[AMDGPU] Add S_MOV_B64_IMM_PSEUDO for wide constants

This is to allow 64 bit constant rematerialization. If a constant
is split into two separate moves initializing sub0 and sub1 like
now RA cannot rematerizalize a 64 bit register.

This gives 10-20% uplift in a set of huge apps heavily using double
precession math.

Fixes: SWDEV-292645

Differential Revision: https://reviews.llvm.org/D104874
This commit is contained in:
Stanislav Mekhanoshin 2021-06-28 13:24:24 -07:00
parent 822b92aae4
commit 381ded345b
38 changed files with 606 additions and 269 deletions

View File

@ -75,6 +75,7 @@ FunctionPass *createAMDGPURewriteOutArgumentsPass();
ModulePass *createAMDGPUReplaceLDSUseWithPointerPass();
ModulePass *createAMDGPULowerModuleLDSPass();
FunctionPass *createSIModeRegisterPass();
FunctionPass *createGCNPreRAOptimizationsPass();
struct AMDGPUSimplifyLibCallsPass : PassInfoMixin<AMDGPUSimplifyLibCallsPass> {
AMDGPUSimplifyLibCallsPass(TargetMachine &TM) : TM(TM) {}
@ -348,6 +349,9 @@ extern char &AMDGPUOpenCLEnqueuedBlockLoweringID;
void initializeGCNNSAReassignPass(PassRegistry &);
extern char &GCNNSAReassignID;
void initializeGCNPreRAOptimizationsPass(PassRegistry &);
extern char &GCNPreRAOptimizationsID;
namespace AMDGPU {
enum TargetIndex {
TI_CONSTDATA_START,

View File

@ -208,6 +208,11 @@ static cl::opt<bool, true> EnableLowerModuleLDS(
cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true),
cl::Hidden);
static cl::opt<bool> EnablePreRAOptimizations(
"amdgpu-enable-pre-ra-optimizations",
cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
cl::Hidden);
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@ -275,6 +280,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUSimplifyLibCallsPass(*PR);
initializeAMDGPUPrintfRuntimeBindingPass(*PR);
initializeGCNNSAReassignPass(*PR);
initializeGCNPreRAOptimizationsPass(*PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@ -1191,6 +1197,11 @@ void GCNPassConfig::addOptimizedRegAlloc() {
if (OptExecMaskPreRA)
insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
if (EnablePreRAOptimizations.getNumOccurrences()
? EnablePreRAOptimizations
: TM->getOptLevel() > CodeGenOpt::Less)
insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID);
// This is not an essential optimization and it has a noticeable impact on
// compilation time, so we only enable it from O2.
if (TM->getOptLevel() > CodeGenOpt::Less)

View File

@ -143,6 +143,7 @@ add_llvm_target(AMDGPUCodeGen
GCNILPSched.cpp
GCNNSAReassign.cpp
GCNDPPCombine.cpp
GCNPreRAOptimizations.cpp
SIModeRegister.cpp
LINK_COMPONENTS

View File

@ -0,0 +1,162 @@
//===-- GCNPreRAOptimizations.cpp -----------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// This pass combines split register tuple initialization into a single psuedo:
///
/// undef %0.sub1:sreg_64 = S_MOV_B32 1
/// %0.sub0:sreg_64 = S_MOV_B32 2
/// =>
/// %0:sreg_64 = S_MOV_B64_IMM_PSEUDO 0x200000001
///
/// This is to allow rematerialization of a value instead of spilling. It is
/// supposed to be done after register coalescer to allow it to do its job and
/// before actual register allocation to allow rematerialization.
///
/// Right now the pass only handles 64 bit SGPRs with immediate initializers,
/// although the same shall be possible with other register classes and
/// instructions if necessary.
///
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/InitializePasses.h"
using namespace llvm;
#define DEBUG_TYPE "amdgpu-pre-ra-optimizations"
namespace {
class GCNPreRAOptimizations : public MachineFunctionPass {
private:
const SIInstrInfo *TII;
MachineRegisterInfo *MRI;
LiveIntervals *LIS;
bool processReg(Register Reg);
public:
static char ID;
GCNPreRAOptimizations() : MachineFunctionPass(ID) {
initializeGCNPreRAOptimizationsPass(*PassRegistry::getPassRegistry());
}
bool runOnMachineFunction(MachineFunction &MF) override;
StringRef getPassName() const override {
return "AMDGPU Pre-RA optimizations";
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<LiveIntervals>();
AU.setPreservesAll();
MachineFunctionPass::getAnalysisUsage(AU);
}
};
} // End anonymous namespace.
INITIALIZE_PASS_BEGIN(GCNPreRAOptimizations, DEBUG_TYPE,
"AMDGPU Pre-RA optimizations", false, false)
INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
INITIALIZE_PASS_END(GCNPreRAOptimizations, DEBUG_TYPE, "Pre-RA optimizations",
false, false)
char GCNPreRAOptimizations::ID = 0;
char &llvm::GCNPreRAOptimizationsID = GCNPreRAOptimizations::ID;
FunctionPass *llvm::createGCNPreRAOptimizationsPass() {
return new GCNPreRAOptimizations();
}
bool GCNPreRAOptimizations::processReg(Register Reg) {
MachineInstr *Def0 = nullptr;
MachineInstr *Def1 = nullptr;
uint64_t Init = 0;
for (MachineInstr &I : MRI->def_instructions(Reg)) {
if (I.getOpcode() != AMDGPU::S_MOV_B32 || I.getOperand(0).getReg() != Reg ||
!I.getOperand(1).isImm() || I.getNumOperands() != 2)
return false;
switch (I.getOperand(0).getSubReg()) {
default:
return false;
case AMDGPU::sub0:
if (Def0)
return false;
Def0 = &I;
Init |= I.getOperand(1).getImm() & 0xffffffff;
break;
case AMDGPU::sub1:
if (Def1)
return false;
Def1 = &I;
Init |= static_cast<uint64_t>(I.getOperand(1).getImm()) << 32;
break;
}
}
if (!Def0 || !Def1 || Def0->getParent() != Def1->getParent())
return false;
LLVM_DEBUG(dbgs() << "Combining:\n " << *Def0 << " " << *Def1
<< " =>\n");
if (SlotIndex::isEarlierInstr(LIS->getInstructionIndex(*Def1),
LIS->getInstructionIndex(*Def0)))
std::swap(Def0, Def1);
LIS->RemoveMachineInstrFromMaps(*Def0);
LIS->RemoveMachineInstrFromMaps(*Def1);
auto NewI = BuildMI(*Def0->getParent(), *Def0, Def0->getDebugLoc(),
TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), Reg)
.addImm(Init);
Def0->eraseFromParent();
Def1->eraseFromParent();
LIS->InsertMachineInstrInMaps(*NewI);
LIS->removeInterval(Reg);
LIS->createAndComputeVirtRegInterval(Reg);
LLVM_DEBUG(dbgs() << " " << *NewI);
return true;
}
bool GCNPreRAOptimizations::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
MRI = &MF.getRegInfo();
LIS = &getAnalysis<LiveIntervals>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
bool Changed = false;
for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) {
Register Reg = Register::index2VirtReg(I);
if (!LIS->hasInterval(Reg))
continue;
const TargetRegisterClass *RC = MRI->getRegClass(Reg);
if (RC->MC->getSizeInBits() != 64 || !TRI->isSGPRClass(RC))
continue;
Changed |= processReg(Reg);
}
return Changed;
}

View File

@ -1765,6 +1765,30 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
expandMovDPP64(MI);
break;
}
case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
const MachineOperand &SrcOp = MI.getOperand(1);
assert(!SrcOp.isFPImm());
APInt Imm(64, SrcOp.getImm());
if (Imm.isIntN(32) || isInlineConstant(Imm)) {
MI.setDesc(get(AMDGPU::S_MOV_B64));
break;
}
Register Dst = MI.getOperand(0).getReg();
Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
APInt Lo(32, Imm.getLoBits(32).getZExtValue());
APInt Hi(32, Imm.getHiBits(32).getZExtValue());
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
.addImm(Lo.getSExtValue())
.addReg(Dst, RegState::Implicit | RegState::Define);
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
.addImm(Hi.getSExtValue())
.addReg(Dst, RegState::Implicit | RegState::Define);
MI.eraseFromParent();
break;
}
case AMDGPU::V_SET_INACTIVE_B32: {
unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;

View File

@ -111,6 +111,18 @@ def V_MOV_B64_DPP_PSEUDO : VOP_DPP_Pseudo <"v_mov_b64_dpp", VOP_I64_I64> {
let Size = 16; // Requires two 8-byte v_mov_b32_dpp to complete.
}
// 64-bit scalar move immediate instruction. This is used to avoid subregs
// initialization and allow rematerialization.
def S_MOV_B64_IMM_PSEUDO : SPseudoInstSI <(outs SReg_64:$sdst),
(ins i64imm:$src0)> {
let isReMaterializable = 1;
let isAsCheapAsAMove = 1;
let isMoveImm = 1;
let SchedRW = [WriteSALU, Write64Bit];
let Size = 16; // Needs maximum 2 s_mov_b32 instructions 8 byte long each.
let Uses = [];
}
// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the
// WQM pass processes it.
def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;

View File

@ -1097,11 +1097,11 @@ define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) {
; SI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v3
; SI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
; SI-NEXT: s_mov_b32 s4, 0
; SI-NEXT: v_and_b32_e32 v3, s6, v3
; SI-NEXT: s_movk_i32 s5, 0x80
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; SI-NEXT: v_lshlrev_b32_e32 v0, 23, v0
; SI-NEXT: s_mov_b32 s4, 0
; SI-NEXT: s_movk_i32 s5, 0x80
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3]
; SI-NEXT: v_and_b32_e32 v1, 1, v0
@ -1129,11 +1129,11 @@ define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) {
; VI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v3
; VI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
; VI-NEXT: s_mov_b32 s4, 0
; VI-NEXT: v_and_b32_e32 v3, s6, v3
; VI-NEXT: s_movk_i32 s5, 0x80
; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; VI-NEXT: v_lshlrev_b32_e32 v0, 23, v0
; VI-NEXT: s_mov_b32 s4, 0
; VI-NEXT: s_movk_i32 s5, 0x80
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3]
; VI-NEXT: v_and_b32_e32 v1, 1, v0
@ -1165,10 +1165,10 @@ define float @v_test_uitofp_i64_byte_to_f32(i64 %arg0) {
; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v3
; SI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
; SI-NEXT: v_and_b32_e32 v3, s4, v3
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; SI-NEXT: v_lshlrev_b32_e32 v0, 23, v0
; SI-NEXT: s_mov_b32 s4, 0
; SI-NEXT: s_movk_i32 s5, 0x80
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; SI-NEXT: v_lshlrev_b32_e32 v0, 23, v0
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3]
; SI-NEXT: v_and_b32_e32 v1, 1, v0
@ -1195,10 +1195,10 @@ define float @v_test_uitofp_i64_byte_to_f32(i64 %arg0) {
; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v3
; VI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
; VI-NEXT: v_and_b32_e32 v3, s4, v3
; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; VI-NEXT: v_lshlrev_b32_e32 v0, 23, v0
; VI-NEXT: s_mov_b32 s4, 0
; VI-NEXT: s_movk_i32 s5, 0x80
; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; VI-NEXT: v_lshlrev_b32_e32 v0, 23, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3]
; VI-NEXT: v_and_b32_e32 v1, 1, v0

View File

@ -2751,9 +2751,9 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(double addrspace(1)* %out, i32
; GPRIDX-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GPRIDX-NEXT: s_load_dword s8, s[4:5], 0x8
; GPRIDX-NEXT: s_mov_b32 s0, 0
; GPRIDX-NEXT: s_mov_b32 s1, 0x40140000
; GPRIDX-NEXT: s_mov_b32 s3, 0x40080000
; GPRIDX-NEXT: s_mov_b32 s2, s0
; GPRIDX-NEXT: s_mov_b32 s1, 0x40140000
; GPRIDX-NEXT: s_waitcnt lgkmcnt(0)
; GPRIDX-NEXT: s_cmp_eq_u32 s8, 1
; GPRIDX-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0
@ -2842,9 +2842,9 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(double addrspace(1)* %out, i32
; MOVREL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; MOVREL-NEXT: s_load_dword s8, s[4:5], 0x8
; MOVREL-NEXT: s_mov_b32 s0, 0
; MOVREL-NEXT: s_mov_b32 s1, 0x40140000
; MOVREL-NEXT: s_mov_b32 s3, 0x40080000
; MOVREL-NEXT: s_mov_b32 s2, s0
; MOVREL-NEXT: s_mov_b32 s1, 0x40140000
; MOVREL-NEXT: s_waitcnt lgkmcnt(0)
; MOVREL-NEXT: s_cmp_eq_u32 s8, 1
; MOVREL-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0
@ -2935,9 +2935,9 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(double addrspace(1)* %out, i32
; GFX10-NEXT: s_load_dword s8, s[4:5], 0x8
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-NEXT: s_mov_b32 s2, 0
; GFX10-NEXT: s_mov_b32 s3, 0x40140000
; GFX10-NEXT: s_mov_b32 s5, 0x40080000
; GFX10-NEXT: s_mov_b32 s4, s2
; GFX10-NEXT: s_mov_b32 s3, 0x40140000
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_cmp_eq_u32 s8, 1
@ -3837,21 +3837,21 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(double addrspace(1)* %out, i3
; GPRIDX-NEXT: runtime_loader_kernel_symbol = 0
; GPRIDX-NEXT: .end_amd_kernel_code_t
; GPRIDX-NEXT: ; %bb.0: ; %entry
; GPRIDX-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GPRIDX-NEXT: s_load_dword s6, s[4:5], 0x8
; GPRIDX-NEXT: s_mov_b32 s0, 0
; GPRIDX-NEXT: s_mov_b32 s1, 0x40080000
; GPRIDX-NEXT: s_mov_b32 s2, 0
; GPRIDX-NEXT: s_mov_b32 s3, 0x40080000
; GPRIDX-NEXT: v_mov_b32_e32 v2, 0
; GPRIDX-NEXT: s_waitcnt lgkmcnt(0)
; GPRIDX-NEXT: s_cmp_eq_u32 s6, 1
; GPRIDX-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0
; GPRIDX-NEXT: s_cmp_eq_u32 s6, 2
; GPRIDX-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
; GPRIDX-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
; GPRIDX-NEXT: s_cmp_eq_u32 s6, 3
; GPRIDX-NEXT: s_cselect_b64 s[0:1], 4.0, s[0:1]
; GPRIDX-NEXT: v_mov_b32_e32 v0, s0
; GPRIDX-NEXT: v_mov_b32_e32 v1, s1
; GPRIDX-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GPRIDX-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3]
; GPRIDX-NEXT: v_mov_b32_e32 v0, s2
; GPRIDX-NEXT: v_mov_b32_e32 v1, s3
; GPRIDX-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GPRIDX-NEXT: s_endpgm
;
; MOVREL-LABEL: dyn_extract_v4f64_s_s_s:
@ -3924,21 +3924,21 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(double addrspace(1)* %out, i3
; MOVREL-NEXT: runtime_loader_kernel_symbol = 0
; MOVREL-NEXT: .end_amd_kernel_code_t
; MOVREL-NEXT: ; %bb.0: ; %entry
; MOVREL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; MOVREL-NEXT: s_load_dword s6, s[4:5], 0x8
; MOVREL-NEXT: s_mov_b32 s0, 0
; MOVREL-NEXT: s_mov_b32 s1, 0x40080000
; MOVREL-NEXT: s_mov_b32 s2, 0
; MOVREL-NEXT: s_mov_b32 s3, 0x40080000
; MOVREL-NEXT: s_waitcnt lgkmcnt(0)
; MOVREL-NEXT: v_mov_b32_e32 v2, s2
; MOVREL-NEXT: v_mov_b32_e32 v3, s1
; MOVREL-NEXT: s_cmp_eq_u32 s6, 1
; MOVREL-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0
; MOVREL-NEXT: s_cmp_eq_u32 s6, 2
; MOVREL-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
; MOVREL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
; MOVREL-NEXT: s_cmp_eq_u32 s6, 3
; MOVREL-NEXT: s_cselect_b64 s[0:1], 4.0, s[0:1]
; MOVREL-NEXT: v_mov_b32_e32 v0, s0
; MOVREL-NEXT: v_mov_b32_e32 v1, s1
; MOVREL-NEXT: v_mov_b32_e32 v3, s3
; MOVREL-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3]
; MOVREL-NEXT: v_mov_b32_e32 v0, s2
; MOVREL-NEXT: v_mov_b32_e32 v1, s3
; MOVREL-NEXT: v_mov_b32_e32 v2, s0
; MOVREL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; MOVREL-NEXT: s_endpgm
;
@ -4078,8 +4078,7 @@ define i32 @v_extract_v64i32_32(<64 x i32> addrspace(1)* %ptr) {
; MOVREL-LABEL: v_extract_v64i32_32:
; MOVREL: ; %bb.0:
; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; MOVREL-NEXT: s_movk_i32 s4, 0x80
; MOVREL-NEXT: s_mov_b32 s5, 0
; MOVREL-NEXT: s_mov_b64 s[4:5], 0x80
; MOVREL-NEXT: v_mov_b32_e32 v2, s4
; MOVREL-NEXT: v_mov_b32_e32 v3, s5
; MOVREL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
@ -4112,8 +4111,7 @@ define i32 @v_extract_v64i32_33(<64 x i32> addrspace(1)* %ptr) {
; MOVREL-LABEL: v_extract_v64i32_33:
; MOVREL: ; %bb.0:
; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; MOVREL-NEXT: s_movk_i32 s4, 0x80
; MOVREL-NEXT: s_mov_b32 s5, 0
; MOVREL-NEXT: s_mov_b64 s[4:5], 0x80
; MOVREL-NEXT: v_mov_b32_e32 v2, s4
; MOVREL-NEXT: v_mov_b32_e32 v3, s5
; MOVREL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
@ -4140,8 +4138,7 @@ define i32 @v_extract_v64i32_37(<64 x i32> addrspace(1)* %ptr) {
; GPRIDX-LABEL: v_extract_v64i32_37:
; GPRIDX: ; %bb.0:
; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GPRIDX-NEXT: s_movk_i32 s4, 0x80
; GPRIDX-NEXT: s_mov_b32 s5, 0
; GPRIDX-NEXT: s_mov_b64 s[4:5], 0x80
; GPRIDX-NEXT: v_mov_b32_e32 v2, s4
; GPRIDX-NEXT: v_mov_b32_e32 v3, s5
; GPRIDX-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
@ -4154,8 +4151,7 @@ define i32 @v_extract_v64i32_37(<64 x i32> addrspace(1)* %ptr) {
; MOVREL-LABEL: v_extract_v64i32_37:
; MOVREL: ; %bb.0:
; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; MOVREL-NEXT: s_movk_i32 s4, 0x80
; MOVREL-NEXT: s_mov_b32 s5, 0
; MOVREL-NEXT: s_mov_b64 s[4:5], 0x80
; MOVREL-NEXT: v_mov_b32_e32 v2, s4
; MOVREL-NEXT: v_mov_b32_e32 v3, s5
; MOVREL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
@ -4171,8 +4167,7 @@ define i32 @v_extract_v64i32_37(<64 x i32> addrspace(1)* %ptr) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_movk_i32 s4, 0x80
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: s_mov_b64 s[4:5], 0x80
; GFX10-NEXT: v_mov_b32_e32 v2, s4
; GFX10-NEXT: v_mov_b32_e32 v3, s5
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2

View File

@ -7,8 +7,8 @@ define double @v_floor_f64_ieee(double %x) {
; GFX6-LABEL: v_floor_f64_ieee:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1]
; GFX6-NEXT: s_mov_b32 s4, -1
; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1]
; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff
; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5]
; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
@ -30,8 +30,8 @@ define double @v_floor_f64_ieee_nnan(double %x) {
; GFX6-LABEL: v_floor_f64_ieee_nnan:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1]
; GFX6-NEXT: s_mov_b32 s4, -1
; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1]
; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff
; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5]
; GFX6-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3]
@ -50,8 +50,8 @@ define double @v_floor_f64_ieee_fneg(double %x) {
; GFX6-LABEL: v_floor_f64_ieee_fneg:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_fract_f64_e64 v[2:3], -v[0:1]
; GFX6-NEXT: s_mov_b32 s4, -1
; GFX6-NEXT: v_fract_f64_e64 v[2:3], -v[0:1]
; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff
; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5]
; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
@ -74,8 +74,8 @@ define double @v_floor_f64_nonieee(double %x) #1 {
; GFX6-LABEL: v_floor_f64_nonieee:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1]
; GFX6-NEXT: s_mov_b32 s4, -1
; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1]
; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff
; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5]
; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
@ -97,8 +97,8 @@ define double @v_floor_f64_nonieee_nnan(double %x) #1 {
; GFX6-LABEL: v_floor_f64_nonieee_nnan:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1]
; GFX6-NEXT: s_mov_b32 s4, -1
; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1]
; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff
; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5]
; GFX6-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3]
@ -117,8 +117,8 @@ define double @v_floor_f64_non_ieee_fneg(double %x) #1 {
; GFX6-LABEL: v_floor_f64_non_ieee_fneg:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_fract_f64_e64 v[2:3], -v[0:1]
; GFX6-NEXT: s_mov_b32 s4, -1
; GFX6-NEXT: v_fract_f64_e64 v[2:3], -v[0:1]
; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff
; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5]
; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
@ -141,8 +141,8 @@ define double @v_floor_f64_fabs(double %x) {
; GFX6-LABEL: v_floor_f64_fabs:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_fract_f64_e64 v[2:3], |v[0:1]|
; GFX6-NEXT: s_mov_b32 s4, -1
; GFX6-NEXT: v_fract_f64_e64 v[2:3], |v[0:1]|
; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff
; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5]
; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
@ -170,8 +170,8 @@ define double @v_floor_f64_fneg_fabs(double %x) {
; GFX6-LABEL: v_floor_f64_fneg_fabs:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_fract_f64_e64 v[2:3], -|v[0:1]|
; GFX6-NEXT: s_mov_b32 s4, -1
; GFX6-NEXT: v_fract_f64_e64 v[2:3], -|v[0:1]|
; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff
; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5]
; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
@ -194,8 +194,8 @@ define double @v_floor_f64_fneg_fabs(double %x) {
define amdgpu_ps <2 x float> @s_floor_f64(double inreg %x) {
; GFX6-LABEL: s_floor_f64:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_fract_f64_e32 v[0:1], s[2:3]
; GFX6-NEXT: s_mov_b32 s0, -1
; GFX6-NEXT: v_fract_f64_e32 v[0:1], s[2:3]
; GFX6-NEXT: s_mov_b32 s1, 0x3fefffff
; GFX6-NEXT: v_min_f64 v[0:1], v[0:1], s[0:1]
; GFX6-NEXT: v_cmp_o_f64_e64 vcc, s[2:3], s[2:3]
@ -218,8 +218,8 @@ define amdgpu_ps <2 x float> @s_floor_f64(double inreg %x) {
define amdgpu_ps <2 x float> @s_floor_f64_fneg(double inreg %x) {
; GFX6-LABEL: s_floor_f64_fneg:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_fract_f64_e64 v[0:1], -s[2:3]
; GFX6-NEXT: s_mov_b32 s0, -1
; GFX6-NEXT: v_fract_f64_e64 v[0:1], -s[2:3]
; GFX6-NEXT: s_mov_b32 s1, 0x3fefffff
; GFX6-NEXT: v_min_f64 v[0:1], v[0:1], s[0:1]
; GFX6-NEXT: v_cmp_o_f64_e64 vcc, s[2:3], s[2:3]
@ -243,8 +243,8 @@ define amdgpu_ps <2 x float> @s_floor_f64_fneg(double inreg %x) {
define amdgpu_ps <2 x float> @s_floor_f64_fabs(double inreg %x) {
; GFX6-LABEL: s_floor_f64_fabs:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_fract_f64_e64 v[0:1], |s[2:3]|
; GFX6-NEXT: s_mov_b32 s0, -1
; GFX6-NEXT: v_fract_f64_e64 v[0:1], |s[2:3]|
; GFX6-NEXT: s_mov_b32 s1, 0x3fefffff
; GFX6-NEXT: v_min_f64 v[0:1], v[0:1], s[0:1]
; GFX6-NEXT: v_cmp_o_f64_e64 vcc, s[2:3], s[2:3]
@ -268,8 +268,8 @@ define amdgpu_ps <2 x float> @s_floor_f64_fabs(double inreg %x) {
define amdgpu_ps <2 x float> @s_floor_f64_fneg_fabs(double inreg %x) {
; GFX6-LABEL: s_floor_f64_fneg_fabs:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_fract_f64_e64 v[0:1], -|s[2:3]|
; GFX6-NEXT: s_mov_b32 s0, -1
; GFX6-NEXT: v_fract_f64_e64 v[0:1], -|s[2:3]|
; GFX6-NEXT: s_mov_b32 s1, 0x3fefffff
; GFX6-NEXT: v_min_f64 v[0:1], v[0:1], s[0:1]
; GFX6-NEXT: v_cmp_o_f64_e64 vcc, s[2:3], s[2:3]

View File

@ -4703,8 +4703,7 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg %amt) {
; GFX6-LABEL: s_fshl_i128:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_movk_i32 s10, 0x7f
; GFX6-NEXT: s_mov_b32 s11, 0
; GFX6-NEXT: s_mov_b64 s[10:11], 0x7f
; GFX6-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11]
; GFX6-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9]
; GFX6-NEXT: s_sub_i32 s9, s12, 64
@ -4751,8 +4750,7 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
;
; GFX8-LABEL: s_fshl_i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_movk_i32 s10, 0x7f
; GFX8-NEXT: s_mov_b32 s11, 0
; GFX8-NEXT: s_mov_b64 s[10:11], 0x7f
; GFX8-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11]
; GFX8-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9]
; GFX8-NEXT: s_sub_i32 s9, s12, 64
@ -4799,8 +4797,7 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
;
; GFX9-LABEL: s_fshl_i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_movk_i32 s10, 0x7f
; GFX9-NEXT: s_mov_b32 s11, 0
; GFX9-NEXT: s_mov_b64 s[10:11], 0x7f
; GFX9-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11]
; GFX9-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9]
; GFX9-NEXT: s_sub_i32 s9, s12, 64
@ -4847,8 +4844,7 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
;
; GFX10-LABEL: s_fshl_i128:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_movk_i32 s10, 0x7f
; GFX10-NEXT: s_mov_b32 s11, 0
; GFX10-NEXT: s_mov_b64 s[10:11], 0x7f
; GFX10-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11]
; GFX10-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9]
; GFX10-NEXT: s_sub_i32 s9, s12, 64
@ -5321,8 +5317,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 inreg %amt) {
; GFX6-LABEL: v_fshl_i128_svs:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_movk_i32 s6, 0x7f
; GFX6-NEXT: s_mov_b32 s7, 0
; GFX6-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX6-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
; GFX6-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
; GFX6-NEXT: s_sub_i32 s5, s8, 64
@ -5379,8 +5374,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
;
; GFX8-LABEL: v_fshl_i128_svs:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_movk_i32 s6, 0x7f
; GFX8-NEXT: s_mov_b32 s7, 0
; GFX8-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX8-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
; GFX8-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
; GFX8-NEXT: s_sub_i32 s5, s8, 64
@ -5437,8 +5431,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
;
; GFX9-LABEL: v_fshl_i128_svs:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_movk_i32 s6, 0x7f
; GFX9-NEXT: s_mov_b32 s7, 0
; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX9-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
; GFX9-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
; GFX9-NEXT: s_sub_i32 s5, s8, 64
@ -5495,8 +5488,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
;
; GFX10-LABEL: v_fshl_i128_svs:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_movk_i32 s6, 0x7f
; GFX10-NEXT: s_mov_b32 s7, 0
; GFX10-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX10-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1]
; GFX10-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
; GFX10-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
@ -5556,8 +5548,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 inreg %amt) {
; GFX6-LABEL: v_fshl_i128_vss:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_movk_i32 s6, 0x7f
; GFX6-NEXT: s_mov_b32 s7, 0
; GFX6-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX6-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
; GFX6-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
; GFX6-NEXT: s_sub_i32 s6, 64, s8
@ -5612,8 +5603,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
;
; GFX8-LABEL: v_fshl_i128_vss:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_movk_i32 s6, 0x7f
; GFX8-NEXT: s_mov_b32 s7, 0
; GFX8-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX8-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
; GFX8-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
; GFX8-NEXT: s_sub_i32 s6, 64, s8
@ -5668,8 +5658,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
;
; GFX9-LABEL: v_fshl_i128_vss:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_movk_i32 s6, 0x7f
; GFX9-NEXT: s_mov_b32 s7, 0
; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX9-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
; GFX9-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
; GFX9-NEXT: s_sub_i32 s6, 64, s8
@ -5724,8 +5713,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
;
; GFX10-LABEL: v_fshl_i128_vss:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_movk_i32 s6, 0x7f
; GFX10-NEXT: s_mov_b32 s7, 0
; GFX10-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX10-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
; GFX10-NEXT: s_andn2_b64 s[6:7], s[6:7], s[4:5]
; GFX10-NEXT: s_sub_i32 s4, 64, s8
@ -5902,8 +5890,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs, <2 x i128> inreg %amt) {
; GFX6-LABEL: s_fshl_v2i128:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_movk_i32 s18, 0x7f
; GFX6-NEXT: s_mov_b32 s19, 0
; GFX6-NEXT: s_mov_b64 s[18:19], 0x7f
; GFX6-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19]
; GFX6-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17]
; GFX6-NEXT: s_sub_i32 s17, s22, 64
@ -5991,8 +5978,7 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
;
; GFX8-LABEL: s_fshl_v2i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_movk_i32 s18, 0x7f
; GFX8-NEXT: s_mov_b32 s19, 0
; GFX8-NEXT: s_mov_b64 s[18:19], 0x7f
; GFX8-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19]
; GFX8-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17]
; GFX8-NEXT: s_sub_i32 s17, s22, 64
@ -6080,8 +6066,7 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
;
; GFX9-LABEL: s_fshl_v2i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_movk_i32 s18, 0x7f
; GFX9-NEXT: s_mov_b32 s19, 0
; GFX9-NEXT: s_mov_b64 s[18:19], 0x7f
; GFX9-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19]
; GFX9-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17]
; GFX9-NEXT: s_sub_i32 s17, s22, 64
@ -6169,8 +6154,7 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
;
; GFX10-LABEL: s_fshl_v2i128:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_movk_i32 s18, 0x7f
; GFX10-NEXT: s_mov_b32 s19, 0
; GFX10-NEXT: s_mov_b64 s[18:19], 0x7f
; GFX10-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19]
; GFX10-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17]
; GFX10-NEXT: s_sub_i32 s17, s22, 64

View File

@ -4840,8 +4840,7 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg %amt) {
; GFX6-LABEL: s_fshr_i128:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_movk_i32 s10, 0x7f
; GFX6-NEXT: s_mov_b32 s11, 0
; GFX6-NEXT: s_mov_b64 s[10:11], 0x7f
; GFX6-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11]
; GFX6-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9]
; GFX6-NEXT: s_sub_i32 s9, 64, 1
@ -4888,8 +4887,7 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
;
; GFX8-LABEL: s_fshr_i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_movk_i32 s10, 0x7f
; GFX8-NEXT: s_mov_b32 s11, 0
; GFX8-NEXT: s_mov_b64 s[10:11], 0x7f
; GFX8-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11]
; GFX8-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9]
; GFX8-NEXT: s_sub_i32 s9, 64, 1
@ -4936,8 +4934,7 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
;
; GFX9-LABEL: s_fshr_i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_movk_i32 s10, 0x7f
; GFX9-NEXT: s_mov_b32 s11, 0
; GFX9-NEXT: s_mov_b64 s[10:11], 0x7f
; GFX9-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11]
; GFX9-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9]
; GFX9-NEXT: s_sub_i32 s9, 64, 1
@ -4984,8 +4981,7 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
;
; GFX10-LABEL: s_fshr_i128:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_movk_i32 s10, 0x7f
; GFX10-NEXT: s_mov_b32 s11, 0
; GFX10-NEXT: s_mov_b64 s[10:11], 0x7f
; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX10-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11]
; GFX10-NEXT: s_sub_i32 s13, 64, 1
@ -5458,8 +5454,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 inreg %amt) {
; GFX6-LABEL: v_fshr_i128_svs:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_movk_i32 s6, 0x7f
; GFX6-NEXT: s_mov_b32 s7, 0
; GFX6-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX6-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
; GFX6-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
; GFX6-NEXT: s_sub_i32 s5, 64, 1
@ -5515,8 +5510,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
;
; GFX8-LABEL: v_fshr_i128_svs:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_movk_i32 s6, 0x7f
; GFX8-NEXT: s_mov_b32 s7, 0
; GFX8-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX8-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
; GFX8-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
; GFX8-NEXT: s_sub_i32 s5, 64, 1
@ -5572,8 +5566,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
;
; GFX9-LABEL: v_fshr_i128_svs:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_movk_i32 s6, 0x7f
; GFX9-NEXT: s_mov_b32 s7, 0
; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX9-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
; GFX9-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
; GFX9-NEXT: s_sub_i32 s5, 64, 1
@ -5629,8 +5622,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
;
; GFX10-LABEL: v_fshr_i128_svs:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_movk_i32 s6, 0x7f
; GFX10-NEXT: s_mov_b32 s7, 0
; GFX10-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX10-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
; GFX10-NEXT: s_sub_i32 s9, 64, 1
@ -5689,8 +5681,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 inreg %amt) {
; GFX6-LABEL: v_fshr_i128_vss:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_movk_i32 s6, 0x7f
; GFX6-NEXT: s_mov_b32 s7, 0
; GFX6-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX6-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
; GFX6-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
; GFX6-NEXT: s_sub_i32 s5, 64, 1
@ -5746,8 +5737,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
;
; GFX8-LABEL: v_fshr_i128_vss:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_movk_i32 s6, 0x7f
; GFX8-NEXT: s_mov_b32 s7, 0
; GFX8-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX8-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
; GFX8-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
; GFX8-NEXT: s_sub_i32 s5, 64, 1
@ -5803,8 +5793,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
;
; GFX9-LABEL: v_fshr_i128_vss:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_movk_i32 s6, 0x7f
; GFX9-NEXT: s_mov_b32 s7, 0
; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX9-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
; GFX9-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
; GFX9-NEXT: s_sub_i32 s5, 64, 1
@ -5863,19 +5852,18 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
; GFX10-NEXT: s_sub_i32 s6, 64, 1
; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
; GFX10-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1]
; GFX10-NEXT: s_movk_i32 s6, 0x7f
; GFX10-NEXT: s_mov_b32 s7, 0
; GFX10-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
; GFX10-NEXT: s_andn2_b64 s[8:9], s[6:7], s[4:5]
; GFX10-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7]
; GFX10-NEXT: s_sub_i32 s4, 64, s8
; GFX10-NEXT: v_or_b32_e32 v2, v4, v2
; GFX10-NEXT: v_or_b32_e32 v3, v5, v3
; GFX10-NEXT: s_sub_i32 s4, 64, s8
; GFX10-NEXT: s_sub_i32 s5, s8, 64
; GFX10-NEXT: s_cmp_lt_u32 s8, 64
; GFX10-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1]
; GFX10-NEXT: v_lshlrev_b64 v[6:7], s8, v[2:3]
; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0
; GFX10-NEXT: v_lshlrev_b64 v[6:7], s8, v[2:3]
; GFX10-NEXT: s_cmp_eq_u32 s8, 0
; GFX10-NEXT: v_lshlrev_b64 v[8:9], s8, v[0:1]
; GFX10-NEXT: s_cselect_b32 s7, 1, 0
@ -6044,8 +6032,7 @@ define i128 @v_fshr_i128_65(i128 %lhs, i128 %rhs) {
define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs, <2 x i128> inreg %amt) {
; GFX6-LABEL: s_fshr_v2i128:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_movk_i32 s18, 0x7f
; GFX6-NEXT: s_mov_b32 s19, 0
; GFX6-NEXT: s_mov_b64 s[18:19], 0x7f
; GFX6-NEXT: s_sub_i32 s28, 64, 1
; GFX6-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19]
; GFX6-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17]
@ -6133,8 +6120,7 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
;
; GFX8-LABEL: s_fshr_v2i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_movk_i32 s18, 0x7f
; GFX8-NEXT: s_mov_b32 s19, 0
; GFX8-NEXT: s_mov_b64 s[18:19], 0x7f
; GFX8-NEXT: s_sub_i32 s28, 64, 1
; GFX8-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19]
; GFX8-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17]
@ -6222,8 +6208,7 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
;
; GFX9-LABEL: s_fshr_v2i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_movk_i32 s18, 0x7f
; GFX9-NEXT: s_mov_b32 s19, 0
; GFX9-NEXT: s_mov_b64 s[18:19], 0x7f
; GFX9-NEXT: s_sub_i32 s28, 64, 1
; GFX9-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19]
; GFX9-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17]
@ -6311,13 +6296,12 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
;
; GFX10-LABEL: s_fshr_v2i128:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mov_b64 s[18:19], 0x7f
; GFX10-NEXT: s_sub_i32 s28, 64, 1
; GFX10-NEXT: s_movk_i32 s18, 0x7f
; GFX10-NEXT: s_mov_b32 s19, 0
; GFX10-NEXT: s_lshr_b64 s[24:25], s[0:1], s28
; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX10-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19]
; GFX10-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17]
; GFX10-NEXT: s_lshr_b64 s[24:25], s[0:1], s28
; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
; GFX10-NEXT: s_or_b64 s[2:3], s[24:25], s[2:3]
; GFX10-NEXT: s_sub_i32 s23, s16, 64

View File

@ -19,8 +19,7 @@ define i32 @global_atomic_csub_offset(i32 addrspace(1)* %ptr, i32 %data) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_waitcnt_vscnt null, 0x0
; GCN-NEXT: s_movk_i32 s4, 0x1000
; GCN-NEXT: s_mov_b32 s5, 0
; GCN-NEXT: s_mov_b64 s[4:5], 0x1000
; GCN-NEXT: v_mov_b32_e32 v3, s4
; GCN-NEXT: v_mov_b32_e32 v4, s5
; GCN-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
@ -50,8 +49,7 @@ define void @global_atomic_csub_offset_nortn(i32 addrspace(1)* %ptr, i32 %data)
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_waitcnt_vscnt null, 0x0
; GCN-NEXT: s_movk_i32 s4, 0x1000
; GCN-NEXT: s_mov_b32 s5, 0
; GCN-NEXT: s_mov_b64 s[4:5], 0x1000
; GCN-NEXT: v_mov_b32_e32 v3, s4
; GCN-NEXT: v_mov_b32_e32 v4, s5
; GCN-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3

View File

@ -336,8 +336,7 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset_offset256(i32 addrspace(
define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset256_offset(i32 addrspace(1)* %ptr, i32 inreg %soffset) {
; GFX6-LABEL: mubuf_store_vgpr_ptr_sgpr_offset256_offset:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_movk_i32 s4, 0x400
; GFX6-NEXT: s_mov_b32 s5, 0
; GFX6-NEXT: s_mov_b64 s[4:5], 0x400
; GFX6-NEXT: v_mov_b32_e32 v2, s4
; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
@ -352,8 +351,7 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset256_offset(i32 addrspace(
;
; GFX7-LABEL: mubuf_store_vgpr_ptr_sgpr_offset256_offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_movk_i32 s4, 0x400
; GFX7-NEXT: s_mov_b32 s5, 0
; GFX7-NEXT: s_mov_b64 s[4:5], 0x400
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
@ -792,8 +790,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset_offset256(float addrspac
define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset256_offset(float addrspace(1)* %ptr, i32 inreg %soffset) {
; GFX6-LABEL: mubuf_load_vgpr_ptr_sgpr_offset256_offset:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_movk_i32 s4, 0x400
; GFX6-NEXT: s_mov_b32 s5, 0
; GFX6-NEXT: s_mov_b64 s[4:5], 0x400
; GFX6-NEXT: v_mov_b32_e32 v2, s4
; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
; GFX6-NEXT: v_mov_b32_e32 v3, s5
@ -808,8 +805,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset256_offset(float addrspac
;
; GFX7-LABEL: mubuf_load_vgpr_ptr_sgpr_offset256_offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_movk_i32 s4, 0x400
; GFX7-NEXT: s_mov_b32 s5, 0
; GFX7-NEXT: s_mov_b64 s[4:5], 0x400
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
; GFX7-NEXT: v_mov_b32_e32 v3, s5

View File

@ -568,8 +568,8 @@ define double @v_roundeven_f64_fneg(double %x) {
; GFX6-NEXT: v_mov_b32_e32 v3, 0
; GFX6-NEXT: v_or_b32_e32 v4, 0x43300000, v4
; GFX6-NEXT: v_add_f64 v[5:6], -v[0:1], v[3:4]
; GFX6-NEXT: v_mov_b32_e32 v1, v0
; GFX6-NEXT: s_mov_b32 s4, -1
; GFX6-NEXT: v_mov_b32_e32 v1, v0
; GFX6-NEXT: s_mov_b32 s5, 0x432fffff
; GFX6-NEXT: v_add_f64 v[3:4], v[5:6], -v[3:4]
; GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[1:2]|, s[4:5]

View File

@ -2509,8 +2509,7 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
; CHECK-LABEL: v_sdiv_i64_pow2_shl_denom:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_movk_i32 s4, 0x1000
; CHECK-NEXT: s_mov_b32 s5, 0
; CHECK-NEXT: s_mov_b64 s[4:5], 0x1000
; CHECK-NEXT: v_lshl_b64 v[4:5], s[4:5], v2
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: v_or_b32_e32 v3, v1, v5
@ -2703,8 +2702,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-LABEL: v_sdiv_v2i64_pow2_shl_denom:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_movk_i32 s6, 0x1000
; GISEL-NEXT: s_mov_b32 s7, 0
; GISEL-NEXT: s_mov_b64 s[6:7], 0x1000
; GISEL-NEXT: v_lshl_b64 v[4:5], s[6:7], v4
; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v1
; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v5
@ -2996,8 +2994,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-LABEL: v_sdiv_v2i64_pow2_shl_denom:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CGP-NEXT: s_movk_i32 s4, 0x1000
; CGP-NEXT: s_mov_b32 s5, 0
; CGP-NEXT: s_mov_b64 s[4:5], 0x1000
; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v4
; CGP-NEXT: v_mov_b32_e32 v7, v1
; CGP-NEXT: v_mov_b32_e32 v5, v0

View File

@ -2473,8 +2473,7 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
; CHECK-LABEL: v_srem_i64_pow2_shl_denom:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_movk_i32 s4, 0x1000
; CHECK-NEXT: s_mov_b32 s5, 0
; CHECK-NEXT: s_mov_b64 s[4:5], 0x1000
; CHECK-NEXT: v_lshl_b64 v[4:5], s[4:5], v2
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: v_or_b32_e32 v3, v1, v5
@ -2663,8 +2662,7 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-LABEL: v_srem_v2i64_pow2_shl_denom:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_movk_i32 s6, 0x1000
; GISEL-NEXT: s_mov_b32 s7, 0
; GISEL-NEXT: s_mov_b64 s[6:7], 0x1000
; GISEL-NEXT: v_lshl_b64 v[4:5], s[6:7], v4
; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1
; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v5
@ -2952,8 +2950,7 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-LABEL: v_srem_v2i64_pow2_shl_denom:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CGP-NEXT: s_movk_i32 s4, 0x1000
; CGP-NEXT: s_mov_b32 s5, 0
; CGP-NEXT: s_mov_b64 s[4:5], 0x1000
; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v4
; CGP-NEXT: v_mov_b32_e32 v7, v1
; CGP-NEXT: v_mov_b32_e32 v5, v0

View File

@ -2291,8 +2291,7 @@ define i64 @v_udiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
; CHECK-LABEL: v_udiv_i64_pow2_shl_denom:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_movk_i32 s4, 0x1000
; CHECK-NEXT: s_mov_b32 s5, 0
; CHECK-NEXT: s_mov_b64 s[4:5], 0x1000
; CHECK-NEXT: v_lshl_b64 v[4:5], s[4:5], v2
; CHECK-NEXT: v_or_b32_e32 v3, v1, v5
; CHECK-NEXT: v_mov_b32_e32 v2, 0
@ -2470,8 +2469,7 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-LABEL: v_udiv_v2i64_pow2_shl_denom:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_movk_i32 s4, 0x1000
; GISEL-NEXT: s_mov_b32 s5, 0
; GISEL-NEXT: s_mov_b64 s[4:5], 0x1000
; GISEL-NEXT: v_lshl_b64 v[4:5], s[4:5], v4
; GISEL-NEXT: v_lshl_b64 v[6:7], s[4:5], v6
; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v4
@ -2735,8 +2733,7 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CGP-NEXT: v_mov_b32_e32 v5, v0
; CGP-NEXT: v_mov_b32_e32 v7, v1
; CGP-NEXT: s_movk_i32 s4, 0x1000
; CGP-NEXT: s_mov_b32 s5, 0
; CGP-NEXT: s_mov_b64 s[4:5], 0x1000
; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v4
; CGP-NEXT: v_lshl_b64 v[8:9], s[4:5], v6
; CGP-NEXT: v_or_b32_e32 v1, v7, v11

View File

@ -1651,8 +1651,7 @@ define i64 @v_urem_i64_pow2_shl_denom(i64 %x, i64 %y) {
; CHECK-LABEL: v_urem_i64_pow2_shl_denom:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_movk_i32 s4, 0x1000
; CHECK-NEXT: s_mov_b32 s5, 0
; CHECK-NEXT: s_mov_b64 s[4:5], 0x1000
; CHECK-NEXT: v_lshl_b64 v[4:5], s[4:5], v2
; CHECK-NEXT: v_or_b32_e32 v3, v1, v5
; CHECK-NEXT: v_mov_b32_e32 v2, 0
@ -1827,8 +1826,7 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-LABEL: v_urem_v2i64_pow2_shl_denom:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_movk_i32 s4, 0x1000
; GISEL-NEXT: s_mov_b32 s5, 0
; GISEL-NEXT: s_mov_b64 s[4:5], 0x1000
; GISEL-NEXT: v_lshl_b64 v[4:5], s[4:5], v4
; GISEL-NEXT: v_lshl_b64 v[6:7], s[4:5], v6
; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v4
@ -2090,8 +2088,7 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CGP-NEXT: v_mov_b32_e32 v5, v0
; CGP-NEXT: v_mov_b32_e32 v7, v1
; CGP-NEXT: s_movk_i32 s4, 0x1000
; CGP-NEXT: s_mov_b32 s5, 0
; CGP-NEXT: s_mov_b64 s[4:5], 0x1000
; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v4
; CGP-NEXT: v_lshl_b64 v[8:9], s[4:5], v6
; CGP-NEXT: v_or_b32_e32 v1, v7, v11

View File

@ -67,6 +67,7 @@ define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: udiv_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
@ -157,6 +158,7 @@ define amdgpu_kernel void @urem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: urem_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
@ -268,6 +270,7 @@ define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
@ -383,6 +386,7 @@ define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: srem_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
@ -464,6 +468,7 @@ define amdgpu_kernel void @udiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc
; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: udiv_i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
@ -534,6 +539,7 @@ define amdgpu_kernel void @urem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: urem_i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
@ -612,6 +618,7 @@ define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -696,6 +703,7 @@ define amdgpu_kernel void @srem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: srem_i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
@ -768,6 +776,7 @@ define amdgpu_kernel void @udiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc
; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: udiv_i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
@ -835,6 +844,7 @@ define amdgpu_kernel void @urem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: urem_i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
@ -912,6 +922,7 @@ define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -997,6 +1008,7 @@ define amdgpu_kernel void @srem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: srem_i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
@ -1249,6 +1261,7 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: udiv_v4i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
@ -1542,6 +1555,7 @@ define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: urem_v4i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
@ -1915,6 +1929,7 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v3
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_v4i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
@ -2308,6 +2323,7 @@ define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s5, v3
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: srem_v4i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
@ -2567,6 +2583,7 @@ define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: udiv_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -2791,6 +2808,7 @@ define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: urem_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -3043,6 +3061,7 @@ define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -3322,6 +3341,7 @@ define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: srem_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -3455,6 +3475,7 @@ define amdgpu_kernel void @udiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
; GFX6-NEXT: v_and_b32_e32 v0, 7, v0
; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: udiv_i3:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -3528,6 +3549,7 @@ define amdgpu_kernel void @urem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
; GFX6-NEXT: v_and_b32_e32 v0, 7, v0
; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: urem_i3:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
@ -3609,6 +3631,7 @@ define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
; GFX6-NEXT: v_and_b32_e32 v0, 7, v0
; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_i3:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -3696,6 +3719,7 @@ define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
; GFX6-NEXT: v_and_b32_e32 v0, 7, v0
; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: srem_i3:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
@ -3843,6 +3867,7 @@ define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: udiv_v3i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -4023,6 +4048,7 @@ define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: urem_v3i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -4219,6 +4245,7 @@ define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_v3i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -4438,6 +4465,7 @@ define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: srem_v3i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
@ -4631,6 +4659,7 @@ define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1
; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: udiv_v3i15:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -4827,6 +4856,7 @@ define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1
; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: urem_v3i15:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -5043,6 +5073,7 @@ define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1
; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_v3i15:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -5285,6 +5316,7 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1
; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: srem_v3i15:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -5394,6 +5426,7 @@ define amdgpu_kernel void @udiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 20, v0
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: udiv_i32_oddk_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -5430,6 +5463,7 @@ define amdgpu_kernel void @udiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x)
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: udiv_i32_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -5464,6 +5498,7 @@ define amdgpu_kernel void @udiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: udiv_i32_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -5505,6 +5540,7 @@ define amdgpu_kernel void @udiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out,
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: udiv_v2i32_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -5550,6 +5586,7 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)*
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -5694,6 +5731,7 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: udiv_v2i32_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
@ -5776,6 +5814,7 @@ define amdgpu_kernel void @urem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: urem_i32_oddk_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -5814,6 +5853,7 @@ define amdgpu_kernel void @urem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x)
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: urem_i32_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -5849,6 +5889,7 @@ define amdgpu_kernel void @urem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: urem_i32_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -5892,6 +5933,7 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out,
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: urem_v2i32_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -6025,6 +6067,7 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: urem_v2i32_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
@ -6100,6 +6143,7 @@ define amdgpu_kernel void @sdiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_i32_oddk_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -6139,6 +6183,7 @@ define amdgpu_kernel void @sdiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x)
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_i32_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -6202,6 +6247,7 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %
; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_i32_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
@ -6275,6 +6321,7 @@ define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out,
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_v2i32_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -6329,6 +6376,7 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)*
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -6511,6 +6559,7 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
@ -6610,6 +6659,7 @@ define amdgpu_kernel void @srem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: srem_i32_oddk_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -6652,6 +6702,7 @@ define amdgpu_kernel void @srem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x)
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: srem_i32_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -6713,6 +6764,7 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %
; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: srem_i32_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
@ -6787,6 +6839,7 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out,
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: srem_v2i32_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -6960,6 +7013,7 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s0, v1
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: srem_v2i32_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -7159,6 +7213,7 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: udiv_i64_oddk_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f176a73
@ -7303,6 +7358,7 @@ define amdgpu_kernel void @udiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: udiv_i64_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
@ -7340,6 +7396,7 @@ define amdgpu_kernel void @udiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
; GFX6-NEXT: v_mov_b32_e32 v1, s5
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: udiv_i64_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
@ -7384,6 +7441,7 @@ define amdgpu_kernel void @udiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
; GFX6-NEXT: v_mov_b32_e32 v3, s3
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: udiv_v2i64_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -7525,6 +7583,7 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
; GFX6-NEXT: v_mov_b32_e32 v1, s3
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: udiv_v2i64_mixed_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000
@ -7672,6 +7731,7 @@ define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
; GFX6-NEXT: v_mov_b32_e32 v3, s3
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: udiv_v2i64_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -7824,6 +7884,7 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: urem_i64_oddk_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f1761f8
@ -7967,6 +8028,7 @@ define amdgpu_kernel void @urem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x)
; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: urem_i64_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
@ -7997,8 +8059,7 @@ define amdgpu_kernel void @urem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s0, s4
; GFX6-NEXT: s_mov_b32 s1, s5
; GFX6-NEXT: s_mov_b32 s5, 0
; GFX6-NEXT: s_movk_i32 s4, 0x1000
; GFX6-NEXT: s_mov_b64 s[4:5], 0x1000
; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s8
; GFX6-NEXT: s_add_u32 s4, s4, -1
; GFX6-NEXT: s_addc_u32 s5, s5, -1
@ -8007,12 +8068,12 @@ define amdgpu_kernel void @urem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
; GFX6-NEXT: v_mov_b32_e32 v1, s5
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: urem_i64_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
; GFX9-NEXT: s_mov_b32 s1, 0
; GFX9-NEXT: s_movk_i32 s0, 0x1000
; GFX9-NEXT: s_mov_b64 s[0:1], 0x1000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
@ -8056,6 +8117,7 @@ define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
; GFX6-NEXT: v_mov_b32_e32 v3, v1
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: urem_v2i64_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -8094,8 +8156,7 @@ define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11
; GFX6-NEXT: s_mov_b32 s13, 0
; GFX6-NEXT: s_movk_i32 s12, 0x1000
; GFX6-NEXT: s_mov_b64 s[12:13], 0x1000
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
@ -8113,13 +8174,13 @@ define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
; GFX6-NEXT: v_mov_b32_e32 v3, s3
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: urem_v2i64_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44
; GFX9-NEXT: s_mov_b32 s1, 0
; GFX9-NEXT: s_movk_i32 s0, 0x1000
; GFX9-NEXT: s_mov_b64 s[0:1], 0x1000
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], s10
@ -8267,6 +8328,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_i64_oddk_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000
@ -8410,6 +8472,7 @@ define amdgpu_kernel void @sdiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_i64_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
@ -8439,10 +8502,10 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
; GFX6-LABEL: sdiv_i64_pow2_shl_denom:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dword s4, s[0:1], 0xd
; GFX6-NEXT: s_mov_b32 s3, 0
; GFX6-NEXT: s_movk_i32 s2, 0x1000
; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000
; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
; GFX6-NEXT: s_ashr_i32 s12, s3, 31
@ -8458,7 +8521,6 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
; GFX6-NEXT: s_mov_b32 s15, s14
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX6-NEXT: v_trunc_f32_e32 v1, v1
@ -8576,11 +8638,11 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_i64_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34
; GFX9-NEXT: s_mov_b32 s3, 0
; GFX9-NEXT: s_movk_i32 s2, 0x1000
; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
@ -8753,6 +8815,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
; GFX6-NEXT: v_mov_b32_e32 v3, s3
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_v2i64_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -8917,6 +8980,7 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
; GFX6-NEXT: v_mov_b32_e32 v1, s3
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: ssdiv_v2i64_mixed_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, 0x457ff000
@ -9064,10 +9128,10 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
; GFX6-LABEL: sdiv_v2i64_pow2_shl_denom:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11
; GFX6-NEXT: s_mov_b32 s3, 0
; GFX6-NEXT: s_movk_i32 s2, 0x1000
; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000
; GFX6-NEXT: s_mov_b32 s18, 0x4f800000
; GFX6-NEXT: s_mov_b32 s19, 0x5f7ffffc
; GFX6-NEXT: s_mov_b32 s20, 0x2f800000
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_lshl_b64 s[12:13], s[2:3], s6
; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
@ -9078,12 +9142,11 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
; GFX6-NEXT: s_xor_b64 s[14:15], s[2:3], s[16:17]
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s14
; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s15
; GFX6-NEXT: s_mov_b32 s20, 0x2f800000
; GFX6-NEXT: s_mov_b32 s21, 0xcf800000
; GFX6-NEXT: s_sub_u32 s6, 0, s14
; GFX6-NEXT: s_subb_u32 s7, 0, s15
; GFX6-NEXT: v_mac_f32_e32 v0, s18, v1
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
; GFX6-NEXT: s_subb_u32 s7, 0, s15
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
; GFX6-NEXT: v_mul_f32_e32 v0, s19, v0
@ -9330,13 +9393,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_v2i64_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44
; GFX9-NEXT: s_mov_b32 s3, 0
; GFX9-NEXT: s_movk_i32 s2, 0x1000
; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000
; GFX9-NEXT: s_mov_b32 s18, 0x4f800000
; GFX9-NEXT: s_mov_b32 s19, 0x5f7ffffc
; GFX9-NEXT: s_mov_b32 s20, 0x2f800000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s6
; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
@ -9347,12 +9411,11 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
; GFX9-NEXT: s_xor_b64 s[10:11], s[2:3], s[12:13]
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s10
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s11
; GFX9-NEXT: s_mov_b32 s20, 0x2f800000
; GFX9-NEXT: s_mov_b32 s21, 0xcf800000
; GFX9-NEXT: s_sub_u32 s14, 0, s10
; GFX9-NEXT: s_subb_u32 s4, 0, s11
; GFX9-NEXT: v_mac_f32_e32 v0, s18, v1
; GFX9-NEXT: v_rcp_f32_e32 v0, v0
; GFX9-NEXT: s_subb_u32 s4, 0, s11
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: v_mul_f32_e32 v0, s19, v0
; GFX9-NEXT: v_mul_f32_e32 v1, s20, v0
@ -9727,6 +9790,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: srem_i64_oddk_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000
@ -9870,6 +9934,7 @@ define amdgpu_kernel void @srem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x)
; GFX6-NEXT: v_mov_b32_e32 v1, s5
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: srem_i64_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
@ -9901,10 +9966,10 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
; GFX6-LABEL: srem_i64_pow2_shl_denom:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dword s4, s[0:1], 0xd
; GFX6-NEXT: s_mov_b32 s3, 0
; GFX6-NEXT: s_movk_i32 s2, 0x1000
; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000
; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
; GFX6-NEXT: s_ashr_i32 s4, s3, 31
@ -9920,15 +9985,14 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
; GFX6-NEXT: s_mov_b32 s15, s14
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_mov_b32 s4, s8
; GFX6-NEXT: s_mov_b32 s5, s9
; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX6-NEXT: v_trunc_f32_e32 v1, v1
; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX6-NEXT: s_mov_b32 s5, s9
; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0
; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1
; GFX6-NEXT: v_mul_lo_u32 v5, s3, v0
@ -10036,11 +10100,11 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: srem_i64_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34
; GFX9-NEXT: s_mov_b32 s3, 0
; GFX9-NEXT: s_movk_i32 s2, 0x1000
; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
@ -10216,6 +10280,7 @@ define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
; GFX6-NEXT: v_mov_b32_e32 v3, s3
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: srem_v2i64_pow2k_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@ -10265,10 +10330,10 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
; GFX6-LABEL: srem_v2i64_pow2_shl_denom:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11
; GFX6-NEXT: s_mov_b32 s3, 0
; GFX6-NEXT: s_movk_i32 s2, 0x1000
; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000
; GFX6-NEXT: s_mov_b32 s18, 0x4f800000
; GFX6-NEXT: s_mov_b32 s19, 0x5f7ffffc
; GFX6-NEXT: s_mov_b32 s20, 0x2f800000
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_lshl_b64 s[14:15], s[2:3], s6
; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
@ -10279,12 +10344,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
; GFX6-NEXT: s_xor_b64 s[16:17], s[2:3], s[4:5]
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s16
; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s17
; GFX6-NEXT: s_mov_b32 s20, 0x2f800000
; GFX6-NEXT: s_mov_b32 s21, 0xcf800000
; GFX6-NEXT: s_sub_u32 s6, 0, s16
; GFX6-NEXT: s_subb_u32 s7, 0, s17
; GFX6-NEXT: v_mac_f32_e32 v0, s18, v1
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
; GFX6-NEXT: s_subb_u32 s7, 0, s17
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
; GFX6-NEXT: v_mul_f32_e32 v0, s19, v0
@ -10527,13 +10591,14 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: srem_v2i64_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44
; GFX9-NEXT: s_mov_b32 s3, 0
; GFX9-NEXT: s_movk_i32 s2, 0x1000
; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000
; GFX9-NEXT: s_mov_b32 s16, 0x4f800000
; GFX9-NEXT: s_mov_b32 s17, 0x5f7ffffc
; GFX9-NEXT: s_mov_b32 s18, 0x2f800000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b64 s[12:13], s[2:3], s6
; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
@ -10544,12 +10609,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
; GFX9-NEXT: s_xor_b64 s[14:15], s[2:3], s[4:5]
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s14
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s15
; GFX9-NEXT: s_mov_b32 s18, 0x2f800000
; GFX9-NEXT: s_mov_b32 s19, 0xcf800000
; GFX9-NEXT: s_sub_u32 s4, 0, s14
; GFX9-NEXT: s_subb_u32 s5, 0, s15
; GFX9-NEXT: v_mac_f32_e32 v0, s16, v1
; GFX9-NEXT: v_rcp_f32_e32 v0, v0
; GFX9-NEXT: s_subb_u32 s5, 0, s15
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
; GFX9-NEXT: v_mul_f32_e32 v0, s17, v0

View File

@ -0,0 +1,98 @@
# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass=liveintervals,amdgpu-pre-ra-optimizations %s -o - | FileCheck -check-prefix=GCN %s
---
# GCN-LABEL: name: combine_sreg64_inits
# GCN: %0:sgpr_64 = S_MOV_B64_IMM_PSEUDO 8589934593
# GCN: S_NOP 0
name: combine_sreg64_inits
tracksRegLiveness: true
body: |
bb.0:
undef %0.sub0:sgpr_64 = S_MOV_B32 1
S_NOP 0
%0.sub1:sgpr_64 = S_MOV_B32 2
...
---
# GCN-LABEL: name: combine_sreg64_inits_swap
# GCN: %0:sgpr_64 = S_MOV_B64_IMM_PSEUDO 8589934593
# GCN: S_NOP 0
name: combine_sreg64_inits_swap
tracksRegLiveness: true
body: |
bb.0:
undef %0.sub1:sgpr_64 = S_MOV_B32 2
S_NOP 0
%0.sub0:sgpr_64 = S_MOV_B32 1
...
---
# GCN-LABEL: name: sreg64_inits_different_blocks
# GCN: undef %0.sub0:sgpr_64 = S_MOV_B32 1
# GCN: %0.sub1:sgpr_64 = S_MOV_B32 2
name: sreg64_inits_different_blocks
tracksRegLiveness: true
body: |
bb.0:
undef %0.sub0:sgpr_64 = S_MOV_B32 1
bb.1:
%0.sub1:sgpr_64 = S_MOV_B32 2
...
---
# GCN-LABEL: name: sreg64_inits_two_defs_sub1
# GCN: undef %0.sub0:sgpr_64 = S_MOV_B32 1
# GCN: %0.sub1:sgpr_64 = S_MOV_B32 2
# GCN: %0.sub1:sgpr_64 = S_MOV_B32 3
name: sreg64_inits_two_defs_sub1
tracksRegLiveness: true
body: |
bb.0:
undef %0.sub0:sgpr_64 = S_MOV_B32 1
%0.sub1:sgpr_64 = S_MOV_B32 2
%0.sub1:sgpr_64 = S_MOV_B32 3
...
---
# GCN-LABEL: name: sreg64_inits_two_defs_sub0
# GCN: undef %0.sub0:sgpr_64 = S_MOV_B32 1
# GCN: %0.sub1:sgpr_64 = S_MOV_B32 2
# GCN: %0.sub0:sgpr_64 = S_MOV_B32 3
name: sreg64_inits_two_defs_sub0
tracksRegLiveness: true
body: |
bb.0:
undef %0.sub0:sgpr_64 = S_MOV_B32 1
%0.sub1:sgpr_64 = S_MOV_B32 2
%0.sub0:sgpr_64 = S_MOV_B32 3
...
---
# GCN-LABEL: name: sreg64_inits_full_def
# GCN: undef %1.sub0:sgpr_64 = S_MOV_B32 1
# GCN: %0:sgpr_64 = S_MOV_B64 3
name: sreg64_inits_full_def
tracksRegLiveness: true
body: |
bb.0:
undef %0.sub0:sgpr_64 = S_MOV_B32 1
%0:sgpr_64 = S_MOV_B64 3
...
---
# GCN-LABEL: name: sreg64_inits_imp_use
# GCN: %0.sub0:sgpr_64 = S_MOV_B32 1, implicit $m0
# GCN: %0.sub1:sgpr_64 = S_MOV_B32 2
name: sreg64_inits_imp_use
tracksRegLiveness: true
body: |
bb.0:
undef %0.sub0:sgpr_64 = S_MOV_B32 1, implicit $m0
%0.sub1:sgpr_64 = S_MOV_B32 2
...
---
# GCN-LABEL: name: sreg64_inits_imp_def
# GCN: %0.sub0:sgpr_64 = S_MOV_B32 1, implicit-def $scc
# GCN: %0.sub1:sgpr_64 = S_MOV_B32 2
name: sreg64_inits_imp_def
tracksRegLiveness: true
body: |
bb.0:
undef %0.sub0:sgpr_64 = S_MOV_B32 1, implicit-def $scc
%0.sub1:sgpr_64 = S_MOV_B32 2
...

View File

@ -844,8 +844,7 @@ define amdgpu_kernel void @store_misaligned64_constant_offsets() {
define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {
; CI-LABEL: store_misaligned64_constant_large_offsets:
; CI: ; %bb.0:
; CI-NEXT: s_movk_i32 s0, 0x7b
; CI-NEXT: s_mov_b32 s1, 0
; CI-NEXT: s_mov_b64 s[0:1], 0x7b
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v2, 0
; CI-NEXT: v_mov_b32_e32 v1, s1
@ -856,8 +855,7 @@ define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {
;
; GFX9-LABEL: store_misaligned64_constant_large_offsets:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_movk_i32 s0, 0x7b
; GFX9-NEXT: s_mov_b32 s1, 0
; GFX9-NEXT: s_mov_b64 s[0:1], 0x7b
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s1

View File

@ -205,8 +205,7 @@ entry:
; FIXME: Should not have intermediate sgprs
; CHECK-LABEL: {{^}}i64_imm_input_phys_vgpr:
; CHECK-DAG: s_mov_b32 s1, 0
; CHECK-DAG: s_mov_b32 s0, 0x1e240
; CHECK: s_mov_b64 s[0:1], 0x1e240
; CHECK: v_mov_b32_e32 v0, s0
; CHECK: v_mov_b32_e32 v1, s1
; CHECK: use v[0:1]

View File

@ -59,20 +59,17 @@ define amdgpu_kernel void @inline_sreg_constraint_imm_f32() {
ret void
}
; FIXME: Should be able to use s_mov_b64
; GCN-LABEL: {{^}}inline_sreg_constraint_imm_i64:
; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], -4{{$}}
; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], -1{{$}}
; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}
; GCN: s_mov_b64 [[REG:s\[[0-9:]+\]]], -4{{$}}
; GCN: ; use [[REG]]
define amdgpu_kernel void @inline_sreg_constraint_imm_i64() {
tail call void asm sideeffect "; use $0", "s"(i64 -4)
ret void
}
; GCN-LABEL: {{^}}inline_sreg_constraint_imm_f64:
; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], 0{{$}}
; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], 0x3ff00000{{$}}
; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}
; GCN: s_mov_b64 [[REG:s\[[0-9:]+\]]], 1.0{{$}}
; GCN: ; use [[REG]]
define amdgpu_kernel void @inline_sreg_constraint_imm_f64() {
tail call void asm sideeffect "; use $0", "s"(double 1.0)
ret void

View File

@ -900,12 +900,11 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2
; SI-NEXT: s_load_dword s4, s[4:5], 0x4
; SI-NEXT: s_mov_b32 s5, 0
; SI-NEXT: s_mov_b32 s3, 0x100f000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshl_b32 s8, s4, 4
; SI-NEXT: s_mov_b32 s4, 0xffff
; SI-NEXT: s_mov_b64 s[4:5], 0xffff
; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], s8
; SI-NEXT: s_mov_b32 s8, 0x50005
; SI-NEXT: s_and_b32 s9, s5, s8
@ -923,12 +922,11 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8
; VI-NEXT: s_load_dword s4, s[4:5], 0x10
; VI-NEXT: s_mov_b32 s5, 0
; VI-NEXT: s_mov_b32 s3, 0x1100f000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b32 s8, s4, 4
; VI-NEXT: s_mov_b32 s4, 0xffff
; VI-NEXT: s_mov_b64 s[4:5], 0xffff
; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s8
; VI-NEXT: s_mov_b32 s8, 0x50005
; VI-NEXT: s_mov_b32 s9, s8
@ -1075,14 +1073,13 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; SI-NEXT: s_load_dword s6, s[4:5], 0x4
; SI-NEXT: s_mov_b32 s7, 0
; SI-NEXT: s_mov_b32 s3, 0x100f000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dwordx2 s[4:5], s[10:11], 0x0
; SI-NEXT: s_mov_b32 s0, s8
; SI-NEXT: s_lshl_b32 s8, s6, 3
; SI-NEXT: s_mov_b32 s6, 0xffff
; SI-NEXT: s_mov_b64 s[6:7], 0xffff
; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], s8
; SI-NEXT: s_mov_b32 s8, 0x5050505
; SI-NEXT: s_mov_b32 s1, s9
@ -1100,14 +1097,13 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; VI-NEXT: s_load_dword s6, s[4:5], 0x10
; VI-NEXT: s_mov_b32 s7, 0
; VI-NEXT: s_mov_b32 s3, 0x1100f000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx2 s[4:5], s[10:11], 0x0
; VI-NEXT: s_mov_b32 s0, s8
; VI-NEXT: s_lshl_b32 s8, s6, 3
; VI-NEXT: s_mov_b32 s6, 0xffff
; VI-NEXT: s_mov_b64 s[6:7], 0xffff
; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], s8
; VI-NEXT: s_mov_b32 s8, 0x5050505
; VI-NEXT: s_mov_b32 s1, s9

View File

@ -1584,8 +1584,7 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
; GFX9-NEXT: s_mov_b32 s3, 0
; GFX9-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NEXT: s_mov_b64 s[2:3], 0xffff
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2
; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3]
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s6, s6
@ -1607,9 +1606,8 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: s_mov_b32 s2, 0xffff
; VI-NEXT: s_mov_b64 s[2:3], 0xffff
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: s_mov_b32 s3, 0
; VI-NEXT: s_and_b32 s1, s4, s2
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: s_lshl_b32 s0, s1, 16
@ -1635,8 +1633,7 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
; CI-NEXT: flat_load_dword v4, v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: s_mov_b32 s3, 0
; CI-NEXT: s_mov_b32 s2, 0xffff
; CI-NEXT: s_mov_b64 s[2:3], 0xffff
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: s_lshl_b32 s1, s4, 16
; CI-NEXT: s_and_b32 s4, s4, s2
@ -1672,8 +1669,7 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspa
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
; GFX9-NEXT: s_pack_ll_b32_b16 s5, s6, s6
; GFX9-NEXT: s_mov_b32 s3, 0
; GFX9-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NEXT: s_mov_b64 s[2:3], 0xffff
; GFX9-NEXT: s_lshl_b32 s4, s7, 4
; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
@ -1694,9 +1690,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspa
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: s_mov_b32 s2, 0xffff
; VI-NEXT: s_mov_b64 s[2:3], 0xffff
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: s_mov_b32 s3, 0
; VI-NEXT: s_lshl_b32 s1, s5, 4
; VI-NEXT: s_and_b32 s4, s4, s2
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
@ -1722,10 +1717,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspa
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: s_mov_b32 s2, 0xffff
; CI-NEXT: s_mov_b64 s[2:3], 0xffff
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: s_and_b32 s6, s4, s2
; CI-NEXT: s_mov_b32 s3, 0
; CI-NEXT: s_lshl_b32 s1, s5, 4
; CI-NEXT: s_lshl_b32 s4, s4, 16
; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2

View File

@ -7,7 +7,7 @@
; RUN: llc -O1 -mtriple=amdgcn--amdhsa -disable-verify -debug-pass=Structure < %s 2>&1 \
; RUN: | grep -v 'Verify generated machine code' | FileCheck -check-prefix=GCN-O1 %s
; RUN: llc -O1 -mtriple=amdgcn--amdhsa -disable-verify -amdgpu-scalar-ir-passes -amdgpu-sdwa-peephole \
; RUN: -amdgpu-load-store-vectorizer -debug-pass=Structure < %s 2>&1 \
; RUN: -amdgpu-load-store-vectorizer -amdgpu-enable-pre-ra-optimizations -debug-pass=Structure < %s 2>&1 \
; RUN: | grep -v 'Verify generated machine code' | FileCheck -check-prefix=GCN-O1-OPTS %s
; RUN: llc -O2 -mtriple=amdgcn--amdhsa -disable-verify -debug-pass=Structure < %s 2>&1 \
; RUN: | grep -v 'Verify generated machine code' | FileCheck -check-prefix=GCN-O2 %s
@ -619,6 +619,7 @@
; GCN-O1-OPTS-NEXT: Machine Natural Loop Construction
; GCN-O1-OPTS-NEXT: Simple Register Coalescing
; GCN-O1-OPTS-NEXT: Rename Disconnected Subregister Components
; GCN-O1-OPTS-NEXT: AMDGPU Pre-RA optimizations
; GCN-O1-OPTS-NEXT: Machine Instruction Scheduler
; GCN-O1-OPTS-NEXT: MachinePostDominator Tree Construction
; GCN-O1-OPTS-NEXT: SI Whole Quad Mode
@ -899,6 +900,7 @@
; GCN-O2-NEXT: Machine Natural Loop Construction
; GCN-O2-NEXT: Simple Register Coalescing
; GCN-O2-NEXT: Rename Disconnected Subregister Components
; GCN-O2-NEXT: AMDGPU Pre-RA optimizations
; GCN-O2-NEXT: Machine Instruction Scheduler
; GCN-O2-NEXT: MachinePostDominator Tree Construction
; GCN-O2-NEXT: SI Whole Quad Mode
@ -1193,6 +1195,7 @@
; GCN-O3-NEXT: Machine Natural Loop Construction
; GCN-O3-NEXT: Simple Register Coalescing
; GCN-O3-NEXT: Rename Disconnected Subregister Components
; GCN-O3-NEXT: AMDGPU Pre-RA optimizations
; GCN-O3-NEXT: Machine Instruction Scheduler
; GCN-O3-NEXT: MachinePostDominator Tree Construction
; GCN-O3-NEXT: SI Whole Quad Mode

View File

@ -77,9 +77,9 @@ define amdgpu_kernel void @v_round_f64(double addrspace(1)* %out, double addrspa
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[0:1], s[6:7]
; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_mov_b32 s0, -1
; SI-NEXT: s_movk_i32 s7, 0xfc01
; SI-NEXT: s_mov_b32 s1, 0xfffff
; SI-NEXT: s_mov_b32 s0, -1
; SI-NEXT: s_brev_b32 s6, -2
; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000
; SI-NEXT: s_waitcnt vmcnt(0)

View File

@ -92,9 +92,8 @@ define amdgpu_kernel void @fadd_v2_v_lit_splat(<2 x float> addrspace(1)* %a) {
; GCN-LABEL: {{^}}fadd_v2_v_lit_hi0:
; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
; GFX90A-DAG: s_mov_b32 s[[HI:[0-9]+]], 0
; GFX90A-DAG: s_mov_b32 s[[LO:[0-9]+]], 1.0
; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s{{\[}}[[LO]]:[[HI]]]{{$}}
; GFX90A-DAG: s_mov_b64 [[K:s\[[0-9:]+\]]], 0x3f800000
; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], [[K]]
define amdgpu_kernel void @fadd_v2_v_lit_hi0(<2 x float> addrspace(1)* %a) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id

View File

@ -103,8 +103,7 @@ entry:
; OPT-LABEL: define amdgpu_kernel void @half4_alloca_load4
; GCN-NOT: buffer_
; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0
; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0xffff
; GCN: s_mov_b64 s[{{[0-9:]+}}], 0xffff
; OPT: %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2
; OPT: %0 = load <4 x half>, <4 x half> addrspace(5)* %alloca
@ -163,8 +162,7 @@ entry:
; OPT-LABEL: define amdgpu_kernel void @short4_alloca_load4
; GCN-NOT: buffer_
; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0
; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0xffff
; GCN: s_mov_b64 s[{{[0-9:]+}}], 0xffff
; OPT: %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2
; OPT: %0 = load <4 x i16>, <4 x i16> addrspace(5)* %alloca

View File

@ -0,0 +1,45 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs --stress-regalloc=10 < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs --stress-regalloc=10 < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}test_remat_sgpr:
; GCN-NOT: v_writelane_b32
; GCN: {{^}}[[LOOP:BB[0-9_]+]]:
; GCN-COUNT-6: s_mov_b32 s{{[0-9]+}}, 0x
; GCN-NOT: v_writelane_b32
; GCN: s_cbranch_{{[^ ]+}} [[LOOP]]
; GCN: .sgpr_spill_count: 0
define amdgpu_kernel void @test_remat_sgpr(double addrspace(1)* %arg, double addrspace(1)* %arg1) {
bb:
%i = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb3
bb2: ; preds = %bb3
ret void
bb3: ; preds = %bb3, %bb
%i4 = phi i32 [ 0, %bb ], [ %i22, %bb3 ]
%i5 = add nuw nsw i32 %i4, %i
%i6 = zext i32 %i5 to i64
%i7 = getelementptr inbounds double, double addrspace(1)* %arg, i64 %i6
%i8 = load double, double addrspace(1)* %i7, align 8
%i9 = fadd double %i8, 0x3EFC01997CC9E6B0
%i10 = tail call double @llvm.fma.f64(double %i8, double %i9, double 0x3FBE25E43ABE935A)
%i11 = tail call double @llvm.fma.f64(double %i10, double %i9, double 0x3FC110EF47E6C9C2)
%i12 = tail call double @llvm.fma.f64(double %i11, double %i9, double 0x3FC3B13BCFA74449)
%i13 = tail call double @llvm.fma.f64(double %i12, double %i9, double 0x3FC745D171BF3C30)
%i14 = tail call double @llvm.fma.f64(double %i13, double %i9, double 0x3FCC71C71C7792CE)
%i15 = tail call double @llvm.fma.f64(double %i14, double %i9, double 0x3FD24924924920DA)
%i16 = tail call double @llvm.fma.f64(double %i15, double %i9, double 0x3FD999999999999C)
%i17 = tail call double @llvm.fma.f64(double %i16, double %i9, double 0x3FD899999999899C)
%i18 = tail call double @llvm.fma.f64(double %i17, double %i9, double 0x3FD799999999799C)
%i19 = tail call double @llvm.fma.f64(double %i18, double %i9, double 0x3FD699999999699C)
%i20 = tail call double @llvm.fma.f64(double %i19, double %i9, double 0x3FD599999999599C)
%i21 = getelementptr inbounds double, double addrspace(1)* %arg1, i64 %i6
store double %i19, double addrspace(1)* %i21, align 8
%i22 = add nuw nsw i32 %i4, 1
%i23 = icmp eq i32 %i22, 1024
br i1 %i23, label %bb2, label %bb3
}
declare double @llvm.fma.f64(double, double, double)
declare i32 @llvm.amdgcn.workitem.id.x()

View File

@ -1698,7 +1698,7 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5]
; GCN-IR-NEXT: s_mov_b32 s8, 0x8000
; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000
; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc
; GCN-IR-NEXT: v_mov_b32_e32 v6, s8
; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5]
@ -1706,7 +1706,6 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v6, 0, s[4:5]
; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GCN-IR-NEXT: v_mov_b32_e32 v3, v2
; GCN-IR-NEXT: s_mov_b32 s9, 0
; GCN-IR-NEXT: v_mov_b32_e32 v7, v11
; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc
; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
@ -1724,8 +1723,7 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5]
; GCN-IR-NEXT: s_cbranch_execz BB12_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
; GCN-IR-NEXT: s_mov_b32 s5, 0
; GCN-IR-NEXT: s_mov_b32 s4, 0x8000
; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000
; GCN-IR-NEXT: v_lshr_b64 v[12:13], s[4:5], v8
; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, -1, v0
; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, -1, v1, vcc

View File

@ -146,8 +146,7 @@ define i128 @v_lshr_i128_kv(i128 %rhs) {
; GCN-LABEL: v_lshr_i128_kv:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_movk_i32 s4, 0x41
; GCN-NEXT: s_mov_b32 s5, 0
; GCN-NEXT: s_mov_b64 s[4:5], 0x41
; GCN-NEXT: v_lshr_b64 v[1:2], s[4:5], v0
; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0
; GCN-NEXT: v_mov_b32_e32 v3, s4

View File

@ -1248,8 +1248,8 @@ define amdgpu_kernel void @v_shl_constant_i64(i64 addrspace(1)* %out, i64 addrsp
; SI-NEXT: s_mov_b32 s8, s6
; SI-NEXT: s_mov_b32 s9, s7
; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
; SI-NEXT: s_movk_i32 s7, 0x11e
; SI-NEXT: s_mov_b32 s6, 0xab19b207
; SI-NEXT: s_movk_i32 s7, 0x11e
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(0)
@ -1266,8 +1266,8 @@ define amdgpu_kernel void @v_shl_constant_i64(i64 addrspace(1)* %out, i64 addrsp
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_movk_i32 s1, 0x11e
; VI-NEXT: s_mov_b32 s0, 0xab19b207
; VI-NEXT: s_movk_i32 s1, 0x11e
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
; VI-NEXT: v_mov_b32_e32 v0, s0
@ -1319,8 +1319,7 @@ define amdgpu_kernel void @v_shl_i64_32_bit_constant(i64 addrspace(1)* %out, i64
; SI-NEXT: s_mov_b32 s8, s6
; SI-NEXT: s_mov_b32 s9, s7
; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
; SI-NEXT: s_mov_b32 s7, 0
; SI-NEXT: s_mov_b32 s6, 0x12d687
; SI-NEXT: s_mov_b64 s[6:7], 0x12d687
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(0)
@ -1337,8 +1336,7 @@ define amdgpu_kernel void @v_shl_i64_32_bit_constant(i64 addrspace(1)* %out, i64
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_mov_b32 s1, 0
; VI-NEXT: s_mov_b32 s0, 0x12d687
; VI-NEXT: s_mov_b64 s[0:1], 0x12d687
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
; VI-NEXT: v_mov_b32_e32 v0, s0
@ -1927,8 +1925,7 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(i64 addrspace(1)* %out,
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dword s2, s[0:1], 0xd
; SI-NEXT: s_mov_b32 s1, 0
; SI-NEXT: s_mov_b32 s0, 4.0
; SI-NEXT: s_mov_b64 s[0:1], 0x40800000
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
@ -1942,8 +1939,7 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(i64 addrspace(1)* %out,
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dword s2, s[0:1], 0x34
; VI-NEXT: s_mov_b32 s1, 0
; VI-NEXT: s_mov_b32 s0, 4.0
; VI-NEXT: s_mov_b64 s[0:1], 0x40800000
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
@ -2039,8 +2035,8 @@ define amdgpu_kernel void @s_shl_inline_high_imm_f32_4_0_i64(i64 addrspace(1)* %
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dword s2, s[0:1], 0xd
; SI-NEXT: s_mov_b32 s1, 4.0
; SI-NEXT: s_mov_b32 s0, 0
; SI-NEXT: s_mov_b32 s1, 4.0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
@ -2054,8 +2050,8 @@ define amdgpu_kernel void @s_shl_inline_high_imm_f32_4_0_i64(i64 addrspace(1)* %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dword s2, s[0:1], 0x34
; VI-NEXT: s_mov_b32 s1, 4.0
; VI-NEXT: s_mov_b32 s0, 0
; VI-NEXT: s_mov_b32 s1, 4.0
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
@ -2090,8 +2086,8 @@ define amdgpu_kernel void @s_shl_inline_high_imm_f32_neg_4_0_i64(i64 addrspace(1
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dword s2, s[0:1], 0xd
; SI-NEXT: s_mov_b32 s1, -4.0
; SI-NEXT: s_mov_b32 s0, 0
; SI-NEXT: s_mov_b32 s1, -4.0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
@ -2105,8 +2101,8 @@ define amdgpu_kernel void @s_shl_inline_high_imm_f32_neg_4_0_i64(i64 addrspace(1
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dword s2, s[0:1], 0x34
; VI-NEXT: s_mov_b32 s1, -4.0
; VI-NEXT: s_mov_b32 s0, 0
; VI-NEXT: s_mov_b32 s1, -4.0
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)

View File

@ -589,13 +589,13 @@ endif:
; GCN-LABEL: {{^}}br_scc_eq_i64_simm16:
; VI-DAG: s_movk_i32 s[[K_LO:[0-9]+]], 0x4d2
; VI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0
; VI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 1
; VI: s_cmp_eq_u64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
; SI: v_cmp_eq_u64_e32
define amdgpu_kernel void @br_scc_eq_i64_simm16(i64 %cond, i32 addrspace(1)* %out) #0 {
entry:
%cmp0 = icmp eq i64 %cond, 1234
%cmp0 = icmp eq i64 %cond, 4294968530
br i1 %cmp0, label %endif, label %if
if:
@ -627,13 +627,13 @@ endif:
; GCN-LABEL: {{^}}br_scc_ne_i64_simm16:
; VI-DAG: s_movk_i32 s[[K_LO:[0-9]+]], 0x4d2
; VI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0
; VI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 1
; VI: s_cmp_lg_u64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
; SI: v_cmp_ne_u64_e32
define amdgpu_kernel void @br_scc_ne_i64_simm16(i64 %cond, i32 addrspace(1)* %out) #0 {
entry:
%cmp0 = icmp ne i64 %cond, 1234
%cmp0 = icmp ne i64 %cond, 4294968530
br i1 %cmp0, label %endif, label %if
if:

View File

@ -1876,14 +1876,13 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3]
; GCN-IR-NEXT: s_mov_b32 s8, 0x8000
; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000
; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc
; GCN-IR-NEXT: v_mov_b32_e32 v4, s8
; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3]
; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5]
; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GCN-IR-NEXT: s_mov_b32 s9, 0
; GCN-IR-NEXT: v_mov_b32_e32 v5, v9
; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc
; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
@ -1901,8 +1900,7 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5]
; GCN-IR-NEXT: s_cbranch_execz BB12_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
; GCN-IR-NEXT: s_mov_b32 s5, 0
; GCN-IR-NEXT: s_mov_b32 s4, 0x8000
; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000
; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v6
; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, -1, v0
; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc

View File

@ -1249,14 +1249,13 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5]
; GCN-IR-NEXT: s_mov_b32 s8, 0x8000
; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000
; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc
; GCN-IR-NEXT: v_mov_b32_e32 v2, s8
; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5]
; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[4:5]
; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GCN-IR-NEXT: s_mov_b32 s9, 0
; GCN-IR-NEXT: v_mov_b32_e32 v3, v9
; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc
; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
@ -1274,8 +1273,7 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5]
; GCN-IR-NEXT: s_cbranch_execz BB9_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
; GCN-IR-NEXT: s_mov_b32 s5, 0
; GCN-IR-NEXT: s_mov_b32 s4, 0x8000
; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000
; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v6
; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, -1, v0
; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc

View File

@ -1269,14 +1269,13 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3]
; GCN-IR-NEXT: s_mov_b32 s8, 0x8000
; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000
; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc
; GCN-IR-NEXT: v_mov_b32_e32 v4, s8
; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3]
; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5]
; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GCN-IR-NEXT: s_mov_b32 s9, 0
; GCN-IR-NEXT: v_mov_b32_e32 v5, v9
; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc
; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
@ -1294,8 +1293,7 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5]
; GCN-IR-NEXT: s_cbranch_execz BB8_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
; GCN-IR-NEXT: s_mov_b32 s5, 0
; GCN-IR-NEXT: s_mov_b32 s4, 0x8000
; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000
; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v6
; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, -1, v0
; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc

View File

@ -807,8 +807,8 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 5, v0
; GFX9-O3-NEXT: buffer_load_dwordx4 v[1:4], v0, s[4:7], 0 offen
; GFX9-O3-NEXT: buffer_load_dwordx2 v[5:6], v0, s[4:7], 0 offen offset:16
; GFX9-O3-NEXT: s_brev_b32 s9, -2
; GFX9-O3-NEXT: s_mov_b32 s8, -1
; GFX9-O3-NEXT: s_brev_b32 s9, -2
; GFX9-O3-NEXT: s_not_b64 exec, exec
; GFX9-O3-NEXT: s_waitcnt vmcnt(1)
; GFX9-O3-NEXT: v_mov_b32_e32 v1, s8