forked from OSchip/llvm-project
[AMDGPU] Add S_MOV_B64_IMM_PSEUDO for wide constants
This is to allow 64 bit constant rematerialization. If a constant is split into two separate moves initializing sub0 and sub1 like now RA cannot rematerizalize a 64 bit register. This gives 10-20% uplift in a set of huge apps heavily using double precession math. Fixes: SWDEV-292645 Differential Revision: https://reviews.llvm.org/D104874
This commit is contained in:
parent
822b92aae4
commit
381ded345b
|
@ -75,6 +75,7 @@ FunctionPass *createAMDGPURewriteOutArgumentsPass();
|
|||
ModulePass *createAMDGPUReplaceLDSUseWithPointerPass();
|
||||
ModulePass *createAMDGPULowerModuleLDSPass();
|
||||
FunctionPass *createSIModeRegisterPass();
|
||||
FunctionPass *createGCNPreRAOptimizationsPass();
|
||||
|
||||
struct AMDGPUSimplifyLibCallsPass : PassInfoMixin<AMDGPUSimplifyLibCallsPass> {
|
||||
AMDGPUSimplifyLibCallsPass(TargetMachine &TM) : TM(TM) {}
|
||||
|
@ -348,6 +349,9 @@ extern char &AMDGPUOpenCLEnqueuedBlockLoweringID;
|
|||
void initializeGCNNSAReassignPass(PassRegistry &);
|
||||
extern char &GCNNSAReassignID;
|
||||
|
||||
void initializeGCNPreRAOptimizationsPass(PassRegistry &);
|
||||
extern char &GCNPreRAOptimizationsID;
|
||||
|
||||
namespace AMDGPU {
|
||||
enum TargetIndex {
|
||||
TI_CONSTDATA_START,
|
||||
|
|
|
@ -208,6 +208,11 @@ static cl::opt<bool, true> EnableLowerModuleLDS(
|
|||
cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true),
|
||||
cl::Hidden);
|
||||
|
||||
static cl::opt<bool> EnablePreRAOptimizations(
|
||||
"amdgpu-enable-pre-ra-optimizations",
|
||||
cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
|
||||
cl::Hidden);
|
||||
|
||||
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
|
||||
// Register the target
|
||||
RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
|
||||
|
@ -275,6 +280,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
|
|||
initializeAMDGPUSimplifyLibCallsPass(*PR);
|
||||
initializeAMDGPUPrintfRuntimeBindingPass(*PR);
|
||||
initializeGCNNSAReassignPass(*PR);
|
||||
initializeGCNPreRAOptimizationsPass(*PR);
|
||||
}
|
||||
|
||||
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
|
||||
|
@ -1191,6 +1197,11 @@ void GCNPassConfig::addOptimizedRegAlloc() {
|
|||
if (OptExecMaskPreRA)
|
||||
insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
|
||||
|
||||
if (EnablePreRAOptimizations.getNumOccurrences()
|
||||
? EnablePreRAOptimizations
|
||||
: TM->getOptLevel() > CodeGenOpt::Less)
|
||||
insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID);
|
||||
|
||||
// This is not an essential optimization and it has a noticeable impact on
|
||||
// compilation time, so we only enable it from O2.
|
||||
if (TM->getOptLevel() > CodeGenOpt::Less)
|
||||
|
|
|
@ -143,6 +143,7 @@ add_llvm_target(AMDGPUCodeGen
|
|||
GCNILPSched.cpp
|
||||
GCNNSAReassign.cpp
|
||||
GCNDPPCombine.cpp
|
||||
GCNPreRAOptimizations.cpp
|
||||
SIModeRegister.cpp
|
||||
|
||||
LINK_COMPONENTS
|
||||
|
|
|
@ -0,0 +1,162 @@
|
|||
//===-- GCNPreRAOptimizations.cpp -----------------------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
/// \file
|
||||
/// This pass combines split register tuple initialization into a single psuedo:
|
||||
///
|
||||
/// undef %0.sub1:sreg_64 = S_MOV_B32 1
|
||||
/// %0.sub0:sreg_64 = S_MOV_B32 2
|
||||
/// =>
|
||||
/// %0:sreg_64 = S_MOV_B64_IMM_PSEUDO 0x200000001
|
||||
///
|
||||
/// This is to allow rematerialization of a value instead of spilling. It is
|
||||
/// supposed to be done after register coalescer to allow it to do its job and
|
||||
/// before actual register allocation to allow rematerialization.
|
||||
///
|
||||
/// Right now the pass only handles 64 bit SGPRs with immediate initializers,
|
||||
/// although the same shall be possible with other register classes and
|
||||
/// instructions if necessary.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "AMDGPU.h"
|
||||
#include "GCNSubtarget.h"
|
||||
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
|
||||
#include "llvm/CodeGen/LiveIntervals.h"
|
||||
#include "llvm/CodeGen/MachineFunctionPass.h"
|
||||
#include "llvm/InitializePasses.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
#define DEBUG_TYPE "amdgpu-pre-ra-optimizations"
|
||||
|
||||
namespace {
|
||||
|
||||
class GCNPreRAOptimizations : public MachineFunctionPass {
|
||||
private:
|
||||
const SIInstrInfo *TII;
|
||||
MachineRegisterInfo *MRI;
|
||||
LiveIntervals *LIS;
|
||||
|
||||
bool processReg(Register Reg);
|
||||
|
||||
public:
|
||||
static char ID;
|
||||
|
||||
GCNPreRAOptimizations() : MachineFunctionPass(ID) {
|
||||
initializeGCNPreRAOptimizationsPass(*PassRegistry::getPassRegistry());
|
||||
}
|
||||
|
||||
bool runOnMachineFunction(MachineFunction &MF) override;
|
||||
|
||||
StringRef getPassName() const override {
|
||||
return "AMDGPU Pre-RA optimizations";
|
||||
}
|
||||
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||
AU.addRequired<LiveIntervals>();
|
||||
AU.setPreservesAll();
|
||||
MachineFunctionPass::getAnalysisUsage(AU);
|
||||
}
|
||||
};
|
||||
|
||||
} // End anonymous namespace.
|
||||
|
||||
INITIALIZE_PASS_BEGIN(GCNPreRAOptimizations, DEBUG_TYPE,
|
||||
"AMDGPU Pre-RA optimizations", false, false)
|
||||
INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
|
||||
INITIALIZE_PASS_END(GCNPreRAOptimizations, DEBUG_TYPE, "Pre-RA optimizations",
|
||||
false, false)
|
||||
|
||||
char GCNPreRAOptimizations::ID = 0;
|
||||
|
||||
char &llvm::GCNPreRAOptimizationsID = GCNPreRAOptimizations::ID;
|
||||
|
||||
FunctionPass *llvm::createGCNPreRAOptimizationsPass() {
|
||||
return new GCNPreRAOptimizations();
|
||||
}
|
||||
|
||||
bool GCNPreRAOptimizations::processReg(Register Reg) {
|
||||
MachineInstr *Def0 = nullptr;
|
||||
MachineInstr *Def1 = nullptr;
|
||||
uint64_t Init = 0;
|
||||
|
||||
for (MachineInstr &I : MRI->def_instructions(Reg)) {
|
||||
if (I.getOpcode() != AMDGPU::S_MOV_B32 || I.getOperand(0).getReg() != Reg ||
|
||||
!I.getOperand(1).isImm() || I.getNumOperands() != 2)
|
||||
return false;
|
||||
|
||||
switch (I.getOperand(0).getSubReg()) {
|
||||
default:
|
||||
return false;
|
||||
case AMDGPU::sub0:
|
||||
if (Def0)
|
||||
return false;
|
||||
Def0 = &I;
|
||||
Init |= I.getOperand(1).getImm() & 0xffffffff;
|
||||
break;
|
||||
case AMDGPU::sub1:
|
||||
if (Def1)
|
||||
return false;
|
||||
Def1 = &I;
|
||||
Init |= static_cast<uint64_t>(I.getOperand(1).getImm()) << 32;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!Def0 || !Def1 || Def0->getParent() != Def1->getParent())
|
||||
return false;
|
||||
|
||||
LLVM_DEBUG(dbgs() << "Combining:\n " << *Def0 << " " << *Def1
|
||||
<< " =>\n");
|
||||
|
||||
if (SlotIndex::isEarlierInstr(LIS->getInstructionIndex(*Def1),
|
||||
LIS->getInstructionIndex(*Def0)))
|
||||
std::swap(Def0, Def1);
|
||||
|
||||
LIS->RemoveMachineInstrFromMaps(*Def0);
|
||||
LIS->RemoveMachineInstrFromMaps(*Def1);
|
||||
auto NewI = BuildMI(*Def0->getParent(), *Def0, Def0->getDebugLoc(),
|
||||
TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), Reg)
|
||||
.addImm(Init);
|
||||
|
||||
Def0->eraseFromParent();
|
||||
Def1->eraseFromParent();
|
||||
LIS->InsertMachineInstrInMaps(*NewI);
|
||||
LIS->removeInterval(Reg);
|
||||
LIS->createAndComputeVirtRegInterval(Reg);
|
||||
|
||||
LLVM_DEBUG(dbgs() << " " << *NewI);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool GCNPreRAOptimizations::runOnMachineFunction(MachineFunction &MF) {
|
||||
if (skipFunction(MF.getFunction()))
|
||||
return false;
|
||||
|
||||
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
||||
TII = ST.getInstrInfo();
|
||||
MRI = &MF.getRegInfo();
|
||||
LIS = &getAnalysis<LiveIntervals>();
|
||||
const SIRegisterInfo *TRI = ST.getRegisterInfo();
|
||||
|
||||
bool Changed = false;
|
||||
|
||||
for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) {
|
||||
Register Reg = Register::index2VirtReg(I);
|
||||
if (!LIS->hasInterval(Reg))
|
||||
continue;
|
||||
const TargetRegisterClass *RC = MRI->getRegClass(Reg);
|
||||
if (RC->MC->getSizeInBits() != 64 || !TRI->isSGPRClass(RC))
|
||||
continue;
|
||||
Changed |= processReg(Reg);
|
||||
}
|
||||
|
||||
return Changed;
|
||||
}
|
|
@ -1765,6 +1765,30 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
|
|||
expandMovDPP64(MI);
|
||||
break;
|
||||
}
|
||||
case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
|
||||
const MachineOperand &SrcOp = MI.getOperand(1);
|
||||
assert(!SrcOp.isFPImm());
|
||||
APInt Imm(64, SrcOp.getImm());
|
||||
if (Imm.isIntN(32) || isInlineConstant(Imm)) {
|
||||
MI.setDesc(get(AMDGPU::S_MOV_B64));
|
||||
break;
|
||||
}
|
||||
|
||||
Register Dst = MI.getOperand(0).getReg();
|
||||
Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
|
||||
Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
|
||||
|
||||
APInt Lo(32, Imm.getLoBits(32).getZExtValue());
|
||||
APInt Hi(32, Imm.getHiBits(32).getZExtValue());
|
||||
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
|
||||
.addImm(Lo.getSExtValue())
|
||||
.addReg(Dst, RegState::Implicit | RegState::Define);
|
||||
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
|
||||
.addImm(Hi.getSExtValue())
|
||||
.addReg(Dst, RegState::Implicit | RegState::Define);
|
||||
MI.eraseFromParent();
|
||||
break;
|
||||
}
|
||||
case AMDGPU::V_SET_INACTIVE_B32: {
|
||||
unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
|
||||
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
|
||||
|
|
|
@ -111,6 +111,18 @@ def V_MOV_B64_DPP_PSEUDO : VOP_DPP_Pseudo <"v_mov_b64_dpp", VOP_I64_I64> {
|
|||
let Size = 16; // Requires two 8-byte v_mov_b32_dpp to complete.
|
||||
}
|
||||
|
||||
// 64-bit scalar move immediate instruction. This is used to avoid subregs
|
||||
// initialization and allow rematerialization.
|
||||
def S_MOV_B64_IMM_PSEUDO : SPseudoInstSI <(outs SReg_64:$sdst),
|
||||
(ins i64imm:$src0)> {
|
||||
let isReMaterializable = 1;
|
||||
let isAsCheapAsAMove = 1;
|
||||
let isMoveImm = 1;
|
||||
let SchedRW = [WriteSALU, Write64Bit];
|
||||
let Size = 16; // Needs maximum 2 s_mov_b32 instructions 8 byte long each.
|
||||
let Uses = [];
|
||||
}
|
||||
|
||||
// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the
|
||||
// WQM pass processes it.
|
||||
def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
|
||||
|
|
|
@ -1097,11 +1097,11 @@ define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) {
|
|||
; SI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v3
|
||||
; SI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
|
||||
; SI-NEXT: s_mov_b32 s4, 0
|
||||
; SI-NEXT: v_and_b32_e32 v3, s6, v3
|
||||
; SI-NEXT: s_movk_i32 s5, 0x80
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v1
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v0, 23, v0
|
||||
; SI-NEXT: s_mov_b32 s4, 0
|
||||
; SI-NEXT: s_movk_i32 s5, 0x80
|
||||
; SI-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; SI-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3]
|
||||
; SI-NEXT: v_and_b32_e32 v1, 1, v0
|
||||
|
@ -1129,11 +1129,11 @@ define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) {
|
|||
; VI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
||||
; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v3
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
|
||||
; VI-NEXT: s_mov_b32 s4, 0
|
||||
; VI-NEXT: v_and_b32_e32 v3, s6, v3
|
||||
; VI-NEXT: s_movk_i32 s5, 0x80
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v0, 23, v0
|
||||
; VI-NEXT: s_mov_b32 s4, 0
|
||||
; VI-NEXT: s_movk_i32 s5, 0x80
|
||||
; VI-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; VI-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3]
|
||||
; VI-NEXT: v_and_b32_e32 v1, 1, v0
|
||||
|
@ -1165,10 +1165,10 @@ define float @v_test_uitofp_i64_byte_to_f32(i64 %arg0) {
|
|||
; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v3
|
||||
; SI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
|
||||
; SI-NEXT: v_and_b32_e32 v3, s4, v3
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v1
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v0, 23, v0
|
||||
; SI-NEXT: s_mov_b32 s4, 0
|
||||
; SI-NEXT: s_movk_i32 s5, 0x80
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v1
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v0, 23, v0
|
||||
; SI-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; SI-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3]
|
||||
; SI-NEXT: v_and_b32_e32 v1, 1, v0
|
||||
|
@ -1195,10 +1195,10 @@ define float @v_test_uitofp_i64_byte_to_f32(i64 %arg0) {
|
|||
; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v3
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
|
||||
; VI-NEXT: v_and_b32_e32 v3, s4, v3
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v0, 23, v0
|
||||
; VI-NEXT: s_mov_b32 s4, 0
|
||||
; VI-NEXT: s_movk_i32 s5, 0x80
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v0, 23, v0
|
||||
; VI-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; VI-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3]
|
||||
; VI-NEXT: v_and_b32_e32 v1, 1, v0
|
||||
|
|
|
@ -2751,9 +2751,9 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(double addrspace(1)* %out, i32
|
|||
; GPRIDX-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
|
||||
; GPRIDX-NEXT: s_load_dword s8, s[4:5], 0x8
|
||||
; GPRIDX-NEXT: s_mov_b32 s0, 0
|
||||
; GPRIDX-NEXT: s_mov_b32 s1, 0x40140000
|
||||
; GPRIDX-NEXT: s_mov_b32 s3, 0x40080000
|
||||
; GPRIDX-NEXT: s_mov_b32 s2, s0
|
||||
; GPRIDX-NEXT: s_mov_b32 s1, 0x40140000
|
||||
; GPRIDX-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GPRIDX-NEXT: s_cmp_eq_u32 s8, 1
|
||||
; GPRIDX-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0
|
||||
|
@ -2842,9 +2842,9 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(double addrspace(1)* %out, i32
|
|||
; MOVREL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
|
||||
; MOVREL-NEXT: s_load_dword s8, s[4:5], 0x8
|
||||
; MOVREL-NEXT: s_mov_b32 s0, 0
|
||||
; MOVREL-NEXT: s_mov_b32 s1, 0x40140000
|
||||
; MOVREL-NEXT: s_mov_b32 s3, 0x40080000
|
||||
; MOVREL-NEXT: s_mov_b32 s2, s0
|
||||
; MOVREL-NEXT: s_mov_b32 s1, 0x40140000
|
||||
; MOVREL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; MOVREL-NEXT: s_cmp_eq_u32 s8, 1
|
||||
; MOVREL-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0
|
||||
|
@ -2935,9 +2935,9 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(double addrspace(1)* %out, i32
|
|||
; GFX10-NEXT: s_load_dword s8, s[4:5], 0x8
|
||||
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX10-NEXT: s_mov_b32 s2, 0
|
||||
; GFX10-NEXT: s_mov_b32 s3, 0x40140000
|
||||
; GFX10-NEXT: s_mov_b32 s5, 0x40080000
|
||||
; GFX10-NEXT: s_mov_b32 s4, s2
|
||||
; GFX10-NEXT: s_mov_b32 s3, 0x40140000
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_cmp_eq_u32 s8, 1
|
||||
|
@ -3837,21 +3837,21 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(double addrspace(1)* %out, i3
|
|||
; GPRIDX-NEXT: runtime_loader_kernel_symbol = 0
|
||||
; GPRIDX-NEXT: .end_amd_kernel_code_t
|
||||
; GPRIDX-NEXT: ; %bb.0: ; %entry
|
||||
; GPRIDX-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
|
||||
; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GPRIDX-NEXT: s_load_dword s6, s[4:5], 0x8
|
||||
; GPRIDX-NEXT: s_mov_b32 s0, 0
|
||||
; GPRIDX-NEXT: s_mov_b32 s1, 0x40080000
|
||||
; GPRIDX-NEXT: s_mov_b32 s2, 0
|
||||
; GPRIDX-NEXT: s_mov_b32 s3, 0x40080000
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GPRIDX-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GPRIDX-NEXT: s_cmp_eq_u32 s6, 1
|
||||
; GPRIDX-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0
|
||||
; GPRIDX-NEXT: s_cmp_eq_u32 s6, 2
|
||||
; GPRIDX-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
|
||||
; GPRIDX-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
|
||||
; GPRIDX-NEXT: s_cmp_eq_u32 s6, 3
|
||||
; GPRIDX-NEXT: s_cselect_b64 s[0:1], 4.0, s[0:1]
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GPRIDX-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
|
||||
; GPRIDX-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3]
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GPRIDX-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
||||
; GPRIDX-NEXT: s_endpgm
|
||||
;
|
||||
; MOVREL-LABEL: dyn_extract_v4f64_s_s_s:
|
||||
|
@ -3924,21 +3924,21 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(double addrspace(1)* %out, i3
|
|||
; MOVREL-NEXT: runtime_loader_kernel_symbol = 0
|
||||
; MOVREL-NEXT: .end_amd_kernel_code_t
|
||||
; MOVREL-NEXT: ; %bb.0: ; %entry
|
||||
; MOVREL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
|
||||
; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; MOVREL-NEXT: s_load_dword s6, s[4:5], 0x8
|
||||
; MOVREL-NEXT: s_mov_b32 s0, 0
|
||||
; MOVREL-NEXT: s_mov_b32 s1, 0x40080000
|
||||
; MOVREL-NEXT: s_mov_b32 s2, 0
|
||||
; MOVREL-NEXT: s_mov_b32 s3, 0x40080000
|
||||
; MOVREL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; MOVREL-NEXT: v_mov_b32_e32 v2, s2
|
||||
; MOVREL-NEXT: v_mov_b32_e32 v3, s1
|
||||
; MOVREL-NEXT: s_cmp_eq_u32 s6, 1
|
||||
; MOVREL-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0
|
||||
; MOVREL-NEXT: s_cmp_eq_u32 s6, 2
|
||||
; MOVREL-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
|
||||
; MOVREL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
|
||||
; MOVREL-NEXT: s_cmp_eq_u32 s6, 3
|
||||
; MOVREL-NEXT: s_cselect_b64 s[0:1], 4.0, s[0:1]
|
||||
; MOVREL-NEXT: v_mov_b32_e32 v0, s0
|
||||
; MOVREL-NEXT: v_mov_b32_e32 v1, s1
|
||||
; MOVREL-NEXT: v_mov_b32_e32 v3, s3
|
||||
; MOVREL-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3]
|
||||
; MOVREL-NEXT: v_mov_b32_e32 v0, s2
|
||||
; MOVREL-NEXT: v_mov_b32_e32 v1, s3
|
||||
; MOVREL-NEXT: v_mov_b32_e32 v2, s0
|
||||
; MOVREL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; MOVREL-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -4078,8 +4078,7 @@ define i32 @v_extract_v64i32_32(<64 x i32> addrspace(1)* %ptr) {
|
|||
; MOVREL-LABEL: v_extract_v64i32_32:
|
||||
; MOVREL: ; %bb.0:
|
||||
; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; MOVREL-NEXT: s_movk_i32 s4, 0x80
|
||||
; MOVREL-NEXT: s_mov_b32 s5, 0
|
||||
; MOVREL-NEXT: s_mov_b64 s[4:5], 0x80
|
||||
; MOVREL-NEXT: v_mov_b32_e32 v2, s4
|
||||
; MOVREL-NEXT: v_mov_b32_e32 v3, s5
|
||||
; MOVREL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
|
||||
|
@ -4112,8 +4111,7 @@ define i32 @v_extract_v64i32_33(<64 x i32> addrspace(1)* %ptr) {
|
|||
; MOVREL-LABEL: v_extract_v64i32_33:
|
||||
; MOVREL: ; %bb.0:
|
||||
; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; MOVREL-NEXT: s_movk_i32 s4, 0x80
|
||||
; MOVREL-NEXT: s_mov_b32 s5, 0
|
||||
; MOVREL-NEXT: s_mov_b64 s[4:5], 0x80
|
||||
; MOVREL-NEXT: v_mov_b32_e32 v2, s4
|
||||
; MOVREL-NEXT: v_mov_b32_e32 v3, s5
|
||||
; MOVREL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
|
||||
|
@ -4140,8 +4138,7 @@ define i32 @v_extract_v64i32_37(<64 x i32> addrspace(1)* %ptr) {
|
|||
; GPRIDX-LABEL: v_extract_v64i32_37:
|
||||
; GPRIDX: ; %bb.0:
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GPRIDX-NEXT: s_movk_i32 s4, 0x80
|
||||
; GPRIDX-NEXT: s_mov_b32 s5, 0
|
||||
; GPRIDX-NEXT: s_mov_b64 s[4:5], 0x80
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GPRIDX-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
|
||||
|
@ -4154,8 +4151,7 @@ define i32 @v_extract_v64i32_37(<64 x i32> addrspace(1)* %ptr) {
|
|||
; MOVREL-LABEL: v_extract_v64i32_37:
|
||||
; MOVREL: ; %bb.0:
|
||||
; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; MOVREL-NEXT: s_movk_i32 s4, 0x80
|
||||
; MOVREL-NEXT: s_mov_b32 s5, 0
|
||||
; MOVREL-NEXT: s_mov_b64 s[4:5], 0x80
|
||||
; MOVREL-NEXT: v_mov_b32_e32 v2, s4
|
||||
; MOVREL-NEXT: v_mov_b32_e32 v3, s5
|
||||
; MOVREL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
|
||||
|
@ -4171,8 +4167,7 @@ define i32 @v_extract_v64i32_37(<64 x i32> addrspace(1)* %ptr) {
|
|||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_movk_i32 s4, 0x80
|
||||
; GFX10-NEXT: s_mov_b32 s5, 0
|
||||
; GFX10-NEXT: s_mov_b64 s[4:5], 0x80
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX10-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
|
||||
|
|
|
@ -7,8 +7,8 @@ define double @v_floor_f64_ieee(double %x) {
|
|||
; GFX6-LABEL: v_floor_f64_ieee:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1]
|
||||
; GFX6-NEXT: s_mov_b32 s4, -1
|
||||
; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1]
|
||||
; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff
|
||||
; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5]
|
||||
; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
|
||||
|
@ -30,8 +30,8 @@ define double @v_floor_f64_ieee_nnan(double %x) {
|
|||
; GFX6-LABEL: v_floor_f64_ieee_nnan:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1]
|
||||
; GFX6-NEXT: s_mov_b32 s4, -1
|
||||
; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1]
|
||||
; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff
|
||||
; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5]
|
||||
; GFX6-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3]
|
||||
|
@ -50,8 +50,8 @@ define double @v_floor_f64_ieee_fneg(double %x) {
|
|||
; GFX6-LABEL: v_floor_f64_ieee_fneg:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_fract_f64_e64 v[2:3], -v[0:1]
|
||||
; GFX6-NEXT: s_mov_b32 s4, -1
|
||||
; GFX6-NEXT: v_fract_f64_e64 v[2:3], -v[0:1]
|
||||
; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff
|
||||
; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5]
|
||||
; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
|
||||
|
@ -74,8 +74,8 @@ define double @v_floor_f64_nonieee(double %x) #1 {
|
|||
; GFX6-LABEL: v_floor_f64_nonieee:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1]
|
||||
; GFX6-NEXT: s_mov_b32 s4, -1
|
||||
; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1]
|
||||
; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff
|
||||
; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5]
|
||||
; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
|
||||
|
@ -97,8 +97,8 @@ define double @v_floor_f64_nonieee_nnan(double %x) #1 {
|
|||
; GFX6-LABEL: v_floor_f64_nonieee_nnan:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1]
|
||||
; GFX6-NEXT: s_mov_b32 s4, -1
|
||||
; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1]
|
||||
; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff
|
||||
; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5]
|
||||
; GFX6-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3]
|
||||
|
@ -117,8 +117,8 @@ define double @v_floor_f64_non_ieee_fneg(double %x) #1 {
|
|||
; GFX6-LABEL: v_floor_f64_non_ieee_fneg:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_fract_f64_e64 v[2:3], -v[0:1]
|
||||
; GFX6-NEXT: s_mov_b32 s4, -1
|
||||
; GFX6-NEXT: v_fract_f64_e64 v[2:3], -v[0:1]
|
||||
; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff
|
||||
; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5]
|
||||
; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
|
||||
|
@ -141,8 +141,8 @@ define double @v_floor_f64_fabs(double %x) {
|
|||
; GFX6-LABEL: v_floor_f64_fabs:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_fract_f64_e64 v[2:3], |v[0:1]|
|
||||
; GFX6-NEXT: s_mov_b32 s4, -1
|
||||
; GFX6-NEXT: v_fract_f64_e64 v[2:3], |v[0:1]|
|
||||
; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff
|
||||
; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5]
|
||||
; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
|
||||
|
@ -170,8 +170,8 @@ define double @v_floor_f64_fneg_fabs(double %x) {
|
|||
; GFX6-LABEL: v_floor_f64_fneg_fabs:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_fract_f64_e64 v[2:3], -|v[0:1]|
|
||||
; GFX6-NEXT: s_mov_b32 s4, -1
|
||||
; GFX6-NEXT: v_fract_f64_e64 v[2:3], -|v[0:1]|
|
||||
; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff
|
||||
; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5]
|
||||
; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
|
||||
|
@ -194,8 +194,8 @@ define double @v_floor_f64_fneg_fabs(double %x) {
|
|||
define amdgpu_ps <2 x float> @s_floor_f64(double inreg %x) {
|
||||
; GFX6-LABEL: s_floor_f64:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: v_fract_f64_e32 v[0:1], s[2:3]
|
||||
; GFX6-NEXT: s_mov_b32 s0, -1
|
||||
; GFX6-NEXT: v_fract_f64_e32 v[0:1], s[2:3]
|
||||
; GFX6-NEXT: s_mov_b32 s1, 0x3fefffff
|
||||
; GFX6-NEXT: v_min_f64 v[0:1], v[0:1], s[0:1]
|
||||
; GFX6-NEXT: v_cmp_o_f64_e64 vcc, s[2:3], s[2:3]
|
||||
|
@ -218,8 +218,8 @@ define amdgpu_ps <2 x float> @s_floor_f64(double inreg %x) {
|
|||
define amdgpu_ps <2 x float> @s_floor_f64_fneg(double inreg %x) {
|
||||
; GFX6-LABEL: s_floor_f64_fneg:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: v_fract_f64_e64 v[0:1], -s[2:3]
|
||||
; GFX6-NEXT: s_mov_b32 s0, -1
|
||||
; GFX6-NEXT: v_fract_f64_e64 v[0:1], -s[2:3]
|
||||
; GFX6-NEXT: s_mov_b32 s1, 0x3fefffff
|
||||
; GFX6-NEXT: v_min_f64 v[0:1], v[0:1], s[0:1]
|
||||
; GFX6-NEXT: v_cmp_o_f64_e64 vcc, s[2:3], s[2:3]
|
||||
|
@ -243,8 +243,8 @@ define amdgpu_ps <2 x float> @s_floor_f64_fneg(double inreg %x) {
|
|||
define amdgpu_ps <2 x float> @s_floor_f64_fabs(double inreg %x) {
|
||||
; GFX6-LABEL: s_floor_f64_fabs:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: v_fract_f64_e64 v[0:1], |s[2:3]|
|
||||
; GFX6-NEXT: s_mov_b32 s0, -1
|
||||
; GFX6-NEXT: v_fract_f64_e64 v[0:1], |s[2:3]|
|
||||
; GFX6-NEXT: s_mov_b32 s1, 0x3fefffff
|
||||
; GFX6-NEXT: v_min_f64 v[0:1], v[0:1], s[0:1]
|
||||
; GFX6-NEXT: v_cmp_o_f64_e64 vcc, s[2:3], s[2:3]
|
||||
|
@ -268,8 +268,8 @@ define amdgpu_ps <2 x float> @s_floor_f64_fabs(double inreg %x) {
|
|||
define amdgpu_ps <2 x float> @s_floor_f64_fneg_fabs(double inreg %x) {
|
||||
; GFX6-LABEL: s_floor_f64_fneg_fabs:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: v_fract_f64_e64 v[0:1], -|s[2:3]|
|
||||
; GFX6-NEXT: s_mov_b32 s0, -1
|
||||
; GFX6-NEXT: v_fract_f64_e64 v[0:1], -|s[2:3]|
|
||||
; GFX6-NEXT: s_mov_b32 s1, 0x3fefffff
|
||||
; GFX6-NEXT: v_min_f64 v[0:1], v[0:1], s[0:1]
|
||||
; GFX6-NEXT: v_cmp_o_f64_e64 vcc, s[2:3], s[2:3]
|
||||
|
|
|
@ -4703,8 +4703,7 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
|
|||
define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg %amt) {
|
||||
; GFX6-LABEL: s_fshl_i128:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_movk_i32 s10, 0x7f
|
||||
; GFX6-NEXT: s_mov_b32 s11, 0
|
||||
; GFX6-NEXT: s_mov_b64 s[10:11], 0x7f
|
||||
; GFX6-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11]
|
||||
; GFX6-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9]
|
||||
; GFX6-NEXT: s_sub_i32 s9, s12, 64
|
||||
|
@ -4751,8 +4750,7 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
|
|||
;
|
||||
; GFX8-LABEL: s_fshl_i128:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_movk_i32 s10, 0x7f
|
||||
; GFX8-NEXT: s_mov_b32 s11, 0
|
||||
; GFX8-NEXT: s_mov_b64 s[10:11], 0x7f
|
||||
; GFX8-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11]
|
||||
; GFX8-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9]
|
||||
; GFX8-NEXT: s_sub_i32 s9, s12, 64
|
||||
|
@ -4799,8 +4797,7 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
|
|||
;
|
||||
; GFX9-LABEL: s_fshl_i128:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_movk_i32 s10, 0x7f
|
||||
; GFX9-NEXT: s_mov_b32 s11, 0
|
||||
; GFX9-NEXT: s_mov_b64 s[10:11], 0x7f
|
||||
; GFX9-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11]
|
||||
; GFX9-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9]
|
||||
; GFX9-NEXT: s_sub_i32 s9, s12, 64
|
||||
|
@ -4847,8 +4844,7 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
|
|||
;
|
||||
; GFX10-LABEL: s_fshl_i128:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_movk_i32 s10, 0x7f
|
||||
; GFX10-NEXT: s_mov_b32 s11, 0
|
||||
; GFX10-NEXT: s_mov_b64 s[10:11], 0x7f
|
||||
; GFX10-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11]
|
||||
; GFX10-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9]
|
||||
; GFX10-NEXT: s_sub_i32 s9, s12, 64
|
||||
|
@ -5321,8 +5317,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
|
|||
define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 inreg %amt) {
|
||||
; GFX6-LABEL: v_fshl_i128_svs:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_movk_i32 s6, 0x7f
|
||||
; GFX6-NEXT: s_mov_b32 s7, 0
|
||||
; GFX6-NEXT: s_mov_b64 s[6:7], 0x7f
|
||||
; GFX6-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
|
||||
; GFX6-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
|
||||
; GFX6-NEXT: s_sub_i32 s5, s8, 64
|
||||
|
@ -5379,8 +5374,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
|
|||
;
|
||||
; GFX8-LABEL: v_fshl_i128_svs:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_movk_i32 s6, 0x7f
|
||||
; GFX8-NEXT: s_mov_b32 s7, 0
|
||||
; GFX8-NEXT: s_mov_b64 s[6:7], 0x7f
|
||||
; GFX8-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
|
||||
; GFX8-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
|
||||
; GFX8-NEXT: s_sub_i32 s5, s8, 64
|
||||
|
@ -5437,8 +5431,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
|
|||
;
|
||||
; GFX9-LABEL: v_fshl_i128_svs:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_movk_i32 s6, 0x7f
|
||||
; GFX9-NEXT: s_mov_b32 s7, 0
|
||||
; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f
|
||||
; GFX9-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
|
||||
; GFX9-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
|
||||
; GFX9-NEXT: s_sub_i32 s5, s8, 64
|
||||
|
@ -5495,8 +5488,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
|
|||
;
|
||||
; GFX10-LABEL: v_fshl_i128_svs:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_movk_i32 s6, 0x7f
|
||||
; GFX10-NEXT: s_mov_b32 s7, 0
|
||||
; GFX10-NEXT: s_mov_b64 s[6:7], 0x7f
|
||||
; GFX10-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1]
|
||||
; GFX10-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
|
||||
; GFX10-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
|
||||
|
@ -5556,8 +5548,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
|
|||
define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 inreg %amt) {
|
||||
; GFX6-LABEL: v_fshl_i128_vss:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_movk_i32 s6, 0x7f
|
||||
; GFX6-NEXT: s_mov_b32 s7, 0
|
||||
; GFX6-NEXT: s_mov_b64 s[6:7], 0x7f
|
||||
; GFX6-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
|
||||
; GFX6-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
|
||||
; GFX6-NEXT: s_sub_i32 s6, 64, s8
|
||||
|
@ -5612,8 +5603,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
|
|||
;
|
||||
; GFX8-LABEL: v_fshl_i128_vss:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_movk_i32 s6, 0x7f
|
||||
; GFX8-NEXT: s_mov_b32 s7, 0
|
||||
; GFX8-NEXT: s_mov_b64 s[6:7], 0x7f
|
||||
; GFX8-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
|
||||
; GFX8-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
|
||||
; GFX8-NEXT: s_sub_i32 s6, 64, s8
|
||||
|
@ -5668,8 +5658,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
|
|||
;
|
||||
; GFX9-LABEL: v_fshl_i128_vss:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_movk_i32 s6, 0x7f
|
||||
; GFX9-NEXT: s_mov_b32 s7, 0
|
||||
; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f
|
||||
; GFX9-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
|
||||
; GFX9-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
|
||||
; GFX9-NEXT: s_sub_i32 s6, 64, s8
|
||||
|
@ -5724,8 +5713,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
|
|||
;
|
||||
; GFX10-LABEL: v_fshl_i128_vss:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_movk_i32 s6, 0x7f
|
||||
; GFX10-NEXT: s_mov_b32 s7, 0
|
||||
; GFX10-NEXT: s_mov_b64 s[6:7], 0x7f
|
||||
; GFX10-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
|
||||
; GFX10-NEXT: s_andn2_b64 s[6:7], s[6:7], s[4:5]
|
||||
; GFX10-NEXT: s_sub_i32 s4, 64, s8
|
||||
|
@ -5902,8 +5890,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
|
|||
define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs, <2 x i128> inreg %amt) {
|
||||
; GFX6-LABEL: s_fshl_v2i128:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_movk_i32 s18, 0x7f
|
||||
; GFX6-NEXT: s_mov_b32 s19, 0
|
||||
; GFX6-NEXT: s_mov_b64 s[18:19], 0x7f
|
||||
; GFX6-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19]
|
||||
; GFX6-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17]
|
||||
; GFX6-NEXT: s_sub_i32 s17, s22, 64
|
||||
|
@ -5991,8 +5978,7 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
|
|||
;
|
||||
; GFX8-LABEL: s_fshl_v2i128:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_movk_i32 s18, 0x7f
|
||||
; GFX8-NEXT: s_mov_b32 s19, 0
|
||||
; GFX8-NEXT: s_mov_b64 s[18:19], 0x7f
|
||||
; GFX8-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19]
|
||||
; GFX8-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17]
|
||||
; GFX8-NEXT: s_sub_i32 s17, s22, 64
|
||||
|
@ -6080,8 +6066,7 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
|
|||
;
|
||||
; GFX9-LABEL: s_fshl_v2i128:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_movk_i32 s18, 0x7f
|
||||
; GFX9-NEXT: s_mov_b32 s19, 0
|
||||
; GFX9-NEXT: s_mov_b64 s[18:19], 0x7f
|
||||
; GFX9-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19]
|
||||
; GFX9-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17]
|
||||
; GFX9-NEXT: s_sub_i32 s17, s22, 64
|
||||
|
@ -6169,8 +6154,7 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
|
|||
;
|
||||
; GFX10-LABEL: s_fshl_v2i128:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_movk_i32 s18, 0x7f
|
||||
; GFX10-NEXT: s_mov_b32 s19, 0
|
||||
; GFX10-NEXT: s_mov_b64 s[18:19], 0x7f
|
||||
; GFX10-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19]
|
||||
; GFX10-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17]
|
||||
; GFX10-NEXT: s_sub_i32 s17, s22, 64
|
||||
|
|
|
@ -4840,8 +4840,7 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
|
|||
define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg %amt) {
|
||||
; GFX6-LABEL: s_fshr_i128:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_movk_i32 s10, 0x7f
|
||||
; GFX6-NEXT: s_mov_b32 s11, 0
|
||||
; GFX6-NEXT: s_mov_b64 s[10:11], 0x7f
|
||||
; GFX6-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11]
|
||||
; GFX6-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9]
|
||||
; GFX6-NEXT: s_sub_i32 s9, 64, 1
|
||||
|
@ -4888,8 +4887,7 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
|
|||
;
|
||||
; GFX8-LABEL: s_fshr_i128:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_movk_i32 s10, 0x7f
|
||||
; GFX8-NEXT: s_mov_b32 s11, 0
|
||||
; GFX8-NEXT: s_mov_b64 s[10:11], 0x7f
|
||||
; GFX8-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11]
|
||||
; GFX8-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9]
|
||||
; GFX8-NEXT: s_sub_i32 s9, 64, 1
|
||||
|
@ -4936,8 +4934,7 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
|
|||
;
|
||||
; GFX9-LABEL: s_fshr_i128:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_movk_i32 s10, 0x7f
|
||||
; GFX9-NEXT: s_mov_b32 s11, 0
|
||||
; GFX9-NEXT: s_mov_b64 s[10:11], 0x7f
|
||||
; GFX9-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11]
|
||||
; GFX9-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9]
|
||||
; GFX9-NEXT: s_sub_i32 s9, 64, 1
|
||||
|
@ -4984,8 +4981,7 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
|
|||
;
|
||||
; GFX10-LABEL: s_fshr_i128:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_movk_i32 s10, 0x7f
|
||||
; GFX10-NEXT: s_mov_b32 s11, 0
|
||||
; GFX10-NEXT: s_mov_b64 s[10:11], 0x7f
|
||||
; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
|
||||
; GFX10-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11]
|
||||
; GFX10-NEXT: s_sub_i32 s13, 64, 1
|
||||
|
@ -5458,8 +5454,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
|
|||
define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 inreg %amt) {
|
||||
; GFX6-LABEL: v_fshr_i128_svs:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_movk_i32 s6, 0x7f
|
||||
; GFX6-NEXT: s_mov_b32 s7, 0
|
||||
; GFX6-NEXT: s_mov_b64 s[6:7], 0x7f
|
||||
; GFX6-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
|
||||
; GFX6-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
|
||||
; GFX6-NEXT: s_sub_i32 s5, 64, 1
|
||||
|
@ -5515,8 +5510,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
|
|||
;
|
||||
; GFX8-LABEL: v_fshr_i128_svs:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_movk_i32 s6, 0x7f
|
||||
; GFX8-NEXT: s_mov_b32 s7, 0
|
||||
; GFX8-NEXT: s_mov_b64 s[6:7], 0x7f
|
||||
; GFX8-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
|
||||
; GFX8-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
|
||||
; GFX8-NEXT: s_sub_i32 s5, 64, 1
|
||||
|
@ -5572,8 +5566,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
|
|||
;
|
||||
; GFX9-LABEL: v_fshr_i128_svs:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_movk_i32 s6, 0x7f
|
||||
; GFX9-NEXT: s_mov_b32 s7, 0
|
||||
; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f
|
||||
; GFX9-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
|
||||
; GFX9-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
|
||||
; GFX9-NEXT: s_sub_i32 s5, 64, 1
|
||||
|
@ -5629,8 +5622,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
|
|||
;
|
||||
; GFX10-LABEL: v_fshr_i128_svs:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_movk_i32 s6, 0x7f
|
||||
; GFX10-NEXT: s_mov_b32 s7, 0
|
||||
; GFX10-NEXT: s_mov_b64 s[6:7], 0x7f
|
||||
; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
|
||||
; GFX10-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
|
||||
; GFX10-NEXT: s_sub_i32 s9, 64, 1
|
||||
|
@ -5689,8 +5681,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
|
|||
define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 inreg %amt) {
|
||||
; GFX6-LABEL: v_fshr_i128_vss:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_movk_i32 s6, 0x7f
|
||||
; GFX6-NEXT: s_mov_b32 s7, 0
|
||||
; GFX6-NEXT: s_mov_b64 s[6:7], 0x7f
|
||||
; GFX6-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
|
||||
; GFX6-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
|
||||
; GFX6-NEXT: s_sub_i32 s5, 64, 1
|
||||
|
@ -5746,8 +5737,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
|
|||
;
|
||||
; GFX8-LABEL: v_fshr_i128_vss:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_movk_i32 s6, 0x7f
|
||||
; GFX8-NEXT: s_mov_b32 s7, 0
|
||||
; GFX8-NEXT: s_mov_b64 s[6:7], 0x7f
|
||||
; GFX8-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
|
||||
; GFX8-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
|
||||
; GFX8-NEXT: s_sub_i32 s5, 64, 1
|
||||
|
@ -5803,8 +5793,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
|
|||
;
|
||||
; GFX9-LABEL: v_fshr_i128_vss:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_movk_i32 s6, 0x7f
|
||||
; GFX9-NEXT: s_mov_b32 s7, 0
|
||||
; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f
|
||||
; GFX9-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
|
||||
; GFX9-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
|
||||
; GFX9-NEXT: s_sub_i32 s5, 64, 1
|
||||
|
@ -5863,19 +5852,18 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
|
|||
; GFX10-NEXT: s_sub_i32 s6, 64, 1
|
||||
; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
|
||||
; GFX10-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1]
|
||||
; GFX10-NEXT: s_movk_i32 s6, 0x7f
|
||||
; GFX10-NEXT: s_mov_b32 s7, 0
|
||||
; GFX10-NEXT: s_mov_b64 s[6:7], 0x7f
|
||||
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
|
||||
; GFX10-NEXT: s_andn2_b64 s[8:9], s[6:7], s[4:5]
|
||||
; GFX10-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7]
|
||||
; GFX10-NEXT: s_sub_i32 s4, 64, s8
|
||||
; GFX10-NEXT: v_or_b32_e32 v2, v4, v2
|
||||
; GFX10-NEXT: v_or_b32_e32 v3, v5, v3
|
||||
; GFX10-NEXT: s_sub_i32 s4, 64, s8
|
||||
; GFX10-NEXT: s_sub_i32 s5, s8, 64
|
||||
; GFX10-NEXT: s_cmp_lt_u32 s8, 64
|
||||
; GFX10-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1]
|
||||
; GFX10-NEXT: v_lshlrev_b64 v[6:7], s8, v[2:3]
|
||||
; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0
|
||||
; GFX10-NEXT: v_lshlrev_b64 v[6:7], s8, v[2:3]
|
||||
; GFX10-NEXT: s_cmp_eq_u32 s8, 0
|
||||
; GFX10-NEXT: v_lshlrev_b64 v[8:9], s8, v[0:1]
|
||||
; GFX10-NEXT: s_cselect_b32 s7, 1, 0
|
||||
|
@ -6044,8 +6032,7 @@ define i128 @v_fshr_i128_65(i128 %lhs, i128 %rhs) {
|
|||
define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs, <2 x i128> inreg %amt) {
|
||||
; GFX6-LABEL: s_fshr_v2i128:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_movk_i32 s18, 0x7f
|
||||
; GFX6-NEXT: s_mov_b32 s19, 0
|
||||
; GFX6-NEXT: s_mov_b64 s[18:19], 0x7f
|
||||
; GFX6-NEXT: s_sub_i32 s28, 64, 1
|
||||
; GFX6-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19]
|
||||
; GFX6-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17]
|
||||
|
@ -6133,8 +6120,7 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
|
|||
;
|
||||
; GFX8-LABEL: s_fshr_v2i128:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_movk_i32 s18, 0x7f
|
||||
; GFX8-NEXT: s_mov_b32 s19, 0
|
||||
; GFX8-NEXT: s_mov_b64 s[18:19], 0x7f
|
||||
; GFX8-NEXT: s_sub_i32 s28, 64, 1
|
||||
; GFX8-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19]
|
||||
; GFX8-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17]
|
||||
|
@ -6222,8 +6208,7 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
|
|||
;
|
||||
; GFX9-LABEL: s_fshr_v2i128:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_movk_i32 s18, 0x7f
|
||||
; GFX9-NEXT: s_mov_b32 s19, 0
|
||||
; GFX9-NEXT: s_mov_b64 s[18:19], 0x7f
|
||||
; GFX9-NEXT: s_sub_i32 s28, 64, 1
|
||||
; GFX9-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19]
|
||||
; GFX9-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17]
|
||||
|
@ -6311,13 +6296,12 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
|
|||
;
|
||||
; GFX10-LABEL: s_fshr_v2i128:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_mov_b64 s[18:19], 0x7f
|
||||
; GFX10-NEXT: s_sub_i32 s28, 64, 1
|
||||
; GFX10-NEXT: s_movk_i32 s18, 0x7f
|
||||
; GFX10-NEXT: s_mov_b32 s19, 0
|
||||
; GFX10-NEXT: s_lshr_b64 s[24:25], s[0:1], s28
|
||||
; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
|
||||
; GFX10-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19]
|
||||
; GFX10-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17]
|
||||
; GFX10-NEXT: s_lshr_b64 s[24:25], s[0:1], s28
|
||||
; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
|
||||
; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
|
||||
; GFX10-NEXT: s_or_b64 s[2:3], s[24:25], s[2:3]
|
||||
; GFX10-NEXT: s_sub_i32 s23, s16, 64
|
||||
|
|
|
@ -19,8 +19,7 @@ define i32 @global_atomic_csub_offset(i32 addrspace(1)* %ptr, i32 %data) {
|
|||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GCN-NEXT: s_movk_i32 s4, 0x1000
|
||||
; GCN-NEXT: s_mov_b32 s5, 0
|
||||
; GCN-NEXT: s_mov_b64 s[4:5], 0x1000
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, s4
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, s5
|
||||
; GCN-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
|
||||
|
@ -50,8 +49,7 @@ define void @global_atomic_csub_offset_nortn(i32 addrspace(1)* %ptr, i32 %data)
|
|||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GCN-NEXT: s_movk_i32 s4, 0x1000
|
||||
; GCN-NEXT: s_mov_b32 s5, 0
|
||||
; GCN-NEXT: s_mov_b64 s[4:5], 0x1000
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, s4
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, s5
|
||||
; GCN-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
|
||||
|
|
|
@ -336,8 +336,7 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset_offset256(i32 addrspace(
|
|||
define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset256_offset(i32 addrspace(1)* %ptr, i32 inreg %soffset) {
|
||||
; GFX6-LABEL: mubuf_store_vgpr_ptr_sgpr_offset256_offset:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_movk_i32 s4, 0x400
|
||||
; GFX6-NEXT: s_mov_b32 s5, 0
|
||||
; GFX6-NEXT: s_mov_b64 s[4:5], 0x400
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
|
||||
|
@ -352,8 +351,7 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset256_offset(i32 addrspace(
|
|||
;
|
||||
; GFX7-LABEL: mubuf_store_vgpr_ptr_sgpr_offset256_offset:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_movk_i32 s4, 0x400
|
||||
; GFX7-NEXT: s_mov_b32 s5, 0
|
||||
; GFX7-NEXT: s_mov_b64 s[4:5], 0x400
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
|
||||
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
|
||||
|
@ -792,8 +790,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset_offset256(float addrspac
|
|||
define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset256_offset(float addrspace(1)* %ptr, i32 inreg %soffset) {
|
||||
; GFX6-LABEL: mubuf_load_vgpr_ptr_sgpr_offset256_offset:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_movk_i32 s4, 0x400
|
||||
; GFX6-NEXT: s_mov_b32 s5, 0
|
||||
; GFX6-NEXT: s_mov_b64 s[4:5], 0x400
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
|
||||
; GFX6-NEXT: v_mov_b32_e32 v3, s5
|
||||
|
@ -808,8 +805,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset256_offset(float addrspac
|
|||
;
|
||||
; GFX7-LABEL: mubuf_load_vgpr_ptr_sgpr_offset256_offset:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_movk_i32 s4, 0x400
|
||||
; GFX7-NEXT: s_mov_b32 s5, 0
|
||||
; GFX7-NEXT: s_mov_b64 s[4:5], 0x400
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s5
|
||||
|
|
|
@ -568,8 +568,8 @@ define double @v_roundeven_f64_fneg(double %x) {
|
|||
; GFX6-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GFX6-NEXT: v_or_b32_e32 v4, 0x43300000, v4
|
||||
; GFX6-NEXT: v_add_f64 v[5:6], -v[0:1], v[3:4]
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX6-NEXT: s_mov_b32 s4, -1
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX6-NEXT: s_mov_b32 s5, 0x432fffff
|
||||
; GFX6-NEXT: v_add_f64 v[3:4], v[5:6], -v[3:4]
|
||||
; GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[1:2]|, s[4:5]
|
||||
|
|
|
@ -2509,8 +2509,7 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
|
|||
; CHECK-LABEL: v_sdiv_i64_pow2_shl_denom:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; CHECK-NEXT: s_movk_i32 s4, 0x1000
|
||||
; CHECK-NEXT: s_mov_b32 s5, 0
|
||||
; CHECK-NEXT: s_mov_b64 s[4:5], 0x1000
|
||||
; CHECK-NEXT: v_lshl_b64 v[4:5], s[4:5], v2
|
||||
; CHECK-NEXT: v_mov_b32_e32 v2, 0
|
||||
; CHECK-NEXT: v_or_b32_e32 v3, v1, v5
|
||||
|
@ -2703,8 +2702,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
|
|||
; GISEL-LABEL: v_sdiv_v2i64_pow2_shl_denom:
|
||||
; GISEL: ; %bb.0:
|
||||
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GISEL-NEXT: s_movk_i32 s6, 0x1000
|
||||
; GISEL-NEXT: s_mov_b32 s7, 0
|
||||
; GISEL-NEXT: s_mov_b64 s[6:7], 0x1000
|
||||
; GISEL-NEXT: v_lshl_b64 v[4:5], s[6:7], v4
|
||||
; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v1
|
||||
; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v5
|
||||
|
@ -2996,8 +2994,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
|
|||
; CGP-LABEL: v_sdiv_v2i64_pow2_shl_denom:
|
||||
; CGP: ; %bb.0:
|
||||
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; CGP-NEXT: s_movk_i32 s4, 0x1000
|
||||
; CGP-NEXT: s_mov_b32 s5, 0
|
||||
; CGP-NEXT: s_mov_b64 s[4:5], 0x1000
|
||||
; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v4
|
||||
; CGP-NEXT: v_mov_b32_e32 v7, v1
|
||||
; CGP-NEXT: v_mov_b32_e32 v5, v0
|
||||
|
|
|
@ -2473,8 +2473,7 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
|
|||
; CHECK-LABEL: v_srem_i64_pow2_shl_denom:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; CHECK-NEXT: s_movk_i32 s4, 0x1000
|
||||
; CHECK-NEXT: s_mov_b32 s5, 0
|
||||
; CHECK-NEXT: s_mov_b64 s[4:5], 0x1000
|
||||
; CHECK-NEXT: v_lshl_b64 v[4:5], s[4:5], v2
|
||||
; CHECK-NEXT: v_mov_b32_e32 v2, 0
|
||||
; CHECK-NEXT: v_or_b32_e32 v3, v1, v5
|
||||
|
@ -2663,8 +2662,7 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
|
|||
; GISEL-LABEL: v_srem_v2i64_pow2_shl_denom:
|
||||
; GISEL: ; %bb.0:
|
||||
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GISEL-NEXT: s_movk_i32 s6, 0x1000
|
||||
; GISEL-NEXT: s_mov_b32 s7, 0
|
||||
; GISEL-NEXT: s_mov_b64 s[6:7], 0x1000
|
||||
; GISEL-NEXT: v_lshl_b64 v[4:5], s[6:7], v4
|
||||
; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1
|
||||
; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v5
|
||||
|
@ -2952,8 +2950,7 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
|
|||
; CGP-LABEL: v_srem_v2i64_pow2_shl_denom:
|
||||
; CGP: ; %bb.0:
|
||||
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; CGP-NEXT: s_movk_i32 s4, 0x1000
|
||||
; CGP-NEXT: s_mov_b32 s5, 0
|
||||
; CGP-NEXT: s_mov_b64 s[4:5], 0x1000
|
||||
; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v4
|
||||
; CGP-NEXT: v_mov_b32_e32 v7, v1
|
||||
; CGP-NEXT: v_mov_b32_e32 v5, v0
|
||||
|
|
|
@ -2291,8 +2291,7 @@ define i64 @v_udiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
|
|||
; CHECK-LABEL: v_udiv_i64_pow2_shl_denom:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; CHECK-NEXT: s_movk_i32 s4, 0x1000
|
||||
; CHECK-NEXT: s_mov_b32 s5, 0
|
||||
; CHECK-NEXT: s_mov_b64 s[4:5], 0x1000
|
||||
; CHECK-NEXT: v_lshl_b64 v[4:5], s[4:5], v2
|
||||
; CHECK-NEXT: v_or_b32_e32 v3, v1, v5
|
||||
; CHECK-NEXT: v_mov_b32_e32 v2, 0
|
||||
|
@ -2470,8 +2469,7 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
|
|||
; GISEL-LABEL: v_udiv_v2i64_pow2_shl_denom:
|
||||
; GISEL: ; %bb.0:
|
||||
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GISEL-NEXT: s_movk_i32 s4, 0x1000
|
||||
; GISEL-NEXT: s_mov_b32 s5, 0
|
||||
; GISEL-NEXT: s_mov_b64 s[4:5], 0x1000
|
||||
; GISEL-NEXT: v_lshl_b64 v[4:5], s[4:5], v4
|
||||
; GISEL-NEXT: v_lshl_b64 v[6:7], s[4:5], v6
|
||||
; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v4
|
||||
|
@ -2735,8 +2733,7 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
|
|||
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; CGP-NEXT: v_mov_b32_e32 v5, v0
|
||||
; CGP-NEXT: v_mov_b32_e32 v7, v1
|
||||
; CGP-NEXT: s_movk_i32 s4, 0x1000
|
||||
; CGP-NEXT: s_mov_b32 s5, 0
|
||||
; CGP-NEXT: s_mov_b64 s[4:5], 0x1000
|
||||
; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v4
|
||||
; CGP-NEXT: v_lshl_b64 v[8:9], s[4:5], v6
|
||||
; CGP-NEXT: v_or_b32_e32 v1, v7, v11
|
||||
|
|
|
@ -1651,8 +1651,7 @@ define i64 @v_urem_i64_pow2_shl_denom(i64 %x, i64 %y) {
|
|||
; CHECK-LABEL: v_urem_i64_pow2_shl_denom:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; CHECK-NEXT: s_movk_i32 s4, 0x1000
|
||||
; CHECK-NEXT: s_mov_b32 s5, 0
|
||||
; CHECK-NEXT: s_mov_b64 s[4:5], 0x1000
|
||||
; CHECK-NEXT: v_lshl_b64 v[4:5], s[4:5], v2
|
||||
; CHECK-NEXT: v_or_b32_e32 v3, v1, v5
|
||||
; CHECK-NEXT: v_mov_b32_e32 v2, 0
|
||||
|
@ -1827,8 +1826,7 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
|
|||
; GISEL-LABEL: v_urem_v2i64_pow2_shl_denom:
|
||||
; GISEL: ; %bb.0:
|
||||
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GISEL-NEXT: s_movk_i32 s4, 0x1000
|
||||
; GISEL-NEXT: s_mov_b32 s5, 0
|
||||
; GISEL-NEXT: s_mov_b64 s[4:5], 0x1000
|
||||
; GISEL-NEXT: v_lshl_b64 v[4:5], s[4:5], v4
|
||||
; GISEL-NEXT: v_lshl_b64 v[6:7], s[4:5], v6
|
||||
; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v4
|
||||
|
@ -2090,8 +2088,7 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
|
|||
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; CGP-NEXT: v_mov_b32_e32 v5, v0
|
||||
; CGP-NEXT: v_mov_b32_e32 v7, v1
|
||||
; CGP-NEXT: s_movk_i32 s4, 0x1000
|
||||
; CGP-NEXT: s_mov_b32 s5, 0
|
||||
; CGP-NEXT: s_mov_b64 s[4:5], 0x1000
|
||||
; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v4
|
||||
; CGP-NEXT: v_lshl_b64 v[8:9], s[4:5], v6
|
||||
; CGP-NEXT: v_or_b32_e32 v1, v7, v11
|
||||
|
|
|
@ -67,6 +67,7 @@ define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
|
|||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: udiv_i32:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
|
||||
|
@ -157,6 +158,7 @@ define amdgpu_kernel void @urem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
|
|||
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
|
||||
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: urem_i32:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
|
||||
|
@ -268,6 +270,7 @@ define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
|
|||
; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0
|
||||
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: sdiv_i32:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
|
||||
|
@ -383,6 +386,7 @@ define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
|
|||
; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0
|
||||
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: srem_i32:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
|
||||
|
@ -464,6 +468,7 @@ define amdgpu_kernel void @udiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
|
|||
; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc
|
||||
; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: udiv_i16:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
|
||||
|
@ -534,6 +539,7 @@ define amdgpu_kernel void @urem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
|
|||
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
|
||||
; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: urem_i16:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
|
||||
|
@ -612,6 +618,7 @@ define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
|
|||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
|
||||
; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: sdiv_i16:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -696,6 +703,7 @@ define amdgpu_kernel void @srem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
|
|||
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
|
||||
; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: srem_i16:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
|
||||
|
@ -768,6 +776,7 @@ define amdgpu_kernel void @udiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
|
|||
; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc
|
||||
; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: udiv_i8:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
|
||||
|
@ -835,6 +844,7 @@ define amdgpu_kernel void @urem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
|
|||
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
|
||||
; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: urem_i8:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
|
||||
|
@ -912,6 +922,7 @@ define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
|
|||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
|
||||
; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: sdiv_i8:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -997,6 +1008,7 @@ define amdgpu_kernel void @srem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
|
|||
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
|
||||
; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: srem_i8:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
|
||||
|
@ -1249,6 +1261,7 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
|
|||
; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
|
||||
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: udiv_v4i32:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
|
||||
|
@ -1542,6 +1555,7 @@ define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
|
|||
; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
|
||||
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: urem_v4i32:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
|
||||
|
@ -1915,6 +1929,7 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
|
|||
; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v3
|
||||
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: sdiv_v4i32:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
|
||||
|
@ -2308,6 +2323,7 @@ define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
|
|||
; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s5, v3
|
||||
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: srem_v4i32:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
|
||||
|
@ -2567,6 +2583,7 @@ define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
|
|||
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
|
||||
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: udiv_v4i16:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -2791,6 +2808,7 @@ define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
|
|||
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
|
||||
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: urem_v4i16:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -3043,6 +3061,7 @@ define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
|
|||
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
|
||||
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: sdiv_v4i16:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -3322,6 +3341,7 @@ define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
|
|||
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
|
||||
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: srem_v4i16:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -3455,6 +3475,7 @@ define amdgpu_kernel void @udiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
|
|||
; GFX6-NEXT: v_and_b32_e32 v0, 7, v0
|
||||
; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: udiv_i3:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -3528,6 +3549,7 @@ define amdgpu_kernel void @urem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
|
|||
; GFX6-NEXT: v_and_b32_e32 v0, 7, v0
|
||||
; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: urem_i3:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
|
||||
|
@ -3609,6 +3631,7 @@ define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
|
|||
; GFX6-NEXT: v_and_b32_e32 v0, 7, v0
|
||||
; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: sdiv_i3:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -3696,6 +3719,7 @@ define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
|
|||
; GFX6-NEXT: v_and_b32_e32 v0, 7, v0
|
||||
; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: srem_i3:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
|
||||
|
@ -3843,6 +3867,7 @@ define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
|
|||
; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4
|
||||
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: udiv_v3i16:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -4023,6 +4048,7 @@ define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
|
|||
; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4
|
||||
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: urem_v3i16:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -4219,6 +4245,7 @@ define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
|
|||
; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4
|
||||
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: sdiv_v3i16:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -4438,6 +4465,7 @@ define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
|
|||
; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4
|
||||
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: srem_v3i16:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
||||
|
@ -4631,6 +4659,7 @@ define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
|
|||
; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1
|
||||
; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: udiv_v3i15:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -4827,6 +4856,7 @@ define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
|
|||
; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1
|
||||
; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: urem_v3i15:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -5043,6 +5073,7 @@ define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
|
|||
; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1
|
||||
; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: sdiv_v3i15:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -5285,6 +5316,7 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
|
|||
; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1
|
||||
; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: srem_v3i15:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -5394,6 +5426,7 @@ define amdgpu_kernel void @udiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
|
|||
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 20, v0
|
||||
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: udiv_i32_oddk_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -5430,6 +5463,7 @@ define amdgpu_kernel void @udiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x)
|
|||
; GFX6-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: udiv_i32_pow2k_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -5464,6 +5498,7 @@ define amdgpu_kernel void @udiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %
|
|||
; GFX6-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: udiv_i32_pow2_shl_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -5505,6 +5540,7 @@ define amdgpu_kernel void @udiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out,
|
|||
; GFX6-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: udiv_v2i32_pow2k_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -5550,6 +5586,7 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)*
|
|||
; GFX6-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -5694,6 +5731,7 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
|
|||
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
|
||||
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: udiv_v2i32_pow2_shl_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
||||
|
@ -5776,6 +5814,7 @@ define amdgpu_kernel void @urem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
|
|||
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
|
||||
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: urem_i32_oddk_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -5814,6 +5853,7 @@ define amdgpu_kernel void @urem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x)
|
|||
; GFX6-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: urem_i32_pow2k_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -5849,6 +5889,7 @@ define amdgpu_kernel void @urem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %
|
|||
; GFX6-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: urem_i32_pow2_shl_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -5892,6 +5933,7 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out,
|
|||
; GFX6-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: urem_v2i32_pow2k_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -6025,6 +6067,7 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
|
|||
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
|
||||
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: urem_v2i32_pow2_shl_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
||||
|
@ -6100,6 +6143,7 @@ define amdgpu_kernel void @sdiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
|
|||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
|
||||
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: sdiv_i32_oddk_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -6139,6 +6183,7 @@ define amdgpu_kernel void @sdiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x)
|
|||
; GFX6-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: sdiv_i32_pow2k_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -6202,6 +6247,7 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %
|
|||
; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0
|
||||
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: sdiv_i32_pow2_shl_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
|
||||
|
@ -6275,6 +6321,7 @@ define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out,
|
|||
; GFX6-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: sdiv_v2i32_pow2k_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -6329,6 +6376,7 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)*
|
|||
; GFX6-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -6511,6 +6559,7 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
|
|||
; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1
|
||||
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
||||
|
@ -6610,6 +6659,7 @@ define amdgpu_kernel void @srem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
|
|||
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
|
||||
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: srem_i32_oddk_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -6652,6 +6702,7 @@ define amdgpu_kernel void @srem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x)
|
|||
; GFX6-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: srem_i32_pow2k_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -6713,6 +6764,7 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %
|
|||
; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0
|
||||
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: srem_i32_pow2_shl_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
|
||||
|
@ -6787,6 +6839,7 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out,
|
|||
; GFX6-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: srem_v2i32_pow2k_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -6960,6 +7013,7 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
|
|||
; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s0, v1
|
||||
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: srem_v2i32_pow2_shl_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -7159,6 +7213,7 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
|
|||
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
||||
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: udiv_i64_oddk_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f176a73
|
||||
|
@ -7303,6 +7358,7 @@ define amdgpu_kernel void @udiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x)
|
|||
; GFX6-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: udiv_i64_pow2k_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
|
@ -7340,6 +7396,7 @@ define amdgpu_kernel void @udiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
|
|||
; GFX6-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: udiv_i64_pow2_shl_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
||||
|
@ -7384,6 +7441,7 @@ define amdgpu_kernel void @udiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
|
|||
; GFX6-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: udiv_v2i64_pow2k_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -7525,6 +7583,7 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
|
|||
; GFX6-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: udiv_v2i64_mixed_pow2k_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000
|
||||
|
@ -7672,6 +7731,7 @@ define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
|
|||
; GFX6-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: udiv_v2i64_pow2_shl_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -7824,6 +7884,7 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
|
|||
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
||||
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: urem_i64_oddk_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f1761f8
|
||||
|
@ -7967,6 +8028,7 @@ define amdgpu_kernel void @urem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x)
|
|||
; GFX6-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: urem_i64_pow2k_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
|
@ -7997,8 +8059,7 @@ define amdgpu_kernel void @urem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
|
|||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: s_mov_b32 s0, s4
|
||||
; GFX6-NEXT: s_mov_b32 s1, s5
|
||||
; GFX6-NEXT: s_mov_b32 s5, 0
|
||||
; GFX6-NEXT: s_movk_i32 s4, 0x1000
|
||||
; GFX6-NEXT: s_mov_b64 s[4:5], 0x1000
|
||||
; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s8
|
||||
; GFX6-NEXT: s_add_u32 s4, s4, -1
|
||||
; GFX6-NEXT: s_addc_u32 s5, s5, -1
|
||||
|
@ -8007,12 +8068,12 @@ define amdgpu_kernel void @urem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
|
|||
; GFX6-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: urem_i64_pow2_shl_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
|
||||
; GFX9-NEXT: s_mov_b32 s1, 0
|
||||
; GFX9-NEXT: s_movk_i32 s0, 0x1000
|
||||
; GFX9-NEXT: s_mov_b64 s[0:1], 0x1000
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
|
||||
|
@ -8056,6 +8117,7 @@ define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
|
|||
; GFX6-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: urem_v2i64_pow2k_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -8094,8 +8156,7 @@ define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
|
|||
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
|
||||
; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
|
||||
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11
|
||||
; GFX6-NEXT: s_mov_b32 s13, 0
|
||||
; GFX6-NEXT: s_movk_i32 s12, 0x1000
|
||||
; GFX6-NEXT: s_mov_b64 s[12:13], 0x1000
|
||||
; GFX6-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX6-NEXT: s_mov_b32 s6, -1
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
@ -8113,13 +8174,13 @@ define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
|
|||
; GFX6-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: urem_v2i64_pow2_shl_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
|
||||
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44
|
||||
; GFX9-NEXT: s_mov_b32 s1, 0
|
||||
; GFX9-NEXT: s_movk_i32 s0, 0x1000
|
||||
; GFX9-NEXT: s_mov_b64 s[0:1], 0x1000
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], s10
|
||||
|
@ -8267,6 +8328,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
|
|||
; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
|
||||
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: sdiv_i64_oddk_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000
|
||||
|
@ -8410,6 +8472,7 @@ define amdgpu_kernel void @sdiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x)
|
|||
; GFX6-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: sdiv_i64_pow2k_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
|
@ -8439,10 +8502,10 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
|
|||
; GFX6-LABEL: sdiv_i64_pow2_shl_denom:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_load_dword s4, s[0:1], 0xd
|
||||
; GFX6-NEXT: s_mov_b32 s3, 0
|
||||
; GFX6-NEXT: s_movk_i32 s2, 0x1000
|
||||
; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000
|
||||
; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
|
||||
; GFX6-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX6-NEXT: s_mov_b32 s6, -1
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
|
||||
; GFX6-NEXT: s_ashr_i32 s12, s3, 31
|
||||
|
@ -8458,7 +8521,6 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
|
|||
; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
|
||||
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
|
||||
; GFX6-NEXT: s_mov_b32 s15, s14
|
||||
; GFX6-NEXT: s_mov_b32 s6, -1
|
||||
; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
|
||||
; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
|
||||
; GFX6-NEXT: v_trunc_f32_e32 v1, v1
|
||||
|
@ -8576,11 +8638,11 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
|
|||
; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
|
||||
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: sdiv_i64_pow2_shl_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34
|
||||
; GFX9-NEXT: s_mov_b32 s3, 0
|
||||
; GFX9-NEXT: s_movk_i32 s2, 0x1000
|
||||
; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
|
||||
|
@ -8753,6 +8815,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
|
|||
; GFX6-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: sdiv_v2i64_pow2k_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -8917,6 +8980,7 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
|
|||
; GFX6-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: ssdiv_v2i64_mixed_pow2k_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0x457ff000
|
||||
|
@ -9064,10 +9128,10 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
|
|||
; GFX6-LABEL: sdiv_v2i64_pow2_shl_denom:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11
|
||||
; GFX6-NEXT: s_mov_b32 s3, 0
|
||||
; GFX6-NEXT: s_movk_i32 s2, 0x1000
|
||||
; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000
|
||||
; GFX6-NEXT: s_mov_b32 s18, 0x4f800000
|
||||
; GFX6-NEXT: s_mov_b32 s19, 0x5f7ffffc
|
||||
; GFX6-NEXT: s_mov_b32 s20, 0x2f800000
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: s_lshl_b64 s[12:13], s[2:3], s6
|
||||
; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
|
||||
|
@ -9078,12 +9142,11 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
|
|||
; GFX6-NEXT: s_xor_b64 s[14:15], s[2:3], s[16:17]
|
||||
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s14
|
||||
; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s15
|
||||
; GFX6-NEXT: s_mov_b32 s20, 0x2f800000
|
||||
; GFX6-NEXT: s_mov_b32 s21, 0xcf800000
|
||||
; GFX6-NEXT: s_sub_u32 s6, 0, s14
|
||||
; GFX6-NEXT: s_subb_u32 s7, 0, s15
|
||||
; GFX6-NEXT: v_mac_f32_e32 v0, s18, v1
|
||||
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
|
||||
; GFX6-NEXT: s_subb_u32 s7, 0, s15
|
||||
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
|
||||
; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
|
||||
; GFX6-NEXT: v_mul_f32_e32 v0, s19, v0
|
||||
|
@ -9330,13 +9393,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
|
|||
; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc
|
||||
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: sdiv_v2i64_pow2_shl_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44
|
||||
; GFX9-NEXT: s_mov_b32 s3, 0
|
||||
; GFX9-NEXT: s_movk_i32 s2, 0x1000
|
||||
; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000
|
||||
; GFX9-NEXT: s_mov_b32 s18, 0x4f800000
|
||||
; GFX9-NEXT: s_mov_b32 s19, 0x5f7ffffc
|
||||
; GFX9-NEXT: s_mov_b32 s20, 0x2f800000
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s6
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
|
||||
|
@ -9347,12 +9411,11 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
|
|||
; GFX9-NEXT: s_xor_b64 s[10:11], s[2:3], s[12:13]
|
||||
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s10
|
||||
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s11
|
||||
; GFX9-NEXT: s_mov_b32 s20, 0x2f800000
|
||||
; GFX9-NEXT: s_mov_b32 s21, 0xcf800000
|
||||
; GFX9-NEXT: s_sub_u32 s14, 0, s10
|
||||
; GFX9-NEXT: s_subb_u32 s4, 0, s11
|
||||
; GFX9-NEXT: v_mac_f32_e32 v0, s18, v1
|
||||
; GFX9-NEXT: v_rcp_f32_e32 v0, v0
|
||||
; GFX9-NEXT: s_subb_u32 s4, 0, s11
|
||||
; GFX9-NEXT: v_mov_b32_e32 v6, 0
|
||||
; GFX9-NEXT: v_mul_f32_e32 v0, s19, v0
|
||||
; GFX9-NEXT: v_mul_f32_e32 v1, s20, v0
|
||||
|
@ -9727,6 +9790,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
|
|||
; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
|
||||
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: srem_i64_oddk_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000
|
||||
|
@ -9870,6 +9934,7 @@ define amdgpu_kernel void @srem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x)
|
|||
; GFX6-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: srem_i64_pow2k_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
|
@ -9901,10 +9966,10 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
|
|||
; GFX6-LABEL: srem_i64_pow2_shl_denom:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_load_dword s4, s[0:1], 0xd
|
||||
; GFX6-NEXT: s_mov_b32 s3, 0
|
||||
; GFX6-NEXT: s_movk_i32 s2, 0x1000
|
||||
; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000
|
||||
; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
|
||||
; GFX6-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX6-NEXT: s_mov_b32 s6, -1
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
|
||||
; GFX6-NEXT: s_ashr_i32 s4, s3, 31
|
||||
|
@ -9920,15 +9985,14 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
|
|||
; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
|
||||
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
|
||||
; GFX6-NEXT: s_mov_b32 s15, s14
|
||||
; GFX6-NEXT: s_mov_b32 s6, -1
|
||||
; GFX6-NEXT: s_mov_b32 s4, s8
|
||||
; GFX6-NEXT: s_mov_b32 s5, s9
|
||||
; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
|
||||
; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
|
||||
; GFX6-NEXT: v_trunc_f32_e32 v1, v1
|
||||
; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
|
||||
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
|
||||
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
|
||||
; GFX6-NEXT: s_mov_b32 s5, s9
|
||||
; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0
|
||||
; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1
|
||||
; GFX6-NEXT: v_mul_lo_u32 v5, s3, v0
|
||||
|
@ -10036,11 +10100,11 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
|
|||
; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
|
||||
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: srem_i64_pow2_shl_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34
|
||||
; GFX9-NEXT: s_mov_b32 s3, 0
|
||||
; GFX9-NEXT: s_movk_i32 s2, 0x1000
|
||||
; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
|
||||
|
@ -10216,6 +10280,7 @@ define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
|
|||
; GFX6-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: srem_v2i64_pow2k_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
|
@ -10265,10 +10330,10 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
|
|||
; GFX6-LABEL: srem_v2i64_pow2_shl_denom:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11
|
||||
; GFX6-NEXT: s_mov_b32 s3, 0
|
||||
; GFX6-NEXT: s_movk_i32 s2, 0x1000
|
||||
; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000
|
||||
; GFX6-NEXT: s_mov_b32 s18, 0x4f800000
|
||||
; GFX6-NEXT: s_mov_b32 s19, 0x5f7ffffc
|
||||
; GFX6-NEXT: s_mov_b32 s20, 0x2f800000
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: s_lshl_b64 s[14:15], s[2:3], s6
|
||||
; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
|
||||
|
@ -10279,12 +10344,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
|
|||
; GFX6-NEXT: s_xor_b64 s[16:17], s[2:3], s[4:5]
|
||||
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s16
|
||||
; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s17
|
||||
; GFX6-NEXT: s_mov_b32 s20, 0x2f800000
|
||||
; GFX6-NEXT: s_mov_b32 s21, 0xcf800000
|
||||
; GFX6-NEXT: s_sub_u32 s6, 0, s16
|
||||
; GFX6-NEXT: s_subb_u32 s7, 0, s17
|
||||
; GFX6-NEXT: v_mac_f32_e32 v0, s18, v1
|
||||
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
|
||||
; GFX6-NEXT: s_subb_u32 s7, 0, s17
|
||||
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
|
||||
; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
|
||||
; GFX6-NEXT: v_mul_f32_e32 v0, s19, v0
|
||||
|
@ -10527,13 +10591,14 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
|
|||
; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc
|
||||
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: srem_v2i64_pow2_shl_denom:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44
|
||||
; GFX9-NEXT: s_mov_b32 s3, 0
|
||||
; GFX9-NEXT: s_movk_i32 s2, 0x1000
|
||||
; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000
|
||||
; GFX9-NEXT: s_mov_b32 s16, 0x4f800000
|
||||
; GFX9-NEXT: s_mov_b32 s17, 0x5f7ffffc
|
||||
; GFX9-NEXT: s_mov_b32 s18, 0x2f800000
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_lshl_b64 s[12:13], s[2:3], s6
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
|
||||
|
@ -10544,12 +10609,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
|
|||
; GFX9-NEXT: s_xor_b64 s[14:15], s[2:3], s[4:5]
|
||||
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s14
|
||||
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s15
|
||||
; GFX9-NEXT: s_mov_b32 s18, 0x2f800000
|
||||
; GFX9-NEXT: s_mov_b32 s19, 0xcf800000
|
||||
; GFX9-NEXT: s_sub_u32 s4, 0, s14
|
||||
; GFX9-NEXT: s_subb_u32 s5, 0, s15
|
||||
; GFX9-NEXT: v_mac_f32_e32 v0, s16, v1
|
||||
; GFX9-NEXT: v_rcp_f32_e32 v0, v0
|
||||
; GFX9-NEXT: s_subb_u32 s5, 0, s15
|
||||
; GFX9-NEXT: v_mov_b32_e32 v6, 0
|
||||
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
|
||||
; GFX9-NEXT: v_mul_f32_e32 v0, s17, v0
|
||||
|
|
|
@ -0,0 +1,98 @@
|
|||
# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass=liveintervals,amdgpu-pre-ra-optimizations %s -o - | FileCheck -check-prefix=GCN %s
|
||||
|
||||
---
|
||||
# GCN-LABEL: name: combine_sreg64_inits
|
||||
# GCN: %0:sgpr_64 = S_MOV_B64_IMM_PSEUDO 8589934593
|
||||
# GCN: S_NOP 0
|
||||
name: combine_sreg64_inits
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
undef %0.sub0:sgpr_64 = S_MOV_B32 1
|
||||
S_NOP 0
|
||||
%0.sub1:sgpr_64 = S_MOV_B32 2
|
||||
...
|
||||
---
|
||||
# GCN-LABEL: name: combine_sreg64_inits_swap
|
||||
# GCN: %0:sgpr_64 = S_MOV_B64_IMM_PSEUDO 8589934593
|
||||
# GCN: S_NOP 0
|
||||
name: combine_sreg64_inits_swap
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
undef %0.sub1:sgpr_64 = S_MOV_B32 2
|
||||
S_NOP 0
|
||||
%0.sub0:sgpr_64 = S_MOV_B32 1
|
||||
...
|
||||
---
|
||||
# GCN-LABEL: name: sreg64_inits_different_blocks
|
||||
# GCN: undef %0.sub0:sgpr_64 = S_MOV_B32 1
|
||||
# GCN: %0.sub1:sgpr_64 = S_MOV_B32 2
|
||||
name: sreg64_inits_different_blocks
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
undef %0.sub0:sgpr_64 = S_MOV_B32 1
|
||||
|
||||
bb.1:
|
||||
%0.sub1:sgpr_64 = S_MOV_B32 2
|
||||
...
|
||||
---
|
||||
# GCN-LABEL: name: sreg64_inits_two_defs_sub1
|
||||
# GCN: undef %0.sub0:sgpr_64 = S_MOV_B32 1
|
||||
# GCN: %0.sub1:sgpr_64 = S_MOV_B32 2
|
||||
# GCN: %0.sub1:sgpr_64 = S_MOV_B32 3
|
||||
name: sreg64_inits_two_defs_sub1
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
undef %0.sub0:sgpr_64 = S_MOV_B32 1
|
||||
%0.sub1:sgpr_64 = S_MOV_B32 2
|
||||
%0.sub1:sgpr_64 = S_MOV_B32 3
|
||||
...
|
||||
---
|
||||
# GCN-LABEL: name: sreg64_inits_two_defs_sub0
|
||||
# GCN: undef %0.sub0:sgpr_64 = S_MOV_B32 1
|
||||
# GCN: %0.sub1:sgpr_64 = S_MOV_B32 2
|
||||
# GCN: %0.sub0:sgpr_64 = S_MOV_B32 3
|
||||
name: sreg64_inits_two_defs_sub0
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
undef %0.sub0:sgpr_64 = S_MOV_B32 1
|
||||
%0.sub1:sgpr_64 = S_MOV_B32 2
|
||||
%0.sub0:sgpr_64 = S_MOV_B32 3
|
||||
...
|
||||
---
|
||||
# GCN-LABEL: name: sreg64_inits_full_def
|
||||
# GCN: undef %1.sub0:sgpr_64 = S_MOV_B32 1
|
||||
# GCN: %0:sgpr_64 = S_MOV_B64 3
|
||||
name: sreg64_inits_full_def
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
undef %0.sub0:sgpr_64 = S_MOV_B32 1
|
||||
%0:sgpr_64 = S_MOV_B64 3
|
||||
...
|
||||
---
|
||||
# GCN-LABEL: name: sreg64_inits_imp_use
|
||||
# GCN: %0.sub0:sgpr_64 = S_MOV_B32 1, implicit $m0
|
||||
# GCN: %0.sub1:sgpr_64 = S_MOV_B32 2
|
||||
name: sreg64_inits_imp_use
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
undef %0.sub0:sgpr_64 = S_MOV_B32 1, implicit $m0
|
||||
%0.sub1:sgpr_64 = S_MOV_B32 2
|
||||
...
|
||||
---
|
||||
# GCN-LABEL: name: sreg64_inits_imp_def
|
||||
# GCN: %0.sub0:sgpr_64 = S_MOV_B32 1, implicit-def $scc
|
||||
# GCN: %0.sub1:sgpr_64 = S_MOV_B32 2
|
||||
name: sreg64_inits_imp_def
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
undef %0.sub0:sgpr_64 = S_MOV_B32 1, implicit-def $scc
|
||||
%0.sub1:sgpr_64 = S_MOV_B32 2
|
||||
...
|
|
@ -844,8 +844,7 @@ define amdgpu_kernel void @store_misaligned64_constant_offsets() {
|
|||
define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {
|
||||
; CI-LABEL: store_misaligned64_constant_large_offsets:
|
||||
; CI: ; %bb.0:
|
||||
; CI-NEXT: s_movk_i32 s0, 0x7b
|
||||
; CI-NEXT: s_mov_b32 s1, 0
|
||||
; CI-NEXT: s_mov_b64 s[0:1], 0x7b
|
||||
; CI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; CI-NEXT: v_mov_b32_e32 v2, 0
|
||||
; CI-NEXT: v_mov_b32_e32 v1, s1
|
||||
|
@ -856,8 +855,7 @@ define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {
|
|||
;
|
||||
; GFX9-LABEL: store_misaligned64_constant_large_offsets:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_movk_i32 s0, 0x7b
|
||||
; GFX9-NEXT: s_mov_b32 s1, 0
|
||||
; GFX9-NEXT: s_mov_b64 s[0:1], 0x7b
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
|
|
|
@ -205,8 +205,7 @@ entry:
|
|||
|
||||
; FIXME: Should not have intermediate sgprs
|
||||
; CHECK-LABEL: {{^}}i64_imm_input_phys_vgpr:
|
||||
; CHECK-DAG: s_mov_b32 s1, 0
|
||||
; CHECK-DAG: s_mov_b32 s0, 0x1e240
|
||||
; CHECK: s_mov_b64 s[0:1], 0x1e240
|
||||
; CHECK: v_mov_b32_e32 v0, s0
|
||||
; CHECK: v_mov_b32_e32 v1, s1
|
||||
; CHECK: use v[0:1]
|
||||
|
|
|
@ -59,20 +59,17 @@ define amdgpu_kernel void @inline_sreg_constraint_imm_f32() {
|
|||
ret void
|
||||
}
|
||||
|
||||
; FIXME: Should be able to use s_mov_b64
|
||||
; GCN-LABEL: {{^}}inline_sreg_constraint_imm_i64:
|
||||
; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], -4{{$}}
|
||||
; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], -1{{$}}
|
||||
; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}
|
||||
; GCN: s_mov_b64 [[REG:s\[[0-9:]+\]]], -4{{$}}
|
||||
; GCN: ; use [[REG]]
|
||||
define amdgpu_kernel void @inline_sreg_constraint_imm_i64() {
|
||||
tail call void asm sideeffect "; use $0", "s"(i64 -4)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}inline_sreg_constraint_imm_f64:
|
||||
; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], 0{{$}}
|
||||
; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], 0x3ff00000{{$}}
|
||||
; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}
|
||||
; GCN: s_mov_b64 [[REG:s\[[0-9:]+\]]], 1.0{{$}}
|
||||
; GCN: ; use [[REG]]
|
||||
define amdgpu_kernel void @inline_sreg_constraint_imm_f64() {
|
||||
tail call void asm sideeffect "; use $0", "s"(double 1.0)
|
||||
ret void
|
||||
|
|
|
@ -900,12 +900,11 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %
|
|||
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2
|
||||
; SI-NEXT: s_load_dword s4, s[4:5], 0x4
|
||||
; SI-NEXT: s_mov_b32 s5, 0
|
||||
; SI-NEXT: s_mov_b32 s3, 0x100f000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_lshl_b32 s8, s4, 4
|
||||
; SI-NEXT: s_mov_b32 s4, 0xffff
|
||||
; SI-NEXT: s_mov_b64 s[4:5], 0xffff
|
||||
; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], s8
|
||||
; SI-NEXT: s_mov_b32 s8, 0x50005
|
||||
; SI-NEXT: s_and_b32 s9, s5, s8
|
||||
|
@ -923,12 +922,11 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %
|
|||
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8
|
||||
; VI-NEXT: s_load_dword s4, s[4:5], 0x10
|
||||
; VI-NEXT: s_mov_b32 s5, 0
|
||||
; VI-NEXT: s_mov_b32 s3, 0x1100f000
|
||||
; VI-NEXT: s_mov_b32 s2, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_lshl_b32 s8, s4, 4
|
||||
; VI-NEXT: s_mov_b32 s4, 0xffff
|
||||
; VI-NEXT: s_mov_b64 s[4:5], 0xffff
|
||||
; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s8
|
||||
; VI-NEXT: s_mov_b32 s8, 0x50005
|
||||
; VI-NEXT: s_mov_b32 s9, s8
|
||||
|
@ -1075,14 +1073,13 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %
|
|||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
|
||||
; SI-NEXT: s_load_dword s6, s[4:5], 0x4
|
||||
; SI-NEXT: s_mov_b32 s7, 0
|
||||
; SI-NEXT: s_mov_b32 s3, 0x100f000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_load_dwordx2 s[4:5], s[10:11], 0x0
|
||||
; SI-NEXT: s_mov_b32 s0, s8
|
||||
; SI-NEXT: s_lshl_b32 s8, s6, 3
|
||||
; SI-NEXT: s_mov_b32 s6, 0xffff
|
||||
; SI-NEXT: s_mov_b64 s[6:7], 0xffff
|
||||
; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], s8
|
||||
; SI-NEXT: s_mov_b32 s8, 0x5050505
|
||||
; SI-NEXT: s_mov_b32 s1, s9
|
||||
|
@ -1100,14 +1097,13 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %
|
|||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
|
||||
; VI-NEXT: s_load_dword s6, s[4:5], 0x10
|
||||
; VI-NEXT: s_mov_b32 s7, 0
|
||||
; VI-NEXT: s_mov_b32 s3, 0x1100f000
|
||||
; VI-NEXT: s_mov_b32 s2, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[10:11], 0x0
|
||||
; VI-NEXT: s_mov_b32 s0, s8
|
||||
; VI-NEXT: s_lshl_b32 s8, s6, 3
|
||||
; VI-NEXT: s_mov_b32 s6, 0xffff
|
||||
; VI-NEXT: s_mov_b64 s[6:7], 0xffff
|
||||
; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], s8
|
||||
; VI-NEXT: s_mov_b32 s8, 0x5050505
|
||||
; VI-NEXT: s_mov_b32 s1, s9
|
||||
|
|
|
@ -1584,8 +1584,7 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
|
|||
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
|
||||
; GFX9-NEXT: s_mov_b32 s3, 0
|
||||
; GFX9-NEXT: s_mov_b32 s2, 0xffff
|
||||
; GFX9-NEXT: s_mov_b64 s[2:3], 0xffff
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2
|
||||
; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3]
|
||||
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s6, s6
|
||||
|
@ -1607,9 +1606,8 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
|
|||
; VI-NEXT: flat_load_dword v4, v[0:1] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
||||
; VI-NEXT: s_mov_b32 s2, 0xffff
|
||||
; VI-NEXT: s_mov_b64 s[2:3], 0xffff
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: s_mov_b32 s3, 0
|
||||
; VI-NEXT: s_and_b32 s1, s4, s2
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
|
||||
; VI-NEXT: s_lshl_b32 s0, s1, 16
|
||||
|
@ -1635,8 +1633,7 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
|
|||
; CI-NEXT: flat_load_dword v4, v[0:1] glc
|
||||
; CI-NEXT: s_waitcnt vmcnt(0)
|
||||
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
||||
; CI-NEXT: s_mov_b32 s3, 0
|
||||
; CI-NEXT: s_mov_b32 s2, 0xffff
|
||||
; CI-NEXT: s_mov_b64 s[2:3], 0xffff
|
||||
; CI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; CI-NEXT: s_lshl_b32 s1, s4, 16
|
||||
; CI-NEXT: s_and_b32 s4, s4, s2
|
||||
|
@ -1672,8 +1669,7 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspa
|
|||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
|
||||
; GFX9-NEXT: s_pack_ll_b32_b16 s5, s6, s6
|
||||
; GFX9-NEXT: s_mov_b32 s3, 0
|
||||
; GFX9-NEXT: s_mov_b32 s2, 0xffff
|
||||
; GFX9-NEXT: s_mov_b64 s[2:3], 0xffff
|
||||
; GFX9-NEXT: s_lshl_b32 s4, s7, 4
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
||||
|
@ -1694,9 +1690,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspa
|
|||
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
||||
; VI-NEXT: s_mov_b32 s2, 0xffff
|
||||
; VI-NEXT: s_mov_b64 s[2:3], 0xffff
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: s_mov_b32 s3, 0
|
||||
; VI-NEXT: s_lshl_b32 s1, s5, 4
|
||||
; VI-NEXT: s_and_b32 s4, s4, s2
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
|
||||
|
@ -1722,10 +1717,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspa
|
|||
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
|
||||
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
||||
; CI-NEXT: s_mov_b32 s2, 0xffff
|
||||
; CI-NEXT: s_mov_b64 s[2:3], 0xffff
|
||||
; CI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; CI-NEXT: s_and_b32 s6, s4, s2
|
||||
; CI-NEXT: s_mov_b32 s3, 0
|
||||
; CI-NEXT: s_lshl_b32 s1, s5, 4
|
||||
; CI-NEXT: s_lshl_b32 s4, s4, 16
|
||||
; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
; RUN: llc -O1 -mtriple=amdgcn--amdhsa -disable-verify -debug-pass=Structure < %s 2>&1 \
|
||||
; RUN: | grep -v 'Verify generated machine code' | FileCheck -check-prefix=GCN-O1 %s
|
||||
; RUN: llc -O1 -mtriple=amdgcn--amdhsa -disable-verify -amdgpu-scalar-ir-passes -amdgpu-sdwa-peephole \
|
||||
; RUN: -amdgpu-load-store-vectorizer -debug-pass=Structure < %s 2>&1 \
|
||||
; RUN: -amdgpu-load-store-vectorizer -amdgpu-enable-pre-ra-optimizations -debug-pass=Structure < %s 2>&1 \
|
||||
; RUN: | grep -v 'Verify generated machine code' | FileCheck -check-prefix=GCN-O1-OPTS %s
|
||||
; RUN: llc -O2 -mtriple=amdgcn--amdhsa -disable-verify -debug-pass=Structure < %s 2>&1 \
|
||||
; RUN: | grep -v 'Verify generated machine code' | FileCheck -check-prefix=GCN-O2 %s
|
||||
|
@ -619,6 +619,7 @@
|
|||
; GCN-O1-OPTS-NEXT: Machine Natural Loop Construction
|
||||
; GCN-O1-OPTS-NEXT: Simple Register Coalescing
|
||||
; GCN-O1-OPTS-NEXT: Rename Disconnected Subregister Components
|
||||
; GCN-O1-OPTS-NEXT: AMDGPU Pre-RA optimizations
|
||||
; GCN-O1-OPTS-NEXT: Machine Instruction Scheduler
|
||||
; GCN-O1-OPTS-NEXT: MachinePostDominator Tree Construction
|
||||
; GCN-O1-OPTS-NEXT: SI Whole Quad Mode
|
||||
|
@ -899,6 +900,7 @@
|
|||
; GCN-O2-NEXT: Machine Natural Loop Construction
|
||||
; GCN-O2-NEXT: Simple Register Coalescing
|
||||
; GCN-O2-NEXT: Rename Disconnected Subregister Components
|
||||
; GCN-O2-NEXT: AMDGPU Pre-RA optimizations
|
||||
; GCN-O2-NEXT: Machine Instruction Scheduler
|
||||
; GCN-O2-NEXT: MachinePostDominator Tree Construction
|
||||
; GCN-O2-NEXT: SI Whole Quad Mode
|
||||
|
@ -1193,6 +1195,7 @@
|
|||
; GCN-O3-NEXT: Machine Natural Loop Construction
|
||||
; GCN-O3-NEXT: Simple Register Coalescing
|
||||
; GCN-O3-NEXT: Rename Disconnected Subregister Components
|
||||
; GCN-O3-NEXT: AMDGPU Pre-RA optimizations
|
||||
; GCN-O3-NEXT: Machine Instruction Scheduler
|
||||
; GCN-O3-NEXT: MachinePostDominator Tree Construction
|
||||
; GCN-O3-NEXT: SI Whole Quad Mode
|
||||
|
|
|
@ -77,9 +77,9 @@ define amdgpu_kernel void @v_round_f64(double addrspace(1)* %out, double addrspa
|
|||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[0:1], s[6:7]
|
||||
; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
|
||||
; SI-NEXT: s_mov_b32 s0, -1
|
||||
; SI-NEXT: s_movk_i32 s7, 0xfc01
|
||||
; SI-NEXT: s_mov_b32 s1, 0xfffff
|
||||
; SI-NEXT: s_mov_b32 s0, -1
|
||||
; SI-NEXT: s_brev_b32 s6, -2
|
||||
; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
|
|
|
@ -92,9 +92,8 @@ define amdgpu_kernel void @fadd_v2_v_lit_splat(<2 x float> addrspace(1)* %a) {
|
|||
; GCN-LABEL: {{^}}fadd_v2_v_lit_hi0:
|
||||
; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
|
||||
; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
|
||||
; GFX90A-DAG: s_mov_b32 s[[HI:[0-9]+]], 0
|
||||
; GFX90A-DAG: s_mov_b32 s[[LO:[0-9]+]], 1.0
|
||||
; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s{{\[}}[[LO]]:[[HI]]]{{$}}
|
||||
; GFX90A-DAG: s_mov_b64 [[K:s\[[0-9:]+\]]], 0x3f800000
|
||||
; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], [[K]]
|
||||
define amdgpu_kernel void @fadd_v2_v_lit_hi0(<2 x float> addrspace(1)* %a) {
|
||||
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
|
||||
|
|
|
@ -103,8 +103,7 @@ entry:
|
|||
; OPT-LABEL: define amdgpu_kernel void @half4_alloca_load4
|
||||
|
||||
; GCN-NOT: buffer_
|
||||
; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0
|
||||
; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0xffff
|
||||
; GCN: s_mov_b64 s[{{[0-9:]+}}], 0xffff
|
||||
|
||||
; OPT: %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2
|
||||
; OPT: %0 = load <4 x half>, <4 x half> addrspace(5)* %alloca
|
||||
|
@ -163,8 +162,7 @@ entry:
|
|||
; OPT-LABEL: define amdgpu_kernel void @short4_alloca_load4
|
||||
|
||||
; GCN-NOT: buffer_
|
||||
; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0
|
||||
; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0xffff
|
||||
; GCN: s_mov_b64 s[{{[0-9:]+}}], 0xffff
|
||||
|
||||
; OPT: %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2
|
||||
; OPT: %0 = load <4 x i16>, <4 x i16> addrspace(5)* %alloca
|
||||
|
|
|
@ -0,0 +1,45 @@
|
|||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs --stress-regalloc=10 < %s | FileCheck -check-prefix=GCN %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs --stress-regalloc=10 < %s | FileCheck -check-prefix=GCN %s
|
||||
|
||||
; GCN-LABEL: {{^}}test_remat_sgpr:
|
||||
; GCN-NOT: v_writelane_b32
|
||||
; GCN: {{^}}[[LOOP:BB[0-9_]+]]:
|
||||
; GCN-COUNT-6: s_mov_b32 s{{[0-9]+}}, 0x
|
||||
; GCN-NOT: v_writelane_b32
|
||||
; GCN: s_cbranch_{{[^ ]+}} [[LOOP]]
|
||||
; GCN: .sgpr_spill_count: 0
|
||||
define amdgpu_kernel void @test_remat_sgpr(double addrspace(1)* %arg, double addrspace(1)* %arg1) {
|
||||
bb:
|
||||
%i = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
br label %bb3
|
||||
|
||||
bb2: ; preds = %bb3
|
||||
ret void
|
||||
|
||||
bb3: ; preds = %bb3, %bb
|
||||
%i4 = phi i32 [ 0, %bb ], [ %i22, %bb3 ]
|
||||
%i5 = add nuw nsw i32 %i4, %i
|
||||
%i6 = zext i32 %i5 to i64
|
||||
%i7 = getelementptr inbounds double, double addrspace(1)* %arg, i64 %i6
|
||||
%i8 = load double, double addrspace(1)* %i7, align 8
|
||||
%i9 = fadd double %i8, 0x3EFC01997CC9E6B0
|
||||
%i10 = tail call double @llvm.fma.f64(double %i8, double %i9, double 0x3FBE25E43ABE935A)
|
||||
%i11 = tail call double @llvm.fma.f64(double %i10, double %i9, double 0x3FC110EF47E6C9C2)
|
||||
%i12 = tail call double @llvm.fma.f64(double %i11, double %i9, double 0x3FC3B13BCFA74449)
|
||||
%i13 = tail call double @llvm.fma.f64(double %i12, double %i9, double 0x3FC745D171BF3C30)
|
||||
%i14 = tail call double @llvm.fma.f64(double %i13, double %i9, double 0x3FCC71C71C7792CE)
|
||||
%i15 = tail call double @llvm.fma.f64(double %i14, double %i9, double 0x3FD24924924920DA)
|
||||
%i16 = tail call double @llvm.fma.f64(double %i15, double %i9, double 0x3FD999999999999C)
|
||||
%i17 = tail call double @llvm.fma.f64(double %i16, double %i9, double 0x3FD899999999899C)
|
||||
%i18 = tail call double @llvm.fma.f64(double %i17, double %i9, double 0x3FD799999999799C)
|
||||
%i19 = tail call double @llvm.fma.f64(double %i18, double %i9, double 0x3FD699999999699C)
|
||||
%i20 = tail call double @llvm.fma.f64(double %i19, double %i9, double 0x3FD599999999599C)
|
||||
%i21 = getelementptr inbounds double, double addrspace(1)* %arg1, i64 %i6
|
||||
store double %i19, double addrspace(1)* %i21, align 8
|
||||
%i22 = add nuw nsw i32 %i4, 1
|
||||
%i23 = icmp eq i32 %i22, 1024
|
||||
br i1 %i23, label %bb2, label %bb3
|
||||
}
|
||||
|
||||
declare double @llvm.fma.f64(double, double, double)
|
||||
declare i32 @llvm.amdgcn.workitem.id.x()
|
|
@ -1698,7 +1698,7 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc
|
||||
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
|
||||
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5]
|
||||
; GCN-IR-NEXT: s_mov_b32 s8, 0x8000
|
||||
; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000
|
||||
; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v6, s8
|
||||
; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5]
|
||||
|
@ -1706,7 +1706,6 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v6, 0, s[4:5]
|
||||
; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GCN-IR-NEXT: s_mov_b32 s9, 0
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v7, v11
|
||||
; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc
|
||||
; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
|
||||
|
@ -1724,8 +1723,7 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5]
|
||||
; GCN-IR-NEXT: s_cbranch_execz BB12_5
|
||||
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
|
||||
; GCN-IR-NEXT: s_mov_b32 s5, 0
|
||||
; GCN-IR-NEXT: s_mov_b32 s4, 0x8000
|
||||
; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000
|
||||
; GCN-IR-NEXT: v_lshr_b64 v[12:13], s[4:5], v8
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, -1, v0
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, -1, v1, vcc
|
||||
|
|
|
@ -146,8 +146,7 @@ define i128 @v_lshr_i128_kv(i128 %rhs) {
|
|||
; GCN-LABEL: v_lshr_i128_kv:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: s_movk_i32 s4, 0x41
|
||||
; GCN-NEXT: s_mov_b32 s5, 0
|
||||
; GCN-NEXT: s_mov_b64 s[4:5], 0x41
|
||||
; GCN-NEXT: v_lshr_b64 v[1:2], s[4:5], v0
|
||||
; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, s4
|
||||
|
|
|
@ -1248,8 +1248,8 @@ define amdgpu_kernel void @v_shl_constant_i64(i64 addrspace(1)* %out, i64 addrsp
|
|||
; SI-NEXT: s_mov_b32 s8, s6
|
||||
; SI-NEXT: s_mov_b32 s9, s7
|
||||
; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
|
||||
; SI-NEXT: s_movk_i32 s7, 0x11e
|
||||
; SI-NEXT: s_mov_b32 s6, 0xab19b207
|
||||
; SI-NEXT: s_movk_i32 s7, 0x11e
|
||||
; SI-NEXT: s_mov_b32 s0, s4
|
||||
; SI-NEXT: s_mov_b32 s1, s5
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
|
@ -1266,8 +1266,8 @@ define amdgpu_kernel void @v_shl_constant_i64(i64 addrspace(1)* %out, i64 addrsp
|
|||
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
|
||||
; VI-NEXT: s_mov_b32 s4, s0
|
||||
; VI-NEXT: s_mov_b32 s5, s1
|
||||
; VI-NEXT: s_movk_i32 s1, 0x11e
|
||||
; VI-NEXT: s_mov_b32 s0, 0xab19b207
|
||||
; VI-NEXT: s_movk_i32 s1, 0x11e
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
|
@ -1319,8 +1319,7 @@ define amdgpu_kernel void @v_shl_i64_32_bit_constant(i64 addrspace(1)* %out, i64
|
|||
; SI-NEXT: s_mov_b32 s8, s6
|
||||
; SI-NEXT: s_mov_b32 s9, s7
|
||||
; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
|
||||
; SI-NEXT: s_mov_b32 s7, 0
|
||||
; SI-NEXT: s_mov_b32 s6, 0x12d687
|
||||
; SI-NEXT: s_mov_b64 s[6:7], 0x12d687
|
||||
; SI-NEXT: s_mov_b32 s0, s4
|
||||
; SI-NEXT: s_mov_b32 s1, s5
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
|
@ -1337,8 +1336,7 @@ define amdgpu_kernel void @v_shl_i64_32_bit_constant(i64 addrspace(1)* %out, i64
|
|||
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
|
||||
; VI-NEXT: s_mov_b32 s4, s0
|
||||
; VI-NEXT: s_mov_b32 s5, s1
|
||||
; VI-NEXT: s_mov_b32 s1, 0
|
||||
; VI-NEXT: s_mov_b32 s0, 0x12d687
|
||||
; VI-NEXT: s_mov_b64 s[0:1], 0x12d687
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
|
@ -1927,8 +1925,7 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(i64 addrspace(1)* %out,
|
|||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
|
||||
; SI-NEXT: s_load_dword s2, s[0:1], 0xd
|
||||
; SI-NEXT: s_mov_b32 s1, 0
|
||||
; SI-NEXT: s_mov_b32 s0, 4.0
|
||||
; SI-NEXT: s_mov_b64 s[0:1], 0x40800000
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
@ -1942,8 +1939,7 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(i64 addrspace(1)* %out,
|
|||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; VI-NEXT: s_load_dword s2, s[0:1], 0x34
|
||||
; VI-NEXT: s_mov_b32 s1, 0
|
||||
; VI-NEXT: s_mov_b32 s0, 4.0
|
||||
; VI-NEXT: s_mov_b64 s[0:1], 0x40800000
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
@ -2039,8 +2035,8 @@ define amdgpu_kernel void @s_shl_inline_high_imm_f32_4_0_i64(i64 addrspace(1)* %
|
|||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
|
||||
; SI-NEXT: s_load_dword s2, s[0:1], 0xd
|
||||
; SI-NEXT: s_mov_b32 s1, 4.0
|
||||
; SI-NEXT: s_mov_b32 s0, 0
|
||||
; SI-NEXT: s_mov_b32 s1, 4.0
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
@ -2054,8 +2050,8 @@ define amdgpu_kernel void @s_shl_inline_high_imm_f32_4_0_i64(i64 addrspace(1)* %
|
|||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; VI-NEXT: s_load_dword s2, s[0:1], 0x34
|
||||
; VI-NEXT: s_mov_b32 s1, 4.0
|
||||
; VI-NEXT: s_mov_b32 s0, 0
|
||||
; VI-NEXT: s_mov_b32 s1, 4.0
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
@ -2090,8 +2086,8 @@ define amdgpu_kernel void @s_shl_inline_high_imm_f32_neg_4_0_i64(i64 addrspace(1
|
|||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
|
||||
; SI-NEXT: s_load_dword s2, s[0:1], 0xd
|
||||
; SI-NEXT: s_mov_b32 s1, -4.0
|
||||
; SI-NEXT: s_mov_b32 s0, 0
|
||||
; SI-NEXT: s_mov_b32 s1, -4.0
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
@ -2105,8 +2101,8 @@ define amdgpu_kernel void @s_shl_inline_high_imm_f32_neg_4_0_i64(i64 addrspace(1
|
|||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; VI-NEXT: s_load_dword s2, s[0:1], 0x34
|
||||
; VI-NEXT: s_mov_b32 s1, -4.0
|
||||
; VI-NEXT: s_mov_b32 s0, 0
|
||||
; VI-NEXT: s_mov_b32 s1, -4.0
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
|
|
@ -589,13 +589,13 @@ endif:
|
|||
|
||||
; GCN-LABEL: {{^}}br_scc_eq_i64_simm16:
|
||||
; VI-DAG: s_movk_i32 s[[K_LO:[0-9]+]], 0x4d2
|
||||
; VI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0
|
||||
; VI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 1
|
||||
; VI: s_cmp_eq_u64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
|
||||
|
||||
; SI: v_cmp_eq_u64_e32
|
||||
define amdgpu_kernel void @br_scc_eq_i64_simm16(i64 %cond, i32 addrspace(1)* %out) #0 {
|
||||
entry:
|
||||
%cmp0 = icmp eq i64 %cond, 1234
|
||||
%cmp0 = icmp eq i64 %cond, 4294968530
|
||||
br i1 %cmp0, label %endif, label %if
|
||||
|
||||
if:
|
||||
|
@ -627,13 +627,13 @@ endif:
|
|||
|
||||
; GCN-LABEL: {{^}}br_scc_ne_i64_simm16:
|
||||
; VI-DAG: s_movk_i32 s[[K_LO:[0-9]+]], 0x4d2
|
||||
; VI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0
|
||||
; VI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 1
|
||||
; VI: s_cmp_lg_u64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
|
||||
|
||||
; SI: v_cmp_ne_u64_e32
|
||||
define amdgpu_kernel void @br_scc_ne_i64_simm16(i64 %cond, i32 addrspace(1)* %out) #0 {
|
||||
entry:
|
||||
%cmp0 = icmp ne i64 %cond, 1234
|
||||
%cmp0 = icmp ne i64 %cond, 4294968530
|
||||
br i1 %cmp0, label %endif, label %if
|
||||
|
||||
if:
|
||||
|
|
|
@ -1876,14 +1876,13 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc
|
||||
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
|
||||
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3]
|
||||
; GCN-IR-NEXT: s_mov_b32 s8, 0x8000
|
||||
; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000
|
||||
; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v4, s8
|
||||
; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
|
||||
; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5]
|
||||
; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1
|
||||
; GCN-IR-NEXT: s_mov_b32 s9, 0
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v5, v9
|
||||
; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc
|
||||
; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
|
||||
|
@ -1901,8 +1900,7 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5]
|
||||
; GCN-IR-NEXT: s_cbranch_execz BB12_5
|
||||
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
|
||||
; GCN-IR-NEXT: s_mov_b32 s5, 0
|
||||
; GCN-IR-NEXT: s_mov_b32 s4, 0x8000
|
||||
; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000
|
||||
; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v6
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, -1, v0
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc
|
||||
|
|
|
@ -1249,14 +1249,13 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc
|
||||
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
|
||||
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5]
|
||||
; GCN-IR-NEXT: s_mov_b32 s8, 0x8000
|
||||
; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000
|
||||
; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v2, s8
|
||||
; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
|
||||
; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[4:5]
|
||||
; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1
|
||||
; GCN-IR-NEXT: s_mov_b32 s9, 0
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v3, v9
|
||||
; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc
|
||||
; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
|
||||
|
@ -1274,8 +1273,7 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5]
|
||||
; GCN-IR-NEXT: s_cbranch_execz BB9_5
|
||||
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
|
||||
; GCN-IR-NEXT: s_mov_b32 s5, 0
|
||||
; GCN-IR-NEXT: s_mov_b32 s4, 0x8000
|
||||
; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000
|
||||
; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v6
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, -1, v0
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc
|
||||
|
|
|
@ -1269,14 +1269,13 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc
|
||||
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
|
||||
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3]
|
||||
; GCN-IR-NEXT: s_mov_b32 s8, 0x8000
|
||||
; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000
|
||||
; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v4, s8
|
||||
; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
|
||||
; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5]
|
||||
; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1
|
||||
; GCN-IR-NEXT: s_mov_b32 s9, 0
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v5, v9
|
||||
; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc
|
||||
; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
|
||||
|
@ -1294,8 +1293,7 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5]
|
||||
; GCN-IR-NEXT: s_cbranch_execz BB8_5
|
||||
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
|
||||
; GCN-IR-NEXT: s_mov_b32 s5, 0
|
||||
; GCN-IR-NEXT: s_mov_b32 s4, 0x8000
|
||||
; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000
|
||||
; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v6
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, -1, v0
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc
|
||||
|
|
|
@ -807,8 +807,8 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
|
|||
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 5, v0
|
||||
; GFX9-O3-NEXT: buffer_load_dwordx4 v[1:4], v0, s[4:7], 0 offen
|
||||
; GFX9-O3-NEXT: buffer_load_dwordx2 v[5:6], v0, s[4:7], 0 offen offset:16
|
||||
; GFX9-O3-NEXT: s_brev_b32 s9, -2
|
||||
; GFX9-O3-NEXT: s_mov_b32 s8, -1
|
||||
; GFX9-O3-NEXT: s_brev_b32 s9, -2
|
||||
; GFX9-O3-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O3-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v1, s8
|
||||
|
|
Loading…
Reference in New Issue