forked from OSchip/llvm-project
[AMDGPU] Preliminary patch for divergence driven instruction selection. Immediate selection predicate changed
Differential revision: https://reviews.llvm.org/D51734 Reviewers: rampitec llvm-svn: 341928
This commit is contained in:
parent
ae3cfeb3ad
commit
db7ee7660a
|
@ -101,7 +101,7 @@ private:
|
|||
std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const;
|
||||
bool isNoNanSrc(SDValue N) const;
|
||||
bool isInlineImmediate(const SDNode *N) const;
|
||||
|
||||
bool isVGPRImm(const SDNode *N) const;
|
||||
bool isUniformBr(const SDNode *N) const;
|
||||
|
||||
MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const;
|
||||
|
@ -2068,6 +2068,56 @@ bool AMDGPUDAGToDAGISel::SelectHi16Elt(SDValue In, SDValue &Src) const {
|
|||
return isExtractHiElt(In, Src);
|
||||
}
|
||||
|
||||
bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
|
||||
if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) {
|
||||
return false;
|
||||
}
|
||||
const SIRegisterInfo *SIRI =
|
||||
static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
|
||||
const SIInstrInfo * SII =
|
||||
static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
|
||||
|
||||
unsigned Limit = 0;
|
||||
bool AllUsesAcceptSReg = true;
|
||||
for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
|
||||
Limit < 10 && U != E; ++U, ++Limit) {
|
||||
const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
|
||||
|
||||
// If the register class is unknown, it could be an unknown
|
||||
// register class that needs to be an SGPR, e.g. an inline asm
|
||||
// constraint
|
||||
if (!RC || SIRI->isSGPRClass(RC))
|
||||
return false;
|
||||
|
||||
if (RC != &AMDGPU::VS_32RegClass) {
|
||||
AllUsesAcceptSReg = false;
|
||||
SDNode * User = *U;
|
||||
if (User->isMachineOpcode()) {
|
||||
unsigned Opc = User->getMachineOpcode();
|
||||
MCInstrDesc Desc = SII->get(Opc);
|
||||
if (Desc.isCommutable()) {
|
||||
unsigned OpIdx = Desc.getNumDefs() + U.getOperandNo();
|
||||
unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
|
||||
if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
|
||||
unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
|
||||
const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo);
|
||||
if (CommutedRC == &AMDGPU::VS_32RegClass)
|
||||
AllUsesAcceptSReg = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
// If "AllUsesAcceptSReg == false" so far we haven't suceeded
|
||||
// commuting current user. This means have at least one use
|
||||
// that strictly require VGPR. Thus, we will not attempt to commute
|
||||
// other user instructions.
|
||||
if (!AllUsesAcceptSReg)
|
||||
break;
|
||||
}
|
||||
}
|
||||
return !AllUsesAcceptSReg && (Limit < 10);
|
||||
}
|
||||
|
||||
|
||||
void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
|
||||
const AMDGPUTargetLowering& Lowering =
|
||||
*static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
|
||||
|
|
|
@ -1421,10 +1421,15 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
|
|||
// TargetInstrInfo::commuteInstruction uses it.
|
||||
bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0,
|
||||
unsigned &SrcOpIdx1) const {
|
||||
if (!MI.isCommutable())
|
||||
return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
|
||||
}
|
||||
|
||||
bool SIInstrInfo::findCommutedOpIndices(MCInstrDesc Desc, unsigned &SrcOpIdx0,
|
||||
unsigned &SrcOpIdx1) const {
|
||||
if (!Desc.isCommutable())
|
||||
return false;
|
||||
|
||||
unsigned Opc = MI.getOpcode();
|
||||
unsigned Opc = Desc.getOpcode();
|
||||
int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
|
||||
if (Src0Idx == -1)
|
||||
return false;
|
||||
|
|
|
@ -227,6 +227,9 @@ public:
|
|||
bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
|
||||
unsigned &SrcOpIdx2) const override;
|
||||
|
||||
bool findCommutedOpIndices(MCInstrDesc Desc, unsigned & SrcOpIdx0,
|
||||
unsigned & SrcOpIdx1) const;
|
||||
|
||||
bool isBranchOffsetInRange(unsigned BranchOpc,
|
||||
int64_t BrOffset) const override;
|
||||
|
||||
|
|
|
@ -495,24 +495,7 @@ class InlineFPImm <ValueType vt> : PatLeaf <(vt fpimm), [{
|
|||
}]>;
|
||||
|
||||
class VGPRImm <dag frag> : PatLeaf<frag, [{
|
||||
if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) {
|
||||
return false;
|
||||
}
|
||||
const SIRegisterInfo *SIRI =
|
||||
static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
|
||||
unsigned Limit = 0;
|
||||
for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
|
||||
Limit < 10 && U != E; ++U, ++Limit) {
|
||||
const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
|
||||
|
||||
// If the register class is unknown, it could be an unknown
|
||||
// register class that needs to be an SGPR, e.g. an inline asm
|
||||
// constraint
|
||||
if (!RC || SIRI->isSGPRClass(RC))
|
||||
return false;
|
||||
}
|
||||
|
||||
return Limit < 10;
|
||||
return isVGPRImm(N);
|
||||
}]>;
|
||||
|
||||
def NegateImm : SDNodeXForm<imm, [{
|
||||
|
|
|
@ -232,8 +232,10 @@ for.end:
|
|||
; SI-ALLOCA: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
|
||||
|
||||
; SI-PROMOTE-VECT: s_load_dword [[IDX:s[0-9]+]]
|
||||
; SI-PROMOTE-VECT: s_mov_b32 [[SREG:s[0-9]+]], 0x10000
|
||||
; SI-PROMOTE-VECT: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 4
|
||||
; SI-PROMOTE-VECT: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[SCALED_IDX]], 16
|
||||
; SI-PROMOTE-VECT: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SCALED_IDX]]
|
||||
; SI-PROMOTE-VECT: v_bfe_u32 v{{[0-9]+}}, [[SREG]], [[VREG]], 16
|
||||
define amdgpu_kernel void @short_array(i32 addrspace(1)* %out, i32 %index) #0 {
|
||||
entry:
|
||||
%0 = alloca [2 x i16], addrspace(5)
|
||||
|
|
|
@ -54,7 +54,7 @@ define amdgpu_kernel void @v_clamp_negabs_f32(float addrspace(1)* %out, float ad
|
|||
; GCN-LABEL: {{^}}v_clamp_negzero_f32:
|
||||
; GCN-DAG: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5, [[A]]
|
||||
; GCN-DAG: v_bfrev_b32_e32 [[SIGNBIT:v[0-9]+]], 1
|
||||
; GCN-DAG: s_brev_b32 [[SIGNBIT:s[0-9]+]], 1
|
||||
; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], [[SIGNBIT]], 1.0
|
||||
define amdgpu_kernel void @v_clamp_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
|
|
|
@ -34,8 +34,8 @@ define amdgpu_kernel void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspa
|
|||
|
||||
; FIXME: Why isn't this being folded as a constant?
|
||||
; GCN-LABEL: {{^}}commute_ne_litk_i32:
|
||||
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3039
|
||||
; GCN: v_cmp_ne_u32_e32 vcc, v{{[0-9]+}}, [[K]]
|
||||
; GCN: s_movk_i32 [[K:s[0-9]+]], 0x3039
|
||||
; GCN: v_cmp_ne_u32_e32 vcc, [[K]], v{{[0-9]+}}
|
||||
define amdgpu_kernel void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
|
||||
%gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
|
||||
|
@ -100,8 +100,8 @@ define amdgpu_kernel void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrsp
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}commute_ule_64_i32:
|
||||
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x41{{$}}
|
||||
; GCN: v_cmp_lt_u32_e32 vcc, v{{[0-9]+}}, [[K]]
|
||||
; GCN: s_movk_i32 [[K:s[0-9]+]], 0x41{{$}}
|
||||
; GCN: v_cmp_gt_u32_e32 vcc, [[K]], v{{[0-9]+}}
|
||||
define amdgpu_kernel void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
|
||||
%gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
|
||||
|
|
|
@ -50,7 +50,7 @@ define amdgpu_kernel void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, fl
|
|||
; FIXME: Should use SGPR for literal.
|
||||
; FUNC-LABEL: @commute_add_lit_fabs_f32
|
||||
; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x44800000
|
||||
; SI: s_mov_b32 [[K:s[0-9]+]], 0x44800000
|
||||
; SI: v_add_f32_e64 [[REG:v[0-9]+]], |[[X]]|, [[K]]
|
||||
; SI: buffer_store_dword [[REG]]
|
||||
define amdgpu_kernel void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
|
||||
|
|
|
@ -174,7 +174,8 @@ end:
|
|||
; GCN: s_mov_b32 m0, -1
|
||||
; GCN: ds_read_b32 [[LOAD0:v[0-9]+]]
|
||||
|
||||
; GCN: v_cmp_ne_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v0,
|
||||
; GCN: s_mov_b32 [[ZERO:s[0-9]+]], 0
|
||||
; GCN: v_cmp_ne_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], [[ZERO]], v0
|
||||
|
||||
; GCN: s_mov_b64 s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, exec
|
||||
; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, [[CMP0]]
|
||||
|
|
|
@ -103,7 +103,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i
|
|||
; GCN-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]]
|
||||
; GCN-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32
|
||||
; GCN-DAG: s_flbit_i32_b32 [[FFBH_HI:s[0-9]+]], s[[HI]]
|
||||
; GCN-DAG: v_mov_b32_e32 [[VFFBH_LO:v[0-9]+]], [[FFBH_LO]]
|
||||
; GCN-DAG: v_mov_b32_e32 [[VFFBH_LO:v[0-9]+]], [[ADD]]
|
||||
; GCN-DAG: v_mov_b32_e32 [[VFFBH_HI:v[0-9]+]], [[FFBH_HI]]
|
||||
; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
|
||||
; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
|
||||
|
|
|
@ -227,8 +227,8 @@ define amdgpu_kernel void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)*
|
|||
|
||||
; FUNC-LABEL: {{^}}v_ctpop_i32_add_literal:
|
||||
; GCN-DAG: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
|
||||
; GCN-DAG: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f
|
||||
; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
|
||||
; GCN-DAG: s_mov_b32 [[LIT:s[0-9]+]], 0x1869f
|
||||
; SI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
|
||||
; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
|
||||
; GCN: buffer_store_dword [[RESULT]],
|
||||
; GCN: s_endpgm
|
||||
|
|
|
@ -227,9 +227,9 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(i16 addrspace(1)*
|
|||
|
||||
; FUNC-LABEL: {{^}}v_ctpop_i16_add_literal:
|
||||
; GCN-DAG: {{buffer|flat}}_load_ushort [[VAL:v[0-9]+]],
|
||||
; SI-DAG: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x3e7
|
||||
; SI-DAG: s_movk_i32 [[LIT:s[0-9]+]], 0x3e7
|
||||
; VI-DAG: s_movk_i32 [[LIT:s[0-9]+]], 0x3e7
|
||||
; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
|
||||
; SI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
|
||||
; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
|
||||
; GCN: buffer_store_short [[RESULT]],
|
||||
; GCN: s_endpgm
|
||||
|
|
|
@ -2,15 +2,18 @@
|
|||
|
||||
; GCN-LABEL: {{^}}eq_t:
|
||||
; GCN-DAG: s_load_dword [[X:s[0-9]+]]
|
||||
; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
|
||||
; GCN: v_cmp_lt_f32_e{{32|64}} [[CC:s\[[0-9]+:[0-9]+\]|vcc]], [[X]], [[ONE]]{{$}}
|
||||
; GCN-DAG: s_mov_b32 [[SONE:s[0-9]+]], 1.0
|
||||
; GCN-DAG: v_mov_b32_e32 [[VONE:v[0-9]+]], [[SONE]]
|
||||
; GCN: v_cmp_lt_f32_e{{32|64}} [[CC:s\[[0-9]+:[0-9]+\]|vcc]], [[X]], [[VONE]]{{$}}
|
||||
; GCN-NOT: 0xddd5
|
||||
; GCN-NOT: v_cndmask_b32
|
||||
; GCN-NOT: v_cmp_eq_u32
|
||||
; GCN-NOT: v_cndmask_b32
|
||||
; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
|
||||
; GCN-DAG: v_mov_b32_e32 [[FOUR:v[0-9]+]], 4.0
|
||||
; GCN: v_cndmask_b32_e{{32|64}} [[RES:v[0-9]+]], [[TWO]], [[FOUR]], [[CC]]
|
||||
; GCN-DAG: s_mov_b32 [[STWO:s[0-9]+]], 2.0
|
||||
; GCN-DAG: v_mov_b32_e32 [[VTWO:v[0-9]+]], [[STWO]]
|
||||
; GCN-DAG: s_mov_b32 [[SFOUR:s[0-9]+]], 4.0
|
||||
; GCN-DAG: v_mov_b32_e32 [[VFOUR:v[0-9]+]], [[SFOUR]]
|
||||
; GCN: v_cndmask_b32_e{{32|64}} [[RES:v[0-9]+]], [[VTWO]], [[VFOUR]], [[CC]]
|
||||
; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}}
|
||||
define amdgpu_kernel void @eq_t(float %x) {
|
||||
%c1 = fcmp olt float %x, 1.0
|
||||
|
@ -23,15 +26,18 @@ define amdgpu_kernel void @eq_t(float %x) {
|
|||
|
||||
; GCN-LABEL: {{^}}ne_t:
|
||||
; GCN-DAG: s_load_dword [[X:s[0-9]+]]
|
||||
; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
|
||||
; GCN: v_cmp_lt_f32_e{{32|64}} [[CC:s\[[0-9]+:[0-9]+\]|vcc]], [[X]], [[ONE]]{{$}}
|
||||
; GCN-DAG: s_mov_b32 [[SONE:s[0-9]+]], 1.0
|
||||
; GCN-DAG: v_mov_b32_e32 [[VONE:v[0-9]+]], [[SONE]]
|
||||
; GCN: v_cmp_lt_f32_e{{32|64}} [[CC:s\[[0-9]+:[0-9]+\]|vcc]], [[X]], [[VONE]]{{$}}
|
||||
; GCN-NOT: 0xddd5
|
||||
; GCN-NOT: v_cndmask_b32
|
||||
; GCN-NOT: v_cmp_eq_u32
|
||||
; GCN-NOT: v_cndmask_b32
|
||||
; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
|
||||
; GCN-DAG: v_mov_b32_e32 [[FOUR:v[0-9]+]], 4.0
|
||||
; GCN: v_cndmask_b32_e{{32|64}} [[RES:v[0-9]+]], [[FOUR]], [[TWO]], [[CC]]
|
||||
; GCN-DAG: s_mov_b32 [[STWO:s[0-9]+]], 2.0
|
||||
; GCN-DAG: v_mov_b32_e32 [[VTWO:v[0-9]+]], [[STWO]]
|
||||
; GCN-DAG: s_mov_b32 [[SFOUR:s[0-9]+]], 4.0
|
||||
; GCN-DAG: v_mov_b32_e32 [[VFOUR:v[0-9]+]], [[SFOUR]]
|
||||
; GCN: v_cndmask_b32_e{{32|64}} [[RES:v[0-9]+]], [[VFOUR]], [[VTWO]], [[CC]]
|
||||
; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}}
|
||||
define amdgpu_kernel void @ne_t(float %x) {
|
||||
%c1 = fcmp olt float %x, 1.0
|
||||
|
@ -44,15 +50,18 @@ define amdgpu_kernel void @ne_t(float %x) {
|
|||
|
||||
; GCN-LABEL: {{^}}eq_f:
|
||||
; GCN-DAG: s_load_dword [[X:s[0-9]+]]
|
||||
; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
|
||||
; GCN: v_cmp_lt_f32_e{{32|64}} [[CC:s\[[0-9]+:[0-9]+\]|vcc]], [[X]], [[ONE]]{{$}}
|
||||
; GCN-DAG: s_mov_b32 [[SONE:s[0-9]+]], 1.0
|
||||
; GCN-DAG: v_mov_b32_e32 [[VONE:v[0-9]+]], [[SONE]]
|
||||
; GCN: v_cmp_lt_f32_e{{32|64}} [[CC:s\[[0-9]+:[0-9]+\]|vcc]], [[X]], [[VONE]]{{$}}
|
||||
; GCN-NOT: 0xddd5
|
||||
; GCN-NOT: v_cndmask_b32
|
||||
; GCN-NOT: v_cmp_eq_u32
|
||||
; GCN-NOT: v_cndmask_b32
|
||||
; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
|
||||
; GCN-DAG: v_mov_b32_e32 [[FOUR:v[0-9]+]], 4.0
|
||||
; GCN: v_cndmask_b32_e{{32|64}} [[RES:v[0-9]+]], [[FOUR]], [[TWO]], [[CC]]
|
||||
; GCN-DAG: s_mov_b32 [[STWO:s[0-9]+]], 2.0
|
||||
; GCN-DAG: v_mov_b32_e32 [[VTWO:v[0-9]+]], [[STWO]]
|
||||
; GCN-DAG: s_mov_b32 [[SFOUR:s[0-9]+]], 4.0
|
||||
; GCN-DAG: v_mov_b32_e32 [[VFOUR:v[0-9]+]], [[SFOUR]]
|
||||
; GCN: v_cndmask_b32_e{{32|64}} [[RES:v[0-9]+]], [[VFOUR]], [[VTWO]], [[CC]]
|
||||
; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}}
|
||||
define amdgpu_kernel void @eq_f(float %x) {
|
||||
%c1 = fcmp olt float %x, 1.0
|
||||
|
@ -65,15 +74,18 @@ define amdgpu_kernel void @eq_f(float %x) {
|
|||
|
||||
; GCN-LABEL: {{^}}ne_f:
|
||||
; GCN-DAG: s_load_dword [[X:s[0-9]+]]
|
||||
; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
|
||||
; GCN: v_cmp_lt_f32_e{{32|64}} [[CC:s\[[0-9]+:[0-9]+\]|vcc]], [[X]], [[ONE]]{{$}}
|
||||
; GCN-DAG: s_mov_b32 [[SONE:s[0-9]+]], 1.0
|
||||
; GCN-DAG: v_mov_b32_e32 [[VONE:v[0-9]+]], [[SONE]]
|
||||
; GCN: v_cmp_lt_f32_e{{32|64}} [[CC:s\[[0-9]+:[0-9]+\]|vcc]], [[X]], [[VONE]]{{$}}
|
||||
; GCN-NOT: 0xddd5
|
||||
; GCN-NOT: v_cndmask_b32
|
||||
; GCN-NOT: v_cmp_eq_u32
|
||||
; GCN-NOT: v_cndmask_b32
|
||||
; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
|
||||
; GCN-DAG: v_mov_b32_e32 [[FOUR:v[0-9]+]], 4.0
|
||||
; GCN: v_cndmask_b32_e{{32|64}} [[RES:v[0-9]+]], [[TWO]], [[FOUR]], [[CC]]
|
||||
; GCN-DAG: s_mov_b32 [[STWO:s[0-9]+]], 2.0
|
||||
; GCN-DAG: v_mov_b32_e32 [[VTWO:v[0-9]+]], [[STWO]]
|
||||
; GCN-DAG: s_mov_b32 [[SFOUR:s[0-9]+]], 4.0
|
||||
; GCN-DAG: v_mov_b32_e32 [[VFOUR:v[0-9]+]], [[SFOUR]]
|
||||
; GCN: v_cndmask_b32_e{{32|64}} [[RES:v[0-9]+]], [[VTWO]], [[VFOUR]], [[CC]]
|
||||
; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}}
|
||||
define amdgpu_kernel void @ne_f(float %x) {
|
||||
%c1 = fcmp olt float %x, 1.0
|
||||
|
@ -86,13 +98,16 @@ define amdgpu_kernel void @ne_f(float %x) {
|
|||
|
||||
; GCN-LABEL: {{^}}different_constants:
|
||||
; GCN-DAG: s_load_dword [[X:s[0-9]+]]
|
||||
; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
|
||||
; GCN-DAG: v_cmp_lt_f32_e{{32|64}} [[CC1:s\[[0-9]+:[0-9]+\]|vcc]], [[X]], [[ONE]]{{$}}
|
||||
; GCN-DAG: s_mov_b32 [[SONE:s[0-9]+]], 1.0
|
||||
; GCN-DAG: v_mov_b32_e32 [[VONE:v[0-9]+]], [[SONE]]
|
||||
; GCN-DAG: v_cmp_lt_f32_e{{32|64}} [[CC1:s\[[0-9]+:[0-9]+\]|vcc]], [[X]], [[VONE]]{{$}}
|
||||
; GCN-DAG: v_cndmask_b32_e{{32|64}} [[CND1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, [[CC1]]
|
||||
; GCN-DAG: v_cmp_eq_u32_e{{32|64}} [[CC2:s\[[0-9]+:[0-9]+\]|vcc]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
|
||||
; GCN-DAG: v_mov_b32_e32 [[FOUR:v[0-9]+]], 4.0
|
||||
; GCN: v_cndmask_b32_e{{32|64}} [[RES:v[0-9]+]], [[TWO]], [[FOUR]], [[CC2]]
|
||||
; GCN-DAG: v_cmp_eq_u32_e{{32|64}} [[CC2:s\[[0-9]+:[0-9]+\]|vcc]], s{{[0-9]+}}, v{{[0-9]+}}{{$}}
|
||||
; GCN-DAG: s_mov_b32 [[STWO:s[0-9]+]], 2.0
|
||||
; GCN-DAG: v_mov_b32_e32 [[VTWO:v[0-9]+]], [[STWO]]
|
||||
; GCN-DAG: s_mov_b32 [[SFOUR:s[0-9]+]], 4.0
|
||||
; GCN-DAG: v_mov_b32_e32 [[VFOUR:v[0-9]+]], [[SFOUR]]
|
||||
; GCN: v_cndmask_b32_e{{32|64}} [[RES:v[0-9]+]], [[VTWO]], [[VFOUR]], [[CC2]]
|
||||
; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}}
|
||||
define amdgpu_kernel void @different_constants(float %x) {
|
||||
%c1 = fcmp olt float %x, 1.0
|
||||
|
|
|
@ -121,7 +121,8 @@ define amdgpu_kernel void @test_fold_canonicalize_floor_value_f32(float addrspac
|
|||
}
|
||||
|
||||
; GCN-LABEL: test_fold_canonicalize_fma_value_f32:
|
||||
; GCN: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN: s_mov_b32 [[SREG:s[0-9]+]], 0x41700000
|
||||
; GCN: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, [[SREG]], [[SREG]]
|
||||
; GCN-NOT: v_mul
|
||||
; GCN-NOT: v_max
|
||||
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
|
||||
|
@ -136,7 +137,8 @@ define amdgpu_kernel void @test_fold_canonicalize_fma_value_f32(float addrspace(
|
|||
}
|
||||
|
||||
; GCN-LABEL: test_fold_canonicalize_fmad_ftz_value_f32:
|
||||
; GCN: v_mad_f32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN: s_mov_b32 [[SGPR:s[0-9]+]], 0x41700000
|
||||
; GCN: v_mad_f32 [[V:v[0-9]+]], v{{[0-9]+}}, [[SGPR]], [[SGPR]]
|
||||
; GCN-NOT: v_mul
|
||||
; GCN-NOT: v_max
|
||||
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
|
||||
|
@ -152,10 +154,12 @@ define amdgpu_kernel void @test_fold_canonicalize_fmad_ftz_value_f32(float addrs
|
|||
|
||||
; GCN-LABEL: test_fold_canonicalize_fmuladd_value_f32:
|
||||
; GCN-FLUSH: v_mac_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GFX9-DENORM: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DENORM: s_mov_b32 [[SREG:s[0-9]+]], 0x41700000
|
||||
; GCN-DENORM: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, [[SREG]], [[SREG]]
|
||||
; GCN-NOT: v_mul
|
||||
; GCN-NOT: v_max
|
||||
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}],
|
||||
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
|
||||
; GCN-NOT: 1.0
|
||||
define amdgpu_kernel void @test_fold_canonicalize_fmuladd_value_f32(float addrspace(1)* %arg) {
|
||||
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
|
||||
|
|
|
@ -85,20 +85,20 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_25ulp(float addrspace(1)* %arg
|
|||
|
||||
; GCN-LABEL: {{^}}div_v4_1_by_x_25ulp:
|
||||
; GCN-DAG: s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
|
||||
; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
|
||||
; GCN-DENORM-DAG: s_mov_b32 [[L:s[0-9]+]], 0x6f800000
|
||||
; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
|
||||
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
|
||||
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
||||
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
||||
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
|
||||
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
||||
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
||||
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
|
||||
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
||||
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
||||
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
|
||||
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
||||
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
||||
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_rcp_f32_e32
|
||||
; GCN-DENORM-DAG: v_rcp_f32_e32
|
||||
; GCN-DENORM-DAG: v_rcp_f32_e32
|
||||
|
@ -121,20 +121,20 @@ define amdgpu_kernel void @div_v4_1_by_x_25ulp(<4 x float> addrspace(1)* %arg) {
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}div_v4_minus_1_by_x_25ulp:
|
||||
; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
|
||||
; GCN-DENORM-DAG: s_mov_b32 [[L:s[0-9]+]], 0x6f800000
|
||||
; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
|
||||
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
|
||||
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
||||
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
||||
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
|
||||
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
||||
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
||||
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
|
||||
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
||||
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
||||
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
|
||||
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
||||
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
||||
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_rcp_f32_e32
|
||||
; GCN-DENORM-DAG: v_rcp_f32_e32
|
||||
; GCN-DENORM-DAG: v_rcp_f32_e32
|
||||
|
@ -156,20 +156,20 @@ define amdgpu_kernel void @div_v4_minus_1_by_x_25ulp(<4 x float> addrspace(1)* %
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}div_v4_1_by_minus_x_25ulp:
|
||||
; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
|
||||
; GCN-DENORM-DAG: s_mov_b32 [[L:s[0-9]+]], 0x6f800000
|
||||
; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
|
||||
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
|
||||
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
||||
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
||||
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
|
||||
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
||||
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
||||
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
|
||||
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
||||
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
||||
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
|
||||
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
||||
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
||||
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_rcp_f32_e32
|
||||
; GCN-DENORM-DAG: v_rcp_f32_e32
|
||||
; GCN-DENORM-DAG: v_rcp_f32_e32
|
||||
|
@ -194,20 +194,20 @@ define amdgpu_kernel void @div_v4_1_by_minus_x_25ulp(<4 x float> addrspace(1)* %
|
|||
|
||||
; GCN-LABEL: {{^}}div_v4_minus_1_by_minus_x_25ulp:
|
||||
; GCN-DAG: s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
|
||||
; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
|
||||
; GCN-DENORM-DAG: s_mov_b32 [[L:s[0-9]+]], 0x6f800000
|
||||
; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
|
||||
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
|
||||
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
||||
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
||||
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
|
||||
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
||||
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
||||
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
|
||||
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
||||
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
||||
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
|
||||
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
||||
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
||||
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_rcp_f32_e32
|
||||
; GCN-DENORM-DAG: v_rcp_f32_e32
|
||||
; GCN-DENORM-DAG: v_rcp_f32_e32
|
||||
|
@ -231,7 +231,7 @@ define amdgpu_kernel void @div_v4_minus_1_by_minus_x_25ulp(<4 x float> addrspace
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}div_v4_c_by_x_25ulp:
|
||||
; GCN-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
|
||||
; GCN-DAG: s_mov_b32 [[L:s[0-9]+]], 0x6f800000
|
||||
; GCN-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
|
||||
; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}}
|
||||
; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}}
|
||||
|
@ -240,13 +240,13 @@ define amdgpu_kernel void @div_v4_minus_1_by_minus_x_25ulp(<4 x float> addrspace
|
|||
; GCN-DENORM-DAG: v_rcp_f32_e32
|
||||
; GCN-DENORM-DAG: v_rcp_f32_e32
|
||||
|
||||
; GCN-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
|
||||
; GCN-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
||||
; GCN-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
|
||||
; GCN-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
||||
|
||||
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP1:v[0-9]+]], v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP1]]
|
||||
; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP2:v[0-9]+]], v{{[0-9]+}}
|
||||
|
@ -273,7 +273,7 @@ define amdgpu_kernel void @div_v4_c_by_x_25ulp(<4 x float> addrspace(1)* %arg) {
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}div_v4_c_by_minus_x_25ulp:
|
||||
; GCN-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
|
||||
; GCN-DAG: s_mov_b32 [[L:s[0-9]+]], 0x6f800000
|
||||
; GCN-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
|
||||
; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
|
||||
; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
|
||||
|
@ -282,13 +282,13 @@ define amdgpu_kernel void @div_v4_c_by_x_25ulp(<4 x float> addrspace(1)* %arg) {
|
|||
; GCN-DENORM-DAG: v_rcp_f32_e32
|
||||
; GCN-DENORM-DAG: v_rcp_f32_e32
|
||||
|
||||
; GCN-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
|
||||
; GCN-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
||||
; GCN-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
|
||||
; GCN-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
||||
|
||||
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP1:v[0-9]+]], v{{[0-9]+}}
|
||||
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP1]]
|
||||
; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP2:v[0-9]+]], v{{[0-9]+}}
|
||||
|
@ -324,7 +324,7 @@ define amdgpu_kernel void @div_v4_c_by_minus_x_25ulp(<4 x float> addrspace(1)* %
|
|||
; GCN-DENORM: v_div_fmas_f32
|
||||
; GCN-DENORM: v_div_fixup_f32 [[OUT:v[0-9]+]],
|
||||
|
||||
; GCN-FLUSF-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
|
||||
; GCN-FLUSH-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
|
||||
; GCN-FLUSH-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
|
||||
; GCN-FLUSH-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]]
|
||||
; GCN-FLUSH-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
;RUN: llc -mtriple=amdgcn-- < %s | FileCheck -enable-var-scope -check-prefix=SI %s
|
||||
;RUN: llc -mtriple=amdgcn-- -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=VI %s
|
||||
;RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
|
||||
;RUN: llc -mtriple=amdgcn-- < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
|
||||
;RUN: llc -mtriple=amdgcn-- -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
|
||||
;RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 %s
|
||||
|
||||
define float @v_exp_f32(float %arg0) {
|
||||
; SI-LABEL: v_exp_f32:
|
||||
|
@ -29,75 +29,32 @@ define float @v_exp_f32(float %arg0) {
|
|||
}
|
||||
|
||||
define <2 x float> @v_exp_v2f32(<2 x float> %arg0) {
|
||||
; SI-LABEL: v_exp_v2f32:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v2, 0x3fb8aa3b
|
||||
; SI-NEXT: v_mul_f32_e32 v0, v0, v2
|
||||
; SI-NEXT: v_mul_f32_e32 v1, v1, v2
|
||||
; SI-NEXT: v_exp_f32_e32 v0, v0
|
||||
; SI-NEXT: v_exp_f32_e32 v1, v1
|
||||
; SI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; VI-LABEL: v_exp_v2f32:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v2, 0x3fb8aa3b
|
||||
; VI-NEXT: v_mul_f32_e32 v0, v0, v2
|
||||
; VI-NEXT: v_mul_f32_e32 v1, v1, v2
|
||||
; VI-NEXT: v_exp_f32_e32 v0, v0
|
||||
; VI-NEXT: v_exp_f32_e32 v1, v1
|
||||
; VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_exp_v2f32:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 0x3fb8aa3b
|
||||
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
|
||||
; GFX9-NEXT: v_mul_f32_e32 v1, v1, v2
|
||||
; GFX9-NEXT: v_exp_f32_e32 v0, v0
|
||||
; GFX9-NEXT: v_exp_f32_e32 v1, v1
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
; GCN-LABEL: v_exp_v2f32:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: s_mov_b32 [[SREG:s[0-9]+]], 0x3fb8aa3b
|
||||
; GCN-NEXT: v_mul_f32_e32 v{{[0-9]+}}, [[SREG]], v{{[0-9]+}}
|
||||
; GCN-NEXT: v_mul_f32_e32 v{{[0-9]+}}, [[SREG]], v{{[0-9]+}}
|
||||
; GCN-NEXT: v_exp_f32_e32 v0, v0
|
||||
; GCN-NEXT: v_exp_f32_e32 v1, v1
|
||||
; GCN-NEXT: s_setpc_b64 s[30:31]
|
||||
%result = call <2 x float> @llvm.exp.v2f32(<2 x float> %arg0)
|
||||
ret <2 x float> %result
|
||||
}
|
||||
|
||||
define <3 x float> @v_exp_v3f32(<3 x float> %arg0) {
|
||||
; SI-LABEL: v_exp_v3f32:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v3, 0x3fb8aa3b
|
||||
; SI-NEXT: v_mul_f32_e32 v0, v0, v3
|
||||
; SI-NEXT: v_mul_f32_e32 v1, v1, v3
|
||||
; SI-NEXT: v_mul_f32_e32 v2, v2, v3
|
||||
; SI-NEXT: v_exp_f32_e32 v0, v0
|
||||
; SI-NEXT: v_exp_f32_e32 v1, v1
|
||||
; SI-NEXT: v_exp_f32_e32 v2, v2
|
||||
; SI-NEXT: s_setpc_b64 s[30:31]
|
||||
; GCN-LABEL: v_exp_v3f32:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: s_mov_b32 [[SREG:s[0-9]+]], 0x3fb8aa3b
|
||||
; GCN-NEXT: v_mul_f32_e32 v{{[0-9]+}}, [[SREG]], v{{[0-9]+}}
|
||||
; GCN-NEXT: v_mul_f32_e32 v{{[0-9]+}}, [[SREG]], v{{[0-9]+}}
|
||||
; GCN-NEXT: v_mul_f32_e32 v{{[0-9]+}}, [[SREG]], v{{[0-9]+}}
|
||||
; GCN-NEXT: v_exp_f32_e32 v0, v0
|
||||
; GCN-NEXT: v_exp_f32_e32 v1, v1
|
||||
; GCN-NEXT: v_exp_f32_e32 v2, v2
|
||||
; GCN-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; VI-LABEL: v_exp_v3f32:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v3, 0x3fb8aa3b
|
||||
; VI-NEXT: v_mul_f32_e32 v0, v0, v3
|
||||
; VI-NEXT: v_mul_f32_e32 v1, v1, v3
|
||||
; VI-NEXT: v_mul_f32_e32 v2, v2, v3
|
||||
; VI-NEXT: v_exp_f32_e32 v0, v0
|
||||
; VI-NEXT: v_exp_f32_e32 v1, v1
|
||||
; VI-NEXT: v_exp_f32_e32 v2, v2
|
||||
; VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_exp_v3f32:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, 0x3fb8aa3b
|
||||
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v3
|
||||
; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3
|
||||
; GFX9-NEXT: v_mul_f32_e32 v2, v2, v3
|
||||
; GFX9-NEXT: v_exp_f32_e32 v0, v0
|
||||
; GFX9-NEXT: v_exp_f32_e32 v1, v1
|
||||
; GFX9-NEXT: v_exp_f32_e32 v2, v2
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
%result = call <3 x float> @llvm.exp.v3f32(<3 x float> %arg0)
|
||||
ret <3 x float> %result
|
||||
}
|
||||
|
@ -106,44 +63,16 @@ define <4 x float> @v_exp_v4f32(<4 x float> %arg0) {
|
|||
; SI-LABEL: v_exp_v4f32:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v4, 0x3fb8aa3b
|
||||
; SI-NEXT: v_mul_f32_e32 v0, v0, v4
|
||||
; SI-NEXT: v_mul_f32_e32 v1, v1, v4
|
||||
; SI-NEXT: v_mul_f32_e32 v2, v2, v4
|
||||
; SI-NEXT: v_mul_f32_e32 v3, v3, v4
|
||||
; SI-NEXT: s_mov_b32 [[SREG:s[0-9]+]], 0x3fb8aa3b
|
||||
; SI-NEXT: v_mul_f32_e32 v0, [[SREG]], v0
|
||||
; SI-NEXT: v_mul_f32_e32 v1, [[SREG]], v1
|
||||
; SI-NEXT: v_mul_f32_e32 v2, [[SREG]], v2
|
||||
; SI-NEXT: v_mul_f32_e32 v3, [[SREG]], v3
|
||||
; SI-NEXT: v_exp_f32_e32 v0, v0
|
||||
; SI-NEXT: v_exp_f32_e32 v1, v1
|
||||
; SI-NEXT: v_exp_f32_e32 v2, v2
|
||||
; SI-NEXT: v_exp_f32_e32 v3, v3
|
||||
; SI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; VI-LABEL: v_exp_v4f32:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v4, 0x3fb8aa3b
|
||||
; VI-NEXT: v_mul_f32_e32 v0, v0, v4
|
||||
; VI-NEXT: v_mul_f32_e32 v1, v1, v4
|
||||
; VI-NEXT: v_mul_f32_e32 v2, v2, v4
|
||||
; VI-NEXT: v_mul_f32_e32 v3, v3, v4
|
||||
; VI-NEXT: v_exp_f32_e32 v0, v0
|
||||
; VI-NEXT: v_exp_f32_e32 v1, v1
|
||||
; VI-NEXT: v_exp_f32_e32 v2, v2
|
||||
; VI-NEXT: v_exp_f32_e32 v3, v3
|
||||
; VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_exp_v4f32:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, 0x3fb8aa3b
|
||||
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v4
|
||||
; GFX9-NEXT: v_mul_f32_e32 v1, v1, v4
|
||||
; GFX9-NEXT: v_mul_f32_e32 v2, v2, v4
|
||||
; GFX9-NEXT: v_mul_f32_e32 v3, v3, v4
|
||||
; GFX9-NEXT: v_exp_f32_e32 v0, v0
|
||||
; GFX9-NEXT: v_exp_f32_e32 v1, v1
|
||||
; GFX9-NEXT: v_exp_f32_e32 v2, v2
|
||||
; GFX9-NEXT: v_exp_f32_e32 v3, v3
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
%result = call <4 x float> @llvm.exp.v4f32(<4 x float> %arg0)
|
||||
ret <4 x float> %result
|
||||
}
|
||||
|
@ -181,11 +110,11 @@ define <2 x half> @v_exp_v2f16(<2 x half> %arg0) {
|
|||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v2, 0x3fb8aa3b
|
||||
; SI-NEXT: s_mov_b32 [[SREG:s[0-9]+]], 0x3fb8aa3b
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; SI-NEXT: v_mul_f32_e32 v0, v0, v2
|
||||
; SI-NEXT: v_mul_f32_e32 v1, v1, v2
|
||||
; SI-NEXT: v_mul_f32_e32 v{{[0-9]+}}, [[SREG]], v{{[0-9]+}}
|
||||
; SI-NEXT: v_mul_f32_e32 v{{[0-9]+}}, [[SREG]], v{{[0-9]+}}
|
||||
; SI-NEXT: v_exp_f32_e32 v0, v0
|
||||
; SI-NEXT: v_exp_f32_e32 v1, v1
|
||||
; SI-NEXT: s_setpc_b64 s[30:31]
|
||||
|
@ -193,19 +122,20 @@ define <2 x half> @v_exp_v2f16(<2 x half> %arg0) {
|
|||
; VI-LABEL: v_exp_v2f16:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v1, 0x3dc5
|
||||
; VI-NEXT: v_mul_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-NEXT: v_mul_f16_e32 v0, v0, v1
|
||||
; VI-NEXT: v_exp_f16_sdwa v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; VI-NEXT: v_exp_f16_e32 v0, v0
|
||||
; VI-NEXT: v_or_b32_e32 v0, v0, v2
|
||||
; VI-NEXT: s_movk_i32 [[SREG:s[0-9]+]], 0x3dc5
|
||||
; VI-NEXT: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]]
|
||||
; VI-NEXT: v_mul_f16_sdwa [[MUL1:v[0-9]+]], v{{[0-9]+}}, [[VREG]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-NEXT: v_mul_f16_e32 [[MUL2:v[0-9]+]], [[SREG]], v{{[0-9]+}}
|
||||
; VI-NEXT: v_exp_f16_sdwa [[MUL1]], [[MUL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; VI-NEXT: v_exp_f16_e32 [[MUL2]], [[MUL2]]
|
||||
; VI-NEXT: v_or_b32_e32 v{{[0-9]+}}, [[MUL2]], [[MUL1]]
|
||||
; VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_exp_v2f16:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3dc5
|
||||
; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 op_sel_hi:[1,0]
|
||||
; GFX9-NEXT: s_movk_i32 [[SREG:s[0-9]+]], 0x3dc5
|
||||
; GFX9-NEXT: v_pk_mul_f16 v0, v0, [[SREG]] op_sel_hi:[1,0]
|
||||
; GFX9-NEXT: v_exp_f16_e32 v1, v0
|
||||
; GFX9-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
|
@ -228,15 +158,15 @@ define <4 x half> @v_exp_v4f16(<4 x half> %arg0) {
|
|||
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v4, 0x3fb8aa3b
|
||||
; SI-NEXT: s_mov_b32 [[SREG:s[0-9]+]], 0x3fb8aa3b
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; SI-NEXT: v_mul_f32_e32 v0, v0, v4
|
||||
; SI-NEXT: v_mul_f32_e32 v1, v1, v4
|
||||
; SI-NEXT: v_mul_f32_e32 v2, v2, v4
|
||||
; SI-NEXT: v_mul_f32_e32 v3, v3, v4
|
||||
; SI-NEXT: v_mul_f32_e32 v0, [[SREG]], v0
|
||||
; SI-NEXT: v_mul_f32_e32 v1, [[SREG]], v1
|
||||
; SI-NEXT: v_mul_f32_e32 v2, [[SREG]], v2
|
||||
; SI-NEXT: v_mul_f32_e32 v3, [[SREG]], v3
|
||||
; SI-NEXT: v_exp_f32_e32 v0, v0
|
||||
; SI-NEXT: v_exp_f32_e32 v1, v1
|
||||
; SI-NEXT: v_exp_f32_e32 v2, v2
|
||||
|
@ -246,36 +176,37 @@ define <4 x half> @v_exp_v4f16(<4 x half> %arg0) {
|
|||
; VI-LABEL: v_exp_v4f16:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v2, 0x3dc5
|
||||
; VI-NEXT: v_mul_f16_e32 v3, v1, v2
|
||||
; VI-NEXT: v_mul_f16_e32 v4, v0, v2
|
||||
; VI-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-NEXT: v_exp_f16_e32 v3, v3
|
||||
; VI-NEXT: v_exp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; VI-NEXT: v_exp_f16_e32 v4, v4
|
||||
; VI-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_e32 v1, v3, v1
|
||||
; VI-NEXT: v_or_b32_e32 v0, v4, v0
|
||||
; VI-NEXT: s_movk_i32 [[SREG:s[0-9]+]], 0x3dc5
|
||||
; VI-NEXT: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]]
|
||||
; VI-NEXT: v_mul_f16_e32 [[MUL1:v[0-9]+]], [[SREG]], v1
|
||||
; VI-NEXT: v_mul_f16_e32 [[MUL2:v[0-9]+]], [[SREG]], v0
|
||||
; VI-NEXT: v_mul_f16_sdwa [[MUL3:v[0-9]+]], v1, [[VREG]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-NEXT: v_mul_f16_sdwa [[MUL4:v[0-9]+]], v0, [[VREG]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-NEXT: v_exp_f16_e32 [[EXP1:v[0-9]+]], [[MUL1]]
|
||||
; VI-NEXT: v_exp_f16_sdwa [[EXP2:v[0-9]+]], v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; VI-NEXT: v_exp_f16_e32 [[EXP3:v[0-9]+]], [[MUL2]]
|
||||
; VI-NEXT: v_exp_f16_sdwa [[EXP4:v[0-9]+]], v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_e32 v1, [[EXP1]], [[EXP2]]
|
||||
; VI-NEXT: v_or_b32_e32 v0, [[EXP3]], [[EXP4]]
|
||||
; VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_exp_v4f16:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 0x3dc5
|
||||
; GFX9-NEXT: v_mul_f16_e32 v3, v1, v2
|
||||
; GFX9-NEXT: v_mul_f16_e32 v4, v0, v2
|
||||
; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; GFX9-NEXT: v_exp_f16_e32 v3, v3
|
||||
; GFX9-NEXT: v_exp_f16_e32 v4, v4
|
||||
; GFX9-NEXT: v_exp_f16_e32 v0, v0
|
||||
; GFX9-NEXT: v_exp_f16_e32 v1, v1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
|
||||
; GFX9-NEXT: v_and_b32_e32 v4, v2, v4
|
||||
; GFX9-NEXT: v_and_b32_e32 v2, v2, v3
|
||||
; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v4
|
||||
; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2
|
||||
; GFX9-NEXT: s_movk_i32 [[SREG:s[0-9]+]], 0x3dc5
|
||||
; GFX9-NEXT: v_mul_f16_e32 [[MUL1:v[0-9]+]], [[SREG]], v1
|
||||
; GFX9-NEXT: v_mul_f16_e32 [[MUL2:v[0-9]+]], [[SREG]], v0
|
||||
; GFX9-NEXT: v_mul_f16_sdwa [[MUL3:v[0-9]+]], v1, [[SREG]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; GFX9-NEXT: v_mul_f16_sdwa [[MUL4:v[0-9]+]], v0, [[SREG]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; GFX9-NEXT: v_exp_f16_e32 [[EXP1:v[0-9]+]], [[MUL1]]
|
||||
; GFX9-NEXT: v_exp_f16_e32 [[EXP2:v[0-9]+]], [[MUL2]]
|
||||
; GFX9-NEXT: v_exp_f16_e32 [[EXP3:v[0-9]+]], [[MUL4]]
|
||||
; GFX9-NEXT: v_exp_f16_e32 [[EXP4:v[0-9]+]], [[MUL3]]
|
||||
; GFX9-NEXT: v_mov_b32_e32 [[VCONST:v[0-9]+]], 0xffff
|
||||
; GFX9-NEXT: v_and_b32_e32 [[AND1:v[0-9]+]], [[VCONST]], [[EXP2]]
|
||||
; GFX9-NEXT: v_and_b32_e32 [[AND2:v[0-9]+]], [[VCONST]], [[EXP1]]
|
||||
; GFX9-NEXT: v_lshl_or_b32 v0, [[EXP3]], 16, [[AND1]]
|
||||
; GFX9-NEXT: v_lshl_or_b32 v1, [[EXP4]], 16, [[AND2]]
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
%result = call <4 x half> @llvm.exp.v4f16(<4 x half> %arg0)
|
||||
ret <4 x half> %result
|
||||
|
|
|
@ -498,7 +498,7 @@ define amdgpu_kernel void @v_fneg_neg0_minnum_f32(float addrspace(1)* %out, floa
|
|||
; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_f32:
|
||||
; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
|
||||
|
||||
; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983
|
||||
; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0xbe22f983
|
||||
; SI: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[K]]
|
||||
|
||||
; VI: v_min_f32_e32 [[MAX:v[0-9]+]], 0.15915494, [[A]]
|
||||
|
@ -520,7 +520,7 @@ define amdgpu_kernel void @v_fneg_inv2pi_minnum_f32(float addrspace(1)* %out, fl
|
|||
; GCN-LABEL: {{^}}v_fneg_neg_inv2pi_minnum_f32:
|
||||
; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
|
||||
|
||||
; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983
|
||||
; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x3e22f983
|
||||
; SI: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[K]]
|
||||
|
||||
; VI: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0.15915494
|
||||
|
@ -660,7 +660,7 @@ define amdgpu_kernel void @v_fneg_0_minnum_foldable_use_f32(float addrspace(1)*
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_foldable_use_f32:
|
||||
; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983
|
||||
; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0xbe22f983
|
||||
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
|
||||
|
||||
|
|
|
@ -326,15 +326,17 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_v2f16(<2 x half> addrspace
|
|||
|
||||
; GCN-LABEL: {{^}}commute_add_literal_v2f16:
|
||||
; GFX9-DAG: buffer_load_dword [[VAL:v[0-9]+]]
|
||||
; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x6400{{$}}
|
||||
; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x6400{{$}}
|
||||
; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], [[K]] op_sel_hi:[1,0]{{$}}
|
||||
; GFX9: buffer_store_dword [[REG]]
|
||||
|
||||
; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x6400{{$}}
|
||||
; VI-DAG: s_movk_i32 [[K:s[0-9]+]], 0x6400{{$}}
|
||||
; VI-DAG: buffer_load_dword
|
||||
; VI-NOT: and
|
||||
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}
|
||||
; gfx8 does not support sreg or imm in sdwa - this will be move then
|
||||
; VI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]]
|
||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; VI: buffer_store_dword
|
||||
define amdgpu_kernel void @commute_add_literal_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
|
||||
|
|
|
@ -446,7 +446,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspac
|
|||
|
||||
; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
|
||||
; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
|
||||
; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7
|
||||
|
||||
; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
|
||||
; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
|
||||
|
@ -474,7 +474,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspac
|
|||
|
||||
; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr:
|
||||
; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
|
||||
; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x1234
|
||||
|
||||
; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
|
||||
; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
|
||||
|
|
|
@ -127,9 +127,9 @@ define amdgpu_kernel void @test_class_9bit_mask_f32(i32 addrspace(1)* %out, floa
|
|||
|
||||
; SI-LABEL: {{^}}v_test_class_full_mask_f32:
|
||||
; SI-DAG: buffer_load_dword [[VA:v[0-9]+]]
|
||||
; SI-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
|
||||
; SI: v_cmp_class_f32_e32 vcc, [[VA]], [[MASK]]
|
||||
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
|
||||
; SI-DAG: s_movk_i32 [[MASK:s[0-9]+]], 0x1ff{{$}}
|
||||
; SI: v_cmp_class_f32_e64 s[{{[0-9]}}:{{[0-9]}}], [[VA]], [[MASK]]
|
||||
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, s[{{[0-9]}}:{{[0-9]}}]
|
||||
; SI: buffer_store_dword [[RESULT]]
|
||||
; SI: s_endpgm
|
||||
define amdgpu_kernel void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
|
||||
|
@ -165,7 +165,7 @@ define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f32(i32 a
|
|||
; FIXME: Why isn't this using a literal constant operand?
|
||||
; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f32:
|
||||
; SI-DAG: buffer_load_dword [[VB:v[0-9]+]]
|
||||
; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
|
||||
; SI-DAG: s_mov_b32 [[VK:s[0-9]+]], 0x44800000
|
||||
; SI: v_cmp_class_f32_e32 vcc, [[VK]], [[VB]]
|
||||
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
|
||||
; SI: buffer_store_dword [[RESULT]]
|
||||
|
@ -284,10 +284,10 @@ define amdgpu_kernel void @test_class_full_mask_f64(i32 addrspace(1)* %out, [8 x
|
|||
|
||||
; SI-LABEL: {{^}}v_test_class_full_mask_f64:
|
||||
; SI-DAG: buffer_load_dwordx2 [[VA:v\[[0-9]+:[0-9]+\]]]
|
||||
; SI-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
|
||||
; SI: v_cmp_class_f64_e32 vcc, [[VA]], [[MASK]]
|
||||
; SI-DAG: s_movk_i32 [[MASK:s[0-9]+]], 0x1ff{{$}}
|
||||
; SI: v_cmp_class_f64_e64 s[{{[0-9]}}:{{[0-9]}}], [[VA]], [[MASK]]
|
||||
; SI-NOT: vcc
|
||||
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
|
||||
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, s[{{[0-9]}}:{{[0-9]}}]
|
||||
; SI: buffer_store_dword [[RESULT]]
|
||||
; SI: s_endpgm
|
||||
define amdgpu_kernel void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #0 {
|
||||
|
@ -377,8 +377,8 @@ define amdgpu_kernel void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, flo
|
|||
|
||||
; SI-LABEL: {{^}}test_fold_or_all_tests_class_f32_0:
|
||||
; SI-NOT: v_cmp_class
|
||||
; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x3ff{{$}}
|
||||
; SI: v_cmp_class_f32_e32 vcc, v{{[0-9]+}}, [[MASK]]{{$}}
|
||||
; SI: s_movk_i32 [[MASK:s[0-9]+]], 0x3ff{{$}}
|
||||
; SI: v_cmp_class_f32_e64 s[0:1], v{{[0-9]+}}, [[MASK]]{{$}}
|
||||
; SI-NOT: v_cmp_class
|
||||
; SI: s_endpgm
|
||||
define amdgpu_kernel void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
|
||||
|
|
|
@ -26,8 +26,8 @@ entry:
|
|||
; GCN-LABEL: {{^}}div_fixup_f16_imm_a
|
||||
; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
|
||||
; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
|
||||
; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x4200{{$}}
|
||||
; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
|
||||
; VI: s_movk_i32 s[[A_F16:[0-9]+]], 0x4200{{$}}
|
||||
; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], s[[A_F16]], v[[B_F16]], v[[C_F16]]
|
||||
; GCN: buffer_store_short v[[R_F16]]
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @div_fixup_f16_imm_a(
|
||||
|
@ -45,8 +45,8 @@ entry:
|
|||
; GCN-LABEL: {{^}}div_fixup_f16_imm_b
|
||||
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
|
||||
; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
|
||||
; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}}
|
||||
; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
|
||||
; VI: s_movk_i32 s[[B_F16:[0-9]+]], 0x4200{{$}}
|
||||
; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], s[[B_F16]], v[[C_F16]]
|
||||
; GCN: buffer_store_short v[[R_F16]]
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @div_fixup_f16_imm_b(
|
||||
|
@ -64,8 +64,8 @@ entry:
|
|||
; GCN-LABEL: {{^}}div_fixup_f16_imm_c
|
||||
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
|
||||
; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
|
||||
; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}}
|
||||
; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
|
||||
; VI: s_movk_i32 s[[C_F16:[0-9]+]], 0x4200{{$}}
|
||||
; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], s[[C_F16]]
|
||||
; GCN: buffer_store_short v[[R_F16]]
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @div_fixup_f16_imm_c(
|
||||
|
@ -81,9 +81,9 @@ entry:
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}div_fixup_f16_imm_a_imm_b
|
||||
; VI-DAG: v_mov_b32_e32 v[[AB_F16:[0-9]+]], 0x4200{{$}}
|
||||
; VI-DAG: s_movk_i32 [[AB_F16:s[0-9]+]], 0x4200{{$}}
|
||||
; GCN-DAG: buffer_load_ushort v[[C_F16:[0-9]+]]
|
||||
; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[AB_F16]], v[[AB_F16]], v[[C_F16]]
|
||||
; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], [[AB_F16]], [[AB_F16]], v[[C_F16]]
|
||||
; GCN: buffer_store_short v[[R_F16]]
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @div_fixup_f16_imm_a_imm_b(
|
||||
|
@ -97,9 +97,9 @@ entry:
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}div_fixup_f16_imm_b_imm_c
|
||||
; VI-DAG: v_mov_b32_e32 v[[BC_F16:[0-9]+]], 0x4200{{$}}
|
||||
; VI-DAG: s_movk_i32 [[BC_F16:s[0-9]+]], 0x4200{{$}}
|
||||
; GCN-DAG: buffer_load_ushort v[[A_F16:[0-9]+]]
|
||||
; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[BC_F16]], v[[BC_F16]]
|
||||
; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], [[BC_F16]], [[BC_F16]]
|
||||
; GCN: buffer_store_short v[[R_F16]]
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @div_fixup_f16_imm_b_imm_c(
|
||||
|
@ -113,9 +113,9 @@ entry:
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}div_fixup_f16_imm_a_imm_c
|
||||
; VI-DAG: v_mov_b32_e32 v[[AC_F16:[0-9]+]], 0x4200{{$}}
|
||||
; VI-DAG: s_movk_i32 [[AC_F16:s[0-9]+]], 0x4200{{$}}
|
||||
; GCN-DAG: buffer_load_ushort v[[B_F16:[0-9]+]]
|
||||
; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[AC_F16]], v[[B_F16]], v[[AC_F16]]
|
||||
; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], [[AC_F16]], v[[B_F16]], [[AC_F16]]
|
||||
; GCN: buffer_store_short v[[R_F16]]
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @div_fixup_f16_imm_a_imm_c(
|
||||
|
|
|
@ -366,8 +366,8 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(float addrspace(1)* %out,
|
|||
}
|
||||
|
||||
; SI-LABEL: {{^}}test_div_scale_f32_val_undef_val:
|
||||
; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x41000000
|
||||
; SI: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[K]], [[K]]
|
||||
; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000
|
||||
; SI: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[K]], v{{[0-9]+}}, [[K]]
|
||||
define amdgpu_kernel void @test_div_scale_f32_val_undef_val(float addrspace(1)* %out) #0 {
|
||||
%result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float 8.0, float undef, i1 false)
|
||||
%result0 = extractvalue { float, i1 } %result, 0
|
||||
|
@ -376,8 +376,8 @@ define amdgpu_kernel void @test_div_scale_f32_val_undef_val(float addrspace(1)*
|
|||
}
|
||||
|
||||
; SI-LABEL: {{^}}test_div_scale_f32_undef_val_val:
|
||||
; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x41000000
|
||||
; SI: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[K]], [[K]]
|
||||
; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000
|
||||
; SI: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[K]], v{{[0-9]+}}
|
||||
define amdgpu_kernel void @test_div_scale_f32_undef_val_val(float addrspace(1)* %out) #0 {
|
||||
%result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float 8.0, i1 false)
|
||||
%result0 = extractvalue { float, i1 } %result, 0
|
||||
|
|
|
@ -274,9 +274,9 @@ define amdgpu_kernel void @v_fcmp_f16(i64 addrspace(1)* %out, half %src) {
|
|||
; GCN-LABEL: {{^}}v_fcmp_f16_oeq:
|
||||
; VI: v_cmp_eq_f16_e64
|
||||
|
||||
; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x42c80000
|
||||
; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x42c80000
|
||||
; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], s{{[0-9]+}}
|
||||
; SI: v_cmp_eq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[CVT]], [[K]]
|
||||
; SI: v_cmp_eq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[CVT]]
|
||||
define amdgpu_kernel void @v_fcmp_f16_oeq(i64 addrspace(1)* %out, half %src) {
|
||||
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 1)
|
||||
store i64 %result, i64 addrspace(1)* %out
|
||||
|
@ -286,9 +286,9 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(i64 addrspace(1)* %out, half %src) {
|
|||
; GCN-LABEL: {{^}}v_fcmp_f16_one:
|
||||
; VI: v_cmp_neq_f16_e64
|
||||
|
||||
; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x42c80000
|
||||
; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x42c80000
|
||||
; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], s{{[0-9]+}}
|
||||
; SI: v_cmp_neq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[CVT]], [[K]]
|
||||
; SI: v_cmp_neq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[CVT]]
|
||||
define amdgpu_kernel void @v_fcmp_f16_one(i64 addrspace(1)* %out, half %src) {
|
||||
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 6)
|
||||
store i64 %result, i64 addrspace(1)* %out
|
||||
|
@ -298,9 +298,9 @@ define amdgpu_kernel void @v_fcmp_f16_one(i64 addrspace(1)* %out, half %src) {
|
|||
; GCN-LABEL: {{^}}v_fcmp_f16_ogt:
|
||||
; VI: v_cmp_gt_f16_e64
|
||||
|
||||
; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x42c80000
|
||||
; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x42c80000
|
||||
; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], s{{[0-9]+}}
|
||||
; SI: v_cmp_gt_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[CVT]], [[K]]
|
||||
; SI: v_cmp_lt_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[CVT]]
|
||||
define amdgpu_kernel void @v_fcmp_f16_ogt(i64 addrspace(1)* %out, half %src) {
|
||||
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 2)
|
||||
store i64 %result, i64 addrspace(1)* %out
|
||||
|
@ -310,9 +310,9 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(i64 addrspace(1)* %out, half %src) {
|
|||
; GCN-LABEL: {{^}}v_fcmp_f16_oge:
|
||||
; VI: v_cmp_ge_f16_e64
|
||||
|
||||
; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x42c80000
|
||||
; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x42c80000
|
||||
; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], s{{[0-9]+}}
|
||||
; SI: v_cmp_ge_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[CVT]], [[K]]
|
||||
; SI: v_cmp_le_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[CVT]]
|
||||
define amdgpu_kernel void @v_fcmp_f16_oge(i64 addrspace(1)* %out, half %src) {
|
||||
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 3)
|
||||
store i64 %result, i64 addrspace(1)* %out
|
||||
|
@ -322,9 +322,9 @@ define amdgpu_kernel void @v_fcmp_f16_oge(i64 addrspace(1)* %out, half %src) {
|
|||
; GCN-LABEL: {{^}}v_fcmp_f16_olt:
|
||||
; VI: v_cmp_lt_f16_e64
|
||||
|
||||
; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x42c80000
|
||||
; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x42c80000
|
||||
; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], s{{[0-9]+}}
|
||||
; SI: v_cmp_lt_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[CVT]], [[K]]
|
||||
; SI: v_cmp_gt_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[CVT]]
|
||||
define amdgpu_kernel void @v_fcmp_f16_olt(i64 addrspace(1)* %out, half %src) {
|
||||
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 4)
|
||||
store i64 %result, i64 addrspace(1)* %out
|
||||
|
@ -334,9 +334,9 @@ define amdgpu_kernel void @v_fcmp_f16_olt(i64 addrspace(1)* %out, half %src) {
|
|||
; GCN-LABEL: {{^}}v_fcmp_f16_ole:
|
||||
; VI: v_cmp_le_f16_e64
|
||||
|
||||
; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x42c80000
|
||||
; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x42c80000
|
||||
; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], s{{[0-9]+}}
|
||||
; SI: v_cmp_le_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[CVT]], [[K]]
|
||||
; SI: v_cmp_ge_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[CVT]]
|
||||
define amdgpu_kernel void @v_fcmp_f16_ole(i64 addrspace(1)* %out, half %src) {
|
||||
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 5)
|
||||
store i64 %result, i64 addrspace(1)* %out
|
||||
|
@ -346,9 +346,9 @@ define amdgpu_kernel void @v_fcmp_f16_ole(i64 addrspace(1)* %out, half %src) {
|
|||
; GCN-LABEL: {{^}}v_fcmp_f16_ueq:
|
||||
; VI: v_cmp_nlg_f16_e64
|
||||
|
||||
; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x42c80000
|
||||
; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x42c80000
|
||||
; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], s{{[0-9]+}}
|
||||
; SI: v_cmp_nlg_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[CVT]], [[K]]
|
||||
; SI: v_cmp_nlg_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[CVT]]
|
||||
define amdgpu_kernel void @v_fcmp_f16_ueq(i64 addrspace(1)* %out, half %src) {
|
||||
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 9)
|
||||
store i64 %result, i64 addrspace(1)* %out
|
||||
|
@ -358,9 +358,9 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(i64 addrspace(1)* %out, half %src) {
|
|||
; GCN-LABEL: {{^}}v_fcmp_f16_une:
|
||||
; VI: v_cmp_neq_f16_e64
|
||||
|
||||
; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x42c80000
|
||||
; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x42c80000
|
||||
; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], s{{[0-9]+}}
|
||||
; SI: v_cmp_neq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[CVT]], [[K]]
|
||||
; SI: v_cmp_neq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[CVT]]
|
||||
define amdgpu_kernel void @v_fcmp_f16_une(i64 addrspace(1)* %out, half %src) {
|
||||
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 14)
|
||||
store i64 %result, i64 addrspace(1)* %out
|
||||
|
@ -370,9 +370,9 @@ define amdgpu_kernel void @v_fcmp_f16_une(i64 addrspace(1)* %out, half %src) {
|
|||
; GCN-LABEL: {{^}}v_fcmp_f16_ugt:
|
||||
; VI: v_cmp_nle_f16_e64
|
||||
|
||||
; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x42c80000
|
||||
; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x42c80000
|
||||
; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], s{{[0-9]+}}
|
||||
; SI: v_cmp_nle_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[CVT]], [[K]]
|
||||
; SI: v_cmp_nge_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[CVT]]
|
||||
define amdgpu_kernel void @v_fcmp_f16_ugt(i64 addrspace(1)* %out, half %src) {
|
||||
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 10)
|
||||
store i64 %result, i64 addrspace(1)* %out
|
||||
|
@ -382,9 +382,9 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(i64 addrspace(1)* %out, half %src) {
|
|||
; GCN-LABEL: {{^}}v_fcmp_f16_uge:
|
||||
; VI: v_cmp_nlt_f16_e64
|
||||
|
||||
; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x42c80000
|
||||
; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x42c80000
|
||||
; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], s{{[0-9]+}}
|
||||
; SI: v_cmp_nlt_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[CVT]], [[K]]
|
||||
; SI: v_cmp_ngt_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[CVT]]
|
||||
define amdgpu_kernel void @v_fcmp_f16_uge(i64 addrspace(1)* %out, half %src) {
|
||||
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 11)
|
||||
store i64 %result, i64 addrspace(1)* %out
|
||||
|
@ -394,9 +394,9 @@ define amdgpu_kernel void @v_fcmp_f16_uge(i64 addrspace(1)* %out, half %src) {
|
|||
; GCN-LABEL: {{^}}v_fcmp_f16_ult:
|
||||
; VI: v_cmp_nge_f16_e64
|
||||
|
||||
; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x42c80000
|
||||
; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x42c80000
|
||||
; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], s{{[0-9]+}}
|
||||
; SI: v_cmp_nge_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[CVT]], [[K]]
|
||||
; SI: v_cmp_nle_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[CVT]]
|
||||
define amdgpu_kernel void @v_fcmp_f16_ult(i64 addrspace(1)* %out, half %src) {
|
||||
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 12)
|
||||
store i64 %result, i64 addrspace(1)* %out
|
||||
|
@ -406,9 +406,9 @@ define amdgpu_kernel void @v_fcmp_f16_ult(i64 addrspace(1)* %out, half %src) {
|
|||
; GCN-LABEL: {{^}}v_fcmp_f16_ule:
|
||||
; VI: v_cmp_ngt_f16_e64
|
||||
|
||||
; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x42c80000
|
||||
; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x42c80000
|
||||
; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], s{{[0-9]+}}
|
||||
; SI: v_cmp_ngt_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[CVT]], [[K]]
|
||||
; SI: v_cmp_nlt_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[CVT]]
|
||||
define amdgpu_kernel void @v_fcmp_f16_ule(i64 addrspace(1)* %out, half %src) {
|
||||
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 13)
|
||||
store i64 %result, i64 addrspace(1)* %out
|
||||
|
|
|
@ -34,7 +34,7 @@ define amdgpu_kernel void @mad_f16_imm_a(
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}mad_f16_imm_b:
|
||||
; GCN: v_mov_b32_e32 [[KB:v[0-9]+]], 0x4800
|
||||
; GCN: s_movk_i32 [[KB:s[0-9]+]], 0x4800
|
||||
; GFX8: v_mad_f16 {{v[0-9]+}}, {{v[0-9]+}}, [[KB]],
|
||||
; GFX9: v_mad_legacy_f16 {{v[0-9]+}}, {{v[0-9]+}}, [[KB]],
|
||||
define amdgpu_kernel void @mad_f16_imm_b(
|
||||
|
|
|
@ -21,8 +21,7 @@ define amdgpu_kernel void @mad_f32(
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}mad_f32_imm_a:
|
||||
; GCN: v_mov_b32_e32 [[KA:v[0-9]+]], 0x41000000
|
||||
; GCN: v_ma{{[dc]}}_f32 {{v[0-9]+}}, [[KA]],
|
||||
; GCN: v_madmk_f32 {{v[0-9]+}}, {{v[0-9]+}}, 0x41000000,
|
||||
define amdgpu_kernel void @mad_f32_imm_a(
|
||||
float addrspace(1)* %r,
|
||||
float addrspace(1)* %b,
|
||||
|
|
|
@ -53,8 +53,8 @@ define amdgpu_kernel void @test_fabs_fmed3(float addrspace(1)* %out, float %src0
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_fneg_fmed3_rr_0:
|
||||
; GCN: v_bfrev_b32_e32 [[NEG0:v[0-9]+]], 1
|
||||
; GCN: v_med3_f32 v{{[0-9]+}}, -s{{[0-9]+}}, -v{{[0-9]+}}, [[NEG0]]
|
||||
; GCN: s_brev_b32 [[NEG0:s[0-9]+]], 1
|
||||
; GCN: v_med3_f32 v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}, [[NEG0]]
|
||||
define amdgpu_kernel void @test_fneg_fmed3_rr_0(float addrspace(1)* %out, float %src0, float %src1) #1 {
|
||||
%med3 = call float @llvm.amdgcn.fmed3.f32(float %src0, float %src1, float 0.0)
|
||||
%neg.med3 = fsub float -0.0, %med3
|
||||
|
@ -88,8 +88,8 @@ define amdgpu_kernel void @test_fneg_fmed3_r_inv2pi_0(float addrspace(1)* %out,
|
|||
|
||||
; GCN-LABEL: {{^}}test_fneg_fmed3_r_inv2pi_0_foldable_user:
|
||||
; GCN-DAG: v_bfrev_b32_e32 [[NEG0:v[0-9]+]], 1
|
||||
; GCN-DAG: v_mov_b32_e32 [[NEG_INV:v[0-9]+]], 0xbe22f983
|
||||
; GCN: v_med3_f32 [[MED3:v[0-9]+]], -s{{[0-9]+}}, [[NEG_INV]], [[NEG0]]
|
||||
; GCN-DAG: s_mov_b32 [[NEG_INV:s[0-9]+]], 0xbe22f983
|
||||
; GCN: v_med3_f32 [[MED3:v[0-9]+]], -v{{[0-9]+}}, [[NEG_INV]], [[NEG0]]
|
||||
; GCN: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[MED3]]
|
||||
define amdgpu_kernel void @test_fneg_fmed3_r_inv2pi_0_foldable_user(float addrspace(1)* %out, float %src0, float %mul.arg) #1 {
|
||||
%med3 = call float @llvm.amdgcn.fmed3.f32(float %src0, float 0x3FC45F3060000000, float 0.0)
|
||||
|
|
|
@ -212,7 +212,7 @@ define amdgpu_gs void @neg_olt(float %a) {
|
|||
|
||||
; SI-LABEL: {{^}}fcmp_x2:
|
||||
; FIXME: LLVM should be able to combine these fcmp opcodes.
|
||||
; SI: v_cmp_gt_f32
|
||||
; SI: v_cmp_lt_f32_e32 vcc, s{{[0-9]+}}, v0
|
||||
; SI: v_cndmask_b32
|
||||
; SI: v_cmpx_le_f32
|
||||
define amdgpu_ps void @fcmp_x2(float %a) #0 {
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
declare i32 @llvm.amdgcn.lerp(i32, i32, i32) #0
|
||||
|
||||
; GCN-LABEL: {{^}}v_lerp:
|
||||
; GCN: v_lerp_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN: v_lerp_u8 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
|
||||
define amdgpu_kernel void @v_lerp(i32 addrspace(1)* %out, i32 %src) nounwind {
|
||||
%result= call i32 @llvm.amdgcn.lerp(i32 %src, i32 100, i32 100) #0
|
||||
store i32 %result, i32 addrspace(1)* %out, align 4
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
declare i32 @llvm.amdgcn.msad.u8(i32, i32, i32) #0
|
||||
|
||||
; GCN-LABEL: {{^}}v_msad_u8:
|
||||
; GCN: v_msad_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN: v_msad_u8 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
|
||||
define amdgpu_kernel void @v_msad_u8(i32 addrspace(1)* %out, i32 %src) {
|
||||
%result= call i32 @llvm.amdgcn.msad.u8(i32 %src, i32 100, i32 100) #0
|
||||
store i32 %result, i32 addrspace(1)* %out, align 4
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
declare i32 @llvm.amdgcn.sad.hi.u8(i32, i32, i32) #0
|
||||
|
||||
; GCN-LABEL: {{^}}v_sad_hi_u8:
|
||||
; GCN: v_sad_hi_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN: v_sad_hi_u8 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
|
||||
define amdgpu_kernel void @v_sad_hi_u8(i32 addrspace(1)* %out, i32 %src) {
|
||||
%result= call i32 @llvm.amdgcn.sad.hi.u8(i32 %src, i32 100, i32 100) #0
|
||||
store i32 %result, i32 addrspace(1)* %out, align 4
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
declare i32 @llvm.amdgcn.sad.u16(i32, i32, i32) #0
|
||||
|
||||
; GCN-LABEL: {{^}}v_sad_u16:
|
||||
; GCN: v_sad_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN: v_sad_u16 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
|
||||
define amdgpu_kernel void @v_sad_u16(i32 addrspace(1)* %out, i32 %src) {
|
||||
%result= call i32 @llvm.amdgcn.sad.u16(i32 %src, i32 100, i32 100) #0
|
||||
store i32 %result, i32 addrspace(1)* %out, align 4
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
declare i32 @llvm.amdgcn.sad.u8(i32, i32, i32) #0
|
||||
|
||||
; GCN-LABEL: {{^}}v_sad_u8:
|
||||
; GCN: v_sad_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN: v_sad_u8 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
|
||||
define amdgpu_kernel void @v_sad_u8(i32 addrspace(1)* %out, i32 %src) {
|
||||
%result= call i32 @llvm.amdgcn.sad.u8(i32 %src, i32 100, i32 100) #0
|
||||
store i32 %result, i32 addrspace(1)* %out, align 4
|
||||
|
|
|
@ -25,14 +25,14 @@ entry:
|
|||
|
||||
; GCN-LABEL: {{^}}cos_v2f16
|
||||
; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||
; SI-DAG: v_mov_b32_e32 v[[HALF_PI:[0-9]+]], 0x3e22f983{{$}}
|
||||
; SI-DAG: s_mov_b32 [[HALF_PI:s[0-9]+]], 0x3e22f983{{$}}
|
||||
|
||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
||||
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
||||
; SI: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[A_F32_0]], v[[HALF_PI]]
|
||||
; SI: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], [[HALF_PI]], v[[A_F32_0]]
|
||||
; SI: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
|
||||
; SI: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[A_F32_1]], v[[HALF_PI]]
|
||||
; SI: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], [[HALF_PI]], v[[A_F32_1]]
|
||||
; SI: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
|
||||
|
||||
; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
||||
|
|
|
@ -33,13 +33,13 @@ define amdgpu_kernel void @fma_f16(
|
|||
; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
|
||||
; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
|
||||
|
||||
; SI: v_mov_b32_e32 v[[A_F32:[0-9]+]], 0x40400000{{$}}
|
||||
; SI: s_mov_b32 s[[A_F32:[0-9]+]], 0x40400000{{$}}
|
||||
; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
|
||||
; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
|
||||
; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]]
|
||||
; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[B_F32:[0-9]]], s[[A_F32:[0-9]]], v[[C_F32:[0-9]]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
|
||||
; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x4200{{$}}
|
||||
; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
|
||||
; VI: s_movk_i32 s[[A_F16:[0-9]+]], 0x4200{{$}}
|
||||
; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], s[[A_F16]], v[[C_F16]]
|
||||
; GCN: buffer_store_short v[[R_F16]]
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @fma_f16_imm_a(
|
||||
|
@ -56,13 +56,13 @@ define amdgpu_kernel void @fma_f16_imm_a(
|
|||
; GCN-LABEL: {{^}}fma_f16_imm_b
|
||||
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
|
||||
; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
|
||||
; SI: v_mov_b32_e32 v[[B_F32:[0-9]+]], 0x40400000{{$}}
|
||||
; SI: s_mov_b32 s[[B_F32:[0-9]+]], 0x40400000{{$}}
|
||||
; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
|
||||
; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
|
||||
; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]]
|
||||
; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], s[[B_F32:[0-9]]], v[[C_F32:[0-9]]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
|
||||
; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}}
|
||||
; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
|
||||
; VI: s_movk_i32 s[[B_F16:[0-9]+]], 0x4200{{$}}
|
||||
; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], s[[B_F16]], v[[C_F16]]
|
||||
; GCN: buffer_store_short v[[R_F16]]
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @fma_f16_imm_b(
|
||||
|
@ -79,13 +79,13 @@ define amdgpu_kernel void @fma_f16_imm_b(
|
|||
; GCN-LABEL: {{^}}fma_f16_imm_c
|
||||
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
|
||||
; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
|
||||
; SI: v_mov_b32_e32 v[[C_F32:[0-9]+]], 0x40400000{{$}}
|
||||
; SI: s_mov_b32 s[[C_F32:[0-9]+]], 0x40400000{{$}}
|
||||
; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
|
||||
; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
|
||||
; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]]
|
||||
; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], s[[C_F32:[0-9]]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
|
||||
; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}}
|
||||
; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
|
||||
; VI: s_movk_i32 s[[C_F16:[0-9]+]], 0x4200{{$}}
|
||||
; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], s[[C_F16]]
|
||||
; GCN: buffer_store_short v[[R_F16]]
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @fma_f16_imm_c(
|
||||
|
@ -154,8 +154,8 @@ define amdgpu_kernel void @fma_v2f16(
|
|||
; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
||||
|
||||
|
||||
; SI: v_mov_b32_e32 v[[A_F32:[0-9]+]], 0x40400000{{$}}
|
||||
; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x4200{{$}}
|
||||
; SI: s_mov_b32 s[[A_F32:[0-9]+]], 0x40400000{{$}}
|
||||
; VI: s_movk_i32 s[[A_F16:[0-9]+]], 0x4200{{$}}
|
||||
; GCN-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
||||
; GCN-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
|
||||
|
||||
|
@ -164,13 +164,13 @@ define amdgpu_kernel void @fma_v2f16(
|
|||
; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
|
||||
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
|
||||
|
||||
; SI: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32]], v[[C_F32_1]]
|
||||
; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32]], v[[C_F32_0]]
|
||||
; SI: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], s[[A_F32]], v[[C_F32_1]]
|
||||
; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], s[[A_F32]], v[[C_F32_0]]
|
||||
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
|
||||
; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[C_F16_1]], v[[A_F16]], v[[B_F16_1]]
|
||||
; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[C_V2_F16]], v[[A_F16]], v[[B_V2_F16]]
|
||||
; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[C_F16_1]], s[[A_F16]], v[[B_F16_1]]
|
||||
; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[C_V2_F16]], s[[A_F16]], v[[B_V2_F16]]
|
||||
|
||||
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN-NOT: and
|
||||
|
@ -195,8 +195,8 @@ define amdgpu_kernel void @fma_v2f16_imm_a(
|
|||
; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||
; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
|
||||
|
||||
; SI: v_mov_b32_e32 v[[B_F32:[0-9]+]], 0x40400000{{$}}
|
||||
; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}}
|
||||
; SI: s_mov_b32 s[[B_F32:[0-9]+]], 0x40400000{{$}}
|
||||
; VI: s_movk_i32 s[[B_F16:[0-9]+]], 0x4200{{$}}
|
||||
|
||||
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
||||
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||
|
@ -205,15 +205,15 @@ define amdgpu_kernel void @fma_v2f16_imm_a(
|
|||
|
||||
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
||||
; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
|
||||
; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32]], v[[C_F32_0]]
|
||||
; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], s[[B_F32]], v[[C_F32_0]]
|
||||
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||
; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32]], v[[C_F32_1]]
|
||||
; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], s[[B_F32]], v[[C_F32_1]]
|
||||
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
|
||||
; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||
; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
|
||||
; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_F16]], v[[C_V2_F16]]
|
||||
; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16]], v[[C_F16_1]]
|
||||
; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], s[[B_F16]], v[[C_V2_F16]]
|
||||
; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], s[[B_F16]], v[[C_F16_1]]
|
||||
|
||||
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN-NOT: and
|
||||
|
@ -238,8 +238,8 @@ define amdgpu_kernel void @fma_v2f16_imm_b(
|
|||
; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||
; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
||||
|
||||
; SI: v_mov_b32_e32 v[[C_F32:[0-9]+]], 0x40400000{{$}}
|
||||
; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}}
|
||||
; SI: s_mov_b32 s[[C_F32:[0-9]+]], 0x40400000{{$}}
|
||||
; VI: s_movk_i32 s[[C_F16:[0-9]+]], 0x4200{{$}}
|
||||
|
||||
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
||||
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||
|
@ -250,8 +250,8 @@ define amdgpu_kernel void @fma_v2f16_imm_b(
|
|||
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
|
||||
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
||||
|
||||
; SI: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32]]
|
||||
; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32]]
|
||||
; SI: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], s[[C_F32]]
|
||||
; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], s[[C_F32]]
|
||||
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
|
@ -260,8 +260,8 @@ define amdgpu_kernel void @fma_v2f16_imm_b(
|
|||
|
||||
; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||
; VI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
||||
; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_F16]]
|
||||
; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16]]
|
||||
; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], s[[C_F16]]
|
||||
; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], s[[C_F16]]
|
||||
; GCN-NOT: and
|
||||
; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
|
||||
|
||||
|
|
|
@ -49,7 +49,7 @@ define amdgpu_kernel void @fmuladd_f16(
|
|||
; VI-FLUSH: v_mac_f16_e32 v[[C_F16]], 0x4200, v[[B_F16]]
|
||||
; VI-FLUSH: buffer_store_short v[[C_F16]]
|
||||
|
||||
; VI-DENORM: v_mov_b32_e32 [[KA:v[0-9]+]], 0x4200
|
||||
; VI-DENORM: s_movk_i32 [[KA:s[0-9]+]], 0x4200
|
||||
; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[B_F16]], [[KA]], v[[C_F16]]
|
||||
; VI-DENORM: buffer_store_short [[RESULT]]
|
||||
|
||||
|
@ -77,7 +77,7 @@ define amdgpu_kernel void @fmuladd_f16_imm_a(
|
|||
; VI-FLUSH: v_mac_f16_e32 v[[C_F16]], 0x4200, v[[A_F16]]
|
||||
; VI-FLUSH: buffer_store_short v[[C_F16]]
|
||||
|
||||
; VI-DENORM: v_mov_b32_e32 [[KA:v[0-9]+]], 0x4200
|
||||
; VI-DENORM: s_movk_i32 [[KA:s[0-9]+]], 0x4200
|
||||
; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[A_F16]], [[KA]], v[[C_F16]]
|
||||
; VI-DENORM buffer_store_short [[RESULT]]
|
||||
|
||||
|
|
|
@ -32,23 +32,24 @@ entry:
|
|||
; SI: buffer_load_dword v[[A_F16_0:[0-9]+]]
|
||||
; VI: flat_load_dword v[[A_F16_0:[0-9]+]]
|
||||
; GFX9: global_load_dword v[[A_F16_0:[0-9]+]]
|
||||
; SI: v_mov_b32_e32 v[[A_F32_2:[0-9]+]], 0x3f317218
|
||||
; VIGFX9: v_mov_b32_e32 v[[A_F32_2:[0-9]+]], 0x398c
|
||||
; SI: s_mov_b32 [[A_F32_2:s[0-9]+]], 0x3f317218
|
||||
; VIGFX9: s_movk_i32 [[A_F32_2:s[0-9]+]], 0x398c
|
||||
; VI: v_mov_b32_e32 [[A_F32_2_V:v[0-9]+]], [[A_F32_2]]
|
||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_0]]
|
||||
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]]
|
||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]]
|
||||
; SI: v_log_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
|
||||
; SI: v_log_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
|
||||
; SI: v_mul_f32_e32 v[[R_F32_6:[0-9]+]], v[[R_F32_1]], v[[A_F32_2]]
|
||||
; SI: v_mul_f32_e32 v[[R_F32_6:[0-9]+]], [[A_F32_2]], v[[R_F32_1]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_6]]
|
||||
; SI: v_mul_f32_e32 v[[R_F32_5:[0-9]+]], v[[R_F32_0]], v[[A_F32_2]]
|
||||
; SI: v_mul_f32_e32 v[[R_F32_5:[0-9]+]], [[A_F32_2]], v[[R_F32_0]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_5]]
|
||||
; GFX9: v_log_f16_e32 v[[R_F16_2:[0-9]+]], v[[A_F16_0]]
|
||||
; VIGFX9: v_log_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; VI: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_F16_0]]
|
||||
; VI: v_mul_f16_sdwa v[[R_F16_2:[0-9]+]], v[[R_F16_1]], v[[A_F32_2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX9: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], v[[R_F16_2]], v[[A_F32_2]]
|
||||
; VIGFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], v[[R_F16_0]], v[[A_F32_2]]
|
||||
; VI: v_mul_f16_sdwa v[[R_F16_2:[0-9]+]], v[[R_F16_1]], [[A_F32_2_V]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX9: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], [[A_F32_2]], v[[R_F16_2]]
|
||||
; VIGFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], [[A_F32_2]], v[[R_F16_0]]
|
||||
; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_0]]
|
||||
; SI-NOT: v_and_b32_e32
|
||||
; SI: v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
|
||||
|
|
|
@ -31,12 +31,11 @@ entry:
|
|||
; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
|
||||
; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
|
||||
; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
|
||||
; GCN: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; SI: v_mov_b32_e32 v[[R_F32_LOG_CONST:[0-9]+]], 0x3f317218
|
||||
; GCN: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GFX8: v_mov_b32_e32 v[[R_F32_LOG_CONST:[0-9]+]], 0x3f317218
|
||||
; GCN: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[R_F32_LOG_CONST]]
|
||||
; GCN: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[R_F32_LOG_CONST]]
|
||||
; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DAG: s_mov_b32 [[R_F32_LOG_CONST:s[0-9]+]], 0x3f317218
|
||||
; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}}
|
||||
; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}}
|
||||
define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
|
||||
entry:
|
||||
%res = call <2 x float> @llvm.log.v2f32(<2 x float> %in)
|
||||
|
@ -67,16 +66,15 @@ entry:
|
|||
; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
|
||||
; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
|
||||
; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
|
||||
; GCN: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; SI: v_mov_b32_e32 v[[R_F32_LOG_CONST:[0-9]+]], 0x3f317218
|
||||
; GCN: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GFX8: v_mov_b32_e32 v[[R_F32_LOG_CONST:[0-9]+]], 0x3f317218
|
||||
; GCN: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[R_F32_LOG_CONST]]
|
||||
; GCN: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[R_F32_LOG_CONST]]
|
||||
; GCN: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[R_F32_LOG_CONST]]
|
||||
; GCN: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[R_F32_LOG_CONST]]
|
||||
; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DAG: s_mov_b32 [[R_F32_LOG_CONST:s[0-9]+]], 0x3f317218
|
||||
; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}}
|
||||
; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}}
|
||||
; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}}
|
||||
; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}}
|
||||
define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
|
||||
entry:
|
||||
%res = call <4 x float> @llvm.log.v4f32(<4 x float> %in)
|
||||
|
|
|
@ -32,23 +32,24 @@ entry:
|
|||
; SI: buffer_load_dword v[[A_F16_0:[0-9]+]]
|
||||
; VI: flat_load_dword v[[A_F16_0:[0-9]+]]
|
||||
; GFX9: global_load_dword v[[A_F16_0:[0-9]+]]
|
||||
; SI: v_mov_b32_e32 v[[A_F32_2:[0-9]+]], 0x3e9a209a
|
||||
; VIGFX9: v_mov_b32_e32 v[[A_F32_2:[0-9]+]], 0x34d1
|
||||
; SI: s_mov_b32 [[A_F32_2:s[0-9]+]], 0x3e9a209a
|
||||
; VIGFX9: s_movk_i32 [[A_F32_2:s[0-9]+]], 0x34d1
|
||||
; VI: v_mov_b32_e32 [[A_F32_2_V:v[0-9]+]], [[A_F32_2]]
|
||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_0]]
|
||||
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]]
|
||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]]
|
||||
; SI: v_log_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
|
||||
; SI: v_log_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
|
||||
; SI: v_mul_f32_e32 v[[R_F32_6:[0-9]+]], v[[R_F32_1]], v[[A_F32_2]]
|
||||
; SI: v_mul_f32_e32 v[[R_F32_6:[0-9]+]], [[A_F32_2]], v[[R_F32_1]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_6]]
|
||||
; SI: v_mul_f32_e32 v[[R_F32_5:[0-9]+]], v[[R_F32_0]], v[[A_F32_2]]
|
||||
; SI: v_mul_f32_e32 v[[R_F32_5:[0-9]+]], [[A_F32_2]], v[[R_F32_0]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_5]]
|
||||
; GFX9: v_log_f16_e32 v[[R_F16_2:[0-9]+]], v[[A_F16_0]]
|
||||
; VIGFX9: v_log_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; VI: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_F16_0]]
|
||||
; VI: v_mul_f16_sdwa v[[R_F16_2:[0-9]+]], v[[R_F16_1]], v[[A_F32_2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX9: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], v[[R_F16_2]], v[[A_F32_2]]
|
||||
; VIGFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], v[[R_F16_0]], v[[A_F32_2]]
|
||||
; VI: v_mul_f16_sdwa v[[R_F16_2:[0-9]+]], v[[R_F16_1]], [[A_F32_2_V]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX9: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], [[A_F32_2]], v[[R_F16_2]]
|
||||
; VIGFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], [[A_F32_2]], v[[R_F16_0]]
|
||||
; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_0]]
|
||||
; SI-NOT: v_and_b32_e32
|
||||
; SI: v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
|
||||
|
|
|
@ -31,12 +31,11 @@ entry:
|
|||
; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
|
||||
; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
|
||||
; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
|
||||
; GCN: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; SI: v_mov_b32_e32 v[[R_F32_LOG_CONST:[0-9]+]], 0x3e9a209a
|
||||
; GCN: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GFX8: v_mov_b32_e32 v[[R_F32_LOG_CONST:[0-9]+]], 0x3e9a209a
|
||||
; GCN: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[R_F32_LOG_CONST]]
|
||||
; GCN: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[R_F32_LOG_CONST]]
|
||||
; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DAG: s_mov_b32 [[R_F32_LOG_CONST:s[0-9]+]], 0x3e9a209a
|
||||
; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}}
|
||||
; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}}
|
||||
define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
|
||||
entry:
|
||||
%res = call <2 x float> @llvm.log10.v2f32(<2 x float> %in)
|
||||
|
@ -67,16 +66,15 @@ entry:
|
|||
; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
|
||||
; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
|
||||
; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
|
||||
; GCN: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; SI: v_mov_b32_e32 v[[R_F32_LOG_CONST:[0-9]+]], 0x3e9a209a
|
||||
; GCN: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GFX8: v_mov_b32_e32 v[[R_F32_LOG_CONST:[0-9]+]], 0x3e9a209a
|
||||
; GCN: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[R_F32_LOG_CONST]]
|
||||
; GCN: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[R_F32_LOG_CONST]]
|
||||
; GCN: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[R_F32_LOG_CONST]]
|
||||
; GCN: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[R_F32_LOG_CONST]]
|
||||
; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DAG: s_mov_b32 [[R_F32_LOG_CONST:s[0-9]+]], 0x3e9a209a
|
||||
; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}}
|
||||
; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}}
|
||||
; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}}
|
||||
; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}}
|
||||
define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
|
||||
entry:
|
||||
%res = call <4 x float> @llvm.log10.v4f32(<4 x float> %in)
|
||||
|
|
|
@ -25,14 +25,14 @@ entry:
|
|||
|
||||
; GCN-LABEL: {{^}}sin_v2f16:
|
||||
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||
; SI: v_mov_b32_e32 v[[HALF_PI:[0-9]+]], 0x3e22f983{{$}}
|
||||
; SI: s_mov_b32 [[HALF_PI:s[0-9]+]], 0x3e22f983{{$}}
|
||||
|
||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
||||
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
||||
; SI: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[A_F32_0]], v[[HALF_PI]]
|
||||
; SI: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], [[HALF_PI]], v[[A_F32_0]]
|
||||
; SI: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
|
||||
; SI: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[A_F32_1]], v[[HALF_PI]]
|
||||
; SI: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], [[HALF_PI]], v[[A_F32_1]]
|
||||
; SI: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
|
||||
; SI: v_sin_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]]
|
||||
; SI: v_sin_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]]
|
||||
|
|
|
@ -214,9 +214,9 @@ define float @v_mad_mix_f32_f16lo_f16lo_negabsf32(half %src0, half %src1, float
|
|||
|
||||
; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f32imm1:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9: v_mov_b32_e32 v2, 1.0
|
||||
; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding
|
||||
; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding
|
||||
; GFX9: s_mov_b32 [[SREG:s[0-9]+]], 1.0
|
||||
; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding
|
||||
; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding
|
||||
|
||||
; CIVI: v_mad_f32 v0, v0, v1, 1.0
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
|
@ -229,9 +229,9 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32imm1(half %src0, half %src1) #0 {
|
|||
|
||||
; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f32imminv2pi:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9: v_mov_b32_e32 v2, 0.15915494
|
||||
; GFX900: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding
|
||||
; GFX906: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding
|
||||
; GFX9: s_mov_b32 [[SREG:s[0-9]+]], 0.15915494
|
||||
; GFX900: v_mad_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding
|
||||
; GFX906: v_fma_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding
|
||||
; VI: v_mad_f32 v0, v0, v1, 0.15915494
|
||||
define float @v_mad_mix_f32_f16lo_f16lo_f32imminv2pi(half %src0, half %src1) #0 {
|
||||
%src0.ext = fpext half %src0 to float
|
||||
|
@ -246,9 +246,9 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32imminv2pi(half %src0, half %src1) #0
|
|||
; fpext f16 1/2pi = 0x3e230000
|
||||
; f32 1/2pi = 0x3e22f983
|
||||
; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi:
|
||||
; GFX9: v_mov_b32_e32 v2, 0x3e230000
|
||||
; GFX900: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding
|
||||
; GFX906: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding
|
||||
; GFX9: s_mov_b32 [[SREG:s[0-9]+]], 0x3e230000
|
||||
; GFX900: v_mad_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding
|
||||
; GFX906: v_fma_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding
|
||||
|
||||
; CIVI: v_madak_f32 v0, v0, v1, 0x3e230000
|
||||
define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi(half %src0, half %src1) #0 {
|
||||
|
@ -260,9 +260,9 @@ define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi(half %src0, half %src1)
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_cvtf16imm63:
|
||||
; GFX9: v_mov_b32_e32 v2, 0x367c0000
|
||||
; GFX900: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding
|
||||
; GFX906: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding
|
||||
; GFX9: s_mov_b32 [[SREG:s[0-9]+]], 0x367c0000
|
||||
; GFX900: v_mad_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding
|
||||
; GFX906: v_fma_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding
|
||||
|
||||
; CIVI: v_madak_f32 v0, v0, v1, 0x367c0000
|
||||
define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imm63(half %src0, half %src1) #0 {
|
||||
|
@ -274,13 +274,13 @@ define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imm63(half %src0, half %src1) #0 {
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_mad_mix_v2f32_f32imm1:
|
||||
; GFX9: v_mov_b32_e32 v3, 1.0
|
||||
; GFX900: v_mad_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding
|
||||
; GFX900: v_mad_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding
|
||||
; GFX9: s_mov_b32 [[SREG:s[0-9]+]], 1.0
|
||||
; GFX900: v_mad_mix_f32 v2, v0, v1, [[SREG]] op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding
|
||||
; GFX900: v_mad_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding
|
||||
; GFX900: v_mov_b32_e32 v1, v2
|
||||
|
||||
; GFX906: v_fma_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding
|
||||
; GFX906: v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding
|
||||
; GFX906: v_fma_mix_f32 v2, v0, v1, [[SREG]] op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding
|
||||
; GFX906: v_fma_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding
|
||||
; GFX906: v_mov_b32_e32 v1, v2
|
||||
define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1) #0 {
|
||||
%src0.ext = fpext <2 x half> %src0 to <2 x float>
|
||||
|
@ -290,14 +290,14 @@ define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1)
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_mad_mix_v2f32_cvtf16imminv2pi:
|
||||
; GFX9: v_mov_b32_e32 v3, 0x3e230000
|
||||
; GFX9: s_mov_b32 [[SREG:s[0-9]+]], 0x3e230000
|
||||
|
||||
; GFX900: v_mad_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding
|
||||
; GFX900: v_mad_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding
|
||||
; GFX900: v_mad_mix_f32 v2, v0, v1, [[SREG]] op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding
|
||||
; GFX900: v_mad_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding
|
||||
; GFX900: v_mov_b32_e32 v1, v2
|
||||
|
||||
; GFX906: v_fma_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding
|
||||
; GFX906: v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding
|
||||
; GFX906: v_fma_mix_f32 v2, v0, v1, [[SREG]] op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding
|
||||
; GFX906: v_fma_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding
|
||||
; GFX906: v_mov_b32_e32 v1, v2
|
||||
define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 {
|
||||
%src0.ext = fpext <2 x half> %src0 to <2 x float>
|
||||
|
@ -308,14 +308,14 @@ define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half>
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_mad_mix_v2f32_f32imminv2pi:
|
||||
; GFX9: v_mov_b32_e32 v3, 0.15915494
|
||||
; GFX9: s_mov_b32 [[SREG:s[0-9]+]], 0.15915494
|
||||
|
||||
; GFX900: v_mad_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding
|
||||
; GFX900: v_mad_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding
|
||||
; GFX900: v_mad_mix_f32 v2, v0, v1, [[SREG]] op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding
|
||||
; GFX900: v_mad_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding
|
||||
; GFX900: v_mov_b32_e32 v1, v2
|
||||
|
||||
; GFX906: v_fma_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding
|
||||
; GFX906: v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding
|
||||
; GFX906: v_fma_mix_f32 v2, v0, v1, [[SREG]] op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding
|
||||
; GFX906: v_fma_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding
|
||||
; GFX906: v_mov_b32_e32 v1, v2
|
||||
define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 {
|
||||
%src0.ext = fpext <2 x half> %src0 to <2 x float>
|
||||
|
|
|
@ -31,9 +31,9 @@ define amdgpu_kernel void @madmk_f32(float addrspace(1)* noalias %out, float add
|
|||
; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
|
||||
; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
|
||||
; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
|
||||
; GCN-DAG: v_mac_f32_e32 [[VB]], [[VA]], [[VK]]
|
||||
; GCN-DAG: v_mac_f32_e32 [[VC]], [[VA]], [[VK]]
|
||||
; GCN-DAG: s_mov_b32 [[SK:s[0-9]+]], 0x41200000
|
||||
; GCN-DAG: v_mac_f32_e32 [[VB]], [[SK]], [[VA]]
|
||||
; GCN-DAG: v_mac_f32_e32 [[VC]], [[SK]], [[VA]]
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
|
||||
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
|
||||
|
@ -94,8 +94,10 @@ define amdgpu_kernel void @s_s_madmk_f32(float addrspace(1)* noalias %out, [8 x
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_s_madmk_f32:
|
||||
; GCN-NOT: v_madmk_f32
|
||||
; GCN: v_mad_f32
|
||||
; GCN: s_load_dword [[SREG:s[0-9]+]]
|
||||
; GCN: buffer_load_dword [[VREG1:v[0-9]+]]
|
||||
; GCN: v_mov_b32_e32 [[VREG2:v[0-9]+]], [[SREG]]
|
||||
; GCN: v_mac_f32_e32 [[VREG2]], 0x41200000, [[VREG1]]
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @v_s_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %b) nounwind {
|
||||
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
|
||||
|
@ -128,8 +130,8 @@ define amdgpu_kernel void @scalar_vector_madmk_f32(float addrspace(1)* noalias %
|
|||
; GCN-LABEL: {{^}}no_madmk_src0_modifier_f32:
|
||||
; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
|
||||
; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
|
||||
; GCN: v_mad_f32 {{v[0-9]+}}, |[[VA]]|, [[VK]], [[VB]]
|
||||
; GCN-DAG: s_mov_b32 [[SK:s[0-9]+]], 0x41200000
|
||||
; GCN: v_mad_f32 {{v[0-9]+}}, |[[VA]]|, [[SK]], [[VB]]
|
||||
define amdgpu_kernel void @no_madmk_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
|
||||
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
|
||||
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
|
||||
|
@ -150,7 +152,7 @@ define amdgpu_kernel void @no_madmk_src0_modifier_f32(float addrspace(1)* noalia
|
|||
; GCN-LABEL: {{^}}no_madmk_src2_modifier_f32:
|
||||
; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
|
||||
; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, |{{[sv][0-9]+}}|
|
||||
; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{[sv][0-9]+}}, |{{v[0-9]+}}|
|
||||
define amdgpu_kernel void @no_madmk_src2_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
|
||||
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
|
||||
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
|
||||
|
@ -170,8 +172,8 @@ define amdgpu_kernel void @no_madmk_src2_modifier_f32(float addrspace(1)* noalia
|
|||
|
||||
; GCN-LABEL: {{^}}madmk_add_inline_imm_f32:
|
||||
; GCN: buffer_load_dword [[A:v[0-9]+]]
|
||||
; GCN: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
|
||||
; GCN: v_mad_f32 {{v[0-9]+}}, [[A]], [[VK]], 2.0
|
||||
; GCN: s_mov_b32 [[SK:s[0-9]+]], 0x41200000
|
||||
; GCN: v_mad_f32 {{v[0-9]+}}, [[A]], [[SK]], 2.0
|
||||
define amdgpu_kernel void @madmk_add_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
|
||||
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
|
||||
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
|
||||
|
|
|
@ -36,7 +36,7 @@ define amdgpu_kernel void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)*
|
|||
|
||||
; FUNC-LABEL: {{^}}slow_sdiv_i32_3435:
|
||||
; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]],
|
||||
; SI-DAG: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x98a1930b
|
||||
; SI-DAG: s_mov_b32 [[MAGIC:s[0-9]+]], 0x98a1930b
|
||||
; SI: v_mul_hi_i32 [[TMP:v[0-9]+]], [[VAL]], [[MAGIC]]
|
||||
; SI: v_add_{{[iu]}}32
|
||||
; SI: v_lshrrev_b32
|
||||
|
|
|
@ -197,14 +197,14 @@ entry:
|
|||
; SI: v_cvt_f32_f16_e32
|
||||
; SI: v_cvt_f32_f16_e32
|
||||
|
||||
; SI: v_cmp_gt_f32_e32
|
||||
; SI: v_cmp_lt_f32_e32
|
||||
; SI: v_cndmask_b32_e32
|
||||
; SI: v_cmp_lt_f32_e32 vcc, 0.5
|
||||
; SI: v_cmp_lt_f32_e32 vcc, 0.5
|
||||
; SI: v_cndmask_b32_e32
|
||||
|
||||
; VI: v_cmp_lt_f16_e32
|
||||
; VI: v_cndmask_b32_e32
|
||||
; VI: v_cmp_gt_f16_e32
|
||||
; VI: v_cmp_lt_f16_e32
|
||||
; VI: v_cndmask_b32_e32
|
||||
|
||||
; SI: v_cvt_f16_f32_e32
|
||||
|
@ -233,14 +233,14 @@ entry:
|
|||
; SI: v_cvt_f32_f16_e32
|
||||
; SI: v_cvt_f32_f16_e32
|
||||
|
||||
; SI: v_cmp_lt_f32_e32
|
||||
; SI: v_cmp_gt_f32_e32
|
||||
; SI: v_cndmask_b32_e32
|
||||
; SI: v_cmp_gt_f32_e32 vcc, 0.5
|
||||
; SI: v_cndmask_b32_e32
|
||||
|
||||
; VI: v_cmp_gt_f16_e32
|
||||
; VI: v_cndmask_b32_e32
|
||||
; VI: v_cmp_lt_f16_e32
|
||||
; VI: v_cmp_gt_f16_e32
|
||||
; VI: v_cndmask_b32_e32
|
||||
|
||||
; SI: v_cvt_f16_f32_e32
|
||||
|
|
|
@ -182,7 +182,7 @@ define amdgpu_kernel void @cmp_sext_k_neg1(i1 addrspace(1)* %out, i8 addrspace(1
|
|||
|
||||
; FUNC-LABEL: {{^}}v_cmp_sext_k_neg1_i8_sext_arg:
|
||||
; GCN: v_cmp_ne_u32_e32 vcc, -1, v0
|
||||
; GCN-NEXT: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 0, 1, vcc
|
||||
; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 0, 1, vcc
|
||||
; GCN: buffer_store_byte [[SELECT]]
|
||||
define void @v_cmp_sext_k_neg1_i8_sext_arg(i8 signext %b) nounwind {
|
||||
%b.ext = sext i8 %b to i32
|
||||
|
|
|
@ -19,7 +19,7 @@ define amdgpu_kernel void @srem_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)*
|
|||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}srem_i32_7:
|
||||
; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x92492493
|
||||
; SI: s_mov_b32 [[MAGIC:s[0-9]+]], 0x92492493
|
||||
; SI: v_mul_hi_i32 {{v[0-9]+}}, {{v[0-9]+}}, [[MAGIC]]
|
||||
; SI: v_mul_lo_i32
|
||||
; SI: v_sub_{{[iu]}}32
|
||||
|
|
|
@ -4,22 +4,22 @@
|
|||
target triple="amdgcn--"
|
||||
|
||||
; CHECK-LABEL: foobar:
|
||||
; CHECK: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
||||
; CHECK: s_load_dwordx2 s[4:5], s[0:1], 0x9
|
||||
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
||||
; CHECK-NEXT: v_mbcnt_lo_u32_b32_e64
|
||||
; CHECK-NEXT: s_mov_b32 s2, -1
|
||||
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, s3
|
||||
; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, s5
|
||||
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
|
||||
; CHECK: BB0_1:
|
||||
; CHECK-NEXT: ; kill: def $vgpr0_vgpr1 killed $sgpr2_sgpr3 killed $exec
|
||||
; CHECK-NEXT: ; kill: def $vgpr0_vgpr1 killed $sgpr4_sgpr5 killed $exec
|
||||
; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
|
||||
|
||||
; CHECK: BB0_2:
|
||||
; CHECK: s_or_b64 exec, exec, s[2:3]
|
||||
; CHECK: s_or_b64 exec, exec, s[4:5]
|
||||
; CHECK-NEXT: s_mov_b32 s3, 0xf000
|
||||
; CHECK-NEXT: s_mov_b32 s2, -1
|
||||
; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], 0
|
||||
; CHECK-NEXT: s_endpgm
|
||||
define amdgpu_kernel void @foobar(float %a0, float %a1, float addrspace(1)* %out) nounwind {
|
||||
|
|
|
@ -73,7 +73,7 @@ define amdgpu_kernel void @udiv_i32_div_pow2(i32 addrspace(1)* %out, i32 addrspa
|
|||
|
||||
; FUNC-LABEL: {{^}}udiv_i32_div_k_even:
|
||||
; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
|
||||
; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xfabbd9c1
|
||||
; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0xfabbd9c1
|
||||
; SI: v_mul_hi_u32 [[MULHI:v[0-9]+]], [[VAL]], [[K]]
|
||||
; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 25, [[MULHI]]
|
||||
; SI: buffer_store_dword [[RESULT]]
|
||||
|
@ -87,7 +87,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(i32 addrspace(1)* %out, i32 addrs
|
|||
|
||||
; FUNC-LABEL: {{^}}udiv_i32_div_k_odd:
|
||||
; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
|
||||
; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x7d5deca3
|
||||
; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x7d5deca3
|
||||
; SI: v_mul_hi_u32 [[MULHI:v[0-9]+]], [[VAL]], [[K]]
|
||||
; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 24, [[MULHI]]
|
||||
; SI: buffer_store_dword [[RESULT]]
|
||||
|
|
|
@ -19,8 +19,8 @@ define amdgpu_kernel void @test_urem_i32(i32 addrspace(1)* %out, i32 addrspace(1
|
|||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}test_urem_i32_7:
|
||||
; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x24924925
|
||||
; SI: v_mul_hi_u32 [[MAGIC]], {{v[0-9]+}}
|
||||
; SI: s_mov_b32 [[MAGIC:s[0-9]+]], 0x24924925
|
||||
; SI: v_mul_hi_u32 {{v[0-9]+}}, {{v[0-9]+}}, [[MAGIC]]
|
||||
; SI: v_subrev_{{[iu]}}32
|
||||
; SI: v_mul_lo_i32
|
||||
; SI: v_sub_{{[iu]}}32
|
||||
|
|
|
@ -135,8 +135,9 @@ define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_kimm(float addrspa
|
|||
|
||||
; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_k_k_s:
|
||||
; GCN-DAG: s_load_dword [[SGPR:s[0-9]+]]
|
||||
; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
|
||||
; GCN: v_fma_f32 [[RESULT0:v[0-9]+]], [[VK]], [[VK]], [[SGPR]]
|
||||
; GCN-DAG: v_mov_b32_e32 [[VGPR:v[0-9]+]], [[SGPR]]
|
||||
; GCN-DAG: s_mov_b32 [[SK:s[0-9]+]], 0x44800000
|
||||
; GCN: v_fma_f32 [[RESULT0:v[0-9]+]], [[SK]], [[SK]], [[VGPR]]
|
||||
; GCN: buffer_store_dword [[RESULT0]]
|
||||
define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_k_s(float addrspace(1)* %out, float %a) #0 {
|
||||
%fma = call float @llvm.fma.f32(float 1024.0, float 1024.0, float %a) #1
|
||||
|
@ -145,11 +146,12 @@ define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_k_s(float addrspa
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_k_k_s_x2:
|
||||
; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
||||
; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
|
||||
; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
|
||||
; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VK]], [[VK]], s[[SGPR0]]
|
||||
; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VK]], [[VK]], s[[SGPR1]]
|
||||
; GCN-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]{{\:}}[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[VGPR0:v[0-9]+]], s[[SGPR0]]
|
||||
; GCN-DAG: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[SGPR1]]
|
||||
; GCN-DAG: s_mov_b32 [[SK:s[0-9]+]], 0x44800000
|
||||
; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SK]], [[SK]], [[VGPR0]]
|
||||
; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SK]], [[SK]], [[VGPR1]]
|
||||
; GCN: buffer_store_dword [[RESULT0]]
|
||||
; GCN: buffer_store_dword [[RESULT1]]
|
||||
; GCN: s_endpgm
|
||||
|
@ -163,8 +165,9 @@ define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_k_s_x2(float addr
|
|||
|
||||
; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_k_s_k:
|
||||
; GCN-DAG: s_load_dword [[SGPR:s[0-9]+]]
|
||||
; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
|
||||
; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[VK]], [[VK]]
|
||||
; GCN-DAG: v_mov_b32_e32 [[VGPR:v[0-9]+]], [[SGPR]]
|
||||
; GCN-DAG: s_mov_b32 [[SK:s[0-9]+]], 0x44800000
|
||||
; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR]], [[SK]], [[SK]]
|
||||
; GCN: buffer_store_dword [[RESULT]]
|
||||
define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_s_k(float addrspace(1)* %out, float %a) #0 {
|
||||
%fma = call float @llvm.fma.f32(float 1024.0, float %a, float 1024.0) #1
|
||||
|
@ -173,11 +176,12 @@ define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_s_k(float addrspa
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_k_s_k_x2:
|
||||
; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
||||
; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
|
||||
; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
|
||||
; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], s[[SGPR0]], [[VK]], [[VK]]
|
||||
; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], s[[SGPR1]], [[VK]], [[VK]]
|
||||
; GCN-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]{{\:}}[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[VGPR0:v[0-9]+]], s[[SGPR0]]
|
||||
; GCN-DAG: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[SGPR1]]
|
||||
; GCN-DAG: s_mov_b32 [[SK:s[0-9]+]], 0x44800000
|
||||
; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VGPR0]], [[SK]], [[SK]]
|
||||
; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VGPR1]], [[SK]], [[SK]]
|
||||
; GCN: buffer_store_dword [[RESULT0]]
|
||||
; GCN: buffer_store_dword [[RESULT1]]
|
||||
; GCN: s_endpgm
|
||||
|
@ -191,8 +195,9 @@ define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_s_k_x2(float addr
|
|||
|
||||
; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_s_k_k:
|
||||
; GCN-DAG: s_load_dword [[SGPR:s[0-9]+]]
|
||||
; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
|
||||
; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[VK]], [[VK]]
|
||||
; GCN-DAG: v_mov_b32_e32 [[VGPR:v[0-9]+]], [[SGPR]]
|
||||
; GCN-DAG: s_mov_b32 [[SK:s[0-9]+]], 0x44800000
|
||||
; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR]], [[SK]], [[SK]]
|
||||
; GCN: buffer_store_dword [[RESULT]]
|
||||
define amdgpu_kernel void @test_literal_use_twice_ternary_op_s_k_k(float addrspace(1)* %out, float %a) #0 {
|
||||
%fma = call float @llvm.fma.f32(float %a, float 1024.0, float 1024.0) #1
|
||||
|
@ -201,11 +206,12 @@ define amdgpu_kernel void @test_literal_use_twice_ternary_op_s_k_k(float addrspa
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_s_k_k_x2:
|
||||
; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
||||
; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
|
||||
; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
|
||||
; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], s[[SGPR0]], [[VK]], [[VK]]
|
||||
; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], s[[SGPR1]], [[VK]], [[VK]]
|
||||
; GCN-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]{{\:}}[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[VGPR0:v[0-9]+]], s[[SGPR0]]
|
||||
; GCN-DAG: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[SGPR1]]
|
||||
; GCN-DAG: s_mov_b32 [[SK:s[0-9]+]], 0x44800000
|
||||
; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VGPR0]], [[SK]], [[SK]]
|
||||
; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VGPR1]], [[SK]], [[SK]]
|
||||
; GCN: buffer_store_dword [[RESULT0]]
|
||||
; GCN: buffer_store_dword [[RESULT1]]
|
||||
; GCN: s_endpgm
|
||||
|
@ -220,12 +226,13 @@ define amdgpu_kernel void @test_literal_use_twice_ternary_op_s_k_k_x2(float addr
|
|||
; GCN-LABEL: {{^}}test_s0_s1_k_f32:
|
||||
; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
||||
; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
|
||||
; GCN-DAG: v_mov_b32_e32 [[VK0:v[0-9]+]], 0x44800000
|
||||
; GCN-DAG: s_mov_b32 [[SK0:s[0-9]+]], 0x44800000
|
||||
; GCN-DAG: v_mov_b32_e32 [[VS1:v[0-9]+]], s[[SGPR1]]
|
||||
; GCN-DAG: v_mov_b32_e32 [[VS0:v[0-9]+]], s[[SGPR0]]
|
||||
|
||||
; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], s[[SGPR0]], [[VS1]], [[VK0]]
|
||||
; GCN-DAG: v_mov_b32_e32 [[VK1:v[0-9]+]], 0x45800000
|
||||
; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], s[[SGPR0]], [[VS1]], [[VK1]]
|
||||
; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VS0]], [[VS1]], [[SK0]]
|
||||
; GCN-DAG: s_mov_b32 [[SK1:s[0-9]+]], 0x45800000
|
||||
; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VS0]], [[VS1]], [[SK1]]
|
||||
|
||||
; GCN: buffer_store_dword [[RESULT0]]
|
||||
; GCN: buffer_store_dword [[RESULT1]]
|
||||
|
|
|
@ -9,6 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
|
|||
|
||||
; waitcnt should be inserted after exec modification
|
||||
; SI: v_cmp_lt_i32_e32 vcc, 0,
|
||||
; SI: v_mov_b32_e32 {{v[0-9]+}}, 0
|
||||
; SI-NEXT: s_and_saveexec_b64 [[SAVE1:s\[[0-9]+:[0-9]+\]]], vcc
|
||||
; SI-NEXT: s_xor_b64 [[SAVE2:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE1]]
|
||||
; SI-NEXT: ; mask branch [[FLOW_BB:BB[0-9]+_[0-9]+]]
|
||||
|
|
|
@ -645,11 +645,11 @@ main_body:
|
|||
; CHECK: image_store
|
||||
; CHECK: s_wqm_b64 exec, exec
|
||||
; CHECK-DAG: v_mov_b32_e32 [[CTR:v[0-9]+]], 0
|
||||
; CHECK-DAG: v_mov_b32_e32 [[SEVEN:v[0-9]+]], 0x40e00000
|
||||
; CHECK-DAG: s_mov_b32 [[SEVEN:s[0-9]+]], 0x40e00000
|
||||
|
||||
; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %body
|
||||
; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]]
|
||||
; CHECK: v_cmp_gt_f32_e32 vcc, [[CTR]], [[SEVEN]]
|
||||
; CHECK: v_cmp_lt_f32_e32 vcc, [[SEVEN]], [[CTR]]
|
||||
; CHECK: s_cbranch_vccz [[LOOPHDR]]
|
||||
; CHECK: ; %break
|
||||
|
||||
|
|
Loading…
Reference in New Issue