forked from OSchip/llvm-project
R600: Add 64-bit float load/store support
* Added R600_Reg64 class * Added T#Index#.XY registers definition * Added v2i32 register reads from parameter and global space * Added f32 and i32 elements extraction from v2f32 and v2i32 * Added v2i32 -> v2f32 conversions Tom Stellard: - Mark vec2 operations as expand. The addition of a vec2 register class made them all legal. Patch by: Dmitry Cherkassov Signed-off-by: Dmitry Cherkassov <dcherkassov@gmail.com> llvm-svn: 187582
This commit is contained in:
parent
53698938a4
commit
0344cdfe39
|
@ -39,7 +39,7 @@ def CC_SI : CallingConv<[
|
|||
// Calling convention for compute kernels
|
||||
def CC_AMDGPU_Kernel : CallingConv<[
|
||||
CCIfType<[v4i32, v4f32], CCAssignToStack <16, 16>>,
|
||||
CCIfType<[i64, f64], CCAssignToStack < 8, 8>>,
|
||||
CCIfType<[i64, f64, v2f32, v2i32], CCAssignToStack < 8, 8>>,
|
||||
CCIfType<[i32, f32], CCAssignToStack < 4, 4>>,
|
||||
CCIfType<[i16], CCAssignToStack < 2, 4>>,
|
||||
CCIfType<[i8], CCAssignToStack < 1, 4>>
|
||||
|
|
|
@ -260,12 +260,19 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
|
|||
if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
|
||||
break;
|
||||
}
|
||||
|
||||
unsigned RegClassID;
|
||||
switch(N->getValueType(0).getVectorNumElements()) {
|
||||
case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
|
||||
case 4: RegClassID = AMDGPU::R600_Reg128RegClassID; break;
|
||||
default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
|
||||
}
|
||||
// BUILD_VECTOR is usually lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
|
||||
// that adds a 128 bits reg copy when going through TwoAddressInstructions
|
||||
// pass. We want to avoid 128 bits copies as much as possible because they
|
||||
// can't be bundled by our scheduler.
|
||||
SDValue RegSeqArgs[9] = {
|
||||
CurDAG->getTargetConstant(AMDGPU::R600_Reg128RegClassID, MVT::i32),
|
||||
CurDAG->getTargetConstant(RegClassID, MVT::i32),
|
||||
SDValue(), CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32),
|
||||
SDValue(), CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32),
|
||||
SDValue(), CurDAG->getTargetConstant(AMDGPU::sub2, MVT::i32),
|
||||
|
|
|
@ -79,6 +79,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
|
|||
setOperationAction(ISD::LOAD, MVT::f64, Promote);
|
||||
AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64);
|
||||
|
||||
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Expand);
|
||||
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Expand);
|
||||
|
||||
setOperationAction(ISD::FNEG, MVT::v2f32, Expand);
|
||||
setOperationAction(ISD::FNEG, MVT::v4f32, Expand);
|
||||
|
||||
|
|
|
@ -378,8 +378,10 @@ public:
|
|||
case AMDGPU::R600_ExportBuf:
|
||||
case AMDGPU::R600_ExportSwz:
|
||||
case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
|
||||
case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
|
||||
case AMDGPU::RAT_WRITE_CACHELESS_128_eg:
|
||||
case AMDGPU::RAT_STORE_DWORD_cm:
|
||||
case AMDGPU::RAT_STORE_DWORD32_cm:
|
||||
case AMDGPU::RAT_STORE_DWORD64_cm:
|
||||
DEBUG(dbgs() << CfCount << ":"; MI->dump(););
|
||||
CfCount++;
|
||||
break;
|
||||
|
|
|
@ -33,17 +33,25 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
|
|||
addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
|
||||
addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
|
||||
addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
|
||||
addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
|
||||
addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
|
||||
|
||||
computeRegisterProperties();
|
||||
|
||||
setOperationAction(ISD::FADD, MVT::v4f32, Expand);
|
||||
setOperationAction(ISD::FADD, MVT::v2f32, Expand);
|
||||
setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
|
||||
setOperationAction(ISD::FMUL, MVT::v2f32, Expand);
|
||||
setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
|
||||
setOperationAction(ISD::FDIV, MVT::v2f32, Expand);
|
||||
setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
|
||||
setOperationAction(ISD::FSUB, MVT::v2f32, Expand);
|
||||
|
||||
setOperationAction(ISD::FCOS, MVT::f32, Custom);
|
||||
setOperationAction(ISD::FSIN, MVT::f32, Custom);
|
||||
|
||||
setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
|
||||
setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
|
||||
|
||||
setOperationAction(ISD::BR_CC, MVT::i32, Expand);
|
||||
setOperationAction(ISD::BR_CC, MVT::f32, Expand);
|
||||
|
@ -66,7 +74,7 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
|
|||
|
||||
// Legalize loads and stores to the private address space.
|
||||
setOperationAction(ISD::LOAD, MVT::i32, Custom);
|
||||
setOperationAction(ISD::LOAD, MVT::v2i32, Expand);
|
||||
setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
|
||||
setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
|
||||
setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
|
||||
setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
|
||||
|
@ -74,7 +82,7 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
|
|||
setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
|
||||
setOperationAction(ISD::STORE, MVT::i8, Custom);
|
||||
setOperationAction(ISD::STORE, MVT::i32, Custom);
|
||||
setOperationAction(ISD::STORE, MVT::v2i32, Expand);
|
||||
setOperationAction(ISD::STORE, MVT::v2i32, Custom);
|
||||
setOperationAction(ISD::STORE, MVT::v4i32, Custom);
|
||||
|
||||
setOperationAction(ISD::LOAD, MVT::i32, Custom);
|
||||
|
@ -170,6 +178,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
|
|||
}
|
||||
|
||||
case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
|
||||
case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
|
||||
case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
|
||||
unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
|
||||
|
||||
|
@ -1129,7 +1138,13 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
|
|||
DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
|
||||
Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
|
||||
}
|
||||
Result = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Slots, 4);
|
||||
EVT NewVT = MVT::v4i32;
|
||||
unsigned NumElements = 4;
|
||||
if (VT.isVector()) {
|
||||
NewVT = VT;
|
||||
NumElements = VT.getVectorNumElements();
|
||||
}
|
||||
Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, Slots, NumElements);
|
||||
} else {
|
||||
// non constant ptr cant be folded, keeps it as a v4f32 load
|
||||
Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
|
||||
|
|
|
@ -51,9 +51,17 @@ R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
|
|||
MachineBasicBlock::iterator MI, DebugLoc DL,
|
||||
unsigned DestReg, unsigned SrcReg,
|
||||
bool KillSrc) const {
|
||||
if (AMDGPU::R600_Reg128RegClass.contains(DestReg)
|
||||
&& AMDGPU::R600_Reg128RegClass.contains(SrcReg)) {
|
||||
for (unsigned I = 0; I < 4; I++) {
|
||||
unsigned VectorComponents = 0;
|
||||
if (AMDGPU::R600_Reg128RegClass.contains(DestReg) &&
|
||||
AMDGPU::R600_Reg128RegClass.contains(SrcReg)) {
|
||||
VectorComponents = 4;
|
||||
} else if(AMDGPU::R600_Reg64RegClass.contains(DestReg) &&
|
||||
AMDGPU::R600_Reg64RegClass.contains(SrcReg)) {
|
||||
VectorComponents = 2;
|
||||
}
|
||||
|
||||
if (VectorComponents > 0) {
|
||||
for (unsigned I = 0; I < VectorComponents; I++) {
|
||||
unsigned SubRegIndex = RI.getSubRegFromChannel(I);
|
||||
buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
|
||||
RI.getSubReg(DestReg, SubRegIndex),
|
||||
|
@ -62,11 +70,6 @@ R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
|
|||
RegState::Define | RegState::Implicit);
|
||||
}
|
||||
} else {
|
||||
|
||||
// We can't copy vec4 registers
|
||||
assert(!AMDGPU::R600_Reg128RegClass.contains(DestReg)
|
||||
&& !AMDGPU::R600_Reg128RegClass.contains(SrcReg));
|
||||
|
||||
MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
|
||||
DestReg, SrcReg);
|
||||
NewMI->getOperand(getOperandIdx(*NewMI, AMDGPU::OpName::src0))
|
||||
|
|
|
@ -1290,6 +1290,13 @@ def RAT_WRITE_CACHELESS_32_eg : RAT_WRITE_CACHELESS_eg <
|
|||
[(global_store i32:$rw_gpr, i32:$index_gpr)]
|
||||
>;
|
||||
|
||||
// 64-bit store
|
||||
def RAT_WRITE_CACHELESS_64_eg : RAT_WRITE_CACHELESS_eg <
|
||||
(ins R600_Reg64:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
|
||||
0x3, "RAT_WRITE_CACHELESS_64_eg $rw_gpr.XY, $index_gpr, $eop",
|
||||
[(global_store v2i32:$rw_gpr, i32:$index_gpr)]
|
||||
>;
|
||||
|
||||
//128-bit store
|
||||
def RAT_WRITE_CACHELESS_128_eg : RAT_WRITE_CACHELESS_eg <
|
||||
(ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
|
||||
|
@ -1358,6 +1365,18 @@ class VTX_READ_32_eg <bits<8> buffer_id, list<dag> pattern>
|
|||
let Constraints = "$src_gpr.ptr = $dst_gpr";
|
||||
}
|
||||
|
||||
class VTX_READ_64_eg <bits<8> buffer_id, list<dag> pattern>
|
||||
: VTX_READ_eg <"VTX_READ_64 $dst_gpr.XY, $src_gpr", buffer_id,
|
||||
(outs R600_Reg64:$dst_gpr), pattern> {
|
||||
|
||||
let MEGA_FETCH_COUNT = 8;
|
||||
let DST_SEL_X = 0;
|
||||
let DST_SEL_Y = 1;
|
||||
let DST_SEL_Z = 7;
|
||||
let DST_SEL_W = 7;
|
||||
let DATA_FORMAT = 0x1D; // COLOR_32_32
|
||||
}
|
||||
|
||||
class VTX_READ_128_eg <bits<8> buffer_id, list<dag> pattern>
|
||||
: VTX_READ_eg <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr", buffer_id,
|
||||
(outs R600_Reg128:$dst_gpr), pattern> {
|
||||
|
@ -1391,6 +1410,10 @@ def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <0,
|
|||
[(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
|
||||
>;
|
||||
|
||||
def VTX_READ_PARAM_64_eg : VTX_READ_64_eg <0,
|
||||
[(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
|
||||
>;
|
||||
|
||||
def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0,
|
||||
[(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
|
||||
>;
|
||||
|
@ -1413,6 +1436,11 @@ def VTX_READ_GLOBAL_32_eg : VTX_READ_32_eg <1,
|
|||
[(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
|
||||
>;
|
||||
|
||||
// 64-bit reads
|
||||
def VTX_READ_GLOBAL_64_eg : VTX_READ_64_eg <1,
|
||||
[(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
|
||||
>;
|
||||
|
||||
// 128-bit reads
|
||||
def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1,
|
||||
[(set v4i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
|
||||
|
@ -1744,15 +1772,23 @@ def : Pat <
|
|||
def : Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>;
|
||||
|
||||
|
||||
def RAT_STORE_DWORD_cm : EG_CF_RAT <
|
||||
0x57, 0x14, 0x1, (outs),
|
||||
(ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr),
|
||||
"EXPORT_RAT_INST_STORE_DWORD $rw_gpr, $index_gpr",
|
||||
[(global_store i32:$rw_gpr, i32:$index_gpr)]
|
||||
class RAT_STORE_DWORD_cm <bits<4> mask, dag ins, list<dag> pat> : EG_CF_RAT <
|
||||
0x57, 0x14, mask, (outs), ins,
|
||||
"EXPORT_RAT_INST_STORE_DWORD $rw_gpr, $index_gpr", pat
|
||||
> {
|
||||
let eop = 0; // This bit is not used on Cayman.
|
||||
}
|
||||
|
||||
def RAT_STORE_DWORD32_cm : RAT_STORE_DWORD_cm <0x1,
|
||||
(ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr),
|
||||
[(global_store i32:$rw_gpr, i32:$index_gpr)]
|
||||
>;
|
||||
|
||||
def RAT_STORE_DWORD64_cm : RAT_STORE_DWORD_cm <0x3,
|
||||
(ins R600_Reg64:$rw_gpr, R600_TReg32_X:$index_gpr),
|
||||
[(global_store v2i32:$rw_gpr, i32:$index_gpr)]
|
||||
>;
|
||||
|
||||
class VTX_READ_cm <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
|
||||
: VTX_WORD0_cm, VTX_READ<name, buffer_id, outs, pattern> {
|
||||
|
||||
|
@ -1815,6 +1851,17 @@ class VTX_READ_32_cm <bits<8> buffer_id, list<dag> pattern>
|
|||
let Constraints = "$src_gpr.ptr = $dst_gpr";
|
||||
}
|
||||
|
||||
class VTX_READ_64_cm <bits<8> buffer_id, list<dag> pattern>
|
||||
: VTX_READ_cm <"VTX_READ_64 $dst_gpr, $src_gpr", buffer_id,
|
||||
(outs R600_Reg64:$dst_gpr), pattern> {
|
||||
|
||||
let DST_SEL_X = 0;
|
||||
let DST_SEL_Y = 1;
|
||||
let DST_SEL_Z = 7;
|
||||
let DST_SEL_W = 7;
|
||||
let DATA_FORMAT = 0x1D; // COLOR_32_32
|
||||
}
|
||||
|
||||
class VTX_READ_128_cm <bits<8> buffer_id, list<dag> pattern>
|
||||
: VTX_READ_cm <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr", buffer_id,
|
||||
(outs R600_Reg128:$dst_gpr), pattern> {
|
||||
|
@ -1846,6 +1893,10 @@ def VTX_READ_PARAM_32_cm : VTX_READ_32_cm <0,
|
|||
[(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
|
||||
>;
|
||||
|
||||
def VTX_READ_PARAM_64_cm : VTX_READ_64_cm <0,
|
||||
[(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
|
||||
>;
|
||||
|
||||
def VTX_READ_PARAM_128_cm : VTX_READ_128_cm <0,
|
||||
[(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
|
||||
>;
|
||||
|
@ -1868,6 +1919,11 @@ def VTX_READ_GLOBAL_32_cm : VTX_READ_32_cm <1,
|
|||
[(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
|
||||
>;
|
||||
|
||||
// 64-bit reads
|
||||
def VTX_READ_GLOBAL_64_cm : VTX_READ_64_cm <1,
|
||||
[(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
|
||||
>;
|
||||
|
||||
// 128-bit reads
|
||||
def VTX_READ_GLOBAL_128_cm : VTX_READ_128_cm <1,
|
||||
[(set v4i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
|
||||
|
@ -2297,10 +2353,24 @@ def : Insert_Element <i32, v4i32, 3, sub3>;
|
|||
def : Vector4_Build <v4f32, f32>;
|
||||
def : Vector4_Build <v4i32, i32>;
|
||||
|
||||
def : Extract_Element <f32, v2f32, 0, sub0>;
|
||||
def : Extract_Element <f32, v2f32, 1, sub1>;
|
||||
|
||||
def : Insert_Element <f32, v2f32, 0, sub0>;
|
||||
def : Insert_Element <f32, v2f32, 1, sub1>;
|
||||
|
||||
def : Extract_Element <i32, v2i32, 0, sub0>;
|
||||
def : Extract_Element <i32, v2i32, 1, sub1>;
|
||||
|
||||
def : Insert_Element <i32, v2i32, 0, sub0>;
|
||||
def : Insert_Element <i32, v2i32, 1, sub1>;
|
||||
|
||||
// bitconvert patterns
|
||||
|
||||
def : BitConvert <i32, f32, R600_Reg32>;
|
||||
def : BitConvert <f32, i32, R600_Reg32>;
|
||||
def : BitConvert <v2f32, v2i32, R600_Reg64>;
|
||||
def : BitConvert <v2i32, v2f32, R600_Reg64>;
|
||||
def : BitConvert <v4f32, v4i32, R600_Reg128>;
|
||||
def : BitConvert <v4i32, v4f32, R600_Reg128>;
|
||||
|
||||
|
|
|
@ -23,6 +23,14 @@ class R600Reg_128<string n, list<Register> subregs, bits<16> encoding> :
|
|||
let HWEncoding = encoding;
|
||||
}
|
||||
|
||||
class R600Reg_64<string n, list<Register> subregs, bits<16> encoding> :
|
||||
RegisterWithSubRegs<n, subregs> {
|
||||
let Namespace = "AMDGPU";
|
||||
let SubRegIndices = [sub0, sub1];
|
||||
let HWEncoding = encoding;
|
||||
}
|
||||
|
||||
|
||||
foreach Index = 0-127 in {
|
||||
foreach Chan = [ "X", "Y", "Z", "W" ] in {
|
||||
// 32-bit Temporary Registers
|
||||
|
@ -41,6 +49,11 @@ foreach Index = 0-127 in {
|
|||
!cast<Register>("T"#Index#"_Z"),
|
||||
!cast<Register>("T"#Index#"_W")],
|
||||
Index>;
|
||||
|
||||
def T#Index#_XY : R600Reg_64 <"T"#Index#"",
|
||||
[!cast<Register>("T"#Index#"_X"),
|
||||
!cast<Register>("T"#Index#"_Y")],
|
||||
Index>;
|
||||
}
|
||||
|
||||
// KCACHE_BANK0
|
||||
|
@ -186,6 +199,9 @@ def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
|
|||
let CopyCost = -1;
|
||||
}
|
||||
|
||||
def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
|
||||
(add (sequence "T%u_XY", 0, 63))>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Register classes for indirect addressing
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
; RUN: llc < %s -march=r600 -mcpu=tahiti | FileCheck %s
|
||||
; RUN: llc < %s -march=r600 -mcpu=tahiti | FileCheck %s --check-prefix=SI-CHECK
|
||||
|
||||
; SI-CHECK: @f64_kernel_arg
|
||||
; SI-CHECK-DAG: S_LOAD_DWORDX2 SGPR{{[0-9]}}_SGPR{{[0-9]}}, SGPR0_SGPR1, 9
|
||||
|
|
|
@ -0,0 +1,34 @@
|
|||
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
|
||||
; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
|
||||
|
||||
; R600-CHECK: @build_vector2
|
||||
; R600-CHECK: MOV
|
||||
; R600-CHECK: MOV
|
||||
; R600-CHECK-NOT: MOV
|
||||
; SI-CHECK: @build_vector2
|
||||
; SI-CHECK-DAG: V_MOV_B32_e32 [[X:VGPR[0-9]]], 5
|
||||
; SI-CHECK-DAG: V_MOV_B32_e32 [[Y:VGPR[0-9]]], 6
|
||||
; SI-CHECK: BUFFER_STORE_DWORDX2 [[X]]_[[Y]]
|
||||
define void @build_vector2 (<2 x i32> addrspace(1)* %out) {
|
||||
entry:
|
||||
store <2 x i32> <i32 5, i32 6>, <2 x i32> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; R600-CHECK: @build_vector4
|
||||
; R600-CHECK: MOV
|
||||
; R600-CHECK: MOV
|
||||
; R600-CHECK: MOV
|
||||
; R600-CHECK: MOV
|
||||
; R600-CHECK-NOT: MOV
|
||||
; SI-CHECK: @build_vector4
|
||||
; SI-CHECK-DAG: V_MOV_B32_e32 [[X:VGPR[0-9]]], 5
|
||||
; SI-CHECK-DAG: V_MOV_B32_e32 [[Y:VGPR[0-9]]], 6
|
||||
; SI-CHECK-DAG: V_MOV_B32_e32 [[Z:VGPR[0-9]]], 7
|
||||
; SI-CHECK-DAG: V_MOV_B32_e32 [[W:VGPR[0-9]]], 8
|
||||
; SI-CHECK: BUFFER_STORE_DWORDX4 [[X]]_[[Y]]_[[Z]]_[[W]]
|
||||
define void @build_vector4 (<4 x i32> addrspace(1)* %out) {
|
||||
entry:
|
||||
store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, <4 x i32> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
|
@ -15,6 +15,16 @@ declare float @llvm.R600.load.input(i32) readnone
|
|||
|
||||
declare void @llvm.AMDGPU.store.output(float, i32)
|
||||
|
||||
; CHECK: @fadd_v2f32
|
||||
; CHECK-DAG: ADD * T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
|
||||
; CHECK-DAG: ADD * T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
|
||||
define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
|
||||
entry:
|
||||
%0 = fadd <2 x float> %a, %b
|
||||
store <2 x float> %0, <2 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: @fadd_v4f32
|
||||
; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
|
|
|
@ -1,17 +1,36 @@
|
|||
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
||||
|
||||
;CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}
|
||||
;CHECK-DAG: MUL_IEEE * T{{[0-9]+\.[XYZW]}}
|
||||
;CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}
|
||||
;CHECK-DAG: MUL_IEEE * T{{[0-9]+\.[XYZW]}}
|
||||
;CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}
|
||||
;CHECK-DAG: MUL_IEEE * T{{[0-9]+\.[XYZW]}}
|
||||
;CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}
|
||||
;CHECK-DAG: MUL_IEEE * T{{[0-9]+\.[XYZW]}}
|
||||
; These tests check that fdiv is expanded correctly and also test that the
|
||||
; scheduler is scheduling the RECIP_IEEE and MUL_IEEE instructions in separate
|
||||
; instruction groups.
|
||||
|
||||
define void @test(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) {
|
||||
; CHECK: @fdiv_v2f32
|
||||
; CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
|
||||
; CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
|
||||
; CHECK-DAG: MUL_IEEE T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
|
||||
; CHECK-DAG: MUL_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
|
||||
define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
|
||||
entry:
|
||||
%0 = fdiv <4 x float> %a, %b
|
||||
store <4 x float> %0, <4 x float> addrspace(1)* %out
|
||||
%0 = fdiv <2 x float> %a, %b
|
||||
store <2 x float> %0, <2 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: @fdiv_v4f32
|
||||
; CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
; CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
; CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
; CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
; CHECK-DAG: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
|
||||
; CHECK-DAG: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
|
||||
; CHECK-DAG: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
|
||||
; CHECK-DAG: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
|
||||
|
||||
define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
|
||||
%b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
|
||||
%a = load <4 x float> addrspace(1) * %in
|
||||
%b = load <4 x float> addrspace(1) * %b_ptr
|
||||
%result = fdiv <4 x float> %a, %b
|
||||
store <4 x float> %result, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
|
|
@ -15,6 +15,16 @@ declare float @llvm.R600.load.input(i32) readnone
|
|||
|
||||
declare void @llvm.AMDGPU.store.output(float, i32)
|
||||
|
||||
; CHECK: @fmul_v2f32
|
||||
; CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW]}}
|
||||
; CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW]}}
|
||||
define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
|
||||
entry:
|
||||
%0 = fmul <2 x float> %a, %b
|
||||
store <2 x float> %0, <2 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: @fmul_v4f32
|
||||
; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
; CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
|
|
|
@ -1,6 +1,18 @@
|
|||
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
|
||||
; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
|
||||
|
||||
; R600-CHECK: @fp_to_sint_v2i32
|
||||
; R600-CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
|
||||
; R600-CHECK: FLT_TO_INT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
|
||||
; SI-CHECK: @fp_to_sint_v2i32
|
||||
; SI-CHECK: V_CVT_I32_F32_e32
|
||||
; SI-CHECK: V_CVT_I32_F32_e32
|
||||
define void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
|
||||
%result = fptosi <2 x float> %in to <2 x i32>
|
||||
store <2 x i32> %result, <2 x i32> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; R600-CHECK: @fp_to_sint_v4i32
|
||||
; R600-CHECK: FLT_TO_INT {{[* ]*}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
|
||||
; R600-CHECK: FLT_TO_INT {{[* ]*}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
|
||||
|
@ -11,7 +23,6 @@
|
|||
; SI-CHECK: V_CVT_I32_F32_e32
|
||||
; SI-CHECK: V_CVT_I32_F32_e32
|
||||
; SI-CHECK: V_CVT_I32_F32_e32
|
||||
|
||||
define void @fp_to_sint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
|
||||
%value = load <4 x float> addrspace(1) * %in
|
||||
%result = fptosi <4 x float> %value to <4 x i32>
|
||||
|
|
|
@ -1,5 +1,15 @@
|
|||
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
||||
|
||||
; CHECK: @fp_to_uint_v2i32
|
||||
; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
|
||||
; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
|
||||
|
||||
define void @fp_to_uint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
|
||||
%result = fptoui <2 x float> %in to <2 x i32>
|
||||
store <2 x i32> %result, <2 x i32> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: @fp_to_uint_v4i32
|
||||
; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
|
||||
; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
|
||||
|
|
|
@ -15,12 +15,21 @@ declare float @llvm.R600.load.input(i32) readnone
|
|||
|
||||
declare void @llvm.AMDGPU.store.output(float, i32)
|
||||
|
||||
; CHECK: @fsub_v4f32
|
||||
; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
; CHECK: @fsub_v2f32
|
||||
; CHECK-DAG: ADD * T{{[0-9]+\.[XYZW]}}, KC0[3].X, -KC0[3].Z
|
||||
; CHECK-DAG: ADD * T{{[0-9]+\.[XYZW]}}, KC0[2].W, -KC0[3].Y
|
||||
define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
|
||||
entry:
|
||||
%0 = fsub <2 x float> %a, %b
|
||||
store <2 x float> %0, <2 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: @fsub_v4f32
|
||||
; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
|
||||
; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
|
||||
; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
|
||||
; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
|
||||
define void @fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
|
||||
%b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
|
||||
%a = load <4 x float> addrspace(1) * %in
|
||||
|
|
|
@ -92,8 +92,7 @@ entry:
|
|||
|
||||
; load a v2f32 value from the global address space
|
||||
; R600-CHECK: @load_v2f32
|
||||
; R600-CHECK: VTX_READ_32
|
||||
; R600-CHECK: VTX_READ_32
|
||||
; R600-CHECK: VTX_READ_64
|
||||
|
||||
; SI-CHECK: @load_v2f32
|
||||
; SI-CHECK: BUFFER_LOAD_DWORDX2
|
||||
|
|
|
@ -3,8 +3,7 @@
|
|||
|
||||
; load a v2i32 value from the global address space.
|
||||
; EG-CHECK: @load_v2i32
|
||||
; EG-CHECK-DAG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 4
|
||||
; EG-CHECK-DAG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
|
||||
; EG-CHECK: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0
|
||||
; SI-CHECK: @load_v2i32
|
||||
; SI-CHECK: BUFFER_LOAD_DWORDX2 VGPR{{[0-9]+}}
|
||||
define void @load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
|
||||
|
|
|
@ -1,26 +1,23 @@
|
|||
;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
|
||||
|
||||
;EG-CHECK: @test2
|
||||
;EG-CHECK: SETE_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
;EG-CHECK: SETE_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
; CHECK: @setcc_v2i32
|
||||
; EG-CHECK-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[3].X, KC0[3].Z
|
||||
; EG-CHECK-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[2].W, KC0[3].Y
|
||||
|
||||
define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
|
||||
%b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
|
||||
%a = load <2 x i32> addrspace(1) * %in
|
||||
%b = load <2 x i32> addrspace(1) * %b_ptr
|
||||
define void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) {
|
||||
%result = icmp eq <2 x i32> %a, %b
|
||||
%sext = sext <2 x i1> %result to <2 x i32>
|
||||
store <2 x i32> %sext, <2 x i32> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
;EG-CHECK: @test4
|
||||
;EG-CHECK: SETE_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
;EG-CHECK: SETE_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
;EG-CHECK: SETE_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
;EG-CHECK: SETE_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
; CHECK: @setcc_v4i32
|
||||
; EG-CHECK-DAG: SETE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
; EG-CHECK-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
; EG-CHECK-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
; EG-CHECK-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
|
||||
define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
|
||||
define void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
|
||||
%b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
|
||||
%a = load <4 x i32> addrspace(1) * %in
|
||||
%b = load <4 x i32> addrspace(1) * %b_ptr
|
||||
|
|
|
@ -1,6 +1,18 @@
|
|||
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
|
||||
; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
|
||||
|
||||
; R600-CHECK: @sint_to_fp_v2i32
|
||||
; R600-CHECK-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
|
||||
; R600-CHECK-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
|
||||
; SI-CHECK: @sint_to_fp_v2i32
|
||||
; SI-CHECK: V_CVT_F32_I32_e32
|
||||
; SI-CHECK: V_CVT_F32_I32_e32
|
||||
define void @sint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) {
|
||||
%result = sitofp <2 x i32> %in to <2 x float>
|
||||
store <2 x float> %result, <2 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; R600-CHECK: @sint_to_fp_v4i32
|
||||
; R600-CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
; R600-CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
|
|
|
@ -17,11 +17,9 @@ define void @store_f32(float addrspace(1)* %out, float %in) {
|
|||
|
||||
; vec2 floating-point stores
|
||||
; EG-CHECK: @store_v2f32
|
||||
; EG-CHECK: RAT_WRITE_CACHELESS_32_eg
|
||||
; EG-CHECK-NEXT: RAT_WRITE_CACHELESS_32_eg
|
||||
; EG-CHECK: RAT_WRITE_CACHELESS_64_eg
|
||||
; CM-CHECK: @store_v2f32
|
||||
; CM-CHECK: EXPORT_RAT_INST_STORE_DWORD
|
||||
; CM-CHECK-NEXT: EXPORT_RAT_INST_STORE_DWORD
|
||||
; SI-CHECK: @store_v2f32
|
||||
; SI-CHECK: BUFFER_STORE_DWORDX2
|
||||
|
||||
|
@ -41,11 +39,9 @@ entry:
|
|||
; be two 32-bit stores.
|
||||
|
||||
; EG-CHECK: @vecload2
|
||||
; EG-CHECK: RAT_WRITE_CACHELESS_32_eg
|
||||
; EG-CHECK: RAT_WRITE_CACHELESS_32_eg
|
||||
; EG-CHECK: RAT_WRITE_CACHELESS_64_eg
|
||||
; CM-CHECK: @vecload2
|
||||
; CM-CHECK: EXPORT_RAT_INST_STORE_DWORD
|
||||
; CM-CHECK: EXPORT_RAT_INST_STORE_DWORD
|
||||
; SI-CHECK: @vecload2
|
||||
; SI-CHECK: BUFFER_STORE_DWORDX2
|
||||
define void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 {
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
|
||||
;EG-CHECK: @test2
|
||||
;EG-CHECK: SUB_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
;EG-CHECK: SUB_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
;EG-CHECK: SUB_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
|
||||
;SI-CHECK: @test2
|
||||
;SI-CHECK: V_SUB_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
|
||||
|
|
|
@ -1,6 +1,18 @@
|
|||
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
|
||||
; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
|
||||
|
||||
; R600-CHECK: @uint_to_fp_v2i32
|
||||
; R600-CHECK-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
|
||||
; R600-CHECK-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
|
||||
; SI-CHECK: @uint_to_fp_v2i32
|
||||
; SI-CHECK: V_CVT_F32_U32_e32
|
||||
; SI-CHECK: V_CVT_F32_U32_e32
|
||||
define void @uint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) {
|
||||
%result = uitofp <2 x i32> %in to <2 x float>
|
||||
store <2 x float> %result, <2 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; R600-CHECK: @uint_to_fp_v4i32
|
||||
; R600-CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
; R600-CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
|
|
Loading…
Reference in New Issue