forked from OSchip/llvm-project
[AMDGPU] Add 224-bit vector types and link 192-bit types to MVTs
Add SReg_224, VReg_224, AReg_224, etc. Link 224-bit types with v7i32/v7f32. Link existing 192-bit types to newly added v3i64/v3f64/v6i32/v6f32. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D104622
This commit is contained in:
parent
6b0f98d442
commit
98f48723f2
|
@ -965,6 +965,16 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
|
|||
IsSGPR = false;
|
||||
IsAGPR = true;
|
||||
Width = 6;
|
||||
} else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
|
||||
IsSGPR = false;
|
||||
Width = 7;
|
||||
} else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
|
||||
IsSGPR = true;
|
||||
Width = 7;
|
||||
} else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
|
||||
IsSGPR = false;
|
||||
IsAGPR = true;
|
||||
Width = 7;
|
||||
} else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
|
||||
assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
|
||||
"trap handler registers should not be used");
|
||||
|
|
|
@ -78,6 +78,12 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
|
|||
setOperationAction(ISD::LOAD, MVT::v5f32, Promote);
|
||||
AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
|
||||
|
||||
setOperationAction(ISD::LOAD, MVT::v6f32, Promote);
|
||||
AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
|
||||
|
||||
setOperationAction(ISD::LOAD, MVT::v7f32, Promote);
|
||||
AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
|
||||
|
||||
setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
|
||||
AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
|
||||
|
||||
|
@ -99,9 +105,15 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
|
|||
setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
|
||||
AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
|
||||
|
||||
setOperationAction(ISD::LOAD, MVT::v3i64, Promote);
|
||||
AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
|
||||
|
||||
setOperationAction(ISD::LOAD, MVT::v4i64, Promote);
|
||||
AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
|
||||
|
||||
setOperationAction(ISD::LOAD, MVT::v3f64, Promote);
|
||||
AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
|
||||
|
||||
setOperationAction(ISD::LOAD, MVT::v4f64, Promote);
|
||||
AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
|
||||
|
||||
|
@ -173,12 +185,14 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
|
|||
|
||||
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
|
||||
setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
|
||||
setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
|
||||
setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
|
||||
setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
|
||||
setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
|
||||
|
||||
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
|
||||
setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
|
||||
setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
|
||||
setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
|
||||
setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
|
||||
setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
|
||||
|
@ -198,6 +212,12 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
|
|||
setOperationAction(ISD::STORE, MVT::v5f32, Promote);
|
||||
AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
|
||||
|
||||
setOperationAction(ISD::STORE, MVT::v6f32, Promote);
|
||||
AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
|
||||
|
||||
setOperationAction(ISD::STORE, MVT::v7f32, Promote);
|
||||
AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
|
||||
|
||||
setOperationAction(ISD::STORE, MVT::v8f32, Promote);
|
||||
AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
|
||||
|
||||
|
@ -219,6 +239,12 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
|
|||
setOperationAction(ISD::STORE, MVT::v2f64, Promote);
|
||||
AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
|
||||
|
||||
setOperationAction(ISD::STORE, MVT::v3i64, Promote);
|
||||
AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
|
||||
|
||||
setOperationAction(ISD::STORE, MVT::v3f64, Promote);
|
||||
AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
|
||||
|
||||
setOperationAction(ISD::STORE, MVT::v4i64, Promote);
|
||||
AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
|
||||
|
||||
|
@ -261,6 +287,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
|
|||
setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
|
||||
setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
|
||||
|
||||
setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
|
||||
setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
|
||||
setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
|
||||
setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
|
||||
|
||||
setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
|
||||
setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
|
||||
setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
|
||||
|
@ -325,6 +356,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
|
|||
setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
|
||||
setOperationAction(ISD::CONCAT_VECTORS, MVT::v5i32, Custom);
|
||||
setOperationAction(ISD::CONCAT_VECTORS, MVT::v5f32, Custom);
|
||||
setOperationAction(ISD::CONCAT_VECTORS, MVT::v6i32, Custom);
|
||||
setOperationAction(ISD::CONCAT_VECTORS, MVT::v6f32, Custom);
|
||||
setOperationAction(ISD::CONCAT_VECTORS, MVT::v7i32, Custom);
|
||||
setOperationAction(ISD::CONCAT_VECTORS, MVT::v7f32, Custom);
|
||||
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
|
||||
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
|
||||
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f16, Custom);
|
||||
|
@ -337,6 +372,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
|
|||
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
|
||||
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5f32, Custom);
|
||||
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom);
|
||||
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v6f32, Custom);
|
||||
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v6i32, Custom);
|
||||
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v7f32, Custom);
|
||||
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v7i32, Custom);
|
||||
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
|
||||
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
|
||||
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f32, Custom);
|
||||
|
@ -345,6 +384,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
|
|||
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom);
|
||||
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f64, Custom);
|
||||
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i64, Custom);
|
||||
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f64, Custom);
|
||||
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i64, Custom);
|
||||
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f64, Custom);
|
||||
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i64, Custom);
|
||||
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f64, Custom);
|
||||
|
@ -414,8 +455,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
|
|||
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
|
||||
|
||||
static const MVT::SimpleValueType VectorIntTypes[] = {
|
||||
MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32
|
||||
};
|
||||
MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32};
|
||||
|
||||
for (MVT VT : VectorIntTypes) {
|
||||
// Expand the following operations for the current type by default.
|
||||
|
@ -456,8 +496,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
|
|||
}
|
||||
|
||||
static const MVT::SimpleValueType FloatVectorTypes[] = {
|
||||
MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32
|
||||
};
|
||||
MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32};
|
||||
|
||||
for (MVT VT : FloatVectorTypes) {
|
||||
setOperationAction(ISD::FABS, VT, Expand);
|
||||
|
@ -507,6 +546,12 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
|
|||
setOperationAction(ISD::SELECT, MVT::v5f32, Promote);
|
||||
AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
|
||||
|
||||
setOperationAction(ISD::SELECT, MVT::v6f32, Promote);
|
||||
AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
|
||||
|
||||
setOperationAction(ISD::SELECT, MVT::v7f32, Promote);
|
||||
AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
|
||||
|
||||
// There are no libcalls of any kind.
|
||||
for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
|
||||
setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
|
||||
|
|
|
@ -7,16 +7,16 @@
|
|||
//===----------------------------------------------------------------------===//
|
||||
|
||||
def SGPRRegBank : RegisterBank<"SGPR",
|
||||
[SReg_LO16, SReg_32, SReg_64, SReg_96, SReg_128, SReg_160, SReg_192, SReg_256, SReg_512, SReg_1024]
|
||||
[SReg_LO16, SReg_32, SReg_64, SReg_96, SReg_128, SReg_160, SReg_192, SReg_224, SReg_256, SReg_512, SReg_1024]
|
||||
>;
|
||||
|
||||
def VGPRRegBank : RegisterBank<"VGPR",
|
||||
[VGPR_LO16, VGPR_HI16, VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192, VReg_256, VReg_512, VReg_1024]
|
||||
[VGPR_LO16, VGPR_HI16, VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192, VReg_224, VReg_256, VReg_512, VReg_1024]
|
||||
>;
|
||||
|
||||
// It is helpful to distinguish conditions from ordinary SGPRs.
|
||||
def VCCRegBank : RegisterBank <"VCC", [SReg_1]>;
|
||||
|
||||
def AGPRRegBank : RegisterBank <"AGPR",
|
||||
[AGPR_LO16, AGPR_32, AReg_64, AReg_96, AReg_128, AReg_160, AReg_192, AReg_256, AReg_512, AReg_1024]
|
||||
[AGPR_LO16, AGPR_32, AReg_64, AReg_96, AReg_128, AReg_160, AReg_192, AReg_224, AReg_256, AReg_512, AReg_1024]
|
||||
>;
|
||||
|
|
|
@ -2192,6 +2192,7 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
|
|||
case 4: return AMDGPU::VReg_128RegClassID;
|
||||
case 5: return AMDGPU::VReg_160RegClassID;
|
||||
case 6: return AMDGPU::VReg_192RegClassID;
|
||||
case 7: return AMDGPU::VReg_224RegClassID;
|
||||
case 8: return AMDGPU::VReg_256RegClassID;
|
||||
case 16: return AMDGPU::VReg_512RegClassID;
|
||||
case 32: return AMDGPU::VReg_1024RegClassID;
|
||||
|
@ -2214,6 +2215,7 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
|
|||
case 4: return AMDGPU::SGPR_128RegClassID;
|
||||
case 5: return AMDGPU::SGPR_160RegClassID;
|
||||
case 6: return AMDGPU::SGPR_192RegClassID;
|
||||
case 7: return AMDGPU::SGPR_224RegClassID;
|
||||
case 8: return AMDGPU::SGPR_256RegClassID;
|
||||
case 16: return AMDGPU::SGPR_512RegClassID;
|
||||
}
|
||||
|
@ -2226,6 +2228,7 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
|
|||
case 4: return AMDGPU::AReg_128RegClassID;
|
||||
case 5: return AMDGPU::AReg_160RegClassID;
|
||||
case 6: return AMDGPU::AReg_192RegClassID;
|
||||
case 7: return AMDGPU::AReg_224RegClassID;
|
||||
case 8: return AMDGPU::AReg_256RegClassID;
|
||||
case 16: return AMDGPU::AReg_512RegClassID;
|
||||
case 32: return AMDGPU::AReg_1024RegClassID;
|
||||
|
|
|
@ -463,6 +463,7 @@ SIMCCodeEmitter::getAVOperandEncoding(const MCInst &MI, unsigned OpNo,
|
|||
MRI.getRegClass(AMDGPU::AReg_128RegClassID).contains(Reg) ||
|
||||
MRI.getRegClass(AMDGPU::AReg_160RegClassID).contains(Reg) ||
|
||||
MRI.getRegClass(AMDGPU::AReg_192RegClassID).contains(Reg) ||
|
||||
MRI.getRegClass(AMDGPU::AReg_224RegClassID).contains(Reg) ||
|
||||
MRI.getRegClass(AMDGPU::AReg_256RegClassID).contains(Reg) ||
|
||||
MRI.getRegClass(AMDGPU::AGPR_LO16RegClassID).contains(Reg))
|
||||
Enc |= 512;
|
||||
|
|
|
@ -102,6 +102,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
|
|||
addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
|
||||
addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
|
||||
|
||||
addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
|
||||
addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
|
||||
|
||||
addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
|
||||
addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
|
||||
|
||||
addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
|
||||
addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
|
||||
|
||||
addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
|
||||
addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
|
||||
|
||||
|
@ -145,6 +154,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
|
|||
setOperationAction(ISD::LOAD, MVT::v3i32, Custom);
|
||||
setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
|
||||
setOperationAction(ISD::LOAD, MVT::v5i32, Custom);
|
||||
setOperationAction(ISD::LOAD, MVT::v6i32, Custom);
|
||||
setOperationAction(ISD::LOAD, MVT::v7i32, Custom);
|
||||
setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
|
||||
setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
|
||||
setOperationAction(ISD::LOAD, MVT::i1, Custom);
|
||||
|
@ -154,6 +165,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
|
|||
setOperationAction(ISD::STORE, MVT::v3i32, Custom);
|
||||
setOperationAction(ISD::STORE, MVT::v4i32, Custom);
|
||||
setOperationAction(ISD::STORE, MVT::v5i32, Custom);
|
||||
setOperationAction(ISD::STORE, MVT::v6i32, Custom);
|
||||
setOperationAction(ISD::STORE, MVT::v7i32, Custom);
|
||||
setOperationAction(ISD::STORE, MVT::v8i32, Custom);
|
||||
setOperationAction(ISD::STORE, MVT::v16i32, Custom);
|
||||
setOperationAction(ISD::STORE, MVT::i1, Custom);
|
||||
|
@ -176,6 +189,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
|
|||
setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
|
||||
setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
|
||||
|
||||
setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
|
||||
setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
|
||||
setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
|
||||
setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
|
||||
setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
|
||||
|
@ -203,8 +218,16 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
|
|||
|
||||
setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
|
||||
setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
|
||||
setOperationAction(ISD::TRUNCATE, MVT::v3i32, Expand);
|
||||
setOperationAction(ISD::FP_ROUND, MVT::v3f32, Expand);
|
||||
setOperationAction(ISD::TRUNCATE, MVT::v4i32, Expand);
|
||||
setOperationAction(ISD::FP_ROUND, MVT::v4f32, Expand);
|
||||
setOperationAction(ISD::TRUNCATE, MVT::v5i32, Expand);
|
||||
setOperationAction(ISD::FP_ROUND, MVT::v5f32, Expand);
|
||||
setOperationAction(ISD::TRUNCATE, MVT::v6i32, Expand);
|
||||
setOperationAction(ISD::FP_ROUND, MVT::v6f32, Expand);
|
||||
setOperationAction(ISD::TRUNCATE, MVT::v7i32, Expand);
|
||||
setOperationAction(ISD::FP_ROUND, MVT::v7f32, Expand);
|
||||
setOperationAction(ISD::TRUNCATE, MVT::v8i32, Expand);
|
||||
setOperationAction(ISD::FP_ROUND, MVT::v8f32, Expand);
|
||||
setOperationAction(ISD::TRUNCATE, MVT::v16i32, Expand);
|
||||
|
@ -245,6 +268,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
|
|||
// with > 4 elements.
|
||||
for (MVT VT : { MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
|
||||
MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16,
|
||||
MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32,
|
||||
MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
|
||||
MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32 }) {
|
||||
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
|
||||
|
@ -290,6 +314,20 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
|
|||
AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
|
||||
}
|
||||
|
||||
for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
|
||||
setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
|
||||
AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
|
||||
|
||||
setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
|
||||
AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
|
||||
|
||||
setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
|
||||
AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
|
||||
|
||||
setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
|
||||
AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
|
||||
}
|
||||
|
||||
for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
|
||||
setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
|
||||
AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
|
||||
|
@ -365,9 +403,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
|
|||
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i32, Custom);
|
||||
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4f32, Custom);
|
||||
|
||||
// Deal with vec5 vector operations when widened to vec8.
|
||||
// Deal with vec5/6/7 vector operations when widened to vec8.
|
||||
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5i32, Custom);
|
||||
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5f32, Custom);
|
||||
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v6i32, Custom);
|
||||
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v6f32, Custom);
|
||||
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v7i32, Custom);
|
||||
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v7f32, Custom);
|
||||
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i32, Custom);
|
||||
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8f32, Custom);
|
||||
|
||||
|
@ -11697,6 +11739,8 @@ static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
|
|||
return AMDGPU::VReg_160_Align2RegClassID;
|
||||
case AMDGPU::VReg_192RegClassID:
|
||||
return AMDGPU::VReg_192_Align2RegClassID;
|
||||
case AMDGPU::VReg_224RegClassID:
|
||||
return AMDGPU::VReg_224_Align2RegClassID;
|
||||
case AMDGPU::VReg_256RegClassID:
|
||||
return AMDGPU::VReg_256_Align2RegClassID;
|
||||
case AMDGPU::VReg_512RegClassID:
|
||||
|
|
|
@ -1340,6 +1340,8 @@ static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
|
|||
return AMDGPU::SI_SPILL_S160_SAVE;
|
||||
case 24:
|
||||
return AMDGPU::SI_SPILL_S192_SAVE;
|
||||
case 28:
|
||||
return AMDGPU::SI_SPILL_S224_SAVE;
|
||||
case 32:
|
||||
return AMDGPU::SI_SPILL_S256_SAVE;
|
||||
case 64:
|
||||
|
@ -1365,6 +1367,8 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
|
|||
return AMDGPU::SI_SPILL_V160_SAVE;
|
||||
case 24:
|
||||
return AMDGPU::SI_SPILL_V192_SAVE;
|
||||
case 28:
|
||||
return AMDGPU::SI_SPILL_V224_SAVE;
|
||||
case 32:
|
||||
return AMDGPU::SI_SPILL_V256_SAVE;
|
||||
case 64:
|
||||
|
@ -1473,6 +1477,8 @@ static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
|
|||
return AMDGPU::SI_SPILL_S160_RESTORE;
|
||||
case 24:
|
||||
return AMDGPU::SI_SPILL_S192_RESTORE;
|
||||
case 28:
|
||||
return AMDGPU::SI_SPILL_S224_RESTORE;
|
||||
case 32:
|
||||
return AMDGPU::SI_SPILL_S256_RESTORE;
|
||||
case 64:
|
||||
|
@ -1498,6 +1504,8 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
|
|||
return AMDGPU::SI_SPILL_V160_RESTORE;
|
||||
case 24:
|
||||
return AMDGPU::SI_SPILL_V192_RESTORE;
|
||||
case 28:
|
||||
return AMDGPU::SI_SPILL_V224_RESTORE;
|
||||
case 32:
|
||||
return AMDGPU::SI_SPILL_V256_RESTORE;
|
||||
case 64:
|
||||
|
|
|
@ -675,6 +675,7 @@ defm SI_SPILL_S96 : SI_SPILL_SGPR <SReg_96>;
|
|||
defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
|
||||
defm SI_SPILL_S160 : SI_SPILL_SGPR <SReg_160>;
|
||||
defm SI_SPILL_S192 : SI_SPILL_SGPR <SReg_192>;
|
||||
defm SI_SPILL_S224 : SI_SPILL_SGPR <SReg_224>;
|
||||
defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
|
||||
defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
|
||||
defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>;
|
||||
|
@ -718,6 +719,7 @@ defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>;
|
|||
defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>;
|
||||
defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160>;
|
||||
defm SI_SPILL_V192 : SI_SPILL_VGPR <VReg_192>;
|
||||
defm SI_SPILL_V224 : SI_SPILL_VGPR <VReg_224>;
|
||||
defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
|
||||
defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
|
||||
defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>;
|
||||
|
@ -728,6 +730,7 @@ defm SI_SPILL_A96 : SI_SPILL_VGPR <AReg_96, 1>;
|
|||
defm SI_SPILL_A128 : SI_SPILL_VGPR <AReg_128, 1>;
|
||||
defm SI_SPILL_A160 : SI_SPILL_VGPR <AReg_160, 1>;
|
||||
defm SI_SPILL_A192 : SI_SPILL_VGPR <AReg_192, 1>;
|
||||
defm SI_SPILL_A224 : SI_SPILL_VGPR <AReg_224, 1>;
|
||||
defm SI_SPILL_A256 : SI_SPILL_VGPR <AReg_256, 1>;
|
||||
defm SI_SPILL_A512 : SI_SPILL_VGPR <AReg_512, 1>;
|
||||
defm SI_SPILL_A1024 : SI_SPILL_VGPR <AReg_1024, 1>;
|
||||
|
@ -1052,6 +1055,38 @@ foreach Index = 0-4 in {
|
|||
>;
|
||||
}
|
||||
|
||||
foreach Index = 0-5 in {
|
||||
def Extract_Element_v6i32_#Index : Extract_Element <
|
||||
i32, v6i32, Index, !cast<SubRegIndex>(sub#Index)
|
||||
>;
|
||||
def Insert_Element_v6i32_#Index : Insert_Element <
|
||||
i32, v6i32, Index, !cast<SubRegIndex>(sub#Index)
|
||||
>;
|
||||
|
||||
def Extract_Element_v6f32_#Index : Extract_Element <
|
||||
f32, v6f32, Index, !cast<SubRegIndex>(sub#Index)
|
||||
>;
|
||||
def Insert_Element_v6f32_#Index : Insert_Element <
|
||||
f32, v6f32, Index, !cast<SubRegIndex>(sub#Index)
|
||||
>;
|
||||
}
|
||||
|
||||
foreach Index = 0-6 in {
|
||||
def Extract_Element_v7i32_#Index : Extract_Element <
|
||||
i32, v7i32, Index, !cast<SubRegIndex>(sub#Index)
|
||||
>;
|
||||
def Insert_Element_v7i32_#Index : Insert_Element <
|
||||
i32, v7i32, Index, !cast<SubRegIndex>(sub#Index)
|
||||
>;
|
||||
|
||||
def Extract_Element_v7f32_#Index : Extract_Element <
|
||||
f32, v7f32, Index, !cast<SubRegIndex>(sub#Index)
|
||||
>;
|
||||
def Insert_Element_v7f32_#Index : Insert_Element <
|
||||
f32, v7f32, Index, !cast<SubRegIndex>(sub#Index)
|
||||
>;
|
||||
}
|
||||
|
||||
foreach Index = 0-7 in {
|
||||
def Extract_Element_v8i32_#Index : Extract_Element <
|
||||
i32, v8i32, Index, !cast<SubRegIndex>(sub#Index)
|
||||
|
@ -1202,8 +1237,32 @@ def : BitConvert <v4f32, v2i64, VReg_128>;
|
|||
def : BitConvert <v2i64, v4f32, VReg_128>;
|
||||
|
||||
// 160-bit bitcast
|
||||
def : BitConvert <v5i32, v5f32, SGPR_160>;
|
||||
def : BitConvert <v5f32, v5i32, SGPR_160>;
|
||||
def : BitConvert <v5i32, v5f32, SReg_160>;
|
||||
def : BitConvert <v5f32, v5i32, SReg_160>;
|
||||
def : BitConvert <v5i32, v5f32, VReg_160>;
|
||||
def : BitConvert <v5f32, v5i32, VReg_160>;
|
||||
|
||||
// 192-bit bitcast
|
||||
def : BitConvert <v6i32, v6f32, SReg_192>;
|
||||
def : BitConvert <v6f32, v6i32, SReg_192>;
|
||||
def : BitConvert <v6i32, v6f32, VReg_192>;
|
||||
def : BitConvert <v6f32, v6i32, VReg_192>;
|
||||
def : BitConvert <v3i64, v3f64, VReg_192>;
|
||||
def : BitConvert <v3f64, v3i64, VReg_192>;
|
||||
def : BitConvert <v3i64, v6i32, VReg_192>;
|
||||
def : BitConvert <v3i64, v6f32, VReg_192>;
|
||||
def : BitConvert <v3f64, v6i32, VReg_192>;
|
||||
def : BitConvert <v3f64, v6f32, VReg_192>;
|
||||
def : BitConvert <v6i32, v3i64, VReg_192>;
|
||||
def : BitConvert <v6f32, v3i64, VReg_192>;
|
||||
def : BitConvert <v6i32, v3f64, VReg_192>;
|
||||
def : BitConvert <v6f32, v3f64, VReg_192>;
|
||||
|
||||
// 224-bit bitcast
|
||||
def : BitConvert <v7i32, v7f32, SReg_224>;
|
||||
def : BitConvert <v7f32, v7i32, SReg_224>;
|
||||
def : BitConvert <v7i32, v7f32, VReg_224>;
|
||||
def : BitConvert <v7f32, v7i32, VReg_224>;
|
||||
|
||||
// 256-bit bitcast
|
||||
def : BitConvert <v8i32, v8f32, SReg_256>;
|
||||
|
|
|
@ -1874,6 +1874,8 @@ getAnyVGPRClassForBitWidth(unsigned BitWidth) {
|
|||
return &AMDGPU::VReg_160RegClass;
|
||||
if (BitWidth <= 192)
|
||||
return &AMDGPU::VReg_192RegClass;
|
||||
if (BitWidth <= 224)
|
||||
return &AMDGPU::VReg_224RegClass;
|
||||
if (BitWidth <= 256)
|
||||
return &AMDGPU::VReg_256RegClass;
|
||||
if (BitWidth <= 512)
|
||||
|
@ -1896,6 +1898,8 @@ getAlignedVGPRClassForBitWidth(unsigned BitWidth) {
|
|||
return &AMDGPU::VReg_160_Align2RegClass;
|
||||
if (BitWidth <= 192)
|
||||
return &AMDGPU::VReg_192_Align2RegClass;
|
||||
if (BitWidth <= 224)
|
||||
return &AMDGPU::VReg_224_Align2RegClass;
|
||||
if (BitWidth <= 256)
|
||||
return &AMDGPU::VReg_256_Align2RegClass;
|
||||
if (BitWidth <= 512)
|
||||
|
@ -2036,6 +2040,11 @@ SIRegisterInfo::getPhysRegClass(MCRegister Reg) const {
|
|||
&AMDGPU::SReg_192RegClass,
|
||||
&AMDGPU::AReg_192_Align2RegClass,
|
||||
&AMDGPU::AReg_192RegClass,
|
||||
&AMDGPU::VReg_224_Align2RegClass,
|
||||
&AMDGPU::VReg_224RegClass,
|
||||
&AMDGPU::SReg_224RegClass,
|
||||
&AMDGPU::AReg_224_Align2RegClass,
|
||||
&AMDGPU::AReg_224RegClass,
|
||||
&AMDGPU::VReg_256_Align2RegClass,
|
||||
&AMDGPU::VReg_256RegClass,
|
||||
&AMDGPU::SReg_256RegClass,
|
||||
|
|
|
@ -58,6 +58,7 @@ class getSubRegs<int size> {
|
|||
list<SubRegIndex> ret4 = [sub0, sub1, sub2, sub3];
|
||||
list<SubRegIndex> ret5 = [sub0, sub1, sub2, sub3, sub4];
|
||||
list<SubRegIndex> ret6 = [sub0, sub1, sub2, sub3, sub4, sub5];
|
||||
list<SubRegIndex> ret7 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6];
|
||||
list<SubRegIndex> ret8 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7];
|
||||
list<SubRegIndex> ret16 = [sub0, sub1, sub2, sub3,
|
||||
sub4, sub5, sub6, sub7,
|
||||
|
@ -77,9 +78,10 @@ class getSubRegs<int size> {
|
|||
!if(!eq(size, 4), ret4,
|
||||
!if(!eq(size, 5), ret5,
|
||||
!if(!eq(size, 6), ret6,
|
||||
!if(!eq(size, 8), ret8,
|
||||
!if(!eq(size, 16), ret16,
|
||||
ret32)))))));
|
||||
!if(!eq(size, 7), ret7,
|
||||
!if(!eq(size, 8), ret8,
|
||||
!if(!eq(size, 16), ret16,
|
||||
ret32))))))));
|
||||
}
|
||||
|
||||
// Generates list of sequential register tuple names.
|
||||
|
@ -350,9 +352,12 @@ def SGPR_128Regs : SIRegisterTuples<getSubRegs<4>.ret, SGPR_32, 105, 4, 4, "s">;
|
|||
// SGPR 160-bit registers. No operations use these, but for symmetry with 160-bit VGPRs.
|
||||
def SGPR_160Regs : SIRegisterTuples<getSubRegs<5>.ret, SGPR_32, 105, 4, 5, "s">;
|
||||
|
||||
// SGPR 192-bit registers
|
||||
// SGPR 192-bit registers. No operations use these, but for symmetry with 192-bit VGPRs.
|
||||
def SGPR_192Regs : SIRegisterTuples<getSubRegs<6>.ret, SGPR_32, 105, 4, 6, "s">;
|
||||
|
||||
// SGPR 224-bit registers. No operations use these, but for symmetry with 224-bit VGPRs.
|
||||
def SGPR_224Regs : SIRegisterTuples<getSubRegs<7>.ret, SGPR_32, 105, 4, 7, "s">;
|
||||
|
||||
// SGPR 256-bit registers
|
||||
def SGPR_256Regs : SIRegisterTuples<getSubRegs<8>.ret, SGPR_32, 105, 4, 8, "s">;
|
||||
|
||||
|
@ -508,6 +513,9 @@ def VGPR_160 : SIRegisterTuples<getSubRegs<5>.ret, VGPR_32, 255, 1, 5, "v">;
|
|||
// VGPR 192-bit registers
|
||||
def VGPR_192 : SIRegisterTuples<getSubRegs<6>.ret, VGPR_32, 255, 1, 6, "v">;
|
||||
|
||||
// VGPR 224-bit registers
|
||||
def VGPR_224 : SIRegisterTuples<getSubRegs<7>.ret, VGPR_32, 255, 1, 7, "v">;
|
||||
|
||||
// VGPR 256-bit registers
|
||||
def VGPR_256 : SIRegisterTuples<getSubRegs<8>.ret, VGPR_32, 255, 1, 8, "v">;
|
||||
|
||||
|
@ -547,6 +555,9 @@ def AGPR_160 : SIRegisterTuples<getSubRegs<5>.ret, AGPR_32, 255, 1, 5, "a">;
|
|||
// AGPR 192-bit registers
|
||||
def AGPR_192 : SIRegisterTuples<getSubRegs<6>.ret, AGPR_32, 255, 1, 6, "a">;
|
||||
|
||||
// AGPR 224-bit registers
|
||||
def AGPR_224 : SIRegisterTuples<getSubRegs<7>.ret, AGPR_32, 255, 1, 7, "a">;
|
||||
|
||||
// AGPR 256-bit registers
|
||||
def AGPR_256 : SIRegisterTuples<getSubRegs<8>.ret, AGPR_32, 255, 1, 8, "a">;
|
||||
|
||||
|
@ -725,20 +736,41 @@ def SReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32,
|
|||
(add SGPR_160)> {
|
||||
// FIXME: Should be isAllocatable = 0, but that causes all TableGen-generated
|
||||
// subclasses of SGPR_160 to be marked unallocatable too.
|
||||
// This occurs because SGPR_160 and SReg_160 classes are equivalent in size
|
||||
// meaning their enumeration order is dependent on alphanumeric ordering of
|
||||
// their names. The superclass for inherence is the last one in topological
|
||||
// order (i.e. enumeration order), hence SReg_160 is selected.
|
||||
// Potential workarounds involve renaming SGPR_160, adding another class
|
||||
// which is ordered last and hence used for inheritance, or adding more
|
||||
// registers to SReg_160 to cause it to be moved earlier in the superclass
|
||||
// list.
|
||||
let CopyCost = 3;
|
||||
}
|
||||
|
||||
def SGPR_192 : RegisterClass<"AMDGPU", [untyped], 32, (add SGPR_192Regs)> {
|
||||
let Size = 192;
|
||||
// There are no 6-component scalar instructions, but this is needed
|
||||
// for symmetry with VGPRs.
|
||||
def SGPR_192 : RegisterClass<"AMDGPU", [v6i32, v6f32, v3i64, v3f64], 32, (add SGPR_192Regs)> {
|
||||
let AllocationPriority = 17;
|
||||
}
|
||||
|
||||
def SReg_192 : RegisterClass<"AMDGPU", [untyped], 32, (add SGPR_192)> {
|
||||
let Size = 192;
|
||||
def SReg_192 : RegisterClass<"AMDGPU", [v6i32, v6f32, v3i64, v3f64], 32, (add SGPR_192)> {
|
||||
let isAllocatable = 0;
|
||||
let CopyCost = 3;
|
||||
}
|
||||
|
||||
// There are no 7-component scalar instructions, but this is needed
|
||||
// for symmetry with VGPRs.
|
||||
def SGPR_224 : RegisterClass<"AMDGPU", [v7i32, v7f32], 32, (add SGPR_224Regs)> {
|
||||
let AllocationPriority = 18;
|
||||
}
|
||||
|
||||
def SReg_224 : RegisterClass<"AMDGPU", [v7i32, v7f32], 32, (add SGPR_224)> {
|
||||
let isAllocatable = 0;
|
||||
let CopyCost = 4;
|
||||
}
|
||||
|
||||
def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32, v4i64, v4f64], 32, (add SGPR_256Regs)> {
|
||||
let AllocationPriority = 18;
|
||||
let AllocationPriority = 19;
|
||||
}
|
||||
|
||||
def TTMP_256 : RegisterClass<"AMDGPU", [v8i32, v8f32, v4i64, v4f64], 32, (add TTMP_256Regs)> {
|
||||
|
@ -754,7 +786,7 @@ def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32, v4i64, v4f64], 32,
|
|||
|
||||
def SGPR_512 : RegisterClass<"AMDGPU", [v16i32, v16f32, v8i64, v8f64], 32,
|
||||
(add SGPR_512Regs)> {
|
||||
let AllocationPriority = 19;
|
||||
let AllocationPriority = 20;
|
||||
}
|
||||
|
||||
def TTMP_512 : RegisterClass<"AMDGPU", [v16i32, v16f32, v8i64, v8f64], 32,
|
||||
|
@ -776,7 +808,7 @@ def VRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 3
|
|||
|
||||
def SGPR_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32, v16i64, v16f64], 32,
|
||||
(add SGPR_1024Regs)> {
|
||||
let AllocationPriority = 20;
|
||||
let AllocationPriority = 21;
|
||||
}
|
||||
|
||||
def SReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32, v16i64, v16f64], 32,
|
||||
|
@ -812,7 +844,8 @@ defm VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>;
|
|||
defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64], (add VGPR_128)>;
|
||||
defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>;
|
||||
|
||||
defm VReg_192 : VRegClass<6, [untyped], (add VGPR_192)>;
|
||||
defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>;
|
||||
defm VReg_224 : VRegClass<7, [v7i32, v7f32], (add VGPR_224)>;
|
||||
defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64], (add VGPR_256)>;
|
||||
defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>;
|
||||
defm VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>;
|
||||
|
@ -832,7 +865,8 @@ defm AReg_64 : ARegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16],
|
|||
defm AReg_96 : ARegClass<3, [v3i32, v3f32], (add AGPR_96)>;
|
||||
defm AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64], (add AGPR_128)>;
|
||||
defm AReg_160 : ARegClass<5, [v5i32, v5f32], (add AGPR_160)>;
|
||||
defm AReg_192 : ARegClass<6, [untyped], (add AGPR_192)>;
|
||||
defm AReg_192 : ARegClass<6, [v6i32, v6f32, v3i64, v3f64], (add AGPR_192)>;
|
||||
defm AReg_224 : ARegClass<7, [v7i32, v7f32], (add AGPR_224)>;
|
||||
defm AReg_256 : ARegClass<8, [v8i32, v8f32, v4i64, v4f64], (add AGPR_256)>;
|
||||
defm AReg_512 : ARegClass<16, [v16i32, v16f32, v8i64, v8f64], (add AGPR_512)>;
|
||||
defm AReg_1024 : ARegClass<32, [v32i32, v32f32, v16i64, v16f64], (add AGPR_1024)>;
|
||||
|
|
|
@ -1650,6 +1650,13 @@ unsigned getRegBitWidth(unsigned RCID) {
|
|||
case AMDGPU::VReg_192_Align2RegClassID:
|
||||
case AMDGPU::AReg_192_Align2RegClassID:
|
||||
return 192;
|
||||
case AMDGPU::SGPR_224RegClassID:
|
||||
case AMDGPU::SReg_224RegClassID:
|
||||
case AMDGPU::VReg_224RegClassID:
|
||||
case AMDGPU::AReg_224RegClassID:
|
||||
case AMDGPU::VReg_224_Align2RegClassID:
|
||||
case AMDGPU::AReg_224_Align2RegClassID:
|
||||
return 224;
|
||||
case AMDGPU::SGPR_256RegClassID:
|
||||
case AMDGPU::SReg_256RegClassID:
|
||||
case AMDGPU::VReg_256RegClassID:
|
||||
|
|
|
@ -3501,13 +3501,13 @@ define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_s_v_s(<7 x float> inreg %v
|
|||
; GPRIDX-LABEL: dyn_insertelement_v7f32_s_v_s:
|
||||
; GPRIDX: ; %bb.0: ; %entry
|
||||
; GPRIDX-NEXT: s_mov_b32 s0, s2
|
||||
; GPRIDX-NEXT: s_mov_b32 s1, s3
|
||||
; GPRIDX-NEXT: s_mov_b32 s2, s4
|
||||
; GPRIDX-NEXT: s_mov_b32 s3, s5
|
||||
; GPRIDX-NEXT: s_mov_b32 s4, s6
|
||||
; GPRIDX-NEXT: s_mov_b32 s5, s7
|
||||
; GPRIDX-NEXT: s_mov_b32 s6, s8
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v14, s7
|
||||
; GPRIDX-NEXT: s_mov_b32 s1, s3
|
||||
; GPRIDX-NEXT: s_mov_b32 s3, s5
|
||||
; GPRIDX-NEXT: s_mov_b32 s5, s7
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v13, s6
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v7, s0
|
||||
; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s9, 0
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc
|
||||
|
@ -3526,7 +3526,6 @@ define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_s_v_s(<7 x float> inreg %v
|
|||
; GPRIDX-NEXT: v_mov_b32_e32 v12, s5
|
||||
; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s9, 5
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v12, v0, vcc
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v13, s6
|
||||
; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s9, 6
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v13, v0, vcc
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v0, v7
|
||||
|
@ -3535,13 +3534,13 @@ define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_s_v_s(<7 x float> inreg %v
|
|||
; MOVREL-LABEL: dyn_insertelement_v7f32_s_v_s:
|
||||
; MOVREL: ; %bb.0: ; %entry
|
||||
; MOVREL-NEXT: s_mov_b32 s0, s2
|
||||
; MOVREL-NEXT: s_mov_b32 s1, s3
|
||||
; MOVREL-NEXT: s_mov_b32 s2, s4
|
||||
; MOVREL-NEXT: s_mov_b32 s3, s5
|
||||
; MOVREL-NEXT: s_mov_b32 s4, s6
|
||||
; MOVREL-NEXT: s_mov_b32 s5, s7
|
||||
; MOVREL-NEXT: s_mov_b32 s6, s8
|
||||
; MOVREL-NEXT: v_mov_b32_e32 v14, s7
|
||||
; MOVREL-NEXT: s_mov_b32 s1, s3
|
||||
; MOVREL-NEXT: s_mov_b32 s3, s5
|
||||
; MOVREL-NEXT: s_mov_b32 s5, s7
|
||||
; MOVREL-NEXT: v_mov_b32_e32 v13, s6
|
||||
; MOVREL-NEXT: v_mov_b32_e32 v7, s0
|
||||
; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 0
|
||||
; MOVREL-NEXT: v_mov_b32_e32 v8, s1
|
||||
|
@ -3551,7 +3550,6 @@ define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_s_v_s(<7 x float> inreg %v
|
|||
; MOVREL-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc_lo
|
||||
; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 1
|
||||
; MOVREL-NEXT: v_mov_b32_e32 v12, s5
|
||||
; MOVREL-NEXT: v_mov_b32_e32 v13, s6
|
||||
; MOVREL-NEXT: v_cndmask_b32_e32 v1, v8, v0, vcc_lo
|
||||
; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 2
|
||||
; MOVREL-NEXT: v_cndmask_b32_e32 v2, v9, v0, vcc_lo
|
||||
|
@ -3574,13 +3572,13 @@ define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_s_v_v(<7 x float> inreg %v
|
|||
; GPRIDX-LABEL: dyn_insertelement_v7f32_s_v_v:
|
||||
; GPRIDX: ; %bb.0: ; %entry
|
||||
; GPRIDX-NEXT: s_mov_b32 s0, s2
|
||||
; GPRIDX-NEXT: s_mov_b32 s1, s3
|
||||
; GPRIDX-NEXT: s_mov_b32 s2, s4
|
||||
; GPRIDX-NEXT: s_mov_b32 s3, s5
|
||||
; GPRIDX-NEXT: s_mov_b32 s4, s6
|
||||
; GPRIDX-NEXT: s_mov_b32 s5, s7
|
||||
; GPRIDX-NEXT: s_mov_b32 s6, s8
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v15, s7
|
||||
; GPRIDX-NEXT: s_mov_b32 s1, s3
|
||||
; GPRIDX-NEXT: s_mov_b32 s3, s5
|
||||
; GPRIDX-NEXT: s_mov_b32 s5, s7
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v14, s6
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v8, s0
|
||||
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc
|
||||
|
@ -3600,7 +3598,6 @@ define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_s_v_v(<7 x float> inreg %v
|
|||
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v1
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v13, v0, vcc
|
||||
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v1
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v14, s6
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v14, v0, vcc
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v0, v8
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v1, v7
|
||||
|
@ -3609,13 +3606,13 @@ define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_s_v_v(<7 x float> inreg %v
|
|||
; MOVREL-LABEL: dyn_insertelement_v7f32_s_v_v:
|
||||
; MOVREL: ; %bb.0: ; %entry
|
||||
; MOVREL-NEXT: s_mov_b32 s0, s2
|
||||
; MOVREL-NEXT: s_mov_b32 s1, s3
|
||||
; MOVREL-NEXT: s_mov_b32 s2, s4
|
||||
; MOVREL-NEXT: s_mov_b32 s3, s5
|
||||
; MOVREL-NEXT: s_mov_b32 s4, s6
|
||||
; MOVREL-NEXT: s_mov_b32 s5, s7
|
||||
; MOVREL-NEXT: s_mov_b32 s6, s8
|
||||
; MOVREL-NEXT: v_mov_b32_e32 v15, s7
|
||||
; MOVREL-NEXT: s_mov_b32 s1, s3
|
||||
; MOVREL-NEXT: s_mov_b32 s3, s5
|
||||
; MOVREL-NEXT: s_mov_b32 s5, s7
|
||||
; MOVREL-NEXT: v_mov_b32_e32 v14, s6
|
||||
; MOVREL-NEXT: v_mov_b32_e32 v8, s0
|
||||
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
|
||||
; MOVREL-NEXT: v_mov_b32_e32 v9, s1
|
||||
|
@ -3625,7 +3622,6 @@ define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_s_v_v(<7 x float> inreg %v
|
|||
; MOVREL-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc_lo
|
||||
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
|
||||
; MOVREL-NEXT: v_mov_b32_e32 v13, s5
|
||||
; MOVREL-NEXT: v_mov_b32_e32 v14, s6
|
||||
; MOVREL-NEXT: v_cndmask_b32_e32 v7, v9, v0, vcc_lo
|
||||
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1
|
||||
; MOVREL-NEXT: v_cndmask_b32_e32 v2, v10, v0, vcc_lo
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1
|
||||
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1
|
||||
; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3
|
||||
; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 8
|
||||
; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 6
|
||||
; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0
|
||||
; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0
|
||||
; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
|
||||
|
@ -33,7 +33,7 @@
|
|||
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1
|
||||
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1
|
||||
; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3
|
||||
; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 8
|
||||
; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 6
|
||||
; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0
|
||||
; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0
|
||||
; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
|
||||
|
|
|
@ -1081,32 +1081,31 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias
|
|||
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_ubyte v12, v[4:5]
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, 6, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
||||
; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
|
||||
; VI-NEXT: v_add_u32_e32 v8, vcc, 5, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
||||
; VI-NEXT: v_add_u32_e32 v10, vcc, 1, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_ubyte v10, v[4:5]
|
||||
; VI-NEXT: flat_load_ubyte v11, v[6:7]
|
||||
; VI-NEXT: flat_load_ubyte v8, v[8:9]
|
||||
; VI-NEXT: flat_load_ubyte v9, v[10:11]
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, 6, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
||||
; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_ubyte v6, v[6:7]
|
||||
; VI-NEXT: flat_load_ubyte v7, v[4:5]
|
||||
; VI-NEXT: flat_load_ubyte v4, v[4:5]
|
||||
; VI-NEXT: flat_load_ubyte v2, v[2:3]
|
||||
; VI-NEXT: flat_load_ubyte v0, v[0:1]
|
||||
; VI-NEXT: s_waitcnt vmcnt(5)
|
||||
; VI-NEXT: v_cvt_f32_ubyte2_e32 v5, v8
|
||||
; VI-NEXT: s_waitcnt vmcnt(4)
|
||||
; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v9
|
||||
; VI-NEXT: v_cvt_f32_ubyte2_e32 v5, v8
|
||||
; VI-NEXT: s_waitcnt vmcnt(3)
|
||||
; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v6
|
||||
; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v6
|
||||
; VI-NEXT: s_waitcnt vmcnt(2)
|
||||
; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v7
|
||||
; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v4
|
||||
; VI-NEXT: s_waitcnt vmcnt(1)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
|
||||
; VI-NEXT: v_or_b32_sdwa v2, v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_sdwa v2, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v11
|
||||
; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
|
||||
|
@ -1127,25 +1126,23 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias
|
|||
; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:2
|
||||
; GFX10-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3
|
||||
; GFX10-NEXT: global_load_short_d16 v2, v0, s[2:3] offset:4
|
||||
; GFX10-NEXT: global_load_ubyte v6, v0, s[2:3] offset:6
|
||||
; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1
|
||||
; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:6
|
||||
; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] offset:1
|
||||
; GFX10-NEXT: global_load_ubyte v7, v0, s[2:3]
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(4)
|
||||
; GFX10-NEXT: v_lshl_or_b32 v0, v3, 8, v1
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(3)
|
||||
; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v2
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(2)
|
||||
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v6
|
||||
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v4
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v1, v4
|
||||
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v2
|
||||
; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v1, v5
|
||||
; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v2
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
|
||||
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v2
|
||||
; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
|
||||
; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v7
|
||||
; GFX10-NEXT: global_store_dword v8, v6, s[0:1] offset:24
|
||||
; GFX10-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16
|
||||
; GFX10-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] offset:16
|
||||
; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
|
|
|
@ -287,7 +287,7 @@ define <2 x i64> @v2i64_func_void() #0 {
|
|||
|
||||
; GCN-LABEL: {{^}}v3i64_func_void:
|
||||
; GCN-DAG: buffer_load_dwordx4 v[0:3], off
|
||||
; GCN-DAG: buffer_load_dwordx4 v[4:7], off
|
||||
; GCN-DAG: buffer_load_dwordx2 v[4:5], off
|
||||
; GCN: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define <3 x i64> @v3i64_func_void() #0 {
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
; CHECK: .max_flat_workgroup_size: 1024
|
||||
; CHECK: .name: test
|
||||
; CHECK: .private_segment_fixed_size: 0
|
||||
; CHECK: .sgpr_count: 8
|
||||
; CHECK: .sgpr_count: 6
|
||||
; CHECK: .symbol: test.kd
|
||||
; CHECK: .vgpr_count: {{3|6}}
|
||||
; WAVE64: .wavefront_size: 64
|
||||
|
|
|
@ -16,7 +16,7 @@
|
|||
; CHECK: PrivateSegmentFixedSize: 0
|
||||
; CHECK: KernargSegmentAlign: 8
|
||||
; CHECK: WavefrontSize: 64
|
||||
; CHECK: NumSGPRs: 8
|
||||
; CHECK: NumSGPRs: 6
|
||||
; CHECK: NumVGPRs: {{3|6}}
|
||||
; CHECK: MaxFlatWorkGroupSize: 1024
|
||||
define amdgpu_kernel void @test(
|
||||
|
@ -39,7 +39,7 @@ entry:
|
|||
; CHECK: PrivateSegmentFixedSize: 0
|
||||
; CHECK: KernargSegmentAlign: 8
|
||||
; CHECK: WavefrontSize: 64
|
||||
; CHECK: NumSGPRs: 8
|
||||
; CHECK: NumSGPRs: 6
|
||||
; CHECK: NumVGPRs: {{3|6}}
|
||||
; CHECK: MaxFlatWorkGroupSize: 256
|
||||
define amdgpu_kernel void @test_max_flat_workgroup_size(
|
||||
|
|
|
@ -1506,26 +1506,27 @@ define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %
|
|||
; SI-LABEL: dynamic_insertelement_v3i64:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8
|
||||
; SI-NEXT: s_load_dword s6, s[4:5], 0x10
|
||||
; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
|
||||
; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xc
|
||||
; SI-NEXT: s_load_dword s12, s[4:5], 0x10
|
||||
; SI-NEXT: s_mov_b32 s3, 0x100f000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s13
|
||||
; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 2
|
||||
; SI-NEXT: v_cndmask_b32_e64 v5, v0, 0, s[4:5]
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s12
|
||||
; SI-NEXT: v_cndmask_b32_e64 v4, v0, 5, s[4:5]
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s11
|
||||
; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 1
|
||||
; SI-NEXT: v_mov_b32_e32 v4, s7
|
||||
; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 1
|
||||
; SI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5]
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s10
|
||||
; SI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5]
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s9
|
||||
; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 0
|
||||
; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 0
|
||||
; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s8
|
||||
; SI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5]
|
||||
; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 2
|
||||
; SI-NEXT: v_cndmask_b32_e64 v5, v4, 0, s[4:5]
|
||||
; SI-NEXT: v_mov_b32_e32 v4, s6
|
||||
; SI-NEXT: v_cndmask_b32_e64 v4, v4, 5, s[4:5]
|
||||
; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16
|
||||
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
|
@ -1533,26 +1534,27 @@ define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %
|
|||
; VI-LABEL: dynamic_insertelement_v3i64:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
|
||||
; VI-NEXT: s_load_dword s6, s[4:5], 0x40
|
||||
; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x20
|
||||
; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x30
|
||||
; VI-NEXT: s_load_dword s12, s[4:5], 0x40
|
||||
; VI-NEXT: s_mov_b32 s3, 0x1100f000
|
||||
; VI-NEXT: s_mov_b32 s2, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s13
|
||||
; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 2
|
||||
; VI-NEXT: v_cndmask_b32_e64 v5, v0, 0, s[4:5]
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s12
|
||||
; VI-NEXT: v_cndmask_b32_e64 v4, v0, 5, s[4:5]
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s11
|
||||
; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 1
|
||||
; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 1
|
||||
; VI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5]
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s10
|
||||
; VI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5]
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s9
|
||||
; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 0
|
||||
; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 0
|
||||
; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s8
|
||||
; VI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5]
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s7
|
||||
; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 2
|
||||
; VI-NEXT: v_cndmask_b32_e64 v5, v4, 0, s[4:5]
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s6
|
||||
; VI-NEXT: v_cndmask_b32_e64 v4, v4, 5, s[4:5]
|
||||
; VI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16
|
||||
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
|
|
|
@ -1039,10 +1039,10 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)*
|
|||
; GFX9-LABEL: s_insertelement_v2i16_dynamic:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
|
||||
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
|
||||
; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_lshl_b32 s2, s4, 4
|
||||
|
@ -1057,7 +1057,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)*
|
|||
; VI-LABEL: s_insertelement_v2i16_dynamic:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: s_load_dword s0, s[4:5], 0x0
|
||||
|
@ -1076,7 +1076,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)*
|
|||
; CI-LABEL: s_insertelement_v2i16_dynamic:
|
||||
; CI: ; %bb.0:
|
||||
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; CI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4
|
||||
; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; CI-NEXT: s_load_dword s0, s[4:5], 0x0
|
||||
|
@ -1169,10 +1169,10 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspa
|
|||
; GFX9-LABEL: v_insertelement_v2f16_dynamic_vgpr:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
|
||||
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[8:9]
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
|
||||
; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
|
||||
; GFX9-NEXT: s_mov_b32 s2, 0xffff
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(1)
|
||||
|
@ -1187,7 +1187,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspa
|
|||
; VI-LABEL: v_insertelement_v2f16_dynamic_vgpr:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
|
||||
|
@ -1214,7 +1214,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspa
|
|||
; CI-LABEL: v_insertelement_v2f16_dynamic_vgpr:
|
||||
; CI: ; %bb.0:
|
||||
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; CI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4
|
||||
; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
|
||||
; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CI-NEXT: v_mov_b32_e32 v1, s3
|
||||
|
|
|
@ -1,19 +1,19 @@
|
|||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -enable-ipra -print-regusage -o /dev/null 2>&1 < %s | FileCheck %s
|
||||
; Make sure the expected regmask is generated for sub/superregisters.
|
||||
|
||||
; CHECK-DAG: csr Clobbered Registers: $vgpr0 $vgpr0_hi16 $vgpr0_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr0_vgpr1 $vgpr0_vgpr1_vgpr2 {{$}}
|
||||
; CHECK-DAG: csr Clobbered Registers: $vgpr0 $vgpr0_hi16 $vgpr0_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr0_vgpr1 $vgpr0_vgpr1_vgpr2 {{$}}
|
||||
define void @csr() #0 {
|
||||
call void asm sideeffect "", "~{v0},~{v44},~{v45}"() #0
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-DAG: subregs_for_super Clobbered Registers: $vgpr0 $vgpr1 $vgpr0_hi16 $vgpr1_hi16 $vgpr0_lo16 $vgpr1_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 $vgpr0_vgpr1 $vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 $vgpr1_vgpr2_vgpr3 {{$}}
|
||||
; CHECK-DAG: subregs_for_super Clobbered Registers: $vgpr0 $vgpr1 $vgpr0_hi16 $vgpr1_hi16 $vgpr0_lo16 $vgpr1_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 $vgpr0_vgpr1 $vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 $vgpr1_vgpr2_vgpr3 {{$}}
|
||||
define void @subregs_for_super() #0 {
|
||||
call void asm sideeffect "", "~{v0},~{v1}"() #0
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-DAG: clobbered_reg_with_sub Clobbered Registers: $vgpr0 $vgpr1 $vgpr0_hi16 $vgpr1_hi16 $vgpr0_lo16 $vgpr1_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 $vgpr0_vgpr1 $vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 $vgpr1_vgpr2_vgpr3 {{$}}
|
||||
; CHECK-DAG: clobbered_reg_with_sub Clobbered Registers: $vgpr0 $vgpr1 $vgpr0_hi16 $vgpr1_hi16 $vgpr0_lo16 $vgpr1_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 $vgpr0_vgpr1 $vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 $vgpr1_vgpr2_vgpr3 {{$}}
|
||||
define void @clobbered_reg_with_sub() #0 {
|
||||
call void asm sideeffect "", "~{v[0:1]}"() #0
|
||||
ret void
|
||||
|
|
|
@ -25,7 +25,8 @@ entry:
|
|||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}constant_load_v3i64:
|
||||
; GCN: s_load_dwordx8 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
|
||||
; GCN-DAG: s_load_dwordx4 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
|
||||
; GCN-DAG: s_load_dwordx2 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x{{[0-9]+}}{{$}}
|
||||
|
||||
; EG-DAG: VTX_READ_128
|
||||
; EG-DAG: VTX_READ_128
|
||||
|
|
|
@ -25,10 +25,10 @@ entry:
|
|||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}global_load_v3f64:
|
||||
; GCN-NOHSA: buffer_load_dwordx4
|
||||
; GCN-NOHSA: buffer_load_dwordx4
|
||||
; GCN-HSA: flat_load_dwordx4
|
||||
; GCN-HSA: flat_load_dwordx4
|
||||
; GCN-NOHSA-DAG: buffer_load_dwordx4
|
||||
; GCN-NOHSA-DAG: buffer_load_dwordx2
|
||||
; GCN-HSA-DAG: flat_load_dwordx4
|
||||
; GCN-HSA-DAG: flat_load_dwordx2
|
||||
define amdgpu_kernel void @global_load_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
|
||||
entry:
|
||||
%ld = load <3 x double>, <3 x double> addrspace(1)* %in
|
||||
|
|
|
@ -32,11 +32,11 @@ entry:
|
|||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}global_load_v3i64:
|
||||
; GCN-NOHSA: buffer_load_dwordx4
|
||||
; GCN-NOHSA: buffer_load_dwordx4
|
||||
; GCN-NOHSA-DAG: buffer_load_dwordx4
|
||||
; GCN-NOHSA-DAG: buffer_load_dwordx2
|
||||
|
||||
; GCN-HSA: flat_load_dwordx4
|
||||
; GCN-HSA: flat_load_dwordx4
|
||||
; GCN-HSA-DAG: flat_load_dwordx4
|
||||
; GCN-HSA-DAG: flat_load_dwordx2
|
||||
|
||||
; EG: VTX_READ_128
|
||||
; EG: VTX_READ_128
|
||||
|
|
|
@ -66,38 +66,38 @@ define amdgpu_vs void @test_3(i32 inreg %arg1, i32 inreg %arg2, <4 x i32> inreg
|
|||
; CHECK-NEXT: s_mov_b32 s6, s4
|
||||
; CHECK-NEXT: s_mov_b32 s5, s3
|
||||
; CHECK-NEXT: s_mov_b32 s4, s2
|
||||
; CHECK-NEXT: v_add_i32_e32 v0, vcc, 16, v1
|
||||
; CHECK-NEXT: v_add_i32_e32 v0, vcc, 4, v1
|
||||
; CHECK-NEXT: v_add_i32_e32 v5, vcc, 8, v1
|
||||
; CHECK-NEXT: v_add_i32_e32 v6, vcc, 12, v1
|
||||
; CHECK-NEXT: v_add_i32_e32 v4, vcc, 8, v1
|
||||
; CHECK-NEXT: v_add_i32_e32 v7, vcc, 4, v1
|
||||
; CHECK-NEXT: v_add_i32_e32 v7, vcc, 16, v1
|
||||
; CHECK-NEXT: v_add_i32_e32 v8, vcc, 20, v1
|
||||
; CHECK-NEXT: v_mov_b32_e32 v9, s0
|
||||
; CHECK-NEXT: v_add_i32_e32 v10, vcc, 16, v2
|
||||
; CHECK-NEXT: v_add_i32_e32 v11, vcc, 12, v2
|
||||
; CHECK-NEXT: v_add_i32_e32 v12, vcc, 8, v2
|
||||
; CHECK-NEXT: v_add_i32_e32 v10, vcc, 4, v2
|
||||
; CHECK-NEXT: v_add_i32_e32 v11, vcc, 8, v2
|
||||
; CHECK-NEXT: v_add_i32_e32 v12, vcc, 12, v2
|
||||
; CHECK-NEXT: s_mov_b32 m0, -1
|
||||
; CHECK-NEXT: ds_read_b32 v3, v1
|
||||
; CHECK-NEXT: ds_read_b32 v5, v4
|
||||
; CHECK-NEXT: ds_read_b32 v4, v7
|
||||
; CHECK-NEXT: ds_read_b32 v1, v8
|
||||
; CHECK-NEXT: ds_read_b32 v4, v0
|
||||
; CHECK-NEXT: ds_read_b32 v5, v5
|
||||
; CHECK-NEXT: ds_read_b32 v6, v6
|
||||
; CHECK-NEXT: ds_read_b32 v0, v0
|
||||
; CHECK-NEXT: v_add_i32_e32 v7, vcc, 4, v2
|
||||
; CHECK-NEXT: ds_read_b32 v0, v7
|
||||
; CHECK-NEXT: ds_read_b32 v1, v8
|
||||
; CHECK-NEXT: v_add_i32_e32 v7, vcc, 16, v2
|
||||
; CHECK-NEXT: v_add_i32_e32 v8, vcc, 20, v2
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
|
||||
; CHECK-NEXT: tbuffer_store_format_xyzw v[3:6], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:264 glc slc
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: tbuffer_store_format_xy v[0:1], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:280 glc slc
|
||||
; CHECK-NEXT: s_waitcnt expcnt(0)
|
||||
; CHECK-NEXT: ds_read_b32 v0, v2
|
||||
; CHECK-NEXT: ds_read_b32 v2, v12
|
||||
; CHECK-NEXT: ds_read_b32 v1, v7
|
||||
; CHECK-NEXT: ds_read_b32 v1, v10
|
||||
; CHECK-NEXT: ds_read_b32 v2, v11
|
||||
; CHECK-NEXT: ds_read_b32 v3, v12
|
||||
; CHECK-NEXT: ds_read_b32 v4, v7
|
||||
; CHECK-NEXT: ds_read_b32 v5, v8
|
||||
; CHECK-NEXT: ds_read_b32 v3, v11
|
||||
; CHECK-NEXT: ds_read_b32 v4, v10
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(5)
|
||||
; CHECK-NEXT: exp mrt0 off, off, off, off
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
|
||||
; CHECK-NEXT: tbuffer_store_format_xyzw v[0:3], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:240 glc slc
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: tbuffer_store_format_xy v[4:5], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:256 glc slc
|
||||
|
|
|
@ -499,7 +499,7 @@ define amdgpu_kernel void @s_test_sdiv24_64(i64 addrspace(1)* %out, i64 %x, i64
|
|||
; GCN-LABEL: s_test_sdiv24_64:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||||
; GCN-NEXT: s_load_dword s1, s[0:1], 0xe
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
|
||||
; GCN-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GCN-NEXT: s_mov_b32 s2, -1
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
@ -529,7 +529,7 @@ define amdgpu_kernel void @s_test_sdiv24_64(i64 addrspace(1)* %out, i64 %x, i64
|
|||
; GCN-IR-LABEL: s_test_sdiv24_64:
|
||||
; GCN-IR: ; %bb.0:
|
||||
; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||||
; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe
|
||||
; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
|
||||
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GCN-IR-NEXT: s_mov_b32 s2, -1
|
||||
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
@ -671,7 +671,7 @@ define amdgpu_kernel void @s_test_sdiv31_64(i64 addrspace(1)* %out, i64 %x, i64
|
|||
; GCN-LABEL: s_test_sdiv31_64:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||||
; GCN-NEXT: s_load_dword s1, s[0:1], 0xe
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
|
||||
; GCN-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GCN-NEXT: s_mov_b32 s2, -1
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
@ -701,7 +701,7 @@ define amdgpu_kernel void @s_test_sdiv31_64(i64 addrspace(1)* %out, i64 %x, i64
|
|||
; GCN-IR-LABEL: s_test_sdiv31_64:
|
||||
; GCN-IR: ; %bb.0:
|
||||
; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||||
; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe
|
||||
; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
|
||||
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GCN-IR-NEXT: s_mov_b32 s2, -1
|
||||
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
@ -738,7 +738,7 @@ define amdgpu_kernel void @s_test_sdiv23_64(i64 addrspace(1)* %out, i64 %x, i64
|
|||
; GCN-LABEL: s_test_sdiv23_64:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||||
; GCN-NEXT: s_load_dword s1, s[0:1], 0xe
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
|
||||
; GCN-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GCN-NEXT: s_mov_b32 s2, -1
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
@ -768,7 +768,7 @@ define amdgpu_kernel void @s_test_sdiv23_64(i64 addrspace(1)* %out, i64 %x, i64
|
|||
; GCN-IR-LABEL: s_test_sdiv23_64:
|
||||
; GCN-IR: ; %bb.0:
|
||||
; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||||
; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe
|
||||
; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
|
||||
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GCN-IR-NEXT: s_mov_b32 s2, -1
|
||||
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
@ -805,7 +805,7 @@ define amdgpu_kernel void @s_test_sdiv25_64(i64 addrspace(1)* %out, i64 %x, i64
|
|||
; GCN-LABEL: s_test_sdiv25_64:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||||
; GCN-NEXT: s_load_dword s1, s[0:1], 0xe
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
|
||||
; GCN-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GCN-NEXT: s_mov_b32 s2, -1
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
@ -835,7 +835,7 @@ define amdgpu_kernel void @s_test_sdiv25_64(i64 addrspace(1)* %out, i64 %x, i64
|
|||
; GCN-IR-LABEL: s_test_sdiv25_64:
|
||||
; GCN-IR: ; %bb.0:
|
||||
; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||||
; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe
|
||||
; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
|
||||
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GCN-IR-NEXT: s_mov_b32 s2, -1
|
||||
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
|
|
@ -480,7 +480,7 @@ define amdgpu_kernel void @s_test_srem23_64(i64 addrspace(1)* %out, i64 %x, i64
|
|||
; GCN-LABEL: s_test_srem23_64:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||||
; GCN-NEXT: s_load_dword s1, s[0:1], 0xe
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
|
||||
; GCN-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GCN-NEXT: s_mov_b32 s2, -1
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
@ -512,7 +512,7 @@ define amdgpu_kernel void @s_test_srem23_64(i64 addrspace(1)* %out, i64 %x, i64
|
|||
; GCN-IR-LABEL: s_test_srem23_64:
|
||||
; GCN-IR: ; %bb.0:
|
||||
; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||||
; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe
|
||||
; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
|
||||
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GCN-IR-NEXT: s_mov_b32 s2, -1
|
||||
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
@ -551,7 +551,7 @@ define amdgpu_kernel void @s_test_srem24_64(i64 addrspace(1)* %out, i64 %x, i64
|
|||
; GCN-LABEL: s_test_srem24_64:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||||
; GCN-NEXT: s_load_dword s1, s[0:1], 0xe
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
|
||||
; GCN-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GCN-NEXT: s_mov_b32 s2, -1
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
@ -583,7 +583,7 @@ define amdgpu_kernel void @s_test_srem24_64(i64 addrspace(1)* %out, i64 %x, i64
|
|||
; GCN-IR-LABEL: s_test_srem24_64:
|
||||
; GCN-IR: ; %bb.0:
|
||||
; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||||
; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe
|
||||
; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
|
||||
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GCN-IR-NEXT: s_mov_b32 s2, -1
|
||||
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
@ -676,7 +676,7 @@ define amdgpu_kernel void @s_test_srem25_64(i64 addrspace(1)* %out, i64 %x, i64
|
|||
; GCN-LABEL: s_test_srem25_64:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||||
; GCN-NEXT: s_load_dword s1, s[0:1], 0xe
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
|
||||
; GCN-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GCN-NEXT: s_mov_b32 s2, -1
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
@ -708,7 +708,7 @@ define amdgpu_kernel void @s_test_srem25_64(i64 addrspace(1)* %out, i64 %x, i64
|
|||
; GCN-IR-LABEL: s_test_srem25_64:
|
||||
; GCN-IR: ; %bb.0:
|
||||
; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||||
; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe
|
||||
; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
|
||||
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GCN-IR-NEXT: s_mov_b32 s2, -1
|
||||
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
@ -747,7 +747,7 @@ define amdgpu_kernel void @s_test_srem31_64(i64 addrspace(1)* %out, i64 %x, i64
|
|||
; GCN-LABEL: s_test_srem31_64:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||||
; GCN-NEXT: s_load_dword s1, s[0:1], 0xe
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
|
||||
; GCN-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GCN-NEXT: s_mov_b32 s2, -1
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
@ -779,7 +779,7 @@ define amdgpu_kernel void @s_test_srem31_64(i64 addrspace(1)* %out, i64 %x, i64
|
|||
; GCN-IR-LABEL: s_test_srem31_64:
|
||||
; GCN-IR: ; %bb.0:
|
||||
; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||||
; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe
|
||||
; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
|
||||
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GCN-IR-NEXT: s_mov_b32 s2, -1
|
||||
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
|
|
@ -1297,37 +1297,37 @@ define amdgpu_kernel void @fma_shuffle(<4 x half> addrspace(1)* nocapture readon
|
|||
; GFX9-LABEL: fma_shuffle:
|
||||
; GFX9: ; %bb.0: ; %entry
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
|
||||
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 3, v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1]
|
||||
; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3]
|
||||
; GFX9-NEXT: global_load_dwordx2 v[4:5], v6, s[8:9]
|
||||
; GFX9-NEXT: global_load_dwordx2 v[4:5], v6, s[6:7]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1]
|
||||
; GFX9-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1]
|
||||
; GFX9-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0]
|
||||
; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0]
|
||||
; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[8:9]
|
||||
; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: fma_shuffle:
|
||||
; GFX10: ; %bb.0: ; %entry
|
||||
; GFX10-NEXT: s_clause 0x1
|
||||
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
|
||||
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_clause 0x2
|
||||
; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1]
|
||||
; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3]
|
||||
; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[8:9]
|
||||
; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[6:7]
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1]
|
||||
; GFX10-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1]
|
||||
; GFX10-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0]
|
||||
; GFX10-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0]
|
||||
; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[8:9]
|
||||
; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
entry:
|
||||
%tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
|
|
Loading…
Reference in New Issue