[AMDGPU] Add 224-bit vector types and link 192-bit types to MVTs

Add SReg_224, VReg_224, AReg_224, etc.
Link 224-bit types with v7i32/v7f32.
Link existing 192-bit types to newly added v3i64/v3f64/v6i32/v6f32.

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D104622
This commit is contained in:
Carl Ritson 2021-06-24 09:59:55 +09:00
parent 6b0f98d442
commit 98f48723f2
27 changed files with 364 additions and 148 deletions

View File

@ -965,6 +965,16 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
IsSGPR = false;
IsAGPR = true;
Width = 6;
} else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
IsSGPR = false;
Width = 7;
} else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
IsSGPR = true;
Width = 7;
} else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 7;
} else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
"trap handler registers should not be used");

View File

@ -78,6 +78,12 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::LOAD, MVT::v5f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
setOperationAction(ISD::LOAD, MVT::v6f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
setOperationAction(ISD::LOAD, MVT::v7f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
@ -99,9 +105,15 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
setOperationAction(ISD::LOAD, MVT::v3i64, Promote);
AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
setOperationAction(ISD::LOAD, MVT::v4i64, Promote);
AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
setOperationAction(ISD::LOAD, MVT::v3f64, Promote);
AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
setOperationAction(ISD::LOAD, MVT::v4f64, Promote);
AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
@ -173,12 +185,14 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
@ -198,6 +212,12 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v5f32, Promote);
AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
setOperationAction(ISD::STORE, MVT::v6f32, Promote);
AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
setOperationAction(ISD::STORE, MVT::v7f32, Promote);
AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
setOperationAction(ISD::STORE, MVT::v8f32, Promote);
AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
@ -219,6 +239,12 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v2f64, Promote);
AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
setOperationAction(ISD::STORE, MVT::v3i64, Promote);
AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
setOperationAction(ISD::STORE, MVT::v3f64, Promote);
AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
setOperationAction(ISD::STORE, MVT::v4i64, Promote);
AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
@ -261,6 +287,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
@ -325,6 +356,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v5i32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v5f32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v6i32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v6f32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v7i32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v7f32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f16, Custom);
@ -337,6 +372,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v6f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v6i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v7f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v7i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f32, Custom);
@ -345,6 +384,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f64, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i64, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f64, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i64, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f64, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i64, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f64, Custom);
@ -414,8 +455,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
static const MVT::SimpleValueType VectorIntTypes[] = {
MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32
};
MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32};
for (MVT VT : VectorIntTypes) {
// Expand the following operations for the current type by default.
@ -456,8 +496,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
}
static const MVT::SimpleValueType FloatVectorTypes[] = {
MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32
};
MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32};
for (MVT VT : FloatVectorTypes) {
setOperationAction(ISD::FABS, VT, Expand);
@ -507,6 +546,12 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SELECT, MVT::v5f32, Promote);
AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
setOperationAction(ISD::SELECT, MVT::v6f32, Promote);
AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
setOperationAction(ISD::SELECT, MVT::v7f32, Promote);
AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
// There are no libcalls of any kind.
for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);

View File

@ -7,16 +7,16 @@
//===----------------------------------------------------------------------===//
def SGPRRegBank : RegisterBank<"SGPR",
[SReg_LO16, SReg_32, SReg_64, SReg_96, SReg_128, SReg_160, SReg_192, SReg_256, SReg_512, SReg_1024]
[SReg_LO16, SReg_32, SReg_64, SReg_96, SReg_128, SReg_160, SReg_192, SReg_224, SReg_256, SReg_512, SReg_1024]
>;
def VGPRRegBank : RegisterBank<"VGPR",
[VGPR_LO16, VGPR_HI16, VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192, VReg_256, VReg_512, VReg_1024]
[VGPR_LO16, VGPR_HI16, VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192, VReg_224, VReg_256, VReg_512, VReg_1024]
>;
// It is helpful to distinguish conditions from ordinary SGPRs.
def VCCRegBank : RegisterBank <"VCC", [SReg_1]>;
def AGPRRegBank : RegisterBank <"AGPR",
[AGPR_LO16, AGPR_32, AReg_64, AReg_96, AReg_128, AReg_160, AReg_192, AReg_256, AReg_512, AReg_1024]
[AGPR_LO16, AGPR_32, AReg_64, AReg_96, AReg_128, AReg_160, AReg_192, AReg_224, AReg_256, AReg_512, AReg_1024]
>;

View File

@ -2192,6 +2192,7 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
case 4: return AMDGPU::VReg_128RegClassID;
case 5: return AMDGPU::VReg_160RegClassID;
case 6: return AMDGPU::VReg_192RegClassID;
case 7: return AMDGPU::VReg_224RegClassID;
case 8: return AMDGPU::VReg_256RegClassID;
case 16: return AMDGPU::VReg_512RegClassID;
case 32: return AMDGPU::VReg_1024RegClassID;
@ -2214,6 +2215,7 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
case 4: return AMDGPU::SGPR_128RegClassID;
case 5: return AMDGPU::SGPR_160RegClassID;
case 6: return AMDGPU::SGPR_192RegClassID;
case 7: return AMDGPU::SGPR_224RegClassID;
case 8: return AMDGPU::SGPR_256RegClassID;
case 16: return AMDGPU::SGPR_512RegClassID;
}
@ -2226,6 +2228,7 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
case 4: return AMDGPU::AReg_128RegClassID;
case 5: return AMDGPU::AReg_160RegClassID;
case 6: return AMDGPU::AReg_192RegClassID;
case 7: return AMDGPU::AReg_224RegClassID;
case 8: return AMDGPU::AReg_256RegClassID;
case 16: return AMDGPU::AReg_512RegClassID;
case 32: return AMDGPU::AReg_1024RegClassID;

View File

@ -463,6 +463,7 @@ SIMCCodeEmitter::getAVOperandEncoding(const MCInst &MI, unsigned OpNo,
MRI.getRegClass(AMDGPU::AReg_128RegClassID).contains(Reg) ||
MRI.getRegClass(AMDGPU::AReg_160RegClassID).contains(Reg) ||
MRI.getRegClass(AMDGPU::AReg_192RegClassID).contains(Reg) ||
MRI.getRegClass(AMDGPU::AReg_224RegClassID).contains(Reg) ||
MRI.getRegClass(AMDGPU::AReg_256RegClassID).contains(Reg) ||
MRI.getRegClass(AMDGPU::AGPR_LO16RegClassID).contains(Reg))
Enc |= 512;

View File

@ -102,6 +102,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
@ -145,6 +154,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::LOAD, MVT::v3i32, Custom);
setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
setOperationAction(ISD::LOAD, MVT::v5i32, Custom);
setOperationAction(ISD::LOAD, MVT::v6i32, Custom);
setOperationAction(ISD::LOAD, MVT::v7i32, Custom);
setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
setOperationAction(ISD::LOAD, MVT::i1, Custom);
@ -154,6 +165,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v3i32, Custom);
setOperationAction(ISD::STORE, MVT::v4i32, Custom);
setOperationAction(ISD::STORE, MVT::v5i32, Custom);
setOperationAction(ISD::STORE, MVT::v6i32, Custom);
setOperationAction(ISD::STORE, MVT::v7i32, Custom);
setOperationAction(ISD::STORE, MVT::v8i32, Custom);
setOperationAction(ISD::STORE, MVT::v16i32, Custom);
setOperationAction(ISD::STORE, MVT::i1, Custom);
@ -176,6 +189,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
@ -203,8 +218,16 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
setOperationAction(ISD::TRUNCATE, MVT::v3i32, Expand);
setOperationAction(ISD::FP_ROUND, MVT::v3f32, Expand);
setOperationAction(ISD::TRUNCATE, MVT::v4i32, Expand);
setOperationAction(ISD::FP_ROUND, MVT::v4f32, Expand);
setOperationAction(ISD::TRUNCATE, MVT::v5i32, Expand);
setOperationAction(ISD::FP_ROUND, MVT::v5f32, Expand);
setOperationAction(ISD::TRUNCATE, MVT::v6i32, Expand);
setOperationAction(ISD::FP_ROUND, MVT::v6f32, Expand);
setOperationAction(ISD::TRUNCATE, MVT::v7i32, Expand);
setOperationAction(ISD::FP_ROUND, MVT::v7f32, Expand);
setOperationAction(ISD::TRUNCATE, MVT::v8i32, Expand);
setOperationAction(ISD::FP_ROUND, MVT::v8f32, Expand);
setOperationAction(ISD::TRUNCATE, MVT::v16i32, Expand);
@ -245,6 +268,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// with > 4 elements.
for (MVT VT : { MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16,
MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32,
MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32 }) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
@ -290,6 +314,20 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
}
for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
}
for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
@ -365,9 +403,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i32, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4f32, Custom);
// Deal with vec5 vector operations when widened to vec8.
// Deal with vec5/6/7 vector operations when widened to vec8.
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5i32, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5f32, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v6i32, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v6f32, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v7i32, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v7f32, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i32, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8f32, Custom);
@ -11697,6 +11739,8 @@ static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
return AMDGPU::VReg_160_Align2RegClassID;
case AMDGPU::VReg_192RegClassID:
return AMDGPU::VReg_192_Align2RegClassID;
case AMDGPU::VReg_224RegClassID:
return AMDGPU::VReg_224_Align2RegClassID;
case AMDGPU::VReg_256RegClassID:
return AMDGPU::VReg_256_Align2RegClassID;
case AMDGPU::VReg_512RegClassID:

View File

@ -1340,6 +1340,8 @@ static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_S160_SAVE;
case 24:
return AMDGPU::SI_SPILL_S192_SAVE;
case 28:
return AMDGPU::SI_SPILL_S224_SAVE;
case 32:
return AMDGPU::SI_SPILL_S256_SAVE;
case 64:
@ -1365,6 +1367,8 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_V160_SAVE;
case 24:
return AMDGPU::SI_SPILL_V192_SAVE;
case 28:
return AMDGPU::SI_SPILL_V224_SAVE;
case 32:
return AMDGPU::SI_SPILL_V256_SAVE;
case 64:
@ -1473,6 +1477,8 @@ static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_S160_RESTORE;
case 24:
return AMDGPU::SI_SPILL_S192_RESTORE;
case 28:
return AMDGPU::SI_SPILL_S224_RESTORE;
case 32:
return AMDGPU::SI_SPILL_S256_RESTORE;
case 64:
@ -1498,6 +1504,8 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_V160_RESTORE;
case 24:
return AMDGPU::SI_SPILL_V192_RESTORE;
case 28:
return AMDGPU::SI_SPILL_V224_RESTORE;
case 32:
return AMDGPU::SI_SPILL_V256_RESTORE;
case 64:

View File

@ -675,6 +675,7 @@ defm SI_SPILL_S96 : SI_SPILL_SGPR <SReg_96>;
defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
defm SI_SPILL_S160 : SI_SPILL_SGPR <SReg_160>;
defm SI_SPILL_S192 : SI_SPILL_SGPR <SReg_192>;
defm SI_SPILL_S224 : SI_SPILL_SGPR <SReg_224>;
defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>;
@ -718,6 +719,7 @@ defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>;
defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>;
defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160>;
defm SI_SPILL_V192 : SI_SPILL_VGPR <VReg_192>;
defm SI_SPILL_V224 : SI_SPILL_VGPR <VReg_224>;
defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>;
@ -728,6 +730,7 @@ defm SI_SPILL_A96 : SI_SPILL_VGPR <AReg_96, 1>;
defm SI_SPILL_A128 : SI_SPILL_VGPR <AReg_128, 1>;
defm SI_SPILL_A160 : SI_SPILL_VGPR <AReg_160, 1>;
defm SI_SPILL_A192 : SI_SPILL_VGPR <AReg_192, 1>;
defm SI_SPILL_A224 : SI_SPILL_VGPR <AReg_224, 1>;
defm SI_SPILL_A256 : SI_SPILL_VGPR <AReg_256, 1>;
defm SI_SPILL_A512 : SI_SPILL_VGPR <AReg_512, 1>;
defm SI_SPILL_A1024 : SI_SPILL_VGPR <AReg_1024, 1>;
@ -1052,6 +1055,38 @@ foreach Index = 0-4 in {
>;
}
foreach Index = 0-5 in {
def Extract_Element_v6i32_#Index : Extract_Element <
i32, v6i32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Insert_Element_v6i32_#Index : Insert_Element <
i32, v6i32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Extract_Element_v6f32_#Index : Extract_Element <
f32, v6f32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Insert_Element_v6f32_#Index : Insert_Element <
f32, v6f32, Index, !cast<SubRegIndex>(sub#Index)
>;
}
foreach Index = 0-6 in {
def Extract_Element_v7i32_#Index : Extract_Element <
i32, v7i32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Insert_Element_v7i32_#Index : Insert_Element <
i32, v7i32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Extract_Element_v7f32_#Index : Extract_Element <
f32, v7f32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Insert_Element_v7f32_#Index : Insert_Element <
f32, v7f32, Index, !cast<SubRegIndex>(sub#Index)
>;
}
foreach Index = 0-7 in {
def Extract_Element_v8i32_#Index : Extract_Element <
i32, v8i32, Index, !cast<SubRegIndex>(sub#Index)
@ -1202,8 +1237,32 @@ def : BitConvert <v4f32, v2i64, VReg_128>;
def : BitConvert <v2i64, v4f32, VReg_128>;
// 160-bit bitcast
def : BitConvert <v5i32, v5f32, SGPR_160>;
def : BitConvert <v5f32, v5i32, SGPR_160>;
def : BitConvert <v5i32, v5f32, SReg_160>;
def : BitConvert <v5f32, v5i32, SReg_160>;
def : BitConvert <v5i32, v5f32, VReg_160>;
def : BitConvert <v5f32, v5i32, VReg_160>;
// 192-bit bitcast
def : BitConvert <v6i32, v6f32, SReg_192>;
def : BitConvert <v6f32, v6i32, SReg_192>;
def : BitConvert <v6i32, v6f32, VReg_192>;
def : BitConvert <v6f32, v6i32, VReg_192>;
def : BitConvert <v3i64, v3f64, VReg_192>;
def : BitConvert <v3f64, v3i64, VReg_192>;
def : BitConvert <v3i64, v6i32, VReg_192>;
def : BitConvert <v3i64, v6f32, VReg_192>;
def : BitConvert <v3f64, v6i32, VReg_192>;
def : BitConvert <v3f64, v6f32, VReg_192>;
def : BitConvert <v6i32, v3i64, VReg_192>;
def : BitConvert <v6f32, v3i64, VReg_192>;
def : BitConvert <v6i32, v3f64, VReg_192>;
def : BitConvert <v6f32, v3f64, VReg_192>;
// 224-bit bitcast
def : BitConvert <v7i32, v7f32, SReg_224>;
def : BitConvert <v7f32, v7i32, SReg_224>;
def : BitConvert <v7i32, v7f32, VReg_224>;
def : BitConvert <v7f32, v7i32, VReg_224>;
// 256-bit bitcast
def : BitConvert <v8i32, v8f32, SReg_256>;

View File

@ -1874,6 +1874,8 @@ getAnyVGPRClassForBitWidth(unsigned BitWidth) {
return &AMDGPU::VReg_160RegClass;
if (BitWidth <= 192)
return &AMDGPU::VReg_192RegClass;
if (BitWidth <= 224)
return &AMDGPU::VReg_224RegClass;
if (BitWidth <= 256)
return &AMDGPU::VReg_256RegClass;
if (BitWidth <= 512)
@ -1896,6 +1898,8 @@ getAlignedVGPRClassForBitWidth(unsigned BitWidth) {
return &AMDGPU::VReg_160_Align2RegClass;
if (BitWidth <= 192)
return &AMDGPU::VReg_192_Align2RegClass;
if (BitWidth <= 224)
return &AMDGPU::VReg_224_Align2RegClass;
if (BitWidth <= 256)
return &AMDGPU::VReg_256_Align2RegClass;
if (BitWidth <= 512)
@ -2036,6 +2040,11 @@ SIRegisterInfo::getPhysRegClass(MCRegister Reg) const {
&AMDGPU::SReg_192RegClass,
&AMDGPU::AReg_192_Align2RegClass,
&AMDGPU::AReg_192RegClass,
&AMDGPU::VReg_224_Align2RegClass,
&AMDGPU::VReg_224RegClass,
&AMDGPU::SReg_224RegClass,
&AMDGPU::AReg_224_Align2RegClass,
&AMDGPU::AReg_224RegClass,
&AMDGPU::VReg_256_Align2RegClass,
&AMDGPU::VReg_256RegClass,
&AMDGPU::SReg_256RegClass,

View File

@ -58,6 +58,7 @@ class getSubRegs<int size> {
list<SubRegIndex> ret4 = [sub0, sub1, sub2, sub3];
list<SubRegIndex> ret5 = [sub0, sub1, sub2, sub3, sub4];
list<SubRegIndex> ret6 = [sub0, sub1, sub2, sub3, sub4, sub5];
list<SubRegIndex> ret7 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6];
list<SubRegIndex> ret8 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7];
list<SubRegIndex> ret16 = [sub0, sub1, sub2, sub3,
sub4, sub5, sub6, sub7,
@ -77,9 +78,10 @@ class getSubRegs<int size> {
!if(!eq(size, 4), ret4,
!if(!eq(size, 5), ret5,
!if(!eq(size, 6), ret6,
!if(!eq(size, 8), ret8,
!if(!eq(size, 16), ret16,
ret32)))))));
!if(!eq(size, 7), ret7,
!if(!eq(size, 8), ret8,
!if(!eq(size, 16), ret16,
ret32))))))));
}
// Generates list of sequential register tuple names.
@ -350,9 +352,12 @@ def SGPR_128Regs : SIRegisterTuples<getSubRegs<4>.ret, SGPR_32, 105, 4, 4, "s">;
// SGPR 160-bit registers. No operations use these, but for symmetry with 160-bit VGPRs.
def SGPR_160Regs : SIRegisterTuples<getSubRegs<5>.ret, SGPR_32, 105, 4, 5, "s">;
// SGPR 192-bit registers
// SGPR 192-bit registers. No operations use these, but for symmetry with 192-bit VGPRs.
def SGPR_192Regs : SIRegisterTuples<getSubRegs<6>.ret, SGPR_32, 105, 4, 6, "s">;
// SGPR 224-bit registers. No operations use these, but for symmetry with 224-bit VGPRs.
def SGPR_224Regs : SIRegisterTuples<getSubRegs<7>.ret, SGPR_32, 105, 4, 7, "s">;
// SGPR 256-bit registers
def SGPR_256Regs : SIRegisterTuples<getSubRegs<8>.ret, SGPR_32, 105, 4, 8, "s">;
@ -508,6 +513,9 @@ def VGPR_160 : SIRegisterTuples<getSubRegs<5>.ret, VGPR_32, 255, 1, 5, "v">;
// VGPR 192-bit registers
def VGPR_192 : SIRegisterTuples<getSubRegs<6>.ret, VGPR_32, 255, 1, 6, "v">;
// VGPR 224-bit registers
def VGPR_224 : SIRegisterTuples<getSubRegs<7>.ret, VGPR_32, 255, 1, 7, "v">;
// VGPR 256-bit registers
def VGPR_256 : SIRegisterTuples<getSubRegs<8>.ret, VGPR_32, 255, 1, 8, "v">;
@ -547,6 +555,9 @@ def AGPR_160 : SIRegisterTuples<getSubRegs<5>.ret, AGPR_32, 255, 1, 5, "a">;
// AGPR 192-bit registers
def AGPR_192 : SIRegisterTuples<getSubRegs<6>.ret, AGPR_32, 255, 1, 6, "a">;
// AGPR 224-bit registers
def AGPR_224 : SIRegisterTuples<getSubRegs<7>.ret, AGPR_32, 255, 1, 7, "a">;
// AGPR 256-bit registers
def AGPR_256 : SIRegisterTuples<getSubRegs<8>.ret, AGPR_32, 255, 1, 8, "a">;
@ -725,20 +736,41 @@ def SReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32,
(add SGPR_160)> {
// FIXME: Should be isAllocatable = 0, but that causes all TableGen-generated
// subclasses of SGPR_160 to be marked unallocatable too.
// This occurs because SGPR_160 and SReg_160 classes are equivalent in size
// meaning their enumeration order is dependent on alphanumeric ordering of
// their names. The superclass for inherence is the last one in topological
// order (i.e. enumeration order), hence SReg_160 is selected.
// Potential workarounds involve renaming SGPR_160, adding another class
// which is ordered last and hence used for inheritance, or adding more
// registers to SReg_160 to cause it to be moved earlier in the superclass
// list.
let CopyCost = 3;
}
def SGPR_192 : RegisterClass<"AMDGPU", [untyped], 32, (add SGPR_192Regs)> {
let Size = 192;
// There are no 6-component scalar instructions, but this is needed
// for symmetry with VGPRs.
def SGPR_192 : RegisterClass<"AMDGPU", [v6i32, v6f32, v3i64, v3f64], 32, (add SGPR_192Regs)> {
let AllocationPriority = 17;
}
def SReg_192 : RegisterClass<"AMDGPU", [untyped], 32, (add SGPR_192)> {
let Size = 192;
def SReg_192 : RegisterClass<"AMDGPU", [v6i32, v6f32, v3i64, v3f64], 32, (add SGPR_192)> {
let isAllocatable = 0;
let CopyCost = 3;
}
// There are no 7-component scalar instructions, but this is needed
// for symmetry with VGPRs.
def SGPR_224 : RegisterClass<"AMDGPU", [v7i32, v7f32], 32, (add SGPR_224Regs)> {
let AllocationPriority = 18;
}
def SReg_224 : RegisterClass<"AMDGPU", [v7i32, v7f32], 32, (add SGPR_224)> {
let isAllocatable = 0;
let CopyCost = 4;
}
def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32, v4i64, v4f64], 32, (add SGPR_256Regs)> {
let AllocationPriority = 18;
let AllocationPriority = 19;
}
def TTMP_256 : RegisterClass<"AMDGPU", [v8i32, v8f32, v4i64, v4f64], 32, (add TTMP_256Regs)> {
@ -754,7 +786,7 @@ def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32, v4i64, v4f64], 32,
def SGPR_512 : RegisterClass<"AMDGPU", [v16i32, v16f32, v8i64, v8f64], 32,
(add SGPR_512Regs)> {
let AllocationPriority = 19;
let AllocationPriority = 20;
}
def TTMP_512 : RegisterClass<"AMDGPU", [v16i32, v16f32, v8i64, v8f64], 32,
@ -776,7 +808,7 @@ def VRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 3
def SGPR_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32, v16i64, v16f64], 32,
(add SGPR_1024Regs)> {
let AllocationPriority = 20;
let AllocationPriority = 21;
}
def SReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32, v16i64, v16f64], 32,
@ -812,7 +844,8 @@ defm VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>;
defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64], (add VGPR_128)>;
defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>;
defm VReg_192 : VRegClass<6, [untyped], (add VGPR_192)>;
defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>;
defm VReg_224 : VRegClass<7, [v7i32, v7f32], (add VGPR_224)>;
defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64], (add VGPR_256)>;
defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>;
defm VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>;
@ -832,7 +865,8 @@ defm AReg_64 : ARegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16],
defm AReg_96 : ARegClass<3, [v3i32, v3f32], (add AGPR_96)>;
defm AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64], (add AGPR_128)>;
defm AReg_160 : ARegClass<5, [v5i32, v5f32], (add AGPR_160)>;
defm AReg_192 : ARegClass<6, [untyped], (add AGPR_192)>;
defm AReg_192 : ARegClass<6, [v6i32, v6f32, v3i64, v3f64], (add AGPR_192)>;
defm AReg_224 : ARegClass<7, [v7i32, v7f32], (add AGPR_224)>;
defm AReg_256 : ARegClass<8, [v8i32, v8f32, v4i64, v4f64], (add AGPR_256)>;
defm AReg_512 : ARegClass<16, [v16i32, v16f32, v8i64, v8f64], (add AGPR_512)>;
defm AReg_1024 : ARegClass<32, [v32i32, v32f32, v16i64, v16f64], (add AGPR_1024)>;

View File

@ -1650,6 +1650,13 @@ unsigned getRegBitWidth(unsigned RCID) {
case AMDGPU::VReg_192_Align2RegClassID:
case AMDGPU::AReg_192_Align2RegClassID:
return 192;
case AMDGPU::SGPR_224RegClassID:
case AMDGPU::SReg_224RegClassID:
case AMDGPU::VReg_224RegClassID:
case AMDGPU::AReg_224RegClassID:
case AMDGPU::VReg_224_Align2RegClassID:
case AMDGPU::AReg_224_Align2RegClassID:
return 224;
case AMDGPU::SGPR_256RegClassID:
case AMDGPU::SReg_256RegClassID:
case AMDGPU::VReg_256RegClassID:

View File

@ -3501,13 +3501,13 @@ define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_s_v_s(<7 x float> inreg %v
; GPRIDX-LABEL: dyn_insertelement_v7f32_s_v_s:
; GPRIDX: ; %bb.0: ; %entry
; GPRIDX-NEXT: s_mov_b32 s0, s2
; GPRIDX-NEXT: s_mov_b32 s1, s3
; GPRIDX-NEXT: s_mov_b32 s2, s4
; GPRIDX-NEXT: s_mov_b32 s3, s5
; GPRIDX-NEXT: s_mov_b32 s4, s6
; GPRIDX-NEXT: s_mov_b32 s5, s7
; GPRIDX-NEXT: s_mov_b32 s6, s8
; GPRIDX-NEXT: v_mov_b32_e32 v14, s7
; GPRIDX-NEXT: s_mov_b32 s1, s3
; GPRIDX-NEXT: s_mov_b32 s3, s5
; GPRIDX-NEXT: s_mov_b32 s5, s7
; GPRIDX-NEXT: v_mov_b32_e32 v13, s6
; GPRIDX-NEXT: v_mov_b32_e32 v7, s0
; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s9, 0
; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc
@ -3526,7 +3526,6 @@ define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_s_v_s(<7 x float> inreg %v
; GPRIDX-NEXT: v_mov_b32_e32 v12, s5
; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s9, 5
; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v12, v0, vcc
; GPRIDX-NEXT: v_mov_b32_e32 v13, s6
; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s9, 6
; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v13, v0, vcc
; GPRIDX-NEXT: v_mov_b32_e32 v0, v7
@ -3535,13 +3534,13 @@ define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_s_v_s(<7 x float> inreg %v
; MOVREL-LABEL: dyn_insertelement_v7f32_s_v_s:
; MOVREL: ; %bb.0: ; %entry
; MOVREL-NEXT: s_mov_b32 s0, s2
; MOVREL-NEXT: s_mov_b32 s1, s3
; MOVREL-NEXT: s_mov_b32 s2, s4
; MOVREL-NEXT: s_mov_b32 s3, s5
; MOVREL-NEXT: s_mov_b32 s4, s6
; MOVREL-NEXT: s_mov_b32 s5, s7
; MOVREL-NEXT: s_mov_b32 s6, s8
; MOVREL-NEXT: v_mov_b32_e32 v14, s7
; MOVREL-NEXT: s_mov_b32 s1, s3
; MOVREL-NEXT: s_mov_b32 s3, s5
; MOVREL-NEXT: s_mov_b32 s5, s7
; MOVREL-NEXT: v_mov_b32_e32 v13, s6
; MOVREL-NEXT: v_mov_b32_e32 v7, s0
; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 0
; MOVREL-NEXT: v_mov_b32_e32 v8, s1
@ -3551,7 +3550,6 @@ define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_s_v_s(<7 x float> inreg %v
; MOVREL-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc_lo
; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 1
; MOVREL-NEXT: v_mov_b32_e32 v12, s5
; MOVREL-NEXT: v_mov_b32_e32 v13, s6
; MOVREL-NEXT: v_cndmask_b32_e32 v1, v8, v0, vcc_lo
; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 2
; MOVREL-NEXT: v_cndmask_b32_e32 v2, v9, v0, vcc_lo
@ -3574,13 +3572,13 @@ define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_s_v_v(<7 x float> inreg %v
; GPRIDX-LABEL: dyn_insertelement_v7f32_s_v_v:
; GPRIDX: ; %bb.0: ; %entry
; GPRIDX-NEXT: s_mov_b32 s0, s2
; GPRIDX-NEXT: s_mov_b32 s1, s3
; GPRIDX-NEXT: s_mov_b32 s2, s4
; GPRIDX-NEXT: s_mov_b32 s3, s5
; GPRIDX-NEXT: s_mov_b32 s4, s6
; GPRIDX-NEXT: s_mov_b32 s5, s7
; GPRIDX-NEXT: s_mov_b32 s6, s8
; GPRIDX-NEXT: v_mov_b32_e32 v15, s7
; GPRIDX-NEXT: s_mov_b32 s1, s3
; GPRIDX-NEXT: s_mov_b32 s3, s5
; GPRIDX-NEXT: s_mov_b32 s5, s7
; GPRIDX-NEXT: v_mov_b32_e32 v14, s6
; GPRIDX-NEXT: v_mov_b32_e32 v8, s0
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc
@ -3600,7 +3598,6 @@ define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_s_v_v(<7 x float> inreg %v
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v1
; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v13, v0, vcc
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v1
; GPRIDX-NEXT: v_mov_b32_e32 v14, s6
; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v14, v0, vcc
; GPRIDX-NEXT: v_mov_b32_e32 v0, v8
; GPRIDX-NEXT: v_mov_b32_e32 v1, v7
@ -3609,13 +3606,13 @@ define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_s_v_v(<7 x float> inreg %v
; MOVREL-LABEL: dyn_insertelement_v7f32_s_v_v:
; MOVREL: ; %bb.0: ; %entry
; MOVREL-NEXT: s_mov_b32 s0, s2
; MOVREL-NEXT: s_mov_b32 s1, s3
; MOVREL-NEXT: s_mov_b32 s2, s4
; MOVREL-NEXT: s_mov_b32 s3, s5
; MOVREL-NEXT: s_mov_b32 s4, s6
; MOVREL-NEXT: s_mov_b32 s5, s7
; MOVREL-NEXT: s_mov_b32 s6, s8
; MOVREL-NEXT: v_mov_b32_e32 v15, s7
; MOVREL-NEXT: s_mov_b32 s1, s3
; MOVREL-NEXT: s_mov_b32 s3, s5
; MOVREL-NEXT: s_mov_b32 s5, s7
; MOVREL-NEXT: v_mov_b32_e32 v14, s6
; MOVREL-NEXT: v_mov_b32_e32 v8, s0
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; MOVREL-NEXT: v_mov_b32_e32 v9, s1
@ -3625,7 +3622,6 @@ define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_s_v_v(<7 x float> inreg %v
; MOVREL-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc_lo
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; MOVREL-NEXT: v_mov_b32_e32 v13, s5
; MOVREL-NEXT: v_mov_b32_e32 v14, s6
; MOVREL-NEXT: v_cndmask_b32_e32 v7, v9, v0, vcc_lo
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1
; MOVREL-NEXT: v_cndmask_b32_e32 v2, v10, v0, vcc_lo

View File

@ -15,7 +15,7 @@
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1
; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3
; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 8
; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 6
; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0
; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0
; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
@ -33,7 +33,7 @@
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1
; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3
; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 8
; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 6
; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0
; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0
; OSABI-AMDHSA-ASM: .end_amdhsa_kernel

View File

@ -1081,32 +1081,31 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ubyte v12, v[4:5]
; VI-NEXT: v_add_u32_e32 v4, vcc, 6, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v0
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v8, vcc, 5, v0
; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v10, vcc, 1, v0
; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ubyte v10, v[4:5]
; VI-NEXT: flat_load_ubyte v11, v[6:7]
; VI-NEXT: flat_load_ubyte v8, v[8:9]
; VI-NEXT: flat_load_ubyte v9, v[10:11]
; VI-NEXT: v_add_u32_e32 v4, vcc, 6, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v0
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ubyte v6, v[6:7]
; VI-NEXT: flat_load_ubyte v7, v[4:5]
; VI-NEXT: flat_load_ubyte v4, v[4:5]
; VI-NEXT: flat_load_ubyte v2, v[2:3]
; VI-NEXT: flat_load_ubyte v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_cvt_f32_ubyte2_e32 v5, v8
; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v9
; VI-NEXT: v_cvt_f32_ubyte2_e32 v5, v8
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v6
; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v6
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v7
; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v4
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v2, v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v2, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v11
; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
@ -1127,25 +1126,23 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias
; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:2
; GFX10-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3
; GFX10-NEXT: global_load_short_d16 v2, v0, s[2:3] offset:4
; GFX10-NEXT: global_load_ubyte v6, v0, s[2:3] offset:6
; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1
; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:6
; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] offset:1
; GFX10-NEXT: global_load_ubyte v7, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(4)
; GFX10-NEXT: v_lshl_or_b32 v0, v3, 8, v1
; GFX10-NEXT: s_waitcnt vmcnt(3)
; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v2
; GFX10-NEXT: s_waitcnt vmcnt(2)
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v6
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v4
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v1, v4
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v2
; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v1, v5
; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v2
; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v7
; GFX10-NEXT: global_store_dword v8, v6, s[0:1] offset:24
; GFX10-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16
; GFX10-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] offset:16
; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX10-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()

View File

@ -287,7 +287,7 @@ define <2 x i64> @v2i64_func_void() #0 {
; GCN-LABEL: {{^}}v3i64_func_void:
; GCN-DAG: buffer_load_dwordx4 v[0:3], off
; GCN-DAG: buffer_load_dwordx4 v[4:7], off
; GCN-DAG: buffer_load_dwordx2 v[4:5], off
; GCN: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define <3 x i64> @v3i64_func_void() #0 {

View File

@ -15,7 +15,7 @@
; CHECK: .max_flat_workgroup_size: 1024
; CHECK: .name: test
; CHECK: .private_segment_fixed_size: 0
; CHECK: .sgpr_count: 8
; CHECK: .sgpr_count: 6
; CHECK: .symbol: test.kd
; CHECK: .vgpr_count: {{3|6}}
; WAVE64: .wavefront_size: 64

View File

@ -16,7 +16,7 @@
; CHECK: PrivateSegmentFixedSize: 0
; CHECK: KernargSegmentAlign: 8
; CHECK: WavefrontSize: 64
; CHECK: NumSGPRs: 8
; CHECK: NumSGPRs: 6
; CHECK: NumVGPRs: {{3|6}}
; CHECK: MaxFlatWorkGroupSize: 1024
define amdgpu_kernel void @test(
@ -39,7 +39,7 @@ entry:
; CHECK: PrivateSegmentFixedSize: 0
; CHECK: KernargSegmentAlign: 8
; CHECK: WavefrontSize: 64
; CHECK: NumSGPRs: 8
; CHECK: NumSGPRs: 6
; CHECK: NumVGPRs: {{3|6}}
; CHECK: MaxFlatWorkGroupSize: 256
define amdgpu_kernel void @test_max_flat_workgroup_size(

View File

@ -1506,26 +1506,27 @@ define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %
; SI-LABEL: dynamic_insertelement_v3i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8
; SI-NEXT: s_load_dword s6, s[4:5], 0x10
; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xc
; SI-NEXT: s_load_dword s12, s[4:5], 0x10
; SI-NEXT: s_mov_b32 s3, 0x100f000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s13
; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 2
; SI-NEXT: v_cndmask_b32_e64 v5, v0, 0, s[4:5]
; SI-NEXT: v_mov_b32_e32 v0, s12
; SI-NEXT: v_cndmask_b32_e64 v4, v0, 5, s[4:5]
; SI-NEXT: v_mov_b32_e32 v0, s11
; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 1
; SI-NEXT: v_mov_b32_e32 v4, s7
; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 1
; SI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5]
; SI-NEXT: v_mov_b32_e32 v0, s10
; SI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5]
; SI-NEXT: v_mov_b32_e32 v0, s9
; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 0
; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 0
; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5]
; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 2
; SI-NEXT: v_cndmask_b32_e64 v5, v4, 0, s[4:5]
; SI-NEXT: v_mov_b32_e32 v4, s6
; SI-NEXT: v_cndmask_b32_e64 v4, v4, 5, s[4:5]
; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
@ -1533,26 +1534,27 @@ define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %
; VI-LABEL: dynamic_insertelement_v3i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
; VI-NEXT: s_load_dword s6, s[4:5], 0x40
; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x20
; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x30
; VI-NEXT: s_load_dword s12, s[4:5], 0x40
; VI-NEXT: s_mov_b32 s3, 0x1100f000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s13
; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 2
; VI-NEXT: v_cndmask_b32_e64 v5, v0, 0, s[4:5]
; VI-NEXT: v_mov_b32_e32 v0, s12
; VI-NEXT: v_cndmask_b32_e64 v4, v0, 5, s[4:5]
; VI-NEXT: v_mov_b32_e32 v0, s11
; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 1
; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 1
; VI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5]
; VI-NEXT: v_mov_b32_e32 v0, s10
; VI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5]
; VI-NEXT: v_mov_b32_e32 v0, s9
; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 0
; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 0
; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5]
; VI-NEXT: v_mov_b32_e32 v4, s7
; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 2
; VI-NEXT: v_cndmask_b32_e64 v5, v4, 0, s[4:5]
; VI-NEXT: v_mov_b32_e32 v4, s6
; VI-NEXT: v_cndmask_b32_e64 v4, v4, 5, s[4:5]
; VI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm

View File

@ -1039,10 +1039,10 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)*
; GFX9-LABEL: s_insertelement_v2i16_dynamic:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b32 s2, s4, 4
@ -1057,7 +1057,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)*
; VI-LABEL: s_insertelement_v2i16_dynamic:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_load_dword s0, s[4:5], 0x0
@ -1076,7 +1076,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)*
; CI-LABEL: s_insertelement_v2i16_dynamic:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4
; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: s_load_dword s0, s[4:5], 0x0
@ -1169,10 +1169,10 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspa
; GFX9-LABEL: v_insertelement_v2f16_dynamic_vgpr:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[8:9]
; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(1)
@ -1187,7 +1187,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspa
; VI-LABEL: v_insertelement_v2f16_dynamic_vgpr:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
@ -1214,7 +1214,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspa
; CI-LABEL: v_insertelement_v2f16_dynamic_vgpr:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4
; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3

View File

@ -1,19 +1,19 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -enable-ipra -print-regusage -o /dev/null 2>&1 < %s | FileCheck %s
; Make sure the expected regmask is generated for sub/superregisters.
; CHECK-DAG: csr Clobbered Registers: $vgpr0 $vgpr0_hi16 $vgpr0_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr0_vgpr1 $vgpr0_vgpr1_vgpr2 {{$}}
; CHECK-DAG: csr Clobbered Registers: $vgpr0 $vgpr0_hi16 $vgpr0_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr0_vgpr1 $vgpr0_vgpr1_vgpr2 {{$}}
define void @csr() #0 {
call void asm sideeffect "", "~{v0},~{v44},~{v45}"() #0
ret void
}
; CHECK-DAG: subregs_for_super Clobbered Registers: $vgpr0 $vgpr1 $vgpr0_hi16 $vgpr1_hi16 $vgpr0_lo16 $vgpr1_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 $vgpr0_vgpr1 $vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 $vgpr1_vgpr2_vgpr3 {{$}}
; CHECK-DAG: subregs_for_super Clobbered Registers: $vgpr0 $vgpr1 $vgpr0_hi16 $vgpr1_hi16 $vgpr0_lo16 $vgpr1_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 $vgpr0_vgpr1 $vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 $vgpr1_vgpr2_vgpr3 {{$}}
define void @subregs_for_super() #0 {
call void asm sideeffect "", "~{v0},~{v1}"() #0
ret void
}
; CHECK-DAG: clobbered_reg_with_sub Clobbered Registers: $vgpr0 $vgpr1 $vgpr0_hi16 $vgpr1_hi16 $vgpr0_lo16 $vgpr1_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 $vgpr0_vgpr1 $vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 $vgpr1_vgpr2_vgpr3 {{$}}
; CHECK-DAG: clobbered_reg_with_sub Clobbered Registers: $vgpr0 $vgpr1 $vgpr0_hi16 $vgpr1_hi16 $vgpr0_lo16 $vgpr1_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 $vgpr0_vgpr1 $vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 $vgpr1_vgpr2_vgpr3 {{$}}
define void @clobbered_reg_with_sub() #0 {
call void asm sideeffect "", "~{v[0:1]}"() #0
ret void

View File

@ -25,7 +25,8 @@ entry:
}
; FUNC-LABEL: {{^}}constant_load_v3i64:
; GCN: s_load_dwordx8 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
; GCN-DAG: s_load_dwordx4 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
; GCN-DAG: s_load_dwordx2 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x{{[0-9]+}}{{$}}
; EG-DAG: VTX_READ_128
; EG-DAG: VTX_READ_128

View File

@ -25,10 +25,10 @@ entry:
}
; FUNC-LABEL: {{^}}global_load_v3f64:
; GCN-NOHSA: buffer_load_dwordx4
; GCN-NOHSA: buffer_load_dwordx4
; GCN-HSA: flat_load_dwordx4
; GCN-HSA: flat_load_dwordx4
; GCN-NOHSA-DAG: buffer_load_dwordx4
; GCN-NOHSA-DAG: buffer_load_dwordx2
; GCN-HSA-DAG: flat_load_dwordx4
; GCN-HSA-DAG: flat_load_dwordx2
define amdgpu_kernel void @global_load_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
entry:
%ld = load <3 x double>, <3 x double> addrspace(1)* %in

View File

@ -32,11 +32,11 @@ entry:
}
; FUNC-LABEL: {{^}}global_load_v3i64:
; GCN-NOHSA: buffer_load_dwordx4
; GCN-NOHSA: buffer_load_dwordx4
; GCN-NOHSA-DAG: buffer_load_dwordx4
; GCN-NOHSA-DAG: buffer_load_dwordx2
; GCN-HSA: flat_load_dwordx4
; GCN-HSA: flat_load_dwordx4
; GCN-HSA-DAG: flat_load_dwordx4
; GCN-HSA-DAG: flat_load_dwordx2
; EG: VTX_READ_128
; EG: VTX_READ_128

View File

@ -66,38 +66,38 @@ define amdgpu_vs void @test_3(i32 inreg %arg1, i32 inreg %arg2, <4 x i32> inreg
; CHECK-NEXT: s_mov_b32 s6, s4
; CHECK-NEXT: s_mov_b32 s5, s3
; CHECK-NEXT: s_mov_b32 s4, s2
; CHECK-NEXT: v_add_i32_e32 v0, vcc, 16, v1
; CHECK-NEXT: v_add_i32_e32 v0, vcc, 4, v1
; CHECK-NEXT: v_add_i32_e32 v5, vcc, 8, v1
; CHECK-NEXT: v_add_i32_e32 v6, vcc, 12, v1
; CHECK-NEXT: v_add_i32_e32 v4, vcc, 8, v1
; CHECK-NEXT: v_add_i32_e32 v7, vcc, 4, v1
; CHECK-NEXT: v_add_i32_e32 v7, vcc, 16, v1
; CHECK-NEXT: v_add_i32_e32 v8, vcc, 20, v1
; CHECK-NEXT: v_mov_b32_e32 v9, s0
; CHECK-NEXT: v_add_i32_e32 v10, vcc, 16, v2
; CHECK-NEXT: v_add_i32_e32 v11, vcc, 12, v2
; CHECK-NEXT: v_add_i32_e32 v12, vcc, 8, v2
; CHECK-NEXT: v_add_i32_e32 v10, vcc, 4, v2
; CHECK-NEXT: v_add_i32_e32 v11, vcc, 8, v2
; CHECK-NEXT: v_add_i32_e32 v12, vcc, 12, v2
; CHECK-NEXT: s_mov_b32 m0, -1
; CHECK-NEXT: ds_read_b32 v3, v1
; CHECK-NEXT: ds_read_b32 v5, v4
; CHECK-NEXT: ds_read_b32 v4, v7
; CHECK-NEXT: ds_read_b32 v1, v8
; CHECK-NEXT: ds_read_b32 v4, v0
; CHECK-NEXT: ds_read_b32 v5, v5
; CHECK-NEXT: ds_read_b32 v6, v6
; CHECK-NEXT: ds_read_b32 v0, v0
; CHECK-NEXT: v_add_i32_e32 v7, vcc, 4, v2
; CHECK-NEXT: ds_read_b32 v0, v7
; CHECK-NEXT: ds_read_b32 v1, v8
; CHECK-NEXT: v_add_i32_e32 v7, vcc, 16, v2
; CHECK-NEXT: v_add_i32_e32 v8, vcc, 20, v2
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
; CHECK-NEXT: tbuffer_store_format_xyzw v[3:6], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:264 glc slc
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: tbuffer_store_format_xy v[0:1], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:280 glc slc
; CHECK-NEXT: s_waitcnt expcnt(0)
; CHECK-NEXT: ds_read_b32 v0, v2
; CHECK-NEXT: ds_read_b32 v2, v12
; CHECK-NEXT: ds_read_b32 v1, v7
; CHECK-NEXT: ds_read_b32 v1, v10
; CHECK-NEXT: ds_read_b32 v2, v11
; CHECK-NEXT: ds_read_b32 v3, v12
; CHECK-NEXT: ds_read_b32 v4, v7
; CHECK-NEXT: ds_read_b32 v5, v8
; CHECK-NEXT: ds_read_b32 v3, v11
; CHECK-NEXT: ds_read_b32 v4, v10
; CHECK-NEXT: s_waitcnt lgkmcnt(5)
; CHECK-NEXT: exp mrt0 off, off, off, off
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
; CHECK-NEXT: tbuffer_store_format_xyzw v[0:3], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:240 glc slc
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: tbuffer_store_format_xy v[4:5], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:256 glc slc

View File

@ -499,7 +499,7 @@ define amdgpu_kernel void @s_test_sdiv24_64(i64 addrspace(1)* %out, i64 %x, i64
; GCN-LABEL: s_test_sdiv24_64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_load_dword s1, s[0:1], 0xe
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
@ -529,7 +529,7 @@ define amdgpu_kernel void @s_test_sdiv24_64(i64 addrspace(1)* %out, i64 %x, i64
; GCN-IR-LABEL: s_test_sdiv24_64:
; GCN-IR: ; %bb.0:
; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe
; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
; GCN-IR-NEXT: s_mov_b32 s2, -1
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
@ -671,7 +671,7 @@ define amdgpu_kernel void @s_test_sdiv31_64(i64 addrspace(1)* %out, i64 %x, i64
; GCN-LABEL: s_test_sdiv31_64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_load_dword s1, s[0:1], 0xe
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
@ -701,7 +701,7 @@ define amdgpu_kernel void @s_test_sdiv31_64(i64 addrspace(1)* %out, i64 %x, i64
; GCN-IR-LABEL: s_test_sdiv31_64:
; GCN-IR: ; %bb.0:
; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe
; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
; GCN-IR-NEXT: s_mov_b32 s2, -1
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
@ -738,7 +738,7 @@ define amdgpu_kernel void @s_test_sdiv23_64(i64 addrspace(1)* %out, i64 %x, i64
; GCN-LABEL: s_test_sdiv23_64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_load_dword s1, s[0:1], 0xe
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
@ -768,7 +768,7 @@ define amdgpu_kernel void @s_test_sdiv23_64(i64 addrspace(1)* %out, i64 %x, i64
; GCN-IR-LABEL: s_test_sdiv23_64:
; GCN-IR: ; %bb.0:
; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe
; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
; GCN-IR-NEXT: s_mov_b32 s2, -1
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
@ -805,7 +805,7 @@ define amdgpu_kernel void @s_test_sdiv25_64(i64 addrspace(1)* %out, i64 %x, i64
; GCN-LABEL: s_test_sdiv25_64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_load_dword s1, s[0:1], 0xe
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
@ -835,7 +835,7 @@ define amdgpu_kernel void @s_test_sdiv25_64(i64 addrspace(1)* %out, i64 %x, i64
; GCN-IR-LABEL: s_test_sdiv25_64:
; GCN-IR: ; %bb.0:
; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe
; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
; GCN-IR-NEXT: s_mov_b32 s2, -1
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)

View File

@ -480,7 +480,7 @@ define amdgpu_kernel void @s_test_srem23_64(i64 addrspace(1)* %out, i64 %x, i64
; GCN-LABEL: s_test_srem23_64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_load_dword s1, s[0:1], 0xe
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
@ -512,7 +512,7 @@ define amdgpu_kernel void @s_test_srem23_64(i64 addrspace(1)* %out, i64 %x, i64
; GCN-IR-LABEL: s_test_srem23_64:
; GCN-IR: ; %bb.0:
; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe
; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
; GCN-IR-NEXT: s_mov_b32 s2, -1
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
@ -551,7 +551,7 @@ define amdgpu_kernel void @s_test_srem24_64(i64 addrspace(1)* %out, i64 %x, i64
; GCN-LABEL: s_test_srem24_64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_load_dword s1, s[0:1], 0xe
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
@ -583,7 +583,7 @@ define amdgpu_kernel void @s_test_srem24_64(i64 addrspace(1)* %out, i64 %x, i64
; GCN-IR-LABEL: s_test_srem24_64:
; GCN-IR: ; %bb.0:
; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe
; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
; GCN-IR-NEXT: s_mov_b32 s2, -1
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
@ -676,7 +676,7 @@ define amdgpu_kernel void @s_test_srem25_64(i64 addrspace(1)* %out, i64 %x, i64
; GCN-LABEL: s_test_srem25_64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_load_dword s1, s[0:1], 0xe
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
@ -708,7 +708,7 @@ define amdgpu_kernel void @s_test_srem25_64(i64 addrspace(1)* %out, i64 %x, i64
; GCN-IR-LABEL: s_test_srem25_64:
; GCN-IR: ; %bb.0:
; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe
; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
; GCN-IR-NEXT: s_mov_b32 s2, -1
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
@ -747,7 +747,7 @@ define amdgpu_kernel void @s_test_srem31_64(i64 addrspace(1)* %out, i64 %x, i64
; GCN-LABEL: s_test_srem31_64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_load_dword s1, s[0:1], 0xe
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
@ -779,7 +779,7 @@ define amdgpu_kernel void @s_test_srem31_64(i64 addrspace(1)* %out, i64 %x, i64
; GCN-IR-LABEL: s_test_srem31_64:
; GCN-IR: ; %bb.0:
; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe
; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
; GCN-IR-NEXT: s_mov_b32 s2, -1
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)

View File

@ -1297,37 +1297,37 @@ define amdgpu_kernel void @fma_shuffle(<4 x half> addrspace(1)* nocapture readon
; GFX9-LABEL: fma_shuffle:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1]
; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3]
; GFX9-NEXT: global_load_dwordx2 v[4:5], v6, s[8:9]
; GFX9-NEXT: global_load_dwordx2 v[4:5], v6, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1]
; GFX9-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1]
; GFX9-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0]
; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0]
; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[8:9]
; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: fma_shuffle:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1]
; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3]
; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[8:9]
; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1]
; GFX10-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1]
; GFX10-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0]
; GFX10-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0]
; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[8:9]
; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7]
; GFX10-NEXT: s_endpgm
entry:
%tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x()