forked from OSchip/llvm-project
AMDGPU/GlobalISel: Handle image atomics
This commit is contained in:
parent
48eda37282
commit
bcb643c8af
|
@ -1245,7 +1245,7 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
|
|||
const LLT AddrTy = MRI->getType(MI.getOperand(VAddrIdx).getReg());
|
||||
const bool IsA16 = AddrTy.getScalarType() == S16;
|
||||
|
||||
Register VData;
|
||||
Register VDataIn, VDataOut;
|
||||
LLT VDataTy;
|
||||
int NumVDataDwords = -1;
|
||||
bool IsD16 = false;
|
||||
|
@ -1271,7 +1271,24 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
|
|||
unsigned DMaskLanes = 0;
|
||||
|
||||
if (BaseOpcode->Atomic) {
|
||||
return false; // TODO
|
||||
VDataOut = MI.getOperand(0).getReg();
|
||||
VDataIn = MI.getOperand(2).getReg();
|
||||
LLT Ty = MRI->getType(VDataIn);
|
||||
|
||||
// Be careful to allow atomic swap on 16-bit element vectors.
|
||||
const bool Is64Bit = BaseOpcode->AtomicX2 ?
|
||||
Ty.getSizeInBits() == 128 :
|
||||
Ty.getSizeInBits() == 64;
|
||||
|
||||
if (BaseOpcode->AtomicX2) {
|
||||
assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
|
||||
|
||||
DMask = Is64Bit ? 0xf : 0x3;
|
||||
NumVDataDwords = Is64Bit ? 4 : 2;
|
||||
} else {
|
||||
DMask = Is64Bit ? 0x3 : 0x1;
|
||||
NumVDataDwords = Is64Bit ? 2 : 1;
|
||||
}
|
||||
} else {
|
||||
const int DMaskIdx = 2; // Input/output + intrinsic ID.
|
||||
|
||||
|
@ -1279,12 +1296,12 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
|
|||
DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
|
||||
|
||||
if (BaseOpcode->Store) {
|
||||
VData = MI.getOperand(1).getReg();
|
||||
VDataTy = MRI->getType(VData);
|
||||
VDataIn = MI.getOperand(1).getReg();
|
||||
VDataTy = MRI->getType(VDataIn);
|
||||
NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
|
||||
} else {
|
||||
VData = MI.getOperand(0).getReg();
|
||||
VDataTy = MRI->getType(VData);
|
||||
VDataOut = MI.getOperand(0).getReg();
|
||||
VDataTy = MRI->getType(VDataOut);
|
||||
NumVDataDwords = DMaskLanes;
|
||||
|
||||
// One memoperand is mandatory, except for getresinfo.
|
||||
|
@ -1386,11 +1403,25 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
|
|||
auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
|
||||
.cloneMemRefs(MI);
|
||||
|
||||
if (!BaseOpcode->Store || BaseOpcode->Atomic)
|
||||
MIB.addDef(VData); // vdata output
|
||||
if (VDataOut) {
|
||||
if (BaseOpcode->AtomicX2) {
|
||||
const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
|
||||
|
||||
if (BaseOpcode->Store || BaseOpcode->Atomic)
|
||||
MIB.addReg(VData); // vdata input
|
||||
Register TmpReg = MRI->createVirtualRegister(
|
||||
Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
|
||||
unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
|
||||
|
||||
MIB.addDef(TmpReg);
|
||||
BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
|
||||
.addReg(TmpReg, RegState::Kill, SubReg);
|
||||
|
||||
} else {
|
||||
MIB.addDef(VDataOut); // vdata output
|
||||
}
|
||||
}
|
||||
|
||||
if (VDataIn)
|
||||
MIB.addReg(VDataIn); // vdata input
|
||||
|
||||
for (int i = 0; i != NumVAddrRegs; ++i) {
|
||||
MachineOperand &SrcOp = MI.getOperand(VAddrIdx + i);
|
||||
|
|
|
@ -3698,6 +3698,24 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
|
|||
MI.getOperand(DMaskIdx).setImm(DMask);
|
||||
}
|
||||
|
||||
if (BaseOpcode->Atomic) {
|
||||
Register VData0 = MI.getOperand(2).getReg();
|
||||
LLT Ty = MRI->getType(VData0);
|
||||
|
||||
// TODO: Allow atomic swap and bit ops for v2s16/v4s16
|
||||
if (Ty.isVector())
|
||||
return false;
|
||||
|
||||
if (BaseOpcode->AtomicX2) {
|
||||
Register VData1 = MI.getOperand(3).getReg();
|
||||
// The two values are packed in one register.
|
||||
LLT PackedTy = LLT::vector(2, Ty);
|
||||
auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
|
||||
MI.getOperand(2).setReg(Concat.getReg(0));
|
||||
MI.getOperand(3).setReg(AMDGPU::NoRegister);
|
||||
}
|
||||
}
|
||||
|
||||
int CorrectedNumVAddrs = NumVAddrs;
|
||||
|
||||
// Optimize _L to _LZ when _L is zero
|
||||
|
@ -3785,6 +3803,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
|
|||
convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
|
||||
}
|
||||
|
||||
|
||||
if (BaseOpcode->Store) { // No TFE for stores?
|
||||
// TODO: Handle dmask trim
|
||||
Register VData = MI.getOperand(1).getReg();
|
||||
|
|
|
@ -536,7 +536,8 @@ define amdgpu_ps float @atomic_cmpswap_1d(<8 x i32> inreg %rsrc, i32 %cmp, i32 %
|
|||
; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
|
||||
; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32)
|
||||
; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
|
||||
; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.1d), [[COPY8]](s32), [[COPY9]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8")
|
||||
; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
|
||||
; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.1d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8")
|
||||
; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
|
||||
; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0
|
||||
; GFX10NSA-LABEL: name: atomic_cmpswap_1d
|
||||
|
@ -555,7 +556,8 @@ define amdgpu_ps float @atomic_cmpswap_1d(<8 x i32> inreg %rsrc, i32 %cmp, i32 %
|
|||
; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
|
||||
; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32)
|
||||
; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
|
||||
; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.1d), [[COPY8]](s32), [[COPY9]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8")
|
||||
; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
|
||||
; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.1d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8")
|
||||
; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
|
||||
; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0
|
||||
main_body:
|
||||
|
@ -1014,10 +1016,11 @@ define amdgpu_ps float @atomic_cmpswap_2d(<8 x i32> inreg %rsrc, i32 %cmp, i32 %
|
|||
; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
|
||||
; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
|
||||
; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
|
||||
; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
|
||||
; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
|
||||
; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY11]](s32)
|
||||
; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32)
|
||||
; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.2d), [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8")
|
||||
; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.2d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8")
|
||||
; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
|
||||
; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0
|
||||
; GFX10NSA-LABEL: name: atomic_cmpswap_2d
|
||||
|
@ -1036,10 +1039,11 @@ define amdgpu_ps float @atomic_cmpswap_2d(<8 x i32> inreg %rsrc, i32 %cmp, i32 %
|
|||
; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
|
||||
; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
|
||||
; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
|
||||
; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
|
||||
; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
|
||||
; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY11]](s32)
|
||||
; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32)
|
||||
; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.2d), [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8")
|
||||
; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.2d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8")
|
||||
; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
|
||||
; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0
|
||||
main_body:
|
||||
|
@ -1066,6 +1070,7 @@ define amdgpu_ps float @atomic_cmpswap_3d(<8 x i32> inreg %rsrc, i32 %cmp, i32 %
|
|||
; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
|
||||
; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4
|
||||
; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
|
||||
; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
|
||||
; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
|
||||
; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY11]](s32)
|
||||
; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32)
|
||||
|
@ -1073,7 +1078,7 @@ define amdgpu_ps float @atomic_cmpswap_3d(<8 x i32> inreg %rsrc, i32 %cmp, i32 %
|
|||
; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
|
||||
; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32)
|
||||
; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
|
||||
; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.3d), [[COPY8]](s32), [[COPY9]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8")
|
||||
; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.3d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8")
|
||||
; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
|
||||
; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0
|
||||
; GFX10NSA-LABEL: name: atomic_cmpswap_3d
|
||||
|
@ -1093,13 +1098,14 @@ define amdgpu_ps float @atomic_cmpswap_3d(<8 x i32> inreg %rsrc, i32 %cmp, i32 %
|
|||
; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
|
||||
; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4
|
||||
; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
|
||||
; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
|
||||
; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
|
||||
; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY11]](s32)
|
||||
; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32)
|
||||
; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
|
||||
; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
|
||||
; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32)
|
||||
; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.3d), [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8")
|
||||
; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.3d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8")
|
||||
; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
|
||||
; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0
|
||||
main_body:
|
||||
|
@ -1127,6 +1133,7 @@ define amdgpu_ps float @atomic_cmpswap_2darraymsaa(<8 x i32> inreg %rsrc, i32 %c
|
|||
; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4
|
||||
; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr5
|
||||
; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
|
||||
; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
|
||||
; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
|
||||
; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY11]](s32)
|
||||
; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
|
||||
|
@ -1134,7 +1141,7 @@ define amdgpu_ps float @atomic_cmpswap_2darraymsaa(<8 x i32> inreg %rsrc, i32 %c
|
|||
; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
|
||||
; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32)
|
||||
; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
|
||||
; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.2darraymsaa), [[COPY8]](s32), [[COPY9]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8")
|
||||
; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.2darraymsaa), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8")
|
||||
; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
|
||||
; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0
|
||||
; GFX10NSA-LABEL: name: atomic_cmpswap_2darraymsaa
|
||||
|
@ -1155,13 +1162,14 @@ define amdgpu_ps float @atomic_cmpswap_2darraymsaa(<8 x i32> inreg %rsrc, i32 %c
|
|||
; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4
|
||||
; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr5
|
||||
; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
|
||||
; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
|
||||
; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
|
||||
; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY11]](s32)
|
||||
; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
|
||||
; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
|
||||
; GFX10NSA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
|
||||
; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32)
|
||||
; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.2darraymsaa), [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8")
|
||||
; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.2darraymsaa), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8")
|
||||
; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
|
||||
; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0
|
||||
main_body:
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue