AMDGPU/GlobalISel: Handle image atomics

This commit is contained in:
Matt Arsenault 2020-02-08 19:08:34 -05:00 committed by Matt Arsenault
parent 48eda37282
commit bcb643c8af
5 changed files with 3820 additions and 18 deletions

View File

@ -1245,7 +1245,7 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
const LLT AddrTy = MRI->getType(MI.getOperand(VAddrIdx).getReg());
const bool IsA16 = AddrTy.getScalarType() == S16;
Register VData;
Register VDataIn, VDataOut;
LLT VDataTy;
int NumVDataDwords = -1;
bool IsD16 = false;
@ -1271,7 +1271,24 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
unsigned DMaskLanes = 0;
if (BaseOpcode->Atomic) {
return false; // TODO
VDataOut = MI.getOperand(0).getReg();
VDataIn = MI.getOperand(2).getReg();
LLT Ty = MRI->getType(VDataIn);
// Be careful to allow atomic swap on 16-bit element vectors.
const bool Is64Bit = BaseOpcode->AtomicX2 ?
Ty.getSizeInBits() == 128 :
Ty.getSizeInBits() == 64;
if (BaseOpcode->AtomicX2) {
assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
DMask = Is64Bit ? 0xf : 0x3;
NumVDataDwords = Is64Bit ? 4 : 2;
} else {
DMask = Is64Bit ? 0x3 : 0x1;
NumVDataDwords = Is64Bit ? 2 : 1;
}
} else {
const int DMaskIdx = 2; // Input/output + intrinsic ID.
@ -1279,12 +1296,12 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
if (BaseOpcode->Store) {
VData = MI.getOperand(1).getReg();
VDataTy = MRI->getType(VData);
VDataIn = MI.getOperand(1).getReg();
VDataTy = MRI->getType(VDataIn);
NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
} else {
VData = MI.getOperand(0).getReg();
VDataTy = MRI->getType(VData);
VDataOut = MI.getOperand(0).getReg();
VDataTy = MRI->getType(VDataOut);
NumVDataDwords = DMaskLanes;
// One memoperand is mandatory, except for getresinfo.
@ -1386,11 +1403,25 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
.cloneMemRefs(MI);
if (!BaseOpcode->Store || BaseOpcode->Atomic)
MIB.addDef(VData); // vdata output
if (VDataOut) {
if (BaseOpcode->AtomicX2) {
const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
if (BaseOpcode->Store || BaseOpcode->Atomic)
MIB.addReg(VData); // vdata input
Register TmpReg = MRI->createVirtualRegister(
Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
MIB.addDef(TmpReg);
BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
.addReg(TmpReg, RegState::Kill, SubReg);
} else {
MIB.addDef(VDataOut); // vdata output
}
}
if (VDataIn)
MIB.addReg(VDataIn); // vdata input
for (int i = 0; i != NumVAddrRegs; ++i) {
MachineOperand &SrcOp = MI.getOperand(VAddrIdx + i);

View File

@ -3698,6 +3698,24 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
MI.getOperand(DMaskIdx).setImm(DMask);
}
if (BaseOpcode->Atomic) {
Register VData0 = MI.getOperand(2).getReg();
LLT Ty = MRI->getType(VData0);
// TODO: Allow atomic swap and bit ops for v2s16/v4s16
if (Ty.isVector())
return false;
if (BaseOpcode->AtomicX2) {
Register VData1 = MI.getOperand(3).getReg();
// The two values are packed in one register.
LLT PackedTy = LLT::vector(2, Ty);
auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
MI.getOperand(2).setReg(Concat.getReg(0));
MI.getOperand(3).setReg(AMDGPU::NoRegister);
}
}
int CorrectedNumVAddrs = NumVAddrs;
// Optimize _L to _LZ when _L is zero
@ -3785,6 +3803,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
}
if (BaseOpcode->Store) { // No TFE for stores?
// TODO: Handle dmask trim
Register VData = MI.getOperand(1).getReg();

View File

@ -536,7 +536,8 @@ define amdgpu_ps float @atomic_cmpswap_1d(<8 x i32> inreg %rsrc, i32 %cmp, i32 %
; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32)
; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.1d), [[COPY8]](s32), [[COPY9]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8")
; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.1d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8")
; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0
; GFX10NSA-LABEL: name: atomic_cmpswap_1d
@ -555,7 +556,8 @@ define amdgpu_ps float @atomic_cmpswap_1d(<8 x i32> inreg %rsrc, i32 %cmp, i32 %
; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32)
; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.1d), [[COPY8]](s32), [[COPY9]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8")
; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.1d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8")
; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0
main_body:
@ -1014,10 +1016,11 @@ define amdgpu_ps float @atomic_cmpswap_2d(<8 x i32> inreg %rsrc, i32 %cmp, i32 %
; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY11]](s32)
; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32)
; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.2d), [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8")
; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.2d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8")
; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0
; GFX10NSA-LABEL: name: atomic_cmpswap_2d
@ -1036,10 +1039,11 @@ define amdgpu_ps float @atomic_cmpswap_2d(<8 x i32> inreg %rsrc, i32 %cmp, i32 %
; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY11]](s32)
; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32)
; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.2d), [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8")
; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.2d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8")
; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0
main_body:
@ -1066,6 +1070,7 @@ define amdgpu_ps float @atomic_cmpswap_3d(<8 x i32> inreg %rsrc, i32 %cmp, i32 %
; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4
; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY11]](s32)
; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32)
@ -1073,7 +1078,7 @@ define amdgpu_ps float @atomic_cmpswap_3d(<8 x i32> inreg %rsrc, i32 %cmp, i32 %
; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32)
; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.3d), [[COPY8]](s32), [[COPY9]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8")
; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.3d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8")
; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0
; GFX10NSA-LABEL: name: atomic_cmpswap_3d
@ -1093,13 +1098,14 @@ define amdgpu_ps float @atomic_cmpswap_3d(<8 x i32> inreg %rsrc, i32 %cmp, i32 %
; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4
; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY11]](s32)
; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32)
; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32)
; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.3d), [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8")
; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.3d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8")
; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0
main_body:
@ -1127,6 +1133,7 @@ define amdgpu_ps float @atomic_cmpswap_2darraymsaa(<8 x i32> inreg %rsrc, i32 %c
; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4
; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr5
; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY11]](s32)
; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
@ -1134,7 +1141,7 @@ define amdgpu_ps float @atomic_cmpswap_2darraymsaa(<8 x i32> inreg %rsrc, i32 %c
; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32)
; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>)
; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.2darraymsaa), [[COPY8]](s32), [[COPY9]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8")
; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.2darraymsaa), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8")
; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0
; GFX10NSA-LABEL: name: atomic_cmpswap_2darraymsaa
@ -1155,13 +1162,14 @@ define amdgpu_ps float @atomic_cmpswap_2darraymsaa(<8 x i32> inreg %rsrc, i32 %c
; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4
; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr5
; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY11]](s32)
; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
; GFX10NSA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32)
; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.2darraymsaa), [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8")
; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.2darraymsaa), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (volatile dereferenceable load store 4 on custom "TargetCustom8")
; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0
main_body:

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff