AMDGPU/GlobalISel: Insert readfirstlane on SGPR returns

In case the source value ends up in a VGPR, insert a readfirstlane to
avoid producing an illegal copy later. If it turns out to be
unnecessary, it can be folded out.
This commit is contained in:
Matt Arsenault 2020-02-14 21:23:07 -05:00 committed by Matt Arsenault
parent a314050065
commit 67cfbec746
6 changed files with 693 additions and 257 deletions

View File

@ -59,6 +59,18 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
} else
ExtReg = extendRegister(ValVReg, VA);
// If this is a scalar return, insert a readfirstlane just in case the value
// ends up in a VGPR.
// FIXME: Assert this is a shader return.
const SIRegisterInfo *TRI
= static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
if (TRI->isSGPRReg(MRI, PhysReg)) {
auto ToSGPR = MIRBuilder.buildIntrinsic(Intrinsic::amdgcn_readfirstlane,
{MRI.getType(ExtReg)}, false)
.addReg(ExtReg);
ExtReg = ToSGPR.getReg(0);
}
MIRBuilder.buildCopy(PhysReg, ExtReg);
MIB.addUse(PhysReg, RegState::Implicit);
}

View File

@ -29,8 +29,7 @@ define amdgpu_ps i32 @s_bswap_i32(i32 inreg %src) {
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
%bswap = call i32 @llvm.bswap.i32(i32 %src)
%to.sgpr = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap)
ret i32 %to.sgpr
ret i32 %bswap
}
define i32 @v_bswap_i32(i32 %src) {
@ -96,13 +95,7 @@ define amdgpu_ps <2 x i32> @s_bswap_v2i32(<2 x i32> inreg %src) {
; GFX9-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NEXT: ; return to shader part epilog
%bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %src)
%bswap.0 = extractelement <2 x i32> %bswap, i32 0
%bswap.1 = extractelement <2 x i32> %bswap, i32 1
%to.sgpr0 = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap.0)
%to.sgpr1 = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap.1)
%ins.0 = insertelement <2 x i32> undef, i32 %to.sgpr0, i32 0
%ins.1 = insertelement <2 x i32> %ins.0, i32 %to.sgpr1, i32 1
ret <2 x i32> %ins.1
ret <2 x i32> %bswap
}
define <2 x i32> @v_bswap_v2i32(<2 x i32> %src) {
@ -137,7 +130,7 @@ define <2 x i32> @v_bswap_v2i32(<2 x i32> %src) {
ret <2 x i32> %bswap
}
define amdgpu_ps <2 x i32> @s_bswap_i64(i64 inreg %src) {
define amdgpu_ps i64 @s_bswap_i64(i64 inreg %src) {
; GFX7-LABEL: s_bswap_i64:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_alignbit_b32 v0, s1, s1, 8
@ -173,14 +166,7 @@ define amdgpu_ps <2 x i32> @s_bswap_i64(i64 inreg %src) {
; GFX9-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NEXT: ; return to shader part epilog
%bswap = call i64 @llvm.bswap.i64(i64 %src)
%cast = bitcast i64 %bswap to <2 x i32>
%elt0 = extractelement <2 x i32> %cast, i32 0
%elt1 = extractelement <2 x i32> %cast, i32 1
%to.sgpr0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0)
%to.sgpr1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1)
%ins.0 = insertelement <2 x i32> undef, i32 %to.sgpr0, i32 0
%ins.1 = insertelement <2 x i32> %ins.0, i32 %to.sgpr1, i32 1
ret <2 x i32> %ins.1
ret i64 %bswap
}
define i64 @v_bswap_i64(i64 %src) {
@ -218,7 +204,7 @@ define i64 @v_bswap_i64(i64 %src) {
ret i64 %bswap
}
define amdgpu_ps <4 x i32> @s_bswap_v2i64(<2 x i64> inreg %src) {
define amdgpu_ps <2 x i64> @s_bswap_v2i64(<2 x i64> inreg %src) {
; GFX7-LABEL: s_bswap_v2i64:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_alignbit_b32 v0, s1, s1, 8
@ -274,20 +260,7 @@ define amdgpu_ps <4 x i32> @s_bswap_v2i64(<2 x i64> inreg %src) {
; GFX9-NEXT: v_readfirstlane_b32 s3, v3
; GFX9-NEXT: ; return to shader part epilog
%bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %src)
%cast = bitcast <2 x i64> %bswap to <4 x i32>
%bswap.0 = extractelement <4 x i32> %cast, i32 0
%bswap.1 = extractelement <4 x i32> %cast, i32 1
%bswap.2 = extractelement <4 x i32> %cast, i32 2
%bswap.3 = extractelement <4 x i32> %cast, i32 3
%to.sgpr0 = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap.0)
%to.sgpr1 = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap.1)
%to.sgpr2 = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap.2)
%to.sgpr3 = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap.3)
%ins.0 = insertelement <4 x i32> undef, i32 %to.sgpr0, i32 0
%ins.1 = insertelement <4 x i32> %ins.0, i32 %to.sgpr1, i32 1
%ins.2 = insertelement <4 x i32> %ins.1, i32 %to.sgpr2, i32 2
%ins.3 = insertelement <4 x i32> %ins.2, i32 %to.sgpr3, i32 3
ret <4 x i32> %ins.3
ret <2 x i64> %bswap
}
define <2 x i64> @v_bswap_v2i64(<2 x i64> %src) {
@ -345,7 +318,6 @@ define amdgpu_ps i16 @s_bswap_i16(i16 inreg %src) {
; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
; GFX7-NEXT: s_lshr_b32 s0, s0, 8
; GFX7-NEXT: s_or_b32 s0, s0, s1
; GFX7-NEXT: s_bfe_u32 s0, s0, 0x100000
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_bswap_i16:
@ -364,10 +336,7 @@ define amdgpu_ps i16 @s_bswap_i16(i16 inreg %src) {
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
%bswap = call i16 @llvm.bswap.i16(i16 %src)
%zext = zext i16 %bswap to i32
%to.sgpr = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
%trunc = trunc i32 %to.sgpr to i16
ret i16 %trunc
ret i16 %bswap
}
define i16 @v_bswap_i16(i16 %src) {
@ -431,9 +400,8 @@ define amdgpu_ps i32 @s_bswap_v2i16(<2 x i16> inreg %src) {
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
%bswap = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %src)
%cast0 = bitcast <2 x i16> %bswap to i32
%to.sgpr = call i32 @llvm.amdgcn.readfirstlane(i32 %cast0)
ret i32 %to.sgpr
%cast = bitcast <2 x i16> %bswap to i32
ret i32 %cast
}
define i32 @v_bswap_i16_zext_to_i32(i16 %src) {
@ -574,7 +542,6 @@ define i64 @v_bswap_i48(i64 %src) {
ret i64 %zext
}
declare i32 @llvm.amdgcn.readfirstlane(i32) #0
declare i16 @llvm.bswap.i16(i16) #1
declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>) #1
declare <3 x i16> @llvm.bswap.v3i16(<3 x i16>) #1

View File

@ -31,6 +31,77 @@ main_body:
ret void
}
define amdgpu_ps float @vgpr_return(i32 %vgpr) {
; CHECK-LABEL: name: vgpr_return
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $vgpr0
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK: $vgpr0 = COPY [[COPY]](s32)
; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0
%cast = bitcast i32 %vgpr to float
ret float %cast
}
define amdgpu_ps i32 @sgpr_return_i32(i32 %vgpr) {
; CHECK-LABEL: name: sgpr_return_i32
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $vgpr0
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY]](s32)
; CHECK: $sgpr0 = COPY [[INT]](s32)
; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0
ret i32 %vgpr
}
define amdgpu_ps i64 @sgpr_return_i64(i64 %vgpr) {
; CHECK-LABEL: name: sgpr_return_i64
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $vgpr0, $vgpr1
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64)
; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
; CHECK: $sgpr0 = COPY [[INT]](s32)
; CHECK: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
; CHECK: $sgpr1 = COPY [[INT1]](s32)
; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
ret i64 %vgpr
}
define amdgpu_ps <2 x i32> @sgpr_return_v2i32(<2 x i32> %vgpr) {
; CHECK-LABEL: name: sgpr_return_v2i32
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $vgpr0, $vgpr1
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32)
; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x s32>)
; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
; CHECK: $sgpr0 = COPY [[INT]](s32)
; CHECK: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
; CHECK: $sgpr1 = COPY [[INT1]](s32)
; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
ret <2 x i32> %vgpr
}
define amdgpu_ps { i32, i32 } @sgpr_struct_return_i32_i32(i32 %vgpr0, i32 %vgpr1) {
; CHECK-LABEL: name: sgpr_struct_return_i32_i32
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $vgpr0, $vgpr1
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY]](s32)
; CHECK: $sgpr0 = COPY [[INT]](s32)
; CHECK: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY1]](s32)
; CHECK: $sgpr1 = COPY [[INT1]](s32)
; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%insertvalue0 = insertvalue { i32, i32 } undef, i32 %vgpr0, 0
%value = insertvalue { i32, i32 } %insertvalue0, i32 %vgpr1, 1
ret { i32, i32 } %value
}
declare void @llvm.amdgcn.exp.f32(i32 immarg, i32 immarg, float, float, float, float, i1 immarg, i1 immarg) #0
attributes #0 = { nounwind }

View File

@ -1,73 +1,99 @@
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=irtranslator -global-isel %s -o - | FileCheck %s
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=irtranslator %s -o - | FileCheck %s
; CHECK-LABEL: name: test_f32_inreg
; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr2
; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), 32, 15, [[S0]]
define amdgpu_vs void @test_f32_inreg(float inreg %arg0) {
; CHECK-LABEL: name: test_f32_inreg
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $sgpr2
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), 32, 15, [[COPY]](s32), [[DEF]](s32), [[DEF]](s32), [[DEF]](s32), 0, 0
; CHECK: S_ENDPGM 0
call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %arg0, float undef, float undef, float undef, i1 false, i1 false) #0
ret void
}
; CHECK-LABEL: name: test_f32
; CHECK: [[V0:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), 32, 15, [[V0]]
define amdgpu_vs void @test_f32(float %arg0) {
; CHECK-LABEL: name: test_f32
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $vgpr0
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), 32, 15, [[COPY]](s32), [[DEF]](s32), [[DEF]](s32), [[DEF]](s32), 0, 0
; CHECK: S_ENDPGM 0
call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %arg0, float undef, float undef, float undef, i1 false, i1 false) #0
ret void
}
; CHECK-LABEL: name: test_ptr2_inreg
; CHECK: [[S2:%[0-9]+]]:_(s32) = COPY $sgpr2
; CHECK: [[S3:%[0-9]+]]:_(s32) = COPY $sgpr3
; CHECK: [[PTR:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[S2]](s32), [[S3]](s32)
; CHECK: G_LOAD [[PTR]]
define amdgpu_vs void @test_ptr2_inreg(i32 addrspace(4)* inreg %arg0) {
; CHECK-LABEL: name: test_ptr2_inreg
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $sgpr2, $sgpr3
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; CHECK: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (volatile load 4 from %ir.arg0, addrspace 4)
; CHECK: S_ENDPGM 0
%tmp0 = load volatile i32, i32 addrspace(4)* %arg0
ret void
}
; CHECK-LABEL: name: test_sgpr_alignment0
; CHECK: [[S2:%[0-9]+]]:_(s32) = COPY $sgpr2
; CHECK: [[S3:%[0-9]+]]:_(s32) = COPY $sgpr3
; CHECK: [[S4:%[0-9]+]]:_(s32) = COPY $sgpr4
; CHECK: [[S34:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[S3]](s32), [[S4]](s32)
; CHECK: G_LOAD [[S34]]
; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), 32, 15, [[S2]](s32)
define amdgpu_vs void @test_sgpr_alignment0(float inreg %arg0, i32 addrspace(4)* inreg %arg1) {
; CHECK-LABEL: name: test_sgpr_alignment0
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; CHECK: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (volatile load 4 from %ir.arg1, addrspace 4)
; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), 32, 15, [[COPY]](s32), [[DEF]](s32), [[DEF]](s32), [[DEF]](s32), 0, 0
; CHECK: S_ENDPGM 0
%tmp0 = load volatile i32, i32 addrspace(4)* %arg1
call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %arg0, float undef, float undef, float undef, i1 false, i1 false) #0
ret void
}
; CHECK-LABEL: name: test_order
; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr2
; CHECK: [[S1:%[0-9]+]]:_(s32) = COPY $sgpr3
; CHECK: [[V0:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK: [[V1:%[0-9]+]]:_(s32) = COPY $vgpr1
; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), 32, 15, [[V0]](s32), [[S0]](s32), [[V1]](s32), [[S1]](s32)
define amdgpu_vs void @test_order(float inreg %arg0, float inreg %arg1, float %arg2, float %arg3) {
; CHECK-LABEL: name: test_order
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr1
; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), 32, 15, [[COPY2]](s32), [[COPY]](s32), [[COPY3]](s32), [[COPY1]](s32), 0, 0
; CHECK: S_ENDPGM 0
call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %arg2, float %arg0, float %arg3, float %arg1, i1 false, i1 false) #0
ret void
}
; CHECK-LABEL: name: ret_struct
; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr2
; CHECK: [[S1:%[0-9]+]]:_(s32) = COPY $sgpr3
; CHECK: $sgpr0 = COPY [[S0]]
; CHECK: $sgpr1 = COPY [[S1]]
; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
define amdgpu_vs <{ i32, i32 }> @ret_struct(i32 inreg %arg0, i32 inreg %arg1) {
; CHECK-LABEL: name: ret_struct
; CHECK: bb.1.main_body:
; CHECK: liveins: $sgpr2, $sgpr3
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY]](s32)
; CHECK: $sgpr0 = COPY [[INT]](s32)
; CHECK: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY1]](s32)
; CHECK: $sgpr1 = COPY [[INT1]](s32)
; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
main_body:
%tmp0 = insertvalue <{ i32, i32 }> undef, i32 %arg0, 0
%tmp1 = insertvalue <{ i32, i32 }> %tmp0, i32 %arg1, 1
ret <{ i32, i32 }> %tmp1
}
; CHECK_LABEL: name: non_void_ret
; CHECK: [[ZERO:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; CHECK: $sgpr0 = COPY [[ZERO]]
; SI_RETURN_TO_EPILOG $sgpr0
define amdgpu_vs i32 @non_void_ret() {
; CHECK-LABEL: name: non_void_ret
; CHECK: bb.1 (%ir-block.0):
; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[C]](s32)
; CHECK: $sgpr0 = COPY [[INT]](s32)
; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0
ret i32 0
}

View File

@ -14,7 +14,9 @@ define amdgpu_ps i32 @s_buffer_load_i32(<4 x i32> inreg %rsrc, i32 inreg %soffse
; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
; CHECK: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(s32) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 4)
; CHECK: $sgpr0 = COPY [[AMDGPU_S_BUFFER_LOAD]](s32)
; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[AMDGPU_S_BUFFER_LOAD]](s32)
; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
; CHECK: $sgpr0 = COPY [[INT]](s32)
; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0
%val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret i32 %val
@ -32,8 +34,12 @@ define amdgpu_ps <2 x i32> @s_buffer_load_v2i32(<4 x i32> inreg %rsrc, i32 inreg
; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
; CHECK: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 8, align 4)
; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<2 x s32>)
; CHECK: $sgpr0 = COPY [[UV]](s32)
; CHECK: $sgpr1 = COPY [[UV1]](s32)
; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
; CHECK: $sgpr0 = COPY [[INT]](s32)
; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32)
; CHECK: [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
; CHECK: $sgpr1 = COPY [[INT1]](s32)
; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%val = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <2 x i32> %val
@ -52,9 +58,15 @@ define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(<4 x i32> inreg %rsrc, i32 inreg
; CHECK: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 12, align 4)
; CHECK: [[EXTRACT:%[0-9]+]]:sgpr(<3 x s32>) = G_EXTRACT [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>), 0
; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[EXTRACT]](<3 x s32>)
; CHECK: $sgpr0 = COPY [[UV]](s32)
; CHECK: $sgpr1 = COPY [[UV1]](s32)
; CHECK: $sgpr2 = COPY [[UV2]](s32)
; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
; CHECK: $sgpr0 = COPY [[INT]](s32)
; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32)
; CHECK: [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
; CHECK: $sgpr1 = COPY [[INT1]](s32)
; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32)
; CHECK: [[INT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32)
; CHECK: $sgpr2 = COPY [[INT2]](s32)
; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2
%val = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <3 x i32> %val
@ -72,14 +84,30 @@ define amdgpu_ps <8 x i32> @s_buffer_load_v8i32(<4 x i32> inreg %rsrc, i32 inreg
; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
; CHECK: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 32, align 4)
; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<8 x s32>)
; CHECK: $sgpr0 = COPY [[UV]](s32)
; CHECK: $sgpr1 = COPY [[UV1]](s32)
; CHECK: $sgpr2 = COPY [[UV2]](s32)
; CHECK: $sgpr3 = COPY [[UV3]](s32)
; CHECK: $sgpr4 = COPY [[UV4]](s32)
; CHECK: $sgpr5 = COPY [[UV5]](s32)
; CHECK: $sgpr6 = COPY [[UV6]](s32)
; CHECK: $sgpr7 = COPY [[UV7]](s32)
; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
; CHECK: $sgpr0 = COPY [[INT]](s32)
; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32)
; CHECK: [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
; CHECK: $sgpr1 = COPY [[INT1]](s32)
; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32)
; CHECK: [[INT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32)
; CHECK: $sgpr2 = COPY [[INT2]](s32)
; CHECK: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[UV3]](s32)
; CHECK: [[INT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY8]](s32)
; CHECK: $sgpr3 = COPY [[INT3]](s32)
; CHECK: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[UV4]](s32)
; CHECK: [[INT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY9]](s32)
; CHECK: $sgpr4 = COPY [[INT4]](s32)
; CHECK: [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[UV5]](s32)
; CHECK: [[INT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY10]](s32)
; CHECK: $sgpr5 = COPY [[INT5]](s32)
; CHECK: [[COPY11:%[0-9]+]]:vgpr(s32) = COPY [[UV6]](s32)
; CHECK: [[INT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY11]](s32)
; CHECK: $sgpr6 = COPY [[INT6]](s32)
; CHECK: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[UV7]](s32)
; CHECK: [[INT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY12]](s32)
; CHECK: $sgpr7 = COPY [[INT7]](s32)
; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7
%val = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <8 x i32> %val
@ -97,22 +125,54 @@ define amdgpu_ps <16 x i32> @s_buffer_load_v16i32(<4 x i32> inreg %rsrc, i32 inr
; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
; CHECK: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 64, align 4)
; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32), [[UV8:%[0-9]+]]:sgpr(s32), [[UV9:%[0-9]+]]:sgpr(s32), [[UV10:%[0-9]+]]:sgpr(s32), [[UV11:%[0-9]+]]:sgpr(s32), [[UV12:%[0-9]+]]:sgpr(s32), [[UV13:%[0-9]+]]:sgpr(s32), [[UV14:%[0-9]+]]:sgpr(s32), [[UV15:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<16 x s32>)
; CHECK: $sgpr0 = COPY [[UV]](s32)
; CHECK: $sgpr1 = COPY [[UV1]](s32)
; CHECK: $sgpr2 = COPY [[UV2]](s32)
; CHECK: $sgpr3 = COPY [[UV3]](s32)
; CHECK: $sgpr4 = COPY [[UV4]](s32)
; CHECK: $sgpr5 = COPY [[UV5]](s32)
; CHECK: $sgpr6 = COPY [[UV6]](s32)
; CHECK: $sgpr7 = COPY [[UV7]](s32)
; CHECK: $sgpr8 = COPY [[UV8]](s32)
; CHECK: $sgpr9 = COPY [[UV9]](s32)
; CHECK: $sgpr10 = COPY [[UV10]](s32)
; CHECK: $sgpr11 = COPY [[UV11]](s32)
; CHECK: $sgpr12 = COPY [[UV12]](s32)
; CHECK: $sgpr13 = COPY [[UV13]](s32)
; CHECK: $sgpr14 = COPY [[UV14]](s32)
; CHECK: $sgpr15 = COPY [[UV15]](s32)
; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
; CHECK: $sgpr0 = COPY [[INT]](s32)
; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32)
; CHECK: [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
; CHECK: $sgpr1 = COPY [[INT1]](s32)
; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32)
; CHECK: [[INT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32)
; CHECK: $sgpr2 = COPY [[INT2]](s32)
; CHECK: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[UV3]](s32)
; CHECK: [[INT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY8]](s32)
; CHECK: $sgpr3 = COPY [[INT3]](s32)
; CHECK: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[UV4]](s32)
; CHECK: [[INT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY9]](s32)
; CHECK: $sgpr4 = COPY [[INT4]](s32)
; CHECK: [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[UV5]](s32)
; CHECK: [[INT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY10]](s32)
; CHECK: $sgpr5 = COPY [[INT5]](s32)
; CHECK: [[COPY11:%[0-9]+]]:vgpr(s32) = COPY [[UV6]](s32)
; CHECK: [[INT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY11]](s32)
; CHECK: $sgpr6 = COPY [[INT6]](s32)
; CHECK: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[UV7]](s32)
; CHECK: [[INT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY12]](s32)
; CHECK: $sgpr7 = COPY [[INT7]](s32)
; CHECK: [[COPY13:%[0-9]+]]:vgpr(s32) = COPY [[UV8]](s32)
; CHECK: [[INT8:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY13]](s32)
; CHECK: $sgpr8 = COPY [[INT8]](s32)
; CHECK: [[COPY14:%[0-9]+]]:vgpr(s32) = COPY [[UV9]](s32)
; CHECK: [[INT9:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY14]](s32)
; CHECK: $sgpr9 = COPY [[INT9]](s32)
; CHECK: [[COPY15:%[0-9]+]]:vgpr(s32) = COPY [[UV10]](s32)
; CHECK: [[INT10:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY15]](s32)
; CHECK: $sgpr10 = COPY [[INT10]](s32)
; CHECK: [[COPY16:%[0-9]+]]:vgpr(s32) = COPY [[UV11]](s32)
; CHECK: [[INT11:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY16]](s32)
; CHECK: $sgpr11 = COPY [[INT11]](s32)
; CHECK: [[COPY17:%[0-9]+]]:vgpr(s32) = COPY [[UV12]](s32)
; CHECK: [[INT12:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY17]](s32)
; CHECK: $sgpr12 = COPY [[INT12]](s32)
; CHECK: [[COPY18:%[0-9]+]]:vgpr(s32) = COPY [[UV13]](s32)
; CHECK: [[INT13:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY18]](s32)
; CHECK: $sgpr13 = COPY [[INT13]](s32)
; CHECK: [[COPY19:%[0-9]+]]:vgpr(s32) = COPY [[UV14]](s32)
; CHECK: [[INT14:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY19]](s32)
; CHECK: $sgpr14 = COPY [[INT14]](s32)
; CHECK: [[COPY20:%[0-9]+]]:vgpr(s32) = COPY [[UV15]](s32)
; CHECK: [[INT15:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY20]](s32)
; CHECK: $sgpr15 = COPY [[INT15]](s32)
; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15
%val = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
ret <16 x i32> %val