AMDGPU: Stop wasting argument registers with v3i32/v3f32

SelectionDAGBuilder widens v3i32/v3f32 arguments to
to v4i32/v4f32 which consume an additional register.
In addition to wasting argument space, this produces extra
instructions since now it appears the 4th vector component has
a meaningful value to most combines.

llvm-svn: 338197
This commit is contained in:
Matt Arsenault 2018-07-28 14:11:34 +00:00
parent 3878bf83dd
commit 8f9dde94b7
7 changed files with 229 additions and 5 deletions

View File

@ -694,6 +694,52 @@ bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
return false;
}
MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
if (CC != CallingConv::AMDGPU_KERNEL &&
VT.isVector() && VT.getVectorNumElements() == 3) {
EVT ScalarVT = VT.getScalarType();
if (ScalarVT.getSizeInBits() == 32)
return ScalarVT.getSimpleVT();
}
return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
}
unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
if (CC != CallingConv::AMDGPU_KERNEL &&
VT.isVector() && VT.getVectorNumElements() == 3) {
EVT ScalarVT = VT.getScalarType();
if (ScalarVT.getSizeInBits() == 32)
return 3;
}
return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
}
unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
LLVMContext &Context, CallingConv::ID CC,
EVT VT, EVT &IntermediateVT,
unsigned &NumIntermediates, MVT &RegisterVT) const {
if (CC != CallingConv::AMDGPU_KERNEL && VT.getVectorNumElements() == 3) {
EVT ScalarVT = VT.getScalarType();
if (ScalarVT.getSizeInBits() == 32 ||
ScalarVT.getSizeInBits() == 64) {
RegisterVT = ScalarVT.getSimpleVT();
IntermediateVT = RegisterVT;
NumIntermediates = 3;
return NumIntermediates;
}
}
return TargetLowering::getVectorTypeBreakdownForCallingConv(
Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
}
bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &CI,
MachineFunction &MF,

View File

@ -25,6 +25,19 @@ class SITargetLowering final : public AMDGPUTargetLowering {
private:
const GCNSubtarget *Subtarget;
public:
MVT getRegisterTypeForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const override;
unsigned getNumRegistersForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const override;
unsigned getVectorTypeBreakdownForCallingConv(
LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
unsigned &NumIntermediates, MVT &RegisterVT) const override;
private:
SDValue lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL,
SDValue Chain, uint64_t Offset) const;
SDValue getImplicitArgPtr(SelectionDAG &DAG, const SDLoc &SL) const;

View File

@ -17,6 +17,9 @@ declare void @external_void_func_i16_zeroext(i16 zeroext) #0
declare void @external_void_func_i32(i32) #0
declare void @external_void_func_i64(i64) #0
declare void @external_void_func_v2i64(<2 x i64>) #0
declare void @external_void_func_v3i64(<3 x i64>) #0
declare void @external_void_func_v4i64(<4 x i64>) #0
declare void @external_void_func_f16(half) #0
declare void @external_void_func_f32(float) #0
@ -27,6 +30,7 @@ declare void @external_void_func_v2f16(<2 x half>) #0
declare void @external_void_func_v2i32(<2 x i32>) #0
declare void @external_void_func_v3i32(<3 x i32>) #0
declare void @external_void_func_v3i32_i32(<3 x i32>, i32) #0
declare void @external_void_func_v4i32(<4 x i32>) #0
declare void @external_void_func_v8i32(<8 x i32>) #0
declare void @external_void_func_v16i32(<16 x i32>) #0
@ -255,6 +259,47 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_v2i64:
; GCN: buffer_load_dwordx4 v[0:3]
; GCN: s_waitcnt
; GCN-NEXT: s_swappc_b64
define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
%val = load <2 x i64>, <2 x i64> addrspace(1)* null
call void @external_void_func_v2i64(<2 x i64> %val)
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_v3i64:
; GCN: buffer_load_dwordx4 v[0:3]
; GCN: v_mov_b32_e32 v4, s
; GCN: v_mov_b32_e32 v5, s
; GCN: s_waitcnt
; GCN-NEXT: s_swappc_b64
define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
%load = load <2 x i64>, <2 x i64> addrspace(1)* null
%val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 undef>, <3 x i32> <i32 0, i32 1, i32 2>
call void @external_void_func_v3i64(<3 x i64> %val)
ret void
}
; FIXME: Immedites should fold directly into v_mov_b32s
; GCN-LABEL: {{^}}test_call_external_void_func_v4i64:
; GCN: buffer_load_dwordx4 v[0:3]
; GCN: v_mov_b32_e32 v4, s
; GCN: v_mov_b32_e32 v5, s
; GCN: v_mov_b32_e32 v6, s
; GCN: v_mov_b32_e32 v7, s
; GCN: s_waitcnt
; GCN-NEXT: s_swappc_b64
define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
%load = load <2 x i64>, <2 x i64> addrspace(1)* null
%val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 17179869187>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
call void @external_void_func_v4i64(<4 x i64> %val)
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_f16_imm:
; VI: v_mov_b32_e32 v0, 0x4400
; CI: v_mov_b32_e32 v0, 4.0
@ -313,15 +358,14 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
ret void
}
; FIXME: Passing 4th
; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_imm:
; HSA-DAG: s_mov_b32 s33, s9
; MESA-DAG: s_mov_b32 s33, s3{{$}}
; GCN-DAG: v_mov_b32_e32 v0
; GCN-DAG: v_mov_b32_e32 v1
; GCN-DAG: v_mov_b32_e32 v2
; GCN-DAG: v_mov_b32_e32 v3
; GCN-DAG: v_mov_b32_e32 v0, 3
; GCN-DAG: v_mov_b32_e32 v1, 4
; GCN-DAG: v_mov_b32_e32 v2, 5
; GCN-NOT: v3
; GCN: s_swappc_b64
define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
@ -329,6 +373,16 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_i32:
; GCN-DAG: v_mov_b32_e32 v0, 3
; GCN-DAG: v_mov_b32_e32 v1, 4
; GCN-DAG: v_mov_b32_e32 v2, 5
; GCN-DAG: v_mov_b32_e32 v3, 6
define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 {
call void @external_void_func_v3i32_i32(<3 x i32> <i32 3, i32 4, i32 5>, i32 6)
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_v4i32:
; GCN: buffer_load_dwordx4 v[0:3]
; GCN: s_waitcnt

View File

@ -3,6 +3,7 @@
declare float @llvm.maxnum.f32(float, float) #0
declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #0
declare <3 x float> @llvm.maxnum.v3f32(<3 x float>, <3 x float>) #0
declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) #0
declare <8 x float> @llvm.maxnum.v8f32(<8 x float>, <8 x float>) #0
declare <16 x float> @llvm.maxnum.v16f32(<16 x float>, <16 x float>) #0
@ -33,6 +34,17 @@ define amdgpu_kernel void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x
ret void
}
; FUNC-LABEL: {{^}}test_fmax_v3f32:
; SI: v_max_f32_e32
; SI: v_max_f32_e32
; SI: v_max_f32_e32
; SI-NOT: v_max_f32
define amdgpu_kernel void @test_fmax_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, <3 x float> %b) nounwind {
%val = call <3 x float> @llvm.maxnum.v3f32(<3 x float> %a, <3 x float> %b) #0
store <3 x float> %val, <3 x float> addrspace(1)* %out, align 16
ret void
}
; FUNC-LABEL: @test_fmax_v4f32
; SI: v_max_f32_e32
; SI: v_max_f32_e32
@ -280,4 +292,14 @@ define amdgpu_kernel void @fmax_literal_var_f32(float addrspace(1)* %out, float
ret void
}
; FUNC-LABEL: {{^}}test_func_fmax_v3f32:
; SI: v_max_f32_e32
; SI: v_max_f32_e32
; SI: v_max_f32_e32
; SI-NOT: v_max_f32
define <3 x float> @test_func_fmax_v3f32(<3 x float> %a, <3 x float> %b) nounwind {
%val = call <3 x float> @llvm.maxnum.v3f32(<3 x float> %a, <3 x float> %b) #0
ret <3 x float> %val
}
attributes #0 = { nounwind readnone }

View File

@ -4,6 +4,7 @@
declare float @llvm.minnum.f32(float, float) #0
declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #0
declare <3 x float> @llvm.minnum.v3f32(<3 x float>, <3 x float>) #0
declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) #0
declare <8 x float> @llvm.minnum.v8f32(<8 x float>, <8 x float>) #0
declare <16 x float> @llvm.minnum.v16f32(<16 x float>, <16 x float>) #0
@ -278,4 +279,14 @@ define amdgpu_kernel void @fmin_literal_var_f32(float addrspace(1)* %out, float
ret void
}
; FUNC-LABEL: {{^}}test_func_fmin_v3f32:
; SI: v_min_f32_e32
; SI: v_min_f32_e32
; SI: v_min_f32_e32
; SI-NOT: v_min_f32
define <3 x float> @test_func_fmin_v3f32(<3 x float> %a, <3 x float> %b) nounwind {
%val = call <3 x float> @llvm.minnum.v3f32(<3 x float> %a, <3 x float> %b) #0
ret <3 x float> %val
}
attributes #0 = { nounwind readnone }

View File

@ -739,6 +739,45 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1,
ret void
}
; Make sure v3 isn't a wasted register because of v3 types being promoted to v4
; GCN-LABEL: {{^}}void_func_v3f32_wasted_reg:
; GCN: s_waitcnt
; GCN: ds_write_b32 v{{[0-9]+}}, v0
; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v1
; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v2
; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v3
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @void_func_v3f32_wasted_reg(<3 x float> %arg0, i32 %arg1) #0 {
%arg0.0 = extractelement <3 x float> %arg0, i32 0
%arg0.1 = extractelement <3 x float> %arg0, i32 1
%arg0.2 = extractelement <3 x float> %arg0, i32 2
store volatile float %arg0.0, float addrspace(3)* undef
store volatile float %arg0.1, float addrspace(3)* undef
store volatile float %arg0.2, float addrspace(3)* undef
store volatile i32 %arg1, i32 addrspace(3)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v3i32_wasted_reg:
; GCN: s_waitcnt
; GCN: ds_write_b32 v{{[0-9]+}}, v0
; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v1
; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v2
; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v3
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @void_func_v3i32_wasted_reg(<3 x i32> %arg0, i32 %arg1) #0 {
%arg0.0 = extractelement <3 x i32> %arg0, i32 0
%arg0.1 = extractelement <3 x i32> %arg0, i32 1
%arg0.2 = extractelement <3 x i32> %arg0, i32 2
store volatile i32 %arg0.0, i32 addrspace(3)* undef
store volatile i32 %arg0.1, i32 addrspace(3)* undef
store volatile i32 %arg0.2, i32 addrspace(3)* undef
store volatile i32 %arg1, i32 addrspace(3)* undef
ret void
}
; Check there is no crash.
; GCN-LABEL: {{^}}void_func_v16i8:
define void @void_func_v16i8(<16 x i8> %arg0) #0 {

View File

@ -531,4 +531,43 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
ret { i32, <32 x i32> }%val
}
; Make sure the last struct component is returned in v3, not v4.
; GCN-LABEL: {{^}}v3i32_struct_func_void_wasted_reg:
; GCN: ds_read_b32 v0,
; GCN: ds_read_b32 v1,
; GCN: ds_read_b32 v2,
; GCN: ds_read_b32 v3,
define { <3 x i32>, i32 } @v3i32_struct_func_void_wasted_reg() #0 {
%load0 = load volatile i32, i32 addrspace(3)* undef
%load1 = load volatile i32, i32 addrspace(3)* undef
%load2 = load volatile i32, i32 addrspace(3)* undef
%load3 = load volatile i32, i32 addrspace(3)* undef
%insert.0 = insertelement <3 x i32> undef, i32 %load0, i32 0
%insert.1 = insertelement <3 x i32> %insert.0, i32 %load1, i32 1
%insert.2 = insertelement <3 x i32> %insert.1, i32 %load2, i32 2
%insert.3 = insertvalue { <3 x i32>, i32 } undef, <3 x i32> %insert.2, 0
%insert.4 = insertvalue { <3 x i32>, i32 } %insert.3, i32 %load3, 1
ret { <3 x i32>, i32 } %insert.4
}
; GCN-LABEL: {{^}}v3f32_struct_func_void_wasted_reg:
; GCN: ds_read_b32 v0,
; GCN: ds_read_b32 v1,
; GCN: ds_read_b32 v2,
; GCN: ds_read_b32 v3,
define { <3 x float>, i32 } @v3f32_struct_func_void_wasted_reg() #0 {
%load0 = load volatile float, float addrspace(3)* undef
%load1 = load volatile float, float addrspace(3)* undef
%load2 = load volatile float, float addrspace(3)* undef
%load3 = load volatile i32, i32 addrspace(3)* undef
%insert.0 = insertelement <3 x float> undef, float %load0, i32 0
%insert.1 = insertelement <3 x float> %insert.0, float %load1, i32 1
%insert.2 = insertelement <3 x float> %insert.1, float %load2, i32 2
%insert.3 = insertvalue { <3 x float>, i32 } undef, <3 x float> %insert.2, 0
%insert.4 = insertvalue { <3 x float>, i32 } %insert.3, i32 %load3, 1
ret { <3 x float>, i32 } %insert.4
}
attributes #0 = { nounwind }