forked from OSchip/llvm-project
[AMDGPU] Force qsads instrs to use different dest register than source registers
The V_MQSAD_PK_U16_U8, V_QSAD_PK_U16_U8, and V_MQSAD_U32_U8 take more than 1 pass in hardware. For these three instructions, the destination registers must be different than all sources, so that the first pass does not overwrite sources for the following passes. Differential Revision: https://reviews.llvm.org/D33783 llvm-svn: 304998
This commit is contained in:
parent
1d36e833e1
commit
e5c7832311
|
@ -209,7 +209,10 @@ def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64,
|
|||
}
|
||||
|
||||
def V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_msad_u8>;
|
||||
|
||||
let Constraints = "@earlyclobber $vdst" in {
|
||||
def V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64>, int_amdgcn_mqsad_pk_u16_u8>;
|
||||
} // End Constraints = "@earlyclobber $vdst"
|
||||
|
||||
def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUtrig_preop> {
|
||||
let SchedRW = [WriteDouble];
|
||||
|
@ -232,8 +235,10 @@ def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>>;
|
|||
|
||||
let SubtargetPredicate = isCIVI in {
|
||||
|
||||
let Constraints = "@earlyclobber $vdst" in {
|
||||
def V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64>, int_amdgcn_qsad_pk_u16_u8>;
|
||||
def V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_V4I32>, int_amdgcn_mqsad_u32_u8>;
|
||||
} // End Constraints = "@earlyclobber $vdst"
|
||||
|
||||
let isCommutable = 1 in {
|
||||
def V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
|
||||
|
|
|
@ -4,19 +4,29 @@
|
|||
declare i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64, i32, i64) #0
|
||||
|
||||
; GCN-LABEL: {{^}}v_mqsad_pk_u16_u8:
|
||||
; GCN: v_mqsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
||||
; GCN: v_mqsad_pk_u16_u8 v[0:1], v[4:5], s{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
||||
; GCN-DAG: v_mov_b32_e32 v5, v1
|
||||
; GCN-DAG: v_mov_b32_e32 v4, v0
|
||||
define amdgpu_kernel void @v_mqsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
|
||||
%result= call i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %src, i32 100, i64 100) #0
|
||||
store i64 %result, i64 addrspace(1)* %out, align 4
|
||||
%tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={VGPR4_VGPR5},v"(i64 %src) #0
|
||||
%tmp1 = call i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %tmp, i32 100, i64 100) #0
|
||||
%tmp2 = call i64 asm ";; force constraint", "=v,{VGPR4_VGPR5}"(i64 %tmp1) #0
|
||||
store i64 %tmp2, i64 addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_mqsad_pk_u16_u8_non_immediate:
|
||||
; GCN: v_mqsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
||||
; GCN: v_mqsad_pk_u16_u8 v[0:1], v[2:3], v4, v[6:7]
|
||||
; GCN-DAG: v_mov_b32_e32 v3, v1
|
||||
; GCN-DAG: v_mov_b32_e32 v2, v0
|
||||
define amdgpu_kernel void @v_mqsad_pk_u16_u8_non_immediate(i64 addrspace(1)* %out, i64 %src, i32 %a, i64 %b) {
|
||||
%result= call i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %src, i32 %a, i64 %b) #0
|
||||
store i64 %result, i64 addrspace(1)* %out, align 4
|
||||
%tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={VGPR2_VGPR3},v"(i64 %src) #0
|
||||
%tmp1 = call i32 asm "v_mov_b32 $0, $1", "={VGPR4},v"(i32 %a) #0
|
||||
%tmp2 = call i64 asm "v_lshlrev_b64 $0, $1, 1", "={VGPR6_VGPR7},v"(i64 %b) #0
|
||||
%tmp3 = call i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %tmp, i32 %tmp1, i64 %tmp2) #0
|
||||
%tmp4 = call i64 asm ";; force constraint", "=v,{VGPR2_VGPR3}"(i64 %tmp3) #0
|
||||
store i64 %tmp4, i64 addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind readnone }
|
||||
attributes #0 = { nounwind readnone }
|
|
@ -3,46 +3,57 @@
|
|||
|
||||
declare <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64, i32, <4 x i32>) #0
|
||||
|
||||
; GCN-LABEL: {{^}}v_mqsad_u32_u8_use_non_inline_constant:
|
||||
; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
||||
define amdgpu_kernel void @v_mqsad_u32_u8_use_non_inline_constant(<4 x i32> addrspace(1)* %out, i64 %src) {
|
||||
%result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 100, <4 x i32> <i32 100, i32 100, i32 100, i32 100>) #0
|
||||
store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
|
||||
; GCN-LABEL: {{^}}v_mqsad_u32_u8_inline_integer_immediate:
|
||||
; GCN-DAG: v_mov_b32_e32 v0, v2
|
||||
; GCN-DAG: v_mov_b32_e32 v1, v3
|
||||
; GCN: v_mqsad_u32_u8 v[2:5], v[0:1], v6, v[{{[0-9]+:[0-9]+}}]
|
||||
define amdgpu_kernel void @v_mqsad_u32_u8_inline_integer_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) {
|
||||
%tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={VGPR2_VGPR3},v"(i64 %src) #0
|
||||
%tmp1 = call i32 asm "v_mov_b32 $0, $1", "={VGPR4},v"(i32 %a) #0
|
||||
%tmp2 = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %tmp, i32 %tmp1, <4 x i32> <i32 10, i32 20, i32 30, i32 40>) #0
|
||||
%tmp3 = call <4 x i32> asm ";; force constraint", "=v,{VGPR2_VGPR3_VGPR4_VGPR5}"(<4 x i32> %tmp2) #0
|
||||
store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_mqsad_u32_u8_non_immediate:
|
||||
; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
||||
; GCN-DAG: v_mov_b32_e32 v0, v2
|
||||
; GCN-DAG: v_mov_b32_e32 v1, v3
|
||||
; GCN: v_mqsad_u32_u8 v[2:5], v[0:1], v6, v[{{[0-9]+:[0-9]+}}]
|
||||
define amdgpu_kernel void @v_mqsad_u32_u8_non_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> %b) {
|
||||
%result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> %b) #0
|
||||
store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_mqsad_u32_u8_inline_integer_immediate:
|
||||
; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
||||
define amdgpu_kernel void @v_mqsad_u32_u8_inline_integer_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) {
|
||||
%result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> <i32 10, i32 20, i32 30, i32 40>) #0
|
||||
store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
|
||||
%tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={VGPR2_VGPR3},v"(i64 %src) #0
|
||||
%tmp1 = call i32 asm "v_mov_b32 $0, $1", "={VGPR4},v"(i32 %a) #0
|
||||
%tmp2 = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %tmp, i32 %tmp1, <4 x i32> %b) #0
|
||||
%tmp3 = call <4 x i32> asm ";; force constraint", "=v,{VGPR2_VGPR3_VGPR4_VGPR5}"(<4 x i32> %tmp2) #0
|
||||
store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_mqsad_u32_u8_inline_fp_immediate:
|
||||
; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
||||
; GCN-DAG: v_mov_b32_e32 v0, v2
|
||||
; GCN-DAG: v_mov_b32_e32 v1, v3
|
||||
; GCN: v_mqsad_u32_u8 v[2:5], v[0:1], v6, v[{{[0-9]+:[0-9]+}}]
|
||||
define amdgpu_kernel void @v_mqsad_u32_u8_inline_fp_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) {
|
||||
%result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> <i32 1065353216, i32 0, i32 0, i32 0>) #0
|
||||
store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
|
||||
%tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={VGPR2_VGPR3},v"(i64 %src) #0
|
||||
%tmp1 = call i32 asm "v_mov_b32 $0, $1", "={VGPR4},v"(i32 %a) #0
|
||||
%tmp2 = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %tmp, i32 %tmp1, <4 x i32> <i32 1065353216, i32 0, i32 0, i32 0>) #0
|
||||
%tmp3 = call <4 x i32> asm ";; force constraint", "=v,{VGPR2_VGPR3_VGPR4_VGPR5}"(<4 x i32> %tmp2) #0
|
||||
store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_mqsad_u32_u8_use_sgpr_vgpr:
|
||||
; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
||||
; GCN-DAG: v_mov_b32_e32 v0, v2
|
||||
; GCN-DAG: v_mov_b32_e32 v1, v3
|
||||
; GCN: v_mqsad_u32_u8 v[2:5], v[0:1], v6, v[{{[0-9]+:[0-9]+}}]
|
||||
define amdgpu_kernel void @v_mqsad_u32_u8_use_sgpr_vgpr(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> addrspace(1)* %input) {
|
||||
%in = load <4 x i32>, <4 x i32> addrspace(1) * %input
|
||||
|
||||
%result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> %in) #0
|
||||
store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
|
||||
%tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={VGPR2_VGPR3},v"(i64 %src) #0
|
||||
%tmp1 = call i32 asm "v_mov_b32 $0, $1", "={VGPR4},v"(i32 %a) #0
|
||||
%tmp2 = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %tmp, i32 %tmp1, <4 x i32> %in) #0
|
||||
%tmp3 = call <4 x i32> asm ";; force constraint", "=v,{VGPR2_VGPR3_VGPR4_VGPR5}"(<4 x i32> %tmp2) #0
|
||||
store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind readnone }
|
||||
attributes #0 = { nounwind readnone }
|
|
@ -4,18 +4,28 @@
|
|||
declare i64 @llvm.amdgcn.qsad.pk.u16.u8(i64, i32, i64) #0
|
||||
|
||||
; GCN-LABEL: {{^}}v_qsad_pk_u16_u8:
|
||||
; GCN: v_qsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
||||
; GCN: v_qsad_pk_u16_u8 v[0:1], v[4:5], s{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
||||
; GCN-DAG: v_mov_b32_e32 v5, v1
|
||||
; GCN-DAG: v_mov_b32_e32 v4, v0
|
||||
define amdgpu_kernel void @v_qsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
|
||||
%result= call i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %src, i32 100, i64 100) #0
|
||||
store i64 %result, i64 addrspace(1)* %out, align 4
|
||||
%tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={VGPR4_VGPR5},v"(i64 %src) #0
|
||||
%tmp1 = call i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %tmp, i32 100, i64 100) #0
|
||||
%tmp2 = call i64 asm ";; force constraint", "=v,{VGPR4_VGPR5}"(i64 %tmp1) #0
|
||||
store i64 %tmp2, i64 addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_qsad_pk_u16_u8_non_immediate:
|
||||
; GCN: v_qsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
||||
; GCN: v_qsad_pk_u16_u8 v[0:1], v[2:3], v4, v[6:7]
|
||||
; GCN-DAG: v_mov_b32_e32 v3, v1
|
||||
; GCN-DAG: v_mov_b32_e32 v2, v0
|
||||
define amdgpu_kernel void @v_qsad_pk_u16_u8_non_immediate(i64 addrspace(1)* %out, i64 %src, i32 %a, i64 %b) {
|
||||
%result= call i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %src, i32 %a, i64 %b) #0
|
||||
store i64 %result, i64 addrspace(1)* %out, align 4
|
||||
%tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={VGPR2_VGPR3},v"(i64 %src) #0
|
||||
%tmp1 = call i32 asm "v_mov_b32 $0, $1", "={VGPR4},v"(i32 %a) #0
|
||||
%tmp2 = call i64 asm "v_lshlrev_b64 $0, $1, 1", "={VGPR6_VGPR7},v"(i64 %b) #0
|
||||
%tmp3 = call i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %tmp, i32 %tmp1, i64 %tmp2) #0
|
||||
%tmp4 = call i64 asm ";; force constraint", "=v,{VGPR2_VGPR3}"(i64 %tmp3) #0
|
||||
store i64 %tmp4, i64 addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue