[AMDGPU] Force qsads instrs to use different dest register than source registers

The V_MQSAD_PK_U16_U8, V_QSAD_PK_U16_U8, and V_MQSAD_U32_U8 take more than 1 pass in hardware. For these three instructions, the destination registers must be different than all sources, so that the first pass does not overwrite sources for the following passes. Differential Revision: https://reviews.llvm.org/D33783 llvm-svn: 304998
2017-06-08 18:21:19 +00:00 · 2017-06-08 18:21:19 +00:00 · e5c7832311
parent 1d36e833e1
commit e5c7832311
4 changed files with 73 additions and 37 deletions
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@ -209,7 +209,10 @@ def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64,
 }

 def V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_msad_u8>;
+
+let Constraints = "@earlyclobber $vdst" in {
 def V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64>, int_amdgcn_mqsad_pk_u16_u8>;
+} // End Constraints = "@earlyclobber $vdst"

 def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUtrig_preop> {
  let SchedRW = [WriteDouble];
@ -232,8 +235,10 @@ def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>>;

 let SubtargetPredicate = isCIVI in {

+let Constraints = "@earlyclobber $vdst" in {
 def V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64>, int_amdgcn_qsad_pk_u16_u8>;
 def V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_V4I32>, int_amdgcn_mqsad_u32_u8>;
+} // End Constraints = "@earlyclobber $vdst"

 let isCommutable = 1 in {
 def V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll
@ -4,19 +4,29 @@
 declare i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64, i32, i64) #0

 ; GCN-LABEL: {{^}}v_mqsad_pk_u16_u8:
-; GCN: v_mqsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN: v_mqsad_pk_u16_u8 v[0:1], v[4:5], s{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN-DAG: v_mov_b32_e32 v5, v1
+; GCN-DAG: v_mov_b32_e32 v4, v0
 define amdgpu_kernel void @v_mqsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
-  %result= call i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %src, i32 100, i64 100) #0
-  store i64 %result, i64 addrspace(1)* %out, align 4
+  %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={VGPR4_VGPR5},v"(i64 %src) #0
+  %tmp1 = call i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %tmp, i32 100, i64 100) #0
+  %tmp2 = call i64 asm ";; force constraint", "=v,{VGPR4_VGPR5}"(i64 %tmp1) #0
+  store i64 %tmp2, i64 addrspace(1)* %out, align 4
  ret void
 }

 ; GCN-LABEL: {{^}}v_mqsad_pk_u16_u8_non_immediate:
-; GCN: v_mqsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN: v_mqsad_pk_u16_u8 v[0:1], v[2:3], v4, v[6:7]
+; GCN-DAG: v_mov_b32_e32 v3, v1
+; GCN-DAG: v_mov_b32_e32 v2, v0		
 define amdgpu_kernel void @v_mqsad_pk_u16_u8_non_immediate(i64 addrspace(1)* %out, i64 %src, i32 %a, i64 %b) {
-  %result= call i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %src, i32 %a, i64 %b) #0
-  store i64 %result, i64 addrspace(1)* %out, align 4
+  %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={VGPR2_VGPR3},v"(i64 %src) #0
+  %tmp1 = call i32 asm "v_mov_b32 $0, $1", "={VGPR4},v"(i32 %a) #0
+  %tmp2 = call i64 asm "v_lshlrev_b64 $0, $1, 1", "={VGPR6_VGPR7},v"(i64 %b) #0
+  %tmp3 = call i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %tmp, i32 %tmp1, i64 %tmp2) #0
+  %tmp4 = call i64 asm ";; force constraint", "=v,{VGPR2_VGPR3}"(i64 %tmp3) #0
+  store i64 %tmp4, i64 addrspace(1)* %out, align 4
  ret void
 }

-attributes #0 = { nounwind readnone }
+attributes #0 = { nounwind readnone }
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll
@ -3,46 +3,57 @@

 declare <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64, i32, <4 x i32>) #0

-; GCN-LABEL: {{^}}v_mqsad_u32_u8_use_non_inline_constant:
-; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @v_mqsad_u32_u8_use_non_inline_constant(<4 x i32> addrspace(1)* %out, i64 %src) {
-  %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 100, <4 x i32> <i32 100, i32 100, i32 100, i32 100>) #0
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
+; GCN-LABEL: {{^}}v_mqsad_u32_u8_inline_integer_immediate:
+; GCN-DAG: v_mov_b32_e32 v0, v2
+; GCN-DAG: v_mov_b32_e32 v1, v3		
+; GCN: v_mqsad_u32_u8 v[2:5], v[0:1], v6, v[{{[0-9]+:[0-9]+}}]
+define amdgpu_kernel void @v_mqsad_u32_u8_inline_integer_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) {
+  %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={VGPR2_VGPR3},v"(i64 %src) #0
+  %tmp1 = call i32 asm "v_mov_b32 $0, $1", "={VGPR4},v"(i32 %a) #0
+  %tmp2 = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %tmp, i32 %tmp1, <4 x i32> <i32 10, i32 20, i32 30, i32 40>) #0
+  %tmp3 = call <4 x i32>  asm ";; force constraint", "=v,{VGPR2_VGPR3_VGPR4_VGPR5}"(<4 x i32> %tmp2) #0
+  store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out, align 4
  ret void
 }

 ; GCN-LABEL: {{^}}v_mqsad_u32_u8_non_immediate:
-; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN-DAG: v_mov_b32_e32 v0, v2
+; GCN-DAG: v_mov_b32_e32 v1, v3		
+; GCN: v_mqsad_u32_u8 v[2:5], v[0:1], v6, v[{{[0-9]+:[0-9]+}}]
 define amdgpu_kernel void @v_mqsad_u32_u8_non_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> %b) {
-  %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> %b) #0
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}v_mqsad_u32_u8_inline_integer_immediate:
-; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @v_mqsad_u32_u8_inline_integer_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) {
-  %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> <i32 10, i32 20, i32 30, i32 40>) #0
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
+  %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={VGPR2_VGPR3},v"(i64 %src) #0
+  %tmp1 = call i32 asm "v_mov_b32 $0, $1", "={VGPR4},v"(i32 %a) #0
+  %tmp2 = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %tmp, i32 %tmp1, <4 x i32> %b) #0
+  %tmp3 = call <4 x i32>  asm ";; force constraint", "=v,{VGPR2_VGPR3_VGPR4_VGPR5}"(<4 x i32> %tmp2) #0
+  store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out, align 4
  ret void
 }

 ; GCN-LABEL: {{^}}v_mqsad_u32_u8_inline_fp_immediate:
-; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN-DAG: v_mov_b32_e32 v0, v2
+; GCN-DAG: v_mov_b32_e32 v1, v3		
+; GCN: v_mqsad_u32_u8 v[2:5], v[0:1], v6, v[{{[0-9]+:[0-9]+}}]
 define amdgpu_kernel void @v_mqsad_u32_u8_inline_fp_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) {
-  %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> <i32 1065353216, i32 0, i32 0, i32 0>) #0
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
+  %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={VGPR2_VGPR3},v"(i64 %src) #0
+  %tmp1 = call i32 asm "v_mov_b32 $0, $1", "={VGPR4},v"(i32 %a) #0
+  %tmp2 = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %tmp, i32 %tmp1, <4 x i32> <i32 1065353216, i32 0, i32 0, i32 0>) #0
+  %tmp3 = call <4 x i32>  asm ";; force constraint", "=v,{VGPR2_VGPR3_VGPR4_VGPR5}"(<4 x i32> %tmp2) #0
+  store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out, align 4
  ret void
 }

 ; GCN-LABEL: {{^}}v_mqsad_u32_u8_use_sgpr_vgpr:
-; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN-DAG: v_mov_b32_e32 v0, v2
+; GCN-DAG: v_mov_b32_e32 v1, v3		
+; GCN: v_mqsad_u32_u8 v[2:5], v[0:1], v6, v[{{[0-9]+:[0-9]+}}]
 define amdgpu_kernel void @v_mqsad_u32_u8_use_sgpr_vgpr(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> addrspace(1)* %input) {
  %in = load <4 x i32>, <4 x i32> addrspace(1) * %input
-
-  %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> %in) #0
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
+  %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={VGPR2_VGPR3},v"(i64 %src) #0
+  %tmp1 = call i32 asm "v_mov_b32 $0, $1", "={VGPR4},v"(i32 %a) #0
+  %tmp2 = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %tmp, i32 %tmp1, <4 x i32> %in) #0
+  %tmp3 = call <4 x i32>  asm ";; force constraint", "=v,{VGPR2_VGPR3_VGPR4_VGPR5}"(<4 x i32> %tmp2) #0
+  store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out, align 4
  ret void
 }

-attributes #0 = { nounwind readnone }
+attributes #0 = { nounwind readnone }
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll
@ -4,18 +4,28 @@
 declare i64 @llvm.amdgcn.qsad.pk.u16.u8(i64, i32, i64) #0

 ; GCN-LABEL: {{^}}v_qsad_pk_u16_u8:
-; GCN: v_qsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN: v_qsad_pk_u16_u8 v[0:1], v[4:5], s{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN-DAG: v_mov_b32_e32 v5, v1
+; GCN-DAG: v_mov_b32_e32 v4, v0
 define amdgpu_kernel void @v_qsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
-  %result= call i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %src, i32 100, i64 100) #0
-  store i64 %result, i64 addrspace(1)* %out, align 4
+  %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={VGPR4_VGPR5},v"(i64 %src) #0
+  %tmp1 = call i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %tmp, i32 100, i64 100) #0
+  %tmp2 = call i64 asm ";; force constraint", "=v,{VGPR4_VGPR5}"(i64 %tmp1) #0
+  store i64 %tmp2, i64 addrspace(1)* %out, align 4
  ret void
 }

 ; GCN-LABEL: {{^}}v_qsad_pk_u16_u8_non_immediate:
-; GCN: v_qsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN: v_qsad_pk_u16_u8 v[0:1], v[2:3], v4, v[6:7]
+; GCN-DAG: v_mov_b32_e32 v3, v1
+; GCN-DAG: v_mov_b32_e32 v2, v0
 define amdgpu_kernel void @v_qsad_pk_u16_u8_non_immediate(i64 addrspace(1)* %out, i64 %src, i32 %a, i64 %b) {
-  %result= call i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %src, i32 %a, i64 %b) #0
-  store i64 %result, i64 addrspace(1)* %out, align 4
+  %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={VGPR2_VGPR3},v"(i64 %src) #0
+  %tmp1 = call i32 asm "v_mov_b32 $0, $1", "={VGPR4},v"(i32 %a) #0
+  %tmp2 = call i64 asm "v_lshlrev_b64 $0, $1, 1", "={VGPR6_VGPR7},v"(i64 %b) #0
+  %tmp3 = call i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %tmp, i32 %tmp1, i64 %tmp2) #0
+  %tmp4 = call i64 asm ";; force constraint", "=v,{VGPR2_VGPR3}"(i64 %tmp3) #0
+  store i64 %tmp4, i64 addrspace(1)* %out, align 4
  ret void
 }