AMDGPU: Undo sub x, c -> add x, -c canonicalization

This is worse if the original constant is an inline immediate. This should also be done for 64-bit adds, but requires fixing operand folding bugs first. llvm-svn: 293540
2017-01-30 19:30:24 +00:00 · 2017-01-30 19:30:24 +00:00 · af635240d5
parent 8432161f1d
commit af635240d5
6 changed files with 227 additions and 2 deletions
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@ -295,6 +295,19 @@ class VGPRImm <dag frag> : PatLeaf<frag, [{
  return Limit < 10;
 }]>;

+def NegateImm : SDNodeXForm<imm, [{
+  return CurDAG->getConstant(-N->getSExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
+// TODO: When FP inline imm values work?
+def NegSubInlineConst32 : ImmLeaf<i32, [{
+  return Imm < -16 && Imm >= -64;
+}], NegateImm>;
+
+def NegSubInlineConst16 : ImmLeaf<i16, [{
+  return Imm < -16 && Imm >= -64;
+}], NegateImm>;
+
 //===----------------------------------------------------------------------===//
 // Custom Operands
 //===----------------------------------------------------------------------===//
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@ -1125,6 +1125,15 @@ def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>;
 def : IntMed3Pat<V_MED3_I32, smax, smax_oneuse, smin_oneuse>;
 def : IntMed3Pat<V_MED3_U32, umax, umax_oneuse, umin_oneuse>;

+
+// Undo sub x, c -> add x, -c canonicalization since c is more likely
+// an inline immediate than -c.
+// TODO: Also do for 64-bit.
+def : Pat<
+  (add i32:$src0, (i32 NegSubInlineConst32:$src1)),
+  (S_SUB_I32 $src0, NegSubInlineConst32:$src1)
+>;
+
 //============================================================================//
 // Assembler aliases
 //============================================================================//
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@ -494,6 +494,14 @@ def : Pat <
  (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src)
 >;

+// Undo sub x, c -> add x, -c canonicalization since c is more likely
+// an inline immediate than -c.
+// TODO: Also do for 64-bit.
+def : Pat<
+  (add i16:$src0, (i16 NegSubInlineConst16:$src1)),
+  (V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1)
+>;
+
 } // End Predicates = [isVI]

 //===----------------------------------------------------------------------===//
--- a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll
@ -37,7 +37,7 @@ define void @s_addk_i32_k1(i32 addrspace(1)* %out, i32 %b) {
 }

 ; SI-LABEL: {{^}}s_addk_i32_k2:
-; SI: s_addk_i32 {{s[0-9]+}}, 0xffef{{$}}
+; SI: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, 17
 ; SI: s_endpgm
 define void @s_addk_i32_k2(i32 addrspace(1)* %out, i32 %b) {
  %add = add i32 %b, -17
@ -45,6 +45,15 @@ define void @s_addk_i32_k2(i32 addrspace(1)* %out, i32 %b) {
  ret void
 }

+; SI-LABEL: {{^}}s_addk_i32_k3:
+; SI: s_addk_i32 {{s[0-9]+}}, 0xffbf{{$}}
+; SI: s_endpgm
+define void @s_addk_i32_k3(i32 addrspace(1)* %out, i32 %b) {
+  %add = add i32 %b, -65
+  store i32 %add, i32 addrspace(1)* %out
+  ret void
+}
+
 ; SI-LABEL: {{^}}s_addk_v2i32_k0:
 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41
 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x42
--- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@ -0,0 +1,186 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; Test that add/sub with a constant is swapped to sub/add with negated
+; constant to minimize code size.
+
+; GCN-LABEL: {{^}}v_test_i32_x_sub_64:
+; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[X]]
+define void @v_test_i32_x_sub_64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %x = load i32, i32 addrspace(1)* %gep
+  %result = sub i32 %x, 64
+  store i32 %result, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i32_x_sub_64_multi_use:
+; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[Y:v[0-9]+]]
+; GCN-DAG: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[X]]
+; GCN-DAG: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[Y]]
+define void @v_test_i32_x_sub_64_multi_use(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %x = load volatile i32, i32 addrspace(1)* %gep
+  %y = load volatile i32, i32 addrspace(1)* %gep
+  %result0 = sub i32 %x, 64
+  %result1 = sub i32 %y, 64
+  store volatile i32 %result0, i32 addrspace(1)* %gep.out
+  store volatile i32 %result1, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i32_64_sub_x:
+; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, 64, [[X]]
+define void @v_test_i32_64_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %x = load i32, i32 addrspace(1)* %gep
+  %result = sub i32 64, %x
+  store i32 %result, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i32_x_sub_65:
+; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 0xffffffbf, [[X]]
+define void @v_test_i32_x_sub_65(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %x = load i32, i32 addrspace(1)* %gep
+  %result = sub i32 %x, 65
+  store i32 %result, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i32_65_sub_x:
+; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, 0x41, [[X]]
+define void @v_test_i32_65_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %x = load i32, i32 addrspace(1)* %gep
+  %result = sub i32 65, %x
+  store i32 %result, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i32_x_sub_neg16:
+; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 16, [[X]]
+define void @v_test_i32_x_sub_neg16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %x = load i32, i32 addrspace(1)* %gep
+  %result = sub i32 %x, -16
+  store i32 %result, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i32_neg16_sub_x:
+; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, -16, [[X]]
+define void @v_test_i32_neg16_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %x = load i32, i32 addrspace(1)* %gep
+  %result = sub i32 -16, %x
+  store i32 %result, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i32_x_sub_neg17:
+; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 17, [[X]]
+define void @v_test_i32_x_sub_neg17(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %x = load i32, i32 addrspace(1)* %gep
+  %result = sub i32 %x, -17
+  store i32 %result, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i32_neg17_sub_x:
+; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, 0xffffffef, [[X]]
+define void @v_test_i32_neg17_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %x = load i32, i32 addrspace(1)* %gep
+  %result = sub i32 -17, %x
+  store i32 %result, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_i32_x_sub_64:
+; GCN: s_load_dword [[X:s[0-9]+]]
+; GCN: s_sub_i32 s{{[0-9]+}}, [[X]], 64
+define void @s_test_i32_x_sub_64(i32 %x) #0 {
+  %result = sub i32 %x, 64
+  call void asm sideeffect "; use $0", "s"(i32 %result)
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i16_x_sub_64:
+; VI: {{buffer|flat}}_load_ushort [[X:v[0-9]+]]
+; VI: v_subrev_u16_e32 v{{[0-9]+}}, 64, [[X]]
+define void @v_test_i16_x_sub_64(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext
+  %x = load i16, i16 addrspace(1)* %gep
+  %result = sub i16 %x, 64
+  store i16 %result, i16 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i16_x_sub_64_multi_use:
+; GCN: {{buffer|flat}}_load_ushort [[X:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_ushort [[Y:v[0-9]+]]
+; VI-DAG: v_subrev_u16_e32 v{{[0-9]+}}, 64, [[X]]
+; VI-DAG: v_subrev_u16_e32 v{{[0-9]+}}, 64, [[Y]]
+
+; SI-DAG: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[X]]
+; SI-DAG: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[Y]]
+define void @v_test_i16_x_sub_64_multi_use(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext
+  %x = load volatile i16, i16 addrspace(1)* %gep
+  %y = load volatile i16, i16 addrspace(1)* %gep
+  %result0 = sub i16 %x, 64
+  %result1 = sub i16 %y, 64
+  store volatile i16 %result0, i16 addrspace(1)* %gep.out
+  store volatile i16 %result1, i16 addrspace(1)* %gep.out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
--- a/llvm/test/CodeGen/AMDGPU/sub.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.i16.ll
@ -52,7 +52,7 @@ define void @v_test_sub_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; GCN-LABEL: {{^}}v_test_sub_i16_inline_63:
 ; VI: flat_load_ushort [[A:v[0-9]+]]
-; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0xffffffc1, [[A]]
+; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], 63, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
 define void @v_test_sub_i16_inline_63(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()