AMDGPU/GlobalISel: Match andn2/orn2 for more types

Unfortunately this ends up not working as expected on targets with 16-bit operations due to AMDGPUCodeGenPrepare's promotion of uniform 16-bit ops to i32. The vector case annoyingly requires switching the checked opcode, since constants for vectors aren't directly handled. I also need to think more carefully about whether this is valid for i1.
2020-07-31 13:48:58 -04:00 · 2020-07-31 13:48:58 -04:00 · 40a142fa57
parent b6db0a544d
commit 40a142fa57
5 changed files with 37 additions and 20 deletions
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@ -1510,6 +1510,10 @@ class getVOPSrc0ForVT<ValueType VT> {
    );
 }

+class getSOPSrcForVT<ValueType VT> {
+  RegisterOperand ret = !if(!eq(VT.Size, 64), SSrc_b64, SSrc_b32);
+}
+
 // Returns the vreg register class to use for source operand given VT
 class getVregSrcForVT<ValueType VT> {
  RegisterClass ret = !if(!eq(VT.Size, 128), VReg_128,
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@ -541,6 +541,7 @@ def S_NOR_B64 : SOP2_64 <"s_nor_b64",
 >;
 } // End isCommutable = 1

+// There are also separate patterns for types other than i32
 def S_ANDN2_B32 : SOP2_32 <"s_andn2_b32",
  [(set i32:$sdst, (UniformBinFrag<and> i32:$src0, (UniformUnaryFrag<not> i32:$src1)))]
 >;
@ -1330,6 +1331,24 @@ def : GCNPat<
  (S_AND_B32 (S_MOV_B32 (i32 0xffff)), $src)
 >;

+// FIXME: ValueType should have isVector field
+class ScalarNot2Pat<Instruction inst, SDPatternOperator op, ValueType vt,
+                    bit isVector = 1> : GCNPat<
+  (UniformBinFrag<op> vt:$src0, (UniformUnaryFrag<!if(isVector, vnot, not)> vt:$src1)),
+  (inst getSOPSrcForVT<vt>.ret:$src0, getSOPSrcForVT<vt>.ret:$src1)
+>;
+
+// Match these for some more types
+// TODO: i1
+def : ScalarNot2Pat<S_ANDN2_B32, and, i16, 0>;
+def : ScalarNot2Pat<S_ANDN2_B32, and, v2i16>;
+def : ScalarNot2Pat<S_ANDN2_B64, and, v4i16>;
+def : ScalarNot2Pat<S_ANDN2_B64, and, v2i32>;
+
+def : ScalarNot2Pat<S_ORN2_B32, or, i16, 0>;
+def : ScalarNot2Pat<S_ORN2_B32, or, v2i16>;
+def : ScalarNot2Pat<S_ORN2_B64, or, v4i16>;
+def : ScalarNot2Pat<S_ORN2_B64, or, v2i32>;

 //===----------------------------------------------------------------------===//
 // Target-specific instruction encodings.
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
@ -204,8 +204,7 @@ define amdgpu_ps <2 x i32> @s_andn2_v2i32_commute(<2 x i32> inreg %src0, <2 x i3
 define amdgpu_ps i16 @s_andn2_i16(i16 inreg %src0, i16 inreg %src1) {
 ; GFX6-LABEL: s_andn2_i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_xor_b32 s0, s3, -1
-; GFX6-NEXT:    s_and_b32 s0, s2, s0
+; GFX6-NEXT:    s_andn2_b32 s0, s2, s3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_andn2_i16:
@ -224,8 +223,7 @@ define amdgpu_ps i16 @s_andn2_i16(i16 inreg %src0, i16 inreg %src1) {
 define amdgpu_ps i16 @s_andn2_i16_commute(i16 inreg %src0, i16 inreg %src1) {
 ; GFX6-LABEL: s_andn2_i16_commute:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_xor_b32 s0, s3, -1
-; GFX6-NEXT:    s_and_b32 s0, s0, s2
+; GFX6-NEXT:    s_andn2_b32 s0, s2, s3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_andn2_i16_commute:
@ -245,7 +243,7 @@ define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_use(i16 inreg %src0, i16 inreg
 ; GFX6-LABEL: s_andn2_i16_multi_use:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_xor_b32 s1, s3, -1
-; GFX6-NEXT:    s_and_b32 s0, s2, s1
+; GFX6-NEXT:    s_andn2_b32 s0, s2, s3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_andn2_i16_multi_use:
@ -266,9 +264,8 @@ define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_use(i16 inreg %src0, i16 inreg
 define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_foldable_use(i16 inreg %src0, i16 inreg %src1, i16 inreg %src2) {
 ; GFX6-LABEL: s_andn2_i16_multi_foldable_use:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_xor_b32 s1, s4, -1
-; GFX6-NEXT:    s_and_b32 s0, s2, s1
-; GFX6-NEXT:    s_and_b32 s1, s3, s1
+; GFX6-NEXT:    s_andn2_b32 s0, s2, s4
+; GFX6-NEXT:    s_andn2_b32 s1, s3, s4
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_andn2_i16_multi_foldable_use:
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
@ -204,8 +204,7 @@ define amdgpu_ps <2 x i32> @s_orn2_v2i32_commute(<2 x i32> inreg %src0, <2 x i32
 define amdgpu_ps i16 @s_orn2_i16(i16 inreg %src0, i16 inreg %src1) {
 ; GFX6-LABEL: s_orn2_i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_xor_b32 s0, s3, -1
-; GFX6-NEXT:    s_or_b32 s0, s2, s0
+; GFX6-NEXT:    s_orn2_b32 s0, s2, s3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_orn2_i16:
@ -224,8 +223,7 @@ define amdgpu_ps i16 @s_orn2_i16(i16 inreg %src0, i16 inreg %src1) {
 define amdgpu_ps i16 @s_orn2_i16_commute(i16 inreg %src0, i16 inreg %src1) {
 ; GFX6-LABEL: s_orn2_i16_commute:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_xor_b32 s0, s3, -1
-; GFX6-NEXT:    s_or_b32 s0, s0, s2
+; GFX6-NEXT:    s_orn2_b32 s0, s2, s3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_orn2_i16_commute:
@ -245,7 +243,7 @@ define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_use(i16 inreg %src0, i16 inreg %
 ; GFX6-LABEL: s_orn2_i16_multi_use:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_xor_b32 s1, s3, -1
-; GFX6-NEXT:    s_or_b32 s0, s2, s1
+; GFX6-NEXT:    s_orn2_b32 s0, s2, s3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_orn2_i16_multi_use:
@ -266,9 +264,8 @@ define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_use(i16 inreg %src0, i16 inreg %
 define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_foldable_use(i16 inreg %src0, i16 inreg %src1, i16 inreg %src2) {
 ; GFX6-LABEL: s_orn2_i16_multi_foldable_use:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_xor_b32 s1, s4, -1
-; GFX6-NEXT:    s_or_b32 s0, s2, s1
-; GFX6-NEXT:    s_or_b32 s1, s3, s1
+; GFX6-NEXT:    s_orn2_b32 s0, s2, s4
+; GFX6-NEXT:    s_orn2_b32 s1, s3, s4
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_orn2_i16_multi_foldable_use:
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@ -969,10 +969,10 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %ou
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshl_b32 s4, s4, 3
 ; VI-NEXT:    v_lshlrev_b16_e64 v0, s4, -1
-; VI-NEXT:    v_and_b32_e32 v1, 0x505, v0
-; VI-NEXT:    v_xor_b32_e32 v0, -1, v0
-; VI-NEXT:    v_and_b32_e32 v0, s6, v0
-; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    v_not_b32_e32 v1, v0
+; VI-NEXT:    v_and_b32_e32 v1, s6, v1
+; VI-NEXT:    v_and_b32_e32 v0, 0x505, v0
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
  %vecins = insertelement <2 x i8> %a, i8 5, i32 %b