From 40a142fa57d648e3daadfdaa75731360e1ebab2e Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 31 Jul 2020 13:48:58 -0400 Subject: [PATCH] AMDGPU/GlobalISel: Match andn2/orn2 for more types Unfortunately this ends up not working as expected on targets with 16-bit operations due to AMDGPUCodeGenPrepare's promotion of uniform 16-bit ops to i32. The vector case annoyingly requires switching the checked opcode, since constants for vectors aren't directly handled. I also need to think more carefully about whether this is valid for i1. --- llvm/lib/Target/AMDGPU/SIInstrInfo.td | 4 ++++ llvm/lib/Target/AMDGPU/SOPInstructions.td | 19 +++++++++++++++++++ llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll | 13 +++++-------- llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll | 13 +++++-------- llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll | 8 ++++---- 5 files changed, 37 insertions(+), 20 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 618b0a142ee9..d5acd79760f3 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1510,6 +1510,10 @@ class getVOPSrc0ForVT { ); } +class getSOPSrcForVT { + RegisterOperand ret = !if(!eq(VT.Size, 64), SSrc_b64, SSrc_b32); +} + // Returns the vreg register class to use for source operand given VT class getVregSrcForVT { RegisterClass ret = !if(!eq(VT.Size, 128), VReg_128, diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index db8f3c9185c9..df2e18fd4414 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -541,6 +541,7 @@ def S_NOR_B64 : SOP2_64 <"s_nor_b64", >; } // End isCommutable = 1 +// There are also separate patterns for types other than i32 def S_ANDN2_B32 : SOP2_32 <"s_andn2_b32", [(set i32:$sdst, (UniformBinFrag i32:$src0, (UniformUnaryFrag i32:$src1)))] >; @@ -1330,6 +1331,24 @@ def : GCNPat< (S_AND_B32 (S_MOV_B32 (i32 0xffff)), $src) >; +// FIXME: ValueType should have isVector field +class ScalarNot2Pat : GCNPat< + (UniformBinFrag vt:$src0, (UniformUnaryFrag vt:$src1)), + (inst getSOPSrcForVT.ret:$src0, getSOPSrcForVT.ret:$src1) +>; + +// Match these for some more types +// TODO: i1 +def : ScalarNot2Pat; +def : ScalarNot2Pat; +def : ScalarNot2Pat; +def : ScalarNot2Pat; + +def : ScalarNot2Pat; +def : ScalarNot2Pat; +def : ScalarNot2Pat; +def : ScalarNot2Pat; //===----------------------------------------------------------------------===// // Target-specific instruction encodings. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll index 7cc18766de98..e183ee3ed7c4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll @@ -204,8 +204,7 @@ define amdgpu_ps <2 x i32> @s_andn2_v2i32_commute(<2 x i32> inreg %src0, <2 x i3 define amdgpu_ps i16 @s_andn2_i16(i16 inreg %src0, i16 inreg %src1) { ; GFX6-LABEL: s_andn2_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_xor_b32 s0, s3, -1 -; GFX6-NEXT: s_and_b32 s0, s2, s0 +; GFX6-NEXT: s_andn2_b32 s0, s2, s3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_andn2_i16: @@ -224,8 +223,7 @@ define amdgpu_ps i16 @s_andn2_i16(i16 inreg %src0, i16 inreg %src1) { define amdgpu_ps i16 @s_andn2_i16_commute(i16 inreg %src0, i16 inreg %src1) { ; GFX6-LABEL: s_andn2_i16_commute: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_xor_b32 s0, s3, -1 -; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_andn2_b32 s0, s2, s3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_andn2_i16_commute: @@ -245,7 +243,7 @@ define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_use(i16 inreg %src0, i16 inreg ; GFX6-LABEL: s_andn2_i16_multi_use: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_xor_b32 s1, s3, -1 -; GFX6-NEXT: s_and_b32 s0, s2, s1 +; GFX6-NEXT: s_andn2_b32 s0, s2, s3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_andn2_i16_multi_use: @@ -266,9 +264,8 @@ define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_use(i16 inreg %src0, i16 inreg define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_foldable_use(i16 inreg %src0, i16 inreg %src1, i16 inreg %src2) { ; GFX6-LABEL: s_andn2_i16_multi_foldable_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_xor_b32 s1, s4, -1 -; GFX6-NEXT: s_and_b32 s0, s2, s1 -; GFX6-NEXT: s_and_b32 s1, s3, s1 +; GFX6-NEXT: s_andn2_b32 s0, s2, s4 +; GFX6-NEXT: s_andn2_b32 s1, s3, s4 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_andn2_i16_multi_foldable_use: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll index 0f451e43b119..c6c8febff9cc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll @@ -204,8 +204,7 @@ define amdgpu_ps <2 x i32> @s_orn2_v2i32_commute(<2 x i32> inreg %src0, <2 x i32 define amdgpu_ps i16 @s_orn2_i16(i16 inreg %src0, i16 inreg %src1) { ; GFX6-LABEL: s_orn2_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_xor_b32 s0, s3, -1 -; GFX6-NEXT: s_or_b32 s0, s2, s0 +; GFX6-NEXT: s_orn2_b32 s0, s2, s3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_orn2_i16: @@ -224,8 +223,7 @@ define amdgpu_ps i16 @s_orn2_i16(i16 inreg %src0, i16 inreg %src1) { define amdgpu_ps i16 @s_orn2_i16_commute(i16 inreg %src0, i16 inreg %src1) { ; GFX6-LABEL: s_orn2_i16_commute: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_xor_b32 s0, s3, -1 -; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_orn2_b32 s0, s2, s3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_orn2_i16_commute: @@ -245,7 +243,7 @@ define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_use(i16 inreg %src0, i16 inreg % ; GFX6-LABEL: s_orn2_i16_multi_use: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_xor_b32 s1, s3, -1 -; GFX6-NEXT: s_or_b32 s0, s2, s1 +; GFX6-NEXT: s_orn2_b32 s0, s2, s3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_orn2_i16_multi_use: @@ -266,9 +264,8 @@ define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_use(i16 inreg %src0, i16 inreg % define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_foldable_use(i16 inreg %src0, i16 inreg %src1, i16 inreg %src2) { ; GFX6-LABEL: s_orn2_i16_multi_foldable_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_xor_b32 s1, s4, -1 -; GFX6-NEXT: s_or_b32 s0, s2, s1 -; GFX6-NEXT: s_or_b32 s1, s3, s1 +; GFX6-NEXT: s_orn2_b32 s0, s2, s4 +; GFX6-NEXT: s_orn2_b32 s1, s3, s4 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_orn2_i16_multi_foldable_use: diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index e5e67b1022d6..9b525585d876 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -969,10 +969,10 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %ou ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b32 s4, s4, 3 ; VI-NEXT: v_lshlrev_b16_e64 v0, s4, -1 -; VI-NEXT: v_and_b32_e32 v1, 0x505, v0 -; VI-NEXT: v_xor_b32_e32 v0, -1, v0 -; VI-NEXT: v_and_b32_e32 v0, s6, v0 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_not_b32_e32 v1, v0 +; VI-NEXT: v_and_b32_e32 v1, s6, v1 +; VI-NEXT: v_and_b32_e32 v0, 0x505, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <2 x i8> %a, i8 5, i32 %b