AMDGPU/GlobalISel: Match andn2/orn2 for more types

Unfortunately this ends up not working as expected on targets with
16-bit operations due to AMDGPUCodeGenPrepare's promotion of uniform
16-bit ops to i32.

The vector case annoyingly requires switching the checked opcode,
since constants for vectors aren't directly handled.

I also need to think more carefully about whether this is valid for i1.
This commit is contained in:
Matt Arsenault 2020-07-31 13:48:58 -04:00 committed by Matt Arsenault
parent b6db0a544d
commit 40a142fa57
5 changed files with 37 additions and 20 deletions
llvm
lib/Target/AMDGPU
test/CodeGen/AMDGPU

View File

@ -1510,6 +1510,10 @@ class getVOPSrc0ForVT<ValueType VT> {
);
}
class getSOPSrcForVT<ValueType VT> {
RegisterOperand ret = !if(!eq(VT.Size, 64), SSrc_b64, SSrc_b32);
}
// Returns the vreg register class to use for source operand given VT
class getVregSrcForVT<ValueType VT> {
RegisterClass ret = !if(!eq(VT.Size, 128), VReg_128,

View File

@ -541,6 +541,7 @@ def S_NOR_B64 : SOP2_64 <"s_nor_b64",
>;
} // End isCommutable = 1
// There are also separate patterns for types other than i32
def S_ANDN2_B32 : SOP2_32 <"s_andn2_b32",
[(set i32:$sdst, (UniformBinFrag<and> i32:$src0, (UniformUnaryFrag<not> i32:$src1)))]
>;
@ -1330,6 +1331,24 @@ def : GCNPat<
(S_AND_B32 (S_MOV_B32 (i32 0xffff)), $src)
>;
// FIXME: ValueType should have isVector field
class ScalarNot2Pat<Instruction inst, SDPatternOperator op, ValueType vt,
bit isVector = 1> : GCNPat<
(UniformBinFrag<op> vt:$src0, (UniformUnaryFrag<!if(isVector, vnot, not)> vt:$src1)),
(inst getSOPSrcForVT<vt>.ret:$src0, getSOPSrcForVT<vt>.ret:$src1)
>;
// Match these for some more types
// TODO: i1
def : ScalarNot2Pat<S_ANDN2_B32, and, i16, 0>;
def : ScalarNot2Pat<S_ANDN2_B32, and, v2i16>;
def : ScalarNot2Pat<S_ANDN2_B64, and, v4i16>;
def : ScalarNot2Pat<S_ANDN2_B64, and, v2i32>;
def : ScalarNot2Pat<S_ORN2_B32, or, i16, 0>;
def : ScalarNot2Pat<S_ORN2_B32, or, v2i16>;
def : ScalarNot2Pat<S_ORN2_B64, or, v4i16>;
def : ScalarNot2Pat<S_ORN2_B64, or, v2i32>;
//===----------------------------------------------------------------------===//
// Target-specific instruction encodings.

View File

@ -204,8 +204,7 @@ define amdgpu_ps <2 x i32> @s_andn2_v2i32_commute(<2 x i32> inreg %src0, <2 x i3
define amdgpu_ps i16 @s_andn2_i16(i16 inreg %src0, i16 inreg %src1) {
; GFX6-LABEL: s_andn2_i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_xor_b32 s0, s3, -1
; GFX6-NEXT: s_and_b32 s0, s2, s0
; GFX6-NEXT: s_andn2_b32 s0, s2, s3
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_andn2_i16:
@ -224,8 +223,7 @@ define amdgpu_ps i16 @s_andn2_i16(i16 inreg %src0, i16 inreg %src1) {
define amdgpu_ps i16 @s_andn2_i16_commute(i16 inreg %src0, i16 inreg %src1) {
; GFX6-LABEL: s_andn2_i16_commute:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_xor_b32 s0, s3, -1
; GFX6-NEXT: s_and_b32 s0, s0, s2
; GFX6-NEXT: s_andn2_b32 s0, s2, s3
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_andn2_i16_commute:
@ -245,7 +243,7 @@ define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_use(i16 inreg %src0, i16 inreg
; GFX6-LABEL: s_andn2_i16_multi_use:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_xor_b32 s1, s3, -1
; GFX6-NEXT: s_and_b32 s0, s2, s1
; GFX6-NEXT: s_andn2_b32 s0, s2, s3
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_andn2_i16_multi_use:
@ -266,9 +264,8 @@ define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_use(i16 inreg %src0, i16 inreg
define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_foldable_use(i16 inreg %src0, i16 inreg %src1, i16 inreg %src2) {
; GFX6-LABEL: s_andn2_i16_multi_foldable_use:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_xor_b32 s1, s4, -1
; GFX6-NEXT: s_and_b32 s0, s2, s1
; GFX6-NEXT: s_and_b32 s1, s3, s1
; GFX6-NEXT: s_andn2_b32 s0, s2, s4
; GFX6-NEXT: s_andn2_b32 s1, s3, s4
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_andn2_i16_multi_foldable_use:

View File

@ -204,8 +204,7 @@ define amdgpu_ps <2 x i32> @s_orn2_v2i32_commute(<2 x i32> inreg %src0, <2 x i32
define amdgpu_ps i16 @s_orn2_i16(i16 inreg %src0, i16 inreg %src1) {
; GFX6-LABEL: s_orn2_i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_xor_b32 s0, s3, -1
; GFX6-NEXT: s_or_b32 s0, s2, s0
; GFX6-NEXT: s_orn2_b32 s0, s2, s3
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_orn2_i16:
@ -224,8 +223,7 @@ define amdgpu_ps i16 @s_orn2_i16(i16 inreg %src0, i16 inreg %src1) {
define amdgpu_ps i16 @s_orn2_i16_commute(i16 inreg %src0, i16 inreg %src1) {
; GFX6-LABEL: s_orn2_i16_commute:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_xor_b32 s0, s3, -1
; GFX6-NEXT: s_or_b32 s0, s0, s2
; GFX6-NEXT: s_orn2_b32 s0, s2, s3
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_orn2_i16_commute:
@ -245,7 +243,7 @@ define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_use(i16 inreg %src0, i16 inreg %
; GFX6-LABEL: s_orn2_i16_multi_use:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_xor_b32 s1, s3, -1
; GFX6-NEXT: s_or_b32 s0, s2, s1
; GFX6-NEXT: s_orn2_b32 s0, s2, s3
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_orn2_i16_multi_use:
@ -266,9 +264,8 @@ define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_use(i16 inreg %src0, i16 inreg %
define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_foldable_use(i16 inreg %src0, i16 inreg %src1, i16 inreg %src2) {
; GFX6-LABEL: s_orn2_i16_multi_foldable_use:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_xor_b32 s1, s4, -1
; GFX6-NEXT: s_or_b32 s0, s2, s1
; GFX6-NEXT: s_or_b32 s1, s3, s1
; GFX6-NEXT: s_orn2_b32 s0, s2, s4
; GFX6-NEXT: s_orn2_b32 s1, s3, s4
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_orn2_i16_multi_foldable_use:

View File

@ -969,10 +969,10 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %ou
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b32 s4, s4, 3
; VI-NEXT: v_lshlrev_b16_e64 v0, s4, -1
; VI-NEXT: v_and_b32_e32 v1, 0x505, v0
; VI-NEXT: v_xor_b32_e32 v0, -1, v0
; VI-NEXT: v_and_b32_e32 v0, s6, v0
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: v_not_b32_e32 v1, v0
; VI-NEXT: v_and_b32_e32 v1, s6, v1
; VI-NEXT: v_and_b32_e32 v0, 0x505, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%vecins = insertelement <2 x i8> %a, i8 5, i32 %b