forked from OSchip/llvm-project
AMDGPU/GlobalISel: Match andn2/orn2 for more types
Unfortunately this ends up not working as expected on targets with 16-bit operations due to AMDGPUCodeGenPrepare's promotion of uniform 16-bit ops to i32. The vector case annoyingly requires switching the checked opcode, since constants for vectors aren't directly handled. I also need to think more carefully about whether this is valid for i1.
This commit is contained in:
parent
b6db0a544d
commit
40a142fa57
|
@ -1510,6 +1510,10 @@ class getVOPSrc0ForVT<ValueType VT> {
|
|||
);
|
||||
}
|
||||
|
||||
class getSOPSrcForVT<ValueType VT> {
|
||||
RegisterOperand ret = !if(!eq(VT.Size, 64), SSrc_b64, SSrc_b32);
|
||||
}
|
||||
|
||||
// Returns the vreg register class to use for source operand given VT
|
||||
class getVregSrcForVT<ValueType VT> {
|
||||
RegisterClass ret = !if(!eq(VT.Size, 128), VReg_128,
|
||||
|
|
|
@ -541,6 +541,7 @@ def S_NOR_B64 : SOP2_64 <"s_nor_b64",
|
|||
>;
|
||||
} // End isCommutable = 1
|
||||
|
||||
// There are also separate patterns for types other than i32
|
||||
def S_ANDN2_B32 : SOP2_32 <"s_andn2_b32",
|
||||
[(set i32:$sdst, (UniformBinFrag<and> i32:$src0, (UniformUnaryFrag<not> i32:$src1)))]
|
||||
>;
|
||||
|
@ -1330,6 +1331,24 @@ def : GCNPat<
|
|||
(S_AND_B32 (S_MOV_B32 (i32 0xffff)), $src)
|
||||
>;
|
||||
|
||||
// FIXME: ValueType should have isVector field
|
||||
class ScalarNot2Pat<Instruction inst, SDPatternOperator op, ValueType vt,
|
||||
bit isVector = 1> : GCNPat<
|
||||
(UniformBinFrag<op> vt:$src0, (UniformUnaryFrag<!if(isVector, vnot, not)> vt:$src1)),
|
||||
(inst getSOPSrcForVT<vt>.ret:$src0, getSOPSrcForVT<vt>.ret:$src1)
|
||||
>;
|
||||
|
||||
// Match these for some more types
|
||||
// TODO: i1
|
||||
def : ScalarNot2Pat<S_ANDN2_B32, and, i16, 0>;
|
||||
def : ScalarNot2Pat<S_ANDN2_B32, and, v2i16>;
|
||||
def : ScalarNot2Pat<S_ANDN2_B64, and, v4i16>;
|
||||
def : ScalarNot2Pat<S_ANDN2_B64, and, v2i32>;
|
||||
|
||||
def : ScalarNot2Pat<S_ORN2_B32, or, i16, 0>;
|
||||
def : ScalarNot2Pat<S_ORN2_B32, or, v2i16>;
|
||||
def : ScalarNot2Pat<S_ORN2_B64, or, v4i16>;
|
||||
def : ScalarNot2Pat<S_ORN2_B64, or, v2i32>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Target-specific instruction encodings.
|
||||
|
|
|
@ -204,8 +204,7 @@ define amdgpu_ps <2 x i32> @s_andn2_v2i32_commute(<2 x i32> inreg %src0, <2 x i3
|
|||
define amdgpu_ps i16 @s_andn2_i16(i16 inreg %src0, i16 inreg %src1) {
|
||||
; GFX6-LABEL: s_andn2_i16:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_xor_b32 s0, s3, -1
|
||||
; GFX6-NEXT: s_and_b32 s0, s2, s0
|
||||
; GFX6-NEXT: s_andn2_b32 s0, s2, s3
|
||||
; GFX6-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX9-LABEL: s_andn2_i16:
|
||||
|
@ -224,8 +223,7 @@ define amdgpu_ps i16 @s_andn2_i16(i16 inreg %src0, i16 inreg %src1) {
|
|||
define amdgpu_ps i16 @s_andn2_i16_commute(i16 inreg %src0, i16 inreg %src1) {
|
||||
; GFX6-LABEL: s_andn2_i16_commute:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_xor_b32 s0, s3, -1
|
||||
; GFX6-NEXT: s_and_b32 s0, s0, s2
|
||||
; GFX6-NEXT: s_andn2_b32 s0, s2, s3
|
||||
; GFX6-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX9-LABEL: s_andn2_i16_commute:
|
||||
|
@ -245,7 +243,7 @@ define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_use(i16 inreg %src0, i16 inreg
|
|||
; GFX6-LABEL: s_andn2_i16_multi_use:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_xor_b32 s1, s3, -1
|
||||
; GFX6-NEXT: s_and_b32 s0, s2, s1
|
||||
; GFX6-NEXT: s_andn2_b32 s0, s2, s3
|
||||
; GFX6-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX9-LABEL: s_andn2_i16_multi_use:
|
||||
|
@ -266,9 +264,8 @@ define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_use(i16 inreg %src0, i16 inreg
|
|||
define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_foldable_use(i16 inreg %src0, i16 inreg %src1, i16 inreg %src2) {
|
||||
; GFX6-LABEL: s_andn2_i16_multi_foldable_use:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_xor_b32 s1, s4, -1
|
||||
; GFX6-NEXT: s_and_b32 s0, s2, s1
|
||||
; GFX6-NEXT: s_and_b32 s1, s3, s1
|
||||
; GFX6-NEXT: s_andn2_b32 s0, s2, s4
|
||||
; GFX6-NEXT: s_andn2_b32 s1, s3, s4
|
||||
; GFX6-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX9-LABEL: s_andn2_i16_multi_foldable_use:
|
||||
|
|
|
@ -204,8 +204,7 @@ define amdgpu_ps <2 x i32> @s_orn2_v2i32_commute(<2 x i32> inreg %src0, <2 x i32
|
|||
define amdgpu_ps i16 @s_orn2_i16(i16 inreg %src0, i16 inreg %src1) {
|
||||
; GFX6-LABEL: s_orn2_i16:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_xor_b32 s0, s3, -1
|
||||
; GFX6-NEXT: s_or_b32 s0, s2, s0
|
||||
; GFX6-NEXT: s_orn2_b32 s0, s2, s3
|
||||
; GFX6-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX9-LABEL: s_orn2_i16:
|
||||
|
@ -224,8 +223,7 @@ define amdgpu_ps i16 @s_orn2_i16(i16 inreg %src0, i16 inreg %src1) {
|
|||
define amdgpu_ps i16 @s_orn2_i16_commute(i16 inreg %src0, i16 inreg %src1) {
|
||||
; GFX6-LABEL: s_orn2_i16_commute:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_xor_b32 s0, s3, -1
|
||||
; GFX6-NEXT: s_or_b32 s0, s0, s2
|
||||
; GFX6-NEXT: s_orn2_b32 s0, s2, s3
|
||||
; GFX6-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX9-LABEL: s_orn2_i16_commute:
|
||||
|
@ -245,7 +243,7 @@ define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_use(i16 inreg %src0, i16 inreg %
|
|||
; GFX6-LABEL: s_orn2_i16_multi_use:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_xor_b32 s1, s3, -1
|
||||
; GFX6-NEXT: s_or_b32 s0, s2, s1
|
||||
; GFX6-NEXT: s_orn2_b32 s0, s2, s3
|
||||
; GFX6-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX9-LABEL: s_orn2_i16_multi_use:
|
||||
|
@ -266,9 +264,8 @@ define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_use(i16 inreg %src0, i16 inreg %
|
|||
define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_foldable_use(i16 inreg %src0, i16 inreg %src1, i16 inreg %src2) {
|
||||
; GFX6-LABEL: s_orn2_i16_multi_foldable_use:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_xor_b32 s1, s4, -1
|
||||
; GFX6-NEXT: s_or_b32 s0, s2, s1
|
||||
; GFX6-NEXT: s_or_b32 s1, s3, s1
|
||||
; GFX6-NEXT: s_orn2_b32 s0, s2, s4
|
||||
; GFX6-NEXT: s_orn2_b32 s1, s3, s4
|
||||
; GFX6-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX9-LABEL: s_orn2_i16_multi_foldable_use:
|
||||
|
|
|
@ -969,10 +969,10 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %ou
|
|||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_lshl_b32 s4, s4, 3
|
||||
; VI-NEXT: v_lshlrev_b16_e64 v0, s4, -1
|
||||
; VI-NEXT: v_and_b32_e32 v1, 0x505, v0
|
||||
; VI-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; VI-NEXT: v_and_b32_e32 v0, s6, v0
|
||||
; VI-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; VI-NEXT: v_not_b32_e32 v1, v0
|
||||
; VI-NEXT: v_and_b32_e32 v1, s6, v1
|
||||
; VI-NEXT: v_and_b32_e32 v0, 0x505, v0
|
||||
; VI-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
%vecins = insertelement <2 x i8> %a, i8 5, i32 %b
|
||||
|
|
Loading…
Reference in New Issue