forked from OSchip/llvm-project
AMDGPU: Apply i16 add->sub pattern with zext to i32
This was only applying the deeper nested zext pattern, and missing the special case code size fold.
This commit is contained in:
parent
73d93617d3
commit
4844bf0fe2
|
@ -766,7 +766,22 @@ def : GCNPat <
|
|||
|
||||
let Predicates = [Has16BitInsts] in {
|
||||
|
||||
// Undo sub x, c -> add x, -c canonicalization since c is more likely
|
||||
// an inline immediate than -c.
|
||||
// TODO: Also do for 64-bit.
|
||||
def : GCNPat<
|
||||
(add i16:$src0, (i16 NegSubInlineConst16:$src1)),
|
||||
(V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1)
|
||||
>;
|
||||
|
||||
|
||||
let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in {
|
||||
|
||||
def : GCNPat<
|
||||
(i32 (zext (add i16:$src0, (i16 NegSubInlineConst16:$src1)))),
|
||||
(V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1)
|
||||
>;
|
||||
|
||||
defm : Arithmetic_i16_0Hi_Pats<add, V_ADD_U16_e64>;
|
||||
defm : Arithmetic_i16_0Hi_Pats<mul, V_MUL_LO_U16_e64>;
|
||||
defm : Arithmetic_i16_0Hi_Pats<sub, V_SUB_U16_e64>;
|
||||
|
@ -788,14 +803,6 @@ def : GCNPat <
|
|||
/*src1mod*/(i32 0), /*src1*/(i32 -1), $src)
|
||||
>;
|
||||
|
||||
// Undo sub x, c -> add x, -c canonicalization since c is more likely
|
||||
// an inline immediate than -c.
|
||||
// TODO: Also do for 64-bit.
|
||||
def : GCNPat<
|
||||
(add i16:$src0, (i16 NegSubInlineConst16:$src1)),
|
||||
(V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1)
|
||||
>;
|
||||
|
||||
} // End Predicates = [Has16BitInsts]
|
||||
|
||||
|
||||
|
|
|
@ -865,7 +865,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(i32 addrspace(1)* %ou
|
|||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; VI-NEXT: v_add_u16_e32 v2, 0xffffffc0, v3
|
||||
; VI-NEXT: v_subrev_u16_e32 v2, 64, v3
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -883,7 +883,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(i32 addrspace(1)* %ou
|
|||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_add_u16_e32 v2, 0xffffffc0, v3
|
||||
; GFX9-NEXT: v_subrev_u16_e32 v2, 64, v3
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v2, off
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -1045,9 +1045,9 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(<2 x i16> addrspace(1)* %out
|
|||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; VI-NEXT: v_add_u16_e32 v2, 0xffffffc0, v4
|
||||
; VI-NEXT: v_sub_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_e32 v2, v2, v3
|
||||
; VI-NEXT: v_sub_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-NEXT: v_subrev_u16_e32 v3, 64, v4
|
||||
; VI-NEXT: v_or_b32_e32 v2, v3, v2
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -1212,9 +1212,9 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(<2 x i16> addrspace(1)* %ou
|
|||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; VI-NEXT: v_add_u16_e32 v2, 0xffffffc0, v4
|
||||
; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_e32 v2, v2, v3
|
||||
; VI-NEXT: v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-NEXT: v_subrev_u16_e32 v3, 64, v4
|
||||
; VI-NEXT: v_or_b32_e32 v2, v3, v2
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -1614,9 +1614,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(<2 x i16> addrspace(1)
|
|||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; VI-NEXT: v_add_u16_e32 v2, 0xffffffe0, v4
|
||||
; VI-NEXT: v_sub_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_e32 v2, v2, v3
|
||||
; VI-NEXT: v_sub_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-NEXT: v_subrev_u16_e32 v3, 32, v4
|
||||
; VI-NEXT: v_or_b32_e32 v2, v3, v2
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -1774,7 +1774,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(<2 x i16> addrspace(1)* %o
|
|||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
|
||||
; VI-NEXT: v_add_u16_e32 v3, 0xffffffe0, v3
|
||||
; VI-NEXT: v_subrev_u16_e32 v3, 32, v3
|
||||
; VI-NEXT: v_or_b32_e32 v2, v3, v2
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
|
|
|
@ -344,7 +344,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspac
|
|||
; VI-NEXT: s_mov_b32 s5, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
|
||||
; VI-NEXT: v_add_u16_e32 v0, 0xffffffe0, v0
|
||||
; VI-NEXT: v_subrev_u16_e32 v0, 32, v0
|
||||
; VI-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
|
|
Loading…
Reference in New Issue