AMDGPU: Apply i16 add->sub pattern with zext to i32

This was only applying the deeper nested zext pattern, and missing the
special case code size fold.
This commit is contained in:
Matt Arsenault 2020-01-07 13:32:03 -05:00 committed by Matt Arsenault
parent 73d93617d3
commit 4844bf0fe2
3 changed files with 28 additions and 21 deletions

View File

@ -766,7 +766,22 @@ def : GCNPat <
let Predicates = [Has16BitInsts] in { let Predicates = [Has16BitInsts] in {
// Undo sub x, c -> add x, -c canonicalization since c is more likely
// an inline immediate than -c.
// TODO: Also do for 64-bit.
def : GCNPat<
(add i16:$src0, (i16 NegSubInlineConst16:$src1)),
(V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1)
>;
let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in { let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in {
def : GCNPat<
(i32 (zext (add i16:$src0, (i16 NegSubInlineConst16:$src1)))),
(V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1)
>;
defm : Arithmetic_i16_0Hi_Pats<add, V_ADD_U16_e64>; defm : Arithmetic_i16_0Hi_Pats<add, V_ADD_U16_e64>;
defm : Arithmetic_i16_0Hi_Pats<mul, V_MUL_LO_U16_e64>; defm : Arithmetic_i16_0Hi_Pats<mul, V_MUL_LO_U16_e64>;
defm : Arithmetic_i16_0Hi_Pats<sub, V_SUB_U16_e64>; defm : Arithmetic_i16_0Hi_Pats<sub, V_SUB_U16_e64>;
@ -788,14 +803,6 @@ def : GCNPat <
/*src1mod*/(i32 0), /*src1*/(i32 -1), $src) /*src1mod*/(i32 0), /*src1*/(i32 -1), $src)
>; >;
// Undo sub x, c -> add x, -c canonicalization since c is more likely
// an inline immediate than -c.
// TODO: Also do for 64-bit.
def : GCNPat<
(add i16:$src0, (i16 NegSubInlineConst16:$src1)),
(V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1)
>;
} // End Predicates = [Has16BitInsts] } // End Predicates = [Has16BitInsts]

View File

@ -865,7 +865,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(i32 addrspace(1)* %ou
; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u16_e32 v2, 0xffffffc0, v3 ; VI-NEXT: v_subrev_u16_e32 v2, 64, v3
; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm ; VI-NEXT: s_endpgm
; ;
@ -883,7 +883,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(i32 addrspace(1)* %ou
; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u16_e32 v2, 0xffffffc0, v3 ; GFX9-NEXT: v_subrev_u16_e32 v2, 64, v3
; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm ; GFX9-NEXT: s_endpgm
; ;
@ -1045,9 +1045,9 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(<2 x i16> addrspace(1)* %out
; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u16_e32 v2, 0xffffffc0, v4 ; VI-NEXT: v_sub_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_sub_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_subrev_u16_e32 v3, 64, v4
; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm ; VI-NEXT: s_endpgm
; ;
@ -1212,9 +1212,9 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(<2 x i16> addrspace(1)* %ou
; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u16_e32 v2, 0xffffffc0, v4 ; VI-NEXT: v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_subrev_u16_e32 v3, 64, v4
; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm ; VI-NEXT: s_endpgm
; ;
@ -1614,9 +1614,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(<2 x i16> addrspace(1)
; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u16_e32 v2, 0xffffffe0, v4 ; VI-NEXT: v_sub_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_sub_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_subrev_u16_e32 v3, 32, v4
; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm ; VI-NEXT: s_endpgm
; ;
@ -1774,7 +1774,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(<2 x i16> addrspace(1)* %o
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
; VI-NEXT: v_add_u16_e32 v3, 0xffffffe0, v3 ; VI-NEXT: v_subrev_u16_e32 v3, 32, v3
; VI-NEXT: v_or_b32_e32 v2, v3, v2 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm ; VI-NEXT: s_endpgm

View File

@ -344,7 +344,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspac
; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
; VI-NEXT: v_add_u16_e32 v0, 0xffffffe0, v0 ; VI-NEXT: v_subrev_u16_e32 v0, 32, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm ; VI-NEXT: s_endpgm