forked from OSchip/llvm-project
AMDGPU: Apply i16 add->sub pattern with zext to i32
This was only applying the deeper nested zext pattern, and missing the special case code size fold.
This commit is contained in:
parent
73d93617d3
commit
4844bf0fe2
|
@ -766,7 +766,22 @@ def : GCNPat <
|
||||||
|
|
||||||
let Predicates = [Has16BitInsts] in {
|
let Predicates = [Has16BitInsts] in {
|
||||||
|
|
||||||
|
// Undo sub x, c -> add x, -c canonicalization since c is more likely
|
||||||
|
// an inline immediate than -c.
|
||||||
|
// TODO: Also do for 64-bit.
|
||||||
|
def : GCNPat<
|
||||||
|
(add i16:$src0, (i16 NegSubInlineConst16:$src1)),
|
||||||
|
(V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1)
|
||||||
|
>;
|
||||||
|
|
||||||
|
|
||||||
let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in {
|
let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in {
|
||||||
|
|
||||||
|
def : GCNPat<
|
||||||
|
(i32 (zext (add i16:$src0, (i16 NegSubInlineConst16:$src1)))),
|
||||||
|
(V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1)
|
||||||
|
>;
|
||||||
|
|
||||||
defm : Arithmetic_i16_0Hi_Pats<add, V_ADD_U16_e64>;
|
defm : Arithmetic_i16_0Hi_Pats<add, V_ADD_U16_e64>;
|
||||||
defm : Arithmetic_i16_0Hi_Pats<mul, V_MUL_LO_U16_e64>;
|
defm : Arithmetic_i16_0Hi_Pats<mul, V_MUL_LO_U16_e64>;
|
||||||
defm : Arithmetic_i16_0Hi_Pats<sub, V_SUB_U16_e64>;
|
defm : Arithmetic_i16_0Hi_Pats<sub, V_SUB_U16_e64>;
|
||||||
|
@ -788,14 +803,6 @@ def : GCNPat <
|
||||||
/*src1mod*/(i32 0), /*src1*/(i32 -1), $src)
|
/*src1mod*/(i32 0), /*src1*/(i32 -1), $src)
|
||||||
>;
|
>;
|
||||||
|
|
||||||
// Undo sub x, c -> add x, -c canonicalization since c is more likely
|
|
||||||
// an inline immediate than -c.
|
|
||||||
// TODO: Also do for 64-bit.
|
|
||||||
def : GCNPat<
|
|
||||||
(add i16:$src0, (i16 NegSubInlineConst16:$src1)),
|
|
||||||
(V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1)
|
|
||||||
>;
|
|
||||||
|
|
||||||
} // End Predicates = [Has16BitInsts]
|
} // End Predicates = [Has16BitInsts]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -865,7 +865,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(i32 addrspace(1)* %ou
|
||||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||||
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||||
; VI-NEXT: v_add_u16_e32 v2, 0xffffffc0, v3
|
; VI-NEXT: v_subrev_u16_e32 v2, 64, v3
|
||||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||||
; VI-NEXT: s_endpgm
|
; VI-NEXT: s_endpgm
|
||||||
;
|
;
|
||||||
|
@ -883,7 +883,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(i32 addrspace(1)* %ou
|
||||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
||||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||||
; GFX9-NEXT: v_add_u16_e32 v2, 0xffffffc0, v3
|
; GFX9-NEXT: v_subrev_u16_e32 v2, 64, v3
|
||||||
; GFX9-NEXT: global_store_dword v[0:1], v2, off
|
; GFX9-NEXT: global_store_dword v[0:1], v2, off
|
||||||
; GFX9-NEXT: s_endpgm
|
; GFX9-NEXT: s_endpgm
|
||||||
;
|
;
|
||||||
|
@ -1045,9 +1045,9 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(<2 x i16> addrspace(1)* %out
|
||||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||||
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||||
; VI-NEXT: v_add_u16_e32 v2, 0xffffffc0, v4
|
; VI-NEXT: v_sub_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||||
; VI-NEXT: v_sub_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
; VI-NEXT: v_subrev_u16_e32 v3, 64, v4
|
||||||
; VI-NEXT: v_or_b32_e32 v2, v2, v3
|
; VI-NEXT: v_or_b32_e32 v2, v3, v2
|
||||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||||
; VI-NEXT: s_endpgm
|
; VI-NEXT: s_endpgm
|
||||||
;
|
;
|
||||||
|
@ -1212,9 +1212,9 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(<2 x i16> addrspace(1)* %ou
|
||||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||||
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||||
; VI-NEXT: v_add_u16_e32 v2, 0xffffffc0, v4
|
; VI-NEXT: v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||||
; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
; VI-NEXT: v_subrev_u16_e32 v3, 64, v4
|
||||||
; VI-NEXT: v_or_b32_e32 v2, v2, v3
|
; VI-NEXT: v_or_b32_e32 v2, v3, v2
|
||||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||||
; VI-NEXT: s_endpgm
|
; VI-NEXT: s_endpgm
|
||||||
;
|
;
|
||||||
|
@ -1614,9 +1614,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(<2 x i16> addrspace(1)
|
||||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||||
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||||
; VI-NEXT: v_add_u16_e32 v2, 0xffffffe0, v4
|
; VI-NEXT: v_sub_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||||
; VI-NEXT: v_sub_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
; VI-NEXT: v_subrev_u16_e32 v3, 32, v4
|
||||||
; VI-NEXT: v_or_b32_e32 v2, v2, v3
|
; VI-NEXT: v_or_b32_e32 v2, v3, v2
|
||||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||||
; VI-NEXT: s_endpgm
|
; VI-NEXT: s_endpgm
|
||||||
;
|
;
|
||||||
|
@ -1774,7 +1774,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(<2 x i16> addrspace(1)* %o
|
||||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||||
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||||
; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
|
; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
|
||||||
; VI-NEXT: v_add_u16_e32 v3, 0xffffffe0, v3
|
; VI-NEXT: v_subrev_u16_e32 v3, 32, v3
|
||||||
; VI-NEXT: v_or_b32_e32 v2, v3, v2
|
; VI-NEXT: v_or_b32_e32 v2, v3, v2
|
||||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||||
; VI-NEXT: s_endpgm
|
; VI-NEXT: s_endpgm
|
||||||
|
|
|
@ -344,7 +344,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspac
|
||||||
; VI-NEXT: s_mov_b32 s5, s1
|
; VI-NEXT: s_mov_b32 s5, s1
|
||||||
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||||
; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
|
; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
|
||||||
; VI-NEXT: v_add_u16_e32 v0, 0xffffffe0, v0
|
; VI-NEXT: v_subrev_u16_e32 v0, 32, v0
|
||||||
; VI-NEXT: v_or_b32_e32 v0, v0, v1
|
; VI-NEXT: v_or_b32_e32 v0, v0, v1
|
||||||
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||||
; VI-NEXT: s_endpgm
|
; VI-NEXT: s_endpgm
|
||||||
|
|
Loading…
Reference in New Issue