From 4844bf0fe2c83859cde322a2f952ac8337bdff05 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 7 Jan 2020 13:32:03 -0500 Subject: [PATCH] AMDGPU: Apply i16 add->sub pattern with zext to i32 This was only applying the deeper nested zext pattern, and missing the special case code size fold. --- llvm/lib/Target/AMDGPU/VOP2Instructions.td | 23 +++++++++++------- .../CodeGen/AMDGPU/shrink-add-sub-constant.ll | 24 +++++++++---------- llvm/test/CodeGen/AMDGPU/sub.v2i16.ll | 2 +- 3 files changed, 28 insertions(+), 21 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 87a2cbad4134..064b26665542 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -766,7 +766,22 @@ def : GCNPat < let Predicates = [Has16BitInsts] in { +// Undo sub x, c -> add x, -c canonicalization since c is more likely +// an inline immediate than -c. +// TODO: Also do for 64-bit. +def : GCNPat< + (add i16:$src0, (i16 NegSubInlineConst16:$src1)), + (V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1) +>; + + let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in { + +def : GCNPat< + (i32 (zext (add i16:$src0, (i16 NegSubInlineConst16:$src1)))), + (V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1) +>; + defm : Arithmetic_i16_0Hi_Pats; defm : Arithmetic_i16_0Hi_Pats; defm : Arithmetic_i16_0Hi_Pats; @@ -788,14 +803,6 @@ def : GCNPat < /*src1mod*/(i32 0), /*src1*/(i32 -1), $src) >; -// Undo sub x, c -> add x, -c canonicalization since c is more likely -// an inline immediate than -c. -// TODO: Also do for 64-bit. -def : GCNPat< - (add i16:$src0, (i16 NegSubInlineConst16:$src1)), - (V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1) ->; - } // End Predicates = [Has16BitInsts] diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll index c5a1913b354d..e73003b83e7c 100644 --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -865,7 +865,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(i32 addrspace(1)* %ou ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, 0xffffffc0, v3 +; VI-NEXT: v_subrev_u16_e32 v2, 64, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -883,7 +883,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(i32 addrspace(1)* %ou ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v2, 0xffffffc0, v3 +; GFX9-NEXT: v_subrev_u16_e32 v2, 64, v3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1045,9 +1045,9 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(<2 x i16> addrspace(1)* %out ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, 0xffffffc0, v4 -; VI-NEXT: v_sub_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: v_sub_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_subrev_u16_e32 v3, 64, v4 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1212,9 +1212,9 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(<2 x i16> addrspace(1)* %ou ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, 0xffffffc0, v4 -; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_subrev_u16_e32 v3, 64, v4 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1614,9 +1614,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(<2 x i16> addrspace(1) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, 0xffffffe0, v4 -; VI-NEXT: v_sub_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: v_sub_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_subrev_u16_e32 v3, 32, v4 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1774,7 +1774,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(<2 x i16> addrspace(1)* %o ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; VI-NEXT: v_add_u16_e32 v3, 0xffffffe0, v3 +; VI-NEXT: v_subrev_u16_e32 v3, 32, v3 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll index 5f540df4968d..495e79bb711d 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -344,7 +344,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspac ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; VI-NEXT: v_add_u16_e32 v0, 0xffffffe0, v0 +; VI-NEXT: v_subrev_u16_e32 v0, 32, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm