AMDGPU: Add flag to disable promotion of uniform i16 ops

This interferes with GlobalISel's much better handling of the situation. This should really be disable for GlobalISel. However, the fallback only re-runs the selection passes, and doesn't go back and rerun any codegen IR passes. I haven't come up with a good solution to this problem.
2020-07-31 14:50:39 -04:00 · 2020-07-31 14:50:39 -04:00 · 75e6f0b3d4
parent 2b7a2cbb15
commit 75e6f0b3d4
3 changed files with 61 additions and 148 deletions
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@ -60,6 +60,12 @@ static cl::opt<bool> WidenLoads(
  cl::ReallyHidden,
  cl::init(false));

+static cl::opt<bool> Widen16BitOps(
+  "amdgpu-codegenprepare-widen-16-bit-ops",
+  cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
+  cl::ReallyHidden,
+  cl::init(true));
+
 static cl::opt<bool> UseMul24Intrin(
  "amdgpu-codegenprepare-mul24",
  cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
@ -269,6 +275,9 @@ bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
 }

 bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
+  if (!Widen16BitOps)
+    return false;
+
  const IntegerType *IntTy = dyn_cast<IntegerType>(T);
  if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
    return true;
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
+; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s

 define amdgpu_ps i32 @s_andn2_i32(i32 inreg %src0, i32 inreg %src1) {
 ; GCN-LABEL: s_andn2_i32:
@ -196,58 +196,31 @@ define amdgpu_ps <2 x i32> @s_andn2_v2i32_commute(<2 x i32> inreg %src0, <2 x i3
 }

 define amdgpu_ps i16 @s_andn2_i16(i16 inreg %src0, i16 inreg %src1) {
-; GFX6-LABEL: s_andn2_i16:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_andn2_b32 s0, s2, s3
-; GFX6-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: s_andn2_i16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
-; GFX9-NEXT:    s_and_b32 s1, s3, s0
-; GFX9-NEXT:    s_xor_b32 s0, s1, s0
-; GFX9-NEXT:    s_and_b32 s0, s2, s0
-; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x100000
-; GFX9-NEXT:    ; return to shader part epilog
+; GCN-LABEL: s_andn2_i16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_andn2_b32 s0, s2, s3
+; GCN-NEXT:    ; return to shader part epilog
  %not.src1 = xor i16 %src1, -1
  %and = and i16 %src0, %not.src1
  ret i16 %and
 }

 define amdgpu_ps i16 @s_andn2_i16_commute(i16 inreg %src0, i16 inreg %src1) {
-; GFX6-LABEL: s_andn2_i16_commute:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_andn2_b32 s0, s2, s3
-; GFX6-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: s_andn2_i16_commute:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
-; GFX9-NEXT:    s_and_b32 s1, s3, s0
-; GFX9-NEXT:    s_xor_b32 s0, s1, s0
-; GFX9-NEXT:    s_and_b32 s0, s0, s2
-; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x100000
-; GFX9-NEXT:    ; return to shader part epilog
+; GCN-LABEL: s_andn2_i16_commute:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_andn2_b32 s0, s2, s3
+; GCN-NEXT:    ; return to shader part epilog
  %not.src1 = xor i16 %src1, -1
  %and = and i16 %not.src1, %src0
  ret i16 %and
 }

 define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_use(i16 inreg %src0, i16 inreg %src1) {
-; GFX6-LABEL: s_andn2_i16_multi_use:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_xor_b32 s1, s3, -1
-; GFX6-NEXT:    s_andn2_b32 s0, s2, s3
-; GFX6-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: s_andn2_i16_multi_use:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
-; GFX9-NEXT:    s_and_b32 s1, s3, s0
-; GFX9-NEXT:    s_xor_b32 s1, s1, s0
-; GFX9-NEXT:    s_and_b32 s0, s2, s1
-; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x100000
-; GFX9-NEXT:    ; return to shader part epilog
+; GCN-LABEL: s_andn2_i16_multi_use:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_xor_b32 s1, s3, -1
+; GCN-NEXT:    s_andn2_b32 s0, s2, s3
+; GCN-NEXT:    ; return to shader part epilog
  %not.src1 = xor i16 %src1, -1
  %and = and i16 %src0, %not.src1
  %insert.0 = insertvalue { i16, i16 } undef, i16 %and, 0
@ -256,23 +229,11 @@ define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_use(i16 inreg %src0, i16 inreg
 }

 define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_foldable_use(i16 inreg %src0, i16 inreg %src1, i16 inreg %src2) {
-; GFX6-LABEL: s_andn2_i16_multi_foldable_use:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_andn2_b32 s0, s2, s4
-; GFX6-NEXT:    s_andn2_b32 s1, s3, s4
-; GFX6-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: s_andn2_i16_multi_foldable_use:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s1, 0xffff
-; GFX9-NEXT:    s_and_b32 s0, s4, s1
-; GFX9-NEXT:    s_xor_b32 s0, s0, s1
-; GFX9-NEXT:    s_and_b32 s2, s2, s1
-; GFX9-NEXT:    s_and_b32 s4, s0, s1
-; GFX9-NEXT:    s_and_b32 s1, s3, s1
-; GFX9-NEXT:    s_and_b32 s0, s2, s4
-; GFX9-NEXT:    s_and_b32 s1, s1, s4
-; GFX9-NEXT:    ; return to shader part epilog
+; GCN-LABEL: s_andn2_i16_multi_foldable_use:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_andn2_b32 s0, s2, s4
+; GCN-NEXT:    s_andn2_b32 s1, s3, s4
+; GCN-NEXT:    ; return to shader part epilog
  %not.src2 = xor i16 %src2, -1
  %and0 = and i16 %src0, %not.src2
  %and1 = and i16 %src1, %not.src2
@ -308,21 +269,12 @@ define amdgpu_ps float @v_andn2_i16_sv(i16 inreg %src0, i16 %src1) {
 }

 define amdgpu_ps float @v_andn2_i16_vs(i16 %src0, i16 inreg %src1) {
-; GFX6-LABEL: v_andn2_i16_vs:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_xor_b32 s0, s2, -1
-; GFX6-NEXT:    v_and_b32_e32 v0, s0, v0
-; GFX6-NEXT:    v_bfe_u32 v0, v0, 0, 16
-; GFX6-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: v_andn2_i16_vs:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
-; GFX9-NEXT:    s_and_b32 s1, s2, s0
-; GFX9-NEXT:    s_xor_b32 s0, s1, s0
-; GFX9-NEXT:    v_and_b32_e32 v0, s0, v0
-; GFX9-NEXT:    v_bfe_u32 v0, v0, 0, 16
-; GFX9-NEXT:    ; return to shader part epilog
+; GCN-LABEL: v_andn2_i16_vs:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_xor_b32 s0, s2, -1
+; GCN-NEXT:    v_and_b32_e32 v0, s0, v0
+; GCN-NEXT:    v_bfe_u32 v0, v0, 0, 16
+; GCN-NEXT:    ; return to shader part epilog
  %not.src1 = xor i16 %src1, -1
  %and = and i16 %src0, %not.src1
  %zext = zext i16 %and to i32
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
+; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s

 define amdgpu_ps i32 @s_orn2_i32(i32 inreg %src0, i32 inreg %src1) {
 ; GCN-LABEL: s_orn2_i32:
@ -196,58 +196,31 @@ define amdgpu_ps <2 x i32> @s_orn2_v2i32_commute(<2 x i32> inreg %src0, <2 x i32
 }

 define amdgpu_ps i16 @s_orn2_i16(i16 inreg %src0, i16 inreg %src1) {
-; GFX6-LABEL: s_orn2_i16:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_orn2_b32 s0, s2, s3
-; GFX6-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: s_orn2_i16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
-; GFX9-NEXT:    s_and_b32 s1, s3, s0
-; GFX9-NEXT:    s_xor_b32 s0, s1, s0
-; GFX9-NEXT:    s_or_b32 s0, s2, s0
-; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x100000
-; GFX9-NEXT:    ; return to shader part epilog
+; GCN-LABEL: s_orn2_i16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_orn2_b32 s0, s2, s3
+; GCN-NEXT:    ; return to shader part epilog
  %not.src1 = xor i16 %src1, -1
  %or = or i16 %src0, %not.src1
  ret i16 %or
 }

 define amdgpu_ps i16 @s_orn2_i16_commute(i16 inreg %src0, i16 inreg %src1) {
-; GFX6-LABEL: s_orn2_i16_commute:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_orn2_b32 s0, s2, s3
-; GFX6-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: s_orn2_i16_commute:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
-; GFX9-NEXT:    s_and_b32 s1, s3, s0
-; GFX9-NEXT:    s_xor_b32 s0, s1, s0
-; GFX9-NEXT:    s_or_b32 s0, s0, s2
-; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x100000
-; GFX9-NEXT:    ; return to shader part epilog
+; GCN-LABEL: s_orn2_i16_commute:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_orn2_b32 s0, s2, s3
+; GCN-NEXT:    ; return to shader part epilog
  %not.src1 = xor i16 %src1, -1
  %or = or i16 %not.src1, %src0
  ret i16 %or
 }

 define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_use(i16 inreg %src0, i16 inreg %src1) {
-; GFX6-LABEL: s_orn2_i16_multi_use:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_xor_b32 s1, s3, -1
-; GFX6-NEXT:    s_orn2_b32 s0, s2, s3
-; GFX6-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: s_orn2_i16_multi_use:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
-; GFX9-NEXT:    s_and_b32 s1, s3, s0
-; GFX9-NEXT:    s_xor_b32 s1, s1, s0
-; GFX9-NEXT:    s_or_b32 s0, s2, s1
-; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x100000
-; GFX9-NEXT:    ; return to shader part epilog
+; GCN-LABEL: s_orn2_i16_multi_use:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_xor_b32 s1, s3, -1
+; GCN-NEXT:    s_orn2_b32 s0, s2, s3
+; GCN-NEXT:    ; return to shader part epilog
  %not.src1 = xor i16 %src1, -1
  %or = or i16 %src0, %not.src1
  %insert.0 = insertvalue { i16, i16 } undef, i16 %or, 0
@ -256,23 +229,11 @@ define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_use(i16 inreg %src0, i16 inreg %
 }

 define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_foldable_use(i16 inreg %src0, i16 inreg %src1, i16 inreg %src2) {
-; GFX6-LABEL: s_orn2_i16_multi_foldable_use:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_orn2_b32 s0, s2, s4
-; GFX6-NEXT:    s_orn2_b32 s1, s3, s4
-; GFX6-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: s_orn2_i16_multi_foldable_use:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s1, 0xffff
-; GFX9-NEXT:    s_and_b32 s0, s4, s1
-; GFX9-NEXT:    s_xor_b32 s0, s0, s1
-; GFX9-NEXT:    s_and_b32 s2, s2, s1
-; GFX9-NEXT:    s_and_b32 s4, s0, s1
-; GFX9-NEXT:    s_and_b32 s1, s3, s1
-; GFX9-NEXT:    s_or_b32 s0, s2, s4
-; GFX9-NEXT:    s_or_b32 s1, s1, s4
-; GFX9-NEXT:    ; return to shader part epilog
+; GCN-LABEL: s_orn2_i16_multi_foldable_use:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_orn2_b32 s0, s2, s4
+; GCN-NEXT:    s_orn2_b32 s1, s3, s4
+; GCN-NEXT:    ; return to shader part epilog
  %not.src2 = xor i16 %src2, -1
  %or0 = or i16 %src0, %not.src2
  %or1 = or i16 %src1, %not.src2
@ -308,21 +269,12 @@ define amdgpu_ps float @v_orn2_i16_sv(i16 inreg %src0, i16 %src1) {
 }

 define amdgpu_ps float @v_orn2_i16_vs(i16 %src0, i16 inreg %src1) {
-; GFX6-LABEL: v_orn2_i16_vs:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_xor_b32 s0, s2, -1
-; GFX6-NEXT:    v_or_b32_e32 v0, s0, v0
-; GFX6-NEXT:    v_bfe_u32 v0, v0, 0, 16
-; GFX6-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: v_orn2_i16_vs:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
-; GFX9-NEXT:    s_and_b32 s1, s2, s0
-; GFX9-NEXT:    s_xor_b32 s0, s1, s0
-; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
-; GFX9-NEXT:    v_bfe_u32 v0, v0, 0, 16
-; GFX9-NEXT:    ; return to shader part epilog
+; GCN-LABEL: v_orn2_i16_vs:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_xor_b32 s0, s2, -1
+; GCN-NEXT:    v_or_b32_e32 v0, s0, v0
+; GCN-NEXT:    v_bfe_u32 v0, v0, 0, 16
+; GCN-NEXT:    ; return to shader part epilog
  %not.src1 = xor i16 %src1, -1
  %or = or i16 %src0, %not.src1
  %zext = zext i16 %or to i32