From ce84130f8562c8c990362502f03d04187a0be581 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 22 Dec 2016 03:05:37 +0000
Subject: [PATCH] AMDGPU: Implement f16 fcanonicalize

llvm-svn: 290300
---
 llvm/lib/Target/AMDGPU/AMDGPUInstructions.td  |   1 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   3 +
 llvm/lib/Target/AMDGPU/SIInstructions.td      |   5 +
 llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll | 172 ++++++++++++++++++
 4 files changed, 181 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index a7a995156e2e..c49866da5d3e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -391,6 +391,7 @@ int TWO_PI = 0x40c90fdb;
 int PI = 0x40490fdb;
 int TWO_PI_INV = 0x3e22f983;
 int FP_UINT_MAX_PLUS_1 = 0x4f800000;    // 1 << 32 in floating point encoding
+int FP16_ONE = 0x3C00;
 int FP32_ONE = 0x3f800000;
 int FP32_NEG_ONE = 0xbf800000;
 int FP64_ONE = 0x3ff0000000000000;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0558c59782c1..b9302582fa89 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3648,6 +3648,9 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
 
     if (VT == MVT::f64 && !Subtarget->hasFP64Denormals())
       return DAG.getConstantFP(0.0, SDLoc(N), VT);
+
+    if (VT == MVT::f16 && !Subtarget->hasFP16Denormals())
+      return DAG.getConstantFP(0.0, SDLoc(N), VT);
   }
 
   if (C.isNaN()) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 54865956ae2f..bc35c2edc8d3 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1021,6 +1021,11 @@ defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
 
 def : BFEPattern <V_BFE_U32, S_MOV_B32>;
 
+def : Pat<
+  (fcanonicalize f16:$src),
+  (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), 0, $src, 0, 0)
+>;
+
 def : Pat<
   (fcanonicalize f32:$src),
   (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), 0, $src, 0, 0)
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
new file mode 100644
index 000000000000..fb693de0e39d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -0,0 +1,172 @@
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+declare half @llvm.canonicalize.f16(half) #0
+
+; GCN-LABEL: {{^}}v_test_canonicalize_var_f16:
+; GCN: v_mul_f16_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
+; GCN: buffer_store_short [[REG]]
+define void @v_test_canonicalize_var_f16(half addrspace(1)* %out) #1 {
+  %val = load half, half addrspace(1)* %out
+  %canonicalized = call half @llvm.canonicalize.f16(half %val)
+  store half %canonicalized, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_canonicalize_var_f16:
+; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, {{s[0-9]+}}
+; GCN: buffer_store_short [[REG]]
+define void @s_test_canonicalize_var_f16(half addrspace(1)* %out, i16 zeroext %val.arg) #1 {
+  %val = bitcast i16 %val.arg to half
+  %canonicalized = call half @llvm.canonicalize.f16(half %val)
+  store half %canonicalized, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_short [[REG]]
+define void @test_fold_canonicalize_p0_f16(half addrspace(1)* %out) #1 {
+  %canonicalized = call half @llvm.canonicalize.f16(half 0.0)
+  store half %canonicalized, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff8000{{$}}
+; GCN: buffer_store_short [[REG]]
+define void @test_fold_canonicalize_n0_f16(half addrspace(1)* %out) #1 {
+  %canonicalized = call half @llvm.canonicalize.f16(half -0.0)
+  store half %canonicalized, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c00{{$}}
+; GCN: buffer_store_short [[REG]]
+define void @test_fold_canonicalize_p1_f16(half addrspace(1)* %out) #1 {
+  %canonicalized = call half @llvm.canonicalize.f16(half 1.0)
+  store half %canonicalized, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffbc00{{$}}
+; GCN: buffer_store_short [[REG]]
+define void @test_fold_canonicalize_n1_f16(half addrspace(1)* %out) #1 {
+  %canonicalized = call half @llvm.canonicalize.f16(half -1.0)
+  store half %canonicalized, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4c00{{$}}
+; GCN: buffer_store_short [[REG]]
+define void @test_fold_canonicalize_literal_f16(half addrspace(1)* %out) #1 {
+  %canonicalized = call half @llvm.canonicalize.f16(half 16.0)
+  store half %canonicalized, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_short [[REG]]
+define void @test_no_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #1 {
+  %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF)
+  store half %canonicalized, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff{{$}}
+; GCN: buffer_store_short [[REG]]
+define void @test_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #3 {
+  %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF)
+  store half %canonicalized, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_short [[REG]]
+define void @test_no_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #1 {
+  %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF)
+  store half %canonicalized, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff83ff{{$}}
+; GCN: buffer_store_short [[REG]]
+define void @test_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #3 {
+  %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF)
+  store half %canonicalized, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7c00{{$}}
+; GCN: buffer_store_short [[REG]]
+define void @test_fold_canonicalize_qnan_f16(half addrspace(1)* %out) #1 {
+  %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C00)
+  store half %canonicalized, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
+; GCN: buffer_store_short [[REG]]
+define void @test_fold_canonicalize_qnan_value_neg1_f16(half addrspace(1)* %out) #1 {
+  %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -1 to half))
+  store half %canonicalized, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
+; GCN: buffer_store_short [[REG]]
+define void @test_fold_canonicalize_qnan_value_neg2_f16(half addrspace(1)* %out) #1 {
+  %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -2 to half))
+  store half %canonicalized, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
+; GCN: buffer_store_short [[REG]]
+define void @test_fold_canonicalize_snan0_value_f16(half addrspace(1)* %out) #1 {
+  %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C01)
+  store half %canonicalized, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
+; GCN: buffer_store_short [[REG]]
+define void @test_fold_canonicalize_snan1_value_f16(half addrspace(1)* %out) #1 {
+  %canonicalized = call half @llvm.canonicalize.f16(half 0xH7DFF)
+  store half %canonicalized, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
+; GCN: buffer_store_short [[REG]]
+define void @test_fold_canonicalize_snan2_value_f16(half addrspace(1)* %out) #1 {
+  %canonicalized = call half @llvm.canonicalize.f16(half 0xHFDFF)
+  store half %canonicalized, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
+; GCN: buffer_store_short [[REG]]
+define void @test_fold_canonicalize_snan3_value_f16(half addrspace(1)* %out) #1 {
+  %canonicalized = call half @llvm.canonicalize.f16(half 0xHFC01)
+  store half %canonicalized, half addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind "target-features"="-fp16-denormals,-fp16-denormals" }
+attributes #3 = { nounwind "target-features"="+fp16-denormals,+fp64-denormals" }