[RISCV] Fix the neutral element in vector 'fadd' reductions

Using positive zero as the neutral element in 'fadd' reductions, while
it generates better code, is incorrect. The correct neutral element is
negative zero: 0.0 + -0.0 = 0.0, whereas -0.0 + -0.0 = -0.0.

There are perhaps more optimal lowerings of negative zero avoiding
constant-pool loads which could be left as future work.

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D105902
This commit is contained in:
Fraser Cormack 2021-07-13 17:08:05 +01:00
parent 4359b870b1
commit 03a4702c88
3 changed files with 162 additions and 63 deletions

View File

@ -3718,7 +3718,7 @@ getRVVFPReductionOpAndOperands(SDValue Op, SelectionDAG &DAG, EVT EltVT) {
llvm_unreachable("Unhandled reduction");
case ISD::VECREDUCE_FADD:
return std::make_tuple(RISCVISD::VECREDUCE_FADD_VL, Op.getOperand(0),
DAG.getConstantFP(0.0, DL, EltVT));
DAG.getNeutralElement(BaseOpcode, DL, EltVT, Flags));
case ISD::VECREDUCE_SEQ_FADD:
return std::make_tuple(RISCVISD::VECREDUCE_SEQ_FADD_VL, Op.getOperand(1),
Op.getOperand(0));

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+experimental-v,+experimental-zfh,+f,+d -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+experimental-v,+experimental-zfh,+f,+d -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+experimental-v,+experimental-zfh,+f,+d -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+experimental-v,+experimental-zfh,+f,+d -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
declare half @llvm.vector.reduce.fadd.v1f16(half, <1 x half>)
@ -38,10 +38,12 @@ declare half @llvm.vector.reduce.fadd.v2f16(half, <2 x half>)
define half @vreduce_fadd_v2f16(<2 x half>* %x, half %s) {
; CHECK-LABEL: vreduce_fadd_v2f16:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a1, %hi(.LCPI2_0)
; CHECK-NEXT: flh ft0, %lo(.LCPI2_0)(a1)
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu
; CHECK-NEXT: vle16.v v25, (a0)
; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu
; CHECK-NEXT: vmv.v.i v26, 0
; CHECK-NEXT: vfmv.v.f v26, ft0
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu
; CHECK-NEXT: vfredsum.vs v25, v25, v26
; CHECK-NEXT: vfmv.f.s ft0, v25
@ -73,10 +75,12 @@ declare half @llvm.vector.reduce.fadd.v4f16(half, <4 x half>)
define half @vreduce_fadd_v4f16(<4 x half>* %x, half %s) {
; CHECK-LABEL: vreduce_fadd_v4f16:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a1, %hi(.LCPI4_0)
; CHECK-NEXT: flh ft0, %lo(.LCPI4_0)(a1)
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu
; CHECK-NEXT: vle16.v v25, (a0)
; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu
; CHECK-NEXT: vmv.v.i v26, 0
; CHECK-NEXT: vfmv.v.f v26, ft0
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu
; CHECK-NEXT: vfredsum.vs v25, v25, v26
; CHECK-NEXT: vfmv.f.s ft0, v25
@ -108,10 +112,12 @@ declare half @llvm.vector.reduce.fadd.v8f16(half, <8 x half>)
define half @vreduce_fadd_v8f16(<8 x half>* %x, half %s) {
; CHECK-LABEL: vreduce_fadd_v8f16:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a1, %hi(.LCPI6_0)
; CHECK-NEXT: flh ft0, %lo(.LCPI6_0)(a1)
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu
; CHECK-NEXT: vle16.v v25, (a0)
; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu
; CHECK-NEXT: vmv.v.i v26, 0
; CHECK-NEXT: vfmv.v.f v26, ft0
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu
; CHECK-NEXT: vfredsum.vs v25, v25, v26
; CHECK-NEXT: vfmv.f.s ft0, v25
@ -143,10 +149,12 @@ declare half @llvm.vector.reduce.fadd.v16f16(half, <16 x half>)
define half @vreduce_fadd_v16f16(<16 x half>* %x, half %s) {
; CHECK-LABEL: vreduce_fadd_v16f16:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a1, %hi(.LCPI8_0)
; CHECK-NEXT: flh ft0, %lo(.LCPI8_0)(a1)
; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu
; CHECK-NEXT: vle16.v v26, (a0)
; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu
; CHECK-NEXT: vmv.v.i v25, 0
; CHECK-NEXT: vfmv.v.f v25, ft0
; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu
; CHECK-NEXT: vfredsum.vs v25, v26, v25
; CHECK-NEXT: vfmv.f.s ft0, v25
@ -176,18 +184,35 @@ define half @vreduce_ord_fadd_v16f16(<16 x half>* %x, half %s) {
declare half @llvm.vector.reduce.fadd.v32f16(half, <32 x half>)
define half @vreduce_fadd_v32f16(<32 x half>* %x, half %s) {
; CHECK-LABEL: vreduce_fadd_v32f16:
; CHECK: # %bb.0:
; CHECK-NEXT: addi a1, zero, 32
; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu
; CHECK-NEXT: vle16.v v28, (a0)
; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu
; CHECK-NEXT: vmv.v.i v25, 0
; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu
; CHECK-NEXT: vfredsum.vs v25, v28, v25
; CHECK-NEXT: vfmv.f.s ft0, v25
; CHECK-NEXT: fadd.h fa0, fa0, ft0
; CHECK-NEXT: ret
; RV32-LABEL: vreduce_fadd_v32f16:
; RV32: # %bb.0:
; RV32-NEXT: addi a1, zero, 32
; RV32-NEXT: lui a2, %hi(.LCPI10_0)
; RV32-NEXT: flh ft0, %lo(.LCPI10_0)(a2)
; RV32-NEXT: vsetvli zero, a1, e16, m4, ta, mu
; RV32-NEXT: vle16.v v28, (a0)
; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu
; RV32-NEXT: vfmv.v.f v25, ft0
; RV32-NEXT: vsetvli zero, a1, e16, m4, ta, mu
; RV32-NEXT: vfredsum.vs v25, v28, v25
; RV32-NEXT: vfmv.f.s ft0, v25
; RV32-NEXT: fadd.h fa0, fa0, ft0
; RV32-NEXT: ret
;
; RV64-LABEL: vreduce_fadd_v32f16:
; RV64: # %bb.0:
; RV64-NEXT: lui a1, %hi(.LCPI10_0)
; RV64-NEXT: flh ft0, %lo(.LCPI10_0)(a1)
; RV64-NEXT: addi a1, zero, 32
; RV64-NEXT: vsetvli zero, a1, e16, m4, ta, mu
; RV64-NEXT: vle16.v v28, (a0)
; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu
; RV64-NEXT: vfmv.v.f v25, ft0
; RV64-NEXT: vsetvli zero, a1, e16, m4, ta, mu
; RV64-NEXT: vfredsum.vs v25, v28, v25
; RV64-NEXT: vfmv.f.s ft0, v25
; RV64-NEXT: fadd.h fa0, fa0, ft0
; RV64-NEXT: ret
%v = load <32 x half>, <32 x half>* %x
%red = call reassoc half @llvm.vector.reduce.fadd.v32f16(half %s, <32 x half> %v)
ret half %red
@ -213,18 +238,35 @@ define half @vreduce_ord_fadd_v32f16(<32 x half>* %x, half %s) {
declare half @llvm.vector.reduce.fadd.v64f16(half, <64 x half>)
define half @vreduce_fadd_v64f16(<64 x half>* %x, half %s) {
; CHECK-LABEL: vreduce_fadd_v64f16:
; CHECK: # %bb.0:
; CHECK-NEXT: addi a1, zero, 64
; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu
; CHECK-NEXT: vle16.v v8, (a0)
; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu
; CHECK-NEXT: vmv.v.i v25, 0
; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu
; CHECK-NEXT: vfredsum.vs v25, v8, v25
; CHECK-NEXT: vfmv.f.s ft0, v25
; CHECK-NEXT: fadd.h fa0, fa0, ft0
; CHECK-NEXT: ret
; RV32-LABEL: vreduce_fadd_v64f16:
; RV32: # %bb.0:
; RV32-NEXT: addi a1, zero, 64
; RV32-NEXT: lui a2, %hi(.LCPI12_0)
; RV32-NEXT: flh ft0, %lo(.LCPI12_0)(a2)
; RV32-NEXT: vsetvli zero, a1, e16, m8, ta, mu
; RV32-NEXT: vle16.v v8, (a0)
; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu
; RV32-NEXT: vfmv.v.f v25, ft0
; RV32-NEXT: vsetvli zero, a1, e16, m8, ta, mu
; RV32-NEXT: vfredsum.vs v25, v8, v25
; RV32-NEXT: vfmv.f.s ft0, v25
; RV32-NEXT: fadd.h fa0, fa0, ft0
; RV32-NEXT: ret
;
; RV64-LABEL: vreduce_fadd_v64f16:
; RV64: # %bb.0:
; RV64-NEXT: lui a1, %hi(.LCPI12_0)
; RV64-NEXT: flh ft0, %lo(.LCPI12_0)(a1)
; RV64-NEXT: addi a1, zero, 64
; RV64-NEXT: vsetvli zero, a1, e16, m8, ta, mu
; RV64-NEXT: vle16.v v8, (a0)
; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu
; RV64-NEXT: vfmv.v.f v25, ft0
; RV64-NEXT: vsetvli zero, a1, e16, m8, ta, mu
; RV64-NEXT: vfredsum.vs v25, v8, v25
; RV64-NEXT: vfmv.f.s ft0, v25
; RV64-NEXT: fadd.h fa0, fa0, ft0
; RV64-NEXT: ret
%v = load <64 x half>, <64 x half>* %x
%red = call reassoc half @llvm.vector.reduce.fadd.v64f16(half %s, <64 x half> %v)
ret half %red
@ -257,9 +299,11 @@ define half @vreduce_fadd_v128f16(<128 x half>* %x, half %s) {
; CHECK-NEXT: vle16.v v8, (a0)
; CHECK-NEXT: addi a0, a0, 128
; CHECK-NEXT: vle16.v v16, (a0)
; CHECK-NEXT: lui a0, %hi(.LCPI14_0)
; CHECK-NEXT: flh ft0, %lo(.LCPI14_0)(a0)
; CHECK-NEXT: vfadd.vv v8, v8, v16
; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu
; CHECK-NEXT: vmv.v.i v25, 0
; CHECK-NEXT: vfmv.v.f v25, ft0
; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu
; CHECK-NEXT: vfredsum.vs v25, v8, v25
; CHECK-NEXT: vfmv.f.s ft0, v25
@ -330,10 +374,12 @@ declare float @llvm.vector.reduce.fadd.v2f32(float, <2 x float>)
define float @vreduce_fadd_v2f32(<2 x float>* %x, float %s) {
; CHECK-LABEL: vreduce_fadd_v2f32:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a1, %hi(.LCPI18_0)
; CHECK-NEXT: flw ft0, %lo(.LCPI18_0)(a1)
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
; CHECK-NEXT: vle32.v v25, (a0)
; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu
; CHECK-NEXT: vmv.v.i v26, 0
; CHECK-NEXT: vfmv.v.f v26, ft0
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
; CHECK-NEXT: vfredsum.vs v25, v25, v26
; CHECK-NEXT: vfmv.f.s ft0, v25
@ -365,10 +411,12 @@ declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>)
define float @vreduce_fadd_v4f32(<4 x float>* %x, float %s) {
; CHECK-LABEL: vreduce_fadd_v4f32:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a1, %hi(.LCPI20_0)
; CHECK-NEXT: flw ft0, %lo(.LCPI20_0)(a1)
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu
; CHECK-NEXT: vle32.v v25, (a0)
; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu
; CHECK-NEXT: vmv.v.i v26, 0
; CHECK-NEXT: vfmv.v.f v26, ft0
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu
; CHECK-NEXT: vfredsum.vs v25, v25, v26
; CHECK-NEXT: vfmv.f.s ft0, v25
@ -400,10 +448,12 @@ declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>)
define float @vreduce_fadd_v8f32(<8 x float>* %x, float %s) {
; CHECK-LABEL: vreduce_fadd_v8f32:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a1, %hi(.LCPI22_0)
; CHECK-NEXT: flw ft0, %lo(.LCPI22_0)(a1)
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu
; CHECK-NEXT: vle32.v v26, (a0)
; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu
; CHECK-NEXT: vmv.v.i v25, 0
; CHECK-NEXT: vfmv.v.f v25, ft0
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu
; CHECK-NEXT: vfredsum.vs v25, v26, v25
; CHECK-NEXT: vfmv.f.s ft0, v25
@ -435,10 +485,12 @@ declare float @llvm.vector.reduce.fadd.v16f32(float, <16 x float>)
define float @vreduce_fadd_v16f32(<16 x float>* %x, float %s) {
; CHECK-LABEL: vreduce_fadd_v16f32:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a1, %hi(.LCPI24_0)
; CHECK-NEXT: flw ft0, %lo(.LCPI24_0)(a1)
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
; CHECK-NEXT: vle32.v v28, (a0)
; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu
; CHECK-NEXT: vmv.v.i v25, 0
; CHECK-NEXT: vfmv.v.f v25, ft0
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
; CHECK-NEXT: vfredsum.vs v25, v28, v25
; CHECK-NEXT: vfmv.f.s ft0, v25
@ -468,18 +520,35 @@ define float @vreduce_ord_fadd_v16f32(<16 x float>* %x, float %s) {
declare float @llvm.vector.reduce.fadd.v32f32(float, <32 x float>)
define float @vreduce_fadd_v32f32(<32 x float>* %x, float %s) {
; CHECK-LABEL: vreduce_fadd_v32f32:
; CHECK: # %bb.0:
; CHECK-NEXT: addi a1, zero, 32
; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu
; CHECK-NEXT: vmv.v.i v25, 0
; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu
; CHECK-NEXT: vfredsum.vs v25, v8, v25
; CHECK-NEXT: vfmv.f.s ft0, v25
; CHECK-NEXT: fadd.s fa0, fa0, ft0
; CHECK-NEXT: ret
; RV32-LABEL: vreduce_fadd_v32f32:
; RV32: # %bb.0:
; RV32-NEXT: addi a1, zero, 32
; RV32-NEXT: lui a2, %hi(.LCPI26_0)
; RV32-NEXT: flw ft0, %lo(.LCPI26_0)(a2)
; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, mu
; RV32-NEXT: vle32.v v8, (a0)
; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, mu
; RV32-NEXT: vfmv.v.f v25, ft0
; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, mu
; RV32-NEXT: vfredsum.vs v25, v8, v25
; RV32-NEXT: vfmv.f.s ft0, v25
; RV32-NEXT: fadd.s fa0, fa0, ft0
; RV32-NEXT: ret
;
; RV64-LABEL: vreduce_fadd_v32f32:
; RV64: # %bb.0:
; RV64-NEXT: lui a1, %hi(.LCPI26_0)
; RV64-NEXT: flw ft0, %lo(.LCPI26_0)(a1)
; RV64-NEXT: addi a1, zero, 32
; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, mu
; RV64-NEXT: vle32.v v8, (a0)
; RV64-NEXT: vsetvli a0, zero, e32, m1, ta, mu
; RV64-NEXT: vfmv.v.f v25, ft0
; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, mu
; RV64-NEXT: vfredsum.vs v25, v8, v25
; RV64-NEXT: vfmv.f.s ft0, v25
; RV64-NEXT: fadd.s fa0, fa0, ft0
; RV64-NEXT: ret
%v = load <32 x float>, <32 x float>* %x
%red = call reassoc float @llvm.vector.reduce.fadd.v32f32(float %s, <32 x float> %v)
ret float %red
@ -512,9 +581,11 @@ define float @vreduce_fadd_v64f32(<64 x float>* %x, float %s) {
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: addi a0, a0, 128
; CHECK-NEXT: vle32.v v16, (a0)
; CHECK-NEXT: lui a0, %hi(.LCPI28_0)
; CHECK-NEXT: flw ft0, %lo(.LCPI28_0)(a0)
; CHECK-NEXT: vfadd.vv v8, v8, v16
; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu
; CHECK-NEXT: vmv.v.i v25, 0
; CHECK-NEXT: vfmv.v.f v25, ft0
; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu
; CHECK-NEXT: vfredsum.vs v25, v8, v25
; CHECK-NEXT: vfmv.f.s ft0, v25
@ -585,10 +656,12 @@ declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>)
define double @vreduce_fadd_v2f64(<2 x double>* %x, double %s) {
; CHECK-LABEL: vreduce_fadd_v2f64:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a1, %hi(.LCPI32_0)
; CHECK-NEXT: fld ft0, %lo(.LCPI32_0)(a1)
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu
; CHECK-NEXT: vle64.v v25, (a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu
; CHECK-NEXT: vmv.v.i v26, 0
; CHECK-NEXT: vfmv.v.f v26, ft0
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu
; CHECK-NEXT: vfredsum.vs v25, v25, v26
; CHECK-NEXT: vfmv.f.s ft0, v25
@ -620,10 +693,12 @@ declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>)
define double @vreduce_fadd_v4f64(<4 x double>* %x, double %s) {
; CHECK-LABEL: vreduce_fadd_v4f64:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a1, %hi(.LCPI34_0)
; CHECK-NEXT: fld ft0, %lo(.LCPI34_0)(a1)
; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu
; CHECK-NEXT: vle64.v v26, (a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu
; CHECK-NEXT: vmv.v.i v25, 0
; CHECK-NEXT: vfmv.v.f v25, ft0
; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu
; CHECK-NEXT: vfredsum.vs v25, v26, v25
; CHECK-NEXT: vfmv.f.s ft0, v25
@ -655,10 +730,12 @@ declare double @llvm.vector.reduce.fadd.v8f64(double, <8 x double>)
define double @vreduce_fadd_v8f64(<8 x double>* %x, double %s) {
; CHECK-LABEL: vreduce_fadd_v8f64:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a1, %hi(.LCPI36_0)
; CHECK-NEXT: fld ft0, %lo(.LCPI36_0)(a1)
; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu
; CHECK-NEXT: vle64.v v28, (a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu
; CHECK-NEXT: vmv.v.i v25, 0
; CHECK-NEXT: vfmv.v.f v25, ft0
; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu
; CHECK-NEXT: vfredsum.vs v25, v28, v25
; CHECK-NEXT: vfmv.f.s ft0, v25
@ -690,10 +767,12 @@ declare double @llvm.vector.reduce.fadd.v16f64(double, <16 x double>)
define double @vreduce_fadd_v16f64(<16 x double>* %x, double %s) {
; CHECK-LABEL: vreduce_fadd_v16f64:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a1, %hi(.LCPI38_0)
; CHECK-NEXT: fld ft0, %lo(.LCPI38_0)(a1)
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu
; CHECK-NEXT: vle64.v v8, (a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu
; CHECK-NEXT: vmv.v.i v25, 0
; CHECK-NEXT: vfmv.v.f v25, ft0
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu
; CHECK-NEXT: vfredsum.vs v25, v8, v25
; CHECK-NEXT: vfmv.f.s ft0, v25
@ -729,9 +808,11 @@ define double @vreduce_fadd_v32f64(<32 x double>* %x, double %s) {
; CHECK-NEXT: vle64.v v8, (a0)
; CHECK-NEXT: addi a0, a0, 128
; CHECK-NEXT: vle64.v v16, (a0)
; CHECK-NEXT: lui a0, %hi(.LCPI40_0)
; CHECK-NEXT: fld ft0, %lo(.LCPI40_0)(a0)
; CHECK-NEXT: vfadd.vv v8, v8, v16
; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu
; CHECK-NEXT: vmv.v.i v25, 0
; CHECK-NEXT: vfmv.v.f v25, ft0
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu
; CHECK-NEXT: vfredsum.vs v25, v8, v25
; CHECK-NEXT: vfmv.f.s ft0, v25

View File

@ -9,8 +9,10 @@ declare half @llvm.vector.reduce.fadd.nxv1f16(half, <vscale x 1 x half>)
define half @vreduce_fadd_nxv1f16(<vscale x 1 x half> %v, half %s) {
; CHECK-LABEL: vreduce_fadd_nxv1f16:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a0, %hi(.LCPI0_0)
; CHECK-NEXT: flh ft0, %lo(.LCPI0_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu
; CHECK-NEXT: vmv.v.i v25, 0
; CHECK-NEXT: vfmv.v.f v25, ft0
; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu
; CHECK-NEXT: vfredsum.vs v25, v8, v25
; CHECK-NEXT: vfmv.f.s ft0, v25
@ -38,8 +40,10 @@ declare half @llvm.vector.reduce.fadd.nxv2f16(half, <vscale x 2 x half>)
define half @vreduce_fadd_nxv2f16(<vscale x 2 x half> %v, half %s) {
; CHECK-LABEL: vreduce_fadd_nxv2f16:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a0, %hi(.LCPI2_0)
; CHECK-NEXT: flh ft0, %lo(.LCPI2_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu
; CHECK-NEXT: vmv.v.i v25, 0
; CHECK-NEXT: vfmv.v.f v25, ft0
; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu
; CHECK-NEXT: vfredsum.vs v25, v8, v25
; CHECK-NEXT: vfmv.f.s ft0, v25
@ -67,8 +71,10 @@ declare half @llvm.vector.reduce.fadd.nxv4f16(half, <vscale x 4 x half>)
define half @vreduce_fadd_nxv4f16(<vscale x 4 x half> %v, half %s) {
; CHECK-LABEL: vreduce_fadd_nxv4f16:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a0, %hi(.LCPI4_0)
; CHECK-NEXT: flh ft0, %lo(.LCPI4_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu
; CHECK-NEXT: vmv.v.i v25, 0
; CHECK-NEXT: vfmv.v.f v25, ft0
; CHECK-NEXT: vfredsum.vs v25, v8, v25
; CHECK-NEXT: vfmv.f.s ft0, v25
; CHECK-NEXT: fadd.h fa0, fa0, ft0
@ -94,8 +100,10 @@ declare float @llvm.vector.reduce.fadd.nxv1f32(float, <vscale x 1 x float>)
define float @vreduce_fadd_nxv1f32(<vscale x 1 x float> %v, float %s) {
; CHECK-LABEL: vreduce_fadd_nxv1f32:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a0, %hi(.LCPI6_0)
; CHECK-NEXT: flw ft0, %lo(.LCPI6_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu
; CHECK-NEXT: vmv.v.i v25, 0
; CHECK-NEXT: vfmv.v.f v25, ft0
; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu
; CHECK-NEXT: vfredsum.vs v25, v8, v25
; CHECK-NEXT: vfmv.f.s ft0, v25
@ -123,8 +131,10 @@ declare float @llvm.vector.reduce.fadd.nxv2f32(float, <vscale x 2 x float>)
define float @vreduce_fadd_nxv2f32(<vscale x 2 x float> %v, float %s) {
; CHECK-LABEL: vreduce_fadd_nxv2f32:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a0, %hi(.LCPI8_0)
; CHECK-NEXT: flw ft0, %lo(.LCPI8_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu
; CHECK-NEXT: vmv.v.i v25, 0
; CHECK-NEXT: vfmv.v.f v25, ft0
; CHECK-NEXT: vfredsum.vs v25, v8, v25
; CHECK-NEXT: vfmv.f.s ft0, v25
; CHECK-NEXT: fadd.s fa0, fa0, ft0
@ -150,8 +160,10 @@ declare float @llvm.vector.reduce.fadd.nxv4f32(float, <vscale x 4 x float>)
define float @vreduce_fadd_nxv4f32(<vscale x 4 x float> %v, float %s) {
; CHECK-LABEL: vreduce_fadd_nxv4f32:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a0, %hi(.LCPI10_0)
; CHECK-NEXT: flw ft0, %lo(.LCPI10_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu
; CHECK-NEXT: vmv.v.i v25, 0
; CHECK-NEXT: vfmv.v.f v25, ft0
; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu
; CHECK-NEXT: vfredsum.vs v25, v8, v25
; CHECK-NEXT: vfmv.f.s ft0, v25
@ -179,8 +191,10 @@ declare double @llvm.vector.reduce.fadd.nxv1f64(double, <vscale x 1 x double>)
define double @vreduce_fadd_nxv1f64(<vscale x 1 x double> %v, double %s) {
; CHECK-LABEL: vreduce_fadd_nxv1f64:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a0, %hi(.LCPI12_0)
; CHECK-NEXT: fld ft0, %lo(.LCPI12_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu
; CHECK-NEXT: vmv.v.i v25, 0
; CHECK-NEXT: vfmv.v.f v25, ft0
; CHECK-NEXT: vfredsum.vs v25, v8, v25
; CHECK-NEXT: vfmv.f.s ft0, v25
; CHECK-NEXT: fadd.d fa0, fa0, ft0
@ -206,8 +220,10 @@ declare double @llvm.vector.reduce.fadd.nxv2f64(double, <vscale x 2 x double>)
define double @vreduce_fadd_nxv2f64(<vscale x 2 x double> %v, double %s) {
; CHECK-LABEL: vreduce_fadd_nxv2f64:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a0, %hi(.LCPI14_0)
; CHECK-NEXT: fld ft0, %lo(.LCPI14_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu
; CHECK-NEXT: vmv.v.i v25, 0
; CHECK-NEXT: vfmv.v.f v25, ft0
; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu
; CHECK-NEXT: vfredsum.vs v25, v8, v25
; CHECK-NEXT: vfmv.f.s ft0, v25
@ -235,8 +251,10 @@ declare double @llvm.vector.reduce.fadd.nxv4f64(double, <vscale x 4 x double>)
define double @vreduce_fadd_nxv4f64(<vscale x 4 x double> %v, double %s) {
; CHECK-LABEL: vreduce_fadd_nxv4f64:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a0, %hi(.LCPI16_0)
; CHECK-NEXT: fld ft0, %lo(.LCPI16_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu
; CHECK-NEXT: vmv.v.i v25, 0
; CHECK-NEXT: vfmv.v.f v25, ft0
; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, mu
; CHECK-NEXT: vfredsum.vs v25, v8, v25
; CHECK-NEXT: vfmv.f.s ft0, v25