[RISCV] Add -riscv-v-fixed-length-vector-elen-max to limit the ELEN used for fixed length vectorization.

This adds an ELEN limit for fixed length vectors. This will scalarize any elements larger than this. It will also disable some fractional LMULs. For example, if ELEN=32 then mf8 becomes illegal, i32/f32 vectors can't use any fractional LMULs, i16/f16 can only use mf2, and i8 can use mf2 and mf4. We may also need something for the scalable vectors, but that has interactions with the intrinsics and we can't scalarize a scalable vector. Longer term this should come from one of the Zve* features
2021-08-27 09:51:05 -07:00 · 2021-08-27 09:51:05 -07:00 · 0eeab8b282
parent 3ec634e65a
commit 0eeab8b282
5 changed files with 227 additions and 5 deletions
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@ -1204,8 +1204,10 @@ static bool useRVVForFixedLengthVectorVT(MVT VT,

  unsigned MinVLen = Subtarget.getMinRVVVectorSizeInBits();

+  MVT EltVT = VT.getVectorElementType();
+
  // Don't use RVV for vectors we cannot scalarize if required.
-  switch (VT.getVectorElementType().SimpleTy) {
+  switch (EltVT.SimpleTy) {
  // i1 is supported but has different rules.
  default:
    return false;
@ -1234,6 +1236,10 @@ static bool useRVVForFixedLengthVectorVT(MVT VT,
    break;
  }

+  // Reject elements larger than ELEN.
+  if (EltVT.getSizeInBits() > Subtarget.getMaxELENForFixedLengthVectors())
+    return false;
+
  unsigned LMul = divideCeil(VT.getSizeInBits(), MinVLen);
  // Don't use RVV for types that don't fit.
  if (LMul > Subtarget.getMaxLMULForFixedLengthVectors())
@ -1260,6 +1266,7 @@ static MVT getContainerForFixedLengthVector(const TargetLowering &TLI, MVT VT,
         "Expected legal fixed length vector!");

  unsigned MinVLen = Subtarget.getMinRVVVectorSizeInBits();
+  unsigned MaxELen = Subtarget.getMaxELENForFixedLengthVectors();

  MVT EltVT = VT.getVectorElementType();
  switch (EltVT.SimpleTy) {
@ -1274,10 +1281,12 @@ static MVT getContainerForFixedLengthVector(const TargetLowering &TLI, MVT VT,
  case MVT::f32:
  case MVT::f64: {
    // We prefer to use LMUL=1 for VLEN sized types. Use fractional lmuls for
-    // narrower types, but we can't have a fractional LMUL with demoninator less
-    // than 64/SEW.
+    // narrower types. The smallest fractional LMUL we support is 8/ELEN. Within
+    // each fractional LMUL we support SEW between 8 and LMUL*ELEN.
    unsigned NumElts =
-        divideCeil(VT.getVectorNumElements(), MinVLen / RISCV::RVVBitsPerBlock);
+        (VT.getVectorNumElements() * RISCV::RVVBitsPerBlock) / MinVLen;
+    NumElts = std::max(NumElts, RISCV::RVVBitsPerBlock / MaxELen);
+    assert(isPowerOf2_32(NumElts) && "Expected power of 2 NumElts");
    return MVT::getScalableVectorVT(EltVT, NumElts);
  }
  }
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
@ -45,6 +45,11 @@ static cl::opt<unsigned> RVVVectorLMULMax(
             "Fractional LMUL values are not supported."),
    cl::init(8), cl::Hidden);

+static cl::opt<unsigned> RVVVectorELENMax(
+    "riscv-v-fixed-length-vector-elen-max",
+    cl::desc("The maximum ELEN value to use for fixed length vectors."),
+    cl::init(64), cl::Hidden);
+
 void RISCVSubtarget::anchor() {}

 RISCVSubtarget &
@ -142,7 +147,18 @@ unsigned RISCVSubtarget::getMaxLMULForFixedLengthVectors() const {
         "Tried to get maximum LMUL without V extension support!");
  assert(RVVVectorLMULMax <= 8 && isPowerOf2_32(RVVVectorLMULMax) &&
         "V extension requires a LMUL to be at most 8 and a power of 2!");
-  return PowerOf2Floor(std::max<unsigned>(RVVVectorLMULMax, 1));
+  return PowerOf2Floor(
+      std::max<unsigned>(std::min<unsigned>(RVVVectorLMULMax, 8), 1));
+}
+
+unsigned RISCVSubtarget::getMaxELENForFixedLengthVectors() const {
+  assert(hasStdExtV() &&
+         "Tried to get maximum ELEN without V extension support!");
+  assert(RVVVectorELENMax <= 64 && RVVVectorELENMax >= 8 &&
+         isPowerOf2_32(RVVVectorELENMax) &&
+         "V extension requires a ELEN to be a power of 2 between 8 and 64!");
+  return PowerOf2Floor(
+      std::max<unsigned>(std::min<unsigned>(RVVVectorELENMax, 64), 8));
 }

 bool RISCVSubtarget::useRVVForFixedLengthVectors() const {
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@ -158,6 +158,7 @@ public:
  unsigned getMaxRVVVectorSizeInBits() const;
  unsigned getMinRVVVectorSizeInBits() const;
  unsigned getMaxLMULForFixedLengthVectors() const;
+  unsigned getMaxELENForFixedLengthVectors() const;
  bool useRVVForFixedLengthVectors() const;
 };
 } // End llvm namespace
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@ -104,6 +104,12 @@ public:
    if (isa<FixedVectorType>(DataType) && ST->getMinRVVVectorSizeInBits() == 0)
      return false;

+    // Don't allow elements larger than the ELEN.
+    // FIXME: How to limit for scalable vectors?
+    if (isa<FixedVectorType>(DataType) &&
+        DataType->getScalarSizeInBits() > ST->getMaxELENForFixedLengthVectors())
+      return false;
+
    if (Alignment <
        DL.getTypeStoreSize(DataType->getScalarType()).getFixedSize())
      return false;
@ -126,6 +132,12 @@ public:
    if (isa<FixedVectorType>(DataType) && ST->getMinRVVVectorSizeInBits() == 0)
      return false;

+    // Don't allow elements larger than the ELEN.
+    // FIXME: How to limit for scalable vectors?
+    if (isa<FixedVectorType>(DataType) &&
+        DataType->getScalarSizeInBits() > ST->getMaxELENForFixedLengthVectors())
+      return false;
+
    if (Alignment <
        DL.getTypeStoreSize(DataType->getScalarType()).getFixedSize())
      return false;
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll
@ -0,0 +1,184 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+d,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-elen-max=32 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+d,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-elen-max=32 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+
+; Test that limiting ELEN, scalarizes elements larger than that and disables
+; some fractional LMULs.
+
+; This should use LMUL=1.
+define void @add_v4i32(<4 x i32>* %x, <4 x i32>* %y) {
+; CHECK-LABEL: add_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; CHECK-NEXT:    vle32.v v25, (a0)
+; CHECK-NEXT:    vle32.v v26, (a1)
+; CHECK-NEXT:    vadd.vv v25, v25, v26
+; CHECK-NEXT:    vse32.v v25, (a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i32>, <4 x i32>* %x
+  %b = load <4 x i32>, <4 x i32>* %y
+  %c = add <4 x i32> %a, %b
+  store <4 x i32> %c, <4 x i32>* %x
+  ret void
+}
+
+; i64 vectors should be scalarized
+define void @add_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
+; RV32-LABEL: add_v2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a2, 8(a0)
+; RV32-NEXT:    lw a6, 12(a0)
+; RV32-NEXT:    lw a4, 0(a0)
+; RV32-NEXT:    lw a7, 4(a0)
+; RV32-NEXT:    lw a3, 4(a1)
+; RV32-NEXT:    lw a5, 0(a1)
+; RV32-NEXT:    lw t0, 8(a1)
+; RV32-NEXT:    lw a1, 12(a1)
+; RV32-NEXT:    add a3, a7, a3
+; RV32-NEXT:    add a5, a4, a5
+; RV32-NEXT:    sltu a4, a5, a4
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    add a1, a6, a1
+; RV32-NEXT:    add a4, a2, t0
+; RV32-NEXT:    sltu a2, a4, a2
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    sw a4, 8(a0)
+; RV32-NEXT:    sw a5, 0(a0)
+; RV32-NEXT:    sw a1, 12(a0)
+; RV32-NEXT:    sw a3, 4(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: add_v2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ld a2, 8(a0)
+; RV64-NEXT:    ld a3, 0(a0)
+; RV64-NEXT:    ld a4, 0(a1)
+; RV64-NEXT:    ld a1, 8(a1)
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    sd a1, 8(a0)
+; RV64-NEXT:    sd a3, 0(a0)
+; RV64-NEXT:    ret
+  %a = load <2 x i64>, <2 x i64>* %x
+  %b = load <2 x i64>, <2 x i64>* %y
+  %c = add <2 x i64> %a, %b
+  store <2 x i64> %c, <2 x i64>* %x
+  ret void
+}
+
+; This should use LMUL=1 becuase there are no fractional i32 LMULs with ELEN=32
+define void @add_v2i32(<2 x i32>* %x, <2 x i32>* %y) {
+; CHECK-LABEL: add_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, mu
+; CHECK-NEXT:    vle32.v v25, (a0)
+; CHECK-NEXT:    vle32.v v26, (a1)
+; CHECK-NEXT:    vadd.vv v25, v25, v26
+; CHECK-NEXT:    vse32.v v25, (a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i32>, <2 x i32>* %x
+  %b = load <2 x i32>, <2 x i32>* %y
+  %c = add <2 x i32> %a, %b
+  store <2 x i32> %c, <2 x i32>* %x
+  ret void
+}
+
+; i64 vectors should be scalarized
+define void @add_v1i64(<1 x i64>* %x, <1 x i64>* %y) {
+; RV32-LABEL: add_v1i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a2, 0(a0)
+; RV32-NEXT:    lw a3, 4(a0)
+; RV32-NEXT:    lw a4, 4(a1)
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    sltu a2, a1, a2
+; RV32-NEXT:    add a2, a3, a2
+; RV32-NEXT:    sw a1, 0(a0)
+; RV32-NEXT:    sw a2, 4(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: add_v1i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ld a2, 0(a0)
+; RV64-NEXT:    ld a1, 0(a1)
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    sd a1, 0(a0)
+; RV64-NEXT:    ret
+  %a = load <1 x i64>, <1 x i64>* %x
+  %b = load <1 x i64>, <1 x i64>* %y
+  %c = add <1 x i64> %a, %b
+  store <1 x i64> %c, <1 x i64>* %x
+  ret void
+}
+
+; This should use LMUL=1.
+define void @fadd_v4f32(<4 x float>* %x, <4 x float>* %y) {
+; CHECK-LABEL: fadd_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; CHECK-NEXT:    vle32.v v25, (a0)
+; CHECK-NEXT:    vle32.v v26, (a1)
+; CHECK-NEXT:    vfadd.vv v25, v25, v26
+; CHECK-NEXT:    vse32.v v25, (a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x float>, <4 x float>* %x
+  %b = load <4 x float>, <4 x float>* %y
+  %c = fadd <4 x float> %a, %b
+  store <4 x float> %c, <4 x float>* %x
+  ret void
+}
+
+; double vectors should be scalarized
+define void @fadd_v2f64(<2 x double>* %x, <2 x double>* %y) {
+; CHECK-LABEL: fadd_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fld ft0, 8(a0)
+; CHECK-NEXT:    fld ft1, 0(a0)
+; CHECK-NEXT:    fld ft2, 0(a1)
+; CHECK-NEXT:    fld ft3, 8(a1)
+; CHECK-NEXT:    fadd.d ft1, ft1, ft2
+; CHECK-NEXT:    fadd.d ft0, ft0, ft3
+; CHECK-NEXT:    fsd ft0, 8(a0)
+; CHECK-NEXT:    fsd ft1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x double>, <2 x double>* %x
+  %b = load <2 x double>, <2 x double>* %y
+  %c = fadd <2 x double> %a, %b
+  store <2 x double> %c, <2 x double>* %x
+  ret void
+}
+
+; This should use LMUL=1 becuase there are no fractional float LMULs with ELEN=32
+define void @fadd_v2f32(<2 x float>* %x, <2 x float>* %y) {
+; CHECK-LABEL: fadd_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, mu
+; CHECK-NEXT:    vle32.v v25, (a0)
+; CHECK-NEXT:    vle32.v v26, (a1)
+; CHECK-NEXT:    vfadd.vv v25, v25, v26
+; CHECK-NEXT:    vse32.v v25, (a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x float>, <2 x float>* %x
+  %b = load <2 x float>, <2 x float>* %y
+  %c = fadd <2 x float> %a, %b
+  store <2 x float> %c, <2 x float>* %x
+  ret void
+}
+
+; double vectors should be scalarized
+define void @fadd_v1f64(<1 x double>* %x, <1 x double>* %y) {
+; CHECK-LABEL: fadd_v1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fld ft0, 0(a0)
+; CHECK-NEXT:    fld ft1, 0(a1)
+; CHECK-NEXT:    fadd.d ft0, ft0, ft1
+; CHECK-NEXT:    fsd ft0, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <1 x double>, <1 x double>* %x
+  %b = load <1 x double>, <1 x double>* %y
+  %c = fadd <1 x double> %a, %b
+  store <1 x double> %c, <1 x double>* %x
+  ret void
+}