[VE] Packed 32/64bit broadcast isel and tests

Packed-mode broadcast of f32/i32 requires the subregister to be
replicated to the full I64 register prior. Add repl_i32 and repl_f32 to
faciliate this.

Reviewed By: kaz7

Differential Revision: https://reviews.llvm.org/D117878
This commit is contained in:
Simon Moll 2022-01-26 10:32:26 +01:00
parent 0984aa70da
commit 5ceb0bc7ea
7 changed files with 136 additions and 10 deletions

View File

@ -19,6 +19,14 @@
namespace llvm {
static const int StandardVectorWidth = 256;
bool isPackedVectorType(EVT SomeVT) {
if (!SomeVT.isVector())
return false;
return SomeVT.getVectorNumElements() > StandardVectorWidth;
}
/// \returns the VVP_* SDNode opcode corresponsing to \p OC.
Optional<unsigned> getVVPOpcode(unsigned Opcode) {
switch (Opcode) {
@ -51,6 +59,22 @@ SDValue VECustomDAG::getConstant(uint64_t Val, EVT VT, bool IsTarget,
SDValue VECustomDAG::getBroadcast(EVT ResultVT, SDValue Scalar,
SDValue AVL) const {
assert(ResultVT.isVector());
auto ScaVT = Scalar.getValueType();
assert(ScaVT != MVT::i1 && "TODO: Mask broadcasts");
if (isPackedVectorType(ResultVT)) {
// v512x packed mode broadcast
// Replicate the scalar reg (f32 or i32) onto the opposing half of the full
// scalar register. If it's an I64 type, assume that this has already
// happened.
if (ScaVT == MVT::f32) {
Scalar = getNode(VEISD::REPL_F32, MVT::i64, Scalar);
} else if (ScaVT == MVT::i32) {
Scalar = getNode(VEISD::REPL_I32, MVT::i64, Scalar);
}
}
return getNode(VEISD::VEC_BROADCAST, ResultVT, {Scalar, AVL});
}

View File

@ -25,6 +25,8 @@ Optional<unsigned> getVVPOpcode(unsigned Opcode);
bool isVVPBinaryOp(unsigned Opcode);
bool isPackedVectorType(EVT SomeVT);
class VECustomDAG {
SelectionDAG &DAG;
SDLoc DL;

View File

@ -11,9 +11,9 @@
//
//===----------------------------------------------------------------------===//
#include "VECustomDAG.h"
#include "VEISelLowering.h"
#include "MCTargetDesc/VEMCExpr.h"
#include "VECustomDAG.h"
#include "VEInstrBuilder.h"
#include "VEMachineFunctionInfo.h"
#include "VERegisterInfo.h"
@ -899,6 +899,8 @@ const char *VETargetLowering::getTargetNodeName(unsigned Opcode) const {
TARGET_NODE_CASE(RET_FLAG)
TARGET_NODE_CASE(TS1AM)
TARGET_NODE_CASE(VEC_BROADCAST)
TARGET_NODE_CASE(REPL_I32)
TARGET_NODE_CASE(REPL_F32)
// Register the VVP_* SDNodes.
#define ADD_VVP_OP(VVP_NAME, ...) TARGET_NODE_CASE(VVP_NAME)
@ -1642,8 +1644,7 @@ static SDValue getSplatValue(SDNode *N) {
SDValue VETargetLowering::lowerBUILD_VECTOR(SDValue Op,
SelectionDAG &DAG) const {
VECustomDAG CDAG(DAG, Op);
unsigned NumEls = Op.getValueType().getVectorNumElements();
MVT ElemVT = Op.getSimpleValueType().getVectorElementType();
MVT ResultVT = Op.getSimpleValueType();
// If there is just one element, expand to INSERT_VECTOR_ELT.
unsigned UniqueIdx;
@ -1651,17 +1652,17 @@ SDValue VETargetLowering::lowerBUILD_VECTOR(SDValue Op,
SDValue AccuV = CDAG.getUNDEF(Op.getValueType());
auto ElemV = Op->getOperand(UniqueIdx);
SDValue IdxV = CDAG.getConstant(UniqueIdx, MVT::i64);
return CDAG.getNode(ISD::INSERT_VECTOR_ELT, Op.getValueType(),
{AccuV, ElemV, IdxV});
return CDAG.getNode(ISD::INSERT_VECTOR_ELT, ResultVT, {AccuV, ElemV, IdxV});
}
// Else emit a broadcast.
if (SDValue ScalarV = getSplatValue(Op.getNode())) {
// lower to VEC_BROADCAST
MVT LegalResVT = MVT::getVectorVT(ElemVT, 256);
auto AVL = CDAG.getConstant(NumEls, MVT::i32);
return CDAG.getBroadcast(LegalResVT, Op.getOperand(0), AVL);
unsigned NumEls = ResultVT.getVectorNumElements();
// TODO: Legalize packed-mode AVL.
// For now, cap the AVL at 256.
auto CappedLength = std::min<unsigned>(256, NumEls);
auto AVL = CDAG.getConstant(CappedLength, MVT::i32);
return CDAG.getBroadcast(ResultVT, Op.getOperand(0), AVL);
}
// Expand

View File

@ -40,6 +40,8 @@ enum NodeType : unsigned {
TS1AM, // A TS1AM instruction used for 1/2 bytes swap.
VEC_BROADCAST, // A vector broadcast instruction.
// 0: scalar value, 1: VL
REPL_I32,
REPL_F32, // Replicate subregister to other half.
// VVP_* nodes.
#define ADD_VVP_OP(VVP_NAME, ...) VVP_NAME,

View File

@ -1576,6 +1576,12 @@ def f2l : OutPatFrag<(ops node:$exp),
def l2f : OutPatFrag<(ops node:$exp),
(EXTRACT_SUBREG $exp, sub_f32)>;
// Zero out subregisters.
def zero_i32 : OutPatFrag<(ops node:$expr),
(ANDrm $expr, 32)>;
def zero_f32 : OutPatFrag<(ops node:$expr),
(ANDrm $expr, !add(32, 64))>;
// Small immediates.
def : Pat<(i32 simm7:$val), (EXTRACT_SUBREG (ORim (LO7 $val), 0), sub_i32)>;
def : Pat<(i64 simm7:$val), (ORim (LO7 $val), 0)>;
@ -2287,6 +2293,16 @@ class IsVLVT<int OpIdx> : SDTCisVT<OpIdx,i32>;
def vec_broadcast : SDNode<"VEISD::VEC_BROADCAST", SDTypeProfile<1, 2,
[SDTCisVec<0>, IsVLVT<2>]>>;
// replicate lower 32bit to upper 32bit (f32 scalar replication).
def repl_f32 : SDNode<"VEISD::REPL_F32",
SDTypeProfile<1, 1,
[SDTCisInt<0>, SDTCisFP<1>]>>;
// replicate upper 32bit to lower 32 bit (i32 scalar replication).
def repl_i32 : SDNode<"VEISD::REPL_I32",
SDTypeProfile<1, 1,
[SDTCisInt<0>, SDTCisInt<1>]>>;
// Whether this is an all-true mask (assuming undef-bits above VL are all-true).
def true_mask : PatLeaf<
(vec_broadcast (i32 nonzero), (i32 srcvalue))>;

View File

@ -15,6 +15,17 @@
// Instruction format superclass
//===----------------------------------------------------------------------===//
// Sub-register replication for packed broadcast.
def: Pat<(i64 (repl_f32 f32:$val)),
(ORrr
(SRLri (f2l $val), 32),
(zero_i32 (f2l $val)))>;
def: Pat<(i64 (repl_i32 i32:$val)),
(ORrr
(zero_f32 (i2l $val)),
(SLLri (i2l $val), 32))>;
multiclass vbrd_elem32<ValueType v32, ValueType s32, SDPatternOperator ImmOp,
SDNodeXForm ImmCast, OutPatFrag SuperRegCast> {
// VBRDil
@ -89,3 +100,8 @@ defm : patterns_elem32<v256f32, f32, simm7fp, LO7FP, l2f, f2l>;
defm : patterns_elem64<v256i64, i64, simm7, LO7>;
defm : patterns_elem64<v256f64, f64, simm7fp, LO7FP>;
defm : vbrd_elem64<v512i32, i64, simm7, LO7>;
defm : vbrd_elem64<v512f32, i64, simm7, LO7>;
defm : vbrd_elem64<v512i32, f64, simm7fp, LO7FP>;
defm : vbrd_elem64<v512f32, f64, simm7fp, LO7FP>;

View File

@ -0,0 +1,65 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s
define fastcc <512 x i32> @brd_v512i32(i32 %s) {
; CHECK-LABEL: brd_v512i32:
; CHECK: # %bb.0:
; CHECK-NEXT: and %s0, %s0, (32)0
; CHECK-NEXT: sll %s1, %s0, 32
; CHECK-NEXT: and %s0, %s0, (32)0
; CHECK-NEXT: or %s0, %s0, %s1
; CHECK-NEXT: lea %s1, 256
; CHECK-NEXT: lvl %s1
; CHECK-NEXT: vbrd %v0, %s0
; CHECK-NEXT: b.l.t (, %s10)
%val = insertelement <512 x i32> undef, i32 %s, i32 0
%ret = shufflevector <512 x i32> %val, <512 x i32> undef, <512 x i32> zeroinitializer
ret <512 x i32> %ret
}
define fastcc <512 x i32> @brdi_v512i32() {
; CHECK-LABEL: brdi_v512i32:
; CHECK: # %bb.0:
; CHECK-NEXT: or %s0, 17, (0)1
; CHECK-NEXT: sll %s1, %s0, 32
; CHECK-NEXT: and %s0, %s0, (32)0
; CHECK-NEXT: or %s0, %s0, %s1
; CHECK-NEXT: lea %s1, 256
; CHECK-NEXT: lvl %s1
; CHECK-NEXT: vbrd %v0, %s0
; CHECK-NEXT: b.l.t (, %s10)
%val = insertelement <512 x i32> undef, i32 17, i32 0
%ret = shufflevector <512 x i32> %val, <512 x i32> undef, <512 x i32> zeroinitializer
ret <512 x i32> %ret
}
define fastcc <512 x float> @brd_v512f32(float %s) {
; CHECK-LABEL: brd_v512f32:
; CHECK: # %bb.0:
; CHECK-NEXT: and %s1, %s0, (32)1
; CHECK-NEXT: srl %s0, %s0, 32
; CHECK-NEXT: or %s0, %s0, %s1
; CHECK-NEXT: lea %s1, 256
; CHECK-NEXT: lvl %s1
; CHECK-NEXT: vbrd %v0, %s0
; CHECK-NEXT: b.l.t (, %s10)
%val = insertelement <512 x float> undef, float %s, i32 0
%ret = shufflevector <512 x float> %val, <512 x float> undef, <512 x i32> zeroinitializer
ret <512 x float> %ret
}
define fastcc <512 x float> @brdi_v512f32() {
; CHECK-LABEL: brdi_v512f32:
; CHECK: # %bb.0:
; CHECK-NEXT: lea.sl %s0, 0
; CHECK-NEXT: and %s1, %s0, (32)1
; CHECK-NEXT: srl %s0, %s0, 32
; CHECK-NEXT: or %s0, %s0, %s1
; CHECK-NEXT: lea %s1, 256
; CHECK-NEXT: lvl %s1
; CHECK-NEXT: vbrd %v0, %s0
; CHECK-NEXT: b.l.t (, %s10)
%val = insertelement <512 x float> undef, float 0.e+00, i32 0
%ret = shufflevector <512 x float> %val, <512 x float> undef, <512 x i32> zeroinitializer
ret <512 x float> %ret
}