forked from OSchip/llvm-project
[VE] Packed 32/64bit broadcast isel and tests
Packed-mode broadcast of f32/i32 requires the subregister to be replicated to the full I64 register prior. Add repl_i32 and repl_f32 to faciliate this. Reviewed By: kaz7 Differential Revision: https://reviews.llvm.org/D117878
This commit is contained in:
parent
0984aa70da
commit
5ceb0bc7ea
|
@ -19,6 +19,14 @@
|
|||
|
||||
namespace llvm {
|
||||
|
||||
static const int StandardVectorWidth = 256;
|
||||
|
||||
bool isPackedVectorType(EVT SomeVT) {
|
||||
if (!SomeVT.isVector())
|
||||
return false;
|
||||
return SomeVT.getVectorNumElements() > StandardVectorWidth;
|
||||
}
|
||||
|
||||
/// \returns the VVP_* SDNode opcode corresponsing to \p OC.
|
||||
Optional<unsigned> getVVPOpcode(unsigned Opcode) {
|
||||
switch (Opcode) {
|
||||
|
@ -51,6 +59,22 @@ SDValue VECustomDAG::getConstant(uint64_t Val, EVT VT, bool IsTarget,
|
|||
|
||||
SDValue VECustomDAG::getBroadcast(EVT ResultVT, SDValue Scalar,
|
||||
SDValue AVL) const {
|
||||
assert(ResultVT.isVector());
|
||||
auto ScaVT = Scalar.getValueType();
|
||||
assert(ScaVT != MVT::i1 && "TODO: Mask broadcasts");
|
||||
|
||||
if (isPackedVectorType(ResultVT)) {
|
||||
// v512x packed mode broadcast
|
||||
// Replicate the scalar reg (f32 or i32) onto the opposing half of the full
|
||||
// scalar register. If it's an I64 type, assume that this has already
|
||||
// happened.
|
||||
if (ScaVT == MVT::f32) {
|
||||
Scalar = getNode(VEISD::REPL_F32, MVT::i64, Scalar);
|
||||
} else if (ScaVT == MVT::i32) {
|
||||
Scalar = getNode(VEISD::REPL_I32, MVT::i64, Scalar);
|
||||
}
|
||||
}
|
||||
|
||||
return getNode(VEISD::VEC_BROADCAST, ResultVT, {Scalar, AVL});
|
||||
}
|
||||
|
||||
|
|
|
@ -25,6 +25,8 @@ Optional<unsigned> getVVPOpcode(unsigned Opcode);
|
|||
|
||||
bool isVVPBinaryOp(unsigned Opcode);
|
||||
|
||||
bool isPackedVectorType(EVT SomeVT);
|
||||
|
||||
class VECustomDAG {
|
||||
SelectionDAG &DAG;
|
||||
SDLoc DL;
|
||||
|
|
|
@ -11,9 +11,9 @@
|
|||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "VECustomDAG.h"
|
||||
#include "VEISelLowering.h"
|
||||
#include "MCTargetDesc/VEMCExpr.h"
|
||||
#include "VECustomDAG.h"
|
||||
#include "VEInstrBuilder.h"
|
||||
#include "VEMachineFunctionInfo.h"
|
||||
#include "VERegisterInfo.h"
|
||||
|
@ -899,6 +899,8 @@ const char *VETargetLowering::getTargetNodeName(unsigned Opcode) const {
|
|||
TARGET_NODE_CASE(RET_FLAG)
|
||||
TARGET_NODE_CASE(TS1AM)
|
||||
TARGET_NODE_CASE(VEC_BROADCAST)
|
||||
TARGET_NODE_CASE(REPL_I32)
|
||||
TARGET_NODE_CASE(REPL_F32)
|
||||
|
||||
// Register the VVP_* SDNodes.
|
||||
#define ADD_VVP_OP(VVP_NAME, ...) TARGET_NODE_CASE(VVP_NAME)
|
||||
|
@ -1642,8 +1644,7 @@ static SDValue getSplatValue(SDNode *N) {
|
|||
SDValue VETargetLowering::lowerBUILD_VECTOR(SDValue Op,
|
||||
SelectionDAG &DAG) const {
|
||||
VECustomDAG CDAG(DAG, Op);
|
||||
unsigned NumEls = Op.getValueType().getVectorNumElements();
|
||||
MVT ElemVT = Op.getSimpleValueType().getVectorElementType();
|
||||
MVT ResultVT = Op.getSimpleValueType();
|
||||
|
||||
// If there is just one element, expand to INSERT_VECTOR_ELT.
|
||||
unsigned UniqueIdx;
|
||||
|
@ -1651,17 +1652,17 @@ SDValue VETargetLowering::lowerBUILD_VECTOR(SDValue Op,
|
|||
SDValue AccuV = CDAG.getUNDEF(Op.getValueType());
|
||||
auto ElemV = Op->getOperand(UniqueIdx);
|
||||
SDValue IdxV = CDAG.getConstant(UniqueIdx, MVT::i64);
|
||||
return CDAG.getNode(ISD::INSERT_VECTOR_ELT, Op.getValueType(),
|
||||
{AccuV, ElemV, IdxV});
|
||||
return CDAG.getNode(ISD::INSERT_VECTOR_ELT, ResultVT, {AccuV, ElemV, IdxV});
|
||||
}
|
||||
|
||||
// Else emit a broadcast.
|
||||
if (SDValue ScalarV = getSplatValue(Op.getNode())) {
|
||||
// lower to VEC_BROADCAST
|
||||
MVT LegalResVT = MVT::getVectorVT(ElemVT, 256);
|
||||
|
||||
auto AVL = CDAG.getConstant(NumEls, MVT::i32);
|
||||
return CDAG.getBroadcast(LegalResVT, Op.getOperand(0), AVL);
|
||||
unsigned NumEls = ResultVT.getVectorNumElements();
|
||||
// TODO: Legalize packed-mode AVL.
|
||||
// For now, cap the AVL at 256.
|
||||
auto CappedLength = std::min<unsigned>(256, NumEls);
|
||||
auto AVL = CDAG.getConstant(CappedLength, MVT::i32);
|
||||
return CDAG.getBroadcast(ResultVT, Op.getOperand(0), AVL);
|
||||
}
|
||||
|
||||
// Expand
|
||||
|
|
|
@ -40,6 +40,8 @@ enum NodeType : unsigned {
|
|||
TS1AM, // A TS1AM instruction used for 1/2 bytes swap.
|
||||
VEC_BROADCAST, // A vector broadcast instruction.
|
||||
// 0: scalar value, 1: VL
|
||||
REPL_I32,
|
||||
REPL_F32, // Replicate subregister to other half.
|
||||
|
||||
// VVP_* nodes.
|
||||
#define ADD_VVP_OP(VVP_NAME, ...) VVP_NAME,
|
||||
|
|
|
@ -1576,6 +1576,12 @@ def f2l : OutPatFrag<(ops node:$exp),
|
|||
def l2f : OutPatFrag<(ops node:$exp),
|
||||
(EXTRACT_SUBREG $exp, sub_f32)>;
|
||||
|
||||
// Zero out subregisters.
|
||||
def zero_i32 : OutPatFrag<(ops node:$expr),
|
||||
(ANDrm $expr, 32)>;
|
||||
def zero_f32 : OutPatFrag<(ops node:$expr),
|
||||
(ANDrm $expr, !add(32, 64))>;
|
||||
|
||||
// Small immediates.
|
||||
def : Pat<(i32 simm7:$val), (EXTRACT_SUBREG (ORim (LO7 $val), 0), sub_i32)>;
|
||||
def : Pat<(i64 simm7:$val), (ORim (LO7 $val), 0)>;
|
||||
|
@ -2287,6 +2293,16 @@ class IsVLVT<int OpIdx> : SDTCisVT<OpIdx,i32>;
|
|||
def vec_broadcast : SDNode<"VEISD::VEC_BROADCAST", SDTypeProfile<1, 2,
|
||||
[SDTCisVec<0>, IsVLVT<2>]>>;
|
||||
|
||||
// replicate lower 32bit to upper 32bit (f32 scalar replication).
|
||||
def repl_f32 : SDNode<"VEISD::REPL_F32",
|
||||
SDTypeProfile<1, 1,
|
||||
[SDTCisInt<0>, SDTCisFP<1>]>>;
|
||||
// replicate upper 32bit to lower 32 bit (i32 scalar replication).
|
||||
def repl_i32 : SDNode<"VEISD::REPL_I32",
|
||||
SDTypeProfile<1, 1,
|
||||
[SDTCisInt<0>, SDTCisInt<1>]>>;
|
||||
|
||||
|
||||
// Whether this is an all-true mask (assuming undef-bits above VL are all-true).
|
||||
def true_mask : PatLeaf<
|
||||
(vec_broadcast (i32 nonzero), (i32 srcvalue))>;
|
||||
|
|
|
@ -15,6 +15,17 @@
|
|||
// Instruction format superclass
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
// Sub-register replication for packed broadcast.
|
||||
def: Pat<(i64 (repl_f32 f32:$val)),
|
||||
(ORrr
|
||||
(SRLri (f2l $val), 32),
|
||||
(zero_i32 (f2l $val)))>;
|
||||
def: Pat<(i64 (repl_i32 i32:$val)),
|
||||
(ORrr
|
||||
(zero_f32 (i2l $val)),
|
||||
(SLLri (i2l $val), 32))>;
|
||||
|
||||
|
||||
multiclass vbrd_elem32<ValueType v32, ValueType s32, SDPatternOperator ImmOp,
|
||||
SDNodeXForm ImmCast, OutPatFrag SuperRegCast> {
|
||||
// VBRDil
|
||||
|
@ -89,3 +100,8 @@ defm : patterns_elem32<v256f32, f32, simm7fp, LO7FP, l2f, f2l>;
|
|||
|
||||
defm : patterns_elem64<v256i64, i64, simm7, LO7>;
|
||||
defm : patterns_elem64<v256f64, f64, simm7fp, LO7FP>;
|
||||
|
||||
defm : vbrd_elem64<v512i32, i64, simm7, LO7>;
|
||||
defm : vbrd_elem64<v512f32, i64, simm7, LO7>;
|
||||
defm : vbrd_elem64<v512i32, f64, simm7fp, LO7FP>;
|
||||
defm : vbrd_elem64<v512f32, f64, simm7fp, LO7FP>;
|
||||
|
|
|
@ -0,0 +1,65 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s
|
||||
|
||||
define fastcc <512 x i32> @brd_v512i32(i32 %s) {
|
||||
; CHECK-LABEL: brd_v512i32:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: and %s0, %s0, (32)0
|
||||
; CHECK-NEXT: sll %s1, %s0, 32
|
||||
; CHECK-NEXT: and %s0, %s0, (32)0
|
||||
; CHECK-NEXT: or %s0, %s0, %s1
|
||||
; CHECK-NEXT: lea %s1, 256
|
||||
; CHECK-NEXT: lvl %s1
|
||||
; CHECK-NEXT: vbrd %v0, %s0
|
||||
; CHECK-NEXT: b.l.t (, %s10)
|
||||
%val = insertelement <512 x i32> undef, i32 %s, i32 0
|
||||
%ret = shufflevector <512 x i32> %val, <512 x i32> undef, <512 x i32> zeroinitializer
|
||||
ret <512 x i32> %ret
|
||||
}
|
||||
|
||||
define fastcc <512 x i32> @brdi_v512i32() {
|
||||
; CHECK-LABEL: brdi_v512i32:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: or %s0, 17, (0)1
|
||||
; CHECK-NEXT: sll %s1, %s0, 32
|
||||
; CHECK-NEXT: and %s0, %s0, (32)0
|
||||
; CHECK-NEXT: or %s0, %s0, %s1
|
||||
; CHECK-NEXT: lea %s1, 256
|
||||
; CHECK-NEXT: lvl %s1
|
||||
; CHECK-NEXT: vbrd %v0, %s0
|
||||
; CHECK-NEXT: b.l.t (, %s10)
|
||||
%val = insertelement <512 x i32> undef, i32 17, i32 0
|
||||
%ret = shufflevector <512 x i32> %val, <512 x i32> undef, <512 x i32> zeroinitializer
|
||||
ret <512 x i32> %ret
|
||||
}
|
||||
|
||||
define fastcc <512 x float> @brd_v512f32(float %s) {
|
||||
; CHECK-LABEL: brd_v512f32:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: and %s1, %s0, (32)1
|
||||
; CHECK-NEXT: srl %s0, %s0, 32
|
||||
; CHECK-NEXT: or %s0, %s0, %s1
|
||||
; CHECK-NEXT: lea %s1, 256
|
||||
; CHECK-NEXT: lvl %s1
|
||||
; CHECK-NEXT: vbrd %v0, %s0
|
||||
; CHECK-NEXT: b.l.t (, %s10)
|
||||
%val = insertelement <512 x float> undef, float %s, i32 0
|
||||
%ret = shufflevector <512 x float> %val, <512 x float> undef, <512 x i32> zeroinitializer
|
||||
ret <512 x float> %ret
|
||||
}
|
||||
|
||||
define fastcc <512 x float> @brdi_v512f32() {
|
||||
; CHECK-LABEL: brdi_v512f32:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: lea.sl %s0, 0
|
||||
; CHECK-NEXT: and %s1, %s0, (32)1
|
||||
; CHECK-NEXT: srl %s0, %s0, 32
|
||||
; CHECK-NEXT: or %s0, %s0, %s1
|
||||
; CHECK-NEXT: lea %s1, 256
|
||||
; CHECK-NEXT: lvl %s1
|
||||
; CHECK-NEXT: vbrd %v0, %s0
|
||||
; CHECK-NEXT: b.l.t (, %s10)
|
||||
%val = insertelement <512 x float> undef, float 0.e+00, i32 0
|
||||
%ret = shufflevector <512 x float> %val, <512 x float> undef, <512 x i32> zeroinitializer
|
||||
ret <512 x float> %ret
|
||||
}
|
Loading…
Reference in New Issue