llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

6411 lines
260 KiB
TableGen
Raw Normal View History

//===-- ARMInstrMVE.td - MVE support for ARM ---------------*- tablegen -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file describes the ARM MVE instruction set.
//
//===----------------------------------------------------------------------===//
class ExpandImmAsmOp<string shift> : AsmOperandClass {
let Name = !strconcat("ExpandImm", shift);
let PredicateMethod = !strconcat("isExpImm<", shift, ">");
let RenderMethod = "addImmOperands";
}
class InvertedExpandImmAsmOp<string shift, string size> : AsmOperandClass {
let Name = !strconcat("InvertedExpandImm", shift, "_", size);
let PredicateMethod = !strconcat("isInvertedExpImm<", shift, ",", size, ">");
let RenderMethod = "addImmOperands";
}
class ExpandImm<string shift> : Operand<i32> {
let ParserMatchClass = ExpandImmAsmOp<shift>;
let EncoderMethod = !strconcat("getExpandedImmOpValue<",shift,",false>");
let DecoderMethod = !strconcat("DecodeExpandedImmOperand<",shift,">");
let PrintMethod = "printExpandedImmOperand";
}
class InvertedExpandImm<string shift, string size> : Operand<i32> {
let ParserMatchClass = InvertedExpandImmAsmOp<shift, size>;
let EncoderMethod = !strconcat("getExpandedImmOpValue<",shift,",true>");
let PrintMethod = "printExpandedImmOperand";
// No decoder method needed, because this operand type is only used
// by aliases (VAND and VORN)
}
def expzero00 : ExpandImm<"0">;
def expzero08 : ExpandImm<"8">;
def expzero16 : ExpandImm<"16">;
def expzero24 : ExpandImm<"24">;
def expzero00inv16 : InvertedExpandImm<"0", "16">;
def expzero08inv16 : InvertedExpandImm<"8", "16">;
def expzero00inv32 : InvertedExpandImm<"0", "32">;
def expzero08inv32 : InvertedExpandImm<"8", "32">;
def expzero16inv32 : InvertedExpandImm<"16", "32">;
def expzero24inv32 : InvertedExpandImm<"24", "32">;
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
// VPT condition mask
def vpt_mask : Operand<i32> {
let PrintMethod = "printVPTMask";
let ParserMatchClass = it_mask_asmoperand;
let EncoderMethod = "getVPTMaskOpValue";
let DecoderMethod = "DecodeVPTMaskOperand";
}
// VPT/VCMP restricted predicate for sign invariant types
def pred_restricted_i_asmoperand : AsmOperandClass {
let Name = "CondCodeRestrictedI";
let RenderMethod = "addITCondCodeOperands";
let PredicateMethod = "isITCondCodeRestrictedI";
let ParserMethod = "parseITCondCode";
let DiagnosticString = "condition code for sign-independent integer "#
"comparison must be EQ or NE";
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
}
// VPT/VCMP restricted predicate for signed types
def pred_restricted_s_asmoperand : AsmOperandClass {
let Name = "CondCodeRestrictedS";
let RenderMethod = "addITCondCodeOperands";
let PredicateMethod = "isITCondCodeRestrictedS";
let ParserMethod = "parseITCondCode";
let DiagnosticString = "condition code for signed integer "#
"comparison must be EQ, NE, LT, GT, LE or GE";
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
}
// VPT/VCMP restricted predicate for unsigned types
def pred_restricted_u_asmoperand : AsmOperandClass {
let Name = "CondCodeRestrictedU";
let RenderMethod = "addITCondCodeOperands";
let PredicateMethod = "isITCondCodeRestrictedU";
let ParserMethod = "parseITCondCode";
let DiagnosticString = "condition code for unsigned integer "#
"comparison must be EQ, NE, HS or HI";
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
}
// VPT/VCMP restricted predicate for floating point
def pred_restricted_fp_asmoperand : AsmOperandClass {
let Name = "CondCodeRestrictedFP";
let RenderMethod = "addITCondCodeOperands";
let PredicateMethod = "isITCondCodeRestrictedFP";
let ParserMethod = "parseITCondCode";
let DiagnosticString = "condition code for floating-point "#
"comparison must be EQ, NE, LT, GT, LE or GE";
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
}
class VCMPPredicateOperand : Operand<i32>;
def pred_basic_i : VCMPPredicateOperand {
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
let PrintMethod = "printMandatoryRestrictedPredicateOperand";
let ParserMatchClass = pred_restricted_i_asmoperand;
let DecoderMethod = "DecodeRestrictedIPredicateOperand";
let EncoderMethod = "getRestrictedCondCodeOpValue";
}
def pred_basic_u : VCMPPredicateOperand {
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
let PrintMethod = "printMandatoryRestrictedPredicateOperand";
let ParserMatchClass = pred_restricted_u_asmoperand;
let DecoderMethod = "DecodeRestrictedUPredicateOperand";
let EncoderMethod = "getRestrictedCondCodeOpValue";
}
def pred_basic_s : VCMPPredicateOperand {
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
let PrintMethod = "printMandatoryRestrictedPredicateOperand";
let ParserMatchClass = pred_restricted_s_asmoperand;
let DecoderMethod = "DecodeRestrictedSPredicateOperand";
let EncoderMethod = "getRestrictedCondCodeOpValue";
}
def pred_basic_fp : VCMPPredicateOperand {
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
let PrintMethod = "printMandatoryRestrictedPredicateOperand";
let ParserMatchClass = pred_restricted_fp_asmoperand;
let DecoderMethod = "DecodeRestrictedFPPredicateOperand";
let EncoderMethod = "getRestrictedCondCodeOpValue";
}
// Register list operands for interleaving load/stores
def VecList2QAsmOperand : AsmOperandClass {
let Name = "VecListTwoMQ";
let ParserMethod = "parseVectorList";
let RenderMethod = "addMVEVecListOperands";
let DiagnosticString = "operand must be a list of two consecutive "#
"q-registers in range [q0,q7]";
}
def VecList2Q : RegisterOperand<QQPR, "printMVEVectorListTwoQ"> {
let ParserMatchClass = VecList2QAsmOperand;
let PrintMethod = "printMVEVectorList<2>";
}
def VecList4QAsmOperand : AsmOperandClass {
let Name = "VecListFourMQ";
let ParserMethod = "parseVectorList";
let RenderMethod = "addMVEVecListOperands";
let DiagnosticString = "operand must be a list of four consecutive "#
"q-registers in range [q0,q7]";
}
def VecList4Q : RegisterOperand<QQQQPR, "printMVEVectorListFourQ"> {
let ParserMatchClass = VecList4QAsmOperand;
let PrintMethod = "printMVEVectorList<4>";
}
[ARM] Add MVE vector load/store instructions. This adds the rest of the vector memory access instructions. It includes contiguous loads/stores, with an ordinary addressing mode such as [r0,#offset] (plus writeback variants); gather loads and scatter stores with a scalar base address register and a vector of offsets from it (written [r0,q1] or similar); and gather/scatters with a vector of base addresses (written [q0,#offset], again with writeback). Additionally, some of the loads can widen each loaded value into a larger vector lane, and the corresponding stores narrow them again. To implement these, we also have to add the addressing modes they need. Also, in AsmParser, the `isMem` query function now has subqueries `isGPRMem` and `isMVEMem`, according to which kind of base register is used by a given memory access operand. I've also had to add an extra check in `checkTargetMatchPredicate` in the AsmParser, without which our last-minute check of `rGPR` register operands against SP and PC was failing an assertion because Tablegen had inserted an immediate 0 in place of one of a pair of tied register operands. (This matches the way the corresponding check for `MCK_rGPR` in `validateTargetOperandClass` is guarded.) Apparently the MVE load instructions were the first to have ever triggered this assertion, but I think only because they were the first to have a combination of the usual Arm pre/post writeback system and the `rGPR` class in particular. Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62680 llvm-svn: 364291
2019-06-25 19:24:18 +08:00
// taddrmode_imm7 := reg[r0-r7] +/- (imm7 << shift)
class TMemImm7ShiftOffsetAsmOperand<int shift> : AsmOperandClass {
let Name = "TMemImm7Shift"#shift#"Offset";
let PredicateMethod = "isMemImm7ShiftedOffset<"#shift#",ARM::tGPRRegClassID>";
let RenderMethod = "addMemImmOffsetOperands";
}
class taddrmode_imm7<int shift> : MemOperand,
ComplexPattern<i32, 2, "SelectTAddrModeImm7<"#shift#">", []> {
[ARM] Add MVE vector load/store instructions. This adds the rest of the vector memory access instructions. It includes contiguous loads/stores, with an ordinary addressing mode such as [r0,#offset] (plus writeback variants); gather loads and scatter stores with a scalar base address register and a vector of offsets from it (written [r0,q1] or similar); and gather/scatters with a vector of base addresses (written [q0,#offset], again with writeback). Additionally, some of the loads can widen each loaded value into a larger vector lane, and the corresponding stores narrow them again. To implement these, we also have to add the addressing modes they need. Also, in AsmParser, the `isMem` query function now has subqueries `isGPRMem` and `isMVEMem`, according to which kind of base register is used by a given memory access operand. I've also had to add an extra check in `checkTargetMatchPredicate` in the AsmParser, without which our last-minute check of `rGPR` register operands against SP and PC was failing an assertion because Tablegen had inserted an immediate 0 in place of one of a pair of tied register operands. (This matches the way the corresponding check for `MCK_rGPR` in `validateTargetOperandClass` is guarded.) Apparently the MVE load instructions were the first to have ever triggered this assertion, but I think only because they were the first to have a combination of the usual Arm pre/post writeback system and the `rGPR` class in particular. Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62680 llvm-svn: 364291
2019-06-25 19:24:18 +08:00
let ParserMatchClass = TMemImm7ShiftOffsetAsmOperand<shift>;
// They are printed the same way as the T2 imm8 version
let PrintMethod = "printT2AddrModeImm8Operand<false>";
// This can also be the same as the T2 version.
let EncoderMethod = "getT2AddrModeImmOpValue<7,"#shift#">";
let DecoderMethod = "DecodeTAddrModeImm7<"#shift#">";
let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm);
}
// t2addrmode_imm7 := reg +/- (imm7)
class MemImm7ShiftOffsetAsmOperand<int shift> : AsmOperandClass {
let Name = "MemImm7Shift"#shift#"Offset";
let PredicateMethod = "isMemImm7ShiftedOffset<" # shift #
",ARM::GPRnopcRegClassID>";
let RenderMethod = "addMemImmOffsetOperands";
}
def MemImm7Shift0OffsetAsmOperand : MemImm7ShiftOffsetAsmOperand<0>;
def MemImm7Shift1OffsetAsmOperand : MemImm7ShiftOffsetAsmOperand<1>;
def MemImm7Shift2OffsetAsmOperand : MemImm7ShiftOffsetAsmOperand<2>;
class T2AddrMode_Imm7<int shift> : MemOperand,
ComplexPattern<i32, 2, "SelectT2AddrModeImm7<"#shift#">", []> {
let EncoderMethod = "getT2AddrModeImmOpValue<7,"#shift#">";
let DecoderMethod = "DecodeT2AddrModeImm7<"#shift#", 0>";
let ParserMatchClass =
!cast<AsmOperandClass>("MemImm7Shift"#shift#"OffsetAsmOperand");
let MIOperandInfo = (ops GPRnopc:$base, i32imm:$offsimm);
}
class t2addrmode_imm7<int shift> : T2AddrMode_Imm7<shift> {
// They are printed the same way as the imm8 version
let PrintMethod = "printT2AddrModeImm8Operand<false>";
}
class MemImm7ShiftOffsetWBAsmOperand<int shift> : AsmOperandClass {
let Name = "MemImm7Shift"#shift#"OffsetWB";
let PredicateMethod = "isMemImm7ShiftedOffset<" # shift #
",ARM::rGPRRegClassID>";
let RenderMethod = "addMemImmOffsetOperands";
}
def MemImm7Shift0OffsetWBAsmOperand : MemImm7ShiftOffsetWBAsmOperand<0>;
def MemImm7Shift1OffsetWBAsmOperand : MemImm7ShiftOffsetWBAsmOperand<1>;
def MemImm7Shift2OffsetWBAsmOperand : MemImm7ShiftOffsetWBAsmOperand<2>;
class t2addrmode_imm7_pre<int shift> : T2AddrMode_Imm7<shift> {
// They are printed the same way as the imm8 version
let PrintMethod = "printT2AddrModeImm8Operand<true>";
let ParserMatchClass =
!cast<AsmOperandClass>("MemImm7Shift"#shift#"OffsetWBAsmOperand");
let DecoderMethod = "DecodeT2AddrModeImm7<"#shift#", 1>";
let MIOperandInfo = (ops rGPR:$base, i32imm:$offsim);
}
class t2am_imm7shiftOffsetAsmOperand<int shift>
: AsmOperandClass { let Name = "Imm7Shift"#shift; }
def t2am_imm7shift0OffsetAsmOperand : t2am_imm7shiftOffsetAsmOperand<0>;
def t2am_imm7shift1OffsetAsmOperand : t2am_imm7shiftOffsetAsmOperand<1>;
def t2am_imm7shift2OffsetAsmOperand : t2am_imm7shiftOffsetAsmOperand<2>;
class t2am_imm7_offset<int shift> : MemOperand,
ComplexPattern<i32, 1, "SelectT2AddrModeImm7Offset<"#shift#">",
[], [SDNPWantRoot]> {
[ARM] Add MVE vector load/store instructions. This adds the rest of the vector memory access instructions. It includes contiguous loads/stores, with an ordinary addressing mode such as [r0,#offset] (plus writeback variants); gather loads and scatter stores with a scalar base address register and a vector of offsets from it (written [r0,q1] or similar); and gather/scatters with a vector of base addresses (written [q0,#offset], again with writeback). Additionally, some of the loads can widen each loaded value into a larger vector lane, and the corresponding stores narrow them again. To implement these, we also have to add the addressing modes they need. Also, in AsmParser, the `isMem` query function now has subqueries `isGPRMem` and `isMVEMem`, according to which kind of base register is used by a given memory access operand. I've also had to add an extra check in `checkTargetMatchPredicate` in the AsmParser, without which our last-minute check of `rGPR` register operands against SP and PC was failing an assertion because Tablegen had inserted an immediate 0 in place of one of a pair of tied register operands. (This matches the way the corresponding check for `MCK_rGPR` in `validateTargetOperandClass` is guarded.) Apparently the MVE load instructions were the first to have ever triggered this assertion, but I think only because they were the first to have a combination of the usual Arm pre/post writeback system and the `rGPR` class in particular. Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62680 llvm-svn: 364291
2019-06-25 19:24:18 +08:00
// They are printed the same way as the imm8 version
let PrintMethod = "printT2AddrModeImm8OffsetOperand";
let ParserMatchClass =
!cast<AsmOperandClass>("t2am_imm7shift"#shift#"OffsetAsmOperand");
let EncoderMethod = "getT2ScaledImmOpValue<7,"#shift#">";
let DecoderMethod = "DecodeT2Imm7<"#shift#">";
}
// Operands for gather/scatter loads of the form [Rbase, Qoffsets]
class MemRegRQOffsetAsmOperand<int shift> : AsmOperandClass {
let Name = "MemRegRQS"#shift#"Offset";
let PredicateMethod = "isMemRegRQOffset<"#shift#">";
let RenderMethod = "addMemRegRQOffsetOperands";
}
def MemRegRQS0OffsetAsmOperand : MemRegRQOffsetAsmOperand<0>;
def MemRegRQS1OffsetAsmOperand : MemRegRQOffsetAsmOperand<1>;
def MemRegRQS2OffsetAsmOperand : MemRegRQOffsetAsmOperand<2>;
def MemRegRQS3OffsetAsmOperand : MemRegRQOffsetAsmOperand<3>;
// mve_addr_rq_shift := reg + vreg{ << UXTW #shift}
class mve_addr_rq_shift<int shift> : MemOperand {
let EncoderMethod = "getMveAddrModeRQOpValue";
let PrintMethod = "printMveAddrModeRQOperand<"#shift#">";
let ParserMatchClass =
!cast<AsmOperandClass>("MemRegRQS"#shift#"OffsetAsmOperand");
let DecoderMethod = "DecodeMveAddrModeRQ";
let MIOperandInfo = (ops GPRnopc:$base, MQPR:$offsreg);
}
class MemRegQOffsetAsmOperand<int shift> : AsmOperandClass {
let Name = "MemRegQS"#shift#"Offset";
let PredicateMethod = "isMemRegQOffset<"#shift#">";
let RenderMethod = "addMemImmOffsetOperands";
}
def MemRegQS2OffsetAsmOperand : MemRegQOffsetAsmOperand<2>;
def MemRegQS3OffsetAsmOperand : MemRegQOffsetAsmOperand<3>;
// mve_addr_q_shift := vreg {+ #imm7s2/4}
class mve_addr_q_shift<int shift> : MemOperand {
let EncoderMethod = "getMveAddrModeQOpValue<"#shift#">";
// Can be printed same way as other reg + imm operands
let PrintMethod = "printT2AddrModeImm8Operand<false>";
let ParserMatchClass =
!cast<AsmOperandClass>("MemRegQS"#shift#"OffsetAsmOperand");
let DecoderMethod = "DecodeMveAddrModeQ<"#shift#">";
let MIOperandInfo = (ops MQPR:$base, i32imm:$imm);
}
[ARM] Begin adding IR intrinsics for MVE instructions. This commit, together with the next few, will add a representative sample of the kind of IR intrinsics that we'll need in order to implement the user-facing ACLE intrinsics for MVE. Supporting all of them will take more work; the intention of this initial series of commits is to implement an intrinsic or two from lots of different categories, as examples and proofs of concept. This initial commit introduces a small number of IR intrinsics for instructions simple enough that they can use Tablegen ISel patterns: the predicated versions of the VADD and VSUB instructions (both integer and FP), VMIN and VMAX, and the float->half VCVT instruction (predicated and unpredicated). When using VPT-predicated instructions in automatic code generation, it will be convenient to specify the predicate value as a vector of the appropriate number of i1. To make it easy to specify all sizes of an instruction in one go and give each one the matching predicate vector type, I've added a system of Tablegen informational records describing MVE's vector types: each one gives the underlying LLVM IR ValueType (which may not be the same if the MVE vector is of explicitly signed or unsigned integers) and an appropriate vNi1 to use as the predicate vector. (Also, those info records include the usual encoding for the types, so that as we add associations between each instruction encoding and one of the new `MVEVectorVTInfo` records, we can remove some of the existing template parameters and replace them with references to the vector type info's fields.) The user-facing ACLE intrinsics will receive a predicate mask as a 16-bit integer, so I've also provided a pair of intrinsics i2v and v2i, to convert between an integer and a vector of i1 by just changing the register class. Reviewers: dmgreen, miyuki, ostannard Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67158
2019-10-08 00:00:51 +08:00
// A family of classes wrapping up information about the vector types
// used by MVE.
class MVEVectorVTInfo<ValueType vec, ValueType dblvec, ValueType pred,
bits<2> size, string suffixletter, bit unsigned> {
[ARM] Begin adding IR intrinsics for MVE instructions. This commit, together with the next few, will add a representative sample of the kind of IR intrinsics that we'll need in order to implement the user-facing ACLE intrinsics for MVE. Supporting all of them will take more work; the intention of this initial series of commits is to implement an intrinsic or two from lots of different categories, as examples and proofs of concept. This initial commit introduces a small number of IR intrinsics for instructions simple enough that they can use Tablegen ISel patterns: the predicated versions of the VADD and VSUB instructions (both integer and FP), VMIN and VMAX, and the float->half VCVT instruction (predicated and unpredicated). When using VPT-predicated instructions in automatic code generation, it will be convenient to specify the predicate value as a vector of the appropriate number of i1. To make it easy to specify all sizes of an instruction in one go and give each one the matching predicate vector type, I've added a system of Tablegen informational records describing MVE's vector types: each one gives the underlying LLVM IR ValueType (which may not be the same if the MVE vector is of explicitly signed or unsigned integers) and an appropriate vNi1 to use as the predicate vector. (Also, those info records include the usual encoding for the types, so that as we add associations between each instruction encoding and one of the new `MVEVectorVTInfo` records, we can remove some of the existing template parameters and replace them with references to the vector type info's fields.) The user-facing ACLE intrinsics will receive a predicate mask as a 16-bit integer, so I've also provided a pair of intrinsics i2v and v2i, to convert between an integer and a vector of i1 by just changing the register class. Reviewers: dmgreen, miyuki, ostannard Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67158
2019-10-08 00:00:51 +08:00
// The LLVM ValueType representing the vector, so we can use it in
// ISel patterns.
ValueType Vec = vec;
// The LLVM ValueType representing a vector with elements double the size
// of those in Vec, so we can use it in ISel patterns. It is up to the
// invoker of this class to ensure that this is a correct choice.
ValueType DblVec = dblvec;
[ARM] Begin adding IR intrinsics for MVE instructions. This commit, together with the next few, will add a representative sample of the kind of IR intrinsics that we'll need in order to implement the user-facing ACLE intrinsics for MVE. Supporting all of them will take more work; the intention of this initial series of commits is to implement an intrinsic or two from lots of different categories, as examples and proofs of concept. This initial commit introduces a small number of IR intrinsics for instructions simple enough that they can use Tablegen ISel patterns: the predicated versions of the VADD and VSUB instructions (both integer and FP), VMIN and VMAX, and the float->half VCVT instruction (predicated and unpredicated). When using VPT-predicated instructions in automatic code generation, it will be convenient to specify the predicate value as a vector of the appropriate number of i1. To make it easy to specify all sizes of an instruction in one go and give each one the matching predicate vector type, I've added a system of Tablegen informational records describing MVE's vector types: each one gives the underlying LLVM IR ValueType (which may not be the same if the MVE vector is of explicitly signed or unsigned integers) and an appropriate vNi1 to use as the predicate vector. (Also, those info records include the usual encoding for the types, so that as we add associations between each instruction encoding and one of the new `MVEVectorVTInfo` records, we can remove some of the existing template parameters and replace them with references to the vector type info's fields.) The user-facing ACLE intrinsics will receive a predicate mask as a 16-bit integer, so I've also provided a pair of intrinsics i2v and v2i, to convert between an integer and a vector of i1 by just changing the register class. Reviewers: dmgreen, miyuki, ostannard Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67158
2019-10-08 00:00:51 +08:00
// An LLVM ValueType representing a corresponding vector of
// predicate bits, for use in ISel patterns that handle an IR
// intrinsic describing the predicated form of the instruction.
//
// Usually, for a vector of N things, this will be vNi1. But for
// vectors of 2 values, we make an exception, and use v4i1 instead
// of v2i1. Rationale: MVE codegen doesn't support doing all the
// auxiliary operations on v2i1 (vector shuffles etc), and also,
// there's no MVE compare instruction that will _generate_ v2i1
// directly.
ValueType Pred = pred;
// The most common representation of the vector element size in MVE
// instruction encodings: a 2-bit value V representing an (8<<V)-bit
// vector element.
bits<2> Size = size;
// For vectors explicitly mentioning a signedness of integers: 0 for
// signed and 1 for unsigned. For anything else, undefined.
bit Unsigned = unsigned;
[ARM,MVE] Add intrinsics for gather/scatter load/stores. This patch adds two new families of intrinsics, both of which are memory accesses taking a vector of locations to load from / store to. The vldrq_gather_base / vstrq_scatter_base intrinsics take a vector of base addresses, and an immediate offset to be added consistently to each one. vldrq_gather_offset / vstrq_scatter_offset take a scalar base address, and a vector of offsets to add to it. The 'shifted_offset' variants also multiply each offset by the element size type, so that the vector is effectively of array indices. At the IR level, these operations are represented by a single set of four IR intrinsics: {gather,scatter} × {base,offset}. The other details (signed/unsigned, shift, and memory element size as opposed to vector element size) are all specified by IR intrinsic polymorphism and immediate operands, because that made the selection job easier than making a huge family of similarly named intrinsics. I considered using the standard IR representations such as llvm.masked.gather, but they're not a good fit. In order to use llvm.masked.gather to represent a gather_offset load with element size smaller than a pointer, you'd have to expand the <8 x i16> vector of offsets into an <8 x i16*> vector of pointers, which would be split up during legalization, so you'd spend most of your time undoing the mess it had made. Also, ISel support for llvm.masked.gather would be easy enough in a trivial way (you can expand it into a gather-base load with a zero immediate offset), but instruction-selecting lots of fiddly idioms back into all the _other_ MVE load instructions would be much more work. So I think dedicated IR intrinsics are the more sensible approach, at least for the moment. On the clang tablegen side, I've added two new features to the Tablegen source accepted by MveEmitter: a 'CopyKind' type node for defining a type that varies with the parameter type (it lets you ask for an unsigned integer type of the same width as the parameter), and an 'unsignedflag' value node for passing an immediate IR operand which is 0 for a signed integer type or 1 for an unsigned one. That lets me write each kind of intrinsic just once and get all its subtypes and immediate arguments generated automatically. Also I've tweaked the handling of pointer-typed values in the code generation part of MveEmitter: they're generated as Address rather than Value (i.e. including an alignment) so that they can be given to the ordinary IR load and store operations, but I'd omitted the code to convert them back to Value when they're going to be used as an argument to an IR intrinsic. On the MC side, I've enhanced MVEVectorVTInfo so that it can tell you not only the full assembly-language suffix for a given vector type (like 's32' or 'u16') but also the numeric-only one used by store instructions (just '32' or '16'). Reviewers: dmgreen Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D69791
2019-11-01 01:02:07 +08:00
// The number of bits in a vector element, in integer form.
int LaneBits = !shl(8, Size);
// The suffix used in assembly language on an instruction operating
// on this lane if it only cares about number of bits.
string BitsSuffix = !if(!eq(suffixletter, "p"),
!if(!eq(unsigned, 0b0), "8", "16"),
!cast<string>(LaneBits));
[ARM,MVE] Add intrinsics for gather/scatter load/stores. This patch adds two new families of intrinsics, both of which are memory accesses taking a vector of locations to load from / store to. The vldrq_gather_base / vstrq_scatter_base intrinsics take a vector of base addresses, and an immediate offset to be added consistently to each one. vldrq_gather_offset / vstrq_scatter_offset take a scalar base address, and a vector of offsets to add to it. The 'shifted_offset' variants also multiply each offset by the element size type, so that the vector is effectively of array indices. At the IR level, these operations are represented by a single set of four IR intrinsics: {gather,scatter} × {base,offset}. The other details (signed/unsigned, shift, and memory element size as opposed to vector element size) are all specified by IR intrinsic polymorphism and immediate operands, because that made the selection job easier than making a huge family of similarly named intrinsics. I considered using the standard IR representations such as llvm.masked.gather, but they're not a good fit. In order to use llvm.masked.gather to represent a gather_offset load with element size smaller than a pointer, you'd have to expand the <8 x i16> vector of offsets into an <8 x i16*> vector of pointers, which would be split up during legalization, so you'd spend most of your time undoing the mess it had made. Also, ISel support for llvm.masked.gather would be easy enough in a trivial way (you can expand it into a gather-base load with a zero immediate offset), but instruction-selecting lots of fiddly idioms back into all the _other_ MVE load instructions would be much more work. So I think dedicated IR intrinsics are the more sensible approach, at least for the moment. On the clang tablegen side, I've added two new features to the Tablegen source accepted by MveEmitter: a 'CopyKind' type node for defining a type that varies with the parameter type (it lets you ask for an unsigned integer type of the same width as the parameter), and an 'unsignedflag' value node for passing an immediate IR operand which is 0 for a signed integer type or 1 for an unsigned one. That lets me write each kind of intrinsic just once and get all its subtypes and immediate arguments generated automatically. Also I've tweaked the handling of pointer-typed values in the code generation part of MveEmitter: they're generated as Address rather than Value (i.e. including an alignment) so that they can be given to the ordinary IR load and store operations, but I'd omitted the code to convert them back to Value when they're going to be used as an argument to an IR intrinsic. On the MC side, I've enhanced MVEVectorVTInfo so that it can tell you not only the full assembly-language suffix for a given vector type (like 's32' or 'u16') but also the numeric-only one used by store instructions (just '32' or '16'). Reviewers: dmgreen Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D69791
2019-11-01 01:02:07 +08:00
// The suffix used on an instruction that mentions the whole type.
string Suffix = suffixletter ## BitsSuffix;
// The letter part of the suffix only.
string SuffixLetter = suffixletter;
[ARM] Begin adding IR intrinsics for MVE instructions. This commit, together with the next few, will add a representative sample of the kind of IR intrinsics that we'll need in order to implement the user-facing ACLE intrinsics for MVE. Supporting all of them will take more work; the intention of this initial series of commits is to implement an intrinsic or two from lots of different categories, as examples and proofs of concept. This initial commit introduces a small number of IR intrinsics for instructions simple enough that they can use Tablegen ISel patterns: the predicated versions of the VADD and VSUB instructions (both integer and FP), VMIN and VMAX, and the float->half VCVT instruction (predicated and unpredicated). When using VPT-predicated instructions in automatic code generation, it will be convenient to specify the predicate value as a vector of the appropriate number of i1. To make it easy to specify all sizes of an instruction in one go and give each one the matching predicate vector type, I've added a system of Tablegen informational records describing MVE's vector types: each one gives the underlying LLVM IR ValueType (which may not be the same if the MVE vector is of explicitly signed or unsigned integers) and an appropriate vNi1 to use as the predicate vector. (Also, those info records include the usual encoding for the types, so that as we add associations between each instruction encoding and one of the new `MVEVectorVTInfo` records, we can remove some of the existing template parameters and replace them with references to the vector type info's fields.) The user-facing ACLE intrinsics will receive a predicate mask as a 16-bit integer, so I've also provided a pair of intrinsics i2v and v2i, to convert between an integer and a vector of i1 by just changing the register class. Reviewers: dmgreen, miyuki, ostannard Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67158
2019-10-08 00:00:51 +08:00
}
// Integer vector types that don't treat signed and unsigned differently.
def MVE_v16i8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, 0b00, "i", ?>;
def MVE_v8i16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, 0b01, "i", ?>;
def MVE_v4i32 : MVEVectorVTInfo<v4i32, v2i64, v4i1, 0b10, "i", ?>;
def MVE_v2i64 : MVEVectorVTInfo<v2i64, ?, v4i1, 0b11, "i", ?>;
[ARM] Begin adding IR intrinsics for MVE instructions. This commit, together with the next few, will add a representative sample of the kind of IR intrinsics that we'll need in order to implement the user-facing ACLE intrinsics for MVE. Supporting all of them will take more work; the intention of this initial series of commits is to implement an intrinsic or two from lots of different categories, as examples and proofs of concept. This initial commit introduces a small number of IR intrinsics for instructions simple enough that they can use Tablegen ISel patterns: the predicated versions of the VADD and VSUB instructions (both integer and FP), VMIN and VMAX, and the float->half VCVT instruction (predicated and unpredicated). When using VPT-predicated instructions in automatic code generation, it will be convenient to specify the predicate value as a vector of the appropriate number of i1. To make it easy to specify all sizes of an instruction in one go and give each one the matching predicate vector type, I've added a system of Tablegen informational records describing MVE's vector types: each one gives the underlying LLVM IR ValueType (which may not be the same if the MVE vector is of explicitly signed or unsigned integers) and an appropriate vNi1 to use as the predicate vector. (Also, those info records include the usual encoding for the types, so that as we add associations between each instruction encoding and one of the new `MVEVectorVTInfo` records, we can remove some of the existing template parameters and replace them with references to the vector type info's fields.) The user-facing ACLE intrinsics will receive a predicate mask as a 16-bit integer, so I've also provided a pair of intrinsics i2v and v2i, to convert between an integer and a vector of i1 by just changing the register class. Reviewers: dmgreen, miyuki, ostannard Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67158
2019-10-08 00:00:51 +08:00
// Explicitly signed and unsigned integer vectors. They map to the
// same set of LLVM ValueTypes as above, but are represented
// differently in assembly and instruction encodings.
def MVE_v16s8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, 0b00, "s", 0b0>;
def MVE_v8s16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, 0b01, "s", 0b0>;
def MVE_v4s32 : MVEVectorVTInfo<v4i32, v2i64, v4i1, 0b10, "s", 0b0>;
def MVE_v2s64 : MVEVectorVTInfo<v2i64, ?, v4i1, 0b11, "s", 0b0>;
def MVE_v16u8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, 0b00, "u", 0b1>;
def MVE_v8u16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, 0b01, "u", 0b1>;
def MVE_v4u32 : MVEVectorVTInfo<v4i32, v2i64, v4i1, 0b10, "u", 0b1>;
def MVE_v2u64 : MVEVectorVTInfo<v2i64, ?, v4i1, 0b11, "u", 0b1>;
[ARM] Begin adding IR intrinsics for MVE instructions. This commit, together with the next few, will add a representative sample of the kind of IR intrinsics that we'll need in order to implement the user-facing ACLE intrinsics for MVE. Supporting all of them will take more work; the intention of this initial series of commits is to implement an intrinsic or two from lots of different categories, as examples and proofs of concept. This initial commit introduces a small number of IR intrinsics for instructions simple enough that they can use Tablegen ISel patterns: the predicated versions of the VADD and VSUB instructions (both integer and FP), VMIN and VMAX, and the float->half VCVT instruction (predicated and unpredicated). When using VPT-predicated instructions in automatic code generation, it will be convenient to specify the predicate value as a vector of the appropriate number of i1. To make it easy to specify all sizes of an instruction in one go and give each one the matching predicate vector type, I've added a system of Tablegen informational records describing MVE's vector types: each one gives the underlying LLVM IR ValueType (which may not be the same if the MVE vector is of explicitly signed or unsigned integers) and an appropriate vNi1 to use as the predicate vector. (Also, those info records include the usual encoding for the types, so that as we add associations between each instruction encoding and one of the new `MVEVectorVTInfo` records, we can remove some of the existing template parameters and replace them with references to the vector type info's fields.) The user-facing ACLE intrinsics will receive a predicate mask as a 16-bit integer, so I've also provided a pair of intrinsics i2v and v2i, to convert between an integer and a vector of i1 by just changing the register class. Reviewers: dmgreen, miyuki, ostannard Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67158
2019-10-08 00:00:51 +08:00
// FP vector types.
def MVE_v8f16 : MVEVectorVTInfo<v8f16, v4f32, v8i1, 0b01, "f", ?>;
def MVE_v4f32 : MVEVectorVTInfo<v4f32, v2f64, v4i1, 0b10, "f", ?>;
def MVE_v2f64 : MVEVectorVTInfo<v2f64, ?, v4i1, 0b11, "f", ?>;
// Polynomial vector types.
def MVE_v16p8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, 0b11, "p", 0b0>;
def MVE_v8p16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, 0b11, "p", 0b1>;
[ARM] Begin adding IR intrinsics for MVE instructions. This commit, together with the next few, will add a representative sample of the kind of IR intrinsics that we'll need in order to implement the user-facing ACLE intrinsics for MVE. Supporting all of them will take more work; the intention of this initial series of commits is to implement an intrinsic or two from lots of different categories, as examples and proofs of concept. This initial commit introduces a small number of IR intrinsics for instructions simple enough that they can use Tablegen ISel patterns: the predicated versions of the VADD and VSUB instructions (both integer and FP), VMIN and VMAX, and the float->half VCVT instruction (predicated and unpredicated). When using VPT-predicated instructions in automatic code generation, it will be convenient to specify the predicate value as a vector of the appropriate number of i1. To make it easy to specify all sizes of an instruction in one go and give each one the matching predicate vector type, I've added a system of Tablegen informational records describing MVE's vector types: each one gives the underlying LLVM IR ValueType (which may not be the same if the MVE vector is of explicitly signed or unsigned integers) and an appropriate vNi1 to use as the predicate vector. (Also, those info records include the usual encoding for the types, so that as we add associations between each instruction encoding and one of the new `MVEVectorVTInfo` records, we can remove some of the existing template parameters and replace them with references to the vector type info's fields.) The user-facing ACLE intrinsics will receive a predicate mask as a 16-bit integer, so I've also provided a pair of intrinsics i2v and v2i, to convert between an integer and a vector of i1 by just changing the register class. Reviewers: dmgreen, miyuki, ostannard Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67158
2019-10-08 00:00:51 +08:00
[ARM] Add MVE vector load/store instructions. This adds the rest of the vector memory access instructions. It includes contiguous loads/stores, with an ordinary addressing mode such as [r0,#offset] (plus writeback variants); gather loads and scatter stores with a scalar base address register and a vector of offsets from it (written [r0,q1] or similar); and gather/scatters with a vector of base addresses (written [q0,#offset], again with writeback). Additionally, some of the loads can widen each loaded value into a larger vector lane, and the corresponding stores narrow them again. To implement these, we also have to add the addressing modes they need. Also, in AsmParser, the `isMem` query function now has subqueries `isGPRMem` and `isMVEMem`, according to which kind of base register is used by a given memory access operand. I've also had to add an extra check in `checkTargetMatchPredicate` in the AsmParser, without which our last-minute check of `rGPR` register operands against SP and PC was failing an assertion because Tablegen had inserted an immediate 0 in place of one of a pair of tied register operands. (This matches the way the corresponding check for `MCK_rGPR` in `validateTargetOperandClass` is guarded.) Apparently the MVE load instructions were the first to have ever triggered this assertion, but I think only because they were the first to have a combination of the usual Arm pre/post writeback system and the `rGPR` class in particular. Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62680 llvm-svn: 364291
2019-06-25 19:24:18 +08:00
// --------- Start of base classes for the instructions themselves
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
class MVE_MI<dag oops, dag iops, InstrItinClass itin, string asm,
string ops, string cstr, list<dag> pattern>
: Thumb2XI<oops, iops, AddrModeNone, 4, itin, !strconcat(asm, "\t", ops), cstr,
pattern>,
Requires<[HasMVEInt]> {
let D = MVEDomain;
let DecoderNamespace = "MVE";
}
// MVE_p is used for most predicated instructions, to add the cluster
// of input operands that provides the VPT suffix (none, T or E) and
// the input predicate register.
class MVE_p<dag oops, dag iops, InstrItinClass itin, string iname,
string suffix, string ops, vpred_ops vpred, string cstr,
list<dag> pattern=[]>
: MVE_MI<oops, !con(iops, (ins vpred:$vp)), itin,
// If the instruction has a suffix, like vadd.f32, then the
// VPT predication suffix goes before the dot, so the full
// name has to be "vadd${vp}.f32".
!strconcat(iname, "${vp}",
!if(!eq(suffix, ""), "", !strconcat(".", suffix))),
ops, !strconcat(cstr, vpred.vpred_constraint), pattern> {
let Inst{31-29} = 0b111;
let Inst{27-26} = 0b11;
}
class MVE_f<dag oops, dag iops, InstrItinClass itin, string iname,
string suffix, string ops, vpred_ops vpred, string cstr,
list<dag> pattern=[]>
: MVE_p<oops, iops, itin, iname, suffix, ops, vpred, cstr, pattern> {
let Predicates = [HasMVEFloat];
}
class MVE_MI_with_pred<dag oops, dag iops, InstrItinClass itin, string asm,
string ops, string cstr, list<dag> pattern>
: Thumb2I<oops, iops, AddrModeNone, 4, itin, asm, !strconcat("\t", ops), cstr,
pattern>,
Requires<[HasV8_1MMainline, HasMVEInt]> {
let D = MVEDomain;
let DecoderNamespace = "MVE";
}
class MVE_VMOV_lane_base<dag oops, dag iops, InstrItinClass itin, string asm,
string suffix, string ops, string cstr,
list<dag> pattern>
: Thumb2I<oops, iops, AddrModeNone, 4, itin, asm,
!if(!eq(suffix, ""), "", "." # suffix) # "\t" # ops,
cstr, pattern>,
Requires<[HasV8_1MMainline, HasMVEInt]> {
let D = MVEDomain;
let DecoderNamespace = "MVE";
}
class MVE_ScalarShift<string iname, dag oops, dag iops, string asm, string cstr,
list<dag> pattern=[]>
: MVE_MI_with_pred<oops, iops, NoItinerary, iname, asm, cstr, pattern> {
let Inst{31-20} = 0b111010100101;
let Inst{8} = 0b1;
}
class MVE_ScalarShiftSingleReg<string iname, dag iops, string asm, string cstr,
list<dag> pattern=[]>
: MVE_ScalarShift<iname, (outs rGPR:$RdaDest), iops, asm, cstr, pattern> {
bits<4> RdaDest;
let Inst{19-16} = RdaDest{3-0};
}
2019-11-19 22:47:07 +08:00
class MVE_ScalarShiftSRegImm<string iname, bits<2> op5_4>
: MVE_ScalarShiftSingleReg<iname, (ins rGPR:$RdaSrc, long_shift:$imm),
2019-11-19 22:47:07 +08:00
"$RdaSrc, $imm", "$RdaDest = $RdaSrc",
[(set rGPR:$RdaDest,
(i32 (!cast<Intrinsic>("int_arm_mve_" # iname)
(i32 rGPR:$RdaSrc), (i32 imm:$imm))))]> {
bits<5> imm;
let Inst{15} = 0b0;
let Inst{14-12} = imm{4-2};
let Inst{11-8} = 0b1111;
let Inst{7-6} = imm{1-0};
let Inst{5-4} = op5_4{1-0};
let Inst{3-0} = 0b1111;
}
def MVE_SQSHL : MVE_ScalarShiftSRegImm<"sqshl", 0b11>;
def MVE_SRSHR : MVE_ScalarShiftSRegImm<"srshr", 0b10>;
def MVE_UQSHL : MVE_ScalarShiftSRegImm<"uqshl", 0b00>;
def MVE_URSHR : MVE_ScalarShiftSRegImm<"urshr", 0b01>;
2019-11-19 22:47:07 +08:00
class MVE_ScalarShiftSRegReg<string iname, bits<2> op5_4>
: MVE_ScalarShiftSingleReg<iname, (ins rGPR:$RdaSrc, rGPR:$Rm),
2019-11-19 22:47:07 +08:00
"$RdaSrc, $Rm", "$RdaDest = $RdaSrc",
[(set rGPR:$RdaDest,
(i32 (!cast<Intrinsic>("int_arm_mve_" # iname)
(i32 rGPR:$RdaSrc), (i32 rGPR:$Rm))))]> {
bits<4> Rm;
let Inst{15-12} = Rm{3-0};
let Inst{11-8} = 0b1111;
let Inst{7-6} = 0b00;
let Inst{5-4} = op5_4{1-0};
let Inst{3-0} = 0b1101;
let Unpredictable{8-6} = 0b111;
}
def MVE_SQRSHR : MVE_ScalarShiftSRegReg<"sqrshr", 0b10>;
def MVE_UQRSHL : MVE_ScalarShiftSRegReg<"uqrshl", 0b00>;
class MVE_ScalarShiftDoubleReg<string iname, dag iops, string asm,
string cstr, list<dag> pattern=[]>
: MVE_ScalarShift<iname, (outs tGPREven:$RdaLo, tGPROdd:$RdaHi),
iops, asm, cstr, pattern> {
bits<4> RdaLo;
bits<4> RdaHi;
let Inst{19-17} = RdaLo{3-1};
let Inst{11-9} = RdaHi{3-1};
}
class MVE_ScalarShiftDRegImm<string iname, bits<2> op5_4, bit op16,
list<dag> pattern=[]>
: MVE_ScalarShiftDoubleReg<
iname, (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, long_shift:$imm),
"$RdaLo, $RdaHi, $imm", "$RdaLo = $RdaLo_src,$RdaHi = $RdaHi_src",
pattern> {
bits<5> imm;
let Inst{16} = op16;
let Inst{15} = 0b0;
let Inst{14-12} = imm{4-2};
let Inst{7-6} = imm{1-0};
let Inst{5-4} = op5_4{1-0};
let Inst{3-0} = 0b1111;
}
class MVE_ScalarShiftDRegRegBase<string iname, dag iops, string asm,
bit op5, bit op16, list<dag> pattern=[]>
: MVE_ScalarShiftDoubleReg<
iname, iops, asm, "@earlyclobber $RdaHi,@earlyclobber $RdaLo,"
"$RdaLo = $RdaLo_src,$RdaHi = $RdaHi_src",
pattern> {
bits<4> Rm;
let Inst{16} = op16;
let Inst{15-12} = Rm{3-0};
let Inst{6} = 0b0;
let Inst{5} = op5;
let Inst{4} = 0b0;
let Inst{3-0} = 0b1101;
// Custom decoder method because of the following overlapping encodings:
// ASRL and SQRSHR
// LSLL and UQRSHL
// SQRSHRL and SQRSHR
// UQRSHLL and UQRSHL
let DecoderMethod = "DecodeMVEOverlappingLongShift";
}
class MVE_ScalarShiftDRegReg<string iname, bit op5, list<dag> pattern=[]>
: MVE_ScalarShiftDRegRegBase<
iname, (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, rGPR:$Rm),
"$RdaLo, $RdaHi, $Rm", op5, 0b0, pattern> {
let Inst{7} = 0b0;
}
class MVE_ScalarShiftDRegRegWithSat<string iname, bit op5, list<dag> pattern=[]>
: MVE_ScalarShiftDRegRegBase<
iname, (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, rGPR:$Rm, saturateop:$sat),
"$RdaLo, $RdaHi, $sat, $Rm", op5, 0b1, pattern> {
bit sat;
let Inst{7} = sat;
}
def MVE_ASRLr : MVE_ScalarShiftDRegReg<"asrl", 0b1, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi,
(ARMasrl tGPREven:$RdaLo_src,
tGPROdd:$RdaHi_src, rGPR:$Rm))]>;
def MVE_ASRLi : MVE_ScalarShiftDRegImm<"asrl", 0b10, ?, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi,
(ARMasrl tGPREven:$RdaLo_src,
tGPROdd:$RdaHi_src, (i32 long_shift:$imm)))]>;
def MVE_LSLLr : MVE_ScalarShiftDRegReg<"lsll", 0b0, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi,
(ARMlsll tGPREven:$RdaLo_src,
tGPROdd:$RdaHi_src, rGPR:$Rm))]>;
def MVE_LSLLi : MVE_ScalarShiftDRegImm<"lsll", 0b00, ?, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi,
(ARMlsll tGPREven:$RdaLo_src,
tGPROdd:$RdaHi_src, (i32 long_shift:$imm)))]>;
def MVE_LSRL : MVE_ScalarShiftDRegImm<"lsrl", 0b01, ?, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi,
(ARMlsrl tGPREven:$RdaLo_src,
tGPROdd:$RdaHi_src, (i32 long_shift:$imm)))]>;
def MVE_SQRSHRL : MVE_ScalarShiftDRegRegWithSat<"sqrshrl", 0b1>;
def MVE_SQSHLL : MVE_ScalarShiftDRegImm<"sqshll", 0b11, 0b1>;
def MVE_SRSHRL : MVE_ScalarShiftDRegImm<"srshrl", 0b10, 0b1>;
def MVE_UQRSHLL : MVE_ScalarShiftDRegRegWithSat<"uqrshll", 0b0>;
def MVE_UQSHLL : MVE_ScalarShiftDRegImm<"uqshll", 0b00, 0b1>;
def MVE_URSHRL : MVE_ScalarShiftDRegImm<"urshrl", 0b01, 0b1>;
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
// start of mve_rDest instructions
class MVE_rDest<dag oops, dag iops, InstrItinClass itin,
string iname, string suffix,
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
string ops, string cstr, list<dag> pattern=[]>
// Always use vpred_n and not vpred_r: with the output register being
// a GPR and not a vector register, there can't be any question of
// what to put in its inactive lanes.
: MVE_p<oops, iops, itin, iname, suffix, ops, vpred_n, cstr, pattern> {
let Inst{25-23} = 0b101;
let Inst{11-9} = 0b111;
let Inst{4} = 0b0;
}
class MVE_VABAV<string suffix, bit U, bits<2> size>
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
: MVE_rDest<(outs rGPR:$Rda), (ins rGPR:$Rda_src, MQPR:$Qn, MQPR:$Qm),
NoItinerary, "vabav", suffix, "$Rda, $Qn, $Qm", "$Rda = $Rda_src",
[]> {
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
bits<4> Qm;
bits<4> Qn;
bits<4> Rda;
let Inst{28} = U;
let Inst{22} = 0b0;
let Inst{21-20} = size{1-0};
let Inst{19-17} = Qn{2-0};
let Inst{16} = 0b0;
let Inst{15-12} = Rda{3-0};
let Inst{8} = 0b1;
let Inst{7} = Qn{3};
let Inst{6} = 0b0;
let Inst{5} = Qm{3};
let Inst{3-1} = Qm{2-0};
let Inst{0} = 0b1;
}
multiclass MVE_VABAV_m<MVEVectorVTInfo VTI> {
def "" : MVE_VABAV<VTI.Suffix, VTI.Unsigned, VTI.Size>;
let Predicates = [HasMVEInt] in {
def : Pat<(i32 (int_arm_mve_vabav
(i32 VTI.Unsigned),
(i32 rGPR:$Rda_src),
(VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm))),
(i32 (!cast<Instruction>(NAME)
(i32 rGPR:$Rda_src),
(VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm)))>;
def : Pat<(i32 (int_arm_mve_vabav_predicated
(i32 VTI.Unsigned),
(i32 rGPR:$Rda_src),
(VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
(VTI.Pred VCCR:$mask))),
(i32 (!cast<Instruction>(NAME)
(i32 rGPR:$Rda_src),
(VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
ARMVCCThen, (VTI.Pred VCCR:$mask)))>;
}
}
defm MVE_VABAVs8 : MVE_VABAV_m<MVE_v16s8>;
defm MVE_VABAVs16 : MVE_VABAV_m<MVE_v8s16>;
defm MVE_VABAVs32 : MVE_VABAV_m<MVE_v4s32>;
defm MVE_VABAVu8 : MVE_VABAV_m<MVE_v16u8>;
defm MVE_VABAVu16 : MVE_VABAV_m<MVE_v8u16>;
defm MVE_VABAVu32 : MVE_VABAV_m<MVE_v4u32>;
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
class MVE_VADDV<string iname, string suffix, dag iops, string cstr,
bit A, bit U, bits<2> size, list<dag> pattern=[]>
: MVE_rDest<(outs tGPREven:$Rda), iops, NoItinerary,
iname, suffix, "$Rda, $Qm", cstr, pattern> {
bits<3> Qm;
bits<4> Rda;
let Inst{28} = U;
let Inst{22-20} = 0b111;
let Inst{19-18} = size{1-0};
let Inst{17-16} = 0b01;
let Inst{15-13} = Rda{3-1};
let Inst{12} = 0b0;
let Inst{8-6} = 0b100;
let Inst{5} = A;
let Inst{3-1} = Qm{2-0};
let Inst{0} = 0b0;
}
multiclass MVE_VADDV_A<string suffix, bit U, bits<2> size,
list<dag> pattern=[]> {
def acc : MVE_VADDV<"vaddva", suffix,
(ins tGPREven:$Rda_src, MQPR:$Qm), "$Rda = $Rda_src",
0b1, U, size, pattern>;
def no_acc : MVE_VADDV<"vaddv", suffix,
(ins MQPR:$Qm), "",
0b0, U, size, pattern>;
}
defm MVE_VADDVs8 : MVE_VADDV_A<"s8", 0b0, 0b00>;
defm MVE_VADDVs16 : MVE_VADDV_A<"s16", 0b0, 0b01>;
defm MVE_VADDVs32 : MVE_VADDV_A<"s32", 0b0, 0b10>;
defm MVE_VADDVu8 : MVE_VADDV_A<"u8", 0b1, 0b00>;
defm MVE_VADDVu16 : MVE_VADDV_A<"u16", 0b1, 0b01>;
defm MVE_VADDVu32 : MVE_VADDV_A<"u32", 0b1, 0b10>;
let Predicates = [HasMVEInt] in {
def : Pat<(i32 (vecreduce_add (v4i32 MQPR:$src))), (i32 (MVE_VADDVu32no_acc $src))>;
def : Pat<(i32 (vecreduce_add (v8i16 MQPR:$src))), (i32 (MVE_VADDVu16no_acc $src))>;
def : Pat<(i32 (vecreduce_add (v16i8 MQPR:$src))), (i32 (MVE_VADDVu8no_acc $src))>;
def : Pat<(i32 (add (i32 (vecreduce_add (v4i32 MQPR:$src1))), (i32 tGPR:$src2))),
(i32 (MVE_VADDVu32acc $src2, $src1))>;
def : Pat<(i32 (add (i32 (vecreduce_add (v8i16 MQPR:$src1))), (i32 tGPR:$src2))),
(i32 (MVE_VADDVu16acc $src2, $src1))>;
def : Pat<(i32 (add (i32 (vecreduce_add (v16i8 MQPR:$src1))), (i32 tGPR:$src2))),
(i32 (MVE_VADDVu8acc $src2, $src1))>;
}
class MVE_VADDLV<string iname, string suffix, dag iops, string cstr,
bit A, bit U, list<dag> pattern=[]>
: MVE_rDest<(outs tGPREven:$RdaLo, tGPROdd:$RdaHi), iops, NoItinerary, iname,
suffix, "$RdaLo, $RdaHi, $Qm", cstr, pattern> {
bits<3> Qm;
bits<4> RdaLo;
bits<4> RdaHi;
let Inst{28} = U;
let Inst{22-20} = RdaHi{3-1};
let Inst{19-18} = 0b10;
let Inst{17-16} = 0b01;
let Inst{15-13} = RdaLo{3-1};
let Inst{12} = 0b0;
let Inst{8-6} = 0b100;
let Inst{5} = A;
let Inst{3-1} = Qm{2-0};
let Inst{0} = 0b0;
}
multiclass MVE_VADDLV_A<string suffix, bit U, list<dag> pattern=[]> {
def acc : MVE_VADDLV<"vaddlva", suffix,
(ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, MQPR:$Qm),
"$RdaLo = $RdaLo_src,$RdaHi = $RdaHi_src",
0b1, U, pattern>;
def no_acc : MVE_VADDLV<"vaddlv", suffix,
(ins MQPR:$Qm), "",
0b0, U, pattern>;
}
defm MVE_VADDLVs32 : MVE_VADDLV_A<"s32", 0b0>;
defm MVE_VADDLVu32 : MVE_VADDLV_A<"u32", 0b1>;
class MVE_VMINMAXNMV<string iname, string suffix, bit sz,
bit bit_17, bit bit_7, list<dag> pattern=[]>
: MVE_rDest<(outs rGPR:$RdaDest), (ins rGPR:$RdaSrc, MQPR:$Qm),
NoItinerary, iname, suffix, "$RdaSrc, $Qm",
"$RdaDest = $RdaSrc", pattern> {
bits<3> Qm;
bits<4> RdaDest;
let Inst{28} = sz;
let Inst{22-20} = 0b110;
let Inst{19-18} = 0b11;
let Inst{17} = bit_17;
let Inst{16} = 0b0;
let Inst{15-12} = RdaDest{3-0};
let Inst{8} = 0b1;
let Inst{7} = bit_7;
let Inst{6-5} = 0b00;
let Inst{3-1} = Qm{2-0};
let Inst{0} = 0b0;
let Predicates = [HasMVEFloat];
}
multiclass MVE_VMINMAXNMV_fty<string iname, bit bit_7, list<dag> pattern=[]> {
def f32 : MVE_VMINMAXNMV<iname, "f32", 0b0, 0b1, bit_7, pattern>;
def f16 : MVE_VMINMAXNMV<iname, "f16", 0b1, 0b1, bit_7, pattern>;
}
defm MVE_VMINNMV : MVE_VMINMAXNMV_fty<"vminnmv", 0b1>;
defm MVE_VMAXNMV : MVE_VMINMAXNMV_fty<"vmaxnmv", 0b0>;
multiclass MVE_VMINMAXNMAV_fty<string iname, bit bit_7, list<dag> pattern=[]> {
def f32 : MVE_VMINMAXNMV<iname, "f32", 0b0, 0b0, bit_7, pattern>;
def f16 : MVE_VMINMAXNMV<iname, "f16", 0b1, 0b0, bit_7, pattern>;
}
defm MVE_VMINNMAV : MVE_VMINMAXNMAV_fty<"vminnmav", 0b1>;
defm MVE_VMAXNMAV : MVE_VMINMAXNMAV_fty<"vmaxnmav", 0b0>;
class MVE_VMINMAXV<string iname, string suffix, bit U, bits<2> size,
bit bit_17, bit bit_7, list<dag> pattern=[]>
: MVE_rDest<(outs rGPR:$RdaDest), (ins rGPR:$RdaSrc, MQPR:$Qm), NoItinerary,
iname, suffix, "$RdaSrc, $Qm", "$RdaDest = $RdaSrc", pattern> {
bits<3> Qm;
bits<4> RdaDest;
let Inst{28} = U;
let Inst{22-20} = 0b110;
let Inst{19-18} = size{1-0};
let Inst{17} = bit_17;
let Inst{16} = 0b0;
let Inst{15-12} = RdaDest{3-0};
let Inst{8} = 0b1;
let Inst{7} = bit_7;
let Inst{6-5} = 0b00;
let Inst{3-1} = Qm{2-0};
let Inst{0} = 0b0;
}
[ARM] Begin adding IR intrinsics for MVE instructions. This commit, together with the next few, will add a representative sample of the kind of IR intrinsics that we'll need in order to implement the user-facing ACLE intrinsics for MVE. Supporting all of them will take more work; the intention of this initial series of commits is to implement an intrinsic or two from lots of different categories, as examples and proofs of concept. This initial commit introduces a small number of IR intrinsics for instructions simple enough that they can use Tablegen ISel patterns: the predicated versions of the VADD and VSUB instructions (both integer and FP), VMIN and VMAX, and the float->half VCVT instruction (predicated and unpredicated). When using VPT-predicated instructions in automatic code generation, it will be convenient to specify the predicate value as a vector of the appropriate number of i1. To make it easy to specify all sizes of an instruction in one go and give each one the matching predicate vector type, I've added a system of Tablegen informational records describing MVE's vector types: each one gives the underlying LLVM IR ValueType (which may not be the same if the MVE vector is of explicitly signed or unsigned integers) and an appropriate vNi1 to use as the predicate vector. (Also, those info records include the usual encoding for the types, so that as we add associations between each instruction encoding and one of the new `MVEVectorVTInfo` records, we can remove some of the existing template parameters and replace them with references to the vector type info's fields.) The user-facing ACLE intrinsics will receive a predicate mask as a 16-bit integer, so I've also provided a pair of intrinsics i2v and v2i, to convert between an integer and a vector of i1 by just changing the register class. Reviewers: dmgreen, miyuki, ostannard Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67158
2019-10-08 00:00:51 +08:00
multiclass MVE_VMINMAXV_p<string iname, bit bit_17, bit bit_7,
MVEVectorVTInfo VTI, Intrinsic intr> {
def "": MVE_VMINMAXV<iname, VTI.Suffix, VTI.Unsigned, VTI.Size,
bit_17, bit_7>;
let Predicates = [HasMVEInt] in
def _pat : Pat<(i32 (intr (i32 rGPR:$prev), (VTI.Vec MQPR:$vec))),
(i32 (!cast<Instruction>(NAME)
(i32 rGPR:$prev), (VTI.Vec MQPR:$vec)))>;
}
[ARM] Begin adding IR intrinsics for MVE instructions. This commit, together with the next few, will add a representative sample of the kind of IR intrinsics that we'll need in order to implement the user-facing ACLE intrinsics for MVE. Supporting all of them will take more work; the intention of this initial series of commits is to implement an intrinsic or two from lots of different categories, as examples and proofs of concept. This initial commit introduces a small number of IR intrinsics for instructions simple enough that they can use Tablegen ISel patterns: the predicated versions of the VADD and VSUB instructions (both integer and FP), VMIN and VMAX, and the float->half VCVT instruction (predicated and unpredicated). When using VPT-predicated instructions in automatic code generation, it will be convenient to specify the predicate value as a vector of the appropriate number of i1. To make it easy to specify all sizes of an instruction in one go and give each one the matching predicate vector type, I've added a system of Tablegen informational records describing MVE's vector types: each one gives the underlying LLVM IR ValueType (which may not be the same if the MVE vector is of explicitly signed or unsigned integers) and an appropriate vNi1 to use as the predicate vector. (Also, those info records include the usual encoding for the types, so that as we add associations between each instruction encoding and one of the new `MVEVectorVTInfo` records, we can remove some of the existing template parameters and replace them with references to the vector type info's fields.) The user-facing ACLE intrinsics will receive a predicate mask as a 16-bit integer, so I've also provided a pair of intrinsics i2v and v2i, to convert between an integer and a vector of i1 by just changing the register class. Reviewers: dmgreen, miyuki, ostannard Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67158
2019-10-08 00:00:51 +08:00
multiclass MVE_VMINMAXV_ty<string iname, bit bit_7,
Intrinsic intr_s, Intrinsic intr_u> {
defm s8 : MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v16s8, intr_s>;
defm s16: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v8s16, intr_s>;
defm s32: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v4s32, intr_s>;
defm u8 : MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v16u8, intr_u>;
defm u16: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v8u16, intr_u>;
defm u32: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v4u32, intr_u>;
}
defm MVE_VMINV : MVE_VMINMAXV_ty<
"vminv", 0b1, int_arm_mve_minv_s, int_arm_mve_minv_u>;
defm MVE_VMAXV : MVE_VMINMAXV_ty<
"vmaxv", 0b0, int_arm_mve_maxv_s, int_arm_mve_maxv_u>;
let Predicates = [HasMVEInt] in {
def : Pat<(i32 (vecreduce_smax (v16i8 MQPR:$src))),
(i32 (MVE_VMAXVs8 (t2MVNi (i32 127)), $src))>;
def : Pat<(i32 (vecreduce_smax (v8i16 MQPR:$src))),
(i32 (MVE_VMAXVs16 (t2MOVi32imm (i32 -32768)), $src))>;
def : Pat<(i32 (vecreduce_smax (v4i32 MQPR:$src))),
(i32 (MVE_VMAXVs32 (t2MOVi (i32 -2147483648)), $src))>;
def : Pat<(i32 (vecreduce_umax (v16i8 MQPR:$src))),
(i32 (MVE_VMAXVu8 (t2MOVi (i32 0)), $src))>;
def : Pat<(i32 (vecreduce_umax (v8i16 MQPR:$src))),
(i32 (MVE_VMAXVu16 (t2MOVi (i32 0)), $src))>;
def : Pat<(i32 (vecreduce_umax (v4i32 MQPR:$src))),
(i32 (MVE_VMAXVu32 (t2MOVi (i32 0)), $src))>;
def : Pat<(i32 (vecreduce_smin (v16i8 MQPR:$src))),
(i32 (MVE_VMINVs8 (t2MOVi (i32 127)), $src))>;
def : Pat<(i32 (vecreduce_smin (v8i16 MQPR:$src))),
(i32 (MVE_VMINVs16 (t2MOVi16 (i32 32767)), $src))>;
def : Pat<(i32 (vecreduce_smin (v4i32 MQPR:$src))),
(i32 (MVE_VMINVs32 (t2MVNi (i32 -2147483648)), $src))>;
def : Pat<(i32 (vecreduce_umin (v16i8 MQPR:$src))),
(i32 (MVE_VMINVu8 (t2MOVi (i32 255)), $src))>;
def : Pat<(i32 (vecreduce_umin (v8i16 MQPR:$src))),
(i32 (MVE_VMINVu16 (t2MOVi16 (i32 65535)), $src))>;
def : Pat<(i32 (vecreduce_umin (v4i32 MQPR:$src))),
(i32 (MVE_VMINVu32 (t2MOVi (i32 4294967295)), $src))>;
}
multiclass MVE_VMINMAXAV_ty<string iname, bit bit_7, list<dag> pattern=[]> {
def s8 : MVE_VMINMAXV<iname, "s8", 0b0, 0b00, 0b0, bit_7>;
def s16 : MVE_VMINMAXV<iname, "s16", 0b0, 0b01, 0b0, bit_7>;
def s32 : MVE_VMINMAXV<iname, "s32", 0b0, 0b10, 0b0, bit_7>;
}
defm MVE_VMINAV : MVE_VMINMAXAV_ty<"vminav", 0b1>;
defm MVE_VMAXAV : MVE_VMINMAXAV_ty<"vmaxav", 0b0>;
class MVE_VMLAMLSDAV<string iname, string suffix, dag iops, string cstr,
bit sz, bit bit_28, bit A, bit X, bit bit_8, bit bit_0>
: MVE_rDest<(outs tGPREven:$RdaDest), iops, NoItinerary, iname, suffix,
"$RdaDest, $Qn, $Qm", cstr, []> {
bits<4> RdaDest;
bits<3> Qm;
bits<3> Qn;
let Inst{28} = bit_28;
let Inst{22-20} = 0b111;
let Inst{19-17} = Qn{2-0};
let Inst{16} = sz;
let Inst{15-13} = RdaDest{3-1};
let Inst{12} = X;
let Inst{8} = bit_8;
let Inst{7-6} = 0b00;
let Inst{5} = A;
let Inst{3-1} = Qm{2-0};
let Inst{0} = bit_0;
}
multiclass MVE_VMLAMLSDAV_A<string iname, string x, MVEVectorVTInfo VTI,
bit sz, bit bit_28, bit X, bit bit_8, bit bit_0> {
def ""#x#VTI.Suffix : MVE_VMLAMLSDAV<iname # x, VTI.Suffix,
[ARM] Remove some spurious MVE reduction instructions. The family of 'dual-accumulating' vector multiply-add instructions (VMLADAV, VMLALDAV and VRMLALDAVH) can all operate on both signed and unsigned integer types, and they all have an 'exchange' variant (with an X in the name) that modifies which pairs of vector lanes in the two inputs are multiplied together. But there's a clause in the spec that says that the X variants //don't// operate on unsigned integer types, only signed. You can have X, or unsigned, or neither, but not both. We didn't notice that clause when we implemented the MC support for these instructions, so LLVM believes that things like VMLADAVX.U8 do exist, contradicting the spec. Here I fix that by conditioning them out in Tablegen. In order to do that, I've reversed the nesting order of the Tablegen multiclasses for those instructions. Previously, the innermost multiclass generated the X and not-X variants, and the one outside that generated the A and not-A variants. Now X is done by the outer multiclass, which allows me to bypass that one when I only want the two not-X variants. Changing the multiclass nesting order also changes the names of the instruction ids unless I make a special effort not to. I decided that while I was changing them anyway I'd make them look nicer; so now the instructions have names like MVE_VMLADAVs32 or MVE_VMLADAVaxs32, instead of cumbersome _noacc_noexch suffixes. The corresponding multiply-subtract instructions are unaffected. Those don't accept unsigned types at all, either in the spec or in LLVM. Reviewers: ostannard, dmgreen Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67214 llvm-svn: 371405
2019-09-09 23:17:26 +08:00
(ins MQPR:$Qn, MQPR:$Qm), "",
sz, bit_28, 0b0, X, bit_8, bit_0>;
def "a"#x#VTI.Suffix : MVE_VMLAMLSDAV<iname # "a" # x, VTI.Suffix,
[ARM] Remove some spurious MVE reduction instructions. The family of 'dual-accumulating' vector multiply-add instructions (VMLADAV, VMLALDAV and VRMLALDAVH) can all operate on both signed and unsigned integer types, and they all have an 'exchange' variant (with an X in the name) that modifies which pairs of vector lanes in the two inputs are multiplied together. But there's a clause in the spec that says that the X variants //don't// operate on unsigned integer types, only signed. You can have X, or unsigned, or neither, but not both. We didn't notice that clause when we implemented the MC support for these instructions, so LLVM believes that things like VMLADAVX.U8 do exist, contradicting the spec. Here I fix that by conditioning them out in Tablegen. In order to do that, I've reversed the nesting order of the Tablegen multiclasses for those instructions. Previously, the innermost multiclass generated the X and not-X variants, and the one outside that generated the A and not-A variants. Now X is done by the outer multiclass, which allows me to bypass that one when I only want the two not-X variants. Changing the multiclass nesting order also changes the names of the instruction ids unless I make a special effort not to. I decided that while I was changing them anyway I'd make them look nicer; so now the instructions have names like MVE_VMLADAVs32 or MVE_VMLADAVaxs32, instead of cumbersome _noacc_noexch suffixes. The corresponding multiply-subtract instructions are unaffected. Those don't accept unsigned types at all, either in the spec or in LLVM. Reviewers: ostannard, dmgreen Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67214 llvm-svn: 371405
2019-09-09 23:17:26 +08:00
(ins tGPREven:$RdaSrc, MQPR:$Qn, MQPR:$Qm),
"$RdaDest = $RdaSrc",
sz, bit_28, 0b1, X, bit_8, bit_0>;
let Predicates = [HasMVEInt] in {
def : Pat<(i32 (int_arm_mve_vmldava
(i32 VTI.Unsigned),
(i32 bit_0) /* subtract */,
(i32 X) /* exchange */,
(i32 0) /* accumulator */,
(VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm))),
(i32 (!cast<Instruction>(NAME # x # VTI.Suffix)
(VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm)))>;
def : Pat<(i32 (int_arm_mve_vmldava_predicated
(i32 VTI.Unsigned),
(i32 bit_0) /* subtract */,
(i32 X) /* exchange */,
(i32 0) /* accumulator */,
(VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
(VTI.Pred VCCR:$mask))),
(i32 (!cast<Instruction>(NAME # x # VTI.Suffix)
(VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
ARMVCCThen, (VTI.Pred VCCR:$mask)))>;
def : Pat<(i32 (int_arm_mve_vmldava
(i32 VTI.Unsigned),
(i32 bit_0) /* subtract */,
(i32 X) /* exchange */,
(i32 tGPREven:$RdaSrc),
(VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm))),
(i32 (!cast<Instruction>(NAME # "a" # x # VTI.Suffix)
(i32 tGPREven:$RdaSrc),
(VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm)))>;
def : Pat<(i32 (int_arm_mve_vmldava_predicated
(i32 VTI.Unsigned),
(i32 bit_0) /* subtract */,
(i32 X) /* exchange */,
(i32 tGPREven:$RdaSrc),
(VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
(VTI.Pred VCCR:$mask))),
(i32 (!cast<Instruction>(NAME # "a" # x # VTI.Suffix)
(i32 tGPREven:$RdaSrc),
(VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
ARMVCCThen, (VTI.Pred VCCR:$mask)))>;
}
[ARM] Remove some spurious MVE reduction instructions. The family of 'dual-accumulating' vector multiply-add instructions (VMLADAV, VMLALDAV and VRMLALDAVH) can all operate on both signed and unsigned integer types, and they all have an 'exchange' variant (with an X in the name) that modifies which pairs of vector lanes in the two inputs are multiplied together. But there's a clause in the spec that says that the X variants //don't// operate on unsigned integer types, only signed. You can have X, or unsigned, or neither, but not both. We didn't notice that clause when we implemented the MC support for these instructions, so LLVM believes that things like VMLADAVX.U8 do exist, contradicting the spec. Here I fix that by conditioning them out in Tablegen. In order to do that, I've reversed the nesting order of the Tablegen multiclasses for those instructions. Previously, the innermost multiclass generated the X and not-X variants, and the one outside that generated the A and not-A variants. Now X is done by the outer multiclass, which allows me to bypass that one when I only want the two not-X variants. Changing the multiclass nesting order also changes the names of the instruction ids unless I make a special effort not to. I decided that while I was changing them anyway I'd make them look nicer; so now the instructions have names like MVE_VMLADAVs32 or MVE_VMLADAVaxs32, instead of cumbersome _noacc_noexch suffixes. The corresponding multiply-subtract instructions are unaffected. Those don't accept unsigned types at all, either in the spec or in LLVM. Reviewers: ostannard, dmgreen Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67214 llvm-svn: 371405
2019-09-09 23:17:26 +08:00
}
multiclass MVE_VMLAMLSDAV_AX<string iname, MVEVectorVTInfo VTI, bit sz,
bit bit_28, bit bit_8, bit bit_0> {
defm "" : MVE_VMLAMLSDAV_A<iname, "", VTI, sz, bit_28,
0b0, bit_8, bit_0>;
defm "" : MVE_VMLAMLSDAV_A<iname, "x", VTI, sz, bit_28,
0b1, bit_8, bit_0>;
}
multiclass MVE_VMLADAV_multi<MVEVectorVTInfo SVTI, MVEVectorVTInfo UVTI,
bit sz, bit bit_8> {
defm "" : MVE_VMLAMLSDAV_AX<"vmladav", SVTI,
sz, 0b0, bit_8, 0b0>;
defm "" : MVE_VMLAMLSDAV_A<"vmladav", "", UVTI,
sz, 0b1, 0b0, bit_8, 0b0>;
}
multiclass MVE_VMLSDAV_multi<MVEVectorVTInfo VTI, bit sz, bit bit_28> {
defm "" : MVE_VMLAMLSDAV_AX<"vmlsdav", VTI,
sz, bit_28, 0b0, 0b1>;
}
defm MVE_VMLADAV : MVE_VMLADAV_multi<MVE_v16s8, MVE_v16u8, 0b0, 0b1>;
defm MVE_VMLADAV : MVE_VMLADAV_multi<MVE_v8s16, MVE_v8u16, 0b0, 0b0>;
defm MVE_VMLADAV : MVE_VMLADAV_multi<MVE_v4s32, MVE_v4u32, 0b1, 0b0>;
defm MVE_VMLSDAV : MVE_VMLSDAV_multi<MVE_v16s8, 0b0, 0b1>;
defm MVE_VMLSDAV : MVE_VMLSDAV_multi<MVE_v8s16, 0b0, 0b0>;
defm MVE_VMLSDAV : MVE_VMLSDAV_multi<MVE_v4s32, 0b1, 0b0>;
// vmlav aliases vmladav
[ARM] Remove some spurious MVE reduction instructions. The family of 'dual-accumulating' vector multiply-add instructions (VMLADAV, VMLALDAV and VRMLALDAVH) can all operate on both signed and unsigned integer types, and they all have an 'exchange' variant (with an X in the name) that modifies which pairs of vector lanes in the two inputs are multiplied together. But there's a clause in the spec that says that the X variants //don't// operate on unsigned integer types, only signed. You can have X, or unsigned, or neither, but not both. We didn't notice that clause when we implemented the MC support for these instructions, so LLVM believes that things like VMLADAVX.U8 do exist, contradicting the spec. Here I fix that by conditioning them out in Tablegen. In order to do that, I've reversed the nesting order of the Tablegen multiclasses for those instructions. Previously, the innermost multiclass generated the X and not-X variants, and the one outside that generated the A and not-A variants. Now X is done by the outer multiclass, which allows me to bypass that one when I only want the two not-X variants. Changing the multiclass nesting order also changes the names of the instruction ids unless I make a special effort not to. I decided that while I was changing them anyway I'd make them look nicer; so now the instructions have names like MVE_VMLADAVs32 or MVE_VMLADAVaxs32, instead of cumbersome _noacc_noexch suffixes. The corresponding multiply-subtract instructions are unaffected. Those don't accept unsigned types at all, either in the spec or in LLVM. Reviewers: ostannard, dmgreen Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67214 llvm-svn: 371405
2019-09-09 23:17:26 +08:00
foreach acc = ["", "a"] in {
foreach suffix = ["s8", "s16", "s32", "u8", "u16", "u32"] in {
[ARM] Remove some spurious MVE reduction instructions. The family of 'dual-accumulating' vector multiply-add instructions (VMLADAV, VMLALDAV and VRMLALDAVH) can all operate on both signed and unsigned integer types, and they all have an 'exchange' variant (with an X in the name) that modifies which pairs of vector lanes in the two inputs are multiplied together. But there's a clause in the spec that says that the X variants //don't// operate on unsigned integer types, only signed. You can have X, or unsigned, or neither, but not both. We didn't notice that clause when we implemented the MC support for these instructions, so LLVM believes that things like VMLADAVX.U8 do exist, contradicting the spec. Here I fix that by conditioning them out in Tablegen. In order to do that, I've reversed the nesting order of the Tablegen multiclasses for those instructions. Previously, the innermost multiclass generated the X and not-X variants, and the one outside that generated the A and not-A variants. Now X is done by the outer multiclass, which allows me to bypass that one when I only want the two not-X variants. Changing the multiclass nesting order also changes the names of the instruction ids unless I make a special effort not to. I decided that while I was changing them anyway I'd make them look nicer; so now the instructions have names like MVE_VMLADAVs32 or MVE_VMLADAVaxs32, instead of cumbersome _noacc_noexch suffixes. The corresponding multiply-subtract instructions are unaffected. Those don't accept unsigned types at all, either in the spec or in LLVM. Reviewers: ostannard, dmgreen Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67214 llvm-svn: 371405
2019-09-09 23:17:26 +08:00
def : MVEInstAlias<"vmlav"#acc#"${vp}."#suffix#"\t$RdaDest, $Qn, $Qm",
(!cast<Instruction>("MVE_VMLADAV"#acc#suffix)
tGPREven:$RdaDest, MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
}
}
// Base class for VMLALDAV and VMLSLDAV, VRMLALDAVH, VRMLSLDAVH
class MVE_VMLALDAVBase<string iname, string suffix, dag iops, string cstr,
bit sz, bit bit_28, bit A, bit X, bit bit_8, bit bit_0,
list<dag> pattern=[]>
: MVE_rDest<(outs tGPREven:$RdaLoDest, tGPROdd:$RdaHiDest), iops, NoItinerary,
iname, suffix, "$RdaLoDest, $RdaHiDest, $Qn, $Qm", cstr, pattern> {
bits<4> RdaLoDest;
bits<4> RdaHiDest;
bits<3> Qm;
bits<3> Qn;
let Inst{28} = bit_28;
let Inst{22-20} = RdaHiDest{3-1};
let Inst{19-17} = Qn{2-0};
let Inst{16} = sz;
let Inst{15-13} = RdaLoDest{3-1};
let Inst{12} = X;
let Inst{8} = bit_8;
let Inst{7-6} = 0b00;
let Inst{5} = A;
let Inst{3-1} = Qm{2-0};
let Inst{0} = bit_0;
}
[ARM] Remove some spurious MVE reduction instructions. The family of 'dual-accumulating' vector multiply-add instructions (VMLADAV, VMLALDAV and VRMLALDAVH) can all operate on both signed and unsigned integer types, and they all have an 'exchange' variant (with an X in the name) that modifies which pairs of vector lanes in the two inputs are multiplied together. But there's a clause in the spec that says that the X variants //don't// operate on unsigned integer types, only signed. You can have X, or unsigned, or neither, but not both. We didn't notice that clause when we implemented the MC support for these instructions, so LLVM believes that things like VMLADAVX.U8 do exist, contradicting the spec. Here I fix that by conditioning them out in Tablegen. In order to do that, I've reversed the nesting order of the Tablegen multiclasses for those instructions. Previously, the innermost multiclass generated the X and not-X variants, and the one outside that generated the A and not-A variants. Now X is done by the outer multiclass, which allows me to bypass that one when I only want the two not-X variants. Changing the multiclass nesting order also changes the names of the instruction ids unless I make a special effort not to. I decided that while I was changing them anyway I'd make them look nicer; so now the instructions have names like MVE_VMLADAVs32 or MVE_VMLADAVaxs32, instead of cumbersome _noacc_noexch suffixes. The corresponding multiply-subtract instructions are unaffected. Those don't accept unsigned types at all, either in the spec or in LLVM. Reviewers: ostannard, dmgreen Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67214 llvm-svn: 371405
2019-09-09 23:17:26 +08:00
multiclass MVE_VMLALDAVBase_A<string iname, string x, string suffix,
bit sz, bit bit_28, bit X, bit bit_8, bit bit_0,
list<dag> pattern=[]> {
def ""#x#suffix : MVE_VMLALDAVBase<
iname # x, suffix, (ins MQPR:$Qn, MQPR:$Qm), "",
sz, bit_28, 0b0, X, bit_8, bit_0, pattern>;
def "a"#x#suffix : MVE_VMLALDAVBase<
iname # "a" # x, suffix,
(ins tGPREven:$RdaLoSrc, tGPROdd:$RdaHiSrc, MQPR:$Qn, MQPR:$Qm),
"$RdaLoDest = $RdaLoSrc,$RdaHiDest = $RdaHiSrc",
sz, bit_28, 0b1, X, bit_8, bit_0, pattern>;
}
[ARM] Remove some spurious MVE reduction instructions. The family of 'dual-accumulating' vector multiply-add instructions (VMLADAV, VMLALDAV and VRMLALDAVH) can all operate on both signed and unsigned integer types, and they all have an 'exchange' variant (with an X in the name) that modifies which pairs of vector lanes in the two inputs are multiplied together. But there's a clause in the spec that says that the X variants //don't// operate on unsigned integer types, only signed. You can have X, or unsigned, or neither, but not both. We didn't notice that clause when we implemented the MC support for these instructions, so LLVM believes that things like VMLADAVX.U8 do exist, contradicting the spec. Here I fix that by conditioning them out in Tablegen. In order to do that, I've reversed the nesting order of the Tablegen multiclasses for those instructions. Previously, the innermost multiclass generated the X and not-X variants, and the one outside that generated the A and not-A variants. Now X is done by the outer multiclass, which allows me to bypass that one when I only want the two not-X variants. Changing the multiclass nesting order also changes the names of the instruction ids unless I make a special effort not to. I decided that while I was changing them anyway I'd make them look nicer; so now the instructions have names like MVE_VMLADAVs32 or MVE_VMLADAVaxs32, instead of cumbersome _noacc_noexch suffixes. The corresponding multiply-subtract instructions are unaffected. Those don't accept unsigned types at all, either in the spec or in LLVM. Reviewers: ostannard, dmgreen Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67214 llvm-svn: 371405
2019-09-09 23:17:26 +08:00
multiclass MVE_VMLALDAVBase_AX<string iname, string suffix, bit sz, bit bit_28,
bit bit_8, bit bit_0, list<dag> pattern=[]> {
defm "" : MVE_VMLALDAVBase_A<iname, "", suffix, sz,
bit_28, 0b0, bit_8, bit_0, pattern>;
defm "" : MVE_VMLALDAVBase_A<iname, "x", suffix, sz,
bit_28, 0b1, bit_8, bit_0, pattern>;
}
[ARM] Remove some spurious MVE reduction instructions. The family of 'dual-accumulating' vector multiply-add instructions (VMLADAV, VMLALDAV and VRMLALDAVH) can all operate on both signed and unsigned integer types, and they all have an 'exchange' variant (with an X in the name) that modifies which pairs of vector lanes in the two inputs are multiplied together. But there's a clause in the spec that says that the X variants //don't// operate on unsigned integer types, only signed. You can have X, or unsigned, or neither, but not both. We didn't notice that clause when we implemented the MC support for these instructions, so LLVM believes that things like VMLADAVX.U8 do exist, contradicting the spec. Here I fix that by conditioning them out in Tablegen. In order to do that, I've reversed the nesting order of the Tablegen multiclasses for those instructions. Previously, the innermost multiclass generated the X and not-X variants, and the one outside that generated the A and not-A variants. Now X is done by the outer multiclass, which allows me to bypass that one when I only want the two not-X variants. Changing the multiclass nesting order also changes the names of the instruction ids unless I make a special effort not to. I decided that while I was changing them anyway I'd make them look nicer; so now the instructions have names like MVE_VMLADAVs32 or MVE_VMLADAVaxs32, instead of cumbersome _noacc_noexch suffixes. The corresponding multiply-subtract instructions are unaffected. Those don't accept unsigned types at all, either in the spec or in LLVM. Reviewers: ostannard, dmgreen Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67214 llvm-svn: 371405
2019-09-09 23:17:26 +08:00
multiclass MVE_VRMLALDAVH_multi<string suffix, list<dag> pattern=[]> {
defm "" : MVE_VMLALDAVBase_AX<"vrmlaldavh", "s"#suffix,
0b0, 0b0, 0b1, 0b0, pattern>;
defm "" : MVE_VMLALDAVBase_A<"vrmlaldavh", "", "u"#suffix,
0b0, 0b1, 0b0, 0b1, 0b0, pattern>;
}
[ARM] Remove some spurious MVE reduction instructions. The family of 'dual-accumulating' vector multiply-add instructions (VMLADAV, VMLALDAV and VRMLALDAVH) can all operate on both signed and unsigned integer types, and they all have an 'exchange' variant (with an X in the name) that modifies which pairs of vector lanes in the two inputs are multiplied together. But there's a clause in the spec that says that the X variants //don't// operate on unsigned integer types, only signed. You can have X, or unsigned, or neither, but not both. We didn't notice that clause when we implemented the MC support for these instructions, so LLVM believes that things like VMLADAVX.U8 do exist, contradicting the spec. Here I fix that by conditioning them out in Tablegen. In order to do that, I've reversed the nesting order of the Tablegen multiclasses for those instructions. Previously, the innermost multiclass generated the X and not-X variants, and the one outside that generated the A and not-A variants. Now X is done by the outer multiclass, which allows me to bypass that one when I only want the two not-X variants. Changing the multiclass nesting order also changes the names of the instruction ids unless I make a special effort not to. I decided that while I was changing them anyway I'd make them look nicer; so now the instructions have names like MVE_VMLADAVs32 or MVE_VMLADAVaxs32, instead of cumbersome _noacc_noexch suffixes. The corresponding multiply-subtract instructions are unaffected. Those don't accept unsigned types at all, either in the spec or in LLVM. Reviewers: ostannard, dmgreen Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67214 llvm-svn: 371405
2019-09-09 23:17:26 +08:00
defm MVE_VRMLALDAVH : MVE_VRMLALDAVH_multi<"32">;
// vrmlalvh aliases for vrmlaldavh
def : MVEInstAlias<"vrmlalvh${vp}.s32\t$RdaLo, $RdaHi, $Qn, $Qm",
[ARM] Remove some spurious MVE reduction instructions. The family of 'dual-accumulating' vector multiply-add instructions (VMLADAV, VMLALDAV and VRMLALDAVH) can all operate on both signed and unsigned integer types, and they all have an 'exchange' variant (with an X in the name) that modifies which pairs of vector lanes in the two inputs are multiplied together. But there's a clause in the spec that says that the X variants //don't// operate on unsigned integer types, only signed. You can have X, or unsigned, or neither, but not both. We didn't notice that clause when we implemented the MC support for these instructions, so LLVM believes that things like VMLADAVX.U8 do exist, contradicting the spec. Here I fix that by conditioning them out in Tablegen. In order to do that, I've reversed the nesting order of the Tablegen multiclasses for those instructions. Previously, the innermost multiclass generated the X and not-X variants, and the one outside that generated the A and not-A variants. Now X is done by the outer multiclass, which allows me to bypass that one when I only want the two not-X variants. Changing the multiclass nesting order also changes the names of the instruction ids unless I make a special effort not to. I decided that while I was changing them anyway I'd make them look nicer; so now the instructions have names like MVE_VMLADAVs32 or MVE_VMLADAVaxs32, instead of cumbersome _noacc_noexch suffixes. The corresponding multiply-subtract instructions are unaffected. Those don't accept unsigned types at all, either in the spec or in LLVM. Reviewers: ostannard, dmgreen Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67214 llvm-svn: 371405
2019-09-09 23:17:26 +08:00
(MVE_VRMLALDAVHs32
tGPREven:$RdaLo, tGPROdd:$RdaHi,
MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
def : MVEInstAlias<"vrmlalvha${vp}.s32\t$RdaLo, $RdaHi, $Qn, $Qm",
[ARM] Remove some spurious MVE reduction instructions. The family of 'dual-accumulating' vector multiply-add instructions (VMLADAV, VMLALDAV and VRMLALDAVH) can all operate on both signed and unsigned integer types, and they all have an 'exchange' variant (with an X in the name) that modifies which pairs of vector lanes in the two inputs are multiplied together. But there's a clause in the spec that says that the X variants //don't// operate on unsigned integer types, only signed. You can have X, or unsigned, or neither, but not both. We didn't notice that clause when we implemented the MC support for these instructions, so LLVM believes that things like VMLADAVX.U8 do exist, contradicting the spec. Here I fix that by conditioning them out in Tablegen. In order to do that, I've reversed the nesting order of the Tablegen multiclasses for those instructions. Previously, the innermost multiclass generated the X and not-X variants, and the one outside that generated the A and not-A variants. Now X is done by the outer multiclass, which allows me to bypass that one when I only want the two not-X variants. Changing the multiclass nesting order also changes the names of the instruction ids unless I make a special effort not to. I decided that while I was changing them anyway I'd make them look nicer; so now the instructions have names like MVE_VMLADAVs32 or MVE_VMLADAVaxs32, instead of cumbersome _noacc_noexch suffixes. The corresponding multiply-subtract instructions are unaffected. Those don't accept unsigned types at all, either in the spec or in LLVM. Reviewers: ostannard, dmgreen Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67214 llvm-svn: 371405
2019-09-09 23:17:26 +08:00
(MVE_VRMLALDAVHas32
tGPREven:$RdaLo, tGPROdd:$RdaHi,
MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
def : MVEInstAlias<"vrmlalvh${vp}.u32\t$RdaLo, $RdaHi, $Qn, $Qm",
[ARM] Remove some spurious MVE reduction instructions. The family of 'dual-accumulating' vector multiply-add instructions (VMLADAV, VMLALDAV and VRMLALDAVH) can all operate on both signed and unsigned integer types, and they all have an 'exchange' variant (with an X in the name) that modifies which pairs of vector lanes in the two inputs are multiplied together. But there's a clause in the spec that says that the X variants //don't// operate on unsigned integer types, only signed. You can have X, or unsigned, or neither, but not both. We didn't notice that clause when we implemented the MC support for these instructions, so LLVM believes that things like VMLADAVX.U8 do exist, contradicting the spec. Here I fix that by conditioning them out in Tablegen. In order to do that, I've reversed the nesting order of the Tablegen multiclasses for those instructions. Previously, the innermost multiclass generated the X and not-X variants, and the one outside that generated the A and not-A variants. Now X is done by the outer multiclass, which allows me to bypass that one when I only want the two not-X variants. Changing the multiclass nesting order also changes the names of the instruction ids unless I make a special effort not to. I decided that while I was changing them anyway I'd make them look nicer; so now the instructions have names like MVE_VMLADAVs32 or MVE_VMLADAVaxs32, instead of cumbersome _noacc_noexch suffixes. The corresponding multiply-subtract instructions are unaffected. Those don't accept unsigned types at all, either in the spec or in LLVM. Reviewers: ostannard, dmgreen Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67214 llvm-svn: 371405
2019-09-09 23:17:26 +08:00
(MVE_VRMLALDAVHu32
tGPREven:$RdaLo, tGPROdd:$RdaHi,
MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
def : MVEInstAlias<"vrmlalvha${vp}.u32\t$RdaLo, $RdaHi, $Qn, $Qm",
[ARM] Remove some spurious MVE reduction instructions. The family of 'dual-accumulating' vector multiply-add instructions (VMLADAV, VMLALDAV and VRMLALDAVH) can all operate on both signed and unsigned integer types, and they all have an 'exchange' variant (with an X in the name) that modifies which pairs of vector lanes in the two inputs are multiplied together. But there's a clause in the spec that says that the X variants //don't// operate on unsigned integer types, only signed. You can have X, or unsigned, or neither, but not both. We didn't notice that clause when we implemented the MC support for these instructions, so LLVM believes that things like VMLADAVX.U8 do exist, contradicting the spec. Here I fix that by conditioning them out in Tablegen. In order to do that, I've reversed the nesting order of the Tablegen multiclasses for those instructions. Previously, the innermost multiclass generated the X and not-X variants, and the one outside that generated the A and not-A variants. Now X is done by the outer multiclass, which allows me to bypass that one when I only want the two not-X variants. Changing the multiclass nesting order also changes the names of the instruction ids unless I make a special effort not to. I decided that while I was changing them anyway I'd make them look nicer; so now the instructions have names like MVE_VMLADAVs32 or MVE_VMLADAVaxs32, instead of cumbersome _noacc_noexch suffixes. The corresponding multiply-subtract instructions are unaffected. Those don't accept unsigned types at all, either in the spec or in LLVM. Reviewers: ostannard, dmgreen Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67214 llvm-svn: 371405
2019-09-09 23:17:26 +08:00
(MVE_VRMLALDAVHau32
tGPREven:$RdaLo, tGPROdd:$RdaHi,
MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
[ARM] Remove some spurious MVE reduction instructions. The family of 'dual-accumulating' vector multiply-add instructions (VMLADAV, VMLALDAV and VRMLALDAVH) can all operate on both signed and unsigned integer types, and they all have an 'exchange' variant (with an X in the name) that modifies which pairs of vector lanes in the two inputs are multiplied together. But there's a clause in the spec that says that the X variants //don't// operate on unsigned integer types, only signed. You can have X, or unsigned, or neither, but not both. We didn't notice that clause when we implemented the MC support for these instructions, so LLVM believes that things like VMLADAVX.U8 do exist, contradicting the spec. Here I fix that by conditioning them out in Tablegen. In order to do that, I've reversed the nesting order of the Tablegen multiclasses for those instructions. Previously, the innermost multiclass generated the X and not-X variants, and the one outside that generated the A and not-A variants. Now X is done by the outer multiclass, which allows me to bypass that one when I only want the two not-X variants. Changing the multiclass nesting order also changes the names of the instruction ids unless I make a special effort not to. I decided that while I was changing them anyway I'd make them look nicer; so now the instructions have names like MVE_VMLADAVs32 or MVE_VMLADAVaxs32, instead of cumbersome _noacc_noexch suffixes. The corresponding multiply-subtract instructions are unaffected. Those don't accept unsigned types at all, either in the spec or in LLVM. Reviewers: ostannard, dmgreen Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67214 llvm-svn: 371405
2019-09-09 23:17:26 +08:00
multiclass MVE_VMLALDAV_multi<string suffix, bit sz, list<dag> pattern=[]> {
defm "" : MVE_VMLALDAVBase_AX<"vmlaldav", "s"#suffix, sz, 0b0, 0b0, 0b0, pattern>;
defm "" : MVE_VMLALDAVBase_A<"vmlaldav", "", "u"#suffix,
sz, 0b1, 0b0, 0b0, 0b0, pattern>;
}
[ARM] Remove some spurious MVE reduction instructions. The family of 'dual-accumulating' vector multiply-add instructions (VMLADAV, VMLALDAV and VRMLALDAVH) can all operate on both signed and unsigned integer types, and they all have an 'exchange' variant (with an X in the name) that modifies which pairs of vector lanes in the two inputs are multiplied together. But there's a clause in the spec that says that the X variants //don't// operate on unsigned integer types, only signed. You can have X, or unsigned, or neither, but not both. We didn't notice that clause when we implemented the MC support for these instructions, so LLVM believes that things like VMLADAVX.U8 do exist, contradicting the spec. Here I fix that by conditioning them out in Tablegen. In order to do that, I've reversed the nesting order of the Tablegen multiclasses for those instructions. Previously, the innermost multiclass generated the X and not-X variants, and the one outside that generated the A and not-A variants. Now X is done by the outer multiclass, which allows me to bypass that one when I only want the two not-X variants. Changing the multiclass nesting order also changes the names of the instruction ids unless I make a special effort not to. I decided that while I was changing them anyway I'd make them look nicer; so now the instructions have names like MVE_VMLADAVs32 or MVE_VMLADAVaxs32, instead of cumbersome _noacc_noexch suffixes. The corresponding multiply-subtract instructions are unaffected. Those don't accept unsigned types at all, either in the spec or in LLVM. Reviewers: ostannard, dmgreen Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67214 llvm-svn: 371405
2019-09-09 23:17:26 +08:00
defm MVE_VMLALDAV : MVE_VMLALDAV_multi<"16", 0b0>;
defm MVE_VMLALDAV : MVE_VMLALDAV_multi<"32", 0b1>;
// vmlalv aliases vmlaldav
[ARM] Remove some spurious MVE reduction instructions. The family of 'dual-accumulating' vector multiply-add instructions (VMLADAV, VMLALDAV and VRMLALDAVH) can all operate on both signed and unsigned integer types, and they all have an 'exchange' variant (with an X in the name) that modifies which pairs of vector lanes in the two inputs are multiplied together. But there's a clause in the spec that says that the X variants //don't// operate on unsigned integer types, only signed. You can have X, or unsigned, or neither, but not both. We didn't notice that clause when we implemented the MC support for these instructions, so LLVM believes that things like VMLADAVX.U8 do exist, contradicting the spec. Here I fix that by conditioning them out in Tablegen. In order to do that, I've reversed the nesting order of the Tablegen multiclasses for those instructions. Previously, the innermost multiclass generated the X and not-X variants, and the one outside that generated the A and not-A variants. Now X is done by the outer multiclass, which allows me to bypass that one when I only want the two not-X variants. Changing the multiclass nesting order also changes the names of the instruction ids unless I make a special effort not to. I decided that while I was changing them anyway I'd make them look nicer; so now the instructions have names like MVE_VMLADAVs32 or MVE_VMLADAVaxs32, instead of cumbersome _noacc_noexch suffixes. The corresponding multiply-subtract instructions are unaffected. Those don't accept unsigned types at all, either in the spec or in LLVM. Reviewers: ostannard, dmgreen Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67214 llvm-svn: 371405
2019-09-09 23:17:26 +08:00
foreach acc = ["", "a"] in {
foreach suffix = ["s16", "s32", "u16", "u32"] in {
[ARM] Remove some spurious MVE reduction instructions. The family of 'dual-accumulating' vector multiply-add instructions (VMLADAV, VMLALDAV and VRMLALDAVH) can all operate on both signed and unsigned integer types, and they all have an 'exchange' variant (with an X in the name) that modifies which pairs of vector lanes in the two inputs are multiplied together. But there's a clause in the spec that says that the X variants //don't// operate on unsigned integer types, only signed. You can have X, or unsigned, or neither, but not both. We didn't notice that clause when we implemented the MC support for these instructions, so LLVM believes that things like VMLADAVX.U8 do exist, contradicting the spec. Here I fix that by conditioning them out in Tablegen. In order to do that, I've reversed the nesting order of the Tablegen multiclasses for those instructions. Previously, the innermost multiclass generated the X and not-X variants, and the one outside that generated the A and not-A variants. Now X is done by the outer multiclass, which allows me to bypass that one when I only want the two not-X variants. Changing the multiclass nesting order also changes the names of the instruction ids unless I make a special effort not to. I decided that while I was changing them anyway I'd make them look nicer; so now the instructions have names like MVE_VMLADAVs32 or MVE_VMLADAVaxs32, instead of cumbersome _noacc_noexch suffixes. The corresponding multiply-subtract instructions are unaffected. Those don't accept unsigned types at all, either in the spec or in LLVM. Reviewers: ostannard, dmgreen Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67214 llvm-svn: 371405
2019-09-09 23:17:26 +08:00
def : MVEInstAlias<"vmlalv" # acc # "${vp}." # suffix #
"\t$RdaLoDest, $RdaHiDest, $Qn, $Qm",
(!cast<Instruction>("MVE_VMLALDAV"#acc#suffix)
tGPREven:$RdaLoDest, tGPROdd:$RdaHiDest,
MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
}
}
multiclass MVE_VMLSLDAV_multi<string iname, string suffix, bit sz,
[ARM] Remove some spurious MVE reduction instructions. The family of 'dual-accumulating' vector multiply-add instructions (VMLADAV, VMLALDAV and VRMLALDAVH) can all operate on both signed and unsigned integer types, and they all have an 'exchange' variant (with an X in the name) that modifies which pairs of vector lanes in the two inputs are multiplied together. But there's a clause in the spec that says that the X variants //don't// operate on unsigned integer types, only signed. You can have X, or unsigned, or neither, but not both. We didn't notice that clause when we implemented the MC support for these instructions, so LLVM believes that things like VMLADAVX.U8 do exist, contradicting the spec. Here I fix that by conditioning them out in Tablegen. In order to do that, I've reversed the nesting order of the Tablegen multiclasses for those instructions. Previously, the innermost multiclass generated the X and not-X variants, and the one outside that generated the A and not-A variants. Now X is done by the outer multiclass, which allows me to bypass that one when I only want the two not-X variants. Changing the multiclass nesting order also changes the names of the instruction ids unless I make a special effort not to. I decided that while I was changing them anyway I'd make them look nicer; so now the instructions have names like MVE_VMLADAVs32 or MVE_VMLADAVaxs32, instead of cumbersome _noacc_noexch suffixes. The corresponding multiply-subtract instructions are unaffected. Those don't accept unsigned types at all, either in the spec or in LLVM. Reviewers: ostannard, dmgreen Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67214 llvm-svn: 371405
2019-09-09 23:17:26 +08:00
bit bit_28, list<dag> pattern=[]> {
defm "" : MVE_VMLALDAVBase_AX<iname, suffix, sz, bit_28, 0b0, 0b1, pattern>;
}
[ARM] Remove some spurious MVE reduction instructions. The family of 'dual-accumulating' vector multiply-add instructions (VMLADAV, VMLALDAV and VRMLALDAVH) can all operate on both signed and unsigned integer types, and they all have an 'exchange' variant (with an X in the name) that modifies which pairs of vector lanes in the two inputs are multiplied together. But there's a clause in the spec that says that the X variants //don't// operate on unsigned integer types, only signed. You can have X, or unsigned, or neither, but not both. We didn't notice that clause when we implemented the MC support for these instructions, so LLVM believes that things like VMLADAVX.U8 do exist, contradicting the spec. Here I fix that by conditioning them out in Tablegen. In order to do that, I've reversed the nesting order of the Tablegen multiclasses for those instructions. Previously, the innermost multiclass generated the X and not-X variants, and the one outside that generated the A and not-A variants. Now X is done by the outer multiclass, which allows me to bypass that one when I only want the two not-X variants. Changing the multiclass nesting order also changes the names of the instruction ids unless I make a special effort not to. I decided that while I was changing them anyway I'd make them look nicer; so now the instructions have names like MVE_VMLADAVs32 or MVE_VMLADAVaxs32, instead of cumbersome _noacc_noexch suffixes. The corresponding multiply-subtract instructions are unaffected. Those don't accept unsigned types at all, either in the spec or in LLVM. Reviewers: ostannard, dmgreen Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67214 llvm-svn: 371405
2019-09-09 23:17:26 +08:00
defm MVE_VMLSLDAV : MVE_VMLSLDAV_multi<"vmlsldav", "s16", 0b0, 0b0>;
defm MVE_VMLSLDAV : MVE_VMLSLDAV_multi<"vmlsldav", "s32", 0b1, 0b0>;
defm MVE_VRMLSLDAVH : MVE_VMLSLDAV_multi<"vrmlsldavh", "s32", 0b0, 0b1>;
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
// end of mve_rDest instructions
// start of mve_comp instructions
class MVE_comp<InstrItinClass itin, string iname, string suffix,
string cstr, list<dag> pattern=[]>
: MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), itin, iname, suffix,
"$Qd, $Qn, $Qm", vpred_r, cstr, pattern> {
bits<4> Qd;
bits<4> Qn;
bits<4> Qm;
let Inst{22} = Qd{3};
let Inst{19-17} = Qn{2-0};
let Inst{16} = 0b0;
let Inst{15-13} = Qd{2-0};
let Inst{12} = 0b0;
let Inst{10-9} = 0b11;
let Inst{7} = Qn{3};
let Inst{5} = Qm{3};
let Inst{3-1} = Qm{2-0};
let Inst{0} = 0b0;
}
class MVE_VMINMAXNM<string iname, string suffix, bit sz, bit bit_21,
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
list<dag> pattern=[]>
: MVE_comp<NoItinerary, iname, suffix, "", pattern> {
let Inst{28} = 0b1;
let Inst{25-24} = 0b11;
let Inst{23} = 0b0;
let Inst{21} = bit_21;
let Inst{20} = sz;
let Inst{11} = 0b1;
let Inst{8} = 0b1;
let Inst{6} = 0b1;
let Inst{4} = 0b1;
let Predicates = [HasMVEFloat];
}
def MVE_VMAXNMf32 : MVE_VMINMAXNM<"vmaxnm", "f32", 0b0, 0b0>;
def MVE_VMAXNMf16 : MVE_VMINMAXNM<"vmaxnm", "f16", 0b1, 0b0>;
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
let Predicates = [HasMVEFloat] in {
def : Pat<(v4f32 (fmaxnum (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))),
(v4f32 (MVE_VMAXNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>;
def : Pat<(v8f16 (fmaxnum (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))),
(v8f16 (MVE_VMAXNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>;
def : Pat<(v4f32 (int_arm_mve_max_predicated (v4f32 MQPR:$val1), (v4f32 MQPR:$val2), (i32 0),
(v4i1 VCCR:$mask), (v4f32 MQPR:$inactive))),
(v4f32 (MVE_VMAXNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2),
ARMVCCThen, (v4i1 VCCR:$mask),
(v4f32 MQPR:$inactive)))>;
def : Pat<(v8f16 (int_arm_mve_max_predicated (v8f16 MQPR:$val1), (v8f16 MQPR:$val2), (i32 0),
(v8i1 VCCR:$mask), (v8f16 MQPR:$inactive))),
(v8f16 (MVE_VMAXNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2),
ARMVCCThen, (v8i1 VCCR:$mask),
(v8f16 MQPR:$inactive)))>;
}
def MVE_VMINNMf32 : MVE_VMINMAXNM<"vminnm", "f32", 0b0, 0b1>;
def MVE_VMINNMf16 : MVE_VMINMAXNM<"vminnm", "f16", 0b1, 0b1>;
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
let Predicates = [HasMVEFloat] in {
def : Pat<(v4f32 (fminnum (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))),
(v4f32 (MVE_VMINNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>;
def : Pat<(v8f16 (fminnum (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))),
(v8f16 (MVE_VMINNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>;
def : Pat<(v4f32 (int_arm_mve_min_predicated (v4f32 MQPR:$val1), (v4f32 MQPR:$val2),
(i32 0), (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive))),
(v4f32 (MVE_VMINNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2),
ARMVCCThen, (v4i1 VCCR:$mask),
(v4f32 MQPR:$inactive)))>;
def : Pat<(v8f16 (int_arm_mve_min_predicated (v8f16 MQPR:$val1), (v8f16 MQPR:$val2),
(i32 0), (v8i1 VCCR:$mask), (v8f16 MQPR:$inactive))),
(v8f16 (MVE_VMINNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2),
ARMVCCThen, (v8i1 VCCR:$mask),
(v8f16 MQPR:$inactive)))>;
}
class MVE_VMINMAX<string iname, string suffix, bit U, bits<2> size,
bit bit_4, list<dag> pattern=[]>
: MVE_comp<NoItinerary, iname, suffix, "", pattern> {
let Inst{28} = U;
let Inst{25-24} = 0b11;
let Inst{23} = 0b0;
let Inst{21-20} = size{1-0};
let Inst{11} = 0b0;
let Inst{8} = 0b0;
let Inst{6} = 0b1;
let Inst{4} = bit_4;
}
multiclass MVE_VMINMAX_m<string iname, bit bit_4, MVEVectorVTInfo VTI,
SDNode unpred_op, Intrinsic pred_int> {
def "" : MVE_VMINMAX<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, bit_4>;
let Predicates = [HasMVEInt] in {
// Unpredicated min/max
def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
// Predicated min/max
def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
(i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
(VTI.Vec MQPR:$inactive))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
ARMVCCThen, (VTI.Pred VCCR:$mask),
(VTI.Vec MQPR:$inactive)))>;
}
}
multiclass MVE_VMAX<MVEVectorVTInfo VTI>
: MVE_VMINMAX_m<"vmax", 0b0, VTI, !if(VTI.Unsigned, umax, smax), int_arm_mve_max_predicated>;
multiclass MVE_VMIN<MVEVectorVTInfo VTI>
: MVE_VMINMAX_m<"vmin", 0b1, VTI, !if(VTI.Unsigned, umin, smin), int_arm_mve_min_predicated>;
defm MVE_VMINs8 : MVE_VMIN<MVE_v16s8>;
defm MVE_VMINs16 : MVE_VMIN<MVE_v8s16>;
defm MVE_VMINs32 : MVE_VMIN<MVE_v4s32>;
defm MVE_VMINu8 : MVE_VMIN<MVE_v16u8>;
defm MVE_VMINu16 : MVE_VMIN<MVE_v8u16>;
defm MVE_VMINu32 : MVE_VMIN<MVE_v4u32>;
defm MVE_VMAXs8 : MVE_VMAX<MVE_v16s8>;
defm MVE_VMAXs16 : MVE_VMAX<MVE_v8s16>;
defm MVE_VMAXs32 : MVE_VMAX<MVE_v4s32>;
defm MVE_VMAXu8 : MVE_VMAX<MVE_v16u8>;
defm MVE_VMAXu16 : MVE_VMAX<MVE_v8u16>;
defm MVE_VMAXu32 : MVE_VMAX<MVE_v4u32>;
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
// end of mve_comp instructions
// start of mve_bit instructions
class MVE_bit_arith<dag oops, dag iops, string iname, string suffix,
string ops, string cstr, list<dag> pattern=[]>
: MVE_p<oops, iops, NoItinerary, iname, suffix, ops, vpred_r, cstr, pattern> {
bits<4> Qd;
bits<4> Qm;
let Inst{22} = Qd{3};
let Inst{15-13} = Qd{2-0};
let Inst{5} = Qm{3};
let Inst{3-1} = Qm{2-0};
}
def MVE_VBIC : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm),
"vbic", "", "$Qd, $Qn, $Qm", ""> {
bits<4> Qn;
let Inst{28} = 0b0;
let Inst{25-23} = 0b110;
let Inst{21-20} = 0b01;
let Inst{19-17} = Qn{2-0};
let Inst{16} = 0b0;
let Inst{12-8} = 0b00001;
let Inst{7} = Qn{3};
let Inst{6} = 0b1;
let Inst{4} = 0b1;
let Inst{0} = 0b0;
let validForTailPredication = 1;
}
class MVE_VREV<string iname, string suffix, bits<2> size, bits<2> bit_8_7, string cstr="">
: MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qm), iname,
suffix, "$Qd, $Qm", cstr> {
let Inst{28} = 0b1;
let Inst{25-23} = 0b111;
let Inst{21-20} = 0b11;
let Inst{19-18} = size;
let Inst{17-16} = 0b00;
let Inst{12-9} = 0b0000;
let Inst{8-7} = bit_8_7;
let Inst{6} = 0b1;
let Inst{4} = 0b0;
let Inst{0} = 0b0;
}
def MVE_VREV64_8 : MVE_VREV<"vrev64", "8", 0b00, 0b00, "@earlyclobber $Qd">;
def MVE_VREV64_16 : MVE_VREV<"vrev64", "16", 0b01, 0b00, "@earlyclobber $Qd">;
def MVE_VREV64_32 : MVE_VREV<"vrev64", "32", 0b10, 0b00, "@earlyclobber $Qd">;
def MVE_VREV32_8 : MVE_VREV<"vrev32", "8", 0b00, 0b01>;
def MVE_VREV32_16 : MVE_VREV<"vrev32", "16", 0b01, 0b01>;
def MVE_VREV16_8 : MVE_VREV<"vrev16", "8", 0b00, 0b10>;
let Predicates = [HasMVEInt] in {
def : Pat<(v8i16 (bswap (v8i16 MQPR:$src))),
(v8i16 (MVE_VREV16_8 (v8i16 MQPR:$src)))>;
def : Pat<(v4i32 (bswap (v4i32 MQPR:$src))),
(v4i32 (MVE_VREV32_8 (v4i32 MQPR:$src)))>;
}
let Predicates = [HasMVEInt] in {
def : Pat<(v4i32 (ARMvrev64 (v4i32 MQPR:$src))),
(v4i32 (MVE_VREV64_32 (v4i32 MQPR:$src)))>;
def : Pat<(v8i16 (ARMvrev64 (v8i16 MQPR:$src))),
(v8i16 (MVE_VREV64_16 (v8i16 MQPR:$src)))>;
def : Pat<(v16i8 (ARMvrev64 (v16i8 MQPR:$src))),
(v16i8 (MVE_VREV64_8 (v16i8 MQPR:$src)))>;
def : Pat<(v8i16 (ARMvrev32 (v8i16 MQPR:$src))),
(v8i16 (MVE_VREV32_16 (v8i16 MQPR:$src)))>;
def : Pat<(v16i8 (ARMvrev32 (v16i8 MQPR:$src))),
(v16i8 (MVE_VREV32_8 (v16i8 MQPR:$src)))>;
def : Pat<(v16i8 (ARMvrev16 (v16i8 MQPR:$src))),
(v16i8 (MVE_VREV16_8 (v16i8 MQPR:$src)))>;
def : Pat<(v4f32 (ARMvrev64 (v4f32 MQPR:$src))),
(v4f32 (MVE_VREV64_32 (v4f32 MQPR:$src)))>;
def : Pat<(v8f16 (ARMvrev64 (v8f16 MQPR:$src))),
(v8f16 (MVE_VREV64_16 (v8f16 MQPR:$src)))>;
def : Pat<(v8f16 (ARMvrev32 (v8f16 MQPR:$src))),
(v8f16 (MVE_VREV32_16 (v8f16 MQPR:$src)))>;
}
def MVE_VMVN : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qm),
"vmvn", "", "$Qd, $Qm", ""> {
let Inst{28} = 0b1;
let Inst{25-23} = 0b111;
let Inst{21-16} = 0b110000;
let Inst{12-6} = 0b0010111;
let Inst{4} = 0b0;
let Inst{0} = 0b0;
let validForTailPredication = 1;
}
let Predicates = [HasMVEInt] in {
def : Pat<(v16i8 (vnotq (v16i8 MQPR:$val1))),
(v16i8 (MVE_VMVN (v16i8 MQPR:$val1)))>;
def : Pat<(v8i16 (vnotq (v8i16 MQPR:$val1))),
(v8i16 (MVE_VMVN (v8i16 MQPR:$val1)))>;
def : Pat<(v4i32 (vnotq (v4i32 MQPR:$val1))),
(v4i32 (MVE_VMVN (v4i32 MQPR:$val1)))>;
def : Pat<(v2i64 (vnotq (v2i64 MQPR:$val1))),
(v2i64 (MVE_VMVN (v2i64 MQPR:$val1)))>;
}
class MVE_bit_ops<string iname, bits<2> bit_21_20, bit bit_28>
: MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm),
iname, "", "$Qd, $Qn, $Qm", ""> {
bits<4> Qn;
let Inst{28} = bit_28;
let Inst{25-23} = 0b110;
let Inst{21-20} = bit_21_20;
let Inst{19-17} = Qn{2-0};
let Inst{16} = 0b0;
let Inst{12-8} = 0b00001;
let Inst{7} = Qn{3};
let Inst{6} = 0b1;
let Inst{4} = 0b1;
let Inst{0} = 0b0;
let validForTailPredication = 1;
}
def MVE_VEOR : MVE_bit_ops<"veor", 0b00, 0b1>;
def MVE_VORN : MVE_bit_ops<"vorn", 0b11, 0b0>;
def MVE_VORR : MVE_bit_ops<"vorr", 0b10, 0b0>;
def MVE_VAND : MVE_bit_ops<"vand", 0b00, 0b0>;
// add ignored suffixes as aliases
foreach s=["s8", "s16", "s32", "u8", "u16", "u32", "i8", "i16", "i32", "f16", "f32"] in {
def : MVEInstAlias<"vbic${vp}." # s # "\t$QdSrc, $QnSrc, $QmSrc",
(MVE_VBIC MQPR:$QdSrc, MQPR:$QnSrc, MQPR:$QmSrc, vpred_r:$vp)>;
def : MVEInstAlias<"veor${vp}." # s # "\t$QdSrc, $QnSrc, $QmSrc",
(MVE_VEOR MQPR:$QdSrc, MQPR:$QnSrc, MQPR:$QmSrc, vpred_r:$vp)>;
def : MVEInstAlias<"vorn${vp}." # s # "\t$QdSrc, $QnSrc, $QmSrc",
(MVE_VORN MQPR:$QdSrc, MQPR:$QnSrc, MQPR:$QmSrc, vpred_r:$vp)>;
def : MVEInstAlias<"vorr${vp}." # s # "\t$QdSrc, $QnSrc, $QmSrc",
(MVE_VORR MQPR:$QdSrc, MQPR:$QnSrc, MQPR:$QmSrc, vpred_r:$vp)>;
def : MVEInstAlias<"vand${vp}." # s # "\t$QdSrc, $QnSrc, $QmSrc",
(MVE_VAND MQPR:$QdSrc, MQPR:$QnSrc, MQPR:$QmSrc, vpred_r:$vp)>;
}
multiclass MVE_bit_op<MVEVectorVTInfo VTI, SDNode unpred_op, Intrinsic pred_int, MVE_bit_ops instruction> {
let Predicates = [HasMVEInt] in {
// Unpredicated operation
def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
(VTI.Vec (instruction (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
// Predicated operation
def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
(VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
(VTI.Vec (instruction
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
ARMVCCThen, (VTI.Pred VCCR:$mask),
(VTI.Vec MQPR:$inactive)))>;
}
}
defm : MVE_bit_op<MVE_v16i8, and, int_arm_mve_and_predicated, MVE_VAND>;
defm : MVE_bit_op<MVE_v8i16, and, int_arm_mve_and_predicated, MVE_VAND>;
defm : MVE_bit_op<MVE_v4i32, and, int_arm_mve_and_predicated, MVE_VAND>;
defm : MVE_bit_op<MVE_v2i64, and, int_arm_mve_and_predicated, MVE_VAND>;
defm : MVE_bit_op<MVE_v16i8, or, int_arm_mve_orr_predicated, MVE_VORR>;
defm : MVE_bit_op<MVE_v8i16, or, int_arm_mve_orr_predicated, MVE_VORR>;
defm : MVE_bit_op<MVE_v4i32, or, int_arm_mve_orr_predicated, MVE_VORR>;
defm : MVE_bit_op<MVE_v2i64, or, int_arm_mve_orr_predicated, MVE_VORR>;
defm : MVE_bit_op<MVE_v16i8, xor, int_arm_mve_eor_predicated, MVE_VEOR>;
defm : MVE_bit_op<MVE_v8i16, xor, int_arm_mve_eor_predicated, MVE_VEOR>;
defm : MVE_bit_op<MVE_v4i32, xor, int_arm_mve_eor_predicated, MVE_VEOR>;
defm : MVE_bit_op<MVE_v2i64, xor, int_arm_mve_eor_predicated, MVE_VEOR>;
multiclass MVE_bit_op_with_inv<MVEVectorVTInfo VTI, SDNode unpred_op, Intrinsic pred_int, MVE_bit_ops instruction> {
let Predicates = [HasMVEInt] in {
// Unpredicated operation
def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (vnotq (VTI.Vec MQPR:$Qn)))),
(VTI.Vec (instruction (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
// Predicated operation
def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
(VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
(VTI.Vec (instruction
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
ARMVCCThen, (VTI.Pred VCCR:$mask),
(VTI.Vec MQPR:$inactive)))>;
}
}
defm : MVE_bit_op_with_inv<MVE_v16i8, and, int_arm_mve_bic_predicated, MVE_VBIC>;
defm : MVE_bit_op_with_inv<MVE_v8i16, and, int_arm_mve_bic_predicated, MVE_VBIC>;
defm : MVE_bit_op_with_inv<MVE_v4i32, and, int_arm_mve_bic_predicated, MVE_VBIC>;
defm : MVE_bit_op_with_inv<MVE_v2i64, and, int_arm_mve_bic_predicated, MVE_VBIC>;
defm : MVE_bit_op_with_inv<MVE_v16i8, or, int_arm_mve_orn_predicated, MVE_VORN>;
defm : MVE_bit_op_with_inv<MVE_v8i16, or, int_arm_mve_orn_predicated, MVE_VORN>;
defm : MVE_bit_op_with_inv<MVE_v4i32, or, int_arm_mve_orn_predicated, MVE_VORN>;
defm : MVE_bit_op_with_inv<MVE_v2i64, or, int_arm_mve_orn_predicated, MVE_VORN>;
class MVE_bit_cmode<string iname, string suffix, bits<4> cmode, dag inOps>
: MVE_p<(outs MQPR:$Qd), inOps, NoItinerary,
iname, suffix, "$Qd, $imm", vpred_n, "$Qd = $Qd_src"> {
bits<8> imm;
bits<4> Qd;
let Inst{28} = imm{7};
let Inst{27-23} = 0b11111;
let Inst{22} = Qd{3};
let Inst{21-19} = 0b000;
let Inst{18-16} = imm{6-4};
let Inst{15-13} = Qd{2-0};
let Inst{12} = 0b0;
let Inst{11-8} = cmode;
let Inst{7-6} = 0b01;
let Inst{4} = 0b1;
let Inst{3-0} = imm{3-0};
}
class MVE_VORR<string suffix, bits<4> cmode, ExpandImm imm_type>
: MVE_bit_cmode<"vorr", suffix, cmode, (ins MQPR:$Qd_src, imm_type:$imm)> {
let Inst{5} = 0b0;
let validForTailPredication = 1;
}
def MVE_VORRIZ0v4i32 : MVE_VORR<"i32", 0b0001, expzero00>;
def MVE_VORRIZ0v8i16 : MVE_VORR<"i16", 0b1001, expzero00>;
def MVE_VORRIZ8v4i32 : MVE_VORR<"i32", 0b0011, expzero08>;
def MVE_VORRIZ8v8i16 : MVE_VORR<"i16", 0b1011, expzero08>;
def MVE_VORRIZ16v4i32 : MVE_VORR<"i32", 0b0101, expzero16>;
def MVE_VORRIZ24v4i32 : MVE_VORR<"i32", 0b0111, expzero24>;
def MVE_VORNIZ0v4i32 : MVEAsmPseudo<"vorn${vp}.i32\t$Qd, $imm",
(ins MQPR:$Qd_src, expzero00inv32:$imm, vpred_n:$vp), (outs MQPR:$Qd)>;
def MVE_VORNIZ0v8i16 : MVEAsmPseudo<"vorn${vp}.i16\t$Qd, $imm",
(ins MQPR:$Qd_src, expzero00inv16:$imm, vpred_n:$vp), (outs MQPR:$Qd)>;
def MVE_VORNIZ8v4i32 : MVEAsmPseudo<"vorn${vp}.i32\t$Qd, $imm",
(ins MQPR:$Qd_src, expzero08inv32:$imm, vpred_n:$vp), (outs MQPR:$Qd)>;
def MVE_VORNIZ8v8i16 : MVEAsmPseudo<"vorn${vp}.i16\t$Qd, $imm",
(ins MQPR:$Qd_src, expzero08inv16:$imm, vpred_n:$vp), (outs MQPR:$Qd)>;
def MVE_VORNIZ16v4i32 : MVEAsmPseudo<"vorn${vp}.i32\t$Qd, $imm",
(ins MQPR:$Qd_src, expzero16inv32:$imm, vpred_n:$vp), (outs MQPR:$Qd)>;
def MVE_VORNIZ24v4i32 : MVEAsmPseudo<"vorn${vp}.i32\t$Qd, $imm",
(ins MQPR:$Qd_src, expzero24inv32:$imm, vpred_n:$vp), (outs MQPR:$Qd)>;
def MVE_VMOV : MVEInstAlias<"vmov${vp}\t$Qd, $Qm",
(MVE_VORR MQPR:$Qd, MQPR:$Qm, MQPR:$Qm, vpred_r:$vp)>;
class MVE_VBIC<string suffix, bits<4> cmode, ExpandImm imm_type>
: MVE_bit_cmode<"vbic", suffix, cmode, (ins MQPR:$Qd_src, imm_type:$imm)> {
let Inst{5} = 0b1;
let validForTailPredication = 1;
}
def MVE_VBICIZ0v4i32 : MVE_VBIC<"i32", 0b0001, expzero00>;
def MVE_VBICIZ0v8i16 : MVE_VBIC<"i16", 0b1001, expzero00>;
def MVE_VBICIZ8v4i32 : MVE_VBIC<"i32", 0b0011, expzero08>;
def MVE_VBICIZ8v8i16 : MVE_VBIC<"i16", 0b1011, expzero08>;
def MVE_VBICIZ16v4i32 : MVE_VBIC<"i32", 0b0101, expzero16>;
def MVE_VBICIZ24v4i32 : MVE_VBIC<"i32", 0b0111, expzero24>;
def MVE_VANDIZ0v4i32 : MVEAsmPseudo<"vand${vp}.i32\t$Qda, $imm",
(ins MQPR:$Qda_src, expzero00inv32:$imm, vpred_n:$vp), (outs MQPR:$Qda)>;
def MVE_VANDIZ0v8i16 : MVEAsmPseudo<"vand${vp}.i16\t$Qda, $imm",
(ins MQPR:$Qda_src, expzero00inv16:$imm, vpred_n:$vp), (outs MQPR:$Qda)>;
def MVE_VANDIZ8v4i32 : MVEAsmPseudo<"vand${vp}.i32\t$Qda, $imm",
(ins MQPR:$Qda_src, expzero08inv32:$imm, vpred_n:$vp), (outs MQPR:$Qda)>;
def MVE_VANDIZ8v8i16 : MVEAsmPseudo<"vand${vp}.i16\t$Qda, $imm",
(ins MQPR:$Qda_src, expzero08inv16:$imm, vpred_n:$vp), (outs MQPR:$Qda)>;
def MVE_VANDIZ16v4i32 : MVEAsmPseudo<"vand${vp}.i32\t$Qda, $imm",
(ins MQPR:$Qda_src, expzero16inv32:$imm, vpred_n:$vp), (outs MQPR:$Qda)>;
def MVE_VANDIZ24v4i32 : MVEAsmPseudo<"vand${vp}.i32\t$Qda, $imm",
(ins MQPR:$Qda_src, expzero24inv32:$imm, vpred_n:$vp), (outs MQPR:$Qda)>;
class MVE_VMOV_lane_direction {
bit bit_20;
dag oops;
dag iops;
string ops;
string cstr;
}
def MVE_VMOV_from_lane : MVE_VMOV_lane_direction {
let bit_20 = 0b1;
let oops = (outs rGPR:$Rt);
let iops = (ins MQPR:$Qd);
let ops = "$Rt, $Qd$Idx";
let cstr = "";
}
def MVE_VMOV_to_lane : MVE_VMOV_lane_direction {
let bit_20 = 0b0;
let oops = (outs MQPR:$Qd);
let iops = (ins MQPR:$Qd_src, rGPR:$Rt);
let ops = "$Qd$Idx, $Rt";
let cstr = "$Qd = $Qd_src";
}
class MVE_VMOV_lane<string suffix, bit U, dag indexop,
MVE_VMOV_lane_direction dir>
: MVE_VMOV_lane_base<dir.oops, !con(dir.iops, indexop), NoItinerary,
"vmov", suffix, dir.ops, dir.cstr, []> {
bits<4> Qd;
bits<4> Rt;
let Inst{31-24} = 0b11101110;
let Inst{23} = U;
let Inst{20} = dir.bit_20;
let Inst{19-17} = Qd{2-0};
let Inst{15-12} = Rt{3-0};
let Inst{11-8} = 0b1011;
let Inst{7} = Qd{3};
let Inst{4-0} = 0b10000;
}
class MVE_VMOV_lane_32<MVE_VMOV_lane_direction dir>
: MVE_VMOV_lane<"32", 0b0, (ins MVEVectorIndex<4>:$Idx), dir> {
bits<2> Idx;
let Inst{22} = 0b0;
let Inst{6-5} = 0b00;
let Inst{16} = Idx{1};
let Inst{21} = Idx{0};
let Predicates = [HasFPRegsV8_1M];
}
class MVE_VMOV_lane_16<string suffix, bit U, MVE_VMOV_lane_direction dir>
: MVE_VMOV_lane<suffix, U, (ins MVEVectorIndex<8>:$Idx), dir> {
bits<3> Idx;
let Inst{22} = 0b0;
let Inst{5} = 0b1;
let Inst{16} = Idx{2};
let Inst{21} = Idx{1};
let Inst{6} = Idx{0};
}
class MVE_VMOV_lane_8<string suffix, bit U, MVE_VMOV_lane_direction dir>
: MVE_VMOV_lane<suffix, U, (ins MVEVectorIndex<16>:$Idx), dir> {
bits<4> Idx;
let Inst{22} = 0b1;
let Inst{16} = Idx{3};
let Inst{21} = Idx{2};
let Inst{6} = Idx{1};
let Inst{5} = Idx{0};
}
def MVE_VMOV_from_lane_32 : MVE_VMOV_lane_32< MVE_VMOV_from_lane>;
def MVE_VMOV_to_lane_32 : MVE_VMOV_lane_32< MVE_VMOV_to_lane>;
def MVE_VMOV_from_lane_s16 : MVE_VMOV_lane_16<"s16", 0b0, MVE_VMOV_from_lane>;
def MVE_VMOV_from_lane_u16 : MVE_VMOV_lane_16<"u16", 0b1, MVE_VMOV_from_lane>;
def MVE_VMOV_to_lane_16 : MVE_VMOV_lane_16< "16", 0b0, MVE_VMOV_to_lane>;
def MVE_VMOV_from_lane_s8 : MVE_VMOV_lane_8 < "s8", 0b0, MVE_VMOV_from_lane>;
def MVE_VMOV_from_lane_u8 : MVE_VMOV_lane_8 < "u8", 0b1, MVE_VMOV_from_lane>;
def MVE_VMOV_to_lane_8 : MVE_VMOV_lane_8 < "8", 0b0, MVE_VMOV_to_lane>;
let Predicates = [HasMVEInt] in {
def : Pat<(extractelt (v2f64 MQPR:$src), imm:$lane),
(f64 (EXTRACT_SUBREG MQPR:$src, (DSubReg_f64_reg imm:$lane)))>;
def : Pat<(insertelt (v2f64 MQPR:$src1), DPR:$src2, imm:$lane),
(INSERT_SUBREG (v2f64 (COPY_TO_REGCLASS MQPR:$src1, MQPR)), DPR:$src2, (DSubReg_f64_reg imm:$lane))>;
def : Pat<(extractelt (v4i32 MQPR:$src), imm:$lane),
(COPY_TO_REGCLASS
(i32 (EXTRACT_SUBREG MQPR:$src, (SSubReg_f32_reg imm:$lane))), rGPR)>;
def : Pat<(insertelt (v4i32 MQPR:$src1), rGPR:$src2, imm:$lane),
(MVE_VMOV_to_lane_32 MQPR:$src1, rGPR:$src2, imm:$lane)>;
def : Pat<(vector_insert (v16i8 MQPR:$src1), rGPR:$src2, imm:$lane),
(MVE_VMOV_to_lane_8 MQPR:$src1, rGPR:$src2, imm:$lane)>;
def : Pat<(vector_insert (v8i16 MQPR:$src1), rGPR:$src2, imm:$lane),
(MVE_VMOV_to_lane_16 MQPR:$src1, rGPR:$src2, imm:$lane)>;
def : Pat<(ARMvgetlanes (v16i8 MQPR:$src), imm:$lane),
(MVE_VMOV_from_lane_s8 MQPR:$src, imm:$lane)>;
def : Pat<(ARMvgetlanes (v8i16 MQPR:$src), imm:$lane),
(MVE_VMOV_from_lane_s16 MQPR:$src, imm:$lane)>;
def : Pat<(ARMvgetlaneu (v16i8 MQPR:$src), imm:$lane),
(MVE_VMOV_from_lane_u8 MQPR:$src, imm:$lane)>;
def : Pat<(ARMvgetlaneu (v8i16 MQPR:$src), imm:$lane),
(MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane)>;
def : Pat<(v16i8 (scalar_to_vector GPR:$src)),
(MVE_VMOV_to_lane_8 (v16i8 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>;
def : Pat<(v8i16 (scalar_to_vector GPR:$src)),
(MVE_VMOV_to_lane_16 (v8i16 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>;
def : Pat<(v4i32 (scalar_to_vector GPR:$src)),
(MVE_VMOV_to_lane_32 (v4i32 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>;
// Floating point patterns, still enabled under HasMVEInt
def : Pat<(extractelt (v4f32 MQPR:$src), imm:$lane),
(COPY_TO_REGCLASS (f32 (EXTRACT_SUBREG MQPR:$src, (SSubReg_f32_reg imm:$lane))), SPR)>;
def : Pat<(insertelt (v4f32 MQPR:$src1), (f32 SPR:$src2), imm:$lane),
(INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS MQPR:$src1, MQPR)), SPR:$src2, (SSubReg_f32_reg imm:$lane))>;
def : Pat<(insertelt (v8f16 MQPR:$src1), HPR:$src2, imm:$lane),
(MVE_VMOV_to_lane_16 MQPR:$src1, (COPY_TO_REGCLASS HPR:$src2, rGPR), imm:$lane)>;
def : Pat<(extractelt (v8f16 MQPR:$src), imm_even:$lane),
(EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_even:$lane))>;
def : Pat<(extractelt (v8f16 MQPR:$src), imm_odd:$lane),
(COPY_TO_REGCLASS
(VMOVH (EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_odd:$lane))),
HPR)>;
def : Pat<(v4f32 (scalar_to_vector SPR:$src)),
(INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), SPR:$src, ssub_0)>;
def : Pat<(v4f32 (scalar_to_vector GPR:$src)),
(MVE_VMOV_to_lane_32 (v4f32 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>;
def : Pat<(v8f16 (scalar_to_vector HPR:$src)),
(INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), HPR:$src, ssub_0)>;
def : Pat<(v8f16 (scalar_to_vector GPR:$src)),
(MVE_VMOV_to_lane_16 (v8f16 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>;
}
// end of mve_bit instructions
// start of MVE Integer instructions
class MVE_int<string iname, string suffix, bits<2> size, list<dag> pattern=[]>
: MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), NoItinerary,
iname, suffix, "$Qd, $Qn, $Qm", vpred_r, "", pattern> {
bits<4> Qd;
bits<4> Qn;
bits<4> Qm;
let Inst{22} = Qd{3};
let Inst{21-20} = size;
let Inst{19-17} = Qn{2-0};
let Inst{15-13} = Qd{2-0};
let Inst{7} = Qn{3};
let Inst{6} = 0b1;
let Inst{5} = Qm{3};
let Inst{3-1} = Qm{2-0};
}
class MVE_VMULt1<string iname, string suffix, bits<2> size,
list<dag> pattern=[]>
: MVE_int<iname, suffix, size, pattern> {
let Inst{28} = 0b0;
let Inst{25-23} = 0b110;
let Inst{16} = 0b0;
let Inst{12-8} = 0b01001;
let Inst{4} = 0b1;
let Inst{0} = 0b0;
let validForTailPredication = 1;
}
multiclass MVE_VMUL_m<string iname, MVEVectorVTInfo VTI,
SDNode unpred_op, Intrinsic pred_int> {
def "" : MVE_VMULt1<iname, VTI.Suffix, VTI.Size>;
let Predicates = [HasMVEInt] in {
// Unpredicated multiply
def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
// Predicated multiply
def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
(VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
ARMVCCThen, (VTI.Pred VCCR:$mask),
(VTI.Vec MQPR:$inactive)))>;
}
}
multiclass MVE_VMUL<MVEVectorVTInfo VTI>
: MVE_VMUL_m<"vmul", VTI, mul, int_arm_mve_mul_predicated>;
defm MVE_VMULi8 : MVE_VMUL<MVE_v16i8>;
defm MVE_VMULi16 : MVE_VMUL<MVE_v8i16>;
defm MVE_VMULi32 : MVE_VMUL<MVE_v4i32>;
class MVE_VQxDMULH_Base<string iname, string suffix, bits<2> size, bit rounding,
list<dag> pattern=[]>
: MVE_int<iname, suffix, size, pattern> {
let Inst{28} = rounding;
let Inst{25-23} = 0b110;
let Inst{16} = 0b0;
let Inst{12-8} = 0b01011;
let Inst{4} = 0b0;
let Inst{0} = 0b0;
}
multiclass MVE_VQxDMULH_m<string iname, MVEVectorVTInfo VTI,
SDNode unpred_op, Intrinsic pred_int,
bit rounding> {
def "" : MVE_VQxDMULH_Base<iname, VTI.Suffix, VTI.Size, rounding>;
let Predicates = [HasMVEInt] in {
// Unpredicated multiply
def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
// Predicated multiply
def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
(VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
ARMVCCThen, (VTI.Pred VCCR:$mask),
(VTI.Vec MQPR:$inactive)))>;
}
}
multiclass MVE_VQxDMULH<string iname, MVEVectorVTInfo VTI, bit rounding>
: MVE_VQxDMULH_m<iname, VTI, !if(rounding, int_arm_mve_vqrdmulh,
int_arm_mve_vqdmulh),
!if(rounding, int_arm_mve_qrdmulh_predicated,
int_arm_mve_qdmulh_predicated),
rounding>;
defm MVE_VQDMULHi8 : MVE_VQxDMULH<"vqdmulh", MVE_v16s8, 0b0>;
defm MVE_VQDMULHi16 : MVE_VQxDMULH<"vqdmulh", MVE_v8s16, 0b0>;
defm MVE_VQDMULHi32 : MVE_VQxDMULH<"vqdmulh", MVE_v4s32, 0b0>;
defm MVE_VQRDMULHi8 : MVE_VQxDMULH<"vqrdmulh", MVE_v16s8, 0b1>;
defm MVE_VQRDMULHi16 : MVE_VQxDMULH<"vqrdmulh", MVE_v8s16, 0b1>;
defm MVE_VQRDMULHi32 : MVE_VQxDMULH<"vqrdmulh", MVE_v4s32, 0b1>;
class MVE_VADDSUB<string iname, string suffix, bits<2> size, bit subtract,
list<dag> pattern=[]>
: MVE_int<iname, suffix, size, pattern> {
let Inst{28} = subtract;
let Inst{25-23} = 0b110;
let Inst{16} = 0b0;
let Inst{12-8} = 0b01000;
let Inst{4} = 0b0;
let Inst{0} = 0b0;
let validForTailPredication = 1;
}
[ARM] Begin adding IR intrinsics for MVE instructions. This commit, together with the next few, will add a representative sample of the kind of IR intrinsics that we'll need in order to implement the user-facing ACLE intrinsics for MVE. Supporting all of them will take more work; the intention of this initial series of commits is to implement an intrinsic or two from lots of different categories, as examples and proofs of concept. This initial commit introduces a small number of IR intrinsics for instructions simple enough that they can use Tablegen ISel patterns: the predicated versions of the VADD and VSUB instructions (both integer and FP), VMIN and VMAX, and the float->half VCVT instruction (predicated and unpredicated). When using VPT-predicated instructions in automatic code generation, it will be convenient to specify the predicate value as a vector of the appropriate number of i1. To make it easy to specify all sizes of an instruction in one go and give each one the matching predicate vector type, I've added a system of Tablegen informational records describing MVE's vector types: each one gives the underlying LLVM IR ValueType (which may not be the same if the MVE vector is of explicitly signed or unsigned integers) and an appropriate vNi1 to use as the predicate vector. (Also, those info records include the usual encoding for the types, so that as we add associations between each instruction encoding and one of the new `MVEVectorVTInfo` records, we can remove some of the existing template parameters and replace them with references to the vector type info's fields.) The user-facing ACLE intrinsics will receive a predicate mask as a 16-bit integer, so I've also provided a pair of intrinsics i2v and v2i, to convert between an integer and a vector of i1 by just changing the register class. Reviewers: dmgreen, miyuki, ostannard Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67158
2019-10-08 00:00:51 +08:00
multiclass MVE_VADDSUB_m<string iname, MVEVectorVTInfo VTI, bit subtract,
SDNode unpred_op, Intrinsic pred_int> {
def "" : MVE_VADDSUB<iname, VTI.Suffix, VTI.Size, subtract>;
[ARM] Begin adding IR intrinsics for MVE instructions. This commit, together with the next few, will add a representative sample of the kind of IR intrinsics that we'll need in order to implement the user-facing ACLE intrinsics for MVE. Supporting all of them will take more work; the intention of this initial series of commits is to implement an intrinsic or two from lots of different categories, as examples and proofs of concept. This initial commit introduces a small number of IR intrinsics for instructions simple enough that they can use Tablegen ISel patterns: the predicated versions of the VADD and VSUB instructions (both integer and FP), VMIN and VMAX, and the float->half VCVT instruction (predicated and unpredicated). When using VPT-predicated instructions in automatic code generation, it will be convenient to specify the predicate value as a vector of the appropriate number of i1. To make it easy to specify all sizes of an instruction in one go and give each one the matching predicate vector type, I've added a system of Tablegen informational records describing MVE's vector types: each one gives the underlying LLVM IR ValueType (which may not be the same if the MVE vector is of explicitly signed or unsigned integers) and an appropriate vNi1 to use as the predicate vector. (Also, those info records include the usual encoding for the types, so that as we add associations between each instruction encoding and one of the new `MVEVectorVTInfo` records, we can remove some of the existing template parameters and replace them with references to the vector type info's fields.) The user-facing ACLE intrinsics will receive a predicate mask as a 16-bit integer, so I've also provided a pair of intrinsics i2v and v2i, to convert between an integer and a vector of i1 by just changing the register class. Reviewers: dmgreen, miyuki, ostannard Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67158
2019-10-08 00:00:51 +08:00
let Predicates = [HasMVEInt] in {
// Unpredicated add/subtract
def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
[ARM] Begin adding IR intrinsics for MVE instructions. This commit, together with the next few, will add a representative sample of the kind of IR intrinsics that we'll need in order to implement the user-facing ACLE intrinsics for MVE. Supporting all of them will take more work; the intention of this initial series of commits is to implement an intrinsic or two from lots of different categories, as examples and proofs of concept. This initial commit introduces a small number of IR intrinsics for instructions simple enough that they can use Tablegen ISel patterns: the predicated versions of the VADD and VSUB instructions (both integer and FP), VMIN and VMAX, and the float->half VCVT instruction (predicated and unpredicated). When using VPT-predicated instructions in automatic code generation, it will be convenient to specify the predicate value as a vector of the appropriate number of i1. To make it easy to specify all sizes of an instruction in one go and give each one the matching predicate vector type, I've added a system of Tablegen informational records describing MVE's vector types: each one gives the underlying LLVM IR ValueType (which may not be the same if the MVE vector is of explicitly signed or unsigned integers) and an appropriate vNi1 to use as the predicate vector. (Also, those info records include the usual encoding for the types, so that as we add associations between each instruction encoding and one of the new `MVEVectorVTInfo` records, we can remove some of the existing template parameters and replace them with references to the vector type info's fields.) The user-facing ACLE intrinsics will receive a predicate mask as a 16-bit integer, so I've also provided a pair of intrinsics i2v and v2i, to convert between an integer and a vector of i1 by just changing the register class. Reviewers: dmgreen, miyuki, ostannard Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67158
2019-10-08 00:00:51 +08:00
// Predicated add/subtract
def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
(VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
ARMVCCThen, (VTI.Pred VCCR:$mask),
[ARM] Begin adding IR intrinsics for MVE instructions. This commit, together with the next few, will add a representative sample of the kind of IR intrinsics that we'll need in order to implement the user-facing ACLE intrinsics for MVE. Supporting all of them will take more work; the intention of this initial series of commits is to implement an intrinsic or two from lots of different categories, as examples and proofs of concept. This initial commit introduces a small number of IR intrinsics for instructions simple enough that they can use Tablegen ISel patterns: the predicated versions of the VADD and VSUB instructions (both integer and FP), VMIN and VMAX, and the float->half VCVT instruction (predicated and unpredicated). When using VPT-predicated instructions in automatic code generation, it will be convenient to specify the predicate value as a vector of the appropriate number of i1. To make it easy to specify all sizes of an instruction in one go and give each one the matching predicate vector type, I've added a system of Tablegen informational records describing MVE's vector types: each one gives the underlying LLVM IR ValueType (which may not be the same if the MVE vector is of explicitly signed or unsigned integers) and an appropriate vNi1 to use as the predicate vector. (Also, those info records include the usual encoding for the types, so that as we add associations between each instruction encoding and one of the new `MVEVectorVTInfo` records, we can remove some of the existing template parameters and replace them with references to the vector type info's fields.) The user-facing ACLE intrinsics will receive a predicate mask as a 16-bit integer, so I've also provided a pair of intrinsics i2v and v2i, to convert between an integer and a vector of i1 by just changing the register class. Reviewers: dmgreen, miyuki, ostannard Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67158
2019-10-08 00:00:51 +08:00
(VTI.Vec MQPR:$inactive)))>;
}
}
[ARM] Begin adding IR intrinsics for MVE instructions. This commit, together with the next few, will add a representative sample of the kind of IR intrinsics that we'll need in order to implement the user-facing ACLE intrinsics for MVE. Supporting all of them will take more work; the intention of this initial series of commits is to implement an intrinsic or two from lots of different categories, as examples and proofs of concept. This initial commit introduces a small number of IR intrinsics for instructions simple enough that they can use Tablegen ISel patterns: the predicated versions of the VADD and VSUB instructions (both integer and FP), VMIN and VMAX, and the float->half VCVT instruction (predicated and unpredicated). When using VPT-predicated instructions in automatic code generation, it will be convenient to specify the predicate value as a vector of the appropriate number of i1. To make it easy to specify all sizes of an instruction in one go and give each one the matching predicate vector type, I've added a system of Tablegen informational records describing MVE's vector types: each one gives the underlying LLVM IR ValueType (which may not be the same if the MVE vector is of explicitly signed or unsigned integers) and an appropriate vNi1 to use as the predicate vector. (Also, those info records include the usual encoding for the types, so that as we add associations between each instruction encoding and one of the new `MVEVectorVTInfo` records, we can remove some of the existing template parameters and replace them with references to the vector type info's fields.) The user-facing ACLE intrinsics will receive a predicate mask as a 16-bit integer, so I've also provided a pair of intrinsics i2v and v2i, to convert between an integer and a vector of i1 by just changing the register class. Reviewers: dmgreen, miyuki, ostannard Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67158
2019-10-08 00:00:51 +08:00
multiclass MVE_VADD<MVEVectorVTInfo VTI>
: MVE_VADDSUB_m<"vadd", VTI, 0b0, add, int_arm_mve_add_predicated>;
multiclass MVE_VSUB<MVEVectorVTInfo VTI>
: MVE_VADDSUB_m<"vsub", VTI, 0b1, sub, int_arm_mve_sub_predicated>;
[ARM] Begin adding IR intrinsics for MVE instructions. This commit, together with the next few, will add a representative sample of the kind of IR intrinsics that we'll need in order to implement the user-facing ACLE intrinsics for MVE. Supporting all of them will take more work; the intention of this initial series of commits is to implement an intrinsic or two from lots of different categories, as examples and proofs of concept. This initial commit introduces a small number of IR intrinsics for instructions simple enough that they can use Tablegen ISel patterns: the predicated versions of the VADD and VSUB instructions (both integer and FP), VMIN and VMAX, and the float->half VCVT instruction (predicated and unpredicated). When using VPT-predicated instructions in automatic code generation, it will be convenient to specify the predicate value as a vector of the appropriate number of i1. To make it easy to specify all sizes of an instruction in one go and give each one the matching predicate vector type, I've added a system of Tablegen informational records describing MVE's vector types: each one gives the underlying LLVM IR ValueType (which may not be the same if the MVE vector is of explicitly signed or unsigned integers) and an appropriate vNi1 to use as the predicate vector. (Also, those info records include the usual encoding for the types, so that as we add associations between each instruction encoding and one of the new `MVEVectorVTInfo` records, we can remove some of the existing template parameters and replace them with references to the vector type info's fields.) The user-facing ACLE intrinsics will receive a predicate mask as a 16-bit integer, so I've also provided a pair of intrinsics i2v and v2i, to convert between an integer and a vector of i1 by just changing the register class. Reviewers: dmgreen, miyuki, ostannard Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67158
2019-10-08 00:00:51 +08:00
defm MVE_VADDi8 : MVE_VADD<MVE_v16i8>;
defm MVE_VADDi16 : MVE_VADD<MVE_v8i16>;
defm MVE_VADDi32 : MVE_VADD<MVE_v4i32>;
defm MVE_VSUBi8 : MVE_VSUB<MVE_v16i8>;
defm MVE_VSUBi16 : MVE_VSUB<MVE_v8i16>;
defm MVE_VSUBi32 : MVE_VSUB<MVE_v4i32>;
class MVE_VQADDSUB<string iname, string suffix, bit U, bit subtract,
bits<2> size>
: MVE_int<iname, suffix, size, []> {
let Inst{28} = U;
let Inst{25-23} = 0b110;
let Inst{16} = 0b0;
let Inst{12-10} = 0b000;
let Inst{9} = subtract;
let Inst{8} = 0b0;
let Inst{4} = 0b1;
let Inst{0} = 0b0;
let validForTailPredication = 1;
}
class MVE_VQADD_<string suffix, bit U, bits<2> size>
: MVE_VQADDSUB<"vqadd", suffix, U, 0b0, size>;
class MVE_VQSUB_<string suffix, bit U, bits<2> size>
: MVE_VQADDSUB<"vqsub", suffix, U, 0b1, size>;
multiclass MVE_VQADD_m<MVEVectorVTInfo VTI,
SDNode unpred_op, Intrinsic pred_int> {
def "" : MVE_VQADD_<VTI.Suffix, VTI.Unsigned, VTI.Size>;
let Predicates = [HasMVEInt] in {
// Unpredicated saturating add
def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
// Predicated saturating add
def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
(i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
(VTI.Vec MQPR:$inactive))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
ARMVCCThen, (VTI.Pred VCCR:$mask),
(VTI.Vec MQPR:$inactive)))>;
}
}
multiclass MVE_VQADD<MVEVectorVTInfo VTI, SDNode unpred_op>
: MVE_VQADD_m<VTI, unpred_op, int_arm_mve_qadd_predicated>;
defm MVE_VQADDs8 : MVE_VQADD<MVE_v16s8, saddsat>;
defm MVE_VQADDs16 : MVE_VQADD<MVE_v8s16, saddsat>;
defm MVE_VQADDs32 : MVE_VQADD<MVE_v4s32, saddsat>;
defm MVE_VQADDu8 : MVE_VQADD<MVE_v16u8, uaddsat>;
defm MVE_VQADDu16 : MVE_VQADD<MVE_v8u16, uaddsat>;
defm MVE_VQADDu32 : MVE_VQADD<MVE_v4u32, uaddsat>;
multiclass MVE_VQSUB_m<MVEVectorVTInfo VTI,
SDNode unpred_op, Intrinsic pred_int> {
def "" : MVE_VQSUB_<VTI.Suffix, VTI.Unsigned, VTI.Size>;
let Predicates = [HasMVEInt] in {
// Unpredicated saturating subtract
def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
// Predicated saturating subtract
def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
(i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
(VTI.Vec MQPR:$inactive))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
ARMVCCThen, (VTI.Pred VCCR:$mask),
(VTI.Vec MQPR:$inactive)))>;
}
}
multiclass MVE_VQSUB<MVEVectorVTInfo VTI, SDNode unpred_op>
: MVE_VQSUB_m<VTI, unpred_op, int_arm_mve_qsub_predicated>;
defm MVE_VQSUBs8 : MVE_VQSUB<MVE_v16s8, ssubsat>;
defm MVE_VQSUBs16 : MVE_VQSUB<MVE_v8s16, ssubsat>;
defm MVE_VQSUBs32 : MVE_VQSUB<MVE_v4s32, ssubsat>;
defm MVE_VQSUBu8 : MVE_VQSUB<MVE_v16u8, usubsat>;
defm MVE_VQSUBu16 : MVE_VQSUB<MVE_v8u16, usubsat>;
defm MVE_VQSUBu32 : MVE_VQSUB<MVE_v4u32, usubsat>;
class MVE_VABD_int<string suffix, bit U, bits<2> size,
list<dag> pattern=[]>
: MVE_int<"vabd", suffix, size, pattern> {
let Inst{28} = U;
let Inst{25-23} = 0b110;
let Inst{16} = 0b0;
let Inst{12-8} = 0b00111;
let Inst{4} = 0b0;
let Inst{0} = 0b0;
let validForTailPredication = 1;
}
multiclass MVE_VABD_m<MVEVectorVTInfo VTI,
Intrinsic unpred_int, Intrinsic pred_int> {
def "" : MVE_VABD_int<VTI.Suffix, VTI.Unsigned, VTI.Size>;
let Predicates = [HasMVEInt] in {
// Unpredicated absolute difference
def : Pat<(VTI.Vec (unpred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
(i32 VTI.Unsigned))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
// Predicated absolute difference
def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
(i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
(VTI.Vec MQPR:$inactive))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
ARMVCCThen, (VTI.Pred VCCR:$mask),
(VTI.Vec MQPR:$inactive)))>;
}
}
multiclass MVE_VABD<MVEVectorVTInfo VTI>
: MVE_VABD_m<VTI, int_arm_mve_vabd, int_arm_mve_abd_predicated>;
defm MVE_VABDs8 : MVE_VABD<MVE_v16s8>;
defm MVE_VABDs16 : MVE_VABD<MVE_v8s16>;
defm MVE_VABDs32 : MVE_VABD<MVE_v4s32>;
defm MVE_VABDu8 : MVE_VABD<MVE_v16u8>;
defm MVE_VABDu16 : MVE_VABD<MVE_v8u16>;
defm MVE_VABDu32 : MVE_VABD<MVE_v4u32>;
class MVE_VRHADD_Base<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
: MVE_int<"vrhadd", suffix, size, pattern> {
let Inst{28} = U;
let Inst{25-23} = 0b110;
let Inst{16} = 0b0;
let Inst{12-8} = 0b00001;
let Inst{4} = 0b0;
let Inst{0} = 0b0;
let validForTailPredication = 1;
}
multiclass MVE_VRHADD_m<MVEVectorVTInfo VTI,
SDNode unpred_op, Intrinsic pred_int> {
def "" : MVE_VRHADD_Base<VTI.Suffix, VTI.Unsigned, VTI.Size>;
let Predicates = [HasMVEInt] in {
// Unpredicated rounding add-with-divide-by-two
def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
(i32 VTI.Unsigned))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
// Predicated add-with-divide-by-two
def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
(i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
(VTI.Vec MQPR:$inactive))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
ARMVCCThen, (VTI.Pred VCCR:$mask),
(VTI.Vec MQPR:$inactive)))>;
}
}
multiclass MVE_VRHADD<MVEVectorVTInfo VTI>
: MVE_VRHADD_m<VTI, int_arm_mve_vrhadd, int_arm_mve_rhadd_predicated>;
defm MVE_VRHADDs8 : MVE_VRHADD<MVE_v16s8>;
defm MVE_VRHADDs16 : MVE_VRHADD<MVE_v8s16>;
defm MVE_VRHADDs32 : MVE_VRHADD<MVE_v4s32>;
defm MVE_VRHADDu8 : MVE_VRHADD<MVE_v16u8>;
defm MVE_VRHADDu16 : MVE_VRHADD<MVE_v8u16>;
defm MVE_VRHADDu32 : MVE_VRHADD<MVE_v4u32>;
class MVE_VHADDSUB<string iname, string suffix, bit U, bit subtract,
bits<2> size, list<dag> pattern=[]>
: MVE_int<iname, suffix, size, pattern> {
let Inst{28} = U;
let Inst{25-23} = 0b110;
let Inst{16} = 0b0;
let Inst{12-10} = 0b000;
let Inst{9} = subtract;
let Inst{8} = 0b0;
let Inst{4} = 0b0;
let Inst{0} = 0b0;
let validForTailPredication = 1;
}
class MVE_VHADD_<string suffix, bit U, bits<2> size,
list<dag> pattern=[]>
: MVE_VHADDSUB<"vhadd", suffix, U, 0b0, size, pattern>;
class MVE_VHSUB_<string suffix, bit U, bits<2> size,
list<dag> pattern=[]>
: MVE_VHADDSUB<"vhsub", suffix, U, 0b1, size, pattern>;
multiclass MVE_VHADD_m<MVEVectorVTInfo VTI,
SDNode unpred_op, Intrinsic pred_int> {
def "" : MVE_VHADD_<VTI.Suffix, VTI.Unsigned, VTI.Size>;
let Predicates = [HasMVEInt] in {
// Unpredicated add-and-divide-by-two
def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 VTI.Unsigned))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
// Predicated add-and-divide-by-two
def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 VTI.Unsigned),
(VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
ARMVCCThen, (VTI.Pred VCCR:$mask),
(VTI.Vec MQPR:$inactive)))>;
}
}
multiclass MVE_VHADD<MVEVectorVTInfo VTI>
: MVE_VHADD_m<VTI, int_arm_mve_vhadd, int_arm_mve_hadd_predicated>;
defm MVE_VHADDs8 : MVE_VHADD<MVE_v16s8>;
defm MVE_VHADDs16 : MVE_VHADD<MVE_v8s16>;
defm MVE_VHADDs32 : MVE_VHADD<MVE_v4s32>;
defm MVE_VHADDu8 : MVE_VHADD<MVE_v16u8>;
defm MVE_VHADDu16 : MVE_VHADD<MVE_v8u16>;
defm MVE_VHADDu32 : MVE_VHADD<MVE_v4u32>;
multiclass MVE_VHSUB_m<MVEVectorVTInfo VTI,
SDNode unpred_op, Intrinsic pred_int> {
def "" : MVE_VHSUB_<VTI.Suffix, VTI.Unsigned, VTI.Size>;
let Predicates = [HasMVEInt] in {
// Unpredicated subtract-and-divide-by-two
def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
(i32 VTI.Unsigned))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
// Predicated subtract-and-divide-by-two
def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
(i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
(VTI.Vec MQPR:$inactive))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
ARMVCCThen, (VTI.Pred VCCR:$mask),
(VTI.Vec MQPR:$inactive)))>;
}
}
multiclass MVE_VHSUB<MVEVectorVTInfo VTI>
: MVE_VHSUB_m<VTI, int_arm_mve_vhsub, int_arm_mve_hsub_predicated>;
defm MVE_VHSUBs8 : MVE_VHSUB<MVE_v16s8>;
defm MVE_VHSUBs16 : MVE_VHSUB<MVE_v8s16>;
defm MVE_VHSUBs32 : MVE_VHSUB<MVE_v4s32>;
defm MVE_VHSUBu8 : MVE_VHSUB<MVE_v16u8>;
defm MVE_VHSUBu16 : MVE_VHSUB<MVE_v8u16>;
defm MVE_VHSUBu32 : MVE_VHSUB<MVE_v4u32>;
class MVE_VDUP<string suffix, bit B, bit E, list<dag> pattern=[]>
: MVE_p<(outs MQPR:$Qd), (ins rGPR:$Rt), NoItinerary,
"vdup", suffix, "$Qd, $Rt", vpred_r, "", pattern> {
bits<4> Qd;
bits<4> Rt;
let Inst{28} = 0b0;
let Inst{25-23} = 0b101;
let Inst{22} = B;
let Inst{21-20} = 0b10;
let Inst{19-17} = Qd{2-0};
let Inst{16} = 0b0;
let Inst{15-12} = Rt;
let Inst{11-8} = 0b1011;
let Inst{7} = Qd{3};
let Inst{6} = 0b0;
let Inst{5} = E;
let Inst{4-0} = 0b10000;
let validForTailPredication = 1;
}
def MVE_VDUP32 : MVE_VDUP<"32", 0b0, 0b0>;
def MVE_VDUP16 : MVE_VDUP<"16", 0b0, 0b1>;
def MVE_VDUP8 : MVE_VDUP<"8", 0b1, 0b0>;
let Predicates = [HasMVEInt] in {
def : Pat<(v16i8 (ARMvdup (i32 rGPR:$elem))),
(MVE_VDUP8 rGPR:$elem)>;
def : Pat<(v8i16 (ARMvdup (i32 rGPR:$elem))),
(MVE_VDUP16 rGPR:$elem)>;
def : Pat<(v4i32 (ARMvdup (i32 rGPR:$elem))),
(MVE_VDUP32 rGPR:$elem)>;
def : Pat<(v4i32 (ARMvduplane (v4i32 MQPR:$src), imm:$lane)),
(MVE_VDUP32 (MVE_VMOV_from_lane_32 MQPR:$src, imm:$lane))>;
// For the 16-bit and 8-bit vduplanes we don't care about the signedness
// of the lane move operation as we only want the lowest 8/16 bits anyway.
def : Pat<(v8i16 (ARMvduplane (v8i16 MQPR:$src), imm:$lane)),
(MVE_VDUP16 (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane))>;
def : Pat<(v16i8 (ARMvduplane (v16i8 MQPR:$src), imm:$lane)),
(MVE_VDUP8 (MVE_VMOV_from_lane_u8 MQPR:$src, imm:$lane))>;
def : Pat<(v4f32 (ARMvdup (f32 SPR:$elem))),
(v4f32 (MVE_VDUP32 (i32 (COPY_TO_REGCLASS (f32 SPR:$elem), rGPR))))>;
def : Pat<(v8f16 (ARMvdup (f16 HPR:$elem))),
(v8f16 (MVE_VDUP16 (i32 (COPY_TO_REGCLASS (f16 HPR:$elem), rGPR))))>;
def : Pat<(v4f32 (ARMvduplane (v4f32 MQPR:$src), imm:$lane)),
(MVE_VDUP32 (MVE_VMOV_from_lane_32 MQPR:$src, imm:$lane))>;
def : Pat<(v8f16 (ARMvduplane (v8f16 MQPR:$src), imm:$lane)),
(MVE_VDUP16 (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane))>;
}
class MVEIntSingleSrc<string iname, string suffix, bits<2> size,
list<dag> pattern=[]>
: MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qm), NoItinerary,
iname, suffix, "$Qd, $Qm", vpred_r, "", pattern> {
bits<4> Qd;
bits<4> Qm;
let Inst{22} = Qd{3};
let Inst{19-18} = size{1-0};
let Inst{15-13} = Qd{2-0};
let Inst{5} = Qm{3};
let Inst{3-1} = Qm{2-0};
}
class MVE_VCLSCLZ<string iname, string suffix, bits<2> size,
bit count_zeroes, list<dag> pattern=[]>
: MVEIntSingleSrc<iname, suffix, size, pattern> {
let Inst{28} = 0b1;
let Inst{25-23} = 0b111;
let Inst{21-20} = 0b11;
let Inst{17-16} = 0b00;
let Inst{12-8} = 0b00100;
let Inst{7} = count_zeroes;
let Inst{6} = 0b1;
let Inst{4} = 0b0;
let Inst{0} = 0b0;
let validForTailPredication = 1;
}
def MVE_VCLSs8 : MVE_VCLSCLZ<"vcls", "s8", 0b00, 0b0>;
def MVE_VCLSs16 : MVE_VCLSCLZ<"vcls", "s16", 0b01, 0b0>;
def MVE_VCLSs32 : MVE_VCLSCLZ<"vcls", "s32", 0b10, 0b0>;
def MVE_VCLZs8 : MVE_VCLSCLZ<"vclz", "i8", 0b00, 0b1>;
def MVE_VCLZs16 : MVE_VCLSCLZ<"vclz", "i16", 0b01, 0b1>;
def MVE_VCLZs32 : MVE_VCLSCLZ<"vclz", "i32", 0b10, 0b1>;
let Predicates = [HasMVEInt] in {
def : Pat<(v16i8 ( ctlz (v16i8 MQPR:$val1))),
(v16i8 ( MVE_VCLZs8 (v16i8 MQPR:$val1)))>;
def : Pat<(v4i32 ( ctlz (v4i32 MQPR:$val1))),
(v4i32 ( MVE_VCLZs32 (v4i32 MQPR:$val1)))>;
def : Pat<(v8i16 ( ctlz (v8i16 MQPR:$val1))),
(v8i16 ( MVE_VCLZs16 (v8i16 MQPR:$val1)))>;
}
class MVE_VABSNEG_int<string iname, string suffix, bits<2> size, bit negate,
list<dag> pattern=[]>
: MVEIntSingleSrc<iname, suffix, size, pattern> {
let Inst{28} = 0b1;
let Inst{25-23} = 0b111;
let Inst{21-20} = 0b11;
let Inst{17-16} = 0b01;
let Inst{12-8} = 0b00011;
let Inst{7} = negate;
let Inst{6} = 0b1;
let Inst{4} = 0b0;
let Inst{0} = 0b0;
let validForTailPredication = 1;
}
def MVE_VABSs8 : MVE_VABSNEG_int<"vabs", "s8", 0b00, 0b0>;
def MVE_VABSs16 : MVE_VABSNEG_int<"vabs", "s16", 0b01, 0b0>;
def MVE_VABSs32 : MVE_VABSNEG_int<"vabs", "s32", 0b10, 0b0>;
let Predicates = [HasMVEInt] in {
def : Pat<(v16i8 (abs (v16i8 MQPR:$v))),
(v16i8 (MVE_VABSs8 $v))>;
def : Pat<(v8i16 (abs (v8i16 MQPR:$v))),
(v8i16 (MVE_VABSs16 $v))>;
def : Pat<(v4i32 (abs (v4i32 MQPR:$v))),
(v4i32 (MVE_VABSs32 $v))>;
}
def MVE_VNEGs8 : MVE_VABSNEG_int<"vneg", "s8", 0b00, 0b1>;
def MVE_VNEGs16 : MVE_VABSNEG_int<"vneg", "s16", 0b01, 0b1>;
def MVE_VNEGs32 : MVE_VABSNEG_int<"vneg", "s32", 0b10, 0b1>;
let Predicates = [HasMVEInt] in {
def : Pat<(v16i8 (vnegq (v16i8 MQPR:$v))),
(v16i8 (MVE_VNEGs8 $v))>;
def : Pat<(v8i16 (vnegq (v8i16 MQPR:$v))),
(v8i16 (MVE_VNEGs16 $v))>;
def : Pat<(v4i32 (vnegq (v4i32 MQPR:$v))),
(v4i32 (MVE_VNEGs32 $v))>;
}
class MVE_VQABSNEG<string iname, string suffix, bits<2> size,
bit negate, list<dag> pattern=[]>
: MVEIntSingleSrc<iname, suffix, size, pattern> {
let Inst{28} = 0b1;
let Inst{25-23} = 0b111;
let Inst{21-20} = 0b11;
let Inst{17-16} = 0b00;
let Inst{12-8} = 0b00111;
let Inst{7} = negate;
let Inst{6} = 0b1;
let Inst{4} = 0b0;
let Inst{0} = 0b0;
let validForTailPredication = 1;
}
def MVE_VQABSs8 : MVE_VQABSNEG<"vqabs", "s8", 0b00, 0b0>;
def MVE_VQABSs16 : MVE_VQABSNEG<"vqabs", "s16", 0b01, 0b0>;
def MVE_VQABSs32 : MVE_VQABSNEG<"vqabs", "s32", 0b10, 0b0>;
def MVE_VQNEGs8 : MVE_VQABSNEG<"vqneg", "s8", 0b00, 0b1>;
def MVE_VQNEGs16 : MVE_VQABSNEG<"vqneg", "s16", 0b01, 0b1>;
def MVE_VQNEGs32 : MVE_VQABSNEG<"vqneg", "s32", 0b10, 0b1>;
// int_min/int_max: vector containing INT_MIN/INT_MAX VTI.Size times
// zero_vec: v4i32-initialized zero vector, potentially wrapped in a bitconvert
multiclass vqabsneg_pattern<MVEVectorVTInfo VTI, dag int_min, dag int_max,
dag zero_vec, MVE_VQABSNEG vqabs_instruction,
MVE_VQABSNEG vqneg_instruction> {
let Predicates = [HasMVEInt] in {
// The below tree can be replaced by a vqabs instruction, as it represents
// the following vectorized expression (r being the value in $reg):
// r > 0 ? r : (r == INT_MIN ? INT_MAX : -r)
def : Pat<(VTI.Vec (vselect
(VTI.Pred (ARMvcmpz (VTI.Vec MQPR:$reg), ARMCCgt)),
(VTI.Vec MQPR:$reg),
(VTI.Vec (vselect
(VTI.Pred (ARMvcmp (VTI.Vec MQPR:$reg), int_min, ARMCCeq)),
int_max,
(sub (VTI.Vec zero_vec), (VTI.Vec MQPR:$reg)))))),
(VTI.Vec (vqabs_instruction (VTI.Vec MQPR:$reg)))>;
// Similarly, this tree represents vqneg, i.e. the following vectorized expression:
// r == INT_MIN ? INT_MAX : -r
def : Pat<(VTI.Vec (vselect
(VTI.Pred (ARMvcmp (VTI.Vec MQPR:$reg), int_min, ARMCCeq)),
int_max,
(sub (VTI.Vec zero_vec), (VTI.Vec MQPR:$reg)))),
(VTI.Vec (vqneg_instruction (VTI.Vec MQPR:$reg)))>;
}
}
defm MVE_VQABSNEG_Ps8 : vqabsneg_pattern<MVE_v16i8,
(v16i8 (ARMvmovImm (i32 3712))),
(v16i8 (ARMvmovImm (i32 3711))),
(bitconvert (v4i32 (ARMvmovImm (i32 0)))),
MVE_VQABSs8, MVE_VQNEGs8>;
defm MVE_VQABSNEG_Ps16 : vqabsneg_pattern<MVE_v8i16,
(v8i16 (ARMvmovImm (i32 2688))),
(v8i16 (ARMvmvnImm (i32 2688))),
(bitconvert (v4i32 (ARMvmovImm (i32 0)))),
MVE_VQABSs16, MVE_VQNEGs16>;
defm MVE_VQABSNEG_Ps32 : vqabsneg_pattern<MVE_v4i32,
(v4i32 (ARMvmovImm (i32 1664))),
(v4i32 (ARMvmvnImm (i32 1664))),
(ARMvmovImm (i32 0)),
MVE_VQABSs32, MVE_VQNEGs32>;
class MVE_mod_imm<string iname, string suffix, bits<4> cmode, bit op,
dag iops, list<dag> pattern=[]>
: MVE_p<(outs MQPR:$Qd), iops, NoItinerary, iname, suffix, "$Qd, $imm",
vpred_r, "", pattern> {
bits<13> imm;
bits<4> Qd;
let Inst{28} = imm{7};
let Inst{25-23} = 0b111;
let Inst{22} = Qd{3};
let Inst{21-19} = 0b000;
let Inst{18-16} = imm{6-4};
let Inst{15-13} = Qd{2-0};
let Inst{12} = 0b0;
let Inst{11-8} = cmode{3-0};
let Inst{7-6} = 0b01;
let Inst{5} = op;
let Inst{4} = 0b1;
let Inst{3-0} = imm{3-0};
let DecoderMethod = "DecodeMVEModImmInstruction";
let validForTailPredication = 1;
}
let isReMaterializable = 1 in {
let isAsCheapAsAMove = 1 in {
def MVE_VMOVimmi8 : MVE_mod_imm<"vmov", "i8", {1,1,1,0}, 0b0, (ins nImmSplatI8:$imm)>;
def MVE_VMOVimmi16 : MVE_mod_imm<"vmov", "i16", {1,0,?,0}, 0b0, (ins nImmSplatI16:$imm)> {
let Inst{9} = imm{9};
}
def MVE_VMOVimmi32 : MVE_mod_imm<"vmov", "i32", {?,?,?,?}, 0b0, (ins nImmVMOVI32:$imm)> {
let Inst{11-8} = imm{11-8};
}
def MVE_VMOVimmi64 : MVE_mod_imm<"vmov", "i64", {1,1,1,0}, 0b1, (ins nImmSplatI64:$imm)>;
def MVE_VMOVimmf32 : MVE_mod_imm<"vmov", "f32", {1,1,1,1}, 0b0, (ins nImmVMOVF32:$imm)>;
} // let isAsCheapAsAMove = 1
def MVE_VMVNimmi16 : MVE_mod_imm<"vmvn", "i16", {1,0,?,0}, 0b1, (ins nImmSplatI16:$imm)> {
let Inst{9} = imm{9};
}
def MVE_VMVNimmi32 : MVE_mod_imm<"vmvn", "i32", {?,?,?,?}, 0b1, (ins nImmVMOVI32:$imm)> {
let Inst{11-8} = imm{11-8};
}
} // let isReMaterializable = 1
let Predicates = [HasMVEInt] in {
def : Pat<(v16i8 (ARMvmovImm timm:$simm)),
(v16i8 (MVE_VMOVimmi8 nImmSplatI8:$simm))>;
def : Pat<(v8i16 (ARMvmovImm timm:$simm)),
(v8i16 (MVE_VMOVimmi16 nImmSplatI16:$simm))>;
def : Pat<(v4i32 (ARMvmovImm timm:$simm)),
(v4i32 (MVE_VMOVimmi32 nImmVMOVI32:$simm))>;
def : Pat<(v8i16 (ARMvmvnImm timm:$simm)),
(v8i16 (MVE_VMVNimmi16 nImmSplatI16:$simm))>;
def : Pat<(v4i32 (ARMvmvnImm timm:$simm)),
(v4i32 (MVE_VMVNimmi32 nImmVMOVI32:$simm))>;
def : Pat<(v4f32 (ARMvmovFPImm timm:$simm)),
(v4f32 (MVE_VMOVimmf32 nImmVMOVF32:$simm))>;
}
class MVE_VMINMAXA<string iname, string suffix, bits<2> size,
bit bit_12, list<dag> pattern=[]>
: MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qm),
NoItinerary, iname, suffix, "$Qd, $Qm", vpred_n, "$Qd = $Qd_src",
pattern> {
bits<4> Qd;
bits<4> Qm;
let Inst{28} = 0b0;
let Inst{25-23} = 0b100;
let Inst{22} = Qd{3};
let Inst{21-20} = 0b11;
let Inst{19-18} = size;
let Inst{17-16} = 0b11;
let Inst{15-13} = Qd{2-0};
let Inst{12} = bit_12;
let Inst{11-6} = 0b111010;
let Inst{5} = Qm{3};
let Inst{4} = 0b0;
let Inst{3-1} = Qm{2-0};
let Inst{0} = 0b1;
}
def MVE_VMAXAs8 : MVE_VMINMAXA<"vmaxa", "s8", 0b00, 0b0>;
def MVE_VMAXAs16 : MVE_VMINMAXA<"vmaxa", "s16", 0b01, 0b0>;
def MVE_VMAXAs32 : MVE_VMINMAXA<"vmaxa", "s32", 0b10, 0b0>;
def MVE_VMINAs8 : MVE_VMINMAXA<"vmina", "s8", 0b00, 0b1>;
def MVE_VMINAs16 : MVE_VMINMAXA<"vmina", "s16", 0b01, 0b1>;
def MVE_VMINAs32 : MVE_VMINMAXA<"vmina", "s32", 0b10, 0b1>;
// end of MVE Integer instructions
// start of mve_imm_shift instructions
def MVE_VSHLC : MVE_p<(outs rGPR:$RdmDest, MQPR:$Qd),
(ins MQPR:$QdSrc, rGPR:$RdmSrc, long_shift:$imm),
NoItinerary, "vshlc", "", "$QdSrc, $RdmSrc, $imm",
vpred_n, "$RdmDest = $RdmSrc,$Qd = $QdSrc"> {
bits<5> imm;
bits<4> Qd;
bits<4> RdmDest;
let Inst{28} = 0b0;
let Inst{25-23} = 0b101;
let Inst{22} = Qd{3};
let Inst{21} = 0b1;
let Inst{20-16} = imm{4-0};
let Inst{15-13} = Qd{2-0};
let Inst{12-4} = 0b011111100;
let Inst{3-0} = RdmDest{3-0};
}
class MVE_shift_imm<dag oops, dag iops, string iname, string suffix,
string ops, vpred_ops vpred, string cstr,
list<dag> pattern=[]>
: MVE_p<oops, iops, NoItinerary, iname, suffix, ops, vpred, cstr, pattern> {
bits<4> Qd;
bits<4> Qm;
let Inst{22} = Qd{3};
let Inst{15-13} = Qd{2-0};
let Inst{5} = Qm{3};
let Inst{3-1} = Qm{2-0};
}
class MVE_VMOVL<string iname, string suffix, bits<2> sz, bit U,
list<dag> pattern=[]>
: MVE_shift_imm<(outs MQPR:$Qd), (ins MQPR:$Qm),
iname, suffix, "$Qd, $Qm", vpred_r, "",
pattern> {
let Inst{28} = U;
let Inst{25-23} = 0b101;
let Inst{21} = 0b1;
let Inst{20-19} = sz{1-0};
let Inst{18-16} = 0b000;
let Inst{11-6} = 0b111101;
let Inst{4} = 0b0;
let Inst{0} = 0b0;
}
multiclass MVE_VMOVL_shift_half<string iname, string suffix, bits<2> sz, bit U,
list<dag> pattern=[]> {
def bh : MVE_VMOVL<!strconcat(iname, "b"), suffix, sz, U, pattern> {
let Inst{12} = 0b0;
}
def th : MVE_VMOVL<!strconcat(iname, "t"), suffix, sz, U, pattern> {
let Inst{12} = 0b1;
}
}
defm MVE_VMOVLs8 : MVE_VMOVL_shift_half<"vmovl", "s8", 0b01, 0b0>;
defm MVE_VMOVLu8 : MVE_VMOVL_shift_half<"vmovl", "u8", 0b01, 0b1>;
defm MVE_VMOVLs16 : MVE_VMOVL_shift_half<"vmovl", "s16", 0b10, 0b0>;
defm MVE_VMOVLu16 : MVE_VMOVL_shift_half<"vmovl", "u16", 0b10, 0b1>;
let Predicates = [HasMVEInt] in {
def : Pat<(sext_inreg (v4i32 MQPR:$src), v4i16),
(MVE_VMOVLs16bh MQPR:$src)>;
def : Pat<(sext_inreg (v8i16 MQPR:$src), v8i8),
(MVE_VMOVLs8bh MQPR:$src)>;
def : Pat<(sext_inreg (v4i32 MQPR:$src), v4i8),
(MVE_VMOVLs16bh (MVE_VMOVLs8bh MQPR:$src))>;
// zext_inreg 16 -> 32
def : Pat<(and (v4i32 MQPR:$src), (v4i32 (ARMvmovImm (i32 0xCFF)))),
(MVE_VMOVLu16bh MQPR:$src)>;
// zext_inreg 8 -> 16
def : Pat<(and (v8i16 MQPR:$src), (v8i16 (ARMvmovImm (i32 0x8FF)))),
(MVE_VMOVLu8bh MQPR:$src)>;
}
class MVE_VSHLL_imm<string iname, string suffix, bit U, bit th,
[ARM][MVE] Add intrinsics for more immediate shifts. Summary: This fills in the remaining shift operations that take a single vector input and an immediate shift count: the `vqshl`, `vqshlu`, `vrshr` and `vshll[bt]` families. `vshll[bt]` (which shifts each input lane left into a double-width output lane) is the most interesting one. There are separate MC instruction ids for shifting by exactly the input lane width and shifting by less than that, because the instruction encoding is so completely different for the lane-width special case. So I had to write two sets of patterns to match based on the immediate shift count, which involved adding a ComplexPattern matcher to avoid the general-case pattern accidentally matching the special case too. For that family I've made sure to add an llc codegen test for both versions of each instruction. I'm experimenting with a new strategy for parametrising the isel patterns for all these instructions: adding extra fields to the relevant `Instruction` subclass itself, which are ignored by the Tablegen backends that generate the MC data, but can be retrieved from each instance of that instruction subclass when it's passed as a template parameter to the multiclass that generates its isel patterns. A nice effect of that is that I can fill in those informational fields using `let` blocks, rather than having to type them out once per instruction at `defm` time. (As a result, quite a lot of existing instruction `def`s are reindented by this patch, so it's clearer to read with whitespace changes ignored.) Reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard Reviewed By: MarkMurrayARM Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71458
2019-12-13 21:05:07 +08:00
Operand immtype, list<dag> pattern=[]>
: MVE_shift_imm<(outs MQPR:$Qd), (ins MQPR:$Qm, immtype:$imm),
iname, suffix, "$Qd, $Qm, $imm", vpred_r, "", pattern> {
let Inst{28} = U;
let Inst{25-23} = 0b101;
let Inst{21} = 0b1;
let Inst{12} = th;
let Inst{11-6} = 0b111101;
let Inst{4} = 0b0;
let Inst{0} = 0b0;
[ARM][MVE] Add intrinsics for more immediate shifts. Summary: This fills in the remaining shift operations that take a single vector input and an immediate shift count: the `vqshl`, `vqshlu`, `vrshr` and `vshll[bt]` families. `vshll[bt]` (which shifts each input lane left into a double-width output lane) is the most interesting one. There are separate MC instruction ids for shifting by exactly the input lane width and shifting by less than that, because the instruction encoding is so completely different for the lane-width special case. So I had to write two sets of patterns to match based on the immediate shift count, which involved adding a ComplexPattern matcher to avoid the general-case pattern accidentally matching the special case too. For that family I've made sure to add an llc codegen test for both versions of each instruction. I'm experimenting with a new strategy for parametrising the isel patterns for all these instructions: adding extra fields to the relevant `Instruction` subclass itself, which are ignored by the Tablegen backends that generate the MC data, but can be retrieved from each instance of that instruction subclass when it's passed as a template parameter to the multiclass that generates its isel patterns. A nice effect of that is that I can fill in those informational fields using `let` blocks, rather than having to type them out once per instruction at `defm` time. (As a result, quite a lot of existing instruction `def`s are reindented by this patch, so it's clearer to read with whitespace changes ignored.) Reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard Reviewed By: MarkMurrayARM Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71458
2019-12-13 21:05:07 +08:00
// For the MVE_VSHLL_patterns multiclass to refer to
Operand immediateType = immtype;
}
// The immediate VSHLL instructions accept shift counts from 1 up to
// the lane width (8 or 16), but the full-width shifts have an
// entirely separate encoding, given below with 'lw' in the name.
class MVE_VSHLL_imm8<string iname, string suffix,
bit U, bit th, list<dag> pattern=[]>
[ARM][MVE] Add intrinsics for more immediate shifts. Summary: This fills in the remaining shift operations that take a single vector input and an immediate shift count: the `vqshl`, `vqshlu`, `vrshr` and `vshll[bt]` families. `vshll[bt]` (which shifts each input lane left into a double-width output lane) is the most interesting one. There are separate MC instruction ids for shifting by exactly the input lane width and shifting by less than that, because the instruction encoding is so completely different for the lane-width special case. So I had to write two sets of patterns to match based on the immediate shift count, which involved adding a ComplexPattern matcher to avoid the general-case pattern accidentally matching the special case too. For that family I've made sure to add an llc codegen test for both versions of each instruction. I'm experimenting with a new strategy for parametrising the isel patterns for all these instructions: adding extra fields to the relevant `Instruction` subclass itself, which are ignored by the Tablegen backends that generate the MC data, but can be retrieved from each instance of that instruction subclass when it's passed as a template parameter to the multiclass that generates its isel patterns. A nice effect of that is that I can fill in those informational fields using `let` blocks, rather than having to type them out once per instruction at `defm` time. (As a result, quite a lot of existing instruction `def`s are reindented by this patch, so it's clearer to read with whitespace changes ignored.) Reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard Reviewed By: MarkMurrayARM Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71458
2019-12-13 21:05:07 +08:00
: MVE_VSHLL_imm<iname, suffix, U, th, mve_shift_imm1_7, pattern> {
bits<3> imm;
let Inst{20-19} = 0b01;
let Inst{18-16} = imm;
}
class MVE_VSHLL_imm16<string iname, string suffix,
bit U, bit th, list<dag> pattern=[]>
[ARM][MVE] Add intrinsics for more immediate shifts. Summary: This fills in the remaining shift operations that take a single vector input and an immediate shift count: the `vqshl`, `vqshlu`, `vrshr` and `vshll[bt]` families. `vshll[bt]` (which shifts each input lane left into a double-width output lane) is the most interesting one. There are separate MC instruction ids for shifting by exactly the input lane width and shifting by less than that, because the instruction encoding is so completely different for the lane-width special case. So I had to write two sets of patterns to match based on the immediate shift count, which involved adding a ComplexPattern matcher to avoid the general-case pattern accidentally matching the special case too. For that family I've made sure to add an llc codegen test for both versions of each instruction. I'm experimenting with a new strategy for parametrising the isel patterns for all these instructions: adding extra fields to the relevant `Instruction` subclass itself, which are ignored by the Tablegen backends that generate the MC data, but can be retrieved from each instance of that instruction subclass when it's passed as a template parameter to the multiclass that generates its isel patterns. A nice effect of that is that I can fill in those informational fields using `let` blocks, rather than having to type them out once per instruction at `defm` time. (As a result, quite a lot of existing instruction `def`s are reindented by this patch, so it's clearer to read with whitespace changes ignored.) Reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard Reviewed By: MarkMurrayARM Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71458
2019-12-13 21:05:07 +08:00
: MVE_VSHLL_imm<iname, suffix, U, th, mve_shift_imm1_15, pattern> {
bits<4> imm;
let Inst{20} = 0b1;
let Inst{19-16} = imm;
}
def MVE_VSHLL_imms8bh : MVE_VSHLL_imm8 <"vshllb", "s8", 0b0, 0b0>;
def MVE_VSHLL_imms8th : MVE_VSHLL_imm8 <"vshllt", "s8", 0b0, 0b1>;
def MVE_VSHLL_immu8bh : MVE_VSHLL_imm8 <"vshllb", "u8", 0b1, 0b0>;
def MVE_VSHLL_immu8th : MVE_VSHLL_imm8 <"vshllt", "u8", 0b1, 0b1>;
def MVE_VSHLL_imms16bh : MVE_VSHLL_imm16<"vshllb", "s16", 0b0, 0b0>;
def MVE_VSHLL_imms16th : MVE_VSHLL_imm16<"vshllt", "s16", 0b0, 0b1>;
def MVE_VSHLL_immu16bh : MVE_VSHLL_imm16<"vshllb", "u16", 0b1, 0b0>;
def MVE_VSHLL_immu16th : MVE_VSHLL_imm16<"vshllt", "u16", 0b1, 0b1>;
class MVE_VSHLL_by_lane_width<string iname, string suffix, bits<2> size,
bit U, string ops, list<dag> pattern=[]>
: MVE_shift_imm<(outs MQPR:$Qd), (ins MQPR:$Qm),
iname, suffix, ops, vpred_r, "", pattern> {
let Inst{28} = U;
let Inst{25-23} = 0b100;
let Inst{21-20} = 0b11;
let Inst{19-18} = size{1-0};
let Inst{17-16} = 0b01;
let Inst{11-6} = 0b111000;
let Inst{4} = 0b0;
let Inst{0} = 0b1;
}
multiclass MVE_VSHLL_lw<string iname, string suffix, bits<2> sz, bit U,
string ops, list<dag> pattern=[]> {
def bh : MVE_VSHLL_by_lane_width<iname#"b", suffix, sz, U, ops, pattern> {
let Inst{12} = 0b0;
}
def th : MVE_VSHLL_by_lane_width<iname#"t", suffix, sz, U, ops, pattern> {
let Inst{12} = 0b1;
}
}
defm MVE_VSHLL_lws8 : MVE_VSHLL_lw<"vshll", "s8", 0b00, 0b0, "$Qd, $Qm, #8">;
defm MVE_VSHLL_lws16 : MVE_VSHLL_lw<"vshll", "s16", 0b01, 0b0, "$Qd, $Qm, #16">;
defm MVE_VSHLL_lwu8 : MVE_VSHLL_lw<"vshll", "u8", 0b00, 0b1, "$Qd, $Qm, #8">;
defm MVE_VSHLL_lwu16 : MVE_VSHLL_lw<"vshll", "u16", 0b01, 0b1, "$Qd, $Qm, #16">;
[ARM][MVE] Add intrinsics for more immediate shifts. Summary: This fills in the remaining shift operations that take a single vector input and an immediate shift count: the `vqshl`, `vqshlu`, `vrshr` and `vshll[bt]` families. `vshll[bt]` (which shifts each input lane left into a double-width output lane) is the most interesting one. There are separate MC instruction ids for shifting by exactly the input lane width and shifting by less than that, because the instruction encoding is so completely different for the lane-width special case. So I had to write two sets of patterns to match based on the immediate shift count, which involved adding a ComplexPattern matcher to avoid the general-case pattern accidentally matching the special case too. For that family I've made sure to add an llc codegen test for both versions of each instruction. I'm experimenting with a new strategy for parametrising the isel patterns for all these instructions: adding extra fields to the relevant `Instruction` subclass itself, which are ignored by the Tablegen backends that generate the MC data, but can be retrieved from each instance of that instruction subclass when it's passed as a template parameter to the multiclass that generates its isel patterns. A nice effect of that is that I can fill in those informational fields using `let` blocks, rather than having to type them out once per instruction at `defm` time. (As a result, quite a lot of existing instruction `def`s are reindented by this patch, so it's clearer to read with whitespace changes ignored.) Reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard Reviewed By: MarkMurrayARM Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71458
2019-12-13 21:05:07 +08:00
multiclass MVE_VSHLL_patterns<MVEVectorVTInfo VTI, int top> {
// A succession of local variable definitions, via singleton
// foreach, to make the actual patterns legible
foreach suffix = [!strconcat(VTI.Suffix, !if(top, "th", "bh"))] in
foreach inst_imm = [!cast<MVE_VSHLL_imm>("MVE_VSHLL_imm" # suffix)] in
foreach inst_lw = [!cast<MVE_VSHLL_by_lane_width>("MVE_VSHLL_lw" # suffix)] in
foreach unpred_int = [int_arm_mve_vshll_imm] in
foreach pred_int = [int_arm_mve_vshll_imm_predicated] in
foreach imm = [inst_imm.immediateType] in {
def : Pat<(VTI.DblVec (unpred_int (VTI.Vec MQPR:$src), imm:$imm,
(i32 VTI.Unsigned), (i32 top))),
(VTI.DblVec (inst_imm (VTI.Vec MQPR:$src), imm:$imm))>;
def : Pat<(VTI.DblVec (unpred_int (VTI.Vec MQPR:$src), (i32 VTI.LaneBits),
(i32 VTI.Unsigned), (i32 top))),
(VTI.DblVec (inst_lw (VTI.Vec MQPR:$src)))>;
def : Pat<(VTI.DblVec (pred_int (VTI.Vec MQPR:$src), imm:$imm,
(i32 VTI.Unsigned), (i32 top),
(VTI.Pred VCCR:$mask),
(VTI.DblVec MQPR:$inactive))),
(VTI.DblVec (inst_imm (VTI.Vec MQPR:$src), imm:$imm,
ARMVCCThen, (VTI.Pred VCCR:$mask),
(VTI.DblVec MQPR:$inactive)))>;
def : Pat<(VTI.DblVec (pred_int (VTI.Vec MQPR:$src), (i32 VTI.LaneBits),
(i32 VTI.Unsigned), (i32 top),
(VTI.Pred VCCR:$mask),
(VTI.DblVec MQPR:$inactive))),
(VTI.DblVec (inst_lw (VTI.Vec MQPR:$src), ARMVCCThen,
(VTI.Pred VCCR:$mask),
(VTI.DblVec MQPR:$inactive)))>;
}
}
foreach VTI = [MVE_v16s8, MVE_v8s16, MVE_v16u8, MVE_v8u16] in
foreach top = [0, 1] in
defm : MVE_VSHLL_patterns<VTI, top>;
class MVE_shift_imm_partial<Operand imm, string iname, string suffix>
: MVE_shift_imm<(outs MQPR:$Qd), (ins MQPR:$QdSrc, MQPR:$Qm, imm:$imm),
iname, suffix, "$Qd, $Qm, $imm", vpred_n, "$Qd = $QdSrc"> {
Operand immediateType = imm;
}
class MVE_VxSHRN<string iname, string suffix, bit bit_12, bit bit_28,
Operand imm, list<dag> pattern=[]>
: MVE_shift_imm_partial<imm, iname, suffix> {
bits<5> imm;
let Inst{28} = bit_28;
let Inst{25-23} = 0b101;
let Inst{21} = 0b0;
let Inst{20-16} = imm{4-0};
let Inst{12} = bit_12;
let Inst{11-6} = 0b111111;
let Inst{4} = 0b0;
let Inst{0} = 0b1;
}
def MVE_VRSHRNi16bh : MVE_VxSHRN<"vrshrnb", "i16", 0b0, 0b1, shr_imm8> {
let Inst{20-19} = 0b01;
}
def MVE_VRSHRNi16th : MVE_VxSHRN<"vrshrnt", "i16", 0b1, 0b1, shr_imm8> {
let Inst{20-19} = 0b01;
}
def MVE_VRSHRNi32bh : MVE_VxSHRN<"vrshrnb", "i32", 0b0, 0b1, shr_imm16> {
let Inst{20} = 0b1;
}
def MVE_VRSHRNi32th : MVE_VxSHRN<"vrshrnt", "i32", 0b1, 0b1, shr_imm16> {
let Inst{20} = 0b1;
}
def MVE_VSHRNi16bh : MVE_VxSHRN<"vshrnb", "i16", 0b0, 0b0, shr_imm8> {
let Inst{20-19} = 0b01;
}
def MVE_VSHRNi16th : MVE_VxSHRN<"vshrnt", "i16", 0b1, 0b0, shr_imm8> {
let Inst{20-19} = 0b01;
}
def MVE_VSHRNi32bh : MVE_VxSHRN<"vshrnb", "i32", 0b0, 0b0, shr_imm16> {
let Inst{20} = 0b1;
}
def MVE_VSHRNi32th : MVE_VxSHRN<"vshrnt", "i32", 0b1, 0b0, shr_imm16> {
let Inst{20} = 0b1;
}
class MVE_VxQRSHRUN<string iname, string suffix, bit bit_28, bit bit_12,
Operand imm, list<dag> pattern=[]>
: MVE_shift_imm_partial<imm, iname, suffix> {
bits<5> imm;
let Inst{28} = bit_28;
let Inst{25-23} = 0b101;
let Inst{21} = 0b0;
let Inst{20-16} = imm{4-0};
let Inst{12} = bit_12;
let Inst{11-6} = 0b111111;
let Inst{4} = 0b0;
let Inst{0} = 0b0;
}
def MVE_VQRSHRUNs16bh : MVE_VxQRSHRUN<
"vqrshrunb", "s16", 0b1, 0b0, shr_imm8> {
let Inst{20-19} = 0b01;
}
def MVE_VQRSHRUNs16th : MVE_VxQRSHRUN<
"vqrshrunt", "s16", 0b1, 0b1, shr_imm8> {
let Inst{20-19} = 0b01;
}
def MVE_VQRSHRUNs32bh : MVE_VxQRSHRUN<
"vqrshrunb", "s32", 0b1, 0b0, shr_imm16> {
let Inst{20} = 0b1;
}
def MVE_VQRSHRUNs32th : MVE_VxQRSHRUN<
"vqrshrunt", "s32", 0b1, 0b1, shr_imm16> {
let Inst{20} = 0b1;
}
def MVE_VQSHRUNs16bh : MVE_VxQRSHRUN<
"vqshrunb", "s16", 0b0, 0b0, shr_imm8> {
let Inst{20-19} = 0b01;
}
def MVE_VQSHRUNs16th : MVE_VxQRSHRUN<
"vqshrunt", "s16", 0b0, 0b1, shr_imm8> {
let Inst{20-19} = 0b01;
}
def MVE_VQSHRUNs32bh : MVE_VxQRSHRUN<
"vqshrunb", "s32", 0b0, 0b0, shr_imm16> {
let Inst{20} = 0b1;
}
def MVE_VQSHRUNs32th : MVE_VxQRSHRUN<
"vqshrunt", "s32", 0b0, 0b1, shr_imm16> {
let Inst{20} = 0b1;
}
class MVE_VxQRSHRN<string iname, string suffix, bit bit_0, bit bit_12,
Operand imm, list<dag> pattern=[]>
: MVE_shift_imm_partial<imm, iname, suffix> {
bits<5> imm;
let Inst{25-23} = 0b101;
let Inst{21} = 0b0;
let Inst{20-16} = imm{4-0};
let Inst{12} = bit_12;
let Inst{11-6} = 0b111101;
let Inst{4} = 0b0;
let Inst{0} = bit_0;
}
multiclass MVE_VxQRSHRN_types<string iname, bit bit_0, bit bit_12> {
def s16 : MVE_VxQRSHRN<iname, "s16", bit_0, bit_12, shr_imm8> {
let Inst{28} = 0b0;
let Inst{20-19} = 0b01;
}
def u16 : MVE_VxQRSHRN<iname, "u16", bit_0, bit_12, shr_imm8> {
let Inst{28} = 0b1;
let Inst{20-19} = 0b01;
}
def s32 : MVE_VxQRSHRN<iname, "s32", bit_0, bit_12, shr_imm16> {
let Inst{28} = 0b0;
let Inst{20} = 0b1;
}
def u32 : MVE_VxQRSHRN<iname, "u32", bit_0, bit_12, shr_imm16> {
let Inst{28} = 0b1;
let Inst{20} = 0b1;
}
}
defm MVE_VQRSHRNbh : MVE_VxQRSHRN_types<"vqrshrnb", 0b1, 0b0>;
defm MVE_VQRSHRNth : MVE_VxQRSHRN_types<"vqrshrnt", 0b1, 0b1>;
defm MVE_VQSHRNbh : MVE_VxQRSHRN_types<"vqshrnb", 0b0, 0b0>;
defm MVE_VQSHRNth : MVE_VxQRSHRN_types<"vqshrnt", 0b0, 0b1>;
multiclass MVE_VSHRN_patterns<MVE_shift_imm_partial inst,
MVEVectorVTInfo OutVTI, MVEVectorVTInfo InVTI,
bit q, bit r, bit top> {
foreach inparams = [(? (OutVTI.Vec MQPR:$QdSrc), (InVTI.Vec MQPR:$Qm),
(inst.immediateType:$imm), (i32 q), (i32 r),
(i32 OutVTI.Unsigned), (i32 InVTI.Unsigned),
(i32 top))] in
foreach outparams = [(inst (OutVTI.Vec MQPR:$QdSrc), (InVTI.Vec MQPR:$Qm),
(imm:$imm))] in {
def : Pat<(OutVTI.Vec !setop(inparams, int_arm_mve_vshrn)),
(OutVTI.Vec outparams)>;
def : Pat<(OutVTI.Vec !con(inparams, (int_arm_mve_vshrn_predicated
(InVTI.Pred VCCR:$pred)))),
(OutVTI.Vec !con(outparams, (? ARMVCCThen, VCCR:$pred)))>;
}
}
defm : MVE_VSHRN_patterns<MVE_VSHRNi16bh, MVE_v16s8, MVE_v8s16, 0,0,0>;
defm : MVE_VSHRN_patterns<MVE_VSHRNi16th, MVE_v16s8, MVE_v8s16, 0,0,1>;
defm : MVE_VSHRN_patterns<MVE_VSHRNi32bh, MVE_v8s16, MVE_v4s32, 0,0,0>;
defm : MVE_VSHRN_patterns<MVE_VSHRNi32th, MVE_v8s16, MVE_v4s32, 0,0,1>;
defm : MVE_VSHRN_patterns<MVE_VSHRNi16bh, MVE_v16u8, MVE_v8u16, 0,0,0>;
defm : MVE_VSHRN_patterns<MVE_VSHRNi16th, MVE_v16u8, MVE_v8u16, 0,0,1>;
defm : MVE_VSHRN_patterns<MVE_VSHRNi32bh, MVE_v8u16, MVE_v4u32, 0,0,0>;
defm : MVE_VSHRN_patterns<MVE_VSHRNi32th, MVE_v8u16, MVE_v4u32, 0,0,1>;
defm : MVE_VSHRN_patterns<MVE_VRSHRNi16bh, MVE_v16s8, MVE_v8s16, 0,1,0>;
defm : MVE_VSHRN_patterns<MVE_VRSHRNi16th, MVE_v16s8, MVE_v8s16, 0,1,1>;
defm : MVE_VSHRN_patterns<MVE_VRSHRNi32bh, MVE_v8s16, MVE_v4s32, 0,1,0>;
defm : MVE_VSHRN_patterns<MVE_VRSHRNi32th, MVE_v8s16, MVE_v4s32, 0,1,1>;
defm : MVE_VSHRN_patterns<MVE_VRSHRNi16bh, MVE_v16u8, MVE_v8u16, 0,1,0>;
defm : MVE_VSHRN_patterns<MVE_VRSHRNi16th, MVE_v16u8, MVE_v8u16, 0,1,1>;
defm : MVE_VSHRN_patterns<MVE_VRSHRNi32bh, MVE_v8u16, MVE_v4u32, 0,1,0>;
defm : MVE_VSHRN_patterns<MVE_VRSHRNi32th, MVE_v8u16, MVE_v4u32, 0,1,1>;
defm : MVE_VSHRN_patterns<MVE_VQSHRNbhs16, MVE_v16s8, MVE_v8s16, 1,0,0>;
defm : MVE_VSHRN_patterns<MVE_VQSHRNths16, MVE_v16s8, MVE_v8s16, 1,0,1>;
defm : MVE_VSHRN_patterns<MVE_VQSHRNbhs32, MVE_v8s16, MVE_v4s32, 1,0,0>;
defm : MVE_VSHRN_patterns<MVE_VQSHRNths32, MVE_v8s16, MVE_v4s32, 1,0,1>;
defm : MVE_VSHRN_patterns<MVE_VQSHRNbhu16, MVE_v16u8, MVE_v8u16, 1,0,0>;
defm : MVE_VSHRN_patterns<MVE_VQSHRNthu16, MVE_v16u8, MVE_v8u16, 1,0,1>;
defm : MVE_VSHRN_patterns<MVE_VQSHRNbhu32, MVE_v8u16, MVE_v4u32, 1,0,0>;
defm : MVE_VSHRN_patterns<MVE_VQSHRNthu32, MVE_v8u16, MVE_v4u32, 1,0,1>;
defm : MVE_VSHRN_patterns<MVE_VQRSHRNbhs16, MVE_v16s8, MVE_v8s16, 1,1,0>;
defm : MVE_VSHRN_patterns<MVE_VQRSHRNths16, MVE_v16s8, MVE_v8s16, 1,1,1>;
defm : MVE_VSHRN_patterns<MVE_VQRSHRNbhs32, MVE_v8s16, MVE_v4s32, 1,1,0>;
defm : MVE_VSHRN_patterns<MVE_VQRSHRNths32, MVE_v8s16, MVE_v4s32, 1,1,1>;
defm : MVE_VSHRN_patterns<MVE_VQRSHRNbhu16, MVE_v16u8, MVE_v8u16, 1,1,0>;
defm : MVE_VSHRN_patterns<MVE_VQRSHRNthu16, MVE_v16u8, MVE_v8u16, 1,1,1>;
defm : MVE_VSHRN_patterns<MVE_VQRSHRNbhu32, MVE_v8u16, MVE_v4u32, 1,1,0>;
defm : MVE_VSHRN_patterns<MVE_VQRSHRNthu32, MVE_v8u16, MVE_v4u32, 1,1,1>;
defm : MVE_VSHRN_patterns<MVE_VQSHRUNs16bh, MVE_v16u8, MVE_v8s16, 1,0,0>;
defm : MVE_VSHRN_patterns<MVE_VQSHRUNs16th, MVE_v16u8, MVE_v8s16, 1,0,1>;
defm : MVE_VSHRN_patterns<MVE_VQSHRUNs32bh, MVE_v8u16, MVE_v4s32, 1,0,0>;
defm : MVE_VSHRN_patterns<MVE_VQSHRUNs32th, MVE_v8u16, MVE_v4s32, 1,0,1>;
defm : MVE_VSHRN_patterns<MVE_VQRSHRUNs16bh, MVE_v16u8, MVE_v8s16, 1,1,0>;
defm : MVE_VSHRN_patterns<MVE_VQRSHRUNs16th, MVE_v16u8, MVE_v8s16, 1,1,1>;
defm : MVE_VSHRN_patterns<MVE_VQRSHRUNs32bh, MVE_v8u16, MVE_v4s32, 1,1,0>;
defm : MVE_VSHRN_patterns<MVE_VQRSHRUNs32th, MVE_v8u16, MVE_v4s32, 1,1,1>;
// end of mve_imm_shift instructions
// start of mve_shift instructions
class MVE_shift_by_vec<string iname, string suffix, bit U,
bits<2> size, bit bit_4, bit bit_8>
: MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qm, MQPR:$Qn), NoItinerary,
iname, suffix, "$Qd, $Qm, $Qn", vpred_r, "", []> {
// Shift instructions which take a vector of shift counts
bits<4> Qd;
bits<4> Qm;
bits<4> Qn;
let Inst{28} = U;
let Inst{25-24} = 0b11;
let Inst{23} = 0b0;
let Inst{22} = Qd{3};
let Inst{21-20} = size;
let Inst{19-17} = Qn{2-0};
let Inst{16} = 0b0;
let Inst{15-13} = Qd{2-0};
let Inst{12-9} = 0b0010;
let Inst{8} = bit_8;
let Inst{7} = Qn{3};
let Inst{6} = 0b1;
let Inst{5} = Qm{3};
let Inst{4} = bit_4;
let Inst{3-1} = Qm{2-0};
let Inst{0} = 0b0;
let validForTailPredication = 1;
}
multiclass MVE_shift_by_vec_p<string iname, MVEVectorVTInfo VTI, bit q, bit r> {
def "" : MVE_shift_by_vec<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, q, r>;
def : Pat<(VTI.Vec (int_arm_mve_vshl_vector
(VTI.Vec MQPR:$in), (VTI.Vec MQPR:$sh),
(i32 q), (i32 r), (i32 VTI.Unsigned))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$in), (VTI.Vec MQPR:$sh)))>;
def : Pat<(VTI.Vec (int_arm_mve_vshl_vector_predicated
(VTI.Vec MQPR:$in), (VTI.Vec MQPR:$sh),
(i32 q), (i32 r), (i32 VTI.Unsigned),
(VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$in), (VTI.Vec MQPR:$sh),
ARMVCCThen, (VTI.Pred VCCR:$mask),
(VTI.Vec MQPR:$inactive)))>;
}
multiclass mve_shift_by_vec_multi<string iname, bit bit_4, bit bit_8> {
defm s8 : MVE_shift_by_vec_p<iname, MVE_v16s8, bit_4, bit_8>;
defm s16 : MVE_shift_by_vec_p<iname, MVE_v8s16, bit_4, bit_8>;
defm s32 : MVE_shift_by_vec_p<iname, MVE_v4s32, bit_4, bit_8>;
defm u8 : MVE_shift_by_vec_p<iname, MVE_v16u8, bit_4, bit_8>;
defm u16 : MVE_shift_by_vec_p<iname, MVE_v8u16, bit_4, bit_8>;
defm u32 : MVE_shift_by_vec_p<iname, MVE_v4u32, bit_4, bit_8>;
}
defm MVE_VSHL_by_vec : mve_shift_by_vec_multi<"vshl", 0b0, 0b0>;
defm MVE_VQSHL_by_vec : mve_shift_by_vec_multi<"vqshl", 0b1, 0b0>;
defm MVE_VQRSHL_by_vec : mve_shift_by_vec_multi<"vqrshl", 0b1, 0b1>;
defm MVE_VRSHL_by_vec : mve_shift_by_vec_multi<"vrshl", 0b0, 0b1>;
let Predicates = [HasMVEInt] in {
def : Pat<(v4i32 (ARMvshlu (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn))),
(v4i32 (MVE_VSHL_by_vecu32 (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)))>;
def : Pat<(v8i16 (ARMvshlu (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn))),
(v8i16 (MVE_VSHL_by_vecu16 (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)))>;
def : Pat<(v16i8 (ARMvshlu (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn))),
(v16i8 (MVE_VSHL_by_vecu8 (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)))>;
def : Pat<(v4i32 (ARMvshls (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn))),
(v4i32 (MVE_VSHL_by_vecs32 (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)))>;
def : Pat<(v8i16 (ARMvshls (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn))),
(v8i16 (MVE_VSHL_by_vecs16 (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)))>;
def : Pat<(v16i8 (ARMvshls (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn))),
(v16i8 (MVE_VSHL_by_vecs8 (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)))>;
}
class MVE_shift_with_imm<string iname, string suffix, dag oops, dag iops,
string ops, vpred_ops vpred, string cstr,
list<dag> pattern=[]>
: MVE_p<oops, iops, NoItinerary, iname, suffix, ops, vpred, cstr, pattern> {
bits<4> Qd;
bits<4> Qm;
let Inst{23} = 0b1;
let Inst{22} = Qd{3};
let Inst{15-13} = Qd{2-0};
let Inst{12-11} = 0b00;
let Inst{7-6} = 0b01;
let Inst{5} = Qm{3};
let Inst{4} = 0b1;
let Inst{3-1} = Qm{2-0};
let Inst{0} = 0b0;
let validForTailPredication = 1;
[ARM][MVE] Add intrinsics for more immediate shifts. Summary: This fills in the remaining shift operations that take a single vector input and an immediate shift count: the `vqshl`, `vqshlu`, `vrshr` and `vshll[bt]` families. `vshll[bt]` (which shifts each input lane left into a double-width output lane) is the most interesting one. There are separate MC instruction ids for shifting by exactly the input lane width and shifting by less than that, because the instruction encoding is so completely different for the lane-width special case. So I had to write two sets of patterns to match based on the immediate shift count, which involved adding a ComplexPattern matcher to avoid the general-case pattern accidentally matching the special case too. For that family I've made sure to add an llc codegen test for both versions of each instruction. I'm experimenting with a new strategy for parametrising the isel patterns for all these instructions: adding extra fields to the relevant `Instruction` subclass itself, which are ignored by the Tablegen backends that generate the MC data, but can be retrieved from each instance of that instruction subclass when it's passed as a template parameter to the multiclass that generates its isel patterns. A nice effect of that is that I can fill in those informational fields using `let` blocks, rather than having to type them out once per instruction at `defm` time. (As a result, quite a lot of existing instruction `def`s are reindented by this patch, so it's clearer to read with whitespace changes ignored.) Reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard Reviewed By: MarkMurrayARM Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71458
2019-12-13 21:05:07 +08:00
// For the MVE_shift_imm_patterns multiclass to refer to
MVEVectorVTInfo VTI;
Operand immediateType;
Intrinsic unpred_int;
Intrinsic pred_int;
dag unsignedFlag = (?);
}
class MVE_VSxI_imm<string iname, string suffix, bit bit_8, Operand immType>
: MVE_shift_with_imm<iname, suffix, (outs MQPR:$Qd),
(ins MQPR:$Qd_src, MQPR:$Qm, immType:$imm),
"$Qd, $Qm, $imm", vpred_n, "$Qd = $Qd_src"> {
bits<6> imm;
let Inst{28} = 0b1;
let Inst{25-24} = 0b11;
let Inst{21-16} = imm;
let Inst{10-9} = 0b10;
let Inst{8} = bit_8;
let validForTailPredication = 1;
Operand immediateType = immType;
}
def MVE_VSRIimm8 : MVE_VSxI_imm<"vsri", "8", 0b0, shr_imm8> {
let Inst{21-19} = 0b001;
}
def MVE_VSRIimm16 : MVE_VSxI_imm<"vsri", "16", 0b0, shr_imm16> {
let Inst{21-20} = 0b01;
}
def MVE_VSRIimm32 : MVE_VSxI_imm<"vsri", "32", 0b0, shr_imm32> {
let Inst{21} = 0b1;
}
def MVE_VSLIimm8 : MVE_VSxI_imm<"vsli", "8", 0b1, imm0_7> {
let Inst{21-19} = 0b001;
}
def MVE_VSLIimm16 : MVE_VSxI_imm<"vsli", "16", 0b1, imm0_15> {
let Inst{21-20} = 0b01;
}
def MVE_VSLIimm32 : MVE_VSxI_imm<"vsli", "32", 0b1,imm0_31> {
let Inst{21} = 0b1;
}
multiclass MVE_VSxI_patterns<MVE_VSxI_imm inst, string name,
MVEVectorVTInfo VTI> {
foreach inparams = [(? (VTI.Vec MQPR:$QdSrc), (VTI.Vec MQPR:$Qm),
(inst.immediateType:$imm))] in
foreach outparams = [(inst (VTI.Vec MQPR:$QdSrc), (VTI.Vec MQPR:$Qm),
(inst.immediateType:$imm))] in
foreach unpred_int = [!cast<Intrinsic>("int_arm_mve_" # name)] in
foreach pred_int = [!cast<Intrinsic>("int_arm_mve_" # name # "_predicated")] in {
def : Pat<(VTI.Vec !setop(inparams, unpred_int)),
(VTI.Vec outparams)>;
def : Pat<(VTI.Vec !con(inparams, (pred_int (VTI.Pred VCCR:$pred)))),
(VTI.Vec !con(outparams, (? ARMVCCThen, VCCR:$pred)))>;
}
}
defm : MVE_VSxI_patterns<MVE_VSLIimm8, "vsli", MVE_v16i8>;
defm : MVE_VSxI_patterns<MVE_VSLIimm16, "vsli", MVE_v8i16>;
defm : MVE_VSxI_patterns<MVE_VSLIimm32, "vsli", MVE_v4i32>;
defm : MVE_VSxI_patterns<MVE_VSRIimm8, "vsri", MVE_v16i8>;
defm : MVE_VSxI_patterns<MVE_VSRIimm16, "vsri", MVE_v8i16>;
defm : MVE_VSxI_patterns<MVE_VSRIimm32, "vsri", MVE_v4i32>;
[ARM][MVE] Add intrinsics for more immediate shifts. Summary: This fills in the remaining shift operations that take a single vector input and an immediate shift count: the `vqshl`, `vqshlu`, `vrshr` and `vshll[bt]` families. `vshll[bt]` (which shifts each input lane left into a double-width output lane) is the most interesting one. There are separate MC instruction ids for shifting by exactly the input lane width and shifting by less than that, because the instruction encoding is so completely different for the lane-width special case. So I had to write two sets of patterns to match based on the immediate shift count, which involved adding a ComplexPattern matcher to avoid the general-case pattern accidentally matching the special case too. For that family I've made sure to add an llc codegen test for both versions of each instruction. I'm experimenting with a new strategy for parametrising the isel patterns for all these instructions: adding extra fields to the relevant `Instruction` subclass itself, which are ignored by the Tablegen backends that generate the MC data, but can be retrieved from each instance of that instruction subclass when it's passed as a template parameter to the multiclass that generates its isel patterns. A nice effect of that is that I can fill in those informational fields using `let` blocks, rather than having to type them out once per instruction at `defm` time. (As a result, quite a lot of existing instruction `def`s are reindented by this patch, so it's clearer to read with whitespace changes ignored.) Reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard Reviewed By: MarkMurrayARM Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71458
2019-12-13 21:05:07 +08:00
class MVE_VQSHL_imm<MVEVectorVTInfo VTI_, Operand immType>
: MVE_shift_with_imm<"vqshl", VTI_.Suffix, (outs MQPR:$Qd),
(ins MQPR:$Qm, immType:$imm), "$Qd, $Qm, $imm",
vpred_r, ""> {
bits<6> imm;
[ARM][MVE] Add intrinsics for more immediate shifts. Summary: This fills in the remaining shift operations that take a single vector input and an immediate shift count: the `vqshl`, `vqshlu`, `vrshr` and `vshll[bt]` families. `vshll[bt]` (which shifts each input lane left into a double-width output lane) is the most interesting one. There are separate MC instruction ids for shifting by exactly the input lane width and shifting by less than that, because the instruction encoding is so completely different for the lane-width special case. So I had to write two sets of patterns to match based on the immediate shift count, which involved adding a ComplexPattern matcher to avoid the general-case pattern accidentally matching the special case too. For that family I've made sure to add an llc codegen test for both versions of each instruction. I'm experimenting with a new strategy for parametrising the isel patterns for all these instructions: adding extra fields to the relevant `Instruction` subclass itself, which are ignored by the Tablegen backends that generate the MC data, but can be retrieved from each instance of that instruction subclass when it's passed as a template parameter to the multiclass that generates its isel patterns. A nice effect of that is that I can fill in those informational fields using `let` blocks, rather than having to type them out once per instruction at `defm` time. (As a result, quite a lot of existing instruction `def`s are reindented by this patch, so it's clearer to read with whitespace changes ignored.) Reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard Reviewed By: MarkMurrayARM Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71458
2019-12-13 21:05:07 +08:00
let Inst{28} = VTI_.Unsigned;
let Inst{25-24} = 0b11;
let Inst{21-16} = imm;
let Inst{10-8} = 0b111;
[ARM][MVE] Add intrinsics for more immediate shifts. Summary: This fills in the remaining shift operations that take a single vector input and an immediate shift count: the `vqshl`, `vqshlu`, `vrshr` and `vshll[bt]` families. `vshll[bt]` (which shifts each input lane left into a double-width output lane) is the most interesting one. There are separate MC instruction ids for shifting by exactly the input lane width and shifting by less than that, because the instruction encoding is so completely different for the lane-width special case. So I had to write two sets of patterns to match based on the immediate shift count, which involved adding a ComplexPattern matcher to avoid the general-case pattern accidentally matching the special case too. For that family I've made sure to add an llc codegen test for both versions of each instruction. I'm experimenting with a new strategy for parametrising the isel patterns for all these instructions: adding extra fields to the relevant `Instruction` subclass itself, which are ignored by the Tablegen backends that generate the MC data, but can be retrieved from each instance of that instruction subclass when it's passed as a template parameter to the multiclass that generates its isel patterns. A nice effect of that is that I can fill in those informational fields using `let` blocks, rather than having to type them out once per instruction at `defm` time. (As a result, quite a lot of existing instruction `def`s are reindented by this patch, so it's clearer to read with whitespace changes ignored.) Reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard Reviewed By: MarkMurrayARM Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71458
2019-12-13 21:05:07 +08:00
let VTI = VTI_;
let immediateType = immType;
let unsignedFlag = (? (i32 VTI.Unsigned));
}
[ARM][MVE] Add intrinsics for more immediate shifts. Summary: This fills in the remaining shift operations that take a single vector input and an immediate shift count: the `vqshl`, `vqshlu`, `vrshr` and `vshll[bt]` families. `vshll[bt]` (which shifts each input lane left into a double-width output lane) is the most interesting one. There are separate MC instruction ids for shifting by exactly the input lane width and shifting by less than that, because the instruction encoding is so completely different for the lane-width special case. So I had to write two sets of patterns to match based on the immediate shift count, which involved adding a ComplexPattern matcher to avoid the general-case pattern accidentally matching the special case too. For that family I've made sure to add an llc codegen test for both versions of each instruction. I'm experimenting with a new strategy for parametrising the isel patterns for all these instructions: adding extra fields to the relevant `Instruction` subclass itself, which are ignored by the Tablegen backends that generate the MC data, but can be retrieved from each instance of that instruction subclass when it's passed as a template parameter to the multiclass that generates its isel patterns. A nice effect of that is that I can fill in those informational fields using `let` blocks, rather than having to type them out once per instruction at `defm` time. (As a result, quite a lot of existing instruction `def`s are reindented by this patch, so it's clearer to read with whitespace changes ignored.) Reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard Reviewed By: MarkMurrayARM Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71458
2019-12-13 21:05:07 +08:00
let unpred_int = int_arm_mve_vqshl_imm,
pred_int = int_arm_mve_vqshl_imm_predicated in {
def MVE_VQSHLimms8 : MVE_VQSHL_imm<MVE_v16s8, imm0_7> {
let Inst{21-19} = 0b001;
}
def MVE_VQSHLimmu8 : MVE_VQSHL_imm<MVE_v16u8, imm0_7> {
let Inst{21-19} = 0b001;
}
def MVE_VQSHLimms16 : MVE_VQSHL_imm<MVE_v8s16, imm0_15> {
let Inst{21-20} = 0b01;
}
def MVE_VQSHLimmu16 : MVE_VQSHL_imm<MVE_v8u16, imm0_15> {
let Inst{21-20} = 0b01;
}
def MVE_VQSHLimms32 : MVE_VQSHL_imm<MVE_v4s32, imm0_31> {
let Inst{21} = 0b1;
}
def MVE_VQSHLimmu32 : MVE_VQSHL_imm<MVE_v4u32, imm0_31> {
let Inst{21} = 0b1;
}
}
[ARM][MVE] Add intrinsics for more immediate shifts. Summary: This fills in the remaining shift operations that take a single vector input and an immediate shift count: the `vqshl`, `vqshlu`, `vrshr` and `vshll[bt]` families. `vshll[bt]` (which shifts each input lane left into a double-width output lane) is the most interesting one. There are separate MC instruction ids for shifting by exactly the input lane width and shifting by less than that, because the instruction encoding is so completely different for the lane-width special case. So I had to write two sets of patterns to match based on the immediate shift count, which involved adding a ComplexPattern matcher to avoid the general-case pattern accidentally matching the special case too. For that family I've made sure to add an llc codegen test for both versions of each instruction. I'm experimenting with a new strategy for parametrising the isel patterns for all these instructions: adding extra fields to the relevant `Instruction` subclass itself, which are ignored by the Tablegen backends that generate the MC data, but can be retrieved from each instance of that instruction subclass when it's passed as a template parameter to the multiclass that generates its isel patterns. A nice effect of that is that I can fill in those informational fields using `let` blocks, rather than having to type them out once per instruction at `defm` time. (As a result, quite a lot of existing instruction `def`s are reindented by this patch, so it's clearer to read with whitespace changes ignored.) Reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard Reviewed By: MarkMurrayARM Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71458
2019-12-13 21:05:07 +08:00
class MVE_VQSHLU_imm<MVEVectorVTInfo VTI_, Operand immType>
: MVE_shift_with_imm<"vqshlu", VTI_.Suffix, (outs MQPR:$Qd),
(ins MQPR:$Qm, immType:$imm), "$Qd, $Qm, $imm",
vpred_r, ""> {
bits<6> imm;
let Inst{28} = 0b1;
let Inst{25-24} = 0b11;
let Inst{21-16} = imm;
let Inst{10-8} = 0b110;
[ARM][MVE] Add intrinsics for more immediate shifts. Summary: This fills in the remaining shift operations that take a single vector input and an immediate shift count: the `vqshl`, `vqshlu`, `vrshr` and `vshll[bt]` families. `vshll[bt]` (which shifts each input lane left into a double-width output lane) is the most interesting one. There are separate MC instruction ids for shifting by exactly the input lane width and shifting by less than that, because the instruction encoding is so completely different for the lane-width special case. So I had to write two sets of patterns to match based on the immediate shift count, which involved adding a ComplexPattern matcher to avoid the general-case pattern accidentally matching the special case too. For that family I've made sure to add an llc codegen test for both versions of each instruction. I'm experimenting with a new strategy for parametrising the isel patterns for all these instructions: adding extra fields to the relevant `Instruction` subclass itself, which are ignored by the Tablegen backends that generate the MC data, but can be retrieved from each instance of that instruction subclass when it's passed as a template parameter to the multiclass that generates its isel patterns. A nice effect of that is that I can fill in those informational fields using `let` blocks, rather than having to type them out once per instruction at `defm` time. (As a result, quite a lot of existing instruction `def`s are reindented by this patch, so it's clearer to read with whitespace changes ignored.) Reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard Reviewed By: MarkMurrayARM Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71458
2019-12-13 21:05:07 +08:00
let VTI = VTI_;
let immediateType = immType;
}
[ARM][MVE] Add intrinsics for more immediate shifts. Summary: This fills in the remaining shift operations that take a single vector input and an immediate shift count: the `vqshl`, `vqshlu`, `vrshr` and `vshll[bt]` families. `vshll[bt]` (which shifts each input lane left into a double-width output lane) is the most interesting one. There are separate MC instruction ids for shifting by exactly the input lane width and shifting by less than that, because the instruction encoding is so completely different for the lane-width special case. So I had to write two sets of patterns to match based on the immediate shift count, which involved adding a ComplexPattern matcher to avoid the general-case pattern accidentally matching the special case too. For that family I've made sure to add an llc codegen test for both versions of each instruction. I'm experimenting with a new strategy for parametrising the isel patterns for all these instructions: adding extra fields to the relevant `Instruction` subclass itself, which are ignored by the Tablegen backends that generate the MC data, but can be retrieved from each instance of that instruction subclass when it's passed as a template parameter to the multiclass that generates its isel patterns. A nice effect of that is that I can fill in those informational fields using `let` blocks, rather than having to type them out once per instruction at `defm` time. (As a result, quite a lot of existing instruction `def`s are reindented by this patch, so it's clearer to read with whitespace changes ignored.) Reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard Reviewed By: MarkMurrayARM Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71458
2019-12-13 21:05:07 +08:00
let unpred_int = int_arm_mve_vqshlu_imm,
pred_int = int_arm_mve_vqshlu_imm_predicated in {
def MVE_VQSHLU_imms8 : MVE_VQSHLU_imm<MVE_v16s8, imm0_7> {
let Inst{21-19} = 0b001;
}
[ARM][MVE] Add intrinsics for more immediate shifts. Summary: This fills in the remaining shift operations that take a single vector input and an immediate shift count: the `vqshl`, `vqshlu`, `vrshr` and `vshll[bt]` families. `vshll[bt]` (which shifts each input lane left into a double-width output lane) is the most interesting one. There are separate MC instruction ids for shifting by exactly the input lane width and shifting by less than that, because the instruction encoding is so completely different for the lane-width special case. So I had to write two sets of patterns to match based on the immediate shift count, which involved adding a ComplexPattern matcher to avoid the general-case pattern accidentally matching the special case too. For that family I've made sure to add an llc codegen test for both versions of each instruction. I'm experimenting with a new strategy for parametrising the isel patterns for all these instructions: adding extra fields to the relevant `Instruction` subclass itself, which are ignored by the Tablegen backends that generate the MC data, but can be retrieved from each instance of that instruction subclass when it's passed as a template parameter to the multiclass that generates its isel patterns. A nice effect of that is that I can fill in those informational fields using `let` blocks, rather than having to type them out once per instruction at `defm` time. (As a result, quite a lot of existing instruction `def`s are reindented by this patch, so it's clearer to read with whitespace changes ignored.) Reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard Reviewed By: MarkMurrayARM Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71458
2019-12-13 21:05:07 +08:00
def MVE_VQSHLU_imms16 : MVE_VQSHLU_imm<MVE_v8s16, imm0_15> {
let Inst{21-20} = 0b01;
}
def MVE_VQSHLU_imms32 : MVE_VQSHLU_imm<MVE_v4s32, imm0_31> {
let Inst{21} = 0b1;
}
}
[ARM][MVE] Add intrinsics for more immediate shifts. Summary: This fills in the remaining shift operations that take a single vector input and an immediate shift count: the `vqshl`, `vqshlu`, `vrshr` and `vshll[bt]` families. `vshll[bt]` (which shifts each input lane left into a double-width output lane) is the most interesting one. There are separate MC instruction ids for shifting by exactly the input lane width and shifting by less than that, because the instruction encoding is so completely different for the lane-width special case. So I had to write two sets of patterns to match based on the immediate shift count, which involved adding a ComplexPattern matcher to avoid the general-case pattern accidentally matching the special case too. For that family I've made sure to add an llc codegen test for both versions of each instruction. I'm experimenting with a new strategy for parametrising the isel patterns for all these instructions: adding extra fields to the relevant `Instruction` subclass itself, which are ignored by the Tablegen backends that generate the MC data, but can be retrieved from each instance of that instruction subclass when it's passed as a template parameter to the multiclass that generates its isel patterns. A nice effect of that is that I can fill in those informational fields using `let` blocks, rather than having to type them out once per instruction at `defm` time. (As a result, quite a lot of existing instruction `def`s are reindented by this patch, so it's clearer to read with whitespace changes ignored.) Reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard Reviewed By: MarkMurrayARM Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71458
2019-12-13 21:05:07 +08:00
class MVE_VRSHR_imm<MVEVectorVTInfo VTI_, Operand immType>
: MVE_shift_with_imm<"vrshr", VTI_.Suffix, (outs MQPR:$Qd),
(ins MQPR:$Qm, immType:$imm), "$Qd, $Qm, $imm",
vpred_r, ""> {
bits<6> imm;
[ARM][MVE] Add intrinsics for more immediate shifts. Summary: This fills in the remaining shift operations that take a single vector input and an immediate shift count: the `vqshl`, `vqshlu`, `vrshr` and `vshll[bt]` families. `vshll[bt]` (which shifts each input lane left into a double-width output lane) is the most interesting one. There are separate MC instruction ids for shifting by exactly the input lane width and shifting by less than that, because the instruction encoding is so completely different for the lane-width special case. So I had to write two sets of patterns to match based on the immediate shift count, which involved adding a ComplexPattern matcher to avoid the general-case pattern accidentally matching the special case too. For that family I've made sure to add an llc codegen test for both versions of each instruction. I'm experimenting with a new strategy for parametrising the isel patterns for all these instructions: adding extra fields to the relevant `Instruction` subclass itself, which are ignored by the Tablegen backends that generate the MC data, but can be retrieved from each instance of that instruction subclass when it's passed as a template parameter to the multiclass that generates its isel patterns. A nice effect of that is that I can fill in those informational fields using `let` blocks, rather than having to type them out once per instruction at `defm` time. (As a result, quite a lot of existing instruction `def`s are reindented by this patch, so it's clearer to read with whitespace changes ignored.) Reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard Reviewed By: MarkMurrayARM Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71458
2019-12-13 21:05:07 +08:00
let Inst{28} = VTI_.Unsigned;
let Inst{25-24} = 0b11;
let Inst{21-16} = imm;
let Inst{10-8} = 0b010;
[ARM][MVE] Add intrinsics for more immediate shifts. Summary: This fills in the remaining shift operations that take a single vector input and an immediate shift count: the `vqshl`, `vqshlu`, `vrshr` and `vshll[bt]` families. `vshll[bt]` (which shifts each input lane left into a double-width output lane) is the most interesting one. There are separate MC instruction ids for shifting by exactly the input lane width and shifting by less than that, because the instruction encoding is so completely different for the lane-width special case. So I had to write two sets of patterns to match based on the immediate shift count, which involved adding a ComplexPattern matcher to avoid the general-case pattern accidentally matching the special case too. For that family I've made sure to add an llc codegen test for both versions of each instruction. I'm experimenting with a new strategy for parametrising the isel patterns for all these instructions: adding extra fields to the relevant `Instruction` subclass itself, which are ignored by the Tablegen backends that generate the MC data, but can be retrieved from each instance of that instruction subclass when it's passed as a template parameter to the multiclass that generates its isel patterns. A nice effect of that is that I can fill in those informational fields using `let` blocks, rather than having to type them out once per instruction at `defm` time. (As a result, quite a lot of existing instruction `def`s are reindented by this patch, so it's clearer to read with whitespace changes ignored.) Reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard Reviewed By: MarkMurrayARM Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71458
2019-12-13 21:05:07 +08:00
let VTI = VTI_;
let immediateType = immType;
let unsignedFlag = (? (i32 VTI.Unsigned));
}
[ARM][MVE] Add intrinsics for more immediate shifts. Summary: This fills in the remaining shift operations that take a single vector input and an immediate shift count: the `vqshl`, `vqshlu`, `vrshr` and `vshll[bt]` families. `vshll[bt]` (which shifts each input lane left into a double-width output lane) is the most interesting one. There are separate MC instruction ids for shifting by exactly the input lane width and shifting by less than that, because the instruction encoding is so completely different for the lane-width special case. So I had to write two sets of patterns to match based on the immediate shift count, which involved adding a ComplexPattern matcher to avoid the general-case pattern accidentally matching the special case too. For that family I've made sure to add an llc codegen test for both versions of each instruction. I'm experimenting with a new strategy for parametrising the isel patterns for all these instructions: adding extra fields to the relevant `Instruction` subclass itself, which are ignored by the Tablegen backends that generate the MC data, but can be retrieved from each instance of that instruction subclass when it's passed as a template parameter to the multiclass that generates its isel patterns. A nice effect of that is that I can fill in those informational fields using `let` blocks, rather than having to type them out once per instruction at `defm` time. (As a result, quite a lot of existing instruction `def`s are reindented by this patch, so it's clearer to read with whitespace changes ignored.) Reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard Reviewed By: MarkMurrayARM Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71458
2019-12-13 21:05:07 +08:00
let unpred_int = int_arm_mve_vrshr_imm,
pred_int = int_arm_mve_vrshr_imm_predicated in {
def MVE_VRSHR_imms8 : MVE_VRSHR_imm<MVE_v16s8, shr_imm8> {
let Inst{21-19} = 0b001;
}
[ARM][MVE] Add intrinsics for more immediate shifts. Summary: This fills in the remaining shift operations that take a single vector input and an immediate shift count: the `vqshl`, `vqshlu`, `vrshr` and `vshll[bt]` families. `vshll[bt]` (which shifts each input lane left into a double-width output lane) is the most interesting one. There are separate MC instruction ids for shifting by exactly the input lane width and shifting by less than that, because the instruction encoding is so completely different for the lane-width special case. So I had to write two sets of patterns to match based on the immediate shift count, which involved adding a ComplexPattern matcher to avoid the general-case pattern accidentally matching the special case too. For that family I've made sure to add an llc codegen test for both versions of each instruction. I'm experimenting with a new strategy for parametrising the isel patterns for all these instructions: adding extra fields to the relevant `Instruction` subclass itself, which are ignored by the Tablegen backends that generate the MC data, but can be retrieved from each instance of that instruction subclass when it's passed as a template parameter to the multiclass that generates its isel patterns. A nice effect of that is that I can fill in those informational fields using `let` blocks, rather than having to type them out once per instruction at `defm` time. (As a result, quite a lot of existing instruction `def`s are reindented by this patch, so it's clearer to read with whitespace changes ignored.) Reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard Reviewed By: MarkMurrayARM Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71458
2019-12-13 21:05:07 +08:00
def MVE_VRSHR_immu8 : MVE_VRSHR_imm<MVE_v16u8, shr_imm8> {
let Inst{21-19} = 0b001;
}
[ARM][MVE] Add intrinsics for more immediate shifts. Summary: This fills in the remaining shift operations that take a single vector input and an immediate shift count: the `vqshl`, `vqshlu`, `vrshr` and `vshll[bt]` families. `vshll[bt]` (which shifts each input lane left into a double-width output lane) is the most interesting one. There are separate MC instruction ids for shifting by exactly the input lane width and shifting by less than that, because the instruction encoding is so completely different for the lane-width special case. So I had to write two sets of patterns to match based on the immediate shift count, which involved adding a ComplexPattern matcher to avoid the general-case pattern accidentally matching the special case too. For that family I've made sure to add an llc codegen test for both versions of each instruction. I'm experimenting with a new strategy for parametrising the isel patterns for all these instructions: adding extra fields to the relevant `Instruction` subclass itself, which are ignored by the Tablegen backends that generate the MC data, but can be retrieved from each instance of that instruction subclass when it's passed as a template parameter to the multiclass that generates its isel patterns. A nice effect of that is that I can fill in those informational fields using `let` blocks, rather than having to type them out once per instruction at `defm` time. (As a result, quite a lot of existing instruction `def`s are reindented by this patch, so it's clearer to read with whitespace changes ignored.) Reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard Reviewed By: MarkMurrayARM Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71458
2019-12-13 21:05:07 +08:00
def MVE_VRSHR_imms16 : MVE_VRSHR_imm<MVE_v8s16, shr_imm16> {
let Inst{21-20} = 0b01;
}
[ARM][MVE] Add intrinsics for more immediate shifts. Summary: This fills in the remaining shift operations that take a single vector input and an immediate shift count: the `vqshl`, `vqshlu`, `vrshr` and `vshll[bt]` families. `vshll[bt]` (which shifts each input lane left into a double-width output lane) is the most interesting one. There are separate MC instruction ids for shifting by exactly the input lane width and shifting by less than that, because the instruction encoding is so completely different for the lane-width special case. So I had to write two sets of patterns to match based on the immediate shift count, which involved adding a ComplexPattern matcher to avoid the general-case pattern accidentally matching the special case too. For that family I've made sure to add an llc codegen test for both versions of each instruction. I'm experimenting with a new strategy for parametrising the isel patterns for all these instructions: adding extra fields to the relevant `Instruction` subclass itself, which are ignored by the Tablegen backends that generate the MC data, but can be retrieved from each instance of that instruction subclass when it's passed as a template parameter to the multiclass that generates its isel patterns. A nice effect of that is that I can fill in those informational fields using `let` blocks, rather than having to type them out once per instruction at `defm` time. (As a result, quite a lot of existing instruction `def`s are reindented by this patch, so it's clearer to read with whitespace changes ignored.) Reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard Reviewed By: MarkMurrayARM Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71458
2019-12-13 21:05:07 +08:00
def MVE_VRSHR_immu16 : MVE_VRSHR_imm<MVE_v8u16, shr_imm16> {
let Inst{21-20} = 0b01;
}
[ARM][MVE] Add intrinsics for more immediate shifts. Summary: This fills in the remaining shift operations that take a single vector input and an immediate shift count: the `vqshl`, `vqshlu`, `vrshr` and `vshll[bt]` families. `vshll[bt]` (which shifts each input lane left into a double-width output lane) is the most interesting one. There are separate MC instruction ids for shifting by exactly the input lane width and shifting by less than that, because the instruction encoding is so completely different for the lane-width special case. So I had to write two sets of patterns to match based on the immediate shift count, which involved adding a ComplexPattern matcher to avoid the general-case pattern accidentally matching the special case too. For that family I've made sure to add an llc codegen test for both versions of each instruction. I'm experimenting with a new strategy for parametrising the isel patterns for all these instructions: adding extra fields to the relevant `Instruction` subclass itself, which are ignored by the Tablegen backends that generate the MC data, but can be retrieved from each instance of that instruction subclass when it's passed as a template parameter to the multiclass that generates its isel patterns. A nice effect of that is that I can fill in those informational fields using `let` blocks, rather than having to type them out once per instruction at `defm` time. (As a result, quite a lot of existing instruction `def`s are reindented by this patch, so it's clearer to read with whitespace changes ignored.) Reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard Reviewed By: MarkMurrayARM Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71458
2019-12-13 21:05:07 +08:00
def MVE_VRSHR_imms32 : MVE_VRSHR_imm<MVE_v4s32, shr_imm32> {
let Inst{21} = 0b1;
}
def MVE_VRSHR_immu32 : MVE_VRSHR_imm<MVE_v4u32, shr_imm32> {
let Inst{21} = 0b1;
}
}
[ARM][MVE] Add intrinsics for more immediate shifts. Summary: This fills in the remaining shift operations that take a single vector input and an immediate shift count: the `vqshl`, `vqshlu`, `vrshr` and `vshll[bt]` families. `vshll[bt]` (which shifts each input lane left into a double-width output lane) is the most interesting one. There are separate MC instruction ids for shifting by exactly the input lane width and shifting by less than that, because the instruction encoding is so completely different for the lane-width special case. So I had to write two sets of patterns to match based on the immediate shift count, which involved adding a ComplexPattern matcher to avoid the general-case pattern accidentally matching the special case too. For that family I've made sure to add an llc codegen test for both versions of each instruction. I'm experimenting with a new strategy for parametrising the isel patterns for all these instructions: adding extra fields to the relevant `Instruction` subclass itself, which are ignored by the Tablegen backends that generate the MC data, but can be retrieved from each instance of that instruction subclass when it's passed as a template parameter to the multiclass that generates its isel patterns. A nice effect of that is that I can fill in those informational fields using `let` blocks, rather than having to type them out once per instruction at `defm` time. (As a result, quite a lot of existing instruction `def`s are reindented by this patch, so it's clearer to read with whitespace changes ignored.) Reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard Reviewed By: MarkMurrayARM Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71458
2019-12-13 21:05:07 +08:00
multiclass MVE_shift_imm_patterns<MVE_shift_with_imm inst> {
def : Pat<(inst.VTI.Vec !con((inst.unpred_int (inst.VTI.Vec MQPR:$src),
inst.immediateType:$imm),
inst.unsignedFlag)),
(inst.VTI.Vec (inst (inst.VTI.Vec MQPR:$src),
inst.immediateType:$imm))>;
def : Pat<(inst.VTI.Vec !con((inst.pred_int (inst.VTI.Vec MQPR:$src),
inst.immediateType:$imm),
inst.unsignedFlag,
(? (inst.VTI.Pred VCCR:$mask),
(inst.VTI.Vec MQPR:$inactive)))),
(inst.VTI.Vec (inst (inst.VTI.Vec MQPR:$src),
inst.immediateType:$imm,
ARMVCCThen, (inst.VTI.Pred VCCR:$mask),
(inst.VTI.Vec MQPR:$inactive)))>;
}
defm : MVE_shift_imm_patterns<MVE_VQSHLimms8>;
defm : MVE_shift_imm_patterns<MVE_VQSHLimmu8>;
defm : MVE_shift_imm_patterns<MVE_VQSHLimms16>;
defm : MVE_shift_imm_patterns<MVE_VQSHLimmu16>;
defm : MVE_shift_imm_patterns<MVE_VQSHLimms32>;
defm : MVE_shift_imm_patterns<MVE_VQSHLimmu32>;
defm : MVE_shift_imm_patterns<MVE_VQSHLU_imms8>;
defm : MVE_shift_imm_patterns<MVE_VQSHLU_imms16>;
defm : MVE_shift_imm_patterns<MVE_VQSHLU_imms32>;
defm : MVE_shift_imm_patterns<MVE_VRSHR_imms8>;
defm : MVE_shift_imm_patterns<MVE_VRSHR_immu8>;
defm : MVE_shift_imm_patterns<MVE_VRSHR_imms16>;
defm : MVE_shift_imm_patterns<MVE_VRSHR_immu16>;
defm : MVE_shift_imm_patterns<MVE_VRSHR_imms32>;
defm : MVE_shift_imm_patterns<MVE_VRSHR_immu32>;
class MVE_VSHR_imm<string suffix, dag imm>
: MVE_shift_with_imm<"vshr", suffix, (outs MQPR:$Qd),
!con((ins MQPR:$Qm), imm), "$Qd, $Qm, $imm",
vpred_r, ""> {
bits<6> imm;
let Inst{25-24} = 0b11;
let Inst{21-16} = imm;
let Inst{10-8} = 0b000;
}
def MVE_VSHR_imms8 : MVE_VSHR_imm<"s8", (ins shr_imm8:$imm)> {
let Inst{28} = 0b0;
let Inst{21-19} = 0b001;
}
def MVE_VSHR_immu8 : MVE_VSHR_imm<"u8", (ins shr_imm8:$imm)> {
let Inst{28} = 0b1;
let Inst{21-19} = 0b001;
}
def MVE_VSHR_imms16 : MVE_VSHR_imm<"s16", (ins shr_imm16:$imm)> {
let Inst{28} = 0b0;
let Inst{21-20} = 0b01;
}
def MVE_VSHR_immu16 : MVE_VSHR_imm<"u16", (ins shr_imm16:$imm)> {
let Inst{28} = 0b1;
let Inst{21-20} = 0b01;
}
def MVE_VSHR_imms32 : MVE_VSHR_imm<"s32", (ins shr_imm32:$imm)> {
let Inst{28} = 0b0;
let Inst{21} = 0b1;
}
def MVE_VSHR_immu32 : MVE_VSHR_imm<"u32", (ins shr_imm32:$imm)> {
let Inst{28} = 0b1;
let Inst{21} = 0b1;
}
class MVE_VSHL_imm<string suffix, dag imm>
: MVE_shift_with_imm<"vshl", suffix, (outs MQPR:$Qd),
!con((ins MQPR:$Qm), imm), "$Qd, $Qm, $imm",
vpred_r, ""> {
bits<6> imm;
let Inst{28} = 0b0;
let Inst{25-24} = 0b11;
let Inst{21-16} = imm;
let Inst{10-8} = 0b101;
}
def MVE_VSHL_immi8 : MVE_VSHL_imm<"i8", (ins imm0_7:$imm)> {
let Inst{21-19} = 0b001;
}
def MVE_VSHL_immi16 : MVE_VSHL_imm<"i16", (ins imm0_15:$imm)> {
let Inst{21-20} = 0b01;
}
def MVE_VSHL_immi32 : MVE_VSHL_imm<"i32", (ins imm0_31:$imm)> {
let Inst{21} = 0b1;
}
[ARM][MVE] Add intrinsics for immediate shifts. (reland) This adds the family of `vshlq_n` and `vshrq_n` ACLE intrinsics, which shift every lane of a vector left or right by a compile-time immediate. They mostly work by expanding to the IR `shl`, `lshr` and `ashr` operations, with their second operand being a vector splat of the immediate. There's a fiddly special case, though. ACLE specifies that the immediate in `vshrq_n` can take values up to //and including// the bit size of the vector lane. But LLVM IR thinks that shifting right by the full size of the lane is UB, and feels free to replace the `lshr` with an `undef` half way through the optimization pipeline. Hence, to keep this legal in source code, I have to detect it at codegen time. Logical (unsigned) right shifts by the element size are handled by simply emitting the zero vector; arithmetic ones are converted into a shift of one bit less, which will always give the same output. In order to do that check, I also had to enhance the tablegen MveEmitter so that it can cope with converting a builtin function's operand into a bare integer to pass to a code-generating subfunction. Previously the only bare integers it knew how to handle were flags generated from within `arm_mve.td`. Reviewers: dmgreen, miyuki, MarkMurrayARM, ostannard Reviewed By: dmgreen, MarkMurrayARM Subscribers: echristo, hokein, rdhindsa, kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71065
2019-12-11 18:04:14 +08:00
multiclass MVE_immediate_shift_patterns_inner<
MVEVectorVTInfo VTI, Operand imm_operand_type, SDNode unpred_op,
Intrinsic pred_int, Instruction inst, list<int> unsignedFlag = []> {
def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$src), imm_operand_type:$imm)),
(VTI.Vec (inst (VTI.Vec MQPR:$src), imm_operand_type:$imm))>;
def : Pat<(VTI.Vec !con((pred_int (VTI.Vec MQPR:$src), imm_operand_type:$imm),
!dag(pred_int, unsignedFlag, ?),
(pred_int (VTI.Pred VCCR:$mask),
(VTI.Vec MQPR:$inactive)))),
(VTI.Vec (inst (VTI.Vec MQPR:$src), imm_operand_type:$imm,
ARMVCCThen, (VTI.Pred VCCR:$mask),
(VTI.Vec MQPR:$inactive)))>;
}
multiclass MVE_immediate_shift_patterns<MVEVectorVTInfo VTI,
Operand imm_operand_type> {
defm : MVE_immediate_shift_patterns_inner<VTI, imm_operand_type,
ARMvshlImm, int_arm_mve_shl_imm_predicated,
!cast<Instruction>("MVE_VSHL_immi" # VTI.BitsSuffix)>;
defm : MVE_immediate_shift_patterns_inner<VTI, imm_operand_type,
ARMvshruImm, int_arm_mve_shr_imm_predicated,
!cast<Instruction>("MVE_VSHR_immu" # VTI.BitsSuffix), [1]>;
defm : MVE_immediate_shift_patterns_inner<VTI, imm_operand_type,
ARMvshrsImm, int_arm_mve_shr_imm_predicated,
!cast<Instruction>("MVE_VSHR_imms" # VTI.BitsSuffix), [0]>;
}
let Predicates = [HasMVEInt] in {
[ARM][MVE] Add intrinsics for immediate shifts. (reland) This adds the family of `vshlq_n` and `vshrq_n` ACLE intrinsics, which shift every lane of a vector left or right by a compile-time immediate. They mostly work by expanding to the IR `shl`, `lshr` and `ashr` operations, with their second operand being a vector splat of the immediate. There's a fiddly special case, though. ACLE specifies that the immediate in `vshrq_n` can take values up to //and including// the bit size of the vector lane. But LLVM IR thinks that shifting right by the full size of the lane is UB, and feels free to replace the `lshr` with an `undef` half way through the optimization pipeline. Hence, to keep this legal in source code, I have to detect it at codegen time. Logical (unsigned) right shifts by the element size are handled by simply emitting the zero vector; arithmetic ones are converted into a shift of one bit less, which will always give the same output. In order to do that check, I also had to enhance the tablegen MveEmitter so that it can cope with converting a builtin function's operand into a bare integer to pass to a code-generating subfunction. Previously the only bare integers it knew how to handle were flags generated from within `arm_mve.td`. Reviewers: dmgreen, miyuki, MarkMurrayARM, ostannard Reviewed By: dmgreen, MarkMurrayARM Subscribers: echristo, hokein, rdhindsa, kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71065
2019-12-11 18:04:14 +08:00
defm : MVE_immediate_shift_patterns<MVE_v16i8, imm0_7>;
defm : MVE_immediate_shift_patterns<MVE_v8i16, imm0_15>;
defm : MVE_immediate_shift_patterns<MVE_v4i32, imm0_31>;
}
// end of mve_shift instructions
// start of MVE Floating Point instructions
class MVE_float<string iname, string suffix, dag oops, dag iops, string ops,
vpred_ops vpred, string cstr, list<dag> pattern=[]>
: MVE_f<oops, iops, NoItinerary, iname, suffix, ops, vpred, cstr, pattern> {
bits<4> Qm;
let Inst{12} = 0b0;
let Inst{6} = 0b1;
let Inst{5} = Qm{3};
let Inst{3-1} = Qm{2-0};
let Inst{0} = 0b0;
}
class MVE_VRINT<string rmode, bits<3> op, string suffix, bits<2> size,
list<dag> pattern=[]>
: MVE_float<!strconcat("vrint", rmode), suffix, (outs MQPR:$Qd),
(ins MQPR:$Qm), "$Qd, $Qm", vpred_r, "", pattern> {
bits<4> Qd;
let Inst{28} = 0b1;
let Inst{25-23} = 0b111;
let Inst{22} = Qd{3};
let Inst{21-20} = 0b11;
let Inst{19-18} = size;
let Inst{17-16} = 0b10;
let Inst{15-13} = Qd{2-0};
let Inst{11-10} = 0b01;
let Inst{9-7} = op{2-0};
let Inst{4} = 0b0;
let validForTailPredication = 1;
}
multiclass MVE_VRINT_ops<string suffix, bits<2> size, list<dag> pattern=[]> {
def N : MVE_VRINT<"n", 0b000, suffix, size, pattern>;
def X : MVE_VRINT<"x", 0b001, suffix, size, pattern>;
def A : MVE_VRINT<"a", 0b010, suffix, size, pattern>;
def Z : MVE_VRINT<"z", 0b011, suffix, size, pattern>;
def M : MVE_VRINT<"m", 0b101, suffix, size, pattern>;
def P : MVE_VRINT<"p", 0b111, suffix, size, pattern>;
}
defm MVE_VRINTf16 : MVE_VRINT_ops<"f16", 0b01>;
defm MVE_VRINTf32 : MVE_VRINT_ops<"f32", 0b10>;
let Predicates = [HasMVEFloat] in {
def : Pat<(v4f32 (frint (v4f32 MQPR:$val1))),
(v4f32 (MVE_VRINTf32X (v4f32 MQPR:$val1)))>;
def : Pat<(v8f16 (frint (v8f16 MQPR:$val1))),
(v8f16 (MVE_VRINTf16X (v8f16 MQPR:$val1)))>;
def : Pat<(v4f32 (fround (v4f32 MQPR:$val1))),
(v4f32 (MVE_VRINTf32A (v4f32 MQPR:$val1)))>;
def : Pat<(v8f16 (fround (v8f16 MQPR:$val1))),
(v8f16 (MVE_VRINTf16A (v8f16 MQPR:$val1)))>;
def : Pat<(v4f32 (ftrunc (v4f32 MQPR:$val1))),
(v4f32 (MVE_VRINTf32Z (v4f32 MQPR:$val1)))>;
def : Pat<(v8f16 (ftrunc (v8f16 MQPR:$val1))),
(v8f16 (MVE_VRINTf16Z (v8f16 MQPR:$val1)))>;
def : Pat<(v4f32 (ffloor (v4f32 MQPR:$val1))),
(v4f32 (MVE_VRINTf32M (v4f32 MQPR:$val1)))>;
def : Pat<(v8f16 (ffloor (v8f16 MQPR:$val1))),
(v8f16 (MVE_VRINTf16M (v8f16 MQPR:$val1)))>;
def : Pat<(v4f32 (fceil (v4f32 MQPR:$val1))),
(v4f32 (MVE_VRINTf32P (v4f32 MQPR:$val1)))>;
def : Pat<(v8f16 (fceil (v8f16 MQPR:$val1))),
(v8f16 (MVE_VRINTf16P (v8f16 MQPR:$val1)))>;
}
class MVEFloatArithNeon<string iname, string suffix, bit size,
dag oops, dag iops, string ops,
vpred_ops vpred, string cstr, list<dag> pattern=[]>
: MVE_float<iname, suffix, oops, iops, ops, vpred, cstr, pattern> {
let Inst{20} = size;
let Inst{16} = 0b0;
}
class MVE_VMUL_fp<string iname, string suffix, bit size, list<dag> pattern=[]>
: MVEFloatArithNeon<iname, suffix, size, (outs MQPR:$Qd),
(ins MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm", vpred_r, "",
pattern> {
bits<4> Qd;
bits<4> Qn;
let Inst{28} = 0b1;
let Inst{25-23} = 0b110;
let Inst{22} = Qd{3};
let Inst{21} = 0b0;
let Inst{19-17} = Qn{2-0};
let Inst{15-13} = Qd{2-0};
let Inst{12-8} = 0b01101;
let Inst{7} = Qn{3};
let Inst{4} = 0b1;
let validForTailPredication = 1;
}
multiclass MVE_VMULT_fp_m<string iname, bit bit_21, MVEVectorVTInfo VTI,
SDNode unpred_op, Intrinsic pred_int> {
def "" : MVE_VMUL_fp<iname, VTI.Suffix, VTI.Size{0}>;
let Predicates = [HasMVEFloat] in {
def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
(VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
ARMVCCThen, (VTI.Pred VCCR:$mask),
(VTI.Vec MQPR:$inactive)))>;
}
}
multiclass MVE_VMUL_fp_m<MVEVectorVTInfo VTI>
: MVE_VMULT_fp_m<"vmul", 0, VTI, fmul, int_arm_mve_mul_predicated>;
defm MVE_VMULf32 : MVE_VMUL_fp_m<MVE_v4f32>;
defm MVE_VMULf16 : MVE_VMUL_fp_m<MVE_v8f16>;
class MVE_VCMLA<string suffix, bit size>
: MVEFloatArithNeon<"vcmla", suffix, size, (outs MQPR:$Qd),
(ins MQPR:$Qd_src, MQPR:$Qn, MQPR:$Qm, complexrotateop:$rot),
"$Qd, $Qn, $Qm, $rot", vpred_n, "$Qd = $Qd_src", []> {
bits<4> Qd;
bits<4> Qn;
bits<2> rot;
let Inst{28} = 0b1;
let Inst{25} = 0b0;
let Inst{24-23} = rot;
let Inst{22} = Qd{3};
let Inst{21} = 0b1;
let Inst{19-17} = Qn{2-0};
let Inst{15-13} = Qd{2-0};
let Inst{12-8} = 0b01000;
let Inst{7} = Qn{3};
let Inst{4} = 0b0;
}
multiclass MVE_VCMLA_m<MVEVectorVTInfo VTI, bit size> {
def "" : MVE_VCMLA<VTI.Suffix, size>;
let Predicates = [HasMVEFloat] in {
def : Pat<(VTI.Vec (int_arm_mve_vcmlaq
imm:$rot, (VTI.Vec MQPR:$Qd_src),
(VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qd_src),
(VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
imm:$rot))>;
def : Pat<(VTI.Vec (int_arm_mve_vcmlaq_predicated
imm:$rot, (VTI.Vec MQPR:$Qd_src),
(VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
(VTI.Pred VCCR:$mask))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qd_src), (VTI.Vec MQPR:$Qn),
(VTI.Vec MQPR:$Qm), imm:$rot,
ARMVCCThen, (VTI.Pred VCCR:$mask)))>;
}
}
defm MVE_VCMLAf16 : MVE_VCMLA_m<MVE_v8f16, 0b0>;
defm MVE_VCMLAf32 : MVE_VCMLA_m<MVE_v4f32, 0b1>;
class MVE_VADDSUBFMA_fp<string iname, string suffix, bit size, bit bit_4,
bit bit_8, bit bit_21, dag iops=(ins),
vpred_ops vpred=vpred_r, string cstr="",
list<dag> pattern=[]>
: MVEFloatArithNeon<iname, suffix, size, (outs MQPR:$Qd),
!con(iops, (ins MQPR:$Qn, MQPR:$Qm)), "$Qd, $Qn, $Qm",
vpred, cstr, pattern> {
bits<4> Qd;
bits<4> Qn;
let Inst{28} = 0b0;
let Inst{25-23} = 0b110;
let Inst{22} = Qd{3};
let Inst{21} = bit_21;
let Inst{19-17} = Qn{2-0};
let Inst{15-13} = Qd{2-0};
let Inst{11-9} = 0b110;
let Inst{8} = bit_8;
let Inst{7} = Qn{3};
let Inst{4} = bit_4;
}
def MVE_VFMAf32 : MVE_VADDSUBFMA_fp<"vfma", "f32", 0b0, 0b1, 0b0, 0b0,
(ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">;
def MVE_VFMAf16 : MVE_VADDSUBFMA_fp<"vfma", "f16", 0b1, 0b1, 0b0, 0b0,
(ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">;
def MVE_VFMSf32 : MVE_VADDSUBFMA_fp<"vfms", "f32", 0b0, 0b1, 0b0, 0b1,
(ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">;
def MVE_VFMSf16 : MVE_VADDSUBFMA_fp<"vfms", "f16", 0b1, 0b1, 0b0, 0b1,
(ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">;
2019-08-08 16:21:01 +08:00
let Predicates = [HasMVEFloat] in {
def : Pat<(v8f16 (fma (v8f16 MQPR:$src1), (v8f16 MQPR:$src2), (v8f16 MQPR:$src3))),
(v8f16 (MVE_VFMAf16 $src3, $src1, $src2))>;
def : Pat<(v4f32 (fma (v4f32 MQPR:$src1), (v4f32 MQPR:$src2), (v4f32 MQPR:$src3))),
(v4f32 (MVE_VFMAf32 $src3, $src1, $src2))>;
def : Pat<(v8f16 (fma (fneg (v8f16 MQPR:$src1)), (v8f16 MQPR:$src2), (v8f16 MQPR:$src3))),
(v8f16 (MVE_VFMSf16 $src3, $src1, $src2))>;
def : Pat<(v4f32 (fma (fneg (v4f32 MQPR:$src1)), (v4f32 MQPR:$src2), (v4f32 MQPR:$src3))),
(v4f32 (MVE_VFMSf32 $src3, $src1, $src2))>;
2019-08-08 16:21:01 +08:00
}
[ARM] Begin adding IR intrinsics for MVE instructions. This commit, together with the next few, will add a representative sample of the kind of IR intrinsics that we'll need in order to implement the user-facing ACLE intrinsics for MVE. Supporting all of them will take more work; the intention of this initial series of commits is to implement an intrinsic or two from lots of different categories, as examples and proofs of concept. This initial commit introduces a small number of IR intrinsics for instructions simple enough that they can use Tablegen ISel patterns: the predicated versions of the VADD and VSUB instructions (both integer and FP), VMIN and VMAX, and the float->half VCVT instruction (predicated and unpredicated). When using VPT-predicated instructions in automatic code generation, it will be convenient to specify the predicate value as a vector of the appropriate number of i1. To make it easy to specify all sizes of an instruction in one go and give each one the matching predicate vector type, I've added a system of Tablegen informational records describing MVE's vector types: each one gives the underlying LLVM IR ValueType (which may not be the same if the MVE vector is of explicitly signed or unsigned integers) and an appropriate vNi1 to use as the predicate vector. (Also, those info records include the usual encoding for the types, so that as we add associations between each instruction encoding and one of the new `MVEVectorVTInfo` records, we can remove some of the existing template parameters and replace them with references to the vector type info's fields.) The user-facing ACLE intrinsics will receive a predicate mask as a 16-bit integer, so I've also provided a pair of intrinsics i2v and v2i, to convert between an integer and a vector of i1 by just changing the register class. Reviewers: dmgreen, miyuki, ostannard Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67158
2019-10-08 00:00:51 +08:00
multiclass MVE_VADDSUB_fp_m<string iname, bit bit_21, MVEVectorVTInfo VTI,
SDNode unpred_op, Intrinsic pred_int> {
def "" : MVE_VADDSUBFMA_fp<iname, VTI.Suffix, VTI.Size{0}, 0, 1, bit_21> {
let validForTailPredication = 1;
}
[ARM] Begin adding IR intrinsics for MVE instructions. This commit, together with the next few, will add a representative sample of the kind of IR intrinsics that we'll need in order to implement the user-facing ACLE intrinsics for MVE. Supporting all of them will take more work; the intention of this initial series of commits is to implement an intrinsic or two from lots of different categories, as examples and proofs of concept. This initial commit introduces a small number of IR intrinsics for instructions simple enough that they can use Tablegen ISel patterns: the predicated versions of the VADD and VSUB instructions (both integer and FP), VMIN and VMAX, and the float->half VCVT instruction (predicated and unpredicated). When using VPT-predicated instructions in automatic code generation, it will be convenient to specify the predicate value as a vector of the appropriate number of i1. To make it easy to specify all sizes of an instruction in one go and give each one the matching predicate vector type, I've added a system of Tablegen informational records describing MVE's vector types: each one gives the underlying LLVM IR ValueType (which may not be the same if the MVE vector is of explicitly signed or unsigned integers) and an appropriate vNi1 to use as the predicate vector. (Also, those info records include the usual encoding for the types, so that as we add associations between each instruction encoding and one of the new `MVEVectorVTInfo` records, we can remove some of the existing template parameters and replace them with references to the vector type info's fields.) The user-facing ACLE intrinsics will receive a predicate mask as a 16-bit integer, so I've also provided a pair of intrinsics i2v and v2i, to convert between an integer and a vector of i1 by just changing the register class. Reviewers: dmgreen, miyuki, ostannard Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67158
2019-10-08 00:00:51 +08:00
let Predicates = [HasMVEFloat] in {
def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
(VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
ARMVCCThen, (VTI.Pred VCCR:$mask),
[ARM] Begin adding IR intrinsics for MVE instructions. This commit, together with the next few, will add a representative sample of the kind of IR intrinsics that we'll need in order to implement the user-facing ACLE intrinsics for MVE. Supporting all of them will take more work; the intention of this initial series of commits is to implement an intrinsic or two from lots of different categories, as examples and proofs of concept. This initial commit introduces a small number of IR intrinsics for instructions simple enough that they can use Tablegen ISel patterns: the predicated versions of the VADD and VSUB instructions (both integer and FP), VMIN and VMAX, and the float->half VCVT instruction (predicated and unpredicated). When using VPT-predicated instructions in automatic code generation, it will be convenient to specify the predicate value as a vector of the appropriate number of i1. To make it easy to specify all sizes of an instruction in one go and give each one the matching predicate vector type, I've added a system of Tablegen informational records describing MVE's vector types: each one gives the underlying LLVM IR ValueType (which may not be the same if the MVE vector is of explicitly signed or unsigned integers) and an appropriate vNi1 to use as the predicate vector. (Also, those info records include the usual encoding for the types, so that as we add associations between each instruction encoding and one of the new `MVEVectorVTInfo` records, we can remove some of the existing template parameters and replace them with references to the vector type info's fields.) The user-facing ACLE intrinsics will receive a predicate mask as a 16-bit integer, so I've also provided a pair of intrinsics i2v and v2i, to convert between an integer and a vector of i1 by just changing the register class. Reviewers: dmgreen, miyuki, ostannard Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67158
2019-10-08 00:00:51 +08:00
(VTI.Vec MQPR:$inactive)))>;
}
}
[ARM] Begin adding IR intrinsics for MVE instructions. This commit, together with the next few, will add a representative sample of the kind of IR intrinsics that we'll need in order to implement the user-facing ACLE intrinsics for MVE. Supporting all of them will take more work; the intention of this initial series of commits is to implement an intrinsic or two from lots of different categories, as examples and proofs of concept. This initial commit introduces a small number of IR intrinsics for instructions simple enough that they can use Tablegen ISel patterns: the predicated versions of the VADD and VSUB instructions (both integer and FP), VMIN and VMAX, and the float->half VCVT instruction (predicated and unpredicated). When using VPT-predicated instructions in automatic code generation, it will be convenient to specify the predicate value as a vector of the appropriate number of i1. To make it easy to specify all sizes of an instruction in one go and give each one the matching predicate vector type, I've added a system of Tablegen informational records describing MVE's vector types: each one gives the underlying LLVM IR ValueType (which may not be the same if the MVE vector is of explicitly signed or unsigned integers) and an appropriate vNi1 to use as the predicate vector. (Also, those info records include the usual encoding for the types, so that as we add associations between each instruction encoding and one of the new `MVEVectorVTInfo` records, we can remove some of the existing template parameters and replace them with references to the vector type info's fields.) The user-facing ACLE intrinsics will receive a predicate mask as a 16-bit integer, so I've also provided a pair of intrinsics i2v and v2i, to convert between an integer and a vector of i1 by just changing the register class. Reviewers: dmgreen, miyuki, ostannard Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67158
2019-10-08 00:00:51 +08:00
multiclass MVE_VADD_fp_m<MVEVectorVTInfo VTI>
: MVE_VADDSUB_fp_m<"vadd", 0, VTI, fadd, int_arm_mve_add_predicated>;
multiclass MVE_VSUB_fp_m<MVEVectorVTInfo VTI>
: MVE_VADDSUB_fp_m<"vsub", 1, VTI, fsub, int_arm_mve_sub_predicated>;
[ARM] Begin adding IR intrinsics for MVE instructions. This commit, together with the next few, will add a representative sample of the kind of IR intrinsics that we'll need in order to implement the user-facing ACLE intrinsics for MVE. Supporting all of them will take more work; the intention of this initial series of commits is to implement an intrinsic or two from lots of different categories, as examples and proofs of concept. This initial commit introduces a small number of IR intrinsics for instructions simple enough that they can use Tablegen ISel patterns: the predicated versions of the VADD and VSUB instructions (both integer and FP), VMIN and VMAX, and the float->half VCVT instruction (predicated and unpredicated). When using VPT-predicated instructions in automatic code generation, it will be convenient to specify the predicate value as a vector of the appropriate number of i1. To make it easy to specify all sizes of an instruction in one go and give each one the matching predicate vector type, I've added a system of Tablegen informational records describing MVE's vector types: each one gives the underlying LLVM IR ValueType (which may not be the same if the MVE vector is of explicitly signed or unsigned integers) and an appropriate vNi1 to use as the predicate vector. (Also, those info records include the usual encoding for the types, so that as we add associations between each instruction encoding and one of the new `MVEVectorVTInfo` records, we can remove some of the existing template parameters and replace them with references to the vector type info's fields.) The user-facing ACLE intrinsics will receive a predicate mask as a 16-bit integer, so I've also provided a pair of intrinsics i2v and v2i, to convert between an integer and a vector of i1 by just changing the register class. Reviewers: dmgreen, miyuki, ostannard Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67158
2019-10-08 00:00:51 +08:00
defm MVE_VADDf32 : MVE_VADD_fp_m<MVE_v4f32>;
defm MVE_VADDf16 : MVE_VADD_fp_m<MVE_v8f16>;
[ARM] Begin adding IR intrinsics for MVE instructions. This commit, together with the next few, will add a representative sample of the kind of IR intrinsics that we'll need in order to implement the user-facing ACLE intrinsics for MVE. Supporting all of them will take more work; the intention of this initial series of commits is to implement an intrinsic or two from lots of different categories, as examples and proofs of concept. This initial commit introduces a small number of IR intrinsics for instructions simple enough that they can use Tablegen ISel patterns: the predicated versions of the VADD and VSUB instructions (both integer and FP), VMIN and VMAX, and the float->half VCVT instruction (predicated and unpredicated). When using VPT-predicated instructions in automatic code generation, it will be convenient to specify the predicate value as a vector of the appropriate number of i1. To make it easy to specify all sizes of an instruction in one go and give each one the matching predicate vector type, I've added a system of Tablegen informational records describing MVE's vector types: each one gives the underlying LLVM IR ValueType (which may not be the same if the MVE vector is of explicitly signed or unsigned integers) and an appropriate vNi1 to use as the predicate vector. (Also, those info records include the usual encoding for the types, so that as we add associations between each instruction encoding and one of the new `MVEVectorVTInfo` records, we can remove some of the existing template parameters and replace them with references to the vector type info's fields.) The user-facing ACLE intrinsics will receive a predicate mask as a 16-bit integer, so I've also provided a pair of intrinsics i2v and v2i, to convert between an integer and a vector of i1 by just changing the register class. Reviewers: dmgreen, miyuki, ostannard Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67158
2019-10-08 00:00:51 +08:00
defm MVE_VSUBf32 : MVE_VSUB_fp_m<MVE_v4f32>;
defm MVE_VSUBf16 : MVE_VSUB_fp_m<MVE_v8f16>;
class MVE_VCADD<string suffix, bit size, string cstr="">
: MVEFloatArithNeon<"vcadd", suffix, size, (outs MQPR:$Qd),
(ins MQPR:$Qn, MQPR:$Qm, complexrotateopodd:$rot),
"$Qd, $Qn, $Qm, $rot", vpred_r, cstr, []> {
bits<4> Qd;
bits<4> Qn;
bit rot;
let Inst{28} = 0b1;
let Inst{25} = 0b0;
let Inst{24} = rot;
let Inst{23} = 0b1;
let Inst{22} = Qd{3};
let Inst{21} = 0b0;
let Inst{19-17} = Qn{2-0};
let Inst{15-13} = Qd{2-0};
let Inst{12-8} = 0b01000;
let Inst{7} = Qn{3};
let Inst{4} = 0b0;
}
multiclass MVE_VCADD_m<MVEVectorVTInfo VTI, bit size, string cstr=""> {
def "" : MVE_VCADD<VTI.Suffix, size, cstr>;
let Predicates = [HasMVEFloat] in {
def : Pat<(VTI.Vec (int_arm_mve_vcaddq (i32 1),
imm:$rot, (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
imm:$rot))>;
def : Pat<(VTI.Vec (int_arm_mve_vcaddq_predicated (i32 1),
imm:$rot, (VTI.Vec MQPR:$inactive),
(VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
(VTI.Pred VCCR:$mask))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
imm:$rot,
ARMVCCThen, (VTI.Pred VCCR:$mask),
(VTI.Vec MQPR:$inactive)))>;
}
}
defm MVE_VCADDf16 : MVE_VCADD_m<MVE_v8f16, 0b0>;
defm MVE_VCADDf32 : MVE_VCADD_m<MVE_v4f32, 0b1, "@earlyclobber $Qd">;
class MVE_VABD_fp<string suffix, bit size>
: MVE_float<"vabd", suffix, (outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm),
"$Qd, $Qn, $Qm", vpred_r, ""> {
bits<4> Qd;
bits<4> Qn;
let Inst{28} = 0b1;
let Inst{25-23} = 0b110;
let Inst{22} = Qd{3};
let Inst{21} = 0b1;
let Inst{20} = size;
let Inst{19-17} = Qn{2-0};
let Inst{16} = 0b0;
let Inst{15-13} = Qd{2-0};
let Inst{11-8} = 0b1101;
let Inst{7} = Qn{3};
let Inst{4} = 0b0;
let validForTailPredication = 1;
}
multiclass MVE_VABDT_fp_m<MVEVectorVTInfo VTI,
Intrinsic unpred_int, Intrinsic pred_int> {
def "" : MVE_VABD_fp<VTI.Suffix, VTI.Size{0}>;
let Predicates = [HasMVEFloat] in {
def : Pat<(VTI.Vec (unpred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
(i32 0))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
(i32 0), (VTI.Pred VCCR:$mask),
(VTI.Vec MQPR:$inactive))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
ARMVCCThen, (VTI.Pred VCCR:$mask),
(VTI.Vec MQPR:$inactive)))>;
}
}
multiclass MVE_VABD_fp_m<MVEVectorVTInfo VTI>
: MVE_VABDT_fp_m<VTI, int_arm_mve_vabd, int_arm_mve_abd_predicated>;
defm MVE_VABDf32 : MVE_VABD_fp_m<MVE_v4f32>;
defm MVE_VABDf16 : MVE_VABD_fp_m<MVE_v8f16>;
class MVE_VCVT_fix<string suffix, bit fsi, bit U, bit op,
Operand imm_operand_type, list<dag> pattern=[]>
: MVE_float<"vcvt", suffix,
(outs MQPR:$Qd), (ins MQPR:$Qm, imm_operand_type:$imm6),
"$Qd, $Qm, $imm6", vpred_r, "", pattern> {
bits<4> Qd;
bits<6> imm6;
let Inst{28} = U;
let Inst{25-23} = 0b111;
let Inst{22} = Qd{3};
let Inst{21} = 0b1;
let Inst{19-16} = imm6{3-0};
let Inst{15-13} = Qd{2-0};
let Inst{11-10} = 0b11;
let Inst{9} = fsi;
let Inst{8} = op;
let Inst{7} = 0b0;
let Inst{4} = 0b1;
let DecoderMethod = "DecodeMVEVCVTt1fp";
let validForTailPredication = 1;
}
class MVE_VCVT_imm_asmop<int Bits> : AsmOperandClass {
let PredicateMethod = "isImmediate<1," # Bits # ">";
let DiagnosticString =
"MVE fixed-point immediate operand must be between 1 and " # Bits;
let Name = "MVEVcvtImm" # Bits;
let RenderMethod = "addImmOperands";
}
class MVE_VCVT_imm<int Bits>: Operand<i32> {
let ParserMatchClass = MVE_VCVT_imm_asmop<Bits>;
let EncoderMethod = "getNEONVcvtImm32OpValue";
let DecoderMethod = "DecodeVCVTImmOperand";
}
class MVE_VCVT_fix_f32<string suffix, bit U, bit op>
: MVE_VCVT_fix<suffix, 0b1, U, op, MVE_VCVT_imm<32>> {
let Inst{20} = imm6{4};
}
class MVE_VCVT_fix_f16<string suffix, bit U, bit op>
: MVE_VCVT_fix<suffix, 0b0, U, op, MVE_VCVT_imm<16>> {
let Inst{20} = 0b1;
}
def MVE_VCVTf16s16_fix : MVE_VCVT_fix_f16<"f16.s16", 0b0, 0b0>;
def MVE_VCVTs16f16_fix : MVE_VCVT_fix_f16<"s16.f16", 0b0, 0b1>;
def MVE_VCVTf16u16_fix : MVE_VCVT_fix_f16<"f16.u16", 0b1, 0b0>;
def MVE_VCVTu16f16_fix : MVE_VCVT_fix_f16<"u16.f16", 0b1, 0b1>;
def MVE_VCVTf32s32_fix : MVE_VCVT_fix_f32<"f32.s32", 0b0, 0b0>;
def MVE_VCVTs32f32_fix : MVE_VCVT_fix_f32<"s32.f32", 0b0, 0b1>;
def MVE_VCVTf32u32_fix : MVE_VCVT_fix_f32<"f32.u32", 0b1, 0b0>;
def MVE_VCVTu32f32_fix : MVE_VCVT_fix_f32<"u32.f32", 0b1, 0b1>;
class MVE_VCVT_fp_int_anpm<string suffix, bits<2> size, bit op, string anpm,
bits<2> rm, list<dag> pattern=[]>
: MVE_float<!strconcat("vcvt", anpm), suffix, (outs MQPR:$Qd),
(ins MQPR:$Qm), "$Qd, $Qm", vpred_r, "", pattern> {
bits<4> Qd;
let Inst{28} = 0b1;
let Inst{25-23} = 0b111;
let Inst{22} = Qd{3};
let Inst{21-20} = 0b11;
let Inst{19-18} = size;
let Inst{17-16} = 0b11;
let Inst{15-13} = Qd{2-0};
let Inst{12-10} = 0b000;
let Inst{9-8} = rm;
let Inst{7} = op;
let Inst{4} = 0b0;
let validForTailPredication = 1;
}
multiclass MVE_VCVT_fp_int_anpm_multi<string suffix, bits<2> size, bit op,
list<dag> pattern=[]> {
def a : MVE_VCVT_fp_int_anpm<suffix, size, op, "a", 0b00>;
def n : MVE_VCVT_fp_int_anpm<suffix, size, op, "n", 0b01>;
def p : MVE_VCVT_fp_int_anpm<suffix, size, op, "p", 0b10>;
def m : MVE_VCVT_fp_int_anpm<suffix, size, op, "m", 0b11>;
}
// This defines instructions such as MVE_VCVTu16f16a, with an explicit
// rounding-mode suffix on the mnemonic. The class below will define
// the bare MVE_VCVTu16f16 (with implied rounding toward zero).
defm MVE_VCVTs16f16 : MVE_VCVT_fp_int_anpm_multi<"s16.f16", 0b01, 0b0>;
defm MVE_VCVTu16f16 : MVE_VCVT_fp_int_anpm_multi<"u16.f16", 0b01, 0b1>;
defm MVE_VCVTs32f32 : MVE_VCVT_fp_int_anpm_multi<"s32.f32", 0b10, 0b0>;
defm MVE_VCVTu32f32 : MVE_VCVT_fp_int_anpm_multi<"u32.f32", 0b10, 0b1>;
class MVE_VCVT_fp_int<string suffix, bits<2> size, bits<2> op,
list<dag> pattern=[]>
: MVE_float<"vcvt", suffix, (outs MQPR:$Qd),
(ins MQPR:$Qm), "$Qd, $Qm", vpred_r, "", pattern> {
bits<4> Qd;
let Inst{28} = 0b1;
let Inst{25-23} = 0b111;
let Inst{22} = Qd{3};
let Inst{21-20} = 0b11;
let Inst{19-18} = size;
let Inst{17-16} = 0b11;
let Inst{15-13} = Qd{2-0};
let Inst{12-9} = 0b0011;
let Inst{8-7} = op;
let Inst{4} = 0b0;
let validForTailPredication = 1;
}
// The unsuffixed VCVT for float->int implicitly rounds toward zero,
// which I reflect here in the llvm instruction names
def MVE_VCVTs16f16z : MVE_VCVT_fp_int<"s16.f16", 0b01, 0b10>;
def MVE_VCVTu16f16z : MVE_VCVT_fp_int<"u16.f16", 0b01, 0b11>;
def MVE_VCVTs32f32z : MVE_VCVT_fp_int<"s32.f32", 0b10, 0b10>;
def MVE_VCVTu32f32z : MVE_VCVT_fp_int<"u32.f32", 0b10, 0b11>;
// Whereas VCVT for int->float rounds to nearest
def MVE_VCVTf16s16n : MVE_VCVT_fp_int<"f16.s16", 0b01, 0b00>;
def MVE_VCVTf16u16n : MVE_VCVT_fp_int<"f16.u16", 0b01, 0b01>;
def MVE_VCVTf32s32n : MVE_VCVT_fp_int<"f32.s32", 0b10, 0b00>;
def MVE_VCVTf32u32n : MVE_VCVT_fp_int<"f32.u32", 0b10, 0b01>;
let Predicates = [HasMVEFloat] in {
def : Pat<(v4i32 (fp_to_sint (v4f32 MQPR:$src))),
(v4i32 (MVE_VCVTs32f32z (v4f32 MQPR:$src)))>;
def : Pat<(v4i32 (fp_to_uint (v4f32 MQPR:$src))),
(v4i32 (MVE_VCVTu32f32z (v4f32 MQPR:$src)))>;
def : Pat<(v8i16 (fp_to_sint (v8f16 MQPR:$src))),
(v8i16 (MVE_VCVTs16f16z (v8f16 MQPR:$src)))>;
def : Pat<(v8i16 (fp_to_uint (v8f16 MQPR:$src))),
(v8i16 (MVE_VCVTu16f16z (v8f16 MQPR:$src)))>;
def : Pat<(v4f32 (sint_to_fp (v4i32 MQPR:$src))),
(v4f32 (MVE_VCVTf32s32n (v4i32 MQPR:$src)))>;
def : Pat<(v4f32 (uint_to_fp (v4i32 MQPR:$src))),
(v4f32 (MVE_VCVTf32u32n (v4i32 MQPR:$src)))>;
def : Pat<(v8f16 (sint_to_fp (v8i16 MQPR:$src))),
(v8f16 (MVE_VCVTf16s16n (v8i16 MQPR:$src)))>;
def : Pat<(v8f16 (uint_to_fp (v8i16 MQPR:$src))),
(v8f16 (MVE_VCVTf16u16n (v8i16 MQPR:$src)))>;
}
class MVE_VABSNEG_fp<string iname, string suffix, bits<2> size, bit negate,
list<dag> pattern=[]>
: MVE_float<iname, suffix, (outs MQPR:$Qd),
(ins MQPR:$Qm), "$Qd, $Qm", vpred_r, "", pattern> {
bits<4> Qd;
let Inst{28} = 0b1;
let Inst{25-23} = 0b111;
let Inst{22} = Qd{3};
let Inst{21-20} = 0b11;
let Inst{19-18} = size;
let Inst{17-16} = 0b01;
let Inst{15-13} = Qd{2-0};
let Inst{11-8} = 0b0111;
let Inst{7} = negate;
let Inst{4} = 0b0;
let validForTailPredication = 1;
}
def MVE_VABSf16 : MVE_VABSNEG_fp<"vabs", "f16", 0b01, 0b0>;
def MVE_VABSf32 : MVE_VABSNEG_fp<"vabs", "f32", 0b10, 0b0>;
let Predicates = [HasMVEFloat] in {
def : Pat<(v8f16 (fabs MQPR:$src)),
(MVE_VABSf16 MQPR:$src)>;
def : Pat<(v4f32 (fabs MQPR:$src)),
(MVE_VABSf32 MQPR:$src)>;
}
def MVE_VNEGf16 : MVE_VABSNEG_fp<"vneg", "f16", 0b01, 0b1>;
def MVE_VNEGf32 : MVE_VABSNEG_fp<"vneg", "f32", 0b10, 0b1>;
let Predicates = [HasMVEFloat] in {
def : Pat<(v8f16 (fneg MQPR:$src)),
(MVE_VNEGf16 MQPR:$src)>;
def : Pat<(v4f32 (fneg MQPR:$src)),
(MVE_VNEGf32 MQPR:$src)>;
}
class MVE_VMAXMINNMA<string iname, string suffix, bit size, bit bit_12,
list<dag> pattern=[]>
: MVE_f<(outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qm),
NoItinerary, iname, suffix, "$Qd, $Qm", vpred_n, "$Qd = $Qd_src",
pattern> {
bits<4> Qd;
bits<4> Qm;
let Inst{28} = size;
let Inst{25-23} = 0b100;
let Inst{22} = Qd{3};
let Inst{21-16} = 0b111111;
let Inst{15-13} = Qd{2-0};
let Inst{12} = bit_12;
let Inst{11-6} = 0b111010;
let Inst{5} = Qm{3};
let Inst{4} = 0b0;
let Inst{3-1} = Qm{2-0};
let Inst{0} = 0b1;
}
def MVE_VMAXNMAf32 : MVE_VMAXMINNMA<"vmaxnma", "f32", 0b0, 0b0>;
def MVE_VMAXNMAf16 : MVE_VMAXMINNMA<"vmaxnma", "f16", 0b1, 0b0>;
def MVE_VMINNMAf32 : MVE_VMAXMINNMA<"vminnma", "f32", 0b0, 0b1>;
def MVE_VMINNMAf16 : MVE_VMAXMINNMA<"vminnma", "f16", 0b1, 0b1>;
// end of MVE Floating Point instructions
// start of MVE compares
class MVE_VCMPqq<string suffix, bit bit_28, bits<2> bits_21_20,
VCMPPredicateOperand predtype, list<dag> pattern=[]>
: MVE_p<(outs VCCR:$P0), (ins MQPR:$Qn, MQPR:$Qm, predtype:$fc),
NoItinerary, "vcmp", suffix, "$fc, $Qn, $Qm", vpred_n, "", pattern> {
// Base class for comparing two vector registers
bits<3> fc;
bits<4> Qn;
bits<4> Qm;
let Inst{28} = bit_28;
let Inst{25-22} = 0b1000;
let Inst{21-20} = bits_21_20;
let Inst{19-17} = Qn{2-0};
let Inst{16-13} = 0b1000;
let Inst{12} = fc{2};
let Inst{11-8} = 0b1111;
let Inst{7} = fc{0};
let Inst{6} = 0b0;
let Inst{5} = Qm{3};
let Inst{4} = 0b0;
let Inst{3-1} = Qm{2-0};
let Inst{0} = fc{1};
let Constraints = "";
// We need a custom decoder method for these instructions because of
// the output VCCR operand, which isn't encoded in the instruction
// bits anywhere (there is only one choice for it) but has to be
// included in the MC operands so that codegen will be able to track
// its data flow between instructions, spill/reload it when
// necessary, etc. There seems to be no way to get the Tablegen
// decoder to emit an operand that isn't affected by any instruction
// bit.
let DecoderMethod = "DecodeMVEVCMP<false," # predtype.DecoderMethod # ">";
let validForTailPredication = 1;
}
class MVE_VCMPqqf<string suffix, bit size>
: MVE_VCMPqq<suffix, size, 0b11, pred_basic_fp> {
let Predicates = [HasMVEFloat];
}
class MVE_VCMPqqi<string suffix, bits<2> size>
: MVE_VCMPqq<suffix, 0b1, size, pred_basic_i> {
let Inst{12} = 0b0;
let Inst{0} = 0b0;
}
class MVE_VCMPqqu<string suffix, bits<2> size>
: MVE_VCMPqq<suffix, 0b1, size, pred_basic_u> {
let Inst{12} = 0b0;
let Inst{0} = 0b1;
}
class MVE_VCMPqqs<string suffix, bits<2> size>
: MVE_VCMPqq<suffix, 0b1, size, pred_basic_s> {
let Inst{12} = 0b1;
}
def MVE_VCMPf32 : MVE_VCMPqqf<"f32", 0b0>;
def MVE_VCMPf16 : MVE_VCMPqqf<"f16", 0b1>;
def MVE_VCMPi8 : MVE_VCMPqqi<"i8", 0b00>;
def MVE_VCMPi16 : MVE_VCMPqqi<"i16", 0b01>;
def MVE_VCMPi32 : MVE_VCMPqqi<"i32", 0b10>;
def MVE_VCMPu8 : MVE_VCMPqqu<"u8", 0b00>;
def MVE_VCMPu16 : MVE_VCMPqqu<"u16", 0b01>;
def MVE_VCMPu32 : MVE_VCMPqqu<"u32", 0b10>;
def MVE_VCMPs8 : MVE_VCMPqqs<"s8", 0b00>;
def MVE_VCMPs16 : MVE_VCMPqqs<"s16", 0b01>;
def MVE_VCMPs32 : MVE_VCMPqqs<"s32", 0b10>;
class MVE_VCMPqr<string suffix, bit bit_28, bits<2> bits_21_20,
VCMPPredicateOperand predtype, list<dag> pattern=[]>
: MVE_p<(outs VCCR:$P0), (ins MQPR:$Qn, GPRwithZR:$Rm, predtype:$fc),
NoItinerary, "vcmp", suffix, "$fc, $Qn, $Rm", vpred_n, "", pattern> {
// Base class for comparing a vector register with a scalar
bits<3> fc;
bits<4> Qn;
bits<4> Rm;
let Inst{28} = bit_28;
let Inst{25-22} = 0b1000;
let Inst{21-20} = bits_21_20;
let Inst{19-17} = Qn{2-0};
let Inst{16-13} = 0b1000;
let Inst{12} = fc{2};
let Inst{11-8} = 0b1111;
let Inst{7} = fc{0};
let Inst{6} = 0b1;
let Inst{5} = fc{1};
let Inst{4} = 0b0;
let Inst{3-0} = Rm{3-0};
let Constraints = "";
// Custom decoder method, for the same reason as MVE_VCMPqq
let DecoderMethod = "DecodeMVEVCMP<true," # predtype.DecoderMethod # ">";
let validForTailPredication = 1;
}
class MVE_VCMPqrf<string suffix, bit size>
: MVE_VCMPqr<suffix, size, 0b11, pred_basic_fp> {
let Predicates = [HasMVEFloat];
}
class MVE_VCMPqri<string suffix, bits<2> size>
: MVE_VCMPqr<suffix, 0b1, size, pred_basic_i> {
let Inst{12} = 0b0;
let Inst{5} = 0b0;
}
class MVE_VCMPqru<string suffix, bits<2> size>
: MVE_VCMPqr<suffix, 0b1, size, pred_basic_u> {
let Inst{12} = 0b0;
let Inst{5} = 0b1;
}
class MVE_VCMPqrs<string suffix, bits<2> size>
: MVE_VCMPqr<suffix, 0b1, size, pred_basic_s> {
let Inst{12} = 0b1;
}
def MVE_VCMPf32r : MVE_VCMPqrf<"f32", 0b0>;
def MVE_VCMPf16r : MVE_VCMPqrf<"f16", 0b1>;
def MVE_VCMPi8r : MVE_VCMPqri<"i8", 0b00>;
def MVE_VCMPi16r : MVE_VCMPqri<"i16", 0b01>;
def MVE_VCMPi32r : MVE_VCMPqri<"i32", 0b10>;
def MVE_VCMPu8r : MVE_VCMPqru<"u8", 0b00>;
def MVE_VCMPu16r : MVE_VCMPqru<"u16", 0b01>;
def MVE_VCMPu32r : MVE_VCMPqru<"u32", 0b10>;
def MVE_VCMPs8r : MVE_VCMPqrs<"s8", 0b00>;
def MVE_VCMPs16r : MVE_VCMPqrs<"s16", 0b01>;
def MVE_VCMPs32r : MVE_VCMPqrs<"s32", 0b10>;
multiclass unpred_vcmp_z<string suffix, PatLeaf fc> {
def i8 : Pat<(v16i1 (ARMvcmpz (v16i8 MQPR:$v1), fc)),
(v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), ZR, fc))>;
def i16 : Pat<(v8i1 (ARMvcmpz (v8i16 MQPR:$v1), fc)),
(v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), ZR, fc))>;
def i32 : Pat<(v4i1 (ARMvcmpz (v4i32 MQPR:$v1), fc)),
(v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), ZR, fc))>;
def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmpz (v16i8 MQPR:$v1), fc)))),
(v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>;
def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmpz (v8i16 MQPR:$v1), fc)))),
(v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>;
def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmpz (v4i32 MQPR:$v1), fc)))),
(v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>;
}
multiclass unpred_vcmp_r<string suffix, PatLeaf fc> {
def i8 : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc)),
(v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8") (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc))>;
def i16 : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), fc)),
(v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16") (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), fc))>;
def i32 : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc)),
(v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc))>;
def i8r : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), fc)),
(v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc))>;
def i16r : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), fc)),
(v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc))>;
def i32r : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), fc)),
(v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc))>;
def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc)))),
(v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8") (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), fc)))),
(v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16") (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc)))),
(v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), fc)))),
(v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), fc)))),
(v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), fc)))),
(v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
}
multiclass unpred_vcmpf_z<PatLeaf fc> {
def f16 : Pat<(v8i1 (ARMvcmpz (v8f16 MQPR:$v1), fc)),
(v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, fc))>;
def f32 : Pat<(v4i1 (ARMvcmpz (v4f32 MQPR:$v1), fc)),
(v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, fc))>;
def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmpz (v8f16 MQPR:$v1), fc)))),
(v8i1 (MVE_VCMPf32r (v8f16 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>;
def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmpz (v4f32 MQPR:$v1), fc)))),
(v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>;
}
multiclass unpred_vcmpf_r<int fc> {
def f16 : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc)),
(v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc))>;
def f32 : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc)),
(v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc))>;
def f16r : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), fc)),
(v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc))>;
def f32r : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), fc)),
(v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc))>;
def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc)))),
(v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc)))),
(v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), fc)))),
(v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc, ARMVCCThen, VCCR:$p1))>;
def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), fc)))),
(v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc, ARMVCCThen, VCCR:$p1))>;
}
let Predicates = [HasMVEInt] in {
defm MVE_VCEQZ : unpred_vcmp_z<"i", ARMCCeq>;
defm MVE_VCNEZ : unpred_vcmp_z<"i", ARMCCne>;
defm MVE_VCGEZ : unpred_vcmp_z<"s", ARMCCge>;
defm MVE_VCLTZ : unpred_vcmp_z<"s", ARMCClt>;
defm MVE_VCGTZ : unpred_vcmp_z<"s", ARMCCgt>;
defm MVE_VCLEZ : unpred_vcmp_z<"s", ARMCCle>;
defm MVE_VCGTUZ : unpred_vcmp_z<"u", ARMCChi>;
defm MVE_VCGEUZ : unpred_vcmp_z<"u", ARMCChs>;
defm MVE_VCEQ : unpred_vcmp_r<"i", ARMCCeq>;
defm MVE_VCNE : unpred_vcmp_r<"i", ARMCCne>;
defm MVE_VCGE : unpred_vcmp_r<"s", ARMCCge>;
defm MVE_VCLT : unpred_vcmp_r<"s", ARMCClt>;
defm MVE_VCGT : unpred_vcmp_r<"s", ARMCCgt>;
defm MVE_VCLE : unpred_vcmp_r<"s", ARMCCle>;
defm MVE_VCGTU : unpred_vcmp_r<"u", ARMCChi>;
defm MVE_VCGEU : unpred_vcmp_r<"u", ARMCChs>;
}
let Predicates = [HasMVEFloat] in {
defm MVE_VFCEQZ : unpred_vcmpf_z<ARMCCeq>;
defm MVE_VFCNEZ : unpred_vcmpf_z<ARMCCne>;
defm MVE_VFCGEZ : unpred_vcmpf_z<ARMCCge>;
defm MVE_VFCLTZ : unpred_vcmpf_z<ARMCClt>;
defm MVE_VFCGTZ : unpred_vcmpf_z<ARMCCgt>;
defm MVE_VFCLEZ : unpred_vcmpf_z<ARMCCle>;
defm MVE_VFCEQ : unpred_vcmpf_r<ARMCCeq>;
defm MVE_VFCNE : unpred_vcmpf_r<ARMCCne>;
defm MVE_VFCGE : unpred_vcmpf_r<ARMCCge>;
defm MVE_VFCLT : unpred_vcmpf_r<ARMCClt>;
defm MVE_VFCGT : unpred_vcmpf_r<ARMCCgt>;
defm MVE_VFCLE : unpred_vcmpf_r<ARMCCle>;
}
// Extra "worst case" and/or/xor partterns, going into and out of GRP
multiclass two_predops<SDPatternOperator opnode, Instruction insn> {
def v16i1 : Pat<(v16i1 (opnode (v16i1 VCCR:$p1), (v16i1 VCCR:$p2))),
(v16i1 (COPY_TO_REGCLASS
(insn (i32 (COPY_TO_REGCLASS (v16i1 VCCR:$p1), rGPR)),
(i32 (COPY_TO_REGCLASS (v16i1 VCCR:$p2), rGPR))),
VCCR))>;
def v8i1 : Pat<(v8i1 (opnode (v8i1 VCCR:$p1), (v8i1 VCCR:$p2))),
(v8i1 (COPY_TO_REGCLASS
(insn (i32 (COPY_TO_REGCLASS (v8i1 VCCR:$p1), rGPR)),
(i32 (COPY_TO_REGCLASS (v8i1 VCCR:$p2), rGPR))),
VCCR))>;
def v4i1 : Pat<(v4i1 (opnode (v4i1 VCCR:$p1), (v4i1 VCCR:$p2))),
(v4i1 (COPY_TO_REGCLASS
(insn (i32 (COPY_TO_REGCLASS (v4i1 VCCR:$p1), rGPR)),
(i32 (COPY_TO_REGCLASS (v4i1 VCCR:$p2), rGPR))),
VCCR))>;
}
let Predicates = [HasMVEInt] in {
defm POR : two_predops<or, t2ORRrr>;
defm PAND : two_predops<and, t2ANDrr>;
defm PEOR : two_predops<xor, t2EORrr>;
}
// Occasionally we need to cast between a i32 and a boolean vector, for
// example when moving between rGPR and VPR.P0 as part of predicate vector
// shuffles. We also sometimes need to cast between different predicate
// vector types (v4i1<>v8i1, etc.) also as part of lowering vector shuffles.
def predicate_cast : SDNode<"ARMISD::PREDICATE_CAST", SDTUnaryOp>;
let Predicates = [HasMVEInt] in {
foreach VT = [ v4i1, v8i1, v16i1 ] in {
def : Pat<(i32 (predicate_cast (VT VCCR:$src))),
(i32 (COPY_TO_REGCLASS (VT VCCR:$src), VCCR))>;
def : Pat<(VT (predicate_cast (i32 VCCR:$src))),
(VT (COPY_TO_REGCLASS (i32 VCCR:$src), VCCR))>;
foreach VT2 = [ v4i1, v8i1, v16i1 ] in
def : Pat<(VT (predicate_cast (VT2 VCCR:$src))),
(VT (COPY_TO_REGCLASS (VT2 VCCR:$src), VCCR))>;
}
}
// end of MVE compares
// start of MVE_qDest_qSrc
class MVE_qDest_qSrc<string iname, string suffix, dag oops, dag iops,
string ops, vpred_ops vpred, string cstr,
list<dag> pattern=[]>
: MVE_p<oops, iops, NoItinerary, iname, suffix,
ops, vpred, cstr, pattern> {
bits<4> Qd;
bits<4> Qm;
let Inst{25-23} = 0b100;
let Inst{22} = Qd{3};
let Inst{15-13} = Qd{2-0};
let Inst{11-9} = 0b111;
let Inst{6} = 0b0;
let Inst{5} = Qm{3};
let Inst{4} = 0b0;
let Inst{3-1} = Qm{2-0};
}
class MVE_VQxDMLxDH<string iname, bit exch, bit round, bit subtract,
string suffix, bits<2> size, string cstr="", list<dag> pattern=[]>
: MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
(ins MQPR:$Qd_src, MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm",
vpred_n, "$Qd = $Qd_src"#cstr, pattern> {
bits<4> Qn;
let Inst{28} = subtract;
let Inst{21-20} = size;
let Inst{19-17} = Qn{2-0};
let Inst{16} = 0b0;
let Inst{12} = exch;
let Inst{8} = 0b0;
let Inst{7} = Qn{3};
let Inst{0} = round;
}
multiclass MVE_VQxDMLxDH_multi<string iname, bit exch,
bit round, bit subtract> {
def s8 : MVE_VQxDMLxDH<iname, exch, round, subtract, "s8", 0b00>;
def s16 : MVE_VQxDMLxDH<iname, exch, round, subtract, "s16", 0b01>;
def s32 : MVE_VQxDMLxDH<iname, exch, round, subtract, "s32", 0b10, ",@earlyclobber $Qd">;
}
defm MVE_VQDMLADH : MVE_VQxDMLxDH_multi<"vqdmladh", 0b0, 0b0, 0b0>;
defm MVE_VQDMLADHX : MVE_VQxDMLxDH_multi<"vqdmladhx", 0b1, 0b0, 0b0>;
defm MVE_VQRDMLADH : MVE_VQxDMLxDH_multi<"vqrdmladh", 0b0, 0b1, 0b0>;
defm MVE_VQRDMLADHX : MVE_VQxDMLxDH_multi<"vqrdmladhx", 0b1, 0b1, 0b0>;
defm MVE_VQDMLSDH : MVE_VQxDMLxDH_multi<"vqdmlsdh", 0b0, 0b0, 0b1>;
defm MVE_VQDMLSDHX : MVE_VQxDMLxDH_multi<"vqdmlsdhx", 0b1, 0b0, 0b1>;
defm MVE_VQRDMLSDH : MVE_VQxDMLxDH_multi<"vqrdmlsdh", 0b0, 0b1, 0b1>;
defm MVE_VQRDMLSDHX : MVE_VQxDMLxDH_multi<"vqrdmlsdhx", 0b1, 0b1, 0b1>;
class MVE_VCMUL<string iname, string suffix, bit size, string cstr="">
: MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
(ins MQPR:$Qn, MQPR:$Qm, complexrotateop:$rot),
"$Qd, $Qn, $Qm, $rot", vpred_r, cstr, []> {
bits<4> Qn;
bits<2> rot;
let Inst{28} = size;
let Inst{21-20} = 0b11;
let Inst{19-17} = Qn{2-0};
let Inst{16} = 0b0;
let Inst{12} = rot{1};
let Inst{8} = 0b0;
let Inst{7} = Qn{3};
let Inst{0} = rot{0};
let Predicates = [HasMVEFloat];
}
multiclass MVE_VCMUL_m<string iname, MVEVectorVTInfo VTI,
bit size, string cstr=""> {
def "" : MVE_VCMUL<iname, VTI.Suffix, size, cstr>;
let Predicates = [HasMVEFloat] in {
def : Pat<(VTI.Vec (int_arm_mve_vcmulq
imm:$rot, (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
imm:$rot))>;
def : Pat<(VTI.Vec (int_arm_mve_vcmulq_predicated
imm:$rot, (VTI.Vec MQPR:$inactive),
(VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
(VTI.Pred VCCR:$mask))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
imm:$rot,
ARMVCCThen, (VTI.Pred VCCR:$mask),
(VTI.Vec MQPR:$inactive)))>;
}
}
defm MVE_VCMULf16 : MVE_VCMUL_m<"vcmul", MVE_v8f16, 0b0>;
defm MVE_VCMULf32 : MVE_VCMUL_m<"vcmul", MVE_v4f32, 0b1, "@earlyclobber $Qd">;
class MVE_VMULL<string iname, string suffix, bit bit_28, bits<2> bits_21_20,
bit T, string cstr, list<dag> pattern=[]>
: MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
(ins MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm",
vpred_r, cstr, pattern> {
bits<4> Qd;
bits<4> Qn;
bits<4> Qm;
let Inst{28} = bit_28;
let Inst{21-20} = bits_21_20;
let Inst{19-17} = Qn{2-0};
let Inst{16} = 0b1;
let Inst{12} = T;
let Inst{8} = 0b0;
let Inst{7} = Qn{3};
let Inst{0} = 0b0;
let validForTailPredication = 1;
}
multiclass MVE_VMULL_m<MVEVectorVTInfo VTI,
SDNode unpred_op, Intrinsic pred_int,
bit Top, string cstr=""> {
def "" : MVE_VMULL<"vmull" # !if(Top, "t", "b"), VTI.Suffix, VTI.Unsigned,
VTI.Size, Top, cstr>;
foreach uflag = [!if(!eq(VTI.SuffixLetter, "p"),
(?), (? (i32 VTI.Unsigned)))] in
let Predicates = [HasMVEInt] in {
// Unpredicated multiply
def : Pat<(VTI.DblVec !con((unpred_op (VTI.Vec MQPR:$Qm),
(VTI.Vec MQPR:$Qn)),
uflag, (? (i32 Top)))),
(VTI.DblVec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
// Predicated multiply
def : Pat<(VTI.DblVec !con((pred_int (VTI.Vec MQPR:$Qm),
(VTI.Vec MQPR:$Qn)),
uflag, (? (i32 Top), (VTI.Pred VCCR:$mask),
(VTI.DblVec MQPR:$inactive)))),
(VTI.DblVec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
ARMVCCThen, (VTI.Pred VCCR:$mask),
(VTI.DblVec MQPR:$inactive)))>;
}
}
// For polynomial multiplies, the size bits take the unused value 0b11, and
// the unsigned bit switches to encoding the size.
defm MVE_VMULLBs8 : MVE_VMULL_m<MVE_v16s8, int_arm_mve_vmull,
int_arm_mve_mull_int_predicated, 0b0>;
defm MVE_VMULLTs8 : MVE_VMULL_m<MVE_v16s8, int_arm_mve_vmull,
int_arm_mve_mull_int_predicated, 0b1>;
defm MVE_VMULLBs16 : MVE_VMULL_m<MVE_v8s16, int_arm_mve_vmull,
int_arm_mve_mull_int_predicated, 0b0>;
defm MVE_VMULLTs16 : MVE_VMULL_m<MVE_v8s16, int_arm_mve_vmull,
int_arm_mve_mull_int_predicated, 0b1>;
defm MVE_VMULLBs32 : MVE_VMULL_m<MVE_v4s32, int_arm_mve_vmull,
int_arm_mve_mull_int_predicated, 0b0,
"@earlyclobber $Qd">;
defm MVE_VMULLTs32 : MVE_VMULL_m<MVE_v4s32, int_arm_mve_vmull,
int_arm_mve_mull_int_predicated, 0b1,
"@earlyclobber $Qd">;
defm MVE_VMULLBu8 : MVE_VMULL_m<MVE_v16u8, int_arm_mve_vmull,
int_arm_mve_mull_int_predicated, 0b0>;
defm MVE_VMULLTu8 : MVE_VMULL_m<MVE_v16u8, int_arm_mve_vmull,
int_arm_mve_mull_int_predicated, 0b1>;
defm MVE_VMULLBu16 : MVE_VMULL_m<MVE_v8u16, int_arm_mve_vmull,
int_arm_mve_mull_int_predicated, 0b0>;
defm MVE_VMULLTu16 : MVE_VMULL_m<MVE_v8u16, int_arm_mve_vmull,
int_arm_mve_mull_int_predicated, 0b1>;
defm MVE_VMULLBu32 : MVE_VMULL_m<MVE_v4u32, int_arm_mve_vmull,
int_arm_mve_mull_int_predicated, 0b0,
"@earlyclobber $Qd">;
defm MVE_VMULLTu32 : MVE_VMULL_m<MVE_v4u32, int_arm_mve_vmull,
int_arm_mve_mull_int_predicated, 0b1,
"@earlyclobber $Qd">;
defm MVE_VMULLBp8 : MVE_VMULL_m<MVE_v16p8, int_arm_mve_vmull_poly,
int_arm_mve_mull_poly_predicated, 0b0>;
defm MVE_VMULLTp8 : MVE_VMULL_m<MVE_v16p8, int_arm_mve_vmull_poly,
int_arm_mve_mull_poly_predicated, 0b1>;
defm MVE_VMULLBp16 : MVE_VMULL_m<MVE_v8p16, int_arm_mve_vmull_poly,
int_arm_mve_mull_poly_predicated, 0b0>;
defm MVE_VMULLTp16 : MVE_VMULL_m<MVE_v8p16, int_arm_mve_vmull_poly,
int_arm_mve_mull_poly_predicated, 0b1>;
class MVE_VxMULH<string iname, string suffix, bit U, bits<2> size, bit round,
list<dag> pattern=[]>
: MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
(ins MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm",
vpred_r, "", pattern> {
bits<4> Qn;
let Inst{28} = U;
let Inst{21-20} = size;
let Inst{19-17} = Qn{2-0};
let Inst{16} = 0b1;
let Inst{12} = round;
let Inst{8} = 0b0;
let Inst{7} = Qn{3};
let Inst{0} = 0b1;
}
multiclass MVE_VxMULH_m<string iname, MVEVectorVTInfo VTI, SDNode unpred_op,
Intrinsic pred_int, bit round> {
def "" : MVE_VxMULH<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, round>;
let Predicates = [HasMVEInt] in {
// Unpredicated multiply returning high bits
def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
(i32 VTI.Unsigned))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
// Predicated multiply returning high bits
def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
(i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
(VTI.Vec MQPR:$inactive))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
ARMVCCThen, (VTI.Pred VCCR:$mask),
(VTI.Vec MQPR:$inactive)))>;
}
}
multiclass MVE_VMULT<string iname, MVEVectorVTInfo VTI, bit round>
: MVE_VxMULH_m<iname, VTI, !if(round, int_arm_mve_vrmulh, int_arm_mve_vmulh),
!if(round, int_arm_mve_rmulh_predicated,
int_arm_mve_mulh_predicated),
round>;
defm MVE_VMULHs8 : MVE_VMULT<"vmulh", MVE_v16s8, 0b0>;
defm MVE_VMULHs16 : MVE_VMULT<"vmulh", MVE_v8s16, 0b0>;
defm MVE_VMULHs32 : MVE_VMULT<"vmulh", MVE_v4s32, 0b0>;
defm MVE_VMULHu8 : MVE_VMULT<"vmulh", MVE_v16u8, 0b0>;
defm MVE_VMULHu16 : MVE_VMULT<"vmulh", MVE_v8u16, 0b0>;
defm MVE_VMULHu32 : MVE_VMULT<"vmulh", MVE_v4u32, 0b0>;
defm MVE_VRMULHs8 : MVE_VMULT<"vrmulh", MVE_v16s8, 0b1>;
defm MVE_VRMULHs16 : MVE_VMULT<"vrmulh", MVE_v8s16, 0b1>;
defm MVE_VRMULHs32 : MVE_VMULT<"vrmulh", MVE_v4s32, 0b1>;
defm MVE_VRMULHu8 : MVE_VMULT<"vrmulh", MVE_v16u8, 0b1>;
defm MVE_VRMULHu16 : MVE_VMULT<"vrmulh", MVE_v8u16, 0b1>;
defm MVE_VRMULHu32 : MVE_VMULT<"vrmulh", MVE_v4u32, 0b1>;
class MVE_VxMOVxN<string iname, string suffix, bit bit_28, bit bit_17,
bits<2> size, bit T, list<dag> pattern=[]>
: MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
(ins MQPR:$Qd_src, MQPR:$Qm), "$Qd, $Qm",
vpred_n, "$Qd = $Qd_src", pattern> {
let Inst{28} = bit_28;
let Inst{21-20} = 0b11;
let Inst{19-18} = size;
let Inst{17} = bit_17;
let Inst{16} = 0b1;
let Inst{12} = T;
let Inst{8} = 0b0;
let Inst{7} = !if(!eq(bit_17, 0), 1, 0);
let Inst{0} = 0b1;
}
multiclass MVE_VxMOVxN_halves<string iname, string suffix,
bit bit_28, bit bit_17, bits<2> size> {
def bh : MVE_VxMOVxN<iname # "b", suffix, bit_28, bit_17, size, 0b0>;
def th : MVE_VxMOVxN<iname # "t", suffix, bit_28, bit_17, size, 0b1>;
}
defm MVE_VMOVNi16 : MVE_VxMOVxN_halves<"vmovn", "i16", 0b1, 0b0, 0b00>;
defm MVE_VMOVNi32 : MVE_VxMOVxN_halves<"vmovn", "i32", 0b1, 0b0, 0b01>;
defm MVE_VQMOVNs16 : MVE_VxMOVxN_halves<"vqmovn", "s16", 0b0, 0b1, 0b00>;
defm MVE_VQMOVNs32 : MVE_VxMOVxN_halves<"vqmovn", "s32", 0b0, 0b1, 0b01>;
defm MVE_VQMOVNu16 : MVE_VxMOVxN_halves<"vqmovn", "u16", 0b1, 0b1, 0b00>;
defm MVE_VQMOVNu32 : MVE_VxMOVxN_halves<"vqmovn", "u32", 0b1, 0b1, 0b01>;
defm MVE_VQMOVUNs16 : MVE_VxMOVxN_halves<"vqmovun", "s16", 0b0, 0b0, 0b00>;
defm MVE_VQMOVUNs32 : MVE_VxMOVxN_halves<"vqmovun", "s32", 0b0, 0b0, 0b01>;
def MVEvmovn : SDNode<"ARMISD::VMOVN", SDTARMVEXT>;
let Predicates = [HasMVEInt] in {
def : Pat<(v8i16 (MVEvmovn (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 0))),
(v8i16 (MVE_VMOVNi32bh (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>;
def : Pat<(v8i16 (MVEvmovn (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 1))),
(v8i16 (MVE_VMOVNi32th (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>;
def : Pat<(v16i8 (MVEvmovn (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm), (i32 0))),
(v16i8 (MVE_VMOVNi16bh (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm)))>;
def : Pat<(v16i8 (MVEvmovn (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm), (i32 1))),
(v16i8 (MVE_VMOVNi16th (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm)))>;
}
class MVE_VCVT_ff<string iname, string suffix, bit op, bit T,
list<dag> pattern=[]>
: MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qm),
"$Qd, $Qm", vpred_n, "$Qd = $Qd_src", pattern> {
let Inst{28} = op;
let Inst{21-16} = 0b111111;
let Inst{12} = T;
let Inst{8-7} = 0b00;
let Inst{0} = 0b1;
let Predicates = [HasMVEFloat];
}
[ARM] Begin adding IR intrinsics for MVE instructions. This commit, together with the next few, will add a representative sample of the kind of IR intrinsics that we'll need in order to implement the user-facing ACLE intrinsics for MVE. Supporting all of them will take more work; the intention of this initial series of commits is to implement an intrinsic or two from lots of different categories, as examples and proofs of concept. This initial commit introduces a small number of IR intrinsics for instructions simple enough that they can use Tablegen ISel patterns: the predicated versions of the VADD and VSUB instructions (both integer and FP), VMIN and VMAX, and the float->half VCVT instruction (predicated and unpredicated). When using VPT-predicated instructions in automatic code generation, it will be convenient to specify the predicate value as a vector of the appropriate number of i1. To make it easy to specify all sizes of an instruction in one go and give each one the matching predicate vector type, I've added a system of Tablegen informational records describing MVE's vector types: each one gives the underlying LLVM IR ValueType (which may not be the same if the MVE vector is of explicitly signed or unsigned integers) and an appropriate vNi1 to use as the predicate vector. (Also, those info records include the usual encoding for the types, so that as we add associations between each instruction encoding and one of the new `MVEVectorVTInfo` records, we can remove some of the existing template parameters and replace them with references to the vector type info's fields.) The user-facing ACLE intrinsics will receive a predicate mask as a 16-bit integer, so I've also provided a pair of intrinsics i2v and v2i, to convert between an integer and a vector of i1 by just changing the register class. Reviewers: dmgreen, miyuki, ostannard Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67158
2019-10-08 00:00:51 +08:00
multiclass MVE_VCVT_f2h_m<string iname, int half> {
def "": MVE_VCVT_ff<iname, "f16.f32", 0b0, half>;
let Predicates = [HasMVEFloat] in {
def : Pat<(v8f16 (int_arm_mve_vcvt_narrow
(v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm), (i32 half))),
(v8f16 (!cast<Instruction>(NAME)
(v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm)))>;
def : Pat<(v8f16 (int_arm_mve_vcvt_narrow_predicated
(v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm), (i32 half),
(v4i1 VCCR:$mask))),
(v8f16 (!cast<Instruction>(NAME)
(v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm),
ARMVCCThen, (v4i1 VCCR:$mask)))>;
[ARM] Begin adding IR intrinsics for MVE instructions. This commit, together with the next few, will add a representative sample of the kind of IR intrinsics that we'll need in order to implement the user-facing ACLE intrinsics for MVE. Supporting all of them will take more work; the intention of this initial series of commits is to implement an intrinsic or two from lots of different categories, as examples and proofs of concept. This initial commit introduces a small number of IR intrinsics for instructions simple enough that they can use Tablegen ISel patterns: the predicated versions of the VADD and VSUB instructions (both integer and FP), VMIN and VMAX, and the float->half VCVT instruction (predicated and unpredicated). When using VPT-predicated instructions in automatic code generation, it will be convenient to specify the predicate value as a vector of the appropriate number of i1. To make it easy to specify all sizes of an instruction in one go and give each one the matching predicate vector type, I've added a system of Tablegen informational records describing MVE's vector types: each one gives the underlying LLVM IR ValueType (which may not be the same if the MVE vector is of explicitly signed or unsigned integers) and an appropriate vNi1 to use as the predicate vector. (Also, those info records include the usual encoding for the types, so that as we add associations between each instruction encoding and one of the new `MVEVectorVTInfo` records, we can remove some of the existing template parameters and replace them with references to the vector type info's fields.) The user-facing ACLE intrinsics will receive a predicate mask as a 16-bit integer, so I've also provided a pair of intrinsics i2v and v2i, to convert between an integer and a vector of i1 by just changing the register class. Reviewers: dmgreen, miyuki, ostannard Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67158
2019-10-08 00:00:51 +08:00
}
}
multiclass MVE_VCVT_h2f_m<string iname, int half> {
def "": MVE_VCVT_ff<iname, "f32.f16", 0b1, half>;
}
[ARM] Begin adding IR intrinsics for MVE instructions. This commit, together with the next few, will add a representative sample of the kind of IR intrinsics that we'll need in order to implement the user-facing ACLE intrinsics for MVE. Supporting all of them will take more work; the intention of this initial series of commits is to implement an intrinsic or two from lots of different categories, as examples and proofs of concept. This initial commit introduces a small number of IR intrinsics for instructions simple enough that they can use Tablegen ISel patterns: the predicated versions of the VADD and VSUB instructions (both integer and FP), VMIN and VMAX, and the float->half VCVT instruction (predicated and unpredicated). When using VPT-predicated instructions in automatic code generation, it will be convenient to specify the predicate value as a vector of the appropriate number of i1. To make it easy to specify all sizes of an instruction in one go and give each one the matching predicate vector type, I've added a system of Tablegen informational records describing MVE's vector types: each one gives the underlying LLVM IR ValueType (which may not be the same if the MVE vector is of explicitly signed or unsigned integers) and an appropriate vNi1 to use as the predicate vector. (Also, those info records include the usual encoding for the types, so that as we add associations between each instruction encoding and one of the new `MVEVectorVTInfo` records, we can remove some of the existing template parameters and replace them with references to the vector type info's fields.) The user-facing ACLE intrinsics will receive a predicate mask as a 16-bit integer, so I've also provided a pair of intrinsics i2v and v2i, to convert between an integer and a vector of i1 by just changing the register class. Reviewers: dmgreen, miyuki, ostannard Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67158
2019-10-08 00:00:51 +08:00
defm MVE_VCVTf16f32bh : MVE_VCVT_f2h_m<"vcvtb", 0b0>;
defm MVE_VCVTf16f32th : MVE_VCVT_f2h_m<"vcvtt", 0b1>;
defm MVE_VCVTf32f16bh : MVE_VCVT_h2f_m<"vcvtb", 0b0>;
defm MVE_VCVTf32f16th : MVE_VCVT_h2f_m<"vcvtt", 0b1>;
class MVE_VxCADD<string iname, string suffix, bits<2> size, bit halve,
string cstr="">
: MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
(ins MQPR:$Qn, MQPR:$Qm, complexrotateopodd:$rot),
"$Qd, $Qn, $Qm, $rot", vpred_r, cstr, []> {
bits<4> Qn;
bit rot;
let Inst{28} = halve;
let Inst{21-20} = size;
let Inst{19-17} = Qn{2-0};
let Inst{16} = 0b0;
let Inst{12} = rot;
let Inst{8} = 0b1;
let Inst{7} = Qn{3};
let Inst{0} = 0b0;
}
multiclass MVE_VxCADD_m<string iname, MVEVectorVTInfo VTI,
bit halve, string cstr=""> {
def "" : MVE_VxCADD<iname, VTI.Suffix, VTI.Size, halve, cstr>;
let Predicates = [HasMVEInt] in {
def : Pat<(VTI.Vec (int_arm_mve_vcaddq halve,
imm:$rot, (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
imm:$rot))>;
def : Pat<(VTI.Vec (int_arm_mve_vcaddq_predicated halve,
imm:$rot, (VTI.Vec MQPR:$inactive),
(VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
(VTI.Pred VCCR:$mask))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
imm:$rot,
ARMVCCThen, (VTI.Pred VCCR:$mask),
(VTI.Vec MQPR:$inactive)))>;
}
}
defm MVE_VCADDi8 : MVE_VxCADD_m<"vcadd", MVE_v16i8, 0b1>;
defm MVE_VCADDi16 : MVE_VxCADD_m<"vcadd", MVE_v8i16, 0b1>;
defm MVE_VCADDi32 : MVE_VxCADD_m<"vcadd", MVE_v4i32, 0b1, "@earlyclobber $Qd">;
defm MVE_VHCADDs8 : MVE_VxCADD_m<"vhcadd", MVE_v16s8, 0b0>;
defm MVE_VHCADDs16 : MVE_VxCADD_m<"vhcadd", MVE_v8s16, 0b0>;
defm MVE_VHCADDs32 : MVE_VxCADD_m<"vhcadd", MVE_v4s32, 0b0, "@earlyclobber $Qd">;
class MVE_VADCSBC<string iname, bit I, bit subtract,
dag carryin, list<dag> pattern=[]>
: MVE_qDest_qSrc<iname, "i32", (outs MQPR:$Qd, cl_FPSCR_NZCV:$carryout),
!con((ins MQPR:$Qn, MQPR:$Qm), carryin),
"$Qd, $Qn, $Qm", vpred_r, "", pattern> {
bits<4> Qn;
let Inst{28} = subtract;
let Inst{21-20} = 0b11;
let Inst{19-17} = Qn{2-0};
let Inst{16} = 0b0;
let Inst{12} = I;
let Inst{8} = 0b1;
let Inst{7} = Qn{3};
let Inst{0} = 0b0;
// Custom decoder method in order to add the FPSCR operand(s), which
// Tablegen won't do right
let DecoderMethod = "DecodeMVEVADCInstruction";
}
def MVE_VADC : MVE_VADCSBC<"vadc", 0b0, 0b0, (ins cl_FPSCR_NZCV:$carryin)>;
def MVE_VADCI : MVE_VADCSBC<"vadci", 0b1, 0b0, (ins)>;
def MVE_VSBC : MVE_VADCSBC<"vsbc", 0b0, 0b1, (ins cl_FPSCR_NZCV:$carryin)>;
def MVE_VSBCI : MVE_VADCSBC<"vsbci", 0b1, 0b1, (ins)>;
class MVE_VQDMULL<string iname, string suffix, bit size, bit T,
string cstr="", list<dag> pattern=[]>
: MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
(ins MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm",
vpred_r, cstr, pattern> {
bits<4> Qn;
let Inst{28} = size;
let Inst{21-20} = 0b11;
let Inst{19-17} = Qn{2-0};
let Inst{16} = 0b0;
let Inst{12} = T;
let Inst{8} = 0b1;
let Inst{7} = Qn{3};
let Inst{0} = 0b1;
let validForTailPredication = 1;
}
multiclass MVE_VQDMULL_halves<string suffix, bit size, string cstr=""> {
def bh : MVE_VQDMULL<"vqdmullb", suffix, size, 0b0, cstr>;
def th : MVE_VQDMULL<"vqdmullt", suffix, size, 0b1, cstr>;
}
defm MVE_VQDMULLs16 : MVE_VQDMULL_halves<"s16", 0b0>;
defm MVE_VQDMULLs32 : MVE_VQDMULL_halves<"s32", 0b1, "@earlyclobber $Qd">;
// end of mve_qDest_qSrc
// start of mve_qDest_rSrc
class MVE_qr_base<dag oops, dag iops, InstrItinClass itin, string iname,
string suffix, string ops, vpred_ops vpred, string cstr,
list<dag> pattern=[]>
: MVE_p<oops, iops, NoItinerary, iname, suffix, ops, vpred, cstr, pattern> {
bits<4> Qd;
bits<4> Qn;
bits<4> Rm;
let Inst{25-23} = 0b100;
let Inst{22} = Qd{3};
let Inst{19-17} = Qn{2-0};
let Inst{15-13} = Qd{2-0};
let Inst{11-9} = 0b111;
let Inst{7} = Qn{3};
let Inst{6} = 0b1;
let Inst{4} = 0b0;
let Inst{3-0} = Rm{3-0};
}
class MVE_qDest_rSrc<string iname, string suffix, string cstr="", list<dag> pattern=[]>
: MVE_qr_base<(outs MQPR:$Qd), (ins MQPR:$Qn, rGPR:$Rm),
NoItinerary, iname, suffix, "$Qd, $Qn, $Rm", vpred_r, cstr,
pattern>;
class MVE_qDestSrc_rSrc<string iname, string suffix, list<dag> pattern=[]>
: MVE_qr_base<(outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qn, rGPR:$Rm),
NoItinerary, iname, suffix, "$Qd, $Qn, $Rm", vpred_n, "$Qd = $Qd_src",
pattern>;
class MVE_qDest_single_rSrc<string iname, string suffix, list<dag> pattern=[]>
: MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qd_src, rGPR:$Rm), NoItinerary, iname,
suffix, "$Qd, $Rm", vpred_n, "$Qd = $Qd_src", pattern> {
bits<4> Qd;
bits<4> Rm;
let Inst{22} = Qd{3};
let Inst{15-13} = Qd{2-0};
let Inst{3-0} = Rm{3-0};
}
class MVE_VADDSUB_qr<string iname, string suffix, bits<2> size,
bit bit_5, bit bit_12, bit bit_16,
bit bit_28, list<dag> pattern=[]>
: MVE_qDest_rSrc<iname, suffix, "", pattern> {
let Inst{28} = bit_28;
let Inst{21-20} = size;
let Inst{16} = bit_16;
let Inst{12} = bit_12;
let Inst{8} = 0b1;
let Inst{5} = bit_5;
let validForTailPredication = 1;
}
multiclass MVE_VADDSUB_qr_sizes<string iname, string suffix,
bit bit_5, bit bit_12, bit bit_16,
bit bit_28, list<dag> pattern=[]> {
def "8" : MVE_VADDSUB_qr<iname, suffix#"8", 0b00,
bit_5, bit_12, bit_16, bit_28>;
def "16" : MVE_VADDSUB_qr<iname, suffix#"16", 0b01,
bit_5, bit_12, bit_16, bit_28>;
def "32" : MVE_VADDSUB_qr<iname, suffix#"32", 0b10,
bit_5, bit_12, bit_16, bit_28>;
}
defm MVE_VADD_qr_i : MVE_VADDSUB_qr_sizes<"vadd", "i", 0b0, 0b0, 0b1, 0b0>;
defm MVE_VQADD_qr_s : MVE_VADDSUB_qr_sizes<"vqadd", "s", 0b1, 0b0, 0b0, 0b0>;
defm MVE_VQADD_qr_u : MVE_VADDSUB_qr_sizes<"vqadd", "u", 0b1, 0b0, 0b0, 0b1>;
defm MVE_VSUB_qr_i : MVE_VADDSUB_qr_sizes<"vsub", "i", 0b0, 0b1, 0b1, 0b0>;
defm MVE_VQSUB_qr_s : MVE_VADDSUB_qr_sizes<"vqsub", "s", 0b1, 0b1, 0b0, 0b0>;
defm MVE_VQSUB_qr_u : MVE_VADDSUB_qr_sizes<"vqsub", "u", 0b1, 0b1, 0b0, 0b1>;
let Predicates = [HasMVEInt] in {
def : Pat<(v16i8 (add (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))),
(v16i8 (MVE_VADD_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>;
def : Pat<(v8i16 (add (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))),
(v8i16 (MVE_VADD_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>;
def : Pat<(v4i32 (add (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))),
(v4i32 (MVE_VADD_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>;
}
let Predicates = [HasMVEInt] in {
def : Pat<(v16i8 (sub (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))),
(v16i8 (MVE_VSUB_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>;
def : Pat<(v8i16 (sub (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))),
(v8i16 (MVE_VSUB_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>;
def : Pat<(v4i32 (sub (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))),
(v4i32 (MVE_VSUB_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>;
}
class MVE_VQDMULL_qr<string iname, string suffix, bit size,
bit T, string cstr="", list<dag> pattern=[]>
: MVE_qDest_rSrc<iname, suffix, cstr, pattern> {
let Inst{28} = size;
let Inst{21-20} = 0b11;
let Inst{16} = 0b0;
let Inst{12} = T;
let Inst{8} = 0b1;
let Inst{5} = 0b1;
let validForTailPredication = 1;
}
multiclass MVE_VQDMULL_qr_halves<string suffix, bit size, string cstr=""> {
def bh : MVE_VQDMULL_qr<"vqdmullb", suffix, size, 0b0, cstr>;
def th : MVE_VQDMULL_qr<"vqdmullt", suffix, size, 0b1, cstr>;
}
defm MVE_VQDMULL_qr_s16 : MVE_VQDMULL_qr_halves<"s16", 0b0>;
defm MVE_VQDMULL_qr_s32 : MVE_VQDMULL_qr_halves<"s32", 0b1, "@earlyclobber $Qd">;
class MVE_VxADDSUB_qr<string iname, string suffix,
bit bit_28, bits<2> bits_21_20, bit subtract,
list<dag> pattern=[]>
: MVE_qDest_rSrc<iname, suffix, "", pattern> {
let Inst{28} = bit_28;
let Inst{21-20} = bits_21_20;
let Inst{16} = 0b0;
let Inst{12} = subtract;
let Inst{8} = 0b1;
let Inst{5} = 0b0;
let validForTailPredication = 1;
}
def MVE_VHADD_qr_s8 : MVE_VxADDSUB_qr<"vhadd", "s8", 0b0, 0b00, 0b0>;
def MVE_VHADD_qr_s16 : MVE_VxADDSUB_qr<"vhadd", "s16", 0b0, 0b01, 0b0>;
def MVE_VHADD_qr_s32 : MVE_VxADDSUB_qr<"vhadd", "s32", 0b0, 0b10, 0b0>;
def MVE_VHADD_qr_u8 : MVE_VxADDSUB_qr<"vhadd", "u8", 0b1, 0b00, 0b0>;
def MVE_VHADD_qr_u16 : MVE_VxADDSUB_qr<"vhadd", "u16", 0b1, 0b01, 0b0>;
def MVE_VHADD_qr_u32 : MVE_VxADDSUB_qr<"vhadd", "u32", 0b1, 0b10, 0b0>;
def MVE_VHSUB_qr_s8 : MVE_VxADDSUB_qr<"vhsub", "s8", 0b0, 0b00, 0b1>;
def MVE_VHSUB_qr_s16 : MVE_VxADDSUB_qr<"vhsub", "s16", 0b0, 0b01, 0b1>;
def MVE_VHSUB_qr_s32 : MVE_VxADDSUB_qr<"vhsub", "s32", 0b0, 0b10, 0b1>;
def MVE_VHSUB_qr_u8 : MVE_VxADDSUB_qr<"vhsub", "u8", 0b1, 0b00, 0b1>;
def MVE_VHSUB_qr_u16 : MVE_VxADDSUB_qr<"vhsub", "u16", 0b1, 0b01, 0b1>;
def MVE_VHSUB_qr_u32 : MVE_VxADDSUB_qr<"vhsub", "u32", 0b1, 0b10, 0b1>;
let Predicates = [HasMVEFloat] in {
def MVE_VADD_qr_f32 : MVE_VxADDSUB_qr<"vadd", "f32", 0b0, 0b11, 0b0>;
def MVE_VADD_qr_f16 : MVE_VxADDSUB_qr<"vadd", "f16", 0b1, 0b11, 0b0>;
def MVE_VSUB_qr_f32 : MVE_VxADDSUB_qr<"vsub", "f32", 0b0, 0b11, 0b1>;
def MVE_VSUB_qr_f16 : MVE_VxADDSUB_qr<"vsub", "f16", 0b1, 0b11, 0b1>;
}
class MVE_VxSHL_qr<string iname, string suffix, bit U, bits<2> size,
bit bit_7, bit bit_17, list<dag> pattern=[]>
: MVE_qDest_single_rSrc<iname, suffix, pattern> {
let Inst{28} = U;
let Inst{25-23} = 0b100;
let Inst{21-20} = 0b11;
let Inst{19-18} = size;
let Inst{17} = bit_17;
let Inst{16} = 0b1;
let Inst{12-8} = 0b11110;
let Inst{7} = bit_7;
let Inst{6-4} = 0b110;
let validForTailPredication = 1;
}
multiclass MVE_VxSHL_qr_p<string iname, MVEVectorVTInfo VTI, bit q, bit r> {
def "" : MVE_VxSHL_qr<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, q, r>;
def : Pat<(VTI.Vec (int_arm_mve_vshl_scalar
(VTI.Vec MQPR:$in), (i32 rGPR:$sh),
(i32 q), (i32 r), (i32 VTI.Unsigned))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$in), (i32 rGPR:$sh)))>;
def : Pat<(VTI.Vec (int_arm_mve_vshl_scalar_predicated
(VTI.Vec MQPR:$in), (i32 rGPR:$sh),
(i32 q), (i32 r), (i32 VTI.Unsigned),
(VTI.Pred VCCR:$mask))),
(VTI.Vec (!cast<Instruction>(NAME)
(VTI.Vec MQPR:$in), (i32 rGPR:$sh),
ARMVCCThen, (VTI.Pred VCCR:$mask)))>;
}
multiclass MVE_VxSHL_qr_types<string iname, bit bit_7, bit bit_17> {
defm s8 : MVE_VxSHL_qr_p<iname, MVE_v16s8, bit_7, bit_17>;
defm s16 : MVE_VxSHL_qr_p<iname, MVE_v8s16, bit_7, bit_17>;
defm s32 : MVE_VxSHL_qr_p<iname, MVE_v4s32, bit_7, bit_17>;
defm u8 : MVE_VxSHL_qr_p<iname, MVE_v16u8, bit_7, bit_17>;
defm u16 : MVE_VxSHL_qr_p<iname, MVE_v8u16, bit_7, bit_17>;
defm u32 : MVE_VxSHL_qr_p<iname, MVE_v4u32, bit_7, bit_17>;
}
defm MVE_VSHL_qr : MVE_VxSHL_qr_types<"vshl", 0b0, 0b0>;
defm MVE_VRSHL_qr : MVE_VxSHL_qr_types<"vrshl", 0b0, 0b1>;
defm MVE_VQSHL_qr : MVE_VxSHL_qr_types<"vqshl", 0b1, 0b0>;
defm MVE_VQRSHL_qr : MVE_VxSHL_qr_types<"vqrshl", 0b1, 0b1>;
let Predicates = [HasMVEInt] in {
def : Pat<(v4i32 (ARMvshlu (v4i32 MQPR:$Qm), (v4i32 (ARMvdup GPR:$Rm)))),
(v4i32 (MVE_VSHL_qru32 (v4i32 MQPR:$Qm), GPR:$Rm))>;
def : Pat<(v8i16 (ARMvshlu (v8i16 MQPR:$Qm), (v8i16 (ARMvdup GPR:$Rm)))),
(v8i16 (MVE_VSHL_qru16 (v8i16 MQPR:$Qm), GPR:$Rm))>;
def : Pat<(v16i8 (ARMvshlu (v16i8 MQPR:$Qm), (v16i8 (ARMvdup GPR:$Rm)))),
(v16i8 (MVE_VSHL_qru8 (v16i8 MQPR:$Qm), GPR:$Rm))>;
def : Pat<(v4i32 (ARMvshls (v4i32 MQPR:$Qm), (v4i32 (ARMvdup GPR:$Rm)))),
(v4i32 (MVE_VSHL_qrs32 (v4i32 MQPR:$Qm), GPR:$Rm))>;
def : Pat<(v8i16 (ARMvshls (v8i16 MQPR:$Qm), (v8i16 (ARMvdup GPR:$Rm)))),
(v8i16 (MVE_VSHL_qrs16 (v8i16 MQPR:$Qm), GPR:$Rm))>;
def : Pat<(v16i8 (ARMvshls (v16i8 MQPR:$Qm), (v16i8 (ARMvdup GPR:$Rm)))),
(v16i8 (MVE_VSHL_qrs8 (v16i8 MQPR:$Qm), GPR:$Rm))>;
}
class MVE_VBRSR<string iname, string suffix, bits<2> size, list<dag> pattern=[]>
: MVE_qDest_rSrc<iname, suffix, "", pattern> {
let Inst{28} = 0b1;
let Inst{21-20} = size;
let Inst{16} = 0b1;
let Inst{12} = 0b1;
let Inst{8} = 0b0;
let Inst{5} = 0b1;
let validForTailPredication = 1;
}
def MVE_VBRSR8 : MVE_VBRSR<"vbrsr", "8", 0b00>;
def MVE_VBRSR16 : MVE_VBRSR<"vbrsr", "16", 0b01>;
def MVE_VBRSR32 : MVE_VBRSR<"vbrsr", "32", 0b10>;
let Predicates = [HasMVEInt] in {
def : Pat<(v16i8 ( bitreverse (v16i8 MQPR:$val1))),
(v16i8 ( MVE_VBRSR8 (v16i8 MQPR:$val1), (t2MOVi (i32 8)) ))>;
def : Pat<(v4i32 ( bitreverse (v4i32 MQPR:$val1))),
(v4i32 ( MVE_VBRSR32 (v4i32 MQPR:$val1), (t2MOVi (i32 32)) ))>;
def : Pat<(v8i16 ( bitreverse (v8i16 MQPR:$val1))),
(v8i16 ( MVE_VBRSR16 (v8i16 MQPR:$val1), (t2MOVi (i32 16)) ))>;
}
class MVE_VMUL_qr_int<string iname, string suffix,
bits<2> size, list<dag> pattern=[]>
: MVE_qDest_rSrc<iname, suffix, "", pattern> {
let Inst{28} = 0b0;
let Inst{21-20} = size;
let Inst{16} = 0b1;
let Inst{12} = 0b1;
let Inst{8} = 0b0;
let Inst{5} = 0b1;
let validForTailPredication = 1;
}
def MVE_VMUL_qr_i8 : MVE_VMUL_qr_int<"vmul", "i8", 0b00>;
def MVE_VMUL_qr_i16 : MVE_VMUL_qr_int<"vmul", "i16", 0b01>;
def MVE_VMUL_qr_i32 : MVE_VMUL_qr_int<"vmul", "i32", 0b10>;
let Predicates = [HasMVEInt] in {
def : Pat<(v16i8 (mul (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))),
(v16i8 (MVE_VMUL_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>;
def : Pat<(v8i16 (mul (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))),
(v8i16 (MVE_VMUL_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>;
def : Pat<(v4i32 (mul (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))),
(v4i32 (MVE_VMUL_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>;
}
class MVE_VxxMUL_qr<string iname, string suffix,
bit bit_28, bits<2> bits_21_20, list<dag> pattern=[]>
: MVE_qDest_rSrc<iname, suffix, "", pattern> {
let Inst{28} = bit_28;
let Inst{21-20} = bits_21_20;
let Inst{16} = 0b1;
let Inst{12} = 0b0;
let Inst{8} = 0b0;
let Inst{5} = 0b1;
}
def MVE_VQDMULH_qr_s8 : MVE_VxxMUL_qr<"vqdmulh", "s8", 0b0, 0b00>;
def MVE_VQDMULH_qr_s16 : MVE_VxxMUL_qr<"vqdmulh", "s16", 0b0, 0b01>;
def MVE_VQDMULH_qr_s32 : MVE_VxxMUL_qr<"vqdmulh", "s32", 0b0, 0b10>;
def MVE_VQRDMULH_qr_s8 : MVE_VxxMUL_qr<"vqrdmulh", "s8", 0b1, 0b00>;
def MVE_VQRDMULH_qr_s16 : MVE_VxxMUL_qr<"vqrdmulh", "s16", 0b1, 0b01>;
def MVE_VQRDMULH_qr_s32 : MVE_VxxMUL_qr<"vqrdmulh", "s32", 0b1, 0b10>;
let Predicates = [HasMVEFloat], validForTailPredication = 1 in {
def MVE_VMUL_qr_f16 : MVE_VxxMUL_qr<"vmul", "f16", 0b1, 0b11>;
def MVE_VMUL_qr_f32 : MVE_VxxMUL_qr<"vmul", "f32", 0b0, 0b11>;
}
class MVE_VFMAMLA_qr<string iname, string suffix,
2019-07-31 18:08:09 +08:00
bit bit_28, bits<2> bits_21_20, bit S,
list<dag> pattern=[]>
: MVE_qDestSrc_rSrc<iname, suffix, pattern> {
let Inst{28} = bit_28;
let Inst{21-20} = bits_21_20;
let Inst{16} = 0b1;
let Inst{12} = S;
let Inst{8} = 0b0;
let Inst{5} = 0b0;
let validForTailPredication = 1;
}
def MVE_VMLA_qr_s8 : MVE_VFMAMLA_qr<"vmla", "s8", 0b0, 0b00, 0b0>;
def MVE_VMLA_qr_s16 : MVE_VFMAMLA_qr<"vmla", "s16", 0b0, 0b01, 0b0>;
def MVE_VMLA_qr_s32 : MVE_VFMAMLA_qr<"vmla", "s32", 0b0, 0b10, 0b0>;
def MVE_VMLA_qr_u8 : MVE_VFMAMLA_qr<"vmla", "u8", 0b1, 0b00, 0b0>;
def MVE_VMLA_qr_u16 : MVE_VFMAMLA_qr<"vmla", "u16", 0b1, 0b01, 0b0>;
def MVE_VMLA_qr_u32 : MVE_VFMAMLA_qr<"vmla", "u32", 0b1, 0b10, 0b0>;
def MVE_VMLAS_qr_s8 : MVE_VFMAMLA_qr<"vmlas", "s8", 0b0, 0b00, 0b1>;
def MVE_VMLAS_qr_s16 : MVE_VFMAMLA_qr<"vmlas", "s16", 0b0, 0b01, 0b1>;
def MVE_VMLAS_qr_s32 : MVE_VFMAMLA_qr<"vmlas", "s32", 0b0, 0b10, 0b1>;
def MVE_VMLAS_qr_u8 : MVE_VFMAMLA_qr<"vmlas", "u8", 0b1, 0b00, 0b1>;
def MVE_VMLAS_qr_u16 : MVE_VFMAMLA_qr<"vmlas", "u16", 0b1, 0b01, 0b1>;
def MVE_VMLAS_qr_u32 : MVE_VFMAMLA_qr<"vmlas", "u32", 0b1, 0b10, 0b1>;
let Predicates = [HasMVEInt] in {
def : Pat<(v4i32 (add (v4i32 MQPR:$src1),
(v4i32 (mul (v4i32 MQPR:$src2),
(v4i32 (ARMvdup (i32 rGPR:$x))))))),
(v4i32 (MVE_VMLA_qr_u32 $src1, $src2, $x))>;
def : Pat<(v8i16 (add (v8i16 MQPR:$src1),
(v8i16 (mul (v8i16 MQPR:$src2),
(v8i16 (ARMvdup (i32 rGPR:$x))))))),
(v8i16 (MVE_VMLA_qr_u16 $src1, $src2, $x))>;
def : Pat<(v16i8 (add (v16i8 MQPR:$src1),
(v16i8 (mul (v16i8 MQPR:$src2),
(v16i8 (ARMvdup (i32 rGPR:$x))))))),
(v16i8 (MVE_VMLA_qr_u8 $src1, $src2, $x))>;
}
let Predicates = [HasMVEFloat] in {
def MVE_VFMA_qr_f16 : MVE_VFMAMLA_qr<"vfma", "f16", 0b1, 0b11, 0b0>;
def MVE_VFMA_qr_f32 : MVE_VFMAMLA_qr<"vfma", "f32", 0b0, 0b11, 0b0>;
def MVE_VFMA_qr_Sf16 : MVE_VFMAMLA_qr<"vfmas", "f16", 0b1, 0b11, 0b1>;
def MVE_VFMA_qr_Sf32 : MVE_VFMAMLA_qr<"vfmas", "f32", 0b0, 0b11, 0b1>;
}
class MVE_VQDMLAH_qr<string iname, string suffix, bit U, bits<2> size,
bit bit_5, bit bit_12, list<dag> pattern=[]>
: MVE_qDestSrc_rSrc<iname, suffix, pattern> {
let Inst{28} = U;
let Inst{21-20} = size;
let Inst{16} = 0b0;
let Inst{12} = bit_12;
let Inst{8} = 0b0;
let Inst{5} = bit_5;
}
multiclass MVE_VQDMLAH_qr_types<string iname, bit bit_5, bit bit_12> {
def s8 : MVE_VQDMLAH_qr<iname, "s8", 0b0, 0b00, bit_5, bit_12>;
def s16 : MVE_VQDMLAH_qr<iname, "s16", 0b0, 0b01, bit_5, bit_12>;
def s32 : MVE_VQDMLAH_qr<iname, "s32", 0b0, 0b10, bit_5, bit_12>;
}
defm MVE_VQDMLAH_qr : MVE_VQDMLAH_qr_types<"vqdmlah", 0b1, 0b0>;
defm MVE_VQRDMLAH_qr : MVE_VQDMLAH_qr_types<"vqrdmlah", 0b0, 0b0>;
defm MVE_VQDMLASH_qr : MVE_VQDMLAH_qr_types<"vqdmlash", 0b1, 0b1>;
defm MVE_VQRDMLASH_qr : MVE_VQDMLAH_qr_types<"vqrdmlash", 0b0, 0b1>;
class MVE_VxDUP<string iname, string suffix, bits<2> size, bit bit_12,
list<dag> pattern=[]>
: MVE_p<(outs MQPR:$Qd, tGPREven:$Rn),
(ins tGPREven:$Rn_src, MVE_VIDUP_imm:$imm), NoItinerary,
iname, suffix, "$Qd, $Rn, $imm", vpred_r, "$Rn = $Rn_src",
pattern> {
bits<4> Qd;
bits<4> Rn;
bits<2> imm;
let Inst{28} = 0b0;
let Inst{25-23} = 0b100;
let Inst{22} = Qd{3};
let Inst{21-20} = size;
let Inst{19-17} = Rn{3-1};
let Inst{16} = 0b1;
let Inst{15-13} = Qd{2-0};
let Inst{12} = bit_12;
let Inst{11-8} = 0b1111;
let Inst{7} = imm{1};
let Inst{6-1} = 0b110111;
let Inst{0} = imm{0};
let validForTailPredication = 1;
}
def MVE_VIDUPu8 : MVE_VxDUP<"vidup", "u8", 0b00, 0b0>;
def MVE_VIDUPu16 : MVE_VxDUP<"vidup", "u16", 0b01, 0b0>;
def MVE_VIDUPu32 : MVE_VxDUP<"vidup", "u32", 0b10, 0b0>;
def MVE_VDDUPu8 : MVE_VxDUP<"vddup", "u8", 0b00, 0b1>;
def MVE_VDDUPu16 : MVE_VxDUP<"vddup", "u16", 0b01, 0b1>;
def MVE_VDDUPu32 : MVE_VxDUP<"vddup", "u32", 0b10, 0b1>;
class MVE_VxWDUP<string iname, string suffix, bits<2> size, bit bit_12,
list<dag> pattern=[]>
: MVE_p<(outs MQPR:$Qd, tGPREven:$Rn),
(ins tGPREven:$Rn_src, tGPROdd:$Rm, MVE_VIDUP_imm:$imm), NoItinerary,
iname, suffix, "$Qd, $Rn, $Rm, $imm", vpred_r, "$Rn = $Rn_src",
pattern> {
bits<4> Qd;
bits<4> Rm;
bits<4> Rn;
bits<2> imm;
let Inst{28} = 0b0;
let Inst{25-23} = 0b100;
let Inst{22} = Qd{3};
let Inst{21-20} = size;
let Inst{19-17} = Rn{3-1};
let Inst{16} = 0b1;
let Inst{15-13} = Qd{2-0};
let Inst{12} = bit_12;
let Inst{11-8} = 0b1111;
let Inst{7} = imm{1};
let Inst{6-4} = 0b110;
let Inst{3-1} = Rm{3-1};
let Inst{0} = imm{0};
let validForTailPredication = 1;
}
def MVE_VIWDUPu8 : MVE_VxWDUP<"viwdup", "u8", 0b00, 0b0>;
def MVE_VIWDUPu16 : MVE_VxWDUP<"viwdup", "u16", 0b01, 0b0>;
def MVE_VIWDUPu32 : MVE_VxWDUP<"viwdup", "u32", 0b10, 0b0>;
def MVE_VDWDUPu8 : MVE_VxWDUP<"vdwdup", "u8", 0b00, 0b1>;
def MVE_VDWDUPu16 : MVE_VxWDUP<"vdwdup", "u16", 0b01, 0b1>;
def MVE_VDWDUPu32 : MVE_VxWDUP<"vdwdup", "u32", 0b10, 0b1>;
let hasSideEffects = 1 in
class MVE_VCTPInst<string suffix, bits<2> size, list<dag> pattern=[]>
: MVE_p<(outs VCCR:$P0), (ins rGPR:$Rn), NoItinerary, "vctp", suffix,
"$Rn", vpred_n, "", pattern> {
bits<4> Rn;
let Inst{28-27} = 0b10;
let Inst{26-22} = 0b00000;
let Inst{21-20} = size;
let Inst{19-16} = Rn{3-0};
let Inst{15-11} = 0b11101;
let Inst{10-0} = 0b00000000001;
let Unpredictable{10-0} = 0b11111111111;
let Constraints = "";
let DecoderMethod = "DecodeMveVCTP";
let validForTailPredication = 1;
}
multiclass MVE_VCTP<MVEVectorVTInfo VTI, Intrinsic intr> {
def "": MVE_VCTPInst<VTI.BitsSuffix, VTI.Size>;
let Predicates = [HasMVEInt] in {
def : Pat<(intr rGPR:$Rn),
(VTI.Pred (!cast<Instruction>(NAME) rGPR:$Rn))>;
def : Pat<(and (intr rGPR:$Rn), (VTI.Pred VCCR:$mask)),
(VTI.Pred (!cast<Instruction>(NAME) rGPR:$Rn, ARMVCCThen, VCCR:$mask))>;
}
}
defm MVE_VCTP8 : MVE_VCTP<MVE_v16i8, int_arm_mve_vctp8>;
defm MVE_VCTP16 : MVE_VCTP<MVE_v8i16, int_arm_mve_vctp16>;
defm MVE_VCTP32 : MVE_VCTP<MVE_v4i32, int_arm_mve_vctp32>;
defm MVE_VCTP64 : MVE_VCTP<MVE_v2i64, int_arm_mve_vctp64>;
// end of mve_qDest_rSrc
// start of coproc mov
class MVE_VMOV_64bit<dag oops, dag iops, bit to_qreg, string ops, string cstr>
: MVE_VMOV_lane_base<oops, !con(iops, (ins MVEPairVectorIndex2:$idx,
MVEPairVectorIndex0:$idx2)),
NoItinerary, "vmov", "", ops, cstr, []> {
bits<5> Rt;
bits<5> Rt2;
bits<4> Qd;
bit idx;
bit idx2;
let Inst{31-23} = 0b111011000;
let Inst{22} = Qd{3};
let Inst{21} = 0b0;
let Inst{20} = to_qreg;
let Inst{19-16} = Rt2{3-0};
let Inst{15-13} = Qd{2-0};
let Inst{12-5} = 0b01111000;
let Inst{4} = idx2;
let Inst{3-0} = Rt{3-0};
}
// The assembly syntax for these instructions mentions the vector
// register name twice, e.g.
//
// vmov q2[2], q2[0], r0, r1
// vmov r0, r1, q2[2], q2[0]
//
// which needs a bit of juggling with MC operand handling.
//
// For the move _into_ a vector register, the MC operand list also has
// to mention the register name twice: once as the output, and once as
// an extra input to represent where the unchanged half of the output
// register comes from (when this instruction is used in code
// generation). So we arrange that the first mention of the vector reg
// in the instruction is considered by the AsmMatcher to be the output
// ($Qd), and the second one is the input ($QdSrc). Binding them
// together with the existing 'tie' constraint is enough to enforce at
// register allocation time that they have to be the same register.
//
// For the move _from_ a vector register, there's no way to get round
// the fact that both instances of that register name have to be
// inputs. They have to be the same register again, but this time, we
// can't use a tie constraint, because that has to be between an
// output and an input operand. So this time, we have to arrange that
// the q-reg appears just once in the MC operand list, in spite of
// being mentioned twice in the asm syntax - which needs a custom
// AsmMatchConverter.
def MVE_VMOV_q_rr : MVE_VMOV_64bit<(outs MQPR:$Qd),
(ins MQPR:$QdSrc, rGPR:$Rt, rGPR:$Rt2),
0b1, "$Qd$idx, $QdSrc$idx2, $Rt, $Rt2",
"$Qd = $QdSrc"> {
let DecoderMethod = "DecodeMVEVMOVDRegtoQ";
}
def MVE_VMOV_rr_q : MVE_VMOV_64bit<(outs rGPR:$Rt, rGPR:$Rt2), (ins MQPR:$Qd),
0b0, "$Rt, $Rt2, $Qd$idx, $Qd$idx2", ""> {
let DecoderMethod = "DecodeMVEVMOVQtoDReg";
let AsmMatchConverter = "cvtMVEVMOVQtoDReg";
}
// end of coproc mov
// start of MVE interleaving load/store
// Base class for the family of interleaving/deinterleaving
// load/stores with names like VLD20.8 and VST43.32.
class MVE_vldst24_base<bit writeback, bit fourregs, bits<2> stage, bits<2> size,
bit load, dag Oops, dag loadIops, dag wbIops,
string iname, string ops,
string cstr, list<dag> pattern=[]>
: MVE_MI<Oops, !con(loadIops, wbIops), NoItinerary, iname, ops, cstr, pattern> {
bits<4> VQd;
bits<4> Rn;
let Inst{31-22} = 0b1111110010;
let Inst{21} = writeback;
let Inst{20} = load;
let Inst{19-16} = Rn;
let Inst{15-13} = VQd{2-0};
let Inst{12-9} = 0b1111;
let Inst{8-7} = size;
let Inst{6-5} = stage;
let Inst{4-1} = 0b0000;
let Inst{0} = fourregs;
let mayLoad = load;
let mayStore = !eq(load,0);
}
// A parameter class used to encapsulate all the ways the writeback
// variants of VLD20 and friends differ from the non-writeback ones.
class MVE_vldst24_writeback<bit b, dag Oo, dag Io,
string sy="", string c="", string n=""> {
bit writeback = b;
dag Oops = Oo;
dag Iops = Io;
string syntax = sy;
string cstr = c;
string id_suffix = n;
}
// Another parameter class that encapsulates the differences between VLD2x
// and VLD4x.
class MVE_vldst24_nvecs<int n, list<int> s, bit b, RegisterOperand vl> {
int nvecs = n;
list<int> stages = s;
bit bit0 = b;
RegisterOperand VecList = vl;
}
// A third parameter class that distinguishes VLDnn.8 from .16 from .32.
class MVE_vldst24_lanesize<int i, bits<2> b> {
int lanesize = i;
bits<2> sizebits = b;
}
// A base class for each direction of transfer: one for load, one for
// store. I can't make these a fourth independent parametric tuple
// class, because they have to take the nvecs tuple class as a
// parameter, in order to find the right VecList operand type.
class MVE_vld24_base<MVE_vldst24_nvecs n, bits<2> pat, bits<2> size,
MVE_vldst24_writeback wb, string iname,
list<dag> pattern=[]>
: MVE_vldst24_base<wb.writeback, n.bit0, pat, size, 1,
!con((outs n.VecList:$VQd), wb.Oops),
(ins n.VecList:$VQdSrc), wb.Iops,
iname, "$VQd, $Rn" # wb.syntax,
wb.cstr # ",$VQdSrc = $VQd", pattern>;
class MVE_vst24_base<MVE_vldst24_nvecs n, bits<2> pat, bits<2> size,
MVE_vldst24_writeback wb, string iname,
list<dag> pattern=[]>
: MVE_vldst24_base<wb.writeback, n.bit0, pat, size, 0,
wb.Oops, (ins n.VecList:$VQd), wb.Iops,
iname, "$VQd, $Rn" # wb.syntax,
wb.cstr, pattern>;
// Actually define all the interleaving loads and stores, by a series
// of nested foreaches over number of vectors (VLD2/VLD4); stage
// within one of those series (VLDx0/VLDx1/VLDx2/VLDx3); size of
// vector lane; writeback or no writeback.
foreach n = [MVE_vldst24_nvecs<2, [0,1], 0, VecList2Q>,
MVE_vldst24_nvecs<4, [0,1,2,3], 1, VecList4Q>] in
foreach stage = n.stages in
foreach s = [MVE_vldst24_lanesize< 8, 0b00>,
MVE_vldst24_lanesize<16, 0b01>,
MVE_vldst24_lanesize<32, 0b10>] in
foreach wb = [MVE_vldst24_writeback<
1, (outs rGPR:$wb), (ins t2_nosp_addr_offset_none:$Rn),
"!", "$Rn.base = $wb", "_wb">,
MVE_vldst24_writeback<0, (outs), (ins t2_addr_offset_none:$Rn)>] in {
// For each case within all of those foreaches, define the actual
// instructions. The def names are made by gluing together pieces
// from all the parameter classes, and will end up being things like
// MVE_VLD20_8 and MVE_VST43_16_wb.
def "MVE_VLD" # n.nvecs # stage # "_" # s.lanesize # wb.id_suffix
: MVE_vld24_base<n, stage, s.sizebits, wb,
"vld" # n.nvecs # stage # "." # s.lanesize>;
def "MVE_VST" # n.nvecs # stage # "_" # s.lanesize # wb.id_suffix
: MVE_vst24_base<n, stage, s.sizebits, wb,
"vst" # n.nvecs # stage # "." # s.lanesize>;
}
[ARM] Add IR intrinsics for MVE VLD[24] and VST[24]. The VST2 and VST4 instructions take two or four vector registers as input, and store part of each register to memory in an interleaved pattern. They come in variants indicating which part of each register they store (VST20 and VST21; VST40 to VST43 inclusive); the intention is that issuing each of those variants in turn has the combined effect of loading or storing the whole set of registers to a memory block of equal size. The corresponding VLD2 and VLD4 instructions load from memory in the same interleaved format: each one overwrites only part of its output register set, and again, the idea is that if you use VLD4{0,1,2,3} or VLD2{0,1} together, you end up having written to the whole of each register. I've implemented the stores and loads quite differently. The loads were easiest to implement as a single intrinsic that expands to all four VLD4x instructions or both VLD2x, delivering four complete output registers. (Implementing each individual load as a separate instruction taking four input registers to partially overwrite is possible in theory, but pointless, and when I tried it, I found it would need extra work to get the register allocation not to be horrible.) Since that intrinsic delivers multiple outputs, it has to be instruction-selected in custom C++. But the store instructions are easier to model individually, because they don't overwrite any register at all and you can write a DAG Isel pattern in Tablegen for each one. Hence, my new intrinsic `int_arm_mve_vld4q` expands to four load instructions, delivers four full output vectors, and is handled by C++ code, whereas `int_arm_mve_vst4q` expands to just one store instruction, takes four input vectors and a constant indicating which lanes to store, and is handled entirely in Tablegen. (And similarly for vld2q/vst2q.) This is asymmetric, but it was the easiest way to do each one. Reviewers: dmgreen, miyuki, ostannard Subscribers: kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D68700
2019-10-08 00:03:46 +08:00
multiclass MVE_vst24_patterns<int lanesize, ValueType VT> {
foreach stage = [0,1] in
def : Pat<(int_arm_mve_vst2q i32:$addr,
(VT MQPR:$v0), (VT MQPR:$v1), (i32 stage)),
(!cast<Instruction>("MVE_VST2"#stage#"_"#lanesize)
(REG_SEQUENCE QQPR, VT:$v0, qsub_0, VT:$v1, qsub_1),
t2_addr_offset_none:$addr)>;
foreach stage = [0,1,2,3] in
def : Pat<(int_arm_mve_vst4q i32:$addr,
(VT MQPR:$v0), (VT MQPR:$v1),
(VT MQPR:$v2), (VT MQPR:$v3), (i32 stage)),
(!cast<Instruction>("MVE_VST4"#stage#"_"#lanesize)
(REG_SEQUENCE QQQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1,
VT:$v2, qsub_2, VT:$v3, qsub_3),
t2_addr_offset_none:$addr)>;
}
defm : MVE_vst24_patterns<8, v16i8>;
defm : MVE_vst24_patterns<16, v8i16>;
defm : MVE_vst24_patterns<32, v4i32>;
defm : MVE_vst24_patterns<16, v8f16>;
defm : MVE_vst24_patterns<32, v4f32>;
// end of MVE interleaving load/store
[ARM] Add MVE vector load/store instructions. This adds the rest of the vector memory access instructions. It includes contiguous loads/stores, with an ordinary addressing mode such as [r0,#offset] (plus writeback variants); gather loads and scatter stores with a scalar base address register and a vector of offsets from it (written [r0,q1] or similar); and gather/scatters with a vector of base addresses (written [q0,#offset], again with writeback). Additionally, some of the loads can widen each loaded value into a larger vector lane, and the corresponding stores narrow them again. To implement these, we also have to add the addressing modes they need. Also, in AsmParser, the `isMem` query function now has subqueries `isGPRMem` and `isMVEMem`, according to which kind of base register is used by a given memory access operand. I've also had to add an extra check in `checkTargetMatchPredicate` in the AsmParser, without which our last-minute check of `rGPR` register operands against SP and PC was failing an assertion because Tablegen had inserted an immediate 0 in place of one of a pair of tied register operands. (This matches the way the corresponding check for `MCK_rGPR` in `validateTargetOperandClass` is guarded.) Apparently the MVE load instructions were the first to have ever triggered this assertion, but I think only because they were the first to have a combination of the usual Arm pre/post writeback system and the `rGPR` class in particular. Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62680 llvm-svn: 364291
2019-06-25 19:24:18 +08:00
// start of MVE predicable load/store
// A parameter class for the direction of transfer.
class MVE_ldst_direction<bit b, dag Oo, dag Io, string c=""> {
bit load = b;
dag Oops = Oo;
dag Iops = Io;
string cstr = c;
}
def MVE_ld: MVE_ldst_direction<1, (outs MQPR:$Qd), (ins), ",@earlyclobber $Qd">;
def MVE_st: MVE_ldst_direction<0, (outs), (ins MQPR:$Qd)>;
// A parameter class for the size of memory access in a load.
class MVE_memsz<bits<2> e, int s, AddrMode m, string mn, list<string> types> {
bits<2> encoding = e; // opcode bit(s) for encoding
int shift = s; // shift applied to immediate load offset
AddrMode AM = m;
// For instruction aliases: define the complete list of type
// suffixes at this size, and the canonical ones for loads and
// stores.
string MnemonicLetter = mn;
int TypeBits = !shl(8, s);
string CanonLoadSuffix = ".u" # TypeBits;
string CanonStoreSuffix = "." # TypeBits;
list<string> suffixes = !foreach(letter, types, "." # letter # TypeBits);
}
// Instances of MVE_memsz.
//
// (memD doesn't need an AddrMode, because those are only for
// contiguous loads, and memD is only used by gather/scatters.)
def MVE_memB: MVE_memsz<0b00, 0, AddrModeT2_i7, "b", ["", "u", "s"]>;
def MVE_memH: MVE_memsz<0b01, 1, AddrModeT2_i7s2, "h", ["", "u", "s", "f"]>;
def MVE_memW: MVE_memsz<0b10, 2, AddrModeT2_i7s4, "w", ["", "u", "s", "f"]>;
def MVE_memD: MVE_memsz<0b11, 3, ?, "d", ["", "u", "s", "f"]>;
// This is the base class for all the MVE loads and stores other than
// the interleaving ones. All the non-interleaving loads/stores share
// the characteristic that they operate on just one vector register,
// so they are VPT-predicable.
//
// The predication operand is vpred_n, for both loads and stores. For
// store instructions, the reason is obvious: if there is no output
// register, there can't be a need for an input parameter giving the
// output register's previous value. Load instructions also don't need
// that input parameter, because unlike MVE data processing
// instructions, predicated loads are defined to set the inactive
// lanes of the output register to zero, instead of preserving their
// input values.
class MVE_VLDRSTR_base<MVE_ldst_direction dir, bit U, bit P, bit W, bit opc,
dag oops, dag iops, string asm, string suffix,
string ops, string cstr, list<dag> pattern=[]>
: MVE_p<oops, iops, NoItinerary, asm, suffix, ops, vpred_n, cstr, pattern> {
bits<3> Qd;
let Inst{28} = U;
let Inst{25} = 0b0;
let Inst{24} = P;
let Inst{22} = 0b0;
let Inst{21} = W;
let Inst{20} = dir.load;
let Inst{15-13} = Qd{2-0};
let Inst{12} = opc;
let Inst{11-9} = 0b111;
let mayLoad = dir.load;
let mayStore = !eq(dir.load,0);
let validForTailPredication = 1;
[ARM] Add MVE vector load/store instructions. This adds the rest of the vector memory access instructions. It includes contiguous loads/stores, with an ordinary addressing mode such as [r0,#offset] (plus writeback variants); gather loads and scatter stores with a scalar base address register and a vector of offsets from it (written [r0,q1] or similar); and gather/scatters with a vector of base addresses (written [q0,#offset], again with writeback). Additionally, some of the loads can widen each loaded value into a larger vector lane, and the corresponding stores narrow them again. To implement these, we also have to add the addressing modes they need. Also, in AsmParser, the `isMem` query function now has subqueries `isGPRMem` and `isMVEMem`, according to which kind of base register is used by a given memory access operand. I've also had to add an extra check in `checkTargetMatchPredicate` in the AsmParser, without which our last-minute check of `rGPR` register operands against SP and PC was failing an assertion because Tablegen had inserted an immediate 0 in place of one of a pair of tied register operands. (This matches the way the corresponding check for `MCK_rGPR` in `validateTargetOperandClass` is guarded.) Apparently the MVE load instructions were the first to have ever triggered this assertion, but I think only because they were the first to have a combination of the usual Arm pre/post writeback system and the `rGPR` class in particular. Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62680 llvm-svn: 364291
2019-06-25 19:24:18 +08:00
}
// Contiguous load and store instructions. These come in two main
// categories: same-size loads/stores in which 128 bits of vector
// register is transferred to or from 128 bits of memory in the most
// obvious way, and widening loads / narrowing stores, in which the
// size of memory accessed is less than the size of a vector register,
// so the load instructions sign- or zero-extend each memory value
// into a wider vector lane, and the store instructions truncate
// correspondingly.
//
// The instruction mnemonics for these two classes look reasonably
// similar, but the actual encodings are different enough to need two
// separate base classes.
// Contiguous, same size
class MVE_VLDRSTR_cs<MVE_ldst_direction dir, MVE_memsz memsz, bit P, bit W,
dag oops, dag iops, string asm, string suffix,
IndexMode im, string ops, string cstr>
: MVE_VLDRSTR_base<dir, 0, P, W, 1, oops, iops, asm, suffix, ops, cstr> {
bits<12> addr;
let Inst{23} = addr{7};
let Inst{19-16} = addr{11-8};
let Inst{8-7} = memsz.encoding;
let Inst{6-0} = addr{6-0};
}
// Contiguous, widening/narrowing
class MVE_VLDRSTR_cw<MVE_ldst_direction dir, MVE_memsz memsz, bit U,
bit P, bit W, bits<2> size, dag oops, dag iops,
string asm, string suffix, IndexMode im,
string ops, string cstr>
: MVE_VLDRSTR_base<dir, U, P, W, 0, oops, iops, asm, suffix, ops, cstr> {
bits<11> addr;
let Inst{23} = addr{7};
let Inst{19} = memsz.encoding{0}; // enough to tell 16- from 32-bit
let Inst{18-16} = addr{10-8};
let Inst{8-7} = size;
let Inst{6-0} = addr{6-0};
let IM = im;
}
// Multiclass wrapper on each of the _cw and _cs base classes, to
// generate three writeback modes (none, preindex, postindex).
multiclass MVE_VLDRSTR_cw_m<MVE_ldst_direction dir, MVE_memsz memsz,
string asm, string suffix, bit U, bits<2> size> {
let AM = memsz.AM in {
def "" : MVE_VLDRSTR_cw<
dir, memsz, U, 1, 0, size,
dir.Oops, !con(dir.Iops, (ins taddrmode_imm7<memsz.shift>:$addr)),
asm, suffix, IndexModeNone, "$Qd, $addr", "">;
def _pre : MVE_VLDRSTR_cw<
dir, memsz, U, 1, 1, size,
!con((outs tGPR:$wb), dir.Oops),
!con(dir.Iops, (ins taddrmode_imm7<memsz.shift>:$addr)),
asm, suffix, IndexModePre, "$Qd, $addr!", "$addr.base = $wb"> {
let DecoderMethod = "DecodeMVE_MEM_1_pre<"#memsz.shift#">";
}
def _post : MVE_VLDRSTR_cw<
dir, memsz, U, 0, 1, size,
!con((outs tGPR:$wb), dir.Oops),
!con(dir.Iops, (ins t_addr_offset_none:$Rn,
t2am_imm7_offset<memsz.shift>:$addr)),
asm, suffix, IndexModePost, "$Qd, $Rn$addr", "$Rn.base = $wb"> {
bits<4> Rn;
let Inst{18-16} = Rn{2-0};
}
}
}
multiclass MVE_VLDRSTR_cs_m<MVE_ldst_direction dir, MVE_memsz memsz,
string asm, string suffix> {
let AM = memsz.AM in {
def "" : MVE_VLDRSTR_cs<
dir, memsz, 1, 0,
dir.Oops, !con(dir.Iops, (ins t2addrmode_imm7<memsz.shift>:$addr)),
asm, suffix, IndexModeNone, "$Qd, $addr", "">;
def _pre : MVE_VLDRSTR_cs<
dir, memsz, 1, 1,
!con((outs rGPR:$wb), dir.Oops),
!con(dir.Iops, (ins t2addrmode_imm7_pre<memsz.shift>:$addr)),
asm, suffix, IndexModePre, "$Qd, $addr!", "$addr.base = $wb"> {
let DecoderMethod = "DecodeMVE_MEM_2_pre<"#memsz.shift#">";
}
def _post : MVE_VLDRSTR_cs<
dir, memsz, 0, 1,
!con((outs rGPR:$wb), dir.Oops),
// We need an !if here to select the base register class,
// because it's legal to write back to SP in a load of this
// type, but not in a store.
!con(dir.Iops, (ins !if(dir.load, t2_addr_offset_none,
t2_nosp_addr_offset_none):$Rn,
t2am_imm7_offset<memsz.shift>:$addr)),
asm, suffix, IndexModePost, "$Qd, $Rn$addr", "$Rn.base = $wb"> {
bits<4> Rn;
let Inst{19-16} = Rn{3-0};
}
}
}
// Now actually declare all the contiguous load/stores, via those
// multiclasses. The instruction ids coming out of this are the bare
// names shown in the defm, with _pre or _post appended for writeback,
// e.g. MVE_VLDRBS16, MVE_VSTRB16_pre, MVE_VSTRHU16_post.
defm MVE_VLDRBS16: MVE_VLDRSTR_cw_m<MVE_ld, MVE_memB, "vldrb", "s16", 0, 0b01>;
defm MVE_VLDRBS32: MVE_VLDRSTR_cw_m<MVE_ld, MVE_memB, "vldrb", "s32", 0, 0b10>;
defm MVE_VLDRBU16: MVE_VLDRSTR_cw_m<MVE_ld, MVE_memB, "vldrb", "u16", 1, 0b01>;
defm MVE_VLDRBU32: MVE_VLDRSTR_cw_m<MVE_ld, MVE_memB, "vldrb", "u32", 1, 0b10>;
defm MVE_VLDRHS32: MVE_VLDRSTR_cw_m<MVE_ld, MVE_memH, "vldrh", "s32", 0, 0b10>;
defm MVE_VLDRHU32: MVE_VLDRSTR_cw_m<MVE_ld, MVE_memH, "vldrh", "u32", 1, 0b10>;
defm MVE_VLDRBU8: MVE_VLDRSTR_cs_m<MVE_ld, MVE_memB, "vldrb", "u8">;
defm MVE_VLDRHU16: MVE_VLDRSTR_cs_m<MVE_ld, MVE_memH, "vldrh", "u16">;
defm MVE_VLDRWU32: MVE_VLDRSTR_cs_m<MVE_ld, MVE_memW, "vldrw", "u32">;
defm MVE_VSTRB16: MVE_VLDRSTR_cw_m<MVE_st, MVE_memB, "vstrb", "16", 0, 0b01>;
defm MVE_VSTRB32: MVE_VLDRSTR_cw_m<MVE_st, MVE_memB, "vstrb", "32", 0, 0b10>;
defm MVE_VSTRH32: MVE_VLDRSTR_cw_m<MVE_st, MVE_memH, "vstrh", "32", 0, 0b10>;
defm MVE_VSTRBU8 : MVE_VLDRSTR_cs_m<MVE_st, MVE_memB, "vstrb", "8">;
defm MVE_VSTRHU16: MVE_VLDRSTR_cs_m<MVE_st, MVE_memH, "vstrh", "16">;
defm MVE_VSTRWU32: MVE_VLDRSTR_cs_m<MVE_st, MVE_memW, "vstrw", "32">;
// Gather loads / scatter stores whose address operand is of the form
// [Rn,Qm], i.e. a single GPR as the common base address, plus a
// vector of offset from it. ('Load/store this sequence of elements of
// the same array.')
//
// Like the contiguous family, these loads and stores can widen the
// loaded values / truncate the stored ones, or they can just
// load/store the same size of memory and vector lane. But unlike the
// contiguous family, there's no particular difference in encoding
// between those two cases.
//
// This family also comes with the option to scale the offset values
// in Qm by the size of the loaded memory (i.e. to treat them as array
// indices), or not to scale them (to treat them as plain byte offsets
// in memory, so that perhaps the loaded values are unaligned). The
// scaled instructions' address operand in assembly looks like
// [Rn,Qm,UXTW #2] or similar.
// Base class.
class MVE_VLDRSTR_rq<MVE_ldst_direction dir, MVE_memsz memsz, bit U,
bits<2> size, bit os, string asm, string suffix, int shift>
: MVE_VLDRSTR_base<dir, U, 0b0, 0b0, 0, dir.Oops,
!con(dir.Iops, (ins mve_addr_rq_shift<shift>:$addr)),
asm, suffix, "$Qd, $addr", dir.cstr> {
bits<7> addr;
let Inst{23} = 0b1;
let Inst{19-16} = addr{6-3};
let Inst{8-7} = size;
let Inst{6} = memsz.encoding{1};
let Inst{5} = 0;
let Inst{4} = memsz.encoding{0};
let Inst{3-1} = addr{2-0};
let Inst{0} = os;
}
// Multiclass that defines the scaled and unscaled versions of an
// instruction, when the memory size is wider than a byte. The scaled
// version gets the default name like MVE_VLDRBU16_rq; the unscaled /
// potentially unaligned version gets a "_u" suffix, e.g.
// MVE_VLDRBU16_rq_u.
multiclass MVE_VLDRSTR_rq_w<MVE_ldst_direction dir, MVE_memsz memsz,
string asm, string suffix, bit U, bits<2> size> {
def _u : MVE_VLDRSTR_rq<dir, memsz, U, size, 0, asm, suffix, 0>;
def "" : MVE_VLDRSTR_rq<dir, memsz, U, size, 1, asm, suffix, memsz.shift>;
}
// Subclass of MVE_VLDRSTR_rq with the same API as that multiclass,
// for use when the memory size is one byte, so there's no 'scaled'
// version of the instruction at all. (This is encoded as if it were
// unscaled, but named in the default way with no _u suffix.)
class MVE_VLDRSTR_rq_b<MVE_ldst_direction dir, MVE_memsz memsz,
string asm, string suffix, bit U, bits<2> size>
: MVE_VLDRSTR_rq<dir, memsz, U, size, 0, asm, suffix, 0>;
[ARM,MVE] Add intrinsics for gather/scatter load/stores. This patch adds two new families of intrinsics, both of which are memory accesses taking a vector of locations to load from / store to. The vldrq_gather_base / vstrq_scatter_base intrinsics take a vector of base addresses, and an immediate offset to be added consistently to each one. vldrq_gather_offset / vstrq_scatter_offset take a scalar base address, and a vector of offsets to add to it. The 'shifted_offset' variants also multiply each offset by the element size type, so that the vector is effectively of array indices. At the IR level, these operations are represented by a single set of four IR intrinsics: {gather,scatter} × {base,offset}. The other details (signed/unsigned, shift, and memory element size as opposed to vector element size) are all specified by IR intrinsic polymorphism and immediate operands, because that made the selection job easier than making a huge family of similarly named intrinsics. I considered using the standard IR representations such as llvm.masked.gather, but they're not a good fit. In order to use llvm.masked.gather to represent a gather_offset load with element size smaller than a pointer, you'd have to expand the <8 x i16> vector of offsets into an <8 x i16*> vector of pointers, which would be split up during legalization, so you'd spend most of your time undoing the mess it had made. Also, ISel support for llvm.masked.gather would be easy enough in a trivial way (you can expand it into a gather-base load with a zero immediate offset), but instruction-selecting lots of fiddly idioms back into all the _other_ MVE load instructions would be much more work. So I think dedicated IR intrinsics are the more sensible approach, at least for the moment. On the clang tablegen side, I've added two new features to the Tablegen source accepted by MveEmitter: a 'CopyKind' type node for defining a type that varies with the parameter type (it lets you ask for an unsigned integer type of the same width as the parameter), and an 'unsignedflag' value node for passing an immediate IR operand which is 0 for a signed integer type or 1 for an unsigned one. That lets me write each kind of intrinsic just once and get all its subtypes and immediate arguments generated automatically. Also I've tweaked the handling of pointer-typed values in the code generation part of MveEmitter: they're generated as Address rather than Value (i.e. including an alignment) so that they can be given to the ordinary IR load and store operations, but I'd omitted the code to convert them back to Value when they're going to be used as an argument to an IR intrinsic. On the MC side, I've enhanced MVEVectorVTInfo so that it can tell you not only the full assembly-language suffix for a given vector type (like 's32' or 'u16') but also the numeric-only one used by store instructions (just '32' or '16'). Reviewers: dmgreen Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D69791
2019-11-01 01:02:07 +08:00
// Multiclasses wrapping that to add ISel patterns for intrinsics.
multiclass MVE_VLDR_rq_w<MVE_memsz memsz, list<MVEVectorVTInfo> VTIs> {
defm "": MVE_VLDRSTR_rq_w<MVE_ld, memsz, "vldr" # memsz.MnemonicLetter,
VTIs[0].Suffix, VTIs[0].Unsigned, VTIs[0].Size>;
foreach VTI = VTIs in
foreach UnsignedFlag = !if(!eq(VTI.Size, memsz.encoding),
[0,1], [VTI.Unsigned]) in {
def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), memsz.TypeBits, 0, UnsignedFlag)),
(VTI.Vec (!cast<Instruction>(NAME#"_u") GPR:$base, MQPR:$offsets))>;
def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), memsz.TypeBits, memsz.shift, UnsignedFlag)),
(VTI.Vec (!cast<Instruction>(NAME) GPR:$base, MQPR:$offsets))>;
def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), memsz.TypeBits, 0, UnsignedFlag, (VTI.Pred VCCR:$pred))),
(VTI.Vec (!cast<Instruction>(NAME#"_u") GPR:$base, MQPR:$offsets, ARMVCCThen, VCCR:$pred))>;
[ARM,MVE] Add intrinsics for gather/scatter load/stores. This patch adds two new families of intrinsics, both of which are memory accesses taking a vector of locations to load from / store to. The vldrq_gather_base / vstrq_scatter_base intrinsics take a vector of base addresses, and an immediate offset to be added consistently to each one. vldrq_gather_offset / vstrq_scatter_offset take a scalar base address, and a vector of offsets to add to it. The 'shifted_offset' variants also multiply each offset by the element size type, so that the vector is effectively of array indices. At the IR level, these operations are represented by a single set of four IR intrinsics: {gather,scatter} × {base,offset}. The other details (signed/unsigned, shift, and memory element size as opposed to vector element size) are all specified by IR intrinsic polymorphism and immediate operands, because that made the selection job easier than making a huge family of similarly named intrinsics. I considered using the standard IR representations such as llvm.masked.gather, but they're not a good fit. In order to use llvm.masked.gather to represent a gather_offset load with element size smaller than a pointer, you'd have to expand the <8 x i16> vector of offsets into an <8 x i16*> vector of pointers, which would be split up during legalization, so you'd spend most of your time undoing the mess it had made. Also, ISel support for llvm.masked.gather would be easy enough in a trivial way (you can expand it into a gather-base load with a zero immediate offset), but instruction-selecting lots of fiddly idioms back into all the _other_ MVE load instructions would be much more work. So I think dedicated IR intrinsics are the more sensible approach, at least for the moment. On the clang tablegen side, I've added two new features to the Tablegen source accepted by MveEmitter: a 'CopyKind' type node for defining a type that varies with the parameter type (it lets you ask for an unsigned integer type of the same width as the parameter), and an 'unsignedflag' value node for passing an immediate IR operand which is 0 for a signed integer type or 1 for an unsigned one. That lets me write each kind of intrinsic just once and get all its subtypes and immediate arguments generated automatically. Also I've tweaked the handling of pointer-typed values in the code generation part of MveEmitter: they're generated as Address rather than Value (i.e. including an alignment) so that they can be given to the ordinary IR load and store operations, but I'd omitted the code to convert them back to Value when they're going to be used as an argument to an IR intrinsic. On the MC side, I've enhanced MVEVectorVTInfo so that it can tell you not only the full assembly-language suffix for a given vector type (like 's32' or 'u16') but also the numeric-only one used by store instructions (just '32' or '16'). Reviewers: dmgreen Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D69791
2019-11-01 01:02:07 +08:00
def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), memsz.TypeBits, memsz.shift, UnsignedFlag, (VTI.Pred VCCR:$pred))),
(VTI.Vec (!cast<Instruction>(NAME) GPR:$base, MQPR:$offsets, ARMVCCThen, VCCR:$pred))>;
[ARM,MVE] Add intrinsics for gather/scatter load/stores. This patch adds two new families of intrinsics, both of which are memory accesses taking a vector of locations to load from / store to. The vldrq_gather_base / vstrq_scatter_base intrinsics take a vector of base addresses, and an immediate offset to be added consistently to each one. vldrq_gather_offset / vstrq_scatter_offset take a scalar base address, and a vector of offsets to add to it. The 'shifted_offset' variants also multiply each offset by the element size type, so that the vector is effectively of array indices. At the IR level, these operations are represented by a single set of four IR intrinsics: {gather,scatter} × {base,offset}. The other details (signed/unsigned, shift, and memory element size as opposed to vector element size) are all specified by IR intrinsic polymorphism and immediate operands, because that made the selection job easier than making a huge family of similarly named intrinsics. I considered using the standard IR representations such as llvm.masked.gather, but they're not a good fit. In order to use llvm.masked.gather to represent a gather_offset load with element size smaller than a pointer, you'd have to expand the <8 x i16> vector of offsets into an <8 x i16*> vector of pointers, which would be split up during legalization, so you'd spend most of your time undoing the mess it had made. Also, ISel support for llvm.masked.gather would be easy enough in a trivial way (you can expand it into a gather-base load with a zero immediate offset), but instruction-selecting lots of fiddly idioms back into all the _other_ MVE load instructions would be much more work. So I think dedicated IR intrinsics are the more sensible approach, at least for the moment. On the clang tablegen side, I've added two new features to the Tablegen source accepted by MveEmitter: a 'CopyKind' type node for defining a type that varies with the parameter type (it lets you ask for an unsigned integer type of the same width as the parameter), and an 'unsignedflag' value node for passing an immediate IR operand which is 0 for a signed integer type or 1 for an unsigned one. That lets me write each kind of intrinsic just once and get all its subtypes and immediate arguments generated automatically. Also I've tweaked the handling of pointer-typed values in the code generation part of MveEmitter: they're generated as Address rather than Value (i.e. including an alignment) so that they can be given to the ordinary IR load and store operations, but I'd omitted the code to convert them back to Value when they're going to be used as an argument to an IR intrinsic. On the MC side, I've enhanced MVEVectorVTInfo so that it can tell you not only the full assembly-language suffix for a given vector type (like 's32' or 'u16') but also the numeric-only one used by store instructions (just '32' or '16'). Reviewers: dmgreen Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D69791
2019-11-01 01:02:07 +08:00
}
}
multiclass MVE_VLDR_rq_b<list<MVEVectorVTInfo> VTIs> {
def "": MVE_VLDRSTR_rq_b<MVE_ld, MVE_memB, "vldrb",
VTIs[0].Suffix, VTIs[0].Unsigned, VTIs[0].Size>;
foreach VTI = VTIs in {
def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), 8, 0, VTI.Unsigned)),
(VTI.Vec (!cast<Instruction>(NAME) GPR:$base, MQPR:$offsets))>;
def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), 8, 0, VTI.Unsigned, (VTI.Pred VCCR:$pred))),
(VTI.Vec (!cast<Instruction>(NAME) GPR:$base, MQPR:$offsets, ARMVCCThen, VCCR:$pred))>;
[ARM,MVE] Add intrinsics for gather/scatter load/stores. This patch adds two new families of intrinsics, both of which are memory accesses taking a vector of locations to load from / store to. The vldrq_gather_base / vstrq_scatter_base intrinsics take a vector of base addresses, and an immediate offset to be added consistently to each one. vldrq_gather_offset / vstrq_scatter_offset take a scalar base address, and a vector of offsets to add to it. The 'shifted_offset' variants also multiply each offset by the element size type, so that the vector is effectively of array indices. At the IR level, these operations are represented by a single set of four IR intrinsics: {gather,scatter} × {base,offset}. The other details (signed/unsigned, shift, and memory element size as opposed to vector element size) are all specified by IR intrinsic polymorphism and immediate operands, because that made the selection job easier than making a huge family of similarly named intrinsics. I considered using the standard IR representations such as llvm.masked.gather, but they're not a good fit. In order to use llvm.masked.gather to represent a gather_offset load with element size smaller than a pointer, you'd have to expand the <8 x i16> vector of offsets into an <8 x i16*> vector of pointers, which would be split up during legalization, so you'd spend most of your time undoing the mess it had made. Also, ISel support for llvm.masked.gather would be easy enough in a trivial way (you can expand it into a gather-base load with a zero immediate offset), but instruction-selecting lots of fiddly idioms back into all the _other_ MVE load instructions would be much more work. So I think dedicated IR intrinsics are the more sensible approach, at least for the moment. On the clang tablegen side, I've added two new features to the Tablegen source accepted by MveEmitter: a 'CopyKind' type node for defining a type that varies with the parameter type (it lets you ask for an unsigned integer type of the same width as the parameter), and an 'unsignedflag' value node for passing an immediate IR operand which is 0 for a signed integer type or 1 for an unsigned one. That lets me write each kind of intrinsic just once and get all its subtypes and immediate arguments generated automatically. Also I've tweaked the handling of pointer-typed values in the code generation part of MveEmitter: they're generated as Address rather than Value (i.e. including an alignment) so that they can be given to the ordinary IR load and store operations, but I'd omitted the code to convert them back to Value when they're going to be used as an argument to an IR intrinsic. On the MC side, I've enhanced MVEVectorVTInfo so that it can tell you not only the full assembly-language suffix for a given vector type (like 's32' or 'u16') but also the numeric-only one used by store instructions (just '32' or '16'). Reviewers: dmgreen Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D69791
2019-11-01 01:02:07 +08:00
}
}
multiclass MVE_VSTR_rq_w<MVE_memsz memsz, list<MVEVectorVTInfo> VTIs> {
defm "": MVE_VLDRSTR_rq_w<MVE_st, memsz, "vstr" # memsz.MnemonicLetter,
VTIs[0].BitsSuffix, 0, VTIs[0].Size>;
foreach VTI = VTIs in {
def : Pat<(int_arm_mve_vstr_scatter_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), memsz.TypeBits, 0),
(!cast<Instruction>(NAME#"_u") MQPR:$data, GPR:$base, MQPR:$offsets)>;
def : Pat<(int_arm_mve_vstr_scatter_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), memsz.TypeBits, memsz.shift),
(!cast<Instruction>(NAME) MQPR:$data, GPR:$base, MQPR:$offsets)>;
def : Pat<(int_arm_mve_vstr_scatter_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), memsz.TypeBits, 0, (VTI.Pred VCCR:$pred)),
(!cast<Instruction>(NAME#"_u") MQPR:$data, GPR:$base, MQPR:$offsets, ARMVCCThen, VCCR:$pred)>;
[ARM,MVE] Add intrinsics for gather/scatter load/stores. This patch adds two new families of intrinsics, both of which are memory accesses taking a vector of locations to load from / store to. The vldrq_gather_base / vstrq_scatter_base intrinsics take a vector of base addresses, and an immediate offset to be added consistently to each one. vldrq_gather_offset / vstrq_scatter_offset take a scalar base address, and a vector of offsets to add to it. The 'shifted_offset' variants also multiply each offset by the element size type, so that the vector is effectively of array indices. At the IR level, these operations are represented by a single set of four IR intrinsics: {gather,scatter} × {base,offset}. The other details (signed/unsigned, shift, and memory element size as opposed to vector element size) are all specified by IR intrinsic polymorphism and immediate operands, because that made the selection job easier than making a huge family of similarly named intrinsics. I considered using the standard IR representations such as llvm.masked.gather, but they're not a good fit. In order to use llvm.masked.gather to represent a gather_offset load with element size smaller than a pointer, you'd have to expand the <8 x i16> vector of offsets into an <8 x i16*> vector of pointers, which would be split up during legalization, so you'd spend most of your time undoing the mess it had made. Also, ISel support for llvm.masked.gather would be easy enough in a trivial way (you can expand it into a gather-base load with a zero immediate offset), but instruction-selecting lots of fiddly idioms back into all the _other_ MVE load instructions would be much more work. So I think dedicated IR intrinsics are the more sensible approach, at least for the moment. On the clang tablegen side, I've added two new features to the Tablegen source accepted by MveEmitter: a 'CopyKind' type node for defining a type that varies with the parameter type (it lets you ask for an unsigned integer type of the same width as the parameter), and an 'unsignedflag' value node for passing an immediate IR operand which is 0 for a signed integer type or 1 for an unsigned one. That lets me write each kind of intrinsic just once and get all its subtypes and immediate arguments generated automatically. Also I've tweaked the handling of pointer-typed values in the code generation part of MveEmitter: they're generated as Address rather than Value (i.e. including an alignment) so that they can be given to the ordinary IR load and store operations, but I'd omitted the code to convert them back to Value when they're going to be used as an argument to an IR intrinsic. On the MC side, I've enhanced MVEVectorVTInfo so that it can tell you not only the full assembly-language suffix for a given vector type (like 's32' or 'u16') but also the numeric-only one used by store instructions (just '32' or '16'). Reviewers: dmgreen Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D69791
2019-11-01 01:02:07 +08:00
def : Pat<(int_arm_mve_vstr_scatter_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), memsz.TypeBits, memsz.shift, (VTI.Pred VCCR:$pred)),
(!cast<Instruction>(NAME) MQPR:$data, GPR:$base, MQPR:$offsets, ARMVCCThen, VCCR:$pred)>;
[ARM,MVE] Add intrinsics for gather/scatter load/stores. This patch adds two new families of intrinsics, both of which are memory accesses taking a vector of locations to load from / store to. The vldrq_gather_base / vstrq_scatter_base intrinsics take a vector of base addresses, and an immediate offset to be added consistently to each one. vldrq_gather_offset / vstrq_scatter_offset take a scalar base address, and a vector of offsets to add to it. The 'shifted_offset' variants also multiply each offset by the element size type, so that the vector is effectively of array indices. At the IR level, these operations are represented by a single set of four IR intrinsics: {gather,scatter} × {base,offset}. The other details (signed/unsigned, shift, and memory element size as opposed to vector element size) are all specified by IR intrinsic polymorphism and immediate operands, because that made the selection job easier than making a huge family of similarly named intrinsics. I considered using the standard IR representations such as llvm.masked.gather, but they're not a good fit. In order to use llvm.masked.gather to represent a gather_offset load with element size smaller than a pointer, you'd have to expand the <8 x i16> vector of offsets into an <8 x i16*> vector of pointers, which would be split up during legalization, so you'd spend most of your time undoing the mess it had made. Also, ISel support for llvm.masked.gather would be easy enough in a trivial way (you can expand it into a gather-base load with a zero immediate offset), but instruction-selecting lots of fiddly idioms back into all the _other_ MVE load instructions would be much more work. So I think dedicated IR intrinsics are the more sensible approach, at least for the moment. On the clang tablegen side, I've added two new features to the Tablegen source accepted by MveEmitter: a 'CopyKind' type node for defining a type that varies with the parameter type (it lets you ask for an unsigned integer type of the same width as the parameter), and an 'unsignedflag' value node for passing an immediate IR operand which is 0 for a signed integer type or 1 for an unsigned one. That lets me write each kind of intrinsic just once and get all its subtypes and immediate arguments generated automatically. Also I've tweaked the handling of pointer-typed values in the code generation part of MveEmitter: they're generated as Address rather than Value (i.e. including an alignment) so that they can be given to the ordinary IR load and store operations, but I'd omitted the code to convert them back to Value when they're going to be used as an argument to an IR intrinsic. On the MC side, I've enhanced MVEVectorVTInfo so that it can tell you not only the full assembly-language suffix for a given vector type (like 's32' or 'u16') but also the numeric-only one used by store instructions (just '32' or '16'). Reviewers: dmgreen Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D69791
2019-11-01 01:02:07 +08:00
}
}
multiclass MVE_VSTR_rq_b<list<MVEVectorVTInfo> VTIs> {
def "": MVE_VLDRSTR_rq_b<MVE_st, MVE_memB, "vstrb",
VTIs[0].BitsSuffix, 0, VTIs[0].Size>;
foreach VTI = VTIs in {
def : Pat<(int_arm_mve_vstr_scatter_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), 8, 0),
(!cast<Instruction>(NAME) MQPR:$data, GPR:$base, MQPR:$offsets)>;
def : Pat<(int_arm_mve_vstr_scatter_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), 8, 0, (VTI.Pred VCCR:$pred)),
(!cast<Instruction>(NAME) MQPR:$data, GPR:$base, MQPR:$offsets, ARMVCCThen, VCCR:$pred)>;
[ARM,MVE] Add intrinsics for gather/scatter load/stores. This patch adds two new families of intrinsics, both of which are memory accesses taking a vector of locations to load from / store to. The vldrq_gather_base / vstrq_scatter_base intrinsics take a vector of base addresses, and an immediate offset to be added consistently to each one. vldrq_gather_offset / vstrq_scatter_offset take a scalar base address, and a vector of offsets to add to it. The 'shifted_offset' variants also multiply each offset by the element size type, so that the vector is effectively of array indices. At the IR level, these operations are represented by a single set of four IR intrinsics: {gather,scatter} × {base,offset}. The other details (signed/unsigned, shift, and memory element size as opposed to vector element size) are all specified by IR intrinsic polymorphism and immediate operands, because that made the selection job easier than making a huge family of similarly named intrinsics. I considered using the standard IR representations such as llvm.masked.gather, but they're not a good fit. In order to use llvm.masked.gather to represent a gather_offset load with element size smaller than a pointer, you'd have to expand the <8 x i16> vector of offsets into an <8 x i16*> vector of pointers, which would be split up during legalization, so you'd spend most of your time undoing the mess it had made. Also, ISel support for llvm.masked.gather would be easy enough in a trivial way (you can expand it into a gather-base load with a zero immediate offset), but instruction-selecting lots of fiddly idioms back into all the _other_ MVE load instructions would be much more work. So I think dedicated IR intrinsics are the more sensible approach, at least for the moment. On the clang tablegen side, I've added two new features to the Tablegen source accepted by MveEmitter: a 'CopyKind' type node for defining a type that varies with the parameter type (it lets you ask for an unsigned integer type of the same width as the parameter), and an 'unsignedflag' value node for passing an immediate IR operand which is 0 for a signed integer type or 1 for an unsigned one. That lets me write each kind of intrinsic just once and get all its subtypes and immediate arguments generated automatically. Also I've tweaked the handling of pointer-typed values in the code generation part of MveEmitter: they're generated as Address rather than Value (i.e. including an alignment) so that they can be given to the ordinary IR load and store operations, but I'd omitted the code to convert them back to Value when they're going to be used as an argument to an IR intrinsic. On the MC side, I've enhanced MVEVectorVTInfo so that it can tell you not only the full assembly-language suffix for a given vector type (like 's32' or 'u16') but also the numeric-only one used by store instructions (just '32' or '16'). Reviewers: dmgreen Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D69791
2019-11-01 01:02:07 +08:00
}
}
[ARM] Add MVE vector load/store instructions. This adds the rest of the vector memory access instructions. It includes contiguous loads/stores, with an ordinary addressing mode such as [r0,#offset] (plus writeback variants); gather loads and scatter stores with a scalar base address register and a vector of offsets from it (written [r0,q1] or similar); and gather/scatters with a vector of base addresses (written [q0,#offset], again with writeback). Additionally, some of the loads can widen each loaded value into a larger vector lane, and the corresponding stores narrow them again. To implement these, we also have to add the addressing modes they need. Also, in AsmParser, the `isMem` query function now has subqueries `isGPRMem` and `isMVEMem`, according to which kind of base register is used by a given memory access operand. I've also had to add an extra check in `checkTargetMatchPredicate` in the AsmParser, without which our last-minute check of `rGPR` register operands against SP and PC was failing an assertion because Tablegen had inserted an immediate 0 in place of one of a pair of tied register operands. (This matches the way the corresponding check for `MCK_rGPR` in `validateTargetOperandClass` is guarded.) Apparently the MVE load instructions were the first to have ever triggered this assertion, but I think only because they were the first to have a combination of the usual Arm pre/post writeback system and the `rGPR` class in particular. Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62680 llvm-svn: 364291
2019-06-25 19:24:18 +08:00
// Actually define all the loads and stores in this family.
[ARM,MVE] Add intrinsics for gather/scatter load/stores. This patch adds two new families of intrinsics, both of which are memory accesses taking a vector of locations to load from / store to. The vldrq_gather_base / vstrq_scatter_base intrinsics take a vector of base addresses, and an immediate offset to be added consistently to each one. vldrq_gather_offset / vstrq_scatter_offset take a scalar base address, and a vector of offsets to add to it. The 'shifted_offset' variants also multiply each offset by the element size type, so that the vector is effectively of array indices. At the IR level, these operations are represented by a single set of four IR intrinsics: {gather,scatter} × {base,offset}. The other details (signed/unsigned, shift, and memory element size as opposed to vector element size) are all specified by IR intrinsic polymorphism and immediate operands, because that made the selection job easier than making a huge family of similarly named intrinsics. I considered using the standard IR representations such as llvm.masked.gather, but they're not a good fit. In order to use llvm.masked.gather to represent a gather_offset load with element size smaller than a pointer, you'd have to expand the <8 x i16> vector of offsets into an <8 x i16*> vector of pointers, which would be split up during legalization, so you'd spend most of your time undoing the mess it had made. Also, ISel support for llvm.masked.gather would be easy enough in a trivial way (you can expand it into a gather-base load with a zero immediate offset), but instruction-selecting lots of fiddly idioms back into all the _other_ MVE load instructions would be much more work. So I think dedicated IR intrinsics are the more sensible approach, at least for the moment. On the clang tablegen side, I've added two new features to the Tablegen source accepted by MveEmitter: a 'CopyKind' type node for defining a type that varies with the parameter type (it lets you ask for an unsigned integer type of the same width as the parameter), and an 'unsignedflag' value node for passing an immediate IR operand which is 0 for a signed integer type or 1 for an unsigned one. That lets me write each kind of intrinsic just once and get all its subtypes and immediate arguments generated automatically. Also I've tweaked the handling of pointer-typed values in the code generation part of MveEmitter: they're generated as Address rather than Value (i.e. including an alignment) so that they can be given to the ordinary IR load and store operations, but I'd omitted the code to convert them back to Value when they're going to be used as an argument to an IR intrinsic. On the MC side, I've enhanced MVEVectorVTInfo so that it can tell you not only the full assembly-language suffix for a given vector type (like 's32' or 'u16') but also the numeric-only one used by store instructions (just '32' or '16'). Reviewers: dmgreen Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D69791
2019-11-01 01:02:07 +08:00
defm MVE_VLDRBU8_rq : MVE_VLDR_rq_b<[MVE_v16u8,MVE_v16s8]>;
defm MVE_VLDRBU16_rq: MVE_VLDR_rq_b<[MVE_v8u16]>;
defm MVE_VLDRBS16_rq: MVE_VLDR_rq_b<[MVE_v8s16]>;
defm MVE_VLDRBU32_rq: MVE_VLDR_rq_b<[MVE_v4u32]>;
defm MVE_VLDRBS32_rq: MVE_VLDR_rq_b<[MVE_v4s32]>;
[ARM] Add MVE vector load/store instructions. This adds the rest of the vector memory access instructions. It includes contiguous loads/stores, with an ordinary addressing mode such as [r0,#offset] (plus writeback variants); gather loads and scatter stores with a scalar base address register and a vector of offsets from it (written [r0,q1] or similar); and gather/scatters with a vector of base addresses (written [q0,#offset], again with writeback). Additionally, some of the loads can widen each loaded value into a larger vector lane, and the corresponding stores narrow them again. To implement these, we also have to add the addressing modes they need. Also, in AsmParser, the `isMem` query function now has subqueries `isGPRMem` and `isMVEMem`, according to which kind of base register is used by a given memory access operand. I've also had to add an extra check in `checkTargetMatchPredicate` in the AsmParser, without which our last-minute check of `rGPR` register operands against SP and PC was failing an assertion because Tablegen had inserted an immediate 0 in place of one of a pair of tied register operands. (This matches the way the corresponding check for `MCK_rGPR` in `validateTargetOperandClass` is guarded.) Apparently the MVE load instructions were the first to have ever triggered this assertion, but I think only because they were the first to have a combination of the usual Arm pre/post writeback system and the `rGPR` class in particular. Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62680 llvm-svn: 364291
2019-06-25 19:24:18 +08:00
[ARM,MVE] Add intrinsics for gather/scatter load/stores. This patch adds two new families of intrinsics, both of which are memory accesses taking a vector of locations to load from / store to. The vldrq_gather_base / vstrq_scatter_base intrinsics take a vector of base addresses, and an immediate offset to be added consistently to each one. vldrq_gather_offset / vstrq_scatter_offset take a scalar base address, and a vector of offsets to add to it. The 'shifted_offset' variants also multiply each offset by the element size type, so that the vector is effectively of array indices. At the IR level, these operations are represented by a single set of four IR intrinsics: {gather,scatter} × {base,offset}. The other details (signed/unsigned, shift, and memory element size as opposed to vector element size) are all specified by IR intrinsic polymorphism and immediate operands, because that made the selection job easier than making a huge family of similarly named intrinsics. I considered using the standard IR representations such as llvm.masked.gather, but they're not a good fit. In order to use llvm.masked.gather to represent a gather_offset load with element size smaller than a pointer, you'd have to expand the <8 x i16> vector of offsets into an <8 x i16*> vector of pointers, which would be split up during legalization, so you'd spend most of your time undoing the mess it had made. Also, ISel support for llvm.masked.gather would be easy enough in a trivial way (you can expand it into a gather-base load with a zero immediate offset), but instruction-selecting lots of fiddly idioms back into all the _other_ MVE load instructions would be much more work. So I think dedicated IR intrinsics are the more sensible approach, at least for the moment. On the clang tablegen side, I've added two new features to the Tablegen source accepted by MveEmitter: a 'CopyKind' type node for defining a type that varies with the parameter type (it lets you ask for an unsigned integer type of the same width as the parameter), and an 'unsignedflag' value node for passing an immediate IR operand which is 0 for a signed integer type or 1 for an unsigned one. That lets me write each kind of intrinsic just once and get all its subtypes and immediate arguments generated automatically. Also I've tweaked the handling of pointer-typed values in the code generation part of MveEmitter: they're generated as Address rather than Value (i.e. including an alignment) so that they can be given to the ordinary IR load and store operations, but I'd omitted the code to convert them back to Value when they're going to be used as an argument to an IR intrinsic. On the MC side, I've enhanced MVEVectorVTInfo so that it can tell you not only the full assembly-language suffix for a given vector type (like 's32' or 'u16') but also the numeric-only one used by store instructions (just '32' or '16'). Reviewers: dmgreen Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D69791
2019-11-01 01:02:07 +08:00
defm MVE_VLDRHU16_rq: MVE_VLDR_rq_w<MVE_memH, [MVE_v8u16,MVE_v8s16,MVE_v8f16]>;
defm MVE_VLDRHU32_rq: MVE_VLDR_rq_w<MVE_memH, [MVE_v4u32]>;
defm MVE_VLDRHS32_rq: MVE_VLDR_rq_w<MVE_memH, [MVE_v4s32]>;
defm MVE_VLDRWU32_rq: MVE_VLDR_rq_w<MVE_memW, [MVE_v4u32,MVE_v4s32,MVE_v4f32]>;
defm MVE_VLDRDU64_rq: MVE_VLDR_rq_w<MVE_memD, [MVE_v2u64,MVE_v2s64]>;
[ARM] Add MVE vector load/store instructions. This adds the rest of the vector memory access instructions. It includes contiguous loads/stores, with an ordinary addressing mode such as [r0,#offset] (plus writeback variants); gather loads and scatter stores with a scalar base address register and a vector of offsets from it (written [r0,q1] or similar); and gather/scatters with a vector of base addresses (written [q0,#offset], again with writeback). Additionally, some of the loads can widen each loaded value into a larger vector lane, and the corresponding stores narrow them again. To implement these, we also have to add the addressing modes they need. Also, in AsmParser, the `isMem` query function now has subqueries `isGPRMem` and `isMVEMem`, according to which kind of base register is used by a given memory access operand. I've also had to add an extra check in `checkTargetMatchPredicate` in the AsmParser, without which our last-minute check of `rGPR` register operands against SP and PC was failing an assertion because Tablegen had inserted an immediate 0 in place of one of a pair of tied register operands. (This matches the way the corresponding check for `MCK_rGPR` in `validateTargetOperandClass` is guarded.) Apparently the MVE load instructions were the first to have ever triggered this assertion, but I think only because they were the first to have a combination of the usual Arm pre/post writeback system and the `rGPR` class in particular. Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62680 llvm-svn: 364291
2019-06-25 19:24:18 +08:00
[ARM,MVE] Add intrinsics for gather/scatter load/stores. This patch adds two new families of intrinsics, both of which are memory accesses taking a vector of locations to load from / store to. The vldrq_gather_base / vstrq_scatter_base intrinsics take a vector of base addresses, and an immediate offset to be added consistently to each one. vldrq_gather_offset / vstrq_scatter_offset take a scalar base address, and a vector of offsets to add to it. The 'shifted_offset' variants also multiply each offset by the element size type, so that the vector is effectively of array indices. At the IR level, these operations are represented by a single set of four IR intrinsics: {gather,scatter} × {base,offset}. The other details (signed/unsigned, shift, and memory element size as opposed to vector element size) are all specified by IR intrinsic polymorphism and immediate operands, because that made the selection job easier than making a huge family of similarly named intrinsics. I considered using the standard IR representations such as llvm.masked.gather, but they're not a good fit. In order to use llvm.masked.gather to represent a gather_offset load with element size smaller than a pointer, you'd have to expand the <8 x i16> vector of offsets into an <8 x i16*> vector of pointers, which would be split up during legalization, so you'd spend most of your time undoing the mess it had made. Also, ISel support for llvm.masked.gather would be easy enough in a trivial way (you can expand it into a gather-base load with a zero immediate offset), but instruction-selecting lots of fiddly idioms back into all the _other_ MVE load instructions would be much more work. So I think dedicated IR intrinsics are the more sensible approach, at least for the moment. On the clang tablegen side, I've added two new features to the Tablegen source accepted by MveEmitter: a 'CopyKind' type node for defining a type that varies with the parameter type (it lets you ask for an unsigned integer type of the same width as the parameter), and an 'unsignedflag' value node for passing an immediate IR operand which is 0 for a signed integer type or 1 for an unsigned one. That lets me write each kind of intrinsic just once and get all its subtypes and immediate arguments generated automatically. Also I've tweaked the handling of pointer-typed values in the code generation part of MveEmitter: they're generated as Address rather than Value (i.e. including an alignment) so that they can be given to the ordinary IR load and store operations, but I'd omitted the code to convert them back to Value when they're going to be used as an argument to an IR intrinsic. On the MC side, I've enhanced MVEVectorVTInfo so that it can tell you not only the full assembly-language suffix for a given vector type (like 's32' or 'u16') but also the numeric-only one used by store instructions (just '32' or '16'). Reviewers: dmgreen Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D69791
2019-11-01 01:02:07 +08:00
defm MVE_VSTRB8_rq : MVE_VSTR_rq_b<[MVE_v16i8]>;
defm MVE_VSTRB16_rq : MVE_VSTR_rq_b<[MVE_v8i16]>;
defm MVE_VSTRB32_rq : MVE_VSTR_rq_b<[MVE_v4i32]>;
[ARM] Add MVE vector load/store instructions. This adds the rest of the vector memory access instructions. It includes contiguous loads/stores, with an ordinary addressing mode such as [r0,#offset] (plus writeback variants); gather loads and scatter stores with a scalar base address register and a vector of offsets from it (written [r0,q1] or similar); and gather/scatters with a vector of base addresses (written [q0,#offset], again with writeback). Additionally, some of the loads can widen each loaded value into a larger vector lane, and the corresponding stores narrow them again. To implement these, we also have to add the addressing modes they need. Also, in AsmParser, the `isMem` query function now has subqueries `isGPRMem` and `isMVEMem`, according to which kind of base register is used by a given memory access operand. I've also had to add an extra check in `checkTargetMatchPredicate` in the AsmParser, without which our last-minute check of `rGPR` register operands against SP and PC was failing an assertion because Tablegen had inserted an immediate 0 in place of one of a pair of tied register operands. (This matches the way the corresponding check for `MCK_rGPR` in `validateTargetOperandClass` is guarded.) Apparently the MVE load instructions were the first to have ever triggered this assertion, but I think only because they were the first to have a combination of the usual Arm pre/post writeback system and the `rGPR` class in particular. Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62680 llvm-svn: 364291
2019-06-25 19:24:18 +08:00
[ARM,MVE] Add intrinsics for gather/scatter load/stores. This patch adds two new families of intrinsics, both of which are memory accesses taking a vector of locations to load from / store to. The vldrq_gather_base / vstrq_scatter_base intrinsics take a vector of base addresses, and an immediate offset to be added consistently to each one. vldrq_gather_offset / vstrq_scatter_offset take a scalar base address, and a vector of offsets to add to it. The 'shifted_offset' variants also multiply each offset by the element size type, so that the vector is effectively of array indices. At the IR level, these operations are represented by a single set of four IR intrinsics: {gather,scatter} × {base,offset}. The other details (signed/unsigned, shift, and memory element size as opposed to vector element size) are all specified by IR intrinsic polymorphism and immediate operands, because that made the selection job easier than making a huge family of similarly named intrinsics. I considered using the standard IR representations such as llvm.masked.gather, but they're not a good fit. In order to use llvm.masked.gather to represent a gather_offset load with element size smaller than a pointer, you'd have to expand the <8 x i16> vector of offsets into an <8 x i16*> vector of pointers, which would be split up during legalization, so you'd spend most of your time undoing the mess it had made. Also, ISel support for llvm.masked.gather would be easy enough in a trivial way (you can expand it into a gather-base load with a zero immediate offset), but instruction-selecting lots of fiddly idioms back into all the _other_ MVE load instructions would be much more work. So I think dedicated IR intrinsics are the more sensible approach, at least for the moment. On the clang tablegen side, I've added two new features to the Tablegen source accepted by MveEmitter: a 'CopyKind' type node for defining a type that varies with the parameter type (it lets you ask for an unsigned integer type of the same width as the parameter), and an 'unsignedflag' value node for passing an immediate IR operand which is 0 for a signed integer type or 1 for an unsigned one. That lets me write each kind of intrinsic just once and get all its subtypes and immediate arguments generated automatically. Also I've tweaked the handling of pointer-typed values in the code generation part of MveEmitter: they're generated as Address rather than Value (i.e. including an alignment) so that they can be given to the ordinary IR load and store operations, but I'd omitted the code to convert them back to Value when they're going to be used as an argument to an IR intrinsic. On the MC side, I've enhanced MVEVectorVTInfo so that it can tell you not only the full assembly-language suffix for a given vector type (like 's32' or 'u16') but also the numeric-only one used by store instructions (just '32' or '16'). Reviewers: dmgreen Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D69791
2019-11-01 01:02:07 +08:00
defm MVE_VSTRH16_rq : MVE_VSTR_rq_w<MVE_memH, [MVE_v8i16,MVE_v8f16]>;
defm MVE_VSTRH32_rq : MVE_VSTR_rq_w<MVE_memH, [MVE_v4i32]>;
defm MVE_VSTRW32_rq : MVE_VSTR_rq_w<MVE_memW, [MVE_v4i32,MVE_v4f32]>;
defm MVE_VSTRD64_rq : MVE_VSTR_rq_w<MVE_memD, [MVE_v2i64]>;
[ARM] Add MVE vector load/store instructions. This adds the rest of the vector memory access instructions. It includes contiguous loads/stores, with an ordinary addressing mode such as [r0,#offset] (plus writeback variants); gather loads and scatter stores with a scalar base address register and a vector of offsets from it (written [r0,q1] or similar); and gather/scatters with a vector of base addresses (written [q0,#offset], again with writeback). Additionally, some of the loads can widen each loaded value into a larger vector lane, and the corresponding stores narrow them again. To implement these, we also have to add the addressing modes they need. Also, in AsmParser, the `isMem` query function now has subqueries `isGPRMem` and `isMVEMem`, according to which kind of base register is used by a given memory access operand. I've also had to add an extra check in `checkTargetMatchPredicate` in the AsmParser, without which our last-minute check of `rGPR` register operands against SP and PC was failing an assertion because Tablegen had inserted an immediate 0 in place of one of a pair of tied register operands. (This matches the way the corresponding check for `MCK_rGPR` in `validateTargetOperandClass` is guarded.) Apparently the MVE load instructions were the first to have ever triggered this assertion, but I think only because they were the first to have a combination of the usual Arm pre/post writeback system and the `rGPR` class in particular. Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62680 llvm-svn: 364291
2019-06-25 19:24:18 +08:00
// Gather loads / scatter stores whose address operand is of the form
// [Qm,#imm], i.e. a vector containing a full base address for each
// loaded item, plus an immediate offset applied consistently to all
// of them. ('Load/store the same field from this vector of pointers
// to a structure type.')
//
// This family requires the vector lane size to be at least 32 bits
// (so there's room for an address in each lane at all). It has no
// widening/narrowing variants. But it does support preindex
// writeback, in which the address vector is updated to hold the
// addresses actually loaded from.
// Base class.
class MVE_VLDRSTR_qi<MVE_ldst_direction dir, MVE_memsz memsz, bit W, dag wbops,
string asm, string wbAsm, string suffix, string cstr = "">
: MVE_VLDRSTR_base<dir, 1, 1, W, 1, !con(wbops, dir.Oops),
!con(dir.Iops, (ins mve_addr_q_shift<memsz.shift>:$addr)),
asm, suffix, "$Qd, $addr" # wbAsm, cstr # dir.cstr> {
bits<11> addr;
let Inst{23} = addr{7};
let Inst{19-17} = addr{10-8};
let Inst{16} = 0;
let Inst{8} = memsz.encoding{0}; // enough to distinguish 32- from 64-bit
let Inst{7} = 0;
let Inst{6-0} = addr{6-0};
}
// Multiclass that generates the non-writeback and writeback variants.
multiclass MVE_VLDRSTR_qi_m<MVE_ldst_direction dir, MVE_memsz memsz,
string asm, string suffix> {
def "" : MVE_VLDRSTR_qi<dir, memsz, 0, (outs), asm, "", suffix>;
def _pre : MVE_VLDRSTR_qi<dir, memsz, 1, (outs MQPR:$wb), asm, "!", suffix,
"$addr.base = $wb"> {
let DecoderMethod="DecodeMVE_MEM_3_pre<"#memsz.shift#">";
}
}
[ARM,MVE] Add intrinsics for gather/scatter load/stores. This patch adds two new families of intrinsics, both of which are memory accesses taking a vector of locations to load from / store to. The vldrq_gather_base / vstrq_scatter_base intrinsics take a vector of base addresses, and an immediate offset to be added consistently to each one. vldrq_gather_offset / vstrq_scatter_offset take a scalar base address, and a vector of offsets to add to it. The 'shifted_offset' variants also multiply each offset by the element size type, so that the vector is effectively of array indices. At the IR level, these operations are represented by a single set of four IR intrinsics: {gather,scatter} × {base,offset}. The other details (signed/unsigned, shift, and memory element size as opposed to vector element size) are all specified by IR intrinsic polymorphism and immediate operands, because that made the selection job easier than making a huge family of similarly named intrinsics. I considered using the standard IR representations such as llvm.masked.gather, but they're not a good fit. In order to use llvm.masked.gather to represent a gather_offset load with element size smaller than a pointer, you'd have to expand the <8 x i16> vector of offsets into an <8 x i16*> vector of pointers, which would be split up during legalization, so you'd spend most of your time undoing the mess it had made. Also, ISel support for llvm.masked.gather would be easy enough in a trivial way (you can expand it into a gather-base load with a zero immediate offset), but instruction-selecting lots of fiddly idioms back into all the _other_ MVE load instructions would be much more work. So I think dedicated IR intrinsics are the more sensible approach, at least for the moment. On the clang tablegen side, I've added two new features to the Tablegen source accepted by MveEmitter: a 'CopyKind' type node for defining a type that varies with the parameter type (it lets you ask for an unsigned integer type of the same width as the parameter), and an 'unsignedflag' value node for passing an immediate IR operand which is 0 for a signed integer type or 1 for an unsigned one. That lets me write each kind of intrinsic just once and get all its subtypes and immediate arguments generated automatically. Also I've tweaked the handling of pointer-typed values in the code generation part of MveEmitter: they're generated as Address rather than Value (i.e. including an alignment) so that they can be given to the ordinary IR load and store operations, but I'd omitted the code to convert them back to Value when they're going to be used as an argument to an IR intrinsic. On the MC side, I've enhanced MVEVectorVTInfo so that it can tell you not only the full assembly-language suffix for a given vector type (like 's32' or 'u16') but also the numeric-only one used by store instructions (just '32' or '16'). Reviewers: dmgreen Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D69791
2019-11-01 01:02:07 +08:00
// Multiclasses wrapping that one, adding selection patterns for the
// non-writeback loads and all the stores. (The writeback loads must
// deliver multiple output values, so they have to be selected by C++
// code.)
multiclass MVE_VLDR_qi<MVE_memsz memsz, MVEVectorVTInfo AVTI,
list<MVEVectorVTInfo> DVTIs> {
defm "" : MVE_VLDRSTR_qi_m<MVE_ld, memsz, "vldr" # memsz.MnemonicLetter,
"u" # memsz.TypeBits>;
foreach DVTI = DVTIs in {
def : Pat<(DVTI.Vec (int_arm_mve_vldr_gather_base
(AVTI.Vec MQPR:$addr), (i32 imm:$offset))),
(DVTI.Vec (!cast<Instruction>(NAME)
(AVTI.Vec MQPR:$addr), (i32 imm:$offset)))>;
def : Pat<(DVTI.Vec (int_arm_mve_vldr_gather_base_predicated
(AVTI.Vec MQPR:$addr), (i32 imm:$offset), (AVTI.Pred VCCR:$pred))),
(DVTI.Vec (!cast<Instruction>(NAME)
(AVTI.Vec MQPR:$addr), (i32 imm:$offset), ARMVCCThen, VCCR:$pred))>;
[ARM,MVE] Add intrinsics for gather/scatter load/stores. This patch adds two new families of intrinsics, both of which are memory accesses taking a vector of locations to load from / store to. The vldrq_gather_base / vstrq_scatter_base intrinsics take a vector of base addresses, and an immediate offset to be added consistently to each one. vldrq_gather_offset / vstrq_scatter_offset take a scalar base address, and a vector of offsets to add to it. The 'shifted_offset' variants also multiply each offset by the element size type, so that the vector is effectively of array indices. At the IR level, these operations are represented by a single set of four IR intrinsics: {gather,scatter} × {base,offset}. The other details (signed/unsigned, shift, and memory element size as opposed to vector element size) are all specified by IR intrinsic polymorphism and immediate operands, because that made the selection job easier than making a huge family of similarly named intrinsics. I considered using the standard IR representations such as llvm.masked.gather, but they're not a good fit. In order to use llvm.masked.gather to represent a gather_offset load with element size smaller than a pointer, you'd have to expand the <8 x i16> vector of offsets into an <8 x i16*> vector of pointers, which would be split up during legalization, so you'd spend most of your time undoing the mess it had made. Also, ISel support for llvm.masked.gather would be easy enough in a trivial way (you can expand it into a gather-base load with a zero immediate offset), but instruction-selecting lots of fiddly idioms back into all the _other_ MVE load instructions would be much more work. So I think dedicated IR intrinsics are the more sensible approach, at least for the moment. On the clang tablegen side, I've added two new features to the Tablegen source accepted by MveEmitter: a 'CopyKind' type node for defining a type that varies with the parameter type (it lets you ask for an unsigned integer type of the same width as the parameter), and an 'unsignedflag' value node for passing an immediate IR operand which is 0 for a signed integer type or 1 for an unsigned one. That lets me write each kind of intrinsic just once and get all its subtypes and immediate arguments generated automatically. Also I've tweaked the handling of pointer-typed values in the code generation part of MveEmitter: they're generated as Address rather than Value (i.e. including an alignment) so that they can be given to the ordinary IR load and store operations, but I'd omitted the code to convert them back to Value when they're going to be used as an argument to an IR intrinsic. On the MC side, I've enhanced MVEVectorVTInfo so that it can tell you not only the full assembly-language suffix for a given vector type (like 's32' or 'u16') but also the numeric-only one used by store instructions (just '32' or '16'). Reviewers: dmgreen Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D69791
2019-11-01 01:02:07 +08:00
}
}
multiclass MVE_VSTR_qi<MVE_memsz memsz, MVEVectorVTInfo AVTI,
list<MVEVectorVTInfo> DVTIs> {
defm "" : MVE_VLDRSTR_qi_m<MVE_st, memsz, "vstr" # memsz.MnemonicLetter,
!cast<string>(memsz.TypeBits)>;
foreach DVTI = DVTIs in {
def : Pat<(int_arm_mve_vstr_scatter_base
(AVTI.Vec MQPR:$addr), (i32 imm:$offset), (DVTI.Vec MQPR:$data)),
(!cast<Instruction>(NAME)
(DVTI.Vec MQPR:$data), (AVTI.Vec MQPR:$addr), (i32 imm:$offset))>;
def : Pat<(int_arm_mve_vstr_scatter_base_predicated
(AVTI.Vec MQPR:$addr), (i32 imm:$offset), (DVTI.Vec MQPR:$data), (AVTI.Pred VCCR:$pred)),
(!cast<Instruction>(NAME)
(DVTI.Vec MQPR:$data), (AVTI.Vec MQPR:$addr), (i32 imm:$offset), ARMVCCThen, VCCR:$pred)>;
[ARM,MVE] Add intrinsics for gather/scatter load/stores. This patch adds two new families of intrinsics, both of which are memory accesses taking a vector of locations to load from / store to. The vldrq_gather_base / vstrq_scatter_base intrinsics take a vector of base addresses, and an immediate offset to be added consistently to each one. vldrq_gather_offset / vstrq_scatter_offset take a scalar base address, and a vector of offsets to add to it. The 'shifted_offset' variants also multiply each offset by the element size type, so that the vector is effectively of array indices. At the IR level, these operations are represented by a single set of four IR intrinsics: {gather,scatter} × {base,offset}. The other details (signed/unsigned, shift, and memory element size as opposed to vector element size) are all specified by IR intrinsic polymorphism and immediate operands, because that made the selection job easier than making a huge family of similarly named intrinsics. I considered using the standard IR representations such as llvm.masked.gather, but they're not a good fit. In order to use llvm.masked.gather to represent a gather_offset load with element size smaller than a pointer, you'd have to expand the <8 x i16> vector of offsets into an <8 x i16*> vector of pointers, which would be split up during legalization, so you'd spend most of your time undoing the mess it had made. Also, ISel support for llvm.masked.gather would be easy enough in a trivial way (you can expand it into a gather-base load with a zero immediate offset), but instruction-selecting lots of fiddly idioms back into all the _other_ MVE load instructions would be much more work. So I think dedicated IR intrinsics are the more sensible approach, at least for the moment. On the clang tablegen side, I've added two new features to the Tablegen source accepted by MveEmitter: a 'CopyKind' type node for defining a type that varies with the parameter type (it lets you ask for an unsigned integer type of the same width as the parameter), and an 'unsignedflag' value node for passing an immediate IR operand which is 0 for a signed integer type or 1 for an unsigned one. That lets me write each kind of intrinsic just once and get all its subtypes and immediate arguments generated automatically. Also I've tweaked the handling of pointer-typed values in the code generation part of MveEmitter: they're generated as Address rather than Value (i.e. including an alignment) so that they can be given to the ordinary IR load and store operations, but I'd omitted the code to convert them back to Value when they're going to be used as an argument to an IR intrinsic. On the MC side, I've enhanced MVEVectorVTInfo so that it can tell you not only the full assembly-language suffix for a given vector type (like 's32' or 'u16') but also the numeric-only one used by store instructions (just '32' or '16'). Reviewers: dmgreen Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D69791
2019-11-01 01:02:07 +08:00
def : Pat<(AVTI.Vec (int_arm_mve_vstr_scatter_base_wb
(AVTI.Vec MQPR:$addr), (i32 imm:$offset), (DVTI.Vec MQPR:$data))),
(AVTI.Vec (!cast<Instruction>(NAME # "_pre")
(DVTI.Vec MQPR:$data), (AVTI.Vec MQPR:$addr), (i32 imm:$offset)))>;
def : Pat<(AVTI.Vec (int_arm_mve_vstr_scatter_base_wb_predicated
(AVTI.Vec MQPR:$addr), (i32 imm:$offset), (DVTI.Vec MQPR:$data), (AVTI.Pred VCCR:$pred))),
(AVTI.Vec (!cast<Instruction>(NAME # "_pre")
(DVTI.Vec MQPR:$data), (AVTI.Vec MQPR:$addr), (i32 imm:$offset), ARMVCCThen, VCCR:$pred))>;
[ARM,MVE] Add intrinsics for gather/scatter load/stores. This patch adds two new families of intrinsics, both of which are memory accesses taking a vector of locations to load from / store to. The vldrq_gather_base / vstrq_scatter_base intrinsics take a vector of base addresses, and an immediate offset to be added consistently to each one. vldrq_gather_offset / vstrq_scatter_offset take a scalar base address, and a vector of offsets to add to it. The 'shifted_offset' variants also multiply each offset by the element size type, so that the vector is effectively of array indices. At the IR level, these operations are represented by a single set of four IR intrinsics: {gather,scatter} × {base,offset}. The other details (signed/unsigned, shift, and memory element size as opposed to vector element size) are all specified by IR intrinsic polymorphism and immediate operands, because that made the selection job easier than making a huge family of similarly named intrinsics. I considered using the standard IR representations such as llvm.masked.gather, but they're not a good fit. In order to use llvm.masked.gather to represent a gather_offset load with element size smaller than a pointer, you'd have to expand the <8 x i16> vector of offsets into an <8 x i16*> vector of pointers, which would be split up during legalization, so you'd spend most of your time undoing the mess it had made. Also, ISel support for llvm.masked.gather would be easy enough in a trivial way (you can expand it into a gather-base load with a zero immediate offset), but instruction-selecting lots of fiddly idioms back into all the _other_ MVE load instructions would be much more work. So I think dedicated IR intrinsics are the more sensible approach, at least for the moment. On the clang tablegen side, I've added two new features to the Tablegen source accepted by MveEmitter: a 'CopyKind' type node for defining a type that varies with the parameter type (it lets you ask for an unsigned integer type of the same width as the parameter), and an 'unsignedflag' value node for passing an immediate IR operand which is 0 for a signed integer type or 1 for an unsigned one. That lets me write each kind of intrinsic just once and get all its subtypes and immediate arguments generated automatically. Also I've tweaked the handling of pointer-typed values in the code generation part of MveEmitter: they're generated as Address rather than Value (i.e. including an alignment) so that they can be given to the ordinary IR load and store operations, but I'd omitted the code to convert them back to Value when they're going to be used as an argument to an IR intrinsic. On the MC side, I've enhanced MVEVectorVTInfo so that it can tell you not only the full assembly-language suffix for a given vector type (like 's32' or 'u16') but also the numeric-only one used by store instructions (just '32' or '16'). Reviewers: dmgreen Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D69791
2019-11-01 01:02:07 +08:00
}
}
[ARM] Add MVE vector load/store instructions. This adds the rest of the vector memory access instructions. It includes contiguous loads/stores, with an ordinary addressing mode such as [r0,#offset] (plus writeback variants); gather loads and scatter stores with a scalar base address register and a vector of offsets from it (written [r0,q1] or similar); and gather/scatters with a vector of base addresses (written [q0,#offset], again with writeback). Additionally, some of the loads can widen each loaded value into a larger vector lane, and the corresponding stores narrow them again. To implement these, we also have to add the addressing modes they need. Also, in AsmParser, the `isMem` query function now has subqueries `isGPRMem` and `isMVEMem`, according to which kind of base register is used by a given memory access operand. I've also had to add an extra check in `checkTargetMatchPredicate` in the AsmParser, without which our last-minute check of `rGPR` register operands against SP and PC was failing an assertion because Tablegen had inserted an immediate 0 in place of one of a pair of tied register operands. (This matches the way the corresponding check for `MCK_rGPR` in `validateTargetOperandClass` is guarded.) Apparently the MVE load instructions were the first to have ever triggered this assertion, but I think only because they were the first to have a combination of the usual Arm pre/post writeback system and the `rGPR` class in particular. Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62680 llvm-svn: 364291
2019-06-25 19:24:18 +08:00
// Actual instruction definitions.
[ARM,MVE] Add intrinsics for gather/scatter load/stores. This patch adds two new families of intrinsics, both of which are memory accesses taking a vector of locations to load from / store to. The vldrq_gather_base / vstrq_scatter_base intrinsics take a vector of base addresses, and an immediate offset to be added consistently to each one. vldrq_gather_offset / vstrq_scatter_offset take a scalar base address, and a vector of offsets to add to it. The 'shifted_offset' variants also multiply each offset by the element size type, so that the vector is effectively of array indices. At the IR level, these operations are represented by a single set of four IR intrinsics: {gather,scatter} × {base,offset}. The other details (signed/unsigned, shift, and memory element size as opposed to vector element size) are all specified by IR intrinsic polymorphism and immediate operands, because that made the selection job easier than making a huge family of similarly named intrinsics. I considered using the standard IR representations such as llvm.masked.gather, but they're not a good fit. In order to use llvm.masked.gather to represent a gather_offset load with element size smaller than a pointer, you'd have to expand the <8 x i16> vector of offsets into an <8 x i16*> vector of pointers, which would be split up during legalization, so you'd spend most of your time undoing the mess it had made. Also, ISel support for llvm.masked.gather would be easy enough in a trivial way (you can expand it into a gather-base load with a zero immediate offset), but instruction-selecting lots of fiddly idioms back into all the _other_ MVE load instructions would be much more work. So I think dedicated IR intrinsics are the more sensible approach, at least for the moment. On the clang tablegen side, I've added two new features to the Tablegen source accepted by MveEmitter: a 'CopyKind' type node for defining a type that varies with the parameter type (it lets you ask for an unsigned integer type of the same width as the parameter), and an 'unsignedflag' value node for passing an immediate IR operand which is 0 for a signed integer type or 1 for an unsigned one. That lets me write each kind of intrinsic just once and get all its subtypes and immediate arguments generated automatically. Also I've tweaked the handling of pointer-typed values in the code generation part of MveEmitter: they're generated as Address rather than Value (i.e. including an alignment) so that they can be given to the ordinary IR load and store operations, but I'd omitted the code to convert them back to Value when they're going to be used as an argument to an IR intrinsic. On the MC side, I've enhanced MVEVectorVTInfo so that it can tell you not only the full assembly-language suffix for a given vector type (like 's32' or 'u16') but also the numeric-only one used by store instructions (just '32' or '16'). Reviewers: dmgreen Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D69791
2019-11-01 01:02:07 +08:00
defm MVE_VLDRWU32_qi: MVE_VLDR_qi<MVE_memW, MVE_v4i32, [MVE_v4i32,MVE_v4f32]>;
defm MVE_VLDRDU64_qi: MVE_VLDR_qi<MVE_memD, MVE_v2i64, [MVE_v2i64,MVE_v2f64]>;
defm MVE_VSTRW32_qi: MVE_VSTR_qi<MVE_memW, MVE_v4i32, [MVE_v4i32,MVE_v4f32]>;
defm MVE_VSTRD64_qi: MVE_VSTR_qi<MVE_memD, MVE_v2i64, [MVE_v2i64,MVE_v2f64]>;
[ARM] Add MVE vector load/store instructions. This adds the rest of the vector memory access instructions. It includes contiguous loads/stores, with an ordinary addressing mode such as [r0,#offset] (plus writeback variants); gather loads and scatter stores with a scalar base address register and a vector of offsets from it (written [r0,q1] or similar); and gather/scatters with a vector of base addresses (written [q0,#offset], again with writeback). Additionally, some of the loads can widen each loaded value into a larger vector lane, and the corresponding stores narrow them again. To implement these, we also have to add the addressing modes they need. Also, in AsmParser, the `isMem` query function now has subqueries `isGPRMem` and `isMVEMem`, according to which kind of base register is used by a given memory access operand. I've also had to add an extra check in `checkTargetMatchPredicate` in the AsmParser, without which our last-minute check of `rGPR` register operands against SP and PC was failing an assertion because Tablegen had inserted an immediate 0 in place of one of a pair of tied register operands. (This matches the way the corresponding check for `MCK_rGPR` in `validateTargetOperandClass` is guarded.) Apparently the MVE load instructions were the first to have ever triggered this assertion, but I think only because they were the first to have a combination of the usual Arm pre/post writeback system and the `rGPR` class in particular. Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62680 llvm-svn: 364291
2019-06-25 19:24:18 +08:00
// Define aliases for all the instructions where memory size and
// vector lane size are the same. These are mnemonic aliases, so they
// apply consistently across all of the above families - contiguous
// loads, and both the rq and qi types of gather/scatter.
//
// Rationale: As long as you're loading (for example) 16-bit memory
// values into 16-bit vector lanes, you can think of them as signed or
// unsigned integers, fp16 or just raw 16-bit blobs and it makes no
// difference. So we permit all of vldrh.16, vldrh.u16, vldrh.s16,
// vldrh.f16 and treat them all as equivalent to the canonical
// spelling (which happens to be .u16 for loads, and just .16 for
// stores).
foreach vpt_cond = ["", "t", "e"] in
foreach memsz = [MVE_memB, MVE_memH, MVE_memW, MVE_memD] in
foreach suffix = memsz.suffixes in {
// These foreaches are conceptually ifs, implemented by iterating a
// dummy variable over a list with 0 or 1 elements depending on the
// condition. The idea is to iterate over _nearly_ all the suffixes
// in memsz.suffixes, but omit the one we want all the others to alias.
foreach _ = !if(!ne(suffix, memsz.CanonLoadSuffix), [1], []<int>) in
def : MnemonicAlias<
"vldr" # memsz.MnemonicLetter # vpt_cond # suffix,
"vldr" # memsz.MnemonicLetter # vpt_cond # memsz.CanonLoadSuffix>;
foreach _ = !if(!ne(suffix, memsz.CanonStoreSuffix), [1], []<int>) in
def : MnemonicAlias<
"vstr" # memsz.MnemonicLetter # vpt_cond # suffix,
"vstr" # memsz.MnemonicLetter # vpt_cond # memsz.CanonStoreSuffix>;
}
// end of MVE predicable load/store
class MVE_VPT<string suffix, bits<2> size, dag iops, string asm, list<dag> pattern=[]>
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
: MVE_MI<(outs ), iops, NoItinerary, !strconcat("vpt", "${Mk}", ".", suffix), asm, "", pattern> {
bits<3> fc;
bits<4> Mk;
bits<3> Qn;
let Inst{31-23} = 0b111111100;
let Inst{22} = Mk{3};
let Inst{21-20} = size;
let Inst{19-17} = Qn{2-0};
let Inst{16} = 0b1;
let Inst{15-13} = Mk{2-0};
let Inst{12} = fc{2};
let Inst{11-8} = 0b1111;
let Inst{7} = fc{0};
let Inst{4} = 0b0;
let Defs = [VPR];
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
}
class MVE_VPTt1<string suffix, bits<2> size, dag iops>
: MVE_VPT<suffix, size, iops, "$fc, $Qn, $Qm"> {
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
bits<4> Qm;
bits<4> Mk;
let Inst{6} = 0b0;
let Inst{5} = Qm{3};
let Inst{3-1} = Qm{2-0};
let Inst{0} = fc{1};
}
class MVE_VPTt1i<string suffix, bits<2> size>
: MVE_VPTt1<suffix, size,
(ins vpt_mask:$Mk, MQPR:$Qn, MQPR:$Qm, pred_basic_i:$fc)> {
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
let Inst{12} = 0b0;
let Inst{0} = 0b0;
}
def MVE_VPTv4i32 : MVE_VPTt1i<"i32", 0b10>;
def MVE_VPTv8i16 : MVE_VPTt1i<"i16", 0b01>;
def MVE_VPTv16i8 : MVE_VPTt1i<"i8", 0b00>;
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
class MVE_VPTt1u<string suffix, bits<2> size>
: MVE_VPTt1<suffix, size,
(ins vpt_mask:$Mk, MQPR:$Qn, MQPR:$Qm, pred_basic_u:$fc)> {
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
let Inst{12} = 0b0;
let Inst{0} = 0b1;
}
def MVE_VPTv4u32 : MVE_VPTt1u<"u32", 0b10>;
def MVE_VPTv8u16 : MVE_VPTt1u<"u16", 0b01>;
def MVE_VPTv16u8 : MVE_VPTt1u<"u8", 0b00>;
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
class MVE_VPTt1s<string suffix, bits<2> size>
: MVE_VPTt1<suffix, size,
(ins vpt_mask:$Mk, MQPR:$Qn, MQPR:$Qm, pred_basic_s:$fc)> {
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
let Inst{12} = 0b1;
}
def MVE_VPTv4s32 : MVE_VPTt1s<"s32", 0b10>;
def MVE_VPTv8s16 : MVE_VPTt1s<"s16", 0b01>;
def MVE_VPTv16s8 : MVE_VPTt1s<"s8", 0b00>;
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
class MVE_VPTt2<string suffix, bits<2> size, dag iops>
: MVE_VPT<suffix, size, iops,
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
"$fc, $Qn, $Rm"> {
bits<4> Rm;
bits<3> fc;
bits<4> Mk;
let Inst{6} = 0b1;
let Inst{5} = fc{1};
let Inst{3-0} = Rm{3-0};
}
class MVE_VPTt2i<string suffix, bits<2> size>
: MVE_VPTt2<suffix, size,
(ins vpt_mask:$Mk, MQPR:$Qn, GPRwithZR:$Rm, pred_basic_i:$fc)> {
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
let Inst{12} = 0b0;
let Inst{5} = 0b0;
}
def MVE_VPTv4i32r : MVE_VPTt2i<"i32", 0b10>;
def MVE_VPTv8i16r : MVE_VPTt2i<"i16", 0b01>;
def MVE_VPTv16i8r : MVE_VPTt2i<"i8", 0b00>;
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
class MVE_VPTt2u<string suffix, bits<2> size>
: MVE_VPTt2<suffix, size,
(ins vpt_mask:$Mk, MQPR:$Qn, GPRwithZR:$Rm, pred_basic_u:$fc)> {
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
let Inst{12} = 0b0;
let Inst{5} = 0b1;
}
def MVE_VPTv4u32r : MVE_VPTt2u<"u32", 0b10>;
def MVE_VPTv8u16r : MVE_VPTt2u<"u16", 0b01>;
def MVE_VPTv16u8r : MVE_VPTt2u<"u8", 0b00>;
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
class MVE_VPTt2s<string suffix, bits<2> size>
: MVE_VPTt2<suffix, size,
(ins vpt_mask:$Mk, MQPR:$Qn, GPRwithZR:$Rm, pred_basic_s:$fc)> {
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
let Inst{12} = 0b1;
}
def MVE_VPTv4s32r : MVE_VPTt2s<"s32", 0b10>;
def MVE_VPTv8s16r : MVE_VPTt2s<"s16", 0b01>;
def MVE_VPTv16s8r : MVE_VPTt2s<"s8", 0b00>;
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
class MVE_VPTf<string suffix, bit size, dag iops, string asm, list<dag> pattern=[]>
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
: MVE_MI<(outs ), iops, NoItinerary, !strconcat("vpt", "${Mk}", ".", suffix), asm,
"", pattern> {
bits<3> fc;
bits<4> Mk;
bits<3> Qn;
let Inst{31-29} = 0b111;
let Inst{28} = size;
let Inst{27-23} = 0b11100;
let Inst{22} = Mk{3};
let Inst{21-20} = 0b11;
let Inst{19-17} = Qn{2-0};
let Inst{16} = 0b1;
let Inst{15-13} = Mk{2-0};
let Inst{12} = fc{2};
let Inst{11-8} = 0b1111;
let Inst{7} = fc{0};
let Inst{4} = 0b0;
let Defs = [VPR];
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
let Predicates = [HasMVEFloat];
}
class MVE_VPTft1<string suffix, bit size>
: MVE_VPTf<suffix, size, (ins vpt_mask:$Mk, MQPR:$Qn, MQPR:$Qm, pred_basic_fp:$fc),
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
"$fc, $Qn, $Qm"> {
bits<3> fc;
bits<4> Qm;
let Inst{6} = 0b0;
let Inst{5} = Qm{3};
let Inst{3-1} = Qm{2-0};
let Inst{0} = fc{1};
}
def MVE_VPTv4f32 : MVE_VPTft1<"f32", 0b0>;
def MVE_VPTv8f16 : MVE_VPTft1<"f16", 0b1>;
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
class MVE_VPTft2<string suffix, bit size>
: MVE_VPTf<suffix, size, (ins vpt_mask:$Mk, MQPR:$Qn, GPRwithZR:$Rm, pred_basic_fp:$fc),
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
"$fc, $Qn, $Rm"> {
bits<3> fc;
bits<4> Rm;
let Inst{6} = 0b1;
let Inst{5} = fc{1};
let Inst{3-0} = Rm{3-0};
}
def MVE_VPTv4f32r : MVE_VPTft2<"f32", 0b0>;
def MVE_VPTv8f16r : MVE_VPTft2<"f16", 0b1>;
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
def MVE_VPST : MVE_MI<(outs ), (ins vpt_mask:$Mk), NoItinerary,
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
!strconcat("vpst", "${Mk}"), "", "", []> {
bits<4> Mk;
let Inst{31-23} = 0b111111100;
let Inst{22} = Mk{3};
let Inst{21-16} = 0b110001;
let Inst{15-13} = Mk{2-0};
let Inst{12-0} = 0b0111101001101;
let Unpredictable{12} = 0b1;
let Unpredictable{7} = 0b1;
let Unpredictable{5} = 0b1;
let Uses = [VPR];
let validForTailPredication = 1;
[ARM] Set up infrastructure for MVE vector instructions. This commit prepares the way to start adding the main collection of MVE instructions, which operate on the 128-bit vector registers. The most obvious thing that's needed, and the simplest, is to add the MQPR register class, which is like the existing QPR except that it has fewer registers in it. The more complicated part: MVE defines a system of vector predication, in which instructions operating on 128-bit vector registers can be constrained to operate on only a subset of the lanes, using a system of prefix instructions similar to the existing Thumb IT, in that you have one prefix instruction which designates up to 4 following instructions as subject to predication, and within that sequence, the predicate can be inverted by means of T/E suffixes ('Then' / 'Else'). To support instructions of this type, we've added two new Tablegen classes `vpred_n` and `vpred_r` for standard clusters of MC operands to add to a predicated instruction. Both include a flag indicating how the instruction is predicated at all (options are T, E and 'not predicated'), and an input register field for the register controlling the set of active lanes. They differ from each other in that `vpred_r` also includes an input operand for the previous value of the output register, for instructions that leave inactive lanes unchanged. `vpred_n` lacks that extra operand; it will be used for instructions that don't preserve inactive lanes in their output register (either because inactive lanes are zeroed, as the MVE load instructions do, or because the output register isn't a vector at all). This commit also adds the family of prefix instructions themselves (VPT / VPST), and all the machinery needed to work with them in assembly and disassembly (e.g. generating the 't' and 'e' mnemonic suffixes on disassembled instructions within a predicated block) I've added a couple of demo instructions that derive from the new Tablegen base classes and use those two operand clusters. The bulk of the vector instructions will come in followup commits small enough to be manageable. (One exception is that I've added the full version of `isMnemonicVPTPredicable` in the AsmParser, because it seemed pointless to carefully split it up.) Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62669 llvm-svn: 363258
2019-06-13 21:11:13 +08:00
}
def MVE_VPSEL : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), NoItinerary,
"vpsel", "", "$Qd, $Qn, $Qm", vpred_n, "", []> {
bits<4> Qn;
bits<4> Qd;
bits<4> Qm;
let Inst{28} = 0b1;
let Inst{25-23} = 0b100;
let Inst{22} = Qd{3};
let Inst{21-20} = 0b11;
let Inst{19-17} = Qn{2-0};
let Inst{16} = 0b1;
let Inst{15-13} = Qd{2-0};
let Inst{12-9} = 0b0111;
let Inst{8} = 0b1;
let Inst{7} = Qn{3};
let Inst{6} = 0b0;
let Inst{5} = Qm{3};
let Inst{4} = 0b0;
let Inst{3-1} = Qm{2-0};
let Inst{0} = 0b1;
let validForTailPredication = 1;
}
foreach suffix = ["s8", "s16", "s32", "u8", "u16", "u32",
"i8", "i16", "i32", "f16", "f32"] in
def : MVEInstAlias<"vpsel${vp}." # suffix # "\t$Qd, $Qn, $Qm",
(MVE_VPSEL MQPR:$Qd, MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
let Predicates = [HasMVEInt] in {
def : Pat<(v16i8 (vselect (v16i1 VCCR:$pred), (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))),
(v16i8 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred))>;
def : Pat<(v8i16 (vselect (v8i1 VCCR:$pred), (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))),
(v8i16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred))>;
def : Pat<(v4i32 (vselect (v4i1 VCCR:$pred), (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))),
(v4i32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred))>;
def : Pat<(v8f16 (vselect (v8i1 VCCR:$pred), (v8f16 MQPR:$v1), (v8f16 MQPR:$v2))),
(v8f16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred))>;
def : Pat<(v4f32 (vselect (v4i1 VCCR:$pred), (v4f32 MQPR:$v1), (v4f32 MQPR:$v2))),
(v4f32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred))>;
def : Pat<(v16i8 (vselect (v16i8 MQPR:$pred), (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))),
(v16i8 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone,
(MVE_VCMPi8 (v16i8 MQPR:$pred), (MVE_VMOVimmi8 0), ARMCCne)))>;
def : Pat<(v8i16 (vselect (v8i16 MQPR:$pred), (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))),
(v8i16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone,
(MVE_VCMPi16 (v8i16 MQPR:$pred), (MVE_VMOVimmi16 0), ARMCCne)))>;
def : Pat<(v4i32 (vselect (v4i32 MQPR:$pred), (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))),
(v4i32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone,
(MVE_VCMPi32 (v4i32 MQPR:$pred), (MVE_VMOVimmi32 0), ARMCCne)))>;
def : Pat<(v8f16 (vselect (v8i16 MQPR:$pred), (v8f16 MQPR:$v1), (v8f16 MQPR:$v2))),
(v8f16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone,
(MVE_VCMPi16 (v8i16 MQPR:$pred), (MVE_VMOVimmi16 0), ARMCCne)))>;
def : Pat<(v4f32 (vselect (v4i32 MQPR:$pred), (v4f32 MQPR:$v1), (v4f32 MQPR:$v2))),
(v4f32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone,
(MVE_VCMPi32 (v4i32 MQPR:$pred), (MVE_VMOVimmi32 0), ARMCCne)))>;
// Pred <-> Int
def : Pat<(v16i8 (zext (v16i1 VCCR:$pred))),
(v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), ARMVCCNone, VCCR:$pred))>;
def : Pat<(v8i16 (zext (v8i1 VCCR:$pred))),
(v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), ARMVCCNone, VCCR:$pred))>;
def : Pat<(v4i32 (zext (v4i1 VCCR:$pred))),
(v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred))>;
def : Pat<(v16i8 (sext (v16i1 VCCR:$pred))),
(v16i8 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi8 0), ARMVCCNone, VCCR:$pred))>;
def : Pat<(v8i16 (sext (v8i1 VCCR:$pred))),
(v8i16 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi16 0), ARMVCCNone, VCCR:$pred))>;
def : Pat<(v4i32 (sext (v4i1 VCCR:$pred))),
(v4i32 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred))>;
def : Pat<(v16i8 (anyext (v16i1 VCCR:$pred))),
(v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), ARMVCCNone, VCCR:$pred))>;
def : Pat<(v8i16 (anyext (v8i1 VCCR:$pred))),
(v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), ARMVCCNone, VCCR:$pred))>;
def : Pat<(v4i32 (anyext (v4i1 VCCR:$pred))),
(v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred))>;
def : Pat<(v16i1 (trunc (v16i8 MQPR:$v1))),
(v16i1 (MVE_VCMPi32r (v16i8 MQPR:$v1), ZR, ARMCCne))>;
def : Pat<(v8i1 (trunc (v8i16 MQPR:$v1))),
(v8i1 (MVE_VCMPi32r (v8i16 MQPR:$v1), ZR, ARMCCne))>;
def : Pat<(v4i1 (trunc (v4i32 MQPR:$v1))),
(v4i1 (MVE_VCMPi32r (v4i32 MQPR:$v1), ZR, ARMCCne))>;
}
let Predicates = [HasMVEFloat] in {
// Pred <-> Float
// 112 is 1.0 in float
def : Pat<(v4f32 (uint_to_fp (v4i1 VCCR:$pred))),
(v4f32 (MVE_VPSEL (v4f32 (MVE_VMOVimmf32 112)), (v4f32 (MVE_VMOVimmi32 0)), ARMVCCNone, VCCR:$pred))>;
// 2620 in 1.0 in half
def : Pat<(v8f16 (uint_to_fp (v8i1 VCCR:$pred))),
(v8f16 (MVE_VPSEL (v8f16 (MVE_VMOVimmi16 2620)), (v8f16 (MVE_VMOVimmi16 0)), ARMVCCNone, VCCR:$pred))>;
// 240 is -1.0 in float
def : Pat<(v4f32 (sint_to_fp (v4i1 VCCR:$pred))),
(v4f32 (MVE_VPSEL (v4f32 (MVE_VMOVimmf32 240)), (v4f32 (MVE_VMOVimmi32 0)), ARMVCCNone, VCCR:$pred))>;
// 2748 is -1.0 in half
def : Pat<(v8f16 (sint_to_fp (v8i1 VCCR:$pred))),
(v8f16 (MVE_VPSEL (v8f16 (MVE_VMOVimmi16 2748)), (v8f16 (MVE_VMOVimmi16 0)), ARMVCCNone, VCCR:$pred))>;
def : Pat<(v4i1 (fp_to_uint (v4f32 MQPR:$v1))),
(v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, ARMCCne))>;
def : Pat<(v8i1 (fp_to_uint (v8f16 MQPR:$v1))),
(v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, ARMCCne))>;
def : Pat<(v4i1 (fp_to_sint (v4f32 MQPR:$v1))),
(v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, ARMCCne))>;
def : Pat<(v8i1 (fp_to_sint (v8f16 MQPR:$v1))),
(v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, ARMCCne))>;
}
def MVE_VPNOT : MVE_p<(outs VCCR:$P0), (ins VCCR:$P0_in), NoItinerary,
"vpnot", "", "", vpred_n, "", []> {
let Inst{31-0} = 0b11111110001100010000111101001101;
let Unpredictable{19-17} = 0b111;
let Unpredictable{12} = 0b1;
let Unpredictable{7} = 0b1;
let Unpredictable{5} = 0b1;
let Constraints = "";
let DecoderMethod = "DecodeMVEVPNOT";
}
let Predicates = [HasMVEInt] in {
def : Pat<(v4i1 (xor (v4i1 VCCR:$pred), (v4i1 (predicate_cast (i32 65535))))),
(v4i1 (MVE_VPNOT (v4i1 VCCR:$pred)))>;
def : Pat<(v8i1 (xor (v8i1 VCCR:$pred), (v8i1 (predicate_cast (i32 65535))))),
(v8i1 (MVE_VPNOT (v8i1 VCCR:$pred)))>;
def : Pat<(v16i1 (xor (v16i1 VCCR:$pred), (v16i1 (predicate_cast (i32 65535))))),
(v16i1 (MVE_VPNOT (v16i1 VCCR:$pred)))>;
}
class MVE_loltp_start<dag iops, string asm, string ops, bits<2> size>
: t2LOL<(outs GPRlr:$LR), iops, asm, ops> {
bits<4> Rn;
let Predicates = [HasMVEInt];
let Inst{22} = 0b0;
let Inst{21-20} = size;
let Inst{19-16} = Rn{3-0};
let Inst{12} = 0b0;
}
class MVE_DLSTP<string asm, bits<2> size>
: MVE_loltp_start<(ins rGPR:$Rn), asm, "$LR, $Rn", size> {
let Inst{13} = 0b1;
let Inst{11-1} = 0b00000000000;
let Unpredictable{10-1} = 0b1111111111;
}
class MVE_WLSTP<string asm, bits<2> size>
: MVE_loltp_start<(ins rGPR:$Rn, wlslabel_u11:$label),
asm, "$LR, $Rn, $label", size> {
bits<11> label;
let Inst{13} = 0b0;
let Inst{11} = label{0};
let Inst{10-1} = label{10-1};
let isBranch = 1;
let isTerminator = 1;
}
def MVE_DLSTP_8 : MVE_DLSTP<"dlstp.8", 0b00>;
def MVE_DLSTP_16 : MVE_DLSTP<"dlstp.16", 0b01>;
def MVE_DLSTP_32 : MVE_DLSTP<"dlstp.32", 0b10>;
def MVE_DLSTP_64 : MVE_DLSTP<"dlstp.64", 0b11>;
def MVE_WLSTP_8 : MVE_WLSTP<"wlstp.8", 0b00>;
def MVE_WLSTP_16 : MVE_WLSTP<"wlstp.16", 0b01>;
def MVE_WLSTP_32 : MVE_WLSTP<"wlstp.32", 0b10>;
def MVE_WLSTP_64 : MVE_WLSTP<"wlstp.64", 0b11>;
class MVE_loltp_end<dag oops, dag iops, string asm, string ops>
: t2LOL<oops, iops, asm, ops> {
let Predicates = [HasMVEInt];
let Inst{22-21} = 0b00;
let Inst{19-16} = 0b1111;
let Inst{12} = 0b0;
}
def MVE_LETP : MVE_loltp_end<(outs GPRlr:$LRout),
(ins GPRlr:$LRin, lelabel_u11:$label),
"letp", "$LRin, $label"> {
bits<11> label;
let Inst{20} = 0b1;
let Inst{13} = 0b0;
let Inst{11} = label{0};
let Inst{10-1} = label{10-1};
let isBranch = 1;
let isTerminator = 1;
}
def MVE_LCTP : MVE_loltp_end<(outs), (ins pred:$p), "lctp${p}", ""> {
let Inst{20} = 0b0;
let Inst{13} = 0b1;
let Inst{11-1} = 0b00000000000;
let Unpredictable{21-20} = 0b11;
let Unpredictable{11-1} = 0b11111111111;
}
//===----------------------------------------------------------------------===//
// Patterns
//===----------------------------------------------------------------------===//
// PatFrags for loads and stores. Often trying to keep semi-consistent names.
def aligned32_pre_store : PatFrag<(ops node:$val, node:$ptr, node:$offset),
(pre_store node:$val, node:$ptr, node:$offset), [{
return cast<StoreSDNode>(N)->getAlignment() >= 4;
}]>;
def aligned32_post_store : PatFrag<(ops node:$val, node:$ptr, node:$offset),
(post_store node:$val, node:$ptr, node:$offset), [{
return cast<StoreSDNode>(N)->getAlignment() >= 4;
}]>;
def aligned16_pre_store : PatFrag<(ops node:$val, node:$ptr, node:$offset),
(pre_store node:$val, node:$ptr, node:$offset), [{
return cast<StoreSDNode>(N)->getAlignment() >= 2;
}]>;
def aligned16_post_store : PatFrag<(ops node:$val, node:$ptr, node:$offset),
(post_store node:$val, node:$ptr, node:$offset), [{
return cast<StoreSDNode>(N)->getAlignment() >= 2;
}]>;
def aligned_maskedloadvi8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
(masked_ld node:$ptr, undef, node:$pred, node:$passthru), [{
auto *Ld = cast<MaskedLoadSDNode>(N);
return Ld->getMemoryVT().getScalarType() == MVT::i8;
}]>;
def aligned_sextmaskedloadvi8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
(aligned_maskedloadvi8 node:$ptr, node:$pred, node:$passthru), [{
return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD;
}]>;
def aligned_zextmaskedloadvi8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
(aligned_maskedloadvi8 node:$ptr, node:$pred, node:$passthru), [{
return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD;
}]>;
def aligned_extmaskedloadvi8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
(aligned_maskedloadvi8 node:$ptr, node:$pred, node:$passthru), [{
auto *Ld = cast<MaskedLoadSDNode>(N);
EVT ScalarVT = Ld->getMemoryVT().getScalarType();
return ScalarVT.isInteger() && Ld->getExtensionType() == ISD::EXTLOAD;
}]>;
def aligned_maskedloadvi16: PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
(masked_ld node:$ptr, undef, node:$pred, node:$passthru), [{
auto *Ld = cast<MaskedLoadSDNode>(N);
EVT ScalarVT = Ld->getMemoryVT().getScalarType();
return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && Ld->getAlignment() >= 2;
}]>;
def aligned_sextmaskedloadvi16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
(aligned_maskedloadvi16 node:$ptr, node:$pred, node:$passthru), [{
return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD;
}]>;
def aligned_zextmaskedloadvi16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
(aligned_maskedloadvi16 node:$ptr, node:$pred, node:$passthru), [{
return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD;
}]>;
def aligned_extmaskedloadvi16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
(aligned_maskedloadvi16 node:$ptr, node:$pred, node:$passthru), [{
auto *Ld = cast<MaskedLoadSDNode>(N);
EVT ScalarVT = Ld->getMemoryVT().getScalarType();
return ScalarVT.isInteger() && Ld->getExtensionType() == ISD::EXTLOAD;
}]>;
def aligned_maskedloadvi32: PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
(masked_ld node:$ptr, undef, node:$pred, node:$passthru), [{
auto *Ld = cast<MaskedLoadSDNode>(N);
EVT ScalarVT = Ld->getMemoryVT().getScalarType();
return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && Ld->getAlignment() >= 4;
}]>;
def aligned_maskedstvi8 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
(masked_st node:$val, node:$ptr, undef, node:$pred), [{
return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
}]>;
def aligned_maskedstvi16 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
(masked_st node:$val, node:$ptr, undef, node:$pred), [{
auto *St = cast<MaskedStoreSDNode>(N);
EVT ScalarVT = St->getMemoryVT().getScalarType();
return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2;
}]>;
def aligned_maskedstvi32 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
(masked_st node:$val, node:$ptr, undef, node:$pred), [{
auto *St = cast<MaskedStoreSDNode>(N);
EVT ScalarVT = St->getMemoryVT().getScalarType();
return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && St->getAlignment() >= 4;
}]>;
def pre_maskedstore : PatFrag<(ops node:$val, node:$base, node:$offset, node:$mask),
(masked_st node:$val, node:$base, node:$offset, node:$mask), [{
ISD::MemIndexedMode AM = cast<MaskedStoreSDNode>(N)->getAddressingMode();
return AM == ISD::PRE_INC || AM == ISD::PRE_DEC;
}]>;
def post_maskedstore : PatFrag<(ops node:$val, node:$base, node:$offset, node:$mask),
(masked_st node:$val, node:$base, node:$offset, node:$mask), [{
ISD::MemIndexedMode AM = cast<MaskedStoreSDNode>(N)->getAddressingMode();
return AM == ISD::POST_INC || AM == ISD::POST_DEC;
}]>;
def aligned_pre_maskedstorevi8 : PatFrag<(ops node:$val, node:$ptr, node:$offset, node:$mask),
(pre_maskedstore node:$val, node:$ptr, node:$offset, node:$mask), [{
return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
}]>;
def aligned_post_maskedstorevi8 : PatFrag<(ops node:$val, node:$ptr, node:$offset, node:$mask),
(post_maskedstore node:$val, node:$ptr, node:$offset, node:$mask), [{
return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
}]>;
def aligned_pre_maskedstorevi16 : PatFrag<(ops node:$val, node:$ptr, node:$offset, node:$mask),
(pre_maskedstore node:$val, node:$ptr, node:$offset, node:$mask), [{
auto *St = cast<MaskedStoreSDNode>(N);
EVT ScalarVT = St->getMemoryVT().getScalarType();
return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2;
}]>;
def aligned_post_maskedstorevi16 : PatFrag<(ops node:$val, node:$ptr, node:$offset, node:$mask),
(post_maskedstore node:$val, node:$ptr, node:$offset, node:$mask), [{
auto *St = cast<MaskedStoreSDNode>(N);
EVT ScalarVT = St->getMemoryVT().getScalarType();
return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2;
}]>;
def aligned_pre_maskedstorevi32 : PatFrag<(ops node:$val, node:$ptr, node:$offset, node:$mask),
(pre_maskedstore node:$val, node:$ptr, node:$offset, node:$mask), [{
auto *St = cast<MaskedStoreSDNode>(N);
EVT ScalarVT = St->getMemoryVT().getScalarType();
return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && St->getAlignment() >= 4;
}]>;
def aligned_post_maskedstorevi32 : PatFrag<(ops node:$val, node:$ptr, node:$offset, node:$mask),
(post_maskedstore node:$val, node:$ptr, node:$offset, node:$mask), [{
auto *St = cast<MaskedStoreSDNode>(N);
EVT ScalarVT = St->getMemoryVT().getScalarType();
return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && St->getAlignment() >= 4;
}]>;
// PatFrags for "Aligned" extending / truncating
def aligned_extloadvi8 : PatFrag<(ops node:$ptr), (extloadvi8 node:$ptr)>;
def aligned_sextloadvi8 : PatFrag<(ops node:$ptr), (sextloadvi8 node:$ptr)>;
def aligned_zextloadvi8 : PatFrag<(ops node:$ptr), (zextloadvi8 node:$ptr)>;
def aligned_truncstvi8 : PatFrag<(ops node:$val, node:$ptr),
(truncstorevi8 node:$val, node:$ptr)>;
def aligned_post_truncstvi8 : PatFrag<(ops node:$val, node:$base, node:$offset),
(post_truncstvi8 node:$val, node:$base, node:$offset)>;
def aligned_pre_truncstvi8 : PatFrag<(ops node:$val, node:$base, node:$offset),
(pre_truncstvi8 node:$val, node:$base, node:$offset)>;
let MinAlignment = 2 in {
def aligned_extloadvi16 : PatFrag<(ops node:$ptr), (extloadvi16 node:$ptr)>;
def aligned_sextloadvi16 : PatFrag<(ops node:$ptr), (sextloadvi16 node:$ptr)>;
def aligned_zextloadvi16 : PatFrag<(ops node:$ptr), (zextloadvi16 node:$ptr)>;
def aligned_truncstvi16 : PatFrag<(ops node:$val, node:$ptr),
(truncstorevi16 node:$val, node:$ptr)>;
def aligned_post_truncstvi16 : PatFrag<(ops node:$val, node:$base, node:$offset),
(post_truncstvi16 node:$val, node:$base, node:$offset)>;
def aligned_pre_truncstvi16 : PatFrag<(ops node:$val, node:$base, node:$offset),
(pre_truncstvi16 node:$val, node:$base, node:$offset)>;
}
def truncmaskedst : PatFrag<(ops node:$val, node:$base, node:$pred),
(masked_st node:$val, node:$base, undef, node:$pred), [{
return cast<MaskedStoreSDNode>(N)->isTruncatingStore();
}]>;
def aligned_truncmaskedstvi8 : PatFrag<(ops node:$val, node:$base, node:$pred),
(truncmaskedst node:$val, node:$base, node:$pred), [{
return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
}]>;
def aligned_truncmaskedstvi16 : PatFrag<(ops node:$val, node:$base, node:$pred),
(truncmaskedst node:$val, node:$base, node:$pred), [{
auto *St = cast<MaskedStoreSDNode>(N);
EVT ScalarVT = St->getMemoryVT().getScalarType();
return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2;
}]>;
def pre_truncmaskedst : PatFrag<(ops node:$val, node:$base, node:$offset, node:$pred),
(masked_st node:$val, node:$base, node:$offset, node:$pred), [{
ISD::MemIndexedMode AM = cast<MaskedStoreSDNode>(N)->getAddressingMode();
return cast<MaskedStoreSDNode>(N)->isTruncatingStore() && (AM == ISD::PRE_INC || AM == ISD::PRE_DEC);
}]>;
def aligned_pre_truncmaskedstvi8 : PatFrag<(ops node:$val, node:$base, node:$offset, node:$pred),
(pre_truncmaskedst node:$val, node:$base, node:$offset, node:$pred), [{
return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
}]>;
def aligned_pre_truncmaskedstvi16 : PatFrag<(ops node:$val, node:$base, node:$offset, node:$pred),
(pre_truncmaskedst node:$val, node:$base, node:$offset, node:$pred), [{
auto *St = cast<MaskedStoreSDNode>(N);
EVT ScalarVT = St->getMemoryVT().getScalarType();
return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2;
}]>;
def post_truncmaskedst : PatFrag<(ops node:$val, node:$base, node:$offset, node:$postd),
(masked_st node:$val, node:$base, node:$offset, node:$postd), [{
ISD::MemIndexedMode AM = cast<MaskedStoreSDNode>(N)->getAddressingMode();
return cast<MaskedStoreSDNode>(N)->isTruncatingStore() && (AM == ISD::POST_INC || AM == ISD::POST_DEC);
}]>;
def aligned_post_truncmaskedstvi8 : PatFrag<(ops node:$val, node:$base, node:$offset, node:$postd),
(post_truncmaskedst node:$val, node:$base, node:$offset, node:$postd), [{
return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
}]>;
def aligned_post_truncmaskedstvi16 : PatFrag<(ops node:$val, node:$base, node:$offset, node:$postd),
(post_truncmaskedst node:$val, node:$base, node:$offset, node:$postd), [{
auto *St = cast<MaskedStoreSDNode>(N);
EVT ScalarVT = St->getMemoryVT().getScalarType();
return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2;
}]>;
// Load/store patterns
class MVE_vector_store_typed<ValueType Ty, Instruction RegImmInst,
PatFrag StoreKind, int shift>
: Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr),
(RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr)>;
class MVE_vector_maskedstore_typed<ValueType Ty, Instruction RegImmInst,
PatFrag StoreKind, int shift>
: Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr, VCCR:$pred),
(RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr, ARMVCCThen, VCCR:$pred)>;
multiclass MVE_vector_store<Instruction RegImmInst, PatFrag StoreKind,
int shift> {
def : MVE_vector_store_typed<v16i8, RegImmInst, StoreKind, shift>;
def : MVE_vector_store_typed<v8i16, RegImmInst, StoreKind, shift>;
def : MVE_vector_store_typed<v8f16, RegImmInst, StoreKind, shift>;
def : MVE_vector_store_typed<v4i32, RegImmInst, StoreKind, shift>;
def : MVE_vector_store_typed<v4f32, RegImmInst, StoreKind, shift>;
def : MVE_vector_store_typed<v2i64, RegImmInst, StoreKind, shift>;
def : MVE_vector_store_typed<v2f64, RegImmInst, StoreKind, shift>;
}
class MVE_vector_load_typed<ValueType Ty, Instruction RegImmInst,
PatFrag LoadKind, int shift>
: Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr)),
(Ty (RegImmInst t2addrmode_imm7<shift>:$addr))>;
class MVE_vector_maskedload_typed<ValueType Ty, Instruction RegImmInst,
PatFrag LoadKind, int shift>
: Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr, VCCR:$pred, (Ty NEONimmAllZerosV))),
(Ty (RegImmInst t2addrmode_imm7<shift>:$addr, ARMVCCThen, VCCR:$pred))>;
multiclass MVE_vector_load<Instruction RegImmInst, PatFrag LoadKind,
int shift> {
def : MVE_vector_load_typed<v16i8, RegImmInst, LoadKind, shift>;
def : MVE_vector_load_typed<v8i16, RegImmInst, LoadKind, shift>;
def : MVE_vector_load_typed<v8f16, RegImmInst, LoadKind, shift>;
def : MVE_vector_load_typed<v4i32, RegImmInst, LoadKind, shift>;
def : MVE_vector_load_typed<v4f32, RegImmInst, LoadKind, shift>;
def : MVE_vector_load_typed<v2i64, RegImmInst, LoadKind, shift>;
def : MVE_vector_load_typed<v2f64, RegImmInst, LoadKind, shift>;
}
class MVE_vector_offset_store_typed<ValueType Ty, Instruction Opcode,
PatFrag StoreKind, int shift>
: Pat<(StoreKind (Ty MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<shift>:$addr),
(Opcode MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<shift>:$addr)>;
class MVE_vector_offset_maskedstore_typed<ValueType Ty, Instruction Opcode,
PatFrag StoreKind, int shift>
: Pat<(StoreKind (Ty MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<shift>:$addr, VCCR:$pred),
(Opcode MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<shift>:$addr, ARMVCCThen, VCCR:$pred)>;
multiclass MVE_vector_offset_store<Instruction RegImmInst, PatFrag StoreKind,
int shift> {
def : MVE_vector_offset_store_typed<v16i8, RegImmInst, StoreKind, shift>;
def : MVE_vector_offset_store_typed<v8i16, RegImmInst, StoreKind, shift>;
def : MVE_vector_offset_store_typed<v8f16, RegImmInst, StoreKind, shift>;
def : MVE_vector_offset_store_typed<v4i32, RegImmInst, StoreKind, shift>;
def : MVE_vector_offset_store_typed<v4f32, RegImmInst, StoreKind, shift>;
def : MVE_vector_offset_store_typed<v2i64, RegImmInst, StoreKind, shift>;
def : MVE_vector_offset_store_typed<v2f64, RegImmInst, StoreKind, shift>;
}
let Predicates = [HasMVEInt, IsLE] in {
// Stores
defm : MVE_vector_store<MVE_VSTRBU8, byte_alignedstore, 0>;
defm : MVE_vector_store<MVE_VSTRHU16, hword_alignedstore, 1>;
defm : MVE_vector_store<MVE_VSTRWU32, alignedstore32, 2>;
// Loads
defm : MVE_vector_load<MVE_VLDRBU8, byte_alignedload, 0>;
defm : MVE_vector_load<MVE_VLDRHU16, hword_alignedload, 1>;
defm : MVE_vector_load<MVE_VLDRWU32, alignedload32, 2>;
// Pre/post inc stores
defm : MVE_vector_offset_store<MVE_VSTRBU8_pre, pre_store, 0>;
defm : MVE_vector_offset_store<MVE_VSTRBU8_post, post_store, 0>;
defm : MVE_vector_offset_store<MVE_VSTRHU16_pre, aligned16_pre_store, 1>;
defm : MVE_vector_offset_store<MVE_VSTRHU16_post, aligned16_post_store, 1>;
defm : MVE_vector_offset_store<MVE_VSTRWU32_pre, aligned32_pre_store, 2>;
defm : MVE_vector_offset_store<MVE_VSTRWU32_post, aligned32_post_store, 2>;
}
let Predicates = [HasMVEInt, IsBE] in {
// Aligned Stores
def : MVE_vector_store_typed<v16i8, MVE_VSTRBU8, store, 0>;
def : MVE_vector_store_typed<v8i16, MVE_VSTRHU16, alignedstore16, 1>;
def : MVE_vector_store_typed<v8f16, MVE_VSTRHU16, alignedstore16, 1>;
def : MVE_vector_store_typed<v4i32, MVE_VSTRWU32, alignedstore32, 2>;
def : MVE_vector_store_typed<v4f32, MVE_VSTRWU32, alignedstore32, 2>;
// Aligned Loads
def : MVE_vector_load_typed<v16i8, MVE_VLDRBU8, load, 0>;
def : MVE_vector_load_typed<v8i16, MVE_VLDRHU16, alignedload16, 1>;
def : MVE_vector_load_typed<v8f16, MVE_VLDRHU16, alignedload16, 1>;
def : MVE_vector_load_typed<v4i32, MVE_VLDRWU32, alignedload32, 2>;
def : MVE_vector_load_typed<v4f32, MVE_VLDRWU32, alignedload32, 2>;
// Other unaligned loads/stores need to go though a VREV
def : Pat<(v2f64 (load t2addrmode_imm7<0>:$addr)),
(v2f64 (MVE_VREV64_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>;
def : Pat<(v2i64 (load t2addrmode_imm7<0>:$addr)),
(v2i64 (MVE_VREV64_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>;
def : Pat<(v4i32 (load t2addrmode_imm7<0>:$addr)),
(v4i32 (MVE_VREV32_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>;
def : Pat<(v4f32 (load t2addrmode_imm7<0>:$addr)),
(v4f32 (MVE_VREV32_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>;
def : Pat<(v8i16 (load t2addrmode_imm7<0>:$addr)),
(v8i16 (MVE_VREV16_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>;
def : Pat<(v8f16 (load t2addrmode_imm7<0>:$addr)),
(v8f16 (MVE_VREV16_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>;
def : Pat<(store (v2f64 MQPR:$val), t2addrmode_imm7<0>:$addr),
(MVE_VSTRBU8 (MVE_VREV64_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>;
def : Pat<(store (v2i64 MQPR:$val), t2addrmode_imm7<0>:$addr),
(MVE_VSTRBU8 (MVE_VREV64_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>;
def : Pat<(store (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr),
(MVE_VSTRBU8 (MVE_VREV32_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>;
def : Pat<(store (v4f32 MQPR:$val), t2addrmode_imm7<0>:$addr),
(MVE_VSTRBU8 (MVE_VREV32_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>;
def : Pat<(store (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr),
(MVE_VSTRBU8 (MVE_VREV16_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>;
def : Pat<(store (v8f16 MQPR:$val), t2addrmode_imm7<0>:$addr),
(MVE_VSTRBU8 (MVE_VREV16_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>;
// Pre/Post inc stores
def : MVE_vector_offset_store_typed<v16i8, MVE_VSTRBU8_pre, pre_store, 0>;
def : MVE_vector_offset_store_typed<v16i8, MVE_VSTRBU8_post, post_store, 0>;
def : MVE_vector_offset_store_typed<v8i16, MVE_VSTRHU16_pre, aligned16_pre_store, 1>;
def : MVE_vector_offset_store_typed<v8i16, MVE_VSTRHU16_post, aligned16_post_store, 1>;
def : MVE_vector_offset_store_typed<v8f16, MVE_VSTRHU16_pre, aligned16_pre_store, 1>;
def : MVE_vector_offset_store_typed<v8f16, MVE_VSTRHU16_post, aligned16_post_store, 1>;
def : MVE_vector_offset_store_typed<v4i32, MVE_VSTRWU32_pre, aligned32_pre_store, 2>;
def : MVE_vector_offset_store_typed<v4i32, MVE_VSTRWU32_post, aligned32_post_store, 2>;
def : MVE_vector_offset_store_typed<v4f32, MVE_VSTRWU32_pre, aligned32_pre_store, 2>;
def : MVE_vector_offset_store_typed<v4f32, MVE_VSTRWU32_post, aligned32_post_store, 2>;
}
let Predicates = [HasMVEInt] in {
// Aligned masked store, shared between LE and BE
def : MVE_vector_maskedstore_typed<v16i8, MVE_VSTRBU8, aligned_maskedstvi8, 0>;
def : MVE_vector_maskedstore_typed<v8i16, MVE_VSTRHU16, aligned_maskedstvi16, 1>;
def : MVE_vector_maskedstore_typed<v8f16, MVE_VSTRHU16, aligned_maskedstvi16, 1>;
def : MVE_vector_maskedstore_typed<v4i32, MVE_VSTRWU32, aligned_maskedstvi32, 2>;
def : MVE_vector_maskedstore_typed<v4f32, MVE_VSTRWU32, aligned_maskedstvi32, 2>;
// Pre/Post inc masked stores
def : MVE_vector_offset_maskedstore_typed<v16i8, MVE_VSTRBU8_pre, aligned_pre_maskedstorevi8, 0>;
def : MVE_vector_offset_maskedstore_typed<v16i8, MVE_VSTRBU8_post, aligned_post_maskedstorevi8, 0>;
def : MVE_vector_offset_maskedstore_typed<v8i16, MVE_VSTRHU16_pre, aligned_pre_maskedstorevi16, 1>;
def : MVE_vector_offset_maskedstore_typed<v8i16, MVE_VSTRHU16_post, aligned_post_maskedstorevi16, 1>;
def : MVE_vector_offset_maskedstore_typed<v8f16, MVE_VSTRHU16_pre, aligned_pre_maskedstorevi16, 1>;
def : MVE_vector_offset_maskedstore_typed<v8f16, MVE_VSTRHU16_post, aligned_post_maskedstorevi16, 1>;
def : MVE_vector_offset_maskedstore_typed<v4i32, MVE_VSTRWU32_pre, aligned_pre_maskedstorevi32, 2>;
def : MVE_vector_offset_maskedstore_typed<v4i32, MVE_VSTRWU32_post, aligned_post_maskedstorevi32, 2>;
def : MVE_vector_offset_maskedstore_typed<v4f32, MVE_VSTRWU32_pre, aligned_pre_maskedstorevi32, 2>;
def : MVE_vector_offset_maskedstore_typed<v4f32, MVE_VSTRWU32_post, aligned_post_maskedstorevi32, 2>;
// Aligned masked loads
def : MVE_vector_maskedload_typed<v16i8, MVE_VLDRBU8, aligned_maskedloadvi8, 0>;
def : MVE_vector_maskedload_typed<v8i16, MVE_VLDRHU16, aligned_maskedloadvi16, 1>;
def : MVE_vector_maskedload_typed<v8f16, MVE_VLDRHU16, aligned_maskedloadvi16, 1>;
def : MVE_vector_maskedload_typed<v4i32, MVE_VLDRWU32, aligned_maskedloadvi32, 2>;
def : MVE_vector_maskedload_typed<v4f32, MVE_VLDRWU32, aligned_maskedloadvi32, 2>;
}
// Widening/Narrowing Loads/Stores
multiclass MVEExtLoadStore<Instruction LoadSInst, Instruction LoadUInst, string StoreInst,
string Amble, ValueType VT, int Shift> {
// Trunc stores
def : Pat<(!cast<PatFrag>("aligned_truncst"#Amble) (VT MQPR:$val), taddrmode_imm7<Shift>:$addr),
(!cast<Instruction>(StoreInst) MQPR:$val, taddrmode_imm7<Shift>:$addr)>;
def : Pat<(!cast<PatFrag>("aligned_post_truncst"#Amble) (VT MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<Shift>:$addr),
(!cast<Instruction>(StoreInst#"_post") MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<Shift>:$addr)>;
def : Pat<(!cast<PatFrag>("aligned_pre_truncst"#Amble) (VT MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<Shift>:$addr),
(!cast<Instruction>(StoreInst#"_pre") MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<Shift>:$addr)>;
// Masked trunc stores
def : Pat<(!cast<PatFrag>("aligned_truncmaskedst"#Amble) (VT MQPR:$val), taddrmode_imm7<Shift>:$addr, VCCR:$pred),
(!cast<Instruction>(StoreInst) MQPR:$val, taddrmode_imm7<Shift>:$addr, ARMVCCThen, VCCR:$pred)>;
def : Pat<(!cast<PatFrag>("aligned_post_truncmaskedst"#Amble) (VT MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<Shift>:$addr, VCCR:$pred),
(!cast<Instruction>(StoreInst#"_post") MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<Shift>:$addr, ARMVCCThen, VCCR:$pred)>;
def : Pat<(!cast<PatFrag>("aligned_pre_truncmaskedst"#Amble) (VT MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<Shift>:$addr, VCCR:$pred),
(!cast<Instruction>(StoreInst#"_pre") MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<Shift>:$addr, ARMVCCThen, VCCR:$pred)>;
// Ext loads
def : Pat<(VT (!cast<PatFrag>("aligned_extload"#Amble) taddrmode_imm7<Shift>:$addr)),
(VT (LoadUInst taddrmode_imm7<Shift>:$addr))>;
def : Pat<(VT (!cast<PatFrag>("aligned_sextload"#Amble) taddrmode_imm7<Shift>:$addr)),
(VT (LoadSInst taddrmode_imm7<Shift>:$addr))>;
def : Pat<(VT (!cast<PatFrag>("aligned_zextload"#Amble) taddrmode_imm7<Shift>:$addr)),
(VT (LoadUInst taddrmode_imm7<Shift>:$addr))>;
// Masked ext loads
def : Pat<(VT (!cast<PatFrag>("aligned_extmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT NEONimmAllZerosV))),
(VT (LoadUInst taddrmode_imm7<Shift>:$addr, ARMVCCThen, VCCR:$pred))>;
def : Pat<(VT (!cast<PatFrag>("aligned_sextmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT NEONimmAllZerosV))),
(VT (LoadSInst taddrmode_imm7<Shift>:$addr, ARMVCCThen, VCCR:$pred))>;
def : Pat<(VT (!cast<PatFrag>("aligned_zextmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT NEONimmAllZerosV))),
(VT (LoadUInst taddrmode_imm7<Shift>:$addr, ARMVCCThen, VCCR:$pred))>;
}
let Predicates = [HasMVEInt] in {
defm : MVEExtLoadStore<MVE_VLDRBS16, MVE_VLDRBU16, "MVE_VSTRB16", "vi8", v8i16, 0>;
defm : MVEExtLoadStore<MVE_VLDRBS32, MVE_VLDRBU32, "MVE_VSTRB32", "vi8", v4i32, 0>;
defm : MVEExtLoadStore<MVE_VLDRHS32, MVE_VLDRHU32, "MVE_VSTRH32", "vi16", v4i32, 1>;
}
// Bit convert patterns
let Predicates = [HasMVEInt] in {
def : Pat<(v2f64 (bitconvert (v2i64 MQPR:$src))), (v2f64 MQPR:$src)>;
def : Pat<(v2i64 (bitconvert (v2f64 MQPR:$src))), (v2i64 MQPR:$src)>;
def : Pat<(v4i32 (bitconvert (v4f32 MQPR:$src))), (v4i32 MQPR:$src)>;
def : Pat<(v4f32 (bitconvert (v4i32 MQPR:$src))), (v4f32 MQPR:$src)>;
def : Pat<(v8i16 (bitconvert (v8f16 MQPR:$src))), (v8i16 MQPR:$src)>;
def : Pat<(v8f16 (bitconvert (v8i16 MQPR:$src))), (v8f16 MQPR:$src)>;
}
let Predicates = [IsLE,HasMVEInt] in {
def : Pat<(v2f64 (bitconvert (v4f32 MQPR:$src))), (v2f64 MQPR:$src)>;
def : Pat<(v2f64 (bitconvert (v4i32 MQPR:$src))), (v2f64 MQPR:$src)>;
def : Pat<(v2f64 (bitconvert (v8f16 MQPR:$src))), (v2f64 MQPR:$src)>;
def : Pat<(v2f64 (bitconvert (v8i16 MQPR:$src))), (v2f64 MQPR:$src)>;
def : Pat<(v2f64 (bitconvert (v16i8 MQPR:$src))), (v2f64 MQPR:$src)>;
def : Pat<(v2i64 (bitconvert (v4f32 MQPR:$src))), (v2i64 MQPR:$src)>;
def : Pat<(v2i64 (bitconvert (v4i32 MQPR:$src))), (v2i64 MQPR:$src)>;
def : Pat<(v2i64 (bitconvert (v8f16 MQPR:$src))), (v2i64 MQPR:$src)>;
def : Pat<(v2i64 (bitconvert (v8i16 MQPR:$src))), (v2i64 MQPR:$src)>;
def : Pat<(v2i64 (bitconvert (v16i8 MQPR:$src))), (v2i64 MQPR:$src)>;
def : Pat<(v4f32 (bitconvert (v2f64 MQPR:$src))), (v4f32 MQPR:$src)>;
def : Pat<(v4f32 (bitconvert (v2i64 MQPR:$src))), (v4f32 MQPR:$src)>;
def : Pat<(v4f32 (bitconvert (v8f16 MQPR:$src))), (v4f32 MQPR:$src)>;
def : Pat<(v4f32 (bitconvert (v8i16 MQPR:$src))), (v4f32 MQPR:$src)>;
def : Pat<(v4f32 (bitconvert (v16i8 MQPR:$src))), (v4f32 MQPR:$src)>;
def : Pat<(v4i32 (bitconvert (v2f64 MQPR:$src))), (v4i32 MQPR:$src)>;
def : Pat<(v4i32 (bitconvert (v2i64 MQPR:$src))), (v4i32 MQPR:$src)>;
def : Pat<(v4i32 (bitconvert (v8f16 MQPR:$src))), (v4i32 MQPR:$src)>;
def : Pat<(v4i32 (bitconvert (v8i16 MQPR:$src))), (v4i32 MQPR:$src)>;
def : Pat<(v4i32 (bitconvert (v16i8 MQPR:$src))), (v4i32 MQPR:$src)>;
def : Pat<(v8f16 (bitconvert (v2f64 MQPR:$src))), (v8f16 MQPR:$src)>;
def : Pat<(v8f16 (bitconvert (v2i64 MQPR:$src))), (v8f16 MQPR:$src)>;
def : Pat<(v8f16 (bitconvert (v4f32 MQPR:$src))), (v8f16 MQPR:$src)>;
def : Pat<(v8f16 (bitconvert (v4i32 MQPR:$src))), (v8f16 MQPR:$src)>;
def : Pat<(v8f16 (bitconvert (v16i8 MQPR:$src))), (v8f16 MQPR:$src)>;
def : Pat<(v8i16 (bitconvert (v2f64 MQPR:$src))), (v8i16 MQPR:$src)>;
def : Pat<(v8i16 (bitconvert (v2i64 MQPR:$src))), (v8i16 MQPR:$src)>;
def : Pat<(v8i16 (bitconvert (v4f32 MQPR:$src))), (v8i16 MQPR:$src)>;
def : Pat<(v8i16 (bitconvert (v4i32 MQPR:$src))), (v8i16 MQPR:$src)>;
def : Pat<(v8i16 (bitconvert (v16i8 MQPR:$src))), (v8i16 MQPR:$src)>;
def : Pat<(v16i8 (bitconvert (v2f64 MQPR:$src))), (v16i8 MQPR:$src)>;
def : Pat<(v16i8 (bitconvert (v2i64 MQPR:$src))), (v16i8 MQPR:$src)>;
def : Pat<(v16i8 (bitconvert (v4f32 MQPR:$src))), (v16i8 MQPR:$src)>;
def : Pat<(v16i8 (bitconvert (v4i32 MQPR:$src))), (v16i8 MQPR:$src)>;
def : Pat<(v16i8 (bitconvert (v8f16 MQPR:$src))), (v16i8 MQPR:$src)>;
def : Pat<(v16i8 (bitconvert (v8i16 MQPR:$src))), (v16i8 MQPR:$src)>;
}
let Predicates = [IsBE,HasMVEInt] in {
def : Pat<(v2f64 (bitconvert (v4f32 MQPR:$src))), (v2f64 (MVE_VREV64_32 MQPR:$src))>;
def : Pat<(v2f64 (bitconvert (v4i32 MQPR:$src))), (v2f64 (MVE_VREV64_32 MQPR:$src))>;
def : Pat<(v2f64 (bitconvert (v8f16 MQPR:$src))), (v2f64 (MVE_VREV64_16 MQPR:$src))>;
def : Pat<(v2f64 (bitconvert (v8i16 MQPR:$src))), (v2f64 (MVE_VREV64_16 MQPR:$src))>;
def : Pat<(v2f64 (bitconvert (v16i8 MQPR:$src))), (v2f64 (MVE_VREV64_8 MQPR:$src))>;
def : Pat<(v2i64 (bitconvert (v4f32 MQPR:$src))), (v2i64 (MVE_VREV64_32 MQPR:$src))>;
def : Pat<(v2i64 (bitconvert (v4i32 MQPR:$src))), (v2i64 (MVE_VREV64_32 MQPR:$src))>;
def : Pat<(v2i64 (bitconvert (v8f16 MQPR:$src))), (v2i64 (MVE_VREV64_16 MQPR:$src))>;
def : Pat<(v2i64 (bitconvert (v8i16 MQPR:$src))), (v2i64 (MVE_VREV64_16 MQPR:$src))>;
def : Pat<(v2i64 (bitconvert (v16i8 MQPR:$src))), (v2i64 (MVE_VREV64_8 MQPR:$src))>;
def : Pat<(v4f32 (bitconvert (v2f64 MQPR:$src))), (v4f32 (MVE_VREV64_32 MQPR:$src))>;
def : Pat<(v4f32 (bitconvert (v2i64 MQPR:$src))), (v4f32 (MVE_VREV64_32 MQPR:$src))>;
def : Pat<(v4f32 (bitconvert (v8f16 MQPR:$src))), (v4f32 (MVE_VREV32_16 MQPR:$src))>;
def : Pat<(v4f32 (bitconvert (v8i16 MQPR:$src))), (v4f32 (MVE_VREV32_16 MQPR:$src))>;
def : Pat<(v4f32 (bitconvert (v16i8 MQPR:$src))), (v4f32 (MVE_VREV32_8 MQPR:$src))>;
def : Pat<(v4i32 (bitconvert (v2f64 MQPR:$src))), (v4i32 (MVE_VREV64_32 MQPR:$src))>;
def : Pat<(v4i32 (bitconvert (v2i64 MQPR:$src))), (v4i32 (MVE_VREV64_32 MQPR:$src))>;
def : Pat<(v4i32 (bitconvert (v8f16 MQPR:$src))), (v4i32 (MVE_VREV32_16 MQPR:$src))>;
def : Pat<(v4i32 (bitconvert (v8i16 MQPR:$src))), (v4i32 (MVE_VREV32_16 MQPR:$src))>;
def : Pat<(v4i32 (bitconvert (v16i8 MQPR:$src))), (v4i32 (MVE_VREV32_8 MQPR:$src))>;
def : Pat<(v8f16 (bitconvert (v2f64 MQPR:$src))), (v8f16 (MVE_VREV64_16 MQPR:$src))>;
def : Pat<(v8f16 (bitconvert (v2i64 MQPR:$src))), (v8f16 (MVE_VREV64_16 MQPR:$src))>;
def : Pat<(v8f16 (bitconvert (v4f32 MQPR:$src))), (v8f16 (MVE_VREV32_16 MQPR:$src))>;
def : Pat<(v8f16 (bitconvert (v4i32 MQPR:$src))), (v8f16 (MVE_VREV32_16 MQPR:$src))>;
def : Pat<(v8f16 (bitconvert (v16i8 MQPR:$src))), (v8f16 (MVE_VREV16_8 MQPR:$src))>;
def : Pat<(v8i16 (bitconvert (v2f64 MQPR:$src))), (v8i16 (MVE_VREV64_16 MQPR:$src))>;
def : Pat<(v8i16 (bitconvert (v2i64 MQPR:$src))), (v8i16 (MVE_VREV64_16 MQPR:$src))>;
def : Pat<(v8i16 (bitconvert (v4f32 MQPR:$src))), (v8i16 (MVE_VREV32_16 MQPR:$src))>;
def : Pat<(v8i16 (bitconvert (v4i32 MQPR:$src))), (v8i16 (MVE_VREV32_16 MQPR:$src))>;
def : Pat<(v8i16 (bitconvert (v16i8 MQPR:$src))), (v8i16 (MVE_VREV16_8 MQPR:$src))>;
def : Pat<(v16i8 (bitconvert (v2f64 MQPR:$src))), (v16i8 (MVE_VREV64_8 MQPR:$src))>;
def : Pat<(v16i8 (bitconvert (v2i64 MQPR:$src))), (v16i8 (MVE_VREV64_8 MQPR:$src))>;
def : Pat<(v16i8 (bitconvert (v4f32 MQPR:$src))), (v16i8 (MVE_VREV32_8 MQPR:$src))>;
def : Pat<(v16i8 (bitconvert (v4i32 MQPR:$src))), (v16i8 (MVE_VREV32_8 MQPR:$src))>;
def : Pat<(v16i8 (bitconvert (v8f16 MQPR:$src))), (v16i8 (MVE_VREV16_8 MQPR:$src))>;
def : Pat<(v16i8 (bitconvert (v8i16 MQPR:$src))), (v16i8 (MVE_VREV16_8 MQPR:$src))>;
}