[RISCV] Transform unaligned RVV vector loads/stores to aligned ones

This patch adds support for loading and storing unaligned vectors via an
equivalently-sized i8 vector type, which has support in the RVV
specification for byte-aligned access.

This offers a more optimal path for handling of unaligned fixed-length
vector accesses, which are currently scalarized. It also prevents
crashing when `LegalizeDAG` sees an unaligned scalable-vector load/store
operation.

Future work could be to investigate loading/storing via the largest
vector element type for the given alignment, in case that would be more
optimal on hardware. For instance, a 4-byte-aligned nxv2i64 vector load
could loaded as nxv4i32 instead of as nxv16i8.

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D104032
This commit is contained in:
Fraser Cormack 2021-06-09 15:17:21 +01:00
parent c58cf692f4
commit c75e454cb9
4 changed files with 295 additions and 289 deletions

View File

@ -523,6 +523,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
for (unsigned VPOpc : IntegerVPOps) for (unsigned VPOpc : IntegerVPOps)
setOperationAction(VPOpc, VT, Custom); setOperationAction(VPOpc, VT, Custom);
setOperationAction(ISD::LOAD, VT, Custom);
setOperationAction(ISD::STORE, VT, Custom);
setOperationAction(ISD::MLOAD, VT, Custom); setOperationAction(ISD::MLOAD, VT, Custom);
setOperationAction(ISD::MSTORE, VT, Custom); setOperationAction(ISD::MSTORE, VT, Custom);
setOperationAction(ISD::MGATHER, VT, Custom); setOperationAction(ISD::MGATHER, VT, Custom);
@ -584,6 +587,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
setOperationAction(ISD::FCOPYSIGN, VT, Legal); setOperationAction(ISD::FCOPYSIGN, VT, Legal);
setOperationAction(ISD::LOAD, VT, Custom);
setOperationAction(ISD::STORE, VT, Custom);
setOperationAction(ISD::MLOAD, VT, Custom); setOperationAction(ISD::MLOAD, VT, Custom);
setOperationAction(ISD::MSTORE, VT, Custom); setOperationAction(ISD::MSTORE, VT, Custom);
setOperationAction(ISD::MGATHER, VT, Custom); setOperationAction(ISD::MGATHER, VT, Custom);
@ -1891,6 +1897,66 @@ static SDValue getRVVFPExtendOrRound(SDValue Op, MVT VT, MVT ContainerVT,
return DAG.getNode(RVVOpc, DL, ContainerVT, Op, Mask, VL); return DAG.getNode(RVVOpc, DL, ContainerVT, Op, Mask, VL);
} }
// While RVV has alignment restrictions, we should always be able to load as a
// legal equivalently-sized byte-typed vector instead. This method is
// responsible for re-expressing a ISD::LOAD via a correctly-aligned type. If
// the load is already correctly-aligned, it returns SDValue().
SDValue RISCVTargetLowering::expandUnalignedRVVLoad(SDValue Op,
SelectionDAG &DAG) const {
auto *Load = cast<LoadSDNode>(Op);
assert(Load && Load->getMemoryVT().isVector() && "Expected vector load");
if (allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
Load->getMemoryVT(),
*Load->getMemOperand()))
return SDValue();
SDLoc DL(Op);
MVT VT = Op.getSimpleValueType();
unsigned EltSizeBits = VT.getScalarSizeInBits();
assert((EltSizeBits == 16 || EltSizeBits == 32 || EltSizeBits == 64) &&
"Unexpected unaligned RVV load type");
MVT NewVT =
MVT::getVectorVT(MVT::i8, VT.getVectorElementCount() * (EltSizeBits / 8));
assert(NewVT.isValid() &&
"Expecting equally-sized RVV vector types to be legal");
SDValue L = DAG.getLoad(NewVT, DL, Load->getChain(), Load->getBasePtr(),
Load->getPointerInfo(), Load->getOriginalAlign(),
Load->getMemOperand()->getFlags());
return DAG.getMergeValues({DAG.getBitcast(VT, L), L.getValue(1)}, DL);
}
// While RVV has alignment restrictions, we should always be able to store as a
// legal equivalently-sized byte-typed vector instead. This method is
// responsible for re-expressing a ISD::STORE via a correctly-aligned type. It
// returns SDValue() if the store is already correctly aligned.
SDValue RISCVTargetLowering::expandUnalignedRVVStore(SDValue Op,
SelectionDAG &DAG) const {
auto *Store = cast<StoreSDNode>(Op);
assert(Store && Store->getValue().getValueType().isVector() &&
"Expected vector store");
if (allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
Store->getMemoryVT(),
*Store->getMemOperand()))
return SDValue();
SDLoc DL(Op);
SDValue StoredVal = Store->getValue();
MVT VT = StoredVal.getSimpleValueType();
unsigned EltSizeBits = VT.getScalarSizeInBits();
assert((EltSizeBits == 16 || EltSizeBits == 32 || EltSizeBits == 64) &&
"Unexpected unaligned RVV store type");
MVT NewVT =
MVT::getVectorVT(MVT::i8, VT.getVectorElementCount() * (EltSizeBits / 8));
assert(NewVT.isValid() &&
"Expecting equally-sized RVV vector types to be legal");
StoredVal = DAG.getBitcast(NewVT, StoredVal);
return DAG.getStore(Store->getChain(), DL, StoredVal, Store->getBasePtr(),
Store->getPointerInfo(), Store->getOriginalAlign(),
Store->getMemOperand()->getFlags());
}
SDValue RISCVTargetLowering::LowerOperation(SDValue Op, SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
SelectionDAG &DAG) const { SelectionDAG &DAG) const {
switch (Op.getOpcode()) { switch (Op.getOpcode()) {
@ -2310,9 +2376,17 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
return Vec; return Vec;
} }
case ISD::LOAD: case ISD::LOAD:
if (auto V = expandUnalignedRVVLoad(Op, DAG))
return V;
if (Op.getValueType().isFixedLengthVector())
return lowerFixedLengthVectorLoadToRVV(Op, DAG); return lowerFixedLengthVectorLoadToRVV(Op, DAG);
return Op;
case ISD::STORE: case ISD::STORE:
if (auto V = expandUnalignedRVVStore(Op, DAG))
return V;
if (Op.getOperand(1).getValueType().isFixedLengthVector())
return lowerFixedLengthVectorStoreToRVV(Op, DAG); return lowerFixedLengthVectorStoreToRVV(Op, DAG);
return Op;
case ISD::MLOAD: case ISD::MLOAD:
return lowerMLOAD(Op, DAG); return lowerMLOAD(Op, DAG);
case ISD::MSTORE: case ISD::MSTORE:
@ -4031,13 +4105,10 @@ RISCVTargetLowering::lowerFixedLengthVectorLoadToRVV(SDValue Op,
SDLoc DL(Op); SDLoc DL(Op);
auto *Load = cast<LoadSDNode>(Op); auto *Load = cast<LoadSDNode>(Op);
if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), assert(allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
Load->getMemoryVT(), Load->getMemoryVT(),
*Load->getMemOperand())) { *Load->getMemOperand()) &&
SDValue Result, Chain; "Expecting a correctly-aligned load");
std::tie(Result, Chain) = expandUnalignedLoad(Load, DAG);
return DAG.getMergeValues({Result, Chain}, DL);
}
MVT VT = Op.getSimpleValueType(); MVT VT = Op.getSimpleValueType();
MVT ContainerVT = getContainerForFixedLengthVector(VT); MVT ContainerVT = getContainerForFixedLengthVector(VT);
@ -4060,10 +4131,10 @@ RISCVTargetLowering::lowerFixedLengthVectorStoreToRVV(SDValue Op,
SDLoc DL(Op); SDLoc DL(Op);
auto *Store = cast<StoreSDNode>(Op); auto *Store = cast<StoreSDNode>(Op);
if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), assert(allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
Store->getMemoryVT(), Store->getMemoryVT(),
*Store->getMemOperand())) *Store->getMemOperand()) &&
return expandUnalignedStore(Store, DAG); "Expecting a correctly-aligned store");
SDValue StoreVal = Store->getValue(); SDValue StoreVal = Store->getValue();
MVT VT = StoreVal.getSimpleValueType(); MVT VT = StoreVal.getSimpleValueType();

View File

@ -562,6 +562,9 @@ private:
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const; SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const; SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
SDValue expandUnalignedRVVLoad(SDValue Op, SelectionDAG &DAG) const;
SDValue expandUnalignedRVVStore(SDValue Op, SelectionDAG &DAG) const;
bool isEligibleForTailCallOptimization( bool isEligibleForTailCallOptimization(
CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF, CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF,
const SmallVector<CCValAssign, 16> &ArgLocs) const; const SmallVector<CCValAssign, 16> &ArgLocs) const;

View File

@ -7,112 +7,14 @@
define <4 x i32> @load_v4i32_align1(<4 x i32>* %ptr) { define <4 x i32> @load_v4i32_align1(<4 x i32>* %ptr) {
; RV32-LABEL: load_v4i32_align1: ; RV32-LABEL: load_v4i32_align1:
; RV32: # %bb.0: ; RV32: # %bb.0:
; RV32-NEXT: addi sp, sp, -32 ; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, mu
; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: vle8.v v8, (a0)
; RV32-NEXT: lbu a1, 13(a0)
; RV32-NEXT: lbu a2, 12(a0)
; RV32-NEXT: lbu a3, 15(a0)
; RV32-NEXT: lbu a4, 14(a0)
; RV32-NEXT: slli a1, a1, 8
; RV32-NEXT: or a1, a1, a2
; RV32-NEXT: slli a2, a3, 8
; RV32-NEXT: or a2, a2, a4
; RV32-NEXT: slli a2, a2, 16
; RV32-NEXT: or a1, a2, a1
; RV32-NEXT: sw a1, 28(sp)
; RV32-NEXT: lbu a1, 9(a0)
; RV32-NEXT: lbu a2, 8(a0)
; RV32-NEXT: lbu a3, 11(a0)
; RV32-NEXT: lbu a4, 10(a0)
; RV32-NEXT: slli a1, a1, 8
; RV32-NEXT: or a1, a1, a2
; RV32-NEXT: slli a2, a3, 8
; RV32-NEXT: or a2, a2, a4
; RV32-NEXT: slli a2, a2, 16
; RV32-NEXT: or a1, a2, a1
; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lbu a1, 5(a0)
; RV32-NEXT: lbu a2, 4(a0)
; RV32-NEXT: lbu a3, 7(a0)
; RV32-NEXT: lbu a4, 6(a0)
; RV32-NEXT: slli a1, a1, 8
; RV32-NEXT: or a1, a1, a2
; RV32-NEXT: slli a2, a3, 8
; RV32-NEXT: or a2, a2, a4
; RV32-NEXT: slli a2, a2, 16
; RV32-NEXT: or a1, a2, a1
; RV32-NEXT: sw a1, 20(sp)
; RV32-NEXT: lbu a1, 1(a0)
; RV32-NEXT: lbu a2, 0(a0)
; RV32-NEXT: lbu a3, 3(a0)
; RV32-NEXT: lbu a0, 2(a0)
; RV32-NEXT: slli a1, a1, 8
; RV32-NEXT: or a1, a1, a2
; RV32-NEXT: slli a2, a3, 8
; RV32-NEXT: or a0, a2, a0
; RV32-NEXT: slli a0, a0, 16
; RV32-NEXT: or a0, a0, a1
; RV32-NEXT: sw a0, 16(sp)
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
; RV32-NEXT: addi a0, sp, 16
; RV32-NEXT: vle32.v v8, (a0)
; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret ; RV32-NEXT: ret
; ;
; RV64-LABEL: load_v4i32_align1: ; RV64-LABEL: load_v4i32_align1:
; RV64: # %bb.0: ; RV64: # %bb.0:
; RV64-NEXT: addi sp, sp, -32 ; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, mu
; RV64-NEXT: .cfi_def_cfa_offset 32 ; RV64-NEXT: vle8.v v8, (a0)
; RV64-NEXT: lbu a1, 9(a0)
; RV64-NEXT: lbu a2, 8(a0)
; RV64-NEXT: lbu a3, 11(a0)
; RV64-NEXT: lbu a4, 10(a0)
; RV64-NEXT: slli a1, a1, 8
; RV64-NEXT: or a1, a1, a2
; RV64-NEXT: slli a2, a3, 8
; RV64-NEXT: or a2, a2, a4
; RV64-NEXT: slli a2, a2, 16
; RV64-NEXT: or a1, a2, a1
; RV64-NEXT: lbu a2, 13(a0)
; RV64-NEXT: lbu a3, 12(a0)
; RV64-NEXT: lbu a4, 15(a0)
; RV64-NEXT: lbu a5, 14(a0)
; RV64-NEXT: slli a2, a2, 8
; RV64-NEXT: or a2, a2, a3
; RV64-NEXT: slli a3, a4, 8
; RV64-NEXT: or a3, a3, a5
; RV64-NEXT: slli a3, a3, 16
; RV64-NEXT: or a2, a3, a2
; RV64-NEXT: slli a2, a2, 32
; RV64-NEXT: or a1, a2, a1
; RV64-NEXT: sd a1, 24(sp)
; RV64-NEXT: lbu a1, 1(a0)
; RV64-NEXT: lbu a2, 0(a0)
; RV64-NEXT: lbu a3, 3(a0)
; RV64-NEXT: lbu a4, 2(a0)
; RV64-NEXT: slli a1, a1, 8
; RV64-NEXT: or a1, a1, a2
; RV64-NEXT: slli a2, a3, 8
; RV64-NEXT: or a2, a2, a4
; RV64-NEXT: slli a2, a2, 16
; RV64-NEXT: or a1, a2, a1
; RV64-NEXT: lbu a2, 5(a0)
; RV64-NEXT: lbu a3, 4(a0)
; RV64-NEXT: lbu a4, 7(a0)
; RV64-NEXT: lbu a0, 6(a0)
; RV64-NEXT: slli a2, a2, 8
; RV64-NEXT: or a2, a2, a3
; RV64-NEXT: slli a3, a4, 8
; RV64-NEXT: or a0, a3, a0
; RV64-NEXT: slli a0, a0, 16
; RV64-NEXT: or a0, a0, a2
; RV64-NEXT: slli a0, a0, 32
; RV64-NEXT: or a0, a0, a1
; RV64-NEXT: sd a0, 16(sp)
; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu
; RV64-NEXT: addi a0, sp, 16
; RV64-NEXT: vle32.v v8, (a0)
; RV64-NEXT: addi sp, sp, 32
; RV64-NEXT: ret ; RV64-NEXT: ret
%z = load <4 x i32>, <4 x i32>* %ptr, align 1 %z = load <4 x i32>, <4 x i32>* %ptr, align 1
ret <4 x i32> %z ret <4 x i32> %z
@ -121,64 +23,14 @@ define <4 x i32> @load_v4i32_align1(<4 x i32>* %ptr) {
define <4 x i32> @load_v4i32_align2(<4 x i32>* %ptr) { define <4 x i32> @load_v4i32_align2(<4 x i32>* %ptr) {
; RV32-LABEL: load_v4i32_align2: ; RV32-LABEL: load_v4i32_align2:
; RV32: # %bb.0: ; RV32: # %bb.0:
; RV32-NEXT: addi sp, sp, -32 ; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, mu
; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: vle8.v v8, (a0)
; RV32-NEXT: lhu a1, 14(a0)
; RV32-NEXT: lhu a2, 12(a0)
; RV32-NEXT: slli a1, a1, 16
; RV32-NEXT: or a1, a1, a2
; RV32-NEXT: sw a1, 28(sp)
; RV32-NEXT: lhu a1, 10(a0)
; RV32-NEXT: lhu a2, 8(a0)
; RV32-NEXT: slli a1, a1, 16
; RV32-NEXT: or a1, a1, a2
; RV32-NEXT: sw a1, 24(sp)
; RV32-NEXT: lhu a1, 6(a0)
; RV32-NEXT: lhu a2, 4(a0)
; RV32-NEXT: slli a1, a1, 16
; RV32-NEXT: or a1, a1, a2
; RV32-NEXT: sw a1, 20(sp)
; RV32-NEXT: lhu a1, 2(a0)
; RV32-NEXT: lhu a0, 0(a0)
; RV32-NEXT: slli a1, a1, 16
; RV32-NEXT: or a0, a1, a0
; RV32-NEXT: sw a0, 16(sp)
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
; RV32-NEXT: addi a0, sp, 16
; RV32-NEXT: vle32.v v8, (a0)
; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret ; RV32-NEXT: ret
; ;
; RV64-LABEL: load_v4i32_align2: ; RV64-LABEL: load_v4i32_align2:
; RV64: # %bb.0: ; RV64: # %bb.0:
; RV64-NEXT: addi sp, sp, -32 ; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, mu
; RV64-NEXT: .cfi_def_cfa_offset 32 ; RV64-NEXT: vle8.v v8, (a0)
; RV64-NEXT: lhu a1, 10(a0)
; RV64-NEXT: lhu a2, 8(a0)
; RV64-NEXT: lhu a3, 14(a0)
; RV64-NEXT: lhu a4, 12(a0)
; RV64-NEXT: slli a1, a1, 16
; RV64-NEXT: or a1, a1, a2
; RV64-NEXT: slli a2, a3, 16
; RV64-NEXT: or a2, a2, a4
; RV64-NEXT: slli a2, a2, 32
; RV64-NEXT: or a1, a2, a1
; RV64-NEXT: sd a1, 24(sp)
; RV64-NEXT: lhu a1, 2(a0)
; RV64-NEXT: lhu a2, 0(a0)
; RV64-NEXT: lhu a3, 6(a0)
; RV64-NEXT: lhu a0, 4(a0)
; RV64-NEXT: slli a1, a1, 16
; RV64-NEXT: or a1, a1, a2
; RV64-NEXT: slli a2, a3, 16
; RV64-NEXT: or a0, a2, a0
; RV64-NEXT: slli a0, a0, 32
; RV64-NEXT: or a0, a0, a1
; RV64-NEXT: sd a0, 16(sp)
; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu
; RV64-NEXT: addi a0, sp, 16
; RV64-NEXT: vle32.v v8, (a0)
; RV64-NEXT: addi sp, sp, 32
; RV64-NEXT: ret ; RV64-NEXT: ret
%z = load <4 x i32>, <4 x i32>* %ptr, align 2 %z = load <4 x i32>, <4 x i32>* %ptr, align 2
ret <4 x i32> %z ret <4 x i32> %z
@ -187,86 +39,14 @@ define <4 x i32> @load_v4i32_align2(<4 x i32>* %ptr) {
define void @store_v4i32_align1(<4 x i32> %x, <4 x i32>* %ptr) { define void @store_v4i32_align1(<4 x i32> %x, <4 x i32>* %ptr) {
; RV32-LABEL: store_v4i32_align1: ; RV32-LABEL: store_v4i32_align1:
; RV32: # %bb.0: ; RV32: # %bb.0:
; RV32-NEXT: addi sp, sp, -32 ; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, mu
; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: vse8.v v8, (a0)
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vse32.v v8, (a1)
; RV32-NEXT: lw a1, 28(sp)
; RV32-NEXT: sb a1, 12(a0)
; RV32-NEXT: lw a2, 24(sp)
; RV32-NEXT: sb a2, 8(a0)
; RV32-NEXT: lw a3, 20(sp)
; RV32-NEXT: sb a3, 4(a0)
; RV32-NEXT: lw a4, 16(sp)
; RV32-NEXT: sb a4, 0(a0)
; RV32-NEXT: srli a5, a1, 24
; RV32-NEXT: sb a5, 15(a0)
; RV32-NEXT: srli a5, a1, 16
; RV32-NEXT: sb a5, 14(a0)
; RV32-NEXT: srli a1, a1, 8
; RV32-NEXT: sb a1, 13(a0)
; RV32-NEXT: srli a1, a2, 24
; RV32-NEXT: sb a1, 11(a0)
; RV32-NEXT: srli a1, a2, 16
; RV32-NEXT: sb a1, 10(a0)
; RV32-NEXT: srli a1, a2, 8
; RV32-NEXT: sb a1, 9(a0)
; RV32-NEXT: srli a1, a3, 24
; RV32-NEXT: sb a1, 7(a0)
; RV32-NEXT: srli a1, a3, 16
; RV32-NEXT: sb a1, 6(a0)
; RV32-NEXT: srli a1, a3, 8
; RV32-NEXT: sb a1, 5(a0)
; RV32-NEXT: srli a1, a4, 24
; RV32-NEXT: sb a1, 3(a0)
; RV32-NEXT: srli a1, a4, 16
; RV32-NEXT: sb a1, 2(a0)
; RV32-NEXT: srli a1, a4, 8
; RV32-NEXT: sb a1, 1(a0)
; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret ; RV32-NEXT: ret
; ;
; RV64-LABEL: store_v4i32_align1: ; RV64-LABEL: store_v4i32_align1:
; RV64: # %bb.0: ; RV64: # %bb.0:
; RV64-NEXT: addi sp, sp, -32 ; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, mu
; RV64-NEXT: .cfi_def_cfa_offset 32 ; RV64-NEXT: vse8.v v8, (a0)
; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu
; RV64-NEXT: addi a1, sp, 16
; RV64-NEXT: vse32.v v8, (a1)
; RV64-NEXT: ld a1, 24(sp)
; RV64-NEXT: sb a1, 8(a0)
; RV64-NEXT: ld a2, 16(sp)
; RV64-NEXT: sb a2, 0(a0)
; RV64-NEXT: srli a3, a1, 56
; RV64-NEXT: sb a3, 15(a0)
; RV64-NEXT: srli a3, a1, 48
; RV64-NEXT: sb a3, 14(a0)
; RV64-NEXT: srli a3, a1, 40
; RV64-NEXT: sb a3, 13(a0)
; RV64-NEXT: srli a3, a1, 32
; RV64-NEXT: sb a3, 12(a0)
; RV64-NEXT: srli a3, a1, 24
; RV64-NEXT: sb a3, 11(a0)
; RV64-NEXT: srli a3, a1, 16
; RV64-NEXT: sb a3, 10(a0)
; RV64-NEXT: srli a1, a1, 8
; RV64-NEXT: sb a1, 9(a0)
; RV64-NEXT: srli a1, a2, 40
; RV64-NEXT: sb a1, 5(a0)
; RV64-NEXT: srli a1, a2, 32
; RV64-NEXT: sb a1, 4(a0)
; RV64-NEXT: srli a1, a2, 56
; RV64-NEXT: sb a1, 7(a0)
; RV64-NEXT: srli a1, a2, 48
; RV64-NEXT: sb a1, 6(a0)
; RV64-NEXT: srli a1, a2, 24
; RV64-NEXT: sb a1, 3(a0)
; RV64-NEXT: srli a1, a2, 16
; RV64-NEXT: sb a1, 2(a0)
; RV64-NEXT: srli a1, a2, 8
; RV64-NEXT: sb a1, 1(a0)
; RV64-NEXT: addi sp, sp, 32
; RV64-NEXT: ret ; RV64-NEXT: ret
store <4 x i32> %x, <4 x i32>* %ptr, align 1 store <4 x i32> %x, <4 x i32>* %ptr, align 1
ret void ret void
@ -275,54 +55,14 @@ define void @store_v4i32_align1(<4 x i32> %x, <4 x i32>* %ptr) {
define void @store_v4i32_align2(<4 x i32> %x, <4 x i32>* %ptr) { define void @store_v4i32_align2(<4 x i32> %x, <4 x i32>* %ptr) {
; RV32-LABEL: store_v4i32_align2: ; RV32-LABEL: store_v4i32_align2:
; RV32: # %bb.0: ; RV32: # %bb.0:
; RV32-NEXT: addi sp, sp, -32 ; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, mu
; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: vse8.v v8, (a0)
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vse32.v v8, (a1)
; RV32-NEXT: lw a1, 28(sp)
; RV32-NEXT: sh a1, 12(a0)
; RV32-NEXT: lw a2, 24(sp)
; RV32-NEXT: sh a2, 8(a0)
; RV32-NEXT: lw a3, 20(sp)
; RV32-NEXT: sh a3, 4(a0)
; RV32-NEXT: lw a4, 16(sp)
; RV32-NEXT: sh a4, 0(a0)
; RV32-NEXT: srli a1, a1, 16
; RV32-NEXT: sh a1, 14(a0)
; RV32-NEXT: srli a1, a2, 16
; RV32-NEXT: sh a1, 10(a0)
; RV32-NEXT: srli a1, a3, 16
; RV32-NEXT: sh a1, 6(a0)
; RV32-NEXT: srli a1, a4, 16
; RV32-NEXT: sh a1, 2(a0)
; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret ; RV32-NEXT: ret
; ;
; RV64-LABEL: store_v4i32_align2: ; RV64-LABEL: store_v4i32_align2:
; RV64: # %bb.0: ; RV64: # %bb.0:
; RV64-NEXT: addi sp, sp, -32 ; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, mu
; RV64-NEXT: .cfi_def_cfa_offset 32 ; RV64-NEXT: vse8.v v8, (a0)
; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu
; RV64-NEXT: addi a1, sp, 16
; RV64-NEXT: vse32.v v8, (a1)
; RV64-NEXT: ld a1, 24(sp)
; RV64-NEXT: sh a1, 8(a0)
; RV64-NEXT: ld a2, 16(sp)
; RV64-NEXT: sh a2, 0(a0)
; RV64-NEXT: srli a3, a1, 48
; RV64-NEXT: sh a3, 14(a0)
; RV64-NEXT: srli a3, a1, 32
; RV64-NEXT: sh a3, 12(a0)
; RV64-NEXT: srli a1, a1, 16
; RV64-NEXT: sh a1, 10(a0)
; RV64-NEXT: srli a1, a2, 48
; RV64-NEXT: sh a1, 6(a0)
; RV64-NEXT: srli a1, a2, 32
; RV64-NEXT: sh a1, 4(a0)
; RV64-NEXT: srli a1, a2, 16
; RV64-NEXT: sh a1, 2(a0)
; RV64-NEXT: addi sp, sp, 32
; RV64-NEXT: ret ; RV64-NEXT: ret
store <4 x i32> %x, <4 x i32>* %ptr, align 2 store <4 x i32> %x, <4 x i32>* %ptr, align 2
ret void ret void

View File

@ -0,0 +1,192 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple riscv32 -mattr=+d,+experimental-zfh,+experimental-v < %s \
; RUN: -verify-machineinstrs | FileCheck %s
; RUN: llc -mtriple riscv64 -mattr=+d,+experimental-zfh,+experimental-v < %s \
; RUN: -verify-machineinstrs | FileCheck %s
define <vscale x 1 x i32> @unaligned_load_nxv1i32_a1(<vscale x 1 x i32>* %ptr) {
; CHECK-LABEL: unaligned_load_nxv1i32_a1:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, mu
; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: ret
%v = load <vscale x 1 x i32>, <vscale x 1 x i32>* %ptr, align 1
ret <vscale x 1 x i32> %v
}
define <vscale x 1 x i32> @unaligned_load_nxv1i32_a2(<vscale x 1 x i32>* %ptr) {
; CHECK-LABEL: unaligned_load_nxv1i32_a2:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, mu
; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: ret
%v = load <vscale x 1 x i32>, <vscale x 1 x i32>* %ptr, align 2
ret <vscale x 1 x i32> %v
}
define <vscale x 1 x i32> @aligned_load_nxv1i32_a4(<vscale x 1 x i32>* %ptr) {
; CHECK-LABEL: aligned_load_nxv1i32_a4:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, mu
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: ret
%v = load <vscale x 1 x i32>, <vscale x 1 x i32>* %ptr, align 4
ret <vscale x 1 x i32> %v
}
define <vscale x 1 x i64> @unaligned_load_nxv1i64_a1(<vscale x 1 x i64>* %ptr) {
; CHECK-LABEL: unaligned_load_nxv1i64_a1:
; CHECK: # %bb.0:
; CHECK-NEXT: vl1r.v v8, (a0)
; CHECK-NEXT: ret
%v = load <vscale x 1 x i64>, <vscale x 1 x i64>* %ptr, align 1
ret <vscale x 1 x i64> %v
}
define <vscale x 1 x i64> @unaligned_load_nxv1i64_a4(<vscale x 1 x i64>* %ptr) {
; CHECK-LABEL: unaligned_load_nxv1i64_a4:
; CHECK: # %bb.0:
; CHECK-NEXT: vl1r.v v8, (a0)
; CHECK-NEXT: ret
%v = load <vscale x 1 x i64>, <vscale x 1 x i64>* %ptr, align 4
ret <vscale x 1 x i64> %v
}
define <vscale x 1 x i64> @aligned_load_nxv1i64_a8(<vscale x 1 x i64>* %ptr) {
; CHECK-LABEL: aligned_load_nxv1i64_a8:
; CHECK: # %bb.0:
; CHECK-NEXT: vl1re64.v v8, (a0)
; CHECK-NEXT: ret
%v = load <vscale x 1 x i64>, <vscale x 1 x i64>* %ptr, align 8
ret <vscale x 1 x i64> %v
}
define <vscale x 2 x i64> @unaligned_load_nxv2i64_a1(<vscale x 2 x i64>* %ptr) {
; CHECK-LABEL: unaligned_load_nxv2i64_a1:
; CHECK: # %bb.0:
; CHECK-NEXT: vl2r.v v8, (a0)
; CHECK-NEXT: ret
%v = load <vscale x 2 x i64>, <vscale x 2 x i64>* %ptr, align 1
ret <vscale x 2 x i64> %v
}
define <vscale x 2 x i64> @unaligned_load_nxv2i64_a4(<vscale x 2 x i64>* %ptr) {
; CHECK-LABEL: unaligned_load_nxv2i64_a4:
; CHECK: # %bb.0:
; CHECK-NEXT: vl2r.v v8, (a0)
; CHECK-NEXT: ret
%v = load <vscale x 2 x i64>, <vscale x 2 x i64>* %ptr, align 4
ret <vscale x 2 x i64> %v
}
define <vscale x 2 x i64> @aligned_load_nxv2i64_a8(<vscale x 2 x i64>* %ptr) {
; CHECK-LABEL: aligned_load_nxv2i64_a8:
; CHECK: # %bb.0:
; CHECK-NEXT: vl2re64.v v8, (a0)
; CHECK-NEXT: ret
%v = load <vscale x 2 x i64>, <vscale x 2 x i64>* %ptr, align 8
ret <vscale x 2 x i64> %v
}
; Masks should always be aligned
define <vscale x 1 x i1> @unaligned_load_nxv1i1_a1(<vscale x 1 x i1>* %ptr) {
; CHECK-LABEL: unaligned_load_nxv1i1_a1:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, mu
; CHECK-NEXT: vle1.v v0, (a0)
; CHECK-NEXT: ret
%v = load <vscale x 1 x i1>, <vscale x 1 x i1>* %ptr, align 1
ret <vscale x 1 x i1> %v
}
define <vscale x 4 x float> @unaligned_load_nxv4f32_a1(<vscale x 4 x float>* %ptr) {
; CHECK-LABEL: unaligned_load_nxv4f32_a1:
; CHECK: # %bb.0:
; CHECK-NEXT: vl2r.v v8, (a0)
; CHECK-NEXT: ret
%v = load <vscale x 4 x float>, <vscale x 4 x float>* %ptr, align 1
ret <vscale x 4 x float> %v
}
define <vscale x 4 x float> @unaligned_load_nxv4f32_a2(<vscale x 4 x float>* %ptr) {
; CHECK-LABEL: unaligned_load_nxv4f32_a2:
; CHECK: # %bb.0:
; CHECK-NEXT: vl2r.v v8, (a0)
; CHECK-NEXT: ret
%v = load <vscale x 4 x float>, <vscale x 4 x float>* %ptr, align 2
ret <vscale x 4 x float> %v
}
define <vscale x 4 x float> @aligned_load_nxv4f32_a4(<vscale x 4 x float>* %ptr) {
; CHECK-LABEL: aligned_load_nxv4f32_a4:
; CHECK: # %bb.0:
; CHECK-NEXT: vl2re32.v v8, (a0)
; CHECK-NEXT: ret
%v = load <vscale x 4 x float>, <vscale x 4 x float>* %ptr, align 4
ret <vscale x 4 x float> %v
}
define <vscale x 8 x half> @unaligned_load_nxv8f16_a1(<vscale x 8 x half>* %ptr) {
; CHECK-LABEL: unaligned_load_nxv8f16_a1:
; CHECK: # %bb.0:
; CHECK-NEXT: vl2r.v v8, (a0)
; CHECK-NEXT: ret
%v = load <vscale x 8 x half>, <vscale x 8 x half>* %ptr, align 1
ret <vscale x 8 x half> %v
}
define <vscale x 8 x half> @aligned_load_nxv8f16_a2(<vscale x 8 x half>* %ptr) {
; CHECK-LABEL: aligned_load_nxv8f16_a2:
; CHECK: # %bb.0:
; CHECK-NEXT: vl2re16.v v8, (a0)
; CHECK-NEXT: ret
%v = load <vscale x 8 x half>, <vscale x 8 x half>* %ptr, align 2
ret <vscale x 8 x half> %v
}
define void @unaligned_store_nxv4i32_a1(<vscale x 4 x i32> %x, <vscale x 4 x i32>* %ptr) {
; CHECK-LABEL: unaligned_store_nxv4i32_a1:
; CHECK: # %bb.0:
; CHECK-NEXT: vs2r.v v8, (a0)
; CHECK-NEXT: ret
store <vscale x 4 x i32> %x, <vscale x 4 x i32>* %ptr, align 1
ret void
}
define void @unaligned_store_nxv4i32_a2(<vscale x 4 x i32> %x, <vscale x 4 x i32>* %ptr) {
; CHECK-LABEL: unaligned_store_nxv4i32_a2:
; CHECK: # %bb.0:
; CHECK-NEXT: vs2r.v v8, (a0)
; CHECK-NEXT: ret
store <vscale x 4 x i32> %x, <vscale x 4 x i32>* %ptr, align 2
ret void
}
define void @aligned_store_nxv4i32_a4(<vscale x 4 x i32> %x, <vscale x 4 x i32>* %ptr) {
; CHECK-LABEL: aligned_store_nxv4i32_a4:
; CHECK: # %bb.0:
; CHECK-NEXT: vs2r.v v8, (a0)
; CHECK-NEXT: ret
store <vscale x 4 x i32> %x, <vscale x 4 x i32>* %ptr, align 4
ret void
}
define void @unaligned_store_nxv1i16_a1(<vscale x 1 x i16> %x, <vscale x 1 x i16>* %ptr) {
; CHECK-LABEL: unaligned_store_nxv1i16_a1:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, mu
; CHECK-NEXT: vse8.v v8, (a0)
; CHECK-NEXT: ret
store <vscale x 1 x i16> %x, <vscale x 1 x i16>* %ptr, align 1
ret void
}
define void @aligned_store_nxv1i16_a2(<vscale x 1 x i16> %x, <vscale x 1 x i16>* %ptr) {
; CHECK-LABEL: aligned_store_nxv1i16_a2:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, mu
; CHECK-NEXT: vse16.v v8, (a0)
; CHECK-NEXT: ret
store <vscale x 1 x i16> %x, <vscale x 1 x i16>* %ptr, align 2
ret void
}