From 3ec3fcb97a6b5a42d89032d44d81bbe711d188a4 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Tue, 14 Jul 2020 13:50:21 +0100 Subject: [PATCH] [CodeGen] In narrowExtractedVectorLoad bail out for scalable vectors In narrowExtractedVectorLoad there is an optimisation that tries to combine extract_subvector with a narrowing vector load. At the moment this produces warnings due to the incorrect calls to getVectorNumElements() for scalable vector types. I've got this working for scalable vectors too when the extract subvector index is a multiple of the minimum number of elements. I have added a new variant of the function: MachineFunction::getMachineMemOperand that copies an existing MachineMemOperand, but replaces the pointer info with a null version since we cannot currently represent scaled offsets. I've added a new test for this particular case in: CodeGen/AArch64/sve-extract-subvector.ll Differential Revision: https://reviews.llvm.org/D83950 --- llvm/include/llvm/CodeGen/MachineFunction.h | 8 ++++ llvm/lib/CodeGen/MachineFunction.cpp | 7 ++++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 37 ++++++++++--------- .../CodeGen/AArch64/sve-extract-subvector.ll | 12 ++++++ 4 files changed, 47 insertions(+), 17 deletions(-) diff --git a/llvm/include/llvm/CodeGen/MachineFunction.h b/llvm/include/llvm/CodeGen/MachineFunction.h index 809c21dd26fc..3f8568facd73 100644 --- a/llvm/include/llvm/CodeGen/MachineFunction.h +++ b/llvm/include/llvm/CodeGen/MachineFunction.h @@ -815,6 +815,14 @@ public: MachineMemOperand *getMachineMemOperand(const MachineMemOperand *MMO, int64_t Offset, uint64_t Size); + /// getMachineMemOperand - Allocate a new MachineMemOperand by copying + /// an existing one, replacing only the MachinePointerInfo and size. + /// MachineMemOperands are owned by the MachineFunction and need not be + /// explicitly deallocated. + MachineMemOperand *getMachineMemOperand(const MachineMemOperand *MMO, + MachinePointerInfo &PtrInfo, + uint64_t Size); + /// Allocate a new MachineMemOperand by copying an existing one, /// replacing only AliasAnalysis information. MachineMemOperands are owned /// by the MachineFunction and need not be explicitly deallocated. diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp index 6d45f08804ed..464f71a4fd53 100644 --- a/llvm/lib/CodeGen/MachineFunction.cpp +++ b/llvm/lib/CodeGen/MachineFunction.cpp @@ -474,6 +474,13 @@ MachineMemOperand *MachineFunction::getMachineMemOperand( SSID, Ordering, FailureOrdering); } +MachineMemOperand *MachineFunction::getMachineMemOperand( + const MachineMemOperand *MMO, MachinePointerInfo &PtrInfo, uint64_t Size) { + return new (Allocator) MachineMemOperand( + PtrInfo, MMO->getFlags(), Size, Alignment, AAMDNodes(), nullptr, + MMO->getSyncScopeID(), MMO->getOrdering(), MMO->getFailureOrdering()); +} + MachineMemOperand * MachineFunction::getMachineMemOperand(const MachineMemOperand *MMO, int64_t Offset, uint64_t Size) { diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 5c5e121a0883..9d3a5fec5713 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -19338,19 +19338,15 @@ static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) { return SDValue(); unsigned Index = ExtIdx->getZExtValue(); - unsigned NumElts = VT.getVectorNumElements(); + unsigned NumElts = VT.getVectorMinNumElements(); - // If the index is a multiple of the extract element count, we can offset the - // address by the store size multiplied by the subvector index. Otherwise if - // the scalar type is byte sized, we can just use the index multiplied by - // the element size in bytes as the offset. - unsigned Offset; - if (Index % NumElts == 0) - Offset = (Index / NumElts) * VT.getStoreSize(); - else if (VT.getScalarType().isByteSized()) - Offset = Index * VT.getScalarType().getStoreSize(); - else - return SDValue(); + // The definition of EXTRACT_SUBVECTOR states that the index must be a + // multiple of the minimum number of elements in the result type. + assert(Index % NumElts == 0 && "The extract subvector index is not a " + "multiple of the result's element count"); + + // It's fine to use TypeSize here as we know the offset will not be negative. + TypeSize Offset = VT.getStoreSize() * (Index / NumElts); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.shouldReduceLoadWidth(Ld, Ld->getExtensionType(), VT)) @@ -19359,14 +19355,21 @@ static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) { // The narrow load will be offset from the base address of the old load if // we are extracting from something besides index 0 (little-endian). SDLoc DL(Extract); - SDValue BaseAddr = Ld->getBasePtr(); // TODO: Use "BaseIndexOffset" to make this more effective. - SDValue NewAddr = - DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL); + SDValue NewAddr = DAG.getMemBasePlusOffset(Ld->getBasePtr(), Offset, DL); + + uint64_t StoreSize = MemoryLocation::getSizeOrUnknown(VT.getStoreSize()); MachineFunction &MF = DAG.getMachineFunction(); - MachineMemOperand *MMO = MF.getMachineMemOperand(Ld->getMemOperand(), Offset, - VT.getStoreSize()); + MachineMemOperand *MMO; + if (Offset.isScalable()) { + MachinePointerInfo MPI = + MachinePointerInfo(Ld->getPointerInfo().getAddrSpace()); + MMO = MF.getMachineMemOperand(Ld->getMemOperand(), MPI, StoreSize); + } else + MMO = MF.getMachineMemOperand(Ld->getMemOperand(), Offset.getFixedSize(), + StoreSize); + SDValue NewLd = DAG.getLoad(VT, DL, Ld->getChain(), NewAddr, MMO); DAG.makeEquivalentMemoryOrdering(Ld, NewLd); return NewLd; diff --git a/llvm/test/CodeGen/AArch64/sve-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-extract-subvector.ll index 5b0bdfc37b1a..9bcaaadd83bd 100644 --- a/llvm/test/CodeGen/AArch64/sve-extract-subvector.ll +++ b/llvm/test/CodeGen/AArch64/sve-extract-subvector.ll @@ -64,7 +64,19 @@ define @extract_hi_nxv2f32_nxv4f32( %z0 ret %ext } +define @load_extract_nxv4f32_nxv8f32(* %p) { +; CHECK-LABEL: load_extract_nxv4f32_nxv8f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ret + %tmp1 = load , * %p, align 16 + %tmp2 = call @llvm.aarch64.sve.tuple.get.nxv8f32( %tmp1, i32 1) + ret %tmp2 +} + declare @llvm.aarch64.sve.tuple.get.nxv4i64(, i32) declare @llvm.aarch64.sve.tuple.get.nxv32i8(, i32) declare @llvm.aarch64.sve.tuple.get.nxv4f32(, i32) declare @llvm.aarch64.sve.tuple.get.nxv8f16(, i32) +declare @llvm.aarch64.sve.tuple.get.nxv8f32(, i32)