forked from OSchip/llvm-project
Revert "Revert "[GlobalISel] Add legalization support for non-power-2 loads and stores""
We were shifting the wrong component of a split load when trying to combine them back into a single value. llvm-svn: 358800
This commit is contained in:
parent
d5c69e0836
commit
eac69e9377
|
@ -639,6 +639,10 @@ public:
|
|||
return actionIf(LegalizeAction::Unsupported,
|
||||
LegalityPredicates::memSizeInBytesNotPow2(0));
|
||||
}
|
||||
LegalizeRuleSet &lowerIfMemSizeNotPow2() {
|
||||
return actionIf(LegalizeAction::Lower,
|
||||
LegalityPredicates::memSizeInBytesNotPow2(0));
|
||||
}
|
||||
|
||||
LegalizeRuleSet &customIf(LegalityPredicate Predicate) {
|
||||
// We have no choice but conservatively assume that a custom action with a
|
||||
|
|
|
@ -1483,11 +1483,57 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
|
|||
LLT DstTy = MRI.getType(DstReg);
|
||||
auto &MMO = **MI.memoperands_begin();
|
||||
|
||||
if (DstTy.getSizeInBits() == MMO.getSize() /* in bytes */ * 8) {
|
||||
// In the case of G_LOAD, this was a non-extending load already and we're
|
||||
// about to lower to the same instruction.
|
||||
if (MI.getOpcode() == TargetOpcode::G_LOAD)
|
||||
if (DstTy.getSizeInBits() == MMO.getSizeInBits()) {
|
||||
if (MI.getOpcode() == TargetOpcode::G_LOAD) {
|
||||
// This load needs splitting into power of 2 sized loads.
|
||||
if (DstTy.isVector())
|
||||
return UnableToLegalize;
|
||||
if (isPowerOf2_32(DstTy.getSizeInBits()))
|
||||
return UnableToLegalize; // Don't know what we're being asked to do.
|
||||
|
||||
// Our strategy here is to generate anyextending loads for the smaller
|
||||
// types up to next power-2 result type, and then combine the two larger
|
||||
// result values together, before truncating back down to the non-pow-2
|
||||
// type.
|
||||
// E.g. v1 = i24 load =>
|
||||
// v2 = i32 load (2 byte)
|
||||
// v3 = i32 load (1 byte)
|
||||
// v4 = i32 shl v3, 16
|
||||
// v5 = i32 or v4, v2
|
||||
// v1 = i24 trunc v5
|
||||
// By doing this we generate the correct truncate which should get
|
||||
// combined away as an artifact with a matching extend.
|
||||
uint64_t LargeSplitSize = PowerOf2Floor(DstTy.getSizeInBits());
|
||||
uint64_t SmallSplitSize = DstTy.getSizeInBits() - LargeSplitSize;
|
||||
|
||||
MachineFunction &MF = MIRBuilder.getMF();
|
||||
MachineMemOperand *LargeMMO =
|
||||
MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
|
||||
MachineMemOperand *SmallMMO = MF.getMachineMemOperand(
|
||||
&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
|
||||
|
||||
LLT PtrTy = MRI.getType(PtrReg);
|
||||
unsigned AnyExtSize = NextPowerOf2(DstTy.getSizeInBits());
|
||||
LLT AnyExtTy = LLT::scalar(AnyExtSize);
|
||||
unsigned LargeLdReg = MRI.createGenericVirtualRegister(AnyExtTy);
|
||||
unsigned SmallLdReg = MRI.createGenericVirtualRegister(AnyExtTy);
|
||||
auto LargeLoad =
|
||||
MIRBuilder.buildLoad(LargeLdReg, PtrReg, *LargeMMO);
|
||||
|
||||
auto OffsetCst =
|
||||
MIRBuilder.buildConstant(LLT::scalar(64), LargeSplitSize / 8);
|
||||
unsigned GEPReg = MRI.createGenericVirtualRegister(PtrTy);
|
||||
auto SmallPtr = MIRBuilder.buildGEP(GEPReg, PtrReg, OffsetCst.getReg(0));
|
||||
auto SmallLoad = MIRBuilder.buildLoad(SmallLdReg, SmallPtr.getReg(0),
|
||||
*SmallMMO);
|
||||
|
||||
auto ShiftAmt = MIRBuilder.buildConstant(AnyExtTy, LargeSplitSize);
|
||||
auto Shift = MIRBuilder.buildShl(AnyExtTy, SmallLoad, ShiftAmt);
|
||||
auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
|
||||
MIRBuilder.buildTrunc(DstReg, {Or.getReg(0)});
|
||||
MI.eraseFromParent();
|
||||
return Legalized;
|
||||
}
|
||||
MIRBuilder.buildLoad(DstReg, PtrReg, MMO);
|
||||
MI.eraseFromParent();
|
||||
return Legalized;
|
||||
|
@ -1516,6 +1562,51 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
|
|||
|
||||
return UnableToLegalize;
|
||||
}
|
||||
case TargetOpcode::G_STORE: {
|
||||
// Lower a non-power of 2 store into multiple pow-2 stores.
|
||||
// E.g. split an i24 store into an i16 store + i8 store.
|
||||
// We do this by first extending the stored value to the next largest power
|
||||
// of 2 type, and then using truncating stores to store the components.
|
||||
// By doing this, likewise with G_LOAD, generate an extend that can be
|
||||
// artifact-combined away instead of leaving behind extracts.
|
||||
unsigned SrcReg = MI.getOperand(0).getReg();
|
||||
unsigned PtrReg = MI.getOperand(1).getReg();
|
||||
LLT SrcTy = MRI.getType(SrcReg);
|
||||
MachineMemOperand &MMO = **MI.memoperands_begin();
|
||||
if (SrcTy.getSizeInBits() != MMO.getSizeInBits())
|
||||
return UnableToLegalize;
|
||||
if (SrcTy.isVector())
|
||||
return UnableToLegalize;
|
||||
if (isPowerOf2_32(SrcTy.getSizeInBits()))
|
||||
return UnableToLegalize; // Don't know what we're being asked to do.
|
||||
|
||||
// Extend to the next pow-2.
|
||||
const LLT ExtendTy = LLT::scalar(NextPowerOf2(SrcTy.getSizeInBits()));
|
||||
auto ExtVal = MIRBuilder.buildAnyExt(ExtendTy, SrcReg);
|
||||
|
||||
// Obtain the smaller value by shifting away the larger value.
|
||||
uint64_t LargeSplitSize = PowerOf2Floor(SrcTy.getSizeInBits());
|
||||
uint64_t SmallSplitSize = SrcTy.getSizeInBits() - LargeSplitSize;
|
||||
auto ShiftAmt = MIRBuilder.buildConstant(ExtendTy, LargeSplitSize);
|
||||
auto SmallVal = MIRBuilder.buildLShr(ExtendTy, ExtVal, ShiftAmt);
|
||||
|
||||
// Generate the GEP and truncating stores.
|
||||
LLT PtrTy = MRI.getType(PtrReg);
|
||||
auto OffsetCst =
|
||||
MIRBuilder.buildConstant(LLT::scalar(64), LargeSplitSize / 8);
|
||||
unsigned GEPReg = MRI.createGenericVirtualRegister(PtrTy);
|
||||
auto SmallPtr = MIRBuilder.buildGEP(GEPReg, PtrReg, OffsetCst.getReg(0));
|
||||
|
||||
MachineFunction &MF = MIRBuilder.getMF();
|
||||
MachineMemOperand *LargeMMO =
|
||||
MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
|
||||
MachineMemOperand *SmallMMO =
|
||||
MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
|
||||
MIRBuilder.buildStore(ExtVal.getReg(0), PtrReg, *LargeMMO);
|
||||
MIRBuilder.buildStore(SmallVal.getReg(0), SmallPtr.getReg(0), *SmallMMO);
|
||||
MI.eraseFromParent();
|
||||
return Legalized;
|
||||
}
|
||||
case TargetOpcode::G_CTLZ_ZERO_UNDEF:
|
||||
case TargetOpcode::G_CTTZ_ZERO_UNDEF:
|
||||
case TargetOpcode::G_CTLZ:
|
||||
|
|
|
@ -235,14 +235,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
|
|||
.legalForTypesWithMemDesc({{s32, p0, 8, 8},
|
||||
{s32, p0, 16, 8}})
|
||||
.clampScalar(0, s8, s64)
|
||||
.widenScalarToNextPow2(0)
|
||||
// TODO: We could support sum-of-pow2's but the lowering code doesn't know
|
||||
// how to do that yet.
|
||||
.unsupportedIfMemSizeNotPow2()
|
||||
.lowerIfMemSizeNotPow2()
|
||||
// Lower any any-extending loads left into G_ANYEXT and G_LOAD
|
||||
.lowerIf([=](const LegalityQuery &Query) {
|
||||
return Query.Types[0].getSizeInBits() != Query.MMODescrs[0].SizeInBits;
|
||||
})
|
||||
.widenScalarToNextPow2(0)
|
||||
.clampMaxNumElements(0, s32, 2)
|
||||
.clampMaxNumElements(0, s64, 1)
|
||||
.customIf(IsPtrVecPred);
|
||||
|
@ -250,6 +248,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
|
|||
getActionDefinitionsBuilder(G_STORE)
|
||||
.legalForTypesWithMemDesc({{s8, p0, 8, 8},
|
||||
{s16, p0, 16, 8},
|
||||
{s32, p0, 8, 8},
|
||||
{s32, p0, 16, 8},
|
||||
{s32, p0, 32, 8},
|
||||
{s64, p0, 64, 8},
|
||||
{p0, p0, 64, 8},
|
||||
|
@ -260,10 +260,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
|
|||
{v4s32, p0, 128, 8},
|
||||
{v2s64, p0, 128, 8}})
|
||||
.clampScalar(0, s8, s64)
|
||||
.widenScalarToNextPow2(0)
|
||||
// TODO: We could support sum-of-pow2's but the lowering code doesn't know
|
||||
// how to do that yet.
|
||||
.unsupportedIfMemSizeNotPow2()
|
||||
.lowerIfMemSizeNotPow2()
|
||||
.lowerIf([=](const LegalityQuery &Query) {
|
||||
return Query.Types[0].isScalar() &&
|
||||
Query.Types[0].getSizeInBits() != Query.MMODescrs[0].SizeInBits;
|
||||
|
|
|
@ -54,26 +54,6 @@ false:
|
|||
|
||||
}
|
||||
|
||||
; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %3:_(s32) = G_LOAD %1:_(p0) :: (load 3 from `i24* undef`, align 1) (in function: odd_type_load)
|
||||
; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for odd_type_load
|
||||
; FALLBACK-WITH-REPORT-OUT-LABEL: odd_type_load
|
||||
define i32 @odd_type_load() {
|
||||
entry:
|
||||
%ld = load i24, i24* undef, align 1
|
||||
%cst = zext i24 %ld to i32
|
||||
ret i32 %cst
|
||||
}
|
||||
|
||||
; General legalizer inability to handle types whose size wasn't a power of 2.
|
||||
; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: G_STORE %1:_(s42), %0:_(p0) :: (store 6 into %ir.addr, align 8) (in function: odd_type)
|
||||
; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for odd_type
|
||||
; FALLBACK-WITH-REPORT-OUT-LABEL: odd_type:
|
||||
define void @odd_type(i42* %addr) {
|
||||
%val42 = load i42, i42* %addr
|
||||
store i42 %val42, i42* %addr
|
||||
ret void
|
||||
}
|
||||
|
||||
; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: G_STORE %1:_(<7 x s32>), %0:_(p0) :: (store 28 into %ir.addr, align 32) (in function: odd_vector)
|
||||
; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for odd_vector
|
||||
; FALLBACK-WITH-REPORT-OUT-LABEL: odd_vector:
|
||||
|
|
|
@ -0,0 +1,49 @@
|
|||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
||||
# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - -verify-machineinstrs | FileCheck %s
|
||||
--- |
|
||||
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
|
||||
target triple = "aarch64"
|
||||
|
||||
define i32 @load_store_test(i24* %ptr, i24* %ptr2) {
|
||||
%val = load i24, i24* %ptr
|
||||
store i24 %val, i24* %ptr2
|
||||
ret i32 0
|
||||
}
|
||||
|
||||
...
|
||||
---
|
||||
name: load_store_test
|
||||
alignment: 2
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.1 (%ir-block.0):
|
||||
liveins: $x0, $x1
|
||||
|
||||
; CHECK-LABEL: name: load_store_test
|
||||
; CHECK: liveins: $x0, $x1
|
||||
; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
|
||||
; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $x1
|
||||
; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
|
||||
; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 2 from %ir.ptr, align 4)
|
||||
; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
|
||||
; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C1]](s64)
|
||||
; CHECK: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p0) :: (load 1 from %ir.ptr + 2, align 4)
|
||||
; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
|
||||
; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C2]](s32)
|
||||
; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[LOAD]]
|
||||
; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[OR]](s32)
|
||||
; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C2]](s32)
|
||||
; CHECK: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[COPY1]], [[C1]](s64)
|
||||
; CHECK: G_STORE [[COPY2]](s32), [[COPY1]](p0) :: (store 2 into %ir.ptr2, align 4)
|
||||
; CHECK: G_STORE [[LSHR]](s32), [[GEP1]](p0) :: (store 1 into %ir.ptr2 + 2, align 4)
|
||||
; CHECK: $w0 = COPY [[C]](s32)
|
||||
; CHECK: RET_ReallyLR implicit $w0
|
||||
%0:_(p0) = COPY $x0
|
||||
%1:_(p0) = COPY $x1
|
||||
%3:_(s32) = G_CONSTANT i32 0
|
||||
%2:_(s24) = G_LOAD %0(p0) :: (load 3 from %ir.ptr, align 4)
|
||||
G_STORE %2(s24), %1(p0) :: (store 3 into %ir.ptr2, align 4)
|
||||
$w0 = COPY %3(s32)
|
||||
RET_ReallyLR implicit $w0
|
||||
|
||||
...
|
Loading…
Reference in New Issue