forked from OSchip/llvm-project
[CodeGen][ARM] Implement atomicrmw as pseudo operations at -O0
atomicrmw instructions are expanded by AtomicExpandPass before register allocation into cmpxchg loops. Register allocation can insert spills between the exclusive loads and stores, which invalidates the exclusive monitor and can lead to infinite loops. To avoid this, reimplement atomicrmw operations as pseudo-instructions and expand them after register allocation. Floating point legalisation: f16 ATOMIC_LOAD_FADD(*f16, f16) is legalised to f32 ATOMIC_LOAD_FADD(*i16, f32) and then eventually f32 ATOMIC_LOAD_FADD_16(*i16, f32) Differential Revision: https://reviews.llvm.org/D101164
This commit is contained in:
parent
985ab6e1fa
commit
3338290c18
|
@ -2255,6 +2255,11 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
|
|||
case ISD::FREM:
|
||||
case ISD::FSUB: R = PromoteFloatRes_BinOp(N); break;
|
||||
|
||||
case ISD::ATOMIC_LOAD_FADD:
|
||||
case ISD::ATOMIC_LOAD_FSUB:
|
||||
R = PromoteFloatRes_ATOMIC_LOAD_FXXX(N);
|
||||
break;
|
||||
|
||||
case ISD::FMA: // FMA is same as FMAD
|
||||
case ISD::FMAD: R = PromoteFloatRes_FMAD(N); break;
|
||||
|
||||
|
@ -2453,6 +2458,21 @@ SDValue DAGTypeLegalizer::PromoteFloatRes_FP_ROUND(SDNode *N) {
|
|||
return DAG.getNode(GetPromotionOpcode(VT, NVT), DL, NVT, Round);
|
||||
}
|
||||
|
||||
SDValue DAGTypeLegalizer::PromoteFloatRes_ATOMIC_LOAD_FXXX(SDNode *N) {
|
||||
AtomicSDNode *A = cast<AtomicSDNode>(N);
|
||||
EVT VT = N->getValueType(0);
|
||||
|
||||
// Load the value as an integer value with the same number of bits.
|
||||
EVT IVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
|
||||
SDValue PromotedVal = GetPromotedFloat(A->getVal());
|
||||
SDValue NewA =
|
||||
DAG.getAtomic(A->getOpcode(), SDLoc(N), IVT, A->getChain(),
|
||||
A->getBasePtr(), PromotedVal, A->getMemOperand());
|
||||
ReplaceValueWith(SDValue(A, 1), NewA.getValue(1));
|
||||
|
||||
return NewA;
|
||||
}
|
||||
|
||||
SDValue DAGTypeLegalizer::PromoteFloatRes_LOAD(SDNode *N) {
|
||||
LoadSDNode *L = cast<LoadSDNode>(N);
|
||||
EVT VT = N->getValueType(0);
|
||||
|
|
|
@ -671,6 +671,7 @@ private:
|
|||
SDValue PromoteFloatRes_FMAD(SDNode *N);
|
||||
SDValue PromoteFloatRes_FPOWI(SDNode *N);
|
||||
SDValue PromoteFloatRes_FP_ROUND(SDNode *N);
|
||||
SDValue PromoteFloatRes_ATOMIC_LOAD_FXXX(SDNode *N);
|
||||
SDValue PromoteFloatRes_LOAD(SDNode *N);
|
||||
SDValue PromoteFloatRes_SELECT(SDNode *N);
|
||||
SDValue PromoteFloatRes_SELECT_CC(SDNode *N);
|
||||
|
|
|
@ -928,6 +928,25 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
|
|||
return;
|
||||
}
|
||||
|
||||
// Handle DPR to/from GPRPair
|
||||
const auto *TRI = &getRegisterInfo();
|
||||
if (ARM::DPRRegClass.contains(SrcReg) &&
|
||||
ARM::GPRPairRegClass.contains(DestReg)) {
|
||||
BuildMI(MBB, I, DL, get(ARM::VMOVRRD))
|
||||
.addReg(TRI->getSubReg(DestReg, ARM::gsub_0), RegState::Define)
|
||||
.addReg(TRI->getSubReg(DestReg, ARM::gsub_1), RegState::Define)
|
||||
.addReg(SrcReg, getKillRegState(KillSrc))
|
||||
.add(predOps(ARMCC::AL));
|
||||
return;
|
||||
} else if (ARM::GPRPairRegClass.contains(SrcReg) &&
|
||||
ARM::DPRRegClass.contains(DestReg)) {
|
||||
BuildMI(MBB, I, DL, get(ARM::VMOVDRR), DestReg)
|
||||
.addReg(TRI->getSubReg(SrcReg, ARM::gsub_0), getKillRegState(KillSrc))
|
||||
.addReg(TRI->getSubReg(SrcReg, ARM::gsub_1), getKillRegState(KillSrc))
|
||||
.add(predOps(ARMCC::AL));
|
||||
return;
|
||||
}
|
||||
|
||||
// Handle register classes that require multiple instructions.
|
||||
unsigned BeginIdx = 0;
|
||||
unsigned SubRegs = 0;
|
||||
|
@ -1013,7 +1032,6 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
|
|||
|
||||
assert(Opc && "Impossible reg-to-reg copy");
|
||||
|
||||
const TargetRegisterInfo *TRI = &getRegisterInfo();
|
||||
MachineInstrBuilder Mov;
|
||||
|
||||
// Copy register tuples backward when the first Dest reg overlaps with SrcReg.
|
||||
|
|
|
@ -107,6 +107,10 @@ namespace {
|
|||
MachineBasicBlock::iterator MBBI, unsigned LdrexOp,
|
||||
unsigned StrexOp, unsigned UxtOp,
|
||||
MachineBasicBlock::iterator &NextMBBI);
|
||||
bool ExpandAtomicOp(MachineBasicBlock &MBB,
|
||||
MachineBasicBlock::iterator MBBI, const int Size,
|
||||
unsigned PseudoOp,
|
||||
MachineBasicBlock::iterator &NextMBBI);
|
||||
|
||||
bool ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
|
||||
MachineBasicBlock::iterator MBBI,
|
||||
|
@ -1657,16 +1661,270 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
|
|||
/// ARM's ldrexd/strexd take a consecutive register pair (represented as a
|
||||
/// single GPRPair register), Thumb's take two separate registers so we need to
|
||||
/// extract the subregs from the pair.
|
||||
static void addExclusiveRegPair(MachineInstrBuilder &MIB, MachineOperand &Reg,
|
||||
static void addExclusiveRegPair(MachineInstrBuilder &MIB, Register Reg,
|
||||
unsigned Flags, bool IsThumb,
|
||||
const TargetRegisterInfo *TRI) {
|
||||
if (IsThumb) {
|
||||
Register RegLo = TRI->getSubReg(Reg.getReg(), ARM::gsub_0);
|
||||
Register RegHi = TRI->getSubReg(Reg.getReg(), ARM::gsub_1);
|
||||
Register RegLo = TRI->getSubReg(Reg, ARM::gsub_0);
|
||||
Register RegHi = TRI->getSubReg(Reg, ARM::gsub_1);
|
||||
MIB.addReg(RegLo, Flags);
|
||||
MIB.addReg(RegHi, Flags);
|
||||
} else
|
||||
MIB.addReg(Reg.getReg(), Flags);
|
||||
MIB.addReg(Reg, Flags);
|
||||
}
|
||||
|
||||
static void
|
||||
makeAtomicUpdateInstrs(const unsigned PseudoOp, MachineBasicBlock *LoadStoreBB,
|
||||
const DebugLoc &DL, const ARMBaseInstrInfo *TII,
|
||||
const Register DestReg, const Register ValReg) {
|
||||
|
||||
auto BasicOp = [&](unsigned Opcode) {
|
||||
auto MIB = BuildMI(LoadStoreBB, DL, TII->get(Opcode), DestReg)
|
||||
.addReg(DestReg, RegState::Kill)
|
||||
.addReg(ValReg)
|
||||
.add(predOps(ARMCC::AL));
|
||||
if (Opcode != ARM::VADDS && Opcode != ARM::VSUBS && Opcode != ARM::VADDD &&
|
||||
Opcode != ARM::VSUBD)
|
||||
// Floating point operations don't have this.
|
||||
// Add 's' bit operand (always reg0 for this)
|
||||
MIB.addReg(0);
|
||||
};
|
||||
auto MinMax = [&](ARMCC::CondCodes Condition) {
|
||||
BuildMI(LoadStoreBB, DL, TII->get(ARM::CMPrr), DestReg)
|
||||
.addReg(ValReg)
|
||||
.add(predOps(ARMCC::AL));
|
||||
BuildMI(LoadStoreBB, DL, TII->get(ARM::MOVr), DestReg)
|
||||
.addReg(ValReg)
|
||||
.add(predOps(Condition))
|
||||
.add(condCodeOp()); // 's' bit
|
||||
};
|
||||
|
||||
switch (PseudoOp) {
|
||||
// No operations (swaps)
|
||||
case ARM::ATOMIC_SWAP_8:
|
||||
case ARM::ATOMIC_SWAP_16:
|
||||
case ARM::ATOMIC_SWAP_32:
|
||||
case ARM::ATOMIC_SWAP_64:
|
||||
llvm_unreachable("Swap should be handled at call site.");
|
||||
return;
|
||||
|
||||
// Basic binary operation
|
||||
case ARM::ATOMIC_LOAD_ADD_8:
|
||||
case ARM::ATOMIC_LOAD_ADD_16:
|
||||
case ARM::ATOMIC_LOAD_ADD_32:
|
||||
case ARM::ATOMIC_LOAD_ADD_64:
|
||||
return BasicOp(ARM::ADDrr);
|
||||
case ARM::ATOMIC_LOAD_SUB_8:
|
||||
case ARM::ATOMIC_LOAD_SUB_16:
|
||||
case ARM::ATOMIC_LOAD_SUB_32:
|
||||
case ARM::ATOMIC_LOAD_SUB_64:
|
||||
return BasicOp(ARM::SUBrr);
|
||||
case ARM::ATOMIC_LOAD_AND_8:
|
||||
case ARM::ATOMIC_LOAD_AND_16:
|
||||
case ARM::ATOMIC_LOAD_AND_32:
|
||||
case ARM::ATOMIC_LOAD_AND_64:
|
||||
return BasicOp(ARM::ANDrr);
|
||||
case ARM::ATOMIC_LOAD_OR_8:
|
||||
case ARM::ATOMIC_LOAD_OR_16:
|
||||
case ARM::ATOMIC_LOAD_OR_32:
|
||||
case ARM::ATOMIC_LOAD_OR_64:
|
||||
return BasicOp(ARM::ORRrr);
|
||||
case ARM::ATOMIC_LOAD_XOR_8:
|
||||
case ARM::ATOMIC_LOAD_XOR_16:
|
||||
case ARM::ATOMIC_LOAD_XOR_32:
|
||||
case ARM::ATOMIC_LOAD_XOR_64:
|
||||
return BasicOp(ARM::EORrr);
|
||||
case ARM::ATOMIC_LOAD_FADD_16:
|
||||
case ARM::ATOMIC_LOAD_FADD_32:
|
||||
return BasicOp(ARM::VADDS);
|
||||
case ARM::ATOMIC_LOAD_FADD_64:
|
||||
return BasicOp(ARM::VADDD);
|
||||
case ARM::ATOMIC_LOAD_FSUB_16:
|
||||
case ARM::ATOMIC_LOAD_FSUB_32:
|
||||
return BasicOp(ARM::VSUBS);
|
||||
case ARM::ATOMIC_LOAD_FSUB_64:
|
||||
return BasicOp(ARM::VSUBD);
|
||||
|
||||
// Minimum or maximum operations
|
||||
case ARM::ATOMIC_LOAD_MAX_8:
|
||||
case ARM::ATOMIC_LOAD_MAX_16:
|
||||
case ARM::ATOMIC_LOAD_MAX_32:
|
||||
case ARM::ATOMIC_LOAD_MAX_64:
|
||||
case ARM::ATOMIC_LOAD_UMAX_8:
|
||||
case ARM::ATOMIC_LOAD_UMAX_16:
|
||||
case ARM::ATOMIC_LOAD_UMAX_32:
|
||||
case ARM::ATOMIC_LOAD_UMAX_64:
|
||||
return MinMax(ARMCC::LE);
|
||||
case ARM::ATOMIC_LOAD_MIN_8:
|
||||
case ARM::ATOMIC_LOAD_MIN_16:
|
||||
case ARM::ATOMIC_LOAD_MIN_32:
|
||||
case ARM::ATOMIC_LOAD_MIN_64:
|
||||
case ARM::ATOMIC_LOAD_UMIN_8:
|
||||
case ARM::ATOMIC_LOAD_UMIN_16:
|
||||
case ARM::ATOMIC_LOAD_UMIN_32:
|
||||
case ARM::ATOMIC_LOAD_UMIN_64:
|
||||
return MinMax(ARMCC::GE);
|
||||
|
||||
// NAND
|
||||
case ARM::ATOMIC_LOAD_NAND_8:
|
||||
case ARM::ATOMIC_LOAD_NAND_16:
|
||||
case ARM::ATOMIC_LOAD_NAND_32:
|
||||
case ARM::ATOMIC_LOAD_NAND_64:
|
||||
BuildMI(LoadStoreBB, DL, TII->get(ARM::ANDrr), DestReg)
|
||||
.addReg(DestReg, RegState::Kill)
|
||||
.addReg(ValReg)
|
||||
.add(predOps(ARMCC::AL))
|
||||
.addReg(0); // 's' bit
|
||||
BuildMI(LoadStoreBB, DL, TII->get(ARM::MVNr), DestReg)
|
||||
.addReg(DestReg, RegState::Kill)
|
||||
.add(predOps(ARMCC::AL))
|
||||
.addReg(0); // 's' bit
|
||||
return;
|
||||
}
|
||||
|
||||
llvm_unreachable("unexpected opcode");
|
||||
}
|
||||
|
||||
bool ARMExpandPseudo::ExpandAtomicOp(MachineBasicBlock &MBB,
|
||||
MachineBasicBlock::iterator MBBI,
|
||||
const int Size, const unsigned PseudoOp,
|
||||
MachineBasicBlock::iterator &NextMBBI) {
|
||||
assert(!STI->isThumb() && "atomic pseudo-instructions are ARM only");
|
||||
|
||||
unsigned LdrexOp;
|
||||
unsigned StrexOp;
|
||||
switch (Size) {
|
||||
case 8:
|
||||
LdrexOp = ARM::LDREXB;
|
||||
StrexOp = ARM::STREXB;
|
||||
break;
|
||||
case 16:
|
||||
LdrexOp = ARM::LDREXH;
|
||||
StrexOp = ARM::STREXH;
|
||||
break;
|
||||
case 32:
|
||||
LdrexOp = ARM::LDREX;
|
||||
StrexOp = ARM::STREX;
|
||||
break;
|
||||
case 64:
|
||||
LdrexOp = ARM::LDREXD;
|
||||
StrexOp = ARM::STREXD;
|
||||
break;
|
||||
default:
|
||||
llvm_unreachable("Invalid Size");
|
||||
}
|
||||
|
||||
MachineInstr &MI = *MBBI;
|
||||
DebugLoc DL = MI.getDebugLoc();
|
||||
MachineOperand &Dest = MI.getOperand(0);
|
||||
MachineOperand &Temp = MI.getOperand(1);
|
||||
// If Temp is a GPRPair, MiniTempReg is the first of the pair
|
||||
Register MiniTempReg =
|
||||
ARM::GPRPairRegClass.contains(Temp.getReg())
|
||||
? (Register)TRI->getSubReg(Temp.getReg(), ARM::gsub_0)
|
||||
: Temp.getReg();
|
||||
assert(ARM::GPRRegClass.contains(MiniTempReg));
|
||||
Register AddrReg = MI.getOperand(2).getReg();
|
||||
Register ValReg = MI.getOperand(3).getReg();
|
||||
|
||||
// TempReg is GPR and is used for load/store operations.
|
||||
// DestReg is either GPR or DPR and is used for arithmetic operations.
|
||||
|
||||
// LoadStoreBB:
|
||||
// TempReg = LoadExclusive [AddrReg]
|
||||
// DestReg = mov TempReg
|
||||
// if xchg:
|
||||
// TempReg = mov ValReg
|
||||
// else:
|
||||
// DestReg = Operation DestReg, ValReg
|
||||
// TempReg = mov DestReg
|
||||
// MiniTempReg = StoreExclusive TempReg, [AddrReg]
|
||||
// cmp MiniTempReg, #0
|
||||
// bne LoadStoreBB
|
||||
// b DoneBB
|
||||
// DoneBB:
|
||||
// bx lr
|
||||
|
||||
MachineFunction *MF = MBB.getParent();
|
||||
auto *LoadStoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
|
||||
auto *DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
|
||||
|
||||
MF->insert(++MBB.getIterator(), LoadStoreBB);
|
||||
MF->insert(++LoadStoreBB->getIterator(), DoneBB);
|
||||
|
||||
MachineInstrBuilder MIB;
|
||||
// LoadExclusive into temporary general purpose register (pair)
|
||||
MIB = BuildMI(LoadStoreBB, DL, TII->get(LdrexOp));
|
||||
addExclusiveRegPair(MIB, Temp.getReg(), RegState::Define, STI->isThumb(),
|
||||
TRI);
|
||||
MIB.addReg(AddrReg);
|
||||
MIB.add(predOps(ARMCC::AL));
|
||||
|
||||
// Copy Temp into Dest. For floating point operations this is GPR -> DPR.
|
||||
TII->copyPhysReg(*LoadStoreBB, LoadStoreBB->end(), DL, Dest.getReg(),
|
||||
Temp.getReg(), true /* KillSrc */);
|
||||
|
||||
const bool IsXchg =
|
||||
PseudoOp == ARM::ATOMIC_SWAP_8 || PseudoOp == ARM::ATOMIC_SWAP_16 ||
|
||||
PseudoOp == ARM::ATOMIC_SWAP_32 || PseudoOp == ARM::ATOMIC_SWAP_64;
|
||||
|
||||
if (IsXchg) {
|
||||
// Copy ValReg into Temp. For floating point operations this is DPR -> GPR.
|
||||
TII->copyPhysReg(*LoadStoreBB, LoadStoreBB->end(), DL, Temp.getReg(),
|
||||
ValReg, false /* KillSrc */);
|
||||
} else {
|
||||
// Update the value in Dest with the results of the operation
|
||||
makeAtomicUpdateInstrs(PseudoOp, LoadStoreBB, DL, TII, Dest.getReg(),
|
||||
ValReg);
|
||||
|
||||
// Copy Dest into Temp. For floating point operations this is DPR -> GPR.
|
||||
TII->copyPhysReg(*LoadStoreBB, LoadStoreBB->end(), DL, Temp.getReg(),
|
||||
Dest.getReg(), false /* KillSrc */);
|
||||
}
|
||||
|
||||
// StoreExclusive Temp to Addr, store success in Temp (or MiniTempReg)
|
||||
MIB = BuildMI(LoadStoreBB, DL, TII->get(StrexOp));
|
||||
addExclusiveRegPair(MIB, MiniTempReg, RegState::Define, STI->isThumb(), TRI);
|
||||
MIB.addReg(Temp.getReg(), RegState::Kill);
|
||||
MIB.addReg(AddrReg);
|
||||
MIB.add(predOps(ARMCC::AL));
|
||||
|
||||
// Compare to zero
|
||||
BuildMI(LoadStoreBB, DL, TII->get(ARM::CMPri))
|
||||
.addReg(MiniTempReg, RegState::Kill)
|
||||
.addImm(0)
|
||||
.add(predOps(ARMCC::AL));
|
||||
|
||||
// Branch to LoadStoreBB if failed
|
||||
BuildMI(LoadStoreBB, DL, TII->get(ARM::Bcc))
|
||||
.addMBB(LoadStoreBB)
|
||||
.addImm(ARMCC::NE)
|
||||
.addReg(ARM::CPSR, RegState::Kill);
|
||||
|
||||
// Branch to DoneBB if success
|
||||
BuildMI(LoadStoreBB, DL, TII->get(ARM::B)).addMBB(DoneBB);
|
||||
|
||||
LoadStoreBB->addSuccessor(LoadStoreBB);
|
||||
LoadStoreBB->addSuccessor(DoneBB);
|
||||
|
||||
// Copy remaining instructions in MBB into DoneBB
|
||||
DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
|
||||
DoneBB->transferSuccessors(&MBB);
|
||||
|
||||
MBB.addSuccessor(LoadStoreBB);
|
||||
|
||||
NextMBBI = MBB.end();
|
||||
MI.eraseFromParent();
|
||||
|
||||
// Recompute livein lists.
|
||||
LivePhysRegs LiveRegs;
|
||||
computeAndAddLiveIns(LiveRegs, *DoneBB);
|
||||
computeAndAddLiveIns(LiveRegs, *LoadStoreBB);
|
||||
// Do an extra pass around the loop to get loop carried registers right.
|
||||
LoadStoreBB->clearLiveIns();
|
||||
computeAndAddLiveIns(LiveRegs, *LoadStoreBB);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Expand a 64-bit CMP_SWAP to an ldrexd/strexd loop.
|
||||
|
@ -1708,7 +1966,7 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
|
|||
unsigned LDREXD = IsThumb ? ARM::t2LDREXD : ARM::LDREXD;
|
||||
MachineInstrBuilder MIB;
|
||||
MIB = BuildMI(LoadCmpBB, DL, TII->get(LDREXD));
|
||||
addExclusiveRegPair(MIB, Dest, RegState::Define, IsThumb, TRI);
|
||||
addExclusiveRegPair(MIB, Dest.getReg(), RegState::Define, IsThumb, TRI);
|
||||
MIB.addReg(AddrReg).add(predOps(ARMCC::AL));
|
||||
|
||||
unsigned CMPrr = IsThumb ? ARM::tCMPhir : ARM::CMPrr;
|
||||
|
@ -1737,7 +1995,7 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
|
|||
unsigned STREXD = IsThumb ? ARM::t2STREXD : ARM::STREXD;
|
||||
MIB = BuildMI(StoreBB, DL, TII->get(STREXD), TempReg);
|
||||
unsigned Flags = getKillRegState(New.isDead());
|
||||
addExclusiveRegPair(MIB, New, Flags, IsThumb, TRI);
|
||||
addExclusiveRegPair(MIB, New.getReg(), Flags, IsThumb, TRI);
|
||||
MIB.addReg(AddrReg).add(predOps(ARMCC::AL));
|
||||
|
||||
unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri;
|
||||
|
@ -2803,6 +3061,64 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
|
|||
case ARM::CMP_SWAP_64:
|
||||
return ExpandCMP_SWAP_64(MBB, MBBI, NextMBBI);
|
||||
|
||||
case ARM::ATOMIC_LOAD_ADD_8:
|
||||
case ARM::ATOMIC_LOAD_AND_8:
|
||||
case ARM::ATOMIC_LOAD_MAX_8:
|
||||
case ARM::ATOMIC_LOAD_MIN_8:
|
||||
case ARM::ATOMIC_LOAD_NAND_8:
|
||||
case ARM::ATOMIC_LOAD_OR_8:
|
||||
case ARM::ATOMIC_LOAD_SUB_8:
|
||||
case ARM::ATOMIC_LOAD_UMAX_8:
|
||||
case ARM::ATOMIC_LOAD_UMIN_8:
|
||||
case ARM::ATOMIC_LOAD_XOR_8:
|
||||
case ARM::ATOMIC_SWAP_8:
|
||||
return ExpandAtomicOp(MBB, MBBI, 8, Opcode, NextMBBI);
|
||||
|
||||
case ARM::ATOMIC_LOAD_ADD_16:
|
||||
case ARM::ATOMIC_LOAD_AND_16:
|
||||
case ARM::ATOMIC_LOAD_FADD_16:
|
||||
case ARM::ATOMIC_LOAD_FSUB_16:
|
||||
case ARM::ATOMIC_LOAD_MAX_16:
|
||||
case ARM::ATOMIC_LOAD_MIN_16:
|
||||
case ARM::ATOMIC_LOAD_NAND_16:
|
||||
case ARM::ATOMIC_LOAD_OR_16:
|
||||
case ARM::ATOMIC_LOAD_SUB_16:
|
||||
case ARM::ATOMIC_LOAD_UMAX_16:
|
||||
case ARM::ATOMIC_LOAD_UMIN_16:
|
||||
case ARM::ATOMIC_LOAD_XOR_16:
|
||||
case ARM::ATOMIC_SWAP_16:
|
||||
return ExpandAtomicOp(MBB, MBBI, 16, Opcode, NextMBBI);
|
||||
|
||||
case ARM::ATOMIC_LOAD_ADD_32:
|
||||
case ARM::ATOMIC_LOAD_AND_32:
|
||||
case ARM::ATOMIC_LOAD_FADD_32:
|
||||
case ARM::ATOMIC_LOAD_FSUB_32:
|
||||
case ARM::ATOMIC_LOAD_MAX_32:
|
||||
case ARM::ATOMIC_LOAD_MIN_32:
|
||||
case ARM::ATOMIC_LOAD_NAND_32:
|
||||
case ARM::ATOMIC_LOAD_OR_32:
|
||||
case ARM::ATOMIC_LOAD_SUB_32:
|
||||
case ARM::ATOMIC_LOAD_UMAX_32:
|
||||
case ARM::ATOMIC_LOAD_UMIN_32:
|
||||
case ARM::ATOMIC_LOAD_XOR_32:
|
||||
case ARM::ATOMIC_SWAP_32:
|
||||
return ExpandAtomicOp(MBB, MBBI, 32, Opcode, NextMBBI);
|
||||
|
||||
case ARM::ATOMIC_LOAD_ADD_64:
|
||||
case ARM::ATOMIC_LOAD_AND_64:
|
||||
case ARM::ATOMIC_LOAD_FADD_64:
|
||||
case ARM::ATOMIC_LOAD_FSUB_64:
|
||||
case ARM::ATOMIC_LOAD_MAX_64:
|
||||
case ARM::ATOMIC_LOAD_MIN_64:
|
||||
case ARM::ATOMIC_LOAD_NAND_64:
|
||||
case ARM::ATOMIC_LOAD_OR_64:
|
||||
case ARM::ATOMIC_LOAD_SUB_64:
|
||||
case ARM::ATOMIC_LOAD_UMAX_64:
|
||||
case ARM::ATOMIC_LOAD_UMIN_64:
|
||||
case ARM::ATOMIC_LOAD_XOR_64:
|
||||
case ARM::ATOMIC_SWAP_64:
|
||||
return ExpandAtomicOp(MBB, MBBI, 64, Opcode, NextMBBI);
|
||||
|
||||
case ARM::tBL_PUSHLR:
|
||||
case ARM::BL_PUSHLR: {
|
||||
const bool Thumb = Opcode == ARM::tBL_PUSHLR;
|
||||
|
|
|
@ -310,6 +310,7 @@ private:
|
|||
void SelectCMPZ(SDNode *N, bool &SwitchEQNEToPLMI);
|
||||
|
||||
void SelectCMP_SWAP(SDNode *N);
|
||||
void SelectAtomicOp(SDNode *N);
|
||||
|
||||
/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
|
||||
/// inline asm expressions.
|
||||
|
@ -3318,6 +3319,142 @@ void ARMDAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
|
|||
CurDAG->RemoveDeadNode(N);
|
||||
}
|
||||
|
||||
/// Expand atomic operations to size- and type-specific pseudo-instructions
|
||||
void ARMDAGToDAGISel::SelectAtomicOp(SDNode *N) {
|
||||
EVT MemTy = cast<MemSDNode>(N)->getMemoryVT();
|
||||
const unsigned Opcode = [&]() {
|
||||
switch (N->getOpcode()) {
|
||||
case ISD::ATOMIC_SWAP:
|
||||
if (MemTy == MVT::i8)
|
||||
return ARM::ATOMIC_SWAP_8;
|
||||
if (MemTy == MVT::i16)
|
||||
return ARM::ATOMIC_SWAP_16;
|
||||
if (MemTy == MVT::i32)
|
||||
return ARM::ATOMIC_SWAP_32;
|
||||
break;
|
||||
case ISD::ATOMIC_LOAD_ADD:
|
||||
if (MemTy == MVT::i8)
|
||||
return ARM::ATOMIC_LOAD_ADD_8;
|
||||
if (MemTy == MVT::i16)
|
||||
return ARM::ATOMIC_LOAD_ADD_16;
|
||||
if (MemTy == MVT::i32)
|
||||
return ARM::ATOMIC_LOAD_ADD_32;
|
||||
break;
|
||||
case ISD::ATOMIC_LOAD_SUB:
|
||||
if (MemTy == MVT::i8)
|
||||
return ARM::ATOMIC_LOAD_SUB_8;
|
||||
if (MemTy == MVT::i16)
|
||||
return ARM::ATOMIC_LOAD_SUB_16;
|
||||
if (MemTy == MVT::i32)
|
||||
return ARM::ATOMIC_LOAD_SUB_32;
|
||||
break;
|
||||
case ISD::ATOMIC_LOAD_AND:
|
||||
if (MemTy == MVT::i8)
|
||||
return ARM::ATOMIC_LOAD_AND_8;
|
||||
if (MemTy == MVT::i16)
|
||||
return ARM::ATOMIC_LOAD_AND_16;
|
||||
if (MemTy == MVT::i32)
|
||||
return ARM::ATOMIC_LOAD_AND_32;
|
||||
break;
|
||||
case ISD::ATOMIC_LOAD_CLR:
|
||||
llvm_unreachable("ATOMIC_LOAD_CLR in SelectAtomicOp");
|
||||
break;
|
||||
case ISD::ATOMIC_LOAD_OR:
|
||||
if (MemTy == MVT::i8)
|
||||
return ARM::ATOMIC_LOAD_OR_8;
|
||||
if (MemTy == MVT::i16)
|
||||
return ARM::ATOMIC_LOAD_OR_16;
|
||||
if (MemTy == MVT::i32)
|
||||
return ARM::ATOMIC_LOAD_OR_32;
|
||||
break;
|
||||
case ISD::ATOMIC_LOAD_XOR:
|
||||
if (MemTy == MVT::i8)
|
||||
return ARM::ATOMIC_LOAD_XOR_8;
|
||||
if (MemTy == MVT::i16)
|
||||
return ARM::ATOMIC_LOAD_XOR_16;
|
||||
if (MemTy == MVT::i32)
|
||||
return ARM::ATOMIC_LOAD_XOR_32;
|
||||
break;
|
||||
case ISD::ATOMIC_LOAD_NAND:
|
||||
if (MemTy == MVT::i8)
|
||||
return ARM::ATOMIC_LOAD_NAND_8;
|
||||
if (MemTy == MVT::i16)
|
||||
return ARM::ATOMIC_LOAD_NAND_16;
|
||||
if (MemTy == MVT::i32)
|
||||
return ARM::ATOMIC_LOAD_NAND_32;
|
||||
break;
|
||||
case ISD::ATOMIC_LOAD_MIN:
|
||||
if (MemTy == MVT::i8)
|
||||
return ARM::ATOMIC_LOAD_MIN_8;
|
||||
if (MemTy == MVT::i16)
|
||||
return ARM::ATOMIC_LOAD_MIN_16;
|
||||
if (MemTy == MVT::i32)
|
||||
return ARM::ATOMIC_LOAD_MIN_32;
|
||||
break;
|
||||
case ISD::ATOMIC_LOAD_MAX:
|
||||
if (MemTy == MVT::i8)
|
||||
return ARM::ATOMIC_LOAD_MAX_8;
|
||||
if (MemTy == MVT::i16)
|
||||
return ARM::ATOMIC_LOAD_MAX_16;
|
||||
if (MemTy == MVT::i32)
|
||||
return ARM::ATOMIC_LOAD_MAX_32;
|
||||
break;
|
||||
case ISD::ATOMIC_LOAD_UMIN:
|
||||
if (MemTy == MVT::i8)
|
||||
return ARM::ATOMIC_LOAD_UMIN_8;
|
||||
if (MemTy == MVT::i16)
|
||||
return ARM::ATOMIC_LOAD_UMIN_16;
|
||||
if (MemTy == MVT::i32)
|
||||
return ARM::ATOMIC_LOAD_UMIN_32;
|
||||
break;
|
||||
case ISD::ATOMIC_LOAD_UMAX:
|
||||
if (MemTy == MVT::i8)
|
||||
return ARM::ATOMIC_LOAD_UMAX_8;
|
||||
if (MemTy == MVT::i16)
|
||||
return ARM::ATOMIC_LOAD_UMAX_16;
|
||||
if (MemTy == MVT::i32)
|
||||
return ARM::ATOMIC_LOAD_UMAX_32;
|
||||
break;
|
||||
case ISD::ATOMIC_LOAD_FADD:
|
||||
if (MemTy == MVT::i16)
|
||||
return ARM::ATOMIC_LOAD_FADD_16; // f16 promoted to f32
|
||||
if (MemTy == MVT::f16)
|
||||
return ARM::ATOMIC_LOAD_FADD_16;
|
||||
if (MemTy == MVT::f32)
|
||||
return ARM::ATOMIC_LOAD_FADD_32;
|
||||
if (MemTy == MVT::f64)
|
||||
return ARM::ATOMIC_LOAD_FADD_64;
|
||||
break;
|
||||
case ISD::ATOMIC_LOAD_FSUB:
|
||||
if (MemTy == MVT::i16)
|
||||
return ARM::ATOMIC_LOAD_FSUB_16; // f16 promoted to f32
|
||||
if (MemTy == MVT::f16)
|
||||
return ARM::ATOMIC_LOAD_FSUB_16;
|
||||
if (MemTy == MVT::f32)
|
||||
return ARM::ATOMIC_LOAD_FSUB_32;
|
||||
if (MemTy == MVT::f64)
|
||||
return ARM::ATOMIC_LOAD_FSUB_64;
|
||||
break;
|
||||
}
|
||||
llvm_unreachable("Unknown AtomicOp type");
|
||||
return ARM::INSTRUCTION_LIST_END;
|
||||
}();
|
||||
|
||||
SDValue Chain = N->getOperand(0);
|
||||
SDValue Addr = N->getOperand(1);
|
||||
SDValue Value = N->getOperand(2);
|
||||
SDNode *Swap = CurDAG->getMachineNode(
|
||||
Opcode, SDLoc(N), CurDAG->getVTList(Value.getValueType(), MVT::Other),
|
||||
{Chain, Addr, Value});
|
||||
|
||||
MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
|
||||
CurDAG->setNodeMemRefs(cast<MachineSDNode>(Swap), {MemOp});
|
||||
|
||||
ReplaceUses(SDValue(N, 0), SDValue(Swap, 0)); // Result
|
||||
ReplaceUses(SDValue(N, 1), SDValue(Swap, 1)); // Chain
|
||||
CurDAG->RemoveDeadNode(N);
|
||||
}
|
||||
|
||||
static Optional<std::pair<unsigned, unsigned>>
|
||||
getContiguousRangeOfSetBits(const APInt &A) {
|
||||
unsigned FirstOne = A.getBitWidth() - A.countLeadingZeros() - 1;
|
||||
|
@ -5028,6 +5165,23 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
|
|||
case ISD::ATOMIC_CMP_SWAP:
|
||||
SelectCMP_SWAP(N);
|
||||
return;
|
||||
|
||||
case ISD::ATOMIC_LOAD_ADD:
|
||||
case ISD::ATOMIC_LOAD_SUB:
|
||||
case ISD::ATOMIC_LOAD_AND:
|
||||
case ISD::ATOMIC_LOAD_CLR:
|
||||
case ISD::ATOMIC_LOAD_OR:
|
||||
case ISD::ATOMIC_LOAD_XOR:
|
||||
case ISD::ATOMIC_LOAD_NAND:
|
||||
case ISD::ATOMIC_LOAD_MIN:
|
||||
case ISD::ATOMIC_LOAD_MAX:
|
||||
case ISD::ATOMIC_LOAD_UMIN:
|
||||
case ISD::ATOMIC_LOAD_UMAX:
|
||||
case ISD::ATOMIC_LOAD_FADD:
|
||||
case ISD::ATOMIC_LOAD_FSUB:
|
||||
case ISD::ATOMIC_SWAP:
|
||||
SelectAtomicOp(N);
|
||||
return;
|
||||
}
|
||||
|
||||
SelectCode(N);
|
||||
|
|
|
@ -19057,6 +19057,11 @@ ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
|
|||
// and up to 64 bits on the non-M profiles
|
||||
TargetLowering::AtomicExpansionKind
|
||||
ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
|
||||
// At -O0 expand pseudo-instructions after register allocation to avoid
|
||||
// inserting spills between ldrex/strex.
|
||||
if (getTargetMachine().getOptLevel() == 0 && !Subtarget->isThumb())
|
||||
return AtomicExpansionKind::None;
|
||||
|
||||
if (AI->isFloatingPointOperation())
|
||||
return AtomicExpansionKind::CmpXChg;
|
||||
|
||||
|
|
|
@ -6428,6 +6428,37 @@ def CMP_SWAP_64 : PseudoInst<(outs GPRPair:$Rd, GPR:$temp),
|
|||
NoItinerary, []>, Sched<[]>;
|
||||
}
|
||||
|
||||
let Constraints = "@earlyclobber $Rd,@earlyclobber $temp",
|
||||
mayLoad = 1, mayStore = 1 in
|
||||
multiclass AtomicRMW {
|
||||
def _8 : PseudoInst<(outs GPR:$Rd, GPR:$temp), (ins GPR:$addr, GPR:$new), NoItinerary, []>, Sched<[]>;
|
||||
def _16 : PseudoInst<(outs GPR:$Rd, GPR:$temp), (ins GPR:$addr, GPR:$new), NoItinerary, []>, Sched<[]>;
|
||||
def _32 : PseudoInst<(outs GPR:$Rd, GPR:$temp), (ins GPR:$addr, GPR:$new), NoItinerary, []>, Sched<[]>;
|
||||
def _64 : PseudoInst<(outs GPRPair:$Rd, GPR:$temp), (ins GPR:$addr, GPRPair:$new), NoItinerary, []>, Sched<[]>;
|
||||
}
|
||||
defm ATOMIC_SWAP : AtomicRMW;
|
||||
defm ATOMIC_LOAD_ADD : AtomicRMW;
|
||||
defm ATOMIC_LOAD_SUB : AtomicRMW;
|
||||
defm ATOMIC_LOAD_AND : AtomicRMW;
|
||||
defm ATOMIC_LOAD_CLR : AtomicRMW;
|
||||
defm ATOMIC_LOAD_OR : AtomicRMW;
|
||||
defm ATOMIC_LOAD_XOR : AtomicRMW;
|
||||
defm ATOMIC_LOAD_NAND : AtomicRMW;
|
||||
defm ATOMIC_LOAD_MIN : AtomicRMW;
|
||||
defm ATOMIC_LOAD_MAX : AtomicRMW;
|
||||
defm ATOMIC_LOAD_UMIN : AtomicRMW;
|
||||
defm ATOMIC_LOAD_UMAX : AtomicRMW;
|
||||
// FADD and FSUB have GPRPair temporary for ldrexd/strexd and the return value of strexd, but DPR for result.
|
||||
let Constraints = "@earlyclobber $Rd,@earlyclobber $temp",
|
||||
mayLoad = 1, mayStore = 1 in
|
||||
multiclass AtomicRMWFloat {
|
||||
def _16 : PseudoInst<(outs SPR:$Rd, GPR:$temp), (ins GPR:$addr, SPR:$new), NoItinerary, []>, Sched<[]>;
|
||||
def _32 : PseudoInst<(outs SPR:$Rd, GPR:$temp), (ins GPR:$addr, SPR:$new), NoItinerary, []>, Sched<[]>;
|
||||
def _64 : PseudoInst<(outs DPR:$Rd, GPRPair:$temp), (ins GPR:$addr, DPR:$new), NoItinerary, []>, Sched<[]>;
|
||||
}
|
||||
defm ATOMIC_LOAD_FADD : AtomicRMWFloat;
|
||||
defm ATOMIC_LOAD_FSUB : AtomicRMWFloat;
|
||||
|
||||
def CompilerBarrier : PseudoInst<(outs), (ins i32imm:$ordering), NoItinerary,
|
||||
[(atomic_fence timm:$ordering, 0)]> {
|
||||
let hasSideEffects = 1;
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,5 +1,5 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
||||
; RUN: opt -S -mtriple=armv7-apple-ios7.0 -atomic-expand %s | FileCheck %s
|
||||
; RUN: opt -O1 -S -mtriple=armv7-apple-ios7.0 -atomic-expand %s | FileCheck %s
|
||||
|
||||
define float @test_atomicrmw_fadd_f32(float* %ptr, float %value) {
|
||||
; CHECK-LABEL: @test_atomicrmw_fadd_f32(
|
||||
|
|
Loading…
Reference in New Issue