[CodeGen][ARM] Implement atomicrmw as pseudo operations at -O0

atomicrmw instructions are expanded by AtomicExpandPass before register allocation
into cmpxchg loops. Register allocation can insert spills between the exclusive loads
and stores, which invalidates the exclusive monitor and can lead to infinite loops.

To avoid this, reimplement atomicrmw operations as pseudo-instructions and expand them
after register allocation.

Floating point legalisation:
f16 ATOMIC_LOAD_FADD(*f16, f16) is legalised to
f32 ATOMIC_LOAD_FADD(*i16, f32) and then eventually
f32 ATOMIC_LOAD_FADD_16(*i16, f32)

Differential Revision: https://reviews.llvm.org/D101164
This commit is contained in:
Tomas Matheson 2021-03-31 17:45:45 +01:00
parent 985ab6e1fa
commit 3338290c18
9 changed files with 1577 additions and 8 deletions

View File

@ -2255,6 +2255,11 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
case ISD::FREM:
case ISD::FSUB: R = PromoteFloatRes_BinOp(N); break;
case ISD::ATOMIC_LOAD_FADD:
case ISD::ATOMIC_LOAD_FSUB:
R = PromoteFloatRes_ATOMIC_LOAD_FXXX(N);
break;
case ISD::FMA: // FMA is same as FMAD
case ISD::FMAD: R = PromoteFloatRes_FMAD(N); break;
@ -2453,6 +2458,21 @@ SDValue DAGTypeLegalizer::PromoteFloatRes_FP_ROUND(SDNode *N) {
return DAG.getNode(GetPromotionOpcode(VT, NVT), DL, NVT, Round);
}
SDValue DAGTypeLegalizer::PromoteFloatRes_ATOMIC_LOAD_FXXX(SDNode *N) {
AtomicSDNode *A = cast<AtomicSDNode>(N);
EVT VT = N->getValueType(0);
// Load the value as an integer value with the same number of bits.
EVT IVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
SDValue PromotedVal = GetPromotedFloat(A->getVal());
SDValue NewA =
DAG.getAtomic(A->getOpcode(), SDLoc(N), IVT, A->getChain(),
A->getBasePtr(), PromotedVal, A->getMemOperand());
ReplaceValueWith(SDValue(A, 1), NewA.getValue(1));
return NewA;
}
SDValue DAGTypeLegalizer::PromoteFloatRes_LOAD(SDNode *N) {
LoadSDNode *L = cast<LoadSDNode>(N);
EVT VT = N->getValueType(0);

View File

@ -671,6 +671,7 @@ private:
SDValue PromoteFloatRes_FMAD(SDNode *N);
SDValue PromoteFloatRes_FPOWI(SDNode *N);
SDValue PromoteFloatRes_FP_ROUND(SDNode *N);
SDValue PromoteFloatRes_ATOMIC_LOAD_FXXX(SDNode *N);
SDValue PromoteFloatRes_LOAD(SDNode *N);
SDValue PromoteFloatRes_SELECT(SDNode *N);
SDValue PromoteFloatRes_SELECT_CC(SDNode *N);

View File

@ -928,6 +928,25 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
// Handle DPR to/from GPRPair
const auto *TRI = &getRegisterInfo();
if (ARM::DPRRegClass.contains(SrcReg) &&
ARM::GPRPairRegClass.contains(DestReg)) {
BuildMI(MBB, I, DL, get(ARM::VMOVRRD))
.addReg(TRI->getSubReg(DestReg, ARM::gsub_0), RegState::Define)
.addReg(TRI->getSubReg(DestReg, ARM::gsub_1), RegState::Define)
.addReg(SrcReg, getKillRegState(KillSrc))
.add(predOps(ARMCC::AL));
return;
} else if (ARM::GPRPairRegClass.contains(SrcReg) &&
ARM::DPRRegClass.contains(DestReg)) {
BuildMI(MBB, I, DL, get(ARM::VMOVDRR), DestReg)
.addReg(TRI->getSubReg(SrcReg, ARM::gsub_0), getKillRegState(KillSrc))
.addReg(TRI->getSubReg(SrcReg, ARM::gsub_1), getKillRegState(KillSrc))
.add(predOps(ARMCC::AL));
return;
}
// Handle register classes that require multiple instructions.
unsigned BeginIdx = 0;
unsigned SubRegs = 0;
@ -1013,7 +1032,6 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
assert(Opc && "Impossible reg-to-reg copy");
const TargetRegisterInfo *TRI = &getRegisterInfo();
MachineInstrBuilder Mov;
// Copy register tuples backward when the first Dest reg overlaps with SrcReg.

View File

@ -107,6 +107,10 @@ namespace {
MachineBasicBlock::iterator MBBI, unsigned LdrexOp,
unsigned StrexOp, unsigned UxtOp,
MachineBasicBlock::iterator &NextMBBI);
bool ExpandAtomicOp(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI, const int Size,
unsigned PseudoOp,
MachineBasicBlock::iterator &NextMBBI);
bool ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
@ -1657,16 +1661,270 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
/// ARM's ldrexd/strexd take a consecutive register pair (represented as a
/// single GPRPair register), Thumb's take two separate registers so we need to
/// extract the subregs from the pair.
static void addExclusiveRegPair(MachineInstrBuilder &MIB, MachineOperand &Reg,
static void addExclusiveRegPair(MachineInstrBuilder &MIB, Register Reg,
unsigned Flags, bool IsThumb,
const TargetRegisterInfo *TRI) {
if (IsThumb) {
Register RegLo = TRI->getSubReg(Reg.getReg(), ARM::gsub_0);
Register RegHi = TRI->getSubReg(Reg.getReg(), ARM::gsub_1);
Register RegLo = TRI->getSubReg(Reg, ARM::gsub_0);
Register RegHi = TRI->getSubReg(Reg, ARM::gsub_1);
MIB.addReg(RegLo, Flags);
MIB.addReg(RegHi, Flags);
} else
MIB.addReg(Reg.getReg(), Flags);
MIB.addReg(Reg, Flags);
}
static void
makeAtomicUpdateInstrs(const unsigned PseudoOp, MachineBasicBlock *LoadStoreBB,
const DebugLoc &DL, const ARMBaseInstrInfo *TII,
const Register DestReg, const Register ValReg) {
auto BasicOp = [&](unsigned Opcode) {
auto MIB = BuildMI(LoadStoreBB, DL, TII->get(Opcode), DestReg)
.addReg(DestReg, RegState::Kill)
.addReg(ValReg)
.add(predOps(ARMCC::AL));
if (Opcode != ARM::VADDS && Opcode != ARM::VSUBS && Opcode != ARM::VADDD &&
Opcode != ARM::VSUBD)
// Floating point operations don't have this.
// Add 's' bit operand (always reg0 for this)
MIB.addReg(0);
};
auto MinMax = [&](ARMCC::CondCodes Condition) {
BuildMI(LoadStoreBB, DL, TII->get(ARM::CMPrr), DestReg)
.addReg(ValReg)
.add(predOps(ARMCC::AL));
BuildMI(LoadStoreBB, DL, TII->get(ARM::MOVr), DestReg)
.addReg(ValReg)
.add(predOps(Condition))
.add(condCodeOp()); // 's' bit
};
switch (PseudoOp) {
// No operations (swaps)
case ARM::ATOMIC_SWAP_8:
case ARM::ATOMIC_SWAP_16:
case ARM::ATOMIC_SWAP_32:
case ARM::ATOMIC_SWAP_64:
llvm_unreachable("Swap should be handled at call site.");
return;
// Basic binary operation
case ARM::ATOMIC_LOAD_ADD_8:
case ARM::ATOMIC_LOAD_ADD_16:
case ARM::ATOMIC_LOAD_ADD_32:
case ARM::ATOMIC_LOAD_ADD_64:
return BasicOp(ARM::ADDrr);
case ARM::ATOMIC_LOAD_SUB_8:
case ARM::ATOMIC_LOAD_SUB_16:
case ARM::ATOMIC_LOAD_SUB_32:
case ARM::ATOMIC_LOAD_SUB_64:
return BasicOp(ARM::SUBrr);
case ARM::ATOMIC_LOAD_AND_8:
case ARM::ATOMIC_LOAD_AND_16:
case ARM::ATOMIC_LOAD_AND_32:
case ARM::ATOMIC_LOAD_AND_64:
return BasicOp(ARM::ANDrr);
case ARM::ATOMIC_LOAD_OR_8:
case ARM::ATOMIC_LOAD_OR_16:
case ARM::ATOMIC_LOAD_OR_32:
case ARM::ATOMIC_LOAD_OR_64:
return BasicOp(ARM::ORRrr);
case ARM::ATOMIC_LOAD_XOR_8:
case ARM::ATOMIC_LOAD_XOR_16:
case ARM::ATOMIC_LOAD_XOR_32:
case ARM::ATOMIC_LOAD_XOR_64:
return BasicOp(ARM::EORrr);
case ARM::ATOMIC_LOAD_FADD_16:
case ARM::ATOMIC_LOAD_FADD_32:
return BasicOp(ARM::VADDS);
case ARM::ATOMIC_LOAD_FADD_64:
return BasicOp(ARM::VADDD);
case ARM::ATOMIC_LOAD_FSUB_16:
case ARM::ATOMIC_LOAD_FSUB_32:
return BasicOp(ARM::VSUBS);
case ARM::ATOMIC_LOAD_FSUB_64:
return BasicOp(ARM::VSUBD);
// Minimum or maximum operations
case ARM::ATOMIC_LOAD_MAX_8:
case ARM::ATOMIC_LOAD_MAX_16:
case ARM::ATOMIC_LOAD_MAX_32:
case ARM::ATOMIC_LOAD_MAX_64:
case ARM::ATOMIC_LOAD_UMAX_8:
case ARM::ATOMIC_LOAD_UMAX_16:
case ARM::ATOMIC_LOAD_UMAX_32:
case ARM::ATOMIC_LOAD_UMAX_64:
return MinMax(ARMCC::LE);
case ARM::ATOMIC_LOAD_MIN_8:
case ARM::ATOMIC_LOAD_MIN_16:
case ARM::ATOMIC_LOAD_MIN_32:
case ARM::ATOMIC_LOAD_MIN_64:
case ARM::ATOMIC_LOAD_UMIN_8:
case ARM::ATOMIC_LOAD_UMIN_16:
case ARM::ATOMIC_LOAD_UMIN_32:
case ARM::ATOMIC_LOAD_UMIN_64:
return MinMax(ARMCC::GE);
// NAND
case ARM::ATOMIC_LOAD_NAND_8:
case ARM::ATOMIC_LOAD_NAND_16:
case ARM::ATOMIC_LOAD_NAND_32:
case ARM::ATOMIC_LOAD_NAND_64:
BuildMI(LoadStoreBB, DL, TII->get(ARM::ANDrr), DestReg)
.addReg(DestReg, RegState::Kill)
.addReg(ValReg)
.add(predOps(ARMCC::AL))
.addReg(0); // 's' bit
BuildMI(LoadStoreBB, DL, TII->get(ARM::MVNr), DestReg)
.addReg(DestReg, RegState::Kill)
.add(predOps(ARMCC::AL))
.addReg(0); // 's' bit
return;
}
llvm_unreachable("unexpected opcode");
}
bool ARMExpandPseudo::ExpandAtomicOp(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
const int Size, const unsigned PseudoOp,
MachineBasicBlock::iterator &NextMBBI) {
assert(!STI->isThumb() && "atomic pseudo-instructions are ARM only");
unsigned LdrexOp;
unsigned StrexOp;
switch (Size) {
case 8:
LdrexOp = ARM::LDREXB;
StrexOp = ARM::STREXB;
break;
case 16:
LdrexOp = ARM::LDREXH;
StrexOp = ARM::STREXH;
break;
case 32:
LdrexOp = ARM::LDREX;
StrexOp = ARM::STREX;
break;
case 64:
LdrexOp = ARM::LDREXD;
StrexOp = ARM::STREXD;
break;
default:
llvm_unreachable("Invalid Size");
}
MachineInstr &MI = *MBBI;
DebugLoc DL = MI.getDebugLoc();
MachineOperand &Dest = MI.getOperand(0);
MachineOperand &Temp = MI.getOperand(1);
// If Temp is a GPRPair, MiniTempReg is the first of the pair
Register MiniTempReg =
ARM::GPRPairRegClass.contains(Temp.getReg())
? (Register)TRI->getSubReg(Temp.getReg(), ARM::gsub_0)
: Temp.getReg();
assert(ARM::GPRRegClass.contains(MiniTempReg));
Register AddrReg = MI.getOperand(2).getReg();
Register ValReg = MI.getOperand(3).getReg();
// TempReg is GPR and is used for load/store operations.
// DestReg is either GPR or DPR and is used for arithmetic operations.
// LoadStoreBB:
// TempReg = LoadExclusive [AddrReg]
// DestReg = mov TempReg
// if xchg:
// TempReg = mov ValReg
// else:
// DestReg = Operation DestReg, ValReg
// TempReg = mov DestReg
// MiniTempReg = StoreExclusive TempReg, [AddrReg]
// cmp MiniTempReg, #0
// bne LoadStoreBB
// b DoneBB
// DoneBB:
// bx lr
MachineFunction *MF = MBB.getParent();
auto *LoadStoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
auto *DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
MF->insert(++MBB.getIterator(), LoadStoreBB);
MF->insert(++LoadStoreBB->getIterator(), DoneBB);
MachineInstrBuilder MIB;
// LoadExclusive into temporary general purpose register (pair)
MIB = BuildMI(LoadStoreBB, DL, TII->get(LdrexOp));
addExclusiveRegPair(MIB, Temp.getReg(), RegState::Define, STI->isThumb(),
TRI);
MIB.addReg(AddrReg);
MIB.add(predOps(ARMCC::AL));
// Copy Temp into Dest. For floating point operations this is GPR -> DPR.
TII->copyPhysReg(*LoadStoreBB, LoadStoreBB->end(), DL, Dest.getReg(),
Temp.getReg(), true /* KillSrc */);
const bool IsXchg =
PseudoOp == ARM::ATOMIC_SWAP_8 || PseudoOp == ARM::ATOMIC_SWAP_16 ||
PseudoOp == ARM::ATOMIC_SWAP_32 || PseudoOp == ARM::ATOMIC_SWAP_64;
if (IsXchg) {
// Copy ValReg into Temp. For floating point operations this is DPR -> GPR.
TII->copyPhysReg(*LoadStoreBB, LoadStoreBB->end(), DL, Temp.getReg(),
ValReg, false /* KillSrc */);
} else {
// Update the value in Dest with the results of the operation
makeAtomicUpdateInstrs(PseudoOp, LoadStoreBB, DL, TII, Dest.getReg(),
ValReg);
// Copy Dest into Temp. For floating point operations this is DPR -> GPR.
TII->copyPhysReg(*LoadStoreBB, LoadStoreBB->end(), DL, Temp.getReg(),
Dest.getReg(), false /* KillSrc */);
}
// StoreExclusive Temp to Addr, store success in Temp (or MiniTempReg)
MIB = BuildMI(LoadStoreBB, DL, TII->get(StrexOp));
addExclusiveRegPair(MIB, MiniTempReg, RegState::Define, STI->isThumb(), TRI);
MIB.addReg(Temp.getReg(), RegState::Kill);
MIB.addReg(AddrReg);
MIB.add(predOps(ARMCC::AL));
// Compare to zero
BuildMI(LoadStoreBB, DL, TII->get(ARM::CMPri))
.addReg(MiniTempReg, RegState::Kill)
.addImm(0)
.add(predOps(ARMCC::AL));
// Branch to LoadStoreBB if failed
BuildMI(LoadStoreBB, DL, TII->get(ARM::Bcc))
.addMBB(LoadStoreBB)
.addImm(ARMCC::NE)
.addReg(ARM::CPSR, RegState::Kill);
// Branch to DoneBB if success
BuildMI(LoadStoreBB, DL, TII->get(ARM::B)).addMBB(DoneBB);
LoadStoreBB->addSuccessor(LoadStoreBB);
LoadStoreBB->addSuccessor(DoneBB);
// Copy remaining instructions in MBB into DoneBB
DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
DoneBB->transferSuccessors(&MBB);
MBB.addSuccessor(LoadStoreBB);
NextMBBI = MBB.end();
MI.eraseFromParent();
// Recompute livein lists.
LivePhysRegs LiveRegs;
computeAndAddLiveIns(LiveRegs, *DoneBB);
computeAndAddLiveIns(LiveRegs, *LoadStoreBB);
// Do an extra pass around the loop to get loop carried registers right.
LoadStoreBB->clearLiveIns();
computeAndAddLiveIns(LiveRegs, *LoadStoreBB);
return true;
}
/// Expand a 64-bit CMP_SWAP to an ldrexd/strexd loop.
@ -1708,7 +1966,7 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
unsigned LDREXD = IsThumb ? ARM::t2LDREXD : ARM::LDREXD;
MachineInstrBuilder MIB;
MIB = BuildMI(LoadCmpBB, DL, TII->get(LDREXD));
addExclusiveRegPair(MIB, Dest, RegState::Define, IsThumb, TRI);
addExclusiveRegPair(MIB, Dest.getReg(), RegState::Define, IsThumb, TRI);
MIB.addReg(AddrReg).add(predOps(ARMCC::AL));
unsigned CMPrr = IsThumb ? ARM::tCMPhir : ARM::CMPrr;
@ -1737,7 +1995,7 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
unsigned STREXD = IsThumb ? ARM::t2STREXD : ARM::STREXD;
MIB = BuildMI(StoreBB, DL, TII->get(STREXD), TempReg);
unsigned Flags = getKillRegState(New.isDead());
addExclusiveRegPair(MIB, New, Flags, IsThumb, TRI);
addExclusiveRegPair(MIB, New.getReg(), Flags, IsThumb, TRI);
MIB.addReg(AddrReg).add(predOps(ARMCC::AL));
unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri;
@ -2803,6 +3061,64 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
case ARM::CMP_SWAP_64:
return ExpandCMP_SWAP_64(MBB, MBBI, NextMBBI);
case ARM::ATOMIC_LOAD_ADD_8:
case ARM::ATOMIC_LOAD_AND_8:
case ARM::ATOMIC_LOAD_MAX_8:
case ARM::ATOMIC_LOAD_MIN_8:
case ARM::ATOMIC_LOAD_NAND_8:
case ARM::ATOMIC_LOAD_OR_8:
case ARM::ATOMIC_LOAD_SUB_8:
case ARM::ATOMIC_LOAD_UMAX_8:
case ARM::ATOMIC_LOAD_UMIN_8:
case ARM::ATOMIC_LOAD_XOR_8:
case ARM::ATOMIC_SWAP_8:
return ExpandAtomicOp(MBB, MBBI, 8, Opcode, NextMBBI);
case ARM::ATOMIC_LOAD_ADD_16:
case ARM::ATOMIC_LOAD_AND_16:
case ARM::ATOMIC_LOAD_FADD_16:
case ARM::ATOMIC_LOAD_FSUB_16:
case ARM::ATOMIC_LOAD_MAX_16:
case ARM::ATOMIC_LOAD_MIN_16:
case ARM::ATOMIC_LOAD_NAND_16:
case ARM::ATOMIC_LOAD_OR_16:
case ARM::ATOMIC_LOAD_SUB_16:
case ARM::ATOMIC_LOAD_UMAX_16:
case ARM::ATOMIC_LOAD_UMIN_16:
case ARM::ATOMIC_LOAD_XOR_16:
case ARM::ATOMIC_SWAP_16:
return ExpandAtomicOp(MBB, MBBI, 16, Opcode, NextMBBI);
case ARM::ATOMIC_LOAD_ADD_32:
case ARM::ATOMIC_LOAD_AND_32:
case ARM::ATOMIC_LOAD_FADD_32:
case ARM::ATOMIC_LOAD_FSUB_32:
case ARM::ATOMIC_LOAD_MAX_32:
case ARM::ATOMIC_LOAD_MIN_32:
case ARM::ATOMIC_LOAD_NAND_32:
case ARM::ATOMIC_LOAD_OR_32:
case ARM::ATOMIC_LOAD_SUB_32:
case ARM::ATOMIC_LOAD_UMAX_32:
case ARM::ATOMIC_LOAD_UMIN_32:
case ARM::ATOMIC_LOAD_XOR_32:
case ARM::ATOMIC_SWAP_32:
return ExpandAtomicOp(MBB, MBBI, 32, Opcode, NextMBBI);
case ARM::ATOMIC_LOAD_ADD_64:
case ARM::ATOMIC_LOAD_AND_64:
case ARM::ATOMIC_LOAD_FADD_64:
case ARM::ATOMIC_LOAD_FSUB_64:
case ARM::ATOMIC_LOAD_MAX_64:
case ARM::ATOMIC_LOAD_MIN_64:
case ARM::ATOMIC_LOAD_NAND_64:
case ARM::ATOMIC_LOAD_OR_64:
case ARM::ATOMIC_LOAD_SUB_64:
case ARM::ATOMIC_LOAD_UMAX_64:
case ARM::ATOMIC_LOAD_UMIN_64:
case ARM::ATOMIC_LOAD_XOR_64:
case ARM::ATOMIC_SWAP_64:
return ExpandAtomicOp(MBB, MBBI, 64, Opcode, NextMBBI);
case ARM::tBL_PUSHLR:
case ARM::BL_PUSHLR: {
const bool Thumb = Opcode == ARM::tBL_PUSHLR;

View File

@ -310,6 +310,7 @@ private:
void SelectCMPZ(SDNode *N, bool &SwitchEQNEToPLMI);
void SelectCMP_SWAP(SDNode *N);
void SelectAtomicOp(SDNode *N);
/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
/// inline asm expressions.
@ -3318,6 +3319,142 @@ void ARMDAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
CurDAG->RemoveDeadNode(N);
}
/// Expand atomic operations to size- and type-specific pseudo-instructions
void ARMDAGToDAGISel::SelectAtomicOp(SDNode *N) {
EVT MemTy = cast<MemSDNode>(N)->getMemoryVT();
const unsigned Opcode = [&]() {
switch (N->getOpcode()) {
case ISD::ATOMIC_SWAP:
if (MemTy == MVT::i8)
return ARM::ATOMIC_SWAP_8;
if (MemTy == MVT::i16)
return ARM::ATOMIC_SWAP_16;
if (MemTy == MVT::i32)
return ARM::ATOMIC_SWAP_32;
break;
case ISD::ATOMIC_LOAD_ADD:
if (MemTy == MVT::i8)
return ARM::ATOMIC_LOAD_ADD_8;
if (MemTy == MVT::i16)
return ARM::ATOMIC_LOAD_ADD_16;
if (MemTy == MVT::i32)
return ARM::ATOMIC_LOAD_ADD_32;
break;
case ISD::ATOMIC_LOAD_SUB:
if (MemTy == MVT::i8)
return ARM::ATOMIC_LOAD_SUB_8;
if (MemTy == MVT::i16)
return ARM::ATOMIC_LOAD_SUB_16;
if (MemTy == MVT::i32)
return ARM::ATOMIC_LOAD_SUB_32;
break;
case ISD::ATOMIC_LOAD_AND:
if (MemTy == MVT::i8)
return ARM::ATOMIC_LOAD_AND_8;
if (MemTy == MVT::i16)
return ARM::ATOMIC_LOAD_AND_16;
if (MemTy == MVT::i32)
return ARM::ATOMIC_LOAD_AND_32;
break;
case ISD::ATOMIC_LOAD_CLR:
llvm_unreachable("ATOMIC_LOAD_CLR in SelectAtomicOp");
break;
case ISD::ATOMIC_LOAD_OR:
if (MemTy == MVT::i8)
return ARM::ATOMIC_LOAD_OR_8;
if (MemTy == MVT::i16)
return ARM::ATOMIC_LOAD_OR_16;
if (MemTy == MVT::i32)
return ARM::ATOMIC_LOAD_OR_32;
break;
case ISD::ATOMIC_LOAD_XOR:
if (MemTy == MVT::i8)
return ARM::ATOMIC_LOAD_XOR_8;
if (MemTy == MVT::i16)
return ARM::ATOMIC_LOAD_XOR_16;
if (MemTy == MVT::i32)
return ARM::ATOMIC_LOAD_XOR_32;
break;
case ISD::ATOMIC_LOAD_NAND:
if (MemTy == MVT::i8)
return ARM::ATOMIC_LOAD_NAND_8;
if (MemTy == MVT::i16)
return ARM::ATOMIC_LOAD_NAND_16;
if (MemTy == MVT::i32)
return ARM::ATOMIC_LOAD_NAND_32;
break;
case ISD::ATOMIC_LOAD_MIN:
if (MemTy == MVT::i8)
return ARM::ATOMIC_LOAD_MIN_8;
if (MemTy == MVT::i16)
return ARM::ATOMIC_LOAD_MIN_16;
if (MemTy == MVT::i32)
return ARM::ATOMIC_LOAD_MIN_32;
break;
case ISD::ATOMIC_LOAD_MAX:
if (MemTy == MVT::i8)
return ARM::ATOMIC_LOAD_MAX_8;
if (MemTy == MVT::i16)
return ARM::ATOMIC_LOAD_MAX_16;
if (MemTy == MVT::i32)
return ARM::ATOMIC_LOAD_MAX_32;
break;
case ISD::ATOMIC_LOAD_UMIN:
if (MemTy == MVT::i8)
return ARM::ATOMIC_LOAD_UMIN_8;
if (MemTy == MVT::i16)
return ARM::ATOMIC_LOAD_UMIN_16;
if (MemTy == MVT::i32)
return ARM::ATOMIC_LOAD_UMIN_32;
break;
case ISD::ATOMIC_LOAD_UMAX:
if (MemTy == MVT::i8)
return ARM::ATOMIC_LOAD_UMAX_8;
if (MemTy == MVT::i16)
return ARM::ATOMIC_LOAD_UMAX_16;
if (MemTy == MVT::i32)
return ARM::ATOMIC_LOAD_UMAX_32;
break;
case ISD::ATOMIC_LOAD_FADD:
if (MemTy == MVT::i16)
return ARM::ATOMIC_LOAD_FADD_16; // f16 promoted to f32
if (MemTy == MVT::f16)
return ARM::ATOMIC_LOAD_FADD_16;
if (MemTy == MVT::f32)
return ARM::ATOMIC_LOAD_FADD_32;
if (MemTy == MVT::f64)
return ARM::ATOMIC_LOAD_FADD_64;
break;
case ISD::ATOMIC_LOAD_FSUB:
if (MemTy == MVT::i16)
return ARM::ATOMIC_LOAD_FSUB_16; // f16 promoted to f32
if (MemTy == MVT::f16)
return ARM::ATOMIC_LOAD_FSUB_16;
if (MemTy == MVT::f32)
return ARM::ATOMIC_LOAD_FSUB_32;
if (MemTy == MVT::f64)
return ARM::ATOMIC_LOAD_FSUB_64;
break;
}
llvm_unreachable("Unknown AtomicOp type");
return ARM::INSTRUCTION_LIST_END;
}();
SDValue Chain = N->getOperand(0);
SDValue Addr = N->getOperand(1);
SDValue Value = N->getOperand(2);
SDNode *Swap = CurDAG->getMachineNode(
Opcode, SDLoc(N), CurDAG->getVTList(Value.getValueType(), MVT::Other),
{Chain, Addr, Value});
MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
CurDAG->setNodeMemRefs(cast<MachineSDNode>(Swap), {MemOp});
ReplaceUses(SDValue(N, 0), SDValue(Swap, 0)); // Result
ReplaceUses(SDValue(N, 1), SDValue(Swap, 1)); // Chain
CurDAG->RemoveDeadNode(N);
}
static Optional<std::pair<unsigned, unsigned>>
getContiguousRangeOfSetBits(const APInt &A) {
unsigned FirstOne = A.getBitWidth() - A.countLeadingZeros() - 1;
@ -5028,6 +5165,23 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
case ISD::ATOMIC_CMP_SWAP:
SelectCMP_SWAP(N);
return;
case ISD::ATOMIC_LOAD_ADD:
case ISD::ATOMIC_LOAD_SUB:
case ISD::ATOMIC_LOAD_AND:
case ISD::ATOMIC_LOAD_CLR:
case ISD::ATOMIC_LOAD_OR:
case ISD::ATOMIC_LOAD_XOR:
case ISD::ATOMIC_LOAD_NAND:
case ISD::ATOMIC_LOAD_MIN:
case ISD::ATOMIC_LOAD_MAX:
case ISD::ATOMIC_LOAD_UMIN:
case ISD::ATOMIC_LOAD_UMAX:
case ISD::ATOMIC_LOAD_FADD:
case ISD::ATOMIC_LOAD_FSUB:
case ISD::ATOMIC_SWAP:
SelectAtomicOp(N);
return;
}
SelectCode(N);

View File

@ -19057,6 +19057,11 @@ ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
// and up to 64 bits on the non-M profiles
TargetLowering::AtomicExpansionKind
ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
// At -O0 expand pseudo-instructions after register allocation to avoid
// inserting spills between ldrex/strex.
if (getTargetMachine().getOptLevel() == 0 && !Subtarget->isThumb())
return AtomicExpansionKind::None;
if (AI->isFloatingPointOperation())
return AtomicExpansionKind::CmpXChg;

View File

@ -6428,6 +6428,37 @@ def CMP_SWAP_64 : PseudoInst<(outs GPRPair:$Rd, GPR:$temp),
NoItinerary, []>, Sched<[]>;
}
let Constraints = "@earlyclobber $Rd,@earlyclobber $temp",
mayLoad = 1, mayStore = 1 in
multiclass AtomicRMW {
def _8 : PseudoInst<(outs GPR:$Rd, GPR:$temp), (ins GPR:$addr, GPR:$new), NoItinerary, []>, Sched<[]>;
def _16 : PseudoInst<(outs GPR:$Rd, GPR:$temp), (ins GPR:$addr, GPR:$new), NoItinerary, []>, Sched<[]>;
def _32 : PseudoInst<(outs GPR:$Rd, GPR:$temp), (ins GPR:$addr, GPR:$new), NoItinerary, []>, Sched<[]>;
def _64 : PseudoInst<(outs GPRPair:$Rd, GPR:$temp), (ins GPR:$addr, GPRPair:$new), NoItinerary, []>, Sched<[]>;
}
defm ATOMIC_SWAP : AtomicRMW;
defm ATOMIC_LOAD_ADD : AtomicRMW;
defm ATOMIC_LOAD_SUB : AtomicRMW;
defm ATOMIC_LOAD_AND : AtomicRMW;
defm ATOMIC_LOAD_CLR : AtomicRMW;
defm ATOMIC_LOAD_OR : AtomicRMW;
defm ATOMIC_LOAD_XOR : AtomicRMW;
defm ATOMIC_LOAD_NAND : AtomicRMW;
defm ATOMIC_LOAD_MIN : AtomicRMW;
defm ATOMIC_LOAD_MAX : AtomicRMW;
defm ATOMIC_LOAD_UMIN : AtomicRMW;
defm ATOMIC_LOAD_UMAX : AtomicRMW;
// FADD and FSUB have GPRPair temporary for ldrexd/strexd and the return value of strexd, but DPR for result.
let Constraints = "@earlyclobber $Rd,@earlyclobber $temp",
mayLoad = 1, mayStore = 1 in
multiclass AtomicRMWFloat {
def _16 : PseudoInst<(outs SPR:$Rd, GPR:$temp), (ins GPR:$addr, SPR:$new), NoItinerary, []>, Sched<[]>;
def _32 : PseudoInst<(outs SPR:$Rd, GPR:$temp), (ins GPR:$addr, SPR:$new), NoItinerary, []>, Sched<[]>;
def _64 : PseudoInst<(outs DPR:$Rd, GPRPair:$temp), (ins GPR:$addr, DPR:$new), NoItinerary, []>, Sched<[]>;
}
defm ATOMIC_LOAD_FADD : AtomicRMWFloat;
defm ATOMIC_LOAD_FSUB : AtomicRMWFloat;
def CompilerBarrier : PseudoInst<(outs), (ins i32imm:$ordering), NoItinerary,
[(atomic_fence timm:$ordering, 0)]> {
let hasSideEffects = 1;

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -mtriple=armv7-apple-ios7.0 -atomic-expand %s | FileCheck %s
; RUN: opt -O1 -S -mtriple=armv7-apple-ios7.0 -atomic-expand %s | FileCheck %s
define float @test_atomicrmw_fadd_f32(float* %ptr, float %value) {
; CHECK-LABEL: @test_atomicrmw_fadd_f32(