From bbcf1c3496ce2bd1ed87e8fb15ad896e279633ce Mon Sep 17 00:00:00 2001 From: Victor Campos Date: Mon, 16 Dec 2019 14:22:15 +0000 Subject: [PATCH] [ARM] Improve codegen of volatile load/store of i64 Summary: Instead of generating two i32 instructions for each load or store of a volatile i64 value (two LDRs or STRs), now emit LDRD/STRD. These improvements cover architectures implementing ARMv5TE or Thumb-2. Reviewers: dmgreen, efriedma, john.brawn Reviewed By: efriedma Subscribers: kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D70072 --- llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp | 18 +++ llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp | 49 ++++++ llvm/lib/Target/ARM/ARMISelLowering.cpp | 57 ++++++- llvm/lib/Target/ARM/ARMISelLowering.h | 8 +- llvm/lib/Target/ARM/ARMInstrInfo.td | 23 +++ llvm/lib/Target/ARM/ARMInstrThumb2.td | 9 +- .../CodeGen/ARM/i64_volatile_load_store.ll | 153 ++++++++++++++++++ 7 files changed, 311 insertions(+), 6 deletions(-) create mode 100644 llvm/test/CodeGen/ARM/i64_volatile_load_store.ll diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp index de4377ec5a47..2c3ac816219f 100644 --- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -1952,6 +1952,24 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MI.eraseFromParent(); return true; } + case ARM::LOADDUAL: + case ARM::STOREDUAL: { + Register PairReg = MI.getOperand(0).getReg(); + + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == ARM::LOADDUAL ? ARM::LDRD : ARM::STRD)) + .addReg(TRI->getSubReg(PairReg, ARM::gsub_0), + Opcode == ARM::LOADDUAL ? RegState::Define : 0) + .addReg(TRI->getSubReg(PairReg, ARM::gsub_1), + Opcode == ARM::LOADDUAL ? RegState::Define : 0); + for (unsigned i = 1; i < MI.getNumOperands(); i++) + MIB.add(MI.getOperand(i)); + MIB.add(predOps(ARMCC::AL)); + MIB.cloneMemRefs(MI); + MI.eraseFromParent(); + return true; + } } } diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index 1f998defbd1b..dcdec779bbec 100644 --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -145,6 +145,8 @@ public: // Thumb 2 Addressing Modes: bool SelectT2AddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm); + template + bool SelectT2AddrModeImm8(SDValue N, SDValue &Base, SDValue &OffImm); bool SelectT2AddrModeImm8(SDValue N, SDValue &Base, SDValue &OffImm); bool SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N, @@ -1294,6 +1296,33 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N, return true; } +template +bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue N, SDValue &Base, + SDValue &OffImm) { + if (N.getOpcode() == ISD::SUB || CurDAG->isBaseWithConstantOffset(N)) { + int RHSC; + if (isScaledConstantInRange(N.getOperand(1), 1 << Shift, -255, 256, RHSC)) { + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); + } + + if (N.getOpcode() == ISD::SUB) + RHSC = -RHSC; + OffImm = + CurDAG->getTargetConstant(RHSC * (1 << Shift), SDLoc(N), MVT::i32); + return true; + } + } + + // Base only. + Base = N; + OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); + return true; +} + bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue N, SDValue &Base, SDValue &OffImm) { // Match simple R - imm8 operands. @@ -3486,6 +3515,26 @@ void ARMDAGToDAGISel::Select(SDNode *N) { CurDAG->RemoveDeadNode(N); return; } + case ARMISD::LDRD: { + if (Subtarget->isThumb2()) + break; // TableGen handles isel in this case. + SDValue Base, RegOffset, ImmOffset; + const SDValue &Chain = N->getOperand(0); + const SDValue &Addr = N->getOperand(1); + SelectAddrMode3(Addr, Base, RegOffset, ImmOffset); + SDValue Ops[] = {Base, RegOffset, ImmOffset, Chain}; + SDNode *New = CurDAG->getMachineNode(ARM::LOADDUAL, dl, + {MVT::Untyped, MVT::Other}, Ops); + SDValue Lo = CurDAG->getTargetExtractSubreg(ARM::gsub_0, dl, MVT::i32, + SDValue(New, 0)); + SDValue Hi = CurDAG->getTargetExtractSubreg(ARM::gsub_1, dl, MVT::i32, + SDValue(New, 0)); + ReplaceUses(SDValue(N, 0), Lo); + ReplaceUses(SDValue(N, 1), Hi); + ReplaceUses(SDValue(N, 2), SDValue(New, 1)); + CurDAG->RemoveDeadNode(N); + return; + } case ARMISD::LOOP_DEC: { SDValue Ops[] = { N->getOperand(1), N->getOperand(2), diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 0b4d39ec3080..67d436fb8faa 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1070,6 +1070,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SRA, MVT::i64, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); + setOperationAction(ISD::LOAD, MVT::i64, Custom); + setOperationAction(ISD::STORE, MVT::i64, Custom); // MVE lowers 64 bit shifts to lsll and lsrl // assuming that ISD::SRL and SRA of i64 are already marked custom @@ -1593,6 +1595,9 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::PRELOAD: return "ARMISD::PRELOAD"; + case ARMISD::LDRD: return "ARMISD::LDRD"; + case ARMISD::STRD: return "ARMISD::STRD"; + case ARMISD::WIN__CHKSTK: return "ARMISD::WIN__CHKSTK"; case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK"; @@ -9081,6 +9086,24 @@ static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) { return DAG.getMergeValues({Pred, Load.getValue(1)}, dl); } +void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl &Results, + SelectionDAG &DAG) const { + LoadSDNode *LD = cast(N); + EVT MemVT = LD->getMemoryVT(); + assert(LD->isUnindexed() && "Loads should be unindexed at this point."); + + if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() && + !Subtarget->isThumb1Only() && LD->isVolatile()) { + SDLoc dl(N); + SDValue Result = DAG.getMemIntrinsicNode( + ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}), + {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand()); + SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, + Result.getValue(0), Result.getValue(1)); + Results.append({Pair, Result.getValue(2)}); + } +} + static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) { StoreSDNode *ST = cast(Op.getNode()); EVT MemVT = ST->getMemoryVT(); @@ -9110,6 +9133,34 @@ static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) { ST->getMemOperand()); } +static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { + StoreSDNode *ST = cast(Op.getNode()); + EVT MemVT = ST->getMemoryVT(); + assert(ST->isUnindexed() && "Stores should be unindexed at this point."); + + if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() && + !Subtarget->isThumb1Only() && ST->isVolatile()) { + SDNode *N = Op.getNode(); + SDLoc dl(N); + + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(), + DAG.getTargetConstant(0, dl, MVT::i32)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(), + DAG.getTargetConstant(1, dl, MVT::i32)); + + return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other), + {ST->getChain(), Lo, Hi, ST->getBasePtr()}, + MemVT, ST->getMemOperand()); + } else if (Subtarget->hasMVEIntegerOps() && + ((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || + MemVT == MVT::v16i1))) { + return LowerPredicateStore(Op, DAG); + } + + return SDValue(); +} + static bool isZeroVector(SDValue N) { return (ISD::isBuildVectorAllZeros(N.getNode()) || (N->getOpcode() == ARMISD::VMOVIMM && @@ -9297,7 +9348,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::LOAD: return LowerPredicateLoad(Op, DAG); case ISD::STORE: - return LowerPredicateStore(Op, DAG); + return LowerSTORE(Op, DAG, Subtarget); case ISD::MLOAD: return LowerMLOAD(Op, DAG); case ISD::ATOMIC_LOAD: @@ -9399,7 +9450,9 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N, case ISD::ABS: lowerABS(N, Results, DAG); return ; - + case ISD::LOAD: + LowerLOAD(N, Results, DAG); + break; } if (Res.getNode()) Results.push_back(Res); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index afb4750ee35a..a63217ed87b5 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -278,7 +278,11 @@ class VectorType; VST4_UPD, VST2LN_UPD, VST3LN_UPD, - VST4LN_UPD + VST4LN_UPD, + + // Load/Store of dual registers + LDRD, + STRD }; } // end namespace ARMISD @@ -731,6 +735,8 @@ class VectorType; SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; void lowerABS(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const; + void LowerLOAD(SDNode *N, SmallVectorImpl &Results, + SelectionDAG &DAG) const; Register getRegisterByName(const char* RegName, EVT VT, const MachineFunction &MF) const override; diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td index 1cab1747ff4e..ba374e8f07d7 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -243,6 +243,12 @@ def ARMqsub8b : SDNode<"ARMISD::QSUB8b", SDT_ARMAnd, []>; def ARMqadd16b : SDNode<"ARMISD::QADD16b", SDT_ARMAnd, []>; def ARMqsub16b : SDNode<"ARMISD::QSUB16b", SDT_ARMAnd, []>; +def SDT_ARMldrd : SDTypeProfile<2, 1, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; +def ARMldrd : SDNode<"ARMISD::LDRD", SDT_ARMldrd, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; + +def SDT_ARMstrd : SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; +def ARMstrd : SDNode<"ARMISD::STRD", SDT_ARMstrd, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; + // Vector operations shared between NEON and MVE def ARMvdup : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>; @@ -2695,6 +2701,12 @@ let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in { Requires<[IsARM, HasV5TE]>; } +let mayLoad = 1, hasSideEffects = 0, hasNoSchedulingInfo = 1 in { +def LOADDUAL : ARMPseudoInst<(outs GPRPairOp:$Rt), (ins addrmode3:$addr), + 64, IIC_iLoad_d_r, []>, + Requires<[IsARM, HasV5TE]>; +} + def LDA : AIldracq<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr), NoItinerary, "lda", "\t$Rt, $addr", []>; def LDAB : AIldracq<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr), @@ -2970,6 +2982,17 @@ let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in { } } +let mayStore = 1, hasSideEffects = 0, hasNoSchedulingInfo = 1 in { +def STOREDUAL : ARMPseudoInst<(outs), (ins GPRPairOp:$Rt, addrmode3:$addr), + 64, IIC_iStore_d_r, []>, + Requires<[IsARM, HasV5TE]>; +} + +let Predicates = [IsARM, HasV5TE] in { +def : Pat<(ARMstrd GPR:$Rt, GPR:$Rt2, addrmode3:$addr), + (STOREDUAL (REG_SEQUENCE GPRPair, GPR:$Rt, gsub_0, GPR:$Rt2, gsub_1), addrmode3:$addr)>; +} + // Indexed stores multiclass AI2_stridx { diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td index 7044c92a7f11..00921930e71a 100644 --- a/llvm/lib/Target/ARM/ARMInstrThumb2.td +++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td @@ -270,7 +270,8 @@ def t2am_imm8_offset : MemOperand, // t2addrmode_imm8s4 := reg +/- (imm8 << 2) def MemImm8s4OffsetAsmOperand : AsmOperandClass {let Name = "MemImm8s4Offset";} -class T2AddrMode_Imm8s4 : MemOperand { +class T2AddrMode_Imm8s4 : MemOperand, + ComplexPattern", []> { let EncoderMethod = "getT2AddrModeImm8s4OpValue"; let DecoderMethod = "DecodeT2AddrModeImm8s4"; let ParserMatchClass = MemImm8s4OffsetAsmOperand; @@ -1412,7 +1413,8 @@ let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in { // Load doubleword def t2LDRDi8 : T2Ii8s4<1, 0, 1, (outs rGPR:$Rt, rGPR:$Rt2), (ins t2addrmode_imm8s4:$addr), - IIC_iLoad_d_i, "ldrd", "\t$Rt, $Rt2, $addr", "", []>, + IIC_iLoad_d_i, "ldrd", "\t$Rt, $Rt2, $addr", "", + [(set rGPR:$Rt, rGPR:$Rt2, (ARMldrd t2addrmode_imm8s4:$addr))]>, Sched<[WriteLd]>; } // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 @@ -1593,7 +1595,8 @@ defm t2STRH:T2I_st<0b01,"strh", IIC_iStore_bh_i, IIC_iStore_bh_si, let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in def t2STRDi8 : T2Ii8s4<1, 0, 0, (outs), (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4:$addr), - IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", "", []>, + IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", "", + [(ARMstrd rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4:$addr)]>, Sched<[WriteST]>; // Indexed stores diff --git a/llvm/test/CodeGen/ARM/i64_volatile_load_store.ll b/llvm/test/CodeGen/ARM/i64_volatile_load_store.ll new file mode 100644 index 000000000000..7461d3651eb3 --- /dev/null +++ b/llvm/test/CodeGen/ARM/i64_volatile_load_store.ll @@ -0,0 +1,153 @@ +; RUN: llc -mtriple=armv5e-arm-none-eabi %s -o - | FileCheck %s --check-prefixes=CHECK-ARMV5TE,CHECK +; RUN: llc -mtriple=thumbv6t2-arm-none-eabi %s -o - | FileCheck %s --check-prefixes=CHECK-T2,CHECK +; RUN: llc -mtriple=armv4t-arm-none-eabi %s -o - | FileCheck %s --check-prefixes=CHECK-ARMV4T,CHECK + +@x = common dso_local global i64 0, align 8 +@y = common dso_local global i64 0, align 8 + +define void @test() { +entry: +; CHECK-LABEL: test: +; CHECK-ARMV5TE: ldr [[ADDR0:r[0-9]+]] +; CHECK-ARMV5TE-NEXT: ldr [[ADDR1:r[0-9]+]] +; CHECK-ARMV5TE-NEXT: ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], {{\[}}[[ADDR0]]] +; CHECK-ARMV5TE-NEXT: strd [[R0]], [[R1]], {{\[}}[[ADDR1]]] +; CHECK-T2: movw [[ADDR0:r[0-9]+]], :lower16:x +; CHECK-T2-NEXT: movw [[ADDR1:r[0-9]+]], :lower16:y +; CHECK-T2-NEXT: movt [[ADDR0]], :upper16:x +; CHECK-T2-NEXT: movt [[ADDR1]], :upper16:y +; CHECK-T2-NEXT: ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], {{\[}}[[ADDR0]]] +; CHECK-T2-NEXT: strd [[R0]], [[R1]], {{\[}}[[ADDR1]]] +; CHECK-ARMV4T: ldr [[ADDR0:r[0-9]+]] +; CHECK-ARMV4T-NEXT: ldr [[ADDR1:r[0-9]+]] +; CHECK-ARMV4T-NEXT: ldr [[R1:r[0-9]+]], {{\[}}[[ADDR0]]] +; CHECK-ARMV4T-NEXT: ldr [[R0:r[0-9]+]], {{\[}}[[ADDR0]], #4] +; CHECK-ARMV4T-NEXT: str [[R0]], {{\[}}[[ADDR1]], #4] +; CHECK-ARMV4T-NEXT: str [[R1]], {{\[}}[[ADDR1]]] + %0 = load volatile i64, i64* @x, align 8 + store volatile i64 %0, i64* @y, align 8 + ret void +} + +define void @test_offset() { +entry: +; CHECK-LABEL: test_offset: +; CHECK-ARMV5TE: ldr [[ADDR0:r[0-9]+]] +; CHECK-ARMV5TE-NEXT: ldr [[ADDR1:r[0-9]+]] +; CHECK-ARMV5TE-NEXT: ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], {{\[}}[[ADDR0]], #-4] +; CHECK-ARMV5TE-NEXT: strd [[R0]], [[R1]], {{\[}}[[ADDR1]], #-4] +; CHECK-T2: movw [[ADDR0:r[0-9]+]], :lower16:x +; CHECK-T2-NEXT: movw [[ADDR1:r[0-9]+]], :lower16:y +; CHECK-T2-NEXT: movt [[ADDR0]], :upper16:x +; CHECK-T2-NEXT: movt [[ADDR1]], :upper16:y +; CHECK-T2-NEXT: ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], {{\[}}[[ADDR0]], #-4] +; CHECK-T2-NEXT: strd [[R0]], [[R1]], {{\[}}[[ADDR1]], #-4] +; CHECK-ARMV4T: ldr [[ADDR0:r[0-9]+]] +; CHECK-ARMV4T-NEXT: ldr [[ADDR1:r[0-9]+]] +; CHECK-ARMV4T-NEXT: ldr [[R0:r[0-9]+]], {{\[}}[[ADDR0]], #-4] +; CHECK-ARMV4T-NEXT: ldr [[R1:r[0-9]+]], {{\[}}[[ADDR0]]] +; CHECK-ARMV4T-NEXT: str [[R1]], {{\[}}[[ADDR1]]] +; CHECK-ARMV4T-NEXT: str [[R0]], {{\[}}[[ADDR1]], #-4] + %0 = load volatile i64, i64* bitcast (i8* getelementptr (i8, i8* bitcast (i64* @x to i8*), i32 -4) to i64*), align 8 + store volatile i64 %0, i64* bitcast (i8* getelementptr (i8, i8* bitcast (i64* @y to i8*), i32 -4) to i64*), align 8 + ret void +} + +define void @test_offset_1() { +; CHECK-LABEL: test_offset_1: +; CHECK-ARMV5TE: ldr [[ADDR0:r[0-9]+]] +; CHECK-ARMV5TE-NEXT: ldr [[ADDR1:r[0-9]+]] +; CHECK-ARMV5TE-NEXT: ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], {{\[}}[[ADDR0]], #255] +; CHECK-ARMV5TE-NEXT: strd [[R0]], [[R1]], {{\[}}[[ADDR1]], #255] +; CHECK-T2: adds [[ADDR0:r[0-9]+]], #255 +; CHECK-T2-NEXT: adds [[ADDR1:r[0-9]+]], #255 +; CHECK-T2-NEXT: ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], {{\[}}[[ADDR0]]] +; CHECK-T2-NEXT: strd [[R0]], [[R1]], {{\[}}[[ADDR1]]] +; CHECK-ARMV4T: ldr [[ADDR0:r[0-9]+]] +; CHECK-ARMV4T-NEXT: ldr [[ADDR1:r[0-9]+]] +; CHECK-ARMV4T-NEXT: ldr [[R0:r[0-9]+]], {{\[}}[[ADDR0]], #255] +; CHECK-ARMV4T-NEXT: ldr [[R1:r[0-9]+]], {{\[}}[[ADDR0]], #259] +; CHECK-ARMV4T-NEXT: str [[R1]], {{\[}}[[ADDR1]], #259] +; CHECK-ARMV4T-NEXT: str [[R0]], {{\[}}[[ADDR1]], #255] +entry: + %0 = load volatile i64, i64* bitcast (i8* getelementptr (i8, i8* bitcast (i64* @x to i8*), i32 255) to i64*), align 8 + store volatile i64 %0, i64* bitcast (i8* getelementptr (i8, i8* bitcast (i64* @y to i8*), i32 255) to i64*), align 8 + ret void +} + +define void @test_offset_2() { +; CHECK-LABEL: test_offset_2: +; CHECK-ARMV5TE: ldr [[ADDR0:r[0-9]+]] +; CHECK-ARMV5TE-NEXT: mov [[OFFSET0:r[0-9]+]], #256 +; CHECK-ARMV5TE-NEXT: ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], {{\[}}[[ADDR0]], [[OFFSET0]]] +; CHECK-ARMV5TE-NEXT: ldr [[ADDR1:r[0-9]+]] +; CHECK-ARMV5TE-NEXT: strd [[R0]], [[R1]], {{\[}}[[ADDR1]], [[OFFSET0]]] +; CHECK-T2: movw [[ADDR0:r[0-9]+]], :lower16:x +; CHECK-T2-NEXT: movw [[ADDR1:r[0-9]+]], :lower16:y +; CHECK-T2-NEXT: movt [[ADDR0]], :upper16:x +; CHECK-T2-NEXT: movt [[ADDR1]], :upper16:y +; CHECK-T2-NEXT: ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], {{\[}}[[ADDR0]], #256] +; CHECK-T2-NEXT: strd [[R0]], [[R1]], {{\[}}[[ADDR1]], #256] +; CHECK-ARMV4T: ldr [[ADDR0:r[0-9]+]] +; CHECK-ARMV4T-NEXT: ldr [[ADDR1:r[0-9]+]] +; CHECK-ARMV4T-NEXT: ldr [[R0:r[0-9]+]], {{\[}}[[ADDR0]], #256] +; CHECK-ARMV4T-NEXT: ldr [[R1:r[0-9]+]], {{\[}}[[ADDR0]], #260] +; CHECK-ARMV4T-NEXT: str [[R1]], {{\[}}[[ADDR1]], #260] +; CHECK-ARMV4T-NEXT: str [[R0]], {{\[}}[[ADDR1]], #256] +entry: + %0 = load volatile i64, i64* bitcast (i8* getelementptr (i8, i8* bitcast (i64* @x to i8*), i32 256) to i64*), align 8 + store volatile i64 %0, i64* bitcast (i8* getelementptr (i8, i8* bitcast (i64* @y to i8*), i32 256) to i64*), align 8 + ret void +} + +define void @test_offset_3() { +; CHECK-LABEL: test_offset_3: +; CHECK-ARMV5TE: ldr [[ADDR0:r[0-9]+]] +; CHECK-ARMV5TE-NEXT: mov [[OFFSET0:r[0-9]+]], #1020 +; CHECK-ARMV5TE-NEXT: ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], {{\[}}[[ADDR0]], [[OFFSET0]]] +; CHECK-ARMV5TE-NEXT: ldr [[ADDR1:r[0-9]+]] +; CHECK-ARMV5TE-NEXT: strd [[R0]], [[R1]], {{\[}}[[ADDR1]], [[OFFSET0]]] +; CHECK-T2: movw [[ADDR0:r[0-9]+]], :lower16:x +; CHECK-T2-NEXT: movw [[ADDR1:r[0-9]+]], :lower16:y +; CHECK-T2-NEXT: movt [[ADDR0]], :upper16:x +; CHECK-T2-NEXT: movt [[ADDR1]], :upper16:y +; CHECK-T2-NEXT: ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], {{\[}}[[ADDR0]], #1020] +; CHECK-T2-NEXT: strd [[R0]], [[R1]], {{\[}}[[ADDR1]], #1020] +; CHECK-ARMV4T: ldr [[ADDR0:r[0-9]+]] +; CHECK-ARMV4T-NEXT: ldr [[ADDR1:r[0-9]+]] +; CHECK-ARMV4T-NEXT: ldr [[R0:r[0-9]+]], {{\[}}[[ADDR0]], #1020] +; CHECK-ARMV4T-NEXT: ldr [[R1:r[0-9]+]], {{\[}}[[ADDR0]], #1024] +; CHECK-ARMV4T-NEXT: str [[R1]], {{\[}}[[ADDR1]], #1024] +; CHECK-ARMV4T-NEXT: str [[R0]], {{\[}}[[ADDR1]], #1020] +entry: + %0 = load volatile i64, i64* bitcast (i8* getelementptr (i8, i8* bitcast (i64* @x to i8*), i32 1020) to i64*), align 8 + store volatile i64 %0, i64* bitcast (i8* getelementptr (i8, i8* bitcast (i64* @y to i8*), i32 1020) to i64*), align 8 + ret void +} + +define void @test_offset_4() { +; CHECK-LABEL: test_offset_4: +; CHECK-ARMV5TE: ldr [[ADDR0:r[0-9]+]] +; CHECK-ARMV5TE-NEXT: mov [[OFFSET0:r[0-9]+]], #1024 +; CHECK-ARMV5TE-NEXT: ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], {{\[}}[[ADDR0]], [[OFFSET0]]] +; CHECK-ARMV5TE: ldr [[ADDR1:r[0-9]+]] +; CHECK-ARMV5TE-NEXT: strd [[R0]], [[R1]], {{\[}}[[ADDR1]], [[OFFSET0]]] +; CHECK-T2: movw [[ADDR1:r[0-9]+]], :lower16:y +; CHECK-T2-NEXT: movw [[ADDR0:r[0-9]+]], :lower16:x +; CHECK-T2-NEXT: movt [[ADDR1]], :upper16:y +; CHECK-T2-NEXT: movt [[ADDR0]], :upper16:x +; CHECK-T2-NEXT: add.w [[ADDR0]], [[ADDR0]], #1024 +; CHECK-T2-NEXT: add.w [[ADDR1]], [[ADDR1]], #1024 +; CHECK-T2-NEXT: ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], {{\[}}[[ADDR0]]] +; CHECK-T2-NEXT: strd [[R0]], [[R1]], {{\[}}[[ADDR1]]] +; CHECK-ARMV4T: ldr [[ADDR0:r[0-9]+]] +; CHECK-ARMV4T-NEXT: ldr [[ADDR1:r[0-9]+]] +; CHECK-ARMV4T-NEXT: ldr [[R0:r[0-9]+]], {{\[}}[[ADDR0]], #1024] +; CHECK-ARMV4T-NEXT: ldr [[R1:r[0-9]+]], {{\[}}[[ADDR0]], #1028] +; CHECK-ARMV4T-NEXT: str [[R1]], {{\[}}[[ADDR1]], #1028] +; CHECK-ARMV4T-NEXT: str [[R0]], {{\[}}[[ADDR1]], #1024] +entry: + %0 = load volatile i64, i64* bitcast (i8* getelementptr (i8, i8* bitcast (i64* @x to i8*), i32 1024) to i64*), align 8 + store volatile i64 %0, i64* bitcast (i8* getelementptr (i8, i8* bitcast (i64* @y to i8*), i32 1024) to i64*), align 8 + ret void +}