[Thumb1] Re-write emitThumbRegPlusImmediate

This was motivated by a bug which caused code like this to be
miscompiled:
  declare void @take_ptr(i8*)
  define void @test() {
    %addr1.32 = alloca i8
    %addr2.32 = alloca i32, i32 1028
    call void @take_ptr(i8* %addr1)
    ret void
  }

This was emitting the following assembly to get the value of %addr1:
  add r0, sp, #1020
  add r0, r0, #8
However, "add r0, r0, #8" is not a valid Thumb1 instruction, and this
could not be assembled. The generated object file contained this,
resulting in r0 holding SP+8 rather tha SP+1028:
  add r0, sp, #1020
  add r0, sp, #8

This function looked like it could have caused miscompilations for
other combinations of registers and offsets (though I don't think it is
currently called with these), and the heuristic it used did not match
the emitted code in all cases.

llvm-svn: 222125
This commit is contained in:
Oliver Stannard 2014-11-17 11:18:10 +00:00
parent 236b0ca790
commit 970b0d576c
4 changed files with 213 additions and 160 deletions

View File

@ -595,10 +595,10 @@ inline uint64_t PowerOf2Floor(uint64_t A) {
/// RoundUpToAlignment(5, 8) = 8
/// RoundUpToAlignment(17, 8) = 24
/// RoundUpToAlignment(~0LL, 8) = 0
/// RoundUpToAlignment(321, 255) = 510
/// \endcode
inline uint64_t RoundUpToAlignment(uint64_t Value, uint64_t Align) {
assert(isPowerOf2_64(Align) && "Alignment must be power of 2!");
return (Value + Align - 1) & ~uint64_t(Align - 1);
return (Value + Align - 1) / Align * Align;
}
/// Returns the offset to the next integer (mod 2**64) that is greater than

View File

@ -66,6 +66,10 @@ Thumb1RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB,
int Val,
ARMCC::CondCodes Pred, unsigned PredReg,
unsigned MIFlags) const {
assert((isARMLowRegister(DestReg) ||
isVirtualRegister(DestReg)) &&
"Thumb1 does not have ldr to high register");
MachineFunction &MF = *MBB.getParent();
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
MachineConstantPool *ConstantPool = MF.getConstantPool();
@ -106,15 +110,15 @@ void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB,
NumBytes = -NumBytes;
}
unsigned LdReg = DestReg;
if (DestReg == ARM::SP) {
if (DestReg == ARM::SP)
assert(BaseReg == ARM::SP && "Unexpected!");
if (!isARMLowRegister(DestReg) && !MRI.isVirtualRegister(DestReg))
LdReg = MF.getRegInfo().createVirtualRegister(&ARM::tGPRRegClass);
}
if (NumBytes <= 255 && NumBytes >= 0)
if (NumBytes <= 255 && NumBytes >= 0 && CanChangeCC) {
AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg))
.addImm(NumBytes).setMIFlags(MIFlags);
else if (NumBytes < 0 && NumBytes >= -255) {
} else if (NumBytes < 0 && NumBytes >= -255 && CanChangeCC) {
AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg))
.addImm(NumBytes).setMIFlags(MIFlags);
AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tRSB), LdReg))
@ -124,7 +128,8 @@ void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB,
ARMCC::AL, 0, MIFlags);
// Emit add / sub.
int Opc = (isSub) ? ARM::tSUBrr : (isHigh ? ARM::tADDhirr : ARM::tADDrr);
int Opc = (isSub) ? ARM::tSUBrr : ((isHigh || !CanChangeCC) ? ARM::tADDhirr
: ARM::tADDrr);
MachineInstrBuilder MIB =
BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg);
if (Opc != ARM::tADDhirr)
@ -136,32 +141,10 @@ void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB,
AddDefaultPred(MIB);
}
/// calcNumMI - Returns the number of instructions required to materialize
/// the specific add / sub r, c instruction.
static unsigned calcNumMI(int Opc, int ExtraOpc, unsigned Bytes,
unsigned NumBits, unsigned Scale) {
unsigned NumMIs = 0;
unsigned Chunk = ((1 << NumBits) - 1) * Scale;
if (Opc == ARM::tADDrSPi) {
unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes;
Bytes -= ThisVal;
NumMIs++;
NumBits = 8;
Scale = 1; // Followed by a number of tADDi8.
Chunk = ((1 << NumBits) - 1) * Scale;
}
NumMIs += Bytes / Chunk;
if ((Bytes % Chunk) != 0)
NumMIs++;
if (ExtraOpc)
NumMIs++;
return NumMIs;
}
/// emitThumbRegPlusImmediate - Emits a series of instructions to materialize
/// a destreg = basereg + immediate in Thumb code.
/// a destreg = basereg + immediate in Thumb code. Tries a series of ADDs or
/// SUBs first, and uses a constant pool value if the instruction sequence would
/// be too long. This is allowed to modify the condition flags.
void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI,
DebugLoc dl,
@ -172,131 +155,146 @@ void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
bool isSub = NumBytes < 0;
unsigned Bytes = (unsigned)NumBytes;
if (isSub) Bytes = -NumBytes;
bool isMul4 = (Bytes & 3) == 0;
bool isTwoAddr = false;
bool DstNotEqBase = false;
unsigned NumBits = 1;
unsigned Scale = 1;
int Opc = 0;
int ExtraOpc = 0;
bool NeedCC = false;
if (DestReg == BaseReg && BaseReg == ARM::SP) {
assert(isMul4 && "Thumb sp inc / dec size must be multiple of 4!");
NumBits = 7;
Scale = 4;
Opc = isSub ? ARM::tSUBspi : ARM::tADDspi;
isTwoAddr = true;
} else if (!isSub && BaseReg == ARM::SP) {
// r1 = add sp, 403
// =>
// r1 = add sp, 100 * 4
// r1 = add r1, 3
if (!isMul4) {
Bytes &= ~3;
ExtraOpc = ARM::tADDi3;
}
DstNotEqBase = true;
NumBits = 8;
Scale = 4;
Opc = ARM::tADDrSPi;
} else {
// sp = sub sp, c
// r1 = sub sp, c
// r8 = sub sp, c
if (DestReg != BaseReg)
DstNotEqBase = true;
if (DestReg == ARM::SP) {
Opc = isSub ? ARM::tSUBspi : ARM::tADDspi;
assert(isMul4 && "Thumb sp inc / dec size must be multiple of 4!");
NumBits = 7;
Scale = 4;
int CopyOpc = 0;
unsigned CopyBits = 0;
unsigned CopyScale = 1;
bool CopyNeedsCC = false;
int ExtraOpc = 0;
unsigned ExtraBits = 0;
unsigned ExtraScale = 1;
bool ExtraNeedsCC = false;
// Strategy:
// We need to select two types of instruction, maximizing the available
// immediate range of each. The instructions we use will depend on whether
// DestReg and BaseReg are low, high or the stack pointer.
// * CopyOpc - DestReg = BaseReg + imm
// This will be emitted once if DestReg != BaseReg, and never if
// DestReg == BaseReg.
// * ExtraOpc - DestReg = DestReg + imm
// This will be emitted as many times as necessary to add the
// full immediate.
// If the immediate ranges of these instructions are not large enough to cover
// NumBytes with a reasonable number of instructions, we fall back to using a
// value loaded from a constant pool.
if (DestReg == ARM::SP) {
if (BaseReg == ARM::SP) {
// sp -> sp
// Already in right reg, no copy needed
} else {
Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8;
NumBits = 8;
NeedCC = true;
// low -> sp or high -> sp
CopyOpc = ARM::tMOVr;
CopyBits = 0;
}
isTwoAddr = true;
ExtraOpc = isSub ? ARM::tSUBspi : ARM::tADDspi;
ExtraBits = 7;
ExtraScale = 4;
} else if (isARMLowRegister(DestReg)) {
if (BaseReg == ARM::SP) {
// sp -> low
assert(!isSub && "Thumb1 does not have tSUBrSPi");
CopyOpc = ARM::tADDrSPi;
CopyBits = 8;
CopyScale = 4;
} else if (DestReg == BaseReg) {
// low -> same low
// Already in right reg, no copy needed
} else if (isARMLowRegister(BaseReg)) {
// low -> different low
CopyOpc = isSub ? ARM::tSUBi3 : ARM::tADDi3;
CopyBits = 3;
CopyNeedsCC = true;
} else {
// high -> low
CopyOpc = ARM::tMOVr;
CopyBits = 0;
}
ExtraOpc = isSub ? ARM::tSUBi8 : ARM::tADDi8;
ExtraBits = 8;
ExtraNeedsCC = true;
} else /* DestReg is high */ {
if (DestReg == BaseReg) {
// high -> same high
// Already in right reg, no copy needed
} else {
// {low,high,sp} -> high
CopyOpc = ARM::tMOVr;
CopyBits = 0;
}
ExtraOpc = 0;
}
unsigned NumMIs = calcNumMI(Opc, ExtraOpc, Bytes, NumBits, Scale);
// We could handle an unaligned immediate with an unaligned copy instruction
// and an aligned extra instruction, but this case is not currently needed.
assert(((Bytes & 3) == 0 || ExtraScale == 1) &&
"Unaligned offset, but all instructions require alignment");
unsigned CopyRange = ((1 << CopyBits) - 1) * CopyScale;
// If we would emit the copy with an immediate of 0, just use tMOVr.
if (CopyOpc && Bytes < CopyScale) {
CopyOpc = ARM::tMOVr;
CopyBits = 0;
CopyScale = 1;
CopyNeedsCC = false;
CopyRange = 0;
}
unsigned ExtraRange = ((1 << ExtraBits) - 1) * ExtraScale; // per instruction
unsigned RequiredCopyInstrs = CopyOpc ? 1 : 0;
unsigned RangeAfterCopy = (CopyRange > Bytes) ? 0 : (Bytes - CopyRange);
// We could handle this case when the copy instruction does not require an
// aligned immediate, but we do not currently do this.
assert(RangeAfterCopy % ExtraScale == 0 &&
"Extra instruction requires immediate to be aligned");
unsigned RequiredExtraInstrs;
if (ExtraRange)
RequiredExtraInstrs = RoundUpToAlignment(RangeAfterCopy, ExtraRange) / ExtraRange;
else if (RangeAfterCopy > 0)
// We need an extra instruction but none is available
RequiredExtraInstrs = 1000000;
else
RequiredExtraInstrs = 0;
unsigned RequiredInstrs = RequiredCopyInstrs + RequiredExtraInstrs;
unsigned Threshold = (DestReg == ARM::SP) ? 3 : 2;
if (NumMIs > Threshold) {
// This will expand into too many instructions. Load the immediate from a
// constpool entry.
// Use a constant pool, if the sequence of ADDs/SUBs is too expensive.
if (RequiredInstrs > Threshold) {
emitThumbRegPlusImmInReg(MBB, MBBI, dl,
DestReg, BaseReg, NumBytes, true,
TII, MRI, MIFlags);
return;
}
if (DstNotEqBase) {
if (isARMLowRegister(DestReg) && isARMLowRegister(BaseReg)) {
// If both are low registers, emit DestReg = add BaseReg, max(Imm, 7)
unsigned Chunk = (1 << 3) - 1;
unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes;
Bytes -= ThisVal;
const MCInstrDesc &MCID = TII.get(isSub ? ARM::tSUBi3 : ARM::tADDi3);
const MachineInstrBuilder MIB =
AddDefaultT1CC(BuildMI(MBB, MBBI, dl, MCID, DestReg)
.setMIFlags(MIFlags));
AddDefaultPred(MIB.addReg(BaseReg, RegState::Kill).addImm(ThisVal));
} else if (isARMLowRegister(DestReg) && BaseReg == ARM::SP && Bytes > 0) {
unsigned ThisVal = std::min(1020U, Bytes / 4 * 4);
Bytes -= ThisVal;
AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), DestReg)
.addReg(BaseReg, RegState::Kill).addImm(ThisVal / 4))
.setMIFlags(MIFlags);
} else {
AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), DestReg)
.addReg(BaseReg, RegState::Kill))
.setMIFlags(MIFlags);
// Emit zero or one copy instructions
if (CopyOpc) {
unsigned CopyImm = std::min(Bytes, CopyRange) / CopyScale;
Bytes -= CopyImm * CopyScale;
MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(CopyOpc), DestReg);
if (CopyNeedsCC)
MIB = AddDefaultT1CC(MIB);
MIB.addReg(BaseReg, RegState::Kill);
if (CopyOpc != ARM::tMOVr) {
MIB.addImm(CopyImm);
}
AddDefaultPred(MIB.setMIFlags(MIFlags));
BaseReg = DestReg;
}
unsigned Chunk = ((1 << NumBits) - 1) * Scale;
// Emit zero or more in-place add/sub instructions
while (Bytes) {
unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes;
Bytes -= ThisVal;
ThisVal /= Scale;
// Build the new tADD / tSUB.
if (isTwoAddr) {
MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg);
if (NeedCC)
MIB = AddDefaultT1CC(MIB);
MIB.addReg(DestReg).addImm(ThisVal);
MIB = AddDefaultPred(MIB);
MIB.setMIFlags(MIFlags);
} else {
bool isKill = BaseReg != ARM::SP;
MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg);
if (NeedCC)
MIB = AddDefaultT1CC(MIB);
MIB.addReg(BaseReg, getKillRegState(isKill)).addImm(ThisVal);
MIB = AddDefaultPred(MIB);
MIB.setMIFlags(MIFlags);
unsigned ExtraImm = std::min(Bytes, ExtraRange) / ExtraScale;
Bytes -= ExtraImm * ExtraScale;
BaseReg = DestReg;
if (Opc == ARM::tADDrSPi) {
// r4 = add sp, imm
// r4 = add r4, imm
// ...
NumBits = 8;
Scale = 1;
Chunk = ((1 << NumBits) - 1) * Scale;
Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8;
NeedCC = isTwoAddr = true;
}
}
}
if (ExtraOpc) {
const MCInstrDesc &MCID = TII.get(ExtraOpc);
AddDefaultPred(AddDefaultT1CC(BuildMI(MBB, MBBI, dl, MCID, DestReg))
.addReg(DestReg, RegState::Kill)
.addImm(((unsigned)NumBytes) & 3)
.setMIFlags(MIFlags));
MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(ExtraOpc), DestReg);
if (ExtraNeedsCC)
MIB = AddDefaultT1CC(MIB);
MIB.addReg(BaseReg).addImm(ExtraImm);
MIB = AddDefaultPred(MIB);
MIB.setMIFlags(MIFlags);
}
}

View File

@ -1,5 +1,7 @@
; RUN: llc < %s -mtriple=thumbv6-apple-darwin | FileCheck %s
; RUN: llc < %s -mtriple=thumbv6-apple-darwin -regalloc=basic | FileCheck %s
; RUN: llc < %s -o %t -filetype=obj -mtriple=thumbv6-apple-darwin
; RUN: llvm-objdump -triple=thumbv6-apple-darwin -d %t | FileCheck %s
@__bar = external hidden global i8*
@__baz = external hidden global i8*
@ -49,13 +51,13 @@ define void @test_local_var_addr() {
%addr2 = alloca i8
; CHECK: mov r0, sp
; CHECK: adds r0, r0, #{{[0-9]+}}
; CHECK: blx _take_ptr
; CHECK: adds r0, #{{[0-9]+}}
; CHECK: blx
call void @take_ptr(i8* %addr1)
; CHECK: mov r0, sp
; CHECK: adds r0, r0, #{{[0-9]+}}
; CHECK: blx _take_ptr
; CHECK: adds r0, #{{[0-9]+}}
; CHECK: blx
call void @take_ptr(i8* %addr2)
ret void
@ -70,7 +72,7 @@ define void @test_simple_var() {
; CHECK: mov r0, sp
; CHECK-NOT: adds r0
; CHECK: blx _take_ptr
; CHECK: blx
call void @take_ptr(i8* %addr8)
ret void
}
@ -85,12 +87,12 @@ define void @test_local_var_addr_aligned() {
%addr2 = bitcast i32* %addr2.32 to i8*
; CHECK: add r0, sp, #{{[0-9]+}}
; CHECK: blx _take_ptr
; CHECK: blx
call void @take_ptr(i8* %addr1)
; CHECK: mov r0, sp
; CHECK-NOT: add r0
; CHECK: blx _take_ptr
; CHECK: blx
call void @take_ptr(i8* %addr2)
ret void
@ -104,8 +106,35 @@ define void @test_local_var_big_offset() {
%addr2.32 = alloca i32, i32 257
; CHECK: add [[RTMP:r[0-9]+]], sp, #1020
; CHECL: add r0, [[RTMP]], #8
; CHECK: blx _take_ptr
; CHECK: adds [[RTMP]], #8
; CHECK: blx
call void @take_ptr(i8* %addr1)
ret void
}
; Max range addressable with tADDrSPi
define void @test_local_var_offset_1020() {
; CHECK-LABEL: test_local_var_offset_1020
%addr1 = alloca i8, i32 4
%addr2 = alloca i8, i32 1020
; CHECK: add r0, sp, #1020
; CHECK-NEXT: blx
call void @take_ptr(i8* %addr1)
ret void
}
; Max range addressable with tADDrSPi + tADDi8
define void @test_local_var_offset_1275() {
; CHECK-LABEL: test_local_var_offset_1275
%addr1 = alloca i8, i32 1
%addr2 = alloca i8, i32 1275
; CHECK: add r0, sp, #1020
; CHECK: adds r0, #255
; CHECK-NEXT: blx
call void @take_ptr(i8* %addr1)
ret void

View File

@ -1,31 +1,57 @@
; RUN: llc < %s -mtriple=thumb-apple-ios | FileCheck %s
; RUN: llc < %s -mtriple=thumb-apple-ios | FileCheck %s --check-prefix=CHECK --check-prefix=IOS
; RUN: llc < %s -mtriple=thumb-none-eabi | FileCheck %s --check-prefix=CHECK --check-prefix=EABI
; RUN: llc < %s -o %t -filetype=obj -mtriple=thumbv6-apple-ios
; RUN: llvm-objdump -triple=thumbv6-apple-ios -d %t | FileCheck %s --check-prefix=CHECK --check-prefix=IOS
; RUN: llc < %s -o %t -filetype=obj -mtriple=thumbv6-none-eabi
; RUN: llvm-objdump -triple=thumbv6-none-eabi -d %t | FileCheck %s --check-prefix=CHECK --check-prefix=EABI
; Largest stack for which a single tADDspi/tSUBspi is enough
define void @test1() {
; CHECK-LABEL: test1:
; CHECK: sub sp, #256
; CHECK: add sp, #256
%tmp = alloca [ 64 x i32 ] , align 4
; CHECK: sub sp, #508
; CHECK: add sp, #508
%tmp = alloca [ 508 x i8 ] , align 4
ret void
}
; Largest stack for which three tADDspi/tSUBspis are enough
define void @test100() {
; CHECK-LABEL: test100:
; CHECK: sub sp, #508
; CHECK: sub sp, #508
; CHECK: sub sp, #508
; EABI: add sp, #508
; EABI: add sp, #508
; EABI: add sp, #508
; IOS: subs r4, r7, #4
; IOS: mov sp, r4
%tmp = alloca [ 1524 x i8 ] , align 4
ret void
}
; Smallest stack for which we use a constant pool
define void @test2() {
; CHECK-LABEL: test2:
; CHECK: ldr r0, LCPI
; CHECK: ldr r0,
; CHECK: add sp, r0
; CHECK: subs r4, r7, #4
; CHECK: mov sp, r4
%tmp = alloca [ 4168 x i8 ] , align 4
; EABI: ldr r0,
; EABI: add sp, r0
; IOS: subs r4, r7, #4
; IOS: mov sp, r4
%tmp = alloca [ 1528 x i8 ] , align 4
ret void
}
define i32 @test3() {
; CHECK-LABEL: test3:
; CHECK: ldr r1, LCPI
; CHECK: ldr r1,
; CHECK: add sp, r1
; CHECK: ldr r1, LCPI
; CHECK: ldr r1,
; CHECK: add r1, sp
; CHECK: subs r4, r7, #4
; CHECK: mov sp, r4
; EABI: ldr r1,
; EABI: add sp, r1
; IOS: subs r4, r7, #4
; IOS: mov sp, r4
%retval = alloca i32, align 4
%tmp = alloca i32, align 4
%a = alloca [805306369 x i8], align 16