[Thumb-1] Synthesize TBB/TBH instructions to make use of compressed jump tables
[Reapplying r284580 and r285917 with fix and testing to ensure emitted jump tables for Thumb-1 have 4-byte alignment]
The TBB and TBH instructions in Thumb-2 allow jump tables to be compressed into sequences of bytes or shorts respectively. These instructions do not exist in Thumb-1, however it is possible to synthesize them out of a sequence of other instructions.
It turns out this sequence is so short that it's almost never a lose for performance and is ALWAYS a significant win for code size.
TBB example:
Before: lsls r0, r0, #2 After: add r0, pc
adr r1, .LJTI0_0 ldrb r0, [r0, #6]
ldr r0, [r0, r1] lsls r0, r0, #1
mov pc, r0 add pc, r0
=> No change in prologue code size or dynamic instruction count. Jump table shrunk by a factor of 4.
The only case that can increase dynamic instruction count is the TBH case:
Before: lsls r0, r4, #2 After: lsls r4, r4, #1
adr r1, .LJTI0_0 add r4, pc
ldr r0, [r0, r1] ldrh r4, [r4, #6]
mov pc, r0 lsls r4, r4, #1
add pc, r4
=> 1 more instruction in prologue. Jump table shrunk by a factor of 2.
So there is an argument that this should be disabled when optimizing for performance (and a TBH needs to be generated). I'm not so sure about that in practice, because on small cores with Thumb-1 performance is often tied to code size. But I'm willing to turn it off when optimizing for performance if people want (also note that TBHs are fairly rare in practice!)
llvm-svn: 285690
2016-11-01 21:37:41 +08:00
|
|
|
; RUN: llc < %s -mtriple=thumbv7-apple-darwin -relocation-model=pic | FileCheck %s --check-prefix=CHECK --check-prefix=T2
|
|
|
|
; RUN: llc < %s -mtriple=thumbv6m-apple-darwin -relocation-model=pic | FileCheck %s --check-prefix=CHECK --check-prefix=T1
|
|
|
|
; RUN: llc < %s -mtriple=thumbv6m-apple-darwin -relocation-model=static | FileCheck %s --check-prefix=CHECK --check-prefix=T1
|
2009-07-29 10:18:14 +08:00
|
|
|
|
2009-07-30 07:20:20 +08:00
|
|
|
; Thumb2 target should reorder the bb's in order to use tbb / tbh.
|
|
|
|
|
2009-07-29 10:18:14 +08:00
|
|
|
%struct.R_flstr = type { i32, i32, i8* }
|
|
|
|
%struct._T_tstr = type { i32, %struct.R_flstr*, %struct._T_tstr* }
|
|
|
|
@_C_nextcmd = external global i32 ; <i32*> [#uses=3]
|
|
|
|
@.str31 = external constant [28 x i8], align 1 ; <[28 x i8]*> [#uses=1]
|
|
|
|
@_T_gtol = external global %struct._T_tstr* ; <%struct._T_tstr**> [#uses=2]
|
|
|
|
|
2010-06-17 23:18:27 +08:00
|
|
|
declare i32 @strlen(i8* nocapture) nounwind readonly
|
2009-07-29 10:18:14 +08:00
|
|
|
|
2010-06-17 23:18:27 +08:00
|
|
|
declare void @Z_fatal(i8*) noreturn nounwind
|
2009-07-29 10:18:14 +08:00
|
|
|
|
2010-06-17 23:18:27 +08:00
|
|
|
declare noalias i8* @calloc(i32, i32) nounwind
|
2009-07-29 10:18:14 +08:00
|
|
|
|
2015-06-01 03:22:07 +08:00
|
|
|
; Jump tables are not anchored next to the TBB/TBH any more. Make sure the
|
|
|
|
; correct address is still calculated (i.e. via a PC-relative symbol *at* the
|
|
|
|
; TBB/TBH).
|
2010-06-17 23:18:27 +08:00
|
|
|
define i32 @main(i32 %argc, i8** nocapture %argv) nounwind {
|
2013-07-14 14:24:09 +08:00
|
|
|
; CHECK-LABEL: main:
|
2015-06-01 03:22:07 +08:00
|
|
|
; CHECK-NOT: adr {{r[0-9]+}}, LJTI
|
[Thumb-1] Synthesize TBB/TBH instructions to make use of compressed jump tables
[Reapplying r284580 and r285917 with fix and testing to ensure emitted jump tables for Thumb-1 have 4-byte alignment]
The TBB and TBH instructions in Thumb-2 allow jump tables to be compressed into sequences of bytes or shorts respectively. These instructions do not exist in Thumb-1, however it is possible to synthesize them out of a sequence of other instructions.
It turns out this sequence is so short that it's almost never a lose for performance and is ALWAYS a significant win for code size.
TBB example:
Before: lsls r0, r0, #2 After: add r0, pc
adr r1, .LJTI0_0 ldrb r0, [r0, #6]
ldr r0, [r0, r1] lsls r0, r0, #1
mov pc, r0 add pc, r0
=> No change in prologue code size or dynamic instruction count. Jump table shrunk by a factor of 4.
The only case that can increase dynamic instruction count is the TBH case:
Before: lsls r0, r4, #2 After: lsls r4, r4, #1
adr r1, .LJTI0_0 add r4, pc
ldr r0, [r0, r1] ldrh r4, [r4, #6]
mov pc, r0 lsls r4, r4, #1
add pc, r4
=> 1 more instruction in prologue. Jump table shrunk by a factor of 2.
So there is an argument that this should be disabled when optimizing for performance (and a TBH needs to be generated). I'm not so sure about that in practice, because on small cores with Thumb-1 performance is often tied to code size. But I'm willing to turn it off when optimizing for performance if people want (also note that TBHs are fairly rare in practice!)
llvm-svn: 285690
2016-11-01 21:37:41 +08:00
|
|
|
; T1: lsls r[[x:[0-9]+]], {{r[0-9]+}}, #1
|
2015-06-01 03:22:07 +08:00
|
|
|
; CHECK: [[PCREL_ANCHOR:LCPI[0-9]+_[0-9]+]]:
|
[Thumb-1] Synthesize TBB/TBH instructions to make use of compressed jump tables
[Reapplying r284580 and r285917 with fix and testing to ensure emitted jump tables for Thumb-1 have 4-byte alignment]
The TBB and TBH instructions in Thumb-2 allow jump tables to be compressed into sequences of bytes or shorts respectively. These instructions do not exist in Thumb-1, however it is possible to synthesize them out of a sequence of other instructions.
It turns out this sequence is so short that it's almost never a lose for performance and is ALWAYS a significant win for code size.
TBB example:
Before: lsls r0, r0, #2 After: add r0, pc
adr r1, .LJTI0_0 ldrb r0, [r0, #6]
ldr r0, [r0, r1] lsls r0, r0, #1
mov pc, r0 add pc, r0
=> No change in prologue code size or dynamic instruction count. Jump table shrunk by a factor of 4.
The only case that can increase dynamic instruction count is the TBH case:
Before: lsls r0, r4, #2 After: lsls r4, r4, #1
adr r1, .LJTI0_0 add r4, pc
ldr r0, [r0, r1] ldrh r4, [r4, #6]
mov pc, r0 lsls r4, r4, #1
add pc, r4
=> 1 more instruction in prologue. Jump table shrunk by a factor of 2.
So there is an argument that this should be disabled when optimizing for performance (and a TBH needs to be generated). I'm not so sure about that in practice, because on small cores with Thumb-1 performance is often tied to code size. But I'm willing to turn it off when optimizing for performance if people want (also note that TBHs are fairly rare in practice!)
llvm-svn: 285690
2016-11-01 21:37:41 +08:00
|
|
|
; T2-NEXT: tbb [pc, {{r[0-9]+}}]
|
|
|
|
; T1-NEXT: add pc, r[[x]]
|
2015-06-01 03:22:07 +08:00
|
|
|
|
|
|
|
; CHECK: LJTI0_0:
|
|
|
|
; CHECK-NEXT: .data_region jt8
|
|
|
|
; CHECK-NEXT: .byte (LBB{{[0-9]+_[0-9]+}}-([[PCREL_ANCHOR]]+4))/2
|
|
|
|
|
2009-07-29 10:18:14 +08:00
|
|
|
entry:
|
|
|
|
br label %bb42.i
|
|
|
|
|
|
|
|
bb1.i2: ; preds = %bb42.i
|
|
|
|
br label %bb40.i
|
|
|
|
|
|
|
|
bb5.i: ; preds = %bb42.i
|
2009-11-13 01:19:09 +08:00
|
|
|
%0 = or i32 %argc, 32 ; <i32> [#uses=1]
|
2009-07-29 10:18:14 +08:00
|
|
|
br label %bb40.i
|
|
|
|
|
|
|
|
bb7.i: ; preds = %bb42.i
|
2010-06-17 23:18:27 +08:00
|
|
|
call void @_T_addtol(%struct._T_tstr** @_T_gtol, i32 0, i8* null) nounwind
|
2009-07-29 10:18:14 +08:00
|
|
|
unreachable
|
|
|
|
|
|
|
|
bb15.i: ; preds = %bb42.i
|
2010-06-17 23:18:27 +08:00
|
|
|
call void @_T_addtol(%struct._T_tstr** @_T_gtol, i32 2, i8* null) nounwind
|
2009-07-29 10:18:14 +08:00
|
|
|
unreachable
|
|
|
|
|
|
|
|
bb23.i: ; preds = %bb42.i
|
2010-06-17 23:18:27 +08:00
|
|
|
%1 = call i32 @strlen(i8* null) nounwind readonly ; <i32> [#uses=0]
|
2009-07-29 10:18:14 +08:00
|
|
|
unreachable
|
|
|
|
|
|
|
|
bb33.i: ; preds = %bb42.i
|
|
|
|
store i32 0, i32* @_C_nextcmd, align 4
|
2010-06-17 23:18:27 +08:00
|
|
|
%2 = call noalias i8* @calloc(i32 21, i32 1) nounwind ; <i8*> [#uses=0]
|
2009-07-29 10:18:14 +08:00
|
|
|
unreachable
|
|
|
|
|
|
|
|
bb34.i: ; preds = %bb42.i
|
2015-02-28 05:17:42 +08:00
|
|
|
%3 = load i32, i32* @_C_nextcmd, align 4 ; <i32> [#uses=1]
|
2009-07-29 10:18:14 +08:00
|
|
|
%4 = add i32 %3, 1 ; <i32> [#uses=1]
|
|
|
|
store i32 %4, i32* @_C_nextcmd, align 4
|
2010-06-17 23:18:27 +08:00
|
|
|
%5 = call noalias i8* @calloc(i32 22, i32 1) nounwind ; <i8*> [#uses=0]
|
2009-07-29 10:18:14 +08:00
|
|
|
unreachable
|
|
|
|
|
|
|
|
bb35.i: ; preds = %bb42.i
|
2010-06-17 23:18:27 +08:00
|
|
|
%6 = call noalias i8* @calloc(i32 20, i32 1) nounwind ; <i8*> [#uses=0]
|
2009-07-29 10:18:14 +08:00
|
|
|
unreachable
|
|
|
|
|
|
|
|
bb37.i: ; preds = %bb42.i
|
2010-06-17 23:18:27 +08:00
|
|
|
%7 = call noalias i8* @calloc(i32 14, i32 1) nounwind ; <i8*> [#uses=0]
|
2009-07-29 10:18:14 +08:00
|
|
|
unreachable
|
|
|
|
|
|
|
|
bb39.i: ; preds = %bb42.i
|
2015-03-14 02:20:45 +08:00
|
|
|
call void @Z_fatal(i8* getelementptr ([28 x i8], [28 x i8]* @.str31, i32 0, i32 0)) nounwind
|
2009-07-29 10:18:14 +08:00
|
|
|
unreachable
|
|
|
|
|
|
|
|
bb40.i: ; preds = %bb42.i, %bb5.i, %bb1.i2
|
|
|
|
br label %bb42.i
|
|
|
|
|
|
|
|
bb42.i: ; preds = %bb40.i, %entry
|
2009-11-13 01:19:09 +08:00
|
|
|
switch i32 %argc, label %bb39.i [
|
2009-07-29 10:18:14 +08:00
|
|
|
i32 67, label %bb33.i
|
|
|
|
i32 70, label %bb35.i
|
|
|
|
i32 77, label %bb37.i
|
|
|
|
i32 83, label %bb34.i
|
|
|
|
i32 97, label %bb7.i
|
|
|
|
i32 100, label %bb5.i
|
|
|
|
i32 101, label %bb40.i
|
|
|
|
i32 102, label %bb23.i
|
|
|
|
i32 105, label %bb15.i
|
|
|
|
i32 116, label %bb1.i2
|
|
|
|
]
|
|
|
|
}
|
|
|
|
|
2010-06-17 23:18:27 +08:00
|
|
|
declare void @_T_addtol(%struct._T_tstr** nocapture, i32, i8*) nounwind
|