2019-10-25 13:57:06 +08:00
; RUN: llc -march=bpfel -filetype=asm -o - %s | FileCheck -check-prefixes=CHECK,CHECK-ALU64 %s
; RUN: llc -march=bpfel -mattr=+alu32 -filetype=asm -o - %s | FileCheck -check-prefixes=CHECK,CHECK-ALU32 %s
2019-09-18 11:49:07 +08:00
;
; Source Code:
; #define _(x) (__builtin_preserve_access_index(x))
; struct s {int a; int b;};
; int test(struct s *arg) { return *(const int *)_(&arg->b); }
; Compiler flag to generate IR:
; clang -target bpf -S -O2 -g -emit-llvm test.c
%struct.s = type { i32 , i32 }
; Function Attrs: nounwind readonly
define d s o _ l o c a l i32 @test ( %struct.s * readonly %arg ) local_unnamed_addr #0 !dbg !11 {
entry:
call void @llvm.dbg.value ( metadata %struct.s * %arg , metadata !20 , metadata !DIExpression ( ) ) , !dbg !21
%0 = tail call i32 * @llvm.preserve.struct.access.index.p0i32.p0s_struct.ss ( %struct.s * %arg , i32 1 , i32 1 ) , !dbg !22 , !llvm.preserve.access.index !15
%1 = load i32 , i32 * %0 , align 4 , !dbg !23 , !tbaa !24
ret i32 %1 , !dbg !28
}
; CHECK-LABEL: test
[BPF] Enable relocation location for load/store/shifts
Previous btf field relocation is always at assignment like
r1 = 4
which is converted from an ld_imm64 instruction.
This patch did an optimization such that relocation
instruction might be load/store/shift. Specically, the
following insns may also have relocation, except BPF_MOV:
LDB, LDH, LDW, LDD, STB, STH, STW, STD,
LDB32, LDH32, LDW32, STB32, STH32, STW32,
SLL, SRL, SRA
To accomplish this, a few BPF target specific
codegen only instructions are invented. They
are generated at backend BPF SimplifyPatchable phase,
which is at early llc phase when SSA form is available.
The new codegen only instructions will be converted to
real proper instructions at the codegen and BTF emission stage.
Note that, as revealed by a few tests, this optimization might
be actual generating more relocations:
Scenario 1:
if (...) {
... __builtin_preserve_field_info(arg->b2, 0) ...
} else {
... __builtin_preserve_field_info(arg->b2, 0) ...
}
Compiler could do CSE to only have one relocation. But if both
of the above is translated into codegen internal instructions,
the compiler will not be able to do that.
Scenario 2:
offset = ... __builtin_preserve_field_info(arg->b2, 0) ...
...
... offset ...
... offset ...
... offset ...
For whatever reason, the compiler might be temporarily do copy
propagation of the righthand of "offset" assignment like
... __builtin_preserve_field_info(arg->b2, 0) ...
... __builtin_preserve_field_info(arg->b2, 0) ...
and CSE will be able to deduplicate later.
But if these intrinsics are converted to BPF pseudo instructions,
they will not be able to get deduplicated.
I do not expect we have big instruction count difference.
It may actually reduce instruction count since now relocation
is in deeper insn dependency chain.
For example, for test offset-reloc-fieldinfo-2.ll, this patch
generates 7 instead of 6 relocations for non-alu32 mode, but it
actually reduced instruction count from 29 to 26.
Differential Revision: https://reviews.llvm.org/D71790
2019-12-20 07:21:53 +08:00
; CHECK-ALU64: r0 = *(u32 *)(r1 + 4)
; CHECK-ALU32: w0 = *(u32 *)(r1 + 4)
2019-09-18 11:49:07 +08:00
; CHECK: exit
;
; CHECK: .long 1 # BTF_KIND_STRUCT(id = 2)
;
; CHECK: .byte 115 # string offset=1
; CHECK: .ascii ".text" # string offset=20
; CHECK: .ascii "0:1" # string offset=26
;
[BPF] do compile-once run-everywhere relocation for bitfields
A bpf specific clang intrinsic is introduced:
u32 __builtin_preserve_field_info(member_access, info_kind)
Depending on info_kind, different information will
be returned to the program. A relocation is also
recorded for this builtin so that bpf loader can
patch the instruction on the target host.
This clang intrinsic is used to get certain information
to facilitate struct/union member relocations.
The offset relocation is extended by 4 bytes to
include relocation kind.
Currently supported relocation kinds are
enum {
FIELD_BYTE_OFFSET = 0,
FIELD_BYTE_SIZE,
FIELD_EXISTENCE,
FIELD_SIGNEDNESS,
FIELD_LSHIFT_U64,
FIELD_RSHIFT_U64,
};
for __builtin_preserve_field_info. The old
access offset relocation is covered by
FIELD_BYTE_OFFSET = 0.
An example:
struct s {
int a;
int b1:9;
int b2:4;
};
enum {
FIELD_BYTE_OFFSET = 0,
FIELD_BYTE_SIZE,
FIELD_EXISTENCE,
FIELD_SIGNEDNESS,
FIELD_LSHIFT_U64,
FIELD_RSHIFT_U64,
};
void bpf_probe_read(void *, unsigned, const void *);
int field_read(struct s *arg) {
unsigned long long ull = 0;
unsigned offset = __builtin_preserve_field_info(arg->b2, FIELD_BYTE_OFFSET);
unsigned size = __builtin_preserve_field_info(arg->b2, FIELD_BYTE_SIZE);
#ifdef USE_PROBE_READ
bpf_probe_read(&ull, size, (const void *)arg + offset);
unsigned lshift = __builtin_preserve_field_info(arg->b2, FIELD_LSHIFT_U64);
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
lshift = lshift + (size << 3) - 64;
#endif
#else
switch(size) {
case 1:
ull = *(unsigned char *)((void *)arg + offset); break;
case 2:
ull = *(unsigned short *)((void *)arg + offset); break;
case 4:
ull = *(unsigned int *)((void *)arg + offset); break;
case 8:
ull = *(unsigned long long *)((void *)arg + offset); break;
}
unsigned lshift = __builtin_preserve_field_info(arg->b2, FIELD_LSHIFT_U64);
#endif
ull <<= lshift;
if (__builtin_preserve_field_info(arg->b2, FIELD_SIGNEDNESS))
return (long long)ull >> __builtin_preserve_field_info(arg->b2, FIELD_RSHIFT_U64);
return ull >> __builtin_preserve_field_info(arg->b2, FIELD_RSHIFT_U64);
}
There is a minor overhead for bpf_probe_read() on big endian.
The code and relocation generated for field_read where bpf_probe_read() is
used to access argument data on little endian mode:
r3 = r1
r1 = 0
r1 = 4 <=== relocation (FIELD_BYTE_OFFSET)
r3 += r1
r1 = r10
r1 += -8
r2 = 4 <=== relocation (FIELD_BYTE_SIZE)
call bpf_probe_read
r2 = 51 <=== relocation (FIELD_LSHIFT_U64)
r1 = *(u64 *)(r10 - 8)
r1 <<= r2
r2 = 60 <=== relocation (FIELD_RSHIFT_U64)
r0 = r1
r0 >>= r2
r3 = 1 <=== relocation (FIELD_SIGNEDNESS)
if r3 == 0 goto LBB0_2
r1 s>>= r2
r0 = r1
LBB0_2:
exit
Compare to the above code between relocations FIELD_LSHIFT_U64 and
FIELD_LSHIFT_U64, the code with big endian mode has four more
instructions.
r1 = 41 <=== relocation (FIELD_LSHIFT_U64)
r6 += r1
r6 += -64
r6 <<= 32
r6 >>= 32
r1 = *(u64 *)(r10 - 8)
r1 <<= r6
r2 = 60 <=== relocation (FIELD_RSHIFT_U64)
The code and relocation generated when using direct load.
r2 = 0
r3 = 4
r4 = 4
if r4 s> 3 goto LBB0_3
if r4 == 1 goto LBB0_5
if r4 == 2 goto LBB0_6
goto LBB0_9
LBB0_6: # %sw.bb1
r1 += r3
r2 = *(u16 *)(r1 + 0)
goto LBB0_9
LBB0_3: # %entry
if r4 == 4 goto LBB0_7
if r4 == 8 goto LBB0_8
goto LBB0_9
LBB0_8: # %sw.bb9
r1 += r3
r2 = *(u64 *)(r1 + 0)
goto LBB0_9
LBB0_5: # %sw.bb
r1 += r3
r2 = *(u8 *)(r1 + 0)
goto LBB0_9
LBB0_7: # %sw.bb5
r1 += r3
r2 = *(u32 *)(r1 + 0)
LBB0_9: # %sw.epilog
r1 = 51
r2 <<= r1
r1 = 60
r0 = r2
r0 >>= r1
r3 = 1
if r3 == 0 goto LBB0_11
r2 s>>= r1
r0 = r2
LBB0_11: # %sw.epilog
exit
Considering verifier is able to do limited constant
propogation following branches. The following is the
code actually traversed.
r2 = 0
r3 = 4 <=== relocation
r4 = 4 <=== relocation
if r4 s> 3 goto LBB0_3
LBB0_3: # %entry
if r4 == 4 goto LBB0_7
LBB0_7: # %sw.bb5
r1 += r3
r2 = *(u32 *)(r1 + 0)
LBB0_9: # %sw.epilog
r1 = 51 <=== relocation
r2 <<= r1
r1 = 60 <=== relocation
r0 = r2
r0 >>= r1
r3 = 1
if r3 == 0 goto LBB0_11
r2 s>>= r1
r0 = r2
LBB0_11: # %sw.epilog
exit
For native load case, the load size is calculated to be the
same as the size of load width LLVM otherwise used to load
the value which is then used to extract the bitfield value.
Differential Revision: https://reviews.llvm.org/D67980
llvm-svn: 374099
2019-10-09 02:23:17 +08:00
; CHECK: .long 16 # FieldReloc
; CHECK-NEXT: .long 20 # Field reloc section string offset=20
2019-09-18 11:49:07 +08:00
; CHECK-NEXT: .long 1
; CHECK-NEXT: .long .Ltmp{{[0-9]+}}
; CHECK-NEXT: .long 2
; CHECK-NEXT: .long 26
[BPF] do compile-once run-everywhere relocation for bitfields
A bpf specific clang intrinsic is introduced:
u32 __builtin_preserve_field_info(member_access, info_kind)
Depending on info_kind, different information will
be returned to the program. A relocation is also
recorded for this builtin so that bpf loader can
patch the instruction on the target host.
This clang intrinsic is used to get certain information
to facilitate struct/union member relocations.
The offset relocation is extended by 4 bytes to
include relocation kind.
Currently supported relocation kinds are
enum {
FIELD_BYTE_OFFSET = 0,
FIELD_BYTE_SIZE,
FIELD_EXISTENCE,
FIELD_SIGNEDNESS,
FIELD_LSHIFT_U64,
FIELD_RSHIFT_U64,
};
for __builtin_preserve_field_info. The old
access offset relocation is covered by
FIELD_BYTE_OFFSET = 0.
An example:
struct s {
int a;
int b1:9;
int b2:4;
};
enum {
FIELD_BYTE_OFFSET = 0,
FIELD_BYTE_SIZE,
FIELD_EXISTENCE,
FIELD_SIGNEDNESS,
FIELD_LSHIFT_U64,
FIELD_RSHIFT_U64,
};
void bpf_probe_read(void *, unsigned, const void *);
int field_read(struct s *arg) {
unsigned long long ull = 0;
unsigned offset = __builtin_preserve_field_info(arg->b2, FIELD_BYTE_OFFSET);
unsigned size = __builtin_preserve_field_info(arg->b2, FIELD_BYTE_SIZE);
#ifdef USE_PROBE_READ
bpf_probe_read(&ull, size, (const void *)arg + offset);
unsigned lshift = __builtin_preserve_field_info(arg->b2, FIELD_LSHIFT_U64);
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
lshift = lshift + (size << 3) - 64;
#endif
#else
switch(size) {
case 1:
ull = *(unsigned char *)((void *)arg + offset); break;
case 2:
ull = *(unsigned short *)((void *)arg + offset); break;
case 4:
ull = *(unsigned int *)((void *)arg + offset); break;
case 8:
ull = *(unsigned long long *)((void *)arg + offset); break;
}
unsigned lshift = __builtin_preserve_field_info(arg->b2, FIELD_LSHIFT_U64);
#endif
ull <<= lshift;
if (__builtin_preserve_field_info(arg->b2, FIELD_SIGNEDNESS))
return (long long)ull >> __builtin_preserve_field_info(arg->b2, FIELD_RSHIFT_U64);
return ull >> __builtin_preserve_field_info(arg->b2, FIELD_RSHIFT_U64);
}
There is a minor overhead for bpf_probe_read() on big endian.
The code and relocation generated for field_read where bpf_probe_read() is
used to access argument data on little endian mode:
r3 = r1
r1 = 0
r1 = 4 <=== relocation (FIELD_BYTE_OFFSET)
r3 += r1
r1 = r10
r1 += -8
r2 = 4 <=== relocation (FIELD_BYTE_SIZE)
call bpf_probe_read
r2 = 51 <=== relocation (FIELD_LSHIFT_U64)
r1 = *(u64 *)(r10 - 8)
r1 <<= r2
r2 = 60 <=== relocation (FIELD_RSHIFT_U64)
r0 = r1
r0 >>= r2
r3 = 1 <=== relocation (FIELD_SIGNEDNESS)
if r3 == 0 goto LBB0_2
r1 s>>= r2
r0 = r1
LBB0_2:
exit
Compare to the above code between relocations FIELD_LSHIFT_U64 and
FIELD_LSHIFT_U64, the code with big endian mode has four more
instructions.
r1 = 41 <=== relocation (FIELD_LSHIFT_U64)
r6 += r1
r6 += -64
r6 <<= 32
r6 >>= 32
r1 = *(u64 *)(r10 - 8)
r1 <<= r6
r2 = 60 <=== relocation (FIELD_RSHIFT_U64)
The code and relocation generated when using direct load.
r2 = 0
r3 = 4
r4 = 4
if r4 s> 3 goto LBB0_3
if r4 == 1 goto LBB0_5
if r4 == 2 goto LBB0_6
goto LBB0_9
LBB0_6: # %sw.bb1
r1 += r3
r2 = *(u16 *)(r1 + 0)
goto LBB0_9
LBB0_3: # %entry
if r4 == 4 goto LBB0_7
if r4 == 8 goto LBB0_8
goto LBB0_9
LBB0_8: # %sw.bb9
r1 += r3
r2 = *(u64 *)(r1 + 0)
goto LBB0_9
LBB0_5: # %sw.bb
r1 += r3
r2 = *(u8 *)(r1 + 0)
goto LBB0_9
LBB0_7: # %sw.bb5
r1 += r3
r2 = *(u32 *)(r1 + 0)
LBB0_9: # %sw.epilog
r1 = 51
r2 <<= r1
r1 = 60
r0 = r2
r0 >>= r1
r3 = 1
if r3 == 0 goto LBB0_11
r2 s>>= r1
r0 = r2
LBB0_11: # %sw.epilog
exit
Considering verifier is able to do limited constant
propogation following branches. The following is the
code actually traversed.
r2 = 0
r3 = 4 <=== relocation
r4 = 4 <=== relocation
if r4 s> 3 goto LBB0_3
LBB0_3: # %entry
if r4 == 4 goto LBB0_7
LBB0_7: # %sw.bb5
r1 += r3
r2 = *(u32 *)(r1 + 0)
LBB0_9: # %sw.epilog
r1 = 51 <=== relocation
r2 <<= r1
r1 = 60 <=== relocation
r0 = r2
r0 >>= r1
r3 = 1
if r3 == 0 goto LBB0_11
r2 s>>= r1
r0 = r2
LBB0_11: # %sw.epilog
exit
For native load case, the load size is calculated to be the
same as the size of load width LLVM otherwise used to load
the value which is then used to extract the bitfield value.
Differential Revision: https://reviews.llvm.org/D67980
llvm-svn: 374099
2019-10-09 02:23:17 +08:00
; CHECK-NEXT: .long 0
2019-09-18 11:49:07 +08:00
; Function Attrs: nounwind readnone
declare i32 * @llvm.preserve.struct.access.index.p0i32.p0s_struct.ss ( %struct.s * , i32 , i32 ) #1
; Function Attrs: nounwind readnone speculatable willreturn
declare void @llvm.dbg.value ( metadata , metadata , metadata ) #2
attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math" = "false" "disable-tail-calls" = "false" "frame-pointer" = "all" "less-precise-fpmad" = "false" "min-legal-vector-width" = "0" "no-infs-fp-math" = "false" "no-jump-tables" = "false" "no-nans-fp-math" = "false" "no-signed-zeros-fp-math" = "false" "no-trapping-math" = "false" "stack-protector-buffer-size" = "8" "unsafe-fp-math" = "false" "use-soft-float" = "false" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind readnone s p e c u l a t a b l e w i l l r e t u r n }
!llvm.dbg.cu = ! { !0 }
!llvm.module.flags = ! { !7 , !8 , !9 }
!llvm.ident = ! { !10 }
!0 = distinct !DICompileUnit ( language: D W _ L A N G _ C 99 , file: !1 , producer: "clang version 10.0.0 (https://github.com/llvm/llvm-project.git 6e353b4df3aa452ed4741a5e5caea02b1a876d8c)" , isOptimized: true , runtimeVersion: 0 , emissionKind: F u l l D e b u g , enums: !2 , retainedTypes: !3 , nameTableKind: N one )
!1 = !DIFile ( filename: "test.c" , directory: "/tmp/home/yhs/work/tests/core" )
!2 = ! { }
!3 = ! { !4 }
!4 = !DIDerivedType ( tag: D W _ T A G _ p o i n t e r _ type , baseType: !5 , size: 64 )
!5 = !DIDerivedType ( tag: D W _ T A G _ c o n s t _ type , baseType: !6 )
!6 = !DIBasicType ( name: "int" , size: 32 , encoding: D W _ A T E _ s i g n e d )
!7 = ! { i32 2 , !"Dwarf Version" , i32 4 }
!8 = ! { i32 2 , !"Debug Info Version" , i32 3 }
!9 = ! { i32 1 , !"wchar_size" , i32 4 }
!10 = ! { !"clang version 10.0.0 (https://github.com/llvm/llvm-project.git 6e353b4df3aa452ed4741a5e5caea02b1a876d8c)" }
!11 = distinct !DISubprogram ( name: "test" , scope: !1 , file: !1 , line: 3 , type: !12 , scopeLine: 3 , flags: D I F l a g P r o t o t y p e d , isDefinition: true , isOptimized: true , unit: !0 , retainedNodes: !19 )
!12 = !DISubroutineType ( types: !13 )
!13 = ! { !6 , !14 }
!14 = !DIDerivedType ( tag: D W _ T A G _ p o i n t e r _ type , baseType: !15 , size: 64 )
!15 = distinct !DICompositeType ( tag: D W _ T A G _ s t r u c t u r e _ type , name: "s" , file: !1 , line: 2 , size: 64 , elements: !16 )
!16 = ! { !17 , !18 }
!17 = !DIDerivedType ( tag: D W _ T A G _ m e m b e r , name: "a" , scope: !15 , file: !1 , line: 2 , baseType: !6 , size: 32 )
!18 = !DIDerivedType ( tag: D W _ T A G _ m e m b e r , name: "b" , scope: !15 , file: !1 , line: 2 , baseType: !6 , size: 32 , offset: 32 )
!19 = ! { !20 }
!20 = !DILocalVariable ( name: "arg" , arg: 1 , scope: !11 , file: !1 , line: 3 , type: !14 )
!21 = !DILocation ( line: 0 , scope: !11 )
!22 = !DILocation ( line: 3 , column: 48 , scope: !11 )
!23 = !DILocation ( line: 3 , column: 34 , scope: !11 )
!24 = ! { !25 , !25 , i64 0 }
!25 = ! { !"int" , !26 , i64 0 }
!26 = ! { !"omnipotent char" , !27 , i64 0 }
!27 = ! { !"Simple C/C++ TBAA" }
!28 = !DILocation ( line: 3 , column: 27 , scope: !11 )