Merge branch 'BPF_and_RT'
Thomas Gleixner says: ==================== This is the third version of the BPF/RT patch set which makes both coexist nicely. The long explanation can be found in the cover letter of the V1 submission: https://lore.kernel.org/r/20200214133917.304937432@linutronix.de V2 is here: https://lore.kernel.org/r/20200220204517.863202864@linutronix.de The following changes vs. V2 have been made: - Rebased to bpf-next, adjusted to the lock changes in the hashmap code. - Split the preallocation enforcement patch for instrumentation type BPF programs into two pieces: 1) Emit a one-time warning on !RT kernels when any instrumentation type BPF program uses run-time allocation. Emit also a corresponding warning in the verifier log. But allow the program to run for backward compatibility sake. After a grace period this should be enforced. 2) On RT reject such programs because on RT the memory allocator cannot be called from truly atomic contexts. - Fixed the fallout from V2 as reported by Alexei and 0-day - Removed the redundant preempt_disable() from trace_call_bpf() - Removed the unused export of trace_call_bpf() ==================== Signed-off-by: Alexei Starovoitov <ast@kernel.org>
This commit is contained in:
commit
80a836c250
|
@ -885,7 +885,7 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
|
|||
struct bpf_prog *_prog; \
|
||||
struct bpf_prog_array *_array; \
|
||||
u32 _ret = 1; \
|
||||
preempt_disable(); \
|
||||
migrate_disable(); \
|
||||
rcu_read_lock(); \
|
||||
_array = rcu_dereference(array); \
|
||||
if (unlikely(check_non_null && !_array))\
|
||||
|
@ -898,7 +898,7 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
|
|||
} \
|
||||
_out: \
|
||||
rcu_read_unlock(); \
|
||||
preempt_enable(); \
|
||||
migrate_enable(); \
|
||||
_ret; \
|
||||
})
|
||||
|
||||
|
@ -932,7 +932,7 @@ _out: \
|
|||
u32 ret; \
|
||||
u32 _ret = 1; \
|
||||
u32 _cn = 0; \
|
||||
preempt_disable(); \
|
||||
migrate_disable(); \
|
||||
rcu_read_lock(); \
|
||||
_array = rcu_dereference(array); \
|
||||
_item = &_array->items[0]; \
|
||||
|
@ -944,7 +944,7 @@ _out: \
|
|||
_item++; \
|
||||
} \
|
||||
rcu_read_unlock(); \
|
||||
preempt_enable(); \
|
||||
migrate_enable(); \
|
||||
if (_ret) \
|
||||
_ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \
|
||||
else \
|
||||
|
@ -961,6 +961,36 @@ _out: \
|
|||
#ifdef CONFIG_BPF_SYSCALL
|
||||
DECLARE_PER_CPU(int, bpf_prog_active);
|
||||
|
||||
/*
|
||||
* Block execution of BPF programs attached to instrumentation (perf,
|
||||
* kprobes, tracepoints) to prevent deadlocks on map operations as any of
|
||||
* these events can happen inside a region which holds a map bucket lock
|
||||
* and can deadlock on it.
|
||||
*
|
||||
* Use the preemption safe inc/dec variants on RT because migrate disable
|
||||
* is preemptible on RT and preemption in the middle of the RMW operation
|
||||
* might lead to inconsistent state. Use the raw variants for non RT
|
||||
* kernels as migrate_disable() maps to preempt_disable() so the slightly
|
||||
* more expensive save operation can be avoided.
|
||||
*/
|
||||
static inline void bpf_disable_instrumentation(void)
|
||||
{
|
||||
migrate_disable();
|
||||
if (IS_ENABLED(CONFIG_PREEMPT_RT))
|
||||
this_cpu_inc(bpf_prog_active);
|
||||
else
|
||||
__this_cpu_inc(bpf_prog_active);
|
||||
}
|
||||
|
||||
static inline void bpf_enable_instrumentation(void)
|
||||
{
|
||||
if (IS_ENABLED(CONFIG_PREEMPT_RT))
|
||||
this_cpu_dec(bpf_prog_active);
|
||||
else
|
||||
__this_cpu_dec(bpf_prog_active);
|
||||
migrate_enable();
|
||||
}
|
||||
|
||||
extern const struct file_operations bpf_map_fops;
|
||||
extern const struct file_operations bpf_prog_fops;
|
||||
|
||||
|
|
|
@ -561,7 +561,7 @@ DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
|
|||
|
||||
#define __BPF_PROG_RUN(prog, ctx, dfunc) ({ \
|
||||
u32 ret; \
|
||||
cant_sleep(); \
|
||||
cant_migrate(); \
|
||||
if (static_branch_unlikely(&bpf_stats_enabled_key)) { \
|
||||
struct bpf_prog_stats *stats; \
|
||||
u64 start = sched_clock(); \
|
||||
|
@ -576,8 +576,30 @@ DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
|
|||
} \
|
||||
ret; })
|
||||
|
||||
#define BPF_PROG_RUN(prog, ctx) __BPF_PROG_RUN(prog, ctx, \
|
||||
bpf_dispatcher_nopfunc)
|
||||
#define BPF_PROG_RUN(prog, ctx) \
|
||||
__BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nopfunc)
|
||||
|
||||
/*
|
||||
* Use in preemptible and therefore migratable context to make sure that
|
||||
* the execution of the BPF program runs on one CPU.
|
||||
*
|
||||
* This uses migrate_disable/enable() explicitly to document that the
|
||||
* invocation of a BPF program does not require reentrancy protection
|
||||
* against a BPF program which is invoked from a preempting task.
|
||||
*
|
||||
* For non RT enabled kernels migrate_disable/enable() maps to
|
||||
* preempt_disable/enable(), i.e. it disables also preemption.
|
||||
*/
|
||||
static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog,
|
||||
const void *ctx)
|
||||
{
|
||||
u32 ret;
|
||||
|
||||
migrate_disable();
|
||||
ret = __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nopfunc);
|
||||
migrate_enable();
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN
|
||||
|
||||
|
@ -655,6 +677,7 @@ static inline u8 *bpf_skb_cb(struct sk_buff *skb)
|
|||
return qdisc_skb_cb(skb)->data;
|
||||
}
|
||||
|
||||
/* Must be invoked with migration disabled */
|
||||
static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog,
|
||||
struct sk_buff *skb)
|
||||
{
|
||||
|
@ -680,9 +703,9 @@ static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog,
|
|||
{
|
||||
u32 res;
|
||||
|
||||
preempt_disable();
|
||||
migrate_disable();
|
||||
res = __bpf_prog_run_save_cb(prog, skb);
|
||||
preempt_enable();
|
||||
migrate_enable();
|
||||
return res;
|
||||
}
|
||||
|
||||
|
@ -695,9 +718,7 @@ static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog,
|
|||
if (unlikely(prog->cb_access))
|
||||
memset(cb_data, 0, BPF_SKB_CB_LEN);
|
||||
|
||||
preempt_disable();
|
||||
res = BPF_PROG_RUN(prog, skb);
|
||||
preempt_enable();
|
||||
res = bpf_prog_run_pin_on_cpu(prog, skb);
|
||||
return res;
|
||||
}
|
||||
|
||||
|
|
|
@ -27,9 +27,62 @@
|
|||
.map_delete_batch = \
|
||||
generic_map_delete_batch
|
||||
|
||||
/*
|
||||
* The bucket lock has two protection scopes:
|
||||
*
|
||||
* 1) Serializing concurrent operations from BPF programs on differrent
|
||||
* CPUs
|
||||
*
|
||||
* 2) Serializing concurrent operations from BPF programs and sys_bpf()
|
||||
*
|
||||
* BPF programs can execute in any context including perf, kprobes and
|
||||
* tracing. As there are almost no limits where perf, kprobes and tracing
|
||||
* can be invoked from the lock operations need to be protected against
|
||||
* deadlocks. Deadlocks can be caused by recursion and by an invocation in
|
||||
* the lock held section when functions which acquire this lock are invoked
|
||||
* from sys_bpf(). BPF recursion is prevented by incrementing the per CPU
|
||||
* variable bpf_prog_active, which prevents BPF programs attached to perf
|
||||
* events, kprobes and tracing to be invoked before the prior invocation
|
||||
* from one of these contexts completed. sys_bpf() uses the same mechanism
|
||||
* by pinning the task to the current CPU and incrementing the recursion
|
||||
* protection accross the map operation.
|
||||
*
|
||||
* This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain
|
||||
* operations like memory allocations (even with GFP_ATOMIC) from atomic
|
||||
* contexts. This is required because even with GFP_ATOMIC the memory
|
||||
* allocator calls into code pathes which acquire locks with long held lock
|
||||
* sections. To ensure the deterministic behaviour these locks are regular
|
||||
* spinlocks, which are converted to 'sleepable' spinlocks on RT. The only
|
||||
* true atomic contexts on an RT kernel are the low level hardware
|
||||
* handling, scheduling, low level interrupt handling, NMIs etc. None of
|
||||
* these contexts should ever do memory allocations.
|
||||
*
|
||||
* As regular device interrupt handlers and soft interrupts are forced into
|
||||
* thread context, the existing code which does
|
||||
* spin_lock*(); alloc(GPF_ATOMIC); spin_unlock*();
|
||||
* just works.
|
||||
*
|
||||
* In theory the BPF locks could be converted to regular spinlocks as well,
|
||||
* but the bucket locks and percpu_freelist locks can be taken from
|
||||
* arbitrary contexts (perf, kprobes, tracepoints) which are required to be
|
||||
* atomic contexts even on RT. These mechanisms require preallocated maps,
|
||||
* so there is no need to invoke memory allocations within the lock held
|
||||
* sections.
|
||||
*
|
||||
* BPF maps which need dynamic allocation are only used from (forced)
|
||||
* thread context on RT and can therefore use regular spinlocks which in
|
||||
* turn allows to invoke memory allocations from the lock held section.
|
||||
*
|
||||
* On a non RT kernel this distinction is neither possible nor required.
|
||||
* spinlock maps to raw_spinlock and the extra code is optimized out by the
|
||||
* compiler.
|
||||
*/
|
||||
struct bucket {
|
||||
struct hlist_nulls_head head;
|
||||
raw_spinlock_t lock;
|
||||
union {
|
||||
raw_spinlock_t raw_lock;
|
||||
spinlock_t lock;
|
||||
};
|
||||
};
|
||||
|
||||
struct bpf_htab {
|
||||
|
@ -68,6 +121,51 @@ struct htab_elem {
|
|||
char key[0] __aligned(8);
|
||||
};
|
||||
|
||||
static inline bool htab_is_prealloc(const struct bpf_htab *htab)
|
||||
{
|
||||
return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
|
||||
}
|
||||
|
||||
static inline bool htab_use_raw_lock(const struct bpf_htab *htab)
|
||||
{
|
||||
return (!IS_ENABLED(CONFIG_PREEMPT_RT) || htab_is_prealloc(htab));
|
||||
}
|
||||
|
||||
static void htab_init_buckets(struct bpf_htab *htab)
|
||||
{
|
||||
unsigned i;
|
||||
|
||||
for (i = 0; i < htab->n_buckets; i++) {
|
||||
INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
|
||||
if (htab_use_raw_lock(htab))
|
||||
raw_spin_lock_init(&htab->buckets[i].raw_lock);
|
||||
else
|
||||
spin_lock_init(&htab->buckets[i].lock);
|
||||
}
|
||||
}
|
||||
|
||||
static inline unsigned long htab_lock_bucket(const struct bpf_htab *htab,
|
||||
struct bucket *b)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
if (htab_use_raw_lock(htab))
|
||||
raw_spin_lock_irqsave(&b->raw_lock, flags);
|
||||
else
|
||||
spin_lock_irqsave(&b->lock, flags);
|
||||
return flags;
|
||||
}
|
||||
|
||||
static inline void htab_unlock_bucket(const struct bpf_htab *htab,
|
||||
struct bucket *b,
|
||||
unsigned long flags)
|
||||
{
|
||||
if (htab_use_raw_lock(htab))
|
||||
raw_spin_unlock_irqrestore(&b->raw_lock, flags);
|
||||
else
|
||||
spin_unlock_irqrestore(&b->lock, flags);
|
||||
}
|
||||
|
||||
static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
|
||||
|
||||
static bool htab_is_lru(const struct bpf_htab *htab)
|
||||
|
@ -82,11 +180,6 @@ static bool htab_is_percpu(const struct bpf_htab *htab)
|
|||
htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
|
||||
}
|
||||
|
||||
static bool htab_is_prealloc(const struct bpf_htab *htab)
|
||||
{
|
||||
return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
|
||||
}
|
||||
|
||||
static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
|
||||
void __percpu *pptr)
|
||||
{
|
||||
|
@ -328,8 +421,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
|
|||
bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
|
||||
bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
|
||||
struct bpf_htab *htab;
|
||||
int err, i;
|
||||
u64 cost;
|
||||
int err;
|
||||
|
||||
htab = kzalloc(sizeof(*htab), GFP_USER);
|
||||
if (!htab)
|
||||
|
@ -391,10 +484,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
|
|||
else
|
||||
htab->hashrnd = get_random_int();
|
||||
|
||||
for (i = 0; i < htab->n_buckets; i++) {
|
||||
INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
|
||||
raw_spin_lock_init(&htab->buckets[i].lock);
|
||||
}
|
||||
htab_init_buckets(htab);
|
||||
|
||||
if (prealloc) {
|
||||
err = prealloc_init(htab);
|
||||
|
@ -602,7 +692,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
|
|||
b = __select_bucket(htab, tgt_l->hash);
|
||||
head = &b->head;
|
||||
|
||||
raw_spin_lock_irqsave(&b->lock, flags);
|
||||
flags = htab_lock_bucket(htab, b);
|
||||
|
||||
hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
|
||||
if (l == tgt_l) {
|
||||
|
@ -610,7 +700,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
|
|||
break;
|
||||
}
|
||||
|
||||
raw_spin_unlock_irqrestore(&b->lock, flags);
|
||||
htab_unlock_bucket(htab, b, flags);
|
||||
|
||||
return l == tgt_l;
|
||||
}
|
||||
|
@ -686,15 +776,7 @@ static void htab_elem_free_rcu(struct rcu_head *head)
|
|||
struct htab_elem *l = container_of(head, struct htab_elem, rcu);
|
||||
struct bpf_htab *htab = l->htab;
|
||||
|
||||
/* must increment bpf_prog_active to avoid kprobe+bpf triggering while
|
||||
* we're calling kfree, otherwise deadlock is possible if kprobes
|
||||
* are placed somewhere inside of slub
|
||||
*/
|
||||
preempt_disable();
|
||||
__this_cpu_inc(bpf_prog_active);
|
||||
htab_elem_free(htab, l);
|
||||
__this_cpu_dec(bpf_prog_active);
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
|
||||
|
@ -884,8 +966,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
|
|||
*/
|
||||
}
|
||||
|
||||
/* bpf_map_update_elem() can be called in_irq() */
|
||||
raw_spin_lock_irqsave(&b->lock, flags);
|
||||
flags = htab_lock_bucket(htab, b);
|
||||
|
||||
l_old = lookup_elem_raw(head, hash, key, key_size);
|
||||
|
||||
|
@ -926,7 +1007,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
|
|||
}
|
||||
ret = 0;
|
||||
err:
|
||||
raw_spin_unlock_irqrestore(&b->lock, flags);
|
||||
htab_unlock_bucket(htab, b, flags);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -964,8 +1045,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
|
|||
return -ENOMEM;
|
||||
memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size);
|
||||
|
||||
/* bpf_map_update_elem() can be called in_irq() */
|
||||
raw_spin_lock_irqsave(&b->lock, flags);
|
||||
flags = htab_lock_bucket(htab, b);
|
||||
|
||||
l_old = lookup_elem_raw(head, hash, key, key_size);
|
||||
|
||||
|
@ -984,7 +1064,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
|
|||
ret = 0;
|
||||
|
||||
err:
|
||||
raw_spin_unlock_irqrestore(&b->lock, flags);
|
||||
htab_unlock_bucket(htab, b, flags);
|
||||
|
||||
if (ret)
|
||||
bpf_lru_push_free(&htab->lru, &l_new->lru_node);
|
||||
|
@ -1019,8 +1099,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
|
|||
b = __select_bucket(htab, hash);
|
||||
head = &b->head;
|
||||
|
||||
/* bpf_map_update_elem() can be called in_irq() */
|
||||
raw_spin_lock_irqsave(&b->lock, flags);
|
||||
flags = htab_lock_bucket(htab, b);
|
||||
|
||||
l_old = lookup_elem_raw(head, hash, key, key_size);
|
||||
|
||||
|
@ -1043,7 +1122,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
|
|||
}
|
||||
ret = 0;
|
||||
err:
|
||||
raw_spin_unlock_irqrestore(&b->lock, flags);
|
||||
htab_unlock_bucket(htab, b, flags);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -1083,8 +1162,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
|
|||
return -ENOMEM;
|
||||
}
|
||||
|
||||
/* bpf_map_update_elem() can be called in_irq() */
|
||||
raw_spin_lock_irqsave(&b->lock, flags);
|
||||
flags = htab_lock_bucket(htab, b);
|
||||
|
||||
l_old = lookup_elem_raw(head, hash, key, key_size);
|
||||
|
||||
|
@ -1106,7 +1184,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
|
|||
}
|
||||
ret = 0;
|
||||
err:
|
||||
raw_spin_unlock_irqrestore(&b->lock, flags);
|
||||
htab_unlock_bucket(htab, b, flags);
|
||||
if (l_new)
|
||||
bpf_lru_push_free(&htab->lru, &l_new->lru_node);
|
||||
return ret;
|
||||
|
@ -1144,7 +1222,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
|
|||
b = __select_bucket(htab, hash);
|
||||
head = &b->head;
|
||||
|
||||
raw_spin_lock_irqsave(&b->lock, flags);
|
||||
flags = htab_lock_bucket(htab, b);
|
||||
|
||||
l = lookup_elem_raw(head, hash, key, key_size);
|
||||
|
||||
|
@ -1154,7 +1232,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
|
|||
ret = 0;
|
||||
}
|
||||
|
||||
raw_spin_unlock_irqrestore(&b->lock, flags);
|
||||
htab_unlock_bucket(htab, b, flags);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -1176,7 +1254,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
|
|||
b = __select_bucket(htab, hash);
|
||||
head = &b->head;
|
||||
|
||||
raw_spin_lock_irqsave(&b->lock, flags);
|
||||
flags = htab_lock_bucket(htab, b);
|
||||
|
||||
l = lookup_elem_raw(head, hash, key, key_size);
|
||||
|
||||
|
@ -1185,7 +1263,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
|
|||
ret = 0;
|
||||
}
|
||||
|
||||
raw_spin_unlock_irqrestore(&b->lock, flags);
|
||||
htab_unlock_bucket(htab, b, flags);
|
||||
if (l)
|
||||
bpf_lru_push_free(&htab->lru, &l->lru_node);
|
||||
return ret;
|
||||
|
@ -1325,8 +1403,7 @@ alloc:
|
|||
}
|
||||
|
||||
again:
|
||||
preempt_disable();
|
||||
this_cpu_inc(bpf_prog_active);
|
||||
bpf_disable_instrumentation();
|
||||
rcu_read_lock();
|
||||
again_nocopy:
|
||||
dst_key = keys;
|
||||
|
@ -1335,7 +1412,7 @@ again_nocopy:
|
|||
head = &b->head;
|
||||
/* do not grab the lock unless need it (bucket_cnt > 0). */
|
||||
if (locked)
|
||||
raw_spin_lock_irqsave(&b->lock, flags);
|
||||
flags = htab_lock_bucket(htab, b);
|
||||
|
||||
bucket_cnt = 0;
|
||||
hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
|
||||
|
@ -1352,10 +1429,9 @@ again_nocopy:
|
|||
/* Note that since bucket_cnt > 0 here, it is implicit
|
||||
* that the locked was grabbed, so release it.
|
||||
*/
|
||||
raw_spin_unlock_irqrestore(&b->lock, flags);
|
||||
htab_unlock_bucket(htab, b, flags);
|
||||
rcu_read_unlock();
|
||||
this_cpu_dec(bpf_prog_active);
|
||||
preempt_enable();
|
||||
bpf_enable_instrumentation();
|
||||
goto after_loop;
|
||||
}
|
||||
|
||||
|
@ -1364,10 +1440,9 @@ again_nocopy:
|
|||
/* Note that since bucket_cnt > 0 here, it is implicit
|
||||
* that the locked was grabbed, so release it.
|
||||
*/
|
||||
raw_spin_unlock_irqrestore(&b->lock, flags);
|
||||
htab_unlock_bucket(htab, b, flags);
|
||||
rcu_read_unlock();
|
||||
this_cpu_dec(bpf_prog_active);
|
||||
preempt_enable();
|
||||
bpf_enable_instrumentation();
|
||||
kvfree(keys);
|
||||
kvfree(values);
|
||||
goto alloc;
|
||||
|
@ -1418,7 +1493,7 @@ again_nocopy:
|
|||
dst_val += value_size;
|
||||
}
|
||||
|
||||
raw_spin_unlock_irqrestore(&b->lock, flags);
|
||||
htab_unlock_bucket(htab, b, flags);
|
||||
locked = false;
|
||||
|
||||
while (node_to_free) {
|
||||
|
@ -1437,8 +1512,7 @@ next_batch:
|
|||
}
|
||||
|
||||
rcu_read_unlock();
|
||||
this_cpu_dec(bpf_prog_active);
|
||||
preempt_enable();
|
||||
bpf_enable_instrumentation();
|
||||
if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys,
|
||||
key_size * bucket_cnt) ||
|
||||
copy_to_user(uvalues + total * value_size, values,
|
||||
|
|
|
@ -34,7 +34,7 @@ struct lpm_trie {
|
|||
size_t n_entries;
|
||||
size_t max_prefixlen;
|
||||
size_t data_size;
|
||||
raw_spinlock_t lock;
|
||||
spinlock_t lock;
|
||||
};
|
||||
|
||||
/* This trie implements a longest prefix match algorithm that can be used to
|
||||
|
@ -315,7 +315,7 @@ static int trie_update_elem(struct bpf_map *map,
|
|||
if (key->prefixlen > trie->max_prefixlen)
|
||||
return -EINVAL;
|
||||
|
||||
raw_spin_lock_irqsave(&trie->lock, irq_flags);
|
||||
spin_lock_irqsave(&trie->lock, irq_flags);
|
||||
|
||||
/* Allocate and fill a new node */
|
||||
|
||||
|
@ -422,7 +422,7 @@ out:
|
|||
kfree(im_node);
|
||||
}
|
||||
|
||||
raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
|
||||
spin_unlock_irqrestore(&trie->lock, irq_flags);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
@ -442,7 +442,7 @@ static int trie_delete_elem(struct bpf_map *map, void *_key)
|
|||
if (key->prefixlen > trie->max_prefixlen)
|
||||
return -EINVAL;
|
||||
|
||||
raw_spin_lock_irqsave(&trie->lock, irq_flags);
|
||||
spin_lock_irqsave(&trie->lock, irq_flags);
|
||||
|
||||
/* Walk the tree looking for an exact key/length match and keeping
|
||||
* track of the path we traverse. We will need to know the node
|
||||
|
@ -518,7 +518,7 @@ static int trie_delete_elem(struct bpf_map *map, void *_key)
|
|||
kfree_rcu(node, rcu);
|
||||
|
||||
out:
|
||||
raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
|
||||
spin_unlock_irqrestore(&trie->lock, irq_flags);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
@ -575,7 +575,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
|
|||
if (ret)
|
||||
goto out_err;
|
||||
|
||||
raw_spin_lock_init(&trie->lock);
|
||||
spin_lock_init(&trie->lock);
|
||||
|
||||
return &trie->map;
|
||||
out_err:
|
||||
|
|
|
@ -25,12 +25,18 @@ void pcpu_freelist_destroy(struct pcpu_freelist *s)
|
|||
free_percpu(s->freelist);
|
||||
}
|
||||
|
||||
static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head,
|
||||
struct pcpu_freelist_node *node)
|
||||
{
|
||||
node->next = head->first;
|
||||
head->first = node;
|
||||
}
|
||||
|
||||
static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
|
||||
struct pcpu_freelist_node *node)
|
||||
{
|
||||
raw_spin_lock(&head->lock);
|
||||
node->next = head->first;
|
||||
head->first = node;
|
||||
pcpu_freelist_push_node(head, node);
|
||||
raw_spin_unlock(&head->lock);
|
||||
}
|
||||
|
||||
|
@ -56,21 +62,16 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
|
|||
u32 nr_elems)
|
||||
{
|
||||
struct pcpu_freelist_head *head;
|
||||
unsigned long flags;
|
||||
int i, cpu, pcpu_entries;
|
||||
|
||||
pcpu_entries = nr_elems / num_possible_cpus() + 1;
|
||||
i = 0;
|
||||
|
||||
/* disable irq to workaround lockdep false positive
|
||||
* in bpf usage pcpu_freelist_populate() will never race
|
||||
* with pcpu_freelist_push()
|
||||
*/
|
||||
local_irq_save(flags);
|
||||
for_each_possible_cpu(cpu) {
|
||||
again:
|
||||
head = per_cpu_ptr(s->freelist, cpu);
|
||||
___pcpu_freelist_push(head, buf);
|
||||
/* No locking required as this is not visible yet. */
|
||||
pcpu_freelist_push_node(head, buf);
|
||||
i++;
|
||||
buf += elem_size;
|
||||
if (i == nr_elems)
|
||||
|
@ -78,7 +79,6 @@ again:
|
|||
if (i % pcpu_entries)
|
||||
goto again;
|
||||
}
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
|
||||
struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
|
||||
|
|
|
@ -40,6 +40,9 @@ static void do_up_read(struct irq_work *entry)
|
|||
{
|
||||
struct stack_map_irq_work *work;
|
||||
|
||||
if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
|
||||
return;
|
||||
|
||||
work = container_of(entry, struct stack_map_irq_work, irq_work);
|
||||
up_read_non_owner(work->sem);
|
||||
work->sem = NULL;
|
||||
|
@ -288,10 +291,19 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
|
|||
struct stack_map_irq_work *work = NULL;
|
||||
|
||||
if (irqs_disabled()) {
|
||||
work = this_cpu_ptr(&up_read_work);
|
||||
if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY)
|
||||
/* cannot queue more up_read, fallback */
|
||||
if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
|
||||
work = this_cpu_ptr(&up_read_work);
|
||||
if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) {
|
||||
/* cannot queue more up_read, fallback */
|
||||
irq_work_busy = true;
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* PREEMPT_RT does not allow to trylock mmap sem in
|
||||
* interrupt disabled context. Force the fallback code.
|
||||
*/
|
||||
irq_work_busy = true;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -171,11 +171,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
|
|||
flags);
|
||||
}
|
||||
|
||||
/* must increment bpf_prog_active to avoid kprobe+bpf triggering from
|
||||
* inside bpf map update or delete otherwise deadlocks are possible
|
||||
*/
|
||||
preempt_disable();
|
||||
__this_cpu_inc(bpf_prog_active);
|
||||
bpf_disable_instrumentation();
|
||||
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
|
||||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
|
||||
err = bpf_percpu_hash_update(map, key, value, flags);
|
||||
|
@ -206,8 +202,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
|
|||
err = map->ops->map_update_elem(map, key, value, flags);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
__this_cpu_dec(bpf_prog_active);
|
||||
preempt_enable();
|
||||
bpf_enable_instrumentation();
|
||||
maybe_wait_bpf_programs(map);
|
||||
|
||||
return err;
|
||||
|
@ -222,8 +217,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
|
|||
if (bpf_map_is_dev_bound(map))
|
||||
return bpf_map_offload_lookup_elem(map, key, value);
|
||||
|
||||
preempt_disable();
|
||||
this_cpu_inc(bpf_prog_active);
|
||||
bpf_disable_instrumentation();
|
||||
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
|
||||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
|
||||
err = bpf_percpu_hash_copy(map, key, value);
|
||||
|
@ -268,8 +262,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
|
|||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
this_cpu_dec(bpf_prog_active);
|
||||
preempt_enable();
|
||||
bpf_enable_instrumentation();
|
||||
maybe_wait_bpf_programs(map);
|
||||
|
||||
return err;
|
||||
|
@ -1136,13 +1129,11 @@ static int map_delete_elem(union bpf_attr *attr)
|
|||
goto out;
|
||||
}
|
||||
|
||||
preempt_disable();
|
||||
__this_cpu_inc(bpf_prog_active);
|
||||
bpf_disable_instrumentation();
|
||||
rcu_read_lock();
|
||||
err = map->ops->map_delete_elem(map, key);
|
||||
rcu_read_unlock();
|
||||
__this_cpu_dec(bpf_prog_active);
|
||||
preempt_enable();
|
||||
bpf_enable_instrumentation();
|
||||
maybe_wait_bpf_programs(map);
|
||||
out:
|
||||
kfree(key);
|
||||
|
@ -1254,13 +1245,11 @@ int generic_map_delete_batch(struct bpf_map *map,
|
|||
break;
|
||||
}
|
||||
|
||||
preempt_disable();
|
||||
__this_cpu_inc(bpf_prog_active);
|
||||
bpf_disable_instrumentation();
|
||||
rcu_read_lock();
|
||||
err = map->ops->map_delete_elem(map, key);
|
||||
rcu_read_unlock();
|
||||
__this_cpu_dec(bpf_prog_active);
|
||||
preempt_enable();
|
||||
bpf_enable_instrumentation();
|
||||
maybe_wait_bpf_programs(map);
|
||||
if (err)
|
||||
break;
|
||||
|
|
|
@ -367,8 +367,9 @@ out:
|
|||
mutex_unlock(&trampoline_mutex);
|
||||
}
|
||||
|
||||
/* The logic is similar to BPF_PROG_RUN, but with explicit rcu and preempt that
|
||||
* are needed for trampoline. The macro is split into
|
||||
/* The logic is similar to BPF_PROG_RUN, but with an explicit
|
||||
* rcu_read_lock() and migrate_disable() which are required
|
||||
* for the trampoline. The macro is split into
|
||||
* call _bpf_prog_enter
|
||||
* call prog->bpf_func
|
||||
* call __bpf_prog_exit
|
||||
|
@ -378,7 +379,7 @@ u64 notrace __bpf_prog_enter(void)
|
|||
u64 start = 0;
|
||||
|
||||
rcu_read_lock();
|
||||
preempt_disable();
|
||||
migrate_disable();
|
||||
if (static_branch_unlikely(&bpf_stats_enabled_key))
|
||||
start = sched_clock();
|
||||
return start;
|
||||
|
@ -401,7 +402,7 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
|
|||
stats->nsecs += sched_clock() - start;
|
||||
u64_stats_update_end(&stats->syncp);
|
||||
}
|
||||
preempt_enable();
|
||||
migrate_enable();
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
|
|
|
@ -8143,26 +8143,48 @@ static bool is_tracing_prog_type(enum bpf_prog_type type)
|
|||
}
|
||||
}
|
||||
|
||||
static bool is_preallocated_map(struct bpf_map *map)
|
||||
{
|
||||
if (!check_map_prealloc(map))
|
||||
return false;
|
||||
if (map->inner_map_meta && !check_map_prealloc(map->inner_map_meta))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
static int check_map_prog_compatibility(struct bpf_verifier_env *env,
|
||||
struct bpf_map *map,
|
||||
struct bpf_prog *prog)
|
||||
|
||||
{
|
||||
/* Make sure that BPF_PROG_TYPE_PERF_EVENT programs only use
|
||||
* preallocated hash maps, since doing memory allocation
|
||||
* in overflow_handler can crash depending on where nmi got
|
||||
* triggered.
|
||||
/*
|
||||
* Validate that trace type programs use preallocated hash maps.
|
||||
*
|
||||
* For programs attached to PERF events this is mandatory as the
|
||||
* perf NMI can hit any arbitrary code sequence.
|
||||
*
|
||||
* All other trace types using preallocated hash maps are unsafe as
|
||||
* well because tracepoint or kprobes can be inside locked regions
|
||||
* of the memory allocator or at a place where a recursion into the
|
||||
* memory allocator would see inconsistent state.
|
||||
*
|
||||
* On RT enabled kernels run-time allocation of all trace type
|
||||
* programs is strictly prohibited due to lock type constraints. On
|
||||
* !RT kernels it is allowed for backwards compatibility reasons for
|
||||
* now, but warnings are emitted so developers are made aware of
|
||||
* the unsafety and can fix their programs before this is enforced.
|
||||
*/
|
||||
if (prog->type == BPF_PROG_TYPE_PERF_EVENT) {
|
||||
if (!check_map_prealloc(map)) {
|
||||
if (is_tracing_prog_type(prog->type) && !is_preallocated_map(map)) {
|
||||
if (prog->type == BPF_PROG_TYPE_PERF_EVENT) {
|
||||
verbose(env, "perf_event programs can only use preallocated hash map\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
if (map->inner_map_meta &&
|
||||
!check_map_prealloc(map->inner_map_meta)) {
|
||||
verbose(env, "perf_event programs can only use preallocated inner hash map\n");
|
||||
if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
|
||||
verbose(env, "trace type programs can only use preallocated hash map\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
WARN_ONCE(1, "trace type BPF program uses run-time allocation\n");
|
||||
verbose(env, "trace type programs with run-time allocated hash maps are unsafe. Switch to preallocated hash maps.\n");
|
||||
}
|
||||
|
||||
if ((is_tracing_prog_type(prog->type) ||
|
||||
|
|
|
@ -9206,7 +9206,6 @@ static void bpf_overflow_handler(struct perf_event *event,
|
|||
int ret = 0;
|
||||
|
||||
ctx.regs = perf_arch_bpf_user_pt_regs(regs);
|
||||
preempt_disable();
|
||||
if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
|
||||
goto out;
|
||||
rcu_read_lock();
|
||||
|
@ -9214,7 +9213,6 @@ static void bpf_overflow_handler(struct perf_event *event,
|
|||
rcu_read_unlock();
|
||||
out:
|
||||
__this_cpu_dec(bpf_prog_active);
|
||||
preempt_enable();
|
||||
if (!ret)
|
||||
return;
|
||||
|
||||
|
|
|
@ -268,16 +268,14 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
|
|||
* All filters in the list are evaluated and the lowest BPF return
|
||||
* value always takes priority (ignoring the DATA).
|
||||
*/
|
||||
preempt_disable();
|
||||
for (; f; f = f->prev) {
|
||||
u32 cur_ret = BPF_PROG_RUN(f->prog, sd);
|
||||
u32 cur_ret = bpf_prog_run_pin_on_cpu(f->prog, sd);
|
||||
|
||||
if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) {
|
||||
ret = cur_ret;
|
||||
*match = f;
|
||||
}
|
||||
}
|
||||
preempt_enable();
|
||||
return ret;
|
||||
}
|
||||
#endif /* CONFIG_SECCOMP_FILTER */
|
||||
|
|
|
@ -83,7 +83,7 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
|
|||
if (in_nmi()) /* not supported yet */
|
||||
return 1;
|
||||
|
||||
preempt_disable();
|
||||
cant_sleep();
|
||||
|
||||
if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
|
||||
/*
|
||||
|
@ -115,11 +115,9 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
|
|||
|
||||
out:
|
||||
__this_cpu_dec(bpf_prog_active);
|
||||
preempt_enable();
|
||||
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(trace_call_bpf);
|
||||
|
||||
#ifdef CONFIG_BPF_KPROBE_OVERRIDE
|
||||
BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
|
||||
|
@ -1516,10 +1514,9 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
|
|||
static __always_inline
|
||||
void __bpf_trace_run(struct bpf_prog *prog, u64 *args)
|
||||
{
|
||||
cant_sleep();
|
||||
rcu_read_lock();
|
||||
preempt_disable();
|
||||
(void) BPF_PROG_RUN(prog, args);
|
||||
preempt_enable();
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
|
|
|
@ -1333,8 +1333,15 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
|
|||
int size, esize;
|
||||
int rctx;
|
||||
|
||||
if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
|
||||
return;
|
||||
if (bpf_prog_array_valid(call)) {
|
||||
u32 ret;
|
||||
|
||||
preempt_disable();
|
||||
ret = trace_call_bpf(call, regs);
|
||||
preempt_enable();
|
||||
if (!ret)
|
||||
return;
|
||||
}
|
||||
|
||||
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
|
||||
|
||||
|
|
|
@ -6660,14 +6660,14 @@ static int __run_one(const struct bpf_prog *fp, const void *data,
|
|||
u64 start, finish;
|
||||
int ret = 0, i;
|
||||
|
||||
preempt_disable();
|
||||
migrate_disable();
|
||||
start = ktime_get_ns();
|
||||
|
||||
for (i = 0; i < runs; i++)
|
||||
ret = BPF_PROG_RUN(fp, data);
|
||||
|
||||
finish = ktime_get_ns();
|
||||
preempt_enable();
|
||||
migrate_enable();
|
||||
|
||||
*duration = finish - start;
|
||||
do_div(*duration, runs);
|
||||
|
|
|
@ -37,7 +37,7 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
|
|||
repeat = 1;
|
||||
|
||||
rcu_read_lock();
|
||||
preempt_disable();
|
||||
migrate_disable();
|
||||
time_start = ktime_get_ns();
|
||||
for (i = 0; i < repeat; i++) {
|
||||
bpf_cgroup_storage_set(storage);
|
||||
|
@ -54,18 +54,18 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
|
|||
|
||||
if (need_resched()) {
|
||||
time_spent += ktime_get_ns() - time_start;
|
||||
preempt_enable();
|
||||
migrate_enable();
|
||||
rcu_read_unlock();
|
||||
|
||||
cond_resched();
|
||||
|
||||
rcu_read_lock();
|
||||
preempt_disable();
|
||||
migrate_disable();
|
||||
time_start = ktime_get_ns();
|
||||
}
|
||||
}
|
||||
time_spent += ktime_get_ns() - time_start;
|
||||
preempt_enable();
|
||||
migrate_enable();
|
||||
rcu_read_unlock();
|
||||
|
||||
do_div(time_spent, repeat);
|
||||
|
|
|
@ -920,9 +920,7 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
|
|||
(int)FLOW_DISSECTOR_F_STOP_AT_ENCAP);
|
||||
flow_keys->flags = flags;
|
||||
|
||||
preempt_disable();
|
||||
result = BPF_PROG_RUN(prog, ctx);
|
||||
preempt_enable();
|
||||
result = bpf_prog_run_pin_on_cpu(prog, ctx);
|
||||
|
||||
flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, nhoff, hlen);
|
||||
flow_keys->thoff = clamp_t(u16, flow_keys->thoff,
|
||||
|
|
|
@ -628,7 +628,6 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
|
|||
struct bpf_prog *prog;
|
||||
int ret;
|
||||
|
||||
preempt_disable();
|
||||
rcu_read_lock();
|
||||
prog = READ_ONCE(psock->progs.msg_parser);
|
||||
if (unlikely(!prog)) {
|
||||
|
@ -638,7 +637,7 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
|
|||
|
||||
sk_msg_compute_data_pointers(msg);
|
||||
msg->sk = sk;
|
||||
ret = BPF_PROG_RUN(prog, msg);
|
||||
ret = bpf_prog_run_pin_on_cpu(prog, msg);
|
||||
ret = sk_psock_map_verd(ret, msg->sk_redir);
|
||||
psock->apply_bytes = msg->apply_bytes;
|
||||
if (ret == __SK_REDIRECT) {
|
||||
|
@ -653,7 +652,6 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
|
|||
}
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
preempt_enable();
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sk_psock_msg_verdict);
|
||||
|
@ -665,9 +663,7 @@ static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog,
|
|||
|
||||
skb->sk = psock->sk;
|
||||
bpf_compute_data_end_sk_skb(skb);
|
||||
preempt_disable();
|
||||
ret = BPF_PROG_RUN(prog, skb);
|
||||
preempt_enable();
|
||||
ret = bpf_prog_run_pin_on_cpu(prog, skb);
|
||||
/* strparser clones the skb before handing it to a upper layer,
|
||||
* meaning skb_orphan has been called. We NULL sk on the way out
|
||||
* to ensure we don't trigger a BUG_ON() in skb/sk operations
|
||||
|
|
|
@ -380,9 +380,7 @@ static int kcm_parse_func_strparser(struct strparser *strp, struct sk_buff *skb)
|
|||
struct bpf_prog *prog = psock->bpf_prog;
|
||||
int res;
|
||||
|
||||
preempt_disable();
|
||||
res = BPF_PROG_RUN(prog, skb);
|
||||
preempt_enable();
|
||||
res = bpf_prog_run_pin_on_cpu(prog, skb);
|
||||
return res;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue