bpf: introduce BPF syscall and maps
BPF syscall is a multiplexor for a range of different operations on eBPF. This patch introduces syscall with single command to create a map. Next patch adds commands to access maps. 'maps' is a generic storage of different types for sharing data between kernel and userspace. Userspace example: /* this syscall wrapper creates a map with given type and attributes * and returns map_fd on success. * use close(map_fd) to delete the map */ int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, int max_entries) { union bpf_attr attr = { .map_type = map_type, .key_size = key_size, .value_size = value_size, .max_entries = max_entries }; return bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); } 'union bpf_attr' is backwards compatible with future extensions. More details in Documentation/networking/filter.txt and in manpage Signed-off-by: Alexei Starovoitov <ast@plumgrid.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
4a8e320c92
commit
99c55f7d47
|
@ -1001,6 +1001,45 @@ instruction that loads 64-bit immediate value into a dst_reg.
|
||||||
Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads
|
Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads
|
||||||
32-bit immediate value into a register.
|
32-bit immediate value into a register.
|
||||||
|
|
||||||
|
eBPF maps
|
||||||
|
---------
|
||||||
|
'maps' is a generic storage of different types for sharing data between kernel
|
||||||
|
and userspace.
|
||||||
|
|
||||||
|
The maps are accessed from user space via BPF syscall, which has commands:
|
||||||
|
- create a map with given type and attributes
|
||||||
|
map_fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size)
|
||||||
|
using attr->map_type, attr->key_size, attr->value_size, attr->max_entries
|
||||||
|
returns process-local file descriptor or negative error
|
||||||
|
|
||||||
|
- lookup key in a given map
|
||||||
|
err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
|
||||||
|
using attr->map_fd, attr->key, attr->value
|
||||||
|
returns zero and stores found elem into value or negative error
|
||||||
|
|
||||||
|
- create or update key/value pair in a given map
|
||||||
|
err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
|
||||||
|
using attr->map_fd, attr->key, attr->value
|
||||||
|
returns zero or negative error
|
||||||
|
|
||||||
|
- find and delete element by key in a given map
|
||||||
|
err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
|
||||||
|
using attr->map_fd, attr->key
|
||||||
|
|
||||||
|
- to delete map: close(fd)
|
||||||
|
Exiting process will delete maps automatically
|
||||||
|
|
||||||
|
userspace programs use this syscall to create/access maps that eBPF programs
|
||||||
|
are concurrently updating.
|
||||||
|
|
||||||
|
maps can have different types: hash, array, bloom filter, radix-tree, etc.
|
||||||
|
|
||||||
|
The map is defined by:
|
||||||
|
. type
|
||||||
|
. max number of elements
|
||||||
|
. key size in bytes
|
||||||
|
. value size in bytes
|
||||||
|
|
||||||
Testing
|
Testing
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,41 @@
|
||||||
|
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of version 2 of the GNU General Public
|
||||||
|
* License as published by the Free Software Foundation.
|
||||||
|
*/
|
||||||
|
#ifndef _LINUX_BPF_H
|
||||||
|
#define _LINUX_BPF_H 1
|
||||||
|
|
||||||
|
#include <uapi/linux/bpf.h>
|
||||||
|
#include <linux/workqueue.h>
|
||||||
|
|
||||||
|
struct bpf_map;
|
||||||
|
|
||||||
|
/* map is generic key/value storage optionally accesible by eBPF programs */
|
||||||
|
struct bpf_map_ops {
|
||||||
|
/* funcs callable from userspace (via syscall) */
|
||||||
|
struct bpf_map *(*map_alloc)(union bpf_attr *attr);
|
||||||
|
void (*map_free)(struct bpf_map *);
|
||||||
|
};
|
||||||
|
|
||||||
|
struct bpf_map {
|
||||||
|
atomic_t refcnt;
|
||||||
|
enum bpf_map_type map_type;
|
||||||
|
u32 key_size;
|
||||||
|
u32 value_size;
|
||||||
|
u32 max_entries;
|
||||||
|
struct bpf_map_ops *ops;
|
||||||
|
struct work_struct work;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct bpf_map_type_list {
|
||||||
|
struct list_head list_node;
|
||||||
|
struct bpf_map_ops *ops;
|
||||||
|
enum bpf_map_type type;
|
||||||
|
};
|
||||||
|
|
||||||
|
void bpf_register_map_type(struct bpf_map_type_list *tl);
|
||||||
|
void bpf_map_put(struct bpf_map *map);
|
||||||
|
|
||||||
|
#endif /* _LINUX_BPF_H */
|
|
@ -62,4 +62,27 @@ struct bpf_insn {
|
||||||
__s32 imm; /* signed immediate constant */
|
__s32 imm; /* signed immediate constant */
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/* BPF syscall commands */
|
||||||
|
enum bpf_cmd {
|
||||||
|
/* create a map with given type and attributes
|
||||||
|
* fd = bpf(BPF_MAP_CREATE, union bpf_attr *, u32 size)
|
||||||
|
* returns fd or negative error
|
||||||
|
* map is deleted when fd is closed
|
||||||
|
*/
|
||||||
|
BPF_MAP_CREATE,
|
||||||
|
};
|
||||||
|
|
||||||
|
enum bpf_map_type {
|
||||||
|
BPF_MAP_TYPE_UNSPEC,
|
||||||
|
};
|
||||||
|
|
||||||
|
union bpf_attr {
|
||||||
|
struct { /* anonymous struct used by BPF_MAP_CREATE command */
|
||||||
|
__u32 map_type; /* one of enum bpf_map_type */
|
||||||
|
__u32 key_size; /* size of key in bytes */
|
||||||
|
__u32 value_size; /* size of value in bytes */
|
||||||
|
__u32 max_entries; /* max number of entries in a map */
|
||||||
|
};
|
||||||
|
} __attribute__((aligned(8)));
|
||||||
|
|
||||||
#endif /* _UAPI__LINUX_BPF_H__ */
|
#endif /* _UAPI__LINUX_BPF_H__ */
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
obj-y := core.o
|
obj-y := core.o syscall.o
|
||||||
|
|
|
@ -0,0 +1,169 @@
|
||||||
|
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of version 2 of the GNU General Public
|
||||||
|
* License as published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but
|
||||||
|
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* General Public License for more details.
|
||||||
|
*/
|
||||||
|
#include <linux/bpf.h>
|
||||||
|
#include <linux/syscalls.h>
|
||||||
|
#include <linux/slab.h>
|
||||||
|
#include <linux/anon_inodes.h>
|
||||||
|
|
||||||
|
static LIST_HEAD(bpf_map_types);
|
||||||
|
|
||||||
|
static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
|
||||||
|
{
|
||||||
|
struct bpf_map_type_list *tl;
|
||||||
|
struct bpf_map *map;
|
||||||
|
|
||||||
|
list_for_each_entry(tl, &bpf_map_types, list_node) {
|
||||||
|
if (tl->type == attr->map_type) {
|
||||||
|
map = tl->ops->map_alloc(attr);
|
||||||
|
if (IS_ERR(map))
|
||||||
|
return map;
|
||||||
|
map->ops = tl->ops;
|
||||||
|
map->map_type = attr->map_type;
|
||||||
|
return map;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ERR_PTR(-EINVAL);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* boot time registration of different map implementations */
|
||||||
|
void bpf_register_map_type(struct bpf_map_type_list *tl)
|
||||||
|
{
|
||||||
|
list_add(&tl->list_node, &bpf_map_types);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* called from workqueue */
|
||||||
|
static void bpf_map_free_deferred(struct work_struct *work)
|
||||||
|
{
|
||||||
|
struct bpf_map *map = container_of(work, struct bpf_map, work);
|
||||||
|
|
||||||
|
/* implementation dependent freeing */
|
||||||
|
map->ops->map_free(map);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* decrement map refcnt and schedule it for freeing via workqueue
|
||||||
|
* (unrelying map implementation ops->map_free() might sleep)
|
||||||
|
*/
|
||||||
|
void bpf_map_put(struct bpf_map *map)
|
||||||
|
{
|
||||||
|
if (atomic_dec_and_test(&map->refcnt)) {
|
||||||
|
INIT_WORK(&map->work, bpf_map_free_deferred);
|
||||||
|
schedule_work(&map->work);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static int bpf_map_release(struct inode *inode, struct file *filp)
|
||||||
|
{
|
||||||
|
struct bpf_map *map = filp->private_data;
|
||||||
|
|
||||||
|
bpf_map_put(map);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static const struct file_operations bpf_map_fops = {
|
||||||
|
.release = bpf_map_release,
|
||||||
|
};
|
||||||
|
|
||||||
|
/* helper macro to check that unused fields 'union bpf_attr' are zero */
|
||||||
|
#define CHECK_ATTR(CMD) \
|
||||||
|
memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
|
||||||
|
sizeof(attr->CMD##_LAST_FIELD), 0, \
|
||||||
|
sizeof(*attr) - \
|
||||||
|
offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
|
||||||
|
sizeof(attr->CMD##_LAST_FIELD)) != NULL
|
||||||
|
|
||||||
|
#define BPF_MAP_CREATE_LAST_FIELD max_entries
|
||||||
|
/* called via syscall */
|
||||||
|
static int map_create(union bpf_attr *attr)
|
||||||
|
{
|
||||||
|
struct bpf_map *map;
|
||||||
|
int err;
|
||||||
|
|
||||||
|
err = CHECK_ATTR(BPF_MAP_CREATE);
|
||||||
|
if (err)
|
||||||
|
return -EINVAL;
|
||||||
|
|
||||||
|
/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
|
||||||
|
map = find_and_alloc_map(attr);
|
||||||
|
if (IS_ERR(map))
|
||||||
|
return PTR_ERR(map);
|
||||||
|
|
||||||
|
atomic_set(&map->refcnt, 1);
|
||||||
|
|
||||||
|
err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);
|
||||||
|
|
||||||
|
if (err < 0)
|
||||||
|
/* failed to allocate fd */
|
||||||
|
goto free_map;
|
||||||
|
|
||||||
|
return err;
|
||||||
|
|
||||||
|
free_map:
|
||||||
|
map->ops->map_free(map);
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
|
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
|
||||||
|
{
|
||||||
|
union bpf_attr attr = {};
|
||||||
|
int err;
|
||||||
|
|
||||||
|
/* the syscall is limited to root temporarily. This restriction will be
|
||||||
|
* lifted when security audit is clean. Note that eBPF+tracing must have
|
||||||
|
* this restriction, since it may pass kernel data to user space
|
||||||
|
*/
|
||||||
|
if (!capable(CAP_SYS_ADMIN))
|
||||||
|
return -EPERM;
|
||||||
|
|
||||||
|
if (!access_ok(VERIFY_READ, uattr, 1))
|
||||||
|
return -EFAULT;
|
||||||
|
|
||||||
|
if (size > PAGE_SIZE) /* silly large */
|
||||||
|
return -E2BIG;
|
||||||
|
|
||||||
|
/* If we're handed a bigger struct than we know of,
|
||||||
|
* ensure all the unknown bits are 0 - i.e. new
|
||||||
|
* user-space does not rely on any kernel feature
|
||||||
|
* extensions we dont know about yet.
|
||||||
|
*/
|
||||||
|
if (size > sizeof(attr)) {
|
||||||
|
unsigned char __user *addr;
|
||||||
|
unsigned char __user *end;
|
||||||
|
unsigned char val;
|
||||||
|
|
||||||
|
addr = (void __user *)uattr + sizeof(attr);
|
||||||
|
end = (void __user *)uattr + size;
|
||||||
|
|
||||||
|
for (; addr < end; addr++) {
|
||||||
|
err = get_user(val, addr);
|
||||||
|
if (err)
|
||||||
|
return err;
|
||||||
|
if (val)
|
||||||
|
return -E2BIG;
|
||||||
|
}
|
||||||
|
size = sizeof(attr);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* copy attributes from user space, may be less than sizeof(bpf_attr) */
|
||||||
|
if (copy_from_user(&attr, uattr, size) != 0)
|
||||||
|
return -EFAULT;
|
||||||
|
|
||||||
|
switch (cmd) {
|
||||||
|
case BPF_MAP_CREATE:
|
||||||
|
err = map_create(&attr);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
err = -EINVAL;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return err;
|
||||||
|
}
|
Loading…
Reference in New Issue