bpf, sockmap: convert to generic sk_msg interface

Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.

This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.

Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.

The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.

Joint work with John.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
This commit is contained in:
Daniel Borkmann 2018-10-13 02:45:58 +02:00 committed by Alexei Starovoitov
parent 1243a51f6c
commit 604326b41a
17 changed files with 2926 additions and 2861 deletions

View File

@ -737,33 +737,18 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map)
} }
#endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_INET) #if defined(CONFIG_BPF_STREAM_PARSER)
struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key); int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, u32 which);
struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key); int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog);
int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type);
int sockmap_get_from_fd(const union bpf_attr *attr, int type,
struct bpf_prog *prog);
#else #else
static inline struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) static inline int sock_map_prog_update(struct bpf_map *map,
{ struct bpf_prog *prog, u32 which)
return NULL;
}
static inline struct sock *__sock_hash_lookup_elem(struct bpf_map *map,
void *key)
{
return NULL;
}
static inline int sock_map_prog(struct bpf_map *map,
struct bpf_prog *prog,
u32 type)
{ {
return -EOPNOTSUPP; return -EOPNOTSUPP;
} }
static inline int sockmap_get_from_fd(const union bpf_attr *attr, int type, static inline int sock_map_get_from_fd(const union bpf_attr *attr,
struct bpf_prog *prog) struct bpf_prog *prog)
{ {
return -EINVAL; return -EINVAL;
} }
@ -839,6 +824,10 @@ extern const struct bpf_func_proto bpf_get_stack_proto;
extern const struct bpf_func_proto bpf_sock_map_update_proto; extern const struct bpf_func_proto bpf_sock_map_update_proto;
extern const struct bpf_func_proto bpf_sock_hash_update_proto; extern const struct bpf_func_proto bpf_sock_hash_update_proto;
extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto;
extern const struct bpf_func_proto bpf_msg_redirect_hash_proto;
extern const struct bpf_func_proto bpf_msg_redirect_map_proto;
extern const struct bpf_func_proto bpf_sk_redirect_hash_proto;
extern const struct bpf_func_proto bpf_sk_redirect_map_proto;
extern const struct bpf_func_proto bpf_get_local_storage_proto; extern const struct bpf_func_proto bpf_get_local_storage_proto;

View File

@ -57,7 +57,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
#ifdef CONFIG_NET #ifdef CONFIG_NET
BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_INET) #if defined(CONFIG_BPF_STREAM_PARSER)
BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops)
#endif #endif

View File

@ -520,24 +520,6 @@ struct bpf_skb_data_end {
void *data_end; void *data_end;
}; };
struct sk_msg_buff {
void *data;
void *data_end;
__u32 apply_bytes;
__u32 cork_bytes;
int sg_copybreak;
int sg_start;
int sg_curr;
int sg_end;
struct scatterlist sg_data[MAX_SKB_FRAGS];
bool sg_copy[MAX_SKB_FRAGS];
__u32 flags;
struct sock *sk_redir;
struct sock *sk;
struct sk_buff *skb;
struct list_head list;
};
struct bpf_redirect_info { struct bpf_redirect_info {
u32 ifindex; u32 ifindex;
u32 flags; u32 flags;
@ -833,9 +815,6 @@ void xdp_do_flush_map(void);
void bpf_warn_invalid_xdp_action(u32 act); void bpf_warn_invalid_xdp_action(u32 act);
struct sock *do_sk_redirect_map(struct sk_buff *skb);
struct sock *do_msg_redirect_map(struct sk_msg_buff *md);
#ifdef CONFIG_INET #ifdef CONFIG_INET
struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
struct bpf_prog *prog, struct sk_buff *skb, struct bpf_prog *prog, struct sk_buff *skb,

371
include/linux/skmsg.h Normal file
View File

@ -0,0 +1,371 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
#ifndef _LINUX_SKMSG_H
#define _LINUX_SKMSG_H
#include <linux/bpf.h>
#include <linux/filter.h>
#include <linux/scatterlist.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/tcp.h>
#include <net/strparser.h>
#define MAX_MSG_FRAGS MAX_SKB_FRAGS
enum __sk_action {
__SK_DROP = 0,
__SK_PASS,
__SK_REDIRECT,
__SK_NONE,
};
struct sk_msg_sg {
u32 start;
u32 curr;
u32 end;
u32 size;
u32 copybreak;
bool copy[MAX_MSG_FRAGS];
struct scatterlist data[MAX_MSG_FRAGS];
};
struct sk_msg {
struct sk_msg_sg sg;
void *data;
void *data_end;
u32 apply_bytes;
u32 cork_bytes;
u32 flags;
struct sk_buff *skb;
struct sock *sk_redir;
struct sock *sk;
struct list_head list;
};
struct sk_psock_progs {
struct bpf_prog *msg_parser;
struct bpf_prog *skb_parser;
struct bpf_prog *skb_verdict;
};
enum sk_psock_state_bits {
SK_PSOCK_TX_ENABLED,
};
struct sk_psock_link {
struct list_head list;
struct bpf_map *map;
void *link_raw;
};
struct sk_psock_parser {
struct strparser strp;
bool enabled;
void (*saved_data_ready)(struct sock *sk);
};
struct sk_psock_work_state {
struct sk_buff *skb;
u32 len;
u32 off;
};
struct sk_psock {
struct sock *sk;
struct sock *sk_redir;
u32 apply_bytes;
u32 cork_bytes;
u32 eval;
struct sk_msg *cork;
struct sk_psock_progs progs;
struct sk_psock_parser parser;
struct sk_buff_head ingress_skb;
struct list_head ingress_msg;
unsigned long state;
struct list_head link;
spinlock_t link_lock;
refcount_t refcnt;
void (*saved_unhash)(struct sock *sk);
void (*saved_close)(struct sock *sk, long timeout);
void (*saved_write_space)(struct sock *sk);
struct proto *sk_proto;
struct sk_psock_work_state work_state;
struct work_struct work;
union {
struct rcu_head rcu;
struct work_struct gc;
};
};
int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
int elem_first_coalesce);
void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len);
int sk_msg_free(struct sock *sk, struct sk_msg *msg);
int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg);
void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes);
void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg,
u32 bytes);
void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes);
int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
struct sk_msg *msg, u32 bytes);
int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
struct sk_msg *msg, u32 bytes);
static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes)
{
WARN_ON(i == msg->sg.end && bytes);
}
static inline void sk_msg_apply_bytes(struct sk_psock *psock, u32 bytes)
{
if (psock->apply_bytes) {
if (psock->apply_bytes < bytes)
psock->apply_bytes = 0;
else
psock->apply_bytes -= bytes;
}
}
#define sk_msg_iter_var_prev(var) \
do { \
if (var == 0) \
var = MAX_MSG_FRAGS - 1; \
else \
var--; \
} while (0)
#define sk_msg_iter_var_next(var) \
do { \
var++; \
if (var == MAX_MSG_FRAGS) \
var = 0; \
} while (0)
#define sk_msg_iter_prev(msg, which) \
sk_msg_iter_var_prev(msg->sg.which)
#define sk_msg_iter_next(msg, which) \
sk_msg_iter_var_next(msg->sg.which)
static inline void sk_msg_clear_meta(struct sk_msg *msg)
{
memset(&msg->sg, 0, offsetofend(struct sk_msg_sg, copy));
}
static inline void sk_msg_init(struct sk_msg *msg)
{
memset(msg, 0, sizeof(*msg));
sg_init_marker(msg->sg.data, ARRAY_SIZE(msg->sg.data));
}
static inline void sk_msg_xfer(struct sk_msg *dst, struct sk_msg *src,
int which, u32 size)
{
dst->sg.data[which] = src->sg.data[which];
dst->sg.data[which].length = size;
src->sg.data[which].length -= size;
src->sg.data[which].offset += size;
}
static inline u32 sk_msg_elem_used(const struct sk_msg *msg)
{
return msg->sg.end >= msg->sg.start ?
msg->sg.end - msg->sg.start :
msg->sg.end + (MAX_MSG_FRAGS - msg->sg.start);
}
static inline bool sk_msg_full(const struct sk_msg *msg)
{
return (msg->sg.end == msg->sg.start) && msg->sg.size;
}
static inline struct scatterlist *sk_msg_elem(struct sk_msg *msg, int which)
{
return &msg->sg.data[which];
}
static inline struct page *sk_msg_page(struct sk_msg *msg, int which)
{
return sg_page(sk_msg_elem(msg, which));
}
static inline bool sk_msg_to_ingress(const struct sk_msg *msg)
{
return msg->flags & BPF_F_INGRESS;
}
static inline void sk_msg_compute_data_pointers(struct sk_msg *msg)
{
struct scatterlist *sge = sk_msg_elem(msg, msg->sg.start);
if (msg->sg.copy[msg->sg.start]) {
msg->data = NULL;
msg->data_end = NULL;
} else {
msg->data = sg_virt(sge);
msg->data_end = msg->data + sge->length;
}
}
static inline void sk_msg_page_add(struct sk_msg *msg, struct page *page,
u32 len, u32 offset)
{
struct scatterlist *sge;
get_page(page);
sge = sk_msg_elem(msg, msg->sg.end);
sg_set_page(sge, page, len, offset);
sg_unmark_end(sge);
msg->sg.copy[msg->sg.end] = true;
msg->sg.size += len;
sk_msg_iter_next(msg, end);
}
static inline struct sk_psock *sk_psock(const struct sock *sk)
{
return rcu_dereference_sk_user_data(sk);
}
static inline bool sk_has_psock(struct sock *sk)
{
return sk_psock(sk) != NULL && sk->sk_prot->recvmsg == tcp_bpf_recvmsg;
}
static inline void sk_psock_queue_msg(struct sk_psock *psock,
struct sk_msg *msg)
{
list_add_tail(&msg->list, &psock->ingress_msg);
}
static inline void sk_psock_report_error(struct sk_psock *psock, int err)
{
struct sock *sk = psock->sk;
sk->sk_err = err;
sk->sk_error_report(sk);
}
struct sk_psock *sk_psock_init(struct sock *sk, int node);
int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock);
void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock);
void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock);
int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
struct sk_msg *msg);
static inline struct sk_psock_link *sk_psock_init_link(void)
{
return kzalloc(sizeof(struct sk_psock_link),
GFP_ATOMIC | __GFP_NOWARN);
}
static inline void sk_psock_free_link(struct sk_psock_link *link)
{
kfree(link);
}
struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock);
#if defined(CONFIG_BPF_STREAM_PARSER)
void sk_psock_unlink(struct sock *sk, struct sk_psock_link *link);
#else
static inline void sk_psock_unlink(struct sock *sk,
struct sk_psock_link *link)
{
}
#endif
void __sk_psock_purge_ingress_msg(struct sk_psock *psock);
static inline void sk_psock_cork_free(struct sk_psock *psock)
{
if (psock->cork) {
sk_msg_free(psock->sk, psock->cork);
kfree(psock->cork);
psock->cork = NULL;
}
}
static inline void sk_psock_update_proto(struct sock *sk,
struct sk_psock *psock,
struct proto *ops)
{
psock->saved_unhash = sk->sk_prot->unhash;
psock->saved_close = sk->sk_prot->close;
psock->saved_write_space = sk->sk_write_space;
psock->sk_proto = sk->sk_prot;
sk->sk_prot = ops;
}
static inline void sk_psock_restore_proto(struct sock *sk,
struct sk_psock *psock)
{
if (psock->sk_proto) {
sk->sk_prot = psock->sk_proto;
psock->sk_proto = NULL;
}
}
static inline void sk_psock_set_state(struct sk_psock *psock,
enum sk_psock_state_bits bit)
{
set_bit(bit, &psock->state);
}
static inline void sk_psock_clear_state(struct sk_psock *psock,
enum sk_psock_state_bits bit)
{
clear_bit(bit, &psock->state);
}
static inline bool sk_psock_test_state(const struct sk_psock *psock,
enum sk_psock_state_bits bit)
{
return test_bit(bit, &psock->state);
}
static inline struct sk_psock *sk_psock_get(struct sock *sk)
{
struct sk_psock *psock;
rcu_read_lock();
psock = sk_psock(sk);
if (psock && !refcount_inc_not_zero(&psock->refcnt))
psock = NULL;
rcu_read_unlock();
return psock;
}
void sk_psock_stop(struct sock *sk, struct sk_psock *psock);
void sk_psock_destroy(struct rcu_head *rcu);
void sk_psock_drop(struct sock *sk, struct sk_psock *psock);
static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock)
{
if (refcount_dec_and_test(&psock->refcnt))
sk_psock_drop(sk, psock);
}
static inline void psock_set_prog(struct bpf_prog **pprog,
struct bpf_prog *prog)
{
prog = xchg(pprog, prog);
if (prog)
bpf_prog_put(prog);
}
static inline void psock_progs_drop(struct sk_psock_progs *progs)
{
psock_set_prog(&progs->msg_parser, NULL);
psock_set_prog(&progs->skb_parser, NULL);
psock_set_prog(&progs->skb_verdict, NULL);
}
#endif /* _LINUX_SKMSG_H */

View File

@ -858,6 +858,21 @@ static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb)
TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb); TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb);
} }
static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb)
{
return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS;
}
static inline struct sock *tcp_skb_bpf_redirect_fetch(struct sk_buff *skb)
{
return TCP_SKB_CB(skb)->bpf.sk_redir;
}
static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb)
{
TCP_SKB_CB(skb)->bpf.sk_redir = NULL;
}
#if IS_ENABLED(CONFIG_IPV6) #if IS_ENABLED(CONFIG_IPV6)
/* This is the variant of inet6_iif() that must be used by TCP, /* This is the variant of inet6_iif() that must be used by TCP,
* as TCP moves IP6CB into a different location in skb->cb[] * as TCP moves IP6CB into a different location in skb->cb[]
@ -2064,6 +2079,18 @@ void tcp_cleanup_ulp(struct sock *sk);
__MODULE_INFO(alias, alias_userspace, name); \ __MODULE_INFO(alias, alias_userspace, name); \
__MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name) __MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name)
struct sk_msg;
struct sk_psock;
int tcp_bpf_init(struct sock *sk);
void tcp_bpf_reinit(struct sock *sk);
int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes,
int flags);
int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
int nonblock, int flags, int *addr_len);
int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
struct msghdr *msg, int len);
/* Call BPF_SOCK_OPS program that returns an int. If the return value /* Call BPF_SOCK_OPS program that returns an int. If the return value
* is < 0, then the BPF op failed (for example if the loaded BPF * is < 0, then the BPF op failed (for example if the loaded BPF
* program does not support the chosen operation or there is no BPF * program does not support the chosen operation or there is no BPF

View File

@ -13,11 +13,6 @@ ifeq ($(CONFIG_XDP_SOCKETS),y)
obj-$(CONFIG_BPF_SYSCALL) += xskmap.o obj-$(CONFIG_BPF_SYSCALL) += xskmap.o
endif endif
obj-$(CONFIG_BPF_SYSCALL) += offload.o obj-$(CONFIG_BPF_SYSCALL) += offload.o
ifeq ($(CONFIG_STREAM_PARSER),y)
ifeq ($(CONFIG_INET),y)
obj-$(CONFIG_BPF_SYSCALL) += sockmap.o
endif
endif
endif endif
ifeq ($(CONFIG_PERF_EVENTS),y) ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o obj-$(CONFIG_BPF_SYSCALL) += stackmap.o

View File

@ -1792,8 +1792,6 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
const struct bpf_func_proto bpf_get_current_comm_proto __weak; const struct bpf_func_proto bpf_get_current_comm_proto __weak;
const struct bpf_func_proto bpf_sock_map_update_proto __weak;
const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
const struct bpf_func_proto bpf_get_local_storage_proto __weak; const struct bpf_func_proto bpf_get_local_storage_proto __weak;

File diff suppressed because it is too large Load Diff

View File

@ -1664,7 +1664,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
switch (ptype) { switch (ptype) {
case BPF_PROG_TYPE_SK_SKB: case BPF_PROG_TYPE_SK_SKB:
case BPF_PROG_TYPE_SK_MSG: case BPF_PROG_TYPE_SK_MSG:
ret = sockmap_get_from_fd(attr, ptype, prog); ret = sock_map_get_from_fd(attr, prog);
break; break;
case BPF_PROG_TYPE_LIRC_MODE2: case BPF_PROG_TYPE_LIRC_MODE2:
ret = lirc_prog_attach(attr, prog); ret = lirc_prog_attach(attr, prog);
@ -1718,10 +1718,10 @@ static int bpf_prog_detach(const union bpf_attr *attr)
ptype = BPF_PROG_TYPE_CGROUP_DEVICE; ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
break; break;
case BPF_SK_MSG_VERDICT: case BPF_SK_MSG_VERDICT:
return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, NULL); return sock_map_get_from_fd(attr, NULL);
case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_PARSER:
case BPF_SK_SKB_STREAM_VERDICT: case BPF_SK_SKB_STREAM_VERDICT:
return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL); return sock_map_get_from_fd(attr, NULL);
case BPF_LIRC_MODE2: case BPF_LIRC_MODE2:
return lirc_prog_detach(attr); return lirc_prog_detach(attr);
case BPF_FLOW_DISSECTOR: case BPF_FLOW_DISSECTOR:

View File

@ -300,8 +300,11 @@ config BPF_JIT
config BPF_STREAM_PARSER config BPF_STREAM_PARSER
bool "enable BPF STREAM_PARSER" bool "enable BPF STREAM_PARSER"
depends on INET
depends on BPF_SYSCALL depends on BPF_SYSCALL
depends on CGROUP_BPF
select STREAM_PARSER select STREAM_PARSER
select NET_SOCK_MSG
---help--- ---help---
Enabling this allows a stream parser to be used with Enabling this allows a stream parser to be used with
BPF_MAP_TYPE_SOCKMAP. BPF_MAP_TYPE_SOCKMAP.
@ -413,6 +416,14 @@ config GRO_CELLS
config SOCK_VALIDATE_XMIT config SOCK_VALIDATE_XMIT
bool bool
config NET_SOCK_MSG
bool
default n
help
The NET_SOCK_MSG provides a framework for plain sockets (e.g. TCP) or
ULPs (upper layer modules, e.g. TLS) to process L7 application data
with the help of BPF programs.
config NET_DEVLINK config NET_DEVLINK
tristate "Network physical/parent device Netlink interface" tristate "Network physical/parent device Netlink interface"
help help

View File

@ -16,6 +16,7 @@ obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
obj-y += net-sysfs.o obj-y += net-sysfs.o
obj-$(CONFIG_PAGE_POOL) += page_pool.o obj-$(CONFIG_PAGE_POOL) += page_pool.o
obj-$(CONFIG_PROC_FS) += net-procfs.o obj-$(CONFIG_PROC_FS) += net-procfs.o
obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o
obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o
obj-$(CONFIG_NETPOLL) += netpoll.o obj-$(CONFIG_NETPOLL) += netpoll.o
obj-$(CONFIG_FIB_RULES) += fib_rules.o obj-$(CONFIG_FIB_RULES) += fib_rules.o
@ -27,6 +28,7 @@ obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
obj-$(CONFIG_LWTUNNEL) += lwtunnel.o obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o
obj-$(CONFIG_BPF_STREAM_PARSER) += sock_map.o
obj-$(CONFIG_DST_CACHE) += dst_cache.o obj-$(CONFIG_DST_CACHE) += dst_cache.o
obj-$(CONFIG_HWBM) += hwbm.o obj-$(CONFIG_HWBM) += hwbm.o
obj-$(CONFIG_NET_DEVLINK) += devlink.o obj-$(CONFIG_NET_DEVLINK) += devlink.o

View File

@ -38,6 +38,7 @@
#include <net/protocol.h> #include <net/protocol.h>
#include <net/netlink.h> #include <net/netlink.h>
#include <linux/skbuff.h> #include <linux/skbuff.h>
#include <linux/skmsg.h>
#include <net/sock.h> #include <net/sock.h>
#include <net/flow_dissector.h> #include <net/flow_dissector.h>
#include <linux/errno.h> #include <linux/errno.h>
@ -2142,123 +2143,7 @@ static const struct bpf_func_proto bpf_redirect_proto = {
.arg2_type = ARG_ANYTHING, .arg2_type = ARG_ANYTHING,
}; };
BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes)
struct bpf_map *, map, void *, key, u64, flags)
{
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
/* If user passes invalid input drop the packet. */
if (unlikely(flags & ~(BPF_F_INGRESS)))
return SK_DROP;
tcb->bpf.flags = flags;
tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key);
if (!tcb->bpf.sk_redir)
return SK_DROP;
return SK_PASS;
}
static const struct bpf_func_proto bpf_sk_redirect_hash_proto = {
.func = bpf_sk_redirect_hash,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_PTR_TO_MAP_KEY,
.arg4_type = ARG_ANYTHING,
};
BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
struct bpf_map *, map, u32, key, u64, flags)
{
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
/* If user passes invalid input drop the packet. */
if (unlikely(flags & ~(BPF_F_INGRESS)))
return SK_DROP;
tcb->bpf.flags = flags;
tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key);
if (!tcb->bpf.sk_redir)
return SK_DROP;
return SK_PASS;
}
struct sock *do_sk_redirect_map(struct sk_buff *skb)
{
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
return tcb->bpf.sk_redir;
}
static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
.func = bpf_sk_redirect_map,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_ANYTHING,
};
BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg_buff *, msg,
struct bpf_map *, map, void *, key, u64, flags)
{
/* If user passes invalid input drop the packet. */
if (unlikely(flags & ~(BPF_F_INGRESS)))
return SK_DROP;
msg->flags = flags;
msg->sk_redir = __sock_hash_lookup_elem(map, key);
if (!msg->sk_redir)
return SK_DROP;
return SK_PASS;
}
static const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
.func = bpf_msg_redirect_hash,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_PTR_TO_MAP_KEY,
.arg4_type = ARG_ANYTHING,
};
BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg,
struct bpf_map *, map, u32, key, u64, flags)
{
/* If user passes invalid input drop the packet. */
if (unlikely(flags & ~(BPF_F_INGRESS)))
return SK_DROP;
msg->flags = flags;
msg->sk_redir = __sock_map_lookup_elem(map, key);
if (!msg->sk_redir)
return SK_DROP;
return SK_PASS;
}
struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
{
return msg->sk_redir;
}
static const struct bpf_func_proto bpf_msg_redirect_map_proto = {
.func = bpf_msg_redirect_map,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_ANYTHING,
};
BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg_buff *, msg, u32, bytes)
{ {
msg->apply_bytes = bytes; msg->apply_bytes = bytes;
return 0; return 0;
@ -2272,7 +2157,7 @@ static const struct bpf_func_proto bpf_msg_apply_bytes_proto = {
.arg2_type = ARG_ANYTHING, .arg2_type = ARG_ANYTHING,
}; };
BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg_buff *, msg, u32, bytes) BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes)
{ {
msg->cork_bytes = bytes; msg->cork_bytes = bytes;
return 0; return 0;
@ -2286,45 +2171,37 @@ static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
.arg2_type = ARG_ANYTHING, .arg2_type = ARG_ANYTHING,
}; };
#define sk_msg_iter_var(var) \ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
do { \ u32, end, u64, flags)
var++; \
if (var == MAX_SKB_FRAGS) \
var = 0; \
} while (0)
BPF_CALL_4(bpf_msg_pull_data,
struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags)
{ {
unsigned int len = 0, offset = 0, copy = 0, poffset = 0; u32 len = 0, offset = 0, copy = 0, poffset = 0, bytes = end - start;
int bytes = end - start, bytes_sg_total; u32 first_sge, last_sge, i, shift, bytes_sg_total;
struct scatterlist *sg = msg->sg_data; struct scatterlist *sge;
int first_sg, last_sg, i, shift; u8 *raw, *to, *from;
unsigned char *p, *to, *from;
struct page *page; struct page *page;
if (unlikely(flags || end <= start)) if (unlikely(flags || end <= start))
return -EINVAL; return -EINVAL;
/* First find the starting scatterlist element */ /* First find the starting scatterlist element */
i = msg->sg_start; i = msg->sg.start;
do { do {
len = sg[i].length; len = sk_msg_elem(msg, i)->length;
if (start < offset + len) if (start < offset + len)
break; break;
offset += len; offset += len;
sk_msg_iter_var(i); sk_msg_iter_var_next(i);
} while (i != msg->sg_end); } while (i != msg->sg.end);
if (unlikely(start >= offset + len)) if (unlikely(start >= offset + len))
return -EINVAL; return -EINVAL;
first_sg = i; first_sge = i;
/* The start may point into the sg element so we need to also /* The start may point into the sg element so we need to also
* account for the headroom. * account for the headroom.
*/ */
bytes_sg_total = start - offset + bytes; bytes_sg_total = start - offset + bytes;
if (!msg->sg_copy[i] && bytes_sg_total <= len) if (!msg->sg.copy[i] && bytes_sg_total <= len)
goto out; goto out;
/* At this point we need to linearize multiple scatterlist /* At this point we need to linearize multiple scatterlist
@ -2338,12 +2215,12 @@ BPF_CALL_4(bpf_msg_pull_data,
* will copy the entire sg entry. * will copy the entire sg entry.
*/ */
do { do {
copy += sg[i].length; copy += sk_msg_elem(msg, i)->length;
sk_msg_iter_var(i); sk_msg_iter_var_next(i);
if (bytes_sg_total <= copy) if (bytes_sg_total <= copy)
break; break;
} while (i != msg->sg_end); } while (i != msg->sg.end);
last_sg = i; last_sge = i;
if (unlikely(bytes_sg_total > copy)) if (unlikely(bytes_sg_total > copy))
return -EINVAL; return -EINVAL;
@ -2352,63 +2229,61 @@ BPF_CALL_4(bpf_msg_pull_data,
get_order(copy)); get_order(copy));
if (unlikely(!page)) if (unlikely(!page))
return -ENOMEM; return -ENOMEM;
p = page_address(page);
i = first_sg; raw = page_address(page);
i = first_sge;
do { do {
from = sg_virt(&sg[i]); sge = sk_msg_elem(msg, i);
len = sg[i].length; from = sg_virt(sge);
to = p + poffset; len = sge->length;
to = raw + poffset;
memcpy(to, from, len); memcpy(to, from, len);
poffset += len; poffset += len;
sg[i].length = 0; sge->length = 0;
put_page(sg_page(&sg[i])); put_page(sg_page(sge));
sk_msg_iter_var(i); sk_msg_iter_var_next(i);
} while (i != last_sg); } while (i != last_sge);
sg[first_sg].length = copy; sg_set_page(&msg->sg.data[first_sge], page, copy, 0);
sg_set_page(&sg[first_sg], page, copy, 0);
/* To repair sg ring we need to shift entries. If we only /* To repair sg ring we need to shift entries. If we only
* had a single entry though we can just replace it and * had a single entry though we can just replace it and
* be done. Otherwise walk the ring and shift the entries. * be done. Otherwise walk the ring and shift the entries.
*/ */
WARN_ON_ONCE(last_sg == first_sg); WARN_ON_ONCE(last_sge == first_sge);
shift = last_sg > first_sg ? shift = last_sge > first_sge ?
last_sg - first_sg - 1 : last_sge - first_sge - 1 :
MAX_SKB_FRAGS - first_sg + last_sg - 1; MAX_SKB_FRAGS - first_sge + last_sge - 1;
if (!shift) if (!shift)
goto out; goto out;
i = first_sg; i = first_sge;
sk_msg_iter_var(i); sk_msg_iter_var_next(i);
do { do {
int move_from; u32 move_from;
if (i + shift >= MAX_SKB_FRAGS) if (i + shift >= MAX_MSG_FRAGS)
move_from = i + shift - MAX_SKB_FRAGS; move_from = i + shift - MAX_MSG_FRAGS;
else else
move_from = i + shift; move_from = i + shift;
if (move_from == msg->sg.end)
if (move_from == msg->sg_end)
break; break;
sg[i] = sg[move_from]; msg->sg.data[i] = msg->sg.data[move_from];
sg[move_from].length = 0; msg->sg.data[move_from].length = 0;
sg[move_from].page_link = 0; msg->sg.data[move_from].page_link = 0;
sg[move_from].offset = 0; msg->sg.data[move_from].offset = 0;
sk_msg_iter_var_next(i);
sk_msg_iter_var(i);
} while (1); } while (1);
msg->sg_end -= shift;
if (msg->sg_end < 0)
msg->sg_end += MAX_SKB_FRAGS;
out:
msg->data = sg_virt(&sg[first_sg]) + start - offset;
msg->data_end = msg->data + bytes;
msg->sg.end = msg->sg.end - shift > msg->sg.end ?
msg->sg.end - shift + MAX_MSG_FRAGS :
msg->sg.end - shift;
out:
msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset;
msg->data_end = msg->data + bytes;
return 0; return 0;
} }
@ -5203,6 +5078,9 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
} }
} }
const struct bpf_func_proto bpf_sock_map_update_proto __weak;
const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
static const struct bpf_func_proto * static const struct bpf_func_proto *
sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{ {
@ -5226,6 +5104,9 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
} }
} }
const struct bpf_func_proto bpf_msg_redirect_map_proto __weak;
const struct bpf_func_proto bpf_msg_redirect_hash_proto __weak;
static const struct bpf_func_proto * static const struct bpf_func_proto *
sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{ {
@ -5247,6 +5128,9 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
} }
} }
const struct bpf_func_proto bpf_sk_redirect_map_proto __weak;
const struct bpf_func_proto bpf_sk_redirect_hash_proto __weak;
static const struct bpf_func_proto * static const struct bpf_func_proto *
sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{ {
@ -7001,22 +6885,22 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
switch (si->off) { switch (si->off) {
case offsetof(struct sk_msg_md, data): case offsetof(struct sk_msg_md, data):
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data), *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data),
si->dst_reg, si->src_reg, si->dst_reg, si->src_reg,
offsetof(struct sk_msg_buff, data)); offsetof(struct sk_msg, data));
break; break;
case offsetof(struct sk_msg_md, data_end): case offsetof(struct sk_msg_md, data_end):
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data_end), *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data_end),
si->dst_reg, si->src_reg, si->dst_reg, si->src_reg,
offsetof(struct sk_msg_buff, data_end)); offsetof(struct sk_msg, data_end));
break; break;
case offsetof(struct sk_msg_md, family): case offsetof(struct sk_msg_md, family):
BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2); BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
struct sk_msg_buff, sk), struct sk_msg, sk),
si->dst_reg, si->src_reg, si->dst_reg, si->src_reg,
offsetof(struct sk_msg_buff, sk)); offsetof(struct sk_msg, sk));
*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
offsetof(struct sock_common, skc_family)); offsetof(struct sock_common, skc_family));
break; break;
@ -7025,9 +6909,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4); BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
struct sk_msg_buff, sk), struct sk_msg, sk),
si->dst_reg, si->src_reg, si->dst_reg, si->src_reg,
offsetof(struct sk_msg_buff, sk)); offsetof(struct sk_msg, sk));
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
offsetof(struct sock_common, skc_daddr)); offsetof(struct sock_common, skc_daddr));
break; break;
@ -7037,9 +6921,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
skc_rcv_saddr) != 4); skc_rcv_saddr) != 4);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
struct sk_msg_buff, sk), struct sk_msg, sk),
si->dst_reg, si->src_reg, si->dst_reg, si->src_reg,
offsetof(struct sk_msg_buff, sk)); offsetof(struct sk_msg, sk));
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
offsetof(struct sock_common, offsetof(struct sock_common,
skc_rcv_saddr)); skc_rcv_saddr));
@ -7054,9 +6938,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
off = si->off; off = si->off;
off -= offsetof(struct sk_msg_md, remote_ip6[0]); off -= offsetof(struct sk_msg_md, remote_ip6[0]);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
struct sk_msg_buff, sk), struct sk_msg, sk),
si->dst_reg, si->src_reg, si->dst_reg, si->src_reg,
offsetof(struct sk_msg_buff, sk)); offsetof(struct sk_msg, sk));
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
offsetof(struct sock_common, offsetof(struct sock_common,
skc_v6_daddr.s6_addr32[0]) + skc_v6_daddr.s6_addr32[0]) +
@ -7075,9 +6959,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
off = si->off; off = si->off;
off -= offsetof(struct sk_msg_md, local_ip6[0]); off -= offsetof(struct sk_msg_md, local_ip6[0]);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
struct sk_msg_buff, sk), struct sk_msg, sk),
si->dst_reg, si->src_reg, si->dst_reg, si->src_reg,
offsetof(struct sk_msg_buff, sk)); offsetof(struct sk_msg, sk));
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
offsetof(struct sock_common, offsetof(struct sock_common,
skc_v6_rcv_saddr.s6_addr32[0]) + skc_v6_rcv_saddr.s6_addr32[0]) +
@ -7091,9 +6975,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2); BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
struct sk_msg_buff, sk), struct sk_msg, sk),
si->dst_reg, si->src_reg, si->dst_reg, si->src_reg,
offsetof(struct sk_msg_buff, sk)); offsetof(struct sk_msg, sk));
*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
offsetof(struct sock_common, skc_dport)); offsetof(struct sock_common, skc_dport));
#ifndef __BIG_ENDIAN_BITFIELD #ifndef __BIG_ENDIAN_BITFIELD
@ -7105,9 +6989,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2); BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
struct sk_msg_buff, sk), struct sk_msg, sk),
si->dst_reg, si->src_reg, si->dst_reg, si->src_reg,
offsetof(struct sk_msg_buff, sk)); offsetof(struct sk_msg, sk));
*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
offsetof(struct sock_common, skc_num)); offsetof(struct sock_common, skc_num));
break; break;

763
net/core/skmsg.c Normal file
View File

@ -0,0 +1,763 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
#include <linux/skmsg.h>
#include <linux/skbuff.h>
#include <linux/scatterlist.h>
#include <net/sock.h>
#include <net/tcp.h>
static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce)
{
if (msg->sg.end > msg->sg.start &&
elem_first_coalesce < msg->sg.end)
return true;
if (msg->sg.end < msg->sg.start &&
(elem_first_coalesce > msg->sg.start ||
elem_first_coalesce < msg->sg.end))
return true;
return false;
}
int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
int elem_first_coalesce)
{
struct page_frag *pfrag = sk_page_frag(sk);
int ret = 0;
len -= msg->sg.size;
while (len > 0) {
struct scatterlist *sge;
u32 orig_offset;
int use, i;
if (!sk_page_frag_refill(sk, pfrag))
return -ENOMEM;
orig_offset = pfrag->offset;
use = min_t(int, len, pfrag->size - orig_offset);
if (!sk_wmem_schedule(sk, use))
return -ENOMEM;
i = msg->sg.end;
sk_msg_iter_var_prev(i);
sge = &msg->sg.data[i];
if (sk_msg_try_coalesce_ok(msg, elem_first_coalesce) &&
sg_page(sge) == pfrag->page &&
sge->offset + sge->length == orig_offset) {
sge->length += use;
} else {
if (sk_msg_full(msg)) {
ret = -ENOSPC;
break;
}
sge = &msg->sg.data[msg->sg.end];
sg_unmark_end(sge);
sg_set_page(sge, pfrag->page, use, orig_offset);
get_page(pfrag->page);
sk_msg_iter_next(msg, end);
}
sk_mem_charge(sk, use);
msg->sg.size += use;
pfrag->offset += use;
len -= use;
}
return ret;
}
EXPORT_SYMBOL_GPL(sk_msg_alloc);
void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes)
{
int i = msg->sg.start;
do {
struct scatterlist *sge = sk_msg_elem(msg, i);
if (bytes < sge->length) {
sge->length -= bytes;
sge->offset += bytes;
sk_mem_uncharge(sk, bytes);
break;
}
sk_mem_uncharge(sk, sge->length);
bytes -= sge->length;
sge->length = 0;
sge->offset = 0;
sk_msg_iter_var_next(i);
} while (bytes && i != msg->sg.end);
msg->sg.start = i;
}
EXPORT_SYMBOL_GPL(sk_msg_return_zero);
void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes)
{
int i = msg->sg.start;
do {
struct scatterlist *sge = &msg->sg.data[i];
int uncharge = (bytes < sge->length) ? bytes : sge->length;
sk_mem_uncharge(sk, uncharge);
bytes -= uncharge;
sk_msg_iter_var_next(i);
} while (i != msg->sg.end);
}
EXPORT_SYMBOL_GPL(sk_msg_return);
static int sk_msg_free_elem(struct sock *sk, struct sk_msg *msg, u32 i,
bool charge)
{
struct scatterlist *sge = sk_msg_elem(msg, i);
u32 len = sge->length;
if (charge)
sk_mem_uncharge(sk, len);
if (!msg->skb)
put_page(sg_page(sge));
memset(sge, 0, sizeof(*sge));
return len;
}
static int __sk_msg_free(struct sock *sk, struct sk_msg *msg, u32 i,
bool charge)
{
struct scatterlist *sge = sk_msg_elem(msg, i);
int freed = 0;
while (msg->sg.size) {
msg->sg.size -= sge->length;
freed += sk_msg_free_elem(sk, msg, i, charge);
sk_msg_iter_var_next(i);
sk_msg_check_to_free(msg, i, msg->sg.size);
sge = sk_msg_elem(msg, i);
}
if (msg->skb)
consume_skb(msg->skb);
sk_msg_init(msg);
return freed;
}
int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg)
{
return __sk_msg_free(sk, msg, msg->sg.start, false);
}
EXPORT_SYMBOL_GPL(sk_msg_free_nocharge);
int sk_msg_free(struct sock *sk, struct sk_msg *msg)
{
return __sk_msg_free(sk, msg, msg->sg.start, true);
}
EXPORT_SYMBOL_GPL(sk_msg_free);
static void __sk_msg_free_partial(struct sock *sk, struct sk_msg *msg,
u32 bytes, bool charge)
{
struct scatterlist *sge;
u32 i = msg->sg.start;
while (bytes) {
sge = sk_msg_elem(msg, i);
if (!sge->length)
break;
if (bytes < sge->length) {
if (charge)
sk_mem_uncharge(sk, bytes);
sge->length -= bytes;
sge->offset += bytes;
msg->sg.size -= bytes;
break;
}
msg->sg.size -= sge->length;
bytes -= sge->length;
sk_msg_free_elem(sk, msg, i, charge);
sk_msg_iter_var_next(i);
sk_msg_check_to_free(msg, i, bytes);
}
msg->sg.start = i;
}
void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes)
{
__sk_msg_free_partial(sk, msg, bytes, true);
}
EXPORT_SYMBOL_GPL(sk_msg_free_partial);
void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg,
u32 bytes)
{
__sk_msg_free_partial(sk, msg, bytes, false);
}
void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len)
{
int trim = msg->sg.size - len;
u32 i = msg->sg.end;
if (trim <= 0) {
WARN_ON(trim < 0);
return;
}
sk_msg_iter_var_prev(i);
msg->sg.size = len;
while (msg->sg.data[i].length &&
trim >= msg->sg.data[i].length) {
trim -= msg->sg.data[i].length;
sk_msg_free_elem(sk, msg, i, true);
sk_msg_iter_var_prev(i);
if (!trim)
goto out;
}
msg->sg.data[i].length -= trim;
sk_mem_uncharge(sk, trim);
out:
/* If we trim data before curr pointer update copybreak and current
* so that any future copy operations start at new copy location.
* However trimed data that has not yet been used in a copy op
* does not require an update.
*/
if (msg->sg.curr >= i) {
msg->sg.curr = i;
msg->sg.copybreak = msg->sg.data[i].length;
}
sk_msg_iter_var_next(i);
msg->sg.end = i;
}
EXPORT_SYMBOL_GPL(sk_msg_trim);
int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
struct sk_msg *msg, u32 bytes)
{
int i, maxpages, ret = 0, num_elems = sk_msg_elem_used(msg);
const int to_max_pages = MAX_MSG_FRAGS;
struct page *pages[MAX_MSG_FRAGS];
ssize_t orig, copied, use, offset;
orig = msg->sg.size;
while (bytes > 0) {
i = 0;
maxpages = to_max_pages - num_elems;
if (maxpages == 0) {
ret = -EFAULT;
goto out;
}
copied = iov_iter_get_pages(from, pages, bytes, maxpages,
&offset);
if (copied <= 0) {
ret = -EFAULT;
goto out;
}
iov_iter_advance(from, copied);
bytes -= copied;
msg->sg.size += copied;
while (copied) {
use = min_t(int, copied, PAGE_SIZE - offset);
sg_set_page(&msg->sg.data[msg->sg.end],
pages[i], use, offset);
sg_unmark_end(&msg->sg.data[msg->sg.end]);
sk_mem_charge(sk, use);
offset = 0;
copied -= use;
sk_msg_iter_next(msg, end);
num_elems++;
i++;
}
/* When zerocopy is mixed with sk_msg_*copy* operations we
* may have a copybreak set in this case clear and prefer
* zerocopy remainder when possible.
*/
msg->sg.copybreak = 0;
msg->sg.curr = msg->sg.end;
}
out:
/* Revert iov_iter updates, msg will need to use 'trim' later if it
* also needs to be cleared.
*/
if (ret)
iov_iter_revert(from, msg->sg.size - orig);
return ret;
}
EXPORT_SYMBOL_GPL(sk_msg_zerocopy_from_iter);
int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
struct sk_msg *msg, u32 bytes)
{
int ret = -ENOSPC, i = msg->sg.curr;
struct scatterlist *sge;
u32 copy, buf_size;
void *to;
do {
sge = sk_msg_elem(msg, i);
/* This is possible if a trim operation shrunk the buffer */
if (msg->sg.copybreak >= sge->length) {
msg->sg.copybreak = 0;
sk_msg_iter_var_next(i);
if (i == msg->sg.end)
break;
sge = sk_msg_elem(msg, i);
}
buf_size = sge->length - msg->sg.copybreak;
copy = (buf_size > bytes) ? bytes : buf_size;
to = sg_virt(sge) + msg->sg.copybreak;
msg->sg.copybreak += copy;
if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY)
ret = copy_from_iter_nocache(to, copy, from);
else
ret = copy_from_iter(to, copy, from);
if (ret != copy) {
ret = -EFAULT;
goto out;
}
bytes -= copy;
if (!bytes)
break;
msg->sg.copybreak = 0;
sk_msg_iter_var_next(i);
} while (i != msg->sg.end);
out:
msg->sg.curr = i;
return ret;
}
EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter);
static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
{
struct sock *sk = psock->sk;
int copied = 0, num_sge;
struct sk_msg *msg;
msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC);
if (unlikely(!msg))
return -EAGAIN;
if (!sk_rmem_schedule(sk, skb, skb->len)) {
kfree(msg);
return -EAGAIN;
}
sk_msg_init(msg);
num_sge = skb_to_sgvec(skb, msg->sg.data, 0, skb->len);
if (unlikely(num_sge < 0)) {
kfree(msg);
return num_sge;
}
sk_mem_charge(sk, skb->len);
copied = skb->len;
msg->sg.start = 0;
msg->sg.end = num_sge == MAX_MSG_FRAGS ? 0 : num_sge;
msg->skb = skb;
sk_psock_queue_msg(psock, msg);
sk->sk_data_ready(sk);
return copied;
}
static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
u32 off, u32 len, bool ingress)
{
if (ingress)
return sk_psock_skb_ingress(psock, skb);
else
return skb_send_sock_locked(psock->sk, skb, off, len);
}
static void sk_psock_backlog(struct work_struct *work)
{
struct sk_psock *psock = container_of(work, struct sk_psock, work);
struct sk_psock_work_state *state = &psock->work_state;
struct sk_buff *skb;
bool ingress;
u32 len, off;
int ret;
/* Lock sock to avoid losing sk_socket during loop. */
lock_sock(psock->sk);
if (state->skb) {
skb = state->skb;
len = state->len;
off = state->off;
state->skb = NULL;
goto start;
}
while ((skb = skb_dequeue(&psock->ingress_skb))) {
len = skb->len;
off = 0;
start:
ingress = tcp_skb_bpf_ingress(skb);
do {
ret = -EIO;
if (likely(psock->sk->sk_socket))
ret = sk_psock_handle_skb(psock, skb, off,
len, ingress);
if (ret <= 0) {
if (ret == -EAGAIN) {
state->skb = skb;
state->len = len;
state->off = off;
goto end;
}
/* Hard errors break pipe and stop xmit. */
sk_psock_report_error(psock, ret ? -ret : EPIPE);
sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
kfree_skb(skb);
goto end;
}
off += ret;
len -= ret;
} while (len);
if (!ingress)
kfree_skb(skb);
}
end:
release_sock(psock->sk);
}
struct sk_psock *sk_psock_init(struct sock *sk, int node)
{
struct sk_psock *psock = kzalloc_node(sizeof(*psock),
GFP_ATOMIC | __GFP_NOWARN,
node);
if (!psock)
return NULL;
psock->sk = sk;
psock->eval = __SK_NONE;
INIT_LIST_HEAD(&psock->link);
spin_lock_init(&psock->link_lock);
INIT_WORK(&psock->work, sk_psock_backlog);
INIT_LIST_HEAD(&psock->ingress_msg);
skb_queue_head_init(&psock->ingress_skb);
sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED);
refcount_set(&psock->refcnt, 1);
rcu_assign_sk_user_data(sk, psock);
sock_hold(sk);
return psock;
}
EXPORT_SYMBOL_GPL(sk_psock_init);
struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock)
{
struct sk_psock_link *link;
spin_lock_bh(&psock->link_lock);
link = list_first_entry_or_null(&psock->link, struct sk_psock_link,
list);
if (link)
list_del(&link->list);
spin_unlock_bh(&psock->link_lock);
return link;
}
void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
{
struct sk_msg *msg, *tmp;
list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) {
list_del(&msg->list);
sk_msg_free(psock->sk, msg);
kfree(msg);
}
}
static void sk_psock_zap_ingress(struct sk_psock *psock)
{
__skb_queue_purge(&psock->ingress_skb);
__sk_psock_purge_ingress_msg(psock);
}
static void sk_psock_link_destroy(struct sk_psock *psock)
{
struct sk_psock_link *link, *tmp;
list_for_each_entry_safe(link, tmp, &psock->link, list) {
list_del(&link->list);
sk_psock_free_link(link);
}
}
static void sk_psock_destroy_deferred(struct work_struct *gc)
{
struct sk_psock *psock = container_of(gc, struct sk_psock, gc);
/* No sk_callback_lock since already detached. */
if (psock->parser.enabled)
strp_done(&psock->parser.strp);
cancel_work_sync(&psock->work);
psock_progs_drop(&psock->progs);
sk_psock_link_destroy(psock);
sk_psock_cork_free(psock);
sk_psock_zap_ingress(psock);
if (psock->sk_redir)
sock_put(psock->sk_redir);
sock_put(psock->sk);
kfree(psock);
}
void sk_psock_destroy(struct rcu_head *rcu)
{
struct sk_psock *psock = container_of(rcu, struct sk_psock, rcu);
INIT_WORK(&psock->gc, sk_psock_destroy_deferred);
schedule_work(&psock->gc);
}
EXPORT_SYMBOL_GPL(sk_psock_destroy);
void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
{
rcu_assign_sk_user_data(sk, NULL);
sk_psock_cork_free(psock);
sk_psock_restore_proto(sk, psock);
write_lock_bh(&sk->sk_callback_lock);
if (psock->progs.skb_parser)
sk_psock_stop_strp(sk, psock);
write_unlock_bh(&sk->sk_callback_lock);
sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
call_rcu_sched(&psock->rcu, sk_psock_destroy);
}
EXPORT_SYMBOL_GPL(sk_psock_drop);
static int sk_psock_map_verd(int verdict, bool redir)
{
switch (verdict) {
case SK_PASS:
return redir ? __SK_REDIRECT : __SK_PASS;
case SK_DROP:
default:
break;
}
return __SK_DROP;
}
int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
struct sk_msg *msg)
{
struct bpf_prog *prog;
int ret;
preempt_disable();
rcu_read_lock();
prog = READ_ONCE(psock->progs.msg_parser);
if (unlikely(!prog)) {
ret = __SK_PASS;
goto out;
}
sk_msg_compute_data_pointers(msg);
msg->sk = sk;
ret = BPF_PROG_RUN(prog, msg);
ret = sk_psock_map_verd(ret, msg->sk_redir);
psock->apply_bytes = msg->apply_bytes;
if (ret == __SK_REDIRECT) {
if (psock->sk_redir)
sock_put(psock->sk_redir);
psock->sk_redir = msg->sk_redir;
if (!psock->sk_redir) {
ret = __SK_DROP;
goto out;
}
sock_hold(psock->sk_redir);
}
out:
rcu_read_unlock();
preempt_enable();
return ret;
}
EXPORT_SYMBOL_GPL(sk_psock_msg_verdict);
static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog,
struct sk_buff *skb)
{
int ret;
skb->sk = psock->sk;
bpf_compute_data_end_sk_skb(skb);
preempt_disable();
ret = BPF_PROG_RUN(prog, skb);
preempt_enable();
/* strparser clones the skb before handing it to a upper layer,
* meaning skb_orphan has been called. We NULL sk on the way out
* to ensure we don't trigger a BUG_ON() in skb/sk operations
* later and because we are not charging the memory of this skb
* to any socket yet.
*/
skb->sk = NULL;
return ret;
}
static struct sk_psock *sk_psock_from_strp(struct strparser *strp)
{
struct sk_psock_parser *parser;
parser = container_of(strp, struct sk_psock_parser, strp);
return container_of(parser, struct sk_psock, parser);
}
static void sk_psock_verdict_apply(struct sk_psock *psock,
struct sk_buff *skb, int verdict)
{
struct sk_psock *psock_other;
struct sock *sk_other;
bool ingress;
switch (verdict) {
case __SK_REDIRECT:
sk_other = tcp_skb_bpf_redirect_fetch(skb);
if (unlikely(!sk_other))
goto out_free;
psock_other = sk_psock(sk_other);
if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED))
goto out_free;
ingress = tcp_skb_bpf_ingress(skb);
if ((!ingress && sock_writeable(sk_other)) ||
(ingress &&
atomic_read(&sk_other->sk_rmem_alloc) <=
sk_other->sk_rcvbuf)) {
if (!ingress)
skb_set_owner_w(skb, sk_other);
skb_queue_tail(&psock_other->ingress_skb, skb);
schedule_work(&psock_other->work);
break;
}
/* fall-through */
case __SK_DROP:
/* fall-through */
default:
out_free:
kfree_skb(skb);
}
}
static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
{
struct sk_psock *psock = sk_psock_from_strp(strp);
struct bpf_prog *prog;
int ret = __SK_DROP;
rcu_read_lock();
prog = READ_ONCE(psock->progs.skb_verdict);
if (likely(prog)) {
skb_orphan(skb);
tcp_skb_bpf_redirect_clear(skb);
ret = sk_psock_bpf_run(psock, prog, skb);
ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
}
rcu_read_unlock();
sk_psock_verdict_apply(psock, skb, ret);
}
static int sk_psock_strp_read_done(struct strparser *strp, int err)
{
return err;
}
static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb)
{
struct sk_psock *psock = sk_psock_from_strp(strp);
struct bpf_prog *prog;
int ret = skb->len;
rcu_read_lock();
prog = READ_ONCE(psock->progs.skb_parser);
if (likely(prog))
ret = sk_psock_bpf_run(psock, prog, skb);
rcu_read_unlock();
return ret;
}
/* Called with socket lock held. */
static void sk_psock_data_ready(struct sock *sk)
{
struct sk_psock *psock;
rcu_read_lock();
psock = sk_psock(sk);
if (likely(psock)) {
write_lock_bh(&sk->sk_callback_lock);
strp_data_ready(&psock->parser.strp);
write_unlock_bh(&sk->sk_callback_lock);
}
rcu_read_unlock();
}
static void sk_psock_write_space(struct sock *sk)
{
struct sk_psock *psock;
void (*write_space)(struct sock *sk);
rcu_read_lock();
psock = sk_psock(sk);
if (likely(psock && sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)))
schedule_work(&psock->work);
write_space = psock->saved_write_space;
rcu_read_unlock();
write_space(sk);
}
int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
{
static const struct strp_callbacks cb = {
.rcv_msg = sk_psock_strp_read,
.read_sock_done = sk_psock_strp_read_done,
.parse_msg = sk_psock_strp_parse,
};
psock->parser.enabled = false;
return strp_init(&psock->parser.strp, sk, &cb);
}
void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
{
struct sk_psock_parser *parser = &psock->parser;
if (parser->enabled)
return;
parser->saved_data_ready = sk->sk_data_ready;
sk->sk_data_ready = sk_psock_data_ready;
sk->sk_write_space = sk_psock_write_space;
parser->enabled = true;
}
void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
{
struct sk_psock_parser *parser = &psock->parser;
if (!parser->enabled)
return;
sk->sk_data_ready = parser->saved_data_ready;
parser->saved_data_ready = NULL;
strp_stop(&parser->strp);
parser->enabled = false;
}

1002
net/core/sock_map.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -63,6 +63,7 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o
obj-$(CONFIG_NETLABEL) += cipso_ipv4.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \

655
net/ipv4/tcp_bpf.c Normal file
View File

@ -0,0 +1,655 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
#include <linux/skmsg.h>
#include <linux/filter.h>
#include <linux/bpf.h>
#include <linux/init.h>
#include <linux/wait.h>
#include <net/inet_common.h>
static bool tcp_bpf_stream_read(const struct sock *sk)
{
struct sk_psock *psock;
bool empty = true;
rcu_read_lock();
psock = sk_psock(sk);
if (likely(psock))
empty = list_empty(&psock->ingress_msg);
rcu_read_unlock();
return !empty;
}
static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock,
int flags, long timeo, int *err)
{
DEFINE_WAIT_FUNC(wait, woken_wake_function);
int ret;
add_wait_queue(sk_sleep(sk), &wait);
sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
ret = sk_wait_event(sk, &timeo,
!list_empty(&psock->ingress_msg) ||
!skb_queue_empty(&sk->sk_receive_queue), &wait);
sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
remove_wait_queue(sk_sleep(sk), &wait);
return ret;
}
int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
struct msghdr *msg, int len)
{
struct iov_iter *iter = &msg->msg_iter;
int i, ret, copied = 0;
while (copied != len) {
struct scatterlist *sge;
struct sk_msg *msg_rx;
msg_rx = list_first_entry_or_null(&psock->ingress_msg,
struct sk_msg, list);
if (unlikely(!msg_rx))
break;
i = msg_rx->sg.start;
do {
struct page *page;
int copy;
sge = sk_msg_elem(msg_rx, i);
copy = sge->length;
page = sg_page(sge);
if (copied + copy > len)
copy = len - copied;
ret = copy_page_to_iter(page, sge->offset, copy, iter);
if (ret != copy) {
msg_rx->sg.start = i;
return -EFAULT;
}
copied += copy;
sge->offset += copy;
sge->length -= copy;
sk_mem_uncharge(sk, copy);
if (!sge->length) {
i++;
if (i == MAX_SKB_FRAGS)
i = 0;
if (!msg_rx->skb)
put_page(page);
}
if (copied == len)
break;
} while (i != msg_rx->sg.end);
msg_rx->sg.start = i;
if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) {
list_del(&msg_rx->list);
if (msg_rx->skb)
consume_skb(msg_rx->skb);
kfree(msg_rx);
}
}
return copied;
}
EXPORT_SYMBOL_GPL(__tcp_bpf_recvmsg);
int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
int nonblock, int flags, int *addr_len)
{
struct sk_psock *psock;
int copied, ret;
if (unlikely(flags & MSG_ERRQUEUE))
return inet_recv_error(sk, msg, len, addr_len);
if (!skb_queue_empty(&sk->sk_receive_queue))
return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
psock = sk_psock_get(sk);
if (unlikely(!psock))
return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
lock_sock(sk);
msg_bytes_ready:
copied = __tcp_bpf_recvmsg(sk, psock, msg, len);
if (!copied) {
int data, err = 0;
long timeo;
timeo = sock_rcvtimeo(sk, nonblock);
data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err);
if (data) {
if (skb_queue_empty(&sk->sk_receive_queue))
goto msg_bytes_ready;
release_sock(sk);
sk_psock_put(sk, psock);
return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
}
if (err) {
ret = err;
goto out;
}
}
ret = copied;
out:
release_sock(sk);
sk_psock_put(sk, psock);
return ret;
}
static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
struct sk_msg *msg, u32 apply_bytes, int flags)
{
bool apply = apply_bytes;
struct scatterlist *sge;
u32 size, copied = 0;
struct sk_msg *tmp;
int i, ret = 0;
tmp = kzalloc(sizeof(*tmp), __GFP_NOWARN | GFP_KERNEL);
if (unlikely(!tmp))
return -ENOMEM;
lock_sock(sk);
tmp->sg.start = msg->sg.start;
i = msg->sg.start;
do {
sge = sk_msg_elem(msg, i);
size = (apply && apply_bytes < sge->length) ?
apply_bytes : sge->length;
if (!sk_wmem_schedule(sk, size)) {
if (!copied)
ret = -ENOMEM;
break;
}
sk_mem_charge(sk, size);
sk_msg_xfer(tmp, msg, i, size);
copied += size;
if (sge->length)
get_page(sk_msg_page(tmp, i));
sk_msg_iter_var_next(i);
tmp->sg.end = i;
if (apply) {
apply_bytes -= size;
if (!apply_bytes)
break;
}
} while (i != msg->sg.end);
if (!ret) {
msg->sg.start = i;
msg->sg.size -= apply_bytes;
sk_psock_queue_msg(psock, tmp);
sk->sk_data_ready(sk);
} else {
sk_msg_free(sk, tmp);
kfree(tmp);
}
release_sock(sk);
return ret;
}
static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes,
int flags, bool uncharge)
{
bool apply = apply_bytes;
struct scatterlist *sge;
struct page *page;
int size, ret = 0;
u32 off;
while (1) {
sge = sk_msg_elem(msg, msg->sg.start);
size = (apply && apply_bytes < sge->length) ?
apply_bytes : sge->length;
off = sge->offset;
page = sg_page(sge);
tcp_rate_check_app_limited(sk);
retry:
ret = do_tcp_sendpages(sk, page, off, size, flags);
if (ret <= 0)
return ret;
if (apply)
apply_bytes -= ret;
msg->sg.size -= ret;
sge->offset += ret;
sge->length -= ret;
if (uncharge)
sk_mem_uncharge(sk, ret);
if (ret != size) {
size -= ret;
off += ret;
goto retry;
}
if (!sge->length) {
put_page(page);
sk_msg_iter_next(msg, start);
sg_init_table(sge, 1);
if (msg->sg.start == msg->sg.end)
break;
}
if (apply && !apply_bytes)
break;
}
return 0;
}
static int tcp_bpf_push_locked(struct sock *sk, struct sk_msg *msg,
u32 apply_bytes, int flags, bool uncharge)
{
int ret;
lock_sock(sk);
ret = tcp_bpf_push(sk, msg, apply_bytes, flags, uncharge);
release_sock(sk);
return ret;
}
int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg,
u32 bytes, int flags)
{
bool ingress = sk_msg_to_ingress(msg);
struct sk_psock *psock = sk_psock_get(sk);
int ret;
if (unlikely(!psock)) {
sk_msg_free(sk, msg);
return 0;
}
ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes, flags) :
tcp_bpf_push_locked(sk, msg, bytes, flags, false);
sk_psock_put(sk, psock);
return ret;
}
EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir);
static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
struct sk_msg *msg, int *copied, int flags)
{
bool cork = false, enospc = msg->sg.start == msg->sg.end;
struct sock *sk_redir;
u32 tosend;
int ret;
more_data:
if (psock->eval == __SK_NONE)
psock->eval = sk_psock_msg_verdict(sk, psock, msg);
if (msg->cork_bytes &&
msg->cork_bytes > msg->sg.size && !enospc) {
psock->cork_bytes = msg->cork_bytes - msg->sg.size;
if (!psock->cork) {
psock->cork = kzalloc(sizeof(*psock->cork),
GFP_ATOMIC | __GFP_NOWARN);
if (!psock->cork)
return -ENOMEM;
}
memcpy(psock->cork, msg, sizeof(*msg));
return 0;
}
tosend = msg->sg.size;
if (psock->apply_bytes && psock->apply_bytes < tosend)
tosend = psock->apply_bytes;
switch (psock->eval) {
case __SK_PASS:
ret = tcp_bpf_push(sk, msg, tosend, flags, true);
if (unlikely(ret)) {
*copied -= sk_msg_free(sk, msg);
break;
}
sk_msg_apply_bytes(psock, tosend);
break;
case __SK_REDIRECT:
sk_redir = psock->sk_redir;
sk_msg_apply_bytes(psock, tosend);
if (psock->cork) {
cork = true;
psock->cork = NULL;
}
sk_msg_return(sk, msg, tosend);
release_sock(sk);
ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags);
lock_sock(sk);
if (unlikely(ret < 0)) {
int free = sk_msg_free_nocharge(sk, msg);
if (!cork)
*copied -= free;
}
if (cork) {
sk_msg_free(sk, msg);
kfree(msg);
msg = NULL;
ret = 0;
}
break;
case __SK_DROP:
default:
sk_msg_free_partial(sk, msg, tosend);
sk_msg_apply_bytes(psock, tosend);
*copied -= tosend;
return -EACCES;
}
if (likely(!ret)) {
if (!psock->apply_bytes) {
psock->eval = __SK_NONE;
if (psock->sk_redir) {
sock_put(psock->sk_redir);
psock->sk_redir = NULL;
}
}
if (msg &&
msg->sg.data[msg->sg.start].page_link &&
msg->sg.data[msg->sg.start].length)
goto more_data;
}
return ret;
}
static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
{
struct sk_msg tmp, *msg_tx = NULL;
int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS;
int copied = 0, err = 0;
struct sk_psock *psock;
long timeo;
psock = sk_psock_get(sk);
if (unlikely(!psock))
return tcp_sendmsg(sk, msg, size);
lock_sock(sk);
timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
while (msg_data_left(msg)) {
bool enospc = false;
u32 copy, osize;
if (sk->sk_err) {
err = -sk->sk_err;
goto out_err;
}
copy = msg_data_left(msg);
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
if (psock->cork) {
msg_tx = psock->cork;
} else {
msg_tx = &tmp;
sk_msg_init(msg_tx);
}
osize = msg_tx->sg.size;
err = sk_msg_alloc(sk, msg_tx, msg_tx->sg.size + copy, msg_tx->sg.end - 1);
if (err) {
if (err != -ENOSPC)
goto wait_for_memory;
enospc = true;
copy = msg_tx->sg.size - osize;
}
err = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_tx,
copy);
if (err < 0) {
sk_msg_trim(sk, msg_tx, osize);
goto out_err;
}
copied += copy;
if (psock->cork_bytes) {
if (size > psock->cork_bytes)
psock->cork_bytes = 0;
else
psock->cork_bytes -= size;
if (psock->cork_bytes && !enospc)
goto out_err;
/* All cork bytes are accounted, rerun the prog. */
psock->eval = __SK_NONE;
psock->cork_bytes = 0;
}
err = tcp_bpf_send_verdict(sk, psock, msg_tx, &copied, flags);
if (unlikely(err < 0))
goto out_err;
continue;
wait_for_sndbuf:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
wait_for_memory:
err = sk_stream_wait_memory(sk, &timeo);
if (err) {
if (msg_tx && msg_tx != psock->cork)
sk_msg_free(sk, msg_tx);
goto out_err;
}
}
out_err:
if (err < 0)
err = sk_stream_error(sk, msg->msg_flags, err);
release_sock(sk);
sk_psock_put(sk, psock);
return copied ? copied : err;
}
static int tcp_bpf_sendpage(struct sock *sk, struct page *page, int offset,
size_t size, int flags)
{
struct sk_msg tmp, *msg = NULL;
int err = 0, copied = 0;
struct sk_psock *psock;
bool enospc = false;
psock = sk_psock_get(sk);
if (unlikely(!psock))
return tcp_sendpage(sk, page, offset, size, flags);
lock_sock(sk);
if (psock->cork) {
msg = psock->cork;
} else {
msg = &tmp;
sk_msg_init(msg);
}
/* Catch case where ring is full and sendpage is stalled. */
if (unlikely(sk_msg_full(msg)))
goto out_err;
sk_msg_page_add(msg, page, size, offset);
sk_mem_charge(sk, size);
copied = size;
if (sk_msg_full(msg))
enospc = true;
if (psock->cork_bytes) {
if (size > psock->cork_bytes)
psock->cork_bytes = 0;
else
psock->cork_bytes -= size;
if (psock->cork_bytes && !enospc)
goto out_err;
/* All cork bytes are accounted, rerun the prog. */
psock->eval = __SK_NONE;
psock->cork_bytes = 0;
}
err = tcp_bpf_send_verdict(sk, psock, msg, &copied, flags);
out_err:
release_sock(sk);
sk_psock_put(sk, psock);
return copied ? copied : err;
}
static void tcp_bpf_remove(struct sock *sk, struct sk_psock *psock)
{
struct sk_psock_link *link;
sk_psock_cork_free(psock);
__sk_psock_purge_ingress_msg(psock);
while ((link = sk_psock_link_pop(psock))) {
sk_psock_unlink(sk, link);
sk_psock_free_link(link);
}
}
static void tcp_bpf_unhash(struct sock *sk)
{
void (*saved_unhash)(struct sock *sk);
struct sk_psock *psock;
rcu_read_lock();
psock = sk_psock(sk);
if (unlikely(!psock)) {
rcu_read_unlock();
if (sk->sk_prot->unhash)
sk->sk_prot->unhash(sk);
return;
}
saved_unhash = psock->saved_unhash;
tcp_bpf_remove(sk, psock);
rcu_read_unlock();
saved_unhash(sk);
}
static void tcp_bpf_close(struct sock *sk, long timeout)
{
void (*saved_close)(struct sock *sk, long timeout);
struct sk_psock *psock;
lock_sock(sk);
rcu_read_lock();
psock = sk_psock(sk);
if (unlikely(!psock)) {
rcu_read_unlock();
release_sock(sk);
return sk->sk_prot->close(sk, timeout);
}
saved_close = psock->saved_close;
tcp_bpf_remove(sk, psock);
rcu_read_unlock();
release_sock(sk);
saved_close(sk, timeout);
}
enum {
TCP_BPF_IPV4,
TCP_BPF_IPV6,
TCP_BPF_NUM_PROTS,
};
enum {
TCP_BPF_BASE,
TCP_BPF_TX,
TCP_BPF_NUM_CFGS,
};
static struct proto *tcpv6_prot_saved __read_mostly;
static DEFINE_SPINLOCK(tcpv6_prot_lock);
static struct proto tcp_bpf_prots[TCP_BPF_NUM_PROTS][TCP_BPF_NUM_CFGS];
static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
struct proto *base)
{
prot[TCP_BPF_BASE] = *base;
prot[TCP_BPF_BASE].unhash = tcp_bpf_unhash;
prot[TCP_BPF_BASE].close = tcp_bpf_close;
prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg;
prot[TCP_BPF_BASE].stream_memory_read = tcp_bpf_stream_read;
prot[TCP_BPF_TX] = prot[TCP_BPF_BASE];
prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg;
prot[TCP_BPF_TX].sendpage = tcp_bpf_sendpage;
}
static void tcp_bpf_check_v6_needs_rebuild(struct sock *sk, struct proto *ops)
{
if (sk->sk_family == AF_INET6 &&
unlikely(ops != smp_load_acquire(&tcpv6_prot_saved))) {
spin_lock_bh(&tcpv6_prot_lock);
if (likely(ops != tcpv6_prot_saved)) {
tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV6], ops);
smp_store_release(&tcpv6_prot_saved, ops);
}
spin_unlock_bh(&tcpv6_prot_lock);
}
}
static int __init tcp_bpf_v4_build_proto(void)
{
tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV4], &tcp_prot);
return 0;
}
core_initcall(tcp_bpf_v4_build_proto);
static void tcp_bpf_update_sk_prot(struct sock *sk, struct sk_psock *psock)
{
int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE;
sk_psock_update_proto(sk, psock, &tcp_bpf_prots[family][config]);
}
static void tcp_bpf_reinit_sk_prot(struct sock *sk, struct sk_psock *psock)
{
int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE;
/* Reinit occurs when program types change e.g. TCP_BPF_TX is removed
* or added requiring sk_prot hook updates. We keep original saved
* hooks in this case.
*/
sk->sk_prot = &tcp_bpf_prots[family][config];
}
static int tcp_bpf_assert_proto_ops(struct proto *ops)
{
/* In order to avoid retpoline, we make assumptions when we call
* into ops if e.g. a psock is not present. Make sure they are
* indeed valid assumptions.
*/
return ops->recvmsg == tcp_recvmsg &&
ops->sendmsg == tcp_sendmsg &&
ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP;
}
void tcp_bpf_reinit(struct sock *sk)
{
struct sk_psock *psock;
sock_owned_by_me(sk);
rcu_read_lock();
psock = sk_psock(sk);
tcp_bpf_reinit_sk_prot(sk, psock);
rcu_read_unlock();
}
int tcp_bpf_init(struct sock *sk)
{
struct proto *ops = READ_ONCE(sk->sk_prot);
struct sk_psock *psock;
sock_owned_by_me(sk);
rcu_read_lock();
psock = sk_psock(sk);
if (unlikely(!psock || psock->sk_proto ||
tcp_bpf_assert_proto_ops(ops))) {
rcu_read_unlock();
return -EINVAL;
}
tcp_bpf_check_v6_needs_rebuild(sk, ops);
tcp_bpf_update_sk_prot(sk, psock);
rcu_read_unlock();
return 0;
}

View File

@ -1,4 +1,2 @@
config STREAM_PARSER config STREAM_PARSER
tristate def_bool n
default n