tun: add eBPF based queue selection method
This patch introduces an eBPF based queue selection method. With this, the policy could be offloaded to userspace completely through a new ioctl TUNSETSTEERINGEBPF. Signed-off-by: Jason Wang <jasowang@redhat.com> Acked-by: Willem de Bruijn <willemb@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
f520957dc2
commit
96f8406162
|
@ -195,6 +195,11 @@ struct tun_flow_entry {
|
||||||
|
|
||||||
#define TUN_NUM_FLOW_ENTRIES 1024
|
#define TUN_NUM_FLOW_ENTRIES 1024
|
||||||
|
|
||||||
|
struct tun_steering_prog {
|
||||||
|
struct rcu_head rcu;
|
||||||
|
struct bpf_prog *prog;
|
||||||
|
};
|
||||||
|
|
||||||
/* Since the socket were moved to tun_file, to preserve the behavior of persist
|
/* Since the socket were moved to tun_file, to preserve the behavior of persist
|
||||||
* device, socket filter, sndbuf and vnet header size were restore when the
|
* device, socket filter, sndbuf and vnet header size were restore when the
|
||||||
* file were attached to a persist device.
|
* file were attached to a persist device.
|
||||||
|
@ -232,6 +237,7 @@ struct tun_struct {
|
||||||
u32 rx_batched;
|
u32 rx_batched;
|
||||||
struct tun_pcpu_stats __percpu *pcpu_stats;
|
struct tun_pcpu_stats __percpu *pcpu_stats;
|
||||||
struct bpf_prog __rcu *xdp_prog;
|
struct bpf_prog __rcu *xdp_prog;
|
||||||
|
struct tun_steering_prog __rcu *steering_prog;
|
||||||
};
|
};
|
||||||
|
|
||||||
static int tun_napi_receive(struct napi_struct *napi, int budget)
|
static int tun_napi_receive(struct napi_struct *napi, int budget)
|
||||||
|
@ -537,15 +543,12 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
|
||||||
* different rxq no. here. If we could not get rxhash, then we would
|
* different rxq no. here. If we could not get rxhash, then we would
|
||||||
* hope the rxq no. may help here.
|
* hope the rxq no. may help here.
|
||||||
*/
|
*/
|
||||||
static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
|
static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
|
||||||
void *accel_priv, select_queue_fallback_t fallback)
|
|
||||||
{
|
{
|
||||||
struct tun_struct *tun = netdev_priv(dev);
|
|
||||||
struct tun_flow_entry *e;
|
struct tun_flow_entry *e;
|
||||||
u32 txq = 0;
|
u32 txq = 0;
|
||||||
u32 numqueues = 0;
|
u32 numqueues = 0;
|
||||||
|
|
||||||
rcu_read_lock();
|
|
||||||
numqueues = READ_ONCE(tun->numqueues);
|
numqueues = READ_ONCE(tun->numqueues);
|
||||||
|
|
||||||
txq = __skb_get_hash_symmetric(skb);
|
txq = __skb_get_hash_symmetric(skb);
|
||||||
|
@ -563,10 +566,37 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
|
||||||
txq -= numqueues;
|
txq -= numqueues;
|
||||||
}
|
}
|
||||||
|
|
||||||
rcu_read_unlock();
|
|
||||||
return txq;
|
return txq;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb)
|
||||||
|
{
|
||||||
|
struct tun_steering_prog *prog;
|
||||||
|
u16 ret = 0;
|
||||||
|
|
||||||
|
prog = rcu_dereference(tun->steering_prog);
|
||||||
|
if (prog)
|
||||||
|
ret = bpf_prog_run_clear_cb(prog->prog, skb);
|
||||||
|
|
||||||
|
return ret % tun->numqueues;
|
||||||
|
}
|
||||||
|
|
||||||
|
static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
|
||||||
|
void *accel_priv, select_queue_fallback_t fallback)
|
||||||
|
{
|
||||||
|
struct tun_struct *tun = netdev_priv(dev);
|
||||||
|
u16 ret;
|
||||||
|
|
||||||
|
rcu_read_lock();
|
||||||
|
if (rcu_dereference(tun->steering_prog))
|
||||||
|
ret = tun_ebpf_select_queue(tun, skb);
|
||||||
|
else
|
||||||
|
ret = tun_automq_select_queue(tun, skb);
|
||||||
|
rcu_read_unlock();
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
static inline bool tun_not_capable(struct tun_struct *tun)
|
static inline bool tun_not_capable(struct tun_struct *tun)
|
||||||
{
|
{
|
||||||
const struct cred *cred = current_cred();
|
const struct cred *cred = current_cred();
|
||||||
|
@ -933,23 +963,10 @@ static int tun_net_close(struct net_device *dev)
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Net device start xmit */
|
/* Net device start xmit */
|
||||||
static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
|
static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb)
|
||||||
{
|
{
|
||||||
struct tun_struct *tun = netdev_priv(dev);
|
|
||||||
int txq = skb->queue_mapping;
|
|
||||||
struct tun_file *tfile;
|
|
||||||
u32 numqueues = 0;
|
|
||||||
|
|
||||||
rcu_read_lock();
|
|
||||||
tfile = rcu_dereference(tun->tfiles[txq]);
|
|
||||||
numqueues = READ_ONCE(tun->numqueues);
|
|
||||||
|
|
||||||
/* Drop packet if interface is not attached */
|
|
||||||
if (txq >= numqueues)
|
|
||||||
goto drop;
|
|
||||||
|
|
||||||
#ifdef CONFIG_RPS
|
#ifdef CONFIG_RPS
|
||||||
if (numqueues == 1 && static_key_false(&rps_needed)) {
|
if (tun->numqueues == 1 && static_key_false(&rps_needed)) {
|
||||||
/* Select queue was not called for the skbuff, so we extract the
|
/* Select queue was not called for the skbuff, so we extract the
|
||||||
* RPS hash and save it into the flow_table here.
|
* RPS hash and save it into the flow_table here.
|
||||||
*/
|
*/
|
||||||
|
@ -965,6 +982,26 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Net device start xmit */
|
||||||
|
static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
|
||||||
|
{
|
||||||
|
struct tun_struct *tun = netdev_priv(dev);
|
||||||
|
int txq = skb->queue_mapping;
|
||||||
|
struct tun_file *tfile;
|
||||||
|
u32 numqueues = 0;
|
||||||
|
|
||||||
|
rcu_read_lock();
|
||||||
|
tfile = rcu_dereference(tun->tfiles[txq]);
|
||||||
|
numqueues = READ_ONCE(tun->numqueues);
|
||||||
|
|
||||||
|
/* Drop packet if interface is not attached */
|
||||||
|
if (txq >= numqueues)
|
||||||
|
goto drop;
|
||||||
|
|
||||||
|
if (!rcu_dereference(tun->steering_prog))
|
||||||
|
tun_automq_xmit(tun, skb);
|
||||||
|
|
||||||
tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
|
tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
|
||||||
|
|
||||||
|
@ -1547,7 +1584,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
|
||||||
int copylen;
|
int copylen;
|
||||||
bool zerocopy = false;
|
bool zerocopy = false;
|
||||||
int err;
|
int err;
|
||||||
u32 rxhash;
|
u32 rxhash = 0;
|
||||||
int skb_xdp = 1;
|
int skb_xdp = 1;
|
||||||
bool frags = tun_napi_frags_enabled(tun);
|
bool frags = tun_napi_frags_enabled(tun);
|
||||||
|
|
||||||
|
@ -1735,7 +1772,10 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
|
||||||
rcu_read_unlock();
|
rcu_read_unlock();
|
||||||
}
|
}
|
||||||
|
|
||||||
rxhash = __skb_get_hash_symmetric(skb);
|
rcu_read_lock();
|
||||||
|
if (!rcu_dereference(tun->steering_prog))
|
||||||
|
rxhash = __skb_get_hash_symmetric(skb);
|
||||||
|
rcu_read_unlock();
|
||||||
|
|
||||||
if (frags) {
|
if (frags) {
|
||||||
/* Exercise flow dissector code path. */
|
/* Exercise flow dissector code path. */
|
||||||
|
@ -1779,7 +1819,9 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
|
||||||
u64_stats_update_end(&stats->syncp);
|
u64_stats_update_end(&stats->syncp);
|
||||||
put_cpu_ptr(stats);
|
put_cpu_ptr(stats);
|
||||||
|
|
||||||
tun_flow_update(tun, rxhash, tfile);
|
if (rxhash)
|
||||||
|
tun_flow_update(tun, rxhash, tfile);
|
||||||
|
|
||||||
return total_len;
|
return total_len;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1987,6 +2029,36 @@ static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void tun_steering_prog_free(struct rcu_head *rcu)
|
||||||
|
{
|
||||||
|
struct tun_steering_prog *prog = container_of(rcu,
|
||||||
|
struct tun_steering_prog, rcu);
|
||||||
|
|
||||||
|
bpf_prog_destroy(prog->prog);
|
||||||
|
kfree(prog);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int __tun_set_steering_ebpf(struct tun_struct *tun,
|
||||||
|
struct bpf_prog *prog)
|
||||||
|
{
|
||||||
|
struct tun_steering_prog *old, *new = NULL;
|
||||||
|
|
||||||
|
if (prog) {
|
||||||
|
new = kmalloc(sizeof(*new), GFP_KERNEL);
|
||||||
|
if (!new)
|
||||||
|
return -ENOMEM;
|
||||||
|
new->prog = prog;
|
||||||
|
}
|
||||||
|
|
||||||
|
old = rtnl_dereference(tun->steering_prog);
|
||||||
|
rcu_assign_pointer(tun->steering_prog, new);
|
||||||
|
|
||||||
|
if (old)
|
||||||
|
call_rcu(&old->rcu, tun_steering_prog_free);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
static void tun_free_netdev(struct net_device *dev)
|
static void tun_free_netdev(struct net_device *dev)
|
||||||
{
|
{
|
||||||
struct tun_struct *tun = netdev_priv(dev);
|
struct tun_struct *tun = netdev_priv(dev);
|
||||||
|
@ -1995,6 +2067,9 @@ static void tun_free_netdev(struct net_device *dev)
|
||||||
free_percpu(tun->pcpu_stats);
|
free_percpu(tun->pcpu_stats);
|
||||||
tun_flow_uninit(tun);
|
tun_flow_uninit(tun);
|
||||||
security_tun_dev_free_security(tun->security);
|
security_tun_dev_free_security(tun->security);
|
||||||
|
rtnl_lock();
|
||||||
|
__tun_set_steering_ebpf(tun, NULL);
|
||||||
|
rtnl_unlock();
|
||||||
}
|
}
|
||||||
|
|
||||||
static void tun_setup(struct net_device *dev)
|
static void tun_setup(struct net_device *dev)
|
||||||
|
@ -2283,6 +2358,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
|
||||||
tun->filter_attached = false;
|
tun->filter_attached = false;
|
||||||
tun->sndbuf = tfile->socket.sk->sk_sndbuf;
|
tun->sndbuf = tfile->socket.sk->sk_sndbuf;
|
||||||
tun->rx_batched = 0;
|
tun->rx_batched = 0;
|
||||||
|
RCU_INIT_POINTER(tun->steering_prog, NULL);
|
||||||
|
|
||||||
tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats);
|
tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats);
|
||||||
if (!tun->pcpu_stats) {
|
if (!tun->pcpu_stats) {
|
||||||
|
@ -2475,6 +2551,25 @@ unlock:
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int tun_set_steering_ebpf(struct tun_struct *tun, void __user *data)
|
||||||
|
{
|
||||||
|
struct bpf_prog *prog;
|
||||||
|
int fd;
|
||||||
|
|
||||||
|
if (copy_from_user(&fd, data, sizeof(fd)))
|
||||||
|
return -EFAULT;
|
||||||
|
|
||||||
|
if (fd == -1) {
|
||||||
|
prog = NULL;
|
||||||
|
} else {
|
||||||
|
prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
|
||||||
|
if (IS_ERR(prog))
|
||||||
|
return PTR_ERR(prog);
|
||||||
|
}
|
||||||
|
|
||||||
|
return __tun_set_steering_ebpf(tun, prog);
|
||||||
|
}
|
||||||
|
|
||||||
static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
|
static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
|
||||||
unsigned long arg, int ifreq_len)
|
unsigned long arg, int ifreq_len)
|
||||||
{
|
{
|
||||||
|
@ -2751,6 +2846,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
|
||||||
ret = 0;
|
ret = 0;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case TUNSETSTEERINGEBPF:
|
||||||
|
ret = tun_set_steering_ebpf(tun, argp);
|
||||||
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
ret = -EINVAL;
|
ret = -EINVAL;
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -57,6 +57,7 @@
|
||||||
*/
|
*/
|
||||||
#define TUNSETVNETBE _IOW('T', 222, int)
|
#define TUNSETVNETBE _IOW('T', 222, int)
|
||||||
#define TUNGETVNETBE _IOR('T', 223, int)
|
#define TUNGETVNETBE _IOR('T', 223, int)
|
||||||
|
#define TUNSETSTEERINGEBPF _IOR('T', 224, int)
|
||||||
|
|
||||||
/* TUNSETIFF ifr flags */
|
/* TUNSETIFF ifr flags */
|
||||||
#define IFF_TUN 0x0001
|
#define IFF_TUN 0x0001
|
||||||
|
|
Loading…
Reference in New Issue