net: add low latency socket poll
Adds an ndo_ll_poll method and the code that supports it. This method can be used by low latency applications to busy-poll Ethernet device queues directly from the socket code. sysctl_net_ll_poll controls how many microseconds to poll. Default is zero (disabled). Individual protocol support will be added by subsequent patches. Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com> Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com> Signed-off-by: Eliezer Tamir <eliezer.tamir@linux.intel.com> Acked-by: Eric Dumazet <edumazet@google.com> Tested-by: Willem de Bruijn <willemb@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
af12fa6e46
commit
0602129286
|
@ -50,6 +50,13 @@ The maximum number of packets that kernel can handle on a NAPI interrupt,
|
|||
it's a Per-CPU variable.
|
||||
Default: 64
|
||||
|
||||
low_latency_poll
|
||||
----------------
|
||||
Low latency busy poll timeout. (needs CONFIG_NET_LL_RX_POLL)
|
||||
Approximate time in us to spin waiting for packets on the device queue.
|
||||
Recommended value is 50. May increase power usage.
|
||||
Default: 0 (off)
|
||||
|
||||
rmem_default
|
||||
------------
|
||||
|
||||
|
|
|
@ -971,6 +971,9 @@ struct net_device_ops {
|
|||
struct netpoll_info *info,
|
||||
gfp_t gfp);
|
||||
void (*ndo_netpoll_cleanup)(struct net_device *dev);
|
||||
#endif
|
||||
#ifdef CONFIG_NET_LL_RX_POLL
|
||||
int (*ndo_ll_poll)(struct napi_struct *dev);
|
||||
#endif
|
||||
int (*ndo_set_vf_mac)(struct net_device *dev,
|
||||
int queue, u8 *mac);
|
||||
|
|
|
@ -386,6 +386,7 @@ typedef unsigned char *sk_buff_data_t;
|
|||
* @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS
|
||||
* @dma_cookie: a cookie to one of several possible DMA operations
|
||||
* done by skb DMA functions
|
||||
* @napi_id: id of the NAPI struct this skb came from
|
||||
* @secmark: security marking
|
||||
* @mark: Generic packet mark
|
||||
* @dropcount: total number of sk_receive_queue overflows
|
||||
|
@ -500,8 +501,11 @@ struct sk_buff {
|
|||
/* 7/9 bit hole (depending on ndisc_nodetype presence) */
|
||||
kmemcheck_bitfield_end(flags2);
|
||||
|
||||
#ifdef CONFIG_NET_DMA
|
||||
#if defined CONFIG_NET_DMA || defined CONFIG_NET_LL_RX_POLL
|
||||
union {
|
||||
unsigned int napi_id;
|
||||
dma_cookie_t dma_cookie;
|
||||
};
|
||||
#endif
|
||||
#ifdef CONFIG_NETWORK_SECMARK
|
||||
__u32 secmark;
|
||||
|
|
|
@ -0,0 +1,148 @@
|
|||
/*
|
||||
* Low Latency Sockets
|
||||
* Copyright(c) 2013 Intel Corporation.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms and conditions of the GNU General Public License,
|
||||
* version 2, as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
||||
* more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with
|
||||
* this program; if not, write to the Free Software Foundation, Inc.,
|
||||
* 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Author: Eliezer Tamir
|
||||
*
|
||||
* Contact Information:
|
||||
* e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
|
||||
*/
|
||||
|
||||
/*
|
||||
* For now this depends on CONFIG_X86_TSC
|
||||
*/
|
||||
|
||||
#ifndef _LINUX_NET_LL_POLL_H
|
||||
#define _LINUX_NET_LL_POLL_H
|
||||
|
||||
#include <linux/netdevice.h>
|
||||
#include <net/ip.h>
|
||||
|
||||
#ifdef CONFIG_NET_LL_RX_POLL
|
||||
|
||||
struct napi_struct;
|
||||
extern unsigned long sysctl_net_ll_poll __read_mostly;
|
||||
|
||||
/* return values from ndo_ll_poll */
|
||||
#define LL_FLUSH_FAILED -1
|
||||
#define LL_FLUSH_BUSY -2
|
||||
|
||||
/* we don't mind a ~2.5% imprecision */
|
||||
#define TSC_MHZ (tsc_khz >> 10)
|
||||
|
||||
static inline cycles_t ll_end_time(void)
|
||||
{
|
||||
return TSC_MHZ * ACCESS_ONCE(sysctl_net_ll_poll) + get_cycles();
|
||||
}
|
||||
|
||||
static inline bool sk_valid_ll(struct sock *sk)
|
||||
{
|
||||
return sysctl_net_ll_poll && sk->sk_napi_id &&
|
||||
!need_resched() && !signal_pending(current);
|
||||
}
|
||||
|
||||
static inline bool can_poll_ll(cycles_t end_time)
|
||||
{
|
||||
return !time_after((unsigned long)get_cycles(),
|
||||
(unsigned long)end_time);
|
||||
}
|
||||
|
||||
static inline bool sk_poll_ll(struct sock *sk, int nonblock)
|
||||
{
|
||||
cycles_t end_time = ll_end_time();
|
||||
const struct net_device_ops *ops;
|
||||
struct napi_struct *napi;
|
||||
int rc = false;
|
||||
|
||||
/*
|
||||
* rcu read lock for napi hash
|
||||
* bh so we don't race with net_rx_action
|
||||
*/
|
||||
rcu_read_lock_bh();
|
||||
|
||||
napi = napi_by_id(sk->sk_napi_id);
|
||||
if (!napi)
|
||||
goto out;
|
||||
|
||||
ops = napi->dev->netdev_ops;
|
||||
if (!ops->ndo_ll_poll)
|
||||
goto out;
|
||||
|
||||
do {
|
||||
|
||||
rc = ops->ndo_ll_poll(napi);
|
||||
|
||||
if (rc == LL_FLUSH_FAILED)
|
||||
break; /* permanent failure */
|
||||
|
||||
if (rc > 0)
|
||||
/* local bh are disabled so it is ok to use _BH */
|
||||
NET_ADD_STATS_BH(sock_net(sk),
|
||||
LINUX_MIB_LOWLATENCYRXPACKETS, rc);
|
||||
|
||||
} while (skb_queue_empty(&sk->sk_receive_queue)
|
||||
&& can_poll_ll(end_time) && !nonblock);
|
||||
|
||||
rc = !skb_queue_empty(&sk->sk_receive_queue);
|
||||
out:
|
||||
rcu_read_unlock_bh();
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* used in the NIC receive handler to mark the skb */
|
||||
static inline void skb_mark_ll(struct sk_buff *skb, struct napi_struct *napi)
|
||||
{
|
||||
skb->napi_id = napi->napi_id;
|
||||
}
|
||||
|
||||
/* used in the protocol hanlder to propagate the napi_id to the socket */
|
||||
static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb)
|
||||
{
|
||||
sk->sk_napi_id = skb->napi_id;
|
||||
}
|
||||
|
||||
#else /* CONFIG_NET_LL_RX_POLL */
|
||||
|
||||
static inline cycles_t ll_end_time(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline bool sk_valid_ll(struct sock *sk)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline bool sk_poll_ll(struct sock *sk, int nonblock)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline void skb_mark_ll(struct sk_buff *skb, struct napi_struct *napi)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb)
|
||||
{
|
||||
}
|
||||
|
||||
static inline bool can_poll_ll(cycles_t end_time)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
#endif /* CONFIG_NET_LL_RX_POLL */
|
||||
#endif /* _LINUX_NET_LL_POLL_H */
|
|
@ -229,6 +229,7 @@ struct cg_proto;
|
|||
* @sk_omem_alloc: "o" is "option" or "other"
|
||||
* @sk_wmem_queued: persistent queue size
|
||||
* @sk_forward_alloc: space allocated forward
|
||||
* @sk_napi_id: id of the last napi context to receive data for sk
|
||||
* @sk_allocation: allocation mode
|
||||
* @sk_sndbuf: size of send buffer in bytes
|
||||
* @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
|
||||
|
@ -324,6 +325,9 @@ struct sock {
|
|||
int sk_forward_alloc;
|
||||
#ifdef CONFIG_RPS
|
||||
__u32 sk_rxhash;
|
||||
#endif
|
||||
#ifdef CONFIG_NET_LL_RX_POLL
|
||||
unsigned int sk_napi_id;
|
||||
#endif
|
||||
atomic_t sk_drops;
|
||||
int sk_rcvbuf;
|
||||
|
|
|
@ -253,6 +253,7 @@ enum
|
|||
LINUX_MIB_TCPFASTOPENLISTENOVERFLOW, /* TCPFastOpenListenOverflow */
|
||||
LINUX_MIB_TCPFASTOPENCOOKIEREQD, /* TCPFastOpenCookieReqd */
|
||||
LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES, /* TCPSpuriousRtxHostQueues */
|
||||
LINUX_MIB_LOWLATENCYRXPACKETS, /* LowLatencyRxPackets */
|
||||
__LINUX_MIB_MAX
|
||||
};
|
||||
|
||||
|
|
12
net/Kconfig
12
net/Kconfig
|
@ -243,6 +243,18 @@ config NETPRIO_CGROUP
|
|||
Cgroup subsystem for use in assigning processes to network priorities on
|
||||
a per-interface basis
|
||||
|
||||
config NET_LL_RX_POLL
|
||||
bool "Low Latency Receive Poll"
|
||||
depends on X86_TSC
|
||||
default n
|
||||
---help---
|
||||
Support Low Latency Receive Queue Poll.
|
||||
(For network card drivers which support this option.)
|
||||
When waiting for data in read or poll call directly into the the device driver
|
||||
to flush packets which may be pending on the device queues into the stack.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config BQL
|
||||
boolean
|
||||
depends on SYSFS
|
||||
|
|
|
@ -733,6 +733,10 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
|
|||
new->vlan_tci = old->vlan_tci;
|
||||
|
||||
skb_copy_secmark(new, old);
|
||||
|
||||
#ifdef CONFIG_NET_LL_RX_POLL
|
||||
new->napi_id = old->napi_id;
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -139,6 +139,8 @@
|
|||
#include <net/tcp.h>
|
||||
#endif
|
||||
|
||||
#include <net/ll_poll.h>
|
||||
|
||||
static DEFINE_MUTEX(proto_list_mutex);
|
||||
static LIST_HEAD(proto_list);
|
||||
|
||||
|
@ -2284,6 +2286,10 @@ void sock_init_data(struct socket *sock, struct sock *sk)
|
|||
|
||||
sk->sk_stamp = ktime_set(-1L, 0);
|
||||
|
||||
#ifdef CONFIG_NET_LL_RX_POLL
|
||||
sk->sk_napi_id = 0;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Before updating sk_refcnt, we must commit prior changes to memory
|
||||
* (Documentation/RCU/rculist_nulls.txt for details)
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#include <net/ip.h>
|
||||
#include <net/sock.h>
|
||||
#include <net/net_ratelimit.h>
|
||||
#include <net/ll_poll.h>
|
||||
|
||||
static int one = 1;
|
||||
|
||||
|
@ -284,6 +285,15 @@ static struct ctl_table net_core_table[] = {
|
|||
.proc_handler = flow_limit_table_len_sysctl
|
||||
},
|
||||
#endif /* CONFIG_NET_FLOW_LIMIT */
|
||||
#ifdef CONFIG_NET_LL_RX_POLL
|
||||
{
|
||||
.procname = "low_latency_poll",
|
||||
.data = &sysctl_net_ll_poll,
|
||||
.maxlen = sizeof(unsigned long),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_doulongvec_minmax
|
||||
},
|
||||
#endif
|
||||
#endif /* CONFIG_NET */
|
||||
{
|
||||
.procname = "netdev_budget",
|
||||
|
|
|
@ -273,6 +273,7 @@ static const struct snmp_mib snmp4_net_list[] = {
|
|||
SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW),
|
||||
SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD),
|
||||
SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES),
|
||||
SNMP_MIB_ITEM("LowLatencyRxPackets", LINUX_MIB_LOWLATENCYRXPACKETS),
|
||||
SNMP_MIB_SENTINEL
|
||||
};
|
||||
|
||||
|
|
|
@ -104,6 +104,12 @@
|
|||
#include <linux/route.h>
|
||||
#include <linux/sockios.h>
|
||||
#include <linux/atalk.h>
|
||||
#include <net/ll_poll.h>
|
||||
|
||||
#ifdef CONFIG_NET_LL_RX_POLL
|
||||
unsigned long sysctl_net_ll_poll __read_mostly;
|
||||
EXPORT_SYMBOL_GPL(sysctl_net_ll_poll);
|
||||
#endif
|
||||
|
||||
static int sock_no_open(struct inode *irrelevant, struct file *dontcare);
|
||||
static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
|
||||
|
|
Loading…
Reference in New Issue