diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c index 570ec72266f3..b24c2e21db8e 100644 --- a/drivers/net/ethernet/sfc/efx.c +++ b/drivers/net/ethernet/sfc/efx.c @@ -264,11 +264,17 @@ static int efx_check_disabled(struct efx_nic *efx) static int efx_process_channel(struct efx_channel *channel, int budget) { struct efx_tx_queue *tx_queue; + struct list_head rx_list; int spent; if (unlikely(!channel->enabled)) return 0; + /* Prepare the batch receive list */ + EFX_WARN_ON_PARANOID(channel->rx_list != NULL); + INIT_LIST_HEAD(&rx_list); + channel->rx_list = &rx_list; + efx_for_each_channel_tx_queue(tx_queue, channel) { tx_queue->pkts_compl = 0; tx_queue->bytes_compl = 0; @@ -291,6 +297,10 @@ static int efx_process_channel(struct efx_channel *channel, int budget) } } + /* Receive any packets we queued up */ + netif_receive_skb_list(channel->rx_list); + channel->rx_list = NULL; + return spent; } @@ -555,6 +565,8 @@ static int efx_probe_channel(struct efx_channel *channel) goto fail; } + channel->rx_list = NULL; + return 0; fail: diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h index 65568925c3ef..961b92979640 100644 --- a/drivers/net/ethernet/sfc/net_driver.h +++ b/drivers/net/ethernet/sfc/net_driver.h @@ -448,6 +448,7 @@ enum efx_sync_events_state { * __efx_rx_packet(), or zero if there is none * @rx_pkt_index: Ring index of first buffer for next packet to be delivered * by __efx_rx_packet(), if @rx_pkt_n_frags != 0 + * @rx_list: list of SKBs from current RX, awaiting processing * @rx_queue: RX queue for this channel * @tx_queue: TX queues for this channel * @sync_events_state: Current state of sync events on this channel @@ -500,6 +501,8 @@ struct efx_channel { unsigned int rx_pkt_n_frags; unsigned int rx_pkt_index; + struct list_head *rx_list; + struct efx_rx_queue rx_queue; struct efx_tx_queue tx_queue[EFX_TXQ_TYPES]; diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c index d2e254f2f72b..396ff01298cd 100644 --- a/drivers/net/ethernet/sfc/rx.c +++ b/drivers/net/ethernet/sfc/rx.c @@ -634,7 +634,12 @@ static void efx_rx_deliver(struct efx_channel *channel, u8 *eh, return; /* Pass the packet up */ - netif_receive_skb(skb); + if (channel->rx_list != NULL) + /* Add to list, will pass up later */ + list_add_tail(&skb->list, channel->rx_list); + else + /* No list, so pass it up now */ + netif_receive_skb(skb); } /* Handle a received packet. Second half: Touches packet payload. */ diff --git a/include/linux/list.h b/include/linux/list.h index 4b129df4d46b..de04cc5ed536 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -285,6 +285,36 @@ static inline void list_cut_position(struct list_head *list, __list_cut_position(list, head, entry); } +/** + * list_cut_before - cut a list into two, before given entry + * @list: a new list to add all removed entries + * @head: a list with entries + * @entry: an entry within head, could be the head itself + * + * This helper moves the initial part of @head, up to but + * excluding @entry, from @head to @list. You should pass + * in @entry an element you know is on @head. @list should + * be an empty list or a list you do not care about losing + * its data. + * If @entry == @head, all entries on @head are moved to + * @list. + */ +static inline void list_cut_before(struct list_head *list, + struct list_head *head, + struct list_head *entry) +{ + if (head->next == entry) { + INIT_LIST_HEAD(list); + return; + } + list->next = head->next; + list->next->prev = list; + list->prev = entry->prev; + list->prev->next = list; + head->next = entry; + entry->prev = head; +} + static inline void __list_splice(const struct list_head *list, struct list_head *prev, struct list_head *next) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 64480a0f2c16..c1ef749b6f9f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2297,6 +2297,9 @@ struct packet_type { struct net_device *, struct packet_type *, struct net_device *); + void (*list_func) (struct list_head *, + struct packet_type *, + struct net_device *); bool (*id_match)(struct packet_type *ptype, struct sock *sk); void *af_packet_priv; @@ -3477,6 +3480,7 @@ int netif_rx(struct sk_buff *skb); int netif_rx_ni(struct sk_buff *skb); int netif_receive_skb(struct sk_buff *skb); int netif_receive_skb_core(struct sk_buff *skb); +void netif_receive_skb_list(struct list_head *head); gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); void napi_gro_flush(struct napi_struct *napi, bool flush_old); struct sk_buff *napi_get_frags(struct napi_struct *napi); diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index dd2052f0efb7..5a5e0a2ab2a3 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -288,6 +288,20 @@ NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct return ret; } +static inline void +NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, + struct list_head *head, struct net_device *in, struct net_device *out, + int (*okfn)(struct net *, struct sock *, struct sk_buff *)) +{ + struct sk_buff *skb, *next; + + list_for_each_entry_safe(skb, next, head, list) { + int ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn); + if (ret != 1) + list_del(&skb->list); + } +} + /* Call setsockopt() */ int nf_setsockopt(struct sock *sk, u_int8_t pf, int optval, char __user *opt, unsigned int len); @@ -369,6 +383,14 @@ NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, return okfn(net, sk, skb); } +static inline void +NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, + struct list_head *head, struct net_device *in, struct net_device *out, + int (*okfn)(struct net *, struct sock *, struct sk_buff *)) +{ + /* nothing to do */ +} + static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, diff --git a/include/net/ip.h b/include/net/ip.h index 09da79d8ceea..99d1b835d2aa 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -138,6 +138,8 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, struct ip_options_rcu *opt); int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); +void ip_list_rcv(struct list_head *head, struct packet_type *pt, + struct net_device *orig_dev); int ip_local_deliver(struct sk_buff *skb); int ip_mr_input(struct sk_buff *skb); int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb); diff --git a/include/trace/events/net.h b/include/trace/events/net.h index 9c886739246a..00aa72ce0e7c 100644 --- a/include/trace/events/net.h +++ b/include/trace/events/net.h @@ -223,6 +223,13 @@ DEFINE_EVENT(net_dev_rx_verbose_template, netif_receive_skb_entry, TP_ARGS(skb) ); +DEFINE_EVENT(net_dev_rx_verbose_template, netif_receive_skb_list_entry, + + TP_PROTO(const struct sk_buff *skb), + + TP_ARGS(skb) +); + DEFINE_EVENT(net_dev_rx_verbose_template, netif_rx_entry, TP_PROTO(const struct sk_buff *skb), diff --git a/net/core/dev.c b/net/core/dev.c index 08d58e0debe5..7e6a2f66db5c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4608,7 +4608,8 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, return 0; } -static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) +static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc, + struct packet_type **ppt_prev) { struct packet_type *ptype, *pt_prev; rx_handler_func_t *rx_handler; @@ -4738,8 +4739,7 @@ skip_classify: if (pt_prev) { if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) goto drop; - else - ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); + *ppt_prev = pt_prev; } else { drop: if (!deliver_exact) @@ -4757,6 +4757,18 @@ out: return ret; } +static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc) +{ + struct net_device *orig_dev = skb->dev; + struct packet_type *pt_prev = NULL; + int ret; + + ret = __netif_receive_skb_core(skb, pfmemalloc, &pt_prev); + if (pt_prev) + ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); + return ret; +} + /** * netif_receive_skb_core - special purpose version of netif_receive_skb * @skb: buffer to process @@ -4777,13 +4789,67 @@ int netif_receive_skb_core(struct sk_buff *skb) int ret; rcu_read_lock(); - ret = __netif_receive_skb_core(skb, false); + ret = __netif_receive_skb_one_core(skb, false); rcu_read_unlock(); return ret; } EXPORT_SYMBOL(netif_receive_skb_core); +static inline void __netif_receive_skb_list_ptype(struct list_head *head, + struct packet_type *pt_prev, + struct net_device *orig_dev) +{ + struct sk_buff *skb, *next; + + if (!pt_prev) + return; + if (list_empty(head)) + return; + if (pt_prev->list_func != NULL) + pt_prev->list_func(head, pt_prev, orig_dev); + else + list_for_each_entry_safe(skb, next, head, list) + pt_prev->func(skb, skb->dev, pt_prev, orig_dev); +} + +static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc) +{ + /* Fast-path assumptions: + * - There is no RX handler. + * - Only one packet_type matches. + * If either of these fails, we will end up doing some per-packet + * processing in-line, then handling the 'last ptype' for the whole + * sublist. This can't cause out-of-order delivery to any single ptype, + * because the 'last ptype' must be constant across the sublist, and all + * other ptypes are handled per-packet. + */ + /* Current (common) ptype of sublist */ + struct packet_type *pt_curr = NULL; + /* Current (common) orig_dev of sublist */ + struct net_device *od_curr = NULL; + struct list_head sublist; + struct sk_buff *skb, *next; + + list_for_each_entry_safe(skb, next, head, list) { + struct net_device *orig_dev = skb->dev; + struct packet_type *pt_prev = NULL; + + __netif_receive_skb_core(skb, pfmemalloc, &pt_prev); + if (pt_curr != pt_prev || od_curr != orig_dev) { + /* dispatch old sublist */ + list_cut_before(&sublist, head, &skb->list); + __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr); + /* start new sublist */ + pt_curr = pt_prev; + od_curr = orig_dev; + } + } + + /* dispatch final sublist */ + __netif_receive_skb_list_ptype(head, pt_curr, od_curr); +} + static int __netif_receive_skb(struct sk_buff *skb) { int ret; @@ -4801,14 +4867,44 @@ static int __netif_receive_skb(struct sk_buff *skb) * context down to all allocation sites. */ noreclaim_flag = memalloc_noreclaim_save(); - ret = __netif_receive_skb_core(skb, true); + ret = __netif_receive_skb_one_core(skb, true); memalloc_noreclaim_restore(noreclaim_flag); } else - ret = __netif_receive_skb_core(skb, false); + ret = __netif_receive_skb_one_core(skb, false); return ret; } +static void __netif_receive_skb_list(struct list_head *head) +{ + unsigned long noreclaim_flag = 0; + struct sk_buff *skb, *next; + bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */ + + list_for_each_entry_safe(skb, next, head, list) { + if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) { + struct list_head sublist; + + /* Handle the previous sublist */ + list_cut_before(&sublist, head, &skb->list); + if (!list_empty(&sublist)) + __netif_receive_skb_list_core(&sublist, pfmemalloc); + pfmemalloc = !pfmemalloc; + /* See comments in __netif_receive_skb */ + if (pfmemalloc) + noreclaim_flag = memalloc_noreclaim_save(); + else + memalloc_noreclaim_restore(noreclaim_flag); + } + } + /* Handle the remaining sublist */ + if (!list_empty(head)) + __netif_receive_skb_list_core(head, pfmemalloc); + /* Restore pflags */ + if (pfmemalloc) + memalloc_noreclaim_restore(noreclaim_flag); +} + static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) { struct bpf_prog *old = rtnl_dereference(dev->xdp_prog); @@ -4883,6 +4979,50 @@ static int netif_receive_skb_internal(struct sk_buff *skb) return ret; } +static void netif_receive_skb_list_internal(struct list_head *head) +{ + struct bpf_prog *xdp_prog = NULL; + struct sk_buff *skb, *next; + + list_for_each_entry_safe(skb, next, head, list) { + net_timestamp_check(netdev_tstamp_prequeue, skb); + if (skb_defer_rx_timestamp(skb)) + /* Handled, remove from list */ + list_del(&skb->list); + } + + if (static_branch_unlikely(&generic_xdp_needed_key)) { + preempt_disable(); + rcu_read_lock(); + list_for_each_entry_safe(skb, next, head, list) { + xdp_prog = rcu_dereference(skb->dev->xdp_prog); + if (do_xdp_generic(xdp_prog, skb) != XDP_PASS) + /* Dropped, remove from list */ + list_del(&skb->list); + } + rcu_read_unlock(); + preempt_enable(); + } + + rcu_read_lock(); +#ifdef CONFIG_RPS + if (static_key_false(&rps_needed)) { + list_for_each_entry_safe(skb, next, head, list) { + struct rps_dev_flow voidflow, *rflow = &voidflow; + int cpu = get_rps_cpu(skb->dev, skb, &rflow); + + if (cpu >= 0) { + enqueue_to_backlog(skb, cpu, &rflow->last_qtail); + /* Handled, remove from list */ + list_del(&skb->list); + } + } + } +#endif + __netif_receive_skb_list(head); + rcu_read_unlock(); +} + /** * netif_receive_skb - process receive buffer from network * @skb: buffer to process @@ -4906,6 +5046,28 @@ int netif_receive_skb(struct sk_buff *skb) } EXPORT_SYMBOL(netif_receive_skb); +/** + * netif_receive_skb_list - process many receive buffers from network + * @head: list of skbs to process. + * + * Since return value of netif_receive_skb() is normally ignored, and + * wouldn't be meaningful for a list, this function returns void. + * + * This function may only be called from softirq context and interrupts + * should be enabled. + */ +void netif_receive_skb_list(struct list_head *head) +{ + struct sk_buff *skb; + + if (list_empty(head)) + return; + list_for_each_entry(skb, head, list) + trace_netif_receive_skb_list_entry(skb); + netif_receive_skb_list_internal(head); +} +EXPORT_SYMBOL(netif_receive_skb_list); + DEFINE_PER_CPU(struct work_struct, flush_works); /* Network device is going away, flush any packets still pending */ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 9263a2c114e0..c716be13d58c 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1882,6 +1882,7 @@ fs_initcall(ipv4_offload_init); static struct packet_type ip_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_IP), .func = ip_rcv, + .list_func = ip_list_rcv, }; static int __init inet_init(void) diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 7582713dd18f..24b9b0210aeb 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -307,7 +307,8 @@ drop: return true; } -static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +static int ip_rcv_finish_core(struct net *net, struct sock *sk, + struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); int (*edemux)(struct sk_buff *skb); @@ -393,7 +394,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) goto drop; } - return dst_input(skb); + return NET_RX_SUCCESS; drop: kfree_skb(skb); @@ -405,13 +406,21 @@ drop_error: goto drop; } +static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + int ret = ip_rcv_finish_core(net, sk, skb); + + if (ret != NET_RX_DROP) + ret = dst_input(skb); + return ret; +} + /* * Main IP Receive routine. */ -int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) +static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) { const struct iphdr *iph; - struct net *net; u32 len; /* When the interface is in promisc. mode, drop all the crap @@ -421,7 +430,6 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, goto drop; - net = dev_net(dev); __IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len); skb = skb_share_check(skb, GFP_ATOMIC); @@ -489,9 +497,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, /* Must drop socket now because of tproxy. */ skb_orphan(skb); - return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, - net, NULL, skb, dev, NULL, - ip_rcv_finish); + return skb; csum_error: __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS); @@ -500,5 +506,95 @@ inhdr_error: drop: kfree_skb(skb); out: - return NET_RX_DROP; + return NULL; +} + +/* + * IP receive entry point + */ +int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, + struct net_device *orig_dev) +{ + struct net *net = dev_net(dev); + + skb = ip_rcv_core(skb, net); + if (skb == NULL) + return NET_RX_DROP; + return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, + net, NULL, skb, dev, NULL, + ip_rcv_finish); +} + +static void ip_sublist_rcv_finish(struct list_head *head) +{ + struct sk_buff *skb, *next; + + list_for_each_entry_safe(skb, next, head, list) + dst_input(skb); +} + +static void ip_list_rcv_finish(struct net *net, struct sock *sk, + struct list_head *head) +{ + struct dst_entry *curr_dst = NULL; + struct sk_buff *skb, *next; + struct list_head sublist; + + list_for_each_entry_safe(skb, next, head, list) { + struct dst_entry *dst; + + if (ip_rcv_finish_core(net, sk, skb) == NET_RX_DROP) + continue; + + dst = skb_dst(skb); + if (curr_dst != dst) { + /* dispatch old sublist */ + list_cut_before(&sublist, head, &skb->list); + if (!list_empty(&sublist)) + ip_sublist_rcv_finish(&sublist); + /* start new sublist */ + curr_dst = dst; + } + } + /* dispatch final sublist */ + ip_sublist_rcv_finish(head); +} + +static void ip_sublist_rcv(struct list_head *head, struct net_device *dev, + struct net *net) +{ + NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL, + head, dev, NULL, ip_rcv_finish); + ip_list_rcv_finish(net, NULL, head); +} + +/* Receive a list of IP packets */ +void ip_list_rcv(struct list_head *head, struct packet_type *pt, + struct net_device *orig_dev) +{ + struct net_device *curr_dev = NULL; + struct net *curr_net = NULL; + struct sk_buff *skb, *next; + struct list_head sublist; + + list_for_each_entry_safe(skb, next, head, list) { + struct net_device *dev = skb->dev; + struct net *net = dev_net(dev); + + skb = ip_rcv_core(skb, net); + if (skb == NULL) + continue; + + if (curr_dev != dev || curr_net != net) { + /* dispatch old sublist */ + list_cut_before(&sublist, head, &skb->list); + if (!list_empty(&sublist)) + ip_sublist_rcv(&sublist, dev, net); + /* start new sublist */ + curr_dev = dev; + curr_net = net; + } + } + /* dispatch final sublist */ + ip_sublist_rcv(head, curr_dev, curr_net); }