packet: rollover lock contention avoidance

Rollover has to call packet_rcv_has_room on sockets in the fanout
group to find a socket to migrate to. This operation is expensive
especially if the packet sockets use rings, when a lock has to be
acquired.

Avoid pounding on the lock by all sockets by temporarily marking a
socket as "under memory pressure" when such pressure is detected.
While set, only the socket owner may call packet_rcv_has_room on the
socket. Once it detects normal conditions, it clears the flag. The
socket is not used as a victim by any other socket in the meantime.

Under reasonably balanced load, each socket writer frequently calls
packet_rcv_has_room and clears its own pressure field. As a backup
for when the socket is rarely written to, also clear the flag on
reading (packet_recvmsg, packet_poll) if this can be done cheaply
(i.e., without calling packet_rcv_has_room). This is only for
edge cases.

Tested:
  Ran bench_rollover: a process with 8 sockets in a single fanout
  group, each pinned to a single cpu that receives one nic recv
  interrupt. RPS and RFS are disabled. The benchmark uses packet
  rx_ring, which has to take a lock when determining whether a
  socket has room.

  Sent 3.5 Mpps of UDP traffic with sufficient entropy to spread
  uniformly across the packet sockets (and inserted an iptables
  rule to drop in PREROUTING to avoid protocol stack processing).

  Without this patch, all sockets try to migrate traffic to
  neighbors, causing lock contention when searching for a non-
  empty neighbor. The lock is the top 9 entries.

    perf record -a -g sleep 5

    -  17.82%   bench_rollover  [kernel.kallsyms]    [k] _raw_spin_lock
       - _raw_spin_lock
          - 99.00% spin_lock
    	 + 81.77% packet_rcv_has_room.isra.41
    	 + 18.23% tpacket_rcv
          + 0.84% packet_rcv_has_room.isra.41
    +   5.20%      ksoftirqd/6  [kernel.kallsyms]    [k] _raw_spin_lock
    +   5.15%      ksoftirqd/1  [kernel.kallsyms]    [k] _raw_spin_lock
    +   5.14%      ksoftirqd/2  [kernel.kallsyms]    [k] _raw_spin_lock
    +   5.12%      ksoftirqd/7  [kernel.kallsyms]    [k] _raw_spin_lock
    +   5.12%      ksoftirqd/5  [kernel.kallsyms]    [k] _raw_spin_lock
    +   5.10%      ksoftirqd/4  [kernel.kallsyms]    [k] _raw_spin_lock
    +   4.66%      ksoftirqd/0  [kernel.kallsyms]    [k] _raw_spin_lock
    +   4.45%      ksoftirqd/3  [kernel.kallsyms]    [k] _raw_spin_lock
    +   1.55%   bench_rollover  [kernel.kallsyms]    [k] packet_rcv_has_room.isra.41

  On net-next with this patch, this lock contention is no longer a
  top entry. Most time is spent in the actual read function. Next up
  are other locks:

    +  15.52%  bench_rollover  bench_rollover     [.] reader
    +   4.68%         swapper  [kernel.kallsyms]  [k] memcpy_erms
    +   2.77%         swapper  [kernel.kallsyms]  [k] packet_lookup_frame.isra.51
    +   2.56%     ksoftirqd/1  [kernel.kallsyms]  [k] memcpy_erms
    +   2.16%         swapper  [kernel.kallsyms]  [k] tpacket_rcv
    +   1.93%         swapper  [kernel.kallsyms]  [k] mlx4_en_process_rx_cq

  Looking closer at the remaining _raw_spin_lock, the cost of probing
  in rollover is now comparable to the cost of taking the lock later
  in tpacket_rcv.

    -   1.51%         swapper  [kernel.kallsyms]  [k] _raw_spin_lock
       - _raw_spin_lock
          + 33.41% packet_rcv_has_room
          + 28.15% tpacket_rcv
          + 19.54% enqueue_to_backlog
          + 6.45% __free_pages_ok
          + 2.78% packet_rcv_fanout
          + 2.13% fanout_demux_rollover
          + 2.01% netif_receive_skb_internal

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Willem de Bruijn 2015-05-12 11:56:48 -04:00 committed by David S. Miller
parent 9954729bc3
commit 2ccdbaa6d5
2 changed files with 32 additions and 7 deletions

View File

@ -1265,14 +1265,14 @@ static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL); return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
} }
static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
{ {
struct sock *sk = &po->sk; struct sock *sk = &po->sk;
int ret = ROOM_NONE; int ret = ROOM_NONE;
if (po->prot_hook.func != tpacket_rcv) { if (po->prot_hook.func != tpacket_rcv) {
int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc) int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
- skb->truesize; - (skb ? skb->truesize : 0);
if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF)) if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
return ROOM_NORMAL; return ROOM_NORMAL;
else if (avail > 0) else if (avail > 0)
@ -1281,7 +1281,6 @@ static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
return ROOM_NONE; return ROOM_NONE;
} }
spin_lock(&sk->sk_receive_queue.lock);
if (po->tp_version == TPACKET_V3) { if (po->tp_version == TPACKET_V3) {
if (__tpacket_v3_has_room(po, ROOM_POW_OFF)) if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
ret = ROOM_NORMAL; ret = ROOM_NORMAL;
@ -1293,7 +1292,26 @@ static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
else if (__tpacket_has_room(po, 0)) else if (__tpacket_has_room(po, 0))
ret = ROOM_LOW; ret = ROOM_LOW;
} }
spin_unlock(&sk->sk_receive_queue.lock);
return ret;
}
static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
{
int ret;
bool has_room;
if (po->prot_hook.func == tpacket_rcv) {
spin_lock(&po->sk.sk_receive_queue.lock);
ret = __packet_rcv_has_room(po, skb);
spin_unlock(&po->sk.sk_receive_queue.lock);
} else {
ret = __packet_rcv_has_room(po, skb);
}
has_room = ret == ROOM_NORMAL;
if (po->pressure == has_room)
xchg(&po->pressure, !has_room);
return ret; return ret;
} }
@ -1362,7 +1380,7 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f,
unsigned int idx, bool try_self, unsigned int idx, bool try_self,
unsigned int num) unsigned int num)
{ {
struct packet_sock *po; struct packet_sock *po, *po_next;
unsigned int i, j; unsigned int i, j;
po = pkt_sk(f->arr[idx]); po = pkt_sk(f->arr[idx]);
@ -1371,8 +1389,9 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f,
i = j = min_t(int, po->rollover->sock, num - 1); i = j = min_t(int, po->rollover->sock, num - 1);
do { do {
if (i != idx && po_next = pkt_sk(f->arr[i]);
packet_rcv_has_room(pkt_sk(f->arr[i]), skb) == ROOM_NORMAL) { if (po_next != po && !po_next->pressure &&
packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
if (i != j) if (i != j)
po->rollover->sock = i; po->rollover->sock = i;
return i; return i;
@ -3000,6 +3019,9 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
if (skb == NULL) if (skb == NULL)
goto out; goto out;
if (pkt_sk(sk)->pressure)
packet_rcv_has_room(pkt_sk(sk), NULL);
if (pkt_sk(sk)->has_vnet_hdr) { if (pkt_sk(sk)->has_vnet_hdr) {
struct virtio_net_hdr vnet_hdr = { 0 }; struct virtio_net_hdr vnet_hdr = { 0 };
@ -3755,6 +3777,8 @@ static unsigned int packet_poll(struct file *file, struct socket *sock,
TP_STATUS_KERNEL)) TP_STATUS_KERNEL))
mask |= POLLIN | POLLRDNORM; mask |= POLLIN | POLLRDNORM;
} }
if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
xchg(&po->pressure, 0);
spin_unlock_bh(&sk->sk_receive_queue.lock); spin_unlock_bh(&sk->sk_receive_queue.lock);
spin_lock_bh(&sk->sk_write_queue.lock); spin_lock_bh(&sk->sk_write_queue.lock);
if (po->tx_ring.pg_vec) { if (po->tx_ring.pg_vec) {

View File

@ -105,6 +105,7 @@ struct packet_sock {
auxdata:1, auxdata:1,
origdev:1, origdev:1,
has_vnet_hdr:1; has_vnet_hdr:1;
int pressure;
int ifindex; /* bound device */ int ifindex; /* bound device */
__be16 num; __be16 num;
struct packet_rollover *rollover; struct packet_rollover *rollover;