Merge branch 'icmp-reply-optimize'
Jesper Dangaard Brouer says: ==================== net: optimize ICMP-reply code path This patchset is optimizing the ICMP-reply code path, for ICMP packets that gets rate limited. A remote party can easily trigger this code path by sending packets to port number with no listening service. Generally the patchset moves the sysctl_icmp_msgs_per_sec ratelimit checking to earlier in the code path and removes an allocation. Use-case: The specific case I experienced this being a bottleneck is, sending UDP packets to a port with no listener, which obviously result in kernel replying with ICMP Destination Unreachable (type:3), Port Unreachable (code:3), which cause the bottleneck. After Eric and Paolo optimized the UDP socket code, the kernels PPS processing capabilities is lower for no-listen ports, than normal UDP sockets. This is bad for capacity planning when restarting a service. UDP no-listen benchmark 8xCPUs using pktgen_sample04_many_flows.sh: Baseline: 6.6 Mpps Patch: 14.7 Mpps Driver mlx5 at 50Gbit/s. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
commit
9f2f27a9a5
125
net/ipv4/icmp.c
125
net/ipv4/icmp.c
|
@ -209,19 +209,17 @@ static struct sock *icmp_sk(struct net *net)
|
|||
return *this_cpu_ptr(net->ipv4.icmp_sk);
|
||||
}
|
||||
|
||||
/* Called with BH disabled */
|
||||
static inline struct sock *icmp_xmit_lock(struct net *net)
|
||||
{
|
||||
struct sock *sk;
|
||||
|
||||
local_bh_disable();
|
||||
|
||||
sk = icmp_sk(net);
|
||||
|
||||
if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
|
||||
/* This can happen if the output path signals a
|
||||
* dst_link_failure() for an outgoing ICMP packet.
|
||||
*/
|
||||
local_bh_enable();
|
||||
return NULL;
|
||||
}
|
||||
return sk;
|
||||
|
@ -229,7 +227,7 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
|
|||
|
||||
static inline void icmp_xmit_unlock(struct sock *sk)
|
||||
{
|
||||
spin_unlock_bh(&sk->sk_lock.slock);
|
||||
spin_unlock(&sk->sk_lock.slock);
|
||||
}
|
||||
|
||||
int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
|
||||
|
@ -282,6 +280,33 @@ bool icmp_global_allow(void)
|
|||
}
|
||||
EXPORT_SYMBOL(icmp_global_allow);
|
||||
|
||||
static bool icmpv4_mask_allow(struct net *net, int type, int code)
|
||||
{
|
||||
if (type > NR_ICMP_TYPES)
|
||||
return true;
|
||||
|
||||
/* Don't limit PMTU discovery. */
|
||||
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
|
||||
return true;
|
||||
|
||||
/* Limit if icmp type is enabled in ratemask. */
|
||||
if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool icmpv4_global_allow(struct net *net, int type, int code)
|
||||
{
|
||||
if (icmpv4_mask_allow(net, type, code))
|
||||
return true;
|
||||
|
||||
if (icmp_global_allow())
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Send an ICMP frame.
|
||||
*/
|
||||
|
@ -290,34 +315,22 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
|
|||
struct flowi4 *fl4, int type, int code)
|
||||
{
|
||||
struct dst_entry *dst = &rt->dst;
|
||||
struct inet_peer *peer;
|
||||
bool rc = true;
|
||||
int vif;
|
||||
|
||||
if (type > NR_ICMP_TYPES)
|
||||
goto out;
|
||||
|
||||
/* Don't limit PMTU discovery. */
|
||||
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
|
||||
if (icmpv4_mask_allow(net, type, code))
|
||||
goto out;
|
||||
|
||||
/* No rate limit on loopback */
|
||||
if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
|
||||
goto out;
|
||||
|
||||
/* Limit if icmp type is enabled in ratemask. */
|
||||
if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
|
||||
goto out;
|
||||
|
||||
rc = false;
|
||||
if (icmp_global_allow()) {
|
||||
int vif = l3mdev_master_ifindex(dst->dev);
|
||||
struct inet_peer *peer;
|
||||
|
||||
peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
|
||||
rc = inet_peer_xrlim_allow(peer,
|
||||
net->ipv4.sysctl_icmp_ratelimit);
|
||||
if (peer)
|
||||
inet_putpeer(peer);
|
||||
}
|
||||
vif = l3mdev_master_ifindex(dst->dev);
|
||||
peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
|
||||
rc = inet_peer_xrlim_allow(peer, net->ipv4.sysctl_icmp_ratelimit);
|
||||
if (peer)
|
||||
inet_putpeer(peer);
|
||||
out:
|
||||
return rc;
|
||||
}
|
||||
|
@ -396,13 +409,22 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
|
|||
struct inet_sock *inet;
|
||||
__be32 daddr, saddr;
|
||||
u32 mark = IP4_REPLY_MARK(net, skb->mark);
|
||||
int type = icmp_param->data.icmph.type;
|
||||
int code = icmp_param->data.icmph.code;
|
||||
|
||||
if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
|
||||
return;
|
||||
|
||||
/* Needed by both icmp_global_allow and icmp_xmit_lock */
|
||||
local_bh_disable();
|
||||
|
||||
/* global icmp_msgs_per_sec */
|
||||
if (!icmpv4_global_allow(net, type, code))
|
||||
goto out_bh_enable;
|
||||
|
||||
sk = icmp_xmit_lock(net);
|
||||
if (!sk)
|
||||
return;
|
||||
goto out_bh_enable;
|
||||
inet = inet_sk(sk);
|
||||
|
||||
icmp_param->data.icmph.checksum = 0;
|
||||
|
@ -433,12 +455,13 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
|
|||
rt = ip_route_output_key(net, &fl4);
|
||||
if (IS_ERR(rt))
|
||||
goto out_unlock;
|
||||
if (icmpv4_xrlim_allow(net, rt, &fl4, icmp_param->data.icmph.type,
|
||||
icmp_param->data.icmph.code))
|
||||
if (icmpv4_xrlim_allow(net, rt, &fl4, type, code))
|
||||
icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
|
||||
ip_rt_put(rt);
|
||||
out_unlock:
|
||||
icmp_xmit_unlock(sk);
|
||||
out_bh_enable:
|
||||
local_bh_enable();
|
||||
}
|
||||
|
||||
#ifdef CONFIG_IP_ROUTE_MULTIPATH
|
||||
|
@ -571,7 +594,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
|
|||
{
|
||||
struct iphdr *iph;
|
||||
int room;
|
||||
struct icmp_bxm *icmp_param;
|
||||
struct icmp_bxm icmp_param;
|
||||
struct rtable *rt = skb_rtable(skb_in);
|
||||
struct ipcm_cookie ipc;
|
||||
struct flowi4 fl4;
|
||||
|
@ -648,13 +671,16 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
|
|||
}
|
||||
}
|
||||
|
||||
icmp_param = kmalloc(sizeof(*icmp_param), GFP_ATOMIC);
|
||||
if (!icmp_param)
|
||||
return;
|
||||
/* Needed by both icmp_global_allow and icmp_xmit_lock */
|
||||
local_bh_disable();
|
||||
|
||||
/* Check global sysctl_icmp_msgs_per_sec ratelimit */
|
||||
if (!icmpv4_global_allow(net, type, code))
|
||||
goto out_bh_enable;
|
||||
|
||||
sk = icmp_xmit_lock(net);
|
||||
if (!sk)
|
||||
goto out_free;
|
||||
goto out_bh_enable;
|
||||
|
||||
/*
|
||||
* Construct source address and options.
|
||||
|
@ -681,7 +707,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
|
|||
iph->tos;
|
||||
mark = IP4_REPLY_MARK(net, skb_in->mark);
|
||||
|
||||
if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in))
|
||||
if (ip_options_echo(&icmp_param.replyopts.opt.opt, skb_in))
|
||||
goto out_unlock;
|
||||
|
||||
|
||||
|
@ -689,25 +715,26 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
|
|||
* Prepare data for ICMP header.
|
||||
*/
|
||||
|
||||
icmp_param->data.icmph.type = type;
|
||||
icmp_param->data.icmph.code = code;
|
||||
icmp_param->data.icmph.un.gateway = info;
|
||||
icmp_param->data.icmph.checksum = 0;
|
||||
icmp_param->skb = skb_in;
|
||||
icmp_param->offset = skb_network_offset(skb_in);
|
||||
icmp_param.data.icmph.type = type;
|
||||
icmp_param.data.icmph.code = code;
|
||||
icmp_param.data.icmph.un.gateway = info;
|
||||
icmp_param.data.icmph.checksum = 0;
|
||||
icmp_param.skb = skb_in;
|
||||
icmp_param.offset = skb_network_offset(skb_in);
|
||||
inet_sk(sk)->tos = tos;
|
||||
sk->sk_mark = mark;
|
||||
ipc.addr = iph->saddr;
|
||||
ipc.opt = &icmp_param->replyopts.opt;
|
||||
ipc.opt = &icmp_param.replyopts.opt;
|
||||
ipc.tx_flags = 0;
|
||||
ipc.ttl = 0;
|
||||
ipc.tos = -1;
|
||||
|
||||
rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark,
|
||||
type, code, icmp_param);
|
||||
type, code, &icmp_param);
|
||||
if (IS_ERR(rt))
|
||||
goto out_unlock;
|
||||
|
||||
/* peer icmp_ratelimit */
|
||||
if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code))
|
||||
goto ende;
|
||||
|
||||
|
@ -716,21 +743,21 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
|
|||
room = dst_mtu(&rt->dst);
|
||||
if (room > 576)
|
||||
room = 576;
|
||||
room -= sizeof(struct iphdr) + icmp_param->replyopts.opt.opt.optlen;
|
||||
room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen;
|
||||
room -= sizeof(struct icmphdr);
|
||||
|
||||
icmp_param->data_len = skb_in->len - icmp_param->offset;
|
||||
if (icmp_param->data_len > room)
|
||||
icmp_param->data_len = room;
|
||||
icmp_param->head_len = sizeof(struct icmphdr);
|
||||
icmp_param.data_len = skb_in->len - icmp_param.offset;
|
||||
if (icmp_param.data_len > room)
|
||||
icmp_param.data_len = room;
|
||||
icmp_param.head_len = sizeof(struct icmphdr);
|
||||
|
||||
icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
|
||||
icmp_push_reply(&icmp_param, &fl4, &ipc, &rt);
|
||||
ende:
|
||||
ip_rt_put(rt);
|
||||
out_unlock:
|
||||
icmp_xmit_unlock(sk);
|
||||
out_free:
|
||||
kfree(icmp_param);
|
||||
out_bh_enable:
|
||||
local_bh_enable();
|
||||
out:;
|
||||
}
|
||||
EXPORT_SYMBOL(icmp_send);
|
||||
|
|
|
@ -110,19 +110,17 @@ static const struct inet6_protocol icmpv6_protocol = {
|
|||
.flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
|
||||
};
|
||||
|
||||
/* Called with BH disabled */
|
||||
static __inline__ struct sock *icmpv6_xmit_lock(struct net *net)
|
||||
{
|
||||
struct sock *sk;
|
||||
|
||||
local_bh_disable();
|
||||
|
||||
sk = icmpv6_sk(net);
|
||||
if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
|
||||
/* This can happen if the output path (f.e. SIT or
|
||||
* ip6ip6 tunnel) signals dst_link_failure() for an
|
||||
* outgoing ICMP6 packet.
|
||||
*/
|
||||
local_bh_enable();
|
||||
return NULL;
|
||||
}
|
||||
return sk;
|
||||
|
@ -130,7 +128,7 @@ static __inline__ struct sock *icmpv6_xmit_lock(struct net *net)
|
|||
|
||||
static __inline__ void icmpv6_xmit_unlock(struct sock *sk)
|
||||
{
|
||||
spin_unlock_bh(&sk->sk_lock.slock);
|
||||
spin_unlock(&sk->sk_lock.slock);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -168,6 +166,30 @@ static bool is_ineligible(const struct sk_buff *skb)
|
|||
return false;
|
||||
}
|
||||
|
||||
static bool icmpv6_mask_allow(int type)
|
||||
{
|
||||
/* Informational messages are not limited. */
|
||||
if (type & ICMPV6_INFOMSG_MASK)
|
||||
return true;
|
||||
|
||||
/* Do not limit pmtu discovery, it would break it. */
|
||||
if (type == ICMPV6_PKT_TOOBIG)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool icmpv6_global_allow(int type)
|
||||
{
|
||||
if (icmpv6_mask_allow(type))
|
||||
return true;
|
||||
|
||||
if (icmp_global_allow())
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check the ICMP output rate limit
|
||||
*/
|
||||
|
@ -178,12 +200,7 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
|
|||
struct dst_entry *dst;
|
||||
bool res = false;
|
||||
|
||||
/* Informational messages are not limited. */
|
||||
if (type & ICMPV6_INFOMSG_MASK)
|
||||
return true;
|
||||
|
||||
/* Do not limit pmtu discovery, it would break it. */
|
||||
if (type == ICMPV6_PKT_TOOBIG)
|
||||
if (icmpv6_mask_allow(type))
|
||||
return true;
|
||||
|
||||
/*
|
||||
|
@ -200,20 +217,16 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
|
|||
} else {
|
||||
struct rt6_info *rt = (struct rt6_info *)dst;
|
||||
int tmo = net->ipv6.sysctl.icmpv6_time;
|
||||
struct inet_peer *peer;
|
||||
|
||||
/* Give more bandwidth to wider prefixes. */
|
||||
if (rt->rt6i_dst.plen < 128)
|
||||
tmo >>= ((128 - rt->rt6i_dst.plen)>>5);
|
||||
|
||||
if (icmp_global_allow()) {
|
||||
struct inet_peer *peer;
|
||||
|
||||
peer = inet_getpeer_v6(net->ipv6.peers,
|
||||
&fl6->daddr, 1);
|
||||
res = inet_peer_xrlim_allow(peer, tmo);
|
||||
if (peer)
|
||||
inet_putpeer(peer);
|
||||
}
|
||||
peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr, 1);
|
||||
res = inet_peer_xrlim_allow(peer, tmo);
|
||||
if (peer)
|
||||
inet_putpeer(peer);
|
||||
}
|
||||
dst_release(dst);
|
||||
return res;
|
||||
|
@ -474,6 +487,13 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
|
|||
return;
|
||||
}
|
||||
|
||||
/* Needed by both icmp_global_allow and icmpv6_xmit_lock */
|
||||
local_bh_disable();
|
||||
|
||||
/* Check global sysctl_icmp_msgs_per_sec ratelimit */
|
||||
if (!icmpv6_global_allow(type))
|
||||
goto out_bh_enable;
|
||||
|
||||
mip6_addr_swap(skb);
|
||||
|
||||
memset(&fl6, 0, sizeof(fl6));
|
||||
|
@ -492,7 +512,8 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
|
|||
|
||||
sk = icmpv6_xmit_lock(net);
|
||||
if (!sk)
|
||||
return;
|
||||
goto out_bh_enable;
|
||||
|
||||
sk->sk_mark = mark;
|
||||
np = inet6_sk(sk);
|
||||
|
||||
|
@ -552,6 +573,8 @@ out_dst_release:
|
|||
dst_release(dst);
|
||||
out:
|
||||
icmpv6_xmit_unlock(sk);
|
||||
out_bh_enable:
|
||||
local_bh_enable();
|
||||
}
|
||||
|
||||
/* Slightly more convenient version of icmp6_send.
|
||||
|
@ -665,9 +688,10 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
|
|||
fl6.flowi6_uid = sock_net_uid(net, NULL);
|
||||
security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
|
||||
|
||||
local_bh_disable();
|
||||
sk = icmpv6_xmit_lock(net);
|
||||
if (!sk)
|
||||
return;
|
||||
goto out_bh_enable;
|
||||
sk->sk_mark = mark;
|
||||
np = inet6_sk(sk);
|
||||
|
||||
|
@ -709,6 +733,8 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
|
|||
dst_release(dst);
|
||||
out:
|
||||
icmpv6_xmit_unlock(sk);
|
||||
out_bh_enable:
|
||||
local_bh_enable();
|
||||
}
|
||||
|
||||
void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info)
|
||||
|
|
Loading…
Reference in New Issue