2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* UDP over IPv6
|
2007-02-09 22:24:49 +08:00
|
|
|
* Linux INET6 implementation
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
|
|
|
* Authors:
|
2007-02-09 22:24:49 +08:00
|
|
|
* Pedro Roque <roque@di.fc.ul.pt>
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
|
|
|
* Based on linux/ipv4/udp.c
|
|
|
|
*
|
|
|
|
* Fixes:
|
|
|
|
* Hideaki YOSHIFUJI : sin6_scope_id support
|
|
|
|
* YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
|
|
|
|
* Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
|
|
|
|
* a single port at the same time.
|
|
|
|
* Kazunori MIYAZAWA @USAGI: change process style to use ip6_append_data
|
|
|
|
* YOSHIFUJI Hideaki @USAGI: convert /proc/net/udp6 to seq_file.
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU General Public License
|
|
|
|
* as published by the Free Software Foundation; either version
|
|
|
|
* 2 of the License, or (at your option) any later version.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/errno.h>
|
|
|
|
#include <linux/types.h>
|
|
|
|
#include <linux/socket.h>
|
|
|
|
#include <linux/sockios.h>
|
|
|
|
#include <linux/net.h>
|
|
|
|
#include <linux/in6.h>
|
|
|
|
#include <linux/netdevice.h>
|
|
|
|
#include <linux/if_arp.h>
|
|
|
|
#include <linux/ipv6.h>
|
|
|
|
#include <linux/icmpv6.h>
|
|
|
|
#include <linux/init.h>
|
2007-12-12 03:30:32 +08:00
|
|
|
#include <linux/module.h>
|
2005-12-14 15:16:37 +08:00
|
|
|
#include <linux/skbuff.h>
|
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 16:04:11 +08:00
|
|
|
#include <linux/slab.h>
|
2016-12-25 03:46:01 +08:00
|
|
|
#include <linux/uaccess.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2016-02-11 00:50:36 +08:00
|
|
|
#include <net/addrconf.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <net/ndisc.h>
|
|
|
|
#include <net/protocol.h>
|
|
|
|
#include <net/transp_v6.h>
|
|
|
|
#include <net/ip6_route.h>
|
|
|
|
#include <net/raw.h>
|
2005-08-10 11:08:28 +08:00
|
|
|
#include <net/tcp_states.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <net/ip6_checksum.h>
|
|
|
|
#include <net/xfrm.h>
|
2017-04-19 01:39:41 +08:00
|
|
|
#include <net/inet_hashtables.h>
|
2013-01-22 17:50:44 +08:00
|
|
|
#include <net/inet6_hashtables.h>
|
2013-07-10 22:13:17 +08:00
|
|
|
#include <net/busy_poll.h>
|
2016-01-05 06:41:46 +08:00
|
|
|
#include <net/sock_reuseport.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#include <linux/proc_fs.h>
|
|
|
|
#include <linux/seq_file.h>
|
2012-06-27 08:23:44 +08:00
|
|
|
#include <trace/events/skb.h>
|
2006-11-28 03:10:57 +08:00
|
|
|
#include "udp_impl.h"
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2017-01-27 02:02:24 +08:00
|
|
|
static bool udp6_lib_exact_dif_match(struct net *net, struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
#if defined(CONFIG_NET_L3_MASTER_DEV)
|
|
|
|
if (!net->ipv4.sysctl_udp_l3mdev_accept &&
|
|
|
|
skb && ipv6_l3mdev_skb(IP6CB(skb)->flags))
|
|
|
|
return true;
|
|
|
|
#endif
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2015-03-19 05:05:33 +08:00
|
|
|
static u32 udp6_ehashfn(const struct net *net,
|
|
|
|
const struct in6_addr *laddr,
|
|
|
|
const u16 lport,
|
|
|
|
const struct in6_addr *faddr,
|
|
|
|
const __be16 fport)
|
2013-10-20 03:48:52 +08:00
|
|
|
{
|
2013-10-20 03:48:57 +08:00
|
|
|
static u32 udp6_ehash_secret __read_mostly;
|
|
|
|
static u32 udp_ipv6_hash_secret __read_mostly;
|
|
|
|
|
|
|
|
u32 lhash, fhash;
|
|
|
|
|
|
|
|
net_get_random_once(&udp6_ehash_secret,
|
|
|
|
sizeof(udp6_ehash_secret));
|
|
|
|
net_get_random_once(&udp_ipv6_hash_secret,
|
|
|
|
sizeof(udp_ipv6_hash_secret));
|
|
|
|
|
|
|
|
lhash = (__force u32)laddr->s6_addr32[3];
|
|
|
|
fhash = __ipv6_addr_jhash(faddr, udp_ipv6_hash_secret);
|
|
|
|
|
2013-10-20 03:48:52 +08:00
|
|
|
return __inet6_ehashfn(lhash, lport, fhash, fport,
|
2013-10-20 03:48:57 +08:00
|
|
|
udp_ipv6_hash_secret + net_hash_mix(net));
|
2013-10-20 03:48:52 +08:00
|
|
|
}
|
|
|
|
|
2008-03-23 07:51:21 +08:00
|
|
|
int udp_v6_get_port(struct sock *sk, unsigned short snum)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2009-11-09 13:26:33 +08:00
|
|
|
unsigned int hash2_nulladdr =
|
2017-12-02 04:52:30 +08:00
|
|
|
ipv6_portaddr_hash(sock_net(sk), &in6addr_any, snum);
|
2012-05-19 09:45:21 +08:00
|
|
|
unsigned int hash2_partial =
|
2017-12-02 04:52:30 +08:00
|
|
|
ipv6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, 0);
|
2009-11-09 13:26:33 +08:00
|
|
|
|
2009-11-08 18:17:30 +08:00
|
|
|
/* precompute partial secondary hash */
|
2009-11-09 13:26:33 +08:00
|
|
|
udp_sk(sk)->udp_portaddr_hash = hash2_partial;
|
2017-01-17 23:51:01 +08:00
|
|
|
return udp_lib_get_port(sk, snum, hash2_nulladdr);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
udp: add rehash on connect()
commit 30fff923 introduced in linux-2.6.33 (udp: bind() optimisation)
added a secondary hash on UDP, hashed on (local addr, local port).
Problem is that following sequence :
fd = socket(...)
connect(fd, &remote, ...)
not only selects remote end point (address and port), but also sets
local address, while UDP stack stored in secondary hash table the socket
while its local address was INADDR_ANY (or ipv6 equivalent)
Sequence is :
- autobind() : choose a random local port, insert socket in hash tables
[while local address is INADDR_ANY]
- connect() : set remote address and port, change local address to IP
given by a route lookup.
When an incoming UDP frame comes, if more than 10 sockets are found in
primary hash table, we switch to secondary table, and fail to find
socket because its local address changed.
One solution to this problem is to rehash datagram socket if needed.
We add a new rehash(struct socket *) method in "struct proto", and
implement this method for UDP v4 & v6, using a common helper.
This rehashing only takes care of secondary hash table, since primary
hash (based on local port only) is not changed.
Reported-by: Krzysztof Piotr Oledzki <ole@ans.pl>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Tested-by: Krzysztof Piotr Oledzki <ole@ans.pl>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-09-08 13:08:44 +08:00
|
|
|
static void udp_v6_rehash(struct sock *sk)
|
|
|
|
{
|
2017-12-02 04:52:30 +08:00
|
|
|
u16 new_hash = ipv6_portaddr_hash(sock_net(sk),
|
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-04 06:42:29 +08:00
|
|
|
&sk->sk_v6_rcv_saddr,
|
udp: add rehash on connect()
commit 30fff923 introduced in linux-2.6.33 (udp: bind() optimisation)
added a secondary hash on UDP, hashed on (local addr, local port).
Problem is that following sequence :
fd = socket(...)
connect(fd, &remote, ...)
not only selects remote end point (address and port), but also sets
local address, while UDP stack stored in secondary hash table the socket
while its local address was INADDR_ANY (or ipv6 equivalent)
Sequence is :
- autobind() : choose a random local port, insert socket in hash tables
[while local address is INADDR_ANY]
- connect() : set remote address and port, change local address to IP
given by a route lookup.
When an incoming UDP frame comes, if more than 10 sockets are found in
primary hash table, we switch to secondary table, and fail to find
socket because its local address changed.
One solution to this problem is to rehash datagram socket if needed.
We add a new rehash(struct socket *) method in "struct proto", and
implement this method for UDP v4 & v6, using a common helper.
This rehashing only takes care of secondary hash table, since primary
hash (based on local port only) is not changed.
Reported-by: Krzysztof Piotr Oledzki <ole@ans.pl>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Tested-by: Krzysztof Piotr Oledzki <ole@ans.pl>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-09-08 13:08:44 +08:00
|
|
|
inet_sk(sk)->inet_num);
|
|
|
|
|
|
|
|
udp_lib_rehash(sk, new_hash);
|
|
|
|
}
|
|
|
|
|
udp reuseport: fix packet of same flow hashed to different socket
There is a corner case in which udp packets belonging to a same
flow are hashed to different socket when hslot->count changes from 10
to 11:
1) When hslot->count <= 10, __udp_lib_lookup() searches udp_table->hash,
and always passes 'daddr' to udp_ehashfn().
2) When hslot->count > 10, __udp_lib_lookup() searches udp_table->hash2,
but may pass 'INADDR_ANY' to udp_ehashfn() if the sockets are bound to
INADDR_ANY instead of some specific addr.
That means when hslot->count changes from 10 to 11, the hash calculated by
udp_ehashfn() is also changed, and the udp packets belonging to a same
flow will be hashed to different socket.
This is easily reproduced:
1) Create 10 udp sockets and bind all of them to 0.0.0.0:40000.
2) From the same host send udp packets to 127.0.0.1:40000, record the
socket index which receives the packets.
3) Create 1 more udp socket and bind it to 0.0.0.0:44096. The number 44096
is 40000 + UDP_HASH_SIZE(4096), this makes the new socket put into the
same hslot as the aformentioned 10 sockets, and makes the hslot->count
change from 10 to 11.
4) From the same host send udp packets to 127.0.0.1:40000, and the socket
index which receives the packets will be different from the one received
in step 2.
This should not happen as the socket bound to 0.0.0.0:44096 should not
change the behavior of the sockets bound to 0.0.0.0:40000.
It's the same case for IPv6, and this patch also fixes that.
Signed-off-by: Su, Xuemin <suxm@chinanetcenter.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-13 11:02:50 +08:00
|
|
|
static int compute_score(struct sock *sk, struct net *net,
|
|
|
|
const struct in6_addr *saddr, __be16 sport,
|
|
|
|
const struct in6_addr *daddr, unsigned short hnum,
|
2017-08-07 23:44:20 +08:00
|
|
|
int dif, int sdif, bool exact_dif)
|
2008-10-29 16:41:45 +08:00
|
|
|
{
|
2014-12-02 12:29:06 +08:00
|
|
|
int score;
|
|
|
|
struct inet_sock *inet;
|
|
|
|
|
|
|
|
if (!net_eq(sock_net(sk), net) ||
|
|
|
|
udp_sk(sk)->udp_port_hash != hnum ||
|
|
|
|
sk->sk_family != PF_INET6)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
score = 0;
|
|
|
|
inet = inet_sk(sk);
|
|
|
|
|
|
|
|
if (inet->inet_dport) {
|
|
|
|
if (inet->inet_dport != sport)
|
|
|
|
return -1;
|
|
|
|
score++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
|
|
|
|
if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
|
|
|
|
return -1;
|
|
|
|
score++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!ipv6_addr_any(&sk->sk_v6_daddr)) {
|
|
|
|
if (!ipv6_addr_equal(&sk->sk_v6_daddr, saddr))
|
|
|
|
return -1;
|
|
|
|
score++;
|
|
|
|
}
|
|
|
|
|
2017-01-27 02:02:24 +08:00
|
|
|
if (sk->sk_bound_dev_if || exact_dif) {
|
2017-08-07 23:44:20 +08:00
|
|
|
bool dev_match = (sk->sk_bound_dev_if == dif ||
|
|
|
|
sk->sk_bound_dev_if == sdif);
|
|
|
|
|
2018-05-09 18:42:34 +08:00
|
|
|
if (!dev_match)
|
2014-12-02 12:29:06 +08:00
|
|
|
return -1;
|
2018-05-09 18:42:34 +08:00
|
|
|
if (sk->sk_bound_dev_if)
|
2017-08-07 23:44:20 +08:00
|
|
|
score++;
|
2008-10-29 16:41:45 +08:00
|
|
|
}
|
2014-12-02 12:29:06 +08:00
|
|
|
|
2015-10-09 10:33:21 +08:00
|
|
|
if (sk->sk_incoming_cpu == raw_smp_processor_id())
|
|
|
|
score++;
|
|
|
|
|
2008-10-29 16:41:45 +08:00
|
|
|
return score;
|
|
|
|
}
|
|
|
|
|
udp reuseport: fix packet of same flow hashed to different socket
There is a corner case in which udp packets belonging to a same
flow are hashed to different socket when hslot->count changes from 10
to 11:
1) When hslot->count <= 10, __udp_lib_lookup() searches udp_table->hash,
and always passes 'daddr' to udp_ehashfn().
2) When hslot->count > 10, __udp_lib_lookup() searches udp_table->hash2,
but may pass 'INADDR_ANY' to udp_ehashfn() if the sockets are bound to
INADDR_ANY instead of some specific addr.
That means when hslot->count changes from 10 to 11, the hash calculated by
udp_ehashfn() is also changed, and the udp packets belonging to a same
flow will be hashed to different socket.
This is easily reproduced:
1) Create 10 udp sockets and bind all of them to 0.0.0.0:40000.
2) From the same host send udp packets to 127.0.0.1:40000, record the
socket index which receives the packets.
3) Create 1 more udp socket and bind it to 0.0.0.0:44096. The number 44096
is 40000 + UDP_HASH_SIZE(4096), this makes the new socket put into the
same hslot as the aformentioned 10 sockets, and makes the hslot->count
change from 10 to 11.
4) From the same host send udp packets to 127.0.0.1:40000, and the socket
index which receives the packets will be different from the one received
in step 2.
This should not happen as the socket bound to 0.0.0.0:44096 should not
change the behavior of the sockets bound to 0.0.0.0:40000.
It's the same case for IPv6, and this patch also fixes that.
Signed-off-by: Su, Xuemin <suxm@chinanetcenter.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-13 11:02:50 +08:00
|
|
|
/* called with rcu_read_lock() */
|
ipv6: udp: optimize unicast RX path
We first locate the (local port) hash chain head
If few sockets are in this chain, we proceed with previous lookup algo.
If too many sockets are listed, we take a look at the secondary
(port, address) hash chain.
We choose the shortest chain and proceed with a RCU lookup on the elected chain.
But, if we chose (port, address) chain, and fail to find a socket on given address,
we must try another lookup on (port, in6addr_any) chain to find sockets not bound
to a particular IP.
-> No extra cost for typical setups, where the first lookup will probabbly
be performed.
RCU lookups everywhere, we dont acquire spinlock.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-11-08 18:18:30 +08:00
|
|
|
static struct sock *udp6_lib_lookup2(struct net *net,
|
|
|
|
const struct in6_addr *saddr, __be16 sport,
|
2017-08-07 23:44:20 +08:00
|
|
|
const struct in6_addr *daddr, unsigned int hnum,
|
|
|
|
int dif, int sdif, bool exact_dif,
|
|
|
|
struct udp_hslot *hslot2, struct sk_buff *skb)
|
ipv6: udp: optimize unicast RX path
We first locate the (local port) hash chain head
If few sockets are in this chain, we proceed with previous lookup algo.
If too many sockets are listed, we take a look at the secondary
(port, address) hash chain.
We choose the shortest chain and proceed with a RCU lookup on the elected chain.
But, if we chose (port, address) chain, and fail to find a socket on given address,
we must try another lookup on (port, in6addr_any) chain to find sockets not bound
to a particular IP.
-> No extra cost for typical setups, where the first lookup will probabbly
be performed.
RCU lookups everywhere, we dont acquire spinlock.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-11-08 18:18:30 +08:00
|
|
|
{
|
|
|
|
struct sock *sk, *result;
|
2017-11-30 22:39:34 +08:00
|
|
|
int score, badness;
|
2013-01-22 17:50:44 +08:00
|
|
|
u32 hash = 0;
|
ipv6: udp: optimize unicast RX path
We first locate the (local port) hash chain head
If few sockets are in this chain, we proceed with previous lookup algo.
If too many sockets are listed, we take a look at the secondary
(port, address) hash chain.
We choose the shortest chain and proceed with a RCU lookup on the elected chain.
But, if we chose (port, address) chain, and fail to find a socket on given address,
we must try another lookup on (port, in6addr_any) chain to find sockets not bound
to a particular IP.
-> No extra cost for typical setups, where the first lookup will probabbly
be performed.
RCU lookups everywhere, we dont acquire spinlock.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-11-08 18:18:30 +08:00
|
|
|
|
|
|
|
result = NULL;
|
|
|
|
badness = -1;
|
2016-04-01 23:52:13 +08:00
|
|
|
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
|
udp reuseport: fix packet of same flow hashed to different socket
There is a corner case in which udp packets belonging to a same
flow are hashed to different socket when hslot->count changes from 10
to 11:
1) When hslot->count <= 10, __udp_lib_lookup() searches udp_table->hash,
and always passes 'daddr' to udp_ehashfn().
2) When hslot->count > 10, __udp_lib_lookup() searches udp_table->hash2,
but may pass 'INADDR_ANY' to udp_ehashfn() if the sockets are bound to
INADDR_ANY instead of some specific addr.
That means when hslot->count changes from 10 to 11, the hash calculated by
udp_ehashfn() is also changed, and the udp packets belonging to a same
flow will be hashed to different socket.
This is easily reproduced:
1) Create 10 udp sockets and bind all of them to 0.0.0.0:40000.
2) From the same host send udp packets to 127.0.0.1:40000, record the
socket index which receives the packets.
3) Create 1 more udp socket and bind it to 0.0.0.0:44096. The number 44096
is 40000 + UDP_HASH_SIZE(4096), this makes the new socket put into the
same hslot as the aformentioned 10 sockets, and makes the hslot->count
change from 10 to 11.
4) From the same host send udp packets to 127.0.0.1:40000, and the socket
index which receives the packets will be different from the one received
in step 2.
This should not happen as the socket bound to 0.0.0.0:44096 should not
change the behavior of the sockets bound to 0.0.0.0:40000.
It's the same case for IPv6, and this patch also fixes that.
Signed-off-by: Su, Xuemin <suxm@chinanetcenter.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-13 11:02:50 +08:00
|
|
|
score = compute_score(sk, net, saddr, sport,
|
2017-08-07 23:44:20 +08:00
|
|
|
daddr, hnum, dif, sdif, exact_dif);
|
ipv6: udp: optimize unicast RX path
We first locate the (local port) hash chain head
If few sockets are in this chain, we proceed with previous lookup algo.
If too many sockets are listed, we take a look at the secondary
(port, address) hash chain.
We choose the shortest chain and proceed with a RCU lookup on the elected chain.
But, if we chose (port, address) chain, and fail to find a socket on given address,
we must try another lookup on (port, in6addr_any) chain to find sockets not bound
to a particular IP.
-> No extra cost for typical setups, where the first lookup will probabbly
be performed.
RCU lookups everywhere, we dont acquire spinlock.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-11-08 18:18:30 +08:00
|
|
|
if (score > badness) {
|
2017-11-30 22:39:34 +08:00
|
|
|
if (sk->sk_reuseport) {
|
2013-10-20 03:48:52 +08:00
|
|
|
hash = udp6_ehashfn(net, daddr, hnum,
|
|
|
|
saddr, sport);
|
udp: fix potential infinite loop in SO_REUSEPORT logic
Using a combination of connected and un-connected sockets, Dmitry
was able to trigger soft lockups with his fuzzer.
The problem is that sockets in the SO_REUSEPORT array might have
different scores.
Right after sk2=socket(), setsockopt(sk2,...,SO_REUSEPORT, on) and
bind(sk2, ...), but _before_ the connect(sk2) is done, sk2 is added into
the soreuseport array, with a score which is smaller than the score of
first socket sk1 found in hash table (I am speaking of the regular UDP
hash table), if sk1 had the connect() done, giving a +8 to its score.
hash bucket [X] -> sk1 -> sk2 -> NULL
sk1 score = 14 (because it did a connect())
sk2 score = 6
SO_REUSEPORT fast selection is an optimization. If it turns out the
score of the selected socket does not match score of first socket, just
fallback to old SO_REUSEPORT logic instead of trying to be too smart.
Normal SO_REUSEPORT users do not mix different kind of sockets, as this
mechanism is used for load balance traffic.
Fixes: e32ea7e74727 ("soreuseport: fast reuseport UDP socket selection")
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Craig Gallek <kraigatgoog@gmail.com>
Acked-by: Craig Gallek <kraig@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-01-20 00:36:43 +08:00
|
|
|
|
2016-04-01 23:52:13 +08:00
|
|
|
result = reuseport_select_sock(sk, hash, skb,
|
udp: fix potential infinite loop in SO_REUSEPORT logic
Using a combination of connected and un-connected sockets, Dmitry
was able to trigger soft lockups with his fuzzer.
The problem is that sockets in the SO_REUSEPORT array might have
different scores.
Right after sk2=socket(), setsockopt(sk2,...,SO_REUSEPORT, on) and
bind(sk2, ...), but _before_ the connect(sk2) is done, sk2 is added into
the soreuseport array, with a score which is smaller than the score of
first socket sk1 found in hash table (I am speaking of the regular UDP
hash table), if sk1 had the connect() done, giving a +8 to its score.
hash bucket [X] -> sk1 -> sk2 -> NULL
sk1 score = 14 (because it did a connect())
sk2 score = 6
SO_REUSEPORT fast selection is an optimization. If it turns out the
score of the selected socket does not match score of first socket, just
fallback to old SO_REUSEPORT logic instead of trying to be too smart.
Normal SO_REUSEPORT users do not mix different kind of sockets, as this
mechanism is used for load balance traffic.
Fixes: e32ea7e74727 ("soreuseport: fast reuseport UDP socket selection")
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Craig Gallek <kraigatgoog@gmail.com>
Acked-by: Craig Gallek <kraig@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-01-20 00:36:43 +08:00
|
|
|
sizeof(struct udphdr));
|
2016-04-01 23:52:13 +08:00
|
|
|
if (result)
|
|
|
|
return result;
|
2015-10-09 10:33:21 +08:00
|
|
|
}
|
2016-04-01 23:52:13 +08:00
|
|
|
result = sk;
|
|
|
|
badness = score;
|
ipv6: udp: optimize unicast RX path
We first locate the (local port) hash chain head
If few sockets are in this chain, we proceed with previous lookup algo.
If too many sockets are listed, we take a look at the secondary
(port, address) hash chain.
We choose the shortest chain and proceed with a RCU lookup on the elected chain.
But, if we chose (port, address) chain, and fail to find a socket on given address,
we must try another lookup on (port, in6addr_any) chain to find sockets not bound
to a particular IP.
-> No extra cost for typical setups, where the first lookup will probabbly
be performed.
RCU lookups everywhere, we dont acquire spinlock.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-11-08 18:18:30 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2016-04-01 23:52:13 +08:00
|
|
|
/* rcu_read_lock() must be held */
|
2011-12-09 14:23:34 +08:00
|
|
|
struct sock *__udp6_lib_lookup(struct net *net,
|
2017-08-07 23:44:20 +08:00
|
|
|
const struct in6_addr *saddr, __be16 sport,
|
|
|
|
const struct in6_addr *daddr, __be16 dport,
|
|
|
|
int dif, int sdif, struct udp_table *udptable,
|
|
|
|
struct sk_buff *skb)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
udp: RCU handling for Unicast packets.
Goals are :
1) Optimizing handling of incoming Unicast UDP frames, so that no memory
writes should happen in the fast path.
Note: Multicasts and broadcasts still will need to take a lock,
because doing a full lockless lookup in this case is difficult.
2) No expensive operations in the socket bind/unhash phases :
- No expensive synchronize_rcu() calls.
- No added rcu_head in socket structure, increasing memory needs,
but more important, forcing us to use call_rcu() calls,
that have the bad property of making sockets structure cold.
(rcu grace period between socket freeing and its potential reuse
make this socket being cold in CPU cache).
David did a previous patch using call_rcu() and noticed a 20%
impact on TCP connection rates.
Quoting Cristopher Lameter :
"Right. That results in cacheline cooldown. You'd want to recycle
the object as they are cache hot on a per cpu basis. That is screwed
up by the delayed regular rcu processing. We have seen multiple
regressions due to cacheline cooldown.
The only choice in cacheline hot sensitive areas is to deal with the
complexity that comes with SLAB_DESTROY_BY_RCU or give up on RCU."
- Because udp sockets are allocated from dedicated kmem_cache,
use of SLAB_DESTROY_BY_RCU can help here.
Theory of operation :
---------------------
As the lookup is lockfree (using rcu_read_lock()/rcu_read_unlock()),
special attention must be taken by readers and writers.
Use of SLAB_DESTROY_BY_RCU is tricky too, because a socket can be freed,
reused, inserted in a different chain or in worst case in the same chain
while readers could do lookups in the same time.
In order to avoid loops, a reader must check each socket found in a chain
really belongs to the chain the reader was traversing. If it finds a
mismatch, lookup must start again at the begining. This *restart* loop
is the reason we had to use rdlock for the multicast case, because
we dont want to send same message several times to the same socket.
We use RCU only for fast path.
Thus, /proc/net/udp still takes spinlocks.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-10-29 17:11:14 +08:00
|
|
|
struct sock *sk, *result;
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned short hnum = ntohs(dport);
|
ipv6: udp: optimize unicast RX path
We first locate the (local port) hash chain head
If few sockets are in this chain, we proceed with previous lookup algo.
If too many sockets are listed, we take a look at the secondary
(port, address) hash chain.
We choose the shortest chain and proceed with a RCU lookup on the elected chain.
But, if we chose (port, address) chain, and fail to find a socket on given address,
we must try another lookup on (port, in6addr_any) chain to find sockets not bound
to a particular IP.
-> No extra cost for typical setups, where the first lookup will probabbly
be performed.
RCU lookups everywhere, we dont acquire spinlock.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-11-08 18:18:30 +08:00
|
|
|
unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
|
|
|
|
struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
|
2017-01-27 02:02:24 +08:00
|
|
|
bool exact_dif = udp6_lib_exact_dif_match(net, skb);
|
2017-11-30 22:39:34 +08:00
|
|
|
int score, badness;
|
2013-01-22 17:50:44 +08:00
|
|
|
u32 hash = 0;
|
2008-10-29 16:41:45 +08:00
|
|
|
|
ipv6: udp: optimize unicast RX path
We first locate the (local port) hash chain head
If few sockets are in this chain, we proceed with previous lookup algo.
If too many sockets are listed, we take a look at the secondary
(port, address) hash chain.
We choose the shortest chain and proceed with a RCU lookup on the elected chain.
But, if we chose (port, address) chain, and fail to find a socket on given address,
we must try another lookup on (port, in6addr_any) chain to find sockets not bound
to a particular IP.
-> No extra cost for typical setups, where the first lookup will probabbly
be performed.
RCU lookups everywhere, we dont acquire spinlock.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-11-08 18:18:30 +08:00
|
|
|
if (hslot->count > 10) {
|
2017-12-02 04:52:30 +08:00
|
|
|
hash2 = ipv6_portaddr_hash(net, daddr, hnum);
|
ipv6: udp: optimize unicast RX path
We first locate the (local port) hash chain head
If few sockets are in this chain, we proceed with previous lookup algo.
If too many sockets are listed, we take a look at the secondary
(port, address) hash chain.
We choose the shortest chain and proceed with a RCU lookup on the elected chain.
But, if we chose (port, address) chain, and fail to find a socket on given address,
we must try another lookup on (port, in6addr_any) chain to find sockets not bound
to a particular IP.
-> No extra cost for typical setups, where the first lookup will probabbly
be performed.
RCU lookups everywhere, we dont acquire spinlock.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-11-08 18:18:30 +08:00
|
|
|
slot2 = hash2 & udptable->mask;
|
|
|
|
hslot2 = &udptable->hash2[slot2];
|
|
|
|
if (hslot->count < hslot2->count)
|
|
|
|
goto begin;
|
|
|
|
|
|
|
|
result = udp6_lib_lookup2(net, saddr, sport,
|
2017-08-07 23:44:20 +08:00
|
|
|
daddr, hnum, dif, sdif, exact_dif,
|
udp reuseport: fix packet of same flow hashed to different socket
There is a corner case in which udp packets belonging to a same
flow are hashed to different socket when hslot->count changes from 10
to 11:
1) When hslot->count <= 10, __udp_lib_lookup() searches udp_table->hash,
and always passes 'daddr' to udp_ehashfn().
2) When hslot->count > 10, __udp_lib_lookup() searches udp_table->hash2,
but may pass 'INADDR_ANY' to udp_ehashfn() if the sockets are bound to
INADDR_ANY instead of some specific addr.
That means when hslot->count changes from 10 to 11, the hash calculated by
udp_ehashfn() is also changed, and the udp packets belonging to a same
flow will be hashed to different socket.
This is easily reproduced:
1) Create 10 udp sockets and bind all of them to 0.0.0.0:40000.
2) From the same host send udp packets to 127.0.0.1:40000, record the
socket index which receives the packets.
3) Create 1 more udp socket and bind it to 0.0.0.0:44096. The number 44096
is 40000 + UDP_HASH_SIZE(4096), this makes the new socket put into the
same hslot as the aformentioned 10 sockets, and makes the hslot->count
change from 10 to 11.
4) From the same host send udp packets to 127.0.0.1:40000, and the socket
index which receives the packets will be different from the one received
in step 2.
This should not happen as the socket bound to 0.0.0.0:44096 should not
change the behavior of the sockets bound to 0.0.0.0:40000.
It's the same case for IPv6, and this patch also fixes that.
Signed-off-by: Su, Xuemin <suxm@chinanetcenter.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-13 11:02:50 +08:00
|
|
|
hslot2, skb);
|
ipv6: udp: optimize unicast RX path
We first locate the (local port) hash chain head
If few sockets are in this chain, we proceed with previous lookup algo.
If too many sockets are listed, we take a look at the secondary
(port, address) hash chain.
We choose the shortest chain and proceed with a RCU lookup on the elected chain.
But, if we chose (port, address) chain, and fail to find a socket on given address,
we must try another lookup on (port, in6addr_any) chain to find sockets not bound
to a particular IP.
-> No extra cost for typical setups, where the first lookup will probabbly
be performed.
RCU lookups everywhere, we dont acquire spinlock.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-11-08 18:18:30 +08:00
|
|
|
if (!result) {
|
udp reuseport: fix packet of same flow hashed to different socket
There is a corner case in which udp packets belonging to a same
flow are hashed to different socket when hslot->count changes from 10
to 11:
1) When hslot->count <= 10, __udp_lib_lookup() searches udp_table->hash,
and always passes 'daddr' to udp_ehashfn().
2) When hslot->count > 10, __udp_lib_lookup() searches udp_table->hash2,
but may pass 'INADDR_ANY' to udp_ehashfn() if the sockets are bound to
INADDR_ANY instead of some specific addr.
That means when hslot->count changes from 10 to 11, the hash calculated by
udp_ehashfn() is also changed, and the udp packets belonging to a same
flow will be hashed to different socket.
This is easily reproduced:
1) Create 10 udp sockets and bind all of them to 0.0.0.0:40000.
2) From the same host send udp packets to 127.0.0.1:40000, record the
socket index which receives the packets.
3) Create 1 more udp socket and bind it to 0.0.0.0:44096. The number 44096
is 40000 + UDP_HASH_SIZE(4096), this makes the new socket put into the
same hslot as the aformentioned 10 sockets, and makes the hslot->count
change from 10 to 11.
4) From the same host send udp packets to 127.0.0.1:40000, and the socket
index which receives the packets will be different from the one received
in step 2.
This should not happen as the socket bound to 0.0.0.0:44096 should not
change the behavior of the sockets bound to 0.0.0.0:40000.
It's the same case for IPv6, and this patch also fixes that.
Signed-off-by: Su, Xuemin <suxm@chinanetcenter.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-13 11:02:50 +08:00
|
|
|
unsigned int old_slot2 = slot2;
|
2017-12-02 04:52:30 +08:00
|
|
|
hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum);
|
ipv6: udp: optimize unicast RX path
We first locate the (local port) hash chain head
If few sockets are in this chain, we proceed with previous lookup algo.
If too many sockets are listed, we take a look at the secondary
(port, address) hash chain.
We choose the shortest chain and proceed with a RCU lookup on the elected chain.
But, if we chose (port, address) chain, and fail to find a socket on given address,
we must try another lookup on (port, in6addr_any) chain to find sockets not bound
to a particular IP.
-> No extra cost for typical setups, where the first lookup will probabbly
be performed.
RCU lookups everywhere, we dont acquire spinlock.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-11-08 18:18:30 +08:00
|
|
|
slot2 = hash2 & udptable->mask;
|
udp reuseport: fix packet of same flow hashed to different socket
There is a corner case in which udp packets belonging to a same
flow are hashed to different socket when hslot->count changes from 10
to 11:
1) When hslot->count <= 10, __udp_lib_lookup() searches udp_table->hash,
and always passes 'daddr' to udp_ehashfn().
2) When hslot->count > 10, __udp_lib_lookup() searches udp_table->hash2,
but may pass 'INADDR_ANY' to udp_ehashfn() if the sockets are bound to
INADDR_ANY instead of some specific addr.
That means when hslot->count changes from 10 to 11, the hash calculated by
udp_ehashfn() is also changed, and the udp packets belonging to a same
flow will be hashed to different socket.
This is easily reproduced:
1) Create 10 udp sockets and bind all of them to 0.0.0.0:40000.
2) From the same host send udp packets to 127.0.0.1:40000, record the
socket index which receives the packets.
3) Create 1 more udp socket and bind it to 0.0.0.0:44096. The number 44096
is 40000 + UDP_HASH_SIZE(4096), this makes the new socket put into the
same hslot as the aformentioned 10 sockets, and makes the hslot->count
change from 10 to 11.
4) From the same host send udp packets to 127.0.0.1:40000, and the socket
index which receives the packets will be different from the one received
in step 2.
This should not happen as the socket bound to 0.0.0.0:44096 should not
change the behavior of the sockets bound to 0.0.0.0:40000.
It's the same case for IPv6, and this patch also fixes that.
Signed-off-by: Su, Xuemin <suxm@chinanetcenter.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-13 11:02:50 +08:00
|
|
|
/* avoid searching the same slot again. */
|
|
|
|
if (unlikely(slot2 == old_slot2))
|
|
|
|
return result;
|
|
|
|
|
ipv6: udp: optimize unicast RX path
We first locate the (local port) hash chain head
If few sockets are in this chain, we proceed with previous lookup algo.
If too many sockets are listed, we take a look at the secondary
(port, address) hash chain.
We choose the shortest chain and proceed with a RCU lookup on the elected chain.
But, if we chose (port, address) chain, and fail to find a socket on given address,
we must try another lookup on (port, in6addr_any) chain to find sockets not bound
to a particular IP.
-> No extra cost for typical setups, where the first lookup will probabbly
be performed.
RCU lookups everywhere, we dont acquire spinlock.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-11-08 18:18:30 +08:00
|
|
|
hslot2 = &udptable->hash2[slot2];
|
|
|
|
if (hslot->count < hslot2->count)
|
|
|
|
goto begin;
|
|
|
|
|
2010-04-08 12:56:48 +08:00
|
|
|
result = udp6_lib_lookup2(net, saddr, sport,
|
2017-08-07 23:44:20 +08:00
|
|
|
daddr, hnum, dif, sdif,
|
2017-01-27 02:02:24 +08:00
|
|
|
exact_dif, hslot2,
|
|
|
|
skb);
|
ipv6: udp: optimize unicast RX path
We first locate the (local port) hash chain head
If few sockets are in this chain, we proceed with previous lookup algo.
If too many sockets are listed, we take a look at the secondary
(port, address) hash chain.
We choose the shortest chain and proceed with a RCU lookup on the elected chain.
But, if we chose (port, address) chain, and fail to find a socket on given address,
we must try another lookup on (port, in6addr_any) chain to find sockets not bound
to a particular IP.
-> No extra cost for typical setups, where the first lookup will probabbly
be performed.
RCU lookups everywhere, we dont acquire spinlock.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-11-08 18:18:30 +08:00
|
|
|
}
|
2018-08-08 16:01:26 +08:00
|
|
|
if (unlikely(IS_ERR(result)))
|
|
|
|
return NULL;
|
ipv6: udp: optimize unicast RX path
We first locate the (local port) hash chain head
If few sockets are in this chain, we proceed with previous lookup algo.
If too many sockets are listed, we take a look at the secondary
(port, address) hash chain.
We choose the shortest chain and proceed with a RCU lookup on the elected chain.
But, if we chose (port, address) chain, and fail to find a socket on given address,
we must try another lookup on (port, in6addr_any) chain to find sockets not bound
to a particular IP.
-> No extra cost for typical setups, where the first lookup will probabbly
be performed.
RCU lookups everywhere, we dont acquire spinlock.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-11-08 18:18:30 +08:00
|
|
|
return result;
|
|
|
|
}
|
udp: RCU handling for Unicast packets.
Goals are :
1) Optimizing handling of incoming Unicast UDP frames, so that no memory
writes should happen in the fast path.
Note: Multicasts and broadcasts still will need to take a lock,
because doing a full lockless lookup in this case is difficult.
2) No expensive operations in the socket bind/unhash phases :
- No expensive synchronize_rcu() calls.
- No added rcu_head in socket structure, increasing memory needs,
but more important, forcing us to use call_rcu() calls,
that have the bad property of making sockets structure cold.
(rcu grace period between socket freeing and its potential reuse
make this socket being cold in CPU cache).
David did a previous patch using call_rcu() and noticed a 20%
impact on TCP connection rates.
Quoting Cristopher Lameter :
"Right. That results in cacheline cooldown. You'd want to recycle
the object as they are cache hot on a per cpu basis. That is screwed
up by the delayed regular rcu processing. We have seen multiple
regressions due to cacheline cooldown.
The only choice in cacheline hot sensitive areas is to deal with the
complexity that comes with SLAB_DESTROY_BY_RCU or give up on RCU."
- Because udp sockets are allocated from dedicated kmem_cache,
use of SLAB_DESTROY_BY_RCU can help here.
Theory of operation :
---------------------
As the lookup is lockfree (using rcu_read_lock()/rcu_read_unlock()),
special attention must be taken by readers and writers.
Use of SLAB_DESTROY_BY_RCU is tricky too, because a socket can be freed,
reused, inserted in a different chain or in worst case in the same chain
while readers could do lookups in the same time.
In order to avoid loops, a reader must check each socket found in a chain
really belongs to the chain the reader was traversing. If it finds a
mismatch, lookup must start again at the begining. This *restart* loop
is the reason we had to use rdlock for the multicast case, because
we dont want to send same message several times to the same socket.
We use RCU only for fast path.
Thus, /proc/net/udp still takes spinlocks.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-10-29 17:11:14 +08:00
|
|
|
begin:
|
|
|
|
result = NULL;
|
|
|
|
badness = -1;
|
2016-04-01 23:52:13 +08:00
|
|
|
sk_for_each_rcu(sk, &hslot->head) {
|
2017-01-27 02:02:24 +08:00
|
|
|
score = compute_score(sk, net, saddr, sport, daddr, hnum, dif,
|
2017-08-07 23:44:20 +08:00
|
|
|
sdif, exact_dif);
|
2008-10-29 16:41:45 +08:00
|
|
|
if (score > badness) {
|
2017-11-30 22:39:34 +08:00
|
|
|
if (sk->sk_reuseport) {
|
2013-10-20 03:48:52 +08:00
|
|
|
hash = udp6_ehashfn(net, daddr, hnum,
|
|
|
|
saddr, sport);
|
2016-04-01 23:52:13 +08:00
|
|
|
result = reuseport_select_sock(sk, hash, skb,
|
2016-01-05 06:41:47 +08:00
|
|
|
sizeof(struct udphdr));
|
2018-08-08 16:01:26 +08:00
|
|
|
if (unlikely(IS_ERR(result)))
|
|
|
|
return NULL;
|
2016-04-01 23:52:13 +08:00
|
|
|
if (result)
|
|
|
|
return result;
|
2013-01-22 17:50:44 +08:00
|
|
|
}
|
2016-04-01 23:52:13 +08:00
|
|
|
result = sk;
|
|
|
|
badness = score;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
2011-12-09 14:23:34 +08:00
|
|
|
EXPORT_SYMBOL_GPL(__udp6_lib_lookup);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-10-08 03:38:32 +08:00
|
|
|
static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb,
|
|
|
|
__be16 sport, __be16 dport,
|
2008-10-29 16:41:45 +08:00
|
|
|
struct udp_table *udptable)
|
2008-10-08 03:38:32 +08:00
|
|
|
{
|
2011-04-22 12:53:02 +08:00
|
|
|
const struct ipv6hdr *iph = ipv6_hdr(skb);
|
2008-10-08 03:38:32 +08:00
|
|
|
|
2016-05-13 07:23:44 +08:00
|
|
|
return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport,
|
2009-06-02 13:19:30 +08:00
|
|
|
&iph->daddr, dport, inet6_iif(skb),
|
2017-08-07 23:44:20 +08:00
|
|
|
inet6_sdif(skb), udptable, skb);
|
2008-10-08 03:38:32 +08:00
|
|
|
}
|
|
|
|
|
2016-04-05 23:22:50 +08:00
|
|
|
struct sock *udp6_lib_lookup_skb(struct sk_buff *skb,
|
|
|
|
__be16 sport, __be16 dport)
|
|
|
|
{
|
|
|
|
const struct ipv6hdr *iph = ipv6_hdr(skb);
|
|
|
|
|
2016-05-13 07:23:44 +08:00
|
|
|
return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport,
|
2016-04-05 23:22:50 +08:00
|
|
|
&iph->daddr, dport, inet6_iif(skb),
|
2017-08-07 23:44:20 +08:00
|
|
|
inet6_sdif(skb), &udp_table, skb);
|
2016-04-05 23:22:50 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(udp6_lib_lookup_skb);
|
|
|
|
|
2016-04-01 23:52:13 +08:00
|
|
|
/* Must be called under rcu_read_lock().
|
|
|
|
* Does increment socket refcount.
|
|
|
|
*/
|
2018-06-05 19:40:34 +08:00
|
|
|
#if IS_ENABLED(CONFIG_NF_TPROXY_IPV6) || IS_ENABLED(CONFIG_NF_SOCKET_IPV6)
|
2010-10-21 22:05:41 +08:00
|
|
|
struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport,
|
|
|
|
const struct in6_addr *daddr, __be16 dport, int dif)
|
|
|
|
{
|
2016-04-01 23:52:13 +08:00
|
|
|
struct sock *sk;
|
|
|
|
|
|
|
|
sk = __udp6_lib_lookup(net, saddr, sport, daddr, dport,
|
2017-08-07 23:44:20 +08:00
|
|
|
dif, 0, &udp_table, NULL);
|
2017-06-30 18:08:01 +08:00
|
|
|
if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
|
2016-04-01 23:52:13 +08:00
|
|
|
sk = NULL;
|
|
|
|
return sk;
|
2010-10-21 22:05:41 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(udp6_lib_lookup);
|
2016-04-01 23:52:13 +08:00
|
|
|
#endif
|
2010-10-21 22:05:41 +08:00
|
|
|
|
2017-07-31 22:52:36 +08:00
|
|
|
/* do not use the scratch area len for jumbogram: their length execeeds the
|
|
|
|
* scratch area space; note that the IP6CB flags is still in the first
|
|
|
|
* cacheline, so checking for jumbograms is cheap
|
|
|
|
*/
|
|
|
|
static int udp6_skb_len(struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
return unlikely(inet6_is_jumbogram(skb)) ? skb->len : udp_skb_len(skb);
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
2014-08-25 04:53:10 +08:00
|
|
|
* This should be easy, if there is something there we
|
|
|
|
* return it, otherwise we block.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
|
|
|
|
2015-03-02 15:37:48 +08:00
|
|
|
int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
|
2005-04-17 06:20:36 +08:00
|
|
|
int noblock, int flags, int *addr_len)
|
|
|
|
{
|
|
|
|
struct ipv6_pinfo *np = inet6_sk(sk);
|
|
|
|
struct inet_sock *inet = inet_sk(sk);
|
2007-02-09 22:24:49 +08:00
|
|
|
struct sk_buff *skb;
|
2011-12-02 03:12:55 +08:00
|
|
|
unsigned int ulen, copied;
|
2016-04-06 00:41:16 +08:00
|
|
|
int peeked, peeking, off;
|
2007-03-26 11:10:56 +08:00
|
|
|
int err;
|
|
|
|
int is_udplite = IS_UDPLITE(sk);
|
udp: properly support MSG_PEEK with truncated buffers
Backport of this upstream commit into stable kernels :
89c22d8c3b27 ("net: Fix skb csum races when peeking")
exposed a bug in udp stack vs MSG_PEEK support, when user provides
a buffer smaller than skb payload.
In this case,
skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr),
msg->msg_iov);
returns -EFAULT.
This bug does not happen in upstream kernels since Al Viro did a great
job to replace this into :
skb_copy_and_csum_datagram_msg(skb, sizeof(struct udphdr), msg);
This variant is safe vs short buffers.
For the time being, instead reverting Herbert Xu patch and add back
skb->ip_summed invalid changes, simply store the result of
udp_lib_checksum_complete() so that we avoid computing the checksum a
second time, and avoid the problematic
skb_copy_and_csum_datagram_iovec() call.
This patch can be applied on recent kernels as it avoids a double
checksumming, then backported to stable kernels as a bug fix.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-12-30 21:51:12 +08:00
|
|
|
bool checksum_valid = false;
|
2008-11-03 00:11:01 +08:00
|
|
|
int is_udp4;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
if (flags & MSG_ERRQUEUE)
|
2013-11-23 07:46:12 +08:00
|
|
|
return ipv6_recv_error(sk, msg, len, addr_len);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2010-04-23 19:26:09 +08:00
|
|
|
if (np->rxpmtu && np->rxopt.bits.rxpmtu)
|
2013-11-23 07:46:12 +08:00
|
|
|
return ipv6_recv_rxpmtu(sk, msg, len, addr_len);
|
2010-04-23 19:26:09 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
try_again:
|
datagram: When peeking datagrams with offset < 0 don't skip empty skbs
Due to commit e6afc8ace6dd5cef5e812f26c72579da8806f5ac ("udp: remove
headers from UDP packets before queueing"), when udp packets are being
peeked the requested extra offset is always 0 as there is no need to skip
the udp header. However, when the offset is 0 and the next skb is
of length 0, it is only returned once. The behaviour can be seen with
the following python script:
from socket import *;
f=socket(AF_INET6, SOCK_DGRAM | SOCK_NONBLOCK, 0);
g=socket(AF_INET6, SOCK_DGRAM | SOCK_NONBLOCK, 0);
f.bind(('::', 0));
addr=('::1', f.getsockname()[1]);
g.sendto(b'', addr)
g.sendto(b'b', addr)
print(f.recvfrom(10, MSG_PEEK));
print(f.recvfrom(10, MSG_PEEK));
Where the expected output should be the empty string twice.
Instead, make sk_peek_offset return negative values, and pass those values
to __skb_try_recv_datagram/__skb_try_recv_from_queue. If the passed offset
to __skb_try_recv_from_queue is negative, the checked skb is never skipped.
__skb_try_recv_from_queue will then ensure the offset is reset back to 0
if a peek is requested without an offset, unless no packets are found.
Also simplify the if condition in __skb_try_recv_from_queue. If _off is
greater then 0, and off is greater then or equal to skb->len, then
(_off || skb->len) must always be true assuming skb->len >= 0 is always
true.
Also remove a redundant check around a call to sk_peek_offset in af_unix.c,
as it double checked if MSG_PEEK was set in the flags.
V2:
- Moved the negative fixup into __skb_try_recv_from_queue, and remove now
redundant checks
- Fix peeking in udp{,v6}_recvmsg to report the right value when the
offset is 0
V3:
- Marked new branch in __skb_try_recv_from_queue as unlikely.
Signed-off-by: Matthew Dawson <matthew@mjdsystems.ca>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-08-19 03:04:54 +08:00
|
|
|
peeking = flags & MSG_PEEK;
|
|
|
|
off = sk_peek_offset(sk, flags);
|
2016-11-04 18:28:59 +08:00
|
|
|
skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!skb)
|
2016-04-06 00:41:16 +08:00
|
|
|
return err;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2017-07-31 22:52:36 +08:00
|
|
|
ulen = udp6_skb_len(skb);
|
2011-12-02 03:12:55 +08:00
|
|
|
copied = len;
|
2016-04-06 00:41:16 +08:00
|
|
|
if (copied > ulen - off)
|
|
|
|
copied = ulen - off;
|
2011-12-02 03:12:55 +08:00
|
|
|
else if (copied < ulen)
|
2007-02-09 22:24:49 +08:00
|
|
|
msg->msg_flags |= MSG_TRUNC;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-11-03 00:11:01 +08:00
|
|
|
is_udp4 = (skb->protocol == htons(ETH_P_IP));
|
|
|
|
|
2006-11-28 03:10:57 +08:00
|
|
|
/*
|
2007-03-26 11:10:56 +08:00
|
|
|
* If checksum is needed at all, try to do it while copying the
|
|
|
|
* data. If the data is truncated, or if we only want a partial
|
|
|
|
* coverage checksum (UDP-Lite), do it before the copy.
|
2006-11-28 03:10:57 +08:00
|
|
|
*/
|
|
|
|
|
2016-11-19 09:18:03 +08:00
|
|
|
if (copied < ulen || peeking ||
|
|
|
|
(is_udplite && UDP_SKB_CB(skb)->partial_cov)) {
|
2017-06-27 01:01:51 +08:00
|
|
|
checksum_valid = udp_skb_csum_unnecessary(skb) ||
|
|
|
|
!__udp_lib_checksum_complete(skb);
|
udp: properly support MSG_PEEK with truncated buffers
Backport of this upstream commit into stable kernels :
89c22d8c3b27 ("net: Fix skb csum races when peeking")
exposed a bug in udp stack vs MSG_PEEK support, when user provides
a buffer smaller than skb payload.
In this case,
skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr),
msg->msg_iov);
returns -EFAULT.
This bug does not happen in upstream kernels since Al Viro did a great
job to replace this into :
skb_copy_and_csum_datagram_msg(skb, sizeof(struct udphdr), msg);
This variant is safe vs short buffers.
For the time being, instead reverting Herbert Xu patch and add back
skb->ip_summed invalid changes, simply store the result of
udp_lib_checksum_complete() so that we avoid computing the checksum a
second time, and avoid the problematic
skb_copy_and_csum_datagram_iovec() call.
This patch can be applied on recent kernels as it avoids a double
checksumming, then backported to stable kernels as a bug fix.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-12-30 21:51:12 +08:00
|
|
|
if (!checksum_valid)
|
2005-04-17 06:20:36 +08:00
|
|
|
goto csum_copy_err;
|
2006-11-28 03:10:57 +08:00
|
|
|
}
|
|
|
|
|
2017-06-27 01:01:51 +08:00
|
|
|
if (checksum_valid || udp_skb_csum_unnecessary(skb)) {
|
|
|
|
if (udp_skb_is_linear(skb))
|
|
|
|
err = copy_linear_skb(skb, copied, off, &msg->msg_iter);
|
|
|
|
else
|
|
|
|
err = skb_copy_datagram_msg(skb, off, msg, copied);
|
|
|
|
} else {
|
2016-04-06 00:41:16 +08:00
|
|
|
err = skb_copy_and_csum_datagram_msg(skb, off, msg);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (err == -EINVAL)
|
|
|
|
goto csum_copy_err;
|
|
|
|
}
|
2012-06-27 08:23:44 +08:00
|
|
|
if (unlikely(err)) {
|
2012-09-06 07:34:44 +08:00
|
|
|
if (!peeked) {
|
|
|
|
atomic_inc(&sk->sk_drops);
|
|
|
|
if (is_udp4)
|
net: snmp: kill various STATS_USER() helpers
In the old days (before linux-3.0), SNMP counters were duplicated,
one for user context, and one for BH context.
After commit 8f0ea0fe3a03 ("snmp: reduce percpu needs by 50%")
we have a single copy, and what really matters is preemption being
enabled or disabled, since we use this_cpu_inc() or __this_cpu_inc()
respectively.
We therefore kill SNMP_INC_STATS_USER(), SNMP_ADD_STATS_USER(),
NET_INC_STATS_USER(), NET_ADD_STATS_USER(), SCTP_INC_STATS_USER(),
SNMP_INC_STATS64_USER(), SNMP_ADD_STATS64_USER(), TCP_ADD_STATS_USER(),
UDP_INC_STATS_USER(), UDP6_INC_STATS_USER(), and XFRM_INC_STATS_USER()
Following patches will rename __BH helpers to make clear their
usage is not tied to BH being disabled.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-28 07:44:27 +08:00
|
|
|
UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS,
|
|
|
|
is_udplite);
|
2012-09-06 07:34:44 +08:00
|
|
|
else
|
net: snmp: kill various STATS_USER() helpers
In the old days (before linux-3.0), SNMP counters were duplicated,
one for user context, and one for BH context.
After commit 8f0ea0fe3a03 ("snmp: reduce percpu needs by 50%")
we have a single copy, and what really matters is preemption being
enabled or disabled, since we use this_cpu_inc() or __this_cpu_inc()
respectively.
We therefore kill SNMP_INC_STATS_USER(), SNMP_ADD_STATS_USER(),
NET_INC_STATS_USER(), NET_ADD_STATS_USER(), SCTP_INC_STATS_USER(),
SNMP_INC_STATS64_USER(), SNMP_ADD_STATS64_USER(), TCP_ADD_STATS_USER(),
UDP_INC_STATS_USER(), UDP6_INC_STATS_USER(), and XFRM_INC_STATS_USER()
Following patches will rename __BH helpers to make clear their
usage is not tied to BH being disabled.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-28 07:44:27 +08:00
|
|
|
UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS,
|
|
|
|
is_udplite);
|
2012-09-06 07:34:44 +08:00
|
|
|
}
|
2016-10-21 19:55:47 +08:00
|
|
|
kfree_skb(skb);
|
2016-04-06 00:41:16 +08:00
|
|
|
return err;
|
2012-06-27 08:23:44 +08:00
|
|
|
}
|
2008-11-03 00:11:01 +08:00
|
|
|
if (!peeked) {
|
|
|
|
if (is_udp4)
|
net: snmp: kill various STATS_USER() helpers
In the old days (before linux-3.0), SNMP counters were duplicated,
one for user context, and one for BH context.
After commit 8f0ea0fe3a03 ("snmp: reduce percpu needs by 50%")
we have a single copy, and what really matters is preemption being
enabled or disabled, since we use this_cpu_inc() or __this_cpu_inc()
respectively.
We therefore kill SNMP_INC_STATS_USER(), SNMP_ADD_STATS_USER(),
NET_INC_STATS_USER(), NET_ADD_STATS_USER(), SCTP_INC_STATS_USER(),
SNMP_INC_STATS64_USER(), SNMP_ADD_STATS64_USER(), TCP_ADD_STATS_USER(),
UDP_INC_STATS_USER(), UDP6_INC_STATS_USER(), and XFRM_INC_STATS_USER()
Following patches will rename __BH helpers to make clear their
usage is not tied to BH being disabled.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-28 07:44:27 +08:00
|
|
|
UDP_INC_STATS(sock_net(sk), UDP_MIB_INDATAGRAMS,
|
|
|
|
is_udplite);
|
2008-11-03 00:11:01 +08:00
|
|
|
else
|
net: snmp: kill various STATS_USER() helpers
In the old days (before linux-3.0), SNMP counters were duplicated,
one for user context, and one for BH context.
After commit 8f0ea0fe3a03 ("snmp: reduce percpu needs by 50%")
we have a single copy, and what really matters is preemption being
enabled or disabled, since we use this_cpu_inc() or __this_cpu_inc()
respectively.
We therefore kill SNMP_INC_STATS_USER(), SNMP_ADD_STATS_USER(),
NET_INC_STATS_USER(), NET_ADD_STATS_USER(), SCTP_INC_STATS_USER(),
SNMP_INC_STATS64_USER(), SNMP_ADD_STATS64_USER(), TCP_ADD_STATS_USER(),
UDP_INC_STATS_USER(), UDP6_INC_STATS_USER(), and XFRM_INC_STATS_USER()
Following patches will rename __BH helpers to make clear their
usage is not tied to BH being disabled.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-28 07:44:27 +08:00
|
|
|
UDP6_INC_STATS(sock_net(sk), UDP_MIB_INDATAGRAMS,
|
|
|
|
is_udplite);
|
2008-11-03 00:11:01 +08:00
|
|
|
}
|
2007-12-03 19:33:28 +08:00
|
|
|
|
net: Generalize socket rx gap / receive queue overflow cmsg
Create a new socket level option to report number of queue overflows
Recently I augmented the AF_PACKET protocol to report the number of frames lost
on the socket receive queue between any two enqueued frames. This value was
exported via a SOL_PACKET level cmsg. AFter I completed that work it was
requested that this feature be generalized so that any datagram oriented socket
could make use of this option. As such I've created this patch, It creates a
new SOL_SOCKET level option called SO_RXQ_OVFL, which when enabled exports a
SOL_SOCKET level cmsg that reports the nubmer of times the sk_receive_queue
overflowed between any two given frames. It also augments the AF_PACKET
protocol to take advantage of this new feature (as it previously did not touch
sk->sk_drops, which this patch uses to record the overflow count). Tested
successfully by me.
Notes:
1) Unlike my previous patch, this patch simply records the sk_drops value, which
is not a number of drops between packets, but rather a total number of drops.
Deltas must be computed in user space.
2) While this patch currently works with datagram oriented protocols, it will
also be accepted by non-datagram oriented protocols. I'm not sure if thats
agreeable to everyone, but my argument in favor of doing so is that, for those
protocols which aren't applicable to this option, sk_drops will always be zero,
and reporting no drops on a receive queue that isn't used for those
non-participating protocols seems reasonable to me. This also saves us having
to code in a per-protocol opt in mechanism.
3) This applies cleanly to net-next assuming that commit
977750076d98c7ff6cbda51858bb5a5894a9d9ab (my af packet cmsg patch) is reverted
Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-10-13 04:26:31 +08:00
|
|
|
sock_recv_ts_and_drops(msg, sk, skb);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* Copy the address. */
|
|
|
|
if (msg->msg_name) {
|
2014-01-18 05:53:15 +08:00
|
|
|
DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
|
2005-04-17 06:20:36 +08:00
|
|
|
sin6->sin6_family = AF_INET6;
|
2007-03-14 01:28:48 +08:00
|
|
|
sin6->sin6_port = udp_hdr(skb)->source;
|
2005-04-17 06:20:36 +08:00
|
|
|
sin6->sin6_flowinfo = 0;
|
|
|
|
|
2013-03-08 10:07:19 +08:00
|
|
|
if (is_udp4) {
|
2009-10-08 04:58:25 +08:00
|
|
|
ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr,
|
|
|
|
&sin6->sin6_addr);
|
2013-03-08 10:07:19 +08:00
|
|
|
sin6->sin6_scope_id = 0;
|
|
|
|
} else {
|
2011-11-21 11:39:03 +08:00
|
|
|
sin6->sin6_addr = ipv6_hdr(skb)->saddr;
|
2013-03-08 10:07:19 +08:00
|
|
|
sin6->sin6_scope_id =
|
|
|
|
ipv6_iface_scope_id(&sin6->sin6_addr,
|
2014-08-01 09:52:58 +08:00
|
|
|
inet6_iif(skb));
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2013-11-18 11:20:45 +08:00
|
|
|
*addr_len = sizeof(*sin6);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2014-01-20 10:43:08 +08:00
|
|
|
|
|
|
|
if (np->rxopt.all)
|
|
|
|
ip6_datagram_recv_common_ctl(sk, msg, skb);
|
|
|
|
|
2008-11-03 00:11:01 +08:00
|
|
|
if (is_udp4) {
|
2005-04-17 06:20:36 +08:00
|
|
|
if (inet->cmsg_flags)
|
2016-11-04 18:28:58 +08:00
|
|
|
ip_cmsg_recv_offset(msg, sk, skb,
|
2016-10-24 09:03:06 +08:00
|
|
|
sizeof(struct udphdr), off);
|
2005-04-17 06:20:36 +08:00
|
|
|
} else {
|
|
|
|
if (np->rxopt.all)
|
2014-01-20 10:43:08 +08:00
|
|
|
ip6_datagram_recv_specific_ctl(sk, msg, skb);
|
2007-02-09 22:24:49 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-12-02 03:12:55 +08:00
|
|
|
err = copied;
|
2005-04-17 06:20:36 +08:00
|
|
|
if (flags & MSG_TRUNC)
|
2007-03-26 11:10:56 +08:00
|
|
|
err = ulen;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2016-10-21 19:55:47 +08:00
|
|
|
skb_consume_udp(sk, skb, peeking ? -err : err);
|
2005-04-17 06:20:36 +08:00
|
|
|
return err;
|
|
|
|
|
|
|
|
csum_copy_err:
|
2017-05-16 17:20:14 +08:00
|
|
|
if (!__sk_queue_drop_skb(sk, &udp_sk(sk)->reader_queue, skb, flags,
|
|
|
|
udp_skb_destructor)) {
|
2013-04-29 16:39:56 +08:00
|
|
|
if (is_udp4) {
|
net: snmp: kill various STATS_USER() helpers
In the old days (before linux-3.0), SNMP counters were duplicated,
one for user context, and one for BH context.
After commit 8f0ea0fe3a03 ("snmp: reduce percpu needs by 50%")
we have a single copy, and what really matters is preemption being
enabled or disabled, since we use this_cpu_inc() or __this_cpu_inc()
respectively.
We therefore kill SNMP_INC_STATS_USER(), SNMP_ADD_STATS_USER(),
NET_INC_STATS_USER(), NET_ADD_STATS_USER(), SCTP_INC_STATS_USER(),
SNMP_INC_STATS64_USER(), SNMP_ADD_STATS64_USER(), TCP_ADD_STATS_USER(),
UDP_INC_STATS_USER(), UDP6_INC_STATS_USER(), and XFRM_INC_STATS_USER()
Following patches will rename __BH helpers to make clear their
usage is not tied to BH being disabled.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-28 07:44:27 +08:00
|
|
|
UDP_INC_STATS(sock_net(sk),
|
|
|
|
UDP_MIB_CSUMERRORS, is_udplite);
|
|
|
|
UDP_INC_STATS(sock_net(sk),
|
|
|
|
UDP_MIB_INERRORS, is_udplite);
|
2013-04-29 16:39:56 +08:00
|
|
|
} else {
|
net: snmp: kill various STATS_USER() helpers
In the old days (before linux-3.0), SNMP counters were duplicated,
one for user context, and one for BH context.
After commit 8f0ea0fe3a03 ("snmp: reduce percpu needs by 50%")
we have a single copy, and what really matters is preemption being
enabled or disabled, since we use this_cpu_inc() or __this_cpu_inc()
respectively.
We therefore kill SNMP_INC_STATS_USER(), SNMP_ADD_STATS_USER(),
NET_INC_STATS_USER(), NET_ADD_STATS_USER(), SCTP_INC_STATS_USER(),
SNMP_INC_STATS64_USER(), SNMP_ADD_STATS64_USER(), TCP_ADD_STATS_USER(),
UDP_INC_STATS_USER(), UDP6_INC_STATS_USER(), and XFRM_INC_STATS_USER()
Following patches will rename __BH helpers to make clear their
usage is not tied to BH being disabled.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-28 07:44:27 +08:00
|
|
|
UDP6_INC_STATS(sock_net(sk),
|
|
|
|
UDP_MIB_CSUMERRORS, is_udplite);
|
|
|
|
UDP6_INC_STATS(sock_net(sk),
|
|
|
|
UDP_MIB_INERRORS, is_udplite);
|
2013-04-29 16:39:56 +08:00
|
|
|
}
|
2008-11-03 00:14:27 +08:00
|
|
|
}
|
2016-10-21 19:55:47 +08:00
|
|
|
kfree_skb(skb);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2015-05-31 00:16:53 +08:00
|
|
|
/* starting over for a new packet, but check if we need to yield */
|
|
|
|
cond_resched();
|
2011-06-21 18:43:40 +08:00
|
|
|
msg->msg_flags &= ~MSG_TRUNC;
|
2005-04-17 06:20:36 +08:00
|
|
|
goto try_again;
|
|
|
|
}
|
|
|
|
|
2006-11-28 03:10:57 +08:00
|
|
|
void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
|
2009-06-23 19:31:07 +08:00
|
|
|
u8 type, u8 code, int offset, __be32 info,
|
2008-10-29 16:41:45 +08:00
|
|
|
struct udp_table *udptable)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
struct ipv6_pinfo *np;
|
2011-04-22 12:53:02 +08:00
|
|
|
const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
|
|
|
|
const struct in6_addr *saddr = &hdr->saddr;
|
|
|
|
const struct in6_addr *daddr = &hdr->daddr;
|
2014-08-25 04:53:10 +08:00
|
|
|
struct udphdr *uh = (struct udphdr *)(skb->data+offset);
|
2005-04-17 06:20:36 +08:00
|
|
|
struct sock *sk;
|
2016-02-18 05:58:22 +08:00
|
|
|
int harderr;
|
2005-04-17 06:20:36 +08:00
|
|
|
int err;
|
2014-07-31 17:54:32 +08:00
|
|
|
struct net *net = dev_net(skb->dev);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2016-01-05 06:41:46 +08:00
|
|
|
sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source,
|
2017-08-07 23:44:20 +08:00
|
|
|
inet6_iif(skb), 0, udptable, skb);
|
2015-03-29 21:00:04 +08:00
|
|
|
if (!sk) {
|
2016-04-28 07:44:36 +08:00
|
|
|
__ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
|
|
|
|
ICMP6_MIB_INERRORS);
|
2005-04-17 06:20:36 +08:00
|
|
|
return;
|
2014-07-31 17:54:32 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2016-02-18 05:58:22 +08:00
|
|
|
harderr = icmpv6_err_convert(type, code, &err);
|
|
|
|
np = inet6_sk(sk);
|
|
|
|
|
2013-12-15 10:41:14 +08:00
|
|
|
if (type == ICMPV6_PKT_TOOBIG) {
|
|
|
|
if (!ip6_sk_accept_pmtu(sk))
|
|
|
|
goto out;
|
2012-06-16 05:54:11 +08:00
|
|
|
ip6_sk_update_pmtu(skb, sk, info);
|
2016-02-18 05:58:22 +08:00
|
|
|
if (np->pmtudisc != IPV6_PMTUDISC_DONT)
|
|
|
|
harderr = 1;
|
2013-12-15 10:41:14 +08:00
|
|
|
}
|
2013-09-20 18:20:28 +08:00
|
|
|
if (type == NDISC_REDIRECT) {
|
2012-07-12 15:25:15 +08:00
|
|
|
ip6_sk_redirect(skb, sk);
|
2013-09-20 18:20:28 +08:00
|
|
|
goto out;
|
|
|
|
}
|
2012-06-16 05:54:11 +08:00
|
|
|
|
2016-02-18 05:58:22 +08:00
|
|
|
if (!np->recverr) {
|
|
|
|
if (!harderr || sk->sk_state != TCP_ESTABLISHED)
|
|
|
|
goto out;
|
|
|
|
} else {
|
2005-04-17 06:20:36 +08:00
|
|
|
ipv6_icmp_error(sk, skb, err, uh->dest, ntohl(info), (u8 *)(uh+1));
|
2016-02-18 05:58:22 +08:00
|
|
|
}
|
2010-06-01 14:44:05 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
sk->sk_err = err;
|
|
|
|
sk->sk_error_report(sk);
|
|
|
|
out:
|
2016-04-01 23:52:13 +08:00
|
|
|
return;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2017-05-17 20:52:16 +08:00
|
|
|
static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
|
2012-04-27 16:23:21 +08:00
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
|
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-04 06:42:29 +08:00
|
|
|
if (!ipv6_addr_any(&sk->sk_v6_daddr)) {
|
2012-04-27 16:23:21 +08:00
|
|
|
sock_rps_save_rxhash(sk, skb);
|
2013-10-08 00:01:38 +08:00
|
|
|
sk_mark_napi_id(sk, skb);
|
net: introduce SO_INCOMING_CPU
Alternative to RPS/RFS is to use hardware support for multiple
queues.
Then split a set of million of sockets into worker threads, each
one using epoll() to manage events on its own socket pool.
Ideally, we want one thread per RX/TX queue/cpu, but we have no way to
know after accept() or connect() on which queue/cpu a socket is managed.
We normally use one cpu per RX queue (IRQ smp_affinity being properly
set), so remembering on socket structure which cpu delivered last packet
is enough to solve the problem.
After accept(), connect(), or even file descriptor passing around
processes, applications can use :
int cpu;
socklen_t len = sizeof(cpu);
getsockopt(fd, SOL_SOCKET, SO_INCOMING_CPU, &cpu, &len);
And use this information to put the socket into the right silo
for optimal performance, as all networking stack should run
on the appropriate cpu, without need to send IPI (RPS/RFS).
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-11-11 21:54:28 +08:00
|
|
|
sk_incoming_cpu_update(sk);
|
2016-11-17 01:10:42 +08:00
|
|
|
} else {
|
|
|
|
sk_mark_napi_id_once(sk, skb);
|
2013-10-08 00:01:38 +08:00
|
|
|
}
|
2012-04-27 16:23:21 +08:00
|
|
|
|
2016-10-21 19:55:47 +08:00
|
|
|
rc = __udp_enqueue_schedule_skb(sk, skb);
|
2012-04-27 16:23:21 +08:00
|
|
|
if (rc < 0) {
|
|
|
|
int is_udplite = IS_UDPLITE(sk);
|
|
|
|
|
|
|
|
/* Note that an ENOMEM error is charged twice */
|
|
|
|
if (rc == -ENOMEM)
|
2016-04-30 05:16:50 +08:00
|
|
|
UDP6_INC_STATS(sock_net(sk),
|
2016-04-28 07:44:30 +08:00
|
|
|
UDP_MIB_RCVBUFERRORS, is_udplite);
|
2016-04-30 05:16:50 +08:00
|
|
|
UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
|
2012-04-27 16:23:21 +08:00
|
|
|
kfree_skb(skb);
|
|
|
|
return -1;
|
|
|
|
}
|
2016-10-21 19:55:47 +08:00
|
|
|
|
2012-04-27 16:23:21 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2006-11-28 03:10:57 +08:00
|
|
|
static __inline__ void udpv6_err(struct sk_buff *skb,
|
2009-06-23 19:31:07 +08:00
|
|
|
struct inet6_skb_parm *opt, u8 type,
|
2014-08-25 04:53:10 +08:00
|
|
|
u8 code, int offset, __be32 info)
|
2006-11-28 03:10:57 +08:00
|
|
|
{
|
2008-10-29 16:41:45 +08:00
|
|
|
__udp6_lib_err(skb, opt, type, code, offset, info, &udp_table);
|
2006-11-28 03:10:57 +08:00
|
|
|
}
|
|
|
|
|
2018-05-09 00:07:03 +08:00
|
|
|
static DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
|
2012-04-27 16:24:08 +08:00
|
|
|
void udpv6_encap_enable(void)
|
|
|
|
{
|
2018-05-09 00:07:03 +08:00
|
|
|
static_branch_enable(&udpv6_encap_needed_key);
|
2012-04-27 16:24:08 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(udpv6_encap_enable);
|
|
|
|
|
2017-05-17 20:52:16 +08:00
|
|
|
static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2006-11-28 03:10:57 +08:00
|
|
|
struct udp_sock *up = udp_sk(sk);
|
2007-12-03 19:34:16 +08:00
|
|
|
int is_udplite = IS_UDPLITE(sk);
|
2006-08-15 15:00:09 +08:00
|
|
|
|
2006-11-28 03:10:57 +08:00
|
|
|
if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
|
|
|
|
goto drop;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2018-05-09 00:07:03 +08:00
|
|
|
if (static_branch_unlikely(&udpv6_encap_needed_key) && up->encap_type) {
|
2012-04-27 16:24:08 +08:00
|
|
|
int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is an encapsulation socket so pass the skb to
|
|
|
|
* the socket's udp_encap_rcv() hook. Otherwise, just
|
|
|
|
* fall through and pass this up the UDP socket.
|
|
|
|
* up->encap_rcv() returns the following value:
|
|
|
|
* =0 if skb was successfully passed to the encap
|
|
|
|
* handler or was discarded by it.
|
|
|
|
* >0 if skb should be passed on to UDP.
|
|
|
|
* <0 if skb should be resubmitted as proto -N
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* if we're overly short, let UDP handle it */
|
locking/atomics: COCCINELLE/treewide: Convert trivial ACCESS_ONCE() patterns to READ_ONCE()/WRITE_ONCE()
Please do not apply this to mainline directly, instead please re-run the
coccinelle script shown below and apply its output.
For several reasons, it is desirable to use {READ,WRITE}_ONCE() in
preference to ACCESS_ONCE(), and new code is expected to use one of the
former. So far, there's been no reason to change most existing uses of
ACCESS_ONCE(), as these aren't harmful, and changing them results in
churn.
However, for some features, the read/write distinction is critical to
correct operation. To distinguish these cases, separate read/write
accessors must be used. This patch migrates (most) remaining
ACCESS_ONCE() instances to {READ,WRITE}_ONCE(), using the following
coccinelle script:
----
// Convert trivial ACCESS_ONCE() uses to equivalent READ_ONCE() and
// WRITE_ONCE()
// $ make coccicheck COCCI=/home/mark/once.cocci SPFLAGS="--include-headers" MODE=patch
virtual patch
@ depends on patch @
expression E1, E2;
@@
- ACCESS_ONCE(E1) = E2
+ WRITE_ONCE(E1, E2)
@ depends on patch @
expression E;
@@
- ACCESS_ONCE(E)
+ READ_ONCE(E)
----
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: davem@davemloft.net
Cc: linux-arch@vger.kernel.org
Cc: mpe@ellerman.id.au
Cc: shuah@kernel.org
Cc: snitzer@redhat.com
Cc: thor.thayer@linux.intel.com
Cc: tj@kernel.org
Cc: viro@zeniv.linux.org.uk
Cc: will.deacon@arm.com
Link: http://lkml.kernel.org/r/1508792849-3115-19-git-send-email-paulmck@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-10-24 05:07:29 +08:00
|
|
|
encap_rcv = READ_ONCE(up->encap_rcv);
|
2016-05-19 21:58:33 +08:00
|
|
|
if (encap_rcv) {
|
2012-04-27 16:24:08 +08:00
|
|
|
int ret;
|
|
|
|
|
2014-05-08 07:52:39 +08:00
|
|
|
/* Verify checksum before giving to encap */
|
|
|
|
if (udp_lib_checksum_complete(skb))
|
|
|
|
goto csum_error;
|
|
|
|
|
2012-04-27 16:24:08 +08:00
|
|
|
ret = encap_rcv(sk, skb);
|
|
|
|
if (ret <= 0) {
|
2016-04-28 07:44:30 +08:00
|
|
|
__UDP_INC_STATS(sock_net(sk),
|
|
|
|
UDP_MIB_INDATAGRAMS,
|
|
|
|
is_udplite);
|
2012-04-27 16:24:08 +08:00
|
|
|
return -ret;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* FALLTHROUGH -- it's a UDP Packet */
|
|
|
|
}
|
|
|
|
|
2006-11-28 03:10:57 +08:00
|
|
|
/*
|
|
|
|
* UDP-Lite specific tests, ignored on UDP sockets (see net/ipv4/udp.c).
|
|
|
|
*/
|
2007-12-03 19:34:16 +08:00
|
|
|
if ((is_udplite & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) {
|
2006-11-28 03:10:57 +08:00
|
|
|
|
|
|
|
if (up->pcrlen == 0) { /* full coverage was set */
|
2014-11-12 02:59:17 +08:00
|
|
|
net_dbg_ratelimited("UDPLITE6: partial coverage %d while full coverage %d requested\n",
|
|
|
|
UDP_SKB_CB(skb)->cscov, skb->len);
|
2006-11-28 03:10:57 +08:00
|
|
|
goto drop;
|
|
|
|
}
|
|
|
|
if (UDP_SKB_CB(skb)->cscov < up->pcrlen) {
|
2014-11-12 02:59:17 +08:00
|
|
|
net_dbg_ratelimited("UDPLITE6: coverage %d too small, need min %d\n",
|
|
|
|
UDP_SKB_CB(skb)->cscov, up->pcrlen);
|
2006-11-28 03:10:57 +08:00
|
|
|
goto drop;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2017-06-22 21:01:22 +08:00
|
|
|
prefetch(&sk->sk_rmem_alloc);
|
2016-06-03 05:52:43 +08:00
|
|
|
if (rcu_access_pointer(sk->sk_filter) &&
|
|
|
|
udp_lib_checksum_complete(skb))
|
|
|
|
goto csum_error;
|
|
|
|
|
2016-07-26 00:06:12 +08:00
|
|
|
if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr)))
|
2016-07-08 23:52:33 +08:00
|
|
|
goto drop;
|
2006-11-28 03:10:57 +08:00
|
|
|
|
2016-04-06 00:41:15 +08:00
|
|
|
udp_csum_pull_header(skb);
|
2012-04-27 16:23:59 +08:00
|
|
|
|
ipv4: PKTINFO doesnt need dst reference
Le lundi 07 novembre 2011 à 15:33 +0100, Eric Dumazet a écrit :
> At least, in recent kernels we dont change dst->refcnt in forwarding
> patch (usinf NOREF skb->dst)
>
> One particular point is the atomic_inc(dst->refcnt) we have to perform
> when queuing an UDP packet if socket asked PKTINFO stuff (for example a
> typical DNS server has to setup this option)
>
> I have one patch somewhere that stores the information in skb->cb[] and
> avoid the atomic_{inc|dec}(dst->refcnt).
>
OK I found it, I did some extra tests and believe its ready.
[PATCH net-next] ipv4: IP_PKTINFO doesnt need dst reference
When a socket uses IP_PKTINFO notifications, we currently force a dst
reference for each received skb. Reader has to access dst to get needed
information (rt_iif & rt_spec_dst) and must release dst reference.
We also forced a dst reference if skb was put in socket backlog, even
without IP_PKTINFO handling. This happens under stress/load.
We can instead store the needed information in skb->cb[], so that only
softirq handler really access dst, improving cache hit ratios.
This removes two atomic operations per packet, and false sharing as
well.
On a benchmark using a mono threaded receiver (doing only recvmsg()
calls), I can reach 720.000 pps instead of 570.000 pps.
IP_PKTINFO is typically used by DNS servers, and any multihomed aware
UDP application.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 15:24:35 +08:00
|
|
|
skb_dst_drop(skb);
|
2007-12-03 19:33:28 +08:00
|
|
|
|
2016-10-21 19:55:47 +08:00
|
|
|
return __udpv6_queue_rcv_skb(sk, skb);
|
2014-06-26 05:38:13 +08:00
|
|
|
|
2013-04-29 16:39:56 +08:00
|
|
|
csum_error:
|
2016-04-28 07:44:30 +08:00
|
|
|
__UDP6_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
|
2006-11-28 03:10:57 +08:00
|
|
|
drop:
|
2016-04-28 07:44:30 +08:00
|
|
|
__UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
|
2012-04-27 16:23:59 +08:00
|
|
|
atomic_inc(&sk->sk_drops);
|
2006-11-28 03:10:57 +08:00
|
|
|
kfree_skb(skb);
|
|
|
|
return -1;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2014-07-16 11:28:31 +08:00
|
|
|
static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk,
|
|
|
|
__be16 loc_port, const struct in6_addr *loc_addr,
|
|
|
|
__be16 rmt_port, const struct in6_addr *rmt_addr,
|
|
|
|
int dif, unsigned short hnum)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2014-07-16 11:28:31 +08:00
|
|
|
struct inet_sock *inet = inet_sk(sk);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2014-07-16 11:28:31 +08:00
|
|
|
if (!net_eq(sock_net(sk), net))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (udp_sk(sk)->udp_port_hash != hnum ||
|
|
|
|
sk->sk_family != PF_INET6 ||
|
|
|
|
(inet->inet_dport && inet->inet_dport != rmt_port) ||
|
|
|
|
(!ipv6_addr_any(&sk->sk_v6_daddr) &&
|
|
|
|
!ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) ||
|
2015-05-19 03:08:49 +08:00
|
|
|
(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) ||
|
|
|
|
(!ipv6_addr_any(&sk->sk_v6_rcv_saddr) &&
|
|
|
|
!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr)))
|
2014-07-16 11:28:31 +08:00
|
|
|
return false;
|
|
|
|
if (!inet6_mc_check(sk, loc_addr, rmt_addr))
|
|
|
|
return false;
|
|
|
|
return true;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2014-05-03 07:29:58 +08:00
|
|
|
static void udp6_csum_zero_error(struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
/* RFC 2460 section 8.1 says that we SHOULD log
|
|
|
|
* this error. Well, it is reasonable.
|
|
|
|
*/
|
2014-11-12 02:59:17 +08:00
|
|
|
net_dbg_ratelimited("IPv6: udp checksum is 0 for [%pI6c]:%u->[%pI6c]:%u\n",
|
|
|
|
&ipv6_hdr(skb)->saddr, ntohs(udp_hdr(skb)->source),
|
|
|
|
&ipv6_hdr(skb)->daddr, ntohs(udp_hdr(skb)->dest));
|
2014-05-03 07:29:58 +08:00
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Note: called only from the BH handler context,
|
|
|
|
* so we don't need to lock the hashes.
|
|
|
|
*/
|
2008-06-17 08:12:11 +08:00
|
|
|
static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
|
2011-04-22 12:53:02 +08:00
|
|
|
const struct in6_addr *saddr, const struct in6_addr *daddr,
|
2014-11-07 02:37:54 +08:00
|
|
|
struct udp_table *udptable, int proto)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2016-04-01 23:52:13 +08:00
|
|
|
struct sock *sk, *first = NULL;
|
2007-03-14 01:28:48 +08:00
|
|
|
const struct udphdr *uh = udp_hdr(skb);
|
2014-07-16 11:28:31 +08:00
|
|
|
unsigned short hnum = ntohs(uh->dest);
|
|
|
|
struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
|
2016-04-01 23:52:13 +08:00
|
|
|
unsigned int offset = offsetof(typeof(*sk), sk_node);
|
2014-07-16 11:28:32 +08:00
|
|
|
unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
|
2016-04-01 23:52:13 +08:00
|
|
|
int dif = inet6_iif(skb);
|
|
|
|
struct hlist_node *node;
|
|
|
|
struct sk_buff *nskb;
|
2014-07-16 11:28:32 +08:00
|
|
|
|
|
|
|
if (use_hash2) {
|
2017-12-02 04:52:30 +08:00
|
|
|
hash2_any = ipv6_portaddr_hash(net, &in6addr_any, hnum) &
|
2016-11-15 06:40:30 +08:00
|
|
|
udptable->mask;
|
2017-12-02 04:52:30 +08:00
|
|
|
hash2 = ipv6_portaddr_hash(net, daddr, hnum) & udptable->mask;
|
2014-07-16 11:28:32 +08:00
|
|
|
start_lookup:
|
2016-11-15 06:40:30 +08:00
|
|
|
hslot = &udptable->hash2[hash2];
|
2014-07-16 11:28:32 +08:00
|
|
|
offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2016-04-01 23:52:13 +08:00
|
|
|
sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) {
|
|
|
|
if (!__udp_v6_is_mcast_sock(net, sk, uh->dest, daddr,
|
|
|
|
uh->source, saddr, dif, hnum))
|
|
|
|
continue;
|
|
|
|
/* If zero checksum and no_check is not on for
|
|
|
|
* the socket then skip it.
|
|
|
|
*/
|
|
|
|
if (!uh->check && !udp_sk(sk)->no_check6_rx)
|
|
|
|
continue;
|
|
|
|
if (!first) {
|
|
|
|
first = sk;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
nskb = skb_clone(skb, GFP_ATOMIC);
|
|
|
|
if (unlikely(!nskb)) {
|
|
|
|
atomic_inc(&sk->sk_drops);
|
2016-04-28 07:44:30 +08:00
|
|
|
__UDP6_INC_STATS(net, UDP_MIB_RCVBUFERRORS,
|
|
|
|
IS_UDPLITE(sk));
|
|
|
|
__UDP6_INC_STATS(net, UDP_MIB_INERRORS,
|
|
|
|
IS_UDPLITE(sk));
|
2016-04-01 23:52:13 +08:00
|
|
|
continue;
|
2007-12-31 16:29:24 +08:00
|
|
|
}
|
2009-11-08 18:18:52 +08:00
|
|
|
|
2016-04-01 23:52:13 +08:00
|
|
|
if (udpv6_queue_rcv_skb(sk, nskb) > 0)
|
|
|
|
consume_skb(nskb);
|
|
|
|
}
|
2009-11-08 18:18:52 +08:00
|
|
|
|
2014-07-16 11:28:32 +08:00
|
|
|
/* Also lookup *:port if we are using hash2 and haven't done so yet. */
|
|
|
|
if (use_hash2 && hash2 != hash2_any) {
|
|
|
|
hash2 = hash2_any;
|
|
|
|
goto start_lookup;
|
|
|
|
}
|
|
|
|
|
2016-04-01 23:52:13 +08:00
|
|
|
if (first) {
|
|
|
|
if (udpv6_queue_rcv_skb(first, skb) > 0)
|
|
|
|
consume_skb(skb);
|
2009-11-08 18:18:52 +08:00
|
|
|
} else {
|
2016-04-01 23:52:13 +08:00
|
|
|
kfree_skb(skb);
|
2016-04-28 07:44:30 +08:00
|
|
|
__UDP6_INC_STATS(net, UDP_MIB_IGNOREDMULTI,
|
|
|
|
proto == IPPROTO_UDPLITE);
|
2009-11-08 18:18:52 +08:00
|
|
|
}
|
2006-11-28 03:10:57 +08:00
|
|
|
return 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2017-08-25 20:31:01 +08:00
|
|
|
static void udp6_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
|
|
|
|
{
|
|
|
|
if (udp_sk_rx_dst_set(sk, dst)) {
|
|
|
|
const struct rt6_info *rt = (const struct rt6_info *)dst;
|
|
|
|
|
|
|
|
inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-09-13 22:27:21 +08:00
|
|
|
/* wrapper for udp_queue_rcv_skb tacking care of csum conversion and
|
|
|
|
* return code conversion for ip layer consumption
|
|
|
|
*/
|
|
|
|
static int udp6_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
|
|
|
|
struct udphdr *uh)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
|
|
|
|
skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
|
|
|
|
ip6_compute_pseudo);
|
|
|
|
|
|
|
|
ret = udpv6_queue_rcv_skb(sk, skb);
|
|
|
|
|
|
|
|
/* a return value > 0 means to resubmit the input, but
|
|
|
|
* it wants the return to be -protocol, or 0
|
|
|
|
*/
|
|
|
|
if (ret > 0)
|
|
|
|
return -ret;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-10-29 16:41:45 +08:00
|
|
|
int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
|
2007-03-26 11:10:56 +08:00
|
|
|
int proto)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2016-04-01 23:52:13 +08:00
|
|
|
const struct in6_addr *saddr, *daddr;
|
2010-02-18 16:25:24 +08:00
|
|
|
struct net *net = dev_net(skb->dev);
|
2007-02-09 22:24:49 +08:00
|
|
|
struct udphdr *uh;
|
2016-04-01 23:52:13 +08:00
|
|
|
struct sock *sk;
|
2005-04-17 06:20:36 +08:00
|
|
|
u32 ulen = 0;
|
|
|
|
|
|
|
|
if (!pskb_may_pull(skb, sizeof(struct udphdr)))
|
2010-05-06 11:44:35 +08:00
|
|
|
goto discard;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-04-26 08:54:47 +08:00
|
|
|
saddr = &ipv6_hdr(skb)->saddr;
|
|
|
|
daddr = &ipv6_hdr(skb)->daddr;
|
2007-03-14 01:28:48 +08:00
|
|
|
uh = udp_hdr(skb);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
ulen = ntohs(uh->len);
|
2006-11-28 03:10:57 +08:00
|
|
|
if (ulen > skb->len)
|
|
|
|
goto short_packet;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-03-26 11:10:56 +08:00
|
|
|
if (proto == IPPROTO_UDP) {
|
|
|
|
/* UDP validates ulen. */
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-11-28 03:10:57 +08:00
|
|
|
/* Check for jumbo payload */
|
|
|
|
if (ulen == 0)
|
|
|
|
ulen = skb->len;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-11-28 03:10:57 +08:00
|
|
|
if (ulen < sizeof(*uh))
|
|
|
|
goto short_packet;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-11-28 03:10:57 +08:00
|
|
|
if (ulen < skb->len) {
|
|
|
|
if (pskb_trim_rcsum(skb, ulen))
|
|
|
|
goto short_packet;
|
2007-04-26 08:54:47 +08:00
|
|
|
saddr = &ipv6_hdr(skb)->saddr;
|
|
|
|
daddr = &ipv6_hdr(skb)->daddr;
|
2007-03-14 01:28:48 +08:00
|
|
|
uh = udp_hdr(skb);
|
2006-11-28 03:10:57 +08:00
|
|
|
}
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-03-26 11:10:56 +08:00
|
|
|
if (udp6_csum_init(skb, uh, proto))
|
2013-04-29 16:39:56 +08:00
|
|
|
goto csum_error;
|
2007-03-26 11:10:56 +08:00
|
|
|
|
2017-07-27 20:45:09 +08:00
|
|
|
/* Check if the socket is already available, e.g. due to early demux */
|
|
|
|
sk = skb_steal_sock(skb);
|
|
|
|
if (sk) {
|
|
|
|
struct dst_entry *dst = skb_dst(skb);
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (unlikely(sk->sk_rx_dst != dst))
|
2017-08-25 20:31:01 +08:00
|
|
|
udp6_sk_rx_dst_set(sk, dst);
|
2017-07-27 20:45:09 +08:00
|
|
|
|
2018-09-13 22:27:21 +08:00
|
|
|
if (!uh->check && !udp_sk(sk)->no_check6_rx) {
|
|
|
|
sock_put(sk);
|
|
|
|
goto report_csum_error;
|
|
|
|
}
|
2017-07-27 20:45:09 +08:00
|
|
|
|
2018-09-13 22:27:21 +08:00
|
|
|
ret = udp6_unicast_rcv_skb(sk, skb, uh);
|
|
|
|
sock_put(sk);
|
|
|
|
return ret;
|
2017-07-27 20:45:09 +08:00
|
|
|
}
|
|
|
|
|
2007-02-09 22:24:49 +08:00
|
|
|
/*
|
|
|
|
* Multicast receive code
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2006-11-28 03:10:57 +08:00
|
|
|
if (ipv6_addr_is_multicast(daddr))
|
2008-06-17 08:12:11 +08:00
|
|
|
return __udp6_lib_mcast_deliver(net, skb,
|
2014-11-07 02:37:54 +08:00
|
|
|
saddr, daddr, udptable, proto);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* Unicast */
|
2008-10-08 03:38:32 +08:00
|
|
|
sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
|
2015-03-29 21:00:05 +08:00
|
|
|
if (sk) {
|
2018-09-13 22:27:21 +08:00
|
|
|
if (!uh->check && !udp_sk(sk)->no_check6_rx)
|
|
|
|
goto report_csum_error;
|
|
|
|
return udp6_unicast_rcv_skb(sk, skb, uh);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2007-02-09 22:24:49 +08:00
|
|
|
|
2018-09-13 22:27:21 +08:00
|
|
|
if (!uh->check)
|
|
|
|
goto report_csum_error;
|
2014-05-03 07:29:58 +08:00
|
|
|
|
2012-04-27 16:23:59 +08:00
|
|
|
if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
|
2010-04-28 06:13:20 +08:00
|
|
|
goto discard;
|
2012-04-27 16:23:59 +08:00
|
|
|
|
|
|
|
if (udp_lib_checksum_complete(skb))
|
2013-04-29 16:39:56 +08:00
|
|
|
goto csum_error;
|
2012-04-27 16:23:59 +08:00
|
|
|
|
2016-04-28 07:44:30 +08:00
|
|
|
__UDP6_INC_STATS(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
|
2012-04-27 16:23:59 +08:00
|
|
|
icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
|
|
|
|
|
|
|
|
kfree_skb(skb);
|
2007-03-09 12:42:35 +08:00
|
|
|
return 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-02-09 22:24:49 +08:00
|
|
|
short_packet:
|
2014-11-12 02:59:17 +08:00
|
|
|
net_dbg_ratelimited("UDP%sv6: short packet: From [%pI6c]:%u %d/%d to [%pI6c]:%u\n",
|
|
|
|
proto == IPPROTO_UDPLITE ? "-Lite" : "",
|
|
|
|
saddr, ntohs(uh->source),
|
|
|
|
ulen, skb->len,
|
|
|
|
daddr, ntohs(uh->dest));
|
2013-04-29 16:39:56 +08:00
|
|
|
goto discard;
|
2018-09-13 22:27:21 +08:00
|
|
|
|
|
|
|
report_csum_error:
|
|
|
|
udp6_csum_zero_error(skb);
|
2013-04-29 16:39:56 +08:00
|
|
|
csum_error:
|
2016-04-28 07:44:30 +08:00
|
|
|
__UDP6_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);
|
2005-04-17 06:20:36 +08:00
|
|
|
discard:
|
2016-04-28 07:44:30 +08:00
|
|
|
__UDP6_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
|
2005-04-17 06:20:36 +08:00
|
|
|
kfree_skb(skb);
|
2007-03-09 12:42:35 +08:00
|
|
|
return 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2006-11-28 03:10:57 +08:00
|
|
|
|
2017-04-19 01:39:41 +08:00
|
|
|
|
2017-03-09 07:36:49 +08:00
|
|
|
static struct sock *__udp6_lib_demux_lookup(struct net *net,
|
|
|
|
__be16 loc_port, const struct in6_addr *loc_addr,
|
|
|
|
__be16 rmt_port, const struct in6_addr *rmt_addr,
|
2017-08-07 23:44:21 +08:00
|
|
|
int dif, int sdif)
|
2017-03-09 07:36:49 +08:00
|
|
|
{
|
2017-04-19 01:39:41 +08:00
|
|
|
unsigned short hnum = ntohs(loc_port);
|
2017-12-02 04:52:30 +08:00
|
|
|
unsigned int hash2 = ipv6_portaddr_hash(net, loc_addr, hnum);
|
2017-04-19 01:39:41 +08:00
|
|
|
unsigned int slot2 = hash2 & udp_table.mask;
|
|
|
|
struct udp_hslot *hslot2 = &udp_table.hash2[slot2];
|
|
|
|
const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum);
|
2017-03-09 07:36:49 +08:00
|
|
|
struct sock *sk;
|
|
|
|
|
2017-04-19 01:39:41 +08:00
|
|
|
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
|
2017-06-24 06:25:37 +08:00
|
|
|
if (sk->sk_state == TCP_ESTABLISHED &&
|
2017-08-07 23:44:21 +08:00
|
|
|
INET6_MATCH(sk, net, rmt_addr, loc_addr, ports, dif, sdif))
|
2017-04-19 01:39:41 +08:00
|
|
|
return sk;
|
|
|
|
/* Only check first socket in chain */
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return NULL;
|
2017-03-09 07:36:49 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void udp_v6_early_demux(struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
struct net *net = dev_net(skb->dev);
|
|
|
|
const struct udphdr *uh;
|
|
|
|
struct sock *sk;
|
|
|
|
struct dst_entry *dst;
|
|
|
|
int dif = skb->dev->ifindex;
|
2017-08-07 23:44:21 +08:00
|
|
|
int sdif = inet6_sdif(skb);
|
2017-03-09 07:36:49 +08:00
|
|
|
|
|
|
|
if (!pskb_may_pull(skb, skb_transport_offset(skb) +
|
|
|
|
sizeof(struct udphdr)))
|
|
|
|
return;
|
|
|
|
|
|
|
|
uh = udp_hdr(skb);
|
|
|
|
|
|
|
|
if (skb->pkt_type == PACKET_HOST)
|
|
|
|
sk = __udp6_lib_demux_lookup(net, uh->dest,
|
|
|
|
&ipv6_hdr(skb)->daddr,
|
|
|
|
uh->source, &ipv6_hdr(skb)->saddr,
|
2017-08-07 23:44:21 +08:00
|
|
|
dif, sdif);
|
2017-03-09 07:36:49 +08:00
|
|
|
else
|
|
|
|
return;
|
|
|
|
|
2017-06-30 18:08:01 +08:00
|
|
|
if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt))
|
2017-03-09 07:36:49 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
skb->sk = sk;
|
|
|
|
skb->destructor = sock_efree;
|
|
|
|
dst = READ_ONCE(sk->sk_rx_dst);
|
|
|
|
|
|
|
|
if (dst)
|
|
|
|
dst = dst_check(dst, inet6_sk(sk)->rx_dst_cookie);
|
|
|
|
if (dst) {
|
2017-06-18 01:42:25 +08:00
|
|
|
/* set noref for now.
|
|
|
|
* any place which wants to hold dst has to call
|
|
|
|
* dst_hold_safe()
|
|
|
|
*/
|
|
|
|
skb_dst_set_noref(skb, dst);
|
2017-03-09 07:36:49 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2007-10-16 03:50:28 +08:00
|
|
|
static __inline__ int udpv6_rcv(struct sk_buff *skb)
|
2006-11-28 03:10:57 +08:00
|
|
|
{
|
2008-10-29 16:41:45 +08:00
|
|
|
return __udp6_lib_rcv(skb, &udp_table, IPPROTO_UDP);
|
2006-11-28 03:10:57 +08:00
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Throw away all pending data and cancel the corking. Socket is locked.
|
|
|
|
*/
|
|
|
|
static void udp_v6_flush_pending_frames(struct sock *sk)
|
|
|
|
{
|
|
|
|
struct udp_sock *up = udp_sk(sk);
|
|
|
|
|
2008-06-04 19:49:07 +08:00
|
|
|
if (up->pending == AF_INET)
|
|
|
|
udp_flush_pending_frames(sk);
|
|
|
|
else if (up->pending) {
|
2005-04-17 06:20:36 +08:00
|
|
|
up->len = 0;
|
|
|
|
up->pending = 0;
|
|
|
|
ip6_flush_pending_frames(sk);
|
2007-02-09 22:24:49 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2018-03-31 06:08:05 +08:00
|
|
|
static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr,
|
|
|
|
int addr_len)
|
|
|
|
{
|
|
|
|
/* The following checks are replicated from __ip6_datagram_connect()
|
|
|
|
* and intended to prevent BPF program called below from accessing
|
|
|
|
* bytes that are out of the bound specified by user in addr_len.
|
|
|
|
*/
|
|
|
|
if (uaddr->sa_family == AF_INET) {
|
|
|
|
if (__ipv6_only_sock(sk))
|
|
|
|
return -EAFNOSUPPORT;
|
|
|
|
return udp_pre_connect(sk, uaddr, addr_len);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (addr_len < SIN6_LEN_RFC2133)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr);
|
|
|
|
}
|
|
|
|
|
2009-07-09 16:09:54 +08:00
|
|
|
/**
|
2014-08-25 04:53:10 +08:00
|
|
|
* udp6_hwcsum_outgoing - handle outgoing HW checksumming
|
|
|
|
* @sk: socket we are sending on
|
|
|
|
* @skb: sk_buff containing the filled-in UDP header
|
|
|
|
* (checksum field must be zeroed out)
|
2009-07-09 16:09:54 +08:00
|
|
|
*/
|
|
|
|
static void udp6_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb,
|
|
|
|
const struct in6_addr *saddr,
|
|
|
|
const struct in6_addr *daddr, int len)
|
|
|
|
{
|
|
|
|
unsigned int offset;
|
|
|
|
struct udphdr *uh = udp_hdr(skb);
|
2015-01-31 23:40:16 +08:00
|
|
|
struct sk_buff *frags = skb_shinfo(skb)->frag_list;
|
2009-07-09 16:09:54 +08:00
|
|
|
__wsum csum = 0;
|
|
|
|
|
2015-01-31 23:40:16 +08:00
|
|
|
if (!frags) {
|
2009-07-09 16:09:54 +08:00
|
|
|
/* Only one fragment on the socket. */
|
|
|
|
skb->csum_start = skb_transport_header(skb) - skb->head;
|
|
|
|
skb->csum_offset = offsetof(struct udphdr, check);
|
|
|
|
uh->check = ~csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP, 0);
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* HW-checksum won't work as there are two or more
|
|
|
|
* fragments on the socket so that all csums of sk_buffs
|
|
|
|
* should be together
|
|
|
|
*/
|
|
|
|
offset = skb_transport_offset(skb);
|
|
|
|
skb->csum = skb_checksum(skb, offset, skb->len - offset, 0);
|
2017-09-14 09:30:51 +08:00
|
|
|
csum = skb->csum;
|
2009-07-09 16:09:54 +08:00
|
|
|
|
|
|
|
skb->ip_summed = CHECKSUM_NONE;
|
|
|
|
|
2015-01-31 23:40:16 +08:00
|
|
|
do {
|
|
|
|
csum = csum_add(csum, frags->csum);
|
|
|
|
} while ((frags = frags->next));
|
2009-07-09 16:09:54 +08:00
|
|
|
|
|
|
|
uh->check = csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP,
|
|
|
|
csum);
|
|
|
|
if (uh->check == 0)
|
|
|
|
uh->check = CSUM_MANGLED_0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Sending
|
|
|
|
*/
|
|
|
|
|
udp: generate gso with UDP_SEGMENT
Support generic segmentation offload for udp datagrams. Callers can
concatenate and send at once the payload of multiple datagrams with
the same destination.
To set segment size, the caller sets socket option UDP_SEGMENT to the
length of each discrete payload. This value must be smaller than or
equal to the relevant MTU.
A follow-up patch adds cmsg UDP_SEGMENT to specify segment size on a
per send call basis.
Total byte length may then exceed MTU. If not an exact multiple of
segment size, the last segment will be shorter.
The implementation adds a gso_size field to the udp socket, ip(v6)
cmsg cookie and inet_cork structure to be able to set the value at
setsockopt or cmsg time and to work with both lockless and corked
paths.
Initial benchmark numbers show UDP GSO about as expensive as TCP GSO.
tcp tso
3197 MB/s 54232 msg/s 54232 calls/s
6,457,754,262 cycles
tcp gso
1765 MB/s 29939 msg/s 29939 calls/s
11,203,021,806 cycles
tcp without tso/gso *
739 MB/s 12548 msg/s 12548 calls/s
11,205,483,630 cycles
udp
876 MB/s 14873 msg/s 624666 calls/s
11,205,777,429 cycles
udp gso
2139 MB/s 36282 msg/s 36282 calls/s
11,204,374,561 cycles
[*] after reverting commit 0a6b2a1dc2a2
("tcp: switch to GSO being always on")
Measured total system cycles ('-a') for one core while pinning both
the network receive path and benchmark process to that core:
perf stat -a -C 12 -e cycles \
./udpgso_bench_tx -C 12 -4 -D "$DST" -l 4
Note the reduction in calls/s with GSO. Bytes per syscall drops
increases from 1470 to 61818.
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-27 01:42:17 +08:00
|
|
|
static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
|
|
|
|
struct inet_cork *cork)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2015-01-31 23:40:16 +08:00
|
|
|
struct sock *sk = skb->sk;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct udphdr *uh;
|
|
|
|
int err = 0;
|
2007-12-03 19:34:16 +08:00
|
|
|
int is_udplite = IS_UDPLITE(sk);
|
2006-11-15 13:35:48 +08:00
|
|
|
__wsum csum = 0;
|
2015-01-31 23:40:16 +08:00
|
|
|
int offset = skb_transport_offset(skb);
|
|
|
|
int len = skb->len - offset;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Create a UDP header
|
|
|
|
*/
|
2007-03-14 01:28:48 +08:00
|
|
|
uh = udp_hdr(skb);
|
2011-03-13 05:36:19 +08:00
|
|
|
uh->source = fl6->fl6_sport;
|
|
|
|
uh->dest = fl6->fl6_dport;
|
2015-01-31 23:40:16 +08:00
|
|
|
uh->len = htons(len);
|
2005-04-17 06:20:36 +08:00
|
|
|
uh->check = 0;
|
|
|
|
|
udp: generate gso with UDP_SEGMENT
Support generic segmentation offload for udp datagrams. Callers can
concatenate and send at once the payload of multiple datagrams with
the same destination.
To set segment size, the caller sets socket option UDP_SEGMENT to the
length of each discrete payload. This value must be smaller than or
equal to the relevant MTU.
A follow-up patch adds cmsg UDP_SEGMENT to specify segment size on a
per send call basis.
Total byte length may then exceed MTU. If not an exact multiple of
segment size, the last segment will be shorter.
The implementation adds a gso_size field to the udp socket, ip(v6)
cmsg cookie and inet_cork structure to be able to set the value at
setsockopt or cmsg time and to work with both lockless and corked
paths.
Initial benchmark numbers show UDP GSO about as expensive as TCP GSO.
tcp tso
3197 MB/s 54232 msg/s 54232 calls/s
6,457,754,262 cycles
tcp gso
1765 MB/s 29939 msg/s 29939 calls/s
11,203,021,806 cycles
tcp without tso/gso *
739 MB/s 12548 msg/s 12548 calls/s
11,205,483,630 cycles
udp
876 MB/s 14873 msg/s 624666 calls/s
11,205,777,429 cycles
udp gso
2139 MB/s 36282 msg/s 36282 calls/s
11,204,374,561 cycles
[*] after reverting commit 0a6b2a1dc2a2
("tcp: switch to GSO being always on")
Measured total system cycles ('-a') for one core while pinning both
the network receive path and benchmark process to that core:
perf stat -a -C 12 -e cycles \
./udpgso_bench_tx -C 12 -4 -D "$DST" -l 4
Note the reduction in calls/s with GSO. Bytes per syscall drops
increases from 1470 to 61818.
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-27 01:42:17 +08:00
|
|
|
if (cork->gso_size) {
|
|
|
|
const int hlen = skb_network_header_len(skb) +
|
|
|
|
sizeof(struct udphdr);
|
|
|
|
|
|
|
|
if (hlen + cork->gso_size > cork->fragsize)
|
|
|
|
return -EINVAL;
|
|
|
|
if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS)
|
|
|
|
return -EINVAL;
|
2018-05-01 03:58:36 +08:00
|
|
|
if (udp_sk(sk)->no_check6_tx)
|
|
|
|
return -EINVAL;
|
2018-05-22 23:34:39 +08:00
|
|
|
if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite ||
|
|
|
|
dst_xfrm(skb_dst(skb)))
|
udp: generate gso with UDP_SEGMENT
Support generic segmentation offload for udp datagrams. Callers can
concatenate and send at once the payload of multiple datagrams with
the same destination.
To set segment size, the caller sets socket option UDP_SEGMENT to the
length of each discrete payload. This value must be smaller than or
equal to the relevant MTU.
A follow-up patch adds cmsg UDP_SEGMENT to specify segment size on a
per send call basis.
Total byte length may then exceed MTU. If not an exact multiple of
segment size, the last segment will be shorter.
The implementation adds a gso_size field to the udp socket, ip(v6)
cmsg cookie and inet_cork structure to be able to set the value at
setsockopt or cmsg time and to work with both lockless and corked
paths.
Initial benchmark numbers show UDP GSO about as expensive as TCP GSO.
tcp tso
3197 MB/s 54232 msg/s 54232 calls/s
6,457,754,262 cycles
tcp gso
1765 MB/s 29939 msg/s 29939 calls/s
11,203,021,806 cycles
tcp without tso/gso *
739 MB/s 12548 msg/s 12548 calls/s
11,205,483,630 cycles
udp
876 MB/s 14873 msg/s 624666 calls/s
11,205,777,429 cycles
udp gso
2139 MB/s 36282 msg/s 36282 calls/s
11,204,374,561 cycles
[*] after reverting commit 0a6b2a1dc2a2
("tcp: switch to GSO being always on")
Measured total system cycles ('-a') for one core while pinning both
the network receive path and benchmark process to that core:
perf stat -a -C 12 -e cycles \
./udpgso_bench_tx -C 12 -4 -D "$DST" -l 4
Note the reduction in calls/s with GSO. Bytes per syscall drops
increases from 1470 to 61818.
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-27 01:42:17 +08:00
|
|
|
return -EIO;
|
|
|
|
|
|
|
|
skb_shinfo(skb)->gso_size = cork->gso_size;
|
|
|
|
skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
|
2018-05-01 03:58:36 +08:00
|
|
|
goto csum_partial;
|
udp: generate gso with UDP_SEGMENT
Support generic segmentation offload for udp datagrams. Callers can
concatenate and send at once the payload of multiple datagrams with
the same destination.
To set segment size, the caller sets socket option UDP_SEGMENT to the
length of each discrete payload. This value must be smaller than or
equal to the relevant MTU.
A follow-up patch adds cmsg UDP_SEGMENT to specify segment size on a
per send call basis.
Total byte length may then exceed MTU. If not an exact multiple of
segment size, the last segment will be shorter.
The implementation adds a gso_size field to the udp socket, ip(v6)
cmsg cookie and inet_cork structure to be able to set the value at
setsockopt or cmsg time and to work with both lockless and corked
paths.
Initial benchmark numbers show UDP GSO about as expensive as TCP GSO.
tcp tso
3197 MB/s 54232 msg/s 54232 calls/s
6,457,754,262 cycles
tcp gso
1765 MB/s 29939 msg/s 29939 calls/s
11,203,021,806 cycles
tcp without tso/gso *
739 MB/s 12548 msg/s 12548 calls/s
11,205,483,630 cycles
udp
876 MB/s 14873 msg/s 624666 calls/s
11,205,777,429 cycles
udp gso
2139 MB/s 36282 msg/s 36282 calls/s
11,204,374,561 cycles
[*] after reverting commit 0a6b2a1dc2a2
("tcp: switch to GSO being always on")
Measured total system cycles ('-a') for one core while pinning both
the network receive path and benchmark process to that core:
perf stat -a -C 12 -e cycles \
./udpgso_bench_tx -C 12 -4 -D "$DST" -l 4
Note the reduction in calls/s with GSO. Bytes per syscall drops
increases from 1470 to 61818.
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-27 01:42:17 +08:00
|
|
|
}
|
|
|
|
|
2007-12-03 19:34:16 +08:00
|
|
|
if (is_udplite)
|
2015-01-31 23:40:16 +08:00
|
|
|
csum = udplite_csum(skb);
|
|
|
|
else if (udp_sk(sk)->no_check6_tx) { /* UDP csum disabled */
|
2014-05-03 07:29:58 +08:00
|
|
|
skb->ip_summed = CHECKSUM_NONE;
|
|
|
|
goto send;
|
|
|
|
} else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
|
2018-05-01 03:58:36 +08:00
|
|
|
csum_partial:
|
2015-01-31 23:40:16 +08:00
|
|
|
udp6_hwcsum_outgoing(sk, skb, &fl6->saddr, &fl6->daddr, len);
|
2009-07-09 16:09:54 +08:00
|
|
|
goto send;
|
|
|
|
} else
|
2015-01-31 23:40:16 +08:00
|
|
|
csum = udp_csum(skb);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-11-28 03:10:57 +08:00
|
|
|
/* add protocol-dependent pseudo-header */
|
2011-03-13 05:22:43 +08:00
|
|
|
uh->check = csum_ipv6_magic(&fl6->saddr, &fl6->daddr,
|
2015-01-31 23:40:16 +08:00
|
|
|
len, fl6->flowi6_proto, csum);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (uh->check == 0)
|
2006-11-16 18:36:50 +08:00
|
|
|
uh->check = CSUM_MANGLED_0;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-07-09 16:09:54 +08:00
|
|
|
send:
|
2015-01-31 23:40:16 +08:00
|
|
|
err = ip6_send_skb(skb);
|
ip: Report qdisc packet drops
Christoph Lameter pointed out that packet drops at qdisc level where not
accounted in SNMP counters. Only if application sets IP_RECVERR, drops
are reported to user (-ENOBUFS errors) and SNMP counters updated.
IP_RECVERR is used to enable extended reliable error message passing,
but these are not needed to update system wide SNMP stats.
This patch changes things a bit to allow SNMP counters to be updated,
regardless of IP_RECVERR being set or not on the socket.
Example after an UDP tx flood
# netstat -s
...
IP:
1487048 outgoing packets dropped
...
Udp:
...
SndbufErrors: 1487048
send() syscalls, do however still return an OK status, to not
break applications.
Note : send() manual page explicitly says for -ENOBUFS error :
"The output queue for a network interface was full.
This generally indicates that the interface has stopped sending,
but may be caused by transient congestion.
(Normally, this does not occur in Linux. Packets are just silently
dropped when a device queue overflows.) "
This is not true for IP_RECVERR enabled sockets : a send() syscall
that hit a qdisc drop returns an ENOBUFS error.
Many thanks to Christoph, David, and last but not least, Alexey !
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-09-03 09:05:33 +08:00
|
|
|
if (err) {
|
|
|
|
if (err == -ENOBUFS && !inet6_sk(sk)->recverr) {
|
net: snmp: kill various STATS_USER() helpers
In the old days (before linux-3.0), SNMP counters were duplicated,
one for user context, and one for BH context.
After commit 8f0ea0fe3a03 ("snmp: reduce percpu needs by 50%")
we have a single copy, and what really matters is preemption being
enabled or disabled, since we use this_cpu_inc() or __this_cpu_inc()
respectively.
We therefore kill SNMP_INC_STATS_USER(), SNMP_ADD_STATS_USER(),
NET_INC_STATS_USER(), NET_ADD_STATS_USER(), SCTP_INC_STATS_USER(),
SNMP_INC_STATS64_USER(), SNMP_ADD_STATS64_USER(), TCP_ADD_STATS_USER(),
UDP_INC_STATS_USER(), UDP6_INC_STATS_USER(), and XFRM_INC_STATS_USER()
Following patches will rename __BH helpers to make clear their
usage is not tied to BH being disabled.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-28 07:44:27 +08:00
|
|
|
UDP6_INC_STATS(sock_net(sk),
|
|
|
|
UDP_MIB_SNDBUFERRORS, is_udplite);
|
ip: Report qdisc packet drops
Christoph Lameter pointed out that packet drops at qdisc level where not
accounted in SNMP counters. Only if application sets IP_RECVERR, drops
are reported to user (-ENOBUFS errors) and SNMP counters updated.
IP_RECVERR is used to enable extended reliable error message passing,
but these are not needed to update system wide SNMP stats.
This patch changes things a bit to allow SNMP counters to be updated,
regardless of IP_RECVERR being set or not on the socket.
Example after an UDP tx flood
# netstat -s
...
IP:
1487048 outgoing packets dropped
...
Udp:
...
SndbufErrors: 1487048
send() syscalls, do however still return an OK status, to not
break applications.
Note : send() manual page explicitly says for -ENOBUFS error :
"The output queue for a network interface was full.
This generally indicates that the interface has stopped sending,
but may be caused by transient congestion.
(Normally, this does not occur in Linux. Packets are just silently
dropped when a device queue overflows.) "
This is not true for IP_RECVERR enabled sockets : a send() syscall
that hit a qdisc drop returns an ENOBUFS error.
Many thanks to Christoph, David, and last but not least, Alexey !
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-09-03 09:05:33 +08:00
|
|
|
err = 0;
|
|
|
|
}
|
net: snmp: kill various STATS_USER() helpers
In the old days (before linux-3.0), SNMP counters were duplicated,
one for user context, and one for BH context.
After commit 8f0ea0fe3a03 ("snmp: reduce percpu needs by 50%")
we have a single copy, and what really matters is preemption being
enabled or disabled, since we use this_cpu_inc() or __this_cpu_inc()
respectively.
We therefore kill SNMP_INC_STATS_USER(), SNMP_ADD_STATS_USER(),
NET_INC_STATS_USER(), NET_ADD_STATS_USER(), SCTP_INC_STATS_USER(),
SNMP_INC_STATS64_USER(), SNMP_ADD_STATS64_USER(), TCP_ADD_STATS_USER(),
UDP_INC_STATS_USER(), UDP6_INC_STATS_USER(), and XFRM_INC_STATS_USER()
Following patches will rename __BH helpers to make clear their
usage is not tied to BH being disabled.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-28 07:44:27 +08:00
|
|
|
} else {
|
|
|
|
UDP6_INC_STATS(sock_net(sk),
|
|
|
|
UDP_MIB_OUTDATAGRAMS, is_udplite);
|
|
|
|
}
|
2015-01-31 23:40:16 +08:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int udp_v6_push_pending_frames(struct sock *sk)
|
|
|
|
{
|
|
|
|
struct sk_buff *skb;
|
|
|
|
struct udp_sock *up = udp_sk(sk);
|
|
|
|
struct flowi6 fl6;
|
|
|
|
int err = 0;
|
|
|
|
|
|
|
|
if (up->pending == AF_INET)
|
|
|
|
return udp_push_pending_frames(sk);
|
|
|
|
|
|
|
|
/* ip6_finish_skb will release the cork, so make a copy of
|
|
|
|
* fl6 here.
|
|
|
|
*/
|
|
|
|
fl6 = inet_sk(sk)->cork.fl.u.ip6;
|
|
|
|
|
|
|
|
skb = ip6_finish_skb(sk);
|
|
|
|
if (!skb)
|
|
|
|
goto out;
|
|
|
|
|
udp: generate gso with UDP_SEGMENT
Support generic segmentation offload for udp datagrams. Callers can
concatenate and send at once the payload of multiple datagrams with
the same destination.
To set segment size, the caller sets socket option UDP_SEGMENT to the
length of each discrete payload. This value must be smaller than or
equal to the relevant MTU.
A follow-up patch adds cmsg UDP_SEGMENT to specify segment size on a
per send call basis.
Total byte length may then exceed MTU. If not an exact multiple of
segment size, the last segment will be shorter.
The implementation adds a gso_size field to the udp socket, ip(v6)
cmsg cookie and inet_cork structure to be able to set the value at
setsockopt or cmsg time and to work with both lockless and corked
paths.
Initial benchmark numbers show UDP GSO about as expensive as TCP GSO.
tcp tso
3197 MB/s 54232 msg/s 54232 calls/s
6,457,754,262 cycles
tcp gso
1765 MB/s 29939 msg/s 29939 calls/s
11,203,021,806 cycles
tcp without tso/gso *
739 MB/s 12548 msg/s 12548 calls/s
11,205,483,630 cycles
udp
876 MB/s 14873 msg/s 624666 calls/s
11,205,777,429 cycles
udp gso
2139 MB/s 36282 msg/s 36282 calls/s
11,204,374,561 cycles
[*] after reverting commit 0a6b2a1dc2a2
("tcp: switch to GSO being always on")
Measured total system cycles ('-a') for one core while pinning both
the network receive path and benchmark process to that core:
perf stat -a -C 12 -e cycles \
./udpgso_bench_tx -C 12 -4 -D "$DST" -l 4
Note the reduction in calls/s with GSO. Bytes per syscall drops
increases from 1470 to 61818.
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-27 01:42:17 +08:00
|
|
|
err = udp_v6_send_skb(skb, &fl6, &inet_sk(sk)->cork.base);
|
2015-01-31 23:40:16 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
out:
|
|
|
|
up->len = 0;
|
|
|
|
up->pending = 0;
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2015-03-02 15:37:48 +08:00
|
|
|
int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
struct ipv6_txoptions opt_space;
|
|
|
|
struct udp_sock *up = udp_sk(sk);
|
|
|
|
struct inet_sock *inet = inet_sk(sk);
|
|
|
|
struct ipv6_pinfo *np = inet6_sk(sk);
|
2014-01-18 05:53:15 +08:00
|
|
|
DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
|
2010-06-02 05:35:01 +08:00
|
|
|
struct in6_addr *daddr, *final_p, final;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct ipv6_txoptions *opt = NULL;
|
2015-11-30 11:37:57 +08:00
|
|
|
struct ipv6_txoptions *opt_to_free = NULL;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct ip6_flowlabel *flowlabel = NULL;
|
2011-03-13 05:22:43 +08:00
|
|
|
struct flowi6 fl6;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct dst_entry *dst;
|
2016-05-03 12:40:07 +08:00
|
|
|
struct ipcm6_cookie ipc6;
|
2005-04-17 06:20:36 +08:00
|
|
|
int addr_len = msg->msg_namelen;
|
2018-04-03 20:00:09 +08:00
|
|
|
bool connected = false;
|
2005-04-17 06:20:36 +08:00
|
|
|
int ulen = len;
|
|
|
|
int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
|
|
|
|
int err;
|
2007-12-03 19:34:16 +08:00
|
|
|
int is_udplite = IS_UDPLITE(sk);
|
2006-11-28 03:10:57 +08:00
|
|
|
int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2018-07-06 22:12:55 +08:00
|
|
|
ipcm6_init(&ipc6);
|
udp: generate gso with UDP_SEGMENT
Support generic segmentation offload for udp datagrams. Callers can
concatenate and send at once the payload of multiple datagrams with
the same destination.
To set segment size, the caller sets socket option UDP_SEGMENT to the
length of each discrete payload. This value must be smaller than or
equal to the relevant MTU.
A follow-up patch adds cmsg UDP_SEGMENT to specify segment size on a
per send call basis.
Total byte length may then exceed MTU. If not an exact multiple of
segment size, the last segment will be shorter.
The implementation adds a gso_size field to the udp socket, ip(v6)
cmsg cookie and inet_cork structure to be able to set the value at
setsockopt or cmsg time and to work with both lockless and corked
paths.
Initial benchmark numbers show UDP GSO about as expensive as TCP GSO.
tcp tso
3197 MB/s 54232 msg/s 54232 calls/s
6,457,754,262 cycles
tcp gso
1765 MB/s 29939 msg/s 29939 calls/s
11,203,021,806 cycles
tcp without tso/gso *
739 MB/s 12548 msg/s 12548 calls/s
11,205,483,630 cycles
udp
876 MB/s 14873 msg/s 624666 calls/s
11,205,777,429 cycles
udp gso
2139 MB/s 36282 msg/s 36282 calls/s
11,204,374,561 cycles
[*] after reverting commit 0a6b2a1dc2a2
("tcp: switch to GSO being always on")
Measured total system cycles ('-a') for one core while pinning both
the network receive path and benchmark process to that core:
perf stat -a -C 12 -e cycles \
./udpgso_bench_tx -C 12 -4 -D "$DST" -l 4
Note the reduction in calls/s with GSO. Bytes per syscall drops
increases from 1470 to 61818.
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-27 01:42:17 +08:00
|
|
|
ipc6.gso_size = up->gso_size;
|
2018-07-06 22:12:57 +08:00
|
|
|
ipc6.sockc.tsflags = sk->sk_tsflags;
|
2016-05-03 12:40:07 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* destination address check */
|
|
|
|
if (sin6) {
|
|
|
|
if (addr_len < offsetof(struct sockaddr, sa_data))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
switch (sin6->sin6_family) {
|
|
|
|
case AF_INET6:
|
|
|
|
if (addr_len < SIN6_LEN_RFC2133)
|
|
|
|
return -EINVAL;
|
|
|
|
daddr = &sin6->sin6_addr;
|
2017-02-13 06:26:07 +08:00
|
|
|
if (ipv6_addr_any(daddr) &&
|
|
|
|
ipv6_addr_v4mapped(&np->saddr))
|
|
|
|
ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK),
|
|
|
|
daddr);
|
2005-04-17 06:20:36 +08:00
|
|
|
break;
|
|
|
|
case AF_INET:
|
|
|
|
goto do_udp_sendmsg;
|
|
|
|
case AF_UNSPEC:
|
|
|
|
msg->msg_name = sin6 = NULL;
|
|
|
|
msg->msg_namelen = addr_len = 0;
|
|
|
|
daddr = NULL;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
} else if (!up->pending) {
|
|
|
|
if (sk->sk_state != TCP_ESTABLISHED)
|
|
|
|
return -EDESTADDRREQ;
|
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-04 06:42:29 +08:00
|
|
|
daddr = &sk->sk_v6_daddr;
|
2007-02-09 22:24:49 +08:00
|
|
|
} else
|
2005-04-17 06:20:36 +08:00
|
|
|
daddr = NULL;
|
|
|
|
|
|
|
|
if (daddr) {
|
2007-08-25 14:16:08 +08:00
|
|
|
if (ipv6_addr_v4mapped(daddr)) {
|
2005-04-17 06:20:36 +08:00
|
|
|
struct sockaddr_in sin;
|
|
|
|
sin.sin_family = AF_INET;
|
2009-10-15 14:30:45 +08:00
|
|
|
sin.sin_port = sin6 ? sin6->sin6_port : inet->inet_dport;
|
2005-04-17 06:20:36 +08:00
|
|
|
sin.sin_addr.s_addr = daddr->s6_addr32[3];
|
|
|
|
msg->msg_name = &sin;
|
|
|
|
msg->msg_namelen = sizeof(sin);
|
|
|
|
do_udp_sendmsg:
|
|
|
|
if (__ipv6_only_sock(sk))
|
|
|
|
return -ENETUNREACH;
|
2015-03-02 15:37:48 +08:00
|
|
|
return udp_sendmsg(sk, msg, len);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (up->pending == AF_INET)
|
2015-03-02 15:37:48 +08:00
|
|
|
return udp_sendmsg(sk, msg, len);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* Rough check on arithmetic overflow,
|
[IPv6]: Fix incorrect length check in rawv6_sendmsg()
In article <20070329.142644.70222545.davem@davemloft.net> (at Thu, 29 Mar 2007 14:26:44 -0700 (PDT)), David Miller <davem@davemloft.net> says:
> From: Sridhar Samudrala <sri@us.ibm.com>
> Date: Thu, 29 Mar 2007 14:17:28 -0700
>
> > The check for length in rawv6_sendmsg() is incorrect.
> > As len is an unsigned int, (len < 0) will never be TRUE.
> > I think checking for IPV6_MAXPLEN(65535) is better.
> >
> > Is it possible to send ipv6 jumbo packets using raw
> > sockets? If so, we can remove this check.
>
> I don't see why such a limitation against jumbo would exist,
> does anyone else?
>
> Thanks for catching this Sridhar. A good compiler should simply
> fail to compile "if (x < 0)" when 'x' is an unsigned type, don't
> you think :-)
Dave, we use "int" for returning value,
so we should fix this anyway, IMHO;
we should not allow len > INT_MAX.
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Acked-by: Sridhar Samudrala <sri@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2007-03-31 05:45:35 +08:00
|
|
|
better check is made in ip6_append_data().
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
|
|
|
if (len > INT_MAX - sizeof(struct udphdr))
|
|
|
|
return -EMSGSIZE;
|
2007-02-09 22:24:49 +08:00
|
|
|
|
2015-01-31 23:40:17 +08:00
|
|
|
getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
|
2005-04-17 06:20:36 +08:00
|
|
|
if (up->pending) {
|
|
|
|
/*
|
|
|
|
* There are pending frames.
|
|
|
|
* The socket lock must be held while it's corked.
|
|
|
|
*/
|
|
|
|
lock_sock(sk);
|
|
|
|
if (likely(up->pending)) {
|
|
|
|
if (unlikely(up->pending != AF_INET6)) {
|
|
|
|
release_sock(sk);
|
|
|
|
return -EAFNOSUPPORT;
|
|
|
|
}
|
|
|
|
dst = NULL;
|
|
|
|
goto do_append_data;
|
|
|
|
}
|
|
|
|
release_sock(sk);
|
|
|
|
}
|
|
|
|
ulen += sizeof(struct udphdr);
|
|
|
|
|
2011-03-13 05:22:43 +08:00
|
|
|
memset(&fl6, 0, sizeof(fl6));
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
if (sin6) {
|
|
|
|
if (sin6->sin6_port == 0)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2011-03-13 05:36:19 +08:00
|
|
|
fl6.fl6_dport = sin6->sin6_port;
|
2005-04-17 06:20:36 +08:00
|
|
|
daddr = &sin6->sin6_addr;
|
|
|
|
|
|
|
|
if (np->sndflow) {
|
2011-03-13 05:22:43 +08:00
|
|
|
fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
|
|
|
|
if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
|
|
|
|
flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
|
2015-03-29 21:00:04 +08:00
|
|
|
if (!flowlabel)
|
2005-04-17 06:20:36 +08:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Otherwise it will be difficult to maintain
|
|
|
|
* sk->sk_dst_cache.
|
|
|
|
*/
|
|
|
|
if (sk->sk_state == TCP_ESTABLISHED &&
|
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-04 06:42:29 +08:00
|
|
|
ipv6_addr_equal(daddr, &sk->sk_v6_daddr))
|
|
|
|
daddr = &sk->sk_v6_daddr;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
if (addr_len >= sizeof(struct sockaddr_in6) &&
|
|
|
|
sin6->sin6_scope_id &&
|
2013-03-08 10:07:19 +08:00
|
|
|
__ipv6_addr_needs_scope_id(__ipv6_addr_type(daddr)))
|
2011-03-13 05:22:43 +08:00
|
|
|
fl6.flowi6_oif = sin6->sin6_scope_id;
|
2005-04-17 06:20:36 +08:00
|
|
|
} else {
|
|
|
|
if (sk->sk_state != TCP_ESTABLISHED)
|
|
|
|
return -EDESTADDRREQ;
|
|
|
|
|
2011-03-13 05:36:19 +08:00
|
|
|
fl6.fl6_dport = inet->inet_dport;
|
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-04 06:42:29 +08:00
|
|
|
daddr = &sk->sk_v6_daddr;
|
2011-03-13 05:22:43 +08:00
|
|
|
fl6.flowlabel = np->flow_label;
|
2018-04-03 20:00:09 +08:00
|
|
|
connected = true;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2011-03-13 05:22:43 +08:00
|
|
|
if (!fl6.flowi6_oif)
|
|
|
|
fl6.flowi6_oif = sk->sk_bound_dev_if;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-03-13 05:22:43 +08:00
|
|
|
if (!fl6.flowi6_oif)
|
|
|
|
fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
|
2008-12-16 18:08:29 +08:00
|
|
|
|
2011-03-13 05:22:43 +08:00
|
|
|
fl6.flowi6_mark = sk->sk_mark;
|
2016-11-04 01:23:43 +08:00
|
|
|
fl6.flowi6_uid = sk->sk_uid;
|
2009-10-05 16:24:16 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
if (msg->msg_controllen) {
|
|
|
|
opt = &opt_space;
|
|
|
|
memset(opt, 0, sizeof(struct ipv6_txoptions));
|
|
|
|
opt->tot_len = sizeof(*opt);
|
2016-05-03 12:40:07 +08:00
|
|
|
ipc6.opt = opt;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2018-04-27 01:42:20 +08:00
|
|
|
err = udp_cmsg_send(sk, msg, &ipc6.gso_size);
|
|
|
|
if (err > 0)
|
|
|
|
err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6,
|
2018-07-06 22:12:57 +08:00
|
|
|
&ipc6);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (err < 0) {
|
|
|
|
fl6_sock_release(flowlabel);
|
|
|
|
return err;
|
|
|
|
}
|
2011-03-13 05:22:43 +08:00
|
|
|
if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) {
|
|
|
|
flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
|
2015-03-29 21:00:04 +08:00
|
|
|
if (!flowlabel)
|
2005-04-17 06:20:36 +08:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
if (!(opt->opt_nflen|opt->opt_flen))
|
|
|
|
opt = NULL;
|
2018-04-03 20:00:09 +08:00
|
|
|
connected = false;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2015-11-30 11:37:57 +08:00
|
|
|
if (!opt) {
|
|
|
|
opt = txopt_get(np);
|
|
|
|
opt_to_free = opt;
|
|
|
|
}
|
2005-11-20 11:23:18 +08:00
|
|
|
if (flowlabel)
|
|
|
|
opt = fl6_merge_options(&opt_space, flowlabel, opt);
|
|
|
|
opt = ipv6_fixup_options(&opt_space, opt);
|
2016-05-03 12:40:07 +08:00
|
|
|
ipc6.opt = opt;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-03-13 05:22:43 +08:00
|
|
|
fl6.flowi6_proto = sk->sk_protocol;
|
2008-04-11 12:38:24 +08:00
|
|
|
if (!ipv6_addr_any(daddr))
|
2011-11-21 11:39:03 +08:00
|
|
|
fl6.daddr = *daddr;
|
2008-04-11 12:38:24 +08:00
|
|
|
else
|
2011-03-13 05:22:43 +08:00
|
|
|
fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */
|
|
|
|
if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr))
|
2011-11-21 11:39:03 +08:00
|
|
|
fl6.saddr = np->saddr;
|
2011-03-13 05:36:19 +08:00
|
|
|
fl6.fl6_sport = inet->inet_sport;
|
2007-02-09 22:24:49 +08:00
|
|
|
|
2018-05-25 23:55:23 +08:00
|
|
|
if (cgroup_bpf_enabled && !connected) {
|
|
|
|
err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,
|
|
|
|
(struct sockaddr *)sin6, &fl6.saddr);
|
|
|
|
if (err)
|
|
|
|
goto out_no_dst;
|
|
|
|
if (sin6) {
|
|
|
|
if (ipv6_addr_v4mapped(&sin6->sin6_addr)) {
|
|
|
|
/* BPF program rewrote IPv6-only by IPv4-mapped
|
|
|
|
* IPv6. It's currently unsupported.
|
|
|
|
*/
|
|
|
|
err = -ENOTSUPP;
|
|
|
|
goto out_no_dst;
|
|
|
|
}
|
|
|
|
if (sin6->sin6_port == 0) {
|
|
|
|
/* BPF program set invalid port. Reject it. */
|
|
|
|
err = -EINVAL;
|
|
|
|
goto out_no_dst;
|
|
|
|
}
|
|
|
|
fl6.fl6_dport = sin6->sin6_port;
|
|
|
|
fl6.daddr = sin6->sin6_addr;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-03-13 05:22:43 +08:00
|
|
|
final_p = fl6_update_dst(&fl6, opt, &final);
|
2010-06-02 05:35:01 +08:00
|
|
|
if (final_p)
|
2018-04-03 20:00:09 +08:00
|
|
|
connected = false;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-03-13 05:22:43 +08:00
|
|
|
if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) {
|
|
|
|
fl6.flowi6_oif = np->mcast_oif;
|
2018-04-03 20:00:09 +08:00
|
|
|
connected = false;
|
2012-02-08 17:11:08 +08:00
|
|
|
} else if (!fl6.flowi6_oif)
|
|
|
|
fl6.flowi6_oif = np->ucast_oif;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-03-13 05:22:43 +08:00
|
|
|
security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
|
2006-08-05 14:12:42 +08:00
|
|
|
|
2016-06-12 02:08:19 +08:00
|
|
|
if (ipc6.tclass < 0)
|
|
|
|
ipc6.tclass = np->tclass;
|
|
|
|
|
|
|
|
fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);
|
|
|
|
|
ipv6: udp: set dst cache for a connected sk if current not valid
A new RTF_CACHE route can be created between ip6_sk_dst_lookup_flow()
and ip6_dst_store() calls in udpv6_sendmsg(), when datagram sending
results to ICMPV6_PKT_TOOBIG error:
udp_v6_send_skb(), for example with vti6 tunnel:
vti6_xmit(), get ICMPV6_PKT_TOOBIG error
skb_dst_update_pmtu(), can create a RTF_CACHE clone
icmpv6_send()
...
udpv6_err()
ip6_sk_update_pmtu()
ip6_update_pmtu(), can create a RTF_CACHE clone
...
ip6_datagram_dst_update()
ip6_dst_store()
And after commit 33c162a980fe ("ipv6: datagram: Update dst cache of
a connected datagram sk during pmtu update"), the UDPv6 error handler
can update socket's dst cache, but it can happen before the update in
the end of udpv6_sendmsg(), preventing getting the new dst cache on
the next udpv6_sendmsg() calls.
In order to fix it, save dst in a connected socket only if the current
socket's dst cache is invalid.
The previous patch prepared ip6_sk_dst_lookup_flow() to do that with
the new argument, and this patch enables it in udpv6_sendmsg().
Fixes: 33c162a980fe ("ipv6: datagram: Update dst cache of a connected datagram sk during pmtu update")
Fixes: 45e4fd26683c ("ipv6: Only create RTF_CACHE routes after encountering pmtu exception")
Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-03 20:00:10 +08:00
|
|
|
dst = ip6_sk_dst_lookup_flow(sk, &fl6, final_p, connected);
|
2011-03-02 05:19:07 +08:00
|
|
|
if (IS_ERR(dst)) {
|
|
|
|
err = PTR_ERR(dst);
|
|
|
|
dst = NULL;
|
2005-04-17 06:20:36 +08:00
|
|
|
goto out;
|
2007-05-25 09:17:54 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2016-05-03 12:40:07 +08:00
|
|
|
if (ipc6.hlimit < 0)
|
|
|
|
ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
if (msg->msg_flags&MSG_CONFIRM)
|
|
|
|
goto do_confirm;
|
|
|
|
back_from_confirm:
|
|
|
|
|
2015-01-31 23:40:17 +08:00
|
|
|
/* Lockless fast path for the non-corking case */
|
|
|
|
if (!corkreq) {
|
2018-04-27 01:42:15 +08:00
|
|
|
struct inet_cork_full cork;
|
2015-01-31 23:40:17 +08:00
|
|
|
struct sk_buff *skb;
|
|
|
|
|
|
|
|
skb = ip6_make_skb(sk, getfrag, msg, ulen,
|
2016-05-03 12:40:07 +08:00
|
|
|
sizeof(struct udphdr), &ipc6,
|
2015-01-31 23:40:17 +08:00
|
|
|
&fl6, (struct rt6_info *)dst,
|
2018-07-06 22:12:57 +08:00
|
|
|
msg->msg_flags, &cork);
|
2015-01-31 23:40:17 +08:00
|
|
|
err = PTR_ERR(skb);
|
|
|
|
if (!IS_ERR_OR_NULL(skb))
|
udp: generate gso with UDP_SEGMENT
Support generic segmentation offload for udp datagrams. Callers can
concatenate and send at once the payload of multiple datagrams with
the same destination.
To set segment size, the caller sets socket option UDP_SEGMENT to the
length of each discrete payload. This value must be smaller than or
equal to the relevant MTU.
A follow-up patch adds cmsg UDP_SEGMENT to specify segment size on a
per send call basis.
Total byte length may then exceed MTU. If not an exact multiple of
segment size, the last segment will be shorter.
The implementation adds a gso_size field to the udp socket, ip(v6)
cmsg cookie and inet_cork structure to be able to set the value at
setsockopt or cmsg time and to work with both lockless and corked
paths.
Initial benchmark numbers show UDP GSO about as expensive as TCP GSO.
tcp tso
3197 MB/s 54232 msg/s 54232 calls/s
6,457,754,262 cycles
tcp gso
1765 MB/s 29939 msg/s 29939 calls/s
11,203,021,806 cycles
tcp without tso/gso *
739 MB/s 12548 msg/s 12548 calls/s
11,205,483,630 cycles
udp
876 MB/s 14873 msg/s 624666 calls/s
11,205,777,429 cycles
udp gso
2139 MB/s 36282 msg/s 36282 calls/s
11,204,374,561 cycles
[*] after reverting commit 0a6b2a1dc2a2
("tcp: switch to GSO being always on")
Measured total system cycles ('-a') for one core while pinning both
the network receive path and benchmark process to that core:
perf stat -a -C 12 -e cycles \
./udpgso_bench_tx -C 12 -4 -D "$DST" -l 4
Note the reduction in calls/s with GSO. Bytes per syscall drops
increases from 1470 to 61818.
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-27 01:42:17 +08:00
|
|
|
err = udp_v6_send_skb(skb, &fl6, &cork.base);
|
ipv6: udp: set dst cache for a connected sk if current not valid
A new RTF_CACHE route can be created between ip6_sk_dst_lookup_flow()
and ip6_dst_store() calls in udpv6_sendmsg(), when datagram sending
results to ICMPV6_PKT_TOOBIG error:
udp_v6_send_skb(), for example with vti6 tunnel:
vti6_xmit(), get ICMPV6_PKT_TOOBIG error
skb_dst_update_pmtu(), can create a RTF_CACHE clone
icmpv6_send()
...
udpv6_err()
ip6_sk_update_pmtu()
ip6_update_pmtu(), can create a RTF_CACHE clone
...
ip6_datagram_dst_update()
ip6_dst_store()
And after commit 33c162a980fe ("ipv6: datagram: Update dst cache of
a connected datagram sk during pmtu update"), the UDPv6 error handler
can update socket's dst cache, but it can happen before the update in
the end of udpv6_sendmsg(), preventing getting the new dst cache on
the next udpv6_sendmsg() calls.
In order to fix it, save dst in a connected socket only if the current
socket's dst cache is invalid.
The previous patch prepared ip6_sk_dst_lookup_flow() to do that with
the new argument, and this patch enables it in udpv6_sendmsg().
Fixes: 33c162a980fe ("ipv6: datagram: Update dst cache of a connected datagram sk during pmtu update")
Fixes: 45e4fd26683c ("ipv6: Only create RTF_CACHE routes after encountering pmtu exception")
Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-03 20:00:10 +08:00
|
|
|
goto out;
|
2015-01-31 23:40:17 +08:00
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
lock_sock(sk);
|
|
|
|
if (unlikely(up->pending)) {
|
|
|
|
/* The socket is already corked while preparing it. */
|
|
|
|
/* ... which is an evident application bug. --ANK */
|
|
|
|
release_sock(sk);
|
|
|
|
|
2014-11-12 02:59:17 +08:00
|
|
|
net_dbg_ratelimited("udp cork app bug 2\n");
|
2005-04-17 06:20:36 +08:00
|
|
|
err = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
up->pending = AF_INET6;
|
|
|
|
|
|
|
|
do_append_data:
|
2016-05-03 12:40:07 +08:00
|
|
|
if (ipc6.dontfrag < 0)
|
|
|
|
ipc6.dontfrag = np->dontfrag;
|
2005-04-17 06:20:36 +08:00
|
|
|
up->len += ulen;
|
2016-05-03 12:40:07 +08:00
|
|
|
err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr),
|
|
|
|
&ipc6, &fl6, (struct rt6_info *)dst,
|
2018-07-06 22:12:57 +08:00
|
|
|
corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (err)
|
|
|
|
udp_v6_flush_pending_frames(sk);
|
|
|
|
else if (!corkreq)
|
2006-11-28 01:29:59 +08:00
|
|
|
err = udp_v6_push_pending_frames(sk);
|
2006-10-04 05:35:49 +08:00
|
|
|
else if (unlikely(skb_queue_empty(&sk->sk_write_queue)))
|
|
|
|
up->pending = 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2015-01-31 23:40:17 +08:00
|
|
|
if (err > 0)
|
|
|
|
err = np->recverr ? net_xmit_errno(err) : 0;
|
|
|
|
release_sock(sk);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
out:
|
2008-06-04 00:30:25 +08:00
|
|
|
dst_release(dst);
|
2018-05-25 23:55:23 +08:00
|
|
|
out_no_dst:
|
2005-04-17 06:20:36 +08:00
|
|
|
fl6_sock_release(flowlabel);
|
2015-11-30 11:37:57 +08:00
|
|
|
txopt_put(opt_to_free);
|
2007-09-15 08:15:01 +08:00
|
|
|
if (!err)
|
2005-04-17 06:20:36 +08:00
|
|
|
return len;
|
2006-08-15 15:00:09 +08:00
|
|
|
/*
|
|
|
|
* ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space. Reporting
|
|
|
|
* ENOBUFS might not be good (it's not tunable per se), but otherwise
|
|
|
|
* we don't have a good statistic (IpOutDiscards but it can be too many
|
|
|
|
* things). We could add another new stat but at least for now that
|
|
|
|
* seems like overkill.
|
|
|
|
*/
|
|
|
|
if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
|
net: snmp: kill various STATS_USER() helpers
In the old days (before linux-3.0), SNMP counters were duplicated,
one for user context, and one for BH context.
After commit 8f0ea0fe3a03 ("snmp: reduce percpu needs by 50%")
we have a single copy, and what really matters is preemption being
enabled or disabled, since we use this_cpu_inc() or __this_cpu_inc()
respectively.
We therefore kill SNMP_INC_STATS_USER(), SNMP_ADD_STATS_USER(),
NET_INC_STATS_USER(), NET_ADD_STATS_USER(), SCTP_INC_STATS_USER(),
SNMP_INC_STATS64_USER(), SNMP_ADD_STATS64_USER(), TCP_ADD_STATS_USER(),
UDP_INC_STATS_USER(), UDP6_INC_STATS_USER(), and XFRM_INC_STATS_USER()
Following patches will rename __BH helpers to make clear their
usage is not tied to BH being disabled.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-28 07:44:27 +08:00
|
|
|
UDP6_INC_STATS(sock_net(sk),
|
|
|
|
UDP_MIB_SNDBUFERRORS, is_udplite);
|
2006-08-15 15:00:09 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
return err;
|
|
|
|
|
|
|
|
do_confirm:
|
2017-02-07 05:14:16 +08:00
|
|
|
if (msg->msg_flags & MSG_PROBE)
|
|
|
|
dst_confirm_neigh(dst, &fl6.daddr);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!(msg->msg_flags&MSG_PROBE) || len)
|
|
|
|
goto back_from_confirm;
|
|
|
|
err = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2008-06-15 08:04:49 +08:00
|
|
|
void udpv6_destroy_sock(struct sock *sk)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2013-03-19 14:11:12 +08:00
|
|
|
struct udp_sock *up = udp_sk(sk);
|
2005-04-17 06:20:36 +08:00
|
|
|
lock_sock(sk);
|
|
|
|
udp_v6_flush_pending_frames(sk);
|
|
|
|
release_sock(sk);
|
|
|
|
|
2018-05-09 00:07:03 +08:00
|
|
|
if (static_branch_unlikely(&udpv6_encap_needed_key) && up->encap_type) {
|
2013-03-19 14:11:12 +08:00
|
|
|
void (*encap_destroy)(struct sock *sk);
|
locking/atomics: COCCINELLE/treewide: Convert trivial ACCESS_ONCE() patterns to READ_ONCE()/WRITE_ONCE()
Please do not apply this to mainline directly, instead please re-run the
coccinelle script shown below and apply its output.
For several reasons, it is desirable to use {READ,WRITE}_ONCE() in
preference to ACCESS_ONCE(), and new code is expected to use one of the
former. So far, there's been no reason to change most existing uses of
ACCESS_ONCE(), as these aren't harmful, and changing them results in
churn.
However, for some features, the read/write distinction is critical to
correct operation. To distinguish these cases, separate read/write
accessors must be used. This patch migrates (most) remaining
ACCESS_ONCE() instances to {READ,WRITE}_ONCE(), using the following
coccinelle script:
----
// Convert trivial ACCESS_ONCE() uses to equivalent READ_ONCE() and
// WRITE_ONCE()
// $ make coccicheck COCCI=/home/mark/once.cocci SPFLAGS="--include-headers" MODE=patch
virtual patch
@ depends on patch @
expression E1, E2;
@@
- ACCESS_ONCE(E1) = E2
+ WRITE_ONCE(E1, E2)
@ depends on patch @
expression E;
@@
- ACCESS_ONCE(E)
+ READ_ONCE(E)
----
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: davem@davemloft.net
Cc: linux-arch@vger.kernel.org
Cc: mpe@ellerman.id.au
Cc: shuah@kernel.org
Cc: snitzer@redhat.com
Cc: thor.thayer@linux.intel.com
Cc: tj@kernel.org
Cc: viro@zeniv.linux.org.uk
Cc: will.deacon@arm.com
Link: http://lkml.kernel.org/r/1508792849-3115-19-git-send-email-paulmck@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-10-24 05:07:29 +08:00
|
|
|
encap_destroy = READ_ONCE(up->encap_destroy);
|
2013-03-19 14:11:12 +08:00
|
|
|
if (encap_destroy)
|
|
|
|
encap_destroy(sk);
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
inet6_destroy_sock(sk);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Socket option code for UDP
|
|
|
|
*/
|
2006-11-28 03:10:57 +08:00
|
|
|
int udpv6_setsockopt(struct sock *sk, int level, int optname,
|
2009-10-01 07:12:20 +08:00
|
|
|
char __user *optval, unsigned int optlen)
|
2006-03-21 14:45:21 +08:00
|
|
|
{
|
2008-03-07 08:22:02 +08:00
|
|
|
if (level == SOL_UDP || level == SOL_UDPLITE)
|
2006-11-28 01:29:59 +08:00
|
|
|
return udp_lib_setsockopt(sk, level, optname, optval, optlen,
|
|
|
|
udp_v6_push_pending_frames);
|
2006-11-28 03:10:57 +08:00
|
|
|
return ipv6_setsockopt(sk, level, optname, optval, optlen);
|
2006-03-21 14:45:21 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_COMPAT
|
2006-11-28 03:10:57 +08:00
|
|
|
int compat_udpv6_setsockopt(struct sock *sk, int level, int optname,
|
2009-10-01 07:12:20 +08:00
|
|
|
char __user *optval, unsigned int optlen)
|
2006-03-21 14:45:21 +08:00
|
|
|
{
|
2008-03-07 08:22:02 +08:00
|
|
|
if (level == SOL_UDP || level == SOL_UDPLITE)
|
2006-11-28 01:29:59 +08:00
|
|
|
return udp_lib_setsockopt(sk, level, optname, optval, optlen,
|
|
|
|
udp_v6_push_pending_frames);
|
2006-11-28 03:10:57 +08:00
|
|
|
return compat_ipv6_setsockopt(sk, level, optname, optval, optlen);
|
2006-03-21 14:45:21 +08:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2006-11-28 03:10:57 +08:00
|
|
|
int udpv6_getsockopt(struct sock *sk, int level, int optname,
|
|
|
|
char __user *optval, int __user *optlen)
|
2006-03-21 14:45:21 +08:00
|
|
|
{
|
2008-03-07 08:22:02 +08:00
|
|
|
if (level == SOL_UDP || level == SOL_UDPLITE)
|
2006-11-28 01:29:59 +08:00
|
|
|
return udp_lib_getsockopt(sk, level, optname, optval, optlen);
|
2006-11-28 03:10:57 +08:00
|
|
|
return ipv6_getsockopt(sk, level, optname, optval, optlen);
|
2006-03-21 14:45:21 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_COMPAT
|
2006-11-28 03:10:57 +08:00
|
|
|
int compat_udpv6_getsockopt(struct sock *sk, int level, int optname,
|
|
|
|
char __user *optval, int __user *optlen)
|
2006-03-21 14:45:21 +08:00
|
|
|
{
|
2008-03-07 08:22:02 +08:00
|
|
|
if (level == SOL_UDP || level == SOL_UDPLITE)
|
2006-11-28 01:29:59 +08:00
|
|
|
return udp_lib_getsockopt(sk, level, optname, optval, optlen);
|
2006-11-28 03:10:57 +08:00
|
|
|
return compat_ipv6_getsockopt(sk, level, optname, optval, optlen);
|
2006-03-21 14:45:21 +08:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2017-08-29 06:14:20 +08:00
|
|
|
/* thinking of making this const? Don't.
|
|
|
|
* early_demux can change based on sysctl.
|
|
|
|
*/
|
2017-03-24 03:34:16 +08:00
|
|
|
static struct inet6_protocol udpv6_protocol = {
|
2017-03-09 07:36:49 +08:00
|
|
|
.early_demux = udp_v6_early_demux,
|
2017-03-24 03:34:16 +08:00
|
|
|
.early_demux_handler = udp_v6_early_demux,
|
2005-04-17 06:20:36 +08:00
|
|
|
.handler = udpv6_rcv,
|
|
|
|
.err_handler = udpv6_err,
|
|
|
|
.flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
|
|
|
|
};
|
|
|
|
|
|
|
|
/* ------------------------------------------------------------------------ */
|
|
|
|
#ifdef CONFIG_PROC_FS
|
2006-11-28 03:10:57 +08:00
|
|
|
int udp6_seq_show(struct seq_file *seq, void *v)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2013-05-31 23:05:48 +08:00
|
|
|
if (v == SEQ_START_TOKEN) {
|
|
|
|
seq_puts(seq, IPV6_SEQ_DGRAM_HEADER);
|
|
|
|
} else {
|
|
|
|
int bucket = ((struct udp_iter_state *)seq->private)->bucket;
|
|
|
|
struct inet_sock *inet = inet_sk(v);
|
|
|
|
__u16 srcp = ntohs(inet->inet_sport);
|
|
|
|
__u16 destp = ntohs(inet->inet_dport);
|
2018-06-08 17:35:40 +08:00
|
|
|
__ip6_dgram_sock_seq_show(seq, v, srcp, destp,
|
|
|
|
udp_rqueue_get(v), bucket);
|
2013-05-31 23:05:48 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-04-11 01:42:55 +08:00
|
|
|
const struct seq_operations udp6_seq_ops = {
|
2018-04-11 03:31:50 +08:00
|
|
|
.start = udp_seq_start,
|
|
|
|
.next = udp_seq_next,
|
|
|
|
.stop = udp_seq_stop,
|
|
|
|
.show = udp6_seq_show,
|
|
|
|
};
|
2018-04-11 01:42:55 +08:00
|
|
|
EXPORT_SYMBOL(udp6_seq_ops);
|
2011-10-30 14:46:30 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
static struct udp_seq_afinfo udp6_seq_afinfo = {
|
|
|
|
.family = AF_INET6,
|
2008-10-29 16:41:45 +08:00
|
|
|
.udp_table = &udp_table,
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
2010-01-17 11:35:32 +08:00
|
|
|
int __net_init udp6_proc_init(struct net *net)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2018-04-11 01:42:55 +08:00
|
|
|
if (!proc_create_net_data("udp6", 0444, net->proc_net, &udp6_seq_ops,
|
|
|
|
sizeof(struct udp_iter_state), &udp6_seq_afinfo))
|
2018-04-11 03:31:50 +08:00
|
|
|
return -ENOMEM;
|
|
|
|
return 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2015-08-15 05:43:38 +08:00
|
|
|
void udp6_proc_exit(struct net *net)
|
|
|
|
{
|
2018-04-11 03:31:50 +08:00
|
|
|
remove_proc_entry("udp6", net->proc_net);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
#endif /* CONFIG_PROC_FS */
|
|
|
|
|
|
|
|
/* ------------------------------------------------------------------------ */
|
|
|
|
|
|
|
|
struct proto udpv6_prot = {
|
2018-03-14 12:57:16 +08:00
|
|
|
.name = "UDPv6",
|
|
|
|
.owner = THIS_MODULE,
|
|
|
|
.close = udp_lib_close,
|
2018-03-31 06:08:05 +08:00
|
|
|
.pre_connect = udpv6_pre_connect,
|
2018-03-14 12:57:16 +08:00
|
|
|
.connect = ip6_datagram_connect,
|
|
|
|
.disconnect = udp_disconnect,
|
|
|
|
.ioctl = udp_ioctl,
|
|
|
|
.init = udp_init_sock,
|
|
|
|
.destroy = udpv6_destroy_sock,
|
|
|
|
.setsockopt = udpv6_setsockopt,
|
|
|
|
.getsockopt = udpv6_getsockopt,
|
|
|
|
.sendmsg = udpv6_sendmsg,
|
|
|
|
.recvmsg = udpv6_recvmsg,
|
|
|
|
.release_cb = ip6_datagram_release_cb,
|
|
|
|
.hash = udp_lib_hash,
|
|
|
|
.unhash = udp_lib_unhash,
|
|
|
|
.rehash = udp_v6_rehash,
|
|
|
|
.get_port = udp_v6_get_port,
|
|
|
|
.memory_allocated = &udp_memory_allocated,
|
|
|
|
.sysctl_mem = sysctl_udp_mem,
|
|
|
|
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min),
|
|
|
|
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min),
|
|
|
|
.obj_size = sizeof(struct udp6_sock),
|
|
|
|
.h.udp_table = &udp_table,
|
2006-03-21 14:45:21 +08:00
|
|
|
#ifdef CONFIG_COMPAT
|
2018-03-14 12:57:16 +08:00
|
|
|
.compat_setsockopt = compat_udpv6_setsockopt,
|
|
|
|
.compat_getsockopt = compat_udpv6_getsockopt,
|
2006-03-21 14:45:21 +08:00
|
|
|
#endif
|
2018-03-14 12:57:16 +08:00
|
|
|
.diag_destroy = udp_abort,
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
static struct inet_protosw udpv6_protosw = {
|
|
|
|
.type = SOCK_DGRAM,
|
|
|
|
.protocol = IPPROTO_UDP,
|
|
|
|
.prot = &udpv6_prot,
|
|
|
|
.ops = &inet6_dgram_ops,
|
|
|
|
.flags = INET_PROTOSW_PERMANENT,
|
|
|
|
};
|
|
|
|
|
2007-12-11 18:25:35 +08:00
|
|
|
int __init udpv6_init(void)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2007-12-11 18:25:35 +08:00
|
|
|
int ret;
|
|
|
|
|
2012-11-15 16:49:15 +08:00
|
|
|
ret = inet6_add_protocol(&udpv6_protocol, IPPROTO_UDP);
|
|
|
|
if (ret)
|
2012-11-15 16:49:22 +08:00
|
|
|
goto out;
|
2012-11-15 16:49:15 +08:00
|
|
|
|
2007-12-11 18:25:35 +08:00
|
|
|
ret = inet6_register_protosw(&udpv6_protosw);
|
|
|
|
if (ret)
|
|
|
|
goto out_udpv6_protocol;
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
out_udpv6_protocol:
|
|
|
|
inet6_del_protocol(&udpv6_protocol, IPPROTO_UDP);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2007-12-13 21:34:58 +08:00
|
|
|
void udpv6_exit(void)
|
2007-12-11 18:25:35 +08:00
|
|
|
{
|
|
|
|
inet6_unregister_protosw(&udpv6_protosw);
|
|
|
|
inet6_del_protocol(&udpv6_protocol, IPPROTO_UDP);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|