inet: add IP_BIND_ADDRESS_NO_PORT to overcome bind(0) limitations
When an application needs to force a source IP on an active TCP socket it has to use bind(IP, port=x). As most applications do not want to deal with already used ports, x is often set to 0, meaning the kernel is in charge to find an available port. But kernel does not know yet if this socket is going to be a listener or be connected. It has very limited choices (no full knowledge of final 4-tuple for a connect()) With limited ephemeral port range (about 32K ports), it is very easy to fill the space. This patch adds a new SOL_IP socket option, asking kernel to ignore the 0 port provided by application in bind(IP, port=0) and only remember the given IP address. The port will be automatically chosen at connect() time, in a way that allows sharing a source port as long as the 4-tuples are unique. This new feature is available for both IPv4 and IPv6 (Thanks Neal) Tested: Wrote a test program and checked its behavior on IPv4 and IPv6. strace(1) shows sequences of bind(IP=127.0.0.2, port=0) followed by connect(). Also getsockname() show that the port is still 0 right after bind() but properly allocated after connect(). socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 5 setsockopt(5, SOL_IP, IP_BIND_ADDRESS_NO_PORT, [1], 4) = 0 bind(5, {sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr("127.0.0.2")}, 16) = 0 getsockname(5, {sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr("127.0.0.2")}, [16]) = 0 connect(5, {sa_family=AF_INET, sin_port=htons(53174), sin_addr=inet_addr("127.0.0.3")}, 16) = 0 getsockname(5, {sa_family=AF_INET, sin_port=htons(38050), sin_addr=inet_addr("127.0.0.2")}, [16]) = 0 IPv6 test : socket(PF_INET6, SOCK_STREAM, IPPROTO_IP) = 7 setsockopt(7, SOL_IP, IP_BIND_ADDRESS_NO_PORT, [1], 4) = 0 bind(7, {sa_family=AF_INET6, sin6_port=htons(0), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, 28) = 0 getsockname(7, {sa_family=AF_INET6, sin6_port=htons(0), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, [28]) = 0 connect(7, {sa_family=AF_INET6, sin6_port=htons(57300), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, 28) = 0 getsockname(7, {sa_family=AF_INET6, sin6_port=htons(60964), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, [28]) = 0 I was able to bind()/connect() a million concurrent IPv4 sockets, instead of ~32000 before patch. lpaa23:~# ulimit -n 1000010 lpaa23:~# ./bind --connect --num-flows=1000000 & 1000000 sockets lpaa23:~# grep TCP /proc/net/sockstat TCP: inuse 2000063 orphan 0 tw 47 alloc 2000157 mem 66 Check that a given source port is indeed used by many different connections : lpaa23:~# ss -t src :40000 | head -10 State Recv-Q Send-Q Local Address:Port Peer Address:Port ESTAB 0 0 127.0.0.2:40000 127.0.202.33:44983 ESTAB 0 0 127.0.0.2:40000 127.2.27.240:44983 ESTAB 0 0 127.0.0.2:40000 127.2.98.5:44983 ESTAB 0 0 127.0.0.2:40000 127.0.124.196:44983 ESTAB 0 0 127.0.0.2:40000 127.2.139.38:44983 ESTAB 0 0 127.0.0.2:40000 127.1.59.80:44983 ESTAB 0 0 127.0.0.2:40000 127.3.6.228:44983 ESTAB 0 0 127.0.0.2:40000 127.0.38.53:44983 ESTAB 0 0 127.0.0.2:40000 127.1.197.10:44983 Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
513d1a1d1c
commit
90c337da15
|
@ -187,6 +187,7 @@ struct inet_sock {
|
|||
transparent:1,
|
||||
mc_all:1,
|
||||
nodefrag:1;
|
||||
__u8 bind_address_no_port:1;
|
||||
__u8 rcv_tos;
|
||||
__u8 convert_csum;
|
||||
int uc_index;
|
||||
|
|
|
@ -112,6 +112,7 @@ struct in_addr {
|
|||
#define IP_MINTTL 21
|
||||
#define IP_NODEFRAG 22
|
||||
#define IP_CHECKSUM 23
|
||||
#define IP_BIND_ADDRESS_NO_PORT 24
|
||||
|
||||
/* IP_MTU_DISCOVER values */
|
||||
#define IP_PMTUDISC_DONT 0 /* Never send DF frames */
|
||||
|
|
|
@ -488,7 +488,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
|
|||
inet->inet_saddr = 0; /* Use device */
|
||||
|
||||
/* Make sure we are allowed to bind here. */
|
||||
if (sk->sk_prot->get_port(sk, snum)) {
|
||||
if ((snum || !inet->bind_address_no_port) &&
|
||||
sk->sk_prot->get_port(sk, snum)) {
|
||||
inet->inet_saddr = inet->inet_rcv_saddr = 0;
|
||||
err = -EADDRINUSE;
|
||||
goto out_release_sock;
|
||||
|
|
|
@ -582,6 +582,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
|
|||
case IP_TRANSPARENT:
|
||||
case IP_MINTTL:
|
||||
case IP_NODEFRAG:
|
||||
case IP_BIND_ADDRESS_NO_PORT:
|
||||
case IP_UNICAST_IF:
|
||||
case IP_MULTICAST_TTL:
|
||||
case IP_MULTICAST_ALL:
|
||||
|
@ -732,6 +733,9 @@ static int do_ip_setsockopt(struct sock *sk, int level,
|
|||
}
|
||||
inet->nodefrag = val ? 1 : 0;
|
||||
break;
|
||||
case IP_BIND_ADDRESS_NO_PORT:
|
||||
inet->bind_address_no_port = val ? 1 : 0;
|
||||
break;
|
||||
case IP_MTU_DISCOVER:
|
||||
if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT)
|
||||
goto e_inval;
|
||||
|
@ -1324,6 +1328,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
|
|||
case IP_NODEFRAG:
|
||||
val = inet->nodefrag;
|
||||
break;
|
||||
case IP_BIND_ADDRESS_NO_PORT:
|
||||
val = inet->bind_address_no_port;
|
||||
break;
|
||||
case IP_MTU_DISCOVER:
|
||||
val = inet->pmtudisc;
|
||||
break;
|
||||
|
|
|
@ -362,7 +362,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
|
|||
np->saddr = addr->sin6_addr;
|
||||
|
||||
/* Make sure we are allowed to bind here. */
|
||||
if (sk->sk_prot->get_port(sk, snum)) {
|
||||
if ((snum || !inet->bind_address_no_port) &&
|
||||
sk->sk_prot->get_port(sk, snum)) {
|
||||
inet_reset_saddr(sk);
|
||||
err = -EADDRINUSE;
|
||||
goto out;
|
||||
|
|
Loading…
Reference in New Issue