OpenCloudOS-Kernel/net/unix/af_unix.c

2945 lines
68 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-or-later
/*
* NET4: Implementation of BSD Unix domain sockets.
*
* Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk>
*
* Fixes:
* Linus Torvalds : Assorted bug cures.
* Niibe Yutaka : async I/O support.
* Carsten Paeth : PF_UNIX check, address fixes.
* Alan Cox : Limit size of allocated blocks.
* Alan Cox : Fixed the stupid socketpair bug.
* Alan Cox : BSD compatibility fine tuning.
* Alan Cox : Fixed a bug in connect when interrupted.
* Alan Cox : Sorted out a proper draft version of
* file descriptor passing hacked up from
* Mike Shaver's work.
* Marty Leisner : Fixes to fd passing
* Nick Nevin : recvmsg bugfix.
* Alan Cox : Started proper garbage collector
* Heiko EiBfeldt : Missing verify_area check
* Alan Cox : Started POSIXisms
* Andreas Schwab : Replace inode by dentry for proper
* reference counting
* Kirk Petersen : Made this a module
* Christoph Rohland : Elegant non-blocking accept/connect algorithm.
* Lots of bug fixes.
* Alexey Kuznetosv : Repaired (I hope) bugs introduces
* by above two patches.
* Andrea Arcangeli : If possible we block in connect(2)
* if the max backlog of the listen socket
* is been reached. This won't break
* old apps and it will avoid huge amount
* of socks hashed (this for unix_gc()
* performances reasons).
* Security fix that limits the max
* number of socks to 2*max_files and
* the number of skb queueable in the
* dgram receiver.
* Artur Skawina : Hash function optimizations
* Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8)
* Malcolm Beattie : Set peercred for socketpair
* Michal Ostrowski : Module initialization cleanup.
* Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT,
* the core infrastructure is doing that
* for all net proto families now (2.5.69+)
*
* Known differences from reference BSD that was tested:
*
* [TO FIX]
* ECONNREFUSED is not returned from one end of a connected() socket to the
* other the moment one end closes.
* fstat() doesn't return st_dev=0, and give the blksize as high water mark
* and a fake inode identifier (nor the BSD first socket fstat twice bug).
* [NOT TO FIX]
* accept() returns a path name even if the connecting socket has closed
* in the meantime (BSD loses the path and gives up).
* accept() returns 0 length path for an unbound connector. BSD returns 16
* and a null first byte in the path (but not for gethost/peername - BSD bug ??)
* socketpair(...SOCK_RAW..) doesn't panic the kernel.
* BSD af_unix apparently has connect forgetting to block properly.
* (need to check this with the POSIX spec in detail)
*
* Differences from 2.0.0-11-... (ANK)
* Bug fixes and improvements.
* - client shutdown killed server socket.
* - removed all useless cli/sti pairs.
*
* Semantic changes/extensions.
* - generic control message passing.
* - SCM_CREDENTIALS control message.
* - "Abstract" (not FS based) socket bindings.
* Abstract names are sequences of bytes (not zero terminated)
* started by 0, so that this name space does not intersect
* with BSD names.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/signal.h>
#include <linux/sched/signal.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/stat.h>
#include <linux/dcache.h>
#include <linux/namei.h>
#include <linux/socket.h>
#include <linux/un.h>
#include <linux/fcntl.h>
#include <linux/termios.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/in.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/tcp_states.h>
#include <net/af_unix.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <net/scm.h>
#include <linux/init.h>
#include <linux/poll.h>
#include <linux/rtnetlink.h>
#include <linux/mount.h>
#include <net/checksum.h>
#include <linux/security.h>
#include <linux/freezer.h>
#include <linux/file.h>
#include "scm.h"
struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
EXPORT_SYMBOL_GPL(unix_socket_table);
DEFINE_SPINLOCK(unix_table_lock);
EXPORT_SYMBOL_GPL(unix_table_lock);
fs: allow for more than 2^31 files Robin Holt tried to boot a 16TB system and found af_unix was overflowing a 32bit value : <quote> We were seeing a failure which prevented boot. The kernel was incapable of creating either a named pipe or unix domain socket. This comes down to a common kernel function called unix_create1() which does: atomic_inc(&unix_nr_socks); if (atomic_read(&unix_nr_socks) > 2 * get_max_files()) goto out; The function get_max_files() is a simple return of files_stat.max_files. files_stat.max_files is a signed integer and is computed in fs/file_table.c's files_init(). n = (mempages * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = n; In our case, mempages (total_ram_pages) is approx 3,758,096,384 (0xe0000000). That leaves max_files at approximately 1,503,238,553. This causes 2 * get_max_files() to integer overflow. </quote> Fix is to let /proc/sys/fs/file-nr & /proc/sys/fs/file-max use long integers, and change af_unix to use an atomic_long_t instead of atomic_t. get_max_files() is changed to return an unsigned long. get_nr_files() is changed to return a long. unix_nr_socks is changed from atomic_t to atomic_long_t, while not strictly needed to address Robin problem. Before patch (on a 64bit kernel) : # echo 2147483648 >/proc/sys/fs/file-max # cat /proc/sys/fs/file-max -18446744071562067968 After patch: # echo 2147483648 >/proc/sys/fs/file-max # cat /proc/sys/fs/file-max 2147483648 # cat /proc/sys/fs/file-nr 704 0 2147483648 Reported-by: Robin Holt <holt@sgi.com> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Acked-by: David Miller <davem@davemloft.net> Reviewed-by: Robin Holt <holt@sgi.com> Tested-by: Robin Holt <holt@sgi.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Christoph Hellwig <hch@lst.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-10-27 05:22:44 +08:00
static atomic_long_t unix_nr_socks;
static struct hlist_head *unix_sockets_unbound(void *addr)
{
unsigned long hash = (unsigned long)addr;
hash ^= hash >> 16;
hash ^= hash >> 8;
hash %= UNIX_HASH_SIZE;
return &unix_socket_table[UNIX_HASH_SIZE + hash];
}
#define UNIX_ABSTRACT(sk) (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
[AF_UNIX]: Datagram getpeersec This patch implements an API whereby an application can determine the label of its peer's Unix datagram sockets via the auxiliary data mechanism of recvmsg. Patch purpose: This patch enables a security-aware application to retrieve the security context of the peer of a Unix datagram socket. The application can then use this security context to determine the security context for processing on behalf of the peer who sent the packet. Patch design and implementation: The design and implementation is very similar to the UDP case for INET sockets. Basically we build upon the existing Unix domain socket API for retrieving user credentials. Linux offers the API for obtaining user credentials via ancillary messages (i.e., out of band/control messages that are bundled together with a normal message). To retrieve the security context, the application first indicates to the kernel such desire by setting the SO_PASSSEC option via getsockopt. Then the application retrieves the security context using the auxiliary data mechanism. An example server application for Unix datagram socket should look like this: toggle = 1; toggle_len = sizeof(toggle); setsockopt(sockfd, SOL_SOCKET, SO_PASSSEC, &toggle, &toggle_len); recvmsg(sockfd, &msg_hdr, 0); if (msg_hdr.msg_controllen > sizeof(struct cmsghdr)) { cmsg_hdr = CMSG_FIRSTHDR(&msg_hdr); if (cmsg_hdr->cmsg_len <= CMSG_LEN(sizeof(scontext)) && cmsg_hdr->cmsg_level == SOL_SOCKET && cmsg_hdr->cmsg_type == SCM_SECURITY) { memcpy(&scontext, CMSG_DATA(cmsg_hdr), sizeof(scontext)); } } sock_setsockopt is enhanced with a new socket option SOCK_PASSSEC to allow a server socket to receive security context of the peer. Testing: We have tested the patch by setting up Unix datagram client and server applications. We verified that the server can retrieve the security context using the auxiliary data mechanism of recvmsg. Signed-off-by: Catherine Zhang <cxzhang@watson.ibm.com> Acked-by: Acked-by: James Morris <jmorris@namei.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2006-06-30 03:27:47 +08:00
#ifdef CONFIG_SECURITY_NETWORK
static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
[AF_UNIX]: Datagram getpeersec This patch implements an API whereby an application can determine the label of its peer's Unix datagram sockets via the auxiliary data mechanism of recvmsg. Patch purpose: This patch enables a security-aware application to retrieve the security context of the peer of a Unix datagram socket. The application can then use this security context to determine the security context for processing on behalf of the peer who sent the packet. Patch design and implementation: The design and implementation is very similar to the UDP case for INET sockets. Basically we build upon the existing Unix domain socket API for retrieving user credentials. Linux offers the API for obtaining user credentials via ancillary messages (i.e., out of band/control messages that are bundled together with a normal message). To retrieve the security context, the application first indicates to the kernel such desire by setting the SO_PASSSEC option via getsockopt. Then the application retrieves the security context using the auxiliary data mechanism. An example server application for Unix datagram socket should look like this: toggle = 1; toggle_len = sizeof(toggle); setsockopt(sockfd, SOL_SOCKET, SO_PASSSEC, &toggle, &toggle_len); recvmsg(sockfd, &msg_hdr, 0); if (msg_hdr.msg_controllen > sizeof(struct cmsghdr)) { cmsg_hdr = CMSG_FIRSTHDR(&msg_hdr); if (cmsg_hdr->cmsg_len <= CMSG_LEN(sizeof(scontext)) && cmsg_hdr->cmsg_level == SOL_SOCKET && cmsg_hdr->cmsg_type == SCM_SECURITY) { memcpy(&scontext, CMSG_DATA(cmsg_hdr), sizeof(scontext)); } } sock_setsockopt is enhanced with a new socket option SOCK_PASSSEC to allow a server socket to receive security context of the peer. Testing: We have tested the patch by setting up Unix datagram client and server applications. We verified that the server can retrieve the security context using the auxiliary data mechanism of recvmsg. Signed-off-by: Catherine Zhang <cxzhang@watson.ibm.com> Acked-by: Acked-by: James Morris <jmorris@namei.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2006-06-30 03:27:47 +08:00
{
UNIXCB(skb).secid = scm->secid;
[AF_UNIX]: Datagram getpeersec This patch implements an API whereby an application can determine the label of its peer's Unix datagram sockets via the auxiliary data mechanism of recvmsg. Patch purpose: This patch enables a security-aware application to retrieve the security context of the peer of a Unix datagram socket. The application can then use this security context to determine the security context for processing on behalf of the peer who sent the packet. Patch design and implementation: The design and implementation is very similar to the UDP case for INET sockets. Basically we build upon the existing Unix domain socket API for retrieving user credentials. Linux offers the API for obtaining user credentials via ancillary messages (i.e., out of band/control messages that are bundled together with a normal message). To retrieve the security context, the application first indicates to the kernel such desire by setting the SO_PASSSEC option via getsockopt. Then the application retrieves the security context using the auxiliary data mechanism. An example server application for Unix datagram socket should look like this: toggle = 1; toggle_len = sizeof(toggle); setsockopt(sockfd, SOL_SOCKET, SO_PASSSEC, &toggle, &toggle_len); recvmsg(sockfd, &msg_hdr, 0); if (msg_hdr.msg_controllen > sizeof(struct cmsghdr)) { cmsg_hdr = CMSG_FIRSTHDR(&msg_hdr); if (cmsg_hdr->cmsg_len <= CMSG_LEN(sizeof(scontext)) && cmsg_hdr->cmsg_level == SOL_SOCKET && cmsg_hdr->cmsg_type == SCM_SECURITY) { memcpy(&scontext, CMSG_DATA(cmsg_hdr), sizeof(scontext)); } } sock_setsockopt is enhanced with a new socket option SOCK_PASSSEC to allow a server socket to receive security context of the peer. Testing: We have tested the patch by setting up Unix datagram client and server applications. We verified that the server can retrieve the security context using the auxiliary data mechanism of recvmsg. Signed-off-by: Catherine Zhang <cxzhang@watson.ibm.com> Acked-by: Acked-by: James Morris <jmorris@namei.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2006-06-30 03:27:47 +08:00
}
static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
{
scm->secid = UNIXCB(skb).secid;
}
static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
{
return (scm->secid == UNIXCB(skb).secid);
[AF_UNIX]: Datagram getpeersec This patch implements an API whereby an application can determine the label of its peer's Unix datagram sockets via the auxiliary data mechanism of recvmsg. Patch purpose: This patch enables a security-aware application to retrieve the security context of the peer of a Unix datagram socket. The application can then use this security context to determine the security context for processing on behalf of the peer who sent the packet. Patch design and implementation: The design and implementation is very similar to the UDP case for INET sockets. Basically we build upon the existing Unix domain socket API for retrieving user credentials. Linux offers the API for obtaining user credentials via ancillary messages (i.e., out of band/control messages that are bundled together with a normal message). To retrieve the security context, the application first indicates to the kernel such desire by setting the SO_PASSSEC option via getsockopt. Then the application retrieves the security context using the auxiliary data mechanism. An example server application for Unix datagram socket should look like this: toggle = 1; toggle_len = sizeof(toggle); setsockopt(sockfd, SOL_SOCKET, SO_PASSSEC, &toggle, &toggle_len); recvmsg(sockfd, &msg_hdr, 0); if (msg_hdr.msg_controllen > sizeof(struct cmsghdr)) { cmsg_hdr = CMSG_FIRSTHDR(&msg_hdr); if (cmsg_hdr->cmsg_len <= CMSG_LEN(sizeof(scontext)) && cmsg_hdr->cmsg_level == SOL_SOCKET && cmsg_hdr->cmsg_type == SCM_SECURITY) { memcpy(&scontext, CMSG_DATA(cmsg_hdr), sizeof(scontext)); } } sock_setsockopt is enhanced with a new socket option SOCK_PASSSEC to allow a server socket to receive security context of the peer. Testing: We have tested the patch by setting up Unix datagram client and server applications. We verified that the server can retrieve the security context using the auxiliary data mechanism of recvmsg. Signed-off-by: Catherine Zhang <cxzhang@watson.ibm.com> Acked-by: Acked-by: James Morris <jmorris@namei.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2006-06-30 03:27:47 +08:00
}
#else
static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
[AF_UNIX]: Datagram getpeersec This patch implements an API whereby an application can determine the label of its peer's Unix datagram sockets via the auxiliary data mechanism of recvmsg. Patch purpose: This patch enables a security-aware application to retrieve the security context of the peer of a Unix datagram socket. The application can then use this security context to determine the security context for processing on behalf of the peer who sent the packet. Patch design and implementation: The design and implementation is very similar to the UDP case for INET sockets. Basically we build upon the existing Unix domain socket API for retrieving user credentials. Linux offers the API for obtaining user credentials via ancillary messages (i.e., out of band/control messages that are bundled together with a normal message). To retrieve the security context, the application first indicates to the kernel such desire by setting the SO_PASSSEC option via getsockopt. Then the application retrieves the security context using the auxiliary data mechanism. An example server application for Unix datagram socket should look like this: toggle = 1; toggle_len = sizeof(toggle); setsockopt(sockfd, SOL_SOCKET, SO_PASSSEC, &toggle, &toggle_len); recvmsg(sockfd, &msg_hdr, 0); if (msg_hdr.msg_controllen > sizeof(struct cmsghdr)) { cmsg_hdr = CMSG_FIRSTHDR(&msg_hdr); if (cmsg_hdr->cmsg_len <= CMSG_LEN(sizeof(scontext)) && cmsg_hdr->cmsg_level == SOL_SOCKET && cmsg_hdr->cmsg_type == SCM_SECURITY) { memcpy(&scontext, CMSG_DATA(cmsg_hdr), sizeof(scontext)); } } sock_setsockopt is enhanced with a new socket option SOCK_PASSSEC to allow a server socket to receive security context of the peer. Testing: We have tested the patch by setting up Unix datagram client and server applications. We verified that the server can retrieve the security context using the auxiliary data mechanism of recvmsg. Signed-off-by: Catherine Zhang <cxzhang@watson.ibm.com> Acked-by: Acked-by: James Morris <jmorris@namei.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2006-06-30 03:27:47 +08:00
{ }
static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
{ }
static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
{
return true;
}
[AF_UNIX]: Datagram getpeersec This patch implements an API whereby an application can determine the label of its peer's Unix datagram sockets via the auxiliary data mechanism of recvmsg. Patch purpose: This patch enables a security-aware application to retrieve the security context of the peer of a Unix datagram socket. The application can then use this security context to determine the security context for processing on behalf of the peer who sent the packet. Patch design and implementation: The design and implementation is very similar to the UDP case for INET sockets. Basically we build upon the existing Unix domain socket API for retrieving user credentials. Linux offers the API for obtaining user credentials via ancillary messages (i.e., out of band/control messages that are bundled together with a normal message). To retrieve the security context, the application first indicates to the kernel such desire by setting the SO_PASSSEC option via getsockopt. Then the application retrieves the security context using the auxiliary data mechanism. An example server application for Unix datagram socket should look like this: toggle = 1; toggle_len = sizeof(toggle); setsockopt(sockfd, SOL_SOCKET, SO_PASSSEC, &toggle, &toggle_len); recvmsg(sockfd, &msg_hdr, 0); if (msg_hdr.msg_controllen > sizeof(struct cmsghdr)) { cmsg_hdr = CMSG_FIRSTHDR(&msg_hdr); if (cmsg_hdr->cmsg_len <= CMSG_LEN(sizeof(scontext)) && cmsg_hdr->cmsg_level == SOL_SOCKET && cmsg_hdr->cmsg_type == SCM_SECURITY) { memcpy(&scontext, CMSG_DATA(cmsg_hdr), sizeof(scontext)); } } sock_setsockopt is enhanced with a new socket option SOCK_PASSSEC to allow a server socket to receive security context of the peer. Testing: We have tested the patch by setting up Unix datagram client and server applications. We verified that the server can retrieve the security context using the auxiliary data mechanism of recvmsg. Signed-off-by: Catherine Zhang <cxzhang@watson.ibm.com> Acked-by: Acked-by: James Morris <jmorris@namei.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2006-06-30 03:27:47 +08:00
#endif /* CONFIG_SECURITY_NETWORK */
/*
* SMP locking strategy:
* hash table is protected with spinlock unix_table_lock
* each socket state is protected by separate spin lock.
*/
static inline unsigned int unix_hash_fold(__wsum n)
{
unsigned int hash = (__force unsigned int)csum_fold(n);
hash ^= hash>>8;
return hash&(UNIX_HASH_SIZE-1);
}
#define unix_peer(sk) (unix_sk(sk)->peer)
static inline int unix_our_peer(struct sock *sk, struct sock *osk)
{
return unix_peer(osk) == sk;
}
static inline int unix_may_send(struct sock *sk, struct sock *osk)
{
return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
}
static inline int unix_recvq_full(const struct sock *sk)
af_unix: fix 'poll for write'/ connected DGRAM sockets The unix_dgram_sendmsg routine implements a (somewhat crude) form of receiver-imposed flow control by comparing the length of the receive queue of the 'peer socket' with the max_ack_backlog value stored in the corresponding sock structure, either blocking the thread which caused the send-routine to be called or returning EAGAIN. This routine is used by both SOCK_DGRAM and SOCK_SEQPACKET sockets. The poll-implementation for these socket types is datagram_poll from core/datagram.c. A socket is deemed to be writeable by this routine when the memory presently consumed by datagrams owned by it is less than the configured socket send buffer size. This is always wrong for connected PF_UNIX non-stream sockets when the abovementioned receive queue is currently considered to be full. 'poll' will then return, indicating that the socket is writeable, but a subsequent write result in EAGAIN, effectively causing an (usual) application to 'poll for writeability by repeated send request with O_NONBLOCK set' until it has consumed its time quantum. The change below uses a suitably modified variant of the datagram_poll routines for both type of PF_UNIX sockets, which tests if the recv-queue of the peer a socket is connected to is presently considered to be 'full' as part of the 'is this socket writeable'-checking code. The socket being polled is additionally put onto the peer_wait wait queue associated with its peer, because the unix_dgram_sendmsg routine does a wake up on this queue after a datagram was received and the 'other wakeup call' is done implicitly as part of skb destruction, meaning, a process blocked in poll because of a full peer receive queue could otherwise sleep forever if no datagram owned by its socket was already sitting on this queue. Among this change is a small (inline) helper routine named 'unix_recvq_full', which consolidates the actual testing code (in three different places) into a single location. Signed-off-by: Rainer Weikusat <rweikusat@mssgmbh.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-06-18 13:28:05 +08:00
{
return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
}
static inline int unix_recvq_full_lockless(const struct sock *sk)
{
return skb_queue_len_lockless(&sk->sk_receive_queue) >
READ_ONCE(sk->sk_max_ack_backlog);
}
struct sock *unix_peer_get(struct sock *s)
{
struct sock *peer;
unix_state_lock(s);
peer = unix_peer(s);
if (peer)
sock_hold(peer);
unix_state_unlock(s);
return peer;
}
EXPORT_SYMBOL_GPL(unix_peer_get);
static inline void unix_release_addr(struct unix_address *addr)
{
if (refcount_dec_and_test(&addr->refcnt))
kfree(addr);
}
/*
* Check unix socket name:
* - should be not zero length.
* - if started by not zero, should be NULL terminated (FS object)
* - if started by zero, it is abstract name.
*/
static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
{
*hashp = 0;
if (len <= sizeof(short) || len > sizeof(*sunaddr))
return -EINVAL;
if (!sunaddr || sunaddr->sun_family != AF_UNIX)
return -EINVAL;
if (sunaddr->sun_path[0]) {
/*
* This may look like an off by one error but it is a bit more
* subtle. 108 is the longest valid AF_UNIX path for a binding.
* sun_path[108] doesn't as such exist. However in kernel space
* we are guaranteed that it is a valid memory location in our
* kernel address buffer.
*/
((char *)sunaddr)[len] = 0;
len = strlen(sunaddr->sun_path)+1+sizeof(short);
return len;
}
*hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
return len;
}
static void __unix_remove_socket(struct sock *sk)
{
sk_del_node_init(sk);
}
static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
{
WARN_ON(!sk_unhashed(sk));
sk_add_node(sk, list);
}
static inline void unix_remove_socket(struct sock *sk)
{
spin_lock(&unix_table_lock);
__unix_remove_socket(sk);
spin_unlock(&unix_table_lock);
}
static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
{
spin_lock(&unix_table_lock);
__unix_insert_socket(list, sk);
spin_unlock(&unix_table_lock);
}
static struct sock *__unix_find_socket_byname(struct net *net,
struct sockaddr_un *sunname,
int len, int type, unsigned int hash)
{
struct sock *s;
hlist: drop the node parameter from iterators I'm not sure why, but the hlist for each entry iterators were conceived list_for_each_entry(pos, head, member) The hlist ones were greedy and wanted an extra parameter: hlist_for_each_entry(tpos, pos, head, member) Why did they need an extra pos parameter? I'm not quite sure. Not only they don't really need it, it also prevents the iterator from looking exactly like the list iterator, which is unfortunate. Besides the semantic patch, there was some manual work required: - Fix up the actual hlist iterators in linux/list.h - Fix up the declaration of other iterators based on the hlist ones. - A very small amount of places were using the 'node' parameter, this was modified to use 'obj->member' instead. - Coccinelle didn't handle the hlist_for_each_entry_safe iterator properly, so those had to be fixed up manually. The semantic patch which is mostly the work of Peter Senna Tschudin is here: @@ iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host; type T; expression a,c,d,e; identifier b; statement S; @@ -T b; <+... when != b ( hlist_for_each_entry(a, - b, c, d) S | hlist_for_each_entry_continue(a, - b, c) S | hlist_for_each_entry_from(a, - b, c) S | hlist_for_each_entry_rcu(a, - b, c, d) S | hlist_for_each_entry_rcu_bh(a, - b, c, d) S | hlist_for_each_entry_continue_rcu_bh(a, - b, c) S | for_each_busy_worker(a, c, - b, d) S | ax25_uid_for_each(a, - b, c) S | ax25_for_each(a, - b, c) S | inet_bind_bucket_for_each(a, - b, c) S | sctp_for_each_hentry(a, - b, c) S | sk_for_each(a, - b, c) S | sk_for_each_rcu(a, - b, c) S | sk_for_each_from -(a, b) +(a) S + sk_for_each_from(a) S | sk_for_each_safe(a, - b, c, d) S | sk_for_each_bound(a, - b, c) S | hlist_for_each_entry_safe(a, - b, c, d, e) S | hlist_for_each_entry_continue_rcu(a, - b, c) S | nr_neigh_for_each(a, - b, c) S | nr_neigh_for_each_safe(a, - b, c, d) S | nr_node_for_each(a, - b, c) S | nr_node_for_each_safe(a, - b, c, d) S | - for_each_gfn_sp(a, c, d, b) S + for_each_gfn_sp(a, c, d) S | - for_each_gfn_indirect_valid_sp(a, c, d, b) S + for_each_gfn_indirect_valid_sp(a, c, d) S | for_each_host(a, - b, c) S | for_each_host_safe(a, - b, c, d) S | for_each_mesh_entry(a, - b, c, d) S ) ...+> [akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c] [akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c] [akpm@linux-foundation.org: checkpatch fixes] [akpm@linux-foundation.org: fix warnings] [akpm@linux-foudnation.org: redo intrusive kvm changes] Tested-by: Peter Senna Tschudin <peter.senna@gmail.com> Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Signed-off-by: Sasha Levin <sasha.levin@oracle.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Cc: Gleb Natapov <gleb@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-28 09:06:00 +08:00
sk_for_each(s, &unix_socket_table[hash ^ type]) {
struct unix_sock *u = unix_sk(s);
if (!net_eq(sock_net(s), net))
continue;
if (u->addr->len == len &&
!memcmp(u->addr->name, sunname, len))
return s;
}
return NULL;
}
static inline struct sock *unix_find_socket_byname(struct net *net,
struct sockaddr_un *sunname,
int len, int type,
unsigned int hash)
{
struct sock *s;
spin_lock(&unix_table_lock);
s = __unix_find_socket_byname(net, sunname, len, type, hash);
if (s)
sock_hold(s);
spin_unlock(&unix_table_lock);
return s;
}
af_unix: Allow connecting to sockets in other network namespaces. Remove the restriction that only allows connecting to a unix domain socket identified by unix path that is in the same network namespace. Crossing network namespaces is always tricky and we did not support this at first, because of a strict policy of don't mix the namespaces. Later after Pavel proposed this we did not support this because no one had performed the audit to make certain using unix domain sockets across namespaces is safe. What fundamentally makes connecting to af_unix sockets in other namespaces is safe is that you have to have the proper permissions on the unix domain socket inode that lives in the filesystem. If you want strict isolation you just don't create inodes where unfriendlys can get at them, or with permissions that allow unfriendlys to open them. All nicely handled for us by the mount namespace and other standard file system facilities. I looked through unix domain sockets and they are a very controlled environment so none of the work that goes on in dev_forward_skb to make crossing namespaces safe appears needed, we are not loosing controll of the skb and so do not need to set up the skb to look like it is comming in fresh from the outside world. Further the fields in struct unix_skb_parms should not have any problems crossing network namespaces. Now that we handle SCM_CREDENTIALS in a way that gives useable values across namespaces. There does not appear to be any operational problems with encouraging the use of unix domain sockets across containers either. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Acked-by: Daniel Lezcano <daniel.lezcano@free.fr> Acked-by: Pavel Emelyanov <xemul@openvz.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2010-06-13 11:35:48 +08:00
static struct sock *unix_find_socket_byinode(struct inode *i)
{
struct sock *s;
spin_lock(&unix_table_lock);
hlist: drop the node parameter from iterators I'm not sure why, but the hlist for each entry iterators were conceived list_for_each_entry(pos, head, member) The hlist ones were greedy and wanted an extra parameter: hlist_for_each_entry(tpos, pos, head, member) Why did they need an extra pos parameter? I'm not quite sure. Not only they don't really need it, it also prevents the iterator from looking exactly like the list iterator, which is unfortunate. Besides the semantic patch, there was some manual work required: - Fix up the actual hlist iterators in linux/list.h - Fix up the declaration of other iterators based on the hlist ones. - A very small amount of places were using the 'node' parameter, this was modified to use 'obj->member' instead. - Coccinelle didn't handle the hlist_for_each_entry_safe iterator properly, so those had to be fixed up manually. The semantic patch which is mostly the work of Peter Senna Tschudin is here: @@ iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host; type T; expression a,c,d,e; identifier b; statement S; @@ -T b; <+... when != b ( hlist_for_each_entry(a, - b, c, d) S | hlist_for_each_entry_continue(a, - b, c) S | hlist_for_each_entry_from(a, - b, c) S | hlist_for_each_entry_rcu(a, - b, c, d) S | hlist_for_each_entry_rcu_bh(a, - b, c, d) S | hlist_for_each_entry_continue_rcu_bh(a, - b, c) S | for_each_busy_worker(a, c, - b, d) S | ax25_uid_for_each(a, - b, c) S | ax25_for_each(a, - b, c) S | inet_bind_bucket_for_each(a, - b, c) S | sctp_for_each_hentry(a, - b, c) S | sk_for_each(a, - b, c) S | sk_for_each_rcu(a, - b, c) S | sk_for_each_from -(a, b) +(a) S + sk_for_each_from(a) S | sk_for_each_safe(a, - b, c, d) S | sk_for_each_bound(a, - b, c) S | hlist_for_each_entry_safe(a, - b, c, d, e) S | hlist_for_each_entry_continue_rcu(a, - b, c) S | nr_neigh_for_each(a, - b, c) S | nr_neigh_for_each_safe(a, - b, c, d) S | nr_node_for_each(a, - b, c) S | nr_node_for_each_safe(a, - b, c, d) S | - for_each_gfn_sp(a, c, d, b) S + for_each_gfn_sp(a, c, d) S | - for_each_gfn_indirect_valid_sp(a, c, d, b) S + for_each_gfn_indirect_valid_sp(a, c, d) S | for_each_host(a, - b, c) S | for_each_host_safe(a, - b, c, d) S | for_each_mesh_entry(a, - b, c, d) S ) ...+> [akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c] [akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c] [akpm@linux-foundation.org: checkpatch fixes] [akpm@linux-foundation.org: fix warnings] [akpm@linux-foudnation.org: redo intrusive kvm changes] Tested-by: Peter Senna Tschudin <peter.senna@gmail.com> Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Signed-off-by: Sasha Levin <sasha.levin@oracle.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Cc: Gleb Natapov <gleb@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-28 09:06:00 +08:00
sk_for_each(s,
&unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
struct dentry *dentry = unix_sk(s)->path.dentry;
if (dentry && d_backing_inode(dentry) == i) {
sock_hold(s);
goto found;
}
}
s = NULL;
found:
spin_unlock(&unix_table_lock);
return s;
}
unix: avoid use-after-free in ep_remove_wait_queue Rainer Weikusat <rweikusat@mobileactivedefense.com> writes: An AF_UNIX datagram socket being the client in an n:1 association with some server socket is only allowed to send messages to the server if the receive queue of this socket contains at most sk_max_ack_backlog datagrams. This implies that prospective writers might be forced to go to sleep despite none of the message presently enqueued on the server receive queue were sent by them. In order to ensure that these will be woken up once space becomes again available, the present unix_dgram_poll routine does a second sock_poll_wait call with the peer_wait wait queue of the server socket as queue argument (unix_dgram_recvmsg does a wake up on this queue after a datagram was received). This is inherently problematic because the server socket is only guaranteed to remain alive for as long as the client still holds a reference to it. In case the connection is dissolved via connect or by the dead peer detection logic in unix_dgram_sendmsg, the server socket may be freed despite "the polling mechanism" (in particular, epoll) still has a pointer to the corresponding peer_wait queue. There's no way to forcibly deregister a wait queue with epoll. Based on an idea by Jason Baron, the patch below changes the code such that a wait_queue_t belonging to the client socket is enqueued on the peer_wait queue of the server whenever the peer receive queue full condition is detected by either a sendmsg or a poll. A wake up on the peer queue is then relayed to the ordinary wait queue of the client socket via wake function. The connection to the peer wait queue is again dissolved if either a wake up is about to be relayed or the client socket reconnects or a dead peer is detected or the client socket is itself closed. This enables removing the second sock_poll_wait from unix_dgram_poll, thus avoiding the use-after-free, while still ensuring that no blocked writer sleeps forever. Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com> Fixes: ec0d215f9420 ("af_unix: fix 'poll for write'/connected DGRAM sockets") Reviewed-by: Jason Baron <jbaron@akamai.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-11-21 06:07:23 +08:00
/* Support code for asymmetrically connected dgram sockets
*
* If a datagram socket is connected to a socket not itself connected
* to the first socket (eg, /dev/log), clients may only enqueue more
* messages if the present receive queue of the server socket is not
* "too large". This means there's a second writeability condition
* poll and sendmsg need to test. The dgram recv code will do a wake
* up on the peer_wait wait queue of a socket upon reception of a
* datagram which needs to be propagated to sleeping would-be writers
* since these might not have sent anything so far. This can't be
* accomplished via poll_wait because the lifetime of the server
* socket might be less than that of its clients if these break their
* association with it or if the server socket is closed while clients
* are still connected to it and there's no way to inform "a polling
* implementation" that it should let go of a certain wait queue
*
* In order to propagate a wake up, a wait_queue_entry_t of the client
unix: avoid use-after-free in ep_remove_wait_queue Rainer Weikusat <rweikusat@mobileactivedefense.com> writes: An AF_UNIX datagram socket being the client in an n:1 association with some server socket is only allowed to send messages to the server if the receive queue of this socket contains at most sk_max_ack_backlog datagrams. This implies that prospective writers might be forced to go to sleep despite none of the message presently enqueued on the server receive queue were sent by them. In order to ensure that these will be woken up once space becomes again available, the present unix_dgram_poll routine does a second sock_poll_wait call with the peer_wait wait queue of the server socket as queue argument (unix_dgram_recvmsg does a wake up on this queue after a datagram was received). This is inherently problematic because the server socket is only guaranteed to remain alive for as long as the client still holds a reference to it. In case the connection is dissolved via connect or by the dead peer detection logic in unix_dgram_sendmsg, the server socket may be freed despite "the polling mechanism" (in particular, epoll) still has a pointer to the corresponding peer_wait queue. There's no way to forcibly deregister a wait queue with epoll. Based on an idea by Jason Baron, the patch below changes the code such that a wait_queue_t belonging to the client socket is enqueued on the peer_wait queue of the server whenever the peer receive queue full condition is detected by either a sendmsg or a poll. A wake up on the peer queue is then relayed to the ordinary wait queue of the client socket via wake function. The connection to the peer wait queue is again dissolved if either a wake up is about to be relayed or the client socket reconnects or a dead peer is detected or the client socket is itself closed. This enables removing the second sock_poll_wait from unix_dgram_poll, thus avoiding the use-after-free, while still ensuring that no blocked writer sleeps forever. Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com> Fixes: ec0d215f9420 ("af_unix: fix 'poll for write'/connected DGRAM sockets") Reviewed-by: Jason Baron <jbaron@akamai.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-11-21 06:07:23 +08:00
* socket is enqueued on the peer_wait queue of the server socket
* whose wake function does a wake_up on the ordinary client socket
* wait queue. This connection is established whenever a write (or
* poll for write) hit the flow control condition and broken when the
* association to the server socket is dissolved or after a wake up
* was relayed.
*/
static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
unix: avoid use-after-free in ep_remove_wait_queue Rainer Weikusat <rweikusat@mobileactivedefense.com> writes: An AF_UNIX datagram socket being the client in an n:1 association with some server socket is only allowed to send messages to the server if the receive queue of this socket contains at most sk_max_ack_backlog datagrams. This implies that prospective writers might be forced to go to sleep despite none of the message presently enqueued on the server receive queue were sent by them. In order to ensure that these will be woken up once space becomes again available, the present unix_dgram_poll routine does a second sock_poll_wait call with the peer_wait wait queue of the server socket as queue argument (unix_dgram_recvmsg does a wake up on this queue after a datagram was received). This is inherently problematic because the server socket is only guaranteed to remain alive for as long as the client still holds a reference to it. In case the connection is dissolved via connect or by the dead peer detection logic in unix_dgram_sendmsg, the server socket may be freed despite "the polling mechanism" (in particular, epoll) still has a pointer to the corresponding peer_wait queue. There's no way to forcibly deregister a wait queue with epoll. Based on an idea by Jason Baron, the patch below changes the code such that a wait_queue_t belonging to the client socket is enqueued on the peer_wait queue of the server whenever the peer receive queue full condition is detected by either a sendmsg or a poll. A wake up on the peer queue is then relayed to the ordinary wait queue of the client socket via wake function. The connection to the peer wait queue is again dissolved if either a wake up is about to be relayed or the client socket reconnects or a dead peer is detected or the client socket is itself closed. This enables removing the second sock_poll_wait from unix_dgram_poll, thus avoiding the use-after-free, while still ensuring that no blocked writer sleeps forever. Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com> Fixes: ec0d215f9420 ("af_unix: fix 'poll for write'/connected DGRAM sockets") Reviewed-by: Jason Baron <jbaron@akamai.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-11-21 06:07:23 +08:00
void *key)
{
struct unix_sock *u;
wait_queue_head_t *u_sleep;
u = container_of(q, struct unix_sock, peer_wake);
__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
q);
u->peer_wake.private = NULL;
/* relaying can only happen while the wq still exists */
u_sleep = sk_sleep(&u->sk);
if (u_sleep)
wake_up_interruptible_poll(u_sleep, key_to_poll(key));
unix: avoid use-after-free in ep_remove_wait_queue Rainer Weikusat <rweikusat@mobileactivedefense.com> writes: An AF_UNIX datagram socket being the client in an n:1 association with some server socket is only allowed to send messages to the server if the receive queue of this socket contains at most sk_max_ack_backlog datagrams. This implies that prospective writers might be forced to go to sleep despite none of the message presently enqueued on the server receive queue were sent by them. In order to ensure that these will be woken up once space becomes again available, the present unix_dgram_poll routine does a second sock_poll_wait call with the peer_wait wait queue of the server socket as queue argument (unix_dgram_recvmsg does a wake up on this queue after a datagram was received). This is inherently problematic because the server socket is only guaranteed to remain alive for as long as the client still holds a reference to it. In case the connection is dissolved via connect or by the dead peer detection logic in unix_dgram_sendmsg, the server socket may be freed despite "the polling mechanism" (in particular, epoll) still has a pointer to the corresponding peer_wait queue. There's no way to forcibly deregister a wait queue with epoll. Based on an idea by Jason Baron, the patch below changes the code such that a wait_queue_t belonging to the client socket is enqueued on the peer_wait queue of the server whenever the peer receive queue full condition is detected by either a sendmsg or a poll. A wake up on the peer queue is then relayed to the ordinary wait queue of the client socket via wake function. The connection to the peer wait queue is again dissolved if either a wake up is about to be relayed or the client socket reconnects or a dead peer is detected or the client socket is itself closed. This enables removing the second sock_poll_wait from unix_dgram_poll, thus avoiding the use-after-free, while still ensuring that no blocked writer sleeps forever. Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com> Fixes: ec0d215f9420 ("af_unix: fix 'poll for write'/connected DGRAM sockets") Reviewed-by: Jason Baron <jbaron@akamai.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-11-21 06:07:23 +08:00
return 0;
}
static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
{
struct unix_sock *u, *u_other;
int rc;
u = unix_sk(sk);
u_other = unix_sk(other);
rc = 0;
spin_lock(&u_other->peer_wait.lock);
if (!u->peer_wake.private) {
u->peer_wake.private = other;
__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
rc = 1;
}
spin_unlock(&u_other->peer_wait.lock);
return rc;
}
static void unix_dgram_peer_wake_disconnect(struct sock *sk,
struct sock *other)
{
struct unix_sock *u, *u_other;
u = unix_sk(sk);
u_other = unix_sk(other);
spin_lock(&u_other->peer_wait.lock);
if (u->peer_wake.private == other) {
__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
u->peer_wake.private = NULL;
}
spin_unlock(&u_other->peer_wait.lock);
}
static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
struct sock *other)
{
unix_dgram_peer_wake_disconnect(sk, other);
wake_up_interruptible_poll(sk_sleep(sk),
EPOLLOUT |
EPOLLWRNORM |
EPOLLWRBAND);
unix: avoid use-after-free in ep_remove_wait_queue Rainer Weikusat <rweikusat@mobileactivedefense.com> writes: An AF_UNIX datagram socket being the client in an n:1 association with some server socket is only allowed to send messages to the server if the receive queue of this socket contains at most sk_max_ack_backlog datagrams. This implies that prospective writers might be forced to go to sleep despite none of the message presently enqueued on the server receive queue were sent by them. In order to ensure that these will be woken up once space becomes again available, the present unix_dgram_poll routine does a second sock_poll_wait call with the peer_wait wait queue of the server socket as queue argument (unix_dgram_recvmsg does a wake up on this queue after a datagram was received). This is inherently problematic because the server socket is only guaranteed to remain alive for as long as the client still holds a reference to it. In case the connection is dissolved via connect or by the dead peer detection logic in unix_dgram_sendmsg, the server socket may be freed despite "the polling mechanism" (in particular, epoll) still has a pointer to the corresponding peer_wait queue. There's no way to forcibly deregister a wait queue with epoll. Based on an idea by Jason Baron, the patch below changes the code such that a wait_queue_t belonging to the client socket is enqueued on the peer_wait queue of the server whenever the peer receive queue full condition is detected by either a sendmsg or a poll. A wake up on the peer queue is then relayed to the ordinary wait queue of the client socket via wake function. The connection to the peer wait queue is again dissolved if either a wake up is about to be relayed or the client socket reconnects or a dead peer is detected or the client socket is itself closed. This enables removing the second sock_poll_wait from unix_dgram_poll, thus avoiding the use-after-free, while still ensuring that no blocked writer sleeps forever. Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com> Fixes: ec0d215f9420 ("af_unix: fix 'poll for write'/connected DGRAM sockets") Reviewed-by: Jason Baron <jbaron@akamai.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-11-21 06:07:23 +08:00
}
/* preconditions:
* - unix_peer(sk) == other
* - association is stable
*/
static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
{
int connected;
connected = unix_dgram_peer_wake_connect(sk, other);
2018-08-04 05:24:53 +08:00
/* If other is SOCK_DEAD, we want to make sure we signal
* POLLOUT, such that a subsequent write() can get a
* -ECONNREFUSED. Otherwise, if we haven't queued any skbs
* to other and its full, we will hang waiting for POLLOUT.
*/
if (unix_recvq_full(other) && !sock_flag(other, SOCK_DEAD))
unix: avoid use-after-free in ep_remove_wait_queue Rainer Weikusat <rweikusat@mobileactivedefense.com> writes: An AF_UNIX datagram socket being the client in an n:1 association with some server socket is only allowed to send messages to the server if the receive queue of this socket contains at most sk_max_ack_backlog datagrams. This implies that prospective writers might be forced to go to sleep despite none of the message presently enqueued on the server receive queue were sent by them. In order to ensure that these will be woken up once space becomes again available, the present unix_dgram_poll routine does a second sock_poll_wait call with the peer_wait wait queue of the server socket as queue argument (unix_dgram_recvmsg does a wake up on this queue after a datagram was received). This is inherently problematic because the server socket is only guaranteed to remain alive for as long as the client still holds a reference to it. In case the connection is dissolved via connect or by the dead peer detection logic in unix_dgram_sendmsg, the server socket may be freed despite "the polling mechanism" (in particular, epoll) still has a pointer to the corresponding peer_wait queue. There's no way to forcibly deregister a wait queue with epoll. Based on an idea by Jason Baron, the patch below changes the code such that a wait_queue_t belonging to the client socket is enqueued on the peer_wait queue of the server whenever the peer receive queue full condition is detected by either a sendmsg or a poll. A wake up on the peer queue is then relayed to the ordinary wait queue of the client socket via wake function. The connection to the peer wait queue is again dissolved if either a wake up is about to be relayed or the client socket reconnects or a dead peer is detected or the client socket is itself closed. This enables removing the second sock_poll_wait from unix_dgram_poll, thus avoiding the use-after-free, while still ensuring that no blocked writer sleeps forever. Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com> Fixes: ec0d215f9420 ("af_unix: fix 'poll for write'/connected DGRAM sockets") Reviewed-by: Jason Baron <jbaron@akamai.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-11-21 06:07:23 +08:00
return 1;
if (connected)
unix_dgram_peer_wake_disconnect(sk, other);
return 0;
}
static int unix_writable(const struct sock *sk)
{
return sk->sk_state != TCP_LISTEN &&
(refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
}
static void unix_write_space(struct sock *sk)
{
net: sock_def_readable() and friends RCU conversion sk_callback_lock rwlock actually protects sk->sk_sleep pointer, so we need two atomic operations (and associated dirtying) per incoming packet. RCU conversion is pretty much needed : 1) Add a new structure, called "struct socket_wq" to hold all fields that will need rcu_read_lock() protection (currently: a wait_queue_head_t and a struct fasync_struct pointer). [Future patch will add a list anchor for wakeup coalescing] 2) Attach one of such structure to each "struct socket" created in sock_alloc_inode(). 3) Respect RCU grace period when freeing a "struct socket_wq" 4) Change sk_sleep pointer in "struct sock" by sk_wq, pointer to "struct socket_wq" 5) Change sk_sleep() function to use new sk->sk_wq instead of sk->sk_sleep 6) Change sk_has_sleeper() to wq_has_sleeper() that must be used inside a rcu_read_lock() section. 7) Change all sk_has_sleeper() callers to : - Use rcu_read_lock() instead of read_lock(&sk->sk_callback_lock) - Use wq_has_sleeper() to eventually wakeup tasks. - Use rcu_read_unlock() instead of read_unlock(&sk->sk_callback_lock) 8) sock_wake_async() is modified to use rcu protection as well. 9) Exceptions : macvtap, drivers/net/tun.c, af_unix use integrated "struct socket_wq" instead of dynamically allocated ones. They dont need rcu freeing. Some cleanups or followups are probably needed, (possible sk_callback_lock conversion to a spinlock for example...). Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2010-04-29 19:01:49 +08:00
struct socket_wq *wq;
rcu_read_lock();
if (unix_writable(sk)) {
net: sock_def_readable() and friends RCU conversion sk_callback_lock rwlock actually protects sk->sk_sleep pointer, so we need two atomic operations (and associated dirtying) per incoming packet. RCU conversion is pretty much needed : 1) Add a new structure, called "struct socket_wq" to hold all fields that will need rcu_read_lock() protection (currently: a wait_queue_head_t and a struct fasync_struct pointer). [Future patch will add a list anchor for wakeup coalescing] 2) Attach one of such structure to each "struct socket" created in sock_alloc_inode(). 3) Respect RCU grace period when freeing a "struct socket_wq" 4) Change sk_sleep pointer in "struct sock" by sk_wq, pointer to "struct socket_wq" 5) Change sk_sleep() function to use new sk->sk_wq instead of sk->sk_sleep 6) Change sk_has_sleeper() to wq_has_sleeper() that must be used inside a rcu_read_lock() section. 7) Change all sk_has_sleeper() callers to : - Use rcu_read_lock() instead of read_lock(&sk->sk_callback_lock) - Use wq_has_sleeper() to eventually wakeup tasks. - Use rcu_read_unlock() instead of read_unlock(&sk->sk_callback_lock) 8) sock_wake_async() is modified to use rcu protection as well. 9) Exceptions : macvtap, drivers/net/tun.c, af_unix use integrated "struct socket_wq" instead of dynamically allocated ones. They dont need rcu freeing. Some cleanups or followups are probably needed, (possible sk_callback_lock conversion to a spinlock for example...). Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2010-04-29 19:01:49 +08:00
wq = rcu_dereference(sk->sk_wq);
if (skwq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait,
EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
}
net: sock_def_readable() and friends RCU conversion sk_callback_lock rwlock actually protects sk->sk_sleep pointer, so we need two atomic operations (and associated dirtying) per incoming packet. RCU conversion is pretty much needed : 1) Add a new structure, called "struct socket_wq" to hold all fields that will need rcu_read_lock() protection (currently: a wait_queue_head_t and a struct fasync_struct pointer). [Future patch will add a list anchor for wakeup coalescing] 2) Attach one of such structure to each "struct socket" created in sock_alloc_inode(). 3) Respect RCU grace period when freeing a "struct socket_wq" 4) Change sk_sleep pointer in "struct sock" by sk_wq, pointer to "struct socket_wq" 5) Change sk_sleep() function to use new sk->sk_wq instead of sk->sk_sleep 6) Change sk_has_sleeper() to wq_has_sleeper() that must be used inside a rcu_read_lock() section. 7) Change all sk_has_sleeper() callers to : - Use rcu_read_lock() instead of read_lock(&sk->sk_callback_lock) - Use wq_has_sleeper() to eventually wakeup tasks. - Use rcu_read_unlock() instead of read_unlock(&sk->sk_callback_lock) 8) sock_wake_async() is modified to use rcu protection as well. 9) Exceptions : macvtap, drivers/net/tun.c, af_unix use integrated "struct socket_wq" instead of dynamically allocated ones. They dont need rcu freeing. Some cleanups or followups are probably needed, (possible sk_callback_lock conversion to a spinlock for example...). Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2010-04-29 19:01:49 +08:00
rcu_read_unlock();
}
/* When dgram socket disconnects (or changes its peer), we clear its receive
* queue of packets arrived from previous peer. First, it allows to do
* flow control based only on wmem_alloc; second, sk connected to peer
* may receive messages only from that peer. */
static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
{
if (!skb_queue_empty(&sk->sk_receive_queue)) {
skb_queue_purge(&sk->sk_receive_queue);
wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
/* If one link of bidirectional dgram pipe is disconnected,
* we signal error. Messages are lost. Do not make this,
* when peer was not connected to us.
*/
if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
other->sk_err = ECONNRESET;
other->sk_error_report(other);
}
}
}
static void unix_sock_destructor(struct sock *sk)
{
struct unix_sock *u = unix_sk(sk);
skb_queue_purge(&sk->sk_receive_queue);
WARN_ON(refcount_read(&sk->sk_wmem_alloc));
WARN_ON(!sk_unhashed(sk));
WARN_ON(sk->sk_socket);
if (!sock_flag(sk, SOCK_DEAD)) {
pr_info("Attempt to release alive unix socket: %p\n", sk);
return;
}
if (u->addr)
unix_release_addr(u->addr);
fs: allow for more than 2^31 files Robin Holt tried to boot a 16TB system and found af_unix was overflowing a 32bit value : <quote> We were seeing a failure which prevented boot. The kernel was incapable of creating either a named pipe or unix domain socket. This comes down to a common kernel function called unix_create1() which does: atomic_inc(&unix_nr_socks); if (atomic_read(&unix_nr_socks) > 2 * get_max_files()) goto out; The function get_max_files() is a simple return of files_stat.max_files. files_stat.max_files is a signed integer and is computed in fs/file_table.c's files_init(). n = (mempages * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = n; In our case, mempages (total_ram_pages) is approx 3,758,096,384 (0xe0000000). That leaves max_files at approximately 1,503,238,553. This causes 2 * get_max_files() to integer overflow. </quote> Fix is to let /proc/sys/fs/file-nr & /proc/sys/fs/file-max use long integers, and change af_unix to use an atomic_long_t instead of atomic_t. get_max_files() is changed to return an unsigned long. get_nr_files() is changed to return a long. unix_nr_socks is changed from atomic_t to atomic_long_t, while not strictly needed to address Robin problem. Before patch (on a 64bit kernel) : # echo 2147483648 >/proc/sys/fs/file-max # cat /proc/sys/fs/file-max -18446744071562067968 After patch: # echo 2147483648 >/proc/sys/fs/file-max # cat /proc/sys/fs/file-max 2147483648 # cat /proc/sys/fs/file-nr 704 0 2147483648 Reported-by: Robin Holt <holt@sgi.com> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Acked-by: David Miller <davem@davemloft.net> Reviewed-by: Robin Holt <holt@sgi.com> Tested-by: Robin Holt <holt@sgi.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Christoph Hellwig <hch@lst.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-10-27 05:22:44 +08:00
atomic_long_dec(&unix_nr_socks);
local_bh_disable();
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
local_bh_enable();
#ifdef UNIX_REFCNT_DEBUG
pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
fs: allow for more than 2^31 files Robin Holt tried to boot a 16TB system and found af_unix was overflowing a 32bit value : <quote> We were seeing a failure which prevented boot. The kernel was incapable of creating either a named pipe or unix domain socket. This comes down to a common kernel function called unix_create1() which does: atomic_inc(&unix_nr_socks); if (atomic_read(&unix_nr_socks) > 2 * get_max_files()) goto out; The function get_max_files() is a simple return of files_stat.max_files. files_stat.max_files is a signed integer and is computed in fs/file_table.c's files_init(). n = (mempages * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = n; In our case, mempages (total_ram_pages) is approx 3,758,096,384 (0xe0000000). That leaves max_files at approximately 1,503,238,553. This causes 2 * get_max_files() to integer overflow. </quote> Fix is to let /proc/sys/fs/file-nr & /proc/sys/fs/file-max use long integers, and change af_unix to use an atomic_long_t instead of atomic_t. get_max_files() is changed to return an unsigned long. get_nr_files() is changed to return a long. unix_nr_socks is changed from atomic_t to atomic_long_t, while not strictly needed to address Robin problem. Before patch (on a 64bit kernel) : # echo 2147483648 >/proc/sys/fs/file-max # cat /proc/sys/fs/file-max -18446744071562067968 After patch: # echo 2147483648 >/proc/sys/fs/file-max # cat /proc/sys/fs/file-max 2147483648 # cat /proc/sys/fs/file-nr 704 0 2147483648 Reported-by: Robin Holt <holt@sgi.com> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Acked-by: David Miller <davem@davemloft.net> Reviewed-by: Robin Holt <holt@sgi.com> Tested-by: Robin Holt <holt@sgi.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Christoph Hellwig <hch@lst.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-10-27 05:22:44 +08:00
atomic_long_read(&unix_nr_socks));
#endif
}
static void unix_release_sock(struct sock *sk, int embrion)
{
struct unix_sock *u = unix_sk(sk);
struct path path;
struct sock *skpair;
struct sk_buff *skb;
int state;
unix_remove_socket(sk);
/* Clear state */
unix_state_lock(sk);
sock_orphan(sk);
sk->sk_shutdown = SHUTDOWN_MASK;
path = u->path;
u->path.dentry = NULL;
u->path.mnt = NULL;
state = sk->sk_state;
sk->sk_state = TCP_CLOSE;
unix_state_unlock(sk);
wake_up_interruptible_all(&u->peer_wait);
skpair = unix_peer(sk);
if (skpair != NULL) {
if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
unix_state_lock(skpair);
/* No more writes */
skpair->sk_shutdown = SHUTDOWN_MASK;
if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
skpair->sk_err = ECONNRESET;
unix_state_unlock(skpair);
skpair->sk_state_change(skpair);
sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
}
unix: avoid use-after-free in ep_remove_wait_queue Rainer Weikusat <rweikusat@mobileactivedefense.com> writes: An AF_UNIX datagram socket being the client in an n:1 association with some server socket is only allowed to send messages to the server if the receive queue of this socket contains at most sk_max_ack_backlog datagrams. This implies that prospective writers might be forced to go to sleep despite none of the message presently enqueued on the server receive queue were sent by them. In order to ensure that these will be woken up once space becomes again available, the present unix_dgram_poll routine does a second sock_poll_wait call with the peer_wait wait queue of the server socket as queue argument (unix_dgram_recvmsg does a wake up on this queue after a datagram was received). This is inherently problematic because the server socket is only guaranteed to remain alive for as long as the client still holds a reference to it. In case the connection is dissolved via connect or by the dead peer detection logic in unix_dgram_sendmsg, the server socket may be freed despite "the polling mechanism" (in particular, epoll) still has a pointer to the corresponding peer_wait queue. There's no way to forcibly deregister a wait queue with epoll. Based on an idea by Jason Baron, the patch below changes the code such that a wait_queue_t belonging to the client socket is enqueued on the peer_wait queue of the server whenever the peer receive queue full condition is detected by either a sendmsg or a poll. A wake up on the peer queue is then relayed to the ordinary wait queue of the client socket via wake function. The connection to the peer wait queue is again dissolved if either a wake up is about to be relayed or the client socket reconnects or a dead peer is detected or the client socket is itself closed. This enables removing the second sock_poll_wait from unix_dgram_poll, thus avoiding the use-after-free, while still ensuring that no blocked writer sleeps forever. Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com> Fixes: ec0d215f9420 ("af_unix: fix 'poll for write'/connected DGRAM sockets") Reviewed-by: Jason Baron <jbaron@akamai.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-11-21 06:07:23 +08:00
unix_dgram_peer_wake_disconnect(sk, skpair);
sock_put(skpair); /* It may now die */
unix_peer(sk) = NULL;
}
/* Try to flush out this socket. Throw out buffers at least */
while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
if (state == TCP_LISTEN)
unix_release_sock(skb->sk, 1);
/* passed fds are erased in the kfree_skb hook */
UNIXCB(skb).consumed = skb->len;
kfree_skb(skb);
}
if (path.dentry)
path_put(&path);
sock_put(sk);
/* ---- Socket is dead now and most probably destroyed ---- */
/*
* Fixme: BSD difference: In BSD all sockets connected to us get
* ECONNRESET and we die on the spot. In Linux we behave
* like files and pipes do and wait for the last
* dereference.
*
* Can't we simply set sock->err?
*
* What the above comment does talk about? --ANK(980817)
*/
if (unix_tot_inflight)
unix_gc(); /* Garbage collect fds */
}
static void init_peercred(struct sock *sk)
{
put_pid(sk->sk_peer_pid);
if (sk->sk_peer_cred)
put_cred(sk->sk_peer_cred);
sk->sk_peer_pid = get_pid(task_tgid(current));
sk->sk_peer_cred = get_current_cred();
}
static void copy_peercred(struct sock *sk, struct sock *peersk)
{
put_pid(sk->sk_peer_pid);
if (sk->sk_peer_cred)
put_cred(sk->sk_peer_cred);
sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
}
static int unix_listen(struct socket *sock, int backlog)
{
int err;
struct sock *sk = sock->sk;
struct unix_sock *u = unix_sk(sk);
struct pid *old_pid = NULL;
err = -EOPNOTSUPP;
if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
goto out; /* Only stream/seqpacket sockets accept */
err = -EINVAL;
if (!u->addr)
goto out; /* No listens on an unbound socket */
unix_state_lock(sk);
if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
goto out_unlock;
if (backlog > sk->sk_max_ack_backlog)
wake_up_interruptible_all(&u->peer_wait);
sk->sk_max_ack_backlog = backlog;
sk->sk_state = TCP_LISTEN;
/* set credentials so connect can copy them */
init_peercred(sk);
err = 0;
out_unlock:
unix_state_unlock(sk);
put_pid(old_pid);
out:
return err;
}
static int unix_release(struct socket *);
static int unix_bind(struct socket *, struct sockaddr *, int);
static int unix_stream_connect(struct socket *, struct sockaddr *,
int addr_len, int flags);
static int unix_socketpair(struct socket *, struct socket *);
net: Work around lockdep limitation in sockets that use sockets Lockdep issues a circular dependency warning when AFS issues an operation through AF_RXRPC from a context in which the VFS/VM holds the mmap_sem. The theory lockdep comes up with is as follows: (1) If the pagefault handler decides it needs to read pages from AFS, it calls AFS with mmap_sem held and AFS begins an AF_RXRPC call, but creating a call requires the socket lock: mmap_sem must be taken before sk_lock-AF_RXRPC (2) afs_open_socket() opens an AF_RXRPC socket and binds it. rxrpc_bind() binds the underlying UDP socket whilst holding its socket lock. inet_bind() takes its own socket lock: sk_lock-AF_RXRPC must be taken before sk_lock-AF_INET (3) Reading from a TCP socket into a userspace buffer might cause a fault and thus cause the kernel to take the mmap_sem, but the TCP socket is locked whilst doing this: sk_lock-AF_INET must be taken before mmap_sem However, lockdep's theory is wrong in this instance because it deals only with lock classes and not individual locks. The AF_INET lock in (2) isn't really equivalent to the AF_INET lock in (3) as the former deals with a socket entirely internal to the kernel that never sees userspace. This is a limitation in the design of lockdep. Fix the general case by: (1) Double up all the locking keys used in sockets so that one set are used if the socket is created by userspace and the other set is used if the socket is created by the kernel. (2) Store the kern parameter passed to sk_alloc() in a variable in the sock struct (sk_kern_sock). This informs sock_lock_init(), sock_init_data() and sk_clone_lock() as to the lock keys to be used. Note that the child created by sk_clone_lock() inherits the parent's kern setting. (3) Add a 'kern' parameter to ->accept() that is analogous to the one passed in to ->create() that distinguishes whether kernel_accept() or sys_accept4() was the caller and can be passed to sk_alloc(). Note that a lot of accept functions merely dequeue an already allocated socket. I haven't touched these as the new socket already exists before we get the parameter. Note also that there are a couple of places where I've made the accepted socket unconditionally kernel-based: irda_accept() rds_rcp_accept_one() tcp_accept_from_sock() because they follow a sock_create_kern() and accept off of that. Whilst creating this, I noticed that lustre and ocfs don't create sockets through sock_create_kern() and thus they aren't marked as for-kernel, though they appear to be internal. I wonder if these should do that so that they use the new set of lock keys. Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-09 16:09:05 +08:00
static int unix_accept(struct socket *, struct socket *, int, bool);
net: make getname() functions return length rather than use int* parameter Changes since v1: Added changes in these files: drivers/infiniband/hw/usnic/usnic_transport.c drivers/staging/lustre/lnet/lnet/lib-socket.c drivers/target/iscsi/iscsi_target_login.c drivers/vhost/net.c fs/dlm/lowcomms.c fs/ocfs2/cluster/tcp.c security/tomoyo/network.c Before: All these functions either return a negative error indicator, or store length of sockaddr into "int *socklen" parameter and return zero on success. "int *socklen" parameter is awkward. For example, if caller does not care, it still needs to provide on-stack storage for the value it does not need. None of the many FOO_getname() functions of various protocols ever used old value of *socklen. They always just overwrite it. This change drops this parameter, and makes all these functions, on success, return length of sockaddr. It's always >= 0 and can be differentiated from an error. Tests in callers are changed from "if (err)" to "if (err < 0)", where needed. rpc_sockname() lost "int buflen" parameter, since its only use was to be passed to kernel_getsockname() as &buflen and subsequently not used in any way. Userspace API is not changed. text data bss dec hex filename 30108430 2633624 873672 33615726 200ef6e vmlinux.before.o 30108109 2633612 873672 33615393 200ee21 vmlinux.o Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com> CC: David S. Miller <davem@davemloft.net> CC: linux-kernel@vger.kernel.org CC: netdev@vger.kernel.org CC: linux-bluetooth@vger.kernel.org CC: linux-decnet-user@lists.sourceforge.net CC: linux-wireless@vger.kernel.org CC: linux-rdma@vger.kernel.org CC: linux-sctp@vger.kernel.org CC: linux-nfs@vger.kernel.org CC: linux-x25@vger.kernel.org Signed-off-by: David S. Miller <davem@davemloft.net>
2018-02-13 03:00:20 +08:00
static int unix_getname(struct socket *, struct sockaddr *, int);
static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
static __poll_t unix_dgram_poll(struct file *, struct socket *,
poll_table *);
static int unix_ioctl(struct socket *, unsigned int, unsigned long);
#ifdef CONFIG_COMPAT
static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
#endif
static int unix_shutdown(struct socket *, int);
static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
size_t size, int flags);
static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos,
struct pipe_inode_info *, size_t size,
unsigned int flags);
static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
static int unix_dgram_connect(struct socket *, struct sockaddr *,
int, int);
static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
int);
static int unix_set_peek_off(struct sock *sk, int val)
{
struct unix_sock *u = unix_sk(sk);
if (mutex_lock_interruptible(&u->iolock))
return -EINTR;
sk->sk_peek_off = val;
mutex_unlock(&u->iolock);
return 0;
}
#ifdef CONFIG_PROC_FS
static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
{
struct sock *sk = sock->sk;
struct unix_sock *u;
if (sk) {
u = unix_sk(sock->sk);
seq_printf(m, "scm_fds: %u\n",
atomic_read(&u->scm_stat.nr_fds));
}
}
#else
#define unix_show_fdinfo NULL
#endif
static const struct proto_ops unix_stream_ops = {
.family = PF_UNIX,
.owner = THIS_MODULE,
.release = unix_release,
.bind = unix_bind,
.connect = unix_stream_connect,
.socketpair = unix_socketpair,
.accept = unix_accept,
.getname = unix_getname,
.poll = unix_poll,
.ioctl = unix_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = unix_compat_ioctl,
#endif
.listen = unix_listen,
.shutdown = unix_shutdown,
.sendmsg = unix_stream_sendmsg,
.recvmsg = unix_stream_recvmsg,
.mmap = sock_no_mmap,
.sendpage = unix_stream_sendpage,
.splice_read = unix_stream_splice_read,
.set_peek_off = unix_set_peek_off,
.show_fdinfo = unix_show_fdinfo,
};
static const struct proto_ops unix_dgram_ops = {
.family = PF_UNIX,
.owner = THIS_MODULE,
.release = unix_release,
.bind = unix_bind,
.connect = unix_dgram_connect,
.socketpair = unix_socketpair,
.accept = sock_no_accept,
.getname = unix_getname,
.poll = unix_dgram_poll,
.ioctl = unix_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = unix_compat_ioctl,
#endif
.listen = sock_no_listen,
.shutdown = unix_shutdown,
.sendmsg = unix_dgram_sendmsg,
.recvmsg = unix_dgram_recvmsg,
.mmap = sock_no_mmap,
.sendpage = sock_no_sendpage,
.set_peek_off = unix_set_peek_off,
.show_fdinfo = unix_show_fdinfo,
};
static const struct proto_ops unix_seqpacket_ops = {
.family = PF_UNIX,
.owner = THIS_MODULE,
.release = unix_release,
.bind = unix_bind,
.connect = unix_stream_connect,
.socketpair = unix_socketpair,
.accept = unix_accept,
.getname = unix_getname,
.poll = unix_dgram_poll,
.ioctl = unix_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = unix_compat_ioctl,
#endif
.listen = unix_listen,
.shutdown = unix_shutdown,
.sendmsg = unix_seqpacket_sendmsg,
.recvmsg = unix_seqpacket_recvmsg,
.mmap = sock_no_mmap,
.sendpage = sock_no_sendpage,
.set_peek_off = unix_set_peek_off,
.show_fdinfo = unix_show_fdinfo,
};
static struct proto unix_proto = {
.name = "UNIX",
.owner = THIS_MODULE,
.obj_size = sizeof(struct unix_sock),
};
static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
{
struct sock *sk = NULL;
struct unix_sock *u;
fs: allow for more than 2^31 files Robin Holt tried to boot a 16TB system and found af_unix was overflowing a 32bit value : <quote> We were seeing a failure which prevented boot. The kernel was incapable of creating either a named pipe or unix domain socket. This comes down to a common kernel function called unix_create1() which does: atomic_inc(&unix_nr_socks); if (atomic_read(&unix_nr_socks) > 2 * get_max_files()) goto out; The function get_max_files() is a simple return of files_stat.max_files. files_stat.max_files is a signed integer and is computed in fs/file_table.c's files_init(). n = (mempages * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = n; In our case, mempages (total_ram_pages) is approx 3,758,096,384 (0xe0000000). That leaves max_files at approximately 1,503,238,553. This causes 2 * get_max_files() to integer overflow. </quote> Fix is to let /proc/sys/fs/file-nr & /proc/sys/fs/file-max use long integers, and change af_unix to use an atomic_long_t instead of atomic_t. get_max_files() is changed to return an unsigned long. get_nr_files() is changed to return a long. unix_nr_socks is changed from atomic_t to atomic_long_t, while not strictly needed to address Robin problem. Before patch (on a 64bit kernel) : # echo 2147483648 >/proc/sys/fs/file-max # cat /proc/sys/fs/file-max -18446744071562067968 After patch: # echo 2147483648 >/proc/sys/fs/file-max # cat /proc/sys/fs/file-max 2147483648 # cat /proc/sys/fs/file-nr 704 0 2147483648 Reported-by: Robin Holt <holt@sgi.com> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Acked-by: David Miller <davem@davemloft.net> Reviewed-by: Robin Holt <holt@sgi.com> Tested-by: Robin Holt <holt@sgi.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Christoph Hellwig <hch@lst.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-10-27 05:22:44 +08:00
atomic_long_inc(&unix_nr_socks);
if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
goto out;
sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern);
if (!sk)
goto out;
sock_init_data(sock, sk);
af_unix: charge buffers to kmemcg Unix sockets can consume a significant amount of system memory, hence they should be accounted to kmemcg. Since unix socket buffers are always allocated from process context, all we need to do to charge them to kmemcg is set __GFP_ACCOUNT in sock->sk_allocation mask. Eric asked: > 1) What happens when a buffer, allocated from socket <A> lands in a > different socket <B>, maybe owned by another user/process. > > Who owns it now, in term of kmemcg accounting ? We never move memcg charges. E.g. if two processes from different cgroups are sharing a memory region, each page will be charged to the process which touched it first. Or if two processes are working with the same directory tree, inodes and dentries will be charged to the first user. The same is fair for unix socket buffers - they will be charged to the sender. > 2) Has performance impact been evaluated ? I ran netperf STREAM_STREAM with default options in a kmemcg on a 4 core x2 HT box. The results are below: # clients bandwidth (10^6bits/sec) base patched 1 67643 +- 725 64874 +- 353 - 4.0 % 4 193585 +- 2516 186715 +- 1460 - 3.5 % 8 194820 +- 377 187443 +- 1229 - 3.7 % So the accounting doesn't come for free - it takes ~4% of performance. I believe we could optimize it by using per cpu batching not only on charge, but also on uncharge in memcg core, but that's beyond the scope of this patch set - I'll take a look at this later. Anyway, if performance impact is found to be unacceptable, it is always possible to disable kmem accounting at boot time (cgroup.memory=nokmem) or not use memory cgroups at runtime at all (thanks to jump labels there'll be no overhead even if they are compiled in). Link: http://lkml.kernel.org/r/fcfe6cae27a59fbc5e40145664b3cf085a560c68.1464079538.git.vdavydov@virtuozzo.com Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com> Cc: "David S. Miller" <davem@davemloft.net> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Michal Hocko <mhocko@kernel.org> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: Minchan Kim <minchan@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-07-27 06:24:36 +08:00
sk->sk_allocation = GFP_KERNEL_ACCOUNT;
sk->sk_write_space = unix_write_space;
sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen;
sk->sk_destruct = unix_sock_destructor;
u = unix_sk(sk);
u->path.dentry = NULL;
u->path.mnt = NULL;
spin_lock_init(&u->lock);
atomic_long_set(&u->inflight, 0);
INIT_LIST_HEAD(&u->link);
mutex_init(&u->iolock); /* single task reading lock */
mutex_init(&u->bindlock); /* single task binding lock */
init_waitqueue_head(&u->peer_wait);
unix: avoid use-after-free in ep_remove_wait_queue Rainer Weikusat <rweikusat@mobileactivedefense.com> writes: An AF_UNIX datagram socket being the client in an n:1 association with some server socket is only allowed to send messages to the server if the receive queue of this socket contains at most sk_max_ack_backlog datagrams. This implies that prospective writers might be forced to go to sleep despite none of the message presently enqueued on the server receive queue were sent by them. In order to ensure that these will be woken up once space becomes again available, the present unix_dgram_poll routine does a second sock_poll_wait call with the peer_wait wait queue of the server socket as queue argument (unix_dgram_recvmsg does a wake up on this queue after a datagram was received). This is inherently problematic because the server socket is only guaranteed to remain alive for as long as the client still holds a reference to it. In case the connection is dissolved via connect or by the dead peer detection logic in unix_dgram_sendmsg, the server socket may be freed despite "the polling mechanism" (in particular, epoll) still has a pointer to the corresponding peer_wait queue. There's no way to forcibly deregister a wait queue with epoll. Based on an idea by Jason Baron, the patch below changes the code such that a wait_queue_t belonging to the client socket is enqueued on the peer_wait queue of the server whenever the peer receive queue full condition is detected by either a sendmsg or a poll. A wake up on the peer queue is then relayed to the ordinary wait queue of the client socket via wake function. The connection to the peer wait queue is again dissolved if either a wake up is about to be relayed or the client socket reconnects or a dead peer is detected or the client socket is itself closed. This enables removing the second sock_poll_wait from unix_dgram_poll, thus avoiding the use-after-free, while still ensuring that no blocked writer sleeps forever. Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com> Fixes: ec0d215f9420 ("af_unix: fix 'poll for write'/connected DGRAM sockets") Reviewed-by: Jason Baron <jbaron@akamai.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-11-21 06:07:23 +08:00
init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
memset(&u->scm_stat, 0, sizeof(struct scm_stat));
unix_insert_socket(unix_sockets_unbound(sk), sk);
out:
if (sk == NULL)
fs: allow for more than 2^31 files Robin Holt tried to boot a 16TB system and found af_unix was overflowing a 32bit value : <quote> We were seeing a failure which prevented boot. The kernel was incapable of creating either a named pipe or unix domain socket. This comes down to a common kernel function called unix_create1() which does: atomic_inc(&unix_nr_socks); if (atomic_read(&unix_nr_socks) > 2 * get_max_files()) goto out; The function get_max_files() is a simple return of files_stat.max_files. files_stat.max_files is a signed integer and is computed in fs/file_table.c's files_init(). n = (mempages * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = n; In our case, mempages (total_ram_pages) is approx 3,758,096,384 (0xe0000000). That leaves max_files at approximately 1,503,238,553. This causes 2 * get_max_files() to integer overflow. </quote> Fix is to let /proc/sys/fs/file-nr & /proc/sys/fs/file-max use long integers, and change af_unix to use an atomic_long_t instead of atomic_t. get_max_files() is changed to return an unsigned long. get_nr_files() is changed to return a long. unix_nr_socks is changed from atomic_t to atomic_long_t, while not strictly needed to address Robin problem. Before patch (on a 64bit kernel) : # echo 2147483648 >/proc/sys/fs/file-max # cat /proc/sys/fs/file-max -18446744071562067968 After patch: # echo 2147483648 >/proc/sys/fs/file-max # cat /proc/sys/fs/file-max 2147483648 # cat /proc/sys/fs/file-nr 704 0 2147483648 Reported-by: Robin Holt <holt@sgi.com> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Acked-by: David Miller <davem@davemloft.net> Reviewed-by: Robin Holt <holt@sgi.com> Tested-by: Robin Holt <holt@sgi.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Christoph Hellwig <hch@lst.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-10-27 05:22:44 +08:00
atomic_long_dec(&unix_nr_socks);
else {
local_bh_disable();
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
local_bh_enable();
}
return sk;
}
static int unix_create(struct net *net, struct socket *sock, int protocol,
int kern)
{
if (protocol && protocol != PF_UNIX)
return -EPROTONOSUPPORT;
sock->state = SS_UNCONNECTED;
switch (sock->type) {
case SOCK_STREAM:
sock->ops = &unix_stream_ops;
break;
/*
* Believe it or not BSD has AF_UNIX, SOCK_RAW though
* nothing uses it.
*/
case SOCK_RAW:
sock->type = SOCK_DGRAM;
/* fall through */
case SOCK_DGRAM:
sock->ops = &unix_dgram_ops;
break;
case SOCK_SEQPACKET:
sock->ops = &unix_seqpacket_ops;
break;
default:
return -ESOCKTNOSUPPORT;
}
return unix_create1(net, sock, kern) ? 0 : -ENOMEM;
}
static int unix_release(struct socket *sock)
{
struct sock *sk = sock->sk;
if (!sk)
return 0;
unix_release_sock(sk, 0);
sock->sk = NULL;
return 0;
}
static int unix_autobind(struct socket *sock)
{
struct sock *sk = sock->sk;
struct net *net = sock_net(sk);
struct unix_sock *u = unix_sk(sk);
static u32 ordernum = 1;
struct unix_address *addr;
int err;
unsigned int retries = 0;
err = mutex_lock_interruptible(&u->bindlock);
if (err)
return err;
err = 0;
if (u->addr)
goto out;
err = -ENOMEM;
addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
if (!addr)
goto out;
addr->name->sun_family = AF_UNIX;
refcount_set(&addr->refcnt, 1);
retry:
addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
spin_lock(&unix_table_lock);
ordernum = (ordernum+1)&0xFFFFF;
if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
addr->hash)) {
spin_unlock(&unix_table_lock);
/*
* __unix_find_socket_byname() may take long time if many names
* are already in use.
*/
cond_resched();
/* Give up if all names seems to be in use. */
if (retries++ == 0xFFFFF) {
err = -ENOSPC;
kfree(addr);
goto out;
}
goto retry;
}
addr->hash ^= sk->sk_type;
__unix_remove_socket(sk);
missing barriers in some of unix_sock ->addr and ->path accesses Several u->addr and u->path users are not holding any locks in common with unix_bind(). unix_state_lock() is useless for those purposes. u->addr is assign-once and *(u->addr) is fully set up by the time we set u->addr (all under unix_table_lock). u->path is also set in the same critical area, also before setting u->addr, and any unix_sock with ->path filled will have non-NULL ->addr. So setting ->addr with smp_store_release() is all we need for those "lockless" users - just have them fetch ->addr with smp_load_acquire() and don't even bother looking at ->path if they see NULL ->addr. Users of ->addr and ->path fall into several classes now: 1) ones that do smp_load_acquire(u->addr) and access *(u->addr) and u->path only if smp_load_acquire() has returned non-NULL. 2) places holding unix_table_lock. These are guaranteed that *(u->addr) is seen fully initialized. If unix_sock is in one of the "bound" chains, so's ->path. 3) unix_sock_destructor() using ->addr is safe. All places that set u->addr are guaranteed to have seen all stores *(u->addr) while holding a reference to u and unix_sock_destructor() is called when (atomic) refcount hits zero. 4) unix_release_sock() using ->path is safe. unix_bind() is serialized wrt unix_release() (normally - by struct file refcount), and for the instances that had ->path set by unix_bind() unix_release_sock() comes from unix_release(), so they are fine. Instances that had it set in unix_stream_connect() either end up attached to a socket (in unix_accept()), in which case the call chain to unix_release_sock() and serialization are the same as in the previous case, or they never get accept'ed and unix_release_sock() is called when the listener is shut down and its queue gets purged. In that case the listener's queue lock provides the barriers needed - unix_stream_connect() shoves our unix_sock into listener's queue under that lock right after having set ->path and eventual unix_release_sock() caller picks them from that queue under the same lock right before calling unix_release_sock(). 5) unix_find_other() use of ->path is pointless, but safe - it happens with successful lookup by (abstract) name, so ->path.dentry is guaranteed to be NULL there. earlier-variant-reviewed-by: "Paul E. McKenney" <paulmck@linux.ibm.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-02-16 04:09:35 +08:00
smp_store_release(&u->addr, addr);
__unix_insert_socket(&unix_socket_table[addr->hash], sk);
spin_unlock(&unix_table_lock);
err = 0;
out: mutex_unlock(&u->bindlock);
return err;
}
static struct sock *unix_find_other(struct net *net,
struct sockaddr_un *sunname, int len,
int type, unsigned int hash, int *error)
{
struct sock *u;
struct path path;
int err = 0;
if (sunname->sun_path[0]) {
struct inode *inode;
err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
if (err)
goto fail;
inode = d_backing_inode(path.dentry);
err = inode_permission(inode, MAY_WRITE);
if (err)
goto put_fail;
err = -ECONNREFUSED;
if (!S_ISSOCK(inode->i_mode))
goto put_fail;
af_unix: Allow connecting to sockets in other network namespaces. Remove the restriction that only allows connecting to a unix domain socket identified by unix path that is in the same network namespace. Crossing network namespaces is always tricky and we did not support this at first, because of a strict policy of don't mix the namespaces. Later after Pavel proposed this we did not support this because no one had performed the audit to make certain using unix domain sockets across namespaces is safe. What fundamentally makes connecting to af_unix sockets in other namespaces is safe is that you have to have the proper permissions on the unix domain socket inode that lives in the filesystem. If you want strict isolation you just don't create inodes where unfriendlys can get at them, or with permissions that allow unfriendlys to open them. All nicely handled for us by the mount namespace and other standard file system facilities. I looked through unix domain sockets and they are a very controlled environment so none of the work that goes on in dev_forward_skb to make crossing namespaces safe appears needed, we are not loosing controll of the skb and so do not need to set up the skb to look like it is comming in fresh from the outside world. Further the fields in struct unix_skb_parms should not have any problems crossing network namespaces. Now that we handle SCM_CREDENTIALS in a way that gives useable values across namespaces. There does not appear to be any operational problems with encouraging the use of unix domain sockets across containers either. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Acked-by: Daniel Lezcano <daniel.lezcano@free.fr> Acked-by: Pavel Emelyanov <xemul@openvz.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2010-06-13 11:35:48 +08:00
u = unix_find_socket_byinode(inode);
if (!u)
goto put_fail;
if (u->sk_type == type)
touch_atime(&path);
path_put(&path);
err = -EPROTOTYPE;
if (u->sk_type != type) {
sock_put(u);
goto fail;
}
} else {
err = -ECONNREFUSED;
u = unix_find_socket_byname(net, sunname, len, type, hash);
if (u) {
struct dentry *dentry;
dentry = unix_sk(u)->path.dentry;
if (dentry)
touch_atime(&unix_sk(u)->path);
} else
goto fail;
}
return u;
put_fail:
path_put(&path);
fail:
*error = err;
return NULL;
}
static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
{
struct dentry *dentry;
struct path path;
int err = 0;
/*
* Get the parent directory, calculate the hash for last
* component.
*/
dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
err = PTR_ERR(dentry);
if (IS_ERR(dentry))
return err;
/*
* All right, let's create it.
*/
err = security_path_mknod(&path, dentry, mode, 0);
if (!err) {
err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
if (!err) {
res->mnt = mntget(path.mnt);
res->dentry = dget(dentry);
}
}
done_path_create(&path, dentry);
return err;
}
static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
struct sock *sk = sock->sk;
struct net *net = sock_net(sk);
struct unix_sock *u = unix_sk(sk);
struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
char *sun_path = sunaddr->sun_path;
int err;
unsigned int hash;
struct unix_address *addr;
struct hlist_head *list;
struct path path = { };
err = -EINVAL;
if (addr_len < offsetofend(struct sockaddr_un, sun_family) ||
sunaddr->sun_family != AF_UNIX)
goto out;
if (addr_len == sizeof(short)) {
err = unix_autobind(sock);
goto out;
}
err = unix_mkname(sunaddr, addr_len, &hash);
if (err < 0)
goto out;
addr_len = err;
if (sun_path[0]) {
umode_t mode = S_IFSOCK |
(SOCK_INODE(sock)->i_mode & ~current_umask());
err = unix_mknod(sun_path, mode, &path);
if (err) {
if (err == -EEXIST)
err = -EADDRINUSE;
goto out;
}
}
err = mutex_lock_interruptible(&u->bindlock);
if (err)
goto out_put;
err = -EINVAL;
if (u->addr)
goto out_up;
err = -ENOMEM;
addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
if (!addr)
goto out_up;
memcpy(addr->name, sunaddr, addr_len);
addr->len = addr_len;
addr->hash = hash ^ sk->sk_type;
refcount_set(&addr->refcnt, 1);
if (sun_path[0]) {
addr->hash = UNIX_HASH_SIZE;
hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1);
spin_lock(&unix_table_lock);
u->path = path;
list = &unix_socket_table[hash];
} else {
spin_lock(&unix_table_lock);
err = -EADDRINUSE;
if (__unix_find_socket_byname(net, sunaddr, addr_len,
sk->sk_type, hash)) {
unix_release_addr(addr);
goto out_unlock;
}
list = &unix_socket_table[addr->hash];
}
err = 0;
__unix_remove_socket(sk);
missing barriers in some of unix_sock ->addr and ->path accesses Several u->addr and u->path users are not holding any locks in common with unix_bind(). unix_state_lock() is useless for those purposes. u->addr is assign-once and *(u->addr) is fully set up by the time we set u->addr (all under unix_table_lock). u->path is also set in the same critical area, also before setting u->addr, and any unix_sock with ->path filled will have non-NULL ->addr. So setting ->addr with smp_store_release() is all we need for those "lockless" users - just have them fetch ->addr with smp_load_acquire() and don't even bother looking at ->path if they see NULL ->addr. Users of ->addr and ->path fall into several classes now: 1) ones that do smp_load_acquire(u->addr) and access *(u->addr) and u->path only if smp_load_acquire() has returned non-NULL. 2) places holding unix_table_lock. These are guaranteed that *(u->addr) is seen fully initialized. If unix_sock is in one of the "bound" chains, so's ->path. 3) unix_sock_destructor() using ->addr is safe. All places that set u->addr are guaranteed to have seen all stores *(u->addr) while holding a reference to u and unix_sock_destructor() is called when (atomic) refcount hits zero. 4) unix_release_sock() using ->path is safe. unix_bind() is serialized wrt unix_release() (normally - by struct file refcount), and for the instances that had ->path set by unix_bind() unix_release_sock() comes from unix_release(), so they are fine. Instances that had it set in unix_stream_connect() either end up attached to a socket (in unix_accept()), in which case the call chain to unix_release_sock() and serialization are the same as in the previous case, or they never get accept'ed and unix_release_sock() is called when the listener is shut down and its queue gets purged. In that case the listener's queue lock provides the barriers needed - unix_stream_connect() shoves our unix_sock into listener's queue under that lock right after having set ->path and eventual unix_release_sock() caller picks them from that queue under the same lock right before calling unix_release_sock(). 5) unix_find_other() use of ->path is pointless, but safe - it happens with successful lookup by (abstract) name, so ->path.dentry is guaranteed to be NULL there. earlier-variant-reviewed-by: "Paul E. McKenney" <paulmck@linux.ibm.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-02-16 04:09:35 +08:00
smp_store_release(&u->addr, addr);
__unix_insert_socket(list, sk);
out_unlock:
spin_unlock(&unix_table_lock);
out_up:
mutex_unlock(&u->bindlock);
out_put:
if (err)
path_put(&path);
out:
return err;
}
static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
{
if (unlikely(sk1 == sk2) || !sk2) {
unix_state_lock(sk1);
return;
}
if (sk1 < sk2) {
unix_state_lock(sk1);
unix_state_lock_nested(sk2);
} else {
unix_state_lock(sk2);
unix_state_lock_nested(sk1);
}
}
static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
{
if (unlikely(sk1 == sk2) || !sk2) {
unix_state_unlock(sk1);
return;
}
unix_state_unlock(sk1);
unix_state_unlock(sk2);
}
static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
int alen, int flags)
{
struct sock *sk = sock->sk;
struct net *net = sock_net(sk);
struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
struct sock *other;
unsigned int hash;
int err;
err = -EINVAL;
if (alen < offsetofend(struct sockaddr, sa_family))
goto out;
if (addr->sa_family != AF_UNSPEC) {
err = unix_mkname(sunaddr, alen, &hash);
if (err < 0)
goto out;
alen = err;
if (test_bit(SOCK_PASSCRED, &sock->flags) &&
!unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
goto out;
restart:
other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
if (!other)
goto out;
unix_state_double_lock(sk, other);
/* Apparently VFS overslept socket death. Retry. */
if (sock_flag(other, SOCK_DEAD)) {
unix_state_double_unlock(sk, other);
sock_put(other);
goto restart;
}
err = -EPERM;
if (!unix_may_send(sk, other))
goto out_unlock;
err = security_unix_may_send(sk->sk_socket, other->sk_socket);
if (err)
goto out_unlock;
} else {
/*
* 1003.1g breaking connected state with AF_UNSPEC
*/
other = NULL;
unix_state_double_lock(sk, other);
}
/*
* If it was connected, reconnect.
*/
if (unix_peer(sk)) {
struct sock *old_peer = unix_peer(sk);
unix_peer(sk) = other;
unix: avoid use-after-free in ep_remove_wait_queue Rainer Weikusat <rweikusat@mobileactivedefense.com> writes: An AF_UNIX datagram socket being the client in an n:1 association with some server socket is only allowed to send messages to the server if the receive queue of this socket contains at most sk_max_ack_backlog datagrams. This implies that prospective writers might be forced to go to sleep despite none of the message presently enqueued on the server receive queue were sent by them. In order to ensure that these will be woken up once space becomes again available, the present unix_dgram_poll routine does a second sock_poll_wait call with the peer_wait wait queue of the server socket as queue argument (unix_dgram_recvmsg does a wake up on this queue after a datagram was received). This is inherently problematic because the server socket is only guaranteed to remain alive for as long as the client still holds a reference to it. In case the connection is dissolved via connect or by the dead peer detection logic in unix_dgram_sendmsg, the server socket may be freed despite "the polling mechanism" (in particular, epoll) still has a pointer to the corresponding peer_wait queue. There's no way to forcibly deregister a wait queue with epoll. Based on an idea by Jason Baron, the patch below changes the code such that a wait_queue_t belonging to the client socket is enqueued on the peer_wait queue of the server whenever the peer receive queue full condition is detected by either a sendmsg or a poll. A wake up on the peer queue is then relayed to the ordinary wait queue of the client socket via wake function. The connection to the peer wait queue is again dissolved if either a wake up is about to be relayed or the client socket reconnects or a dead peer is detected or the client socket is itself closed. This enables removing the second sock_poll_wait from unix_dgram_poll, thus avoiding the use-after-free, while still ensuring that no blocked writer sleeps forever. Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com> Fixes: ec0d215f9420 ("af_unix: fix 'poll for write'/connected DGRAM sockets") Reviewed-by: Jason Baron <jbaron@akamai.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-11-21 06:07:23 +08:00
unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
unix_state_double_unlock(sk, other);
if (other != old_peer)
unix_dgram_disconnected(sk, old_peer);
sock_put(old_peer);
} else {
unix_peer(sk) = other;
unix_state_double_unlock(sk, other);
}
return 0;
out_unlock:
unix_state_double_unlock(sk, other);
sock_put(other);
out:
return err;
}
static long unix_wait_for_peer(struct sock *other, long timeo)
__releases(&unix_sk(other)->lock)
{
struct unix_sock *u = unix_sk(other);
int sched;
DEFINE_WAIT(wait);
prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
sched = !sock_flag(other, SOCK_DEAD) &&
!(other->sk_shutdown & RCV_SHUTDOWN) &&
af_unix: fix 'poll for write'/ connected DGRAM sockets The unix_dgram_sendmsg routine implements a (somewhat crude) form of receiver-imposed flow control by comparing the length of the receive queue of the 'peer socket' with the max_ack_backlog value stored in the corresponding sock structure, either blocking the thread which caused the send-routine to be called or returning EAGAIN. This routine is used by both SOCK_DGRAM and SOCK_SEQPACKET sockets. The poll-implementation for these socket types is datagram_poll from core/datagram.c. A socket is deemed to be writeable by this routine when the memory presently consumed by datagrams owned by it is less than the configured socket send buffer size. This is always wrong for connected PF_UNIX non-stream sockets when the abovementioned receive queue is currently considered to be full. 'poll' will then return, indicating that the socket is writeable, but a subsequent write result in EAGAIN, effectively causing an (usual) application to 'poll for writeability by repeated send request with O_NONBLOCK set' until it has consumed its time quantum. The change below uses a suitably modified variant of the datagram_poll routines for both type of PF_UNIX sockets, which tests if the recv-queue of the peer a socket is connected to is presently considered to be 'full' as part of the 'is this socket writeable'-checking code. The socket being polled is additionally put onto the peer_wait wait queue associated with its peer, because the unix_dgram_sendmsg routine does a wake up on this queue after a datagram was received and the 'other wakeup call' is done implicitly as part of skb destruction, meaning, a process blocked in poll because of a full peer receive queue could otherwise sleep forever if no datagram owned by its socket was already sitting on this queue. Among this change is a small (inline) helper routine named 'unix_recvq_full', which consolidates the actual testing code (in three different places) into a single location. Signed-off-by: Rainer Weikusat <rweikusat@mssgmbh.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-06-18 13:28:05 +08:00
unix_recvq_full(other);
unix_state_unlock(other);
if (sched)
timeo = schedule_timeout(timeo);
finish_wait(&u->peer_wait, &wait);
return timeo;
}
static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int addr_len, int flags)
{
struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
struct sock *sk = sock->sk;
struct net *net = sock_net(sk);
struct unix_sock *u = unix_sk(sk), *newu, *otheru;
struct sock *newsk = NULL;
struct sock *other = NULL;
struct sk_buff *skb = NULL;
unsigned int hash;
int st;
int err;
long timeo;
err = unix_mkname(sunaddr, addr_len, &hash);
if (err < 0)
goto out;
addr_len = err;
if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
(err = unix_autobind(sock)) != 0)
goto out;
timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
/* First of all allocate resources.
If we will make it after state is locked,
we will have to recheck all again in any case.
*/
err = -ENOMEM;
/* create new sock for complete connection */
newsk = unix_create1(sock_net(sk), NULL, 0);
if (newsk == NULL)
goto out;
/* Allocate skb for sending to listening sock */
skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
if (skb == NULL)
goto out;
restart:
/* Find listening sock. */
other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
if (!other)
goto out;
/* Latch state of peer */
unix_state_lock(other);
/* Apparently VFS overslept socket death. Retry. */
if (sock_flag(other, SOCK_DEAD)) {
unix_state_unlock(other);
sock_put(other);
goto restart;
}
err = -ECONNREFUSED;
if (other->sk_state != TCP_LISTEN)
goto out_unlock;
AF_UNIX: Fix deadlock on connecting to shutdown socket I found a deadlock bug in UNIX domain socket, which makes able to DoS attack against the local machine by non-root users. How to reproduce: 1. Make a listening AF_UNIX/SOCK_STREAM socket with an abstruct namespace(*), and shutdown(2) it. 2. Repeat connect(2)ing to the listening socket from the other sockets until the connection backlog is full-filled. 3. connect(2) takes the CPU forever. If every core is taken, the system hangs. PoC code: (Run as many times as cores on SMP machines.) int main(void) { int ret; int csd; int lsd; struct sockaddr_un sun; /* make an abstruct name address (*) */ memset(&sun, 0, sizeof(sun)); sun.sun_family = PF_UNIX; sprintf(&sun.sun_path[1], "%d", getpid()); /* create the listening socket and shutdown */ lsd = socket(AF_UNIX, SOCK_STREAM, 0); bind(lsd, (struct sockaddr *)&sun, sizeof(sun)); listen(lsd, 1); shutdown(lsd, SHUT_RDWR); /* connect loop */ alarm(15); /* forcely exit the loop after 15 sec */ for (;;) { csd = socket(AF_UNIX, SOCK_STREAM, 0); ret = connect(csd, (struct sockaddr *)&sun, sizeof(sun)); if (-1 == ret) { perror("connect()"); break; } puts("Connection OK"); } return 0; } (*) Make sun_path[0] = 0 to use the abstruct namespace. If a file-based socket is used, the system doesn't deadlock because of context switches in the file system layer. Why this happens: Error checks between unix_socket_connect() and unix_wait_for_peer() are inconsistent. The former calls the latter to wait until the backlog is processed. Despite the latter returns without doing anything when the socket is shutdown, the former doesn't check the shutdown state and just retries calling the latter forever. Patch: The patch below adds shutdown check into unix_socket_connect(), so connect(2) to the shutdown socket will return -ECONREFUSED. Signed-off-by: Tomoki Sekiyama <tomoki.sekiyama.qu@hitachi.com> Signed-off-by: Masanori Yoshida <masanori.yoshida.tv@hitachi.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2009-10-19 14:17:37 +08:00
if (other->sk_shutdown & RCV_SHUTDOWN)
goto out_unlock;
af_unix: fix 'poll for write'/ connected DGRAM sockets The unix_dgram_sendmsg routine implements a (somewhat crude) form of receiver-imposed flow control by comparing the length of the receive queue of the 'peer socket' with the max_ack_backlog value stored in the corresponding sock structure, either blocking the thread which caused the send-routine to be called or returning EAGAIN. This routine is used by both SOCK_DGRAM and SOCK_SEQPACKET sockets. The poll-implementation for these socket types is datagram_poll from core/datagram.c. A socket is deemed to be writeable by this routine when the memory presently consumed by datagrams owned by it is less than the configured socket send buffer size. This is always wrong for connected PF_UNIX non-stream sockets when the abovementioned receive queue is currently considered to be full. 'poll' will then return, indicating that the socket is writeable, but a subsequent write result in EAGAIN, effectively causing an (usual) application to 'poll for writeability by repeated send request with O_NONBLOCK set' until it has consumed its time quantum. The change below uses a suitably modified variant of the datagram_poll routines for both type of PF_UNIX sockets, which tests if the recv-queue of the peer a socket is connected to is presently considered to be 'full' as part of the 'is this socket writeable'-checking code. The socket being polled is additionally put onto the peer_wait wait queue associated with its peer, because the unix_dgram_sendmsg routine does a wake up on this queue after a datagram was received and the 'other wakeup call' is done implicitly as part of skb destruction, meaning, a process blocked in poll because of a full peer receive queue could otherwise sleep forever if no datagram owned by its socket was already sitting on this queue. Among this change is a small (inline) helper routine named 'unix_recvq_full', which consolidates the actual testing code (in three different places) into a single location. Signed-off-by: Rainer Weikusat <rweikusat@mssgmbh.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-06-18 13:28:05 +08:00
if (unix_recvq_full(other)) {
err = -EAGAIN;
if (!timeo)
goto out_unlock;
timeo = unix_wait_for_peer(other, timeo);
err = sock_intr_errno(timeo);
if (signal_pending(current))
goto out;
sock_put(other);
goto restart;
}
/* Latch our state.
It is tricky place. We need to grab our state lock and cannot
drop lock on peer. It is dangerous because deadlock is
possible. Connect to self case and simultaneous
attempt to connect are eliminated by checking socket
state. other is TCP_LISTEN, if sk is TCP_LISTEN we
check this before attempt to grab lock.
Well, and we have to recheck the state after socket locked.
*/
st = sk->sk_state;
switch (st) {
case TCP_CLOSE:
/* This is ok... continue with connect */
break;
case TCP_ESTABLISHED:
/* Socket is already connected */
err = -EISCONN;
goto out_unlock;
default:
err = -EINVAL;
goto out_unlock;
}
unix_state_lock_nested(sk);
if (sk->sk_state != st) {
unix_state_unlock(sk);
unix_state_unlock(other);
sock_put(other);
goto restart;
}
err = security_unix_stream_connect(sk, other, newsk);
if (err) {
unix_state_unlock(sk);
goto out_unlock;
}
/* The way is open! Fastly set all the necessary fields... */
sock_hold(sk);
unix_peer(newsk) = sk;
newsk->sk_state = TCP_ESTABLISHED;
newsk->sk_type = sk->sk_type;
init_peercred(newsk);
newu = unix_sk(newsk);
RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
otheru = unix_sk(other);
missing barriers in some of unix_sock ->addr and ->path accesses Several u->addr and u->path users are not holding any locks in common with unix_bind(). unix_state_lock() is useless for those purposes. u->addr is assign-once and *(u->addr) is fully set up by the time we set u->addr (all under unix_table_lock). u->path is also set in the same critical area, also before setting u->addr, and any unix_sock with ->path filled will have non-NULL ->addr. So setting ->addr with smp_store_release() is all we need for those "lockless" users - just have them fetch ->addr with smp_load_acquire() and don't even bother looking at ->path if they see NULL ->addr. Users of ->addr and ->path fall into several classes now: 1) ones that do smp_load_acquire(u->addr) and access *(u->addr) and u->path only if smp_load_acquire() has returned non-NULL. 2) places holding unix_table_lock. These are guaranteed that *(u->addr) is seen fully initialized. If unix_sock is in one of the "bound" chains, so's ->path. 3) unix_sock_destructor() using ->addr is safe. All places that set u->addr are guaranteed to have seen all stores *(u->addr) while holding a reference to u and unix_sock_destructor() is called when (atomic) refcount hits zero. 4) unix_release_sock() using ->path is safe. unix_bind() is serialized wrt unix_release() (normally - by struct file refcount), and for the instances that had ->path set by unix_bind() unix_release_sock() comes from unix_release(), so they are fine. Instances that had it set in unix_stream_connect() either end up attached to a socket (in unix_accept()), in which case the call chain to unix_release_sock() and serialization are the same as in the previous case, or they never get accept'ed and unix_release_sock() is called when the listener is shut down and its queue gets purged. In that case the listener's queue lock provides the barriers needed - unix_stream_connect() shoves our unix_sock into listener's queue under that lock right after having set ->path and eventual unix_release_sock() caller picks them from that queue under the same lock right before calling unix_release_sock(). 5) unix_find_other() use of ->path is pointless, but safe - it happens with successful lookup by (abstract) name, so ->path.dentry is guaranteed to be NULL there. earlier-variant-reviewed-by: "Paul E. McKenney" <paulmck@linux.ibm.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-02-16 04:09:35 +08:00
/* copy address information from listening to new sock
*
* The contents of *(otheru->addr) and otheru->path
* are seen fully set up here, since we have found
* otheru in hash under unix_table_lock. Insertion
* into the hash chain we'd found it in had been done
* in an earlier critical area protected by unix_table_lock,
* the same one where we'd set *(otheru->addr) contents,
* as well as otheru->path and otheru->addr itself.
*
* Using smp_store_release() here to set newu->addr
* is enough to make those stores, as well as stores
* to newu->path visible to anyone who gets newu->addr
* by smp_load_acquire(). IOW, the same warranties
* as for unix_sock instances bound in unix_bind() or
* in unix_autobind().
*/
if (otheru->path.dentry) {
path_get(&otheru->path);
newu->path = otheru->path;
}
missing barriers in some of unix_sock ->addr and ->path accesses Several u->addr and u->path users are not holding any locks in common with unix_bind(). unix_state_lock() is useless for those purposes. u->addr is assign-once and *(u->addr) is fully set up by the time we set u->addr (all under unix_table_lock). u->path is also set in the same critical area, also before setting u->addr, and any unix_sock with ->path filled will have non-NULL ->addr. So setting ->addr with smp_store_release() is all we need for those "lockless" users - just have them fetch ->addr with smp_load_acquire() and don't even bother looking at ->path if they see NULL ->addr. Users of ->addr and ->path fall into several classes now: 1) ones that do smp_load_acquire(u->addr) and access *(u->addr) and u->path only if smp_load_acquire() has returned non-NULL. 2) places holding unix_table_lock. These are guaranteed that *(u->addr) is seen fully initialized. If unix_sock is in one of the "bound" chains, so's ->path. 3) unix_sock_destructor() using ->addr is safe. All places that set u->addr are guaranteed to have seen all stores *(u->addr) while holding a reference to u and unix_sock_destructor() is called when (atomic) refcount hits zero. 4) unix_release_sock() using ->path is safe. unix_bind() is serialized wrt unix_release() (normally - by struct file refcount), and for the instances that had ->path set by unix_bind() unix_release_sock() comes from unix_release(), so they are fine. Instances that had it set in unix_stream_connect() either end up attached to a socket (in unix_accept()), in which case the call chain to unix_release_sock() and serialization are the same as in the previous case, or they never get accept'ed and unix_release_sock() is called when the listener is shut down and its queue gets purged. In that case the listener's queue lock provides the barriers needed - unix_stream_connect() shoves our unix_sock into listener's queue under that lock right after having set ->path and eventual unix_release_sock() caller picks them from that queue under the same lock right before calling unix_release_sock(). 5) unix_find_other() use of ->path is pointless, but safe - it happens with successful lookup by (abstract) name, so ->path.dentry is guaranteed to be NULL there. earlier-variant-reviewed-by: "Paul E. McKenney" <paulmck@linux.ibm.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-02-16 04:09:35 +08:00
refcount_inc(&otheru->addr->refcnt);
smp_store_release(&newu->addr, otheru->addr);
/* Set credentials */
copy_peercred(sk, other);
sock->state = SS_CONNECTED;
sk->sk_state = TCP_ESTABLISHED;
sock_hold(newsk);
smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
unix_peer(sk) = newsk;
unix_state_unlock(sk);
/* take ten and and send info to listening sock */
spin_lock(&other->sk_receive_queue.lock);
__skb_queue_tail(&other->sk_receive_queue, skb);
spin_unlock(&other->sk_receive_queue.lock);
unix_state_unlock(other);
other->sk_data_ready(other);
sock_put(other);
return 0;
out_unlock:
if (other)
unix_state_unlock(other);
out:
kfree_skb(skb);
if (newsk)
unix_release_sock(newsk, 0);
if (other)
sock_put(other);
return err;
}
static int unix_socketpair(struct socket *socka, struct socket *sockb)
{
struct sock *ska = socka->sk, *skb = sockb->sk;
/* Join our sockets back to back */
sock_hold(ska);
sock_hold(skb);
unix_peer(ska) = skb;
unix_peer(skb) = ska;
init_peercred(ska);
init_peercred(skb);
if (ska->sk_type != SOCK_DGRAM) {
ska->sk_state = TCP_ESTABLISHED;
skb->sk_state = TCP_ESTABLISHED;
socka->state = SS_CONNECTED;
sockb->state = SS_CONNECTED;
}
return 0;
}
net: unix: inherit SOCK_PASS{CRED, SEC} flags from socket to fix race In the case of credentials passing in unix stream sockets (dgram sockets seem not affected), we get a rather sparse race after commit 16e5726 ("af_unix: dont send SCM_CREDENTIALS by default"). We have a stream server on receiver side that requests credential passing from senders (e.g. nc -U). Since we need to set SO_PASSCRED on each spawned/accepted socket on server side to 1 first (as it's not inherited), it can happen that in the time between accept() and setsockopt() we get interrupted, the sender is being scheduled and continues with passing data to our receiver. At that time SO_PASSCRED is neither set on sender nor receiver side, hence in cmsg's SCM_CREDENTIALS we get eventually pid:0, uid:65534, gid:65534 (== overflow{u,g}id) instead of what we actually would like to see. On the sender side, here nc -U, the tests in maybe_add_creds() invoked through unix_stream_sendmsg() would fail, as at that exact time, as mentioned, the sender has neither SO_PASSCRED on his side nor sees it on the server side, and we have a valid 'other' socket in place. Thus, sender believes it would just look like a normal connection, not needing/requesting SO_PASSCRED at that time. As reverting 16e5726 would not be an option due to the significant performance regression reported when having creds always passed, one way/trade-off to prevent that would be to set SO_PASSCRED on the listener socket and allow inheriting these flags to the spawned socket on server side in accept(). It seems also logical to do so if we'd tell the listener socket to pass those flags onwards, and would fix the race. Before, strace: recvmsg(4, {msg_name(0)=NULL, msg_iov(1)=[{"blub\n", 4096}], msg_controllen=32, {cmsg_len=28, cmsg_level=SOL_SOCKET, cmsg_type=SCM_CREDENTIALS{pid=0, uid=65534, gid=65534}}, msg_flags=0}, 0) = 5 After, strace: recvmsg(4, {msg_name(0)=NULL, msg_iov(1)=[{"blub\n", 4096}], msg_controllen=32, {cmsg_len=28, cmsg_level=SOL_SOCKET, cmsg_type=SCM_CREDENTIALS{pid=11580, uid=1000, gid=1000}}, msg_flags=0}, 0) = 5 Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Cc: Eric Dumazet <edumazet@google.com> Cc: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-18 04:51:31 +08:00
static void unix_sock_inherit_flags(const struct socket *old,
struct socket *new)
{
if (test_bit(SOCK_PASSCRED, &old->flags))
set_bit(SOCK_PASSCRED, &new->flags);
if (test_bit(SOCK_PASSSEC, &old->flags))
set_bit(SOCK_PASSSEC, &new->flags);
}
net: Work around lockdep limitation in sockets that use sockets Lockdep issues a circular dependency warning when AFS issues an operation through AF_RXRPC from a context in which the VFS/VM holds the mmap_sem. The theory lockdep comes up with is as follows: (1) If the pagefault handler decides it needs to read pages from AFS, it calls AFS with mmap_sem held and AFS begins an AF_RXRPC call, but creating a call requires the socket lock: mmap_sem must be taken before sk_lock-AF_RXRPC (2) afs_open_socket() opens an AF_RXRPC socket and binds it. rxrpc_bind() binds the underlying UDP socket whilst holding its socket lock. inet_bind() takes its own socket lock: sk_lock-AF_RXRPC must be taken before sk_lock-AF_INET (3) Reading from a TCP socket into a userspace buffer might cause a fault and thus cause the kernel to take the mmap_sem, but the TCP socket is locked whilst doing this: sk_lock-AF_INET must be taken before mmap_sem However, lockdep's theory is wrong in this instance because it deals only with lock classes and not individual locks. The AF_INET lock in (2) isn't really equivalent to the AF_INET lock in (3) as the former deals with a socket entirely internal to the kernel that never sees userspace. This is a limitation in the design of lockdep. Fix the general case by: (1) Double up all the locking keys used in sockets so that one set are used if the socket is created by userspace and the other set is used if the socket is created by the kernel. (2) Store the kern parameter passed to sk_alloc() in a variable in the sock struct (sk_kern_sock). This informs sock_lock_init(), sock_init_data() and sk_clone_lock() as to the lock keys to be used. Note that the child created by sk_clone_lock() inherits the parent's kern setting. (3) Add a 'kern' parameter to ->accept() that is analogous to the one passed in to ->create() that distinguishes whether kernel_accept() or sys_accept4() was the caller and can be passed to sk_alloc(). Note that a lot of accept functions merely dequeue an already allocated socket. I haven't touched these as the new socket already exists before we get the parameter. Note also that there are a couple of places where I've made the accepted socket unconditionally kernel-based: irda_accept() rds_rcp_accept_one() tcp_accept_from_sock() because they follow a sock_create_kern() and accept off of that. Whilst creating this, I noticed that lustre and ocfs don't create sockets through sock_create_kern() and thus they aren't marked as for-kernel, though they appear to be internal. I wonder if these should do that so that they use the new set of lock keys. Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-09 16:09:05 +08:00
static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
bool kern)
{
struct sock *sk = sock->sk;
struct sock *tsk;
struct sk_buff *skb;
int err;
err = -EOPNOTSUPP;
if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
goto out;
err = -EINVAL;
if (sk->sk_state != TCP_LISTEN)
goto out;
/* If socket state is TCP_LISTEN it cannot change (for now...),
* so that no locks are necessary.
*/
skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
if (!skb) {
/* This means receive shutdown. */
if (err == 0)
err = -EINVAL;
goto out;
}
tsk = skb->sk;
skb_free_datagram(sk, skb);
wake_up_interruptible(&unix_sk(sk)->peer_wait);
/* attach accepted sock to socket */
unix_state_lock(tsk);
newsock->state = SS_CONNECTED;
net: unix: inherit SOCK_PASS{CRED, SEC} flags from socket to fix race In the case of credentials passing in unix stream sockets (dgram sockets seem not affected), we get a rather sparse race after commit 16e5726 ("af_unix: dont send SCM_CREDENTIALS by default"). We have a stream server on receiver side that requests credential passing from senders (e.g. nc -U). Since we need to set SO_PASSCRED on each spawned/accepted socket on server side to 1 first (as it's not inherited), it can happen that in the time between accept() and setsockopt() we get interrupted, the sender is being scheduled and continues with passing data to our receiver. At that time SO_PASSCRED is neither set on sender nor receiver side, hence in cmsg's SCM_CREDENTIALS we get eventually pid:0, uid:65534, gid:65534 (== overflow{u,g}id) instead of what we actually would like to see. On the sender side, here nc -U, the tests in maybe_add_creds() invoked through unix_stream_sendmsg() would fail, as at that exact time, as mentioned, the sender has neither SO_PASSCRED on his side nor sees it on the server side, and we have a valid 'other' socket in place. Thus, sender believes it would just look like a normal connection, not needing/requesting SO_PASSCRED at that time. As reverting 16e5726 would not be an option due to the significant performance regression reported when having creds always passed, one way/trade-off to prevent that would be to set SO_PASSCRED on the listener socket and allow inheriting these flags to the spawned socket on server side in accept(). It seems also logical to do so if we'd tell the listener socket to pass those flags onwards, and would fix the race. Before, strace: recvmsg(4, {msg_name(0)=NULL, msg_iov(1)=[{"blub\n", 4096}], msg_controllen=32, {cmsg_len=28, cmsg_level=SOL_SOCKET, cmsg_type=SCM_CREDENTIALS{pid=0, uid=65534, gid=65534}}, msg_flags=0}, 0) = 5 After, strace: recvmsg(4, {msg_name(0)=NULL, msg_iov(1)=[{"blub\n", 4096}], msg_controllen=32, {cmsg_len=28, cmsg_level=SOL_SOCKET, cmsg_type=SCM_CREDENTIALS{pid=11580, uid=1000, gid=1000}}, msg_flags=0}, 0) = 5 Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Cc: Eric Dumazet <edumazet@google.com> Cc: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-18 04:51:31 +08:00
unix_sock_inherit_flags(sock, newsock);
sock_graft(tsk, newsock);
unix_state_unlock(tsk);
return 0;
out:
return err;
}
net: make getname() functions return length rather than use int* parameter Changes since v1: Added changes in these files: drivers/infiniband/hw/usnic/usnic_transport.c drivers/staging/lustre/lnet/lnet/lib-socket.c drivers/target/iscsi/iscsi_target_login.c drivers/vhost/net.c fs/dlm/lowcomms.c fs/ocfs2/cluster/tcp.c security/tomoyo/network.c Before: All these functions either return a negative error indicator, or store length of sockaddr into "int *socklen" parameter and return zero on success. "int *socklen" parameter is awkward. For example, if caller does not care, it still needs to provide on-stack storage for the value it does not need. None of the many FOO_getname() functions of various protocols ever used old value of *socklen. They always just overwrite it. This change drops this parameter, and makes all these functions, on success, return length of sockaddr. It's always >= 0 and can be differentiated from an error. Tests in callers are changed from "if (err)" to "if (err < 0)", where needed. rpc_sockname() lost "int buflen" parameter, since its only use was to be passed to kernel_getsockname() as &buflen and subsequently not used in any way. Userspace API is not changed. text data bss dec hex filename 30108430 2633624 873672 33615726 200ef6e vmlinux.before.o 30108109 2633612 873672 33615393 200ee21 vmlinux.o Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com> CC: David S. Miller <davem@davemloft.net> CC: linux-kernel@vger.kernel.org CC: netdev@vger.kernel.org CC: linux-bluetooth@vger.kernel.org CC: linux-decnet-user@lists.sourceforge.net CC: linux-wireless@vger.kernel.org CC: linux-rdma@vger.kernel.org CC: linux-sctp@vger.kernel.org CC: linux-nfs@vger.kernel.org CC: linux-x25@vger.kernel.org Signed-off-by: David S. Miller <davem@davemloft.net>
2018-02-13 03:00:20 +08:00
static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
{
struct sock *sk = sock->sk;
missing barriers in some of unix_sock ->addr and ->path accesses Several u->addr and u->path users are not holding any locks in common with unix_bind(). unix_state_lock() is useless for those purposes. u->addr is assign-once and *(u->addr) is fully set up by the time we set u->addr (all under unix_table_lock). u->path is also set in the same critical area, also before setting u->addr, and any unix_sock with ->path filled will have non-NULL ->addr. So setting ->addr with smp_store_release() is all we need for those "lockless" users - just have them fetch ->addr with smp_load_acquire() and don't even bother looking at ->path if they see NULL ->addr. Users of ->addr and ->path fall into several classes now: 1) ones that do smp_load_acquire(u->addr) and access *(u->addr) and u->path only if smp_load_acquire() has returned non-NULL. 2) places holding unix_table_lock. These are guaranteed that *(u->addr) is seen fully initialized. If unix_sock is in one of the "bound" chains, so's ->path. 3) unix_sock_destructor() using ->addr is safe. All places that set u->addr are guaranteed to have seen all stores *(u->addr) while holding a reference to u and unix_sock_destructor() is called when (atomic) refcount hits zero. 4) unix_release_sock() using ->path is safe. unix_bind() is serialized wrt unix_release() (normally - by struct file refcount), and for the instances that had ->path set by unix_bind() unix_release_sock() comes from unix_release(), so they are fine. Instances that had it set in unix_stream_connect() either end up attached to a socket (in unix_accept()), in which case the call chain to unix_release_sock() and serialization are the same as in the previous case, or they never get accept'ed and unix_release_sock() is called when the listener is shut down and its queue gets purged. In that case the listener's queue lock provides the barriers needed - unix_stream_connect() shoves our unix_sock into listener's queue under that lock right after having set ->path and eventual unix_release_sock() caller picks them from that queue under the same lock right before calling unix_release_sock(). 5) unix_find_other() use of ->path is pointless, but safe - it happens with successful lookup by (abstract) name, so ->path.dentry is guaranteed to be NULL there. earlier-variant-reviewed-by: "Paul E. McKenney" <paulmck@linux.ibm.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-02-16 04:09:35 +08:00
struct unix_address *addr;
DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
int err = 0;
if (peer) {
sk = unix_peer_get(sk);
err = -ENOTCONN;
if (!sk)
goto out;
err = 0;
} else {
sock_hold(sk);
}
missing barriers in some of unix_sock ->addr and ->path accesses Several u->addr and u->path users are not holding any locks in common with unix_bind(). unix_state_lock() is useless for those purposes. u->addr is assign-once and *(u->addr) is fully set up by the time we set u->addr (all under unix_table_lock). u->path is also set in the same critical area, also before setting u->addr, and any unix_sock with ->path filled will have non-NULL ->addr. So setting ->addr with smp_store_release() is all we need for those "lockless" users - just have them fetch ->addr with smp_load_acquire() and don't even bother looking at ->path if they see NULL ->addr. Users of ->addr and ->path fall into several classes now: 1) ones that do smp_load_acquire(u->addr) and access *(u->addr) and u->path only if smp_load_acquire() has returned non-NULL. 2) places holding unix_table_lock. These are guaranteed that *(u->addr) is seen fully initialized. If unix_sock is in one of the "bound" chains, so's ->path. 3) unix_sock_destructor() using ->addr is safe. All places that set u->addr are guaranteed to have seen all stores *(u->addr) while holding a reference to u and unix_sock_destructor() is called when (atomic) refcount hits zero. 4) unix_release_sock() using ->path is safe. unix_bind() is serialized wrt unix_release() (normally - by struct file refcount), and for the instances that had ->path set by unix_bind() unix_release_sock() comes from unix_release(), so they are fine. Instances that had it set in unix_stream_connect() either end up attached to a socket (in unix_accept()), in which case the call chain to unix_release_sock() and serialization are the same as in the previous case, or they never get accept'ed and unix_release_sock() is called when the listener is shut down and its queue gets purged. In that case the listener's queue lock provides the barriers needed - unix_stream_connect() shoves our unix_sock into listener's queue under that lock right after having set ->path and eventual unix_release_sock() caller picks them from that queue under the same lock right before calling unix_release_sock(). 5) unix_find_other() use of ->path is pointless, but safe - it happens with successful lookup by (abstract) name, so ->path.dentry is guaranteed to be NULL there. earlier-variant-reviewed-by: "Paul E. McKenney" <paulmck@linux.ibm.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-02-16 04:09:35 +08:00
addr = smp_load_acquire(&unix_sk(sk)->addr);
if (!addr) {
sunaddr->sun_family = AF_UNIX;
sunaddr->sun_path[0] = 0;
net: make getname() functions return length rather than use int* parameter Changes since v1: Added changes in these files: drivers/infiniband/hw/usnic/usnic_transport.c drivers/staging/lustre/lnet/lnet/lib-socket.c drivers/target/iscsi/iscsi_target_login.c drivers/vhost/net.c fs/dlm/lowcomms.c fs/ocfs2/cluster/tcp.c security/tomoyo/network.c Before: All these functions either return a negative error indicator, or store length of sockaddr into "int *socklen" parameter and return zero on success. "int *socklen" parameter is awkward. For example, if caller does not care, it still needs to provide on-stack storage for the value it does not need. None of the many FOO_getname() functions of various protocols ever used old value of *socklen. They always just overwrite it. This change drops this parameter, and makes all these functions, on success, return length of sockaddr. It's always >= 0 and can be differentiated from an error. Tests in callers are changed from "if (err)" to "if (err < 0)", where needed. rpc_sockname() lost "int buflen" parameter, since its only use was to be passed to kernel_getsockname() as &buflen and subsequently not used in any way. Userspace API is not changed. text data bss dec hex filename 30108430 2633624 873672 33615726 200ef6e vmlinux.before.o 30108109 2633612 873672 33615393 200ee21 vmlinux.o Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com> CC: David S. Miller <davem@davemloft.net> CC: linux-kernel@vger.kernel.org CC: netdev@vger.kernel.org CC: linux-bluetooth@vger.kernel.org CC: linux-decnet-user@lists.sourceforge.net CC: linux-wireless@vger.kernel.org CC: linux-rdma@vger.kernel.org CC: linux-sctp@vger.kernel.org CC: linux-nfs@vger.kernel.org CC: linux-x25@vger.kernel.org Signed-off-by: David S. Miller <davem@davemloft.net>
2018-02-13 03:00:20 +08:00
err = sizeof(short);
} else {
net: make getname() functions return length rather than use int* parameter Changes since v1: Added changes in these files: drivers/infiniband/hw/usnic/usnic_transport.c drivers/staging/lustre/lnet/lnet/lib-socket.c drivers/target/iscsi/iscsi_target_login.c drivers/vhost/net.c fs/dlm/lowcomms.c fs/ocfs2/cluster/tcp.c security/tomoyo/network.c Before: All these functions either return a negative error indicator, or store length of sockaddr into "int *socklen" parameter and return zero on success. "int *socklen" parameter is awkward. For example, if caller does not care, it still needs to provide on-stack storage for the value it does not need. None of the many FOO_getname() functions of various protocols ever used old value of *socklen. They always just overwrite it. This change drops this parameter, and makes all these functions, on success, return length of sockaddr. It's always >= 0 and can be differentiated from an error. Tests in callers are changed from "if (err)" to "if (err < 0)", where needed. rpc_sockname() lost "int buflen" parameter, since its only use was to be passed to kernel_getsockname() as &buflen and subsequently not used in any way. Userspace API is not changed. text data bss dec hex filename 30108430 2633624 873672 33615726 200ef6e vmlinux.before.o 30108109 2633612 873672 33615393 200ee21 vmlinux.o Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com> CC: David S. Miller <davem@davemloft.net> CC: linux-kernel@vger.kernel.org CC: netdev@vger.kernel.org CC: linux-bluetooth@vger.kernel.org CC: linux-decnet-user@lists.sourceforge.net CC: linux-wireless@vger.kernel.org CC: linux-rdma@vger.kernel.org CC: linux-sctp@vger.kernel.org CC: linux-nfs@vger.kernel.org CC: linux-x25@vger.kernel.org Signed-off-by: David S. Miller <davem@davemloft.net>
2018-02-13 03:00:20 +08:00
err = addr->len;
memcpy(sunaddr, addr->name, addr->len);
}
sock_put(sk);
out:
return err;
}
static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
{
int err = 0;
af_unix: dont send SCM_CREDENTIALS by default Since commit 7361c36c5224 (af_unix: Allow credentials to work across user and pid namespaces) af_unix performance dropped a lot. This is because we now take a reference on pid and cred in each write(), and release them in read(), usually done from another process, eventually from another cpu. This triggers false sharing. # Events: 154K cycles # # Overhead Command Shared Object Symbol # ........ ....... .................. ......................... # 10.40% hackbench [kernel.kallsyms] [k] put_pid 8.60% hackbench [kernel.kallsyms] [k] unix_stream_recvmsg 7.87% hackbench [kernel.kallsyms] [k] unix_stream_sendmsg 6.11% hackbench [kernel.kallsyms] [k] do_raw_spin_lock 4.95% hackbench [kernel.kallsyms] [k] unix_scm_to_skb 4.87% hackbench [kernel.kallsyms] [k] pid_nr_ns 4.34% hackbench [kernel.kallsyms] [k] cred_to_ucred 2.39% hackbench [kernel.kallsyms] [k] unix_destruct_scm 2.24% hackbench [kernel.kallsyms] [k] sub_preempt_count 1.75% hackbench [kernel.kallsyms] [k] fget_light 1.51% hackbench [kernel.kallsyms] [k] __mutex_lock_interruptible_slowpath 1.42% hackbench [kernel.kallsyms] [k] sock_alloc_send_pskb This patch includes SCM_CREDENTIALS information in a af_unix message/skb only if requested by the sender, [man 7 unix for details how to include ancillary data using sendmsg() system call] Note: This might break buggy applications that expected SCM_CREDENTIAL from an unaware write() system call, and receiver not using SO_PASSCRED socket option. If SOCK_PASSCRED is set on source or destination socket, we still include credentials for mere write() syscalls. Performance boost in hackbench : more than 50% gain on a 16 thread machine (2 quad-core cpus, 2 threads per core) hackbench 20 thread 2000 4.228 sec instead of 9.102 sec Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Acked-by: Tim Chen <tim.c.chen@linux.intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2011-09-19 13:52:27 +08:00
UNIXCB(skb).pid = get_pid(scm->pid);
UNIXCB(skb).uid = scm->creds.uid;
UNIXCB(skb).gid = scm->creds.gid;
UNIXCB(skb).fp = NULL;
unix_get_secdata(scm, skb);
if (scm->fp && send_fds)
err = unix_attach_fds(scm, skb);
skb->destructor = unix_destruct_scm;
return err;
}
static bool unix_passcred_enabled(const struct socket *sock,
const struct sock *other)
{
return test_bit(SOCK_PASSCRED, &sock->flags) ||
!other->sk_socket ||
test_bit(SOCK_PASSCRED, &other->sk_socket->flags);
}
af_unix: dont send SCM_CREDENTIALS by default Since commit 7361c36c5224 (af_unix: Allow credentials to work across user and pid namespaces) af_unix performance dropped a lot. This is because we now take a reference on pid and cred in each write(), and release them in read(), usually done from another process, eventually from another cpu. This triggers false sharing. # Events: 154K cycles # # Overhead Command Shared Object Symbol # ........ ....... .................. ......................... # 10.40% hackbench [kernel.kallsyms] [k] put_pid 8.60% hackbench [kernel.kallsyms] [k] unix_stream_recvmsg 7.87% hackbench [kernel.kallsyms] [k] unix_stream_sendmsg 6.11% hackbench [kernel.kallsyms] [k] do_raw_spin_lock 4.95% hackbench [kernel.kallsyms] [k] unix_scm_to_skb 4.87% hackbench [kernel.kallsyms] [k] pid_nr_ns 4.34% hackbench [kernel.kallsyms] [k] cred_to_ucred 2.39% hackbench [kernel.kallsyms] [k] unix_destruct_scm 2.24% hackbench [kernel.kallsyms] [k] sub_preempt_count 1.75% hackbench [kernel.kallsyms] [k] fget_light 1.51% hackbench [kernel.kallsyms] [k] __mutex_lock_interruptible_slowpath 1.42% hackbench [kernel.kallsyms] [k] sock_alloc_send_pskb This patch includes SCM_CREDENTIALS information in a af_unix message/skb only if requested by the sender, [man 7 unix for details how to include ancillary data using sendmsg() system call] Note: This might break buggy applications that expected SCM_CREDENTIAL from an unaware write() system call, and receiver not using SO_PASSCRED socket option. If SOCK_PASSCRED is set on source or destination socket, we still include credentials for mere write() syscalls. Performance boost in hackbench : more than 50% gain on a 16 thread machine (2 quad-core cpus, 2 threads per core) hackbench 20 thread 2000 4.228 sec instead of 9.102 sec Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Acked-by: Tim Chen <tim.c.chen@linux.intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2011-09-19 13:52:27 +08:00
/*
* Some apps rely on write() giving SCM_CREDENTIALS
* We include credentials if source or destination socket
* asserted SOCK_PASSCRED.
*/
static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
const struct sock *other)
{
if (UNIXCB(skb).pid)
af_unix: dont send SCM_CREDENTIALS by default Since commit 7361c36c5224 (af_unix: Allow credentials to work across user and pid namespaces) af_unix performance dropped a lot. This is because we now take a reference on pid and cred in each write(), and release them in read(), usually done from another process, eventually from another cpu. This triggers false sharing. # Events: 154K cycles # # Overhead Command Shared Object Symbol # ........ ....... .................. ......................... # 10.40% hackbench [kernel.kallsyms] [k] put_pid 8.60% hackbench [kernel.kallsyms] [k] unix_stream_recvmsg 7.87% hackbench [kernel.kallsyms] [k] unix_stream_sendmsg 6.11% hackbench [kernel.kallsyms] [k] do_raw_spin_lock 4.95% hackbench [kernel.kallsyms] [k] unix_scm_to_skb 4.87% hackbench [kernel.kallsyms] [k] pid_nr_ns 4.34% hackbench [kernel.kallsyms] [k] cred_to_ucred 2.39% hackbench [kernel.kallsyms] [k] unix_destruct_scm 2.24% hackbench [kernel.kallsyms] [k] sub_preempt_count 1.75% hackbench [kernel.kallsyms] [k] fget_light 1.51% hackbench [kernel.kallsyms] [k] __mutex_lock_interruptible_slowpath 1.42% hackbench [kernel.kallsyms] [k] sock_alloc_send_pskb This patch includes SCM_CREDENTIALS information in a af_unix message/skb only if requested by the sender, [man 7 unix for details how to include ancillary data using sendmsg() system call] Note: This might break buggy applications that expected SCM_CREDENTIAL from an unaware write() system call, and receiver not using SO_PASSCRED socket option. If SOCK_PASSCRED is set on source or destination socket, we still include credentials for mere write() syscalls. Performance boost in hackbench : more than 50% gain on a 16 thread machine (2 quad-core cpus, 2 threads per core) hackbench 20 thread 2000 4.228 sec instead of 9.102 sec Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Acked-by: Tim Chen <tim.c.chen@linux.intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2011-09-19 13:52:27 +08:00
return;
if (unix_passcred_enabled(sock, other)) {
af_unix: dont send SCM_CREDENTIALS by default Since commit 7361c36c5224 (af_unix: Allow credentials to work across user and pid namespaces) af_unix performance dropped a lot. This is because we now take a reference on pid and cred in each write(), and release them in read(), usually done from another process, eventually from another cpu. This triggers false sharing. # Events: 154K cycles # # Overhead Command Shared Object Symbol # ........ ....... .................. ......................... # 10.40% hackbench [kernel.kallsyms] [k] put_pid 8.60% hackbench [kernel.kallsyms] [k] unix_stream_recvmsg 7.87% hackbench [kernel.kallsyms] [k] unix_stream_sendmsg 6.11% hackbench [kernel.kallsyms] [k] do_raw_spin_lock 4.95% hackbench [kernel.kallsyms] [k] unix_scm_to_skb 4.87% hackbench [kernel.kallsyms] [k] pid_nr_ns 4.34% hackbench [kernel.kallsyms] [k] cred_to_ucred 2.39% hackbench [kernel.kallsyms] [k] unix_destruct_scm 2.24% hackbench [kernel.kallsyms] [k] sub_preempt_count 1.75% hackbench [kernel.kallsyms] [k] fget_light 1.51% hackbench [kernel.kallsyms] [k] __mutex_lock_interruptible_slowpath 1.42% hackbench [kernel.kallsyms] [k] sock_alloc_send_pskb This patch includes SCM_CREDENTIALS information in a af_unix message/skb only if requested by the sender, [man 7 unix for details how to include ancillary data using sendmsg() system call] Note: This might break buggy applications that expected SCM_CREDENTIAL from an unaware write() system call, and receiver not using SO_PASSCRED socket option. If SOCK_PASSCRED is set on source or destination socket, we still include credentials for mere write() syscalls. Performance boost in hackbench : more than 50% gain on a 16 thread machine (2 quad-core cpus, 2 threads per core) hackbench 20 thread 2000 4.228 sec instead of 9.102 sec Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Acked-by: Tim Chen <tim.c.chen@linux.intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2011-09-19 13:52:27 +08:00
UNIXCB(skb).pid = get_pid(task_tgid(current));
current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
af_unix: dont send SCM_CREDENTIALS by default Since commit 7361c36c5224 (af_unix: Allow credentials to work across user and pid namespaces) af_unix performance dropped a lot. This is because we now take a reference on pid and cred in each write(), and release them in read(), usually done from another process, eventually from another cpu. This triggers false sharing. # Events: 154K cycles # # Overhead Command Shared Object Symbol # ........ ....... .................. ......................... # 10.40% hackbench [kernel.kallsyms] [k] put_pid 8.60% hackbench [kernel.kallsyms] [k] unix_stream_recvmsg 7.87% hackbench [kernel.kallsyms] [k] unix_stream_sendmsg 6.11% hackbench [kernel.kallsyms] [k] do_raw_spin_lock 4.95% hackbench [kernel.kallsyms] [k] unix_scm_to_skb 4.87% hackbench [kernel.kallsyms] [k] pid_nr_ns 4.34% hackbench [kernel.kallsyms] [k] cred_to_ucred 2.39% hackbench [kernel.kallsyms] [k] unix_destruct_scm 2.24% hackbench [kernel.kallsyms] [k] sub_preempt_count 1.75% hackbench [kernel.kallsyms] [k] fget_light 1.51% hackbench [kernel.kallsyms] [k] __mutex_lock_interruptible_slowpath 1.42% hackbench [kernel.kallsyms] [k] sock_alloc_send_pskb This patch includes SCM_CREDENTIALS information in a af_unix message/skb only if requested by the sender, [man 7 unix for details how to include ancillary data using sendmsg() system call] Note: This might break buggy applications that expected SCM_CREDENTIAL from an unaware write() system call, and receiver not using SO_PASSCRED socket option. If SOCK_PASSCRED is set on source or destination socket, we still include credentials for mere write() syscalls. Performance boost in hackbench : more than 50% gain on a 16 thread machine (2 quad-core cpus, 2 threads per core) hackbench 20 thread 2000 4.228 sec instead of 9.102 sec Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Acked-by: Tim Chen <tim.c.chen@linux.intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2011-09-19 13:52:27 +08:00
}
}
static int maybe_init_creds(struct scm_cookie *scm,
struct socket *socket,
const struct sock *other)
{
int err;
struct msghdr msg = { .msg_controllen = 0 };
err = scm_send(socket, &msg, scm, false);
if (err)
return err;
if (unix_passcred_enabled(socket, other)) {
scm->pid = get_pid(task_tgid(current));
current_uid_gid(&scm->creds.uid, &scm->creds.gid);
}
return err;
}
static bool unix_skb_scm_eq(struct sk_buff *skb,
struct scm_cookie *scm)
{
const struct unix_skb_parms *u = &UNIXCB(skb);
return u->pid == scm->pid &&
uid_eq(u->uid, scm->creds.uid) &&
gid_eq(u->gid, scm->creds.gid) &&
unix_secdata_eq(scm, skb);
}
static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
{
struct scm_fp_list *fp = UNIXCB(skb).fp;
struct unix_sock *u = unix_sk(sk);
if (unlikely(fp && fp->count))
atomic_add(fp->count, &u->scm_stat.nr_fds);
}
static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
{
struct scm_fp_list *fp = UNIXCB(skb).fp;
struct unix_sock *u = unix_sk(sk);
if (unlikely(fp && fp->count))
atomic_sub(fp->count, &u->scm_stat.nr_fds);
}
/*
* Send AF_UNIX data.
*/
static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
size_t len)
{
struct sock *sk = sock->sk;
struct net *net = sock_net(sk);
struct unix_sock *u = unix_sk(sk);
DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
struct sock *other = NULL;
int namelen = 0; /* fake GCC */
int err;
unsigned int hash;
struct sk_buff *skb;
long timeo;
struct scm_cookie scm;
int data_len = 0;
unix: avoid use-after-free in ep_remove_wait_queue Rainer Weikusat <rweikusat@mobileactivedefense.com> writes: An AF_UNIX datagram socket being the client in an n:1 association with some server socket is only allowed to send messages to the server if the receive queue of this socket contains at most sk_max_ack_backlog datagrams. This implies that prospective writers might be forced to go to sleep despite none of the message presently enqueued on the server receive queue were sent by them. In order to ensure that these will be woken up once space becomes again available, the present unix_dgram_poll routine does a second sock_poll_wait call with the peer_wait wait queue of the server socket as queue argument (unix_dgram_recvmsg does a wake up on this queue after a datagram was received). This is inherently problematic because the server socket is only guaranteed to remain alive for as long as the client still holds a reference to it. In case the connection is dissolved via connect or by the dead peer detection logic in unix_dgram_sendmsg, the server socket may be freed despite "the polling mechanism" (in particular, epoll) still has a pointer to the corresponding peer_wait queue. There's no way to forcibly deregister a wait queue with epoll. Based on an idea by Jason Baron, the patch below changes the code such that a wait_queue_t belonging to the client socket is enqueued on the peer_wait queue of the server whenever the peer receive queue full condition is detected by either a sendmsg or a poll. A wake up on the peer queue is then relayed to the ordinary wait queue of the client socket via wake function. The connection to the peer wait queue is again dissolved if either a wake up is about to be relayed or the client socket reconnects or a dead peer is detected or the client socket is itself closed. This enables removing the second sock_poll_wait from unix_dgram_poll, thus avoiding the use-after-free, while still ensuring that no blocked writer sleeps forever. Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com> Fixes: ec0d215f9420 ("af_unix: fix 'poll for write'/connected DGRAM sockets") Reviewed-by: Jason Baron <jbaron@akamai.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-11-21 06:07:23 +08:00
int sk_locked;
wait_for_unix_gc();
err = scm_send(sock, msg, &scm, false);
if (err < 0)
return err;
err = -EOPNOTSUPP;
if (msg->msg_flags&MSG_OOB)
goto out;
if (msg->msg_namelen) {
err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
if (err < 0)
goto out;
namelen = err;
} else {
sunaddr = NULL;
err = -ENOTCONN;
other = unix_peer_get(sk);
if (!other)
goto out;
}
if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
&& (err = unix_autobind(sock)) != 0)
goto out;
err = -EMSGSIZE;
if (len > sk->sk_sndbuf - 32)
goto out;
if (len > SKB_MAX_ALLOC) {
data_len = min_t(size_t,
len - SKB_MAX_ALLOC,
MAX_SKB_FRAGS * PAGE_SIZE);
data_len = PAGE_ALIGN(data_len);
BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
}
skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
msg->msg_flags & MSG_DONTWAIT, &err,
PAGE_ALLOC_COSTLY_ORDER);
if (skb == NULL)
goto out;
err = unix_scm_to_skb(&scm, skb, true);
if (err < 0)
goto out_free;
[AF_UNIX]: Datagram getpeersec This patch implements an API whereby an application can determine the label of its peer's Unix datagram sockets via the auxiliary data mechanism of recvmsg. Patch purpose: This patch enables a security-aware application to retrieve the security context of the peer of a Unix datagram socket. The application can then use this security context to determine the security context for processing on behalf of the peer who sent the packet. Patch design and implementation: The design and implementation is very similar to the UDP case for INET sockets. Basically we build upon the existing Unix domain socket API for retrieving user credentials. Linux offers the API for obtaining user credentials via ancillary messages (i.e., out of band/control messages that are bundled together with a normal message). To retrieve the security context, the application first indicates to the kernel such desire by setting the SO_PASSSEC option via getsockopt. Then the application retrieves the security context using the auxiliary data mechanism. An example server application for Unix datagram socket should look like this: toggle = 1; toggle_len = sizeof(toggle); setsockopt(sockfd, SOL_SOCKET, SO_PASSSEC, &toggle, &toggle_len); recvmsg(sockfd, &msg_hdr, 0); if (msg_hdr.msg_controllen > sizeof(struct cmsghdr)) { cmsg_hdr = CMSG_FIRSTHDR(&msg_hdr); if (cmsg_hdr->cmsg_len <= CMSG_LEN(sizeof(scontext)) && cmsg_hdr->cmsg_level == SOL_SOCKET && cmsg_hdr->cmsg_type == SCM_SECURITY) { memcpy(&scontext, CMSG_DATA(cmsg_hdr), sizeof(scontext)); } } sock_setsockopt is enhanced with a new socket option SOCK_PASSSEC to allow a server socket to receive security context of the peer. Testing: We have tested the patch by setting up Unix datagram client and server applications. We verified that the server can retrieve the security context using the auxiliary data mechanism of recvmsg. Signed-off-by: Catherine Zhang <cxzhang@watson.ibm.com> Acked-by: Acked-by: James Morris <jmorris@namei.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2006-06-30 03:27:47 +08:00
skb_put(skb, len - data_len);
skb->data_len = data_len;
skb->len = len;
err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
if (err)
goto out_free;
timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
restart:
if (!other) {
err = -ECONNRESET;
if (sunaddr == NULL)
goto out_free;
other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
hash, &err);
if (other == NULL)
goto out_free;
}
2011-01-18 14:39:15 +08:00
if (sk_filter(other, skb) < 0) {
/* Toss the packet but do not return any error to the sender */
err = len;
goto out_free;
}
unix: avoid use-after-free in ep_remove_wait_queue Rainer Weikusat <rweikusat@mobileactivedefense.com> writes: An AF_UNIX datagram socket being the client in an n:1 association with some server socket is only allowed to send messages to the server if the receive queue of this socket contains at most sk_max_ack_backlog datagrams. This implies that prospective writers might be forced to go to sleep despite none of the message presently enqueued on the server receive queue were sent by them. In order to ensure that these will be woken up once space becomes again available, the present unix_dgram_poll routine does a second sock_poll_wait call with the peer_wait wait queue of the server socket as queue argument (unix_dgram_recvmsg does a wake up on this queue after a datagram was received). This is inherently problematic because the server socket is only guaranteed to remain alive for as long as the client still holds a reference to it. In case the connection is dissolved via connect or by the dead peer detection logic in unix_dgram_sendmsg, the server socket may be freed despite "the polling mechanism" (in particular, epoll) still has a pointer to the corresponding peer_wait queue. There's no way to forcibly deregister a wait queue with epoll. Based on an idea by Jason Baron, the patch below changes the code such that a wait_queue_t belonging to the client socket is enqueued on the peer_wait queue of the server whenever the peer receive queue full condition is detected by either a sendmsg or a poll. A wake up on the peer queue is then relayed to the ordinary wait queue of the client socket via wake function. The connection to the peer wait queue is again dissolved if either a wake up is about to be relayed or the client socket reconnects or a dead peer is detected or the client socket is itself closed. This enables removing the second sock_poll_wait from unix_dgram_poll, thus avoiding the use-after-free, while still ensuring that no blocked writer sleeps forever. Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com> Fixes: ec0d215f9420 ("af_unix: fix 'poll for write'/connected DGRAM sockets") Reviewed-by: Jason Baron <jbaron@akamai.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-11-21 06:07:23 +08:00
sk_locked = 0;
unix_state_lock(other);
unix: avoid use-after-free in ep_remove_wait_queue Rainer Weikusat <rweikusat@mobileactivedefense.com> writes: An AF_UNIX datagram socket being the client in an n:1 association with some server socket is only allowed to send messages to the server if the receive queue of this socket contains at most sk_max_ack_backlog datagrams. This implies that prospective writers might be forced to go to sleep despite none of the message presently enqueued on the server receive queue were sent by them. In order to ensure that these will be woken up once space becomes again available, the present unix_dgram_poll routine does a second sock_poll_wait call with the peer_wait wait queue of the server socket as queue argument (unix_dgram_recvmsg does a wake up on this queue after a datagram was received). This is inherently problematic because the server socket is only guaranteed to remain alive for as long as the client still holds a reference to it. In case the connection is dissolved via connect or by the dead peer detection logic in unix_dgram_sendmsg, the server socket may be freed despite "the polling mechanism" (in particular, epoll) still has a pointer to the corresponding peer_wait queue. There's no way to forcibly deregister a wait queue with epoll. Based on an idea by Jason Baron, the patch below changes the code such that a wait_queue_t belonging to the client socket is enqueued on the peer_wait queue of the server whenever the peer receive queue full condition is detected by either a sendmsg or a poll. A wake up on the peer queue is then relayed to the ordinary wait queue of the client socket via wake function. The connection to the peer wait queue is again dissolved if either a wake up is about to be relayed or the client socket reconnects or a dead peer is detected or the client socket is itself closed. This enables removing the second sock_poll_wait from unix_dgram_poll, thus avoiding the use-after-free, while still ensuring that no blocked writer sleeps forever. Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com> Fixes: ec0d215f9420 ("af_unix: fix 'poll for write'/connected DGRAM sockets") Reviewed-by: Jason Baron <jbaron@akamai.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-11-21 06:07:23 +08:00
restart_locked:
err = -EPERM;
if (!unix_may_send(sk, other))
goto out_unlock;
unix: avoid use-after-free in ep_remove_wait_queue Rainer Weikusat <rweikusat@mobileactivedefense.com> writes: An AF_UNIX datagram socket being the client in an n:1 association with some server socket is only allowed to send messages to the server if the receive queue of this socket contains at most sk_max_ack_backlog datagrams. This implies that prospective writers might be forced to go to sleep despite none of the message presently enqueued on the server receive queue were sent by them. In order to ensure that these will be woken up once space becomes again available, the present unix_dgram_poll routine does a second sock_poll_wait call with the peer_wait wait queue of the server socket as queue argument (unix_dgram_recvmsg does a wake up on this queue after a datagram was received). This is inherently problematic because the server socket is only guaranteed to remain alive for as long as the client still holds a reference to it. In case the connection is dissolved via connect or by the dead peer detection logic in unix_dgram_sendmsg, the server socket may be freed despite "the polling mechanism" (in particular, epoll) still has a pointer to the corresponding peer_wait queue. There's no way to forcibly deregister a wait queue with epoll. Based on an idea by Jason Baron, the patch below changes the code such that a wait_queue_t belonging to the client socket is enqueued on the peer_wait queue of the server whenever the peer receive queue full condition is detected by either a sendmsg or a poll. A wake up on the peer queue is then relayed to the ordinary wait queue of the client socket via wake function. The connection to the peer wait queue is again dissolved if either a wake up is about to be relayed or the client socket reconnects or a dead peer is detected or the client socket is itself closed. This enables removing the second sock_poll_wait from unix_dgram_poll, thus avoiding the use-after-free, while still ensuring that no blocked writer sleeps forever. Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com> Fixes: ec0d215f9420 ("af_unix: fix 'poll for write'/connected DGRAM sockets") Reviewed-by: Jason Baron <jbaron@akamai.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-11-21 06:07:23 +08:00
if (unlikely(sock_flag(other, SOCK_DEAD))) {
/*
* Check with 1003.1g - what should
* datagram error
*/
unix_state_unlock(other);
sock_put(other);
unix: avoid use-after-free in ep_remove_wait_queue Rainer Weikusat <rweikusat@mobileactivedefense.com> writes: An AF_UNIX datagram socket being the client in an n:1 association with some server socket is only allowed to send messages to the server if the receive queue of this socket contains at most sk_max_ack_backlog datagrams. This implies that prospective writers might be forced to go to sleep despite none of the message presently enqueued on the server receive queue were sent by them. In order to ensure that these will be woken up once space becomes again available, the present unix_dgram_poll routine does a second sock_poll_wait call with the peer_wait wait queue of the server socket as queue argument (unix_dgram_recvmsg does a wake up on this queue after a datagram was received). This is inherently problematic because the server socket is only guaranteed to remain alive for as long as the client still holds a reference to it. In case the connection is dissolved via connect or by the dead peer detection logic in unix_dgram_sendmsg, the server socket may be freed despite "the polling mechanism" (in particular, epoll) still has a pointer to the corresponding peer_wait queue. There's no way to forcibly deregister a wait queue with epoll. Based on an idea by Jason Baron, the patch below changes the code such that a wait_queue_t belonging to the client socket is enqueued on the peer_wait queue of the server whenever the peer receive queue full condition is detected by either a sendmsg or a poll. A wake up on the peer queue is then relayed to the ordinary wait queue of the client socket via wake function. The connection to the peer wait queue is again dissolved if either a wake up is about to be relayed or the client socket reconnects or a dead peer is detected or the client socket is itself closed. This enables removing the second sock_poll_wait from unix_dgram_poll, thus avoiding the use-after-free, while still ensuring that no blocked writer sleeps forever. Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com> Fixes: ec0d215f9420 ("af_unix: fix 'poll for write'/connected DGRAM sockets") Reviewed-by: Jason Baron <jbaron@akamai.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-11-21 06:07:23 +08:00
if (!sk_locked)
unix_state_lock(sk);
err = 0;
if (unix_peer(sk) == other) {
unix_peer(sk) = NULL;
unix: avoid use-after-free in ep_remove_wait_queue Rainer Weikusat <rweikusat@mobileactivedefense.com> writes: An AF_UNIX datagram socket being the client in an n:1 association with some server socket is only allowed to send messages to the server if the receive queue of this socket contains at most sk_max_ack_backlog datagrams. This implies that prospective writers might be forced to go to sleep despite none of the message presently enqueued on the server receive queue were sent by them. In order to ensure that these will be woken up once space becomes again available, the present unix_dgram_poll routine does a second sock_poll_wait call with the peer_wait wait queue of the server socket as queue argument (unix_dgram_recvmsg does a wake up on this queue after a datagram was received). This is inherently problematic because the server socket is only guaranteed to remain alive for as long as the client still holds a reference to it. In case the connection is dissolved via connect or by the dead peer detection logic in unix_dgram_sendmsg, the server socket may be freed despite "the polling mechanism" (in particular, epoll) still has a pointer to the corresponding peer_wait queue. There's no way to forcibly deregister a wait queue with epoll. Based on an idea by Jason Baron, the patch below changes the code such that a wait_queue_t belonging to the client socket is enqueued on the peer_wait queue of the server whenever the peer receive queue full condition is detected by either a sendmsg or a poll. A wake up on the peer queue is then relayed to the ordinary wait queue of the client socket via wake function. The connection to the peer wait queue is again dissolved if either a wake up is about to be relayed or the client socket reconnects or a dead peer is detected or the client socket is itself closed. This enables removing the second sock_poll_wait from unix_dgram_poll, thus avoiding the use-after-free, while still ensuring that no blocked writer sleeps forever. Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com> Fixes: ec0d215f9420 ("af_unix: fix 'poll for write'/connected DGRAM sockets") Reviewed-by: Jason Baron <jbaron@akamai.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-11-21 06:07:23 +08:00
unix_dgram_peer_wake_disconnect_wakeup(sk, other);
unix_state_unlock(sk);
unix_dgram_disconnected(sk, other);
sock_put(other);
err = -ECONNREFUSED;
} else {
unix_state_unlock(sk);
}
other = NULL;
if (err)
goto out_free;
goto restart;
}
err = -EPIPE;
if (other->sk_shutdown & RCV_SHUTDOWN)
goto out_unlock;
if (sk->sk_type != SOCK_SEQPACKET) {
err = security_unix_may_send(sk->sk_socket, other->sk_socket);
if (err)
goto out_unlock;
}
/* other == sk && unix_peer(other) != sk if
* - unix_peer(sk) == NULL, destination address bound to sk
* - unix_peer(sk) == sk by time of get but disconnected before lock
*/
if (other != sk &&
unlikely(unix_peer(other) != sk &&
unix_recvq_full_lockless(other))) {
unix: avoid use-after-free in ep_remove_wait_queue Rainer Weikusat <rweikusat@mobileactivedefense.com> writes: An AF_UNIX datagram socket being the client in an n:1 association with some server socket is only allowed to send messages to the server if the receive queue of this socket contains at most sk_max_ack_backlog datagrams. This implies that prospective writers might be forced to go to sleep despite none of the message presently enqueued on the server receive queue were sent by them. In order to ensure that these will be woken up once space becomes again available, the present unix_dgram_poll routine does a second sock_poll_wait call with the peer_wait wait queue of the server socket as queue argument (unix_dgram_recvmsg does a wake up on this queue after a datagram was received). This is inherently problematic because the server socket is only guaranteed to remain alive for as long as the client still holds a reference to it. In case the connection is dissolved via connect or by the dead peer detection logic in unix_dgram_sendmsg, the server socket may be freed despite "the polling mechanism" (in particular, epoll) still has a pointer to the corresponding peer_wait queue. There's no way to forcibly deregister a wait queue with epoll. Based on an idea by Jason Baron, the patch below changes the code such that a wait_queue_t belonging to the client socket is enqueued on the peer_wait queue of the server whenever the peer receive queue full condition is detected by either a sendmsg or a poll. A wake up on the peer queue is then relayed to the ordinary wait queue of the client socket via wake function. The connection to the peer wait queue is again dissolved if either a wake up is about to be relayed or the client socket reconnects or a dead peer is detected or the client socket is itself closed. This enables removing the second sock_poll_wait from unix_dgram_poll, thus avoiding the use-after-free, while still ensuring that no blocked writer sleeps forever. Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com> Fixes: ec0d215f9420 ("af_unix: fix 'poll for write'/connected DGRAM sockets") Reviewed-by: Jason Baron <jbaron@akamai.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-11-21 06:07:23 +08:00
if (timeo) {
timeo = unix_wait_for_peer(other, timeo);
err = sock_intr_errno(timeo);
if (signal_pending(current))
goto out_free;
goto restart;
}
unix: avoid use-after-free in ep_remove_wait_queue Rainer Weikusat <rweikusat@mobileactivedefense.com> writes: An AF_UNIX datagram socket being the client in an n:1 association with some server socket is only allowed to send messages to the server if the receive queue of this socket contains at most sk_max_ack_backlog datagrams. This implies that prospective writers might be forced to go to sleep despite none of the message presently enqueued on the server receive queue were sent by them. In order to ensure that these will be woken up once space becomes again available, the present unix_dgram_poll routine does a second sock_poll_wait call with the peer_wait wait queue of the server socket as queue argument (unix_dgram_recvmsg does a wake up on this queue after a datagram was received). This is inherently problematic because the server socket is only guaranteed to remain alive for as long as the client still holds a reference to it. In case the connection is dissolved via connect or by the dead peer detection logic in unix_dgram_sendmsg, the server socket may be freed despite "the polling mechanism" (in particular, epoll) still has a pointer to the corresponding peer_wait queue. There's no way to forcibly deregister a wait queue with epoll. Based on an idea by Jason Baron, the patch below changes the code such that a wait_queue_t belonging to the client socket is enqueued on the peer_wait queue of the server whenever the peer receive queue full condition is detected by either a sendmsg or a poll. A wake up on the peer queue is then relayed to the ordinary wait queue of the client socket via wake function. The connection to the peer wait queue is again dissolved if either a wake up is about to be relayed or the client socket reconnects or a dead peer is detected or the client socket is itself closed. This enables removing the second sock_poll_wait from unix_dgram_poll, thus avoiding the use-after-free, while still ensuring that no blocked writer sleeps forever. Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com> Fixes: ec0d215f9420 ("af_unix: fix 'poll for write'/connected DGRAM sockets") Reviewed-by: Jason Baron <jbaron@akamai.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-11-21 06:07:23 +08:00
if (!sk_locked) {
unix_state_unlock(other);
unix_state_double_lock(sk, other);
}
unix: avoid use-after-free in ep_remove_wait_queue Rainer Weikusat <rweikusat@mobileactivedefense.com> writes: An AF_UNIX datagram socket being the client in an n:1 association with some server socket is only allowed to send messages to the server if the receive queue of this socket contains at most sk_max_ack_backlog datagrams. This implies that prospective writers might be forced to go to sleep despite none of the message presently enqueued on the server receive queue were sent by them. In order to ensure that these will be woken up once space becomes again available, the present unix_dgram_poll routine does a second sock_poll_wait call with the peer_wait wait queue of the server socket as queue argument (unix_dgram_recvmsg does a wake up on this queue after a datagram was received). This is inherently problematic because the server socket is only guaranteed to remain alive for as long as the client still holds a reference to it. In case the connection is dissolved via connect or by the dead peer detection logic in unix_dgram_sendmsg, the server socket may be freed despite "the polling mechanism" (in particular, epoll) still has a pointer to the corresponding peer_wait queue. There's no way to forcibly deregister a wait queue with epoll. Based on an idea by Jason Baron, the patch below changes the code such that a wait_queue_t belonging to the client socket is enqueued on the peer_wait queue of the server whenever the peer receive queue full condition is detected by either a sendmsg or a poll. A wake up on the peer queue is then relayed to the ordinary wait queue of the client socket via wake function. The connection to the peer wait queue is again dissolved if either a wake up is about to be relayed or the client socket reconnects or a dead peer is detected or the client socket is itself closed. This enables removing the second sock_poll_wait from unix_dgram_poll, thus avoiding the use-after-free, while still ensuring that no blocked writer sleeps forever. Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com> Fixes: ec0d215f9420 ("af_unix: fix 'poll for write'/connected DGRAM sockets") Reviewed-by: Jason Baron <jbaron@akamai.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-11-21 06:07:23 +08:00
if (unix_peer(sk) != other ||
unix_dgram_peer_wake_me(sk, other)) {
err = -EAGAIN;
sk_locked = 1;
goto out_unlock;
}
unix: avoid use-after-free in ep_remove_wait_queue Rainer Weikusat <rweikusat@mobileactivedefense.com> writes: An AF_UNIX datagram socket being the client in an n:1 association with some server socket is only allowed to send messages to the server if the receive queue of this socket contains at most sk_max_ack_backlog datagrams. This implies that prospective writers might be forced to go to sleep despite none of the message presently enqueued on the server receive queue were sent by them. In order to ensure that these will be woken up once space becomes again available, the present unix_dgram_poll routine does a second sock_poll_wait call with the peer_wait wait queue of the server socket as queue argument (unix_dgram_recvmsg does a wake up on this queue after a datagram was received). This is inherently problematic because the server socket is only guaranteed to remain alive for as long as the client still holds a reference to it. In case the connection is dissolved via connect or by the dead peer detection logic in unix_dgram_sendmsg, the server socket may be freed despite "the polling mechanism" (in particular, epoll) still has a pointer to the corresponding peer_wait queue. There's no way to forcibly deregister a wait queue with epoll. Based on an idea by Jason Baron, the patch below changes the code such that a wait_queue_t belonging to the client socket is enqueued on the peer_wait queue of the server whenever the peer receive queue full condition is detected by either a sendmsg or a poll. A wake up on the peer queue is then relayed to the ordinary wait queue of the client socket via wake function. The connection to the peer wait queue is again dissolved if either a wake up is about to be relayed or the client socket reconnects or a dead peer is detected or the client socket is itself closed. This enables removing the second sock_poll_wait from unix_dgram_poll, thus avoiding the use-after-free, while still ensuring that no blocked writer sleeps forever. Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com> Fixes: ec0d215f9420 ("af_unix: fix 'poll for write'/connected DGRAM sockets") Reviewed-by: Jason Baron <jbaron@akamai.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-11-21 06:07:23 +08:00
if (!sk_locked) {
sk_locked = 1;
goto restart_locked;
}
}
unix: avoid use-after-free in ep_remove_wait_queue Rainer Weikusat <rweikusat@mobileactivedefense.com> writes: An AF_UNIX datagram socket being the client in an n:1 association with some server socket is only allowed to send messages to the server if the receive queue of this socket contains at most sk_max_ack_backlog datagrams. This implies that prospective writers might be forced to go to sleep despite none of the message presently enqueued on the server receive queue were sent by them. In order to ensure that these will be woken up once space becomes again available, the present unix_dgram_poll routine does a second sock_poll_wait call with the peer_wait wait queue of the server socket as queue argument (unix_dgram_recvmsg does a wake up on this queue after a datagram was received). This is inherently problematic because the server socket is only guaranteed to remain alive for as long as the client still holds a reference to it. In case the connection is dissolved via connect or by the dead peer detection logic in unix_dgram_sendmsg, the server socket may be freed despite "the polling mechanism" (in particular, epoll) still has a pointer to the corresponding peer_wait queue. There's no way to forcibly deregister a wait queue with epoll. Based on an idea by Jason Baron, the patch below changes the code such that a wait_queue_t belonging to the client socket is enqueued on the peer_wait queue of the server whenever the peer receive queue full condition is detected by either a sendmsg or a poll. A wake up on the peer queue is then relayed to the ordinary wait queue of the client socket via wake function. The connection to the peer wait queue is again dissolved if either a wake up is about to be relayed or the client socket reconnects or a dead peer is detected or the client socket is itself closed. This enables removing the second sock_poll_wait from unix_dgram_poll, thus avoiding the use-after-free, while still ensuring that no blocked writer sleeps forever. Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com> Fixes: ec0d215f9420 ("af_unix: fix 'poll for write'/connected DGRAM sockets") Reviewed-by: Jason Baron <jbaron@akamai.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-11-21 06:07:23 +08:00
if (unlikely(sk_locked))
unix_state_unlock(sk);
if (sock_flag(other, SOCK_RCVTSTAMP))
__net_timestamp(skb);
af_unix: dont send SCM_CREDENTIALS by default Since commit 7361c36c5224 (af_unix: Allow credentials to work across user and pid namespaces) af_unix performance dropped a lot. This is because we now take a reference on pid and cred in each write(), and release them in read(), usually done from another process, eventually from another cpu. This triggers false sharing. # Events: 154K cycles # # Overhead Command Shared Object Symbol # ........ ....... .................. ......................... # 10.40% hackbench [kernel.kallsyms] [k] put_pid 8.60% hackbench [kernel.kallsyms] [k] unix_stream_recvmsg 7.87% hackbench [kernel.kallsyms] [k] unix_stream_sendmsg 6.11% hackbench [kernel.kallsyms] [k] do_raw_spin_lock 4.95% hackbench [kernel.kallsyms] [k] unix_scm_to_skb 4.87% hackbench [kernel.kallsyms] [k] pid_nr_ns 4.34% hackbench [kernel.kallsyms] [k] cred_to_ucred 2.39% hackbench [kernel.kallsyms] [k] unix_destruct_scm 2.24% hackbench [kernel.kallsyms] [k] sub_preempt_count 1.75% hackbench [kernel.kallsyms] [k] fget_light 1.51% hackbench [kernel.kallsyms] [k] __mutex_lock_interruptible_slowpath 1.42% hackbench [kernel.kallsyms] [k] sock_alloc_send_pskb This patch includes SCM_CREDENTIALS information in a af_unix message/skb only if requested by the sender, [man 7 unix for details how to include ancillary data using sendmsg() system call] Note: This might break buggy applications that expected SCM_CREDENTIAL from an unaware write() system call, and receiver not using SO_PASSCRED socket option. If SOCK_PASSCRED is set on source or destination socket, we still include credentials for mere write() syscalls. Performance boost in hackbench : more than 50% gain on a 16 thread machine (2 quad-core cpus, 2 threads per core) hackbench 20 thread 2000 4.228 sec instead of 9.102 sec Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Acked-by: Tim Chen <tim.c.chen@linux.intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2011-09-19 13:52:27 +08:00
maybe_add_creds(skb, sock, other);
scm_stat_add(other, skb);
skb_queue_tail(&other->sk_receive_queue, skb);
unix_state_unlock(other);
other->sk_data_ready(other);
sock_put(other);
scm_destroy(&scm);
return len;
out_unlock:
unix: avoid use-after-free in ep_remove_wait_queue Rainer Weikusat <rweikusat@mobileactivedefense.com> writes: An AF_UNIX datagram socket being the client in an n:1 association with some server socket is only allowed to send messages to the server if the receive queue of this socket contains at most sk_max_ack_backlog datagrams. This implies that prospective writers might be forced to go to sleep despite none of the message presently enqueued on the server receive queue were sent by them. In order to ensure that these will be woken up once space becomes again available, the present unix_dgram_poll routine does a second sock_poll_wait call with the peer_wait wait queue of the server socket as queue argument (unix_dgram_recvmsg does a wake up on this queue after a datagram was received). This is inherently problematic because the server socket is only guaranteed to remain alive for as long as the client still holds a reference to it. In case the connection is dissolved via connect or by the dead peer detection logic in unix_dgram_sendmsg, the server socket may be freed despite "the polling mechanism" (in particular, epoll) still has a pointer to the corresponding peer_wait queue. There's no way to forcibly deregister a wait queue with epoll. Based on an idea by Jason Baron, the patch below changes the code such that a wait_queue_t belonging to the client socket is enqueued on the peer_wait queue of the server whenever the peer receive queue full condition is detected by either a sendmsg or a poll. A wake up on the peer queue is then relayed to the ordinary wait queue of the client socket via wake function. The connection to the peer wait queue is again dissolved if either a wake up is about to be relayed or the client socket reconnects or a dead peer is detected or the client socket is itself closed. This enables removing the second sock_poll_wait from unix_dgram_poll, thus avoiding the use-after-free, while still ensuring that no blocked writer sleeps forever. Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com> Fixes: ec0d215f9420 ("af_unix: fix 'poll for write'/connected DGRAM sockets") Reviewed-by: Jason Baron <jbaron@akamai.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-11-21 06:07:23 +08:00
if (sk_locked)
unix_state_unlock(sk);
unix_state_unlock(other);
out_free:
kfree_skb(skb);
out:
if (other)
sock_put(other);
scm_destroy(&scm);
return err;
}
/* We use paged skbs for stream sockets, and limit occupancy to 32768
* bytes, and a minimum of a full page.
*/
#define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
size_t len)
{
struct sock *sk = sock->sk;
struct sock *other = NULL;
int err, size;
struct sk_buff *skb;
int sent = 0;
struct scm_cookie scm;
bool fds_sent = false;
int data_len;
wait_for_unix_gc();
err = scm_send(sock, msg, &scm, false);
if (err < 0)
return err;
err = -EOPNOTSUPP;
if (msg->msg_flags&MSG_OOB)
goto out_err;
if (msg->msg_namelen) {
err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
goto out_err;
} else {
err = -ENOTCONN;
other = unix_peer(sk);
if (!other)
goto out_err;
}
if (sk->sk_shutdown & SEND_SHUTDOWN)
goto pipe_err;
while (sent < len) {
size = len - sent;
/* Keep two messages in the pipe so it schedules better */
size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
/* allow fallback to order-0 allocations */
size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
msg->msg_flags & MSG_DONTWAIT, &err,
get_order(UNIX_SKB_FRAGS_SZ));
if (!skb)
goto out_err;
/* Only send the fds in the first buffer */
err = unix_scm_to_skb(&scm, skb, !fds_sent);
if (err < 0) {
kfree_skb(skb);
goto out_err;
}
fds_sent = true;
skb_put(skb, size - data_len);
skb->data_len = data_len;
skb->len = size;
err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
if (err) {
kfree_skb(skb);
goto out_err;
}
unix_state_lock(other);
if (sock_flag(other, SOCK_DEAD) ||
(other->sk_shutdown & RCV_SHUTDOWN))
goto pipe_err_free;
af_unix: dont send SCM_CREDENTIALS by default Since commit 7361c36c5224 (af_unix: Allow credentials to work across user and pid namespaces) af_unix performance dropped a lot. This is because we now take a reference on pid and cred in each write(), and release them in read(), usually done from another process, eventually from another cpu. This triggers false sharing. # Events: 154K cycles # # Overhead Command Shared Object Symbol # ........ ....... .................. ......................... # 10.40% hackbench [kernel.kallsyms] [k] put_pid 8.60% hackbench [kernel.kallsyms] [k] unix_stream_recvmsg 7.87% hackbench [kernel.kallsyms] [k] unix_stream_sendmsg 6.11% hackbench [kernel.kallsyms] [k] do_raw_spin_lock 4.95% hackbench [kernel.kallsyms] [k] unix_scm_to_skb 4.87% hackbench [kernel.kallsyms] [k] pid_nr_ns 4.34% hackbench [kernel.kallsyms] [k] cred_to_ucred 2.39% hackbench [kernel.kallsyms] [k] unix_destruct_scm 2.24% hackbench [kernel.kallsyms] [k] sub_preempt_count 1.75% hackbench [kernel.kallsyms] [k] fget_light 1.51% hackbench [kernel.kallsyms] [k] __mutex_lock_interruptible_slowpath 1.42% hackbench [kernel.kallsyms] [k] sock_alloc_send_pskb This patch includes SCM_CREDENTIALS information in a af_unix message/skb only if requested by the sender, [man 7 unix for details how to include ancillary data using sendmsg() system call] Note: This might break buggy applications that expected SCM_CREDENTIAL from an unaware write() system call, and receiver not using SO_PASSCRED socket option. If SOCK_PASSCRED is set on source or destination socket, we still include credentials for mere write() syscalls. Performance boost in hackbench : more than 50% gain on a 16 thread machine (2 quad-core cpus, 2 threads per core) hackbench 20 thread 2000 4.228 sec instead of 9.102 sec Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Acked-by: Tim Chen <tim.c.chen@linux.intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2011-09-19 13:52:27 +08:00
maybe_add_creds(skb, sock, other);
scm_stat_add(other, skb);
skb_queue_tail(&other->sk_receive_queue, skb);
unix_state_unlock(other);
other->sk_data_ready(other);
sent += size;
}
scm_destroy(&scm);
return sent;
pipe_err_free:
unix_state_unlock(other);
kfree_skb(skb);
pipe_err:
if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
send_sig(SIGPIPE, current, 0);
err = -EPIPE;
out_err:
scm_destroy(&scm);
return sent ? : err;
}
static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
int offset, size_t size, int flags)
{
int err;
bool send_sigpipe = false;
bool init_scm = true;
struct scm_cookie scm;
struct sock *other, *sk = socket->sk;
struct sk_buff *skb, *newskb = NULL, *tail = NULL;
if (flags & MSG_OOB)
return -EOPNOTSUPP;
other = unix_peer(sk);
if (!other || sk->sk_state != TCP_ESTABLISHED)
return -ENOTCONN;
if (false) {
alloc_skb:
unix_state_unlock(other);
mutex_unlock(&unix_sk(other)->iolock);
newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
&err, 0);
if (!newskb)
goto err;
}
/* we must acquire iolock as we modify already present
* skbs in the sk_receive_queue and mess with skb->len
*/
err = mutex_lock_interruptible(&unix_sk(other)->iolock);
if (err) {
err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
goto err;
}
if (sk->sk_shutdown & SEND_SHUTDOWN) {
err = -EPIPE;
send_sigpipe = true;
goto err_unlock;
}
unix_state_lock(other);
if (sock_flag(other, SOCK_DEAD) ||
other->sk_shutdown & RCV_SHUTDOWN) {
err = -EPIPE;
send_sigpipe = true;
goto err_state_unlock;
}
if (init_scm) {
err = maybe_init_creds(&scm, socket, other);
if (err)
goto err_state_unlock;
init_scm = false;
}
skb = skb_peek_tail(&other->sk_receive_queue);
if (tail && tail == skb) {
skb = newskb;
} else if (!skb || !unix_skb_scm_eq(skb, &scm)) {
if (newskb) {
skb = newskb;
} else {
tail = skb;
goto alloc_skb;
}
} else if (newskb) {
/* this is fast path, we don't necessarily need to
* call to kfree_skb even though with newskb == NULL
* this - does no harm
*/
consume_skb(newskb);
newskb = NULL;
}
if (skb_append_pagefrags(skb, page, offset, size)) {
tail = skb;
goto alloc_skb;
}
skb->len += size;
skb->data_len += size;
skb->truesize += size;
refcount_add(size, &sk->sk_wmem_alloc);
if (newskb) {
err = unix_scm_to_skb(&scm, skb, false);
if (err)
goto err_state_unlock;
spin_lock(&other->sk_receive_queue.lock);
__skb_queue_tail(&other->sk_receive_queue, newskb);
spin_unlock(&other->sk_receive_queue.lock);
}
unix_state_unlock(other);
mutex_unlock(&unix_sk(other)->iolock);
other->sk_data_ready(other);
scm_destroy(&scm);
return size;
err_state_unlock:
unix_state_unlock(other);
err_unlock:
mutex_unlock(&unix_sk(other)->iolock);
err:
kfree_skb(newskb);
if (send_sigpipe && !(flags & MSG_NOSIGNAL))
send_sig(SIGPIPE, current, 0);
if (!init_scm)
scm_destroy(&scm);
return err;
}
static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
size_t len)
{
int err;
struct sock *sk = sock->sk;
err = sock_error(sk);
if (err)
return err;
if (sk->sk_state != TCP_ESTABLISHED)
return -ENOTCONN;
if (msg->msg_namelen)
msg->msg_namelen = 0;
return unix_dgram_sendmsg(sock, msg, len);
}
static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
size_t size, int flags)
{
struct sock *sk = sock->sk;
if (sk->sk_state != TCP_ESTABLISHED)
return -ENOTCONN;
return unix_dgram_recvmsg(sock, msg, size, flags);
}
static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
{
missing barriers in some of unix_sock ->addr and ->path accesses Several u->addr and u->path users are not holding any locks in common with unix_bind(). unix_state_lock() is useless for those purposes. u->addr is assign-once and *(u->addr) is fully set up by the time we set u->addr (all under unix_table_lock). u->path is also set in the same critical area, also before setting u->addr, and any unix_sock with ->path filled will have non-NULL ->addr. So setting ->addr with smp_store_release() is all we need for those "lockless" users - just have them fetch ->addr with smp_load_acquire() and don't even bother looking at ->path if they see NULL ->addr. Users of ->addr and ->path fall into several classes now: 1) ones that do smp_load_acquire(u->addr) and access *(u->addr) and u->path only if smp_load_acquire() has returned non-NULL. 2) places holding unix_table_lock. These are guaranteed that *(u->addr) is seen fully initialized. If unix_sock is in one of the "bound" chains, so's ->path. 3) unix_sock_destructor() using ->addr is safe. All places that set u->addr are guaranteed to have seen all stores *(u->addr) while holding a reference to u and unix_sock_destructor() is called when (atomic) refcount hits zero. 4) unix_release_sock() using ->path is safe. unix_bind() is serialized wrt unix_release() (normally - by struct file refcount), and for the instances that had ->path set by unix_bind() unix_release_sock() comes from unix_release(), so they are fine. Instances that had it set in unix_stream_connect() either end up attached to a socket (in unix_accept()), in which case the call chain to unix_release_sock() and serialization are the same as in the previous case, or they never get accept'ed and unix_release_sock() is called when the listener is shut down and its queue gets purged. In that case the listener's queue lock provides the barriers needed - unix_stream_connect() shoves our unix_sock into listener's queue under that lock right after having set ->path and eventual unix_release_sock() caller picks them from that queue under the same lock right before calling unix_release_sock(). 5) unix_find_other() use of ->path is pointless, but safe - it happens with successful lookup by (abstract) name, so ->path.dentry is guaranteed to be NULL there. earlier-variant-reviewed-by: "Paul E. McKenney" <paulmck@linux.ibm.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-02-16 04:09:35 +08:00
struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
missing barriers in some of unix_sock ->addr and ->path accesses Several u->addr and u->path users are not holding any locks in common with unix_bind(). unix_state_lock() is useless for those purposes. u->addr is assign-once and *(u->addr) is fully set up by the time we set u->addr (all under unix_table_lock). u->path is also set in the same critical area, also before setting u->addr, and any unix_sock with ->path filled will have non-NULL ->addr. So setting ->addr with smp_store_release() is all we need for those "lockless" users - just have them fetch ->addr with smp_load_acquire() and don't even bother looking at ->path if they see NULL ->addr. Users of ->addr and ->path fall into several classes now: 1) ones that do smp_load_acquire(u->addr) and access *(u->addr) and u->path only if smp_load_acquire() has returned non-NULL. 2) places holding unix_table_lock. These are guaranteed that *(u->addr) is seen fully initialized. If unix_sock is in one of the "bound" chains, so's ->path. 3) unix_sock_destructor() using ->addr is safe. All places that set u->addr are guaranteed to have seen all stores *(u->addr) while holding a reference to u and unix_sock_destructor() is called when (atomic) refcount hits zero. 4) unix_release_sock() using ->path is safe. unix_bind() is serialized wrt unix_release() (normally - by struct file refcount), and for the instances that had ->path set by unix_bind() unix_release_sock() comes from unix_release(), so they are fine. Instances that had it set in unix_stream_connect() either end up attached to a socket (in unix_accept()), in which case the call chain to unix_release_sock() and serialization are the same as in the previous case, or they never get accept'ed and unix_release_sock() is called when the listener is shut down and its queue gets purged. In that case the listener's queue lock provides the barriers needed - unix_stream_connect() shoves our unix_sock into listener's queue under that lock right after having set ->path and eventual unix_release_sock() caller picks them from that queue under the same lock right before calling unix_release_sock(). 5) unix_find_other() use of ->path is pointless, but safe - it happens with successful lookup by (abstract) name, so ->path.dentry is guaranteed to be NULL there. earlier-variant-reviewed-by: "Paul E. McKenney" <paulmck@linux.ibm.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-02-16 04:09:35 +08:00
if (addr) {
msg->msg_namelen = addr->len;
memcpy(msg->msg_name, addr->name, addr->len);
}
}
static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
size_t size, int flags)
{
struct scm_cookie scm;
struct sock *sk = sock->sk;
struct unix_sock *u = unix_sk(sk);
af_unix: fix unix_dgram_recvmsg entry locking The current unix_dgram_recvsmg code acquires the u->readlock mutex in order to protect access to the peek offset prior to calling __skb_recv_datagram for actually receiving data. This implies that a blocking reader will go to sleep with this mutex held if there's presently no data to return to userspace. Two non-desirable side effects of this are that a later non-blocking read call on the same socket will block on the ->readlock mutex until the earlier blocking call releases it (or the readers is interrupted) and that later blocking read calls will wait longer than the effective socket read timeout says they should: The timeout will only start 'ticking' once such a reader hits the schedule_timeout in wait_for_more_packets (core.c) while the time it already had to wait until it could acquire the mutex is unaccounted for. The patch avoids both by using the __skb_try_recv_datagram and __skb_wait_for_more packets functions created by the first patch to implement a unix_dgram_recvmsg read loop which releases the readlock mutex prior to going to sleep and reacquires it as needed afterwards. Non-blocking readers will thus immediately return with -EAGAIN if there's no data available regardless of any concurrent blocking readers and all blocking readers will end up sleeping via schedule_timeout, thus honouring the configured socket receive timeout. Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-12-07 05:11:38 +08:00
struct sk_buff *skb, *last;
long timeo;
int skip;
int err;
err = -EOPNOTSUPP;
if (flags&MSG_OOB)
goto out;
af_unix: fix unix_dgram_recvmsg entry locking The current unix_dgram_recvsmg code acquires the u->readlock mutex in order to protect access to the peek offset prior to calling __skb_recv_datagram for actually receiving data. This implies that a blocking reader will go to sleep with this mutex held if there's presently no data to return to userspace. Two non-desirable side effects of this are that a later non-blocking read call on the same socket will block on the ->readlock mutex until the earlier blocking call releases it (or the readers is interrupted) and that later blocking read calls will wait longer than the effective socket read timeout says they should: The timeout will only start 'ticking' once such a reader hits the schedule_timeout in wait_for_more_packets (core.c) while the time it already had to wait until it could acquire the mutex is unaccounted for. The patch avoids both by using the __skb_try_recv_datagram and __skb_wait_for_more packets functions created by the first patch to implement a unix_dgram_recvmsg read loop which releases the readlock mutex prior to going to sleep and reacquires it as needed afterwards. Non-blocking readers will thus immediately return with -EAGAIN if there's no data available regardless of any concurrent blocking readers and all blocking readers will end up sleeping via schedule_timeout, thus honouring the configured socket receive timeout. Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-12-07 05:11:38 +08:00
timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
af_unix: fix unix_dgram_recvmsg entry locking The current unix_dgram_recvsmg code acquires the u->readlock mutex in order to protect access to the peek offset prior to calling __skb_recv_datagram for actually receiving data. This implies that a blocking reader will go to sleep with this mutex held if there's presently no data to return to userspace. Two non-desirable side effects of this are that a later non-blocking read call on the same socket will block on the ->readlock mutex until the earlier blocking call releases it (or the readers is interrupted) and that later blocking read calls will wait longer than the effective socket read timeout says they should: The timeout will only start 'ticking' once such a reader hits the schedule_timeout in wait_for_more_packets (core.c) while the time it already had to wait until it could acquire the mutex is unaccounted for. The patch avoids both by using the __skb_try_recv_datagram and __skb_wait_for_more packets functions created by the first patch to implement a unix_dgram_recvmsg read loop which releases the readlock mutex prior to going to sleep and reacquires it as needed afterwards. Non-blocking readers will thus immediately return with -EAGAIN if there's no data available regardless of any concurrent blocking readers and all blocking readers will end up sleeping via schedule_timeout, thus honouring the configured socket receive timeout. Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-12-07 05:11:38 +08:00
do {
mutex_lock(&u->iolock);
af_unix: fix unix_dgram_recvmsg entry locking The current unix_dgram_recvsmg code acquires the u->readlock mutex in order to protect access to the peek offset prior to calling __skb_recv_datagram for actually receiving data. This implies that a blocking reader will go to sleep with this mutex held if there's presently no data to return to userspace. Two non-desirable side effects of this are that a later non-blocking read call on the same socket will block on the ->readlock mutex until the earlier blocking call releases it (or the readers is interrupted) and that later blocking read calls will wait longer than the effective socket read timeout says they should: The timeout will only start 'ticking' once such a reader hits the schedule_timeout in wait_for_more_packets (core.c) while the time it already had to wait until it could acquire the mutex is unaccounted for. The patch avoids both by using the __skb_try_recv_datagram and __skb_wait_for_more packets functions created by the first patch to implement a unix_dgram_recvmsg read loop which releases the readlock mutex prior to going to sleep and reacquires it as needed afterwards. Non-blocking readers will thus immediately return with -EAGAIN if there's no data available regardless of any concurrent blocking readers and all blocking readers will end up sleeping via schedule_timeout, thus honouring the configured socket receive timeout. Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-12-07 05:11:38 +08:00
skip = sk_peek_offset(sk, flags);
skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
&skip, &err, &last);
if (skb) {
if (!(flags & MSG_PEEK))
scm_stat_del(sk, skb);
af_unix: fix unix_dgram_recvmsg entry locking The current unix_dgram_recvsmg code acquires the u->readlock mutex in order to protect access to the peek offset prior to calling __skb_recv_datagram for actually receiving data. This implies that a blocking reader will go to sleep with this mutex held if there's presently no data to return to userspace. Two non-desirable side effects of this are that a later non-blocking read call on the same socket will block on the ->readlock mutex until the earlier blocking call releases it (or the readers is interrupted) and that later blocking read calls will wait longer than the effective socket read timeout says they should: The timeout will only start 'ticking' once such a reader hits the schedule_timeout in wait_for_more_packets (core.c) while the time it already had to wait until it could acquire the mutex is unaccounted for. The patch avoids both by using the __skb_try_recv_datagram and __skb_wait_for_more packets functions created by the first patch to implement a unix_dgram_recvmsg read loop which releases the readlock mutex prior to going to sleep and reacquires it as needed afterwards. Non-blocking readers will thus immediately return with -EAGAIN if there's no data available regardless of any concurrent blocking readers and all blocking readers will end up sleeping via schedule_timeout, thus honouring the configured socket receive timeout. Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-12-07 05:11:38 +08:00
break;
}
af_unix: fix unix_dgram_recvmsg entry locking The current unix_dgram_recvsmg code acquires the u->readlock mutex in order to protect access to the peek offset prior to calling __skb_recv_datagram for actually receiving data. This implies that a blocking reader will go to sleep with this mutex held if there's presently no data to return to userspace. Two non-desirable side effects of this are that a later non-blocking read call on the same socket will block on the ->readlock mutex until the earlier blocking call releases it (or the readers is interrupted) and that later blocking read calls will wait longer than the effective socket read timeout says they should: The timeout will only start 'ticking' once such a reader hits the schedule_timeout in wait_for_more_packets (core.c) while the time it already had to wait until it could acquire the mutex is unaccounted for. The patch avoids both by using the __skb_try_recv_datagram and __skb_wait_for_more packets functions created by the first patch to implement a unix_dgram_recvmsg read loop which releases the readlock mutex prior to going to sleep and reacquires it as needed afterwards. Non-blocking readers will thus immediately return with -EAGAIN if there's no data available regardless of any concurrent blocking readers and all blocking readers will end up sleeping via schedule_timeout, thus honouring the configured socket receive timeout. Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-12-07 05:11:38 +08:00
mutex_unlock(&u->iolock);
af_unix: fix unix_dgram_recvmsg entry locking The current unix_dgram_recvsmg code acquires the u->readlock mutex in order to protect access to the peek offset prior to calling __skb_recv_datagram for actually receiving data. This implies that a blocking reader will go to sleep with this mutex held if there's presently no data to return to userspace. Two non-desirable side effects of this are that a later non-blocking read call on the same socket will block on the ->readlock mutex until the earlier blocking call releases it (or the readers is interrupted) and that later blocking read calls will wait longer than the effective socket read timeout says they should: The timeout will only start 'ticking' once such a reader hits the schedule_timeout in wait_for_more_packets (core.c) while the time it already had to wait until it could acquire the mutex is unaccounted for. The patch avoids both by using the __skb_try_recv_datagram and __skb_wait_for_more packets functions created by the first patch to implement a unix_dgram_recvmsg read loop which releases the readlock mutex prior to going to sleep and reacquires it as needed afterwards. Non-blocking readers will thus immediately return with -EAGAIN if there's no data available regardless of any concurrent blocking readers and all blocking readers will end up sleeping via schedule_timeout, thus honouring the configured socket receive timeout. Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-12-07 05:11:38 +08:00
if (err != -EAGAIN)
break;
} while (timeo &&
!__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
&err, &timeo, last));
af_unix: fix unix_dgram_recvmsg entry locking The current unix_dgram_recvsmg code acquires the u->readlock mutex in order to protect access to the peek offset prior to calling __skb_recv_datagram for actually receiving data. This implies that a blocking reader will go to sleep with this mutex held if there's presently no data to return to userspace. Two non-desirable side effects of this are that a later non-blocking read call on the same socket will block on the ->readlock mutex until the earlier blocking call releases it (or the readers is interrupted) and that later blocking read calls will wait longer than the effective socket read timeout says they should: The timeout will only start 'ticking' once such a reader hits the schedule_timeout in wait_for_more_packets (core.c) while the time it already had to wait until it could acquire the mutex is unaccounted for. The patch avoids both by using the __skb_try_recv_datagram and __skb_wait_for_more packets functions created by the first patch to implement a unix_dgram_recvmsg read loop which releases the readlock mutex prior to going to sleep and reacquires it as needed afterwards. Non-blocking readers will thus immediately return with -EAGAIN if there's no data available regardless of any concurrent blocking readers and all blocking readers will end up sleeping via schedule_timeout, thus honouring the configured socket receive timeout. Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-12-07 05:11:38 +08:00
if (!skb) { /* implies iolock unlocked */
[UNIX]: EOF on non-blocking SOCK_SEQPACKET I am not absolutely sure whether this actually is a bug (as in: I've got no clue what the standards say or what other implementations do), but at least I was pretty surprised when I noticed that a recv() on a non-blocking unix domain socket of type SOCK_SEQPACKET (which is connection oriented, after all) where the remote end has closed the connection returned -1 (EAGAIN) rather than 0 to indicate end of file. This is a test case: | #include <sys/types.h> | #include <unistd.h> | #include <sys/socket.h> | #include <sys/un.h> | #include <fcntl.h> | #include <string.h> | #include <stdlib.h> | | int main(){ | int sock; | struct sockaddr_un addr; | char buf[4096]; | int pfds[2]; | | pipe(pfds); | sock=socket(PF_UNIX,SOCK_SEQPACKET,0); | addr.sun_family=AF_UNIX; | strcpy(addr.sun_path,"/tmp/foobar_testsock"); | bind(sock,(struct sockaddr *)&addr,sizeof(addr)); | listen(sock,1); | if(fork()){ | close(sock); | sock=socket(PF_UNIX,SOCK_SEQPACKET,0); | connect(sock,(struct sockaddr *)&addr,sizeof(addr)); | fcntl(sock,F_SETFL,fcntl(sock,F_GETFL)|O_NONBLOCK); | close(pfds[1]); | read(pfds[0],buf,sizeof(buf)); | recv(sock,buf,sizeof(buf),0); // <-- this one | }else accept(sock,NULL,NULL); | exit(0); | } If you try it, make sure /tmp/foobar_testsock doesn't exist. The marked recv() returns -1 (EAGAIN) on 2.6.23.9. Below you find a patch that fixes that. Signed-off-by: Florian Zumbiehl <florz@florz.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2007-11-29 20:19:23 +08:00
unix_state_lock(sk);
/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
(sk->sk_shutdown & RCV_SHUTDOWN))
err = 0;
unix_state_unlock(sk);
af_unix: fix unix_dgram_recvmsg entry locking The current unix_dgram_recvsmg code acquires the u->readlock mutex in order to protect access to the peek offset prior to calling __skb_recv_datagram for actually receiving data. This implies that a blocking reader will go to sleep with this mutex held if there's presently no data to return to userspace. Two non-desirable side effects of this are that a later non-blocking read call on the same socket will block on the ->readlock mutex until the earlier blocking call releases it (or the readers is interrupted) and that later blocking read calls will wait longer than the effective socket read timeout says they should: The timeout will only start 'ticking' once such a reader hits the schedule_timeout in wait_for_more_packets (core.c) while the time it already had to wait until it could acquire the mutex is unaccounted for. The patch avoids both by using the __skb_try_recv_datagram and __skb_wait_for_more packets functions created by the first patch to implement a unix_dgram_recvmsg read loop which releases the readlock mutex prior to going to sleep and reacquires it as needed afterwards. Non-blocking readers will thus immediately return with -EAGAIN if there's no data available regardless of any concurrent blocking readers and all blocking readers will end up sleeping via schedule_timeout, thus honouring the configured socket receive timeout. Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-12-07 05:11:38 +08:00
goto out;
[UNIX]: EOF on non-blocking SOCK_SEQPACKET I am not absolutely sure whether this actually is a bug (as in: I've got no clue what the standards say or what other implementations do), but at least I was pretty surprised when I noticed that a recv() on a non-blocking unix domain socket of type SOCK_SEQPACKET (which is connection oriented, after all) where the remote end has closed the connection returned -1 (EAGAIN) rather than 0 to indicate end of file. This is a test case: | #include <sys/types.h> | #include <unistd.h> | #include <sys/socket.h> | #include <sys/un.h> | #include <fcntl.h> | #include <string.h> | #include <stdlib.h> | | int main(){ | int sock; | struct sockaddr_un addr; | char buf[4096]; | int pfds[2]; | | pipe(pfds); | sock=socket(PF_UNIX,SOCK_SEQPACKET,0); | addr.sun_family=AF_UNIX; | strcpy(addr.sun_path,"/tmp/foobar_testsock"); | bind(sock,(struct sockaddr *)&addr,sizeof(addr)); | listen(sock,1); | if(fork()){ | close(sock); | sock=socket(PF_UNIX,SOCK_SEQPACKET,0); | connect(sock,(struct sockaddr *)&addr,sizeof(addr)); | fcntl(sock,F_SETFL,fcntl(sock,F_GETFL)|O_NONBLOCK); | close(pfds[1]); | read(pfds[0],buf,sizeof(buf)); | recv(sock,buf,sizeof(buf),0); // <-- this one | }else accept(sock,NULL,NULL); | exit(0); | } If you try it, make sure /tmp/foobar_testsock doesn't exist. The marked recv() returns -1 (EAGAIN) on 2.6.23.9. Below you find a patch that fixes that. Signed-off-by: Florian Zumbiehl <florz@florz.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2007-11-29 20:19:23 +08:00
}
if (wq_has_sleeper(&u->peer_wait))
wake_up_interruptible_sync_poll(&u->peer_wait,
EPOLLOUT | EPOLLWRNORM |
EPOLLWRBAND);
if (msg->msg_name)
unix_copy_addr(msg, skb->sk);
if (size > skb->len - skip)
size = skb->len - skip;
else if (size < skb->len - skip)
msg->msg_flags |= MSG_TRUNC;
err = skb_copy_datagram_msg(skb, skip, msg, size);
if (err)
goto out_free;
if (sock_flag(sk, SOCK_RCVTSTAMP))
__sock_recv_timestamp(msg, sk, skb);
memset(&scm, 0, sizeof(scm));
scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
unix_set_secdata(&scm, skb);
if (!(flags & MSG_PEEK)) {
if (UNIXCB(skb).fp)
unix_detach_fds(&scm, skb);
sk_peek_offset_bwd(sk, skb->len);
} else {
/* It is questionable: on PEEK we could:
- do not return fds - good, but too simple 8)
- return fds, and do not return them on read (old strategy,
apparently wrong)
- clone fds (I chose it for now, it is the most universal
solution)
POSIX 1003.1g does not actually define this clearly
at all. POSIX 1003.1g doesn't define a lot of things
clearly however!
*/
sk_peek_offset_fwd(sk, size);
if (UNIXCB(skb).fp)
scm.fp = scm_fp_dup(UNIXCB(skb).fp);
}
err = (flags & MSG_TRUNC) ? skb->len - skip : size;
scm_recv(sock, msg, &scm, flags);
out_free:
skb_free_datagram(sk, skb);
mutex_unlock(&u->iolock);
out:
return err;
}
/*
* Sleep until more data has arrived. But check for races..
*/
static long unix_stream_data_wait(struct sock *sk, long timeo,
struct sk_buff *last, unsigned int last_len,
bool freezable)
{
struct sk_buff *tail;
DEFINE_WAIT(wait);
unix_state_lock(sk);
for (;;) {
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
tail = skb_peek_tail(&sk->sk_receive_queue);
if (tail != last ||
(tail && tail->len != last_len) ||
sk->sk_err ||
(sk->sk_shutdown & RCV_SHUTDOWN) ||
signal_pending(current) ||
!timeo)
break;
sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
unix_state_unlock(sk);
if (freezable)
timeo = freezable_schedule_timeout(timeo);
else
timeo = schedule_timeout(timeo);
unix_state_lock(sk);
if (sock_flag(sk, SOCK_DEAD))
break;
sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
}
finish_wait(sk_sleep(sk), &wait);
unix_state_unlock(sk);
return timeo;
}
static unsigned int unix_skb_len(const struct sk_buff *skb)
{
return skb->len - UNIXCB(skb).consumed;
}
struct unix_stream_read_state {
int (*recv_actor)(struct sk_buff *, int, int,
struct unix_stream_read_state *);
struct socket *socket;
struct msghdr *msg;
struct pipe_inode_info *pipe;
size_t size;
int flags;
unsigned int splice_flags;
};
static int unix_stream_read_generic(struct unix_stream_read_state *state,
bool freezable)
{
struct scm_cookie scm;
struct socket *sock = state->socket;
struct sock *sk = sock->sk;
struct unix_sock *u = unix_sk(sk);
int copied = 0;
int flags = state->flags;
int noblock = flags & MSG_DONTWAIT;
bool check_creds = false;
int target;
int err = 0;
long timeo;
int skip;
size_t size = state->size;
unsigned int last_len;
if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
err = -EINVAL;
goto out;
}
if (unlikely(flags & MSG_OOB)) {
err = -EOPNOTSUPP;
goto out;
}
target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
timeo = sock_rcvtimeo(sk, noblock);
memset(&scm, 0, sizeof(scm));
/* Lock the socket to prevent queue disordering
* while sleeps in memcpy_tomsg
*/
mutex_lock(&u->iolock);
datagram: When peeking datagrams with offset < 0 don't skip empty skbs Due to commit e6afc8ace6dd5cef5e812f26c72579da8806f5ac ("udp: remove headers from UDP packets before queueing"), when udp packets are being peeked the requested extra offset is always 0 as there is no need to skip the udp header. However, when the offset is 0 and the next skb is of length 0, it is only returned once. The behaviour can be seen with the following python script: from socket import *; f=socket(AF_INET6, SOCK_DGRAM | SOCK_NONBLOCK, 0); g=socket(AF_INET6, SOCK_DGRAM | SOCK_NONBLOCK, 0); f.bind(('::', 0)); addr=('::1', f.getsockname()[1]); g.sendto(b'', addr) g.sendto(b'b', addr) print(f.recvfrom(10, MSG_PEEK)); print(f.recvfrom(10, MSG_PEEK)); Where the expected output should be the empty string twice. Instead, make sk_peek_offset return negative values, and pass those values to __skb_try_recv_datagram/__skb_try_recv_from_queue. If the passed offset to __skb_try_recv_from_queue is negative, the checked skb is never skipped. __skb_try_recv_from_queue will then ensure the offset is reset back to 0 if a peek is requested without an offset, unless no packets are found. Also simplify the if condition in __skb_try_recv_from_queue. If _off is greater then 0, and off is greater then or equal to skb->len, then (_off || skb->len) must always be true assuming skb->len >= 0 is always true. Also remove a redundant check around a call to sk_peek_offset in af_unix.c, as it double checked if MSG_PEEK was set in the flags. V2: - Moved the negative fixup into __skb_try_recv_from_queue, and remove now redundant checks - Fix peeking in udp{,v6}_recvmsg to report the right value when the offset is 0 V3: - Marked new branch in __skb_try_recv_from_queue as unlikely. Signed-off-by: Matthew Dawson <matthew@mjdsystems.ca> Acked-by: Willem de Bruijn <willemb@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-08-19 03:04:54 +08:00
skip = max(sk_peek_offset(sk, flags), 0);
do {
int chunk;
bool drop_skb;
struct sk_buff *skb, *last;
redo:
unix_state_lock(sk);
if (sock_flag(sk, SOCK_DEAD)) {
err = -ECONNRESET;
goto unlock;
}
last = skb = skb_peek(&sk->sk_receive_queue);
last_len = last ? last->len : 0;
again:
if (skb == NULL) {
if (copied >= target)
goto unlock;
/*
* POSIX 1003.1g mandates this order.
*/
err = sock_error(sk);
if (err)
goto unlock;
if (sk->sk_shutdown & RCV_SHUTDOWN)
goto unlock;
unix_state_unlock(sk);
if (!timeo) {
err = -EAGAIN;
break;
}
mutex_unlock(&u->iolock);
timeo = unix_stream_data_wait(sk, timeo, last,
last_len, freezable);
if (signal_pending(current)) {
err = sock_intr_errno(timeo);
scm_destroy(&scm);
goto out;
}
mutex_lock(&u->iolock);
goto redo;
unlock:
unix_state_unlock(sk);
break;
}
while (skip >= unix_skb_len(skb)) {
skip -= unix_skb_len(skb);
last = skb;
last_len = skb->len;
skb = skb_peek_next(skb, &sk->sk_receive_queue);
if (!skb)
goto again;
}
unix_state_unlock(sk);
if (check_creds) {
/* Never glue messages from different writers */
if (!unix_skb_scm_eq(skb, &scm))
break;
} else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
/* Copy credentials */
scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
unix_set_secdata(&scm, skb);
check_creds = true;
}
/* Copy address just once */
if (state->msg && state->msg->msg_name) {
DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
state->msg->msg_name);
unix_copy_addr(state->msg, skb->sk);
sunaddr = NULL;
}
chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
skb_get(skb);
chunk = state->recv_actor(skb, skip, chunk, state);
drop_skb = !unix_skb_len(skb);
/* skb is only safe to use if !drop_skb */
consume_skb(skb);
if (chunk < 0) {
if (copied == 0)
copied = -EFAULT;
break;
}
copied += chunk;
size -= chunk;
if (drop_skb) {
/* the skb was touched by a concurrent reader;
* we should not expect anything from this skb
* anymore and assume it invalid - we can be
* sure it was dropped from the socket queue
*
* let's report a short read
*/
err = 0;
break;
}
/* Mark read part of skb as used */
if (!(flags & MSG_PEEK)) {
UNIXCB(skb).consumed += chunk;
sk_peek_offset_bwd(sk, chunk);
if (UNIXCB(skb).fp) {
scm_stat_del(sk, skb);
unix_detach_fds(&scm, skb);
}
if (unix_skb_len(skb))
break;
skb_unlink(skb, &sk->sk_receive_queue);
consume_skb(skb);
if (scm.fp)
break;
} else {
/* It is questionable, see note in unix_dgram_recvmsg.
*/
if (UNIXCB(skb).fp)
scm.fp = scm_fp_dup(UNIXCB(skb).fp);
sk_peek_offset_fwd(sk, chunk);
if (UNIXCB(skb).fp)
break;
skip = 0;
last = skb;
last_len = skb->len;
unix_state_lock(sk);
skb = skb_peek_next(skb, &sk->sk_receive_queue);
if (skb)
goto again;
unix_state_unlock(sk);
break;
}
} while (size);
mutex_unlock(&u->iolock);
if (state->msg)
scm_recv(sock, state->msg, &scm, flags);
else
scm_destroy(&scm);
out:
return copied ? : err;
}
static int unix_stream_read_actor(struct sk_buff *skb,
int skip, int chunk,
struct unix_stream_read_state *state)
{
int ret;
ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
state->msg, chunk);
return ret ?: chunk;
}
static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
size_t size, int flags)
{
struct unix_stream_read_state state = {
.recv_actor = unix_stream_read_actor,
.socket = sock,
.msg = msg,
.size = size,
.flags = flags
};
return unix_stream_read_generic(&state, true);
}
static int unix_stream_splice_actor(struct sk_buff *skb,
int skip, int chunk,
struct unix_stream_read_state *state)
{
return skb_splice_bits(skb, state->socket->sk,
UNIXCB(skb).consumed + skip,
state->pipe, chunk, state->splice_flags);
}
static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos,
struct pipe_inode_info *pipe,
size_t size, unsigned int flags)
{
struct unix_stream_read_state state = {
.recv_actor = unix_stream_splice_actor,
.socket = sock,
.pipe = pipe,
.size = size,
.splice_flags = flags,
};
if (unlikely(*ppos))
return -ESPIPE;
if (sock->file->f_flags & O_NONBLOCK ||
flags & SPLICE_F_NONBLOCK)
state.flags = MSG_DONTWAIT;
return unix_stream_read_generic(&state, false);
}
static int unix_shutdown(struct socket *sock, int mode)
{
struct sock *sk = sock->sk;
struct sock *other;
if (mode < SHUT_RD || mode > SHUT_RDWR)
return -EINVAL;
/* This maps:
* SHUT_RD (0) -> RCV_SHUTDOWN (1)
* SHUT_WR (1) -> SEND_SHUTDOWN (2)
* SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
*/
++mode;
unix_state_lock(sk);
sk->sk_shutdown |= mode;
other = unix_peer(sk);
if (other)
sock_hold(other);
unix_state_unlock(sk);
sk->sk_state_change(sk);
if (other &&
(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
int peer_mode = 0;
if (mode&RCV_SHUTDOWN)
peer_mode |= SEND_SHUTDOWN;
if (mode&SEND_SHUTDOWN)
peer_mode |= RCV_SHUTDOWN;
unix_state_lock(other);
other->sk_shutdown |= peer_mode;
unix_state_unlock(other);
other->sk_state_change(other);
if (peer_mode == SHUTDOWN_MASK)
sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
else if (peer_mode & RCV_SHUTDOWN)
sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
}
if (other)
sock_put(other);
return 0;
}
long unix_inq_len(struct sock *sk)
{
struct sk_buff *skb;
long amount = 0;
if (sk->sk_state == TCP_LISTEN)
return -EINVAL;
spin_lock(&sk->sk_receive_queue.lock);
if (sk->sk_type == SOCK_STREAM ||
sk->sk_type == SOCK_SEQPACKET) {
skb_queue_walk(&sk->sk_receive_queue, skb)
amount += unix_skb_len(skb);
} else {
skb = skb_peek(&sk->sk_receive_queue);
if (skb)
amount = skb->len;
}
spin_unlock(&sk->sk_receive_queue.lock);
return amount;
}
EXPORT_SYMBOL_GPL(unix_inq_len);
long unix_outq_len(struct sock *sk)
{
return sk_wmem_alloc_get(sk);
}
EXPORT_SYMBOL_GPL(unix_outq_len);
static int unix_open_file(struct sock *sk)
{
struct path path;
struct file *f;
int fd;
if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
missing barriers in some of unix_sock ->addr and ->path accesses Several u->addr and u->path users are not holding any locks in common with unix_bind(). unix_state_lock() is useless for those purposes. u->addr is assign-once and *(u->addr) is fully set up by the time we set u->addr (all under unix_table_lock). u->path is also set in the same critical area, also before setting u->addr, and any unix_sock with ->path filled will have non-NULL ->addr. So setting ->addr with smp_store_release() is all we need for those "lockless" users - just have them fetch ->addr with smp_load_acquire() and don't even bother looking at ->path if they see NULL ->addr. Users of ->addr and ->path fall into several classes now: 1) ones that do smp_load_acquire(u->addr) and access *(u->addr) and u->path only if smp_load_acquire() has returned non-NULL. 2) places holding unix_table_lock. These are guaranteed that *(u->addr) is seen fully initialized. If unix_sock is in one of the "bound" chains, so's ->path. 3) unix_sock_destructor() using ->addr is safe. All places that set u->addr are guaranteed to have seen all stores *(u->addr) while holding a reference to u and unix_sock_destructor() is called when (atomic) refcount hits zero. 4) unix_release_sock() using ->path is safe. unix_bind() is serialized wrt unix_release() (normally - by struct file refcount), and for the instances that had ->path set by unix_bind() unix_release_sock() comes from unix_release(), so they are fine. Instances that had it set in unix_stream_connect() either end up attached to a socket (in unix_accept()), in which case the call chain to unix_release_sock() and serialization are the same as in the previous case, or they never get accept'ed and unix_release_sock() is called when the listener is shut down and its queue gets purged. In that case the listener's queue lock provides the barriers needed - unix_stream_connect() shoves our unix_sock into listener's queue under that lock right after having set ->path and eventual unix_release_sock() caller picks them from that queue under the same lock right before calling unix_release_sock(). 5) unix_find_other() use of ->path is pointless, but safe - it happens with successful lookup by (abstract) name, so ->path.dentry is guaranteed to be NULL there. earlier-variant-reviewed-by: "Paul E. McKenney" <paulmck@linux.ibm.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-02-16 04:09:35 +08:00
if (!smp_load_acquire(&unix_sk(sk)->addr))
return -ENOENT;
path = unix_sk(sk)->path;
missing barriers in some of unix_sock ->addr and ->path accesses Several u->addr and u->path users are not holding any locks in common with unix_bind(). unix_state_lock() is useless for those purposes. u->addr is assign-once and *(u->addr) is fully set up by the time we set u->addr (all under unix_table_lock). u->path is also set in the same critical area, also before setting u->addr, and any unix_sock with ->path filled will have non-NULL ->addr. So setting ->addr with smp_store_release() is all we need for those "lockless" users - just have them fetch ->addr with smp_load_acquire() and don't even bother looking at ->path if they see NULL ->addr. Users of ->addr and ->path fall into several classes now: 1) ones that do smp_load_acquire(u->addr) and access *(u->addr) and u->path only if smp_load_acquire() has returned non-NULL. 2) places holding unix_table_lock. These are guaranteed that *(u->addr) is seen fully initialized. If unix_sock is in one of the "bound" chains, so's ->path. 3) unix_sock_destructor() using ->addr is safe. All places that set u->addr are guaranteed to have seen all stores *(u->addr) while holding a reference to u and unix_sock_destructor() is called when (atomic) refcount hits zero. 4) unix_release_sock() using ->path is safe. unix_bind() is serialized wrt unix_release() (normally - by struct file refcount), and for the instances that had ->path set by unix_bind() unix_release_sock() comes from unix_release(), so they are fine. Instances that had it set in unix_stream_connect() either end up attached to a socket (in unix_accept()), in which case the call chain to unix_release_sock() and serialization are the same as in the previous case, or they never get accept'ed and unix_release_sock() is called when the listener is shut down and its queue gets purged. In that case the listener's queue lock provides the barriers needed - unix_stream_connect() shoves our unix_sock into listener's queue under that lock right after having set ->path and eventual unix_release_sock() caller picks them from that queue under the same lock right before calling unix_release_sock(). 5) unix_find_other() use of ->path is pointless, but safe - it happens with successful lookup by (abstract) name, so ->path.dentry is guaranteed to be NULL there. earlier-variant-reviewed-by: "Paul E. McKenney" <paulmck@linux.ibm.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-02-16 04:09:35 +08:00
if (!path.dentry)
return -ENOENT;
path_get(&path);
fd = get_unused_fd_flags(O_CLOEXEC);
if (fd < 0)
goto out;
f = dentry_open(&path, O_PATH, current_cred());
if (IS_ERR(f)) {
put_unused_fd(fd);
fd = PTR_ERR(f);
goto out;
}
fd_install(fd, f);
out:
path_put(&path);
return fd;
}
static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
struct sock *sk = sock->sk;
long amount = 0;
int err;
switch (cmd) {
case SIOCOUTQ:
amount = unix_outq_len(sk);
err = put_user(amount, (int __user *)arg);
break;
case SIOCINQ:
amount = unix_inq_len(sk);
if (amount < 0)
err = amount;
else
err = put_user(amount, (int __user *)arg);
break;
case SIOCUNIXFILE:
err = unix_open_file(sk);
break;
default:
err = -ENOIOCTLCMD;
break;
}
return err;
}
#ifdef CONFIG_COMPAT
static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
}
#endif
static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
{
struct sock *sk = sock->sk;
__poll_t mask;
sock_poll_wait(file, sock, wait);
mask = 0;
/* exceptional events? */
if (sk->sk_err)
mask |= EPOLLERR;
if (sk->sk_shutdown == SHUTDOWN_MASK)
mask |= EPOLLHUP;
if (sk->sk_shutdown & RCV_SHUTDOWN)
mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
/* readable? */
if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
mask |= EPOLLIN | EPOLLRDNORM;
/* Connection-based need to check for termination and startup */
if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
sk->sk_state == TCP_CLOSE)
mask |= EPOLLHUP;
/*
* we set writable also when the other side has shut down the
* connection. This prevents stuck sockets.
*/
if (unix_writable(sk))
mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
return mask;
}
static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
poll_table *wait)
af_unix: fix 'poll for write'/ connected DGRAM sockets The unix_dgram_sendmsg routine implements a (somewhat crude) form of receiver-imposed flow control by comparing the length of the receive queue of the 'peer socket' with the max_ack_backlog value stored in the corresponding sock structure, either blocking the thread which caused the send-routine to be called or returning EAGAIN. This routine is used by both SOCK_DGRAM and SOCK_SEQPACKET sockets. The poll-implementation for these socket types is datagram_poll from core/datagram.c. A socket is deemed to be writeable by this routine when the memory presently consumed by datagrams owned by it is less than the configured socket send buffer size. This is always wrong for connected PF_UNIX non-stream sockets when the abovementioned receive queue is currently considered to be full. 'poll' will then return, indicating that the socket is writeable, but a subsequent write result in EAGAIN, effectively causing an (usual) application to 'poll for writeability by repeated send request with O_NONBLOCK set' until it has consumed its time quantum. The change below uses a suitably modified variant of the datagram_poll routines for both type of PF_UNIX sockets, which tests if the recv-queue of the peer a socket is connected to is presently considered to be 'full' as part of the 'is this socket writeable'-checking code. The socket being polled is additionally put onto the peer_wait wait queue associated with its peer, because the unix_dgram_sendmsg routine does a wake up on this queue after a datagram was received and the 'other wakeup call' is done implicitly as part of skb destruction, meaning, a process blocked in poll because of a full peer receive queue could otherwise sleep forever if no datagram owned by its socket was already sitting on this queue. Among this change is a small (inline) helper routine named 'unix_recvq_full', which consolidates the actual testing code (in three different places) into a single location. Signed-off-by: Rainer Weikusat <rweikusat@mssgmbh.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-06-18 13:28:05 +08:00
{
af_unix: fix 'poll for write'/connected DGRAM sockets For n:1 'datagram connections' (eg /dev/log), the unix_dgram_sendmsg routine implements a form of receiver-imposed flow control by comparing the length of the receive queue of the 'peer socket' with the max_ack_backlog value stored in the corresponding sock structure, either blocking the thread which caused the send-routine to be called or returning EAGAIN. This routine is used by both SOCK_DGRAM and SOCK_SEQPACKET sockets. The poll-implementation for these socket types is datagram_poll from core/datagram.c. A socket is deemed to be writeable by this routine when the memory presently consumed by datagrams owned by it is less than the configured socket send buffer size. This is always wrong for PF_UNIX non-stream sockets connected to server sockets dealing with (potentially) multiple clients if the abovementioned receive queue is currently considered to be full. 'poll' will then return, indicating that the socket is writeable, but a subsequent write result in EAGAIN, effectively causing an (usual) application to 'poll for writeability by repeated send request with O_NONBLOCK set' until it has consumed its time quantum. The change below uses a suitably modified variant of the datagram_poll routines for both type of PF_UNIX sockets, which tests if the recv-queue of the peer a socket is connected to is presently considered to be 'full' as part of the 'is this socket writeable'-checking code. The socket being polled is additionally put onto the peer_wait wait queue associated with its peer, because the unix_dgram_recvmsg routine does a wake up on this queue after a datagram was received and the 'other wakeup call' is done implicitly as part of skb destruction, meaning, a process blocked in poll because of a full peer receive queue could otherwise sleep forever if no datagram owned by its socket was already sitting on this queue. Among this change is a small (inline) helper routine named 'unix_recvq_full', which consolidates the actual testing code (in three different places) into a single location. Signed-off-by: Rainer Weikusat <rweikusat@mssgmbh.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-06-28 10:34:18 +08:00
struct sock *sk = sock->sk, *other;
unsigned int writable;
__poll_t mask;
sock_poll_wait(file, sock, wait);
mask = 0;
af_unix: fix 'poll for write'/ connected DGRAM sockets The unix_dgram_sendmsg routine implements a (somewhat crude) form of receiver-imposed flow control by comparing the length of the receive queue of the 'peer socket' with the max_ack_backlog value stored in the corresponding sock structure, either blocking the thread which caused the send-routine to be called or returning EAGAIN. This routine is used by both SOCK_DGRAM and SOCK_SEQPACKET sockets. The poll-implementation for these socket types is datagram_poll from core/datagram.c. A socket is deemed to be writeable by this routine when the memory presently consumed by datagrams owned by it is less than the configured socket send buffer size. This is always wrong for connected PF_UNIX non-stream sockets when the abovementioned receive queue is currently considered to be full. 'poll' will then return, indicating that the socket is writeable, but a subsequent write result in EAGAIN, effectively causing an (usual) application to 'poll for writeability by repeated send request with O_NONBLOCK set' until it has consumed its time quantum. The change below uses a suitably modified variant of the datagram_poll routines for both type of PF_UNIX sockets, which tests if the recv-queue of the peer a socket is connected to is presently considered to be 'full' as part of the 'is this socket writeable'-checking code. The socket being polled is additionally put onto the peer_wait wait queue associated with its peer, because the unix_dgram_sendmsg routine does a wake up on this queue after a datagram was received and the 'other wakeup call' is done implicitly as part of skb destruction, meaning, a process blocked in poll because of a full peer receive queue could otherwise sleep forever if no datagram owned by its socket was already sitting on this queue. Among this change is a small (inline) helper routine named 'unix_recvq_full', which consolidates the actual testing code (in three different places) into a single location. Signed-off-by: Rainer Weikusat <rweikusat@mssgmbh.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-06-18 13:28:05 +08:00
/* exceptional events? */
if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
mask |= EPOLLERR |
(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
af_unix: fix 'poll for write'/ connected DGRAM sockets The unix_dgram_sendmsg routine implements a (somewhat crude) form of receiver-imposed flow control by comparing the length of the receive queue of the 'peer socket' with the max_ack_backlog value stored in the corresponding sock structure, either blocking the thread which caused the send-routine to be called or returning EAGAIN. This routine is used by both SOCK_DGRAM and SOCK_SEQPACKET sockets. The poll-implementation for these socket types is datagram_poll from core/datagram.c. A socket is deemed to be writeable by this routine when the memory presently consumed by datagrams owned by it is less than the configured socket send buffer size. This is always wrong for connected PF_UNIX non-stream sockets when the abovementioned receive queue is currently considered to be full. 'poll' will then return, indicating that the socket is writeable, but a subsequent write result in EAGAIN, effectively causing an (usual) application to 'poll for writeability by repeated send request with O_NONBLOCK set' until it has consumed its time quantum. The change below uses a suitably modified variant of the datagram_poll routines for both type of PF_UNIX sockets, which tests if the recv-queue of the peer a socket is connected to is presently considered to be 'full' as part of the 'is this socket writeable'-checking code. The socket being polled is additionally put onto the peer_wait wait queue associated with its peer, because the unix_dgram_sendmsg routine does a wake up on this queue after a datagram was received and the 'other wakeup call' is done implicitly as part of skb destruction, meaning, a process blocked in poll because of a full peer receive queue could otherwise sleep forever if no datagram owned by its socket was already sitting on this queue. Among this change is a small (inline) helper routine named 'unix_recvq_full', which consolidates the actual testing code (in three different places) into a single location. Signed-off-by: Rainer Weikusat <rweikusat@mssgmbh.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-06-18 13:28:05 +08:00
if (sk->sk_shutdown & RCV_SHUTDOWN)
mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
af_unix: fix 'poll for write'/ connected DGRAM sockets The unix_dgram_sendmsg routine implements a (somewhat crude) form of receiver-imposed flow control by comparing the length of the receive queue of the 'peer socket' with the max_ack_backlog value stored in the corresponding sock structure, either blocking the thread which caused the send-routine to be called or returning EAGAIN. This routine is used by both SOCK_DGRAM and SOCK_SEQPACKET sockets. The poll-implementation for these socket types is datagram_poll from core/datagram.c. A socket is deemed to be writeable by this routine when the memory presently consumed by datagrams owned by it is less than the configured socket send buffer size. This is always wrong for connected PF_UNIX non-stream sockets when the abovementioned receive queue is currently considered to be full. 'poll' will then return, indicating that the socket is writeable, but a subsequent write result in EAGAIN, effectively causing an (usual) application to 'poll for writeability by repeated send request with O_NONBLOCK set' until it has consumed its time quantum. The change below uses a suitably modified variant of the datagram_poll routines for both type of PF_UNIX sockets, which tests if the recv-queue of the peer a socket is connected to is presently considered to be 'full' as part of the 'is this socket writeable'-checking code. The socket being polled is additionally put onto the peer_wait wait queue associated with its peer, because the unix_dgram_sendmsg routine does a wake up on this queue after a datagram was received and the 'other wakeup call' is done implicitly as part of skb destruction, meaning, a process blocked in poll because of a full peer receive queue could otherwise sleep forever if no datagram owned by its socket was already sitting on this queue. Among this change is a small (inline) helper routine named 'unix_recvq_full', which consolidates the actual testing code (in three different places) into a single location. Signed-off-by: Rainer Weikusat <rweikusat@mssgmbh.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-06-18 13:28:05 +08:00
if (sk->sk_shutdown == SHUTDOWN_MASK)
mask |= EPOLLHUP;
af_unix: fix 'poll for write'/ connected DGRAM sockets The unix_dgram_sendmsg routine implements a (somewhat crude) form of receiver-imposed flow control by comparing the length of the receive queue of the 'peer socket' with the max_ack_backlog value stored in the corresponding sock structure, either blocking the thread which caused the send-routine to be called or returning EAGAIN. This routine is used by both SOCK_DGRAM and SOCK_SEQPACKET sockets. The poll-implementation for these socket types is datagram_poll from core/datagram.c. A socket is deemed to be writeable by this routine when the memory presently consumed by datagrams owned by it is less than the configured socket send buffer size. This is always wrong for connected PF_UNIX non-stream sockets when the abovementioned receive queue is currently considered to be full. 'poll' will then return, indicating that the socket is writeable, but a subsequent write result in EAGAIN, effectively causing an (usual) application to 'poll for writeability by repeated send request with O_NONBLOCK set' until it has consumed its time quantum. The change below uses a suitably modified variant of the datagram_poll routines for both type of PF_UNIX sockets, which tests if the recv-queue of the peer a socket is connected to is presently considered to be 'full' as part of the 'is this socket writeable'-checking code. The socket being polled is additionally put onto the peer_wait wait queue associated with its peer, because the unix_dgram_sendmsg routine does a wake up on this queue after a datagram was received and the 'other wakeup call' is done implicitly as part of skb destruction, meaning, a process blocked in poll because of a full peer receive queue could otherwise sleep forever if no datagram owned by its socket was already sitting on this queue. Among this change is a small (inline) helper routine named 'unix_recvq_full', which consolidates the actual testing code (in three different places) into a single location. Signed-off-by: Rainer Weikusat <rweikusat@mssgmbh.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-06-18 13:28:05 +08:00
/* readable? */
if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
mask |= EPOLLIN | EPOLLRDNORM;
af_unix: fix 'poll for write'/ connected DGRAM sockets The unix_dgram_sendmsg routine implements a (somewhat crude) form of receiver-imposed flow control by comparing the length of the receive queue of the 'peer socket' with the max_ack_backlog value stored in the corresponding sock structure, either blocking the thread which caused the send-routine to be called or returning EAGAIN. This routine is used by both SOCK_DGRAM and SOCK_SEQPACKET sockets. The poll-implementation for these socket types is datagram_poll from core/datagram.c. A socket is deemed to be writeable by this routine when the memory presently consumed by datagrams owned by it is less than the configured socket send buffer size. This is always wrong for connected PF_UNIX non-stream sockets when the abovementioned receive queue is currently considered to be full. 'poll' will then return, indicating that the socket is writeable, but a subsequent write result in EAGAIN, effectively causing an (usual) application to 'poll for writeability by repeated send request with O_NONBLOCK set' until it has consumed its time quantum. The change below uses a suitably modified variant of the datagram_poll routines for both type of PF_UNIX sockets, which tests if the recv-queue of the peer a socket is connected to is presently considered to be 'full' as part of the 'is this socket writeable'-checking code. The socket being polled is additionally put onto the peer_wait wait queue associated with its peer, because the unix_dgram_sendmsg routine does a wake up on this queue after a datagram was received and the 'other wakeup call' is done implicitly as part of skb destruction, meaning, a process blocked in poll because of a full peer receive queue could otherwise sleep forever if no datagram owned by its socket was already sitting on this queue. Among this change is a small (inline) helper routine named 'unix_recvq_full', which consolidates the actual testing code (in three different places) into a single location. Signed-off-by: Rainer Weikusat <rweikusat@mssgmbh.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-06-18 13:28:05 +08:00
/* Connection-based need to check for termination and startup */
if (sk->sk_type == SOCK_SEQPACKET) {
if (sk->sk_state == TCP_CLOSE)
mask |= EPOLLHUP;
af_unix: fix 'poll for write'/ connected DGRAM sockets The unix_dgram_sendmsg routine implements a (somewhat crude) form of receiver-imposed flow control by comparing the length of the receive queue of the 'peer socket' with the max_ack_backlog value stored in the corresponding sock structure, either blocking the thread which caused the send-routine to be called or returning EAGAIN. This routine is used by both SOCK_DGRAM and SOCK_SEQPACKET sockets. The poll-implementation for these socket types is datagram_poll from core/datagram.c. A socket is deemed to be writeable by this routine when the memory presently consumed by datagrams owned by it is less than the configured socket send buffer size. This is always wrong for connected PF_UNIX non-stream sockets when the abovementioned receive queue is currently considered to be full. 'poll' will then return, indicating that the socket is writeable, but a subsequent write result in EAGAIN, effectively causing an (usual) application to 'poll for writeability by repeated send request with O_NONBLOCK set' until it has consumed its time quantum. The change below uses a suitably modified variant of the datagram_poll routines for both type of PF_UNIX sockets, which tests if the recv-queue of the peer a socket is connected to is presently considered to be 'full' as part of the 'is this socket writeable'-checking code. The socket being polled is additionally put onto the peer_wait wait queue associated with its peer, because the unix_dgram_sendmsg routine does a wake up on this queue after a datagram was received and the 'other wakeup call' is done implicitly as part of skb destruction, meaning, a process blocked in poll because of a full peer receive queue could otherwise sleep forever if no datagram owned by its socket was already sitting on this queue. Among this change is a small (inline) helper routine named 'unix_recvq_full', which consolidates the actual testing code (in three different places) into a single location. Signed-off-by: Rainer Weikusat <rweikusat@mssgmbh.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-06-18 13:28:05 +08:00
/* connection hasn't started yet? */
if (sk->sk_state == TCP_SYN_SENT)
return mask;
}
/* No write status requested, avoid expensive OUT tests. */
if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
return mask;
af_unix: fix 'poll for write'/connected DGRAM sockets For n:1 'datagram connections' (eg /dev/log), the unix_dgram_sendmsg routine implements a form of receiver-imposed flow control by comparing the length of the receive queue of the 'peer socket' with the max_ack_backlog value stored in the corresponding sock structure, either blocking the thread which caused the send-routine to be called or returning EAGAIN. This routine is used by both SOCK_DGRAM and SOCK_SEQPACKET sockets. The poll-implementation for these socket types is datagram_poll from core/datagram.c. A socket is deemed to be writeable by this routine when the memory presently consumed by datagrams owned by it is less than the configured socket send buffer size. This is always wrong for PF_UNIX non-stream sockets connected to server sockets dealing with (potentially) multiple clients if the abovementioned receive queue is currently considered to be full. 'poll' will then return, indicating that the socket is writeable, but a subsequent write result in EAGAIN, effectively causing an (usual) application to 'poll for writeability by repeated send request with O_NONBLOCK set' until it has consumed its time quantum. The change below uses a suitably modified variant of the datagram_poll routines for both type of PF_UNIX sockets, which tests if the recv-queue of the peer a socket is connected to is presently considered to be 'full' as part of the 'is this socket writeable'-checking code. The socket being polled is additionally put onto the peer_wait wait queue associated with its peer, because the unix_dgram_recvmsg routine does a wake up on this queue after a datagram was received and the 'other wakeup call' is done implicitly as part of skb destruction, meaning, a process blocked in poll because of a full peer receive queue could otherwise sleep forever if no datagram owned by its socket was already sitting on this queue. Among this change is a small (inline) helper routine named 'unix_recvq_full', which consolidates the actual testing code (in three different places) into a single location. Signed-off-by: Rainer Weikusat <rweikusat@mssgmbh.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-06-28 10:34:18 +08:00
writable = unix_writable(sk);
unix: avoid use-after-free in ep_remove_wait_queue Rainer Weikusat <rweikusat@mobileactivedefense.com> writes: An AF_UNIX datagram socket being the client in an n:1 association with some server socket is only allowed to send messages to the server if the receive queue of this socket contains at most sk_max_ack_backlog datagrams. This implies that prospective writers might be forced to go to sleep despite none of the message presently enqueued on the server receive queue were sent by them. In order to ensure that these will be woken up once space becomes again available, the present unix_dgram_poll routine does a second sock_poll_wait call with the peer_wait wait queue of the server socket as queue argument (unix_dgram_recvmsg does a wake up on this queue after a datagram was received). This is inherently problematic because the server socket is only guaranteed to remain alive for as long as the client still holds a reference to it. In case the connection is dissolved via connect or by the dead peer detection logic in unix_dgram_sendmsg, the server socket may be freed despite "the polling mechanism" (in particular, epoll) still has a pointer to the corresponding peer_wait queue. There's no way to forcibly deregister a wait queue with epoll. Based on an idea by Jason Baron, the patch below changes the code such that a wait_queue_t belonging to the client socket is enqueued on the peer_wait queue of the server whenever the peer receive queue full condition is detected by either a sendmsg or a poll. A wake up on the peer queue is then relayed to the ordinary wait queue of the client socket via wake function. The connection to the peer wait queue is again dissolved if either a wake up is about to be relayed or the client socket reconnects or a dead peer is detected or the client socket is itself closed. This enables removing the second sock_poll_wait from unix_dgram_poll, thus avoiding the use-after-free, while still ensuring that no blocked writer sleeps forever. Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com> Fixes: ec0d215f9420 ("af_unix: fix 'poll for write'/connected DGRAM sockets") Reviewed-by: Jason Baron <jbaron@akamai.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-11-21 06:07:23 +08:00
if (writable) {
unix_state_lock(sk);
other = unix_peer(sk);
if (other && unix_peer(other) != sk &&
unix_recvq_full(other) &&
unix_dgram_peer_wake_me(sk, other))
writable = 0;
unix_state_unlock(sk);
af_unix: fix 'poll for write'/connected DGRAM sockets For n:1 'datagram connections' (eg /dev/log), the unix_dgram_sendmsg routine implements a form of receiver-imposed flow control by comparing the length of the receive queue of the 'peer socket' with the max_ack_backlog value stored in the corresponding sock structure, either blocking the thread which caused the send-routine to be called or returning EAGAIN. This routine is used by both SOCK_DGRAM and SOCK_SEQPACKET sockets. The poll-implementation for these socket types is datagram_poll from core/datagram.c. A socket is deemed to be writeable by this routine when the memory presently consumed by datagrams owned by it is less than the configured socket send buffer size. This is always wrong for PF_UNIX non-stream sockets connected to server sockets dealing with (potentially) multiple clients if the abovementioned receive queue is currently considered to be full. 'poll' will then return, indicating that the socket is writeable, but a subsequent write result in EAGAIN, effectively causing an (usual) application to 'poll for writeability by repeated send request with O_NONBLOCK set' until it has consumed its time quantum. The change below uses a suitably modified variant of the datagram_poll routines for both type of PF_UNIX sockets, which tests if the recv-queue of the peer a socket is connected to is presently considered to be 'full' as part of the 'is this socket writeable'-checking code. The socket being polled is additionally put onto the peer_wait wait queue associated with its peer, because the unix_dgram_recvmsg routine does a wake up on this queue after a datagram was received and the 'other wakeup call' is done implicitly as part of skb destruction, meaning, a process blocked in poll because of a full peer receive queue could otherwise sleep forever if no datagram owned by its socket was already sitting on this queue. Among this change is a small (inline) helper routine named 'unix_recvq_full', which consolidates the actual testing code (in three different places) into a single location. Signed-off-by: Rainer Weikusat <rweikusat@mssgmbh.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-06-28 10:34:18 +08:00
}
if (writable)
mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
af_unix: fix 'poll for write'/ connected DGRAM sockets The unix_dgram_sendmsg routine implements a (somewhat crude) form of receiver-imposed flow control by comparing the length of the receive queue of the 'peer socket' with the max_ack_backlog value stored in the corresponding sock structure, either blocking the thread which caused the send-routine to be called or returning EAGAIN. This routine is used by both SOCK_DGRAM and SOCK_SEQPACKET sockets. The poll-implementation for these socket types is datagram_poll from core/datagram.c. A socket is deemed to be writeable by this routine when the memory presently consumed by datagrams owned by it is less than the configured socket send buffer size. This is always wrong for connected PF_UNIX non-stream sockets when the abovementioned receive queue is currently considered to be full. 'poll' will then return, indicating that the socket is writeable, but a subsequent write result in EAGAIN, effectively causing an (usual) application to 'poll for writeability by repeated send request with O_NONBLOCK set' until it has consumed its time quantum. The change below uses a suitably modified variant of the datagram_poll routines for both type of PF_UNIX sockets, which tests if the recv-queue of the peer a socket is connected to is presently considered to be 'full' as part of the 'is this socket writeable'-checking code. The socket being polled is additionally put onto the peer_wait wait queue associated with its peer, because the unix_dgram_sendmsg routine does a wake up on this queue after a datagram was received and the 'other wakeup call' is done implicitly as part of skb destruction, meaning, a process blocked in poll because of a full peer receive queue could otherwise sleep forever if no datagram owned by its socket was already sitting on this queue. Among this change is a small (inline) helper routine named 'unix_recvq_full', which consolidates the actual testing code (in three different places) into a single location. Signed-off-by: Rainer Weikusat <rweikusat@mssgmbh.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-06-18 13:28:05 +08:00
else
sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
af_unix: fix 'poll for write'/ connected DGRAM sockets The unix_dgram_sendmsg routine implements a (somewhat crude) form of receiver-imposed flow control by comparing the length of the receive queue of the 'peer socket' with the max_ack_backlog value stored in the corresponding sock structure, either blocking the thread which caused the send-routine to be called or returning EAGAIN. This routine is used by both SOCK_DGRAM and SOCK_SEQPACKET sockets. The poll-implementation for these socket types is datagram_poll from core/datagram.c. A socket is deemed to be writeable by this routine when the memory presently consumed by datagrams owned by it is less than the configured socket send buffer size. This is always wrong for connected PF_UNIX non-stream sockets when the abovementioned receive queue is currently considered to be full. 'poll' will then return, indicating that the socket is writeable, but a subsequent write result in EAGAIN, effectively causing an (usual) application to 'poll for writeability by repeated send request with O_NONBLOCK set' until it has consumed its time quantum. The change below uses a suitably modified variant of the datagram_poll routines for both type of PF_UNIX sockets, which tests if the recv-queue of the peer a socket is connected to is presently considered to be 'full' as part of the 'is this socket writeable'-checking code. The socket being polled is additionally put onto the peer_wait wait queue associated with its peer, because the unix_dgram_sendmsg routine does a wake up on this queue after a datagram was received and the 'other wakeup call' is done implicitly as part of skb destruction, meaning, a process blocked in poll because of a full peer receive queue could otherwise sleep forever if no datagram owned by its socket was already sitting on this queue. Among this change is a small (inline) helper routine named 'unix_recvq_full', which consolidates the actual testing code (in three different places) into a single location. Signed-off-by: Rainer Weikusat <rweikusat@mssgmbh.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-06-18 13:28:05 +08:00
return mask;
}
#ifdef CONFIG_PROC_FS
#define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
#define get_bucket(x) ((x) >> BUCKET_SPACE)
#define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
{
unsigned long offset = get_offset(*pos);
unsigned long bucket = get_bucket(*pos);
struct sock *sk;
unsigned long count = 0;
for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
if (sock_net(sk) != seq_file_net(seq))
continue;
if (++count == offset)
break;
}
return sk;
}
static struct sock *unix_next_socket(struct seq_file *seq,
struct sock *sk,
loff_t *pos)
{
unsigned long bucket;
while (sk > (struct sock *)SEQ_START_TOKEN) {
sk = sk_next(sk);
if (!sk)
goto next_bucket;
if (sock_net(sk) == seq_file_net(seq))
return sk;
}
do {
sk = unix_from_bucket(seq, pos);
if (sk)
return sk;
next_bucket:
bucket = get_bucket(*pos) + 1;
*pos = set_bucket_offset(bucket, 1);
} while (bucket < ARRAY_SIZE(unix_socket_table));
return NULL;
}
static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(unix_table_lock)
{
spin_lock(&unix_table_lock);
if (!*pos)
return SEQ_START_TOKEN;
if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
return NULL;
return unix_next_socket(seq, NULL, pos);
}
static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
++*pos;
return unix_next_socket(seq, v, pos);
}
static void unix_seq_stop(struct seq_file *seq, void *v)
__releases(unix_table_lock)
{
spin_unlock(&unix_table_lock);
}
static int unix_seq_show(struct seq_file *seq, void *v)
{
if (v == SEQ_START_TOKEN)
seq_puts(seq, "Num RefCount Protocol Flags Type St "
"Inode Path\n");
else {
struct sock *s = v;
struct unix_sock *u = unix_sk(s);
unix_state_lock(s);
net: convert %p usage to %pK The %pK format specifier is designed to hide exposed kernel pointers, specifically via /proc interfaces. Exposing these pointers provides an easy target for kernel write vulnerabilities, since they reveal the locations of writable structures containing easily triggerable function pointers. The behavior of %pK depends on the kptr_restrict sysctl. If kptr_restrict is set to 0, no deviation from the standard %p behavior occurs. If kptr_restrict is set to 1, the default, if the current user (intended to be a reader via seq_printf(), etc.) does not have CAP_SYSLOG (currently in the LSM tree), kernel pointers using %pK are printed as 0's. If kptr_restrict is set to 2, kernel pointers using %pK are printed as 0's regardless of privileges. Replacing with 0's was chosen over the default "(null)", which cannot be parsed by userland %p, which expects "(nil)". The supporting code for kptr_restrict and %pK are currently in the -mm tree. This patch converts users of %p in net/ to %pK. Cases of printing pointers to the syslog are not covered, since this would eliminate useful information for postmortem debugging and the reading of the syslog is already optionally protected by the dmesg_restrict sysctl. Signed-off-by: Dan Rosenberg <drosenberg@vsecurity.com> Cc: James Morris <jmorris@namei.org> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: Thomas Graf <tgraf@infradead.org> Cc: Eugene Teo <eugeneteo@kernel.org> Cc: Kees Cook <kees.cook@canonical.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: David S. Miller <davem@davemloft.net> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Eric Paris <eparis@parisplace.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2011-05-23 20:17:35 +08:00
seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
s,
refcount_read(&s->sk_refcnt),
0,
s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
s->sk_type,
s->sk_socket ?
(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
sock_i_ino(s));
missing barriers in some of unix_sock ->addr and ->path accesses Several u->addr and u->path users are not holding any locks in common with unix_bind(). unix_state_lock() is useless for those purposes. u->addr is assign-once and *(u->addr) is fully set up by the time we set u->addr (all under unix_table_lock). u->path is also set in the same critical area, also before setting u->addr, and any unix_sock with ->path filled will have non-NULL ->addr. So setting ->addr with smp_store_release() is all we need for those "lockless" users - just have them fetch ->addr with smp_load_acquire() and don't even bother looking at ->path if they see NULL ->addr. Users of ->addr and ->path fall into several classes now: 1) ones that do smp_load_acquire(u->addr) and access *(u->addr) and u->path only if smp_load_acquire() has returned non-NULL. 2) places holding unix_table_lock. These are guaranteed that *(u->addr) is seen fully initialized. If unix_sock is in one of the "bound" chains, so's ->path. 3) unix_sock_destructor() using ->addr is safe. All places that set u->addr are guaranteed to have seen all stores *(u->addr) while holding a reference to u and unix_sock_destructor() is called when (atomic) refcount hits zero. 4) unix_release_sock() using ->path is safe. unix_bind() is serialized wrt unix_release() (normally - by struct file refcount), and for the instances that had ->path set by unix_bind() unix_release_sock() comes from unix_release(), so they are fine. Instances that had it set in unix_stream_connect() either end up attached to a socket (in unix_accept()), in which case the call chain to unix_release_sock() and serialization are the same as in the previous case, or they never get accept'ed and unix_release_sock() is called when the listener is shut down and its queue gets purged. In that case the listener's queue lock provides the barriers needed - unix_stream_connect() shoves our unix_sock into listener's queue under that lock right after having set ->path and eventual unix_release_sock() caller picks them from that queue under the same lock right before calling unix_release_sock(). 5) unix_find_other() use of ->path is pointless, but safe - it happens with successful lookup by (abstract) name, so ->path.dentry is guaranteed to be NULL there. earlier-variant-reviewed-by: "Paul E. McKenney" <paulmck@linux.ibm.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-02-16 04:09:35 +08:00
if (u->addr) { // under unix_table_lock here
int i, len;
seq_putc(seq, ' ');
i = 0;
len = u->addr->len - sizeof(short);
if (!UNIX_ABSTRACT(s))
len--;
else {
seq_putc(seq, '@');
i++;
}
for ( ; i < len; i++)
seq_putc(seq, u->addr->name->sun_path[i] ?:
'@');
}
unix_state_unlock(s);
seq_putc(seq, '\n');
}
return 0;
}
static const struct seq_operations unix_seq_ops = {
.start = unix_seq_start,
.next = unix_seq_next,
.stop = unix_seq_stop,
.show = unix_seq_show,
};
#endif
static const struct net_proto_family unix_family_ops = {
.family = PF_UNIX,
.create = unix_create,
.owner = THIS_MODULE,
};
static int __net_init unix_net_init(struct net *net)
{
int error = -ENOMEM;
net->unx.sysctl_max_dgram_qlen = 10;
if (unix_sysctl_register(net))
goto out;
#ifdef CONFIG_PROC_FS
if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
sizeof(struct seq_net_private))) {
unix_sysctl_unregister(net);
goto out;
}
#endif
error = 0;
out:
return error;
}
static void __net_exit unix_net_exit(struct net *net)
{
unix_sysctl_unregister(net);
remove_proc_entry("unix", net->proc_net);
}
static struct pernet_operations unix_net_ops = {
.init = unix_net_init,
.exit = unix_net_exit,
};
static int __init af_unix_init(void)
{
int rc = -1;
BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
rc = proto_register(&unix_proto, 1);
if (rc != 0) {
pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
goto out;
}
sock_register(&unix_family_ops);
register_pernet_subsys(&unix_net_ops);
out:
return rc;
}
static void __exit af_unix_exit(void)
{
sock_unregister(PF_UNIX);
proto_unregister(&unix_proto);
unregister_pernet_subsys(&unix_net_ops);
}
/* Earlier than device_initcall() so that other drivers invoking
request_module() don't end up in a loop when modprobe tries
to use a UNIX socket. But later than subsys_initcall() because
we depend on stuff initialised there */
fs_initcall(af_unix_init);
module_exit(af_unix_exit);
MODULE_LICENSE("GPL");
MODULE_ALIAS_NETPROTO(PF_UNIX);