OpenCloudOS-Kernel/net/compat.c

859 lines
24 KiB
C
Raw Normal View History

/*
* 32bit Socket syscall emulation. Based on arch/sparc64/kernel/sys_sparc32.c.
*
* Copyright (C) 2000 VA Linux Co
* Copyright (C) 2000 Don Dugger <n0ano@valinux.com>
* Copyright (C) 1999 Arun Sharma <arun.sharma@intel.com>
* Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
* Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
* Copyright (C) 2000 Hewlett-Packard Co.
* Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
* Copyright (C) 2000,2001 Andi Kleen, SuSE Labs
*/
#include <linux/kernel.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 16:04:11 +08:00
#include <linux/gfp.h>
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/file.h>
#include <linux/icmpv6.h>
#include <linux/socket.h>
#include <linux/syscalls.h>
#include <linux/filter.h>
#include <linux/compat.h>
#include <linux/security.h>
#include <net/scm.h>
#include <net/sock.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <asm/uaccess.h>
#include <net/compat.h>
static inline int iov_from_user_compat_to_kern(struct iovec *kiov,
struct compat_iovec __user *uiov32,
int niov)
{
int tot_len = 0;
while (niov > 0) {
compat_uptr_t buf;
compat_size_t len;
if (get_user(len, &uiov32->iov_len) ||
get_user(buf, &uiov32->iov_base)) {
tot_len = -EFAULT;
break;
}
tot_len += len;
kiov->iov_base = compat_ptr(buf);
kiov->iov_len = (__kernel_size_t) len;
uiov32++;
kiov++;
niov--;
}
return tot_len;
}
int get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr __user *umsg)
{
compat_uptr_t tmp1, tmp2, tmp3;
if (!access_ok(VERIFY_READ, umsg, sizeof(*umsg)) ||
__get_user(tmp1, &umsg->msg_name) ||
__get_user(kmsg->msg_namelen, &umsg->msg_namelen) ||
__get_user(tmp2, &umsg->msg_iov) ||
__get_user(kmsg->msg_iovlen, &umsg->msg_iovlen) ||
__get_user(tmp3, &umsg->msg_control) ||
__get_user(kmsg->msg_controllen, &umsg->msg_controllen) ||
__get_user(kmsg->msg_flags, &umsg->msg_flags))
return -EFAULT;
kmsg->msg_name = compat_ptr(tmp1);
kmsg->msg_iov = compat_ptr(tmp2);
kmsg->msg_control = compat_ptr(tmp3);
return 0;
}
/* I've named the args so it is easy to tell whose space the pointers are in. */
int verify_compat_iovec(struct msghdr *kern_msg, struct iovec *kern_iov,
struct sockaddr *kern_address, int mode)
{
int tot_len;
if (kern_msg->msg_namelen) {
if (mode==VERIFY_READ) {
int err = move_addr_to_kernel(kern_msg->msg_name,
kern_msg->msg_namelen,
kern_address);
if (err < 0)
return err;
}
kern_msg->msg_name = kern_address;
} else
kern_msg->msg_name = NULL;
tot_len = iov_from_user_compat_to_kern(kern_iov,
(struct compat_iovec __user *)kern_msg->msg_iov,
kern_msg->msg_iovlen);
if (tot_len >= 0)
kern_msg->msg_iov = kern_iov;
return tot_len;
}
/* Bleech... */
#define CMSG_COMPAT_ALIGN(len) ALIGN((len), sizeof(s32))
#define CMSG_COMPAT_DATA(cmsg) \
((void __user *)((char __user *)(cmsg) + CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr))))
#define CMSG_COMPAT_SPACE(len) \
(CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)) + CMSG_COMPAT_ALIGN(len))
#define CMSG_COMPAT_LEN(len) \
(CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)) + (len))
#define CMSG_COMPAT_FIRSTHDR(msg) \
(((msg)->msg_controllen) >= sizeof(struct compat_cmsghdr) ? \
(struct compat_cmsghdr __user *)((msg)->msg_control) : \
(struct compat_cmsghdr __user *)NULL)
#define CMSG_COMPAT_OK(ucmlen, ucmsg, mhdr) \
((ucmlen) >= sizeof(struct compat_cmsghdr) && \
(ucmlen) <= (unsigned long) \
((mhdr)->msg_controllen - \
((char *)(ucmsg) - (char *)(mhdr)->msg_control)))
static inline struct compat_cmsghdr __user *cmsg_compat_nxthdr(struct msghdr *msg,
struct compat_cmsghdr __user *cmsg, int cmsg_len)
{
char __user *ptr = (char __user *)cmsg + CMSG_COMPAT_ALIGN(cmsg_len);
if ((unsigned long)(ptr + 1 - (char __user *)msg->msg_control) >
msg->msg_controllen)
return NULL;
return (struct compat_cmsghdr __user *)ptr;
}
/* There is a lot of hair here because the alignment rules (and
* thus placement) of cmsg headers and length are different for
* 32-bit apps. -DaveM
*/
int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk,
unsigned char *stackbuf, int stackbuf_size)
{
struct compat_cmsghdr __user *ucmsg;
struct cmsghdr *kcmsg, *kcmsg_base;
compat_size_t ucmlen;
__kernel_size_t kcmlen, tmp;
int err = -EFAULT;
kcmlen = 0;
kcmsg_base = kcmsg = (struct cmsghdr *)stackbuf;
ucmsg = CMSG_COMPAT_FIRSTHDR(kmsg);
while (ucmsg != NULL) {
if (get_user(ucmlen, &ucmsg->cmsg_len))
return -EFAULT;
/* Catch bogons. */
if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg))
return -EINVAL;
tmp = ((ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))) +
CMSG_ALIGN(sizeof(struct cmsghdr)));
tmp = CMSG_ALIGN(tmp);
kcmlen += tmp;
ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, ucmlen);
}
if (kcmlen == 0)
return -EINVAL;
/* The kcmlen holds the 64-bit version of the control length.
* It may not be modified as we do not stick it into the kmsg
* until we have successfully copied over all of the data
* from the user.
*/
if (kcmlen > stackbuf_size)
kcmsg_base = kcmsg = sock_kmalloc(sk, kcmlen, GFP_KERNEL);
if (kcmsg == NULL)
return -ENOBUFS;
/* Now copy them over neatly. */
memset(kcmsg, 0, kcmlen);
ucmsg = CMSG_COMPAT_FIRSTHDR(kmsg);
while (ucmsg != NULL) {
if (__get_user(ucmlen, &ucmsg->cmsg_len))
goto Efault;
if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg))
goto Einval;
tmp = ((ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))) +
CMSG_ALIGN(sizeof(struct cmsghdr)));
if ((char *)kcmsg_base + kcmlen - (char *)kcmsg < CMSG_ALIGN(tmp))
goto Einval;
kcmsg->cmsg_len = tmp;
tmp = CMSG_ALIGN(tmp);
if (__get_user(kcmsg->cmsg_level, &ucmsg->cmsg_level) ||
__get_user(kcmsg->cmsg_type, &ucmsg->cmsg_type) ||
copy_from_user(CMSG_DATA(kcmsg),
CMSG_COMPAT_DATA(ucmsg),
(ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg)))))
goto Efault;
/* Advance. */
kcmsg = (struct cmsghdr *)((char *)kcmsg + tmp);
ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, ucmlen);
}
/* Ok, looks like we made it. Hook it up and return success. */
kmsg->msg_control = kcmsg_base;
kmsg->msg_controllen = kcmlen;
return 0;
Einval:
err = -EINVAL;
Efault:
if (kcmsg_base != (struct cmsghdr *)stackbuf)
sock_kfree_s(sk, kcmsg_base, kcmlen);
return err;
}
int put_cmsg_compat(struct msghdr *kmsg, int level, int type, int len, void *data)
{
struct compat_timeval ctv;
struct compat_timespec cts[3];
struct compat_cmsghdr __user *cm = (struct compat_cmsghdr __user *) kmsg->msg_control;
struct compat_cmsghdr cmhdr;
int cmlen;
if (cm == NULL || kmsg->msg_controllen < sizeof(*cm)) {
kmsg->msg_flags |= MSG_CTRUNC;
return 0; /* XXX: return error? check spec. */
}
if (level == SOL_SOCKET && type == SCM_TIMESTAMP) {
struct timeval *tv = (struct timeval *)data;
ctv.tv_sec = tv->tv_sec;
ctv.tv_usec = tv->tv_usec;
data = &ctv;
len = sizeof(ctv);
}
if (level == SOL_SOCKET &&
(type == SCM_TIMESTAMPNS || type == SCM_TIMESTAMPING)) {
int count = type == SCM_TIMESTAMPNS ? 1 : 3;
int i;
struct timespec *ts = (struct timespec *)data;
for (i = 0; i < count; i++) {
cts[i].tv_sec = ts[i].tv_sec;
cts[i].tv_nsec = ts[i].tv_nsec;
}
data = &cts;
len = sizeof(cts[0]) * count;
}
cmlen = CMSG_COMPAT_LEN(len);
if (kmsg->msg_controllen < cmlen) {
kmsg->msg_flags |= MSG_CTRUNC;
cmlen = kmsg->msg_controllen;
}
cmhdr.cmsg_level = level;
cmhdr.cmsg_type = type;
cmhdr.cmsg_len = cmlen;
if (copy_to_user(cm, &cmhdr, sizeof cmhdr))
return -EFAULT;
if (copy_to_user(CMSG_COMPAT_DATA(cm), data, cmlen - sizeof(struct compat_cmsghdr)))
return -EFAULT;
cmlen = CMSG_COMPAT_SPACE(len);
[NET]: Fix function put_cmsg() which may cause usr application memory overflow When used function put_cmsg() to copy kernel information to user application memory, if the memory length given by user application is not enough, by the bad length calculate of msg.msg_controllen, put_cmsg() function may cause the msg.msg_controllen to be a large value, such as 0xFFFFFFF0, so the following put_cmsg() can also write data to usr application memory even usr has no valid memory to store this. This may cause usr application memory overflow. int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) { struct cmsghdr __user *cm = (__force struct cmsghdr __user *)msg->msg_control; struct cmsghdr cmhdr; int cmlen = CMSG_LEN(len); ~~~~~~~~~~~~~~~~~~~~~ int err; if (MSG_CMSG_COMPAT & msg->msg_flags) return put_cmsg_compat(msg, level, type, len, data); if (cm==NULL || msg->msg_controllen < sizeof(*cm)) { msg->msg_flags |= MSG_CTRUNC; return 0; /* XXX: return error? check spec. */ } if (msg->msg_controllen < cmlen) { ~~~~~~~~~~~~~~~~~~~~~~~~ msg->msg_flags |= MSG_CTRUNC; cmlen = msg->msg_controllen; } cmhdr.cmsg_level = level; cmhdr.cmsg_type = type; cmhdr.cmsg_len = cmlen; err = -EFAULT; if (copy_to_user(cm, &cmhdr, sizeof cmhdr)) goto out; if (copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr))) goto out; cmlen = CMSG_SPACE(len); ~~~~~~~~~~~~~~~~~~~~~~~~~~~ If MSG_CTRUNC flags is set, msg->msg_controllen is less than CMSG_SPACE(len), "msg->msg_controllen -= cmlen" will cause unsinged int type msg->msg_controllen to be a large value. ~~~~~~~~~~~~~~~~~~~~~~~~~~~ msg->msg_control += cmlen; msg->msg_controllen -= cmlen; ~~~~~~~~~~~~~~~~~~~~~ err = 0; out: return err; } The same promble exists in put_cmsg_compat(). This patch can fix this problem. Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2007-12-21 06:36:44 +08:00
if (kmsg->msg_controllen < cmlen)
cmlen = kmsg->msg_controllen;
kmsg->msg_control += cmlen;
kmsg->msg_controllen -= cmlen;
return 0;
}
void scm_detach_fds_compat(struct msghdr *kmsg, struct scm_cookie *scm)
{
struct compat_cmsghdr __user *cm = (struct compat_cmsghdr __user *) kmsg->msg_control;
int fdmax = (kmsg->msg_controllen - sizeof(struct compat_cmsghdr)) / sizeof(int);
int fdnum = scm->fp->count;
struct file **fp = scm->fp->fp;
int __user *cmfptr;
int err = 0, i;
if (fdnum < fdmax)
fdmax = fdnum;
for (i = 0, cmfptr = (int __user *) CMSG_COMPAT_DATA(cm); i < fdmax; i++, cmfptr++) {
int new_fd;
err = security_file_receive(fp[i]);
if (err)
break;
O_CLOEXEC for SCM_RIGHTS Part two in the O_CLOEXEC saga: adding support for file descriptors received through Unix domain sockets. The patch is once again pretty minimal, it introduces a new flag for recvmsg and passes it just like the existing MSG_CMSG_COMPAT flag. I think this bit is not used otherwise but the networking people will know better. This new flag is not recognized by recvfrom and recv. These functions cannot be used for that purpose and the asymmetry this introduces is not worse than the already existing MSG_CMSG_COMPAT situations. The patch must be applied on the patch which introduced O_CLOEXEC. It has to remove static from the new get_unused_fd_flags function but since scm.c cannot live in a module the function still hasn't to be exported. Here's a test program to make sure the code works. It's so much longer than the actual patch... #include <errno.h> #include <error.h> #include <fcntl.h> #include <stdio.h> #include <string.h> #include <unistd.h> #include <sys/socket.h> #include <sys/un.h> #ifndef O_CLOEXEC # define O_CLOEXEC 02000000 #endif #ifndef MSG_CMSG_CLOEXEC # define MSG_CMSG_CLOEXEC 0x40000000 #endif int main (int argc, char *argv[]) { if (argc > 1) { int fd = atol (argv[1]); printf ("child: fd = %d\n", fd); if (fcntl (fd, F_GETFD) == 0 || errno != EBADF) { puts ("file descriptor valid in child"); return 1; } return 0; } struct sockaddr_un sun; strcpy (sun.sun_path, "./testsocket"); sun.sun_family = AF_UNIX; char databuf[] = "hello"; struct iovec iov[1]; iov[0].iov_base = databuf; iov[0].iov_len = sizeof (databuf); union { struct cmsghdr hdr; char bytes[CMSG_SPACE (sizeof (int))]; } buf; struct msghdr msg = { .msg_iov = iov, .msg_iovlen = 1, .msg_control = buf.bytes, .msg_controllen = sizeof (buf) }; struct cmsghdr *cmsg = CMSG_FIRSTHDR (&msg); cmsg->cmsg_level = SOL_SOCKET; cmsg->cmsg_type = SCM_RIGHTS; cmsg->cmsg_len = CMSG_LEN (sizeof (int)); msg.msg_controllen = cmsg->cmsg_len; pid_t child = fork (); if (child == -1) error (1, errno, "fork"); if (child == 0) { int sock = socket (PF_UNIX, SOCK_STREAM, 0); if (sock < 0) error (1, errno, "socket"); if (bind (sock, (struct sockaddr *) &sun, sizeof (sun)) < 0) error (1, errno, "bind"); if (listen (sock, SOMAXCONN) < 0) error (1, errno, "listen"); int conn = accept (sock, NULL, NULL); if (conn == -1) error (1, errno, "accept"); *(int *) CMSG_DATA (cmsg) = sock; if (sendmsg (conn, &msg, MSG_NOSIGNAL) < 0) error (1, errno, "sendmsg"); return 0; } /* For a test suite this should be more robust like a barrier in shared memory. */ sleep (1); int sock = socket (PF_UNIX, SOCK_STREAM, 0); if (sock < 0) error (1, errno, "socket"); if (connect (sock, (struct sockaddr *) &sun, sizeof (sun)) < 0) error (1, errno, "connect"); unlink (sun.sun_path); *(int *) CMSG_DATA (cmsg) = -1; if (recvmsg (sock, &msg, MSG_CMSG_CLOEXEC) < 0) error (1, errno, "recvmsg"); int fd = *(int *) CMSG_DATA (cmsg); if (fd == -1) error (1, 0, "no descriptor received"); char fdname[20]; snprintf (fdname, sizeof (fdname), "%d", fd); execl ("/proc/self/exe", argv[0], fdname, NULL); puts ("execl failed"); return 1; } [akpm@linux-foundation.org: Fix fastcall inconsistency noted by Michael Buesch] [akpm@linux-foundation.org: build fix] Signed-off-by: Ulrich Drepper <drepper@redhat.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Michael Buesch <mb@bu3sch.de> Cc: Michael Kerrisk <mtk-manpages@gmx.net> Acked-by: David S. Miller <davem@davemloft.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-07-16 14:40:34 +08:00
err = get_unused_fd_flags(MSG_CMSG_CLOEXEC & kmsg->msg_flags
? O_CLOEXEC : 0);
if (err < 0)
break;
new_fd = err;
err = put_user(new_fd, cmfptr);
if (err) {
put_unused_fd(new_fd);
break;
}
/* Bump the usage count and install the file. */
get_file(fp[i]);
fd_install(new_fd, fp[i]);
}
if (i > 0) {
int cmlen = CMSG_COMPAT_LEN(i * sizeof(int));
err = put_user(SOL_SOCKET, &cm->cmsg_level);
if (!err)
err = put_user(SCM_RIGHTS, &cm->cmsg_type);
if (!err)
err = put_user(cmlen, &cm->cmsg_len);
if (!err) {
cmlen = CMSG_COMPAT_SPACE(i * sizeof(int));
kmsg->msg_control += cmlen;
kmsg->msg_controllen -= cmlen;
}
}
if (i < fdnum)
kmsg->msg_flags |= MSG_CTRUNC;
/*
* All of the files that fit in the message have had their
* usage counts incremented, so we just free the list.
*/
__scm_destroy(scm);
}
/*
* A struct sock_filter is architecture independent.
*/
struct compat_sock_fprog {
u16 len;
compat_uptr_t filter; /* struct sock_filter * */
};
static int do_set_attach_filter(struct socket *sock, int level, int optname,
char __user *optval, unsigned int optlen)
{
struct compat_sock_fprog __user *fprog32 = (struct compat_sock_fprog __user *)optval;
struct sock_fprog __user *kfprog = compat_alloc_user_space(sizeof(struct sock_fprog));
compat_uptr_t ptr;
u16 len;
if (!access_ok(VERIFY_READ, fprog32, sizeof(*fprog32)) ||
!access_ok(VERIFY_WRITE, kfprog, sizeof(struct sock_fprog)) ||
__get_user(len, &fprog32->len) ||
__get_user(ptr, &fprog32->filter) ||
__put_user(len, &kfprog->len) ||
__put_user(compat_ptr(ptr), &kfprog->filter))
return -EFAULT;
return sock_setsockopt(sock, level, optname, (char __user *)kfprog,
sizeof(struct sock_fprog));
}
static int do_set_sock_timeout(struct socket *sock, int level,
int optname, char __user *optval, unsigned int optlen)
{
struct compat_timeval __user *up = (struct compat_timeval __user *) optval;
struct timeval ktime;
mm_segment_t old_fs;
int err;
if (optlen < sizeof(*up))
return -EINVAL;
if (!access_ok(VERIFY_READ, up, sizeof(*up)) ||
__get_user(ktime.tv_sec, &up->tv_sec) ||
__get_user(ktime.tv_usec, &up->tv_usec))
return -EFAULT;
old_fs = get_fs();
set_fs(KERNEL_DS);
err = sock_setsockopt(sock, level, optname, (char *) &ktime, sizeof(ktime));
set_fs(old_fs);
return err;
}
static int compat_sock_setsockopt(struct socket *sock, int level, int optname,
char __user *optval, unsigned int optlen)
{
if (optname == SO_ATTACH_FILTER)
return do_set_attach_filter(sock, level, optname,
optval, optlen);
if (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO)
return do_set_sock_timeout(sock, level, optname, optval, optlen);
return sock_setsockopt(sock, level, optname, optval, optlen);
}
asmlinkage long compat_sys_setsockopt(int fd, int level, int optname,
char __user *optval, unsigned int optlen)
{
int err;
struct socket *sock;
if ((sock = sockfd_lookup(fd, &err))!=NULL)
{
err = security_socket_setsockopt(sock,level,optname);
if (err) {
sockfd_put(sock);
return err;
}
if (level == SOL_SOCKET)
err = compat_sock_setsockopt(sock, level,
optname, optval, optlen);
else if (sock->ops->compat_setsockopt)
err = sock->ops->compat_setsockopt(sock, level,
optname, optval, optlen);
else
err = sock->ops->setsockopt(sock, level,
optname, optval, optlen);
sockfd_put(sock);
}
return err;
}
static int do_get_sock_timeout(struct socket *sock, int level, int optname,
char __user *optval, int __user *optlen)
{
struct compat_timeval __user *up;
struct timeval ktime;
mm_segment_t old_fs;
int len, err;
up = (struct compat_timeval __user *) optval;
if (get_user(len, optlen))
return -EFAULT;
if (len < sizeof(*up))
return -EINVAL;
len = sizeof(ktime);
old_fs = get_fs();
set_fs(KERNEL_DS);
err = sock_getsockopt(sock, level, optname, (char *) &ktime, &len);
set_fs(old_fs);
if (!err) {
if (put_user(sizeof(*up), optlen) ||
!access_ok(VERIFY_WRITE, up, sizeof(*up)) ||
__put_user(ktime.tv_sec, &up->tv_sec) ||
__put_user(ktime.tv_usec, &up->tv_usec))
err = -EFAULT;
}
return err;
}
static int compat_sock_getsockopt(struct socket *sock, int level, int optname,
char __user *optval, int __user *optlen)
{
if (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO)
return do_get_sock_timeout(sock, level, optname, optval, optlen);
return sock_getsockopt(sock, level, optname, optval, optlen);
}
int compat_sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
{
struct compat_timeval __user *ctv =
(struct compat_timeval __user*) userstamp;
int err = -ENOENT;
struct timeval tv;
if (!sock_flag(sk, SOCK_TIMESTAMP))
sock_enable_timestamp(sk, SOCK_TIMESTAMP);
tv = ktime_to_timeval(sk->sk_stamp);
if (tv.tv_sec == -1)
return err;
if (tv.tv_sec == 0) {
sk->sk_stamp = ktime_get_real();
tv = ktime_to_timeval(sk->sk_stamp);
}
err = 0;
if (put_user(tv.tv_sec, &ctv->tv_sec) ||
put_user(tv.tv_usec, &ctv->tv_usec))
err = -EFAULT;
return err;
}
EXPORT_SYMBOL(compat_sock_get_timestamp);
int compat_sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
{
struct compat_timespec __user *ctv =
(struct compat_timespec __user*) userstamp;
int err = -ENOENT;
struct timespec ts;
if (!sock_flag(sk, SOCK_TIMESTAMP))
sock_enable_timestamp(sk, SOCK_TIMESTAMP);
ts = ktime_to_timespec(sk->sk_stamp);
if (ts.tv_sec == -1)
return err;
if (ts.tv_sec == 0) {
sk->sk_stamp = ktime_get_real();
ts = ktime_to_timespec(sk->sk_stamp);
}
err = 0;
if (put_user(ts.tv_sec, &ctv->tv_sec) ||
put_user(ts.tv_nsec, &ctv->tv_nsec))
err = -EFAULT;
return err;
}
EXPORT_SYMBOL(compat_sock_get_timestampns);
asmlinkage long compat_sys_getsockopt(int fd, int level, int optname,
char __user *optval, int __user *optlen)
{
int err;
struct socket *sock;
if ((sock = sockfd_lookup(fd, &err))!=NULL)
{
err = security_socket_getsockopt(sock, level,
optname);
if (err) {
sockfd_put(sock);
return err;
}
if (level == SOL_SOCKET)
err = compat_sock_getsockopt(sock, level,
optname, optval, optlen);
else if (sock->ops->compat_getsockopt)
err = sock->ops->compat_getsockopt(sock, level,
optname, optval, optlen);
else
err = sock->ops->getsockopt(sock, level,
optname, optval, optlen);
sockfd_put(sock);
}
return err;
}
struct compat_group_req {
__u32 gr_interface;
struct __kernel_sockaddr_storage gr_group
__attribute__ ((aligned(4)));
} __attribute__ ((packed));
struct compat_group_source_req {
__u32 gsr_interface;
struct __kernel_sockaddr_storage gsr_group
__attribute__ ((aligned(4)));
struct __kernel_sockaddr_storage gsr_source
__attribute__ ((aligned(4)));
} __attribute__ ((packed));
struct compat_group_filter {
__u32 gf_interface;
struct __kernel_sockaddr_storage gf_group
__attribute__ ((aligned(4)));
__u32 gf_fmode;
__u32 gf_numsrc;
struct __kernel_sockaddr_storage gf_slist[1]
__attribute__ ((aligned(4)));
} __attribute__ ((packed));
#define __COMPAT_GF0_SIZE (sizeof(struct compat_group_filter) - \
sizeof(struct __kernel_sockaddr_storage))
int compat_mc_setsockopt(struct sock *sock, int level, int optname,
char __user *optval, unsigned int optlen,
int (*setsockopt)(struct sock *,int,int,char __user *,unsigned int))
{
char __user *koptval = optval;
int koptlen = optlen;
switch (optname) {
case MCAST_JOIN_GROUP:
case MCAST_LEAVE_GROUP:
{
struct compat_group_req __user *gr32 = (void *)optval;
struct group_req __user *kgr =
compat_alloc_user_space(sizeof(struct group_req));
u32 interface;
if (!access_ok(VERIFY_READ, gr32, sizeof(*gr32)) ||
!access_ok(VERIFY_WRITE, kgr, sizeof(struct group_req)) ||
__get_user(interface, &gr32->gr_interface) ||
__put_user(interface, &kgr->gr_interface) ||
copy_in_user(&kgr->gr_group, &gr32->gr_group,
sizeof(kgr->gr_group)))
return -EFAULT;
koptval = (char __user *)kgr;
koptlen = sizeof(struct group_req);
break;
}
case MCAST_JOIN_SOURCE_GROUP:
case MCAST_LEAVE_SOURCE_GROUP:
case MCAST_BLOCK_SOURCE:
case MCAST_UNBLOCK_SOURCE:
{
struct compat_group_source_req __user *gsr32 = (void *)optval;
struct group_source_req __user *kgsr = compat_alloc_user_space(
sizeof(struct group_source_req));
u32 interface;
if (!access_ok(VERIFY_READ, gsr32, sizeof(*gsr32)) ||
!access_ok(VERIFY_WRITE, kgsr,
sizeof(struct group_source_req)) ||
__get_user(interface, &gsr32->gsr_interface) ||
__put_user(interface, &kgsr->gsr_interface) ||
copy_in_user(&kgsr->gsr_group, &gsr32->gsr_group,
sizeof(kgsr->gsr_group)) ||
copy_in_user(&kgsr->gsr_source, &gsr32->gsr_source,
sizeof(kgsr->gsr_source)))
return -EFAULT;
koptval = (char __user *)kgsr;
koptlen = sizeof(struct group_source_req);
break;
}
case MCAST_MSFILTER:
{
struct compat_group_filter __user *gf32 = (void *)optval;
struct group_filter __user *kgf;
u32 interface, fmode, numsrc;
if (!access_ok(VERIFY_READ, gf32, __COMPAT_GF0_SIZE) ||
__get_user(interface, &gf32->gf_interface) ||
__get_user(fmode, &gf32->gf_fmode) ||
__get_user(numsrc, &gf32->gf_numsrc))
return -EFAULT;
koptlen = optlen + sizeof(struct group_filter) -
sizeof(struct compat_group_filter);
if (koptlen < GROUP_FILTER_SIZE(numsrc))
return -EINVAL;
kgf = compat_alloc_user_space(koptlen);
if (!access_ok(VERIFY_WRITE, kgf, koptlen) ||
__put_user(interface, &kgf->gf_interface) ||
__put_user(fmode, &kgf->gf_fmode) ||
__put_user(numsrc, &kgf->gf_numsrc) ||
copy_in_user(&kgf->gf_group, &gf32->gf_group,
sizeof(kgf->gf_group)) ||
(numsrc && copy_in_user(kgf->gf_slist, gf32->gf_slist,
numsrc * sizeof(kgf->gf_slist[0]))))
return -EFAULT;
koptval = (char __user *)kgf;
break;
}
default:
break;
}
return setsockopt(sock, level, optname, koptval, koptlen);
}
EXPORT_SYMBOL(compat_mc_setsockopt);
int compat_mc_getsockopt(struct sock *sock, int level, int optname,
char __user *optval, int __user *optlen,
int (*getsockopt)(struct sock *,int,int,char __user *,int __user *))
{
struct compat_group_filter __user *gf32 = (void *)optval;
struct group_filter __user *kgf;
int __user *koptlen;
u32 interface, fmode, numsrc;
int klen, ulen, err;
if (optname != MCAST_MSFILTER)
return getsockopt(sock, level, optname, optval, optlen);
koptlen = compat_alloc_user_space(sizeof(*koptlen));
if (!access_ok(VERIFY_READ, optlen, sizeof(*optlen)) ||
__get_user(ulen, optlen))
return -EFAULT;
/* adjust len for pad */
klen = ulen + sizeof(*kgf) - sizeof(*gf32);
if (klen < GROUP_FILTER_SIZE(0))
return -EINVAL;
if (!access_ok(VERIFY_WRITE, koptlen, sizeof(*koptlen)) ||
__put_user(klen, koptlen))
return -EFAULT;
/* have to allow space for previous compat_alloc_user_space, too */
kgf = compat_alloc_user_space(klen+sizeof(*optlen));
if (!access_ok(VERIFY_READ, gf32, __COMPAT_GF0_SIZE) ||
__get_user(interface, &gf32->gf_interface) ||
__get_user(fmode, &gf32->gf_fmode) ||
__get_user(numsrc, &gf32->gf_numsrc) ||
__put_user(interface, &kgf->gf_interface) ||
__put_user(fmode, &kgf->gf_fmode) ||
__put_user(numsrc, &kgf->gf_numsrc) ||
copy_in_user(&kgf->gf_group,&gf32->gf_group,sizeof(kgf->gf_group)))
return -EFAULT;
err = getsockopt(sock, level, optname, (char __user *)kgf, koptlen);
if (err)
return err;
if (!access_ok(VERIFY_READ, koptlen, sizeof(*koptlen)) ||
__get_user(klen, koptlen))
return -EFAULT;
ulen = klen - (sizeof(*kgf)-sizeof(*gf32));
if (!access_ok(VERIFY_WRITE, optlen, sizeof(*optlen)) ||
__put_user(ulen, optlen))
return -EFAULT;
if (!access_ok(VERIFY_READ, kgf, klen) ||
!access_ok(VERIFY_WRITE, gf32, ulen) ||
__get_user(interface, &kgf->gf_interface) ||
__get_user(fmode, &kgf->gf_fmode) ||
__get_user(numsrc, &kgf->gf_numsrc) ||
__put_user(interface, &gf32->gf_interface) ||
__put_user(fmode, &gf32->gf_fmode) ||
__put_user(numsrc, &gf32->gf_numsrc))
return -EFAULT;
if (numsrc) {
int copylen;
klen -= GROUP_FILTER_SIZE(0);
copylen = numsrc * sizeof(gf32->gf_slist[0]);
if (copylen > klen)
copylen = klen;
if (copy_in_user(gf32->gf_slist, kgf->gf_slist, copylen))
return -EFAULT;
}
return err;
}
EXPORT_SYMBOL(compat_mc_getsockopt);
/* Argument list sizes for compat_sys_socketcall */
#define AL(x) ((x) * sizeof(u32))
static unsigned char nas[20]={AL(0),AL(3),AL(3),AL(3),AL(2),AL(3),
AL(3),AL(3),AL(4),AL(4),AL(4),AL(6),
flag parameters: paccept This patch is by far the most complex in the series. It adds a new syscall paccept. This syscall differs from accept in that it adds (at the userlevel) two additional parameters: - a signal mask - a flags value The flags parameter can be used to set flag like SOCK_CLOEXEC. This is imlpemented here as well. Some people argued that this is a property which should be inherited from the file desriptor for the server but this is against POSIX. Additionally, we really want the signal mask parameter as well (similar to pselect, ppoll, etc). So an interface change in inevitable. The flag value is the same as for socket and socketpair. I think diverging here will only create confusion. Similar to the filesystem interfaces where the use of the O_* constants differs, it is acceptable here. The signal mask is handled as for pselect etc. The mask is temporarily installed for the thread and removed before the call returns. I modeled the code after pselect. If there is a problem it's likely also in pselect. For architectures which use socketcall I maintained this interface instead of adding a system call. The symmetry shouldn't be broken. The following test must be adjusted for architectures other than x86 and x86-64 and in case the syscall numbers changed. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #include <errno.h> #include <fcntl.h> #include <pthread.h> #include <signal.h> #include <stdio.h> #include <unistd.h> #include <netinet/in.h> #include <sys/socket.h> #include <sys/syscall.h> #ifndef __NR_paccept # ifdef __x86_64__ # define __NR_paccept 288 # elif defined __i386__ # define SYS_PACCEPT 18 # define USE_SOCKETCALL 1 # else # error "need __NR_paccept" # endif #endif #ifdef USE_SOCKETCALL # define paccept(fd, addr, addrlen, mask, flags) \ ({ long args[6] = { \ (long) fd, (long) addr, (long) addrlen, (long) mask, 8, (long) flags }; \ syscall (__NR_socketcall, SYS_PACCEPT, args); }) #else # define paccept(fd, addr, addrlen, mask, flags) \ syscall (__NR_paccept, fd, addr, addrlen, mask, 8, flags) #endif #define PORT 57392 #define SOCK_CLOEXEC O_CLOEXEC static pthread_barrier_t b; static void * tf (void *arg) { pthread_barrier_wait (&b); int s = socket (AF_INET, SOCK_STREAM, 0); struct sockaddr_in sin; sin.sin_family = AF_INET; sin.sin_addr.s_addr = htonl (INADDR_LOOPBACK); sin.sin_port = htons (PORT); connect (s, (const struct sockaddr *) &sin, sizeof (sin)); close (s); pthread_barrier_wait (&b); s = socket (AF_INET, SOCK_STREAM, 0); sin.sin_port = htons (PORT); connect (s, (const struct sockaddr *) &sin, sizeof (sin)); close (s); pthread_barrier_wait (&b); pthread_barrier_wait (&b); sleep (2); pthread_kill ((pthread_t) arg, SIGUSR1); return NULL; } static void handler (int s) { } int main (void) { pthread_barrier_init (&b, NULL, 2); struct sockaddr_in sin; pthread_t th; if (pthread_create (&th, NULL, tf, (void *) pthread_self ()) != 0) { puts ("pthread_create failed"); return 1; } int s = socket (AF_INET, SOCK_STREAM, 0); int reuse = 1; setsockopt (s, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof (reuse)); sin.sin_family = AF_INET; sin.sin_addr.s_addr = htonl (INADDR_LOOPBACK); sin.sin_port = htons (PORT); bind (s, (struct sockaddr *) &sin, sizeof (sin)); listen (s, SOMAXCONN); pthread_barrier_wait (&b); int s2 = paccept (s, NULL, 0, NULL, 0); if (s2 < 0) { puts ("paccept(0) failed"); return 1; } int coe = fcntl (s2, F_GETFD); if (coe & FD_CLOEXEC) { puts ("paccept(0) set close-on-exec-flag"); return 1; } close (s2); pthread_barrier_wait (&b); s2 = paccept (s, NULL, 0, NULL, SOCK_CLOEXEC); if (s2 < 0) { puts ("paccept(SOCK_CLOEXEC) failed"); return 1; } coe = fcntl (s2, F_GETFD); if ((coe & FD_CLOEXEC) == 0) { puts ("paccept(SOCK_CLOEXEC) does not set close-on-exec flag"); return 1; } close (s2); pthread_barrier_wait (&b); struct sigaction sa; sa.sa_handler = handler; sa.sa_flags = 0; sigemptyset (&sa.sa_mask); sigaction (SIGUSR1, &sa, NULL); sigset_t ss; pthread_sigmask (SIG_SETMASK, NULL, &ss); sigaddset (&ss, SIGUSR1); pthread_sigmask (SIG_SETMASK, &ss, NULL); sigdelset (&ss, SIGUSR1); alarm (4); pthread_barrier_wait (&b); errno = 0 ; s2 = paccept (s, NULL, 0, &ss, 0); if (s2 != -1 || errno != EINTR) { puts ("paccept did not fail with EINTR"); return 1; } close (s); puts ("OK"); return 0; } ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ [akpm@linux-foundation.org: make it compile] [akpm@linux-foundation.org: add sys_ni stub] Signed-off-by: Ulrich Drepper <drepper@redhat.com> Acked-by: Davide Libenzi <davidel@xmailserver.org> Cc: Michael Kerrisk <mtk.manpages@googlemail.com> Cc: <linux-arch@vger.kernel.org> Cc: "David S. Miller" <davem@davemloft.net> Cc: Roland McGrath <roland@redhat.com> Cc: Kyle McMartin <kyle@mcmartin.ca> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-24 12:29:20 +08:00
AL(6),AL(2),AL(5),AL(5),AL(3),AL(3),
AL(4),AL(5)};
#undef AL
asmlinkage long compat_sys_sendmsg(int fd, struct compat_msghdr __user *msg, unsigned flags)
{
return sys_sendmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT);
}
asmlinkage long compat_sys_recvmsg(int fd, struct compat_msghdr __user *msg, unsigned int flags)
{
return sys_recvmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT);
}
net/compat/wext: send different messages to compat tasks Wireless extensions have the unfortunate problem that events are multicast netlink messages, and are not independent of pointer size. Thus, currently 32-bit tasks on 64-bit platforms cannot properly receive events and fail with all kinds of strange problems, for instance wpa_supplicant never notices disassociations, due to the way the 64-bit event looks (to a 32-bit process), the fact that the address is all zeroes is lost, it thinks instead it is 00:00:00:00:01:00. The same problem existed with the ioctls, until David Miller fixed those some time ago in an heroic effort. A different problem caused by this is that we cannot send the ASSOCREQIE/ASSOCRESPIE events because sending them causes a 32-bit wpa_supplicant on a 64-bit system to overwrite its internal information, which is worse than it not getting the information at all -- so we currently resort to sending a custom string event that it then parses. This, however, has a severe size limitation we are frequently hitting with modern access points; this limitation would can be lifted after this patch by sending the correct binary, not custom, event. A similar problem apparently happens for some other netlink users on x86_64 with 32-bit tasks due to the alignment for 64-bit quantities. In order to fix these problems, I have implemented a way to send compat messages to tasks. When sending an event, we send the non-compat event data together with a compat event data in skb_shinfo(main_skb)->frag_list. Then, when the event is read from the socket, the netlink code makes sure to pass out only the skb that is compatible with the task. This approach was suggested by David Miller, my original approach required always sending two skbs but that had various small problems. To determine whether compat is needed or not, I have used the MSG_CMSG_COMPAT flag, and adjusted the call path for recv and recvfrom to include it, even if those calls do not have a cmsg parameter. I have not solved one small part of the problem, and I don't think it is necessary to: if a 32-bit application uses read() rather than any form of recvmsg() it will still get the wrong (64-bit) event. However, neither do applications actually do this, nor would it be a regression. Signed-off-by: Johannes Berg <johannes@sipsolutions.net> Signed-off-by: David S. Miller <davem@davemloft.net>
2009-07-01 19:26:02 +08:00
asmlinkage long compat_sys_recv(int fd, void __user *buf, size_t len, unsigned flags)
{
return sys_recv(fd, buf, len, flags | MSG_CMSG_COMPAT);
}
asmlinkage long compat_sys_recvfrom(int fd, void __user *buf, size_t len,
unsigned flags, struct sockaddr __user *addr,
int __user *addrlen)
{
return sys_recvfrom(fd, buf, len, flags | MSG_CMSG_COMPAT, addr, addrlen);
}
asmlinkage long compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg,
unsigned vlen, unsigned int flags,
struct compat_timespec __user *timeout)
{
int datagrams;
struct timespec ktspec;
if (timeout == NULL)
return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
flags | MSG_CMSG_COMPAT, NULL);
if (get_compat_timespec(&ktspec, timeout))
return -EFAULT;
datagrams = __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
flags | MSG_CMSG_COMPAT, &ktspec);
if (datagrams > 0 && put_compat_timespec(&ktspec, timeout))
datagrams = -EFAULT;
return datagrams;
}
asmlinkage long compat_sys_socketcall(int call, u32 __user *args)
{
int ret;
u32 a[6];
u32 a0, a1;
if (call < SYS_SOCKET || call > SYS_RECVMMSG)
return -EINVAL;
if (copy_from_user(a, args, nas[call]))
return -EFAULT;
a0 = a[0];
a1 = a[1];
switch (call) {
case SYS_SOCKET:
ret = sys_socket(a0, a1, a[2]);
break;
case SYS_BIND:
ret = sys_bind(a0, compat_ptr(a1), a[2]);
break;
case SYS_CONNECT:
ret = sys_connect(a0, compat_ptr(a1), a[2]);
break;
case SYS_LISTEN:
ret = sys_listen(a0, a1);
break;
case SYS_ACCEPT:
reintroduce accept4 Introduce a new accept4() system call. The addition of this system call matches analogous changes in 2.6.27 (dup3(), evenfd2(), signalfd4(), inotify_init1(), epoll_create1(), pipe2()) which added new system calls that differed from analogous traditional system calls in adding a flags argument that can be used to access additional functionality. The accept4() system call is exactly the same as accept(), except that it adds a flags bit-mask argument. Two flags are initially implemented. (Most of the new system calls in 2.6.27 also had both of these flags.) SOCK_CLOEXEC causes the close-on-exec (FD_CLOEXEC) flag to be enabled for the new file descriptor returned by accept4(). This is a useful security feature to avoid leaking information in a multithreaded program where one thread is doing an accept() at the same time as another thread is doing a fork() plus exec(). More details here: http://udrepper.livejournal.com/20407.html "Secure File Descriptor Handling", Ulrich Drepper). The other flag is SOCK_NONBLOCK, which causes the O_NONBLOCK flag to be enabled on the new open file description created by accept4(). (This flag is merely a convenience, saving the use of additional calls fcntl(F_GETFL) and fcntl (F_SETFL) to achieve the same result. Here's a test program. Works on x86-32. Should work on x86-64, but I (mtk) don't have a system to hand to test with. It tests accept4() with each of the four possible combinations of SOCK_CLOEXEC and SOCK_NONBLOCK set/clear in 'flags', and verifies that the appropriate flags are set on the file descriptor/open file description returned by accept4(). I tested Ulrich's patch in this thread by applying against 2.6.28-rc2, and it passes according to my test program. /* test_accept4.c Copyright (C) 2008, Linux Foundation, written by Michael Kerrisk <mtk.manpages@gmail.com> Licensed under the GNU GPLv2 or later. */ #define _GNU_SOURCE #include <unistd.h> #include <sys/syscall.h> #include <sys/socket.h> #include <netinet/in.h> #include <stdlib.h> #include <fcntl.h> #include <stdio.h> #include <string.h> #define PORT_NUM 33333 #define die(msg) do { perror(msg); exit(EXIT_FAILURE); } while (0) /**********************************************************************/ /* The following is what we need until glibc gets a wrapper for accept4() */ /* Flags for socket(), socketpair(), accept4() */ #ifndef SOCK_CLOEXEC #define SOCK_CLOEXEC O_CLOEXEC #endif #ifndef SOCK_NONBLOCK #define SOCK_NONBLOCK O_NONBLOCK #endif #ifdef __x86_64__ #define SYS_accept4 288 #elif __i386__ #define USE_SOCKETCALL 1 #define SYS_ACCEPT4 18 #else #error "Sorry -- don't know the syscall # on this architecture" #endif static int accept4(int fd, struct sockaddr *sockaddr, socklen_t *addrlen, int flags) { printf("Calling accept4(): flags = %x", flags); if (flags != 0) { printf(" ("); if (flags & SOCK_CLOEXEC) printf("SOCK_CLOEXEC"); if ((flags & SOCK_CLOEXEC) && (flags & SOCK_NONBLOCK)) printf(" "); if (flags & SOCK_NONBLOCK) printf("SOCK_NONBLOCK"); printf(")"); } printf("\n"); #if USE_SOCKETCALL long args[6]; args[0] = fd; args[1] = (long) sockaddr; args[2] = (long) addrlen; args[3] = flags; return syscall(SYS_socketcall, SYS_ACCEPT4, args); #else return syscall(SYS_accept4, fd, sockaddr, addrlen, flags); #endif } /**********************************************************************/ static int do_test(int lfd, struct sockaddr_in *conn_addr, int closeonexec_flag, int nonblock_flag) { int connfd, acceptfd; int fdf, flf, fdf_pass, flf_pass; struct sockaddr_in claddr; socklen_t addrlen; printf("=======================================\n"); connfd = socket(AF_INET, SOCK_STREAM, 0); if (connfd == -1) die("socket"); if (connect(connfd, (struct sockaddr *) conn_addr, sizeof(struct sockaddr_in)) == -1) die("connect"); addrlen = sizeof(struct sockaddr_in); acceptfd = accept4(lfd, (struct sockaddr *) &claddr, &addrlen, closeonexec_flag | nonblock_flag); if (acceptfd == -1) { perror("accept4()"); close(connfd); return 0; } fdf = fcntl(acceptfd, F_GETFD); if (fdf == -1) die("fcntl:F_GETFD"); fdf_pass = ((fdf & FD_CLOEXEC) != 0) == ((closeonexec_flag & SOCK_CLOEXEC) != 0); printf("Close-on-exec flag is %sset (%s); ", (fdf & FD_CLOEXEC) ? "" : "not ", fdf_pass ? "OK" : "failed"); flf = fcntl(acceptfd, F_GETFL); if (flf == -1) die("fcntl:F_GETFD"); flf_pass = ((flf & O_NONBLOCK) != 0) == ((nonblock_flag & SOCK_NONBLOCK) !=0); printf("nonblock flag is %sset (%s)\n", (flf & O_NONBLOCK) ? "" : "not ", flf_pass ? "OK" : "failed"); close(acceptfd); close(connfd); printf("Test result: %s\n", (fdf_pass && flf_pass) ? "PASS" : "FAIL"); return fdf_pass && flf_pass; } static int create_listening_socket(int port_num) { struct sockaddr_in svaddr; int lfd; int optval; memset(&svaddr, 0, sizeof(struct sockaddr_in)); svaddr.sin_family = AF_INET; svaddr.sin_addr.s_addr = htonl(INADDR_ANY); svaddr.sin_port = htons(port_num); lfd = socket(AF_INET, SOCK_STREAM, 0); if (lfd == -1) die("socket"); optval = 1; if (setsockopt(lfd, SOL_SOCKET, SO_REUSEADDR, &optval, sizeof(optval)) == -1) die("setsockopt"); if (bind(lfd, (struct sockaddr *) &svaddr, sizeof(struct sockaddr_in)) == -1) die("bind"); if (listen(lfd, 5) == -1) die("listen"); return lfd; } int main(int argc, char *argv[]) { struct sockaddr_in conn_addr; int lfd; int port_num; int passed; passed = 1; port_num = (argc > 1) ? atoi(argv[1]) : PORT_NUM; memset(&conn_addr, 0, sizeof(struct sockaddr_in)); conn_addr.sin_family = AF_INET; conn_addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK); conn_addr.sin_port = htons(port_num); lfd = create_listening_socket(port_num); if (!do_test(lfd, &conn_addr, 0, 0)) passed = 0; if (!do_test(lfd, &conn_addr, SOCK_CLOEXEC, 0)) passed = 0; if (!do_test(lfd, &conn_addr, 0, SOCK_NONBLOCK)) passed = 0; if (!do_test(lfd, &conn_addr, SOCK_CLOEXEC, SOCK_NONBLOCK)) passed = 0; close(lfd); exit(passed ? EXIT_SUCCESS : EXIT_FAILURE); } [mtk.manpages@gmail.com: rewrote changelog, updated test program] Signed-off-by: Ulrich Drepper <drepper@redhat.com> Tested-by: Michael Kerrisk <mtk.manpages@gmail.com> Acked-by: Michael Kerrisk <mtk.manpages@gmail.com> Cc: <linux-api@vger.kernel.org> Cc: <linux-arch@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-11-20 07:36:14 +08:00
ret = sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), 0);
break;
case SYS_GETSOCKNAME:
ret = sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2]));
break;
case SYS_GETPEERNAME:
ret = sys_getpeername(a0, compat_ptr(a1), compat_ptr(a[2]));
break;
case SYS_SOCKETPAIR:
ret = sys_socketpair(a0, a1, a[2], compat_ptr(a[3]));
break;
case SYS_SEND:
ret = sys_send(a0, compat_ptr(a1), a[2], a[3]);
break;
case SYS_SENDTO:
ret = sys_sendto(a0, compat_ptr(a1), a[2], a[3], compat_ptr(a[4]), a[5]);
break;
case SYS_RECV:
net/compat/wext: send different messages to compat tasks Wireless extensions have the unfortunate problem that events are multicast netlink messages, and are not independent of pointer size. Thus, currently 32-bit tasks on 64-bit platforms cannot properly receive events and fail with all kinds of strange problems, for instance wpa_supplicant never notices disassociations, due to the way the 64-bit event looks (to a 32-bit process), the fact that the address is all zeroes is lost, it thinks instead it is 00:00:00:00:01:00. The same problem existed with the ioctls, until David Miller fixed those some time ago in an heroic effort. A different problem caused by this is that we cannot send the ASSOCREQIE/ASSOCRESPIE events because sending them causes a 32-bit wpa_supplicant on a 64-bit system to overwrite its internal information, which is worse than it not getting the information at all -- so we currently resort to sending a custom string event that it then parses. This, however, has a severe size limitation we are frequently hitting with modern access points; this limitation would can be lifted after this patch by sending the correct binary, not custom, event. A similar problem apparently happens for some other netlink users on x86_64 with 32-bit tasks due to the alignment for 64-bit quantities. In order to fix these problems, I have implemented a way to send compat messages to tasks. When sending an event, we send the non-compat event data together with a compat event data in skb_shinfo(main_skb)->frag_list. Then, when the event is read from the socket, the netlink code makes sure to pass out only the skb that is compatible with the task. This approach was suggested by David Miller, my original approach required always sending two skbs but that had various small problems. To determine whether compat is needed or not, I have used the MSG_CMSG_COMPAT flag, and adjusted the call path for recv and recvfrom to include it, even if those calls do not have a cmsg parameter. I have not solved one small part of the problem, and I don't think it is necessary to: if a 32-bit application uses read() rather than any form of recvmsg() it will still get the wrong (64-bit) event. However, neither do applications actually do this, nor would it be a regression. Signed-off-by: Johannes Berg <johannes@sipsolutions.net> Signed-off-by: David S. Miller <davem@davemloft.net>
2009-07-01 19:26:02 +08:00
ret = compat_sys_recv(a0, compat_ptr(a1), a[2], a[3]);
break;
case SYS_RECVFROM:
net/compat/wext: send different messages to compat tasks Wireless extensions have the unfortunate problem that events are multicast netlink messages, and are not independent of pointer size. Thus, currently 32-bit tasks on 64-bit platforms cannot properly receive events and fail with all kinds of strange problems, for instance wpa_supplicant never notices disassociations, due to the way the 64-bit event looks (to a 32-bit process), the fact that the address is all zeroes is lost, it thinks instead it is 00:00:00:00:01:00. The same problem existed with the ioctls, until David Miller fixed those some time ago in an heroic effort. A different problem caused by this is that we cannot send the ASSOCREQIE/ASSOCRESPIE events because sending them causes a 32-bit wpa_supplicant on a 64-bit system to overwrite its internal information, which is worse than it not getting the information at all -- so we currently resort to sending a custom string event that it then parses. This, however, has a severe size limitation we are frequently hitting with modern access points; this limitation would can be lifted after this patch by sending the correct binary, not custom, event. A similar problem apparently happens for some other netlink users on x86_64 with 32-bit tasks due to the alignment for 64-bit quantities. In order to fix these problems, I have implemented a way to send compat messages to tasks. When sending an event, we send the non-compat event data together with a compat event data in skb_shinfo(main_skb)->frag_list. Then, when the event is read from the socket, the netlink code makes sure to pass out only the skb that is compatible with the task. This approach was suggested by David Miller, my original approach required always sending two skbs but that had various small problems. To determine whether compat is needed or not, I have used the MSG_CMSG_COMPAT flag, and adjusted the call path for recv and recvfrom to include it, even if those calls do not have a cmsg parameter. I have not solved one small part of the problem, and I don't think it is necessary to: if a 32-bit application uses read() rather than any form of recvmsg() it will still get the wrong (64-bit) event. However, neither do applications actually do this, nor would it be a regression. Signed-off-by: Johannes Berg <johannes@sipsolutions.net> Signed-off-by: David S. Miller <davem@davemloft.net>
2009-07-01 19:26:02 +08:00
ret = compat_sys_recvfrom(a0, compat_ptr(a1), a[2], a[3],
compat_ptr(a[4]), compat_ptr(a[5]));
break;
case SYS_SHUTDOWN:
ret = sys_shutdown(a0,a1);
break;
case SYS_SETSOCKOPT:
ret = compat_sys_setsockopt(a0, a1, a[2],
compat_ptr(a[3]), a[4]);
break;
case SYS_GETSOCKOPT:
ret = compat_sys_getsockopt(a0, a1, a[2],
compat_ptr(a[3]), compat_ptr(a[4]));
break;
case SYS_SENDMSG:
ret = compat_sys_sendmsg(a0, compat_ptr(a1), a[2]);
break;
case SYS_RECVMSG:
ret = compat_sys_recvmsg(a0, compat_ptr(a1), a[2]);
break;
case SYS_RECVMMSG:
ret = compat_sys_recvmmsg(a0, compat_ptr(a1), a[2], a[3],
compat_ptr(a[4]));
break;
reintroduce accept4 Introduce a new accept4() system call. The addition of this system call matches analogous changes in 2.6.27 (dup3(), evenfd2(), signalfd4(), inotify_init1(), epoll_create1(), pipe2()) which added new system calls that differed from analogous traditional system calls in adding a flags argument that can be used to access additional functionality. The accept4() system call is exactly the same as accept(), except that it adds a flags bit-mask argument. Two flags are initially implemented. (Most of the new system calls in 2.6.27 also had both of these flags.) SOCK_CLOEXEC causes the close-on-exec (FD_CLOEXEC) flag to be enabled for the new file descriptor returned by accept4(). This is a useful security feature to avoid leaking information in a multithreaded program where one thread is doing an accept() at the same time as another thread is doing a fork() plus exec(). More details here: http://udrepper.livejournal.com/20407.html "Secure File Descriptor Handling", Ulrich Drepper). The other flag is SOCK_NONBLOCK, which causes the O_NONBLOCK flag to be enabled on the new open file description created by accept4(). (This flag is merely a convenience, saving the use of additional calls fcntl(F_GETFL) and fcntl (F_SETFL) to achieve the same result. Here's a test program. Works on x86-32. Should work on x86-64, but I (mtk) don't have a system to hand to test with. It tests accept4() with each of the four possible combinations of SOCK_CLOEXEC and SOCK_NONBLOCK set/clear in 'flags', and verifies that the appropriate flags are set on the file descriptor/open file description returned by accept4(). I tested Ulrich's patch in this thread by applying against 2.6.28-rc2, and it passes according to my test program. /* test_accept4.c Copyright (C) 2008, Linux Foundation, written by Michael Kerrisk <mtk.manpages@gmail.com> Licensed under the GNU GPLv2 or later. */ #define _GNU_SOURCE #include <unistd.h> #include <sys/syscall.h> #include <sys/socket.h> #include <netinet/in.h> #include <stdlib.h> #include <fcntl.h> #include <stdio.h> #include <string.h> #define PORT_NUM 33333 #define die(msg) do { perror(msg); exit(EXIT_FAILURE); } while (0) /**********************************************************************/ /* The following is what we need until glibc gets a wrapper for accept4() */ /* Flags for socket(), socketpair(), accept4() */ #ifndef SOCK_CLOEXEC #define SOCK_CLOEXEC O_CLOEXEC #endif #ifndef SOCK_NONBLOCK #define SOCK_NONBLOCK O_NONBLOCK #endif #ifdef __x86_64__ #define SYS_accept4 288 #elif __i386__ #define USE_SOCKETCALL 1 #define SYS_ACCEPT4 18 #else #error "Sorry -- don't know the syscall # on this architecture" #endif static int accept4(int fd, struct sockaddr *sockaddr, socklen_t *addrlen, int flags) { printf("Calling accept4(): flags = %x", flags); if (flags != 0) { printf(" ("); if (flags & SOCK_CLOEXEC) printf("SOCK_CLOEXEC"); if ((flags & SOCK_CLOEXEC) && (flags & SOCK_NONBLOCK)) printf(" "); if (flags & SOCK_NONBLOCK) printf("SOCK_NONBLOCK"); printf(")"); } printf("\n"); #if USE_SOCKETCALL long args[6]; args[0] = fd; args[1] = (long) sockaddr; args[2] = (long) addrlen; args[3] = flags; return syscall(SYS_socketcall, SYS_ACCEPT4, args); #else return syscall(SYS_accept4, fd, sockaddr, addrlen, flags); #endif } /**********************************************************************/ static int do_test(int lfd, struct sockaddr_in *conn_addr, int closeonexec_flag, int nonblock_flag) { int connfd, acceptfd; int fdf, flf, fdf_pass, flf_pass; struct sockaddr_in claddr; socklen_t addrlen; printf("=======================================\n"); connfd = socket(AF_INET, SOCK_STREAM, 0); if (connfd == -1) die("socket"); if (connect(connfd, (struct sockaddr *) conn_addr, sizeof(struct sockaddr_in)) == -1) die("connect"); addrlen = sizeof(struct sockaddr_in); acceptfd = accept4(lfd, (struct sockaddr *) &claddr, &addrlen, closeonexec_flag | nonblock_flag); if (acceptfd == -1) { perror("accept4()"); close(connfd); return 0; } fdf = fcntl(acceptfd, F_GETFD); if (fdf == -1) die("fcntl:F_GETFD"); fdf_pass = ((fdf & FD_CLOEXEC) != 0) == ((closeonexec_flag & SOCK_CLOEXEC) != 0); printf("Close-on-exec flag is %sset (%s); ", (fdf & FD_CLOEXEC) ? "" : "not ", fdf_pass ? "OK" : "failed"); flf = fcntl(acceptfd, F_GETFL); if (flf == -1) die("fcntl:F_GETFD"); flf_pass = ((flf & O_NONBLOCK) != 0) == ((nonblock_flag & SOCK_NONBLOCK) !=0); printf("nonblock flag is %sset (%s)\n", (flf & O_NONBLOCK) ? "" : "not ", flf_pass ? "OK" : "failed"); close(acceptfd); close(connfd); printf("Test result: %s\n", (fdf_pass && flf_pass) ? "PASS" : "FAIL"); return fdf_pass && flf_pass; } static int create_listening_socket(int port_num) { struct sockaddr_in svaddr; int lfd; int optval; memset(&svaddr, 0, sizeof(struct sockaddr_in)); svaddr.sin_family = AF_INET; svaddr.sin_addr.s_addr = htonl(INADDR_ANY); svaddr.sin_port = htons(port_num); lfd = socket(AF_INET, SOCK_STREAM, 0); if (lfd == -1) die("socket"); optval = 1; if (setsockopt(lfd, SOL_SOCKET, SO_REUSEADDR, &optval, sizeof(optval)) == -1) die("setsockopt"); if (bind(lfd, (struct sockaddr *) &svaddr, sizeof(struct sockaddr_in)) == -1) die("bind"); if (listen(lfd, 5) == -1) die("listen"); return lfd; } int main(int argc, char *argv[]) { struct sockaddr_in conn_addr; int lfd; int port_num; int passed; passed = 1; port_num = (argc > 1) ? atoi(argv[1]) : PORT_NUM; memset(&conn_addr, 0, sizeof(struct sockaddr_in)); conn_addr.sin_family = AF_INET; conn_addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK); conn_addr.sin_port = htons(port_num); lfd = create_listening_socket(port_num); if (!do_test(lfd, &conn_addr, 0, 0)) passed = 0; if (!do_test(lfd, &conn_addr, SOCK_CLOEXEC, 0)) passed = 0; if (!do_test(lfd, &conn_addr, 0, SOCK_NONBLOCK)) passed = 0; if (!do_test(lfd, &conn_addr, SOCK_CLOEXEC, SOCK_NONBLOCK)) passed = 0; close(lfd); exit(passed ? EXIT_SUCCESS : EXIT_FAILURE); } [mtk.manpages@gmail.com: rewrote changelog, updated test program] Signed-off-by: Ulrich Drepper <drepper@redhat.com> Tested-by: Michael Kerrisk <mtk.manpages@gmail.com> Acked-by: Michael Kerrisk <mtk.manpages@gmail.com> Cc: <linux-api@vger.kernel.org> Cc: <linux-arch@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-11-20 07:36:14 +08:00
case SYS_ACCEPT4:
ret = sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), a[3]);
flag parameters: paccept This patch is by far the most complex in the series. It adds a new syscall paccept. This syscall differs from accept in that it adds (at the userlevel) two additional parameters: - a signal mask - a flags value The flags parameter can be used to set flag like SOCK_CLOEXEC. This is imlpemented here as well. Some people argued that this is a property which should be inherited from the file desriptor for the server but this is against POSIX. Additionally, we really want the signal mask parameter as well (similar to pselect, ppoll, etc). So an interface change in inevitable. The flag value is the same as for socket and socketpair. I think diverging here will only create confusion. Similar to the filesystem interfaces where the use of the O_* constants differs, it is acceptable here. The signal mask is handled as for pselect etc. The mask is temporarily installed for the thread and removed before the call returns. I modeled the code after pselect. If there is a problem it's likely also in pselect. For architectures which use socketcall I maintained this interface instead of adding a system call. The symmetry shouldn't be broken. The following test must be adjusted for architectures other than x86 and x86-64 and in case the syscall numbers changed. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #include <errno.h> #include <fcntl.h> #include <pthread.h> #include <signal.h> #include <stdio.h> #include <unistd.h> #include <netinet/in.h> #include <sys/socket.h> #include <sys/syscall.h> #ifndef __NR_paccept # ifdef __x86_64__ # define __NR_paccept 288 # elif defined __i386__ # define SYS_PACCEPT 18 # define USE_SOCKETCALL 1 # else # error "need __NR_paccept" # endif #endif #ifdef USE_SOCKETCALL # define paccept(fd, addr, addrlen, mask, flags) \ ({ long args[6] = { \ (long) fd, (long) addr, (long) addrlen, (long) mask, 8, (long) flags }; \ syscall (__NR_socketcall, SYS_PACCEPT, args); }) #else # define paccept(fd, addr, addrlen, mask, flags) \ syscall (__NR_paccept, fd, addr, addrlen, mask, 8, flags) #endif #define PORT 57392 #define SOCK_CLOEXEC O_CLOEXEC static pthread_barrier_t b; static void * tf (void *arg) { pthread_barrier_wait (&b); int s = socket (AF_INET, SOCK_STREAM, 0); struct sockaddr_in sin; sin.sin_family = AF_INET; sin.sin_addr.s_addr = htonl (INADDR_LOOPBACK); sin.sin_port = htons (PORT); connect (s, (const struct sockaddr *) &sin, sizeof (sin)); close (s); pthread_barrier_wait (&b); s = socket (AF_INET, SOCK_STREAM, 0); sin.sin_port = htons (PORT); connect (s, (const struct sockaddr *) &sin, sizeof (sin)); close (s); pthread_barrier_wait (&b); pthread_barrier_wait (&b); sleep (2); pthread_kill ((pthread_t) arg, SIGUSR1); return NULL; } static void handler (int s) { } int main (void) { pthread_barrier_init (&b, NULL, 2); struct sockaddr_in sin; pthread_t th; if (pthread_create (&th, NULL, tf, (void *) pthread_self ()) != 0) { puts ("pthread_create failed"); return 1; } int s = socket (AF_INET, SOCK_STREAM, 0); int reuse = 1; setsockopt (s, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof (reuse)); sin.sin_family = AF_INET; sin.sin_addr.s_addr = htonl (INADDR_LOOPBACK); sin.sin_port = htons (PORT); bind (s, (struct sockaddr *) &sin, sizeof (sin)); listen (s, SOMAXCONN); pthread_barrier_wait (&b); int s2 = paccept (s, NULL, 0, NULL, 0); if (s2 < 0) { puts ("paccept(0) failed"); return 1; } int coe = fcntl (s2, F_GETFD); if (coe & FD_CLOEXEC) { puts ("paccept(0) set close-on-exec-flag"); return 1; } close (s2); pthread_barrier_wait (&b); s2 = paccept (s, NULL, 0, NULL, SOCK_CLOEXEC); if (s2 < 0) { puts ("paccept(SOCK_CLOEXEC) failed"); return 1; } coe = fcntl (s2, F_GETFD); if ((coe & FD_CLOEXEC) == 0) { puts ("paccept(SOCK_CLOEXEC) does not set close-on-exec flag"); return 1; } close (s2); pthread_barrier_wait (&b); struct sigaction sa; sa.sa_handler = handler; sa.sa_flags = 0; sigemptyset (&sa.sa_mask); sigaction (SIGUSR1, &sa, NULL); sigset_t ss; pthread_sigmask (SIG_SETMASK, NULL, &ss); sigaddset (&ss, SIGUSR1); pthread_sigmask (SIG_SETMASK, &ss, NULL); sigdelset (&ss, SIGUSR1); alarm (4); pthread_barrier_wait (&b); errno = 0 ; s2 = paccept (s, NULL, 0, &ss, 0); if (s2 != -1 || errno != EINTR) { puts ("paccept did not fail with EINTR"); return 1; } close (s); puts ("OK"); return 0; } ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ [akpm@linux-foundation.org: make it compile] [akpm@linux-foundation.org: add sys_ni stub] Signed-off-by: Ulrich Drepper <drepper@redhat.com> Acked-by: Davide Libenzi <davidel@xmailserver.org> Cc: Michael Kerrisk <mtk.manpages@googlemail.com> Cc: <linux-arch@vger.kernel.org> Cc: "David S. Miller" <davem@davemloft.net> Cc: Roland McGrath <roland@redhat.com> Cc: Kyle McMartin <kyle@mcmartin.ca> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-24 12:29:20 +08:00
break;
default:
ret = -EINVAL;
break;
}
return ret;
}