Second round of 9p patches for the 3.11 merge window.

Several of these patches were rebased in order to correct style issues.
 Only stylistic changes were made versus the patches which were in linux-next
 for two weeks.  The rebases have been in linux-next for 3 days and have
 passed my regressions.
 
 The bulk of these are RDMA fixes and improvements.  There's also some
 additions on the extended attributes front to support some additional
 namespaces and a new option for TCP to force allocation of mount requests
 from a priviledged port.
 -----BEGIN PGP SIGNATURE-----
 Version: GnuPG v1.4.12 (GNU/Linux)
 Comment: GPGTools - http://gpgtools.org
 
 iQIcBAABAgAGBQJR3rWXAAoJEDZk62b0Tg6xabIP/12I+SkQ57wRN03EQy5fqUdX
 gK/YMHKQ9QuDnZPBvrZ2lypesQNqVU0KINay6VEA86JG1gwzPyUd2MnpQ7F0vV3N
 XwVD54IoflV/M74xUnrgGWB8YxaPcdacQQ8yazX+mEgOgYGdWmDAl7FHmAkdKAFB
 gSl25f3PNJX1Rjay0dssNVXrVPXuJY/fZXKnNQZKtRwXffRWKsWHd8FU0Eq7F30A
 kNQB8tmMSfHBBjP+tzR0My6/kQ09jzHdtZOkH9IgVpNzqrd8tfy0l6tEvFypxqGT
 5oQFoxHHL/tUW05V0P3gYany2A7lEhSUifPKS6omqHO+vPlw+pDJw+xWlNq9fnDt
 8S8znqVuEHhvqRQW7zFdb9ac2MZi8CHHhC2wGIZ7GYjNG2q5XwE8b/QhdXQeFin7
 ibugvoW7+ZdcDewpQW27oO0g7B/8hRt8KC+1lc/8rITKIfGxbNJkGzTDl0F4Co7v
 IH7Ew5PHPe6ZiuU0QSdU+NBuvk8g8sWGxx04Xvzl3WicwOg7XvN3ivrKB9oN2U1x
 50KZRnYpwQQv/9AxyhroYU+Ufje8SF4v++zsq1eMzUcHsC/C73eatw2m764t+X4S
 8yMLrgqY1Nzif4nAMi/SDMnB/R1bXeuc8kXD9xT6XD9d2tf6e+zCHhQklVeC0tuK
 RiVRJqGrfanbKMnWIG0Y
 =n9rI
 -----END PGP SIGNATURE-----

Merge tag 'for-linus-3.11-merge-window-part-2' of git://git.kernel.org/pub/scm/linux/kernel/git/ericvh/v9fs

Pull second round of 9p patches from Eric Van Hensbergen:
 "Several of these patches were rebased in order to correct style
  issues.  Only stylistic changes were made versus the patches which
  were in linux-next for two weeks.  The rebases have been in linux-next
  for 3 days and have passed my regressions.

  The bulk of these are RDMA fixes and improvements.  There's also some
  additions on the extended attributes front to support some additional
  namespaces and a new option for TCP to force allocation of mount
  requests from a priviledged port"

* tag 'for-linus-3.11-merge-window-part-2' of git://git.kernel.org/pub/scm/linux/kernel/git/ericvh/v9fs:
  fs/9p: Remove the unused variable "err" in v9fs_vfs_getattr()
  9P: Add cancelled() to the transport functions.
  9P/RDMA: count posted buffers without a pending request
  9P/RDMA: Improve error handling in rdma_request
  9P/RDMA: Do not free req->rc in error handling in rdma_request()
  9P/RDMA: Use a semaphore to protect the RQ
  9P/RDMA: Protect against duplicate replies
  9P/RDMA: increase P9_RDMA_MAXSIZE to 1MB
  9pnet: refactor struct p9_fcall alloc code
  9P/RDMA: rdma_request() needs not allocate req->rc
  9P: Fix fcall allocation for rdma
  fs/9p: xattr: add trusted and security namespaces
  net/9p: add privport option to 9p tcp transport
This commit is contained in:
Linus Torvalds 2013-07-11 10:21:23 -07:00
commit 19d2f8e0fb
11 changed files with 355 additions and 79 deletions

View File

@ -31,3 +31,16 @@ config 9P_FS_POSIX_ACL
If you don't know what Access Control Lists are, say N If you don't know what Access Control Lists are, say N
endif endif
config 9P_FS_SECURITY
bool "9P Security Labels"
depends on 9P_FS
help
Security labels support alternative access control models
implemented by security modules like SELinux. This option
enables an extended attribute handler for file security
labels in the 9P filesystem.
If you are not using a security module that requires using
extended attributes for file security labels, say N.

View File

@ -11,7 +11,9 @@ obj-$(CONFIG_9P_FS) := 9p.o
v9fs.o \ v9fs.o \
fid.o \ fid.o \
xattr.o \ xattr.o \
xattr_user.o xattr_user.o \
xattr_trusted.o
9p-$(CONFIG_9P_FSCACHE) += cache.o 9p-$(CONFIG_9P_FSCACHE) += cache.o
9p-$(CONFIG_9P_FS_POSIX_ACL) += acl.o 9p-$(CONFIG_9P_FS_POSIX_ACL) += acl.o
9p-$(CONFIG_9P_FS_SECURITY) += xattr_security.o

View File

@ -1054,13 +1054,11 @@ static int
v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat) struct kstat *stat)
{ {
int err;
struct v9fs_session_info *v9ses; struct v9fs_session_info *v9ses;
struct p9_fid *fid; struct p9_fid *fid;
struct p9_wstat *st; struct p9_wstat *st;
p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
err = -EPERM;
v9ses = v9fs_dentry2v9ses(dentry); v9ses = v9fs_dentry2v9ses(dentry);
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
generic_fillattr(dentry->d_inode, stat); generic_fillattr(dentry->d_inode, stat);

View File

@ -167,9 +167,13 @@ ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
const struct xattr_handler *v9fs_xattr_handlers[] = { const struct xattr_handler *v9fs_xattr_handlers[] = {
&v9fs_xattr_user_handler, &v9fs_xattr_user_handler,
&v9fs_xattr_trusted_handler,
#ifdef CONFIG_9P_FS_POSIX_ACL #ifdef CONFIG_9P_FS_POSIX_ACL
&v9fs_xattr_acl_access_handler, &v9fs_xattr_acl_access_handler,
&v9fs_xattr_acl_default_handler, &v9fs_xattr_acl_default_handler,
#endif
#ifdef CONFIG_9P_FS_SECURITY
&v9fs_xattr_security_handler,
#endif #endif
NULL NULL
}; };

View File

@ -20,6 +20,8 @@
extern const struct xattr_handler *v9fs_xattr_handlers[]; extern const struct xattr_handler *v9fs_xattr_handlers[];
extern struct xattr_handler v9fs_xattr_user_handler; extern struct xattr_handler v9fs_xattr_user_handler;
extern struct xattr_handler v9fs_xattr_trusted_handler;
extern struct xattr_handler v9fs_xattr_security_handler;
extern const struct xattr_handler v9fs_xattr_acl_access_handler; extern const struct xattr_handler v9fs_xattr_acl_access_handler;
extern const struct xattr_handler v9fs_xattr_acl_default_handler; extern const struct xattr_handler v9fs_xattr_acl_default_handler;

80
fs/9p/xattr_security.c Normal file
View File

@ -0,0 +1,80 @@
/*
* Copyright IBM Corporation, 2010
* Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of version 2.1 of the GNU Lesser General Public License
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it would be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
*/
#include <linux/module.h>
#include <linux/string.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include "xattr.h"
static int v9fs_xattr_security_get(struct dentry *dentry, const char *name,
void *buffer, size_t size, int type)
{
int retval;
char *full_name;
size_t name_len;
size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
if (name == NULL)
return -EINVAL;
if (strcmp(name, "") == 0)
return -EINVAL;
name_len = strlen(name);
full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
if (!full_name)
return -ENOMEM;
memcpy(full_name, XATTR_SECURITY_PREFIX, prefix_len);
memcpy(full_name+prefix_len, name, name_len);
full_name[prefix_len + name_len] = '\0';
retval = v9fs_xattr_get(dentry, full_name, buffer, size);
kfree(full_name);
return retval;
}
static int v9fs_xattr_security_set(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags, int type)
{
int retval;
char *full_name;
size_t name_len;
size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
if (name == NULL)
return -EINVAL;
if (strcmp(name, "") == 0)
return -EINVAL;
name_len = strlen(name);
full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
if (!full_name)
return -ENOMEM;
memcpy(full_name, XATTR_SECURITY_PREFIX, prefix_len);
memcpy(full_name + prefix_len, name, name_len);
full_name[prefix_len + name_len] = '\0';
retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
kfree(full_name);
return retval;
}
struct xattr_handler v9fs_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.get = v9fs_xattr_security_get,
.set = v9fs_xattr_security_set,
};

80
fs/9p/xattr_trusted.c Normal file
View File

@ -0,0 +1,80 @@
/*
* Copyright IBM Corporation, 2010
* Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of version 2.1 of the GNU Lesser General Public License
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it would be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
*/
#include <linux/module.h>
#include <linux/string.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include "xattr.h"
static int v9fs_xattr_trusted_get(struct dentry *dentry, const char *name,
void *buffer, size_t size, int type)
{
int retval;
char *full_name;
size_t name_len;
size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
if (name == NULL)
return -EINVAL;
if (strcmp(name, "") == 0)
return -EINVAL;
name_len = strlen(name);
full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
if (!full_name)
return -ENOMEM;
memcpy(full_name, XATTR_TRUSTED_PREFIX, prefix_len);
memcpy(full_name+prefix_len, name, name_len);
full_name[prefix_len + name_len] = '\0';
retval = v9fs_xattr_get(dentry, full_name, buffer, size);
kfree(full_name);
return retval;
}
static int v9fs_xattr_trusted_set(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags, int type)
{
int retval;
char *full_name;
size_t name_len;
size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
if (name == NULL)
return -EINVAL;
if (strcmp(name, "") == 0)
return -EINVAL;
name_len = strlen(name);
full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
if (!full_name)
return -ENOMEM;
memcpy(full_name, XATTR_TRUSTED_PREFIX, prefix_len);
memcpy(full_name + prefix_len, name, name_len);
full_name[prefix_len + name_len] = '\0';
retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
kfree(full_name);
return retval;
}
struct xattr_handler v9fs_xattr_trusted_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
.get = v9fs_xattr_trusted_get,
.set = v9fs_xattr_trusted_set,
};

View File

@ -26,6 +26,9 @@
#ifndef NET_9P_TRANSPORT_H #ifndef NET_9P_TRANSPORT_H
#define NET_9P_TRANSPORT_H #define NET_9P_TRANSPORT_H
#define P9_DEF_MIN_RESVPORT (665U)
#define P9_DEF_MAX_RESVPORT (1023U)
/** /**
* struct p9_trans_module - transport module interface * struct p9_trans_module - transport module interface
* @list: used to maintain a list of currently available transports * @list: used to maintain a list of currently available transports
@ -37,6 +40,8 @@
* @close: member function to discard a connection on this transport * @close: member function to discard a connection on this transport
* @request: member function to issue a request to the transport * @request: member function to issue a request to the transport
* @cancel: member function to cancel a request (if it hasn't been sent) * @cancel: member function to cancel a request (if it hasn't been sent)
* @cancelled: member function to notify that a cancelled request will not
* not receive a reply
* *
* This is the basic API for a transport module which is registered by the * This is the basic API for a transport module which is registered by the
* transport module with the 9P core network module and used by the client * transport module with the 9P core network module and used by the client
@ -55,6 +60,7 @@ struct p9_trans_module {
void (*close) (struct p9_client *); void (*close) (struct p9_client *);
int (*request) (struct p9_client *, struct p9_req_t *req); int (*request) (struct p9_client *, struct p9_req_t *req);
int (*cancel) (struct p9_client *, struct p9_req_t *req); int (*cancel) (struct p9_client *, struct p9_req_t *req);
int (*cancelled)(struct p9_client *, struct p9_req_t *req);
int (*zc_request)(struct p9_client *, struct p9_req_t *, int (*zc_request)(struct p9_client *, struct p9_req_t *,
char *, char *, int , int, int, int); char *, char *, int , int, int, int);
}; };

View File

@ -204,6 +204,17 @@ free_and_return:
return ret; return ret;
} }
struct p9_fcall *p9_fcall_alloc(int alloc_msize)
{
struct p9_fcall *fc;
fc = kmalloc(sizeof(struct p9_fcall) + alloc_msize, GFP_NOFS);
if (!fc)
return NULL;
fc->capacity = alloc_msize;
fc->sdata = (char *) fc + sizeof(struct p9_fcall);
return fc;
}
/** /**
* p9_tag_alloc - lookup/allocate a request by tag * p9_tag_alloc - lookup/allocate a request by tag
* @c: client session to lookup tag within * @c: client session to lookup tag within
@ -256,39 +267,36 @@ p9_tag_alloc(struct p9_client *c, u16 tag, unsigned int max_size)
col = tag % P9_ROW_MAXTAG; col = tag % P9_ROW_MAXTAG;
req = &c->reqs[row][col]; req = &c->reqs[row][col];
if (!req->tc) { if (!req->wq) {
req->wq = kmalloc(sizeof(wait_queue_head_t), GFP_NOFS); req->wq = kmalloc(sizeof(wait_queue_head_t), GFP_NOFS);
if (!req->wq) { if (!req->wq)
pr_err("Couldn't grow tag array\n"); goto grow_failed;
return ERR_PTR(-ENOMEM);
}
init_waitqueue_head(req->wq); init_waitqueue_head(req->wq);
req->tc = kmalloc(sizeof(struct p9_fcall) + alloc_msize,
GFP_NOFS);
req->rc = kmalloc(sizeof(struct p9_fcall) + alloc_msize,
GFP_NOFS);
if ((!req->tc) || (!req->rc)) {
pr_err("Couldn't grow tag array\n");
kfree(req->tc);
kfree(req->rc);
kfree(req->wq);
req->tc = req->rc = NULL;
req->wq = NULL;
return ERR_PTR(-ENOMEM);
}
req->tc->capacity = alloc_msize;
req->rc->capacity = alloc_msize;
req->tc->sdata = (char *) req->tc + sizeof(struct p9_fcall);
req->rc->sdata = (char *) req->rc + sizeof(struct p9_fcall);
} }
if (!req->tc)
req->tc = p9_fcall_alloc(alloc_msize);
if (!req->rc)
req->rc = p9_fcall_alloc(alloc_msize);
if (!req->tc || !req->rc)
goto grow_failed;
p9pdu_reset(req->tc); p9pdu_reset(req->tc);
p9pdu_reset(req->rc); p9pdu_reset(req->rc);
req->tc->tag = tag-1; req->tc->tag = tag-1;
req->status = REQ_STATUS_ALLOC; req->status = REQ_STATUS_ALLOC;
return &c->reqs[row][col]; return req;
grow_failed:
pr_err("Couldn't grow tag array\n");
kfree(req->tc);
kfree(req->rc);
kfree(req->wq);
req->tc = req->rc = NULL;
req->wq = NULL;
return ERR_PTR(-ENOMEM);
} }
/** /**
@ -648,12 +656,20 @@ static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq)
return PTR_ERR(req); return PTR_ERR(req);
/* if we haven't received a response for oldreq, /*
remove it from the list. */ * if we haven't received a response for oldreq,
* remove it from the list, and notify the transport
* layer that the reply will never arrive.
*/
spin_lock(&c->lock); spin_lock(&c->lock);
if (oldreq->status == REQ_STATUS_FLSH) if (oldreq->status == REQ_STATUS_FLSH) {
list_del(&oldreq->req_list); list_del(&oldreq->req_list);
spin_unlock(&c->lock); spin_unlock(&c->lock);
if (c->trans_mod->cancelled)
c->trans_mod->cancelled(c, req);
} else {
spin_unlock(&c->lock);
}
p9_free_req(c, req); p9_free_req(c, req);
return 0; return 0;

View File

@ -63,6 +63,7 @@ struct p9_fd_opts {
int rfd; int rfd;
int wfd; int wfd;
u16 port; u16 port;
int privport;
}; };
/** /**
@ -87,12 +88,15 @@ struct p9_trans_fd {
enum { enum {
/* Options that take integer arguments */ /* Options that take integer arguments */
Opt_port, Opt_rfdno, Opt_wfdno, Opt_err, Opt_port, Opt_rfdno, Opt_wfdno, Opt_err,
/* Options that take no arguments */
Opt_privport,
}; };
static const match_table_t tokens = { static const match_table_t tokens = {
{Opt_port, "port=%u"}, {Opt_port, "port=%u"},
{Opt_rfdno, "rfdno=%u"}, {Opt_rfdno, "rfdno=%u"},
{Opt_wfdno, "wfdno=%u"}, {Opt_wfdno, "wfdno=%u"},
{Opt_privport, "privport"},
{Opt_err, NULL}, {Opt_err, NULL},
}; };
@ -161,6 +165,9 @@ static DEFINE_SPINLOCK(p9_poll_lock);
static LIST_HEAD(p9_poll_pending_list); static LIST_HEAD(p9_poll_pending_list);
static DECLARE_WORK(p9_poll_work, p9_poll_workfn); static DECLARE_WORK(p9_poll_work, p9_poll_workfn);
static unsigned int p9_ipport_resv_min = P9_DEF_MIN_RESVPORT;
static unsigned int p9_ipport_resv_max = P9_DEF_MAX_RESVPORT;
static void p9_mux_poll_stop(struct p9_conn *m) static void p9_mux_poll_stop(struct p9_conn *m)
{ {
unsigned long flags; unsigned long flags;
@ -741,7 +748,7 @@ static int parse_opts(char *params, struct p9_fd_opts *opts)
if (!*p) if (!*p)
continue; continue;
token = match_token(p, tokens, args); token = match_token(p, tokens, args);
if (token != Opt_err) { if ((token != Opt_err) && (token != Opt_privport)) {
r = match_int(&args[0], &option); r = match_int(&args[0], &option);
if (r < 0) { if (r < 0) {
p9_debug(P9_DEBUG_ERROR, p9_debug(P9_DEBUG_ERROR,
@ -759,6 +766,9 @@ static int parse_opts(char *params, struct p9_fd_opts *opts)
case Opt_wfdno: case Opt_wfdno:
opts->wfd = option; opts->wfd = option;
break; break;
case Opt_privport:
opts->privport = 1;
break;
default: default:
continue; continue;
} }
@ -898,6 +908,24 @@ static inline int valid_ipaddr4(const char *buf)
return 0; return 0;
} }
static int p9_bind_privport(struct socket *sock)
{
struct sockaddr_in cl;
int port, err = -EINVAL;
memset(&cl, 0, sizeof(cl));
cl.sin_family = AF_INET;
cl.sin_addr.s_addr = INADDR_ANY;
for (port = p9_ipport_resv_max; port >= p9_ipport_resv_min; port--) {
cl.sin_port = htons((ushort)port);
err = kernel_bind(sock, (struct sockaddr *)&cl, sizeof(cl));
if (err != -EADDRINUSE)
break;
}
return err;
}
static int static int
p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args) p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args)
{ {
@ -926,6 +954,16 @@ p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args)
return err; return err;
} }
if (opts.privport) {
err = p9_bind_privport(csocket);
if (err < 0) {
pr_err("%s (%d): problem binding to privport\n",
__func__, task_pid_nr(current));
sock_release(csocket);
return err;
}
}
err = csocket->ops->connect(csocket, err = csocket->ops->connect(csocket,
(struct sockaddr *)&sin_server, (struct sockaddr *)&sin_server,
sizeof(struct sockaddr_in), 0); sizeof(struct sockaddr_in), 0);

View File

@ -57,9 +57,7 @@
#define P9_RDMA_IRD 0 #define P9_RDMA_IRD 0
#define P9_RDMA_ORD 0 #define P9_RDMA_ORD 0
#define P9_RDMA_TIMEOUT 30000 /* 30 seconds */ #define P9_RDMA_TIMEOUT 30000 /* 30 seconds */
#define P9_RDMA_MAXSIZE (4*4096) /* Min SGE is 4, so we can #define P9_RDMA_MAXSIZE (1024*1024) /* 1MB */
* safely advertise a maxsize
* of 64k */
/** /**
* struct p9_trans_rdma - RDMA transport instance * struct p9_trans_rdma - RDMA transport instance
@ -75,7 +73,9 @@
* @sq_depth: The depth of the Send Queue * @sq_depth: The depth of the Send Queue
* @sq_sem: Semaphore for the SQ * @sq_sem: Semaphore for the SQ
* @rq_depth: The depth of the Receive Queue. * @rq_depth: The depth of the Receive Queue.
* @rq_count: Count of requests in the Receive Queue. * @rq_sem: Semaphore for the RQ
* @excess_rc : Amount of posted Receive Contexts without a pending request.
* See rdma_request()
* @addr: The remote peer's address * @addr: The remote peer's address
* @req_lock: Protects the active request list * @req_lock: Protects the active request list
* @cm_done: Completion event for connection management tracking * @cm_done: Completion event for connection management tracking
@ -100,7 +100,8 @@ struct p9_trans_rdma {
int sq_depth; int sq_depth;
struct semaphore sq_sem; struct semaphore sq_sem;
int rq_depth; int rq_depth;
atomic_t rq_count; struct semaphore rq_sem;
atomic_t excess_rc;
struct sockaddr_in addr; struct sockaddr_in addr;
spinlock_t req_lock; spinlock_t req_lock;
@ -296,6 +297,13 @@ handle_recv(struct p9_client *client, struct p9_trans_rdma *rdma,
if (!req) if (!req)
goto err_out; goto err_out;
/* Check that we have not yet received a reply for this request.
*/
if (unlikely(req->rc)) {
pr_err("Duplicate reply for request %d", tag);
goto err_out;
}
req->rc = c->rc; req->rc = c->rc;
req->status = REQ_STATUS_RCVD; req->status = REQ_STATUS_RCVD;
p9_client_cb(client, req); p9_client_cb(client, req);
@ -336,8 +344,8 @@ static void cq_comp_handler(struct ib_cq *cq, void *cq_context)
switch (c->wc_op) { switch (c->wc_op) {
case IB_WC_RECV: case IB_WC_RECV:
atomic_dec(&rdma->rq_count);
handle_recv(client, rdma, c, wc.status, wc.byte_len); handle_recv(client, rdma, c, wc.status, wc.byte_len);
up(&rdma->rq_sem);
break; break;
case IB_WC_SEND: case IB_WC_SEND:
@ -421,32 +429,33 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
struct p9_rdma_context *c = NULL; struct p9_rdma_context *c = NULL;
struct p9_rdma_context *rpl_context = NULL; struct p9_rdma_context *rpl_context = NULL;
/* When an error occurs between posting the recv and the send,
* there will be a receive context posted without a pending request.
* Since there is no way to "un-post" it, we remember it and skip
* post_recv() for the next request.
* So here,
* see if we are this `next request' and need to absorb an excess rc.
* If yes, then drop and free our own, and do not recv_post().
**/
if (unlikely(atomic_read(&rdma->excess_rc) > 0)) {
if ((atomic_sub_return(1, &rdma->excess_rc) >= 0)) {
/* Got one ! */
kfree(req->rc);
req->rc = NULL;
goto dont_need_post_recv;
} else {
/* We raced and lost. */
atomic_inc(&rdma->excess_rc);
}
}
/* Allocate an fcall for the reply */ /* Allocate an fcall for the reply */
rpl_context = kmalloc(sizeof *rpl_context, GFP_NOFS); rpl_context = kmalloc(sizeof *rpl_context, GFP_NOFS);
if (!rpl_context) { if (!rpl_context) {
err = -ENOMEM; err = -ENOMEM;
goto err_close; goto recv_error;
}
/*
* If the request has a buffer, steal it, otherwise
* allocate a new one. Typically, requests should already
* have receive buffers allocated and just swap them around
*/
if (!req->rc) {
req->rc = kmalloc(sizeof(struct p9_fcall)+client->msize,
GFP_NOFS);
if (req->rc) {
req->rc->sdata = (char *) req->rc +
sizeof(struct p9_fcall);
req->rc->capacity = client->msize;
}
} }
rpl_context->rc = req->rc; rpl_context->rc = req->rc;
if (!rpl_context->rc) {
err = -ENOMEM;
goto err_free2;
}
/* /*
* Post a receive buffer for this request. We need to ensure * Post a receive buffer for this request. We need to ensure
@ -455,29 +464,35 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
* outstanding request, so we must keep a count to avoid * outstanding request, so we must keep a count to avoid
* overflowing the RQ. * overflowing the RQ.
*/ */
if (atomic_inc_return(&rdma->rq_count) <= rdma->rq_depth) { if (down_interruptible(&rdma->rq_sem)) {
err = post_recv(client, rpl_context); err = -EINTR;
if (err) goto recv_error;
goto err_free1; }
} else
atomic_dec(&rdma->rq_count);
err = post_recv(client, rpl_context);
if (err) {
p9_debug(P9_DEBUG_FCALL, "POST RECV failed\n");
goto recv_error;
}
/* remove posted receive buffer from request structure */ /* remove posted receive buffer from request structure */
req->rc = NULL; req->rc = NULL;
dont_need_post_recv:
/* Post the request */ /* Post the request */
c = kmalloc(sizeof *c, GFP_NOFS); c = kmalloc(sizeof *c, GFP_NOFS);
if (!c) { if (!c) {
err = -ENOMEM; err = -ENOMEM;
goto err_free1; goto send_error;
} }
c->req = req; c->req = req;
c->busa = ib_dma_map_single(rdma->cm_id->device, c->busa = ib_dma_map_single(rdma->cm_id->device,
c->req->tc->sdata, c->req->tc->size, c->req->tc->sdata, c->req->tc->size,
DMA_TO_DEVICE); DMA_TO_DEVICE);
if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) {
goto error; err = -EIO;
goto send_error;
}
sge.addr = c->busa; sge.addr = c->busa;
sge.length = c->req->tc->size; sge.length = c->req->tc->size;
@ -491,22 +506,32 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
wr.sg_list = &sge; wr.sg_list = &sge;
wr.num_sge = 1; wr.num_sge = 1;
if (down_interruptible(&rdma->sq_sem)) if (down_interruptible(&rdma->sq_sem)) {
goto error; err = -EINTR;
goto send_error;
}
return ib_post_send(rdma->qp, &wr, &bad_wr); err = ib_post_send(rdma->qp, &wr, &bad_wr);
if (err)
goto send_error;
error: /* Success */
return 0;
/* Handle errors that happened during or while preparing the send: */
send_error:
kfree(c); kfree(c);
kfree(rpl_context->rc); p9_debug(P9_DEBUG_ERROR, "Error %d in rdma_request()\n", err);
/* Ach.
* We did recv_post(), but not send. We have one recv_post in excess.
*/
atomic_inc(&rdma->excess_rc);
return err;
/* Handle errors that happened during or while preparing post_recv(): */
recv_error:
kfree(rpl_context); kfree(rpl_context);
p9_debug(P9_DEBUG_ERROR, "EIO\n");
return -EIO;
err_free1:
kfree(rpl_context->rc);
err_free2:
kfree(rpl_context);
err_close:
spin_lock_irqsave(&rdma->req_lock, flags); spin_lock_irqsave(&rdma->req_lock, flags);
if (rdma->state < P9_RDMA_CLOSING) { if (rdma->state < P9_RDMA_CLOSING) {
rdma->state = P9_RDMA_CLOSING; rdma->state = P9_RDMA_CLOSING;
@ -551,7 +576,8 @@ static struct p9_trans_rdma *alloc_rdma(struct p9_rdma_opts *opts)
spin_lock_init(&rdma->req_lock); spin_lock_init(&rdma->req_lock);
init_completion(&rdma->cm_done); init_completion(&rdma->cm_done);
sema_init(&rdma->sq_sem, rdma->sq_depth); sema_init(&rdma->sq_sem, rdma->sq_depth);
atomic_set(&rdma->rq_count, 0); sema_init(&rdma->rq_sem, rdma->rq_depth);
atomic_set(&rdma->excess_rc, 0);
return rdma; return rdma;
} }
@ -562,6 +588,17 @@ static int rdma_cancel(struct p9_client *client, struct p9_req_t *req)
return 1; return 1;
} }
/* A request has been fully flushed without a reply.
* That means we have posted one buffer in excess.
*/
static int rdma_cancelled(struct p9_client *client, struct p9_req_t *req)
{
struct p9_trans_rdma *rdma = client->trans;
atomic_inc(&rdma->excess_rc);
return 0;
}
/** /**
* trans_create_rdma - Transport method for creating atransport instance * trans_create_rdma - Transport method for creating atransport instance
* @client: client instance * @client: client instance