2010-04-07 06:14:15 +08:00
|
|
|
#include <linux/ceph/ceph_debug.h>
|
2009-10-07 02:31:13 +08:00
|
|
|
|
|
|
|
#include <linux/crc32c.h>
|
|
|
|
#include <linux/ctype.h>
|
|
|
|
#include <linux/highmem.h>
|
|
|
|
#include <linux/inet.h>
|
|
|
|
#include <linux/kthread.h>
|
|
|
|
#include <linux/net.h>
|
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 16:04:11 +08:00
|
|
|
#include <linux/slab.h>
|
2009-10-07 02:31:13 +08:00
|
|
|
#include <linux/socket.h>
|
|
|
|
#include <linux/string.h>
|
2013-02-01 06:02:01 +08:00
|
|
|
#ifdef CONFIG_BLOCK
|
2010-04-07 06:01:27 +08:00
|
|
|
#include <linux/bio.h>
|
2013-02-01 06:02:01 +08:00
|
|
|
#endif /* CONFIG_BLOCK */
|
2011-09-24 02:48:42 +08:00
|
|
|
#include <linux/dns_resolver.h>
|
2009-10-07 02:31:13 +08:00
|
|
|
#include <net/tcp.h>
|
|
|
|
|
2010-04-07 06:14:15 +08:00
|
|
|
#include <linux/ceph/libceph.h>
|
|
|
|
#include <linux/ceph/messenger.h>
|
|
|
|
#include <linux/ceph/decode.h>
|
|
|
|
#include <linux/ceph/pagelist.h>
|
2011-07-15 23:47:34 +08:00
|
|
|
#include <linux/export.h>
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2013-03-07 13:39:39 +08:00
|
|
|
#define list_entry_next(pos, member) \
|
|
|
|
list_entry(pos->member.next, typeof(*pos), member)
|
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
/*
|
|
|
|
* Ceph uses the messenger to exchange ceph_msg messages with other
|
|
|
|
* hosts in the system. The messenger provides ordered and reliable
|
|
|
|
* delivery. We tolerate TCP disconnects by reconnecting (with
|
|
|
|
* exponential backoff) in the case of a fault (disconnection, bad
|
|
|
|
* crc, protocol error). Acks allow sent messages to be discarded by
|
|
|
|
* the sender.
|
|
|
|
*/
|
|
|
|
|
2012-06-21 10:53:53 +08:00
|
|
|
/*
|
|
|
|
* We track the state of the socket on a given connection using
|
|
|
|
* values defined below. The transition to a new socket state is
|
|
|
|
* handled by a function which verifies we aren't coming from an
|
|
|
|
* unexpected state.
|
|
|
|
*
|
|
|
|
* --------
|
|
|
|
* | NEW* | transient initial state
|
|
|
|
* --------
|
|
|
|
* | con_sock_state_init()
|
|
|
|
* v
|
|
|
|
* ----------
|
|
|
|
* | CLOSED | initialized, but no socket (and no
|
|
|
|
* ---------- TCP connection)
|
|
|
|
* ^ \
|
|
|
|
* | \ con_sock_state_connecting()
|
|
|
|
* | ----------------------
|
|
|
|
* | \
|
|
|
|
* + con_sock_state_closed() \
|
2012-06-28 03:31:02 +08:00
|
|
|
* |+--------------------------- \
|
|
|
|
* | \ \ \
|
|
|
|
* | ----------- \ \
|
|
|
|
* | | CLOSING | socket event; \ \
|
|
|
|
* | ----------- await close \ \
|
|
|
|
* | ^ \ |
|
|
|
|
* | | \ |
|
|
|
|
* | + con_sock_state_closing() \ |
|
|
|
|
* | / \ | |
|
|
|
|
* | / --------------- | |
|
|
|
|
* | / \ v v
|
2012-06-21 10:53:53 +08:00
|
|
|
* | / --------------
|
|
|
|
* | / -----------------| CONNECTING | socket created, TCP
|
|
|
|
* | | / -------------- connect initiated
|
|
|
|
* | | | con_sock_state_connected()
|
|
|
|
* | | v
|
|
|
|
* -------------
|
|
|
|
* | CONNECTED | TCP connection established
|
|
|
|
* -------------
|
|
|
|
*
|
|
|
|
* State values for ceph_connection->sock_state; NEW is assumed to be 0.
|
|
|
|
*/
|
2012-05-23 11:15:49 +08:00
|
|
|
|
|
|
|
#define CON_SOCK_STATE_NEW 0 /* -> CLOSED */
|
|
|
|
#define CON_SOCK_STATE_CLOSED 1 /* -> CONNECTING */
|
|
|
|
#define CON_SOCK_STATE_CONNECTING 2 /* -> CONNECTED or -> CLOSING */
|
|
|
|
#define CON_SOCK_STATE_CONNECTED 3 /* -> CLOSING or -> CLOSED */
|
|
|
|
#define CON_SOCK_STATE_CLOSING 4 /* -> CLOSED */
|
|
|
|
|
2012-07-21 08:24:40 +08:00
|
|
|
/*
|
|
|
|
* connection states
|
|
|
|
*/
|
|
|
|
#define CON_STATE_CLOSED 1 /* -> PREOPEN */
|
|
|
|
#define CON_STATE_PREOPEN 2 /* -> CONNECTING, CLOSED */
|
|
|
|
#define CON_STATE_CONNECTING 3 /* -> NEGOTIATING, CLOSED */
|
|
|
|
#define CON_STATE_NEGOTIATING 4 /* -> OPEN, CLOSED */
|
|
|
|
#define CON_STATE_OPEN 5 /* -> STANDBY, CLOSED */
|
|
|
|
#define CON_STATE_STANDBY 6 /* -> PREOPEN, CLOSED */
|
|
|
|
|
2012-07-21 08:29:55 +08:00
|
|
|
/*
|
|
|
|
* ceph_connection flag bits
|
|
|
|
*/
|
|
|
|
#define CON_FLAG_LOSSYTX 0 /* we can close channel or drop
|
|
|
|
* messages on errors */
|
|
|
|
#define CON_FLAG_KEEPALIVE_PENDING 1 /* we need to send a keepalive */
|
|
|
|
#define CON_FLAG_WRITE_PENDING 2 /* we have data ready to send */
|
|
|
|
#define CON_FLAG_SOCK_CLOSED 3 /* socket state changed to closed */
|
|
|
|
#define CON_FLAG_BACKOFF 4 /* need to retry queuing delayed work */
|
2012-07-21 08:24:40 +08:00
|
|
|
|
2013-02-21 00:25:12 +08:00
|
|
|
static bool con_flag_valid(unsigned long con_flag)
|
|
|
|
{
|
|
|
|
switch (con_flag) {
|
|
|
|
case CON_FLAG_LOSSYTX:
|
|
|
|
case CON_FLAG_KEEPALIVE_PENDING:
|
|
|
|
case CON_FLAG_WRITE_PENDING:
|
|
|
|
case CON_FLAG_SOCK_CLOSED:
|
|
|
|
case CON_FLAG_BACKOFF:
|
|
|
|
return true;
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void con_flag_clear(struct ceph_connection *con, unsigned long con_flag)
|
|
|
|
{
|
|
|
|
BUG_ON(!con_flag_valid(con_flag));
|
|
|
|
|
|
|
|
clear_bit(con_flag, &con->flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void con_flag_set(struct ceph_connection *con, unsigned long con_flag)
|
|
|
|
{
|
|
|
|
BUG_ON(!con_flag_valid(con_flag));
|
|
|
|
|
|
|
|
set_bit(con_flag, &con->flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool con_flag_test(struct ceph_connection *con, unsigned long con_flag)
|
|
|
|
{
|
|
|
|
BUG_ON(!con_flag_valid(con_flag));
|
|
|
|
|
|
|
|
return test_bit(con_flag, &con->flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool con_flag_test_and_clear(struct ceph_connection *con,
|
|
|
|
unsigned long con_flag)
|
|
|
|
{
|
|
|
|
BUG_ON(!con_flag_valid(con_flag));
|
|
|
|
|
|
|
|
return test_and_clear_bit(con_flag, &con->flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool con_flag_test_and_set(struct ceph_connection *con,
|
|
|
|
unsigned long con_flag)
|
|
|
|
{
|
|
|
|
BUG_ON(!con_flag_valid(con_flag));
|
|
|
|
|
|
|
|
return test_and_set_bit(con_flag, &con->flags);
|
|
|
|
}
|
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
/* static tag bytes (protocol control messages) */
|
|
|
|
static char tag_msg = CEPH_MSGR_TAG_MSG;
|
|
|
|
static char tag_ack = CEPH_MSGR_TAG_ACK;
|
|
|
|
static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
|
|
|
|
|
2010-04-14 05:07:07 +08:00
|
|
|
#ifdef CONFIG_LOCKDEP
|
|
|
|
static struct lock_class_key socket_class;
|
|
|
|
#endif
|
|
|
|
|
2012-02-15 21:43:55 +08:00
|
|
|
/*
|
|
|
|
* When skipping (ignoring) a block of input we read it into a "skip
|
|
|
|
* buffer," which is this many bytes in size.
|
|
|
|
*/
|
|
|
|
#define SKIP_BUF_SIZE 1024
|
2009-10-07 02:31:13 +08:00
|
|
|
|
|
|
|
static void queue_con(struct ceph_connection *con);
|
|
|
|
static void con_work(struct work_struct *);
|
2013-02-20 02:25:57 +08:00
|
|
|
static void con_fault(struct ceph_connection *con);
|
2009-10-07 02:31:13 +08:00
|
|
|
|
|
|
|
/*
|
2012-01-24 05:49:27 +08:00
|
|
|
* Nicely render a sockaddr as a string. An array of formatted
|
|
|
|
* strings is used, to approximate reentrancy.
|
2009-10-07 02:31:13 +08:00
|
|
|
*/
|
2012-01-24 05:49:27 +08:00
|
|
|
#define ADDR_STR_COUNT_LOG 5 /* log2(# address strings in array) */
|
|
|
|
#define ADDR_STR_COUNT (1 << ADDR_STR_COUNT_LOG)
|
|
|
|
#define ADDR_STR_COUNT_MASK (ADDR_STR_COUNT - 1)
|
|
|
|
#define MAX_ADDR_STR_LEN 64 /* 54 is enough */
|
|
|
|
|
|
|
|
static char addr_str[ADDR_STR_COUNT][MAX_ADDR_STR_LEN];
|
|
|
|
static atomic_t addr_str_seq = ATOMIC_INIT(0);
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2012-01-24 05:49:27 +08:00
|
|
|
static struct page *zero_page; /* used in certain error cases */
|
|
|
|
|
2010-04-07 06:14:15 +08:00
|
|
|
const char *ceph_pr_addr(const struct sockaddr_storage *ss)
|
2009-10-07 02:31:13 +08:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
char *s;
|
2012-01-24 05:49:27 +08:00
|
|
|
struct sockaddr_in *in4 = (struct sockaddr_in *) ss;
|
|
|
|
struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) ss;
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2012-01-24 05:49:27 +08:00
|
|
|
i = atomic_inc_return(&addr_str_seq) & ADDR_STR_COUNT_MASK;
|
2009-10-07 02:31:13 +08:00
|
|
|
s = addr_str[i];
|
|
|
|
|
|
|
|
switch (ss->ss_family) {
|
|
|
|
case AF_INET:
|
2012-01-24 05:49:27 +08:00
|
|
|
snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%hu", &in4->sin_addr,
|
|
|
|
ntohs(in4->sin_port));
|
2009-10-07 02:31:13 +08:00
|
|
|
break;
|
|
|
|
|
|
|
|
case AF_INET6:
|
2012-01-24 05:49:27 +08:00
|
|
|
snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%hu", &in6->sin6_addr,
|
|
|
|
ntohs(in6->sin6_port));
|
2009-10-07 02:31:13 +08:00
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
2012-02-15 04:05:33 +08:00
|
|
|
snprintf(s, MAX_ADDR_STR_LEN, "(unknown sockaddr family %hu)",
|
|
|
|
ss->ss_family);
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
2010-04-07 06:14:15 +08:00
|
|
|
EXPORT_SYMBOL(ceph_pr_addr);
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2009-11-04 07:17:56 +08:00
|
|
|
static void encode_my_addr(struct ceph_messenger *msgr)
|
|
|
|
{
|
|
|
|
memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
|
|
|
|
ceph_encode_addr(&msgr->my_enc_addr);
|
|
|
|
}
|
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
/*
|
|
|
|
* work queue for all reading and writing to/from the socket.
|
|
|
|
*/
|
2012-02-15 04:05:33 +08:00
|
|
|
static struct workqueue_struct *ceph_msgr_wq;
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2013-02-20 02:25:56 +08:00
|
|
|
static void _ceph_msgr_exit(void)
|
2012-02-15 04:05:33 +08:00
|
|
|
{
|
2012-02-15 04:05:33 +08:00
|
|
|
if (ceph_msgr_wq) {
|
2012-02-15 04:05:33 +08:00
|
|
|
destroy_workqueue(ceph_msgr_wq);
|
2012-02-15 04:05:33 +08:00
|
|
|
ceph_msgr_wq = NULL;
|
|
|
|
}
|
2012-02-15 04:05:33 +08:00
|
|
|
|
|
|
|
BUG_ON(zero_page == NULL);
|
|
|
|
kunmap(zero_page);
|
|
|
|
page_cache_release(zero_page);
|
|
|
|
zero_page = NULL;
|
|
|
|
}
|
|
|
|
|
2010-04-07 06:14:15 +08:00
|
|
|
int ceph_msgr_init(void)
|
2009-10-07 02:31:13 +08:00
|
|
|
{
|
2012-01-24 05:49:27 +08:00
|
|
|
BUG_ON(zero_page != NULL);
|
|
|
|
zero_page = ZERO_PAGE(0);
|
|
|
|
page_cache_get(zero_page);
|
|
|
|
|
2011-01-03 21:49:46 +08:00
|
|
|
ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_NON_REENTRANT, 0);
|
2012-02-15 04:05:33 +08:00
|
|
|
if (ceph_msgr_wq)
|
|
|
|
return 0;
|
2012-01-24 05:49:27 +08:00
|
|
|
|
2012-02-15 04:05:33 +08:00
|
|
|
pr_err("msgr_init failed to create workqueue\n");
|
|
|
|
_ceph_msgr_exit();
|
2012-01-24 05:49:27 +08:00
|
|
|
|
2012-02-15 04:05:33 +08:00
|
|
|
return -ENOMEM;
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
2010-04-07 06:14:15 +08:00
|
|
|
EXPORT_SYMBOL(ceph_msgr_init);
|
2009-10-07 02:31:13 +08:00
|
|
|
|
|
|
|
void ceph_msgr_exit(void)
|
|
|
|
{
|
2012-01-24 05:49:27 +08:00
|
|
|
BUG_ON(ceph_msgr_wq == NULL);
|
|
|
|
|
2012-02-15 04:05:33 +08:00
|
|
|
_ceph_msgr_exit();
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
2010-04-07 06:14:15 +08:00
|
|
|
EXPORT_SYMBOL(ceph_msgr_exit);
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2010-06-12 07:58:48 +08:00
|
|
|
void ceph_msgr_flush(void)
|
2010-05-30 00:41:23 +08:00
|
|
|
{
|
|
|
|
flush_workqueue(ceph_msgr_wq);
|
|
|
|
}
|
2010-04-07 06:14:15 +08:00
|
|
|
EXPORT_SYMBOL(ceph_msgr_flush);
|
2010-05-30 00:41:23 +08:00
|
|
|
|
2012-05-23 11:15:49 +08:00
|
|
|
/* Connection socket state transition functions */
|
|
|
|
|
|
|
|
static void con_sock_state_init(struct ceph_connection *con)
|
|
|
|
{
|
|
|
|
int old_state;
|
|
|
|
|
|
|
|
old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED);
|
|
|
|
if (WARN_ON(old_state != CON_SOCK_STATE_NEW))
|
|
|
|
printk("%s: unexpected old state %d\n", __func__, old_state);
|
2012-07-31 09:16:16 +08:00
|
|
|
dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
|
|
|
|
CON_SOCK_STATE_CLOSED);
|
2012-05-23 11:15:49 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void con_sock_state_connecting(struct ceph_connection *con)
|
|
|
|
{
|
|
|
|
int old_state;
|
|
|
|
|
|
|
|
old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTING);
|
|
|
|
if (WARN_ON(old_state != CON_SOCK_STATE_CLOSED))
|
|
|
|
printk("%s: unexpected old state %d\n", __func__, old_state);
|
2012-07-31 09:16:16 +08:00
|
|
|
dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
|
|
|
|
CON_SOCK_STATE_CONNECTING);
|
2012-05-23 11:15:49 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void con_sock_state_connected(struct ceph_connection *con)
|
|
|
|
{
|
|
|
|
int old_state;
|
|
|
|
|
|
|
|
old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTED);
|
|
|
|
if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING))
|
|
|
|
printk("%s: unexpected old state %d\n", __func__, old_state);
|
2012-07-31 09:16:16 +08:00
|
|
|
dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
|
|
|
|
CON_SOCK_STATE_CONNECTED);
|
2012-05-23 11:15:49 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void con_sock_state_closing(struct ceph_connection *con)
|
|
|
|
{
|
|
|
|
int old_state;
|
|
|
|
|
|
|
|
old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSING);
|
|
|
|
if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING &&
|
|
|
|
old_state != CON_SOCK_STATE_CONNECTED &&
|
|
|
|
old_state != CON_SOCK_STATE_CLOSING))
|
|
|
|
printk("%s: unexpected old state %d\n", __func__, old_state);
|
2012-07-31 09:16:16 +08:00
|
|
|
dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
|
|
|
|
CON_SOCK_STATE_CLOSING);
|
2012-05-23 11:15:49 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void con_sock_state_closed(struct ceph_connection *con)
|
|
|
|
{
|
|
|
|
int old_state;
|
|
|
|
|
|
|
|
old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED);
|
|
|
|
if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTED &&
|
2012-06-28 03:31:02 +08:00
|
|
|
old_state != CON_SOCK_STATE_CLOSING &&
|
2012-07-31 09:16:16 +08:00
|
|
|
old_state != CON_SOCK_STATE_CONNECTING &&
|
|
|
|
old_state != CON_SOCK_STATE_CLOSED))
|
2012-05-23 11:15:49 +08:00
|
|
|
printk("%s: unexpected old state %d\n", __func__, old_state);
|
2012-07-31 09:16:16 +08:00
|
|
|
dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
|
|
|
|
CON_SOCK_STATE_CLOSED);
|
2012-05-23 11:15:49 +08:00
|
|
|
}
|
2010-05-30 00:41:23 +08:00
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
/*
|
|
|
|
* socket callback functions
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* data available on socket, or listen socket received a connect */
|
2012-05-23 00:41:43 +08:00
|
|
|
static void ceph_sock_data_ready(struct sock *sk, int count_unused)
|
2009-10-07 02:31:13 +08:00
|
|
|
{
|
2012-01-24 05:49:27 +08:00
|
|
|
struct ceph_connection *con = sk->sk_user_data;
|
2012-07-09 10:50:33 +08:00
|
|
|
if (atomic_read(&con->msgr->stopping)) {
|
|
|
|
return;
|
|
|
|
}
|
2012-01-24 05:49:27 +08:00
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
if (sk->sk_state != TCP_CLOSE_WAIT) {
|
2012-05-23 00:41:43 +08:00
|
|
|
dout("%s on %p state = %lu, queueing work\n", __func__,
|
2009-10-07 02:31:13 +08:00
|
|
|
con, con->state);
|
|
|
|
queue_con(con);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* socket has buffer space for writing */
|
2012-05-23 00:41:43 +08:00
|
|
|
static void ceph_sock_write_space(struct sock *sk)
|
2009-10-07 02:31:13 +08:00
|
|
|
{
|
2012-02-15 04:05:33 +08:00
|
|
|
struct ceph_connection *con = sk->sk_user_data;
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2012-02-29 23:30:58 +08:00
|
|
|
/* only queue to workqueue if there is data we want to write,
|
|
|
|
* and there is sufficient space in the socket buffer to accept
|
2012-05-23 00:41:43 +08:00
|
|
|
* more data. clear SOCK_NOSPACE so that ceph_sock_write_space()
|
2012-02-29 23:30:58 +08:00
|
|
|
* doesn't get called again until try_write() fills the socket
|
|
|
|
* buffer. See net/ipv4/tcp_input.c:tcp_check_space()
|
|
|
|
* and net/core/stream.c:sk_stream_write_space().
|
|
|
|
*/
|
2013-02-21 00:25:12 +08:00
|
|
|
if (con_flag_test(con, CON_FLAG_WRITE_PENDING)) {
|
2012-02-29 23:30:58 +08:00
|
|
|
if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
|
2012-05-23 00:41:43 +08:00
|
|
|
dout("%s %p queueing write work\n", __func__, con);
|
2012-02-29 23:30:58 +08:00
|
|
|
clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
|
|
|
|
queue_con(con);
|
|
|
|
}
|
2009-10-07 02:31:13 +08:00
|
|
|
} else {
|
2012-05-23 00:41:43 +08:00
|
|
|
dout("%s %p nothing to write\n", __func__, con);
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* socket's state has changed */
|
2012-05-23 00:41:43 +08:00
|
|
|
static void ceph_sock_state_change(struct sock *sk)
|
2009-10-07 02:31:13 +08:00
|
|
|
{
|
2012-01-24 05:49:27 +08:00
|
|
|
struct ceph_connection *con = sk->sk_user_data;
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2012-05-23 00:41:43 +08:00
|
|
|
dout("%s %p state = %lu sk_state = %u\n", __func__,
|
2009-10-07 02:31:13 +08:00
|
|
|
con, con->state, sk->sk_state);
|
|
|
|
|
|
|
|
switch (sk->sk_state) {
|
|
|
|
case TCP_CLOSE:
|
2012-05-23 00:41:43 +08:00
|
|
|
dout("%s TCP_CLOSE\n", __func__);
|
2009-10-07 02:31:13 +08:00
|
|
|
case TCP_CLOSE_WAIT:
|
2012-05-23 00:41:43 +08:00
|
|
|
dout("%s TCP_CLOSE_WAIT\n", __func__);
|
2012-05-23 11:15:49 +08:00
|
|
|
con_sock_state_closing(con);
|
2013-02-21 00:25:12 +08:00
|
|
|
con_flag_set(con, CON_FLAG_SOCK_CLOSED);
|
2012-06-21 10:53:53 +08:00
|
|
|
queue_con(con);
|
2009-10-07 02:31:13 +08:00
|
|
|
break;
|
|
|
|
case TCP_ESTABLISHED:
|
2012-05-23 00:41:43 +08:00
|
|
|
dout("%s TCP_ESTABLISHED\n", __func__);
|
2012-05-23 11:15:49 +08:00
|
|
|
con_sock_state_connected(con);
|
2009-10-07 02:31:13 +08:00
|
|
|
queue_con(con);
|
|
|
|
break;
|
2012-02-15 04:05:33 +08:00
|
|
|
default: /* Everything else is uninteresting */
|
|
|
|
break;
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* set up socket callbacks
|
|
|
|
*/
|
|
|
|
static void set_sock_callbacks(struct socket *sock,
|
|
|
|
struct ceph_connection *con)
|
|
|
|
{
|
|
|
|
struct sock *sk = sock->sk;
|
2012-01-24 05:49:27 +08:00
|
|
|
sk->sk_user_data = con;
|
2012-05-23 00:41:43 +08:00
|
|
|
sk->sk_data_ready = ceph_sock_data_ready;
|
|
|
|
sk->sk_write_space = ceph_sock_write_space;
|
|
|
|
sk->sk_state_change = ceph_sock_state_change;
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* socket helpers
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* initiate connection to a remote socket.
|
|
|
|
*/
|
2012-02-15 04:05:33 +08:00
|
|
|
static int ceph_tcp_connect(struct ceph_connection *con)
|
2009-10-07 02:31:13 +08:00
|
|
|
{
|
2010-07-02 06:18:31 +08:00
|
|
|
struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
|
2009-10-07 02:31:13 +08:00
|
|
|
struct socket *sock;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
BUG_ON(con->sock);
|
2010-07-02 06:18:31 +08:00
|
|
|
ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM,
|
|
|
|
IPPROTO_TCP, &sock);
|
2009-10-07 02:31:13 +08:00
|
|
|
if (ret)
|
2012-02-15 04:05:33 +08:00
|
|
|
return ret;
|
2009-10-07 02:31:13 +08:00
|
|
|
sock->sk->sk_allocation = GFP_NOFS;
|
|
|
|
|
2010-04-14 05:07:07 +08:00
|
|
|
#ifdef CONFIG_LOCKDEP
|
|
|
|
lockdep_set_class(&sock->sk->sk_lock, &socket_class);
|
|
|
|
#endif
|
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
set_sock_callbacks(sock, con);
|
|
|
|
|
2010-04-07 06:14:15 +08:00
|
|
|
dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr));
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2012-06-10 05:19:21 +08:00
|
|
|
con_sock_state_connecting(con);
|
2010-07-02 06:18:31 +08:00
|
|
|
ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
|
|
|
|
O_NONBLOCK);
|
2009-10-07 02:31:13 +08:00
|
|
|
if (ret == -EINPROGRESS) {
|
|
|
|
dout("connect %s EINPROGRESS sk_state = %u\n",
|
2010-04-07 06:14:15 +08:00
|
|
|
ceph_pr_addr(&con->peer_addr.in_addr),
|
2009-10-07 02:31:13 +08:00
|
|
|
sock->sk->sk_state);
|
2012-01-24 05:49:27 +08:00
|
|
|
} else if (ret < 0) {
|
2009-10-07 02:31:13 +08:00
|
|
|
pr_err("connect %s error %d\n",
|
2010-04-07 06:14:15 +08:00
|
|
|
ceph_pr_addr(&con->peer_addr.in_addr), ret);
|
2009-10-07 02:31:13 +08:00
|
|
|
sock_release(sock);
|
|
|
|
con->error_msg = "connect error";
|
|
|
|
|
2012-02-15 04:05:33 +08:00
|
|
|
return ret;
|
2012-01-24 05:49:27 +08:00
|
|
|
}
|
|
|
|
con->sock = sock;
|
2012-02-15 04:05:33 +08:00
|
|
|
return 0;
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
|
|
|
|
{
|
|
|
|
struct kvec iov = {buf, len};
|
|
|
|
struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
|
2011-01-26 00:17:48 +08:00
|
|
|
int r;
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2011-01-26 00:17:48 +08:00
|
|
|
r = kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
|
|
|
|
if (r == -EAGAIN)
|
|
|
|
r = 0;
|
|
|
|
return r;
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
|
|
|
|
2013-03-09 10:58:59 +08:00
|
|
|
static int ceph_tcp_recvpage(struct socket *sock, struct page *page,
|
|
|
|
int page_offset, size_t length)
|
|
|
|
{
|
|
|
|
void *kaddr;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
BUG_ON(page_offset + length > PAGE_SIZE);
|
|
|
|
|
|
|
|
kaddr = kmap(page);
|
|
|
|
BUG_ON(!kaddr);
|
|
|
|
ret = ceph_tcp_recvmsg(sock, kaddr + page_offset, length);
|
|
|
|
kunmap(page);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
/*
|
|
|
|
* write something. @more is true if caller will be sending more data
|
|
|
|
* shortly.
|
|
|
|
*/
|
|
|
|
static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
|
|
|
|
size_t kvlen, size_t len, int more)
|
|
|
|
{
|
|
|
|
struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
|
2011-01-26 00:19:34 +08:00
|
|
|
int r;
|
2009-10-07 02:31:13 +08:00
|
|
|
|
|
|
|
if (more)
|
|
|
|
msg.msg_flags |= MSG_MORE;
|
|
|
|
else
|
|
|
|
msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
|
|
|
|
|
2011-01-26 00:19:34 +08:00
|
|
|
r = kernel_sendmsg(sock, &msg, iov, kvlen, len);
|
|
|
|
if (r == -EAGAIN)
|
|
|
|
r = 0;
|
|
|
|
return r;
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
|
|
|
|
2012-03-08 01:40:08 +08:00
|
|
|
static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
|
2013-03-07 13:39:38 +08:00
|
|
|
int offset, size_t size, bool more)
|
2012-03-08 01:40:08 +08:00
|
|
|
{
|
|
|
|
int flags = MSG_DONTWAIT | MSG_NOSIGNAL | (more ? MSG_MORE : MSG_EOR);
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = kernel_sendpage(sock, page, offset, size, flags);
|
|
|
|
if (ret == -EAGAIN)
|
|
|
|
ret = 0;
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Shutdown/close the socket for the given connection.
|
|
|
|
*/
|
|
|
|
static int con_close_socket(struct ceph_connection *con)
|
|
|
|
{
|
2012-07-31 09:16:16 +08:00
|
|
|
int rc = 0;
|
2009-10-07 02:31:13 +08:00
|
|
|
|
|
|
|
dout("con_close_socket on %p sock %p\n", con, con->sock);
|
2012-07-31 09:16:16 +08:00
|
|
|
if (con->sock) {
|
|
|
|
rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
|
|
|
|
sock_release(con->sock);
|
|
|
|
con->sock = NULL;
|
|
|
|
}
|
2012-06-21 10:53:53 +08:00
|
|
|
|
|
|
|
/*
|
2012-07-21 08:29:55 +08:00
|
|
|
* Forcibly clear the SOCK_CLOSED flag. It gets set
|
2012-06-21 10:53:53 +08:00
|
|
|
* independent of the connection mutex, and we could have
|
|
|
|
* received a socket close event before we had the chance to
|
|
|
|
* shut the socket down.
|
|
|
|
*/
|
2013-02-21 00:25:12 +08:00
|
|
|
con_flag_clear(con, CON_FLAG_SOCK_CLOSED);
|
2012-07-31 09:16:16 +08:00
|
|
|
|
2012-05-23 11:15:49 +08:00
|
|
|
con_sock_state_closed(con);
|
2009-10-07 02:31:13 +08:00
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Reset a connection. Discard all incoming and outgoing messages
|
|
|
|
* and clear *_seq state.
|
|
|
|
*/
|
|
|
|
static void ceph_msg_remove(struct ceph_msg *msg)
|
|
|
|
{
|
|
|
|
list_del_init(&msg->list_head);
|
2012-06-02 03:56:43 +08:00
|
|
|
BUG_ON(msg->con == NULL);
|
2012-06-22 03:47:08 +08:00
|
|
|
msg->con->ops->put(msg->con);
|
2012-06-02 03:56:43 +08:00
|
|
|
msg->con = NULL;
|
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
ceph_msg_put(msg);
|
|
|
|
}
|
|
|
|
static void ceph_msg_remove_list(struct list_head *head)
|
|
|
|
{
|
|
|
|
while (!list_empty(head)) {
|
|
|
|
struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
|
|
|
|
list_head);
|
|
|
|
ceph_msg_remove(msg);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void reset_connection(struct ceph_connection *con)
|
|
|
|
{
|
|
|
|
/* reset connection, out_queue, msg_ and connect_seq */
|
|
|
|
/* discard existing out_queue and msg_seq */
|
2012-12-28 10:27:04 +08:00
|
|
|
dout("reset_connection %p\n", con);
|
2009-10-07 02:31:13 +08:00
|
|
|
ceph_msg_remove_list(&con->out_queue);
|
|
|
|
ceph_msg_remove_list(&con->out_sent);
|
|
|
|
|
2009-12-12 01:48:05 +08:00
|
|
|
if (con->in_msg) {
|
2012-06-02 03:56:43 +08:00
|
|
|
BUG_ON(con->in_msg->con != con);
|
|
|
|
con->in_msg->con = NULL;
|
2009-12-12 01:48:05 +08:00
|
|
|
ceph_msg_put(con->in_msg);
|
|
|
|
con->in_msg = NULL;
|
2012-06-22 03:47:08 +08:00
|
|
|
con->ops->put(con);
|
2009-12-12 01:48:05 +08:00
|
|
|
}
|
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
con->connect_seq = 0;
|
|
|
|
con->out_seq = 0;
|
2009-12-15 06:04:30 +08:00
|
|
|
if (con->out_msg) {
|
|
|
|
ceph_msg_put(con->out_msg);
|
|
|
|
con->out_msg = NULL;
|
|
|
|
}
|
2009-10-07 02:31:13 +08:00
|
|
|
con->in_seq = 0;
|
2010-04-03 07:07:19 +08:00
|
|
|
con->in_seq_acked = 0;
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* mark a peer down. drop any open connections.
|
|
|
|
*/
|
|
|
|
void ceph_con_close(struct ceph_connection *con)
|
|
|
|
{
|
2012-07-31 07:24:37 +08:00
|
|
|
mutex_lock(&con->mutex);
|
2010-04-07 06:14:15 +08:00
|
|
|
dout("con_close %p peer %s\n", con,
|
|
|
|
ceph_pr_addr(&con->peer_addr.in_addr));
|
2012-07-21 08:24:40 +08:00
|
|
|
con->state = CON_STATE_CLOSED;
|
2012-05-30 00:04:58 +08:00
|
|
|
|
2013-02-21 00:25:12 +08:00
|
|
|
con_flag_clear(con, CON_FLAG_LOSSYTX); /* so we retry next connect */
|
|
|
|
con_flag_clear(con, CON_FLAG_KEEPALIVE_PENDING);
|
|
|
|
con_flag_clear(con, CON_FLAG_WRITE_PENDING);
|
|
|
|
con_flag_clear(con, CON_FLAG_BACKOFF);
|
2012-05-30 00:04:58 +08:00
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
reset_connection(con);
|
2010-04-03 07:16:34 +08:00
|
|
|
con->peer_global_seq = 0;
|
2010-02-16 04:05:09 +08:00
|
|
|
cancel_delayed_work(&con->work);
|
2012-07-21 07:45:49 +08:00
|
|
|
con_close_socket(con);
|
2009-12-23 02:43:42 +08:00
|
|
|
mutex_unlock(&con->mutex);
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
2010-04-07 06:14:15 +08:00
|
|
|
EXPORT_SYMBOL(ceph_con_close);
|
2009-10-07 02:31:13 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Reopen a closed connection, with a new peer address.
|
|
|
|
*/
|
2012-06-28 03:24:08 +08:00
|
|
|
void ceph_con_open(struct ceph_connection *con,
|
|
|
|
__u8 entity_type, __u64 entity_num,
|
|
|
|
struct ceph_entity_addr *addr)
|
2009-10-07 02:31:13 +08:00
|
|
|
{
|
2012-07-31 07:21:40 +08:00
|
|
|
mutex_lock(&con->mutex);
|
2010-04-07 06:14:15 +08:00
|
|
|
dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr));
|
2012-07-21 08:24:40 +08:00
|
|
|
|
2012-12-27 00:43:57 +08:00
|
|
|
WARN_ON(con->state != CON_STATE_CLOSED);
|
2012-07-21 08:24:40 +08:00
|
|
|
con->state = CON_STATE_PREOPEN;
|
2012-05-30 00:04:58 +08:00
|
|
|
|
2012-06-28 03:24:08 +08:00
|
|
|
con->peer_name.type = (__u8) entity_type;
|
|
|
|
con->peer_name.num = cpu_to_le64(entity_num);
|
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
memcpy(&con->peer_addr, addr, sizeof(*addr));
|
2009-11-21 07:14:15 +08:00
|
|
|
con->delay = 0; /* reset backoff memory */
|
2012-07-31 07:21:40 +08:00
|
|
|
mutex_unlock(&con->mutex);
|
2009-10-07 02:31:13 +08:00
|
|
|
queue_con(con);
|
|
|
|
}
|
2010-04-07 06:14:15 +08:00
|
|
|
EXPORT_SYMBOL(ceph_con_open);
|
2009-10-07 02:31:13 +08:00
|
|
|
|
ceph: avoid reopening osd connections when address hasn't changed
We get a fault callback on _every_ tcp connection fault. Normally, we
want to reopen the connection when that happens. If the address we have
is bad, however, and connection attempts always result in a connection
refused or similar error, explicitly closing and reopening the msgr
connection just prevents the messenger's backoff logic from kicking in.
The result can be a console full of
[ 3974.417106] ceph: osd11 10.3.14.138:6800 connection failed
[ 3974.423295] ceph: osd11 10.3.14.138:6800 connection failed
[ 3974.429709] ceph: osd11 10.3.14.138:6800 connection failed
Instead, if we get a fault, and have outstanding requests, but the osd
address hasn't changed and the connection never successfully connected in
the first place, do nothing to the osd connection. The messenger layer
will back off and retry periodically, because we never connected and thus
the lossy bit is not set.
Instead, touch each request's r_stamp so that handle_timeout can tell the
request is still alive and kicking.
Signed-off-by: Sage Weil <sage@newdream.net>
2010-03-23 05:51:18 +08:00
|
|
|
/*
|
|
|
|
* return true if this connection ever successfully opened
|
|
|
|
*/
|
|
|
|
bool ceph_con_opened(struct ceph_connection *con)
|
|
|
|
{
|
|
|
|
return con->connect_seq > 0;
|
|
|
|
}
|
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
/*
|
|
|
|
* initialize a new connection.
|
|
|
|
*/
|
2012-05-27 12:26:43 +08:00
|
|
|
void ceph_con_init(struct ceph_connection *con, void *private,
|
|
|
|
const struct ceph_connection_operations *ops,
|
2012-06-28 03:24:08 +08:00
|
|
|
struct ceph_messenger *msgr)
|
2009-10-07 02:31:13 +08:00
|
|
|
{
|
|
|
|
dout("con_init %p\n", con);
|
|
|
|
memset(con, 0, sizeof(*con));
|
2012-05-27 12:26:43 +08:00
|
|
|
con->private = private;
|
|
|
|
con->ops = ops;
|
2009-10-07 02:31:13 +08:00
|
|
|
con->msgr = msgr;
|
2012-05-23 11:15:49 +08:00
|
|
|
|
|
|
|
con_sock_state_init(con);
|
|
|
|
|
2009-12-23 02:43:42 +08:00
|
|
|
mutex_init(&con->mutex);
|
2009-10-07 02:31:13 +08:00
|
|
|
INIT_LIST_HEAD(&con->out_queue);
|
|
|
|
INIT_LIST_HEAD(&con->out_sent);
|
|
|
|
INIT_DELAYED_WORK(&con->work, con_work);
|
2012-05-30 00:04:58 +08:00
|
|
|
|
2012-07-21 08:24:40 +08:00
|
|
|
con->state = CON_STATE_CLOSED;
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
2010-04-07 06:14:15 +08:00
|
|
|
EXPORT_SYMBOL(ceph_con_init);
|
2009-10-07 02:31:13 +08:00
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We maintain a global counter to order connection attempts. Get
|
|
|
|
* a unique seq greater than @gt.
|
|
|
|
*/
|
|
|
|
static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
|
|
|
|
{
|
|
|
|
u32 ret;
|
|
|
|
|
|
|
|
spin_lock(&msgr->global_seq_lock);
|
|
|
|
if (msgr->global_seq < gt)
|
|
|
|
msgr->global_seq = gt;
|
|
|
|
ret = ++msgr->global_seq;
|
|
|
|
spin_unlock(&msgr->global_seq_lock);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-05-24 03:35:23 +08:00
|
|
|
static void con_out_kvec_reset(struct ceph_connection *con)
|
2012-02-15 04:05:33 +08:00
|
|
|
{
|
|
|
|
con->out_kvec_left = 0;
|
|
|
|
con->out_kvec_bytes = 0;
|
|
|
|
con->out_kvec_cur = &con->out_kvec[0];
|
|
|
|
}
|
|
|
|
|
2012-05-24 03:35:23 +08:00
|
|
|
static void con_out_kvec_add(struct ceph_connection *con,
|
2012-02-15 04:05:33 +08:00
|
|
|
size_t size, void *data)
|
|
|
|
{
|
|
|
|
int index;
|
|
|
|
|
|
|
|
index = con->out_kvec_left;
|
|
|
|
BUG_ON(index >= ARRAY_SIZE(con->out_kvec));
|
|
|
|
|
|
|
|
con->out_kvec[index].iov_len = size;
|
|
|
|
con->out_kvec[index].iov_base = data;
|
|
|
|
con->out_kvec_left++;
|
|
|
|
con->out_kvec_bytes += size;
|
|
|
|
}
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2012-06-12 03:57:13 +08:00
|
|
|
#ifdef CONFIG_BLOCK
|
2013-02-16 12:10:17 +08:00
|
|
|
static void init_bio_iter(struct bio *bio, struct bio **bio_iter,
|
|
|
|
unsigned int *bio_seg)
|
2012-06-12 03:57:13 +08:00
|
|
|
{
|
|
|
|
if (!bio) {
|
2013-02-16 12:10:17 +08:00
|
|
|
*bio_iter = NULL;
|
|
|
|
*bio_seg = 0;
|
2012-06-12 03:57:13 +08:00
|
|
|
return;
|
|
|
|
}
|
2013-02-16 12:10:17 +08:00
|
|
|
*bio_iter = bio;
|
|
|
|
*bio_seg = (unsigned int) bio->bi_idx;
|
2012-06-12 03:57:13 +08:00
|
|
|
}
|
|
|
|
|
2013-02-16 12:10:17 +08:00
|
|
|
static void iter_bio_next(struct bio **bio_iter, unsigned int *seg)
|
2012-06-12 03:57:13 +08:00
|
|
|
{
|
|
|
|
if (*bio_iter == NULL)
|
|
|
|
return;
|
|
|
|
|
|
|
|
BUG_ON(*seg >= (*bio_iter)->bi_vcnt);
|
|
|
|
|
|
|
|
(*seg)++;
|
|
|
|
if (*seg == (*bio_iter)->bi_vcnt)
|
|
|
|
init_bio_iter((*bio_iter)->bi_next, bio_iter, seg);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2013-03-07 13:39:39 +08:00
|
|
|
/*
|
|
|
|
* Message data is handled (sent or received) in pieces, where each
|
|
|
|
* piece resides on a single page. The network layer might not
|
|
|
|
* consume an entire piece at once. A data item's cursor keeps
|
|
|
|
* track of which piece is next to process and how much remains to
|
|
|
|
* be processed in that piece. It also tracks whether the current
|
|
|
|
* piece is the last one in the data item.
|
|
|
|
*/
|
|
|
|
static void ceph_msg_data_cursor_init(struct ceph_msg_data *data)
|
|
|
|
{
|
|
|
|
struct ceph_msg_data_cursor *cursor = &data->cursor;
|
|
|
|
struct ceph_pagelist *pagelist;
|
|
|
|
struct page *page;
|
|
|
|
|
|
|
|
if (data->type != CEPH_MSG_DATA_PAGELIST)
|
|
|
|
return;
|
|
|
|
|
|
|
|
pagelist = data->pagelist;
|
|
|
|
BUG_ON(!pagelist);
|
|
|
|
if (!pagelist->length)
|
|
|
|
return; /* pagelist can be assigned but empty */
|
|
|
|
|
|
|
|
BUG_ON(list_empty(&pagelist->head));
|
|
|
|
page = list_first_entry(&pagelist->head, struct page, lru);
|
|
|
|
|
|
|
|
cursor->page = page;
|
|
|
|
cursor->offset = 0;
|
|
|
|
cursor->last_piece = pagelist->length <= PAGE_SIZE;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return the page containing the next piece to process for a given
|
|
|
|
* data item, and supply the page offset and length of that piece.
|
|
|
|
* Indicate whether this is the last piece in this data item.
|
|
|
|
*/
|
|
|
|
static struct page *ceph_msg_data_next(struct ceph_msg_data *data,
|
|
|
|
size_t *page_offset,
|
|
|
|
size_t *length,
|
|
|
|
bool *last_piece)
|
|
|
|
{
|
|
|
|
struct ceph_msg_data_cursor *cursor = &data->cursor;
|
|
|
|
struct ceph_pagelist *pagelist;
|
|
|
|
size_t piece_end;
|
|
|
|
|
|
|
|
BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST);
|
|
|
|
|
|
|
|
pagelist = data->pagelist;
|
|
|
|
BUG_ON(!pagelist);
|
|
|
|
|
|
|
|
BUG_ON(!cursor->page);
|
|
|
|
BUG_ON(cursor->offset >= pagelist->length);
|
|
|
|
|
|
|
|
*last_piece = cursor->last_piece;
|
|
|
|
if (*last_piece) {
|
|
|
|
/* pagelist offset is always 0 */
|
|
|
|
piece_end = pagelist->length & ~PAGE_MASK;
|
|
|
|
if (!piece_end)
|
|
|
|
piece_end = PAGE_SIZE;
|
|
|
|
} else {
|
|
|
|
piece_end = PAGE_SIZE;
|
|
|
|
}
|
|
|
|
*page_offset = cursor->offset & ~PAGE_MASK;
|
|
|
|
*length = piece_end - *page_offset;
|
|
|
|
|
|
|
|
return data->cursor.page;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Returns true if the result moves the cursor on to the next piece
|
|
|
|
* (the next page) of the pagelist.
|
|
|
|
*/
|
|
|
|
static bool ceph_msg_data_advance(struct ceph_msg_data *data, size_t bytes)
|
|
|
|
{
|
|
|
|
struct ceph_msg_data_cursor *cursor = &data->cursor;
|
|
|
|
struct ceph_pagelist *pagelist;
|
|
|
|
|
|
|
|
BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST);
|
|
|
|
|
|
|
|
pagelist = data->pagelist;
|
|
|
|
BUG_ON(!pagelist);
|
|
|
|
BUG_ON(!cursor->page);
|
|
|
|
BUG_ON(cursor->offset + bytes > pagelist->length);
|
|
|
|
BUG_ON((cursor->offset & ~PAGE_MASK) + bytes > PAGE_SIZE);
|
|
|
|
|
|
|
|
/* Advance the cursor offset */
|
|
|
|
|
|
|
|
cursor->offset += bytes;
|
|
|
|
/* pagelist offset is always 0 */
|
|
|
|
if (!bytes || cursor->offset & ~PAGE_MASK)
|
|
|
|
return false; /* more bytes to process in the current page */
|
|
|
|
|
|
|
|
/* Move on to the next page */
|
|
|
|
|
|
|
|
BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head));
|
|
|
|
cursor->page = list_entry_next(cursor->page, lru);
|
|
|
|
|
|
|
|
/* cursor offset is at page boundary; pagelist offset is always 0 */
|
|
|
|
if (pagelist->length - cursor->offset <= PAGE_SIZE)
|
|
|
|
cursor->last_piece = true;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2013-03-07 13:39:39 +08:00
|
|
|
static void prepare_message_data(struct ceph_msg *msg,
|
|
|
|
struct ceph_msg_pos *msg_pos)
|
2012-06-12 03:57:13 +08:00
|
|
|
{
|
|
|
|
BUG_ON(!msg);
|
|
|
|
BUG_ON(!msg->hdr.data_len);
|
|
|
|
|
|
|
|
/* initialize page iterator */
|
2013-03-07 13:39:38 +08:00
|
|
|
msg_pos->page = 0;
|
2013-03-02 08:00:16 +08:00
|
|
|
if (ceph_msg_has_pages(msg))
|
2013-03-02 08:00:16 +08:00
|
|
|
msg_pos->page_pos = msg->p.alignment;
|
2012-06-12 03:57:13 +08:00
|
|
|
else
|
2013-03-07 13:39:38 +08:00
|
|
|
msg_pos->page_pos = 0;
|
2012-06-12 03:57:13 +08:00
|
|
|
#ifdef CONFIG_BLOCK
|
2013-03-02 08:00:16 +08:00
|
|
|
if (ceph_msg_has_bio(msg))
|
2013-03-02 08:00:16 +08:00
|
|
|
init_bio_iter(msg->b.bio, &msg->b.bio_iter, &msg->b.bio_seg);
|
2012-06-12 03:57:13 +08:00
|
|
|
#endif
|
2013-03-07 13:39:38 +08:00
|
|
|
msg_pos->data_pos = 0;
|
2013-03-07 13:39:39 +08:00
|
|
|
|
|
|
|
/* If there's a trail, initialize its cursor */
|
|
|
|
|
|
|
|
if (ceph_msg_has_trail(msg))
|
|
|
|
ceph_msg_data_cursor_init(&msg->t);
|
|
|
|
|
2013-03-07 13:39:38 +08:00
|
|
|
msg_pos->did_page_crc = false;
|
2012-06-12 03:57:13 +08:00
|
|
|
}
|
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
/*
|
|
|
|
* Prepare footer for currently outgoing message, and finish things
|
|
|
|
* off. Assumes out_kvec* are already valid.. we just add on to the end.
|
|
|
|
*/
|
2012-02-15 04:05:33 +08:00
|
|
|
static void prepare_write_message_footer(struct ceph_connection *con)
|
2009-10-07 02:31:13 +08:00
|
|
|
{
|
|
|
|
struct ceph_msg *m = con->out_msg;
|
2012-02-15 04:05:33 +08:00
|
|
|
int v = con->out_kvec_left;
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2012-06-12 03:57:13 +08:00
|
|
|
m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;
|
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
dout("prepare_write_message_footer %p\n", con);
|
|
|
|
con->out_kvec_is_msg = true;
|
|
|
|
con->out_kvec[v].iov_base = &m->footer;
|
|
|
|
con->out_kvec[v].iov_len = sizeof(m->footer);
|
|
|
|
con->out_kvec_bytes += sizeof(m->footer);
|
|
|
|
con->out_kvec_left++;
|
|
|
|
con->out_more = m->more_to_follow;
|
2009-12-15 06:04:30 +08:00
|
|
|
con->out_msg_done = true;
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Prepare headers for the next outgoing message.
|
|
|
|
*/
|
|
|
|
static void prepare_write_message(struct ceph_connection *con)
|
|
|
|
{
|
|
|
|
struct ceph_msg *m;
|
2012-02-15 21:43:54 +08:00
|
|
|
u32 crc;
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2012-05-24 03:35:23 +08:00
|
|
|
con_out_kvec_reset(con);
|
2009-10-07 02:31:13 +08:00
|
|
|
con->out_kvec_is_msg = true;
|
2009-12-15 06:04:30 +08:00
|
|
|
con->out_msg_done = false;
|
2009-10-07 02:31:13 +08:00
|
|
|
|
|
|
|
/* Sneak an ack in there first? If we can get it into the same
|
|
|
|
* TCP packet that's a good thing. */
|
|
|
|
if (con->in_seq > con->in_seq_acked) {
|
|
|
|
con->in_seq_acked = con->in_seq;
|
2012-05-24 03:35:23 +08:00
|
|
|
con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
|
2009-10-07 02:31:13 +08:00
|
|
|
con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
|
2012-05-24 03:35:23 +08:00
|
|
|
con_out_kvec_add(con, sizeof (con->out_temp_ack),
|
2012-02-15 04:05:33 +08:00
|
|
|
&con->out_temp_ack);
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
|
|
|
|
2012-06-02 03:56:43 +08:00
|
|
|
BUG_ON(list_empty(&con->out_queue));
|
2012-02-15 04:05:33 +08:00
|
|
|
m = list_first_entry(&con->out_queue, struct ceph_msg, list_head);
|
2009-12-15 06:04:30 +08:00
|
|
|
con->out_msg = m;
|
2012-06-02 03:56:43 +08:00
|
|
|
BUG_ON(m->con != con);
|
2011-07-27 02:27:24 +08:00
|
|
|
|
|
|
|
/* put message on sent list */
|
|
|
|
ceph_msg_get(m);
|
|
|
|
list_move_tail(&m->list_head, &con->out_sent);
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2010-05-12 12:20:38 +08:00
|
|
|
/*
|
|
|
|
* only assign outgoing seq # if we haven't sent this message
|
|
|
|
* yet. if it is requeued, resend with it's original seq.
|
|
|
|
*/
|
|
|
|
if (m->needs_out_seq) {
|
|
|
|
m->hdr.seq = cpu_to_le64(++con->out_seq);
|
|
|
|
m->needs_out_seq = false;
|
|
|
|
}
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2013-03-08 05:38:26 +08:00
|
|
|
dout("prepare_write_message %p seq %lld type %d len %d+%d+%d (%zd)\n",
|
2009-10-07 02:31:13 +08:00
|
|
|
m, con->out_seq, le16_to_cpu(m->hdr.type),
|
|
|
|
le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
|
2013-03-02 08:00:16 +08:00
|
|
|
le32_to_cpu(m->hdr.data_len), m->p.length);
|
2009-10-07 02:31:13 +08:00
|
|
|
BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
|
|
|
|
|
|
|
|
/* tag + hdr + front + middle */
|
2012-05-24 03:35:23 +08:00
|
|
|
con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
|
|
|
|
con_out_kvec_add(con, sizeof (m->hdr), &m->hdr);
|
|
|
|
con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
|
2012-02-15 04:05:33 +08:00
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
if (m->middle)
|
2012-05-24 03:35:23 +08:00
|
|
|
con_out_kvec_add(con, m->middle->vec.iov_len,
|
2012-02-15 04:05:33 +08:00
|
|
|
m->middle->vec.iov_base);
|
2009-10-07 02:31:13 +08:00
|
|
|
|
|
|
|
/* fill in crc (except data pages), footer */
|
2012-02-15 21:43:54 +08:00
|
|
|
crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
|
|
|
|
con->out_msg->hdr.crc = cpu_to_le32(crc);
|
2012-06-12 03:57:13 +08:00
|
|
|
con->out_msg->footer.flags = 0;
|
2012-02-15 21:43:54 +08:00
|
|
|
|
|
|
|
crc = crc32c(0, m->front.iov_base, m->front.iov_len);
|
|
|
|
con->out_msg->footer.front_crc = cpu_to_le32(crc);
|
|
|
|
if (m->middle) {
|
|
|
|
crc = crc32c(0, m->middle->vec.iov_base,
|
|
|
|
m->middle->vec.iov_len);
|
|
|
|
con->out_msg->footer.middle_crc = cpu_to_le32(crc);
|
|
|
|
} else
|
2009-10-07 02:31:13 +08:00
|
|
|
con->out_msg->footer.middle_crc = 0;
|
2012-06-12 03:57:13 +08:00
|
|
|
dout("%s front_crc %u middle_crc %u\n", __func__,
|
2009-10-07 02:31:13 +08:00
|
|
|
le32_to_cpu(con->out_msg->footer.front_crc),
|
|
|
|
le32_to_cpu(con->out_msg->footer.middle_crc));
|
|
|
|
|
|
|
|
/* is there a data payload? */
|
2012-06-12 03:57:13 +08:00
|
|
|
con->out_msg->footer.data_crc = 0;
|
2013-03-07 13:39:39 +08:00
|
|
|
if (m->hdr.data_len) {
|
|
|
|
prepare_message_data(con->out_msg, &con->out_msg_pos);
|
|
|
|
con->out_more = 1; /* data + footer will follow */
|
|
|
|
} else {
|
2009-10-07 02:31:13 +08:00
|
|
|
/* no, queue up footer too and be done */
|
2012-02-15 04:05:33 +08:00
|
|
|
prepare_write_message_footer(con);
|
2013-03-07 13:39:39 +08:00
|
|
|
}
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2013-02-21 00:25:12 +08:00
|
|
|
con_flag_set(con, CON_FLAG_WRITE_PENDING);
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Prepare an ack.
|
|
|
|
*/
|
|
|
|
static void prepare_write_ack(struct ceph_connection *con)
|
|
|
|
{
|
|
|
|
dout("prepare_write_ack %p %llu -> %llu\n", con,
|
|
|
|
con->in_seq_acked, con->in_seq);
|
|
|
|
con->in_seq_acked = con->in_seq;
|
|
|
|
|
2012-05-24 03:35:23 +08:00
|
|
|
con_out_kvec_reset(con);
|
2012-02-15 04:05:33 +08:00
|
|
|
|
2012-05-24 03:35:23 +08:00
|
|
|
con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
|
2012-02-15 04:05:33 +08:00
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
|
2012-05-24 03:35:23 +08:00
|
|
|
con_out_kvec_add(con, sizeof (con->out_temp_ack),
|
2012-02-15 04:05:33 +08:00
|
|
|
&con->out_temp_ack);
|
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
con->out_more = 1; /* more will follow.. eventually.. */
|
2013-02-21 00:25:12 +08:00
|
|
|
con_flag_set(con, CON_FLAG_WRITE_PENDING);
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Prepare to write keepalive byte.
|
|
|
|
*/
|
|
|
|
static void prepare_write_keepalive(struct ceph_connection *con)
|
|
|
|
{
|
|
|
|
dout("prepare_write_keepalive %p\n", con);
|
2012-05-24 03:35:23 +08:00
|
|
|
con_out_kvec_reset(con);
|
|
|
|
con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive);
|
2013-02-21 00:25:12 +08:00
|
|
|
con_flag_set(con, CON_FLAG_WRITE_PENDING);
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Connection negotiation.
|
|
|
|
*/
|
|
|
|
|
2012-05-17 04:16:39 +08:00
|
|
|
static struct ceph_auth_handshake *get_connect_authorizer(struct ceph_connection *con,
|
|
|
|
int *auth_proto)
|
2009-11-19 08:19:57 +08:00
|
|
|
{
|
2012-05-17 04:16:39 +08:00
|
|
|
struct ceph_auth_handshake *auth;
|
2012-05-17 04:16:38 +08:00
|
|
|
|
|
|
|
if (!con->ops->get_authorizer) {
|
|
|
|
con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN;
|
|
|
|
con->out_connect.authorizer_len = 0;
|
2012-05-17 04:16:39 +08:00
|
|
|
return NULL;
|
2012-05-17 04:16:38 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Can't hold the mutex while getting authorizer */
|
2009-12-23 02:43:42 +08:00
|
|
|
mutex_unlock(&con->mutex);
|
2012-05-17 04:16:39 +08:00
|
|
|
auth = con->ops->get_authorizer(con, auth_proto, con->auth_retry);
|
2009-12-23 02:43:42 +08:00
|
|
|
mutex_lock(&con->mutex);
|
2009-11-19 08:19:57 +08:00
|
|
|
|
2012-05-17 04:16:39 +08:00
|
|
|
if (IS_ERR(auth))
|
2012-05-17 04:16:39 +08:00
|
|
|
return auth;
|
2012-07-21 08:24:40 +08:00
|
|
|
if (con->state != CON_STATE_NEGOTIATING)
|
2012-05-17 04:16:39 +08:00
|
|
|
return ERR_PTR(-EAGAIN);
|
2011-05-20 02:21:05 +08:00
|
|
|
|
2012-05-17 04:16:39 +08:00
|
|
|
con->auth_reply_buf = auth->authorizer_reply_buf;
|
|
|
|
con->auth_reply_buf_len = auth->authorizer_reply_buf_len;
|
2012-05-17 04:16:39 +08:00
|
|
|
return auth;
|
2009-11-19 08:19:57 +08:00
|
|
|
}
|
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
/*
|
|
|
|
* We connected to a peer and are saying hello.
|
|
|
|
*/
|
2012-05-17 04:16:38 +08:00
|
|
|
static void prepare_write_banner(struct ceph_connection *con)
|
2009-10-07 02:31:13 +08:00
|
|
|
{
|
2012-05-24 03:35:23 +08:00
|
|
|
con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER);
|
|
|
|
con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr),
|
2012-05-17 04:16:38 +08:00
|
|
|
&con->msgr->my_enc_addr);
|
2009-11-11 06:34:36 +08:00
|
|
|
|
|
|
|
con->out_more = 0;
|
2013-02-21 00:25:12 +08:00
|
|
|
con_flag_set(con, CON_FLAG_WRITE_PENDING);
|
2009-11-11 06:34:36 +08:00
|
|
|
}
|
|
|
|
|
2012-05-17 04:16:38 +08:00
|
|
|
static int prepare_write_connect(struct ceph_connection *con)
|
2009-11-11 06:34:36 +08:00
|
|
|
{
|
2012-04-15 13:58:06 +08:00
|
|
|
unsigned int global_seq = get_global_seq(con->msgr, 0);
|
2009-10-07 02:31:13 +08:00
|
|
|
int proto;
|
2012-05-17 04:16:39 +08:00
|
|
|
int auth_proto;
|
2012-05-17 04:16:39 +08:00
|
|
|
struct ceph_auth_handshake *auth;
|
2009-10-07 02:31:13 +08:00
|
|
|
|
|
|
|
switch (con->peer_name.type) {
|
|
|
|
case CEPH_ENTITY_TYPE_MON:
|
|
|
|
proto = CEPH_MONC_PROTOCOL;
|
|
|
|
break;
|
|
|
|
case CEPH_ENTITY_TYPE_OSD:
|
|
|
|
proto = CEPH_OSDC_PROTOCOL;
|
|
|
|
break;
|
|
|
|
case CEPH_ENTITY_TYPE_MDS:
|
|
|
|
proto = CEPH_MDSC_PROTOCOL;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
BUG();
|
|
|
|
}
|
|
|
|
|
|
|
|
dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
|
|
|
|
con->connect_seq, global_seq, proto);
|
2009-11-19 08:19:57 +08:00
|
|
|
|
2012-05-17 04:16:38 +08:00
|
|
|
con->out_connect.features = cpu_to_le64(con->msgr->supported_features);
|
2009-10-07 02:31:13 +08:00
|
|
|
con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
|
|
|
|
con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
|
|
|
|
con->out_connect.global_seq = cpu_to_le32(global_seq);
|
|
|
|
con->out_connect.protocol_version = cpu_to_le32(proto);
|
|
|
|
con->out_connect.flags = 0;
|
|
|
|
|
2012-05-17 04:16:39 +08:00
|
|
|
auth_proto = CEPH_AUTH_UNKNOWN;
|
|
|
|
auth = get_connect_authorizer(con, &auth_proto);
|
2012-05-17 04:16:39 +08:00
|
|
|
if (IS_ERR(auth))
|
|
|
|
return PTR_ERR(auth);
|
2012-05-17 04:16:39 +08:00
|
|
|
|
2012-05-17 04:16:39 +08:00
|
|
|
con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto);
|
2012-05-17 04:16:39 +08:00
|
|
|
con->out_connect.authorizer_len = auth ?
|
|
|
|
cpu_to_le32(auth->authorizer_buf_len) : 0;
|
|
|
|
|
2012-05-24 03:35:23 +08:00
|
|
|
con_out_kvec_add(con, sizeof (con->out_connect),
|
2012-05-17 04:16:39 +08:00
|
|
|
&con->out_connect);
|
|
|
|
if (auth && auth->authorizer_buf_len)
|
2012-05-24 03:35:23 +08:00
|
|
|
con_out_kvec_add(con, auth->authorizer_buf_len,
|
2012-05-17 04:16:39 +08:00
|
|
|
auth->authorizer_buf);
|
2012-02-15 04:05:33 +08:00
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
con->out_more = 0;
|
2013-02-21 00:25:12 +08:00
|
|
|
con_flag_set(con, CON_FLAG_WRITE_PENDING);
|
2009-11-19 08:19:57 +08:00
|
|
|
|
2012-05-17 04:16:38 +08:00
|
|
|
return 0;
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* write as much of pending kvecs to the socket as we can.
|
|
|
|
* 1 -> done
|
|
|
|
* 0 -> socket full, but more to do
|
|
|
|
* <0 -> error
|
|
|
|
*/
|
|
|
|
static int write_partial_kvec(struct ceph_connection *con)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
|
|
|
|
while (con->out_kvec_bytes > 0) {
|
|
|
|
ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
|
|
|
|
con->out_kvec_left, con->out_kvec_bytes,
|
|
|
|
con->out_more);
|
|
|
|
if (ret <= 0)
|
|
|
|
goto out;
|
|
|
|
con->out_kvec_bytes -= ret;
|
|
|
|
if (con->out_kvec_bytes == 0)
|
|
|
|
break; /* done */
|
2012-02-15 21:43:54 +08:00
|
|
|
|
|
|
|
/* account for full iov entries consumed */
|
|
|
|
while (ret >= con->out_kvec_cur->iov_len) {
|
|
|
|
BUG_ON(!con->out_kvec_left);
|
|
|
|
ret -= con->out_kvec_cur->iov_len;
|
|
|
|
con->out_kvec_cur++;
|
|
|
|
con->out_kvec_left--;
|
|
|
|
}
|
|
|
|
/* and for a partially-consumed entry */
|
|
|
|
if (ret) {
|
|
|
|
con->out_kvec_cur->iov_len -= ret;
|
|
|
|
con->out_kvec_cur->iov_base += ret;
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
con->out_kvec_left = 0;
|
|
|
|
con->out_kvec_is_msg = false;
|
|
|
|
ret = 1;
|
|
|
|
out:
|
|
|
|
dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
|
|
|
|
con->out_kvec_bytes, con->out_kvec_left, ret);
|
|
|
|
return ret; /* done! */
|
|
|
|
}
|
|
|
|
|
2012-06-12 03:57:13 +08:00
|
|
|
static void out_msg_pos_next(struct ceph_connection *con, struct page *page,
|
|
|
|
size_t len, size_t sent, bool in_trail)
|
2010-04-07 06:01:27 +08:00
|
|
|
{
|
2012-06-12 03:57:13 +08:00
|
|
|
struct ceph_msg *msg = con->out_msg;
|
2013-03-07 13:39:38 +08:00
|
|
|
struct ceph_msg_pos *msg_pos = &con->out_msg_pos;
|
2010-04-07 06:01:27 +08:00
|
|
|
|
2012-06-12 03:57:13 +08:00
|
|
|
BUG_ON(!msg);
|
|
|
|
BUG_ON(!sent);
|
2010-04-07 06:01:27 +08:00
|
|
|
|
2013-03-07 13:39:38 +08:00
|
|
|
msg_pos->data_pos += sent;
|
|
|
|
msg_pos->page_pos += sent;
|
2013-03-07 13:39:39 +08:00
|
|
|
if (in_trail) {
|
|
|
|
bool need_crc;
|
|
|
|
|
|
|
|
need_crc = ceph_msg_data_advance(&msg->t, sent);
|
|
|
|
BUG_ON(need_crc && sent != len);
|
|
|
|
}
|
2012-06-12 03:57:13 +08:00
|
|
|
if (sent < len)
|
|
|
|
return;
|
2010-04-07 06:01:27 +08:00
|
|
|
|
2012-06-12 03:57:13 +08:00
|
|
|
BUG_ON(sent != len);
|
2013-03-07 13:39:38 +08:00
|
|
|
msg_pos->page_pos = 0;
|
|
|
|
msg_pos->page++;
|
|
|
|
msg_pos->did_page_crc = false;
|
2013-03-07 13:39:39 +08:00
|
|
|
if (ceph_msg_has_pagelist(msg)) {
|
2013-03-02 08:00:16 +08:00
|
|
|
list_rotate_left(&msg->l.pagelist->head);
|
2012-06-12 03:57:13 +08:00
|
|
|
#ifdef CONFIG_BLOCK
|
2013-03-02 08:00:16 +08:00
|
|
|
} else if (ceph_msg_has_bio(msg)) {
|
2013-03-02 08:00:16 +08:00
|
|
|
iter_bio_next(&msg->b.bio_iter, &msg->b.bio_seg);
|
2010-04-07 06:01:27 +08:00
|
|
|
#endif
|
2013-03-02 08:00:16 +08:00
|
|
|
}
|
2012-06-12 03:57:13 +08:00
|
|
|
}
|
2010-04-07 06:01:27 +08:00
|
|
|
|
libceph: define and use in_msg_pos_next()
Define a new function in_msg_pos_next() to match out_msg_pos_next(),
and use it in place of code at the end of read_partial_message_pages()
and read_partial_message_bio().
Note that the page number is incremented and offset reset under
slightly different conditions from before. The result is
equivalent, however, as explained below.
Each time an incoming message is going to arrive, we find out how
much room is left--not surpassing the current page--and provide that
as the number of bytes to receive. So the amount we'll use is the
lesser of: all that's left of the entire request; and all that's
left in the current page.
If we received exactly how many were requested, we either reached
the end of the request or the end of the page. In the first case,
we're done, in the second, we move onto the next page in the array.
In all cases but (possibly) on the last page, after adding the
number of bytes received, page_pos == PAGE_SIZE. On the last page,
it doesn't really matter whether we increment the page number and
reset the page position, because we're done and we won't come back
here again. The code previously skipped over that last case,
basically. The new code handles that case the same as the others,
incrementing and resetting.
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2013-03-09 08:51:04 +08:00
|
|
|
static void in_msg_pos_next(struct ceph_connection *con, size_t len,
|
|
|
|
size_t received)
|
|
|
|
{
|
|
|
|
struct ceph_msg *msg = con->in_msg;
|
2013-03-07 13:39:38 +08:00
|
|
|
struct ceph_msg_pos *msg_pos = &con->in_msg_pos;
|
libceph: define and use in_msg_pos_next()
Define a new function in_msg_pos_next() to match out_msg_pos_next(),
and use it in place of code at the end of read_partial_message_pages()
and read_partial_message_bio().
Note that the page number is incremented and offset reset under
slightly different conditions from before. The result is
equivalent, however, as explained below.
Each time an incoming message is going to arrive, we find out how
much room is left--not surpassing the current page--and provide that
as the number of bytes to receive. So the amount we'll use is the
lesser of: all that's left of the entire request; and all that's
left in the current page.
If we received exactly how many were requested, we either reached
the end of the request or the end of the page. In the first case,
we're done, in the second, we move onto the next page in the array.
In all cases but (possibly) on the last page, after adding the
number of bytes received, page_pos == PAGE_SIZE. On the last page,
it doesn't really matter whether we increment the page number and
reset the page position, because we're done and we won't come back
here again. The code previously skipped over that last case,
basically. The new code handles that case the same as the others,
incrementing and resetting.
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2013-03-09 08:51:04 +08:00
|
|
|
|
|
|
|
BUG_ON(!msg);
|
|
|
|
BUG_ON(!received);
|
|
|
|
|
2013-03-07 13:39:38 +08:00
|
|
|
msg_pos->data_pos += received;
|
|
|
|
msg_pos->page_pos += received;
|
libceph: define and use in_msg_pos_next()
Define a new function in_msg_pos_next() to match out_msg_pos_next(),
and use it in place of code at the end of read_partial_message_pages()
and read_partial_message_bio().
Note that the page number is incremented and offset reset under
slightly different conditions from before. The result is
equivalent, however, as explained below.
Each time an incoming message is going to arrive, we find out how
much room is left--not surpassing the current page--and provide that
as the number of bytes to receive. So the amount we'll use is the
lesser of: all that's left of the entire request; and all that's
left in the current page.
If we received exactly how many were requested, we either reached
the end of the request or the end of the page. In the first case,
we're done, in the second, we move onto the next page in the array.
In all cases but (possibly) on the last page, after adding the
number of bytes received, page_pos == PAGE_SIZE. On the last page,
it doesn't really matter whether we increment the page number and
reset the page position, because we're done and we won't come back
here again. The code previously skipped over that last case,
basically. The new code handles that case the same as the others,
incrementing and resetting.
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2013-03-09 08:51:04 +08:00
|
|
|
if (received < len)
|
|
|
|
return;
|
|
|
|
|
|
|
|
BUG_ON(received != len);
|
2013-03-07 13:39:38 +08:00
|
|
|
msg_pos->page_pos = 0;
|
|
|
|
msg_pos->page++;
|
libceph: define and use in_msg_pos_next()
Define a new function in_msg_pos_next() to match out_msg_pos_next(),
and use it in place of code at the end of read_partial_message_pages()
and read_partial_message_bio().
Note that the page number is incremented and offset reset under
slightly different conditions from before. The result is
equivalent, however, as explained below.
Each time an incoming message is going to arrive, we find out how
much room is left--not surpassing the current page--and provide that
as the number of bytes to receive. So the amount we'll use is the
lesser of: all that's left of the entire request; and all that's
left in the current page.
If we received exactly how many were requested, we either reached
the end of the request or the end of the page. In the first case,
we're done, in the second, we move onto the next page in the array.
In all cases but (possibly) on the last page, after adding the
number of bytes received, page_pos == PAGE_SIZE. On the last page,
it doesn't really matter whether we increment the page number and
reset the page position, because we're done and we won't come back
here again. The code previously skipped over that last case,
basically. The new code handles that case the same as the others,
incrementing and resetting.
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2013-03-09 08:51:04 +08:00
|
|
|
#ifdef CONFIG_BLOCK
|
2013-03-02 08:00:16 +08:00
|
|
|
if (msg->b.bio)
|
|
|
|
iter_bio_next(&msg->b.bio_iter, &msg->b.bio_seg);
|
libceph: define and use in_msg_pos_next()
Define a new function in_msg_pos_next() to match out_msg_pos_next(),
and use it in place of code at the end of read_partial_message_pages()
and read_partial_message_bio().
Note that the page number is incremented and offset reset under
slightly different conditions from before. The result is
equivalent, however, as explained below.
Each time an incoming message is going to arrive, we find out how
much room is left--not surpassing the current page--and provide that
as the number of bytes to receive. So the amount we'll use is the
lesser of: all that's left of the entire request; and all that's
left in the current page.
If we received exactly how many were requested, we either reached
the end of the request or the end of the page. In the first case,
we're done, in the second, we move onto the next page in the array.
In all cases but (possibly) on the last page, after adding the
number of bytes received, page_pos == PAGE_SIZE. On the last page,
it doesn't really matter whether we increment the page number and
reset the page position, because we're done and we won't come back
here again. The code previously skipped over that last case,
basically. The new code handles that case the same as the others,
incrementing and resetting.
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2013-03-09 08:51:04 +08:00
|
|
|
#endif /* CONFIG_BLOCK */
|
|
|
|
}
|
|
|
|
|
2013-03-09 10:59:00 +08:00
|
|
|
static u32 ceph_crc32c_page(u32 crc, struct page *page,
|
|
|
|
unsigned int page_offset,
|
|
|
|
unsigned int length)
|
|
|
|
{
|
|
|
|
char *kaddr;
|
|
|
|
|
|
|
|
kaddr = kmap(page);
|
|
|
|
BUG_ON(kaddr == NULL);
|
|
|
|
crc = crc32c(crc, kaddr + page_offset, length);
|
|
|
|
kunmap(page);
|
|
|
|
|
|
|
|
return crc;
|
|
|
|
}
|
2009-10-07 02:31:13 +08:00
|
|
|
/*
|
|
|
|
* Write as much message data payload as we can. If we finish, queue
|
|
|
|
* up the footer.
|
|
|
|
* 1 -> done, footer is now queued in out_kvec[].
|
|
|
|
* 0 -> socket full, but more to do
|
|
|
|
* <0 -> error
|
|
|
|
*/
|
2013-03-09 10:58:59 +08:00
|
|
|
static int write_partial_message_data(struct ceph_connection *con)
|
2009-10-07 02:31:13 +08:00
|
|
|
{
|
|
|
|
struct ceph_msg *msg = con->out_msg;
|
2013-03-07 13:39:38 +08:00
|
|
|
struct ceph_msg_pos *msg_pos = &con->out_msg_pos;
|
2012-04-15 13:58:06 +08:00
|
|
|
unsigned int data_len = le32_to_cpu(msg->hdr.data_len);
|
2012-03-08 01:40:08 +08:00
|
|
|
bool do_datacrc = !con->msgr->nocrc;
|
2009-10-07 02:31:13 +08:00
|
|
|
int ret;
|
2010-04-07 06:01:27 +08:00
|
|
|
int total_max_write;
|
2012-06-12 03:57:13 +08:00
|
|
|
bool in_trail = false;
|
2013-03-02 08:00:16 +08:00
|
|
|
size_t trail_len = 0;
|
|
|
|
size_t trail_off = data_len;
|
|
|
|
|
|
|
|
if (ceph_msg_has_trail(msg)) {
|
2013-03-02 08:00:16 +08:00
|
|
|
trail_len = msg->t.pagelist->length;
|
2013-03-02 08:00:16 +08:00
|
|
|
trail_off -= trail_len;
|
|
|
|
}
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2013-03-09 10:58:59 +08:00
|
|
|
dout("%s %p msg %p page %d offset %d\n", __func__,
|
2013-03-07 13:39:38 +08:00
|
|
|
con, msg, msg_pos->page, msg_pos->page_pos);
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2012-06-12 03:57:13 +08:00
|
|
|
/*
|
|
|
|
* Iterate through each page that contains data to be
|
|
|
|
* written, and send as much as possible for each.
|
|
|
|
*
|
|
|
|
* If we are calculating the data crc (the default), we will
|
|
|
|
* need to map the page. If we have no pages, they have
|
|
|
|
* been revoked, so use the zero page.
|
|
|
|
*/
|
2013-03-07 13:39:38 +08:00
|
|
|
while (data_len > msg_pos->data_pos) {
|
2009-10-07 02:31:13 +08:00
|
|
|
struct page *page = NULL;
|
2013-03-07 13:39:38 +08:00
|
|
|
size_t page_offset;
|
|
|
|
size_t length;
|
2010-04-07 06:01:27 +08:00
|
|
|
int max_write = PAGE_SIZE;
|
2012-03-08 01:40:08 +08:00
|
|
|
int bio_offset = 0;
|
2013-03-07 13:39:39 +08:00
|
|
|
bool use_cursor = false;
|
|
|
|
bool last_piece = true; /* preserve existing behavior */
|
2010-04-07 06:01:27 +08:00
|
|
|
|
2013-03-07 13:39:38 +08:00
|
|
|
in_trail = in_trail || msg_pos->data_pos >= trail_off;
|
2012-06-12 03:57:13 +08:00
|
|
|
if (!in_trail)
|
2013-03-07 13:39:38 +08:00
|
|
|
total_max_write = trail_off - msg_pos->data_pos;
|
2010-04-07 06:01:27 +08:00
|
|
|
|
2012-06-12 03:57:13 +08:00
|
|
|
if (in_trail) {
|
2013-03-02 08:00:16 +08:00
|
|
|
BUG_ON(!ceph_msg_has_trail(msg));
|
2013-03-07 13:39:39 +08:00
|
|
|
use_cursor = true;
|
|
|
|
page = ceph_msg_data_next(&msg->t, &page_offset,
|
|
|
|
&length, &last_piece);
|
2013-03-02 08:00:16 +08:00
|
|
|
} else if (ceph_msg_has_pages(msg)) {
|
2013-03-02 08:00:16 +08:00
|
|
|
page = msg->p.pages[msg_pos->page];
|
2013-03-02 08:00:16 +08:00
|
|
|
} else if (ceph_msg_has_pagelist(msg)) {
|
2013-03-02 08:00:16 +08:00
|
|
|
page = list_first_entry(&msg->l.pagelist->head,
|
2009-12-24 04:12:31 +08:00
|
|
|
struct page, lru);
|
2010-04-07 06:01:27 +08:00
|
|
|
#ifdef CONFIG_BLOCK
|
2013-03-02 08:00:16 +08:00
|
|
|
} else if (ceph_msg_has_bio(msg)) {
|
2010-04-07 06:01:27 +08:00
|
|
|
struct bio_vec *bv;
|
|
|
|
|
2013-03-02 08:00:16 +08:00
|
|
|
bv = bio_iovec_idx(msg->b.bio_iter, msg->b.bio_seg);
|
2010-04-07 06:01:27 +08:00
|
|
|
page = bv->bv_page;
|
2012-03-08 01:40:08 +08:00
|
|
|
bio_offset = bv->bv_offset;
|
2010-04-07 06:01:27 +08:00
|
|
|
max_write = bv->bv_len;
|
|
|
|
#endif
|
2009-10-07 02:31:13 +08:00
|
|
|
} else {
|
2012-01-24 05:49:27 +08:00
|
|
|
page = zero_page;
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
2013-03-07 13:39:39 +08:00
|
|
|
if (!use_cursor)
|
|
|
|
length = min_t(int, max_write - msg_pos->page_pos,
|
|
|
|
total_max_write);
|
2010-04-07 06:01:27 +08:00
|
|
|
|
2013-03-07 13:39:38 +08:00
|
|
|
page_offset = msg_pos->page_pos + bio_offset;
|
2013-03-07 13:39:38 +08:00
|
|
|
if (do_datacrc && !msg_pos->did_page_crc) {
|
2012-06-12 03:57:13 +08:00
|
|
|
u32 crc = le32_to_cpu(msg->footer.data_crc);
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2013-03-09 10:59:00 +08:00
|
|
|
crc = ceph_crc32c_page(crc, page, page_offset, length);
|
2012-06-12 03:57:13 +08:00
|
|
|
msg->footer.data_crc = cpu_to_le32(crc);
|
2013-03-07 13:39:38 +08:00
|
|
|
msg_pos->did_page_crc = true;
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
2013-03-07 13:39:38 +08:00
|
|
|
ret = ceph_tcp_sendpage(con->sock, page, page_offset,
|
2013-03-07 13:39:39 +08:00
|
|
|
length, last_piece);
|
2009-10-07 02:31:13 +08:00
|
|
|
if (ret <= 0)
|
|
|
|
goto out;
|
|
|
|
|
2013-03-07 13:39:38 +08:00
|
|
|
out_msg_pos_next(con, page, length, (size_t) ret, in_trail);
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
|
|
|
|
2013-03-09 10:58:59 +08:00
|
|
|
dout("%s %p msg %p done\n", __func__, con, msg);
|
2009-10-07 02:31:13 +08:00
|
|
|
|
|
|
|
/* prepare and queue up footer, too */
|
2012-03-08 01:40:08 +08:00
|
|
|
if (!do_datacrc)
|
2012-06-12 03:57:13 +08:00
|
|
|
msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
|
2012-05-24 03:35:23 +08:00
|
|
|
con_out_kvec_reset(con);
|
2012-02-15 04:05:33 +08:00
|
|
|
prepare_write_message_footer(con);
|
2009-10-07 02:31:13 +08:00
|
|
|
ret = 1;
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* write some zeros
|
|
|
|
*/
|
|
|
|
static int write_partial_skip(struct ceph_connection *con)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
while (con->out_skip > 0) {
|
2012-03-08 01:40:08 +08:00
|
|
|
size_t size = min(con->out_skip, (int) PAGE_CACHE_SIZE);
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2013-03-07 13:39:38 +08:00
|
|
|
ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, true);
|
2009-10-07 02:31:13 +08:00
|
|
|
if (ret <= 0)
|
|
|
|
goto out;
|
|
|
|
con->out_skip -= ret;
|
|
|
|
}
|
|
|
|
ret = 1;
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Prepare to read connection handshake, or an ack.
|
|
|
|
*/
|
2009-11-11 06:34:36 +08:00
|
|
|
static void prepare_read_banner(struct ceph_connection *con)
|
|
|
|
{
|
|
|
|
dout("prepare_read_banner %p\n", con);
|
|
|
|
con->in_base_pos = 0;
|
|
|
|
}
|
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
static void prepare_read_connect(struct ceph_connection *con)
|
|
|
|
{
|
|
|
|
dout("prepare_read_connect %p\n", con);
|
|
|
|
con->in_base_pos = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void prepare_read_ack(struct ceph_connection *con)
|
|
|
|
{
|
|
|
|
dout("prepare_read_ack %p\n", con);
|
|
|
|
con->in_base_pos = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void prepare_read_tag(struct ceph_connection *con)
|
|
|
|
{
|
|
|
|
dout("prepare_read_tag %p\n", con);
|
|
|
|
con->in_base_pos = 0;
|
|
|
|
con->in_tag = CEPH_MSGR_TAG_READY;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Prepare to read a message.
|
|
|
|
*/
|
|
|
|
static int prepare_read_message(struct ceph_connection *con)
|
|
|
|
{
|
|
|
|
dout("prepare_read_message %p\n", con);
|
|
|
|
BUG_ON(con->in_msg != NULL);
|
|
|
|
con->in_base_pos = 0;
|
|
|
|
con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static int read_partial(struct ceph_connection *con,
|
2012-05-10 23:29:50 +08:00
|
|
|
int end, int size, void *object)
|
2009-10-07 02:31:13 +08:00
|
|
|
{
|
2012-05-10 23:29:50 +08:00
|
|
|
while (con->in_base_pos < end) {
|
|
|
|
int left = end - con->in_base_pos;
|
2009-10-07 02:31:13 +08:00
|
|
|
int have = size - left;
|
|
|
|
int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
|
|
|
|
if (ret <= 0)
|
|
|
|
return ret;
|
|
|
|
con->in_base_pos += ret;
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Read all or part of the connect-side handshake on a new connection
|
|
|
|
*/
|
2009-11-11 06:34:36 +08:00
|
|
|
static int read_partial_banner(struct ceph_connection *con)
|
2009-10-07 02:31:13 +08:00
|
|
|
{
|
2012-05-10 23:29:50 +08:00
|
|
|
int size;
|
|
|
|
int end;
|
|
|
|
int ret;
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2009-11-11 06:34:36 +08:00
|
|
|
dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
|
2009-10-07 02:31:13 +08:00
|
|
|
|
|
|
|
/* peer's banner */
|
2012-05-10 23:29:50 +08:00
|
|
|
size = strlen(CEPH_BANNER);
|
|
|
|
end = size;
|
|
|
|
ret = read_partial(con, end, size, con->in_banner);
|
2009-10-07 02:31:13 +08:00
|
|
|
if (ret <= 0)
|
|
|
|
goto out;
|
2012-05-10 23:29:50 +08:00
|
|
|
|
|
|
|
size = sizeof (con->actual_peer_addr);
|
|
|
|
end += size;
|
|
|
|
ret = read_partial(con, end, size, &con->actual_peer_addr);
|
2009-10-07 02:31:13 +08:00
|
|
|
if (ret <= 0)
|
|
|
|
goto out;
|
2012-05-10 23:29:50 +08:00
|
|
|
|
|
|
|
size = sizeof (con->peer_addr_for_me);
|
|
|
|
end += size;
|
|
|
|
ret = read_partial(con, end, size, &con->peer_addr_for_me);
|
2009-10-07 02:31:13 +08:00
|
|
|
if (ret <= 0)
|
|
|
|
goto out;
|
2012-05-10 23:29:50 +08:00
|
|
|
|
2009-11-11 06:34:36 +08:00
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int read_partial_connect(struct ceph_connection *con)
|
|
|
|
{
|
2012-05-10 23:29:50 +08:00
|
|
|
int size;
|
|
|
|
int end;
|
|
|
|
int ret;
|
2009-11-11 06:34:36 +08:00
|
|
|
|
|
|
|
dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
|
|
|
|
|
2012-05-10 23:29:50 +08:00
|
|
|
size = sizeof (con->in_reply);
|
|
|
|
end = size;
|
|
|
|
ret = read_partial(con, end, size, &con->in_reply);
|
2009-10-07 02:31:13 +08:00
|
|
|
if (ret <= 0)
|
|
|
|
goto out;
|
2012-05-10 23:29:50 +08:00
|
|
|
|
|
|
|
size = le32_to_cpu(con->in_reply.authorizer_len);
|
|
|
|
end += size;
|
|
|
|
ret = read_partial(con, end, size, con->auth_reply_buf);
|
2009-11-19 08:19:57 +08:00
|
|
|
if (ret <= 0)
|
|
|
|
goto out;
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2009-11-19 08:19:57 +08:00
|
|
|
dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
|
|
|
|
con, (int)con->in_reply.tag,
|
|
|
|
le32_to_cpu(con->in_reply.connect_seq),
|
2009-10-07 02:31:13 +08:00
|
|
|
le32_to_cpu(con->in_reply.global_seq));
|
|
|
|
out:
|
|
|
|
return ret;
|
2009-11-11 06:34:36 +08:00
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Verify the hello banner looks okay.
|
|
|
|
*/
|
|
|
|
static int verify_hello(struct ceph_connection *con)
|
|
|
|
{
|
|
|
|
if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
|
2009-10-10 07:36:34 +08:00
|
|
|
pr_err("connect to %s got bad banner\n",
|
2010-04-07 06:14:15 +08:00
|
|
|
ceph_pr_addr(&con->peer_addr.in_addr));
|
2009-10-07 02:31:13 +08:00
|
|
|
con->error_msg = "protocol error, bad banner";
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool addr_is_blank(struct sockaddr_storage *ss)
|
|
|
|
{
|
|
|
|
switch (ss->ss_family) {
|
|
|
|
case AF_INET:
|
|
|
|
return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
|
|
|
|
case AF_INET6:
|
|
|
|
return
|
|
|
|
((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
|
|
|
|
((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
|
|
|
|
((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
|
|
|
|
((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int addr_port(struct sockaddr_storage *ss)
|
|
|
|
{
|
|
|
|
switch (ss->ss_family) {
|
|
|
|
case AF_INET:
|
2009-11-05 03:46:35 +08:00
|
|
|
return ntohs(((struct sockaddr_in *)ss)->sin_port);
|
2009-10-07 02:31:13 +08:00
|
|
|
case AF_INET6:
|
2009-11-05 03:46:35 +08:00
|
|
|
return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void addr_set_port(struct sockaddr_storage *ss, int p)
|
|
|
|
{
|
|
|
|
switch (ss->ss_family) {
|
|
|
|
case AF_INET:
|
|
|
|
((struct sockaddr_in *)ss)->sin_port = htons(p);
|
2011-05-13 06:34:24 +08:00
|
|
|
break;
|
2009-10-07 02:31:13 +08:00
|
|
|
case AF_INET6:
|
|
|
|
((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
|
2011-05-13 06:34:24 +08:00
|
|
|
break;
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-09-24 02:48:42 +08:00
|
|
|
/*
|
|
|
|
* Unlike other *_pton function semantics, zero indicates success.
|
|
|
|
*/
|
|
|
|
static int ceph_pton(const char *str, size_t len, struct sockaddr_storage *ss,
|
|
|
|
char delim, const char **ipend)
|
|
|
|
{
|
2012-01-24 05:49:27 +08:00
|
|
|
struct sockaddr_in *in4 = (struct sockaddr_in *) ss;
|
|
|
|
struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) ss;
|
2011-09-24 02:48:42 +08:00
|
|
|
|
|
|
|
memset(ss, 0, sizeof(*ss));
|
|
|
|
|
|
|
|
if (in4_pton(str, len, (u8 *)&in4->sin_addr.s_addr, delim, ipend)) {
|
|
|
|
ss->ss_family = AF_INET;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (in6_pton(str, len, (u8 *)&in6->sin6_addr.s6_addr, delim, ipend)) {
|
|
|
|
ss->ss_family = AF_INET6;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Extract hostname string and resolve using kernel DNS facility.
|
|
|
|
*/
|
|
|
|
#ifdef CONFIG_CEPH_LIB_USE_DNS_RESOLVER
|
|
|
|
static int ceph_dns_resolve_name(const char *name, size_t namelen,
|
|
|
|
struct sockaddr_storage *ss, char delim, const char **ipend)
|
|
|
|
{
|
|
|
|
const char *end, *delim_p;
|
|
|
|
char *colon_p, *ip_addr = NULL;
|
|
|
|
int ip_len, ret;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The end of the hostname occurs immediately preceding the delimiter or
|
|
|
|
* the port marker (':') where the delimiter takes precedence.
|
|
|
|
*/
|
|
|
|
delim_p = memchr(name, delim, namelen);
|
|
|
|
colon_p = memchr(name, ':', namelen);
|
|
|
|
|
|
|
|
if (delim_p && colon_p)
|
|
|
|
end = delim_p < colon_p ? delim_p : colon_p;
|
|
|
|
else if (!delim_p && colon_p)
|
|
|
|
end = colon_p;
|
|
|
|
else {
|
|
|
|
end = delim_p;
|
|
|
|
if (!end) /* case: hostname:/ */
|
|
|
|
end = name + namelen;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (end <= name)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
/* do dns_resolve upcall */
|
|
|
|
ip_len = dns_query(NULL, name, end - name, NULL, &ip_addr, NULL);
|
|
|
|
if (ip_len > 0)
|
|
|
|
ret = ceph_pton(ip_addr, ip_len, ss, -1, NULL);
|
|
|
|
else
|
|
|
|
ret = -ESRCH;
|
|
|
|
|
|
|
|
kfree(ip_addr);
|
|
|
|
|
|
|
|
*ipend = end;
|
|
|
|
|
|
|
|
pr_info("resolve '%.*s' (ret=%d): %s\n", (int)(end - name), name,
|
|
|
|
ret, ret ? "failed" : ceph_pr_addr(ss));
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static inline int ceph_dns_resolve_name(const char *name, size_t namelen,
|
|
|
|
struct sockaddr_storage *ss, char delim, const char **ipend)
|
|
|
|
{
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Parse a server name (IP or hostname). If a valid IP address is not found
|
|
|
|
* then try to extract a hostname to resolve using userspace DNS upcall.
|
|
|
|
*/
|
|
|
|
static int ceph_parse_server_name(const char *name, size_t namelen,
|
|
|
|
struct sockaddr_storage *ss, char delim, const char **ipend)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = ceph_pton(name, namelen, ss, delim, ipend);
|
|
|
|
if (ret)
|
|
|
|
ret = ceph_dns_resolve_name(name, namelen, ss, delim, ipend);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
/*
|
|
|
|
* Parse an ip[:port] list into an addr array. Use the default
|
|
|
|
* monitor port if a port isn't specified.
|
|
|
|
*/
|
|
|
|
int ceph_parse_ips(const char *c, const char *end,
|
|
|
|
struct ceph_entity_addr *addr,
|
|
|
|
int max_count, int *count)
|
|
|
|
{
|
2011-09-24 02:48:42 +08:00
|
|
|
int i, ret = -EINVAL;
|
2009-10-07 02:31:13 +08:00
|
|
|
const char *p = c;
|
|
|
|
|
|
|
|
dout("parse_ips on '%.*s'\n", (int)(end-c), c);
|
|
|
|
for (i = 0; i < max_count; i++) {
|
|
|
|
const char *ipend;
|
|
|
|
struct sockaddr_storage *ss = &addr[i].in_addr;
|
|
|
|
int port;
|
2010-07-09 00:54:52 +08:00
|
|
|
char delim = ',';
|
|
|
|
|
|
|
|
if (*p == '[') {
|
|
|
|
delim = ']';
|
|
|
|
p++;
|
|
|
|
}
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2011-09-24 02:48:42 +08:00
|
|
|
ret = ceph_parse_server_name(p, end - p, ss, delim, &ipend);
|
|
|
|
if (ret)
|
2009-10-07 02:31:13 +08:00
|
|
|
goto bad;
|
2011-09-24 02:48:42 +08:00
|
|
|
ret = -EINVAL;
|
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
p = ipend;
|
|
|
|
|
2010-07-09 00:54:52 +08:00
|
|
|
if (delim == ']') {
|
|
|
|
if (*p != ']') {
|
|
|
|
dout("missing matching ']'\n");
|
|
|
|
goto bad;
|
|
|
|
}
|
|
|
|
p++;
|
|
|
|
}
|
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
/* port? */
|
|
|
|
if (p < end && *p == ':') {
|
|
|
|
port = 0;
|
|
|
|
p++;
|
|
|
|
while (p < end && *p >= '0' && *p <= '9') {
|
|
|
|
port = (port * 10) + (*p - '0');
|
|
|
|
p++;
|
|
|
|
}
|
|
|
|
if (port > 65535 || port == 0)
|
|
|
|
goto bad;
|
|
|
|
} else {
|
|
|
|
port = CEPH_MON_PORT;
|
|
|
|
}
|
|
|
|
|
|
|
|
addr_set_port(ss, port);
|
|
|
|
|
2010-04-07 06:14:15 +08:00
|
|
|
dout("parse_ips got %s\n", ceph_pr_addr(ss));
|
2009-10-07 02:31:13 +08:00
|
|
|
|
|
|
|
if (p == end)
|
|
|
|
break;
|
|
|
|
if (*p != ',')
|
|
|
|
goto bad;
|
|
|
|
p++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (p != end)
|
|
|
|
goto bad;
|
|
|
|
|
|
|
|
if (count)
|
|
|
|
*count = i + 1;
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
bad:
|
2010-07-09 00:54:52 +08:00
|
|
|
pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c);
|
2011-09-24 02:48:42 +08:00
|
|
|
return ret;
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
2010-04-07 06:14:15 +08:00
|
|
|
EXPORT_SYMBOL(ceph_parse_ips);
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2009-11-11 06:34:36 +08:00
|
|
|
static int process_banner(struct ceph_connection *con)
|
2009-10-07 02:31:13 +08:00
|
|
|
{
|
2009-11-11 06:34:36 +08:00
|
|
|
dout("process_banner on %p\n", con);
|
2009-10-07 02:31:13 +08:00
|
|
|
|
|
|
|
if (verify_hello(con) < 0)
|
|
|
|
return -1;
|
|
|
|
|
2009-11-04 07:17:56 +08:00
|
|
|
ceph_decode_addr(&con->actual_peer_addr);
|
|
|
|
ceph_decode_addr(&con->peer_addr_for_me);
|
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
/*
|
|
|
|
* Make sure the other end is who we wanted. note that the other
|
|
|
|
* end may not yet know their ip address, so if it's 0.0.0.0, give
|
|
|
|
* them the benefit of the doubt.
|
|
|
|
*/
|
2010-01-08 08:12:36 +08:00
|
|
|
if (memcmp(&con->peer_addr, &con->actual_peer_addr,
|
|
|
|
sizeof(con->peer_addr)) != 0 &&
|
2009-10-07 02:31:13 +08:00
|
|
|
!(addr_is_blank(&con->actual_peer_addr.in_addr) &&
|
|
|
|
con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
|
2010-06-12 07:58:48 +08:00
|
|
|
pr_warning("wrong peer, want %s/%d, got %s/%d\n",
|
2010-04-07 06:14:15 +08:00
|
|
|
ceph_pr_addr(&con->peer_addr.in_addr),
|
2010-06-12 07:58:48 +08:00
|
|
|
(int)le32_to_cpu(con->peer_addr.nonce),
|
2010-04-07 06:14:15 +08:00
|
|
|
ceph_pr_addr(&con->actual_peer_addr.in_addr),
|
2010-06-12 07:58:48 +08:00
|
|
|
(int)le32_to_cpu(con->actual_peer_addr.nonce));
|
2009-12-24 04:12:31 +08:00
|
|
|
con->error_msg = "wrong peer at address";
|
2009-10-07 02:31:13 +08:00
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* did we learn our address?
|
|
|
|
*/
|
|
|
|
if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
|
|
|
|
int port = addr_port(&con->msgr->inst.addr.in_addr);
|
|
|
|
|
|
|
|
memcpy(&con->msgr->inst.addr.in_addr,
|
|
|
|
&con->peer_addr_for_me.in_addr,
|
|
|
|
sizeof(con->peer_addr_for_me.in_addr));
|
|
|
|
addr_set_port(&con->msgr->inst.addr.in_addr, port);
|
2009-11-04 07:17:56 +08:00
|
|
|
encode_my_addr(con->msgr);
|
2009-11-11 06:34:36 +08:00
|
|
|
dout("process_banner learned my addr is %s\n",
|
2010-04-07 06:14:15 +08:00
|
|
|
ceph_pr_addr(&con->msgr->inst.addr.in_addr));
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
|
|
|
|
2009-11-11 06:34:36 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int process_connect(struct ceph_connection *con)
|
|
|
|
{
|
2010-04-07 06:14:15 +08:00
|
|
|
u64 sup_feat = con->msgr->supported_features;
|
|
|
|
u64 req_feat = con->msgr->required_features;
|
2009-12-24 01:30:21 +08:00
|
|
|
u64 server_feat = le64_to_cpu(con->in_reply.features);
|
2011-05-20 02:21:05 +08:00
|
|
|
int ret;
|
2009-12-24 01:30:21 +08:00
|
|
|
|
2009-11-11 06:34:36 +08:00
|
|
|
dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
|
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
switch (con->in_reply.tag) {
|
2009-12-24 01:30:21 +08:00
|
|
|
case CEPH_MSGR_TAG_FEATURES:
|
|
|
|
pr_err("%s%lld %s feature set mismatch,"
|
|
|
|
" my %llx < server's %llx, missing %llx\n",
|
|
|
|
ENTITY_NAME(con->peer_name),
|
2010-04-07 06:14:15 +08:00
|
|
|
ceph_pr_addr(&con->peer_addr.in_addr),
|
2009-12-24 01:30:21 +08:00
|
|
|
sup_feat, server_feat, server_feat & ~sup_feat);
|
|
|
|
con->error_msg = "missing required protocol features";
|
2012-12-28 10:27:04 +08:00
|
|
|
reset_connection(con);
|
2009-12-24 01:30:21 +08:00
|
|
|
return -1;
|
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
case CEPH_MSGR_TAG_BADPROTOVER:
|
|
|
|
pr_err("%s%lld %s protocol version mismatch,"
|
|
|
|
" my %d != server's %d\n",
|
|
|
|
ENTITY_NAME(con->peer_name),
|
2010-04-07 06:14:15 +08:00
|
|
|
ceph_pr_addr(&con->peer_addr.in_addr),
|
2009-10-07 02:31:13 +08:00
|
|
|
le32_to_cpu(con->out_connect.protocol_version),
|
|
|
|
le32_to_cpu(con->in_reply.protocol_version));
|
|
|
|
con->error_msg = "protocol version mismatch";
|
2012-12-28 10:27:04 +08:00
|
|
|
reset_connection(con);
|
2009-10-07 02:31:13 +08:00
|
|
|
return -1;
|
|
|
|
|
2009-11-19 08:19:57 +08:00
|
|
|
case CEPH_MSGR_TAG_BADAUTHORIZER:
|
|
|
|
con->auth_retry++;
|
|
|
|
dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
|
|
|
|
con->auth_retry);
|
|
|
|
if (con->auth_retry == 2) {
|
|
|
|
con->error_msg = "connect authorization failure";
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
con->auth_retry = 1;
|
libceph: avoid truncation due to racing banners
Because the Ceph client messenger uses a non-blocking connect, it is
possible for the sending of the client banner to race with the
arrival of the banner sent by the peer.
When ceph_sock_state_change() notices the connect has completed, it
schedules work to process the socket via con_work(). During this
time the peer is writing its banner, and arrival of the peer banner
races with con_work().
If con_work() calls try_read() before the peer banner arrives, there
is nothing for it to do, after which con_work() calls try_write() to
send the client's banner. In this case Ceph's protocol negotiation
can complete succesfully.
The server-side messenger immediately sends its banner and addresses
after accepting a connect request, *before* actually attempting to
read or verify the banner from the client. As a result, it is
possible for the banner from the server to arrive before con_work()
calls try_read(). If that happens, try_read() will read the banner
and prepare protocol negotiation info via prepare_write_connect().
prepare_write_connect() calls con_out_kvec_reset(), which discards
the as-yet-unsent client banner. Next, con_work() calls
try_write(), which sends the protocol negotiation info rather than
the banner that the peer is expecting.
The result is that the peer sees an invalid banner, and the client
reports "negotiation failed".
Fix this by moving con_out_kvec_reset() out of
prepare_write_connect() to its callers at all locations except the
one where the banner might still need to be sent.
[elder@inktak.com: added note about server-side behavior]
Signed-off-by: Jim Schutt <jaschut@sandia.gov>
Reviewed-by: Alex Elder <elder@inktank.com>
2012-08-11 01:37:38 +08:00
|
|
|
con_out_kvec_reset(con);
|
2012-05-17 04:16:38 +08:00
|
|
|
ret = prepare_write_connect(con);
|
2011-05-20 02:21:05 +08:00
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
2010-03-16 06:47:22 +08:00
|
|
|
prepare_read_connect(con);
|
2009-11-19 08:19:57 +08:00
|
|
|
break;
|
2009-10-07 02:31:13 +08:00
|
|
|
|
|
|
|
case CEPH_MSGR_TAG_RESETSESSION:
|
|
|
|
/*
|
|
|
|
* If we connected with a large connect_seq but the peer
|
|
|
|
* has no record of a session with us (no connection, or
|
|
|
|
* connect_seq == 0), they will send RESETSESION to indicate
|
|
|
|
* that they must have reset their session, and may have
|
|
|
|
* dropped messages.
|
|
|
|
*/
|
|
|
|
dout("process_connect got RESET peer seq %u\n",
|
2012-07-11 02:53:34 +08:00
|
|
|
le32_to_cpu(con->in_reply.connect_seq));
|
2009-10-07 02:31:13 +08:00
|
|
|
pr_err("%s%lld %s connection reset\n",
|
|
|
|
ENTITY_NAME(con->peer_name),
|
2010-04-07 06:14:15 +08:00
|
|
|
ceph_pr_addr(&con->peer_addr.in_addr));
|
2009-10-07 02:31:13 +08:00
|
|
|
reset_connection(con);
|
libceph: avoid truncation due to racing banners
Because the Ceph client messenger uses a non-blocking connect, it is
possible for the sending of the client banner to race with the
arrival of the banner sent by the peer.
When ceph_sock_state_change() notices the connect has completed, it
schedules work to process the socket via con_work(). During this
time the peer is writing its banner, and arrival of the peer banner
races with con_work().
If con_work() calls try_read() before the peer banner arrives, there
is nothing for it to do, after which con_work() calls try_write() to
send the client's banner. In this case Ceph's protocol negotiation
can complete succesfully.
The server-side messenger immediately sends its banner and addresses
after accepting a connect request, *before* actually attempting to
read or verify the banner from the client. As a result, it is
possible for the banner from the server to arrive before con_work()
calls try_read(). If that happens, try_read() will read the banner
and prepare protocol negotiation info via prepare_write_connect().
prepare_write_connect() calls con_out_kvec_reset(), which discards
the as-yet-unsent client banner. Next, con_work() calls
try_write(), which sends the protocol negotiation info rather than
the banner that the peer is expecting.
The result is that the peer sees an invalid banner, and the client
reports "negotiation failed".
Fix this by moving con_out_kvec_reset() out of
prepare_write_connect() to its callers at all locations except the
one where the banner might still need to be sent.
[elder@inktak.com: added note about server-side behavior]
Signed-off-by: Jim Schutt <jaschut@sandia.gov>
Reviewed-by: Alex Elder <elder@inktank.com>
2012-08-11 01:37:38 +08:00
|
|
|
con_out_kvec_reset(con);
|
2012-05-17 10:51:59 +08:00
|
|
|
ret = prepare_write_connect(con);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
2009-10-07 02:31:13 +08:00
|
|
|
prepare_read_connect(con);
|
|
|
|
|
|
|
|
/* Tell ceph about it. */
|
2009-12-23 02:43:42 +08:00
|
|
|
mutex_unlock(&con->mutex);
|
2009-10-07 02:31:13 +08:00
|
|
|
pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
|
|
|
|
if (con->ops->peer_reset)
|
|
|
|
con->ops->peer_reset(con);
|
2009-12-23 02:43:42 +08:00
|
|
|
mutex_lock(&con->mutex);
|
2012-07-21 08:24:40 +08:00
|
|
|
if (con->state != CON_STATE_NEGOTIATING)
|
2011-05-20 02:21:05 +08:00
|
|
|
return -EAGAIN;
|
2009-10-07 02:31:13 +08:00
|
|
|
break;
|
|
|
|
|
|
|
|
case CEPH_MSGR_TAG_RETRY_SESSION:
|
|
|
|
/*
|
|
|
|
* If we sent a smaller connect_seq than the peer has, try
|
|
|
|
* again with a larger value.
|
|
|
|
*/
|
2012-07-11 02:53:34 +08:00
|
|
|
dout("process_connect got RETRY_SESSION my seq %u, peer %u\n",
|
2009-10-07 02:31:13 +08:00
|
|
|
le32_to_cpu(con->out_connect.connect_seq),
|
2012-07-11 02:53:34 +08:00
|
|
|
le32_to_cpu(con->in_reply.connect_seq));
|
|
|
|
con->connect_seq = le32_to_cpu(con->in_reply.connect_seq);
|
libceph: avoid truncation due to racing banners
Because the Ceph client messenger uses a non-blocking connect, it is
possible for the sending of the client banner to race with the
arrival of the banner sent by the peer.
When ceph_sock_state_change() notices the connect has completed, it
schedules work to process the socket via con_work(). During this
time the peer is writing its banner, and arrival of the peer banner
races with con_work().
If con_work() calls try_read() before the peer banner arrives, there
is nothing for it to do, after which con_work() calls try_write() to
send the client's banner. In this case Ceph's protocol negotiation
can complete succesfully.
The server-side messenger immediately sends its banner and addresses
after accepting a connect request, *before* actually attempting to
read or verify the banner from the client. As a result, it is
possible for the banner from the server to arrive before con_work()
calls try_read(). If that happens, try_read() will read the banner
and prepare protocol negotiation info via prepare_write_connect().
prepare_write_connect() calls con_out_kvec_reset(), which discards
the as-yet-unsent client banner. Next, con_work() calls
try_write(), which sends the protocol negotiation info rather than
the banner that the peer is expecting.
The result is that the peer sees an invalid banner, and the client
reports "negotiation failed".
Fix this by moving con_out_kvec_reset() out of
prepare_write_connect() to its callers at all locations except the
one where the banner might still need to be sent.
[elder@inktak.com: added note about server-side behavior]
Signed-off-by: Jim Schutt <jaschut@sandia.gov>
Reviewed-by: Alex Elder <elder@inktank.com>
2012-08-11 01:37:38 +08:00
|
|
|
con_out_kvec_reset(con);
|
2012-05-17 10:51:59 +08:00
|
|
|
ret = prepare_write_connect(con);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
2009-10-07 02:31:13 +08:00
|
|
|
prepare_read_connect(con);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case CEPH_MSGR_TAG_RETRY_GLOBAL:
|
|
|
|
/*
|
|
|
|
* If we sent a smaller global_seq than the peer has, try
|
|
|
|
* again with a larger value.
|
|
|
|
*/
|
2009-11-11 06:34:36 +08:00
|
|
|
dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
|
2009-10-07 02:31:13 +08:00
|
|
|
con->peer_global_seq,
|
2012-07-11 02:53:34 +08:00
|
|
|
le32_to_cpu(con->in_reply.global_seq));
|
2009-10-07 02:31:13 +08:00
|
|
|
get_global_seq(con->msgr,
|
2012-07-11 02:53:34 +08:00
|
|
|
le32_to_cpu(con->in_reply.global_seq));
|
libceph: avoid truncation due to racing banners
Because the Ceph client messenger uses a non-blocking connect, it is
possible for the sending of the client banner to race with the
arrival of the banner sent by the peer.
When ceph_sock_state_change() notices the connect has completed, it
schedules work to process the socket via con_work(). During this
time the peer is writing its banner, and arrival of the peer banner
races with con_work().
If con_work() calls try_read() before the peer banner arrives, there
is nothing for it to do, after which con_work() calls try_write() to
send the client's banner. In this case Ceph's protocol negotiation
can complete succesfully.
The server-side messenger immediately sends its banner and addresses
after accepting a connect request, *before* actually attempting to
read or verify the banner from the client. As a result, it is
possible for the banner from the server to arrive before con_work()
calls try_read(). If that happens, try_read() will read the banner
and prepare protocol negotiation info via prepare_write_connect().
prepare_write_connect() calls con_out_kvec_reset(), which discards
the as-yet-unsent client banner. Next, con_work() calls
try_write(), which sends the protocol negotiation info rather than
the banner that the peer is expecting.
The result is that the peer sees an invalid banner, and the client
reports "negotiation failed".
Fix this by moving con_out_kvec_reset() out of
prepare_write_connect() to its callers at all locations except the
one where the banner might still need to be sent.
[elder@inktak.com: added note about server-side behavior]
Signed-off-by: Jim Schutt <jaschut@sandia.gov>
Reviewed-by: Alex Elder <elder@inktank.com>
2012-08-11 01:37:38 +08:00
|
|
|
con_out_kvec_reset(con);
|
2012-05-17 10:51:59 +08:00
|
|
|
ret = prepare_write_connect(con);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
2009-10-07 02:31:13 +08:00
|
|
|
prepare_read_connect(con);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case CEPH_MSGR_TAG_READY:
|
2009-12-24 01:30:21 +08:00
|
|
|
if (req_feat & ~server_feat) {
|
|
|
|
pr_err("%s%lld %s protocol feature mismatch,"
|
|
|
|
" my required %llx > server's %llx, need %llx\n",
|
|
|
|
ENTITY_NAME(con->peer_name),
|
2010-04-07 06:14:15 +08:00
|
|
|
ceph_pr_addr(&con->peer_addr.in_addr),
|
2009-12-24 01:30:21 +08:00
|
|
|
req_feat, server_feat, req_feat & ~server_feat);
|
|
|
|
con->error_msg = "missing required protocol features";
|
2012-12-28 10:27:04 +08:00
|
|
|
reset_connection(con);
|
2009-12-24 01:30:21 +08:00
|
|
|
return -1;
|
|
|
|
}
|
2012-07-21 08:24:40 +08:00
|
|
|
|
2012-12-27 00:43:57 +08:00
|
|
|
WARN_ON(con->state != CON_STATE_NEGOTIATING);
|
2012-07-21 08:24:40 +08:00
|
|
|
con->state = CON_STATE_OPEN;
|
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
|
|
|
|
con->connect_seq++;
|
2010-05-13 06:23:30 +08:00
|
|
|
con->peer_features = server_feat;
|
2009-10-07 02:31:13 +08:00
|
|
|
dout("process_connect got READY gseq %d cseq %d (%d)\n",
|
|
|
|
con->peer_global_seq,
|
|
|
|
le32_to_cpu(con->in_reply.connect_seq),
|
|
|
|
con->connect_seq);
|
|
|
|
WARN_ON(con->connect_seq !=
|
|
|
|
le32_to_cpu(con->in_reply.connect_seq));
|
2009-12-15 06:56:56 +08:00
|
|
|
|
|
|
|
if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
|
2013-02-21 00:25:12 +08:00
|
|
|
con_flag_set(con, CON_FLAG_LOSSYTX);
|
2009-12-15 06:56:56 +08:00
|
|
|
|
2012-07-31 07:22:05 +08:00
|
|
|
con->delay = 0; /* reset backoff memory */
|
2009-12-15 06:56:56 +08:00
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
prepare_read_tag(con);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case CEPH_MSGR_TAG_WAIT:
|
|
|
|
/*
|
|
|
|
* If there is a connection race (we are opening
|
|
|
|
* connections to each other), one of us may just have
|
|
|
|
* to WAIT. This shouldn't happen if we are the
|
|
|
|
* client.
|
|
|
|
*/
|
2011-05-13 06:33:17 +08:00
|
|
|
pr_err("process_connect got WAIT as client\n");
|
|
|
|
con->error_msg = "protocol error, got WAIT as client";
|
|
|
|
return -1;
|
2009-10-07 02:31:13 +08:00
|
|
|
|
|
|
|
default:
|
|
|
|
pr_err("connect protocol error, will retry\n");
|
|
|
|
con->error_msg = "protocol error, garbage tag during connect";
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* read (part of) an ack
|
|
|
|
*/
|
|
|
|
static int read_partial_ack(struct ceph_connection *con)
|
|
|
|
{
|
2012-05-10 23:29:50 +08:00
|
|
|
int size = sizeof (con->in_temp_ack);
|
|
|
|
int end = size;
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2012-05-10 23:29:50 +08:00
|
|
|
return read_partial(con, end, size, &con->in_temp_ack);
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We can finally discard anything that's been acked.
|
|
|
|
*/
|
|
|
|
static void process_ack(struct ceph_connection *con)
|
|
|
|
{
|
|
|
|
struct ceph_msg *m;
|
|
|
|
u64 ack = le64_to_cpu(con->in_temp_ack);
|
|
|
|
u64 seq;
|
|
|
|
|
|
|
|
while (!list_empty(&con->out_sent)) {
|
|
|
|
m = list_first_entry(&con->out_sent, struct ceph_msg,
|
|
|
|
list_head);
|
|
|
|
seq = le64_to_cpu(m->hdr.seq);
|
|
|
|
if (seq > ack)
|
|
|
|
break;
|
|
|
|
dout("got ack for seq %llu type %d at %p\n", seq,
|
|
|
|
le16_to_cpu(m->hdr.type), m);
|
2011-07-27 02:27:24 +08:00
|
|
|
m->ack_stamp = jiffies;
|
2009-10-07 02:31:13 +08:00
|
|
|
ceph_msg_remove(m);
|
|
|
|
}
|
|
|
|
prepare_read_tag(con);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2010-01-09 05:58:34 +08:00
|
|
|
static int read_partial_message_section(struct ceph_connection *con,
|
2010-08-04 01:25:11 +08:00
|
|
|
struct kvec *section,
|
|
|
|
unsigned int sec_len, u32 *crc)
|
2010-01-09 05:58:34 +08:00
|
|
|
{
|
2010-04-07 06:01:27 +08:00
|
|
|
int ret, left;
|
2010-01-09 05:58:34 +08:00
|
|
|
|
|
|
|
BUG_ON(!section);
|
|
|
|
|
|
|
|
while (section->iov_len < sec_len) {
|
|
|
|
BUG_ON(section->iov_base == NULL);
|
|
|
|
left = sec_len - section->iov_len;
|
|
|
|
ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
|
|
|
|
section->iov_len, left);
|
|
|
|
if (ret <= 0)
|
|
|
|
return ret;
|
|
|
|
section->iov_len += ret;
|
|
|
|
}
|
2012-02-15 21:43:54 +08:00
|
|
|
if (section->iov_len == sec_len)
|
|
|
|
*crc = crc32c(0, section->iov_base, section->iov_len);
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2010-01-09 05:58:34 +08:00
|
|
|
return 1;
|
|
|
|
}
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2012-07-31 09:19:30 +08:00
|
|
|
static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip);
|
2010-04-07 06:01:27 +08:00
|
|
|
|
|
|
|
static int read_partial_message_pages(struct ceph_connection *con,
|
|
|
|
struct page **pages,
|
2012-04-15 13:58:06 +08:00
|
|
|
unsigned int data_len, bool do_datacrc)
|
2010-04-07 06:01:27 +08:00
|
|
|
{
|
2013-03-07 13:39:38 +08:00
|
|
|
struct ceph_msg_pos *msg_pos = &con->in_msg_pos;
|
libceph: define and use in_msg_pos_next()
Define a new function in_msg_pos_next() to match out_msg_pos_next(),
and use it in place of code at the end of read_partial_message_pages()
and read_partial_message_bio().
Note that the page number is incremented and offset reset under
slightly different conditions from before. The result is
equivalent, however, as explained below.
Each time an incoming message is going to arrive, we find out how
much room is left--not surpassing the current page--and provide that
as the number of bytes to receive. So the amount we'll use is the
lesser of: all that's left of the entire request; and all that's
left in the current page.
If we received exactly how many were requested, we either reached
the end of the request or the end of the page. In the first case,
we're done, in the second, we move onto the next page in the array.
In all cases but (possibly) on the last page, after adding the
number of bytes received, page_pos == PAGE_SIZE. On the last page,
it doesn't really matter whether we increment the page number and
reset the page position, because we're done and we won't come back
here again. The code previously skipped over that last case,
basically. The new code handles that case the same as the others,
incrementing and resetting.
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2013-03-09 08:51:04 +08:00
|
|
|
struct page *page;
|
2013-03-09 10:58:59 +08:00
|
|
|
size_t page_offset;
|
|
|
|
size_t length;
|
|
|
|
unsigned int left;
|
2010-04-07 06:01:27 +08:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
/* (page) data */
|
|
|
|
BUG_ON(pages == NULL);
|
2013-03-07 13:39:38 +08:00
|
|
|
page = pages[msg_pos->page];
|
2013-03-09 10:58:59 +08:00
|
|
|
page_offset = msg_pos->page_pos;
|
|
|
|
BUG_ON(msg_pos->data_pos >= data_len);
|
|
|
|
left = data_len - msg_pos->data_pos;
|
|
|
|
BUG_ON(page_offset >= PAGE_SIZE);
|
|
|
|
length = min_t(unsigned int, PAGE_SIZE - page_offset, left);
|
|
|
|
|
|
|
|
ret = ceph_tcp_recvpage(con->sock, page, page_offset, length);
|
2010-04-07 06:01:27 +08:00
|
|
|
if (ret <= 0)
|
|
|
|
return ret;
|
libceph: define and use in_msg_pos_next()
Define a new function in_msg_pos_next() to match out_msg_pos_next(),
and use it in place of code at the end of read_partial_message_pages()
and read_partial_message_bio().
Note that the page number is incremented and offset reset under
slightly different conditions from before. The result is
equivalent, however, as explained below.
Each time an incoming message is going to arrive, we find out how
much room is left--not surpassing the current page--and provide that
as the number of bytes to receive. So the amount we'll use is the
lesser of: all that's left of the entire request; and all that's
left in the current page.
If we received exactly how many were requested, we either reached
the end of the request or the end of the page. In the first case,
we're done, in the second, we move onto the next page in the array.
In all cases but (possibly) on the last page, after adding the
number of bytes received, page_pos == PAGE_SIZE. On the last page,
it doesn't really matter whether we increment the page number and
reset the page position, because we're done and we won't come back
here again. The code previously skipped over that last case,
basically. The new code handles that case the same as the others,
incrementing and resetting.
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2013-03-09 08:51:04 +08:00
|
|
|
|
2013-03-09 10:59:00 +08:00
|
|
|
if (do_datacrc)
|
|
|
|
con->in_data_crc = ceph_crc32c_page(con->in_data_crc, page,
|
|
|
|
page_offset, ret);
|
2013-03-09 10:58:59 +08:00
|
|
|
|
|
|
|
in_msg_pos_next(con, length, ret);
|
2010-04-07 06:01:27 +08:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_BLOCK
|
|
|
|
static int read_partial_message_bio(struct ceph_connection *con,
|
2012-04-15 13:58:06 +08:00
|
|
|
unsigned int data_len, bool do_datacrc)
|
2010-04-07 06:01:27 +08:00
|
|
|
{
|
2013-03-09 08:51:03 +08:00
|
|
|
struct ceph_msg *msg = con->in_msg;
|
2013-03-07 13:39:38 +08:00
|
|
|
struct ceph_msg_pos *msg_pos = &con->in_msg_pos;
|
2013-03-09 08:51:03 +08:00
|
|
|
struct bio_vec *bv;
|
libceph: define and use in_msg_pos_next()
Define a new function in_msg_pos_next() to match out_msg_pos_next(),
and use it in place of code at the end of read_partial_message_pages()
and read_partial_message_bio().
Note that the page number is incremented and offset reset under
slightly different conditions from before. The result is
equivalent, however, as explained below.
Each time an incoming message is going to arrive, we find out how
much room is left--not surpassing the current page--and provide that
as the number of bytes to receive. So the amount we'll use is the
lesser of: all that's left of the entire request; and all that's
left in the current page.
If we received exactly how many were requested, we either reached
the end of the request or the end of the page. In the first case,
we're done, in the second, we move onto the next page in the array.
In all cases but (possibly) on the last page, after adding the
number of bytes received, page_pos == PAGE_SIZE. On the last page,
it doesn't really matter whether we increment the page number and
reset the page position, because we're done and we won't come back
here again. The code previously skipped over that last case,
basically. The new code handles that case the same as the others,
incrementing and resetting.
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2013-03-09 08:51:04 +08:00
|
|
|
struct page *page;
|
2013-03-09 10:58:59 +08:00
|
|
|
size_t page_offset;
|
|
|
|
size_t length;
|
|
|
|
unsigned int left;
|
|
|
|
int ret;
|
2010-04-07 06:01:27 +08:00
|
|
|
|
2013-03-09 08:51:03 +08:00
|
|
|
BUG_ON(!msg);
|
2013-03-02 08:00:16 +08:00
|
|
|
BUG_ON(!msg->b.bio_iter);
|
|
|
|
bv = bio_iovec_idx(msg->b.bio_iter, msg->b.bio_seg);
|
libceph: define and use in_msg_pos_next()
Define a new function in_msg_pos_next() to match out_msg_pos_next(),
and use it in place of code at the end of read_partial_message_pages()
and read_partial_message_bio().
Note that the page number is incremented and offset reset under
slightly different conditions from before. The result is
equivalent, however, as explained below.
Each time an incoming message is going to arrive, we find out how
much room is left--not surpassing the current page--and provide that
as the number of bytes to receive. So the amount we'll use is the
lesser of: all that's left of the entire request; and all that's
left in the current page.
If we received exactly how many were requested, we either reached
the end of the request or the end of the page. In the first case,
we're done, in the second, we move onto the next page in the array.
In all cases but (possibly) on the last page, after adding the
number of bytes received, page_pos == PAGE_SIZE. On the last page,
it doesn't really matter whether we increment the page number and
reset the page position, because we're done and we won't come back
here again. The code previously skipped over that last case,
basically. The new code handles that case the same as the others,
incrementing and resetting.
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2013-03-09 08:51:04 +08:00
|
|
|
page = bv->bv_page;
|
2013-03-09 10:58:59 +08:00
|
|
|
page_offset = bv->bv_offset + msg_pos->page_pos;
|
|
|
|
BUG_ON(msg_pos->data_pos >= data_len);
|
|
|
|
left = data_len - msg_pos->data_pos;
|
|
|
|
BUG_ON(msg_pos->page_pos >= bv->bv_len);
|
|
|
|
length = min_t(unsigned int, bv->bv_len - msg_pos->page_pos, left);
|
2010-04-07 06:01:27 +08:00
|
|
|
|
2013-03-09 10:58:59 +08:00
|
|
|
ret = ceph_tcp_recvpage(con->sock, page, page_offset, length);
|
2010-04-07 06:01:27 +08:00
|
|
|
if (ret <= 0)
|
|
|
|
return ret;
|
libceph: define and use in_msg_pos_next()
Define a new function in_msg_pos_next() to match out_msg_pos_next(),
and use it in place of code at the end of read_partial_message_pages()
and read_partial_message_bio().
Note that the page number is incremented and offset reset under
slightly different conditions from before. The result is
equivalent, however, as explained below.
Each time an incoming message is going to arrive, we find out how
much room is left--not surpassing the current page--and provide that
as the number of bytes to receive. So the amount we'll use is the
lesser of: all that's left of the entire request; and all that's
left in the current page.
If we received exactly how many were requested, we either reached
the end of the request or the end of the page. In the first case,
we're done, in the second, we move onto the next page in the array.
In all cases but (possibly) on the last page, after adding the
number of bytes received, page_pos == PAGE_SIZE. On the last page,
it doesn't really matter whether we increment the page number and
reset the page position, because we're done and we won't come back
here again. The code previously skipped over that last case,
basically. The new code handles that case the same as the others,
incrementing and resetting.
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2013-03-09 08:51:04 +08:00
|
|
|
|
2013-03-09 10:59:00 +08:00
|
|
|
if (do_datacrc)
|
|
|
|
con->in_data_crc = ceph_crc32c_page(con->in_data_crc, page,
|
|
|
|
page_offset, ret);
|
2013-03-09 10:58:59 +08:00
|
|
|
|
|
|
|
in_msg_pos_next(con, length, ret);
|
2010-04-07 06:01:27 +08:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2013-03-09 10:58:59 +08:00
|
|
|
static int read_partial_msg_data(struct ceph_connection *con)
|
|
|
|
{
|
|
|
|
struct ceph_msg *msg = con->in_msg;
|
|
|
|
struct ceph_msg_pos *msg_pos = &con->in_msg_pos;
|
|
|
|
const bool do_datacrc = !con->msgr->nocrc;
|
|
|
|
unsigned int data_len;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
BUG_ON(!msg);
|
|
|
|
|
|
|
|
data_len = le32_to_cpu(con->in_hdr.data_len);
|
|
|
|
while (msg_pos->data_pos < data_len) {
|
2013-03-02 08:00:16 +08:00
|
|
|
if (ceph_msg_has_pages(msg)) {
|
2013-03-02 08:00:16 +08:00
|
|
|
ret = read_partial_message_pages(con, msg->p.pages,
|
2013-03-09 10:58:59 +08:00
|
|
|
data_len, do_datacrc);
|
|
|
|
if (ret <= 0)
|
|
|
|
return ret;
|
|
|
|
#ifdef CONFIG_BLOCK
|
2013-03-02 08:00:16 +08:00
|
|
|
} else if (ceph_msg_has_bio(msg)) {
|
2013-03-09 10:58:59 +08:00
|
|
|
ret = read_partial_message_bio(con,
|
|
|
|
data_len, do_datacrc);
|
|
|
|
if (ret <= 0)
|
|
|
|
return ret;
|
|
|
|
#endif
|
|
|
|
} else {
|
|
|
|
BUG_ON(1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return 1; /* must return > 0 to indicate success */
|
|
|
|
}
|
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
/*
|
|
|
|
* read (part of) a message.
|
|
|
|
*/
|
|
|
|
static int read_partial_message(struct ceph_connection *con)
|
|
|
|
{
|
|
|
|
struct ceph_msg *m = con->in_msg;
|
2012-05-10 23:29:50 +08:00
|
|
|
int size;
|
|
|
|
int end;
|
2009-10-07 02:31:13 +08:00
|
|
|
int ret;
|
2012-04-15 13:58:06 +08:00
|
|
|
unsigned int front_len, middle_len, data_len;
|
2012-03-08 01:40:08 +08:00
|
|
|
bool do_datacrc = !con->msgr->nocrc;
|
2010-04-22 22:47:01 +08:00
|
|
|
u64 seq;
|
2012-02-15 21:43:54 +08:00
|
|
|
u32 crc;
|
2009-10-07 02:31:13 +08:00
|
|
|
|
|
|
|
dout("read_partial_message con %p msg %p\n", con, m);
|
|
|
|
|
|
|
|
/* header */
|
2012-05-10 23:29:50 +08:00
|
|
|
size = sizeof (con->in_hdr);
|
|
|
|
end = size;
|
|
|
|
ret = read_partial(con, end, size, &con->in_hdr);
|
2012-05-10 23:29:50 +08:00
|
|
|
if (ret <= 0)
|
|
|
|
return ret;
|
2012-02-15 21:43:54 +08:00
|
|
|
|
|
|
|
crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc));
|
|
|
|
if (cpu_to_le32(crc) != con->in_hdr.crc) {
|
|
|
|
pr_err("read_partial_message bad hdr "
|
|
|
|
" crc %u != expected %u\n",
|
|
|
|
crc, con->in_hdr.crc);
|
|
|
|
return -EBADMSG;
|
|
|
|
}
|
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
front_len = le32_to_cpu(con->in_hdr.front_len);
|
|
|
|
if (front_len > CEPH_MSG_MAX_FRONT_LEN)
|
|
|
|
return -EIO;
|
|
|
|
middle_len = le32_to_cpu(con->in_hdr.middle_len);
|
2013-03-09 08:51:03 +08:00
|
|
|
if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN)
|
2009-10-07 02:31:13 +08:00
|
|
|
return -EIO;
|
|
|
|
data_len = le32_to_cpu(con->in_hdr.data_len);
|
|
|
|
if (data_len > CEPH_MSG_MAX_DATA_LEN)
|
|
|
|
return -EIO;
|
|
|
|
|
2010-04-22 22:47:01 +08:00
|
|
|
/* verify seq# */
|
|
|
|
seq = le64_to_cpu(con->in_hdr.seq);
|
|
|
|
if ((s64)seq - (s64)con->in_seq < 1) {
|
ceph: fix small seq message skipping
If the client gets out of sync with the server message sequence number, we
normally skip low seq messages (ones we already received). The skip code
was also incrementing the expected seq, such that all subsequent messages
also appeared old and got skipped, and an eventual timeout on the osd
connection. This resulted in some lagging requests and console messages
like
[233480.882885] ceph: skipping osd22 10.138.138.13:6804 seq 2016, expected 2017
[233480.882919] ceph: skipping osd22 10.138.138.13:6804 seq 2017, expected 2018
[233480.882963] ceph: skipping osd22 10.138.138.13:6804 seq 2018, expected 2019
[233480.883488] ceph: skipping osd22 10.138.138.13:6804 seq 2019, expected 2020
[233485.219558] ceph: skipping osd22 10.138.138.13:6804 seq 2020, expected 2021
[233485.906595] ceph: skipping osd22 10.138.138.13:6804 seq 2021, expected 2022
[233490.379536] ceph: skipping osd22 10.138.138.13:6804 seq 2022, expected 2023
[233495.523260] ceph: skipping osd22 10.138.138.13:6804 seq 2023, expected 2024
[233495.923194] ceph: skipping osd22 10.138.138.13:6804 seq 2024, expected 2025
[233500.534614] ceph: tid 6023602 timed out on osd22, will reset osd
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Sage Weil <sage@newdream.net>
2010-11-02 06:49:23 +08:00
|
|
|
pr_info("skipping %s%lld %s seq %lld expected %lld\n",
|
2010-04-22 22:47:01 +08:00
|
|
|
ENTITY_NAME(con->peer_name),
|
2010-04-07 06:14:15 +08:00
|
|
|
ceph_pr_addr(&con->peer_addr.in_addr),
|
2010-04-22 22:47:01 +08:00
|
|
|
seq, con->in_seq + 1);
|
|
|
|
con->in_base_pos = -front_len - middle_len - data_len -
|
|
|
|
sizeof(m->footer);
|
|
|
|
con->in_tag = CEPH_MSGR_TAG_READY;
|
|
|
|
return 0;
|
|
|
|
} else if ((s64)seq - (s64)con->in_seq > 1) {
|
|
|
|
pr_err("read_partial_message bad seq %lld expected %lld\n",
|
|
|
|
seq, con->in_seq + 1);
|
|
|
|
con->error_msg = "bad message sequence # for incoming message";
|
|
|
|
return -EBADMSG;
|
|
|
|
}
|
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
/* allocate message? */
|
|
|
|
if (!con->in_msg) {
|
2012-07-31 09:19:30 +08:00
|
|
|
int skip = 0;
|
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
|
2013-03-09 08:51:03 +08:00
|
|
|
front_len, data_len);
|
2012-07-31 09:19:30 +08:00
|
|
|
ret = ceph_con_in_msg_alloc(con, &skip);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
2010-01-09 05:58:34 +08:00
|
|
|
if (skip) {
|
2009-10-07 02:31:13 +08:00
|
|
|
/* skip this message */
|
2010-04-02 07:06:19 +08:00
|
|
|
dout("alloc_msg said skip message\n");
|
2010-06-14 01:30:19 +08:00
|
|
|
BUG_ON(con->in_msg);
|
2009-10-07 02:31:13 +08:00
|
|
|
con->in_base_pos = -front_len - middle_len - data_len -
|
|
|
|
sizeof(m->footer);
|
|
|
|
con->in_tag = CEPH_MSGR_TAG_READY;
|
2010-04-22 11:45:59 +08:00
|
|
|
con->in_seq++;
|
2009-10-07 02:31:13 +08:00
|
|
|
return 0;
|
|
|
|
}
|
2012-06-02 03:56:43 +08:00
|
|
|
|
2012-07-31 09:19:30 +08:00
|
|
|
BUG_ON(!con->in_msg);
|
2012-06-02 03:56:43 +08:00
|
|
|
BUG_ON(con->in_msg->con != con);
|
2009-10-07 02:31:13 +08:00
|
|
|
m = con->in_msg;
|
|
|
|
m->front.iov_len = 0; /* haven't read it yet */
|
2010-01-09 05:58:34 +08:00
|
|
|
if (m->middle)
|
|
|
|
m->middle->vec.iov_len = 0;
|
2010-01-12 02:32:02 +08:00
|
|
|
|
2013-03-07 13:39:39 +08:00
|
|
|
/* prepare for data payload, if any */
|
2012-07-31 07:20:25 +08:00
|
|
|
|
2013-03-07 13:39:39 +08:00
|
|
|
if (data_len)
|
|
|
|
prepare_message_data(con->in_msg, &con->in_msg_pos);
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* front */
|
2010-01-09 05:58:34 +08:00
|
|
|
ret = read_partial_message_section(con, &m->front, front_len,
|
|
|
|
&con->in_front_crc);
|
|
|
|
if (ret <= 0)
|
|
|
|
return ret;
|
2009-10-07 02:31:13 +08:00
|
|
|
|
|
|
|
/* middle */
|
2010-01-09 05:58:34 +08:00
|
|
|
if (m->middle) {
|
2010-08-04 01:25:11 +08:00
|
|
|
ret = read_partial_message_section(con, &m->middle->vec,
|
|
|
|
middle_len,
|
2010-01-09 05:58:34 +08:00
|
|
|
&con->in_middle_crc);
|
2009-10-07 02:31:13 +08:00
|
|
|
if (ret <= 0)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* (page) data */
|
2013-03-09 10:58:59 +08:00
|
|
|
if (data_len) {
|
|
|
|
ret = read_partial_msg_data(con);
|
|
|
|
if (ret <= 0)
|
|
|
|
return ret;
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* footer */
|
2012-05-10 23:29:50 +08:00
|
|
|
size = sizeof (m->footer);
|
|
|
|
end += size;
|
|
|
|
ret = read_partial(con, end, size, &m->footer);
|
2012-05-10 23:29:50 +08:00
|
|
|
if (ret <= 0)
|
|
|
|
return ret;
|
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
|
|
|
|
m, front_len, m->footer.front_crc, middle_len,
|
|
|
|
m->footer.middle_crc, data_len, m->footer.data_crc);
|
|
|
|
|
|
|
|
/* crc ok? */
|
|
|
|
if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
|
|
|
|
pr_err("read_partial_message %p front crc %u != exp. %u\n",
|
|
|
|
m, con->in_front_crc, m->footer.front_crc);
|
|
|
|
return -EBADMSG;
|
|
|
|
}
|
|
|
|
if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
|
|
|
|
pr_err("read_partial_message %p middle crc %u != exp %u\n",
|
|
|
|
m, con->in_middle_crc, m->footer.middle_crc);
|
|
|
|
return -EBADMSG;
|
|
|
|
}
|
2012-02-15 21:43:54 +08:00
|
|
|
if (do_datacrc &&
|
2009-10-07 02:31:13 +08:00
|
|
|
(m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
|
|
|
|
con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
|
|
|
|
pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
|
|
|
|
con->in_data_crc, le32_to_cpu(m->footer.data_crc));
|
|
|
|
return -EBADMSG;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 1; /* done! */
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Process message. This happens in the worker thread. The callback should
|
|
|
|
* be careful not to do anything that waits on other incoming messages or it
|
|
|
|
* may deadlock.
|
|
|
|
*/
|
|
|
|
static void process_message(struct ceph_connection *con)
|
|
|
|
{
|
2009-12-15 06:30:34 +08:00
|
|
|
struct ceph_msg *msg;
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2012-06-02 03:56:43 +08:00
|
|
|
BUG_ON(con->in_msg->con != con);
|
|
|
|
con->in_msg->con = NULL;
|
2009-12-15 06:30:34 +08:00
|
|
|
msg = con->in_msg;
|
2009-10-07 02:31:13 +08:00
|
|
|
con->in_msg = NULL;
|
2012-06-22 03:47:08 +08:00
|
|
|
con->ops->put(con);
|
2009-10-07 02:31:13 +08:00
|
|
|
|
|
|
|
/* if first message, set peer_name */
|
|
|
|
if (con->peer_name.type == 0)
|
2010-03-26 06:45:38 +08:00
|
|
|
con->peer_name = msg->hdr.src;
|
2009-10-07 02:31:13 +08:00
|
|
|
|
|
|
|
con->in_seq++;
|
2009-12-23 02:43:42 +08:00
|
|
|
mutex_unlock(&con->mutex);
|
2009-10-07 02:31:13 +08:00
|
|
|
|
|
|
|
dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
|
|
|
|
msg, le64_to_cpu(msg->hdr.seq),
|
2010-03-26 06:45:38 +08:00
|
|
|
ENTITY_NAME(msg->hdr.src),
|
2009-10-07 02:31:13 +08:00
|
|
|
le16_to_cpu(msg->hdr.type),
|
|
|
|
ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
|
|
|
|
le32_to_cpu(msg->hdr.front_len),
|
|
|
|
le32_to_cpu(msg->hdr.data_len),
|
|
|
|
con->in_front_crc, con->in_middle_crc, con->in_data_crc);
|
|
|
|
con->ops->dispatch(con, msg);
|
2009-12-23 02:43:42 +08:00
|
|
|
|
|
|
|
mutex_lock(&con->mutex);
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Write something to the socket. Called in a worker thread when the
|
|
|
|
* socket appears to be writeable and we have something ready to send.
|
|
|
|
*/
|
|
|
|
static int try_write(struct ceph_connection *con)
|
|
|
|
{
|
|
|
|
int ret = 1;
|
|
|
|
|
2012-06-22 03:49:23 +08:00
|
|
|
dout("try_write start %p state %lu\n", con, con->state);
|
2009-10-07 02:31:13 +08:00
|
|
|
|
|
|
|
more:
|
|
|
|
dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
|
|
|
|
|
|
|
|
/* open the socket first? */
|
2012-07-21 08:24:40 +08:00
|
|
|
if (con->state == CON_STATE_PREOPEN) {
|
|
|
|
BUG_ON(con->sock);
|
|
|
|
con->state = CON_STATE_CONNECTING;
|
2012-05-30 00:04:58 +08:00
|
|
|
|
2012-05-24 03:35:23 +08:00
|
|
|
con_out_kvec_reset(con);
|
2012-05-17 04:16:38 +08:00
|
|
|
prepare_write_banner(con);
|
2009-11-11 06:34:36 +08:00
|
|
|
prepare_read_banner(con);
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2009-12-12 01:48:05 +08:00
|
|
|
BUG_ON(con->in_msg);
|
2009-10-07 02:31:13 +08:00
|
|
|
con->in_tag = CEPH_MSGR_TAG_READY;
|
|
|
|
dout("try_write initiating connect on %p new state %lu\n",
|
|
|
|
con, con->state);
|
2012-02-15 04:05:33 +08:00
|
|
|
ret = ceph_tcp_connect(con);
|
|
|
|
if (ret < 0) {
|
2009-10-07 02:31:13 +08:00
|
|
|
con->error_msg = "connect error";
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
more_kvec:
|
|
|
|
/* kvec data queued? */
|
|
|
|
if (con->out_skip) {
|
|
|
|
ret = write_partial_skip(con);
|
|
|
|
if (ret <= 0)
|
2011-01-26 00:19:34 +08:00
|
|
|
goto out;
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
|
|
|
if (con->out_kvec_left) {
|
|
|
|
ret = write_partial_kvec(con);
|
|
|
|
if (ret <= 0)
|
2011-01-26 00:19:34 +08:00
|
|
|
goto out;
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* msg pages? */
|
|
|
|
if (con->out_msg) {
|
2009-12-15 06:04:30 +08:00
|
|
|
if (con->out_msg_done) {
|
|
|
|
ceph_msg_put(con->out_msg);
|
|
|
|
con->out_msg = NULL; /* we're done with this one */
|
|
|
|
goto do_next;
|
|
|
|
}
|
|
|
|
|
2013-03-09 10:58:59 +08:00
|
|
|
ret = write_partial_message_data(con);
|
2009-10-07 02:31:13 +08:00
|
|
|
if (ret == 1)
|
|
|
|
goto more_kvec; /* we need to send the footer, too! */
|
|
|
|
if (ret == 0)
|
2011-01-26 00:19:34 +08:00
|
|
|
goto out;
|
2009-10-07 02:31:13 +08:00
|
|
|
if (ret < 0) {
|
2013-03-09 10:58:59 +08:00
|
|
|
dout("try_write write_partial_message_data err %d\n",
|
2009-10-07 02:31:13 +08:00
|
|
|
ret);
|
2011-01-26 00:19:34 +08:00
|
|
|
goto out;
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-12-15 06:04:30 +08:00
|
|
|
do_next:
|
2012-07-21 08:24:40 +08:00
|
|
|
if (con->state == CON_STATE_OPEN) {
|
2009-10-07 02:31:13 +08:00
|
|
|
/* is anything else pending? */
|
|
|
|
if (!list_empty(&con->out_queue)) {
|
|
|
|
prepare_write_message(con);
|
|
|
|
goto more;
|
|
|
|
}
|
|
|
|
if (con->in_seq > con->in_seq_acked) {
|
|
|
|
prepare_write_ack(con);
|
|
|
|
goto more;
|
|
|
|
}
|
2013-02-21 00:25:12 +08:00
|
|
|
if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) {
|
2009-10-07 02:31:13 +08:00
|
|
|
prepare_write_keepalive(con);
|
|
|
|
goto more;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Nothing to do! */
|
2013-02-21 00:25:12 +08:00
|
|
|
con_flag_clear(con, CON_FLAG_WRITE_PENDING);
|
2009-10-07 02:31:13 +08:00
|
|
|
dout("try_write nothing else to write.\n");
|
|
|
|
ret = 0;
|
|
|
|
out:
|
2011-01-26 00:19:34 +08:00
|
|
|
dout("try_write done on %p ret %d\n", con, ret);
|
2009-10-07 02:31:13 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Read what we can from the socket.
|
|
|
|
*/
|
|
|
|
static int try_read(struct ceph_connection *con)
|
|
|
|
{
|
|
|
|
int ret = -1;
|
|
|
|
|
2012-07-21 08:24:40 +08:00
|
|
|
more:
|
|
|
|
dout("try_read start on %p state %lu\n", con, con->state);
|
|
|
|
if (con->state != CON_STATE_CONNECTING &&
|
|
|
|
con->state != CON_STATE_NEGOTIATING &&
|
|
|
|
con->state != CON_STATE_OPEN)
|
2009-10-07 02:31:13 +08:00
|
|
|
return 0;
|
|
|
|
|
2012-07-21 08:24:40 +08:00
|
|
|
BUG_ON(!con->sock);
|
2009-12-23 02:43:42 +08:00
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
|
|
|
|
con->in_base_pos);
|
2011-05-20 02:21:05 +08:00
|
|
|
|
2012-07-21 08:24:40 +08:00
|
|
|
if (con->state == CON_STATE_CONNECTING) {
|
2012-05-25 00:55:03 +08:00
|
|
|
dout("try_read connecting\n");
|
|
|
|
ret = read_partial_banner(con);
|
|
|
|
if (ret <= 0)
|
2012-06-01 00:37:29 +08:00
|
|
|
goto out;
|
2012-05-25 00:55:03 +08:00
|
|
|
ret = process_banner(con);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
2012-07-21 08:24:40 +08:00
|
|
|
con->state = CON_STATE_NEGOTIATING;
|
2012-05-25 00:55:03 +08:00
|
|
|
|
libceph: avoid truncation due to racing banners
Because the Ceph client messenger uses a non-blocking connect, it is
possible for the sending of the client banner to race with the
arrival of the banner sent by the peer.
When ceph_sock_state_change() notices the connect has completed, it
schedules work to process the socket via con_work(). During this
time the peer is writing its banner, and arrival of the peer banner
races with con_work().
If con_work() calls try_read() before the peer banner arrives, there
is nothing for it to do, after which con_work() calls try_write() to
send the client's banner. In this case Ceph's protocol negotiation
can complete succesfully.
The server-side messenger immediately sends its banner and addresses
after accepting a connect request, *before* actually attempting to
read or verify the banner from the client. As a result, it is
possible for the banner from the server to arrive before con_work()
calls try_read(). If that happens, try_read() will read the banner
and prepare protocol negotiation info via prepare_write_connect().
prepare_write_connect() calls con_out_kvec_reset(), which discards
the as-yet-unsent client banner. Next, con_work() calls
try_write(), which sends the protocol negotiation info rather than
the banner that the peer is expecting.
The result is that the peer sees an invalid banner, and the client
reports "negotiation failed".
Fix this by moving con_out_kvec_reset() out of
prepare_write_connect() to its callers at all locations except the
one where the banner might still need to be sent.
[elder@inktak.com: added note about server-side behavior]
Signed-off-by: Jim Schutt <jaschut@sandia.gov>
Reviewed-by: Alex Elder <elder@inktank.com>
2012-08-11 01:37:38 +08:00
|
|
|
/*
|
|
|
|
* Received banner is good, exchange connection info.
|
|
|
|
* Do not reset out_kvec, as sending our banner raced
|
|
|
|
* with receiving peer banner after connect completed.
|
|
|
|
*/
|
2012-05-25 00:55:03 +08:00
|
|
|
ret = prepare_write_connect(con);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
prepare_read_connect(con);
|
|
|
|
|
|
|
|
/* Send connection info before awaiting response */
|
2011-05-20 02:21:05 +08:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2012-07-21 08:24:40 +08:00
|
|
|
if (con->state == CON_STATE_NEGOTIATING) {
|
2012-05-25 00:55:03 +08:00
|
|
|
dout("try_read negotiating\n");
|
2009-10-07 02:31:13 +08:00
|
|
|
ret = read_partial_connect(con);
|
|
|
|
if (ret <= 0)
|
|
|
|
goto out;
|
2011-01-26 00:17:48 +08:00
|
|
|
ret = process_connect(con);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2009-10-07 02:31:13 +08:00
|
|
|
goto more;
|
|
|
|
}
|
|
|
|
|
2012-12-27 00:43:57 +08:00
|
|
|
WARN_ON(con->state != CON_STATE_OPEN);
|
2012-07-21 08:24:40 +08:00
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
if (con->in_base_pos < 0) {
|
|
|
|
/*
|
|
|
|
* skipping + discarding content.
|
|
|
|
*
|
|
|
|
* FIXME: there must be a better way to do this!
|
|
|
|
*/
|
2012-02-15 21:43:55 +08:00
|
|
|
static char buf[SKIP_BUF_SIZE];
|
|
|
|
int skip = min((int) sizeof (buf), -con->in_base_pos);
|
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
|
|
|
|
ret = ceph_tcp_recvmsg(con->sock, buf, skip);
|
|
|
|
if (ret <= 0)
|
2011-01-26 00:17:48 +08:00
|
|
|
goto out;
|
2009-10-07 02:31:13 +08:00
|
|
|
con->in_base_pos += ret;
|
|
|
|
if (con->in_base_pos)
|
|
|
|
goto more;
|
|
|
|
}
|
|
|
|
if (con->in_tag == CEPH_MSGR_TAG_READY) {
|
|
|
|
/*
|
|
|
|
* what's next?
|
|
|
|
*/
|
|
|
|
ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
|
|
|
|
if (ret <= 0)
|
2011-01-26 00:17:48 +08:00
|
|
|
goto out;
|
2009-10-07 02:31:13 +08:00
|
|
|
dout("try_read got tag %d\n", (int)con->in_tag);
|
|
|
|
switch (con->in_tag) {
|
|
|
|
case CEPH_MSGR_TAG_MSG:
|
|
|
|
prepare_read_message(con);
|
|
|
|
break;
|
|
|
|
case CEPH_MSGR_TAG_ACK:
|
|
|
|
prepare_read_ack(con);
|
|
|
|
break;
|
|
|
|
case CEPH_MSGR_TAG_CLOSE:
|
2012-07-21 08:24:40 +08:00
|
|
|
con_close_socket(con);
|
|
|
|
con->state = CON_STATE_CLOSED;
|
2011-01-26 00:17:48 +08:00
|
|
|
goto out;
|
2009-10-07 02:31:13 +08:00
|
|
|
default:
|
|
|
|
goto bad_tag;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (con->in_tag == CEPH_MSGR_TAG_MSG) {
|
|
|
|
ret = read_partial_message(con);
|
|
|
|
if (ret <= 0) {
|
|
|
|
switch (ret) {
|
|
|
|
case -EBADMSG:
|
|
|
|
con->error_msg = "bad crc";
|
|
|
|
ret = -EIO;
|
2011-01-26 00:17:48 +08:00
|
|
|
break;
|
2009-10-07 02:31:13 +08:00
|
|
|
case -EIO:
|
|
|
|
con->error_msg = "io error";
|
2011-01-26 00:17:48 +08:00
|
|
|
break;
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
2011-01-26 00:17:48 +08:00
|
|
|
goto out;
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
|
|
|
if (con->in_tag == CEPH_MSGR_TAG_READY)
|
|
|
|
goto more;
|
|
|
|
process_message(con);
|
2012-07-31 09:16:56 +08:00
|
|
|
if (con->state == CON_STATE_OPEN)
|
|
|
|
prepare_read_tag(con);
|
2009-10-07 02:31:13 +08:00
|
|
|
goto more;
|
|
|
|
}
|
|
|
|
if (con->in_tag == CEPH_MSGR_TAG_ACK) {
|
|
|
|
ret = read_partial_ack(con);
|
|
|
|
if (ret <= 0)
|
2011-01-26 00:17:48 +08:00
|
|
|
goto out;
|
2009-10-07 02:31:13 +08:00
|
|
|
process_ack(con);
|
|
|
|
goto more;
|
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
2011-01-26 00:17:48 +08:00
|
|
|
dout("try_read done on %p ret %d\n", con, ret);
|
2009-10-07 02:31:13 +08:00
|
|
|
return ret;
|
|
|
|
|
|
|
|
bad_tag:
|
|
|
|
pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
|
|
|
|
con->error_msg = "protocol error, garbage tag";
|
|
|
|
ret = -1;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
2012-10-09 11:37:30 +08:00
|
|
|
* Atomically queue work on a connection after the specified delay.
|
|
|
|
* Bump @con reference to avoid races with connection teardown.
|
|
|
|
* Returns 0 if work was queued, or an error code otherwise.
|
2009-10-07 02:31:13 +08:00
|
|
|
*/
|
2012-10-09 11:37:30 +08:00
|
|
|
static int queue_con_delay(struct ceph_connection *con, unsigned long delay)
|
2009-10-07 02:31:13 +08:00
|
|
|
{
|
|
|
|
if (!con->ops->get(con)) {
|
2012-10-09 11:37:30 +08:00
|
|
|
dout("%s %p ref count 0\n", __func__, con);
|
|
|
|
|
|
|
|
return -ENOENT;
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
|
|
|
|
2012-10-09 11:37:30 +08:00
|
|
|
if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) {
|
|
|
|
dout("%s %p - already queued\n", __func__, con);
|
2009-10-07 02:31:13 +08:00
|
|
|
con->ops->put(con);
|
2012-10-09 11:37:30 +08:00
|
|
|
|
|
|
|
return -EBUSY;
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
2012-10-09 11:37:30 +08:00
|
|
|
|
|
|
|
dout("%s %p %lu\n", __func__, con, delay);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void queue_con(struct ceph_connection *con)
|
|
|
|
{
|
|
|
|
(void) queue_con_delay(con, 0);
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
|
|
|
|
2012-12-08 09:50:07 +08:00
|
|
|
static bool con_sock_closed(struct ceph_connection *con)
|
|
|
|
{
|
2013-02-21 00:25:12 +08:00
|
|
|
if (!con_flag_test_and_clear(con, CON_FLAG_SOCK_CLOSED))
|
2012-12-08 09:50:07 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
#define CASE(x) \
|
|
|
|
case CON_STATE_ ## x: \
|
|
|
|
con->error_msg = "socket closed (con state " #x ")"; \
|
|
|
|
break;
|
|
|
|
|
|
|
|
switch (con->state) {
|
|
|
|
CASE(CLOSED);
|
|
|
|
CASE(PREOPEN);
|
|
|
|
CASE(CONNECTING);
|
|
|
|
CASE(NEGOTIATING);
|
|
|
|
CASE(OPEN);
|
|
|
|
CASE(STANDBY);
|
|
|
|
default:
|
|
|
|
pr_warning("%s con %p unrecognized state %lu\n",
|
|
|
|
__func__, con, con->state);
|
|
|
|
con->error_msg = "unrecognized con state";
|
|
|
|
BUG();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
#undef CASE
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2013-02-20 02:25:57 +08:00
|
|
|
static bool con_backoff(struct ceph_connection *con)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (!con_flag_test_and_clear(con, CON_FLAG_BACKOFF))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
ret = queue_con_delay(con, round_jiffies_relative(con->delay));
|
|
|
|
if (ret) {
|
|
|
|
dout("%s: con %p FAILED to back off %lu\n", __func__,
|
|
|
|
con, con->delay);
|
|
|
|
BUG_ON(ret == -ENOENT);
|
|
|
|
con_flag_set(con, CON_FLAG_BACKOFF);
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2013-02-20 02:25:57 +08:00
|
|
|
/* Finish fault handling; con->mutex must *not* be held here */
|
|
|
|
|
|
|
|
static void con_fault_finish(struct ceph_connection *con)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* in case we faulted due to authentication, invalidate our
|
|
|
|
* current tickets so that we can get new ones.
|
|
|
|
*/
|
|
|
|
if (con->auth_retry && con->ops->invalidate_authorizer) {
|
|
|
|
dout("calling invalidate_authorizer()\n");
|
|
|
|
con->ops->invalidate_authorizer(con);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (con->ops->fault)
|
|
|
|
con->ops->fault(con);
|
|
|
|
}
|
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
/*
|
|
|
|
* Do some work on a connection. Drop a connection ref when we're done.
|
|
|
|
*/
|
|
|
|
static void con_work(struct work_struct *work)
|
|
|
|
{
|
|
|
|
struct ceph_connection *con = container_of(work, struct ceph_connection,
|
|
|
|
work.work);
|
2013-02-20 02:25:57 +08:00
|
|
|
bool fault;
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2010-04-29 04:51:50 +08:00
|
|
|
mutex_lock(&con->mutex);
|
2013-02-20 02:25:57 +08:00
|
|
|
while (true) {
|
|
|
|
int ret;
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2013-02-20 02:25:57 +08:00
|
|
|
if ((fault = con_sock_closed(con))) {
|
|
|
|
dout("%s: con %p SOCK_CLOSED\n", __func__, con);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (con_backoff(con)) {
|
|
|
|
dout("%s: con %p BACKOFF\n", __func__, con);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (con->state == CON_STATE_STANDBY) {
|
|
|
|
dout("%s: con %p STANDBY\n", __func__, con);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (con->state == CON_STATE_CLOSED) {
|
|
|
|
dout("%s: con %p CLOSED\n", __func__, con);
|
|
|
|
BUG_ON(con->sock);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (con->state == CON_STATE_PREOPEN) {
|
|
|
|
dout("%s: con %p PREOPEN\n", __func__, con);
|
|
|
|
BUG_ON(con->sock);
|
|
|
|
}
|
2011-05-20 02:21:05 +08:00
|
|
|
|
2013-02-20 02:25:57 +08:00
|
|
|
ret = try_read(con);
|
|
|
|
if (ret < 0) {
|
|
|
|
if (ret == -EAGAIN)
|
|
|
|
continue;
|
|
|
|
con->error_msg = "socket error on read";
|
|
|
|
fault = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = try_write(con);
|
|
|
|
if (ret < 0) {
|
|
|
|
if (ret == -EAGAIN)
|
|
|
|
continue;
|
|
|
|
con->error_msg = "socket error on write";
|
|
|
|
fault = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
break; /* If we make it to here, we're done */
|
2012-07-31 07:24:21 +08:00
|
|
|
}
|
2013-02-20 02:25:57 +08:00
|
|
|
if (fault)
|
|
|
|
con_fault(con);
|
2010-04-29 04:51:50 +08:00
|
|
|
mutex_unlock(&con->mutex);
|
2011-05-20 02:21:05 +08:00
|
|
|
|
2013-02-20 02:25:57 +08:00
|
|
|
if (fault)
|
|
|
|
con_fault_finish(con);
|
|
|
|
|
|
|
|
con->ops->put(con);
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Generic error/fault handler. A retry mechanism is used with
|
|
|
|
* exponential backoff
|
|
|
|
*/
|
2013-02-20 02:25:57 +08:00
|
|
|
static void con_fault(struct ceph_connection *con)
|
2009-10-07 02:31:13 +08:00
|
|
|
{
|
2012-12-15 06:47:41 +08:00
|
|
|
pr_warning("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
|
2010-04-07 06:14:15 +08:00
|
|
|
ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg);
|
2009-10-07 02:31:13 +08:00
|
|
|
dout("fault %p state %lu to peer %s\n",
|
2010-04-07 06:14:15 +08:00
|
|
|
con, con->state, ceph_pr_addr(&con->peer_addr.in_addr));
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2012-12-27 00:43:57 +08:00
|
|
|
WARN_ON(con->state != CON_STATE_CONNECTING &&
|
2012-07-21 08:24:40 +08:00
|
|
|
con->state != CON_STATE_NEGOTIATING &&
|
|
|
|
con->state != CON_STATE_OPEN);
|
2009-12-23 02:43:42 +08:00
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
con_close_socket(con);
|
2009-12-15 06:30:34 +08:00
|
|
|
|
2013-02-21 00:25:12 +08:00
|
|
|
if (con_flag_test(con, CON_FLAG_LOSSYTX)) {
|
2012-07-21 08:24:40 +08:00
|
|
|
dout("fault on LOSSYTX channel, marking CLOSED\n");
|
|
|
|
con->state = CON_STATE_CLOSED;
|
2013-02-20 02:25:57 +08:00
|
|
|
return;
|
2012-07-21 06:22:53 +08:00
|
|
|
}
|
|
|
|
|
2009-12-15 06:30:34 +08:00
|
|
|
if (con->in_msg) {
|
2012-06-02 03:56:43 +08:00
|
|
|
BUG_ON(con->in_msg->con != con);
|
|
|
|
con->in_msg->con = NULL;
|
2009-12-15 06:30:34 +08:00
|
|
|
ceph_msg_put(con->in_msg);
|
|
|
|
con->in_msg = NULL;
|
2012-06-22 03:47:08 +08:00
|
|
|
con->ops->put(con);
|
2009-12-15 06:30:34 +08:00
|
|
|
}
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2010-02-26 04:40:45 +08:00
|
|
|
/* Requeue anything that hasn't been acked */
|
|
|
|
list_splice_init(&con->out_sent, &con->out_queue);
|
2010-02-03 08:21:06 +08:00
|
|
|
|
2011-03-04 02:10:15 +08:00
|
|
|
/* If there are no messages queued or keepalive pending, place
|
|
|
|
* the connection in a STANDBY state */
|
|
|
|
if (list_empty(&con->out_queue) &&
|
2013-02-21 00:25:12 +08:00
|
|
|
!con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING)) {
|
2011-03-05 04:25:05 +08:00
|
|
|
dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con);
|
2013-02-21 00:25:12 +08:00
|
|
|
con_flag_clear(con, CON_FLAG_WRITE_PENDING);
|
2012-07-21 08:24:40 +08:00
|
|
|
con->state = CON_STATE_STANDBY;
|
2010-02-26 04:40:45 +08:00
|
|
|
} else {
|
|
|
|
/* retry after a delay. */
|
2012-07-21 08:24:40 +08:00
|
|
|
con->state = CON_STATE_PREOPEN;
|
2010-02-26 04:40:45 +08:00
|
|
|
if (con->delay == 0)
|
|
|
|
con->delay = BASE_DELAY_INTERVAL;
|
|
|
|
else if (con->delay < MAX_DELAY_INTERVAL)
|
|
|
|
con->delay *= 2;
|
2013-02-21 00:25:12 +08:00
|
|
|
con_flag_set(con, CON_FLAG_BACKOFF);
|
2012-10-09 11:37:30 +08:00
|
|
|
queue_con(con);
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
2012-05-27 12:26:43 +08:00
|
|
|
* initialize a new messenger instance
|
2009-10-07 02:31:13 +08:00
|
|
|
*/
|
2012-05-27 12:26:43 +08:00
|
|
|
void ceph_messenger_init(struct ceph_messenger *msgr,
|
|
|
|
struct ceph_entity_addr *myaddr,
|
|
|
|
u32 supported_features,
|
|
|
|
u32 required_features,
|
|
|
|
bool nocrc)
|
2009-10-07 02:31:13 +08:00
|
|
|
{
|
2010-04-07 06:14:15 +08:00
|
|
|
msgr->supported_features = supported_features;
|
|
|
|
msgr->required_features = required_features;
|
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
spin_lock_init(&msgr->global_seq_lock);
|
|
|
|
|
|
|
|
if (myaddr)
|
|
|
|
msgr->inst.addr = *myaddr;
|
|
|
|
|
|
|
|
/* select a random nonce */
|
2010-01-28 06:28:10 +08:00
|
|
|
msgr->inst.addr.type = 0;
|
2010-01-08 08:12:36 +08:00
|
|
|
get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
|
2009-11-04 07:17:56 +08:00
|
|
|
encode_my_addr(msgr);
|
2012-05-27 12:26:43 +08:00
|
|
|
msgr->nocrc = nocrc;
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2012-07-09 10:50:33 +08:00
|
|
|
atomic_set(&msgr->stopping, 0);
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2012-05-27 12:26:43 +08:00
|
|
|
dout("%s %p\n", __func__, msgr);
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
2012-05-27 12:26:43 +08:00
|
|
|
EXPORT_SYMBOL(ceph_messenger_init);
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2011-03-05 04:25:05 +08:00
|
|
|
static void clear_standby(struct ceph_connection *con)
|
|
|
|
{
|
|
|
|
/* come back from STANDBY? */
|
2012-07-21 08:24:40 +08:00
|
|
|
if (con->state == CON_STATE_STANDBY) {
|
2011-03-05 04:25:05 +08:00
|
|
|
dout("clear_standby %p and ++connect_seq\n", con);
|
2012-07-21 08:24:40 +08:00
|
|
|
con->state = CON_STATE_PREOPEN;
|
2011-03-05 04:25:05 +08:00
|
|
|
con->connect_seq++;
|
2013-02-21 00:25:12 +08:00
|
|
|
WARN_ON(con_flag_test(con, CON_FLAG_WRITE_PENDING));
|
|
|
|
WARN_ON(con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING));
|
2011-03-05 04:25:05 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
/*
|
|
|
|
* Queue up an outgoing message on the given connection.
|
|
|
|
*/
|
|
|
|
void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
|
|
|
|
{
|
|
|
|
/* set src+dst */
|
2010-03-26 06:45:38 +08:00
|
|
|
msg->hdr.src = con->msgr->inst.name;
|
2010-03-02 07:25:00 +08:00
|
|
|
BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
|
2010-05-12 12:20:38 +08:00
|
|
|
msg->needs_out_seq = true;
|
|
|
|
|
2009-12-23 02:43:42 +08:00
|
|
|
mutex_lock(&con->mutex);
|
libceph: have messages take a connection reference
There are essentially two types of ceph messages: incoming and
outgoing. Outgoing messages are always allocated via ceph_msg_new(),
and at the time of their allocation they are not associated with any
particular connection. Incoming messages are always allocated via
ceph_con_in_msg_alloc(), and they are initially associated with the
connection from which incoming data will be placed into the message.
When an outgoing message gets sent, it becomes associated with a
connection and remains that way until the message is successfully
sent. The association of an incoming message goes away at the point
it is sent to an upper layer via a con->ops->dispatch method.
This patch implements reference counting for all ceph messages, such
that every message holds a reference (and a pointer) to a connection
if and only if it is associated with that connection (as described
above).
For background, here is an explanation of the ceph message
lifecycle, emphasizing when an association exists between a message
and a connection.
Outgoing Messages
An outgoing message is "owned" by its allocator, from the time it is
allocated in ceph_msg_new() up to the point it gets queued for
sending in ceph_con_send(). Prior to that point the message's
msg->con pointer is null; at the point it is queued for sending its
message pointer is assigned to refer to the connection. At that
time the message is inserted into a connection's out_queue list.
When a message on the out_queue list has been sent to the socket
layer to be put on the wire, it is transferred out of that list and
into the connection's out_sent list. At that point it is still owned
by the connection, and will remain so until an acknowledgement is
received from the recipient that indicates the message was
successfully transferred. When such an acknowledgement is received
(in process_ack()), the message is removed from its list (in
ceph_msg_remove()), at which point it is no longer associated with
the connection.
So basically, any time a message is on one of a connection's lists,
it is associated with that connection. Reference counting outgoing
messages can thus be done at the points a message is added to the
out_queue (in ceph_con_send()) and the point it is removed from
either its two lists (in ceph_msg_remove())--at which point its
connection pointer becomes null.
Incoming Messages
When an incoming message on a connection is getting read (in
read_partial_message()) and there is no message in con->in_msg,
a new one is allocated using ceph_con_in_msg_alloc(). At that
point the message is associated with the connection. Once that
message has been completely and successfully read, it is passed to
upper layer code using the connection's con->ops->dispatch method.
At that point the association between the message and the connection
no longer exists.
Reference counting of connections for incoming messages can be done
by taking a reference to the connection when the message gets
allocated, and releasing that reference when it gets handed off
using the dispatch method.
We should never fail to get a connection reference for a
message--the since the caller should already hold one.
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
2012-06-05 03:43:33 +08:00
|
|
|
|
2012-07-21 08:24:40 +08:00
|
|
|
if (con->state == CON_STATE_CLOSED) {
|
2012-07-21 06:34:04 +08:00
|
|
|
dout("con_send %p closed, dropping %p\n", con, msg);
|
|
|
|
ceph_msg_put(msg);
|
|
|
|
mutex_unlock(&con->mutex);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2012-06-02 03:56:43 +08:00
|
|
|
BUG_ON(msg->con != NULL);
|
2012-06-22 03:47:08 +08:00
|
|
|
msg->con = con->ops->get(con);
|
libceph: have messages take a connection reference
There are essentially two types of ceph messages: incoming and
outgoing. Outgoing messages are always allocated via ceph_msg_new(),
and at the time of their allocation they are not associated with any
particular connection. Incoming messages are always allocated via
ceph_con_in_msg_alloc(), and they are initially associated with the
connection from which incoming data will be placed into the message.
When an outgoing message gets sent, it becomes associated with a
connection and remains that way until the message is successfully
sent. The association of an incoming message goes away at the point
it is sent to an upper layer via a con->ops->dispatch method.
This patch implements reference counting for all ceph messages, such
that every message holds a reference (and a pointer) to a connection
if and only if it is associated with that connection (as described
above).
For background, here is an explanation of the ceph message
lifecycle, emphasizing when an association exists between a message
and a connection.
Outgoing Messages
An outgoing message is "owned" by its allocator, from the time it is
allocated in ceph_msg_new() up to the point it gets queued for
sending in ceph_con_send(). Prior to that point the message's
msg->con pointer is null; at the point it is queued for sending its
message pointer is assigned to refer to the connection. At that
time the message is inserted into a connection's out_queue list.
When a message on the out_queue list has been sent to the socket
layer to be put on the wire, it is transferred out of that list and
into the connection's out_sent list. At that point it is still owned
by the connection, and will remain so until an acknowledgement is
received from the recipient that indicates the message was
successfully transferred. When such an acknowledgement is received
(in process_ack()), the message is removed from its list (in
ceph_msg_remove()), at which point it is no longer associated with
the connection.
So basically, any time a message is on one of a connection's lists,
it is associated with that connection. Reference counting outgoing
messages can thus be done at the points a message is added to the
out_queue (in ceph_con_send()) and the point it is removed from
either its two lists (in ceph_msg_remove())--at which point its
connection pointer becomes null.
Incoming Messages
When an incoming message on a connection is getting read (in
read_partial_message()) and there is no message in con->in_msg,
a new one is allocated using ceph_con_in_msg_alloc(). At that
point the message is associated with the connection. Once that
message has been completely and successfully read, it is passed to
upper layer code using the connection's con->ops->dispatch method.
At that point the association between the message and the connection
no longer exists.
Reference counting of connections for incoming messages can be done
by taking a reference to the connection when the message gets
allocated, and releasing that reference when it gets handed off
using the dispatch method.
We should never fail to get a connection reference for a
message--the since the caller should already hold one.
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
2012-06-05 03:43:33 +08:00
|
|
|
BUG_ON(msg->con == NULL);
|
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
BUG_ON(!list_empty(&msg->list_head));
|
|
|
|
list_add_tail(&msg->list_head, &con->out_queue);
|
|
|
|
dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
|
|
|
|
ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
|
|
|
|
ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
|
|
|
|
le32_to_cpu(msg->hdr.front_len),
|
|
|
|
le32_to_cpu(msg->hdr.middle_len),
|
|
|
|
le32_to_cpu(msg->hdr.data_len));
|
2012-07-21 06:33:04 +08:00
|
|
|
|
|
|
|
clear_standby(con);
|
2009-12-23 02:43:42 +08:00
|
|
|
mutex_unlock(&con->mutex);
|
2009-10-07 02:31:13 +08:00
|
|
|
|
|
|
|
/* if there wasn't anything waiting to send before, queue
|
|
|
|
* new work */
|
2013-02-21 00:25:12 +08:00
|
|
|
if (con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0)
|
2009-10-07 02:31:13 +08:00
|
|
|
queue_con(con);
|
|
|
|
}
|
2010-04-07 06:14:15 +08:00
|
|
|
EXPORT_SYMBOL(ceph_con_send);
|
2009-10-07 02:31:13 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Revoke a message that was previously queued for send
|
|
|
|
*/
|
2012-06-02 03:56:43 +08:00
|
|
|
void ceph_msg_revoke(struct ceph_msg *msg)
|
2009-10-07 02:31:13 +08:00
|
|
|
{
|
2012-06-02 03:56:43 +08:00
|
|
|
struct ceph_connection *con = msg->con;
|
|
|
|
|
|
|
|
if (!con)
|
|
|
|
return; /* Message not in our possession */
|
|
|
|
|
2009-12-23 02:43:42 +08:00
|
|
|
mutex_lock(&con->mutex);
|
2009-10-07 02:31:13 +08:00
|
|
|
if (!list_empty(&msg->list_head)) {
|
2012-06-02 03:56:43 +08:00
|
|
|
dout("%s %p msg %p - was on queue\n", __func__, con, msg);
|
2009-10-07 02:31:13 +08:00
|
|
|
list_del_init(&msg->list_head);
|
2012-06-02 03:56:43 +08:00
|
|
|
BUG_ON(msg->con == NULL);
|
2012-06-22 03:47:08 +08:00
|
|
|
msg->con->ops->put(msg->con);
|
2012-06-02 03:56:43 +08:00
|
|
|
msg->con = NULL;
|
2009-10-07 02:31:13 +08:00
|
|
|
msg->hdr.seq = 0;
|
2012-06-02 03:56:43 +08:00
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
ceph_msg_put(msg);
|
2010-07-06 03:15:14 +08:00
|
|
|
}
|
|
|
|
if (con->out_msg == msg) {
|
2012-06-02 03:56:43 +08:00
|
|
|
dout("%s %p msg %p - was sending\n", __func__, con, msg);
|
2010-07-06 03:15:14 +08:00
|
|
|
con->out_msg = NULL;
|
2009-10-07 02:31:13 +08:00
|
|
|
if (con->out_kvec_is_msg) {
|
|
|
|
con->out_skip = con->out_kvec_bytes;
|
|
|
|
con->out_kvec_is_msg = false;
|
|
|
|
}
|
2010-07-06 03:15:14 +08:00
|
|
|
msg->hdr.seq = 0;
|
libceph: have messages take a connection reference
There are essentially two types of ceph messages: incoming and
outgoing. Outgoing messages are always allocated via ceph_msg_new(),
and at the time of their allocation they are not associated with any
particular connection. Incoming messages are always allocated via
ceph_con_in_msg_alloc(), and they are initially associated with the
connection from which incoming data will be placed into the message.
When an outgoing message gets sent, it becomes associated with a
connection and remains that way until the message is successfully
sent. The association of an incoming message goes away at the point
it is sent to an upper layer via a con->ops->dispatch method.
This patch implements reference counting for all ceph messages, such
that every message holds a reference (and a pointer) to a connection
if and only if it is associated with that connection (as described
above).
For background, here is an explanation of the ceph message
lifecycle, emphasizing when an association exists between a message
and a connection.
Outgoing Messages
An outgoing message is "owned" by its allocator, from the time it is
allocated in ceph_msg_new() up to the point it gets queued for
sending in ceph_con_send(). Prior to that point the message's
msg->con pointer is null; at the point it is queued for sending its
message pointer is assigned to refer to the connection. At that
time the message is inserted into a connection's out_queue list.
When a message on the out_queue list has been sent to the socket
layer to be put on the wire, it is transferred out of that list and
into the connection's out_sent list. At that point it is still owned
by the connection, and will remain so until an acknowledgement is
received from the recipient that indicates the message was
successfully transferred. When such an acknowledgement is received
(in process_ack()), the message is removed from its list (in
ceph_msg_remove()), at which point it is no longer associated with
the connection.
So basically, any time a message is on one of a connection's lists,
it is associated with that connection. Reference counting outgoing
messages can thus be done at the points a message is added to the
out_queue (in ceph_con_send()) and the point it is removed from
either its two lists (in ceph_msg_remove())--at which point its
connection pointer becomes null.
Incoming Messages
When an incoming message on a connection is getting read (in
read_partial_message()) and there is no message in con->in_msg,
a new one is allocated using ceph_con_in_msg_alloc(). At that
point the message is associated with the connection. Once that
message has been completely and successfully read, it is passed to
upper layer code using the connection's con->ops->dispatch method.
At that point the association between the message and the connection
no longer exists.
Reference counting of connections for incoming messages can be done
by taking a reference to the connection when the message gets
allocated, and releasing that reference when it gets handed off
using the dispatch method.
We should never fail to get a connection reference for a
message--the since the caller should already hold one.
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
2012-06-05 03:43:33 +08:00
|
|
|
|
|
|
|
ceph_msg_put(msg);
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
2009-12-23 02:43:42 +08:00
|
|
|
mutex_unlock(&con->mutex);
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
|
|
|
|
2009-12-23 02:45:45 +08:00
|
|
|
/*
|
2010-01-14 09:03:23 +08:00
|
|
|
* Revoke a message that we may be reading data into
|
2009-12-23 02:45:45 +08:00
|
|
|
*/
|
2012-06-02 03:56:43 +08:00
|
|
|
void ceph_msg_revoke_incoming(struct ceph_msg *msg)
|
2009-12-23 02:45:45 +08:00
|
|
|
{
|
2012-06-02 03:56:43 +08:00
|
|
|
struct ceph_connection *con;
|
|
|
|
|
|
|
|
BUG_ON(msg == NULL);
|
|
|
|
if (!msg->con) {
|
|
|
|
dout("%s msg %p null con\n", __func__, msg);
|
|
|
|
|
|
|
|
return; /* Message not in our possession */
|
|
|
|
}
|
|
|
|
|
|
|
|
con = msg->con;
|
2009-12-23 02:45:45 +08:00
|
|
|
mutex_lock(&con->mutex);
|
2012-06-02 03:56:43 +08:00
|
|
|
if (con->in_msg == msg) {
|
2012-04-15 13:58:06 +08:00
|
|
|
unsigned int front_len = le32_to_cpu(con->in_hdr.front_len);
|
|
|
|
unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len);
|
|
|
|
unsigned int data_len = le32_to_cpu(con->in_hdr.data_len);
|
2009-12-23 02:45:45 +08:00
|
|
|
|
|
|
|
/* skip rest of message */
|
2012-06-02 03:56:43 +08:00
|
|
|
dout("%s %p msg %p revoked\n", __func__, con, msg);
|
|
|
|
con->in_base_pos = con->in_base_pos -
|
2009-12-23 02:45:45 +08:00
|
|
|
sizeof(struct ceph_msg_header) -
|
2010-01-14 09:03:23 +08:00
|
|
|
front_len -
|
|
|
|
middle_len -
|
|
|
|
data_len -
|
2009-12-23 02:45:45 +08:00
|
|
|
sizeof(struct ceph_msg_footer);
|
|
|
|
ceph_msg_put(con->in_msg);
|
|
|
|
con->in_msg = NULL;
|
|
|
|
con->in_tag = CEPH_MSGR_TAG_READY;
|
2010-04-22 11:45:59 +08:00
|
|
|
con->in_seq++;
|
2009-12-23 02:45:45 +08:00
|
|
|
} else {
|
2012-06-02 03:56:43 +08:00
|
|
|
dout("%s %p in_msg %p msg %p no-op\n",
|
|
|
|
__func__, con, con->in_msg, msg);
|
2009-12-23 02:45:45 +08:00
|
|
|
}
|
|
|
|
mutex_unlock(&con->mutex);
|
|
|
|
}
|
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
/*
|
|
|
|
* Queue a keepalive byte to ensure the tcp connection is alive.
|
|
|
|
*/
|
|
|
|
void ceph_con_keepalive(struct ceph_connection *con)
|
|
|
|
{
|
2011-03-05 04:25:05 +08:00
|
|
|
dout("con_keepalive %p\n", con);
|
2012-07-21 06:33:04 +08:00
|
|
|
mutex_lock(&con->mutex);
|
2011-03-05 04:25:05 +08:00
|
|
|
clear_standby(con);
|
2012-07-21 06:33:04 +08:00
|
|
|
mutex_unlock(&con->mutex);
|
2013-02-21 00:25:12 +08:00
|
|
|
if (con_flag_test_and_set(con, CON_FLAG_KEEPALIVE_PENDING) == 0 &&
|
|
|
|
con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0)
|
2009-10-07 02:31:13 +08:00
|
|
|
queue_con(con);
|
|
|
|
}
|
2010-04-07 06:14:15 +08:00
|
|
|
EXPORT_SYMBOL(ceph_con_keepalive);
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2013-03-02 08:00:16 +08:00
|
|
|
static void ceph_msg_data_init(struct ceph_msg_data *data)
|
|
|
|
{
|
|
|
|
data->type = CEPH_MSG_DATA_NONE;
|
|
|
|
}
|
|
|
|
|
2013-02-15 02:16:43 +08:00
|
|
|
void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages,
|
2013-03-08 05:38:26 +08:00
|
|
|
size_t length, size_t alignment)
|
2013-02-15 02:16:43 +08:00
|
|
|
{
|
2013-03-05 08:29:06 +08:00
|
|
|
BUG_ON(!pages);
|
|
|
|
BUG_ON(!length);
|
2013-03-02 08:00:16 +08:00
|
|
|
BUG_ON(msg->p.type != CEPH_MSG_DATA_NONE);
|
2013-02-15 02:16:43 +08:00
|
|
|
|
2013-03-02 08:00:16 +08:00
|
|
|
msg->p.type = CEPH_MSG_DATA_PAGES;
|
2013-03-02 08:00:16 +08:00
|
|
|
msg->p.pages = pages;
|
|
|
|
msg->p.length = length;
|
|
|
|
msg->p.alignment = alignment & ~PAGE_MASK;
|
2013-02-15 02:16:43 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(ceph_msg_data_set_pages);
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2013-02-15 02:16:43 +08:00
|
|
|
void ceph_msg_data_set_pagelist(struct ceph_msg *msg,
|
|
|
|
struct ceph_pagelist *pagelist)
|
|
|
|
{
|
2013-03-05 08:29:06 +08:00
|
|
|
BUG_ON(!pagelist);
|
|
|
|
BUG_ON(!pagelist->length);
|
2013-03-02 08:00:16 +08:00
|
|
|
BUG_ON(msg->l.type != CEPH_MSG_DATA_NONE);
|
2013-02-15 02:16:43 +08:00
|
|
|
|
2013-03-02 08:00:16 +08:00
|
|
|
msg->l.type = CEPH_MSG_DATA_PAGELIST;
|
2013-03-02 08:00:16 +08:00
|
|
|
msg->l.pagelist = pagelist;
|
2013-02-15 02:16:43 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(ceph_msg_data_set_pagelist);
|
|
|
|
|
|
|
|
void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio)
|
|
|
|
{
|
2013-03-05 08:29:06 +08:00
|
|
|
BUG_ON(!bio);
|
2013-03-02 08:00:16 +08:00
|
|
|
BUG_ON(msg->b.type != CEPH_MSG_DATA_NONE);
|
2013-02-15 02:16:43 +08:00
|
|
|
|
2013-03-02 08:00:16 +08:00
|
|
|
msg->b.type = CEPH_MSG_DATA_BIO;
|
2013-03-02 08:00:16 +08:00
|
|
|
msg->b.bio = bio;
|
2013-02-15 02:16:43 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(ceph_msg_data_set_bio);
|
|
|
|
|
|
|
|
void ceph_msg_data_set_trail(struct ceph_msg *msg, struct ceph_pagelist *trail)
|
|
|
|
{
|
2013-03-05 08:29:06 +08:00
|
|
|
BUG_ON(!trail);
|
|
|
|
BUG_ON(!trail->length);
|
2013-03-02 08:00:16 +08:00
|
|
|
BUG_ON(msg->b.type != CEPH_MSG_DATA_NONE);
|
2013-02-15 02:16:43 +08:00
|
|
|
|
2013-03-02 08:00:16 +08:00
|
|
|
msg->t.type = CEPH_MSG_DATA_PAGELIST;
|
|
|
|
msg->t.pagelist = trail;
|
2013-02-15 02:16:43 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(ceph_msg_data_set_trail);
|
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
/*
|
|
|
|
* construct a new message with given type, size
|
|
|
|
* the new msg has a ref count of 1.
|
|
|
|
*/
|
2011-08-10 06:03:46 +08:00
|
|
|
struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
|
|
|
|
bool can_fail)
|
2009-10-07 02:31:13 +08:00
|
|
|
{
|
|
|
|
struct ceph_msg *m;
|
|
|
|
|
2013-03-02 08:00:16 +08:00
|
|
|
m = kzalloc(sizeof(*m), flags);
|
2009-10-07 02:31:13 +08:00
|
|
|
if (m == NULL)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
m->hdr.type = cpu_to_le16(type);
|
2010-05-12 06:01:51 +08:00
|
|
|
m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
|
2009-10-07 02:31:13 +08:00
|
|
|
m->hdr.front_len = cpu_to_le32(front_len);
|
2011-05-03 10:29:56 +08:00
|
|
|
|
2013-03-02 08:00:16 +08:00
|
|
|
INIT_LIST_HEAD(&m->list_head);
|
|
|
|
kref_init(&m->kref);
|
2011-05-03 10:29:56 +08:00
|
|
|
|
2013-03-02 08:00:16 +08:00
|
|
|
ceph_msg_data_init(&m->p);
|
|
|
|
ceph_msg_data_init(&m->l);
|
|
|
|
ceph_msg_data_init(&m->b);
|
|
|
|
ceph_msg_data_init(&m->t);
|
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
/* front */
|
2013-03-02 08:00:16 +08:00
|
|
|
m->front_max = front_len;
|
2009-10-07 02:31:13 +08:00
|
|
|
if (front_len) {
|
|
|
|
if (front_len > PAGE_CACHE_SIZE) {
|
2010-04-07 05:33:58 +08:00
|
|
|
m->front.iov_base = __vmalloc(front_len, flags,
|
2009-10-07 02:31:13 +08:00
|
|
|
PAGE_KERNEL);
|
|
|
|
m->front_is_vmalloc = true;
|
|
|
|
} else {
|
2010-04-07 05:33:58 +08:00
|
|
|
m->front.iov_base = kmalloc(front_len, flags);
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
|
|
|
if (m->front.iov_base == NULL) {
|
2011-08-10 06:03:46 +08:00
|
|
|
dout("ceph_msg_new can't allocate %d bytes\n",
|
2009-10-07 02:31:13 +08:00
|
|
|
front_len);
|
|
|
|
goto out2;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
m->front.iov_base = NULL;
|
|
|
|
}
|
|
|
|
m->front.iov_len = front_len;
|
|
|
|
|
2010-04-02 07:07:23 +08:00
|
|
|
dout("ceph_msg_new %p front %d\n", m, front_len);
|
2009-10-07 02:31:13 +08:00
|
|
|
return m;
|
|
|
|
|
|
|
|
out2:
|
|
|
|
ceph_msg_put(m);
|
|
|
|
out:
|
2011-08-10 06:03:46 +08:00
|
|
|
if (!can_fail) {
|
|
|
|
pr_err("msg_new can't create type %d front %d\n", type,
|
|
|
|
front_len);
|
2011-08-10 06:05:07 +08:00
|
|
|
WARN_ON(1);
|
2011-08-10 06:03:46 +08:00
|
|
|
} else {
|
|
|
|
dout("msg_new can't create type %d front %d\n", type,
|
|
|
|
front_len);
|
|
|
|
}
|
2010-04-02 07:06:19 +08:00
|
|
|
return NULL;
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
2010-04-07 06:14:15 +08:00
|
|
|
EXPORT_SYMBOL(ceph_msg_new);
|
2009-10-07 02:31:13 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Allocate "middle" portion of a message, if it is needed and wasn't
|
|
|
|
* allocated by alloc_msg. This allows us to read a small fixed-size
|
|
|
|
* per-type header in the front and then gracefully fail (i.e.,
|
|
|
|
* propagate the error to the caller based on info in the front) when
|
|
|
|
* the middle is too large.
|
|
|
|
*/
|
2010-01-09 05:58:34 +08:00
|
|
|
static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
|
2009-10-07 02:31:13 +08:00
|
|
|
{
|
|
|
|
int type = le16_to_cpu(msg->hdr.type);
|
|
|
|
int middle_len = le32_to_cpu(msg->hdr.middle_len);
|
|
|
|
|
|
|
|
dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
|
|
|
|
ceph_msg_type_name(type), middle_len);
|
|
|
|
BUG_ON(!middle_len);
|
|
|
|
BUG_ON(msg->middle);
|
|
|
|
|
2009-12-08 04:17:17 +08:00
|
|
|
msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
|
2009-10-07 02:31:13 +08:00
|
|
|
if (!msg->middle)
|
|
|
|
return -ENOMEM;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2010-01-09 05:58:34 +08:00
|
|
|
/*
|
2012-06-05 03:43:32 +08:00
|
|
|
* Allocate a message for receiving an incoming message on a
|
|
|
|
* connection, and save the result in con->in_msg. Uses the
|
|
|
|
* connection's private alloc_msg op if available.
|
|
|
|
*
|
2012-07-31 09:19:30 +08:00
|
|
|
* Returns 0 on success, or a negative error code.
|
|
|
|
*
|
|
|
|
* On success, if we set *skip = 1:
|
|
|
|
* - the next message should be skipped and ignored.
|
|
|
|
* - con->in_msg == NULL
|
|
|
|
* or if we set *skip = 0:
|
|
|
|
* - con->in_msg is non-null.
|
|
|
|
* On error (ENOMEM, EAGAIN, ...),
|
|
|
|
* - con->in_msg == NULL
|
2010-01-09 05:58:34 +08:00
|
|
|
*/
|
2012-07-31 09:19:30 +08:00
|
|
|
static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
|
2010-01-09 05:58:34 +08:00
|
|
|
{
|
2012-07-31 09:19:30 +08:00
|
|
|
struct ceph_msg_header *hdr = &con->in_hdr;
|
2010-01-09 05:58:34 +08:00
|
|
|
int middle_len = le32_to_cpu(hdr->middle_len);
|
2013-03-02 08:00:14 +08:00
|
|
|
struct ceph_msg *msg;
|
2012-07-31 09:19:30 +08:00
|
|
|
int ret = 0;
|
2010-01-09 05:58:34 +08:00
|
|
|
|
2012-06-05 03:43:32 +08:00
|
|
|
BUG_ON(con->in_msg != NULL);
|
2013-03-02 08:00:14 +08:00
|
|
|
BUG_ON(!con->ops->alloc_msg);
|
2010-01-09 05:58:34 +08:00
|
|
|
|
2013-03-02 08:00:14 +08:00
|
|
|
mutex_unlock(&con->mutex);
|
|
|
|
msg = con->ops->alloc_msg(con, hdr, skip);
|
|
|
|
mutex_lock(&con->mutex);
|
|
|
|
if (con->state != CON_STATE_OPEN) {
|
|
|
|
if (msg)
|
2013-03-02 08:00:14 +08:00
|
|
|
ceph_msg_put(msg);
|
2013-03-02 08:00:14 +08:00
|
|
|
return -EAGAIN;
|
|
|
|
}
|
2013-03-05 23:25:10 +08:00
|
|
|
if (msg) {
|
|
|
|
BUG_ON(*skip);
|
|
|
|
con->in_msg = msg;
|
2012-06-22 03:47:08 +08:00
|
|
|
con->in_msg->con = con->ops->get(con);
|
libceph: have messages take a connection reference
There are essentially two types of ceph messages: incoming and
outgoing. Outgoing messages are always allocated via ceph_msg_new(),
and at the time of their allocation they are not associated with any
particular connection. Incoming messages are always allocated via
ceph_con_in_msg_alloc(), and they are initially associated with the
connection from which incoming data will be placed into the message.
When an outgoing message gets sent, it becomes associated with a
connection and remains that way until the message is successfully
sent. The association of an incoming message goes away at the point
it is sent to an upper layer via a con->ops->dispatch method.
This patch implements reference counting for all ceph messages, such
that every message holds a reference (and a pointer) to a connection
if and only if it is associated with that connection (as described
above).
For background, here is an explanation of the ceph message
lifecycle, emphasizing when an association exists between a message
and a connection.
Outgoing Messages
An outgoing message is "owned" by its allocator, from the time it is
allocated in ceph_msg_new() up to the point it gets queued for
sending in ceph_con_send(). Prior to that point the message's
msg->con pointer is null; at the point it is queued for sending its
message pointer is assigned to refer to the connection. At that
time the message is inserted into a connection's out_queue list.
When a message on the out_queue list has been sent to the socket
layer to be put on the wire, it is transferred out of that list and
into the connection's out_sent list. At that point it is still owned
by the connection, and will remain so until an acknowledgement is
received from the recipient that indicates the message was
successfully transferred. When such an acknowledgement is received
(in process_ack()), the message is removed from its list (in
ceph_msg_remove()), at which point it is no longer associated with
the connection.
So basically, any time a message is on one of a connection's lists,
it is associated with that connection. Reference counting outgoing
messages can thus be done at the points a message is added to the
out_queue (in ceph_con_send()) and the point it is removed from
either its two lists (in ceph_msg_remove())--at which point its
connection pointer becomes null.
Incoming Messages
When an incoming message on a connection is getting read (in
read_partial_message()) and there is no message in con->in_msg,
a new one is allocated using ceph_con_in_msg_alloc(). At that
point the message is associated with the connection. Once that
message has been completely and successfully read, it is passed to
upper layer code using the connection's con->ops->dispatch method.
At that point the association between the message and the connection
no longer exists.
Reference counting of connections for incoming messages can be done
by taking a reference to the connection when the message gets
allocated, and releasing that reference when it gets handed off
using the dispatch method.
We should never fail to get a connection reference for a
message--the since the caller should already hold one.
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
2012-06-05 03:43:33 +08:00
|
|
|
BUG_ON(con->in_msg->con == NULL);
|
2013-03-05 23:25:10 +08:00
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* Null message pointer means either we should skip
|
|
|
|
* this message or we couldn't allocate memory. The
|
|
|
|
* former is not an error.
|
|
|
|
*/
|
|
|
|
if (*skip)
|
|
|
|
return 0;
|
|
|
|
con->error_msg = "error allocating memory for incoming message";
|
|
|
|
|
2013-03-02 08:00:14 +08:00
|
|
|
return -ENOMEM;
|
2010-01-09 05:58:34 +08:00
|
|
|
}
|
2012-06-05 03:43:32 +08:00
|
|
|
memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
|
2010-01-09 05:58:34 +08:00
|
|
|
|
2012-06-05 03:43:32 +08:00
|
|
|
if (middle_len && !con->in_msg->middle) {
|
|
|
|
ret = ceph_alloc_middle(con, con->in_msg);
|
2010-01-09 05:58:34 +08:00
|
|
|
if (ret < 0) {
|
2012-06-05 03:43:32 +08:00
|
|
|
ceph_msg_put(con->in_msg);
|
|
|
|
con->in_msg = NULL;
|
2010-01-09 05:58:34 +08:00
|
|
|
}
|
|
|
|
}
|
2010-01-12 02:32:02 +08:00
|
|
|
|
2012-07-31 09:19:30 +08:00
|
|
|
return ret;
|
2010-01-09 05:58:34 +08:00
|
|
|
}
|
|
|
|
|
2009-10-07 02:31:13 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Free a generically kmalloc'd message.
|
|
|
|
*/
|
|
|
|
void ceph_msg_kfree(struct ceph_msg *m)
|
|
|
|
{
|
|
|
|
dout("msg_kfree %p\n", m);
|
|
|
|
if (m->front_is_vmalloc)
|
|
|
|
vfree(m->front.iov_base);
|
|
|
|
else
|
|
|
|
kfree(m->front.iov_base);
|
|
|
|
kfree(m);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Drop a msg ref. Destroy as needed.
|
|
|
|
*/
|
2009-12-08 07:55:05 +08:00
|
|
|
void ceph_msg_last_put(struct kref *kref)
|
|
|
|
{
|
|
|
|
struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
|
2009-10-07 02:31:13 +08:00
|
|
|
|
2009-12-08 07:55:05 +08:00
|
|
|
dout("ceph_msg_put last one on %p\n", m);
|
|
|
|
WARN_ON(!list_empty(&m->list_head));
|
|
|
|
|
|
|
|
/* drop middle, data, if any */
|
|
|
|
if (m->middle) {
|
|
|
|
ceph_buffer_put(m->middle);
|
|
|
|
m->middle = NULL;
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
2013-03-02 08:00:16 +08:00
|
|
|
if (ceph_msg_has_pages(m)) {
|
2013-03-02 08:00:16 +08:00
|
|
|
m->p.length = 0;
|
|
|
|
m->p.pages = NULL;
|
2013-03-02 08:00:16 +08:00
|
|
|
}
|
2009-12-08 07:55:05 +08:00
|
|
|
|
2013-03-02 08:00:16 +08:00
|
|
|
if (ceph_msg_has_pagelist(m)) {
|
2013-03-02 08:00:16 +08:00
|
|
|
ceph_pagelist_release(m->l.pagelist);
|
|
|
|
kfree(m->l.pagelist);
|
|
|
|
m->l.pagelist = NULL;
|
2009-12-24 04:12:31 +08:00
|
|
|
}
|
|
|
|
|
2013-03-02 08:00:16 +08:00
|
|
|
if (ceph_msg_has_trail(m))
|
2013-03-02 08:00:16 +08:00
|
|
|
m->t.pagelist = NULL;
|
2010-04-07 06:01:27 +08:00
|
|
|
|
2009-12-08 07:55:05 +08:00
|
|
|
if (m->pool)
|
|
|
|
ceph_msgpool_put(m->pool, m);
|
|
|
|
else
|
|
|
|
ceph_msg_kfree(m);
|
2009-10-07 02:31:13 +08:00
|
|
|
}
|
2010-04-07 06:14:15 +08:00
|
|
|
EXPORT_SYMBOL(ceph_msg_last_put);
|
2009-12-15 07:13:47 +08:00
|
|
|
|
|
|
|
void ceph_msg_dump(struct ceph_msg *msg)
|
|
|
|
{
|
2013-03-08 05:38:26 +08:00
|
|
|
pr_debug("msg_dump %p (front_max %d length %zd)\n", msg,
|
2013-03-02 08:00:16 +08:00
|
|
|
msg->front_max, msg->p.length);
|
2009-12-15 07:13:47 +08:00
|
|
|
print_hex_dump(KERN_DEBUG, "header: ",
|
|
|
|
DUMP_PREFIX_OFFSET, 16, 1,
|
|
|
|
&msg->hdr, sizeof(msg->hdr), true);
|
|
|
|
print_hex_dump(KERN_DEBUG, " front: ",
|
|
|
|
DUMP_PREFIX_OFFSET, 16, 1,
|
|
|
|
msg->front.iov_base, msg->front.iov_len, true);
|
|
|
|
if (msg->middle)
|
|
|
|
print_hex_dump(KERN_DEBUG, "middle: ",
|
|
|
|
DUMP_PREFIX_OFFSET, 16, 1,
|
|
|
|
msg->middle->vec.iov_base,
|
|
|
|
msg->middle->vec.iov_len, true);
|
|
|
|
print_hex_dump(KERN_DEBUG, "footer: ",
|
|
|
|
DUMP_PREFIX_OFFSET, 16, 1,
|
|
|
|
&msg->footer, sizeof(msg->footer), true);
|
|
|
|
}
|
2010-04-07 06:14:15 +08:00
|
|
|
EXPORT_SYMBOL(ceph_msg_dump);
|