RDS: add receive message trace used by application
Socket option to tap receive path latency in various stages in nano seconds. It can be enabled on selective sockets using using SO_RDS_MSG_RXPATH_LATENCY socket option. RDS will return the data to application with RDS_CMSG_RXPATH_LATENCY in defined format. Scope is left to add more trace points for future without need of change in the interface. Reviewed-by: Sowmini Varadhan <sowmini.varadhan@oracle.com> Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
This commit is contained in:
parent
f9fb69adb6
commit
3289025aed
|
@ -52,6 +52,13 @@
|
||||||
#define RDS_GET_MR_FOR_DEST 7
|
#define RDS_GET_MR_FOR_DEST 7
|
||||||
#define SO_RDS_TRANSPORT 8
|
#define SO_RDS_TRANSPORT 8
|
||||||
|
|
||||||
|
/* Socket option to tap receive path latency
|
||||||
|
* SO_RDS: SO_RDS_MSG_RXPATH_LATENCY
|
||||||
|
* Format used struct rds_rx_trace_so
|
||||||
|
*/
|
||||||
|
#define SO_RDS_MSG_RXPATH_LATENCY 10
|
||||||
|
|
||||||
|
|
||||||
/* supported values for SO_RDS_TRANSPORT */
|
/* supported values for SO_RDS_TRANSPORT */
|
||||||
#define RDS_TRANS_IB 0
|
#define RDS_TRANS_IB 0
|
||||||
#define RDS_TRANS_IWARP 1
|
#define RDS_TRANS_IWARP 1
|
||||||
|
@ -77,6 +84,12 @@
|
||||||
* the same as for the GET_MR setsockopt.
|
* the same as for the GET_MR setsockopt.
|
||||||
* RDS_CMSG_RDMA_STATUS (recvmsg)
|
* RDS_CMSG_RDMA_STATUS (recvmsg)
|
||||||
* Returns the status of a completed RDMA operation.
|
* Returns the status of a completed RDMA operation.
|
||||||
|
* RDS_CMSG_RXPATH_LATENCY(recvmsg)
|
||||||
|
* Returns rds message latencies in various stages of receive
|
||||||
|
* path in nS. Its set per socket using SO_RDS_MSG_RXPATH_LATENCY
|
||||||
|
* socket option. Legitimate points are defined in
|
||||||
|
* enum rds_message_rxpath_latency. More points can be added in
|
||||||
|
* future. CSMG format is struct rds_cmsg_rx_trace.
|
||||||
*/
|
*/
|
||||||
#define RDS_CMSG_RDMA_ARGS 1
|
#define RDS_CMSG_RDMA_ARGS 1
|
||||||
#define RDS_CMSG_RDMA_DEST 2
|
#define RDS_CMSG_RDMA_DEST 2
|
||||||
|
@ -87,6 +100,7 @@
|
||||||
#define RDS_CMSG_ATOMIC_CSWP 7
|
#define RDS_CMSG_ATOMIC_CSWP 7
|
||||||
#define RDS_CMSG_MASKED_ATOMIC_FADD 8
|
#define RDS_CMSG_MASKED_ATOMIC_FADD 8
|
||||||
#define RDS_CMSG_MASKED_ATOMIC_CSWP 9
|
#define RDS_CMSG_MASKED_ATOMIC_CSWP 9
|
||||||
|
#define RDS_CMSG_RXPATH_LATENCY 11
|
||||||
|
|
||||||
#define RDS_INFO_FIRST 10000
|
#define RDS_INFO_FIRST 10000
|
||||||
#define RDS_INFO_COUNTERS 10000
|
#define RDS_INFO_COUNTERS 10000
|
||||||
|
@ -171,6 +185,25 @@ struct rds_info_rdma_connection {
|
||||||
uint32_t rdma_mr_size;
|
uint32_t rdma_mr_size;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/* RDS message Receive Path Latency points */
|
||||||
|
enum rds_message_rxpath_latency {
|
||||||
|
RDS_MSG_RX_HDR_TO_DGRAM_START = 0,
|
||||||
|
RDS_MSG_RX_DGRAM_REASSEMBLE,
|
||||||
|
RDS_MSG_RX_DGRAM_DELIVERED,
|
||||||
|
RDS_MSG_RX_DGRAM_TRACE_MAX
|
||||||
|
};
|
||||||
|
|
||||||
|
struct rds_rx_trace_so {
|
||||||
|
u8 rx_traces;
|
||||||
|
u8 rx_trace_pos[RDS_MSG_RX_DGRAM_TRACE_MAX];
|
||||||
|
};
|
||||||
|
|
||||||
|
struct rds_cmsg_rx_trace {
|
||||||
|
u8 rx_traces;
|
||||||
|
u8 rx_trace_pos[RDS_MSG_RX_DGRAM_TRACE_MAX];
|
||||||
|
u64 rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX];
|
||||||
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Congestion monitoring.
|
* Congestion monitoring.
|
||||||
* Congestion control in RDS happens at the host connection
|
* Congestion control in RDS happens at the host connection
|
||||||
|
|
|
@ -298,6 +298,30 @@ static int rds_enable_recvtstamp(struct sock *sk, char __user *optval,
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int rds_recv_track_latency(struct rds_sock *rs, char __user *optval,
|
||||||
|
int optlen)
|
||||||
|
{
|
||||||
|
struct rds_rx_trace_so trace;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
if (optlen != sizeof(struct rds_rx_trace_so))
|
||||||
|
return -EFAULT;
|
||||||
|
|
||||||
|
if (copy_from_user(&trace, optval, sizeof(trace)))
|
||||||
|
return -EFAULT;
|
||||||
|
|
||||||
|
rs->rs_rx_traces = trace.rx_traces;
|
||||||
|
for (i = 0; i < rs->rs_rx_traces; i++) {
|
||||||
|
if (trace.rx_trace_pos[i] > RDS_MSG_RX_DGRAM_TRACE_MAX) {
|
||||||
|
rs->rs_rx_traces = 0;
|
||||||
|
return -EFAULT;
|
||||||
|
}
|
||||||
|
rs->rs_rx_trace[i] = trace.rx_trace_pos[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
static int rds_setsockopt(struct socket *sock, int level, int optname,
|
static int rds_setsockopt(struct socket *sock, int level, int optname,
|
||||||
char __user *optval, unsigned int optlen)
|
char __user *optval, unsigned int optlen)
|
||||||
{
|
{
|
||||||
|
@ -338,6 +362,9 @@ static int rds_setsockopt(struct socket *sock, int level, int optname,
|
||||||
ret = rds_enable_recvtstamp(sock->sk, optval, optlen);
|
ret = rds_enable_recvtstamp(sock->sk, optval, optlen);
|
||||||
release_sock(sock->sk);
|
release_sock(sock->sk);
|
||||||
break;
|
break;
|
||||||
|
case SO_RDS_MSG_RXPATH_LATENCY:
|
||||||
|
ret = rds_recv_track_latency(rs, optval, optlen);
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
ret = -ENOPROTOOPT;
|
ret = -ENOPROTOOPT;
|
||||||
}
|
}
|
||||||
|
@ -484,6 +511,7 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
|
||||||
INIT_LIST_HEAD(&rs->rs_cong_list);
|
INIT_LIST_HEAD(&rs->rs_cong_list);
|
||||||
spin_lock_init(&rs->rs_rdma_lock);
|
spin_lock_init(&rs->rs_rdma_lock);
|
||||||
rs->rs_rdma_keys = RB_ROOT;
|
rs->rs_rdma_keys = RB_ROOT;
|
||||||
|
rs->rs_rx_traces = 0;
|
||||||
|
|
||||||
spin_lock_bh(&rds_sock_lock);
|
spin_lock_bh(&rds_sock_lock);
|
||||||
list_add_tail(&rs->rs_item, &rds_sock_list);
|
list_add_tail(&rs->rs_item, &rds_sock_list);
|
||||||
|
|
|
@ -911,8 +911,12 @@ static void rds_ib_process_recv(struct rds_connection *conn,
|
||||||
ic->i_ibinc = ibinc;
|
ic->i_ibinc = ibinc;
|
||||||
|
|
||||||
hdr = &ibinc->ii_inc.i_hdr;
|
hdr = &ibinc->ii_inc.i_hdr;
|
||||||
|
ibinc->ii_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] =
|
||||||
|
local_clock();
|
||||||
memcpy(hdr, ihdr, sizeof(*hdr));
|
memcpy(hdr, ihdr, sizeof(*hdr));
|
||||||
ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
|
ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
|
||||||
|
ibinc->ii_inc.i_rx_lat_trace[RDS_MSG_RX_START] =
|
||||||
|
local_clock();
|
||||||
|
|
||||||
rdsdebug("ic %p ibinc %p rem %u flag 0x%x\n", ic, ibinc,
|
rdsdebug("ic %p ibinc %p rem %u flag 0x%x\n", ic, ibinc,
|
||||||
ic->i_recv_data_rem, hdr->h_flags);
|
ic->i_recv_data_rem, hdr->h_flags);
|
||||||
|
|
|
@ -253,6 +253,11 @@ struct rds_ext_header_rdma_dest {
|
||||||
#define RDS_EXTHDR_GEN_NUM 6
|
#define RDS_EXTHDR_GEN_NUM 6
|
||||||
|
|
||||||
#define __RDS_EXTHDR_MAX 16 /* for now */
|
#define __RDS_EXTHDR_MAX 16 /* for now */
|
||||||
|
#define RDS_RX_MAX_TRACES (RDS_MSG_RX_DGRAM_TRACE_MAX + 1)
|
||||||
|
#define RDS_MSG_RX_HDR 0
|
||||||
|
#define RDS_MSG_RX_START 1
|
||||||
|
#define RDS_MSG_RX_END 2
|
||||||
|
#define RDS_MSG_RX_CMSG 3
|
||||||
|
|
||||||
struct rds_incoming {
|
struct rds_incoming {
|
||||||
atomic_t i_refcount;
|
atomic_t i_refcount;
|
||||||
|
@ -265,6 +270,7 @@ struct rds_incoming {
|
||||||
|
|
||||||
rds_rdma_cookie_t i_rdma_cookie;
|
rds_rdma_cookie_t i_rdma_cookie;
|
||||||
struct timeval i_rx_tstamp;
|
struct timeval i_rx_tstamp;
|
||||||
|
u64 i_rx_lat_trace[RDS_RX_MAX_TRACES];
|
||||||
};
|
};
|
||||||
|
|
||||||
struct rds_mr {
|
struct rds_mr {
|
||||||
|
@ -575,6 +581,10 @@ struct rds_sock {
|
||||||
unsigned char rs_recverr,
|
unsigned char rs_recverr,
|
||||||
rs_cong_monitor;
|
rs_cong_monitor;
|
||||||
u32 rs_hash_initval;
|
u32 rs_hash_initval;
|
||||||
|
|
||||||
|
/* Socket receive path trace points*/
|
||||||
|
u8 rs_rx_traces;
|
||||||
|
u8 rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX];
|
||||||
};
|
};
|
||||||
|
|
||||||
static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk)
|
static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk)
|
||||||
|
|
|
@ -43,6 +43,8 @@
|
||||||
void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
|
void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
|
||||||
__be32 saddr)
|
__be32 saddr)
|
||||||
{
|
{
|
||||||
|
int i;
|
||||||
|
|
||||||
atomic_set(&inc->i_refcount, 1);
|
atomic_set(&inc->i_refcount, 1);
|
||||||
INIT_LIST_HEAD(&inc->i_item);
|
INIT_LIST_HEAD(&inc->i_item);
|
||||||
inc->i_conn = conn;
|
inc->i_conn = conn;
|
||||||
|
@ -50,6 +52,9 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
|
||||||
inc->i_rdma_cookie = 0;
|
inc->i_rdma_cookie = 0;
|
||||||
inc->i_rx_tstamp.tv_sec = 0;
|
inc->i_rx_tstamp.tv_sec = 0;
|
||||||
inc->i_rx_tstamp.tv_usec = 0;
|
inc->i_rx_tstamp.tv_usec = 0;
|
||||||
|
|
||||||
|
for (i = 0; i < RDS_RX_MAX_TRACES; i++)
|
||||||
|
inc->i_rx_lat_trace[i] = 0;
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(rds_inc_init);
|
EXPORT_SYMBOL_GPL(rds_inc_init);
|
||||||
|
|
||||||
|
@ -373,6 +378,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
|
||||||
if (sock_flag(sk, SOCK_RCVTSTAMP))
|
if (sock_flag(sk, SOCK_RCVTSTAMP))
|
||||||
do_gettimeofday(&inc->i_rx_tstamp);
|
do_gettimeofday(&inc->i_rx_tstamp);
|
||||||
rds_inc_addref(inc);
|
rds_inc_addref(inc);
|
||||||
|
inc->i_rx_lat_trace[RDS_MSG_RX_END] = local_clock();
|
||||||
list_add_tail(&inc->i_item, &rs->rs_recv_queue);
|
list_add_tail(&inc->i_item, &rs->rs_recv_queue);
|
||||||
__rds_wake_sk_sleep(sk);
|
__rds_wake_sk_sleep(sk);
|
||||||
} else {
|
} else {
|
||||||
|
@ -534,7 +540,7 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
|
||||||
ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST,
|
ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST,
|
||||||
sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie);
|
sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie);
|
||||||
if (ret)
|
if (ret)
|
||||||
return ret;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((inc->i_rx_tstamp.tv_sec != 0) &&
|
if ((inc->i_rx_tstamp.tv_sec != 0) &&
|
||||||
|
@ -543,10 +549,30 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
|
||||||
sizeof(struct timeval),
|
sizeof(struct timeval),
|
||||||
&inc->i_rx_tstamp);
|
&inc->i_rx_tstamp);
|
||||||
if (ret)
|
if (ret)
|
||||||
return ret;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
if (rs->rs_rx_traces) {
|
||||||
|
struct rds_cmsg_rx_trace t;
|
||||||
|
int i, j;
|
||||||
|
|
||||||
|
inc->i_rx_lat_trace[RDS_MSG_RX_CMSG] = local_clock();
|
||||||
|
t.rx_traces = rs->rs_rx_traces;
|
||||||
|
for (i = 0; i < rs->rs_rx_traces; i++) {
|
||||||
|
j = rs->rs_rx_trace[i];
|
||||||
|
t.rx_trace_pos[i] = j;
|
||||||
|
t.rx_trace[i] = inc->i_rx_lat_trace[j + 1] -
|
||||||
|
inc->i_rx_lat_trace[j];
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RXPATH_LATENCY,
|
||||||
|
sizeof(t), &t);
|
||||||
|
if (ret)
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
out:
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
|
int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
|
||||||
|
|
|
@ -180,6 +180,9 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
|
||||||
rdsdebug("alloced tinc %p\n", tinc);
|
rdsdebug("alloced tinc %p\n", tinc);
|
||||||
rds_inc_path_init(&tinc->ti_inc, cp,
|
rds_inc_path_init(&tinc->ti_inc, cp,
|
||||||
cp->cp_conn->c_faddr);
|
cp->cp_conn->c_faddr);
|
||||||
|
tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] =
|
||||||
|
local_clock();
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* XXX * we might be able to use the __ variants when
|
* XXX * we might be able to use the __ variants when
|
||||||
* we've already serialized at a higher level.
|
* we've already serialized at a higher level.
|
||||||
|
@ -204,6 +207,8 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
|
||||||
/* could be 0 for a 0 len message */
|
/* could be 0 for a 0 len message */
|
||||||
tc->t_tinc_data_rem =
|
tc->t_tinc_data_rem =
|
||||||
be32_to_cpu(tinc->ti_inc.i_hdr.h_len);
|
be32_to_cpu(tinc->ti_inc.i_hdr.h_len);
|
||||||
|
tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_START] =
|
||||||
|
local_clock();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue