Merge branch 'RDS-zerocopy-support'
Sowmini Varadhan says: ==================== RDS: zerocopy support This is version 3 of the series, following up on review comments for http://patchwork.ozlabs.org/project/netdev/list/?series=28530 Review comments addressed Patch 4 - fix fragile use of skb->cb[], do not set ee_code incorrectly. Patch 5: - remove needless bzero of skb->cb[], consolidate err cleanup A brief overview of this feature follows. This patch series provides support for MSG_ZERCOCOPY on a PF_RDS socket based on the APIs and infrastructure added by Commitf214f915e7
("tcp: enable MSG_ZEROCOPY") For single threaded rds-stress testing using rds-tcp with the ixgbe driver using 1M message sizes (-a 1M -q 1M) preliminary results show that there is a significant reduction in latency: about 90 usec with zerocopy, compared with 200 usec without zerocopy. This patchset modifies the above for zerocopy in the following manner. - if the MSG_ZEROCOPY flag is specified with rds_sendmsg(), and, - if the SO_ZEROCOPY socket option has been set on the PF_RDS socket, application pages sent down with rds_sendmsg are pinned. The pinning uses the accounting infrastructure added bya91dbff551
("sock: ulimit on MSG_ZEROCOPY pages"). The message is unpinned when all references to the message go down to 0, and the message is freed by rds_message_purge. A multithreaded application using this infrastructure must send down a unique 32 bit cookie as ancillary data with each sendmsg invocation. The format of this ancillary data is described in Patch 5 of the series. The cookie is passed up to the application on the sk_error_queue when the message is unpinned, indicating to the application that it is now safe to free/reuse the message buffer. The details of the completion notification are provided in Patch 4 of this series. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
commit
80c6d2b8d8
|
@ -466,6 +466,9 @@ struct ubuf_info {
|
|||
|
||||
#define skb_uarg(SKB) ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg))
|
||||
|
||||
int mm_account_pinned_pages(struct mmpin *mmp, size_t size);
|
||||
void mm_unaccount_pinned_pages(struct mmpin *mmp);
|
||||
|
||||
struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size);
|
||||
struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
|
||||
struct ubuf_info *uarg);
|
||||
|
|
|
@ -20,11 +20,13 @@ struct sock_extended_err {
|
|||
#define SO_EE_ORIGIN_ICMP6 3
|
||||
#define SO_EE_ORIGIN_TXSTATUS 4
|
||||
#define SO_EE_ORIGIN_ZEROCOPY 5
|
||||
#define SO_EE_ORIGIN_ZCOOKIE 6
|
||||
#define SO_EE_ORIGIN_TIMESTAMPING SO_EE_ORIGIN_TXSTATUS
|
||||
|
||||
#define SO_EE_OFFENDER(ee) ((struct sockaddr*)((ee)+1))
|
||||
|
||||
#define SO_EE_CODE_ZEROCOPY_COPIED 1
|
||||
#define SO_EE_ORIGIN_MAX_ZCOOKIES 8
|
||||
|
||||
/**
|
||||
* struct scm_timestamping - timestamps exposed through cmsg
|
||||
|
|
|
@ -103,6 +103,7 @@
|
|||
#define RDS_CMSG_MASKED_ATOMIC_FADD 8
|
||||
#define RDS_CMSG_MASKED_ATOMIC_CSWP 9
|
||||
#define RDS_CMSG_RXPATH_LATENCY 11
|
||||
#define RDS_CMSG_ZCOPY_COOKIE 12
|
||||
|
||||
#define RDS_INFO_FIRST 10000
|
||||
#define RDS_INFO_COUNTERS 10000
|
||||
|
|
|
@ -890,7 +890,7 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(skb_morph);
|
||||
|
||||
static int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
|
||||
int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
|
||||
{
|
||||
unsigned long max_pg, num_pg, new_pg, old_pg;
|
||||
struct user_struct *user;
|
||||
|
@ -919,14 +919,16 @@ static int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
|
|||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(mm_account_pinned_pages);
|
||||
|
||||
static void mm_unaccount_pinned_pages(struct mmpin *mmp)
|
||||
void mm_unaccount_pinned_pages(struct mmpin *mmp)
|
||||
{
|
||||
if (mmp->user) {
|
||||
atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm);
|
||||
free_uid(mmp->user);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);
|
||||
|
||||
struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
|
||||
{
|
||||
|
|
|
@ -1049,18 +1049,21 @@ set_rcvbuf:
|
|||
break;
|
||||
|
||||
case SO_ZEROCOPY:
|
||||
if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6)
|
||||
if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
|
||||
if (sk->sk_protocol != IPPROTO_TCP)
|
||||
ret = -ENOTSUPP;
|
||||
else if (sk->sk_state != TCP_CLOSE)
|
||||
ret = -EBUSY;
|
||||
} else if (sk->sk_family != PF_RDS) {
|
||||
ret = -ENOTSUPP;
|
||||
else if (sk->sk_protocol != IPPROTO_TCP)
|
||||
ret = -ENOTSUPP;
|
||||
else if (sk->sk_state != TCP_CLOSE)
|
||||
ret = -EBUSY;
|
||||
else if (val < 0 || val > 1)
|
||||
ret = -EINVAL;
|
||||
else
|
||||
sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
|
||||
break;
|
||||
|
||||
}
|
||||
if (!ret) {
|
||||
if (val < 0 || val > 1)
|
||||
ret = -EINVAL;
|
||||
else
|
||||
sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
ret = -ENOPROTOOPT;
|
||||
break;
|
||||
|
|
|
@ -182,6 +182,8 @@ static __poll_t rds_poll(struct file *file, struct socket *sock,
|
|||
mask |= (EPOLLIN | EPOLLRDNORM);
|
||||
if (rs->rs_snd_bytes < rds_sk_sndbuf(rs))
|
||||
mask |= (EPOLLOUT | EPOLLWRNORM);
|
||||
if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
|
||||
mask |= POLLERR;
|
||||
read_unlock_irqrestore(&rs->rs_recv_lock, flags);
|
||||
|
||||
/* clear state any time we wake a seen-congested socket */
|
||||
|
|
|
@ -33,6 +33,9 @@
|
|||
#include <linux/kernel.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/export.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/errqueue.h>
|
||||
|
||||
#include "rds.h"
|
||||
|
||||
|
@ -53,20 +56,92 @@ void rds_message_addref(struct rds_message *rm)
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(rds_message_addref);
|
||||
|
||||
static inline bool skb_zcookie_add(struct sk_buff *skb, u32 cookie)
|
||||
{
|
||||
struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
|
||||
int ncookies;
|
||||
u32 *ptr;
|
||||
|
||||
if (serr->ee.ee_origin != SO_EE_ORIGIN_ZCOOKIE)
|
||||
return false;
|
||||
ncookies = serr->ee.ee_data;
|
||||
if (ncookies == SO_EE_ORIGIN_MAX_ZCOOKIES)
|
||||
return false;
|
||||
ptr = skb_put(skb, sizeof(u32));
|
||||
*ptr = cookie;
|
||||
serr->ee.ee_data = ++ncookies;
|
||||
return true;
|
||||
}
|
||||
|
||||
static void rds_rm_zerocopy_callback(struct rds_sock *rs,
|
||||
struct rds_znotifier *znotif)
|
||||
{
|
||||
struct sock *sk = rds_rs_to_sk(rs);
|
||||
struct sk_buff *skb, *tail;
|
||||
struct sock_exterr_skb *serr;
|
||||
unsigned long flags;
|
||||
struct sk_buff_head *q;
|
||||
u32 cookie = znotif->z_cookie;
|
||||
|
||||
q = &sk->sk_error_queue;
|
||||
spin_lock_irqsave(&q->lock, flags);
|
||||
tail = skb_peek_tail(q);
|
||||
|
||||
if (tail && skb_zcookie_add(tail, cookie)) {
|
||||
spin_unlock_irqrestore(&q->lock, flags);
|
||||
mm_unaccount_pinned_pages(&znotif->z_mmp);
|
||||
consume_skb(rds_skb_from_znotifier(znotif));
|
||||
sk->sk_error_report(sk);
|
||||
return;
|
||||
}
|
||||
|
||||
skb = rds_skb_from_znotifier(znotif);
|
||||
serr = SKB_EXT_ERR(skb);
|
||||
memset(&serr->ee, 0, sizeof(serr->ee));
|
||||
serr->ee.ee_errno = 0;
|
||||
serr->ee.ee_origin = SO_EE_ORIGIN_ZCOOKIE;
|
||||
serr->ee.ee_info = 0;
|
||||
WARN_ON(!skb_zcookie_add(skb, cookie));
|
||||
|
||||
__skb_queue_tail(q, skb);
|
||||
|
||||
spin_unlock_irqrestore(&q->lock, flags);
|
||||
sk->sk_error_report(sk);
|
||||
|
||||
mm_unaccount_pinned_pages(&znotif->z_mmp);
|
||||
}
|
||||
|
||||
/*
|
||||
* This relies on dma_map_sg() not touching sg[].page during merging.
|
||||
*/
|
||||
static void rds_message_purge(struct rds_message *rm)
|
||||
{
|
||||
unsigned long i;
|
||||
unsigned long i, flags;
|
||||
bool zcopy = false;
|
||||
|
||||
if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags)))
|
||||
return;
|
||||
|
||||
spin_lock_irqsave(&rm->m_rs_lock, flags);
|
||||
if (rm->m_rs) {
|
||||
struct rds_sock *rs = rm->m_rs;
|
||||
|
||||
if (rm->data.op_mmp_znotifier) {
|
||||
zcopy = true;
|
||||
rds_rm_zerocopy_callback(rs, rm->data.op_mmp_znotifier);
|
||||
rm->data.op_mmp_znotifier = NULL;
|
||||
}
|
||||
sock_put(rds_rs_to_sk(rs));
|
||||
rm->m_rs = NULL;
|
||||
}
|
||||
spin_unlock_irqrestore(&rm->m_rs_lock, flags);
|
||||
|
||||
for (i = 0; i < rm->data.op_nents; i++) {
|
||||
rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.op_sg[i]));
|
||||
/* XXX will have to put_page for page refs */
|
||||
__free_page(sg_page(&rm->data.op_sg[i]));
|
||||
if (!zcopy)
|
||||
__free_page(sg_page(&rm->data.op_sg[i]));
|
||||
else
|
||||
put_page(sg_page(&rm->data.op_sg[i]));
|
||||
}
|
||||
rm->data.op_nents = 0;
|
||||
|
||||
|
@ -266,12 +341,14 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in
|
|||
return rm;
|
||||
}
|
||||
|
||||
int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from)
|
||||
int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from,
|
||||
bool zcopy)
|
||||
{
|
||||
unsigned long to_copy, nbytes;
|
||||
unsigned long sg_off;
|
||||
struct scatterlist *sg;
|
||||
int ret = 0;
|
||||
int length = iov_iter_count(from);
|
||||
|
||||
rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from));
|
||||
|
||||
|
@ -281,6 +358,53 @@ int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from)
|
|||
sg = rm->data.op_sg;
|
||||
sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */
|
||||
|
||||
if (zcopy) {
|
||||
int total_copied = 0;
|
||||
struct sk_buff *skb;
|
||||
|
||||
skb = alloc_skb(SO_EE_ORIGIN_MAX_ZCOOKIES * sizeof(u32),
|
||||
GFP_KERNEL);
|
||||
if (!skb)
|
||||
return -ENOMEM;
|
||||
rm->data.op_mmp_znotifier = RDS_ZCOPY_SKB(skb);
|
||||
if (mm_account_pinned_pages(&rm->data.op_mmp_znotifier->z_mmp,
|
||||
length)) {
|
||||
ret = -ENOMEM;
|
||||
goto err;
|
||||
}
|
||||
while (iov_iter_count(from)) {
|
||||
struct page *pages;
|
||||
size_t start;
|
||||
ssize_t copied;
|
||||
|
||||
copied = iov_iter_get_pages(from, &pages, PAGE_SIZE,
|
||||
1, &start);
|
||||
if (copied < 0) {
|
||||
struct mmpin *mmp;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < rm->data.op_nents; i++)
|
||||
put_page(sg_page(&rm->data.op_sg[i]));
|
||||
mmp = &rm->data.op_mmp_znotifier->z_mmp;
|
||||
mm_unaccount_pinned_pages(mmp);
|
||||
ret = -EFAULT;
|
||||
goto err;
|
||||
}
|
||||
total_copied += copied;
|
||||
iov_iter_advance(from, copied);
|
||||
length -= copied;
|
||||
sg_set_page(sg, pages, copied, start);
|
||||
rm->data.op_nents++;
|
||||
sg++;
|
||||
}
|
||||
WARN_ON_ONCE(length != 0);
|
||||
return ret;
|
||||
err:
|
||||
consume_skb(skb);
|
||||
rm->data.op_mmp_znotifier = NULL;
|
||||
return ret;
|
||||
} /* zcopy */
|
||||
|
||||
while (iov_iter_count(from)) {
|
||||
if (!sg_page(sg)) {
|
||||
ret = rds_page_remainder_alloc(sg, iov_iter_count(from),
|
||||
|
|
|
@ -356,6 +356,19 @@ static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
|
|||
#define RDS_MSG_PAGEVEC 7
|
||||
#define RDS_MSG_FLUSH 8
|
||||
|
||||
struct rds_znotifier {
|
||||
struct list_head z_list;
|
||||
struct mmpin z_mmp;
|
||||
u32 z_cookie;
|
||||
};
|
||||
|
||||
#define RDS_ZCOPY_SKB(__skb) ((struct rds_znotifier *)&((__skb)->cb[0]))
|
||||
|
||||
static inline struct sk_buff *rds_skb_from_znotifier(struct rds_znotifier *z)
|
||||
{
|
||||
return container_of((void *)z, struct sk_buff, cb);
|
||||
}
|
||||
|
||||
struct rds_message {
|
||||
refcount_t m_refcount;
|
||||
struct list_head m_sock_item;
|
||||
|
@ -436,6 +449,7 @@ struct rds_message {
|
|||
unsigned int op_count;
|
||||
unsigned int op_dmasg;
|
||||
unsigned int op_dmaoff;
|
||||
struct rds_znotifier *op_mmp_znotifier;
|
||||
struct scatterlist *op_sg;
|
||||
} data;
|
||||
};
|
||||
|
@ -771,7 +785,8 @@ rds_conn_connecting(struct rds_connection *conn)
|
|||
/* message.c */
|
||||
struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp);
|
||||
struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents);
|
||||
int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from);
|
||||
int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from,
|
||||
bool zcopy);
|
||||
struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len);
|
||||
void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
|
||||
__be16 dport, u64 seq);
|
||||
|
|
|
@ -594,6 +594,8 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
|
|||
|
||||
if (msg_flags & MSG_OOB)
|
||||
goto out;
|
||||
if (msg_flags & MSG_ERRQUEUE)
|
||||
return sock_recv_errqueue(sk, msg, size, SOL_IP, IP_RECVERR);
|
||||
|
||||
while (1) {
|
||||
/* If there are pending notifications, do those - and nothing else */
|
||||
|
|
|
@ -649,7 +649,6 @@ static void rds_send_remove_from_sock(struct list_head *messages, int status)
|
|||
rm->rdma.op_notifier = NULL;
|
||||
}
|
||||
was_on_sock = 1;
|
||||
rm->m_rs = NULL;
|
||||
}
|
||||
spin_unlock(&rs->rs_lock);
|
||||
|
||||
|
@ -756,9 +755,6 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
|
|||
*/
|
||||
if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
|
||||
spin_unlock_irqrestore(&cp->cp_lock, flags);
|
||||
spin_lock_irqsave(&rm->m_rs_lock, flags);
|
||||
rm->m_rs = NULL;
|
||||
spin_unlock_irqrestore(&rm->m_rs_lock, flags);
|
||||
continue;
|
||||
}
|
||||
list_del_init(&rm->m_conn_item);
|
||||
|
@ -774,7 +770,6 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
|
|||
__rds_send_complete(rs, rm, RDS_RDMA_CANCELED);
|
||||
spin_unlock(&rs->rs_lock);
|
||||
|
||||
rm->m_rs = NULL;
|
||||
spin_unlock_irqrestore(&rm->m_rs_lock, flags);
|
||||
|
||||
rds_message_put(rm);
|
||||
|
@ -798,7 +793,6 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
|
|||
__rds_send_complete(rs, rm, RDS_RDMA_CANCELED);
|
||||
spin_unlock(&rs->rs_lock);
|
||||
|
||||
rm->m_rs = NULL;
|
||||
spin_unlock_irqrestore(&rm->m_rs_lock, flags);
|
||||
|
||||
rds_message_put(rm);
|
||||
|
@ -849,6 +843,7 @@ static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn,
|
|||
list_add_tail(&rm->m_sock_item, &rs->rs_send_queue);
|
||||
set_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
|
||||
rds_message_addref(rm);
|
||||
sock_hold(rds_rs_to_sk(rs));
|
||||
rm->m_rs = rs;
|
||||
|
||||
/* The code ordering is a little weird, but we're
|
||||
|
@ -880,12 +875,13 @@ out:
|
|||
* rds_message is getting to be quite complicated, and we'd like to allocate
|
||||
* it all in one go. This figures out how big it needs to be up front.
|
||||
*/
|
||||
static int rds_rm_size(struct msghdr *msg, int data_len)
|
||||
static int rds_rm_size(struct msghdr *msg, int num_sgs)
|
||||
{
|
||||
struct cmsghdr *cmsg;
|
||||
int size = 0;
|
||||
int cmsg_groups = 0;
|
||||
int retval;
|
||||
bool zcopy_cookie = false;
|
||||
|
||||
for_each_cmsghdr(cmsg, msg) {
|
||||
if (!CMSG_OK(msg, cmsg))
|
||||
|
@ -904,6 +900,8 @@ static int rds_rm_size(struct msghdr *msg, int data_len)
|
|||
|
||||
break;
|
||||
|
||||
case RDS_CMSG_ZCOPY_COOKIE:
|
||||
zcopy_cookie = true;
|
||||
case RDS_CMSG_RDMA_DEST:
|
||||
case RDS_CMSG_RDMA_MAP:
|
||||
cmsg_groups |= 2;
|
||||
|
@ -924,7 +922,10 @@ static int rds_rm_size(struct msghdr *msg, int data_len)
|
|||
|
||||
}
|
||||
|
||||
size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist);
|
||||
if ((msg->msg_flags & MSG_ZEROCOPY) && !zcopy_cookie)
|
||||
return -EINVAL;
|
||||
|
||||
size += num_sgs * sizeof(struct scatterlist);
|
||||
|
||||
/* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */
|
||||
if (cmsg_groups == 3)
|
||||
|
@ -933,6 +934,18 @@ static int rds_rm_size(struct msghdr *msg, int data_len)
|
|||
return size;
|
||||
}
|
||||
|
||||
static int rds_cmsg_zcopy(struct rds_sock *rs, struct rds_message *rm,
|
||||
struct cmsghdr *cmsg)
|
||||
{
|
||||
u32 *cookie;
|
||||
|
||||
if (cmsg->cmsg_len < CMSG_LEN(sizeof(*cookie)))
|
||||
return -EINVAL;
|
||||
cookie = CMSG_DATA(cmsg);
|
||||
rm->data.op_mmp_znotifier->z_cookie = *cookie;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
|
||||
struct msghdr *msg, int *allocated_mr)
|
||||
{
|
||||
|
@ -975,6 +988,10 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
|
|||
ret = rds_cmsg_atomic(rs, rm, cmsg);
|
||||
break;
|
||||
|
||||
case RDS_CMSG_ZCOPY_COOKIE:
|
||||
ret = rds_cmsg_zcopy(rs, rm, cmsg);
|
||||
break;
|
||||
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
|
@ -1045,10 +1062,13 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
|
|||
long timeo = sock_sndtimeo(sk, nonblock);
|
||||
struct rds_conn_path *cpath;
|
||||
size_t total_payload_len = payload_len, rdma_payload_len = 0;
|
||||
bool zcopy = ((msg->msg_flags & MSG_ZEROCOPY) &&
|
||||
sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY));
|
||||
int num_sgs = ceil(payload_len, PAGE_SIZE);
|
||||
|
||||
/* Mirror Linux UDP mirror of BSD error message compatibility */
|
||||
/* XXX: Perhaps MSG_MORE someday */
|
||||
if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT)) {
|
||||
if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT | MSG_ZEROCOPY)) {
|
||||
ret = -EOPNOTSUPP;
|
||||
goto out;
|
||||
}
|
||||
|
@ -1092,8 +1112,15 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
|
|||
goto out;
|
||||
}
|
||||
|
||||
if (zcopy) {
|
||||
if (rs->rs_transport->t_type != RDS_TRANS_TCP) {
|
||||
ret = -EOPNOTSUPP;
|
||||
goto out;
|
||||
}
|
||||
num_sgs = iov_iter_npages(&msg->msg_iter, INT_MAX);
|
||||
}
|
||||
/* size of rm including all sgs */
|
||||
ret = rds_rm_size(msg, payload_len);
|
||||
ret = rds_rm_size(msg, num_sgs);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
|
@ -1105,12 +1132,12 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
|
|||
|
||||
/* Attach data to the rm */
|
||||
if (payload_len) {
|
||||
rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE));
|
||||
rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs);
|
||||
if (!rm->data.op_sg) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
ret = rds_message_copy_from_user(rm, &msg->msg_iter);
|
||||
ret = rds_message_copy_from_user(rm, &msg->msg_iter, zcopy);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
|
|
|
@ -14,6 +14,9 @@
|
|||
* - SOCK_DGRAM
|
||||
* - SOCK_RAW
|
||||
*
|
||||
* PF_RDS
|
||||
* - SOCK_SEQPACKET
|
||||
*
|
||||
* Start this program on two connected hosts, one in send mode and
|
||||
* the other with option '-r' to put it in receiver mode.
|
||||
*
|
||||
|
@ -53,6 +56,7 @@
|
|||
#include <sys/types.h>
|
||||
#include <sys/wait.h>
|
||||
#include <unistd.h>
|
||||
#include <linux/rds.h>
|
||||
|
||||
#ifndef SO_EE_ORIGIN_ZEROCOPY
|
||||
#define SO_EE_ORIGIN_ZEROCOPY 5
|
||||
|
@ -164,17 +168,39 @@ static int do_accept(int fd)
|
|||
return fd;
|
||||
}
|
||||
|
||||
static bool do_sendmsg(int fd, struct msghdr *msg, bool do_zerocopy)
|
||||
static void add_zcopy_cookie(struct msghdr *msg, uint32_t cookie)
|
||||
{
|
||||
struct cmsghdr *cm;
|
||||
|
||||
if (!msg->msg_control)
|
||||
error(1, errno, "NULL cookie");
|
||||
cm = (void *)msg->msg_control;
|
||||
cm->cmsg_len = CMSG_LEN(sizeof(cookie));
|
||||
cm->cmsg_level = SOL_RDS;
|
||||
cm->cmsg_type = RDS_CMSG_ZCOPY_COOKIE;
|
||||
memcpy(CMSG_DATA(cm), &cookie, sizeof(cookie));
|
||||
}
|
||||
|
||||
static bool do_sendmsg(int fd, struct msghdr *msg, bool do_zerocopy, int domain)
|
||||
{
|
||||
int ret, len, i, flags;
|
||||
static uint32_t cookie;
|
||||
char ckbuf[CMSG_SPACE(sizeof(cookie))];
|
||||
|
||||
len = 0;
|
||||
for (i = 0; i < msg->msg_iovlen; i++)
|
||||
len += msg->msg_iov[i].iov_len;
|
||||
|
||||
flags = MSG_DONTWAIT;
|
||||
if (do_zerocopy)
|
||||
if (do_zerocopy) {
|
||||
flags |= MSG_ZEROCOPY;
|
||||
if (domain == PF_RDS) {
|
||||
memset(&msg->msg_control, 0, sizeof(msg->msg_control));
|
||||
msg->msg_controllen = CMSG_SPACE(sizeof(cookie));
|
||||
msg->msg_control = (struct cmsghdr *)ckbuf;
|
||||
add_zcopy_cookie(msg, ++cookie);
|
||||
}
|
||||
}
|
||||
|
||||
ret = sendmsg(fd, msg, flags);
|
||||
if (ret == -1 && errno == EAGAIN)
|
||||
|
@ -190,6 +216,10 @@ static bool do_sendmsg(int fd, struct msghdr *msg, bool do_zerocopy)
|
|||
if (do_zerocopy && ret)
|
||||
expected_completions++;
|
||||
}
|
||||
if (do_zerocopy && domain == PF_RDS) {
|
||||
msg->msg_control = NULL;
|
||||
msg->msg_controllen = 0;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -216,7 +246,9 @@ static void do_sendmsg_corked(int fd, struct msghdr *msg)
|
|||
msg->msg_iov[0].iov_len = payload_len + extra_len;
|
||||
extra_len = 0;
|
||||
|
||||
do_sendmsg(fd, msg, do_zerocopy);
|
||||
do_sendmsg(fd, msg, do_zerocopy,
|
||||
(cfg_dst_addr.ss_family == AF_INET ?
|
||||
PF_INET : PF_INET6));
|
||||
}
|
||||
|
||||
do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0);
|
||||
|
@ -300,13 +332,38 @@ static int do_setup_tx(int domain, int type, int protocol)
|
|||
if (cfg_zerocopy)
|
||||
do_setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, 1);
|
||||
|
||||
if (domain != PF_PACKET)
|
||||
if (domain != PF_PACKET && domain != PF_RDS)
|
||||
if (connect(fd, (void *) &cfg_dst_addr, cfg_alen))
|
||||
error(1, errno, "connect");
|
||||
|
||||
if (domain == PF_RDS) {
|
||||
if (bind(fd, (void *) &cfg_src_addr, cfg_alen))
|
||||
error(1, errno, "bind");
|
||||
}
|
||||
|
||||
return fd;
|
||||
}
|
||||
|
||||
static int do_process_zerocopy_cookies(struct sock_extended_err *serr,
|
||||
uint32_t *ckbuf, size_t nbytes)
|
||||
{
|
||||
int ncookies, i;
|
||||
|
||||
if (serr->ee_errno != 0)
|
||||
error(1, 0, "serr: wrong error code: %u", serr->ee_errno);
|
||||
ncookies = serr->ee_data;
|
||||
if (ncookies > SO_EE_ORIGIN_MAX_ZCOOKIES)
|
||||
error(1, 0, "Returned %d cookies, max expected %d\n",
|
||||
ncookies, SO_EE_ORIGIN_MAX_ZCOOKIES);
|
||||
if (nbytes != ncookies * sizeof(uint32_t))
|
||||
error(1, 0, "Expected %d cookies, got %ld\n",
|
||||
ncookies, nbytes/sizeof(uint32_t));
|
||||
for (i = 0; i < ncookies; i++)
|
||||
if (cfg_verbose >= 2)
|
||||
fprintf(stderr, "%d\n", ckbuf[i]);
|
||||
return ncookies;
|
||||
}
|
||||
|
||||
static bool do_recv_completion(int fd)
|
||||
{
|
||||
struct sock_extended_err *serr;
|
||||
|
@ -315,10 +372,17 @@ static bool do_recv_completion(int fd)
|
|||
uint32_t hi, lo, range;
|
||||
int ret, zerocopy;
|
||||
char control[100];
|
||||
uint32_t ckbuf[SO_EE_ORIGIN_MAX_ZCOOKIES];
|
||||
struct iovec iov;
|
||||
|
||||
msg.msg_control = control;
|
||||
msg.msg_controllen = sizeof(control);
|
||||
|
||||
iov.iov_base = ckbuf;
|
||||
iov.iov_len = (SO_EE_ORIGIN_MAX_ZCOOKIES * sizeof(ckbuf[0]));
|
||||
msg.msg_iov = &iov;
|
||||
msg.msg_iovlen = 1;
|
||||
|
||||
ret = recvmsg(fd, &msg, MSG_ERRQUEUE);
|
||||
if (ret == -1 && errno == EAGAIN)
|
||||
return false;
|
||||
|
@ -337,6 +401,11 @@ static bool do_recv_completion(int fd)
|
|||
cm->cmsg_level, cm->cmsg_type);
|
||||
|
||||
serr = (void *) CMSG_DATA(cm);
|
||||
|
||||
if (serr->ee_origin == SO_EE_ORIGIN_ZCOOKIE) {
|
||||
completions += do_process_zerocopy_cookies(serr, ckbuf, ret);
|
||||
return true;
|
||||
}
|
||||
if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY)
|
||||
error(1, 0, "serr: wrong origin: %u", serr->ee_origin);
|
||||
if (serr->ee_errno != 0)
|
||||
|
@ -444,6 +513,13 @@ static void do_tx(int domain, int type, int protocol)
|
|||
msg.msg_iovlen++;
|
||||
}
|
||||
|
||||
if (domain == PF_RDS) {
|
||||
msg.msg_name = &cfg_dst_addr;
|
||||
msg.msg_namelen = (cfg_dst_addr.ss_family == AF_INET ?
|
||||
sizeof(struct sockaddr_in) :
|
||||
sizeof(struct sockaddr_in6));
|
||||
}
|
||||
|
||||
iov[2].iov_base = payload;
|
||||
iov[2].iov_len = cfg_payload_len;
|
||||
msg.msg_iovlen++;
|
||||
|
@ -454,7 +530,7 @@ static void do_tx(int domain, int type, int protocol)
|
|||
if (cfg_cork)
|
||||
do_sendmsg_corked(fd, &msg);
|
||||
else
|
||||
do_sendmsg(fd, &msg, cfg_zerocopy);
|
||||
do_sendmsg(fd, &msg, cfg_zerocopy, domain);
|
||||
|
||||
while (!do_poll(fd, POLLOUT)) {
|
||||
if (cfg_zerocopy)
|
||||
|
@ -555,6 +631,40 @@ static void do_flush_datagram(int fd, int type)
|
|||
bytes += cfg_payload_len;
|
||||
}
|
||||
|
||||
|
||||
static void do_recvmsg(int fd)
|
||||
{
|
||||
int ret, off = 0;
|
||||
char *buf;
|
||||
struct iovec iov;
|
||||
struct msghdr msg;
|
||||
struct sockaddr_storage din;
|
||||
|
||||
buf = calloc(cfg_payload_len, sizeof(char));
|
||||
iov.iov_base = buf;
|
||||
iov.iov_len = cfg_payload_len;
|
||||
|
||||
memset(&msg, 0, sizeof(msg));
|
||||
msg.msg_name = &din;
|
||||
msg.msg_namelen = sizeof(din);
|
||||
msg.msg_iov = &iov;
|
||||
msg.msg_iovlen = 1;
|
||||
|
||||
ret = recvmsg(fd, &msg, MSG_TRUNC);
|
||||
|
||||
if (ret == -1)
|
||||
error(1, errno, "recv");
|
||||
if (ret != cfg_payload_len)
|
||||
error(1, 0, "recv: ret=%u != %u", ret, cfg_payload_len);
|
||||
|
||||
if (memcmp(buf + off, payload, ret))
|
||||
error(1, 0, "recv: data mismatch");
|
||||
|
||||
free(buf);
|
||||
packets++;
|
||||
bytes += cfg_payload_len;
|
||||
}
|
||||
|
||||
static void do_rx(int domain, int type, int protocol)
|
||||
{
|
||||
uint64_t tstop;
|
||||
|
@ -566,6 +676,8 @@ static void do_rx(int domain, int type, int protocol)
|
|||
do {
|
||||
if (type == SOCK_STREAM)
|
||||
do_flush_tcp(fd);
|
||||
else if (domain == PF_RDS)
|
||||
do_recvmsg(fd);
|
||||
else
|
||||
do_flush_datagram(fd, type);
|
||||
|
||||
|
@ -610,6 +722,7 @@ static void parse_opts(int argc, char **argv)
|
|||
40 /* max tcp options */;
|
||||
int c;
|
||||
char *daddr = NULL, *saddr = NULL;
|
||||
char *cfg_test;
|
||||
|
||||
cfg_payload_len = max_payload_len;
|
||||
|
||||
|
@ -667,6 +780,14 @@ static void parse_opts(int argc, char **argv)
|
|||
break;
|
||||
}
|
||||
}
|
||||
|
||||
cfg_test = argv[argc - 1];
|
||||
if (strcmp(cfg_test, "rds") == 0) {
|
||||
if (!daddr)
|
||||
error(1, 0, "-D <server addr> required for PF_RDS\n");
|
||||
if (!cfg_rx && !saddr)
|
||||
error(1, 0, "-S <client addr> required for PF_RDS\n");
|
||||
}
|
||||
setup_sockaddr(cfg_family, daddr, &cfg_dst_addr);
|
||||
setup_sockaddr(cfg_family, saddr, &cfg_src_addr);
|
||||
|
||||
|
@ -699,6 +820,8 @@ int main(int argc, char **argv)
|
|||
do_test(cfg_family, SOCK_STREAM, 0);
|
||||
else if (!strcmp(cfg_test, "udp"))
|
||||
do_test(cfg_family, SOCK_DGRAM, 0);
|
||||
else if (!strcmp(cfg_test, "rds"))
|
||||
do_test(PF_RDS, SOCK_SEQPACKET, 0);
|
||||
else
|
||||
error(1, 0, "unknown cfg_test %s", cfg_test);
|
||||
|
||||
|
|
Loading…
Reference in New Issue