From 518b1646f8a31904ca637b8df0c1e31c34a7a3c2 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 21 May 2007 15:04:59 +0300 Subject: [PATCH] IPoIB/cm: Fix SRQ WR leak SRQ WR leakage has been observed with IPoIB/CM: e.g. flipping ports on and off will, with time, leak out all WRs and then all connections will start getting RNR NAKs. Fix this in the way suggested by spec: move the QP being destroyed to the error state, wait for "Last WQE Reached" event and then post WR on a "drain QP" connected to the same CQ. Once we observe a completion on the drain QP, it's safe to call ib_destroy_qp. Signed-off-by: Michael S. Tsirkin Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib.h | 42 ++++- drivers/infiniband/ulp/ipoib/ipoib_cm.c | 203 +++++++++++++++++---- drivers/infiniband/ulp/ipoib/ipoib_verbs.c | 2 +- 3 files changed, 211 insertions(+), 36 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 93d4a9a1e1dd..a0b3782c7625 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -132,12 +132,46 @@ struct ipoib_cm_data { __be32 mtu; }; +/* + * Quoting 10.3.1 Queue Pair and EE Context States: + * + * Note, for QPs that are associated with an SRQ, the Consumer should take the + * QP through the Error State before invoking a Destroy QP or a Modify QP to the + * Reset State. The Consumer may invoke the Destroy QP without first performing + * a Modify QP to the Error State and waiting for the Affiliated Asynchronous + * Last WQE Reached Event. However, if the Consumer does not wait for the + * Affiliated Asynchronous Last WQE Reached Event, then WQE and Data Segment + * leakage may occur. Therefore, it is good programming practice to tear down a + * QP that is associated with an SRQ by using the following process: + * + * - Put the QP in the Error State + * - Wait for the Affiliated Asynchronous Last WQE Reached Event; + * - either: + * drain the CQ by invoking the Poll CQ verb and either wait for CQ + * to be empty or the number of Poll CQ operations has exceeded + * CQ capacity size; + * - or + * post another WR that completes on the same CQ and wait for this + * WR to return as a WC; + * - and then invoke a Destroy QP or Reset QP. + * + * We use the second option and wait for a completion on the + * rx_drain_qp before destroying QPs attached to our SRQ. + */ + +enum ipoib_cm_state { + IPOIB_CM_RX_LIVE, + IPOIB_CM_RX_ERROR, /* Ignored by stale task */ + IPOIB_CM_RX_FLUSH /* Last WQE Reached event observed */ +}; + struct ipoib_cm_rx { struct ib_cm_id *id; struct ib_qp *qp; struct list_head list; struct net_device *dev; unsigned long jiffies; + enum ipoib_cm_state state; }; struct ipoib_cm_tx { @@ -165,10 +199,16 @@ struct ipoib_cm_dev_priv { struct ib_srq *srq; struct ipoib_cm_rx_buf *srq_ring; struct ib_cm_id *id; - struct list_head passive_ids; + struct ib_qp *rx_drain_qp; /* generates WR described in 10.3.1 */ + struct list_head passive_ids; /* state: LIVE */ + struct list_head rx_error_list; /* state: ERROR */ + struct list_head rx_flush_list; /* state: FLUSH, drain not started */ + struct list_head rx_drain_list; /* state: FLUSH, drain started */ + struct list_head rx_reap_list; /* state: FLUSH, drain done */ struct work_struct start_task; struct work_struct reap_task; struct work_struct skb_task; + struct work_struct rx_reap_task; struct delayed_work stale_task; struct sk_buff_head skb_queue; struct list_head start_list; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index eec833b81e9b..ffec794b7913 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -37,6 +37,7 @@ #include #include #include +#include #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA static int data_debug_level; @@ -62,6 +63,16 @@ struct ipoib_cm_id { u32 remote_mtu; }; +static struct ib_qp_attr ipoib_cm_err_attr = { + .qp_state = IB_QPS_ERR +}; + +#define IPOIB_CM_RX_DRAIN_WRID 0x7fffffff + +static struct ib_recv_wr ipoib_cm_rx_drain_wr = { + .wr_id = IPOIB_CM_RX_DRAIN_WRID +}; + static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event); @@ -150,11 +161,44 @@ partial_error: return NULL; } +static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv* priv) +{ + struct ib_recv_wr *bad_wr; + + /* rx_drain_qp send queue depth is 1, so + * make sure we have at most 1 outstanding WR. */ + if (list_empty(&priv->cm.rx_flush_list) || + !list_empty(&priv->cm.rx_drain_list)) + return; + + if (ib_post_recv(priv->cm.rx_drain_qp, &ipoib_cm_rx_drain_wr, &bad_wr)) + ipoib_warn(priv, "failed to post rx_drain wr\n"); + + list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list); +} + +static void ipoib_cm_rx_event_handler(struct ib_event *event, void *ctx) +{ + struct ipoib_cm_rx *p = ctx; + struct ipoib_dev_priv *priv = netdev_priv(p->dev); + unsigned long flags; + + if (event->event != IB_EVENT_QP_LAST_WQE_REACHED) + return; + + spin_lock_irqsave(&priv->lock, flags); + list_move(&p->list, &priv->cm.rx_flush_list); + p->state = IPOIB_CM_RX_FLUSH; + ipoib_cm_start_rx_drain(priv); + spin_unlock_irqrestore(&priv->lock, flags); +} + static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev, struct ipoib_cm_rx *p) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ib_qp_init_attr attr = { + .event_handler = ipoib_cm_rx_event_handler, .send_cq = priv->cq, /* does not matter, we never send anything */ .recv_cq = priv->cq, .srq = priv->cm.srq, @@ -256,6 +300,7 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even cm_id->context = p; p->jiffies = jiffies; + p->state = IPOIB_CM_RX_LIVE; spin_lock_irq(&priv->lock); if (list_empty(&priv->cm.passive_ids)) queue_delayed_work(ipoib_workqueue, @@ -277,7 +322,6 @@ static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id, { struct ipoib_cm_rx *p; struct ipoib_dev_priv *priv; - int ret; switch (event->event) { case IB_CM_REQ_RECEIVED: @@ -289,20 +333,9 @@ static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id, case IB_CM_REJ_RECEIVED: p = cm_id->context; priv = netdev_priv(p->dev); - spin_lock_irq(&priv->lock); - if (list_empty(&p->list)) - ret = 0; /* Connection is going away already. */ - else { - list_del_init(&p->list); - ret = -ECONNRESET; - } - spin_unlock_irq(&priv->lock); - if (ret) { - ib_destroy_qp(p->qp); - kfree(p); - return ret; - } - return 0; + if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE)) + ipoib_warn(priv, "unable to move qp to error state\n"); + /* Fall through */ default: return 0; } @@ -354,8 +387,15 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) wr_id, wc->status); if (unlikely(wr_id >= ipoib_recvq_size)) { - ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n", - wr_id, ipoib_recvq_size); + if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~IPOIB_CM_OP_SRQ)) { + spin_lock_irqsave(&priv->lock, flags); + list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list); + ipoib_cm_start_rx_drain(priv); + queue_work(ipoib_workqueue, &priv->cm.rx_reap_task); + spin_unlock_irqrestore(&priv->lock, flags); + } else + ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n", + wr_id, ipoib_recvq_size); return; } @@ -374,9 +414,9 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) { spin_lock_irqsave(&priv->lock, flags); p->jiffies = jiffies; - /* Move this entry to list head, but do - * not re-add it if it has been removed. */ - if (!list_empty(&p->list)) + /* Move this entry to list head, but do not re-add it + * if it has been moved out of list. */ + if (p->state == IPOIB_CM_RX_LIVE) list_move(&p->list, &priv->cm.passive_ids); spin_unlock_irqrestore(&priv->lock, flags); } @@ -583,17 +623,43 @@ static void ipoib_cm_tx_completion(struct ib_cq *cq, void *tx_ptr) int ipoib_cm_dev_open(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_qp_init_attr qp_init_attr = { + .send_cq = priv->cq, /* does not matter, we never send anything */ + .recv_cq = priv->cq, + .cap.max_send_wr = 1, /* FIXME: 0 Seems not to work */ + .cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */ + .cap.max_recv_wr = 1, + .cap.max_recv_sge = 1, /* FIXME: 0 Seems not to work */ + .sq_sig_type = IB_SIGNAL_ALL_WR, + .qp_type = IB_QPT_UC, + }; int ret; if (!IPOIB_CM_SUPPORTED(dev->dev_addr)) return 0; + priv->cm.rx_drain_qp = ib_create_qp(priv->pd, &qp_init_attr); + if (IS_ERR(priv->cm.rx_drain_qp)) { + printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name); + ret = PTR_ERR(priv->cm.rx_drain_qp); + return ret; + } + + /* + * We put the QP in error state directly. This way, a "flush + * error" WC will be immediately generated for each WR we post. + */ + ret = ib_modify_qp(priv->cm.rx_drain_qp, &ipoib_cm_err_attr, IB_QP_STATE); + if (ret) { + ipoib_warn(priv, "failed to modify drain QP to error: %d\n", ret); + goto err_qp; + } + priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev); if (IS_ERR(priv->cm.id)) { printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name); ret = PTR_ERR(priv->cm.id); - priv->cm.id = NULL; - return ret; + goto err_cm; } ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num), @@ -601,35 +667,79 @@ int ipoib_cm_dev_open(struct net_device *dev) if (ret) { printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name, IPOIB_CM_IETF_ID | priv->qp->qp_num); - ib_destroy_cm_id(priv->cm.id); - priv->cm.id = NULL; - return ret; + goto err_listen; } + return 0; + +err_listen: + ib_destroy_cm_id(priv->cm.id); +err_cm: + priv->cm.id = NULL; +err_qp: + ib_destroy_qp(priv->cm.rx_drain_qp); + return ret; } void ipoib_cm_dev_stop(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); - struct ipoib_cm_rx *p; + struct ipoib_cm_rx *p, *n; + unsigned long begin; + LIST_HEAD(list); + int ret; if (!IPOIB_CM_SUPPORTED(dev->dev_addr) || !priv->cm.id) return; ib_destroy_cm_id(priv->cm.id); priv->cm.id = NULL; + spin_lock_irq(&priv->lock); while (!list_empty(&priv->cm.passive_ids)) { p = list_entry(priv->cm.passive_ids.next, typeof(*p), list); - list_del_init(&p->list); + list_move(&p->list, &priv->cm.rx_error_list); + p->state = IPOIB_CM_RX_ERROR; spin_unlock_irq(&priv->lock); + ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE); + if (ret) + ipoib_warn(priv, "unable to move qp to error state: %d\n", ret); + spin_lock_irq(&priv->lock); + } + + /* Wait for all RX to be drained */ + begin = jiffies; + + while (!list_empty(&priv->cm.rx_error_list) || + !list_empty(&priv->cm.rx_flush_list) || + !list_empty(&priv->cm.rx_drain_list)) { + if (!time_after(jiffies, begin + 5 * HZ)) { + ipoib_warn(priv, "RX drain timing out\n"); + + /* + * assume the HW is wedged and just free up everything. + */ + list_splice_init(&priv->cm.rx_flush_list, &list); + list_splice_init(&priv->cm.rx_error_list, &list); + list_splice_init(&priv->cm.rx_drain_list, &list); + break; + } + spin_unlock_irq(&priv->lock); + msleep(1); + spin_lock_irq(&priv->lock); + } + + list_splice_init(&priv->cm.rx_reap_list, &list); + + spin_unlock_irq(&priv->lock); + + list_for_each_entry_safe(p, n, &list, list) { ib_destroy_cm_id(p->id); ib_destroy_qp(p->qp); kfree(p); - spin_lock_irq(&priv->lock); } - spin_unlock_irq(&priv->lock); + ib_destroy_qp(priv->cm.rx_drain_qp); cancel_delayed_work(&priv->cm.stale_task); } @@ -1079,24 +1189,44 @@ void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb, queue_work(ipoib_workqueue, &priv->cm.skb_task); } +static void ipoib_cm_rx_reap(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, + cm.rx_reap_task); + struct ipoib_cm_rx *p, *n; + LIST_HEAD(list); + + spin_lock_irq(&priv->lock); + list_splice_init(&priv->cm.rx_reap_list, &list); + spin_unlock_irq(&priv->lock); + + list_for_each_entry_safe(p, n, &list, list) { + ib_destroy_cm_id(p->id); + ib_destroy_qp(p->qp); + kfree(p); + } +} + static void ipoib_cm_stale_task(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, cm.stale_task.work); struct ipoib_cm_rx *p; + int ret; spin_lock_irq(&priv->lock); while (!list_empty(&priv->cm.passive_ids)) { - /* List if sorted by LRU, start from tail, + /* List is sorted by LRU, start from tail, * stop when we see a recently used entry */ p = list_entry(priv->cm.passive_ids.prev, typeof(*p), list); if (time_before_eq(jiffies, p->jiffies + IPOIB_CM_RX_TIMEOUT)) break; - list_del_init(&p->list); + list_move(&p->list, &priv->cm.rx_error_list); + p->state = IPOIB_CM_RX_ERROR; spin_unlock_irq(&priv->lock); - ib_destroy_cm_id(p->id); - ib_destroy_qp(p->qp); - kfree(p); + ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE); + if (ret) + ipoib_warn(priv, "unable to move qp to error state: %d\n", ret); spin_lock_irq(&priv->lock); } @@ -1164,9 +1294,14 @@ int ipoib_cm_dev_init(struct net_device *dev) INIT_LIST_HEAD(&priv->cm.passive_ids); INIT_LIST_HEAD(&priv->cm.reap_list); INIT_LIST_HEAD(&priv->cm.start_list); + INIT_LIST_HEAD(&priv->cm.rx_error_list); + INIT_LIST_HEAD(&priv->cm.rx_flush_list); + INIT_LIST_HEAD(&priv->cm.rx_drain_list); + INIT_LIST_HEAD(&priv->cm.rx_reap_list); INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start); INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap); INIT_WORK(&priv->cm.skb_task, ipoib_cm_skb_reap); + INIT_WORK(&priv->cm.rx_reap_task, ipoib_cm_rx_reap); INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task); skb_queue_head_init(&priv->cm.skb_queue); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index 791252621b26..982eb88e27ec 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -173,7 +173,7 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) size = ipoib_sendq_size + ipoib_recvq_size + 1; ret = ipoib_cm_dev_init(dev); if (!ret) - size += ipoib_recvq_size; + size += ipoib_recvq_size + 1 /* 1 extra for rx_drain_qp */; priv->cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, size, 0); if (IS_ERR(priv->cq)) {