From 7668ff9c2ad7d354655e23afa836a92d54d2ea63 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Thu, 17 May 2012 20:52:20 +0100
Subject: [PATCH 01/16] sfc: Refactor struct efx_tx_buffer to use a flags field

Add a flags field to struct efx_tx_buffer, replacing the
continuation and map_single booleans.

Since a single descriptor cannot be both a TSO header and the last
descriptor for an skb, unionise efx_tx_buffer::{skb,tsoh} and add
flags for validity of these fields.

Clear all flags in free buffers (whereas previously the continuation
flag would be set).

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/net_driver.h | 27 +++++-----
 drivers/net/ethernet/sfc/nic.c        |  4 +-
 drivers/net/ethernet/sfc/tx.c         | 78 +++++++++++++--------------
 3 files changed, 53 insertions(+), 56 deletions(-)

diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h
index cd9c0a989692..0ac01fa6e63c 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -91,29 +91,30 @@ struct efx_special_buffer {
 };
 
 /**
- * struct efx_tx_buffer - An Efx TX buffer
- * @skb: The associated socket buffer.
- *	Set only on the final fragment of a packet; %NULL for all other
- *	fragments.  When this fragment completes, then we can free this
- *	skb.
- * @tsoh: The associated TSO header structure, or %NULL if this
- *	buffer is not a TSO header.
+ * struct efx_tx_buffer - buffer state for a TX descriptor
+ * @skb: When @flags & %EFX_TX_BUF_SKB, the associated socket buffer to be
+ *	freed when descriptor completes
+ * @tsoh: When @flags & %EFX_TX_BUF_TSOH, the associated TSO header structure.
  * @dma_addr: DMA address of the fragment.
+ * @flags: Flags for allocation and DMA mapping type
  * @len: Length of this fragment.
  *	This field is zero when the queue slot is empty.
- * @continuation: True if this fragment is not the end of a packet.
- * @unmap_single: True if dma_unmap_single should be used.
  * @unmap_len: Length of this fragment to unmap
  */
 struct efx_tx_buffer {
-	const struct sk_buff *skb;
-	struct efx_tso_header *tsoh;
+	union {
+		const struct sk_buff *skb;
+		struct efx_tso_header *tsoh;
+	};
 	dma_addr_t dma_addr;
+	unsigned short flags;
 	unsigned short len;
-	bool continuation;
-	bool unmap_single;
 	unsigned short unmap_len;
 };
+#define EFX_TX_BUF_CONT		1	/* not last descriptor of packet */
+#define EFX_TX_BUF_SKB		2	/* buffer is last part of skb */
+#define EFX_TX_BUF_TSOH		4	/* buffer is TSO header */
+#define EFX_TX_BUF_MAP_SINGLE	8	/* buffer was mapped with dma_map_single() */
 
 /**
  * struct efx_tx_queue - An Efx TX queue
diff --git a/drivers/net/ethernet/sfc/nic.c b/drivers/net/ethernet/sfc/nic.c
index 326d799762d6..aa113709831d 100644
--- a/drivers/net/ethernet/sfc/nic.c
+++ b/drivers/net/ethernet/sfc/nic.c
@@ -401,8 +401,10 @@ void efx_nic_push_buffers(struct efx_tx_queue *tx_queue)
 		++tx_queue->write_count;
 
 		/* Create TX descriptor ring entry */
+		BUILD_BUG_ON(EFX_TX_BUF_CONT != 1);
 		EFX_POPULATE_QWORD_4(*txd,
-				     FSF_AZ_TX_KER_CONT, buffer->continuation,
+				     FSF_AZ_TX_KER_CONT,
+				     buffer->flags & EFX_TX_BUF_CONT,
 				     FSF_AZ_TX_KER_BYTE_COUNT, buffer->len,
 				     FSF_AZ_TX_KER_BUF_REGION, 0,
 				     FSF_AZ_TX_KER_BUF_ADDR, buffer->dma_addr);
diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c
index 18713436b443..24c82f3ce0f3 100644
--- a/drivers/net/ethernet/sfc/tx.c
+++ b/drivers/net/ethernet/sfc/tx.c
@@ -39,25 +39,25 @@ static void efx_dequeue_buffer(struct efx_tx_queue *tx_queue,
 		struct device *dma_dev = &tx_queue->efx->pci_dev->dev;
 		dma_addr_t unmap_addr = (buffer->dma_addr + buffer->len -
 					 buffer->unmap_len);
-		if (buffer->unmap_single)
+		if (buffer->flags & EFX_TX_BUF_MAP_SINGLE)
 			dma_unmap_single(dma_dev, unmap_addr, buffer->unmap_len,
 					 DMA_TO_DEVICE);
 		else
 			dma_unmap_page(dma_dev, unmap_addr, buffer->unmap_len,
 				       DMA_TO_DEVICE);
 		buffer->unmap_len = 0;
-		buffer->unmap_single = false;
 	}
 
-	if (buffer->skb) {
+	if (buffer->flags & EFX_TX_BUF_SKB) {
 		(*pkts_compl)++;
 		(*bytes_compl) += buffer->skb->len;
 		dev_kfree_skb_any((struct sk_buff *) buffer->skb);
-		buffer->skb = NULL;
 		netif_vdbg(tx_queue->efx, tx_done, tx_queue->efx->net_dev,
 			   "TX queue %d transmission id %x complete\n",
 			   tx_queue->queue, tx_queue->read_count);
 	}
+
+	buffer->flags &= EFX_TX_BUF_TSOH;
 }
 
 /**
@@ -89,14 +89,14 @@ static void efx_tsoh_heap_free(struct efx_tx_queue *tx_queue,
 static void efx_tsoh_free(struct efx_tx_queue *tx_queue,
 			  struct efx_tx_buffer *buffer)
 {
-	if (buffer->tsoh) {
+	if (buffer->flags & EFX_TX_BUF_TSOH) {
 		if (likely(!buffer->tsoh->unmap_len)) {
 			buffer->tsoh->next = tx_queue->tso_headers_free;
 			tx_queue->tso_headers_free = buffer->tsoh;
 		} else {
 			efx_tsoh_heap_free(tx_queue, buffer->tsoh);
 		}
-		buffer->tsoh = NULL;
+		buffer->flags &= ~EFX_TX_BUF_TSOH;
 	}
 }
 
@@ -163,7 +163,7 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
 	unsigned int len, unmap_len = 0, fill_level, insert_ptr;
 	dma_addr_t dma_addr, unmap_addr = 0;
 	unsigned int dma_len;
-	bool unmap_single;
+	unsigned short dma_flags;
 	int q_space, i = 0;
 	netdev_tx_t rc = NETDEV_TX_OK;
 
@@ -190,7 +190,7 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
 	 * since this is more efficient on machines with sparse
 	 * memory.
 	 */
-	unmap_single = true;
+	dma_flags = EFX_TX_BUF_MAP_SINGLE;
 	dma_addr = dma_map_single(dma_dev, skb->data, len, PCI_DMA_TODEVICE);
 
 	/* Process all fragments */
@@ -234,10 +234,8 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
 			insert_ptr = tx_queue->insert_count & tx_queue->ptr_mask;
 			buffer = &tx_queue->buffer[insert_ptr];
 			efx_tsoh_free(tx_queue, buffer);
-			EFX_BUG_ON_PARANOID(buffer->tsoh);
-			EFX_BUG_ON_PARANOID(buffer->skb);
+			EFX_BUG_ON_PARANOID(buffer->flags);
 			EFX_BUG_ON_PARANOID(buffer->len);
-			EFX_BUG_ON_PARANOID(!buffer->continuation);
 			EFX_BUG_ON_PARANOID(buffer->unmap_len);
 
 			dma_len = efx_max_tx_len(efx, dma_addr);
@@ -247,13 +245,14 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
 			/* Fill out per descriptor fields */
 			buffer->len = dma_len;
 			buffer->dma_addr = dma_addr;
+			buffer->flags = EFX_TX_BUF_CONT;
 			len -= dma_len;
 			dma_addr += dma_len;
 			++tx_queue->insert_count;
 		} while (len);
 
 		/* Transfer ownership of the unmapping to the final buffer */
-		buffer->unmap_single = unmap_single;
+		buffer->flags = EFX_TX_BUF_CONT | dma_flags;
 		buffer->unmap_len = unmap_len;
 		unmap_len = 0;
 
@@ -264,14 +263,14 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
 		len = skb_frag_size(fragment);
 		i++;
 		/* Map for DMA */
-		unmap_single = false;
+		dma_flags = 0;
 		dma_addr = skb_frag_dma_map(dma_dev, fragment, 0, len,
 					    DMA_TO_DEVICE);
 	}
 
 	/* Transfer ownership of the skb to the final buffer */
 	buffer->skb = skb;
-	buffer->continuation = false;
+	buffer->flags = EFX_TX_BUF_SKB | dma_flags;
 
 	netdev_tx_sent_queue(tx_queue->core_txq, skb->len);
 
@@ -302,7 +301,7 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
 
 	/* Free the fragment we were mid-way through pushing */
 	if (unmap_len) {
-		if (unmap_single)
+		if (dma_flags & EFX_TX_BUF_MAP_SINGLE)
 			dma_unmap_single(dma_dev, unmap_addr, unmap_len,
 					 DMA_TO_DEVICE);
 		else
@@ -340,7 +339,6 @@ static void efx_dequeue_buffers(struct efx_tx_queue *tx_queue,
 		}
 
 		efx_dequeue_buffer(tx_queue, buffer, pkts_compl, bytes_compl);
-		buffer->continuation = true;
 		buffer->len = 0;
 
 		++tx_queue->read_count;
@@ -484,7 +482,7 @@ int efx_probe_tx_queue(struct efx_tx_queue *tx_queue)
 {
 	struct efx_nic *efx = tx_queue->efx;
 	unsigned int entries;
-	int i, rc;
+	int rc;
 
 	/* Create the smallest power-of-two aligned ring */
 	entries = max(roundup_pow_of_two(efx->txq_entries), EFX_MIN_DMAQ_SIZE);
@@ -500,8 +498,6 @@ int efx_probe_tx_queue(struct efx_tx_queue *tx_queue)
 				   GFP_KERNEL);
 	if (!tx_queue->buffer)
 		return -ENOMEM;
-	for (i = 0; i <= tx_queue->ptr_mask; ++i)
-		tx_queue->buffer[i].continuation = true;
 
 	/* Allocate hardware ring */
 	rc = efx_nic_probe_tx(tx_queue);
@@ -546,7 +542,6 @@ void efx_release_tx_buffers(struct efx_tx_queue *tx_queue)
 		unsigned int pkts_compl = 0, bytes_compl = 0;
 		buffer = &tx_queue->buffer[tx_queue->read_count & tx_queue->ptr_mask];
 		efx_dequeue_buffer(tx_queue, buffer, &pkts_compl, &bytes_compl);
-		buffer->continuation = true;
 		buffer->len = 0;
 
 		++tx_queue->read_count;
@@ -631,7 +626,7 @@ void efx_remove_tx_queue(struct efx_tx_queue *tx_queue)
  * @in_len: Remaining length in current SKB fragment
  * @unmap_len: Length of SKB fragment
  * @unmap_addr: DMA address of SKB fragment
- * @unmap_single: DMA single vs page mapping flag
+ * @dma_flags: TX buffer flags for DMA mapping - %EFX_TX_BUF_MAP_SINGLE or 0
  * @protocol: Network protocol (after any VLAN header)
  * @header_len: Number of bytes of header
  * @full_packet_size: Number of bytes to put in each outgoing segment
@@ -651,7 +646,7 @@ struct tso_state {
 	unsigned in_len;
 	unsigned unmap_len;
 	dma_addr_t unmap_addr;
-	bool unmap_single;
+	unsigned short dma_flags;
 
 	__be16 protocol;
 	unsigned header_len;
@@ -833,9 +828,7 @@ static int efx_tx_queue_insert(struct efx_tx_queue *tx_queue,
 		efx_tsoh_free(tx_queue, buffer);
 		EFX_BUG_ON_PARANOID(buffer->len);
 		EFX_BUG_ON_PARANOID(buffer->unmap_len);
-		EFX_BUG_ON_PARANOID(buffer->skb);
-		EFX_BUG_ON_PARANOID(!buffer->continuation);
-		EFX_BUG_ON_PARANOID(buffer->tsoh);
+		EFX_BUG_ON_PARANOID(buffer->flags);
 
 		buffer->dma_addr = dma_addr;
 
@@ -845,7 +838,8 @@ static int efx_tx_queue_insert(struct efx_tx_queue *tx_queue,
 		if (dma_len >= len)
 			break;
 
-		buffer->len = dma_len; /* Don't set the other members */
+		buffer->len = dma_len;
+		buffer->flags = EFX_TX_BUF_CONT;
 		dma_addr += dma_len;
 		len -= dma_len;
 	}
@@ -873,12 +867,11 @@ static void efx_tso_put_header(struct efx_tx_queue *tx_queue,
 	efx_tsoh_free(tx_queue, buffer);
 	EFX_BUG_ON_PARANOID(buffer->len);
 	EFX_BUG_ON_PARANOID(buffer->unmap_len);
-	EFX_BUG_ON_PARANOID(buffer->skb);
-	EFX_BUG_ON_PARANOID(!buffer->continuation);
-	EFX_BUG_ON_PARANOID(buffer->tsoh);
+	EFX_BUG_ON_PARANOID(buffer->flags);
 	buffer->len = len;
 	buffer->dma_addr = tsoh->dma_addr;
 	buffer->tsoh = tsoh;
+	buffer->flags = EFX_TX_BUF_TSOH | EFX_TX_BUF_CONT;
 
 	++tx_queue->insert_count;
 }
@@ -896,11 +889,11 @@ static void efx_enqueue_unwind(struct efx_tx_queue *tx_queue)
 		buffer = &tx_queue->buffer[tx_queue->insert_count &
 					   tx_queue->ptr_mask];
 		efx_tsoh_free(tx_queue, buffer);
-		EFX_BUG_ON_PARANOID(buffer->skb);
+		EFX_BUG_ON_PARANOID(buffer->flags & EFX_TX_BUF_SKB);
 		if (buffer->unmap_len) {
 			unmap_addr = (buffer->dma_addr + buffer->len -
 				      buffer->unmap_len);
-			if (buffer->unmap_single)
+			if (buffer->flags & EFX_TX_BUF_MAP_SINGLE)
 				dma_unmap_single(&tx_queue->efx->pci_dev->dev,
 						 unmap_addr, buffer->unmap_len,
 						 DMA_TO_DEVICE);
@@ -911,7 +904,7 @@ static void efx_enqueue_unwind(struct efx_tx_queue *tx_queue)
 			buffer->unmap_len = 0;
 		}
 		buffer->len = 0;
-		buffer->continuation = true;
+		buffer->flags = 0;
 	}
 }
 
@@ -938,7 +931,7 @@ static void tso_start(struct tso_state *st, const struct sk_buff *skb)
 
 	st->out_len = skb->len - st->header_len;
 	st->unmap_len = 0;
-	st->unmap_single = false;
+	st->dma_flags = 0;
 }
 
 static int tso_get_fragment(struct tso_state *st, struct efx_nic *efx,
@@ -947,7 +940,7 @@ static int tso_get_fragment(struct tso_state *st, struct efx_nic *efx,
 	st->unmap_addr = skb_frag_dma_map(&efx->pci_dev->dev, frag, 0,
 					  skb_frag_size(frag), DMA_TO_DEVICE);
 	if (likely(!dma_mapping_error(&efx->pci_dev->dev, st->unmap_addr))) {
-		st->unmap_single = false;
+		st->dma_flags = 0;
 		st->unmap_len = skb_frag_size(frag);
 		st->in_len = skb_frag_size(frag);
 		st->dma_addr = st->unmap_addr;
@@ -965,7 +958,7 @@ static int tso_get_head_fragment(struct tso_state *st, struct efx_nic *efx,
 	st->unmap_addr = dma_map_single(&efx->pci_dev->dev, skb->data + hl,
 					len, DMA_TO_DEVICE);
 	if (likely(!dma_mapping_error(&efx->pci_dev->dev, st->unmap_addr))) {
-		st->unmap_single = true;
+		st->dma_flags = EFX_TX_BUF_MAP_SINGLE;
 		st->unmap_len = len;
 		st->in_len = len;
 		st->dma_addr = st->unmap_addr;
@@ -990,7 +983,7 @@ static int tso_fill_packet_with_fragment(struct efx_tx_queue *tx_queue,
 					 struct tso_state *st)
 {
 	struct efx_tx_buffer *buffer;
-	int n, end_of_packet, rc;
+	int n, rc;
 
 	if (st->in_len == 0)
 		return 0;
@@ -1008,17 +1001,18 @@ static int tso_fill_packet_with_fragment(struct efx_tx_queue *tx_queue,
 
 	rc = efx_tx_queue_insert(tx_queue, st->dma_addr, n, &buffer);
 	if (likely(rc == 0)) {
-		if (st->out_len == 0)
+		if (st->out_len == 0) {
 			/* Transfer ownership of the skb */
 			buffer->skb = skb;
-
-		end_of_packet = st->out_len == 0 || st->packet_space == 0;
-		buffer->continuation = !end_of_packet;
+			buffer->flags = EFX_TX_BUF_SKB;
+		} else if (st->packet_space != 0) {
+			buffer->flags = EFX_TX_BUF_CONT;
+		}
 
 		if (st->in_len == 0) {
 			/* Transfer ownership of the DMA mapping */
 			buffer->unmap_len = st->unmap_len;
-			buffer->unmap_single = st->unmap_single;
+			buffer->flags |= st->dma_flags;
 			st->unmap_len = 0;
 		}
 	}
@@ -1195,7 +1189,7 @@ static int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue,
  unwind:
 	/* Free the DMA mapping we were in the process of writing out */
 	if (state.unmap_len) {
-		if (state.unmap_single)
+		if (state.dma_flags & EFX_TX_BUF_MAP_SINGLE)
 			dma_unmap_single(&efx->pci_dev->dev, state.unmap_addr,
 					 state.unmap_len, DMA_TO_DEVICE);
 		else

From 14bf718fb97efe9ff649c317e7d87a3617b13e7c Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Tue, 22 May 2012 01:27:58 +0100
Subject: [PATCH 02/16] sfc: Stop TX queues before they fill up

We now have a definite upper bound on the number of descriptors per
skb; use that to stop the queue when the next packet might not fit.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/efx.c        |  10 ++
 drivers/net/ethernet/sfc/net_driver.h |   5 +
 drivers/net/ethernet/sfc/tx.c         | 212 ++++++++++++--------------
 3 files changed, 112 insertions(+), 115 deletions(-)

diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 65a8d49106a4..3b3f08489a5e 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -630,6 +630,16 @@ static void efx_start_datapath(struct efx_nic *efx)
 	efx->rx_buffer_order = get_order(efx->rx_buffer_len +
 					 sizeof(struct efx_rx_page_state));
 
+	/* We must keep at least one descriptor in a TX ring empty.
+	 * We could avoid this when the queue size does not exactly
+	 * match the hardware ring size, but it's not that important.
+	 * Therefore we stop the queue when one more skb might fill
+	 * the ring completely.  We wake it when half way back to
+	 * empty.
+	 */
+	efx->txq_stop_thresh = efx->txq_entries - efx_tx_max_skb_descs(efx);
+	efx->txq_wake_thresh = efx->txq_stop_thresh / 2;
+
 	/* Initialise the channels */
 	efx_for_each_channel(channel, efx) {
 		efx_for_each_channel_tx_queue(tx_queue, channel)
diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h
index 0ac01fa6e63c..28a6d6258692 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -665,6 +665,8 @@ struct vfdi_status;
  *	should be allocated for this NIC
  * @rxq_entries: Size of receive queues requested by user.
  * @txq_entries: Size of transmit queues requested by user.
+ * @txq_stop_thresh: TX queue fill level at or above which we stop it.
+ * @txq_wake_thresh: TX queue fill level at or below which we wake it.
  * @tx_dc_base: Base qword address in SRAM of TX queue descriptor caches
  * @rx_dc_base: Base qword address in SRAM of RX queue descriptor caches
  * @sram_lim_qw: Qword address limit of SRAM
@@ -775,6 +777,9 @@ struct efx_nic {
 
 	unsigned rxq_entries;
 	unsigned txq_entries;
+	unsigned int txq_stop_thresh;
+	unsigned int txq_wake_thresh;
+
 	unsigned tx_dc_base;
 	unsigned rx_dc_base;
 	unsigned sram_lim_qw;
diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c
index 24c82f3ce0f3..330d9111a339 100644
--- a/drivers/net/ethernet/sfc/tx.c
+++ b/drivers/net/ethernet/sfc/tx.c
@@ -22,14 +22,6 @@
 #include "nic.h"
 #include "workarounds.h"
 
-/*
- * TX descriptor ring full threshold
- *
- * The tx_queue descriptor ring fill-level must fall below this value
- * before we restart the netif queue
- */
-#define EFX_TXQ_THRESHOLD(_efx) ((_efx)->txq_entries / 2u)
-
 static void efx_dequeue_buffer(struct efx_tx_queue *tx_queue,
 			       struct efx_tx_buffer *buffer,
 			       unsigned int *pkts_compl,
@@ -138,6 +130,56 @@ unsigned int efx_tx_max_skb_descs(struct efx_nic *efx)
 	return max_descs;
 }
 
+/* Get partner of a TX queue, seen as part of the same net core queue */
+static struct efx_tx_queue *efx_tx_queue_partner(struct efx_tx_queue *tx_queue)
+{
+	if (tx_queue->queue & EFX_TXQ_TYPE_OFFLOAD)
+		return tx_queue - EFX_TXQ_TYPE_OFFLOAD;
+	else
+		return tx_queue + EFX_TXQ_TYPE_OFFLOAD;
+}
+
+static void efx_tx_maybe_stop_queue(struct efx_tx_queue *txq1)
+{
+	/* We need to consider both queues that the net core sees as one */
+	struct efx_tx_queue *txq2 = efx_tx_queue_partner(txq1);
+	struct efx_nic *efx = txq1->efx;
+	unsigned int fill_level;
+
+	fill_level = max(txq1->insert_count - txq1->old_read_count,
+			 txq2->insert_count - txq2->old_read_count);
+	if (likely(fill_level < efx->txq_stop_thresh))
+		return;
+
+	/* We used the stale old_read_count above, which gives us a
+	 * pessimistic estimate of the fill level (which may even
+	 * validly be >= efx->txq_entries).  Now try again using
+	 * read_count (more likely to be a cache miss).
+	 *
+	 * If we read read_count and then conditionally stop the
+	 * queue, it is possible for the completion path to race with
+	 * us and complete all outstanding descriptors in the middle,
+	 * after which there will be no more completions to wake it.
+	 * Therefore we stop the queue first, then read read_count
+	 * (with a memory barrier to ensure the ordering), then
+	 * restart the queue if the fill level turns out to be low
+	 * enough.
+	 */
+	netif_tx_stop_queue(txq1->core_txq);
+	smp_mb();
+	txq1->old_read_count = ACCESS_ONCE(txq1->read_count);
+	txq2->old_read_count = ACCESS_ONCE(txq2->read_count);
+
+	fill_level = max(txq1->insert_count - txq1->old_read_count,
+			 txq2->insert_count - txq2->old_read_count);
+	EFX_BUG_ON_PARANOID(fill_level >= efx->txq_entries);
+	if (likely(fill_level < efx->txq_stop_thresh)) {
+		smp_mb();
+		if (likely(!efx->loopback_selftest))
+			netif_tx_start_queue(txq1->core_txq);
+	}
+}
+
 /*
  * Add a socket buffer to a TX queue
  *
@@ -151,7 +193,7 @@ unsigned int efx_tx_max_skb_descs(struct efx_nic *efx)
  * This function is split out from efx_hard_start_xmit to allow the
  * loopback test to direct packets via specific TX queues.
  *
- * Returns NETDEV_TX_OK or NETDEV_TX_BUSY
+ * Returns NETDEV_TX_OK.
  * You must hold netif_tx_lock() to call this function.
  */
 netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
@@ -160,12 +202,11 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
 	struct device *dma_dev = &efx->pci_dev->dev;
 	struct efx_tx_buffer *buffer;
 	skb_frag_t *fragment;
-	unsigned int len, unmap_len = 0, fill_level, insert_ptr;
+	unsigned int len, unmap_len = 0, insert_ptr;
 	dma_addr_t dma_addr, unmap_addr = 0;
 	unsigned int dma_len;
 	unsigned short dma_flags;
-	int q_space, i = 0;
-	netdev_tx_t rc = NETDEV_TX_OK;
+	int i = 0;
 
 	EFX_BUG_ON_PARANOID(tx_queue->write_count != tx_queue->insert_count);
 
@@ -183,9 +224,6 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
 			return NETDEV_TX_OK;
 	}
 
-	fill_level = tx_queue->insert_count - tx_queue->old_read_count;
-	q_space = efx->txq_entries - 1 - fill_level;
-
 	/* Map for DMA.  Use dma_map_single rather than dma_map_page
 	 * since this is more efficient on machines with sparse
 	 * memory.
@@ -205,32 +243,6 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
 
 		/* Add to TX queue, splitting across DMA boundaries */
 		do {
-			if (unlikely(q_space-- <= 0)) {
-				/* It might be that completions have
-				 * happened since the xmit path last
-				 * checked.  Update the xmit path's
-				 * copy of read_count.
-				 */
-				netif_tx_stop_queue(tx_queue->core_txq);
-				/* This memory barrier protects the
-				 * change of queue state from the access
-				 * of read_count. */
-				smp_mb();
-				tx_queue->old_read_count =
-					ACCESS_ONCE(tx_queue->read_count);
-				fill_level = (tx_queue->insert_count
-					      - tx_queue->old_read_count);
-				q_space = efx->txq_entries - 1 - fill_level;
-				if (unlikely(q_space-- <= 0)) {
-					rc = NETDEV_TX_BUSY;
-					goto unwind;
-				}
-				smp_mb();
-				if (likely(!efx->loopback_selftest))
-					netif_tx_start_queue(
-						tx_queue->core_txq);
-			}
-
 			insert_ptr = tx_queue->insert_count & tx_queue->ptr_mask;
 			buffer = &tx_queue->buffer[insert_ptr];
 			efx_tsoh_free(tx_queue, buffer);
@@ -277,6 +289,8 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
 	/* Pass off to hardware */
 	efx_nic_push_buffers(tx_queue);
 
+	efx_tx_maybe_stop_queue(tx_queue);
+
 	return NETDEV_TX_OK;
 
  dma_err:
@@ -288,7 +302,6 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
 	/* Mark the packet as transmitted, and free the SKB ourselves */
 	dev_kfree_skb_any(skb);
 
- unwind:
 	/* Work backwards until we hit the original insert pointer value */
 	while (tx_queue->insert_count != tx_queue->write_count) {
 		unsigned int pkts_compl = 0, bytes_compl = 0;
@@ -309,7 +322,7 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
 				       DMA_TO_DEVICE);
 	}
 
-	return rc;
+	return NETDEV_TX_OK;
 }
 
 /* Remove packets from the TX queue
@@ -448,6 +461,7 @@ void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index)
 {
 	unsigned fill_level;
 	struct efx_nic *efx = tx_queue->efx;
+	struct efx_tx_queue *txq2;
 	unsigned int pkts_compl = 0, bytes_compl = 0;
 
 	EFX_BUG_ON_PARANOID(index > tx_queue->ptr_mask);
@@ -455,15 +469,18 @@ void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index)
 	efx_dequeue_buffers(tx_queue, index, &pkts_compl, &bytes_compl);
 	netdev_tx_completed_queue(tx_queue->core_txq, pkts_compl, bytes_compl);
 
-	/* See if we need to restart the netif queue.  This barrier
-	 * separates the update of read_count from the test of the
-	 * queue state. */
+	/* See if we need to restart the netif queue.  This memory
+	 * barrier ensures that we write read_count (inside
+	 * efx_dequeue_buffers()) before reading the queue status.
+	 */
 	smp_mb();
 	if (unlikely(netif_tx_queue_stopped(tx_queue->core_txq)) &&
 	    likely(efx->port_enabled) &&
 	    likely(netif_device_present(efx->net_dev))) {
-		fill_level = tx_queue->insert_count - tx_queue->read_count;
-		if (fill_level < EFX_TXQ_THRESHOLD(efx))
+		txq2 = efx_tx_queue_partner(tx_queue);
+		fill_level = max(tx_queue->insert_count - tx_queue->read_count,
+				 txq2->insert_count - txq2->read_count);
+		if (fill_level <= efx->txq_wake_thresh)
 			netif_tx_wake_queue(tx_queue->core_txq);
 	}
 
@@ -776,47 +793,19 @@ efx_tsoh_heap_free(struct efx_tx_queue *tx_queue, struct efx_tso_header *tsoh)
  * @len:		Length of fragment
  * @final_buffer:	The final buffer inserted into the queue
  *
- * Push descriptors onto the TX queue.  Return 0 on success or 1 if
- * @tx_queue full.
+ * Push descriptors onto the TX queue.
  */
-static int efx_tx_queue_insert(struct efx_tx_queue *tx_queue,
-			       dma_addr_t dma_addr, unsigned len,
-			       struct efx_tx_buffer **final_buffer)
+static void efx_tx_queue_insert(struct efx_tx_queue *tx_queue,
+				dma_addr_t dma_addr, unsigned len,
+				struct efx_tx_buffer **final_buffer)
 {
 	struct efx_tx_buffer *buffer;
 	struct efx_nic *efx = tx_queue->efx;
-	unsigned dma_len, fill_level, insert_ptr;
-	int q_space;
+	unsigned dma_len, insert_ptr;
 
 	EFX_BUG_ON_PARANOID(len <= 0);
 
-	fill_level = tx_queue->insert_count - tx_queue->old_read_count;
-	/* -1 as there is no way to represent all descriptors used */
-	q_space = efx->txq_entries - 1 - fill_level;
-
 	while (1) {
-		if (unlikely(q_space-- <= 0)) {
-			/* It might be that completions have happened
-			 * since the xmit path last checked.  Update
-			 * the xmit path's copy of read_count.
-			 */
-			netif_tx_stop_queue(tx_queue->core_txq);
-			/* This memory barrier protects the change of
-			 * queue state from the access of read_count. */
-			smp_mb();
-			tx_queue->old_read_count =
-				ACCESS_ONCE(tx_queue->read_count);
-			fill_level = (tx_queue->insert_count
-				      - tx_queue->old_read_count);
-			q_space = efx->txq_entries - 1 - fill_level;
-			if (unlikely(q_space-- <= 0)) {
-				*final_buffer = NULL;
-				return 1;
-			}
-			smp_mb();
-			netif_tx_start_queue(tx_queue->core_txq);
-		}
-
 		insert_ptr = tx_queue->insert_count & tx_queue->ptr_mask;
 		buffer = &tx_queue->buffer[insert_ptr];
 		++tx_queue->insert_count;
@@ -847,7 +836,6 @@ static int efx_tx_queue_insert(struct efx_tx_queue *tx_queue,
 	EFX_BUG_ON_PARANOID(!len);
 	buffer->len = len;
 	*final_buffer = buffer;
-	return 0;
 }
 
 
@@ -975,20 +963,19 @@ static int tso_get_head_fragment(struct tso_state *st, struct efx_nic *efx,
  * @st:			TSO state
  *
  * Form descriptors for the current fragment, until we reach the end
- * of fragment or end-of-packet.  Return 0 on success, 1 if not enough
- * space in @tx_queue.
+ * of fragment or end-of-packet.
  */
-static int tso_fill_packet_with_fragment(struct efx_tx_queue *tx_queue,
-					 const struct sk_buff *skb,
-					 struct tso_state *st)
+static void tso_fill_packet_with_fragment(struct efx_tx_queue *tx_queue,
+					  const struct sk_buff *skb,
+					  struct tso_state *st)
 {
 	struct efx_tx_buffer *buffer;
-	int n, rc;
+	int n;
 
 	if (st->in_len == 0)
-		return 0;
+		return;
 	if (st->packet_space == 0)
-		return 0;
+		return;
 
 	EFX_BUG_ON_PARANOID(st->in_len <= 0);
 	EFX_BUG_ON_PARANOID(st->packet_space <= 0);
@@ -999,26 +986,24 @@ static int tso_fill_packet_with_fragment(struct efx_tx_queue *tx_queue,
 	st->out_len -= n;
 	st->in_len -= n;
 
-	rc = efx_tx_queue_insert(tx_queue, st->dma_addr, n, &buffer);
-	if (likely(rc == 0)) {
-		if (st->out_len == 0) {
-			/* Transfer ownership of the skb */
-			buffer->skb = skb;
-			buffer->flags = EFX_TX_BUF_SKB;
-		} else if (st->packet_space != 0) {
-			buffer->flags = EFX_TX_BUF_CONT;
-		}
+	efx_tx_queue_insert(tx_queue, st->dma_addr, n, &buffer);
 
-		if (st->in_len == 0) {
-			/* Transfer ownership of the DMA mapping */
-			buffer->unmap_len = st->unmap_len;
-			buffer->flags |= st->dma_flags;
-			st->unmap_len = 0;
-		}
+	if (st->out_len == 0) {
+		/* Transfer ownership of the skb */
+		buffer->skb = skb;
+		buffer->flags = EFX_TX_BUF_SKB;
+	} else if (st->packet_space != 0) {
+		buffer->flags = EFX_TX_BUF_CONT;
+	}
+
+	if (st->in_len == 0) {
+		/* Transfer ownership of the DMA mapping */
+		buffer->unmap_len = st->unmap_len;
+		buffer->flags |= st->dma_flags;
+		st->unmap_len = 0;
 	}
 
 	st->dma_addr += n;
-	return rc;
 }
 
 
@@ -1112,13 +1097,13 @@ static int tso_start_new_packet(struct efx_tx_queue *tx_queue,
  *
  * Add socket buffer @skb to @tx_queue, doing TSO or return != 0 if
  * @skb was not enqueued.  In all cases @skb is consumed.  Return
- * %NETDEV_TX_OK or %NETDEV_TX_BUSY.
+ * %NETDEV_TX_OK.
  */
 static int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue,
 			       struct sk_buff *skb)
 {
 	struct efx_nic *efx = tx_queue->efx;
-	int frag_i, rc, rc2 = NETDEV_TX_OK;
+	int frag_i, rc;
 	struct tso_state state;
 
 	/* Find the packet protocol and sanity-check it */
@@ -1150,11 +1135,7 @@ static int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue,
 		goto mem_err;
 
 	while (1) {
-		rc = tso_fill_packet_with_fragment(tx_queue, skb, &state);
-		if (unlikely(rc)) {
-			rc2 = NETDEV_TX_BUSY;
-			goto unwind;
-		}
+		tso_fill_packet_with_fragment(tx_queue, skb, &state);
 
 		/* Move onto the next fragment? */
 		if (state.in_len == 0) {
@@ -1178,6 +1159,8 @@ static int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue,
 	/* Pass off to hardware */
 	efx_nic_push_buffers(tx_queue);
 
+	efx_tx_maybe_stop_queue(tx_queue);
+
 	tx_queue->tso_bursts++;
 	return NETDEV_TX_OK;
 
@@ -1186,7 +1169,6 @@ static int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue,
 		  "Out of memory for TSO headers, or DMA mapping error\n");
 	dev_kfree_skb_any(skb);
 
- unwind:
 	/* Free the DMA mapping we were in the process of writing out */
 	if (state.unmap_len) {
 		if (state.dma_flags & EFX_TX_BUF_MAP_SINGLE)
@@ -1198,7 +1180,7 @@ static int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue,
 	}
 
 	efx_enqueue_unwind(tx_queue);
-	return rc2;
+	return NETDEV_TX_OK;
 }
 
 

From f7251a9ce936f1006fbfdef63dbe42ae5e0fee7c Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Thu, 17 May 2012 18:40:54 +0100
Subject: [PATCH 03/16] sfc: Simplify TSO header buffer allocation

TSO header buffers contain a control structure immediately followed by
the packet headers, and are kept on a free list when not in use.  This
complicates buffer management and tends to result in cache read misses
when we recycle such buffers (particularly if DMA-coherent memory
requires caches to be disabled).

Replace the free list with a simple mapping by descriptor index.  We
know that there is always a payload descriptor between any two
descriptors with TSO header buffers, so we can allocate only one
such buffer for each two descriptors.

While we're at it, use a standard error code for allocation failure,
not -1.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/net_driver.h |  13 +-
 drivers/net/ethernet/sfc/nic.c        |   2 +-
 drivers/net/ethernet/sfc/tx.c         | 323 +++++++++-----------------
 3 files changed, 116 insertions(+), 222 deletions(-)

diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h
index 28a6d6258692..a4fe9a786ef8 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -94,7 +94,8 @@ struct efx_special_buffer {
  * struct efx_tx_buffer - buffer state for a TX descriptor
  * @skb: When @flags & %EFX_TX_BUF_SKB, the associated socket buffer to be
  *	freed when descriptor completes
- * @tsoh: When @flags & %EFX_TX_BUF_TSOH, the associated TSO header structure.
+ * @heap_buf: When @flags & %EFX_TX_BUF_HEAP, the associated heap buffer to be
+ *	freed when descriptor completes.
  * @dma_addr: DMA address of the fragment.
  * @flags: Flags for allocation and DMA mapping type
  * @len: Length of this fragment.
@@ -104,7 +105,7 @@ struct efx_special_buffer {
 struct efx_tx_buffer {
 	union {
 		const struct sk_buff *skb;
-		struct efx_tso_header *tsoh;
+		void *heap_buf;
 	};
 	dma_addr_t dma_addr;
 	unsigned short flags;
@@ -113,7 +114,7 @@ struct efx_tx_buffer {
 };
 #define EFX_TX_BUF_CONT		1	/* not last descriptor of packet */
 #define EFX_TX_BUF_SKB		2	/* buffer is last part of skb */
-#define EFX_TX_BUF_TSOH		4	/* buffer is TSO header */
+#define EFX_TX_BUF_HEAP		4	/* buffer was allocated with kmalloc() */
 #define EFX_TX_BUF_MAP_SINGLE	8	/* buffer was mapped with dma_map_single() */
 
 /**
@@ -134,6 +135,7 @@ struct efx_tx_buffer {
  * @channel: The associated channel
  * @core_txq: The networking core TX queue structure
  * @buffer: The software buffer ring
+ * @tsoh_page: Array of pages of TSO header buffers
  * @txd: The hardware descriptor ring
  * @ptr_mask: The size of the ring minus 1.
  * @initialised: Has hardware queue been initialised?
@@ -157,9 +159,6 @@ struct efx_tx_buffer {
  *	variable indicates that the queue is full.  This is to
  *	avoid cache-line ping-pong between the xmit path and the
  *	completion path.
- * @tso_headers_free: A list of TSO headers allocated for this TX queue
- *	that are not in use, and so available for new TSO sends. The list
- *	is protected by the TX queue lock.
  * @tso_bursts: Number of times TSO xmit invoked by kernel
  * @tso_long_headers: Number of packets with headers too long for standard
  *	blocks
@@ -176,6 +175,7 @@ struct efx_tx_queue {
 	struct efx_channel *channel;
 	struct netdev_queue *core_txq;
 	struct efx_tx_buffer *buffer;
+	struct efx_buffer *tsoh_page;
 	struct efx_special_buffer txd;
 	unsigned int ptr_mask;
 	bool initialised;
@@ -188,7 +188,6 @@ struct efx_tx_queue {
 	unsigned int insert_count ____cacheline_aligned_in_smp;
 	unsigned int write_count;
 	unsigned int old_read_count;
-	struct efx_tso_header *tso_headers_free;
 	unsigned int tso_bursts;
 	unsigned int tso_long_headers;
 	unsigned int tso_packets;
diff --git a/drivers/net/ethernet/sfc/nic.c b/drivers/net/ethernet/sfc/nic.c
index aa113709831d..cdff40b65729 100644
--- a/drivers/net/ethernet/sfc/nic.c
+++ b/drivers/net/ethernet/sfc/nic.c
@@ -298,7 +298,7 @@ efx_free_special_buffer(struct efx_nic *efx, struct efx_special_buffer *buffer)
 /**************************************************************************
  *
  * Generic buffer handling
- * These buffers are used for interrupt status and MAC stats
+ * These buffers are used for interrupt status, MAC stats, etc.
  *
  **************************************************************************/
 
diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c
index 330d9111a339..61bc0ed718e3 100644
--- a/drivers/net/ethernet/sfc/tx.c
+++ b/drivers/net/ethernet/sfc/tx.c
@@ -47,51 +47,16 @@ static void efx_dequeue_buffer(struct efx_tx_queue *tx_queue,
 		netif_vdbg(tx_queue->efx, tx_done, tx_queue->efx->net_dev,
 			   "TX queue %d transmission id %x complete\n",
 			   tx_queue->queue, tx_queue->read_count);
+	} else if (buffer->flags & EFX_TX_BUF_HEAP) {
+		kfree(buffer->heap_buf);
 	}
 
-	buffer->flags &= EFX_TX_BUF_TSOH;
+	buffer->len = 0;
+	buffer->flags = 0;
 }
 
-/**
- * struct efx_tso_header - a DMA mapped buffer for packet headers
- * @next: Linked list of free ones.
- *	The list is protected by the TX queue lock.
- * @dma_unmap_len: Length to unmap for an oversize buffer, or 0.
- * @dma_addr: The DMA address of the header below.
- *
- * This controls the memory used for a TSO header.  Use TSOH_DATA()
- * to find the packet header data.  Use TSOH_SIZE() to calculate the
- * total size required for a given packet header length.  TSO headers
- * in the free list are exactly %TSOH_STD_SIZE bytes in size.
- */
-struct efx_tso_header {
-	union {
-		struct efx_tso_header *next;
-		size_t unmap_len;
-	};
-	dma_addr_t dma_addr;
-};
-
 static int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue,
 			       struct sk_buff *skb);
-static void efx_fini_tso(struct efx_tx_queue *tx_queue);
-static void efx_tsoh_heap_free(struct efx_tx_queue *tx_queue,
-			       struct efx_tso_header *tsoh);
-
-static void efx_tsoh_free(struct efx_tx_queue *tx_queue,
-			  struct efx_tx_buffer *buffer)
-{
-	if (buffer->flags & EFX_TX_BUF_TSOH) {
-		if (likely(!buffer->tsoh->unmap_len)) {
-			buffer->tsoh->next = tx_queue->tso_headers_free;
-			tx_queue->tso_headers_free = buffer->tsoh;
-		} else {
-			efx_tsoh_heap_free(tx_queue, buffer->tsoh);
-		}
-		buffer->flags &= ~EFX_TX_BUF_TSOH;
-	}
-}
-
 
 static inline unsigned
 efx_max_tx_len(struct efx_nic *efx, dma_addr_t dma_addr)
@@ -245,7 +210,6 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
 		do {
 			insert_ptr = tx_queue->insert_count & tx_queue->ptr_mask;
 			buffer = &tx_queue->buffer[insert_ptr];
-			efx_tsoh_free(tx_queue, buffer);
 			EFX_BUG_ON_PARANOID(buffer->flags);
 			EFX_BUG_ON_PARANOID(buffer->len);
 			EFX_BUG_ON_PARANOID(buffer->unmap_len);
@@ -309,7 +273,6 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
 		insert_ptr = tx_queue->insert_count & tx_queue->ptr_mask;
 		buffer = &tx_queue->buffer[insert_ptr];
 		efx_dequeue_buffer(tx_queue, buffer, &pkts_compl, &bytes_compl);
-		buffer->len = 0;
 	}
 
 	/* Free the fragment we were mid-way through pushing */
@@ -352,7 +315,6 @@ static void efx_dequeue_buffers(struct efx_tx_queue *tx_queue,
 		}
 
 		efx_dequeue_buffer(tx_queue, buffer, pkts_compl, bytes_compl);
-		buffer->len = 0;
 
 		++tx_queue->read_count;
 		read_ptr = tx_queue->read_count & tx_queue->ptr_mask;
@@ -495,6 +457,21 @@ void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index)
 	}
 }
 
+/* Size of page-based TSO header buffers.  Larger blocks must be
+ * allocated from the heap.
+ */
+#define TSOH_STD_SIZE	128
+#define TSOH_PER_PAGE	(PAGE_SIZE / TSOH_STD_SIZE)
+
+/* At most half the descriptors in the queue at any time will refer to
+ * a TSO header buffer, since they must always be followed by a
+ * payload descriptor referring to an skb.
+ */
+static unsigned int efx_tsoh_page_count(struct efx_tx_queue *tx_queue)
+{
+	return DIV_ROUND_UP(tx_queue->ptr_mask + 1, 2 * TSOH_PER_PAGE);
+}
+
 int efx_probe_tx_queue(struct efx_tx_queue *tx_queue)
 {
 	struct efx_nic *efx = tx_queue->efx;
@@ -516,14 +493,27 @@ int efx_probe_tx_queue(struct efx_tx_queue *tx_queue)
 	if (!tx_queue->buffer)
 		return -ENOMEM;
 
+	if (tx_queue->queue & EFX_TXQ_TYPE_OFFLOAD) {
+		tx_queue->tsoh_page =
+			kcalloc(efx_tsoh_page_count(tx_queue),
+				sizeof(tx_queue->tsoh_page[0]), GFP_KERNEL);
+		if (!tx_queue->tsoh_page) {
+			rc = -ENOMEM;
+			goto fail1;
+		}
+	}
+
 	/* Allocate hardware ring */
 	rc = efx_nic_probe_tx(tx_queue);
 	if (rc)
-		goto fail;
+		goto fail2;
 
 	return 0;
 
- fail:
+fail2:
+	kfree(tx_queue->tsoh_page);
+	tx_queue->tsoh_page = NULL;
+fail1:
 	kfree(tx_queue->buffer);
 	tx_queue->buffer = NULL;
 	return rc;
@@ -559,7 +549,6 @@ void efx_release_tx_buffers(struct efx_tx_queue *tx_queue)
 		unsigned int pkts_compl = 0, bytes_compl = 0;
 		buffer = &tx_queue->buffer[tx_queue->read_count & tx_queue->ptr_mask];
 		efx_dequeue_buffer(tx_queue, buffer, &pkts_compl, &bytes_compl);
-		buffer->len = 0;
 
 		++tx_queue->read_count;
 	}
@@ -580,13 +569,12 @@ void efx_fini_tx_queue(struct efx_tx_queue *tx_queue)
 	efx_nic_fini_tx(tx_queue);
 
 	efx_release_tx_buffers(tx_queue);
-
-	/* Free up TSO header cache */
-	efx_fini_tso(tx_queue);
 }
 
 void efx_remove_tx_queue(struct efx_tx_queue *tx_queue)
 {
+	int i;
+
 	if (!tx_queue->buffer)
 		return;
 
@@ -594,6 +582,14 @@ void efx_remove_tx_queue(struct efx_tx_queue *tx_queue)
 		  "destroying TX queue %d\n", tx_queue->queue);
 	efx_nic_remove_tx(tx_queue);
 
+	if (tx_queue->tsoh_page) {
+		for (i = 0; i < efx_tsoh_page_count(tx_queue); i++)
+			efx_nic_free_buffer(tx_queue->efx,
+					    &tx_queue->tsoh_page[i]);
+		kfree(tx_queue->tsoh_page);
+		tx_queue->tsoh_page = NULL;
+	}
+
 	kfree(tx_queue->buffer);
 	tx_queue->buffer = NULL;
 }
@@ -616,17 +612,6 @@ void efx_remove_tx_queue(struct efx_tx_queue *tx_queue)
 #define TSOH_OFFSET	NET_IP_ALIGN
 #endif
 
-#define TSOH_BUFFER(tsoh)	((u8 *)(tsoh + 1) + TSOH_OFFSET)
-
-/* Total size of struct efx_tso_header, buffer and padding */
-#define TSOH_SIZE(hdr_len)					\
-	(sizeof(struct efx_tso_header) + TSOH_OFFSET + hdr_len)
-
-/* Size of blocks on free list.  Larger blocks must be allocated from
- * the heap.
- */
-#define TSOH_STD_SIZE		128
-
 #define PTR_DIFF(p1, p2)  ((u8 *)(p1) - (u8 *)(p2))
 #define ETH_HDR_LEN(skb)  (skb_network_header(skb) - (skb)->data)
 #define SKB_TCP_OFF(skb)  PTR_DIFF(tcp_hdr(skb), (skb)->data)
@@ -699,91 +684,43 @@ static __be16 efx_tso_check_protocol(struct sk_buff *skb)
 	return protocol;
 }
 
-
-/*
- * Allocate a page worth of efx_tso_header structures, and string them
- * into the tx_queue->tso_headers_free linked list. Return 0 or -ENOMEM.
- */
-static int efx_tsoh_block_alloc(struct efx_tx_queue *tx_queue)
+static u8 *efx_tsoh_get_buffer(struct efx_tx_queue *tx_queue,
+			       struct efx_tx_buffer *buffer, unsigned int len)
 {
-	struct device *dma_dev = &tx_queue->efx->pci_dev->dev;
-	struct efx_tso_header *tsoh;
-	dma_addr_t dma_addr;
-	u8 *base_kva, *kva;
+	u8 *result;
 
-	base_kva = dma_alloc_coherent(dma_dev, PAGE_SIZE, &dma_addr, GFP_ATOMIC);
-	if (base_kva == NULL) {
-		netif_err(tx_queue->efx, tx_err, tx_queue->efx->net_dev,
-			  "Unable to allocate page for TSO headers\n");
-		return -ENOMEM;
+	EFX_BUG_ON_PARANOID(buffer->len);
+	EFX_BUG_ON_PARANOID(buffer->flags);
+	EFX_BUG_ON_PARANOID(buffer->unmap_len);
+
+	if (likely(len <= TSOH_STD_SIZE - TSOH_OFFSET)) {
+		unsigned index =
+			(tx_queue->insert_count & tx_queue->ptr_mask) / 2;
+		struct efx_buffer *page_buf =
+			&tx_queue->tsoh_page[index / TSOH_PER_PAGE];
+		unsigned offset =
+			TSOH_STD_SIZE * (index % TSOH_PER_PAGE) + TSOH_OFFSET;
+
+		if (unlikely(!page_buf->addr) &&
+		    efx_nic_alloc_buffer(tx_queue->efx, page_buf, PAGE_SIZE))
+			return NULL;
+
+		result = (u8 *)page_buf->addr + offset;
+		buffer->dma_addr = page_buf->dma_addr + offset;
+		buffer->flags = EFX_TX_BUF_CONT;
+	} else {
+		tx_queue->tso_long_headers++;
+
+		buffer->heap_buf = kmalloc(TSOH_OFFSET + len, GFP_ATOMIC);
+		if (unlikely(!buffer->heap_buf))
+			return NULL;
+		result = (u8 *)buffer->heap_buf + TSOH_OFFSET;
+		buffer->flags = EFX_TX_BUF_CONT | EFX_TX_BUF_HEAP;
 	}
 
-	/* dma_alloc_coherent() allocates pages. */
-	EFX_BUG_ON_PARANOID(dma_addr & (PAGE_SIZE - 1u));
+	buffer->len = len;
 
-	for (kva = base_kva; kva < base_kva + PAGE_SIZE; kva += TSOH_STD_SIZE) {
-		tsoh = (struct efx_tso_header *)kva;
-		tsoh->dma_addr = dma_addr + (TSOH_BUFFER(tsoh) - base_kva);
-		tsoh->next = tx_queue->tso_headers_free;
-		tx_queue->tso_headers_free = tsoh;
-	}
-
-	return 0;
-}
-
-
-/* Free up a TSO header, and all others in the same page. */
-static void efx_tsoh_block_free(struct efx_tx_queue *tx_queue,
-				struct efx_tso_header *tsoh,
-				struct device *dma_dev)
-{
-	struct efx_tso_header **p;
-	unsigned long base_kva;
-	dma_addr_t base_dma;
-
-	base_kva = (unsigned long)tsoh & PAGE_MASK;
-	base_dma = tsoh->dma_addr & PAGE_MASK;
-
-	p = &tx_queue->tso_headers_free;
-	while (*p != NULL) {
-		if (((unsigned long)*p & PAGE_MASK) == base_kva)
-			*p = (*p)->next;
-		else
-			p = &(*p)->next;
-	}
-
-	dma_free_coherent(dma_dev, PAGE_SIZE, (void *)base_kva, base_dma);
-}
-
-static struct efx_tso_header *
-efx_tsoh_heap_alloc(struct efx_tx_queue *tx_queue, size_t header_len)
-{
-	struct efx_tso_header *tsoh;
-
-	tsoh = kmalloc(TSOH_SIZE(header_len), GFP_ATOMIC | GFP_DMA);
-	if (unlikely(!tsoh))
-		return NULL;
-
-	tsoh->dma_addr = dma_map_single(&tx_queue->efx->pci_dev->dev,
-					TSOH_BUFFER(tsoh), header_len,
-					DMA_TO_DEVICE);
-	if (unlikely(dma_mapping_error(&tx_queue->efx->pci_dev->dev,
-				       tsoh->dma_addr))) {
-		kfree(tsoh);
-		return NULL;
-	}
-
-	tsoh->unmap_len = header_len;
-	return tsoh;
-}
-
-static void
-efx_tsoh_heap_free(struct efx_tx_queue *tx_queue, struct efx_tso_header *tsoh)
-{
-	dma_unmap_single(&tx_queue->efx->pci_dev->dev,
-			 tsoh->dma_addr, tsoh->unmap_len,
-			 DMA_TO_DEVICE);
-	kfree(tsoh);
+	return result;
 }
 
 /**
@@ -814,7 +751,6 @@ static void efx_tx_queue_insert(struct efx_tx_queue *tx_queue,
 				    tx_queue->read_count >=
 				    efx->txq_entries);
 
-		efx_tsoh_free(tx_queue, buffer);
 		EFX_BUG_ON_PARANOID(buffer->len);
 		EFX_BUG_ON_PARANOID(buffer->unmap_len);
 		EFX_BUG_ON_PARANOID(buffer->flags);
@@ -846,53 +782,42 @@ static void efx_tx_queue_insert(struct efx_tx_queue *tx_queue,
  * a single fragment, and we know it doesn't cross a page boundary.  It
  * also allows us to not worry about end-of-packet etc.
  */
-static void efx_tso_put_header(struct efx_tx_queue *tx_queue,
-			       struct efx_tso_header *tsoh, unsigned len)
+static int efx_tso_put_header(struct efx_tx_queue *tx_queue,
+			      struct efx_tx_buffer *buffer, u8 *header)
 {
-	struct efx_tx_buffer *buffer;
-
-	buffer = &tx_queue->buffer[tx_queue->insert_count & tx_queue->ptr_mask];
-	efx_tsoh_free(tx_queue, buffer);
-	EFX_BUG_ON_PARANOID(buffer->len);
-	EFX_BUG_ON_PARANOID(buffer->unmap_len);
-	EFX_BUG_ON_PARANOID(buffer->flags);
-	buffer->len = len;
-	buffer->dma_addr = tsoh->dma_addr;
-	buffer->tsoh = tsoh;
-	buffer->flags = EFX_TX_BUF_TSOH | EFX_TX_BUF_CONT;
+	if (unlikely(buffer->flags & EFX_TX_BUF_HEAP)) {
+		buffer->dma_addr = dma_map_single(&tx_queue->efx->pci_dev->dev,
+						  header, buffer->len,
+						  DMA_TO_DEVICE);
+		if (unlikely(dma_mapping_error(&tx_queue->efx->pci_dev->dev,
+					       buffer->dma_addr))) {
+			kfree(buffer->heap_buf);
+			buffer->len = 0;
+			buffer->flags = 0;
+			return -ENOMEM;
+		}
+		buffer->unmap_len = buffer->len;
+		buffer->flags |= EFX_TX_BUF_MAP_SINGLE;
+	}
 
 	++tx_queue->insert_count;
+	return 0;
 }
 
 
-/* Remove descriptors put into a tx_queue. */
+/* Remove buffers put into a tx_queue.  None of the buffers must have
+ * an skb attached.
+ */
 static void efx_enqueue_unwind(struct efx_tx_queue *tx_queue)
 {
 	struct efx_tx_buffer *buffer;
-	dma_addr_t unmap_addr;
 
 	/* Work backwards until we hit the original insert pointer value */
 	while (tx_queue->insert_count != tx_queue->write_count) {
 		--tx_queue->insert_count;
 		buffer = &tx_queue->buffer[tx_queue->insert_count &
 					   tx_queue->ptr_mask];
-		efx_tsoh_free(tx_queue, buffer);
-		EFX_BUG_ON_PARANOID(buffer->flags & EFX_TX_BUF_SKB);
-		if (buffer->unmap_len) {
-			unmap_addr = (buffer->dma_addr + buffer->len -
-				      buffer->unmap_len);
-			if (buffer->flags & EFX_TX_BUF_MAP_SINGLE)
-				dma_unmap_single(&tx_queue->efx->pci_dev->dev,
-						 unmap_addr, buffer->unmap_len,
-						 DMA_TO_DEVICE);
-			else
-				dma_unmap_page(&tx_queue->efx->pci_dev->dev,
-					       unmap_addr, buffer->unmap_len,
-					       DMA_TO_DEVICE);
-			buffer->unmap_len = 0;
-		}
-		buffer->len = 0;
-		buffer->flags = 0;
+		efx_dequeue_buffer(tx_queue, buffer, NULL, NULL);
 	}
 }
 
@@ -1014,35 +939,24 @@ static void tso_fill_packet_with_fragment(struct efx_tx_queue *tx_queue,
  * @st:			TSO state
  *
  * Generate a new header and prepare for the new packet.  Return 0 on
- * success, or -1 if failed to alloc header.
+ * success, or -%ENOMEM if failed to alloc header.
  */
 static int tso_start_new_packet(struct efx_tx_queue *tx_queue,
 				const struct sk_buff *skb,
 				struct tso_state *st)
 {
-	struct efx_tso_header *tsoh;
+	struct efx_tx_buffer *buffer =
+		&tx_queue->buffer[tx_queue->insert_count & tx_queue->ptr_mask];
 	struct tcphdr *tsoh_th;
 	unsigned ip_length;
 	u8 *header;
+	int rc;
 
-	/* Allocate a DMA-mapped header buffer. */
-	if (likely(TSOH_SIZE(st->header_len) <= TSOH_STD_SIZE)) {
-		if (tx_queue->tso_headers_free == NULL) {
-			if (efx_tsoh_block_alloc(tx_queue))
-				return -1;
-		}
-		EFX_BUG_ON_PARANOID(!tx_queue->tso_headers_free);
-		tsoh = tx_queue->tso_headers_free;
-		tx_queue->tso_headers_free = tsoh->next;
-		tsoh->unmap_len = 0;
-	} else {
-		tx_queue->tso_long_headers++;
-		tsoh = efx_tsoh_heap_alloc(tx_queue, st->header_len);
-		if (unlikely(!tsoh))
-			return -1;
-	}
+	/* Allocate and insert a DMA-mapped header buffer. */
+	header = efx_tsoh_get_buffer(tx_queue, buffer, st->header_len);
+	if (!header)
+		return -ENOMEM;
 
-	header = TSOH_BUFFER(tsoh);
 	tsoh_th = (struct tcphdr *)(header + SKB_TCP_OFF(skb));
 
 	/* Copy and update the headers. */
@@ -1078,12 +992,13 @@ static int tso_start_new_packet(struct efx_tx_queue *tx_queue,
 		tsoh_iph->payload_len = htons(ip_length - sizeof(*tsoh_iph));
 	}
 
+	rc = efx_tso_put_header(tx_queue, buffer, header);
+	if (unlikely(rc))
+		return rc;
+
 	st->packet_space = skb_shinfo(skb)->gso_size;
 	++tx_queue->tso_packets;
 
-	/* Form a descriptor for this header. */
-	efx_tso_put_header(tx_queue, tsoh, st->header_len);
-
 	return 0;
 }
 
@@ -1182,23 +1097,3 @@ static int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue,
 	efx_enqueue_unwind(tx_queue);
 	return NETDEV_TX_OK;
 }
-
-
-/*
- * Free up all TSO datastructures associated with tx_queue. This
- * routine should be called only once the tx_queue is both empty and
- * will no longer be used.
- */
-static void efx_fini_tso(struct efx_tx_queue *tx_queue)
-{
-	unsigned i;
-
-	if (tx_queue->buffer) {
-		for (i = 0; i <= tx_queue->ptr_mask; ++i)
-			efx_tsoh_free(tx_queue, &tx_queue->buffer[i]);
-	}
-
-	while (tx_queue->tso_headers_free != NULL)
-		efx_tsoh_block_free(tx_queue, tx_queue->tso_headers_free,
-				    &tx_queue->efx->pci_dev->dev);
-}

From 53cb13c680b98c637d95dd731b01bc284b3aa33d Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Tue, 19 Jun 2012 20:03:41 +0100
Subject: [PATCH 04/16] sfc: Replace tso_state::full_packet_space with
 ip_base_len

We only use tso_state::full_packet_space to calculate the IPv4 tot_len
or IPv6 payload_len, not to set tso_state::packet_space.  Replace it
with an ip_base_len field holding the value of tot_len or payload_len
before including the TCP payload, which is much more useful when
constructing the new headers.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/tx.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c
index 61bc0ed718e3..d9dbebc9f819 100644
--- a/drivers/net/ethernet/sfc/tx.c
+++ b/drivers/net/ethernet/sfc/tx.c
@@ -631,7 +631,7 @@ void efx_remove_tx_queue(struct efx_tx_queue *tx_queue)
  * @dma_flags: TX buffer flags for DMA mapping - %EFX_TX_BUF_MAP_SINGLE or 0
  * @protocol: Network protocol (after any VLAN header)
  * @header_len: Number of bytes of header
- * @full_packet_size: Number of bytes to put in each outgoing segment
+ * @ip_base_len: IPv4 tot_len or IPv6 payload_len, before TCP payload
  *
  * The state used during segmentation.  It is put into this data structure
  * just to make it easy to pass into inline functions.
@@ -652,7 +652,7 @@ struct tso_state {
 
 	__be16 protocol;
 	unsigned header_len;
-	int full_packet_size;
+	unsigned int ip_base_len;
 };
 
 
@@ -830,12 +830,14 @@ static void tso_start(struct tso_state *st, const struct sk_buff *skb)
 	 */
 	st->header_len = ((tcp_hdr(skb)->doff << 2u)
 			  + PTR_DIFF(tcp_hdr(skb), skb->data));
-	st->full_packet_size = st->header_len + skb_shinfo(skb)->gso_size;
 
-	if (st->protocol == htons(ETH_P_IP))
+	if (st->protocol == htons(ETH_P_IP)) {
+		st->ip_base_len = st->header_len - ETH_HDR_LEN(skb);
 		st->ipv4_id = ntohs(ip_hdr(skb)->id);
-	else
+	} else {
+		st->ip_base_len = tcp_hdr(skb)->doff << 2u;
 		st->ipv4_id = 0;
+	}
 	st->seqnum = ntohl(tcp_hdr(skb)->seq);
 
 	EFX_BUG_ON_PARANOID(tcp_hdr(skb)->urg);
@@ -966,15 +968,16 @@ static int tso_start_new_packet(struct efx_tx_queue *tx_queue,
 	st->seqnum += skb_shinfo(skb)->gso_size;
 	if (st->out_len > skb_shinfo(skb)->gso_size) {
 		/* This packet will not finish the TSO burst. */
-		ip_length = st->full_packet_size - ETH_HDR_LEN(skb);
+		st->packet_space = skb_shinfo(skb)->gso_size;
 		tsoh_th->fin = 0;
 		tsoh_th->psh = 0;
 	} else {
 		/* This packet will be the last in the TSO burst. */
-		ip_length = st->header_len - ETH_HDR_LEN(skb) + st->out_len;
+		st->packet_space = st->out_len;
 		tsoh_th->fin = tcp_hdr(skb)->fin;
 		tsoh_th->psh = tcp_hdr(skb)->psh;
 	}
+	ip_length = st->ip_base_len + st->packet_space;
 
 	if (st->protocol == htons(ETH_P_IP)) {
 		struct iphdr *tsoh_iph =
@@ -989,14 +992,13 @@ static int tso_start_new_packet(struct efx_tx_queue *tx_queue,
 		struct ipv6hdr *tsoh_iph =
 			(struct ipv6hdr *)(header + SKB_IPV6_OFF(skb));
 
-		tsoh_iph->payload_len = htons(ip_length - sizeof(*tsoh_iph));
+		tsoh_iph->payload_len = htons(ip_length);
 	}
 
 	rc = efx_tso_put_header(tx_queue, buffer, header);
 	if (unlikely(rc))
 		return rc;
 
-	st->packet_space = skb_shinfo(skb)->gso_size;
 	++tx_queue->tso_packets;
 
 	return 0;

From 9714284f83387d330496758e5c10a649fd9a677d Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Fri, 22 Jun 2012 02:44:01 +0100
Subject: [PATCH 05/16] sfc: Stash header offsets for TSO in struct tso_state

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/tx.c | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c
index d9dbebc9f819..ebca75ed78dc 100644
--- a/drivers/net/ethernet/sfc/tx.c
+++ b/drivers/net/ethernet/sfc/tx.c
@@ -613,10 +613,6 @@ void efx_remove_tx_queue(struct efx_tx_queue *tx_queue)
 #endif
 
 #define PTR_DIFF(p1, p2)  ((u8 *)(p1) - (u8 *)(p2))
-#define ETH_HDR_LEN(skb)  (skb_network_header(skb) - (skb)->data)
-#define SKB_TCP_OFF(skb)  PTR_DIFF(tcp_hdr(skb), (skb)->data)
-#define SKB_IPV4_OFF(skb) PTR_DIFF(ip_hdr(skb), (skb)->data)
-#define SKB_IPV6_OFF(skb) PTR_DIFF(ipv6_hdr(skb), (skb)->data)
 
 /**
  * struct tso_state - TSO state for an SKB
@@ -630,6 +626,8 @@ void efx_remove_tx_queue(struct efx_tx_queue *tx_queue)
  * @unmap_addr: DMA address of SKB fragment
  * @dma_flags: TX buffer flags for DMA mapping - %EFX_TX_BUF_MAP_SINGLE or 0
  * @protocol: Network protocol (after any VLAN header)
+ * @ip_off: Offset of IP header
+ * @tcp_off: Offset of TCP header
  * @header_len: Number of bytes of header
  * @ip_base_len: IPv4 tot_len or IPv6 payload_len, before TCP payload
  *
@@ -651,6 +649,8 @@ struct tso_state {
 	unsigned short dma_flags;
 
 	__be16 protocol;
+	unsigned int ip_off;
+	unsigned int tcp_off;
 	unsigned header_len;
 	unsigned int ip_base_len;
 };
@@ -825,17 +825,14 @@ static void efx_enqueue_unwind(struct efx_tx_queue *tx_queue)
 /* Parse the SKB header and initialise state. */
 static void tso_start(struct tso_state *st, const struct sk_buff *skb)
 {
-	/* All ethernet/IP/TCP headers combined size is TCP header size
-	 * plus offset of TCP header relative to start of packet.
-	 */
-	st->header_len = ((tcp_hdr(skb)->doff << 2u)
-			  + PTR_DIFF(tcp_hdr(skb), skb->data));
-
+	st->ip_off = skb_network_header(skb) - skb->data;
+	st->tcp_off = skb_transport_header(skb) - skb->data;
+	st->header_len = st->tcp_off + (tcp_hdr(skb)->doff << 2u);
 	if (st->protocol == htons(ETH_P_IP)) {
-		st->ip_base_len = st->header_len - ETH_HDR_LEN(skb);
+		st->ip_base_len = st->header_len - st->ip_off;
 		st->ipv4_id = ntohs(ip_hdr(skb)->id);
 	} else {
-		st->ip_base_len = tcp_hdr(skb)->doff << 2u;
+		st->ip_base_len = st->header_len - st->tcp_off;
 		st->ipv4_id = 0;
 	}
 	st->seqnum = ntohl(tcp_hdr(skb)->seq);
@@ -959,7 +956,7 @@ static int tso_start_new_packet(struct efx_tx_queue *tx_queue,
 	if (!header)
 		return -ENOMEM;
 
-	tsoh_th = (struct tcphdr *)(header + SKB_TCP_OFF(skb));
+	tsoh_th = (struct tcphdr *)(header + st->tcp_off);
 
 	/* Copy and update the headers. */
 	memcpy(header, skb->data, st->header_len);
@@ -980,8 +977,7 @@ static int tso_start_new_packet(struct efx_tx_queue *tx_queue,
 	ip_length = st->ip_base_len + st->packet_space;
 
 	if (st->protocol == htons(ETH_P_IP)) {
-		struct iphdr *tsoh_iph =
-			(struct iphdr *)(header + SKB_IPV4_OFF(skb));
+		struct iphdr *tsoh_iph = (struct iphdr *)(header + st->ip_off);
 
 		tsoh_iph->tot_len = htons(ip_length);
 
@@ -990,7 +986,7 @@ static int tso_start_new_packet(struct efx_tx_queue *tx_queue,
 		st->ipv4_id++;
 	} else {
 		struct ipv6hdr *tsoh_iph =
-			(struct ipv6hdr *)(header + SKB_IPV6_OFF(skb));
+			(struct ipv6hdr *)(header + st->ip_off);
 
 		tsoh_iph->payload_len = htons(ip_length);
 	}

From f16aeea0e679d5fd43fc02e99569c52d77d5e5d3 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Fri, 27 Jul 2012 19:31:16 +0100
Subject: [PATCH 06/16] sfc: Change state names to be clearer, and comment them

STATE_INIT and STATE_FINI are equivalent and represent incompletely
initialised states; combine them as STATE_UNINIT.

Rename STATE_RUNNING to STATE_READY, to avoid confusion with
netif_running() and IFF_RUNNING.

The comments do not quite match current usage, but this will be
corrected in subsequent fixes.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/efx.c           | 24 +++++++++++-------------
 drivers/net/ethernet/sfc/ethtool.c       |  2 +-
 drivers/net/ethernet/sfc/falcon_boards.c |  2 +-
 drivers/net/ethernet/sfc/net_driver.h    | 10 ++++------
 4 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 3b3f08489a5e..9f88ad86f4e3 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -202,7 +202,7 @@ static void efx_stop_all(struct efx_nic *efx);
 
 #define EFX_ASSERT_RESET_SERIALISED(efx)		\
 	do {						\
-		if ((efx->state == STATE_RUNNING) ||	\
+		if ((efx->state == STATE_READY) ||	\
 		    (efx->state == STATE_DISABLED))	\
 			ASSERT_RTNL();			\
 	} while (0)
@@ -1556,7 +1556,7 @@ static void efx_start_all(struct efx_nic *efx)
 	 * of these flags are safe to read under just the rtnl lock */
 	if (efx->port_enabled)
 		return;
-	if ((efx->state != STATE_RUNNING) && (efx->state != STATE_INIT))
+	if ((efx->state != STATE_READY) && (efx->state != STATE_UNINIT))
 		return;
 	if (!netif_running(efx->net_dev))
 		return;
@@ -2286,11 +2286,11 @@ static void efx_reset_work(struct work_struct *data)
 	if (!pending)
 		return;
 
-	/* If we're not RUNNING then don't reset. Leave the reset_pending
+	/* If we're not READY then don't reset. Leave the reset_pending
 	 * flags set so that efx_pci_probe_main will be retried */
-	if (efx->state != STATE_RUNNING) {
+	if (efx->state != STATE_READY) {
 		netif_info(efx, drv, efx->net_dev,
-			   "scheduled reset quenched. NIC not RUNNING\n");
+			   "scheduled reset quenched; NIC not ready\n");
 		return;
 	}
 
@@ -2402,7 +2402,7 @@ static int efx_init_struct(struct efx_nic *efx, const struct efx_nic_type *type,
 	INIT_DELAYED_WORK(&efx->selftest_work, efx_selftest_async_work);
 	efx->pci_dev = pci_dev;
 	efx->msg_enable = debug;
-	efx->state = STATE_INIT;
+	efx->state = STATE_UNINIT;
 	strlcpy(efx->name, pci_name(pci_dev), sizeof(efx->name));
 
 	efx->net_dev = net_dev;
@@ -2490,7 +2490,7 @@ static void efx_pci_remove(struct pci_dev *pci_dev)
 
 	/* Mark the NIC as fini, then stop the interface */
 	rtnl_lock();
-	efx->state = STATE_FINI;
+	efx->state = STATE_UNINIT;
 	dev_close(efx->net_dev);
 
 	/* Allow any queued efx_resets() to complete */
@@ -2684,9 +2684,9 @@ static int __devinit efx_pci_probe(struct pci_dev *pci_dev,
 		goto fail4;
 	}
 
-	/* Switch to the running state before we expose the device to the OS,
+	/* Switch to the READY state before we expose the device to the OS,
 	 * so that dev_open()|efx_start_all() will actually start the device */
-	efx->state = STATE_RUNNING;
+	efx->state = STATE_READY;
 
 	rc = efx_register_netdev(efx);
 	if (rc)
@@ -2727,7 +2727,7 @@ static int efx_pm_freeze(struct device *dev)
 {
 	struct efx_nic *efx = pci_get_drvdata(to_pci_dev(dev));
 
-	efx->state = STATE_FINI;
+	efx->state = STATE_UNINIT;
 
 	netif_device_detach(efx->net_dev);
 
@@ -2741,8 +2741,6 @@ static int efx_pm_thaw(struct device *dev)
 {
 	struct efx_nic *efx = pci_get_drvdata(to_pci_dev(dev));
 
-	efx->state = STATE_INIT;
-
 	efx_start_interrupts(efx, false);
 
 	mutex_lock(&efx->mac_lock);
@@ -2753,7 +2751,7 @@ static int efx_pm_thaw(struct device *dev)
 
 	netif_device_attach(efx->net_dev);
 
-	efx->state = STATE_RUNNING;
+	efx->state = STATE_READY;
 
 	efx->type->resume_wol(efx);
 
diff --git a/drivers/net/ethernet/sfc/ethtool.c b/drivers/net/ethernet/sfc/ethtool.c
index 8cba2df82b18..5d0e2a3241b1 100644
--- a/drivers/net/ethernet/sfc/ethtool.c
+++ b/drivers/net/ethernet/sfc/ethtool.c
@@ -531,7 +531,7 @@ static void efx_ethtool_self_test(struct net_device *net_dev,
 
 
 	ASSERT_RTNL();
-	if (efx->state != STATE_RUNNING) {
+	if (efx->state != STATE_READY) {
 		rc = -EIO;
 		goto fail1;
 	}
diff --git a/drivers/net/ethernet/sfc/falcon_boards.c b/drivers/net/ethernet/sfc/falcon_boards.c
index 8687a6c3db0d..ec1e99d0dcad 100644
--- a/drivers/net/ethernet/sfc/falcon_boards.c
+++ b/drivers/net/ethernet/sfc/falcon_boards.c
@@ -380,7 +380,7 @@ static ssize_t set_phy_flash_cfg(struct device *dev,
 		new_mode = PHY_MODE_SPECIAL;
 	if (!((old_mode ^ new_mode) & PHY_MODE_SPECIAL)) {
 		err = 0;
-	} else if (efx->state != STATE_RUNNING || netif_running(efx->net_dev)) {
+	} else if (efx->state != STATE_READY || netif_running(efx->net_dev)) {
 		err = -EBUSY;
 	} else {
 		/* Reset the PHY, reconfigure the MAC and enable/disable
diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h
index a4fe9a786ef8..7ab1232494ef 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -430,11 +430,9 @@ enum efx_int_mode {
 #define EFX_INT_MODE_USE_MSI(x) (((x)->interrupt_mode) <= EFX_INT_MODE_MSI)
 
 enum nic_state {
-	STATE_INIT = 0,
-	STATE_RUNNING = 1,
-	STATE_FINI = 2,
-	STATE_DISABLED = 3,
-	STATE_MAX,
+	STATE_UNINIT = 0,	/* device being probed/removed or is frozen */
+	STATE_READY = 1,	/* hardware ready and netdev registered */
+	STATE_DISABLED = 2,	/* device disabled due to hardware errors */
 };
 
 /*
@@ -654,7 +652,7 @@ struct vfdi_status;
  * @irq_rx_adaptive: Adaptive IRQ moderation enabled for RX event queues
  * @irq_rx_moderation: IRQ moderation time for RX event queues
  * @msg_enable: Log message enable flags
- * @state: Device state flag. Serialised by the rtnl_lock.
+ * @state: Device state number (%STATE_*). Serialised by the rtnl_lock.
  * @reset_pending: Bitmask for pending resets
  * @tx_queue: TX DMA queues
  * @rx_queue: RX DMA queues

From 61da026d86517def727eb67d7cb8ee5657a12dd9 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Fri, 27 Jul 2012 19:35:39 +0100
Subject: [PATCH 07/16] sfc: Hold the RTNL lock for more of the suspend/resume
 cycle

I don't think these PM functions can race with userland net device
operations, but it's much easier to reason about locking if state is
consistently guarded by the same lock.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/efx.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 9f88ad86f4e3..e5f3a17349af 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -2727,6 +2727,8 @@ static int efx_pm_freeze(struct device *dev)
 {
 	struct efx_nic *efx = pci_get_drvdata(to_pci_dev(dev));
 
+	rtnl_lock();
+
 	efx->state = STATE_UNINIT;
 
 	netif_device_detach(efx->net_dev);
@@ -2734,6 +2736,8 @@ static int efx_pm_freeze(struct device *dev)
 	efx_stop_all(efx);
 	efx_stop_interrupts(efx, false);
 
+	rtnl_unlock();
+
 	return 0;
 }
 
@@ -2741,6 +2745,8 @@ static int efx_pm_thaw(struct device *dev)
 {
 	struct efx_nic *efx = pci_get_drvdata(to_pci_dev(dev));
 
+	rtnl_lock();
+
 	efx_start_interrupts(efx, false);
 
 	mutex_lock(&efx->mac_lock);
@@ -2755,6 +2761,8 @@ static int efx_pm_thaw(struct device *dev)
 
 	efx->type->resume_wol(efx);
 
+	rtnl_unlock();
+
 	/* Reschedule any quenched resets scheduled during efx_pm_freeze() */
 	queue_work(reset_workqueue, &efx->reset_work);
 

From 6032fb56c546c0a14856dc57a92d84560816b217 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Fri, 27 Jul 2012 19:35:47 +0100
Subject: [PATCH 08/16] sfc: Keep disabled NICs quiescent during suspend/resume

Currently we ignore and clear the disabled state.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/efx.c | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index e5f3a17349af..f02591bb62f4 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -2729,12 +2729,14 @@ static int efx_pm_freeze(struct device *dev)
 
 	rtnl_lock();
 
-	efx->state = STATE_UNINIT;
+	if (efx->state != STATE_DISABLED) {
+		efx->state = STATE_UNINIT;
 
-	netif_device_detach(efx->net_dev);
+		netif_device_detach(efx->net_dev);
 
-	efx_stop_all(efx);
-	efx_stop_interrupts(efx, false);
+		efx_stop_all(efx);
+		efx_stop_interrupts(efx, false);
+	}
 
 	rtnl_unlock();
 
@@ -2747,19 +2749,21 @@ static int efx_pm_thaw(struct device *dev)
 
 	rtnl_lock();
 
-	efx_start_interrupts(efx, false);
+	if (efx->state != STATE_DISABLED) {
+		efx_start_interrupts(efx, false);
 
-	mutex_lock(&efx->mac_lock);
-	efx->phy_op->reconfigure(efx);
-	mutex_unlock(&efx->mac_lock);
+		mutex_lock(&efx->mac_lock);
+		efx->phy_op->reconfigure(efx);
+		mutex_unlock(&efx->mac_lock);
 
-	efx_start_all(efx);
+		efx_start_all(efx);
 
-	netif_device_attach(efx->net_dev);
+		netif_device_attach(efx->net_dev);
 
-	efx->state = STATE_READY;
+		efx->state = STATE_READY;
 
-	efx->type->resume_wol(efx);
+		efx->type->resume_wol(efx);
+	}
 
 	rtnl_unlock();
 

From 5642ceef466365ebc94eb1332d9e6228efa09518 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Fri, 27 Jul 2012 19:35:52 +0100
Subject: [PATCH 09/16] sfc: Hold RTNL lock (only) when calling
 efx_stop_interrupts()

Interrupt state should be consistently guarded by the RTNL lock once
the net device is registered.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/efx.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index f02591bb62f4..629029e2037d 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -2164,9 +2164,9 @@ void efx_reset_down(struct efx_nic *efx, enum reset_type method)
 	EFX_ASSERT_RESET_SERIALISED(efx);
 
 	efx_stop_all(efx);
-	mutex_lock(&efx->mac_lock);
-
 	efx_stop_interrupts(efx, false);
+
+	mutex_lock(&efx->mac_lock);
 	if (efx->port_initialized && method != RESET_TYPE_INVISIBLE)
 		efx->phy_op->fini(efx);
 	efx->type->fini(efx);
@@ -2492,11 +2492,11 @@ static void efx_pci_remove(struct pci_dev *pci_dev)
 	rtnl_lock();
 	efx->state = STATE_UNINIT;
 	dev_close(efx->net_dev);
+	efx_stop_interrupts(efx, false);
 
 	/* Allow any queued efx_resets() to complete */
 	rtnl_unlock();
 
-	efx_stop_interrupts(efx, false);
 	efx_sriov_fini(efx);
 	efx_unregister_netdev(efx);
 

From 8b7325b4e29256881117aff8a162e829c79b47e9 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Fri, 27 Jul 2012 20:46:41 +0100
Subject: [PATCH 10/16] sfc: Never try to stop and start a NIC that is disabled

efx_change_mtu() and efx_realloc_channels() each stop and start much
of the NIC, even if it has been disabled.  Since efx_start_all() is a
no-op when the NIC is disabled, this is probably harmless in the case
of efx_change_mtu(), but efx_realloc_channels() also reenables
interrupts which could be a bad thing to do.

Change efx_start_all() and efx_start_interrupts() to assert that the
NIC is not disabled, but make efx_stop_interrupts() do nothing if the
NIC is disabled (since it is already stopped), consistent with
efx_stop_all().

Update comments for efx_start_all() and efx_stop_all() to describe
their purpose and preconditions more accurately.

Add a common function to check and log if the NIC is disabled, and use
it in efx_net_open(), efx_change_mtu() and efx_realloc_channels().

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/efx.c | 65 +++++++++++++++++++++++-----------
 1 file changed, 44 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 629029e2037d..977fc3a591e9 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -207,6 +207,16 @@ static void efx_stop_all(struct efx_nic *efx);
 			ASSERT_RTNL();			\
 	} while (0)
 
+static int efx_check_disabled(struct efx_nic *efx)
+{
+	if (efx->state == STATE_DISABLED) {
+		netif_err(efx, drv, efx->net_dev,
+			  "device is disabled due to earlier errors\n");
+		return -EIO;
+	}
+	return 0;
+}
+
 /**************************************************************************
  *
  * Event queue processing
@@ -740,7 +750,11 @@ efx_realloc_channels(struct efx_nic *efx, u32 rxq_entries, u32 txq_entries)
 	struct efx_channel *other_channel[EFX_MAX_CHANNELS], *channel;
 	u32 old_rxq_entries, old_txq_entries;
 	unsigned i, next_buffer_table = 0;
-	int rc = 0;
+	int rc;
+
+	rc = efx_check_disabled(efx);
+	if (rc)
+		return rc;
 
 	/* Not all channels should be reallocated. We must avoid
 	 * reallocating their buffer table entries.
@@ -1375,6 +1389,8 @@ static void efx_start_interrupts(struct efx_nic *efx, bool may_keep_eventq)
 {
 	struct efx_channel *channel;
 
+	BUG_ON(efx->state == STATE_DISABLED);
+
 	if (efx->legacy_irq)
 		efx->legacy_irq_enabled = true;
 	efx_nic_enable_interrupts(efx);
@@ -1392,6 +1408,9 @@ static void efx_stop_interrupts(struct efx_nic *efx, bool may_keep_eventq)
 {
 	struct efx_channel *channel;
 
+	if (efx->state == STATE_DISABLED)
+		return;
+
 	efx_mcdi_mode_poll(efx);
 
 	efx_nic_disable_interrupts(efx);
@@ -1543,22 +1562,21 @@ static int efx_probe_all(struct efx_nic *efx)
 	return rc;
 }
 
-/* Called after previous invocation(s) of efx_stop_all, restarts the port,
- * kernel transmit queues and NAPI processing, and ensures that the port is
- * scheduled to be reconfigured. This function is safe to call multiple
- * times when the NIC is in any state.
+/* If the interface is supposed to be running but is not, start
+ * the hardware and software data path, regular activity for the port
+ * (MAC statistics, link polling, etc.) and schedule the port to be
+ * reconfigured.  Interrupts must already be enabled.  This function
+ * is safe to call multiple times, so long as the NIC is not disabled.
+ * Requires the RTNL lock.
  */
 static void efx_start_all(struct efx_nic *efx)
 {
 	EFX_ASSERT_RESET_SERIALISED(efx);
+	BUG_ON(efx->state == STATE_DISABLED);
 
 	/* Check that it is appropriate to restart the interface. All
 	 * of these flags are safe to read under just the rtnl lock */
-	if (efx->port_enabled)
-		return;
-	if ((efx->state != STATE_READY) && (efx->state != STATE_UNINIT))
-		return;
-	if (!netif_running(efx->net_dev))
+	if (efx->port_enabled || !netif_running(efx->net_dev))
 		return;
 
 	efx_start_port(efx);
@@ -1592,11 +1610,11 @@ static void efx_flush_all(struct efx_nic *efx)
 	cancel_work_sync(&efx->mac_work);
 }
 
-/* Quiesce hardware and software without bringing the link down.
- * Safe to call multiple times, when the nic and interface is in any
- * state. The caller is guaranteed to subsequently be in a position
- * to modify any hardware and software state they see fit without
- * taking locks. */
+/* Quiesce the hardware and software data path, and regular activity
+ * for the port without bringing the link down.  Safe to call multiple
+ * times with the NIC in almost any state, but interrupts should be
+ * enabled.  Requires the RTNL lock.
+ */
 static void efx_stop_all(struct efx_nic *efx)
 {
 	EFX_ASSERT_RESET_SERIALISED(efx);
@@ -1830,13 +1848,16 @@ static void efx_netpoll(struct net_device *net_dev)
 static int efx_net_open(struct net_device *net_dev)
 {
 	struct efx_nic *efx = netdev_priv(net_dev);
+	int rc;
+
 	EFX_ASSERT_RESET_SERIALISED(efx);
 
 	netif_dbg(efx, ifup, efx->net_dev, "opening device on CPU %d\n",
 		  raw_smp_processor_id());
 
-	if (efx->state == STATE_DISABLED)
-		return -EIO;
+	rc = efx_check_disabled(efx);
+	if (rc)
+		return rc;
 	if (efx->phy_mode & PHY_MODE_SPECIAL)
 		return -EBUSY;
 	if (efx_mcdi_poll_reboot(efx) && efx_reset(efx, RESET_TYPE_ALL))
@@ -1862,10 +1883,8 @@ static int efx_net_stop(struct net_device *net_dev)
 	netif_dbg(efx, ifdown, efx->net_dev, "closing on CPU %d\n",
 		  raw_smp_processor_id());
 
-	if (efx->state != STATE_DISABLED) {
-		/* Stop the device and flush all the channels */
-		efx_stop_all(efx);
-	}
+	/* Stop the device and flush all the channels */
+	efx_stop_all(efx);
 
 	return 0;
 }
@@ -1925,9 +1944,13 @@ static void efx_watchdog(struct net_device *net_dev)
 static int efx_change_mtu(struct net_device *net_dev, int new_mtu)
 {
 	struct efx_nic *efx = netdev_priv(net_dev);
+	int rc;
 
 	EFX_ASSERT_RESET_SERIALISED(efx);
 
+	rc = efx_check_disabled(efx);
+	if (rc)
+		return rc;
 	if (new_mtu > EFX_MAX_MTU)
 		return -EINVAL;
 

From b812f8b7a9b657367eaacab8c34e6711a657a48b Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Fri, 27 Jul 2012 20:48:36 +0100
Subject: [PATCH 11/16] sfc: Improve log messages in case we abort probe due to
 a pending reset

The current informational message doesn't properly explain what
happens, and could also appear if we defer a reset during
suspend/resume.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/efx.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 977fc3a591e9..5555e9f98162 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -2311,11 +2311,8 @@ static void efx_reset_work(struct work_struct *data)
 
 	/* If we're not READY then don't reset. Leave the reset_pending
 	 * flags set so that efx_pci_probe_main will be retried */
-	if (efx->state != STATE_READY) {
-		netif_info(efx, drv, efx->net_dev,
-			   "scheduled reset quenched; NIC not ready\n");
+	if (efx->state != STATE_READY)
 		return;
-	}
 
 	rtnl_lock();
 	(void)efx_reset(efx, fls(pending) - 1);
@@ -2703,6 +2700,8 @@ static int __devinit efx_pci_probe(struct pci_dev *pci_dev,
 	 * probably hosed anyway.
 	 */
 	if (efx->reset_pending) {
+		netif_err(efx, probe, efx->net_dev,
+			  "aborting probe due to scheduled reset\n");
 		rc = -EIO;
 		goto fail4;
 	}

From 7153f623ea47f3feb54209d75b80ed09a497cd4a Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Fri, 27 Jul 2012 20:50:52 +0100
Subject: [PATCH 12/16] sfc: Fix reset vs probe/remove/PM races involving
 efx_nic::state

We try to defer resets while the device is not READY, but we're not
doing this quite correctly.  In particular, changes to efx_nic::state
are documented as serialised by the RTNL lock, but they aren't.

1. We check whether a reset was requested during probe (suggesting
broken hardware) before we allow requested resets to be scheduled.
This leaves a window where a requested reset would be deferred
indefinitely.

2. Although we cancel the reset work item during device removal,
there are still later operations that can cause it to be scheduled
again.  We need to check the state before scheduling it.

3. Since the state can change between scheduling and running of
the work item, we still need to check it there, and we need to
do so *after* acquiring the RTNL lock which serialises state
changes.

4. We must cancel the reset work item during device removal, if the
state could ever have been READY.  This wasn't done in some of the
failure paths from efx_pci_probe().  Move the cancellation to
efx_pci_remove_main().

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/efx.c | 84 +++++++++++++++++-----------------
 1 file changed, 43 insertions(+), 41 deletions(-)

diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 5555e9f98162..d0653406058b 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -2112,6 +2112,19 @@ static int efx_register_netdev(struct efx_nic *efx)
 
 	rtnl_lock();
 
+	/* Enable resets to be scheduled and check whether any were
+	 * already requested.  If so, the NIC is probably hosed so we
+	 * abort.
+	 */
+	efx->state = STATE_READY;
+	smp_mb(); /* ensure we change state before checking reset_pending */
+	if (efx->reset_pending) {
+		netif_err(efx, probe, efx->net_dev,
+			  "aborting probe due to scheduled reset\n");
+		rc = -EIO;
+		goto fail_locked;
+	}
+
 	rc = dev_alloc_name(net_dev, net_dev->name);
 	if (rc < 0)
 		goto fail_locked;
@@ -2141,14 +2154,14 @@ static int efx_register_netdev(struct efx_nic *efx)
 
 	return 0;
 
+fail_registered:
+	rtnl_lock();
+	unregister_netdevice(net_dev);
 fail_locked:
+	efx->state = STATE_UNINIT;
 	rtnl_unlock();
 	netif_err(efx, drv, efx->net_dev, "could not register net dev\n");
 	return rc;
-
-fail_registered:
-	unregister_netdev(net_dev);
-	return rc;
 }
 
 static void efx_unregister_netdev(struct efx_nic *efx)
@@ -2171,7 +2184,11 @@ static void efx_unregister_netdev(struct efx_nic *efx)
 
 	strlcpy(efx->name, pci_name(efx->pci_dev), sizeof(efx->name));
 	device_remove_file(&efx->pci_dev->dev, &dev_attr_phy_type);
-	unregister_netdev(efx->net_dev);
+
+	rtnl_lock();
+	unregister_netdevice(efx->net_dev);
+	efx->state = STATE_UNINIT;
+	rtnl_unlock();
 }
 
 /**************************************************************************
@@ -2309,13 +2326,15 @@ static void efx_reset_work(struct work_struct *data)
 	if (!pending)
 		return;
 
-	/* If we're not READY then don't reset. Leave the reset_pending
-	 * flags set so that efx_pci_probe_main will be retried */
-	if (efx->state != STATE_READY)
-		return;
-
 	rtnl_lock();
-	(void)efx_reset(efx, fls(pending) - 1);
+
+	/* We checked the state in efx_schedule_reset() but it may
+	 * have changed by now.  Now that we have the RTNL lock,
+	 * it cannot change again.
+	 */
+	if (efx->state == STATE_READY)
+		(void)efx_reset(efx, fls(pending) - 1);
+
 	rtnl_unlock();
 }
 
@@ -2341,6 +2360,13 @@ void efx_schedule_reset(struct efx_nic *efx, enum reset_type type)
 	}
 
 	set_bit(method, &efx->reset_pending);
+	smp_mb(); /* ensure we change reset_pending before checking state */
+
+	/* If we're not READY then just leave the flags set as the cue
+	 * to abort probing or reschedule the reset later.
+	 */
+	if (ACCESS_ONCE(efx->state) != STATE_READY)
+		return;
 
 	/* efx_process_channel() will no longer read events once a
 	 * reset is scheduled. So switch back to poll'd MCDI completions. */
@@ -2485,6 +2511,12 @@ static void efx_fini_struct(struct efx_nic *efx)
  */
 static void efx_pci_remove_main(struct efx_nic *efx)
 {
+	/* Flush reset_work. It can no longer be scheduled since we
+	 * are not READY.
+	 */
+	BUG_ON(efx->state == STATE_READY);
+	cancel_work_sync(&efx->reset_work);
+
 #ifdef CONFIG_RFS_ACCEL
 	free_irq_cpu_rmap(efx->net_dev->rx_cpu_rmap);
 	efx->net_dev->rx_cpu_rmap = NULL;
@@ -2510,11 +2542,8 @@ static void efx_pci_remove(struct pci_dev *pci_dev)
 
 	/* Mark the NIC as fini, then stop the interface */
 	rtnl_lock();
-	efx->state = STATE_UNINIT;
 	dev_close(efx->net_dev);
 	efx_stop_interrupts(efx, false);
-
-	/* Allow any queued efx_resets() to complete */
 	rtnl_unlock();
 
 	efx_sriov_fini(efx);
@@ -2522,12 +2551,6 @@ static void efx_pci_remove(struct pci_dev *pci_dev)
 
 	efx_mtd_remove(efx);
 
-	/* Wait for any scheduled resets to complete. No more will be
-	 * scheduled from this point because efx_stop_all() has been
-	 * called, we are no longer registered with driverlink, and
-	 * the net_device's have been removed. */
-	cancel_work_sync(&efx->reset_work);
-
 	efx_pci_remove_main(efx);
 
 	efx_fini_io(efx);
@@ -2686,30 +2709,9 @@ static int __devinit efx_pci_probe(struct pci_dev *pci_dev,
 		goto fail2;
 
 	rc = efx_pci_probe_main(efx);
-
-	/* Serialise against efx_reset(). No more resets will be
-	 * scheduled since efx_stop_all() has been called, and we have
-	 * not and never have been registered.
-	 */
-	cancel_work_sync(&efx->reset_work);
-
 	if (rc)
 		goto fail3;
 
-	/* If there was a scheduled reset during probe, the NIC is
-	 * probably hosed anyway.
-	 */
-	if (efx->reset_pending) {
-		netif_err(efx, probe, efx->net_dev,
-			  "aborting probe due to scheduled reset\n");
-		rc = -EIO;
-		goto fail4;
-	}
-
-	/* Switch to the READY state before we expose the device to the OS,
-	 * so that dev_open()|efx_start_all() will actually start the device */
-	efx->state = STATE_READY;
-
 	rc = efx_register_netdev(efx);
 	if (rc)
 		goto fail4;

From 7bde852afc88909d78e617b70d3e7a3dc6bc04ff Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Fri, 27 Jul 2012 20:50:54 +0100
Subject: [PATCH 13/16] sfc: Remove overly paranoid locking assertions from
 netdev operations

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/efx.c     | 8 --------
 drivers/net/ethernet/sfc/ethtool.c | 2 --
 2 files changed, 10 deletions(-)

diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index d0653406058b..b29db12756f9 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -1767,8 +1767,6 @@ static int efx_ioctl(struct net_device *net_dev, struct ifreq *ifr, int cmd)
 	struct efx_nic *efx = netdev_priv(net_dev);
 	struct mii_ioctl_data *data = if_mii(ifr);
 
-	EFX_ASSERT_RESET_SERIALISED(efx);
-
 	/* Convert phy_id from older PRTAD/DEVAD format */
 	if ((cmd == SIOCGMIIREG || cmd == SIOCSMIIREG) &&
 	    (data->phy_id & 0xfc00) == 0x0400)
@@ -1850,8 +1848,6 @@ static int efx_net_open(struct net_device *net_dev)
 	struct efx_nic *efx = netdev_priv(net_dev);
 	int rc;
 
-	EFX_ASSERT_RESET_SERIALISED(efx);
-
 	netif_dbg(efx, ifup, efx->net_dev, "opening device on CPU %d\n",
 		  raw_smp_processor_id());
 
@@ -1946,8 +1942,6 @@ static int efx_change_mtu(struct net_device *net_dev, int new_mtu)
 	struct efx_nic *efx = netdev_priv(net_dev);
 	int rc;
 
-	EFX_ASSERT_RESET_SERIALISED(efx);
-
 	rc = efx_check_disabled(efx);
 	if (rc)
 		return rc;
@@ -1975,8 +1969,6 @@ static int efx_set_mac_address(struct net_device *net_dev, void *data)
 	struct sockaddr *addr = data;
 	char *new_addr = addr->sa_data;
 
-	EFX_ASSERT_RESET_SERIALISED(efx);
-
 	if (!is_valid_ether_addr(new_addr)) {
 		netif_err(efx, drv, efx->net_dev,
 			  "invalid ethernet MAC address requested: %pM\n",
diff --git a/drivers/net/ethernet/sfc/ethtool.c b/drivers/net/ethernet/sfc/ethtool.c
index 5d0e2a3241b1..2bd5c2d35e5d 100644
--- a/drivers/net/ethernet/sfc/ethtool.c
+++ b/drivers/net/ethernet/sfc/ethtool.c
@@ -529,8 +529,6 @@ static void efx_ethtool_self_test(struct net_device *net_dev,
 	if (!efx_tests)
 		goto fail;
 
-
-	ASSERT_RTNL();
 	if (efx->state != STATE_READY) {
 		rc = -EIO;
 		goto fail1;

From 3f65ea5b2aa0f76a463c85ed39955c26faf4f5a7 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Fri, 27 Jul 2012 20:50:57 +0100
Subject: [PATCH 14/16] sfc: Remove bogus comment about MTU change and RX
 buffer overrun

RX DMA is limited by the length specified in each descriptor and not
by the MAC.  Over-length frames may get into the RX FIFO regardless of
the MAC settings, due to a hardware bug, but they will be truncated by
the packet DMA engine and reported as such in the completion event.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/efx.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index b29db12756f9..4105a6664765 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -1953,8 +1953,6 @@ static int efx_change_mtu(struct net_device *net_dev, int new_mtu)
 	netif_dbg(efx, drv, efx->net_dev, "changing MTU to %d\n", new_mtu);
 
 	mutex_lock(&efx->mac_lock);
-	/* Reconfigure the MAC before enabling the dma queues so that
-	 * the RX buffers don't overflow */
 	net_dev->mtu = new_mtu;
 	efx->type->reconfigure_mac(efx);
 	mutex_unlock(&efx->mac_lock);

From adeb15aa1ce3413d461ec0958a979e9d64cd764c Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Thu, 2 Aug 2012 01:39:38 +0100
Subject: [PATCH 15/16] sfc: Assign efx and efx->type as early as possible in
 efx_pci_probe()

We also stop clearing *efx in efx_init_struct().  This is safe because
alloc_etherdev_mq() already clears it for us.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/efx.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 4105a6664765..92f002029481 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -2422,13 +2422,12 @@ static const struct efx_phy_operations efx_dummy_phy_operations = {
 /* This zeroes out and then fills in the invariants in a struct
  * efx_nic (including all sub-structures).
  */
-static int efx_init_struct(struct efx_nic *efx, const struct efx_nic_type *type,
+static int efx_init_struct(struct efx_nic *efx,
 			   struct pci_dev *pci_dev, struct net_device *net_dev)
 {
 	int i;
 
 	/* Initialise common structures */
-	memset(efx, 0, sizeof(*efx));
 	spin_lock_init(&efx->biu_lock);
 #ifdef CONFIG_SFC_MTD
 	INIT_LIST_HEAD(&efx->mtd_list);
@@ -2455,8 +2454,6 @@ static int efx_init_struct(struct efx_nic *efx, const struct efx_nic_type *type,
 			goto fail;
 	}
 
-	efx->type = type;
-
 	EFX_BUG_ON_PARANOID(efx->type->phys_addr_channels > EFX_MAX_CHANNELS);
 
 	/* Higher numbered interrupt modes are less capable! */
@@ -2660,7 +2657,6 @@ static int efx_pci_probe_main(struct efx_nic *efx)
 static int __devinit efx_pci_probe(struct pci_dev *pci_dev,
 				   const struct pci_device_id *entry)
 {
-	const struct efx_nic_type *type = (const struct efx_nic_type *) entry->driver_data;
 	struct net_device *net_dev;
 	struct efx_nic *efx;
 	int rc;
@@ -2670,10 +2666,12 @@ static int __devinit efx_pci_probe(struct pci_dev *pci_dev,
 				     EFX_MAX_RX_QUEUES);
 	if (!net_dev)
 		return -ENOMEM;
-	net_dev->features |= (type->offload_features | NETIF_F_SG |
+	efx = netdev_priv(net_dev);
+	efx->type = (const struct efx_nic_type *) entry->driver_data;
+	net_dev->features |= (efx->type->offload_features | NETIF_F_SG |
 			      NETIF_F_HIGHDMA | NETIF_F_TSO |
 			      NETIF_F_RXCSUM);
-	if (type->offload_features & NETIF_F_V6_CSUM)
+	if (efx->type->offload_features & NETIF_F_V6_CSUM)
 		net_dev->features |= NETIF_F_TSO6;
 	/* Mask for features that also apply to VLAN devices */
 	net_dev->vlan_features |= (NETIF_F_ALL_CSUM | NETIF_F_SG |
@@ -2681,10 +2679,9 @@ static int __devinit efx_pci_probe(struct pci_dev *pci_dev,
 				   NETIF_F_RXCSUM);
 	/* All offloads can be toggled */
 	net_dev->hw_features = net_dev->features & ~NETIF_F_HIGHDMA;
-	efx = netdev_priv(net_dev);
 	pci_set_drvdata(pci_dev, efx);
 	SET_NETDEV_DEV(net_dev, &pci_dev->dev);
-	rc = efx_init_struct(efx, type, pci_dev, net_dev);
+	rc = efx_init_struct(efx, pci_dev, net_dev);
 	if (rc)
 		goto fail1;
 

From 8f8b3d518999fd1c342310910aa1e49112c86d05 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Fri, 24 Aug 2012 18:04:38 +0100
Subject: [PATCH 16/16] sfc: Fix the initial device operstate

Following commit 8f4cccb ('net: Set device operstate at registration
time') it is now correct and preferable to set the carrier off before
registering a device.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/efx.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 92f002029481..a606db43c5ba 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -2120,6 +2120,9 @@ static int efx_register_netdev(struct efx_nic *efx)
 		goto fail_locked;
 	efx_update_name(efx);
 
+	/* Always start with carrier off; PHY events will detect the link */
+	netif_carrier_off(net_dev);
+
 	rc = register_netdevice(net_dev);
 	if (rc)
 		goto fail_locked;
@@ -2130,9 +2133,6 @@ static int efx_register_netdev(struct efx_nic *efx)
 			efx_init_tx_queue_core_txq(tx_queue);
 	}
 
-	/* Always start with carrier off; PHY events will detect the link */
-	netif_carrier_off(net_dev);
-
 	rtnl_unlock();
 
 	rc = device_create_file(&efx->pci_dev->dev, &dev_attr_phy_type);