From afaa9459de639591ff3318fd215a813c8d794759 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 20 Jul 2012 08:08:12 +0000 Subject: [PATCH 1/9] ixgbe: Remove code that was initializing Rx page offset This change reverts an earlier patch that introduced ixgbe_init_rx_page_offset. The idea behind the function was to provide some variation in the starting offset for the page in order to reduce hot-spots in the cache. However it doesn't appear to provide any significant benefit in the testing I have done. It has however been a source of several bugs, and it blocks us from being able to use 2K fragments on larger page sizes. So the decision I made was to remove it. Signed-off-by: Alexander Duyck Tested-by: Phil Schmitt Signed-off-by: Peter P Waskiewicz Jr --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 27 +------------------ 1 file changed, 1 insertion(+), 26 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 4326f74f7137..f7351c6fa3b5 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -1167,7 +1167,7 @@ static bool ixgbe_alloc_mapped_page(struct ixgbe_ring *rx_ring, } bi->dma = dma; - bi->page_offset ^= ixgbe_rx_bufsz(rx_ring); + bi->page_offset = 0; return true; } @@ -4129,27 +4129,6 @@ void ixgbe_reset(struct ixgbe_adapter *adapter) hw->mac.ops.set_vmdq_san_mac(hw, VMDQ_P(0)); } -/** - * ixgbe_init_rx_page_offset - initialize page offset values for Rx buffers - * @rx_ring: ring to setup - * - * On many IA platforms the L1 cache has a critical stride of 4K, this - * results in each receive buffer starting in the same cache set. To help - * reduce the pressure on this cache set we can interleave the offsets so - * that only every other buffer will be in the same cache set. - **/ -static void ixgbe_init_rx_page_offset(struct ixgbe_ring *rx_ring) -{ - struct ixgbe_rx_buffer *rx_buffer = rx_ring->rx_buffer_info; - u16 i; - - for (i = 0; i < rx_ring->count; i += 2) { - rx_buffer[0].page_offset = 0; - rx_buffer[1].page_offset = ixgbe_rx_bufsz(rx_ring); - rx_buffer = &rx_buffer[2]; - } -} - /** * ixgbe_clean_rx_ring - Free Rx Buffers per Queue * @rx_ring: ring to free buffers from @@ -4195,8 +4174,6 @@ static void ixgbe_clean_rx_ring(struct ixgbe_ring *rx_ring) size = sizeof(struct ixgbe_rx_buffer) * rx_ring->count; memset(rx_ring->rx_buffer_info, 0, size); - ixgbe_init_rx_page_offset(rx_ring); - /* Zero out the descriptor ring */ memset(rx_ring->desc, 0, rx_ring->size); @@ -4646,8 +4623,6 @@ int ixgbe_setup_rx_resources(struct ixgbe_ring *rx_ring) rx_ring->next_to_clean = 0; rx_ring->next_to_use = 0; - ixgbe_init_rx_page_offset(rx_ring); - return 0; err: vfree(rx_ring->rx_buffer_info); From 0549ae20b77d411aefb5271c2c494b9c3f02d972 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 20 Jul 2012 08:08:18 +0000 Subject: [PATCH 2/9] ixgbe: combine ixgbe_add_rx_frag and ixgbe_can_reuse_page This patch combines ixgbe_add_rx_frag and ixgbe_can_reuse_page into a single function. The main motivation behind this is to make better use of the values so that we don't have to load them from memory and into registers twice. Signed-off-by: Alexander Duyck Tested-by: Phil Schmitt Signed-off-by: Peter P Waskiewicz Jr --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 73 +++++++++---------- 1 file changed, 34 insertions(+), 39 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index f7351c6fa3b5..6a8c48443676 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -1559,34 +1559,18 @@ static bool ixgbe_cleanup_headers(struct ixgbe_ring *rx_ring, return false; } -/** - * ixgbe_can_reuse_page - determine if we can reuse a page - * @rx_buffer: pointer to rx_buffer containing the page we want to reuse - * - * Returns true if page can be reused in another Rx buffer - **/ -static inline bool ixgbe_can_reuse_page(struct ixgbe_rx_buffer *rx_buffer) -{ - struct page *page = rx_buffer->page; - - /* if we are only owner of page and it is local we can reuse it */ - return likely(page_count(page) == 1) && - likely(page_to_nid(page) == numa_node_id()); -} - /** * ixgbe_reuse_rx_page - page flip buffer and store it back on the ring * @rx_ring: rx descriptor ring to store buffers on * @old_buff: donor buffer to have page reused * - * Syncronizes page for reuse by the adapter + * Synchronizes page for reuse by the adapter **/ static void ixgbe_reuse_rx_page(struct ixgbe_ring *rx_ring, struct ixgbe_rx_buffer *old_buff) { struct ixgbe_rx_buffer *new_buff; u16 nta = rx_ring->next_to_alloc; - u16 bufsz = ixgbe_rx_bufsz(rx_ring); new_buff = &rx_ring->rx_buffer_info[nta]; @@ -1597,17 +1581,13 @@ static void ixgbe_reuse_rx_page(struct ixgbe_ring *rx_ring, /* transfer page from old buffer to new buffer */ new_buff->page = old_buff->page; new_buff->dma = old_buff->dma; - - /* flip page offset to other buffer and store to new_buff */ - new_buff->page_offset = old_buff->page_offset ^ bufsz; + new_buff->page_offset = old_buff->page_offset; /* sync the buffer for use by the device */ dma_sync_single_range_for_device(rx_ring->dev, new_buff->dma, - new_buff->page_offset, bufsz, + new_buff->page_offset, + ixgbe_rx_bufsz(rx_ring), DMA_FROM_DEVICE); - - /* bump ref count on page before it is given to the stack */ - get_page(new_buff->page); } /** @@ -1617,20 +1597,38 @@ static void ixgbe_reuse_rx_page(struct ixgbe_ring *rx_ring, * @rx_desc: descriptor containing length of buffer written by hardware * @skb: sk_buff to place the data into * - * This function is based on skb_add_rx_frag. I would have used that - * function however it doesn't handle the truesize case correctly since we - * are allocating more memory than might be used for a single receive. + * This function will add the data contained in rx_buffer->page to the skb. + * This is done either through a direct copy if the data in the buffer is + * less than the skb header size, otherwise it will just attach the page as + * a frag to the skb. + * + * The function will then update the page offset if necessary and return + * true if the buffer can be reused by the adapter. **/ -static void ixgbe_add_rx_frag(struct ixgbe_ring *rx_ring, +static bool ixgbe_add_rx_frag(struct ixgbe_ring *rx_ring, struct ixgbe_rx_buffer *rx_buffer, - struct sk_buff *skb, int size) + union ixgbe_adv_rx_desc *rx_desc, + struct sk_buff *skb) { - skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags, - rx_buffer->page, rx_buffer->page_offset, - size); - skb->len += size; - skb->data_len += size; - skb->truesize += ixgbe_rx_bufsz(rx_ring); + struct page *page = rx_buffer->page; + unsigned int size = le16_to_cpu(rx_desc->wb.upper.length); + unsigned int truesize = ixgbe_rx_bufsz(rx_ring); + + skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, + rx_buffer->page_offset, size, truesize); + + /* if we are only owner of page and it is local we can reuse it */ + if (unlikely(page_count(page) != 1) || + unlikely(page_to_nid(page) != numa_node_id())) + return false; + + /* flip page offset to other buffer */ + rx_buffer->page_offset ^= truesize; + + /* bump ref count on page before it is given to the stack */ + get_page(page); + + return true; } /** @@ -1731,10 +1729,7 @@ static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, } /* pull page into skb */ - ixgbe_add_rx_frag(rx_ring, rx_buffer, skb, - le16_to_cpu(rx_desc->wb.upper.length)); - - if (ixgbe_can_reuse_page(rx_buffer)) { + if (ixgbe_add_rx_frag(rx_ring, rx_buffer, rx_desc, skb)) { /* hand second half of page back to the ring */ ixgbe_reuse_rx_page(rx_ring, rx_buffer); } else if (IXGBE_CB(skb)->dma == rx_buffer->dma) { From 09816fbea96ae81eac82dee2d52f29ea7241678d Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 20 Jul 2012 08:08:23 +0000 Subject: [PATCH 3/9] ixgbe: Only use double buffering if page size is less than 8K This change makes it so that we do not use double buffering if the page size is larger than 4K. Instead we will simply walk through the page using up to 3K per receive, and if we receive less than we only move the offset by that amount. We will free the page when there is no longer any space left that we can use instead of checking the page count to see if we can cycle back to the start. The main motivation behind this is to avoid the unnecessary truesize cost for using a half page when most packets are 2K or smaller. With this new approach the largest possible truesize for a page fragment will be 3K when PAGE_SIZE is larger than 4K. Signed-off-by: Alexander Duyck Tested-by: Phil Schmitt Signed-off-by: Peter P Waskiewicz Jr --- drivers/net/ethernet/intel/ixgbe/ixgbe.h | 24 ++++++++--- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 40 +++++++++++++------ 2 files changed, 45 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h index b9623e9ea895..fd2bc697a92b 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h @@ -78,6 +78,9 @@ /* Supported Rx Buffer Sizes */ #define IXGBE_RXBUFFER_256 256 /* Used for skb receive header */ +#define IXGBE_RXBUFFER_2K 2048 +#define IXGBE_RXBUFFER_3K 3072 +#define IXGBE_RXBUFFER_4K 4096 #define IXGBE_MAX_RXBUFFER 16384 /* largest size for a single descriptor */ /* @@ -293,16 +296,25 @@ struct ixgbe_ring_feature { * this is twice the size of a half page we need to double the page order * for FCoE enabled Rx queues. */ -#if defined(IXGBE_FCOE) && (PAGE_SIZE < 8192) +static inline unsigned int ixgbe_rx_bufsz(struct ixgbe_ring *ring) +{ +#ifdef IXGBE_FCOE + if (test_bit(__IXGBE_RX_FCOE, &ring->state)) + return (PAGE_SIZE < 8192) ? IXGBE_RXBUFFER_4K : + IXGBE_RXBUFFER_3K; +#endif + return IXGBE_RXBUFFER_2K; +} + static inline unsigned int ixgbe_rx_pg_order(struct ixgbe_ring *ring) { - return test_bit(__IXGBE_RX_FCOE, &ring->state) ? 1 : 0; -} -#else -#define ixgbe_rx_pg_order(_ring) 0 +#ifdef IXGBE_FCOE + if (test_bit(__IXGBE_RX_FCOE, &ring->state)) + return (PAGE_SIZE < 8192) ? 1 : 0; #endif + return 0; +} #define ixgbe_rx_pg_size(_ring) (PAGE_SIZE << ixgbe_rx_pg_order(_ring)) -#define ixgbe_rx_bufsz(_ring) ((PAGE_SIZE / 2) << ixgbe_rx_pg_order(_ring)) struct ixgbe_ring_container { struct ixgbe_ring *ring; /* pointer to linked list of rings */ diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 6a8c48443676..9305e9a5a761 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -1612,21 +1612,45 @@ static bool ixgbe_add_rx_frag(struct ixgbe_ring *rx_ring, { struct page *page = rx_buffer->page; unsigned int size = le16_to_cpu(rx_desc->wb.upper.length); +#if (PAGE_SIZE < 8192) unsigned int truesize = ixgbe_rx_bufsz(rx_ring); +#else + unsigned int truesize = ALIGN(size, L1_CACHE_BYTES); + unsigned int last_offset = ixgbe_rx_pg_size(rx_ring) - + ixgbe_rx_bufsz(rx_ring); +#endif skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, rx_buffer->page_offset, size, truesize); - /* if we are only owner of page and it is local we can reuse it */ - if (unlikely(page_count(page) != 1) || - unlikely(page_to_nid(page) != numa_node_id())) + /* avoid re-using remote pages */ + if (unlikely(page_to_nid(page) != numa_node_id())) + return false; + +#if (PAGE_SIZE < 8192) + /* if we are only owner of page we can reuse it */ + if (unlikely(page_count(page) != 1)) return false; /* flip page offset to other buffer */ rx_buffer->page_offset ^= truesize; + /* + * since we are the only owner of the page and we need to + * increment it, just set the value to 2 in order to avoid + * an unecessary locked operation + */ + atomic_set(&page->_count, 2); +#else + /* move offset up to the next cache line */ + rx_buffer->page_offset += truesize; + + if (rx_buffer->page_offset > last_offset) + return false; + /* bump ref count on page before it is given to the stack */ get_page(page); +#endif return true; } @@ -2863,11 +2887,7 @@ static void ixgbe_configure_srrctl(struct ixgbe_adapter *adapter, srrctl = IXGBE_RX_HDR_SIZE << IXGBE_SRRCTL_BSIZEHDRSIZE_SHIFT; /* configure the packet buffer length */ -#if PAGE_SIZE > IXGBE_MAX_RXBUFFER - srrctl |= IXGBE_MAX_RXBUFFER >> IXGBE_SRRCTL_BSIZEPKT_SHIFT; -#else srrctl |= ixgbe_rx_bufsz(rx_ring) >> IXGBE_SRRCTL_BSIZEPKT_SHIFT; -#endif /* configure descriptor type */ srrctl |= IXGBE_SRRCTL_DESCTYPE_ADV_ONEBUF; @@ -2975,13 +2995,7 @@ static void ixgbe_configure_rscctl(struct ixgbe_adapter *adapter, * total size of max desc * buf_len is not greater * than 65536 */ -#if (PAGE_SIZE <= 8192) rscctrl |= IXGBE_RSCCTL_MAXDESC_16; -#elif (PAGE_SIZE <= 16384) - rscctrl |= IXGBE_RSCCTL_MAXDESC_8; -#else - rscctrl |= IXGBE_RSCCTL_MAXDESC_4; -#endif IXGBE_WRITE_REG(hw, IXGBE_RSCCTL(reg_idx), rscctrl); } From 42073d91a214587717c36a697436bad0f60c4384 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 20 Jul 2012 08:08:28 +0000 Subject: [PATCH 4/9] ixgbe: Have the CPU take ownership of the buffers sooner This patch makes it so that we will always have ownership of the buffers by the time we get to ixgbe_add_rx_frag. This is necessary as I am planning to add a copy-break to ixgbe_add_rx_frag and in order for that to function correctly we need the CPU to have ownership of the buffer. Signed-off-by: Alexander Duyck Tested-by: Phil Schmitt Signed-off-by: Peter P Waskiewicz Jr --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 52 ++++++++++++++----- 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 9305e9a5a761..b0020fcf0505 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -1457,6 +1457,36 @@ static bool ixgbe_is_non_eop(struct ixgbe_ring *rx_ring, return true; } +/** + * ixgbe_dma_sync_frag - perform DMA sync for first frag of SKB + * @rx_ring: rx descriptor ring packet is being transacted on + * @skb: pointer to current skb being updated + * + * This function provides a basic DMA sync up for the first fragment of an + * skb. The reason for doing this is that the first fragment cannot be + * unmapped until we have reached the end of packet descriptor for a buffer + * chain. + */ +static void ixgbe_dma_sync_frag(struct ixgbe_ring *rx_ring, + struct sk_buff *skb) +{ + /* if the page was released unmap it, else just sync our portion */ + if (unlikely(IXGBE_CB(skb)->page_released)) { + dma_unmap_page(rx_ring->dev, IXGBE_CB(skb)->dma, + ixgbe_rx_pg_size(rx_ring), DMA_FROM_DEVICE); + IXGBE_CB(skb)->page_released = false; + } else { + struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[0]; + + dma_sync_single_range_for_cpu(rx_ring->dev, + IXGBE_CB(skb)->dma, + frag->page_offset, + ixgbe_rx_bufsz(rx_ring), + DMA_FROM_DEVICE); + } + IXGBE_CB(skb)->dma = 0; +} + /** * ixgbe_cleanup_headers - Correct corrupted or empty headers * @rx_ring: rx descriptor ring packet is being transacted on @@ -1484,20 +1514,6 @@ static bool ixgbe_cleanup_headers(struct ixgbe_ring *rx_ring, unsigned char *va; unsigned int pull_len; - /* if the page was released unmap it, else just sync our portion */ - if (unlikely(IXGBE_CB(skb)->page_released)) { - dma_unmap_page(rx_ring->dev, IXGBE_CB(skb)->dma, - ixgbe_rx_pg_size(rx_ring), DMA_FROM_DEVICE); - IXGBE_CB(skb)->page_released = false; - } else { - dma_sync_single_range_for_cpu(rx_ring->dev, - IXGBE_CB(skb)->dma, - frag->page_offset, - ixgbe_rx_bufsz(rx_ring), - DMA_FROM_DEVICE); - } - IXGBE_CB(skb)->dma = 0; - /* verify that the packet does not have any known errors */ if (unlikely(ixgbe_test_staterr(rx_desc, IXGBE_RXDADV_ERR_FRAME_ERR_MASK) && @@ -1742,8 +1758,16 @@ static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, * after the writeback. Only unmap it when EOP is * reached */ + if (likely(ixgbe_test_staterr(rx_desc, + IXGBE_RXD_STAT_EOP))) + goto dma_sync; + IXGBE_CB(skb)->dma = rx_buffer->dma; } else { + if (ixgbe_test_staterr(rx_desc, IXGBE_RXD_STAT_EOP)) + ixgbe_dma_sync_frag(rx_ring, skb); + +dma_sync: /* we are reusing so sync this buffer for CPU use */ dma_sync_single_range_for_cpu(rx_ring->dev, rx_buffer->dma, From 19861ce24f2ea9d9db7b96814de9ca8d522cbe40 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 20 Jul 2012 08:08:33 +0000 Subject: [PATCH 5/9] ixgbe: Make pull tail function separate from rest of cleanup_headers This change creates a separate function for functionality similar to pskb_pull_tail. The main motivation for moving it to a separate function is so that later I can just skip this function in the case where we have already copied the buffer into skb->head. Signed-off-by: Alexander Duyck Tested-by: Phil Schmitt Signed-off-by: Peter P Waskiewicz Jr --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 94 +++++++++++-------- 1 file changed, 57 insertions(+), 37 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index b0020fcf0505..d9269737409a 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -1457,6 +1457,61 @@ static bool ixgbe_is_non_eop(struct ixgbe_ring *rx_ring, return true; } +/** + * ixgbe_pull_tail - ixgbe specific version of skb_pull_tail + * @rx_ring: rx descriptor ring packet is being transacted on + * @skb: pointer to current skb being adjusted + * + * This function is an ixgbe specific version of __pskb_pull_tail. The + * main difference between this version and the original function is that + * this function can make several assumptions about the state of things + * that allow for significant optimizations versus the standard function. + * As a result we can do things like drop a frag and maintain an accurate + * truesize for the skb. + */ +static void ixgbe_pull_tail(struct ixgbe_ring *rx_ring, + struct sk_buff *skb) +{ + struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[0]; + unsigned char *va; + unsigned int pull_len; + + /* + * it is valid to use page_address instead of kmap since we are + * working with pages allocated out of the lomem pool per + * alloc_page(GFP_ATOMIC) + */ + va = skb_frag_address(frag); + + /* + * we need the header to contain the greater of either ETH_HLEN or + * 60 bytes if the skb->len is less than 60 for skb_pad. + */ + pull_len = skb_frag_size(frag); + if (pull_len > IXGBE_RX_HDR_SIZE) + pull_len = ixgbe_get_headlen(va, IXGBE_RX_HDR_SIZE); + + /* align pull length to size of long to optimize memcpy performance */ + skb_copy_to_linear_data(skb, va, ALIGN(pull_len, sizeof(long))); + + /* update all of the pointers */ + skb_frag_size_sub(frag, pull_len); + frag->page_offset += pull_len; + skb->data_len -= pull_len; + skb->tail += pull_len; + + /* + * if we sucked the frag empty then we should free it, + * if there are other frags here something is screwed up in hardware + */ + if (skb_frag_size(frag) == 0) { + BUG_ON(skb_shinfo(skb)->nr_frags != 1); + skb_shinfo(skb)->nr_frags = 0; + __skb_frag_unref(frag); + skb->truesize -= ixgbe_rx_bufsz(rx_ring); + } +} + /** * ixgbe_dma_sync_frag - perform DMA sync for first frag of SKB * @rx_ring: rx descriptor ring packet is being transacted on @@ -1509,10 +1564,7 @@ static bool ixgbe_cleanup_headers(struct ixgbe_ring *rx_ring, union ixgbe_adv_rx_desc *rx_desc, struct sk_buff *skb) { - struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[0]; struct net_device *netdev = rx_ring->netdev; - unsigned char *va; - unsigned int pull_len; /* verify that the packet does not have any known errors */ if (unlikely(ixgbe_test_staterr(rx_desc, @@ -1522,40 +1574,8 @@ static bool ixgbe_cleanup_headers(struct ixgbe_ring *rx_ring, return true; } - /* - * it is valid to use page_address instead of kmap since we are - * working with pages allocated out of the lomem pool per - * alloc_page(GFP_ATOMIC) - */ - va = skb_frag_address(frag); - - /* - * we need the header to contain the greater of either ETH_HLEN or - * 60 bytes if the skb->len is less than 60 for skb_pad. - */ - pull_len = skb_frag_size(frag); - if (pull_len > IXGBE_RX_HDR_SIZE) - pull_len = ixgbe_get_headlen(va, IXGBE_RX_HDR_SIZE); - - /* align pull length to size of long to optimize memcpy performance */ - skb_copy_to_linear_data(skb, va, ALIGN(pull_len, sizeof(long))); - - /* update all of the pointers */ - skb_frag_size_sub(frag, pull_len); - frag->page_offset += pull_len; - skb->data_len -= pull_len; - skb->tail += pull_len; - - /* - * if we sucked the frag empty then we should free it, - * if there are other frags here something is screwed up in hardware - */ - if (skb_frag_size(frag) == 0) { - BUG_ON(skb_shinfo(skb)->nr_frags != 1); - skb_shinfo(skb)->nr_frags = 0; - __skb_frag_unref(frag); - skb->truesize -= ixgbe_rx_bufsz(rx_ring); - } + /* place header in linear portion of buffer */ + ixgbe_pull_tail(rx_ring, skb); #ifdef IXGBE_FCOE /* do not attempt to pad FCoE Frames as this will disrupt DDP */ From cf3fe7aca03e14db107d649c5699a48bec054583 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 20 Jul 2012 08:08:39 +0000 Subject: [PATCH 6/9] ixgbe: Copybreak sooner to avoid get_page/put_page and offset change overhead This change makes it so that if only the first 256 bytes of a buffer are used we just copy the data out and leave the offset and page count unchanged. There are multiple advantages to this. First it allows us to reuse the page much more in the case of pages larger than 4K. It also allows us to avoid some expensive atomic operations in the form of get_page/put_page. In perf I have seen CPU utilization for put_page drop from 3.5% to 1.8% as a result of this patch when doing small packet routing, and packet rates increased by about 3%. Signed-off-by: Alexander Duyck Tested-by: Phil Schmitt Signed-off-by: Peter P Waskiewicz Jr --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 32 ++++++++++--------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index d9269737409a..d11fac53a792 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -1487,9 +1487,7 @@ static void ixgbe_pull_tail(struct ixgbe_ring *rx_ring, * we need the header to contain the greater of either ETH_HLEN or * 60 bytes if the skb->len is less than 60 for skb_pad. */ - pull_len = skb_frag_size(frag); - if (pull_len > IXGBE_RX_HDR_SIZE) - pull_len = ixgbe_get_headlen(va, IXGBE_RX_HDR_SIZE); + pull_len = ixgbe_get_headlen(va, IXGBE_RX_HDR_SIZE); /* align pull length to size of long to optimize memcpy performance */ skb_copy_to_linear_data(skb, va, ALIGN(pull_len, sizeof(long))); @@ -1499,17 +1497,6 @@ static void ixgbe_pull_tail(struct ixgbe_ring *rx_ring, frag->page_offset += pull_len; skb->data_len -= pull_len; skb->tail += pull_len; - - /* - * if we sucked the frag empty then we should free it, - * if there are other frags here something is screwed up in hardware - */ - if (skb_frag_size(frag) == 0) { - BUG_ON(skb_shinfo(skb)->nr_frags != 1); - skb_shinfo(skb)->nr_frags = 0; - __skb_frag_unref(frag); - skb->truesize -= ixgbe_rx_bufsz(rx_ring); - } } /** @@ -1575,7 +1562,8 @@ static bool ixgbe_cleanup_headers(struct ixgbe_ring *rx_ring, } /* place header in linear portion of buffer */ - ixgbe_pull_tail(rx_ring, skb); + if (skb_is_nonlinear(skb)) + ixgbe_pull_tail(rx_ring, skb); #ifdef IXGBE_FCOE /* do not attempt to pad FCoE Frames as this will disrupt DDP */ @@ -1656,6 +1644,20 @@ static bool ixgbe_add_rx_frag(struct ixgbe_ring *rx_ring, ixgbe_rx_bufsz(rx_ring); #endif + if ((size <= IXGBE_RX_HDR_SIZE) && !skb_is_nonlinear(skb)) { + unsigned char *va = page_address(page) + rx_buffer->page_offset; + + memcpy(__skb_put(skb, size), va, ALIGN(size, sizeof(long))); + + /* we can reuse buffer as-is, just make sure it is local */ + if (likely(page_to_nid(page) == numa_node_id())) + return true; + + /* this page cannot be reused so discard it */ + put_page(page); + return false; + } + skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, rx_buffer->page_offset, size, truesize); From 18806c9ea28320d5368fd43318506a7d33cc952c Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 20 Jul 2012 08:08:44 +0000 Subject: [PATCH 7/9] ixgbe: Make allocating skb and placing data in it a separate function This patch creates a function named ixgbe_fetch_rx_buffer. The sole purpose of this function is to retrieve a single buffer off of the ring and to place it in an skb. The advantage to doing this is that it helps improve the readability since I can decrease the indentation and for the code in this section. Signed-off-by: Alexander Duyck Tested-by: Phil Schmitt Signed-off-by: Peter P Waskiewicz Jr --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 166 ++++++++++-------- 1 file changed, 89 insertions(+), 77 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index d11fac53a792..9e72ae6d8492 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -1693,6 +1693,89 @@ static bool ixgbe_add_rx_frag(struct ixgbe_ring *rx_ring, return true; } +static struct sk_buff *ixgbe_fetch_rx_buffer(struct ixgbe_ring *rx_ring, + union ixgbe_adv_rx_desc *rx_desc) +{ + struct ixgbe_rx_buffer *rx_buffer; + struct sk_buff *skb; + struct page *page; + + rx_buffer = &rx_ring->rx_buffer_info[rx_ring->next_to_clean]; + page = rx_buffer->page; + prefetchw(page); + + skb = rx_buffer->skb; + + if (likely(!skb)) { + void *page_addr = page_address(page) + + rx_buffer->page_offset; + + /* prefetch first cache line of first page */ + prefetch(page_addr); +#if L1_CACHE_BYTES < 128 + prefetch(page_addr + L1_CACHE_BYTES); +#endif + + /* allocate a skb to store the frags */ + skb = netdev_alloc_skb_ip_align(rx_ring->netdev, + IXGBE_RX_HDR_SIZE); + if (unlikely(!skb)) { + rx_ring->rx_stats.alloc_rx_buff_failed++; + return NULL; + } + + /* + * we will be copying header into skb->data in + * pskb_may_pull so it is in our interest to prefetch + * it now to avoid a possible cache miss + */ + prefetchw(skb->data); + + /* + * Delay unmapping of the first packet. It carries the + * header information, HW may still access the header + * after the writeback. Only unmap it when EOP is + * reached + */ + if (likely(ixgbe_test_staterr(rx_desc, IXGBE_RXD_STAT_EOP))) + goto dma_sync; + + IXGBE_CB(skb)->dma = rx_buffer->dma; + } else { + if (ixgbe_test_staterr(rx_desc, IXGBE_RXD_STAT_EOP)) + ixgbe_dma_sync_frag(rx_ring, skb); + +dma_sync: + /* we are reusing so sync this buffer for CPU use */ + dma_sync_single_range_for_cpu(rx_ring->dev, + rx_buffer->dma, + rx_buffer->page_offset, + ixgbe_rx_bufsz(rx_ring), + DMA_FROM_DEVICE); + } + + /* pull page into skb */ + if (ixgbe_add_rx_frag(rx_ring, rx_buffer, rx_desc, skb)) { + /* hand second half of page back to the ring */ + ixgbe_reuse_rx_page(rx_ring, rx_buffer); + } else if (IXGBE_CB(skb)->dma == rx_buffer->dma) { + /* the page has been released from the ring */ + IXGBE_CB(skb)->page_released = true; + } else { + /* we are not reusing the buffer so unmap it */ + dma_unmap_page(rx_ring->dev, rx_buffer->dma, + ixgbe_rx_pg_size(rx_ring), + DMA_FROM_DEVICE); + } + + /* clear contents of buffer_info */ + rx_buffer->skb = NULL; + rx_buffer->dma = 0; + rx_buffer->page = NULL; + + return skb; +} + /** * ixgbe_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf * @q_vector: structure containing interrupt and ring information @@ -1718,11 +1801,8 @@ static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, u16 cleaned_count = ixgbe_desc_unused(rx_ring); do { - struct ixgbe_rx_buffer *rx_buffer; union ixgbe_adv_rx_desc *rx_desc; struct sk_buff *skb; - struct page *page; - u16 ntc; /* return some buffers to hardware, one at a time is too slow */ if (cleaned_count >= IXGBE_RX_BUFFER_WRITE) { @@ -1730,9 +1810,7 @@ static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, cleaned_count = 0; } - ntc = rx_ring->next_to_clean; - rx_desc = IXGBE_RX_DESC(rx_ring, ntc); - rx_buffer = &rx_ring->rx_buffer_info[ntc]; + rx_desc = IXGBE_RX_DESC(rx_ring, rx_ring->next_to_clean); if (!ixgbe_test_staterr(rx_desc, IXGBE_RXD_STAT_DD)) break; @@ -1744,78 +1822,12 @@ static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, */ rmb(); - page = rx_buffer->page; - prefetchw(page); + /* retrieve a buffer from the ring */ + skb = ixgbe_fetch_rx_buffer(rx_ring, rx_desc); - skb = rx_buffer->skb; - - if (likely(!skb)) { - void *page_addr = page_address(page) + - rx_buffer->page_offset; - - /* prefetch first cache line of first page */ - prefetch(page_addr); -#if L1_CACHE_BYTES < 128 - prefetch(page_addr + L1_CACHE_BYTES); -#endif - - /* allocate a skb to store the frags */ - skb = netdev_alloc_skb_ip_align(rx_ring->netdev, - IXGBE_RX_HDR_SIZE); - if (unlikely(!skb)) { - rx_ring->rx_stats.alloc_rx_buff_failed++; - break; - } - - /* - * we will be copying header into skb->data in - * pskb_may_pull so it is in our interest to prefetch - * it now to avoid a possible cache miss - */ - prefetchw(skb->data); - - /* - * Delay unmapping of the first packet. It carries the - * header information, HW may still access the header - * after the writeback. Only unmap it when EOP is - * reached - */ - if (likely(ixgbe_test_staterr(rx_desc, - IXGBE_RXD_STAT_EOP))) - goto dma_sync; - - IXGBE_CB(skb)->dma = rx_buffer->dma; - } else { - if (ixgbe_test_staterr(rx_desc, IXGBE_RXD_STAT_EOP)) - ixgbe_dma_sync_frag(rx_ring, skb); - -dma_sync: - /* we are reusing so sync this buffer for CPU use */ - dma_sync_single_range_for_cpu(rx_ring->dev, - rx_buffer->dma, - rx_buffer->page_offset, - ixgbe_rx_bufsz(rx_ring), - DMA_FROM_DEVICE); - } - - /* pull page into skb */ - if (ixgbe_add_rx_frag(rx_ring, rx_buffer, rx_desc, skb)) { - /* hand second half of page back to the ring */ - ixgbe_reuse_rx_page(rx_ring, rx_buffer); - } else if (IXGBE_CB(skb)->dma == rx_buffer->dma) { - /* the page has been released from the ring */ - IXGBE_CB(skb)->page_released = true; - } else { - /* we are not reusing the buffer so unmap it */ - dma_unmap_page(rx_ring->dev, rx_buffer->dma, - ixgbe_rx_pg_size(rx_ring), - DMA_FROM_DEVICE); - } - - /* clear contents of buffer_info */ - rx_buffer->skb = NULL; - rx_buffer->dma = 0; - rx_buffer->page = NULL; + /* exit if we failed to retrieve a buffer */ + if (!skb) + break; ixgbe_get_rsc_cnt(rx_ring, rx_desc, skb); From 5a02cbd10d37889d8214e82d57ccc70307edf805 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 20 Jul 2012 08:08:51 +0000 Subject: [PATCH 8/9] ixgbe: Roll RSC code into non-EOP code This change moves the RSC code into the non-EOP descriptor handling function. The main motivation behind this change is to help reduce the overhead in the non-RSC case. Previously the non-RSC path code would always be checking for append count even if RSC had been disabled. Now this code is completely skipped in a single conditional check instead of having to make two separate checks. Signed-off-by: Alexander Duyck Tested-by: Phil Schmitt Signed-off-by: Peter P Waskiewicz Jr --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 51 +++++++------------ 1 file changed, 19 insertions(+), 32 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 9e72ae6d8492..aa37b84592b3 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -1320,29 +1320,6 @@ static unsigned int ixgbe_get_headlen(unsigned char *data, return max_len; } -static void ixgbe_get_rsc_cnt(struct ixgbe_ring *rx_ring, - union ixgbe_adv_rx_desc *rx_desc, - struct sk_buff *skb) -{ - __le32 rsc_enabled; - u32 rsc_cnt; - - if (!ring_is_rsc_enabled(rx_ring)) - return; - - rsc_enabled = rx_desc->wb.lower.lo_dword.data & - cpu_to_le32(IXGBE_RXDADV_RSCCNT_MASK); - - /* If this is an RSC frame rsc_cnt should be non-zero */ - if (!rsc_enabled) - return; - - rsc_cnt = le32_to_cpu(rsc_enabled); - rsc_cnt >>= IXGBE_RXDADV_RSCCNT_SHIFT; - - IXGBE_CB(skb)->append_cnt += rsc_cnt - 1; -} - static void ixgbe_set_rsc_gso_size(struct ixgbe_ring *ring, struct sk_buff *skb) { @@ -1440,16 +1417,28 @@ static bool ixgbe_is_non_eop(struct ixgbe_ring *rx_ring, prefetch(IXGBE_RX_DESC(rx_ring, ntc)); + /* update RSC append count if present */ + if (ring_is_rsc_enabled(rx_ring)) { + __le32 rsc_enabled = rx_desc->wb.lower.lo_dword.data & + cpu_to_le32(IXGBE_RXDADV_RSCCNT_MASK); + + if (unlikely(rsc_enabled)) { + u32 rsc_cnt = le32_to_cpu(rsc_enabled); + + rsc_cnt >>= IXGBE_RXDADV_RSCCNT_SHIFT; + IXGBE_CB(skb)->append_cnt += rsc_cnt - 1; + + /* update ntc based on RSC value */ + ntc = le32_to_cpu(rx_desc->wb.upper.status_error); + ntc &= IXGBE_RXDADV_NEXTP_MASK; + ntc >>= IXGBE_RXDADV_NEXTP_SHIFT; + } + } + + /* if we are the last buffer then there is nothing else to do */ if (likely(ixgbe_test_staterr(rx_desc, IXGBE_RXD_STAT_EOP))) return false; - /* append_cnt indicates packet is RSC, if so fetch nextp */ - if (IXGBE_CB(skb)->append_cnt) { - ntc = le32_to_cpu(rx_desc->wb.upper.status_error); - ntc &= IXGBE_RXDADV_NEXTP_MASK; - ntc >>= IXGBE_RXDADV_NEXTP_SHIFT; - } - /* place skb in next buffer to be received */ rx_ring->rx_buffer_info[ntc].skb = skb; rx_ring->rx_stats.non_eop_descs++; @@ -1829,8 +1818,6 @@ static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, if (!skb) break; - ixgbe_get_rsc_cnt(rx_ring, rx_desc, skb); - cleaned_count++; /* place incomplete frames back on ring for completion */ From 62748b7bde84618f06b5c7a8733ae87776ec36c5 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 20 Jul 2012 08:09:01 +0000 Subject: [PATCH 9/9] ixgbe: Rewrite code related to configuring IFCS bit in Tx descriptor This change updates the code related to configuring the transmit frame checksum. Specifically I have updated the code so that we can only skip inserting the checksum in the case that we are not performing some other offload that will modify the frame data. Signed-off-by: Alexander Duyck Tested-by: Phil Schmitt Signed-off-by: Peter P Waskiewicz Jr --- drivers/net/ethernet/intel/ixgbe/ixgbe.h | 1 + drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 16 ++++++++++------ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h index fd2bc697a92b..bffcf1f2357a 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h @@ -107,6 +107,7 @@ #define IXGBE_TX_FLAGS_FSO (u32)(1 << 6) #define IXGBE_TX_FLAGS_TXSW (u32)(1 << 7) #define IXGBE_TX_FLAGS_TSTAMP (u32)(1 << 8) +#define IXGBE_TX_FLAGS_NO_IFCS (u32)(1 << 9) #define IXGBE_TX_FLAGS_VLAN_MASK 0xffff0000 #define IXGBE_TX_FLAGS_VLAN_PRIO_MASK 0xe0000000 #define IXGBE_TX_FLAGS_VLAN_PRIO_SHIFT 29 diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index aa37b84592b3..fa0d6e1561c1 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -5903,9 +5903,12 @@ static void ixgbe_tx_csum(struct ixgbe_ring *tx_ring, u32 type_tucmd = 0; if (skb->ip_summed != CHECKSUM_PARTIAL) { - if (!(first->tx_flags & IXGBE_TX_FLAGS_HW_VLAN) && - !(first->tx_flags & IXGBE_TX_FLAGS_TXSW)) - return; + if (!(first->tx_flags & IXGBE_TX_FLAGS_HW_VLAN)) { + if (unlikely(skb->no_fcs)) + first->tx_flags |= IXGBE_TX_FLAGS_NO_IFCS; + if (!(first->tx_flags & IXGBE_TX_FLAGS_TXSW)) + return; + } } else { u8 l4_hdr = 0; switch (first->protocol) { @@ -5967,7 +5970,6 @@ static __le32 ixgbe_tx_cmd_type(u32 tx_flags) { /* set type for advanced descriptor with frame checksum insertion */ __le32 cmd_type = cpu_to_le32(IXGBE_ADVTXD_DTYP_DATA | - IXGBE_ADVTXD_DCMD_IFCS | IXGBE_ADVTXD_DCMD_DEXT); /* set HW vlan bit if vlan is present */ @@ -5987,6 +5989,10 @@ static __le32 ixgbe_tx_cmd_type(u32 tx_flags) #endif cmd_type |= cpu_to_le32(IXGBE_ADVTXD_DCMD_TSE); + /* insert frame checksum */ + if (!(tx_flags & IXGBE_TX_FLAGS_NO_IFCS)) + cmd_type |= cpu_to_le32(IXGBE_ADVTXD_DCMD_IFCS); + return cmd_type; } @@ -6092,8 +6098,6 @@ static void ixgbe_tx_map(struct ixgbe_ring *tx_ring, if (likely(!data_len)) break; - if (unlikely(skb->no_fcs)) - cmd_type &= ~(cpu_to_le32(IXGBE_ADVTXD_DCMD_IFCS)); tx_desc->read.cmd_type_len = cmd_type | cpu_to_le32(size); i++;