net/mlx5e: xsk: Use xsk_buff_alloc_batch on legacy RQ

XSK provides a function to allocate frames in batches for more efficient
processing. This commit starts using this function on legacy RQ, adding
a special case for XSK. The new branch introduced basically replaces the
branch that was removed from the same place a few commits before.

A check is made that DMA sync is not needed, because the batching
allocator falls back to returning one frame when DMA sync is needed, and
this is best handled by the loop in the standard case.

Performance improvement is up to 8% in the aligned mode and up to 9% in
the unaligned mode.

Aligned mode, 2048-byte frames: 12.8 Mpps -> 13.5 Mpps
Aligned mode, 4096-byte frames: 11.5 Mpps -> 12.4 Mpps
Unaligned mode, 2048-byte frames: 12.2 Mpps -> 13.4 Mpps
Unaligned mode, 3072-byte frames: 11.6 Mpps -> 12.5 Mpps
Unaligned mode, 4096-byte frames: 11.2 Mpps -> 12.2 Mpps

CPU: Intel(R) Xeon(R) Gold 6240 CPU @ 2.60GHz

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Maxim Mikityanskiy 2022-09-30 09:28:56 -07:00 committed by Jakub Kicinski
parent a2e5ba242c
commit 259bbc6436
4 changed files with 55 additions and 0 deletions

View File

@ -8,6 +8,46 @@
/* RX data path */ /* RX data path */
int mlx5e_xsk_alloc_rx_wqes_batched(struct mlx5e_rq *rq, u16 ix, int wqe_bulk)
{
struct mlx5_wq_cyc *wq = &rq->wqe.wq;
struct xdp_buff **buffs;
u32 contig, alloc;
int i;
/* mlx5e_init_frags_partition creates a 1:1 mapping between
* rq->wqe.frags and rq->wqe.alloc_units, which allows us to
* allocate XDP buffers straight into alloc_units.
*/
BUILD_BUG_ON(sizeof(rq->wqe.alloc_units[0]) !=
sizeof(rq->wqe.alloc_units[0].xsk));
buffs = (struct xdp_buff **)rq->wqe.alloc_units;
contig = mlx5_wq_cyc_get_size(wq) - ix;
if (wqe_bulk <= contig) {
alloc = xsk_buff_alloc_batch(rq->xsk_pool, buffs + ix, wqe_bulk);
} else {
alloc = xsk_buff_alloc_batch(rq->xsk_pool, buffs + ix, contig);
if (likely(alloc == contig))
alloc += xsk_buff_alloc_batch(rq->xsk_pool, buffs, wqe_bulk - contig);
}
for (i = 0; i < alloc; i++) {
int j = mlx5_wq_cyc_ctr2ix(wq, ix + i);
struct mlx5e_wqe_frag_info *frag;
struct mlx5e_rx_wqe_cyc *wqe;
dma_addr_t addr;
wqe = mlx5_wq_cyc_get_wqe(wq, j);
/* Assumes log_num_frags == 0. */
frag = &rq->wqe.frags[j];
addr = xsk_buff_xdp_get_frame_dma(frag->au->xsk);
wqe->data[0].addr = cpu_to_be64(addr + rq->buff.headroom);
}
return alloc;
}
int mlx5e_xsk_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, int wqe_bulk) int mlx5e_xsk_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, int wqe_bulk)
{ {
struct mlx5_wq_cyc *wq = &rq->wqe.wq; struct mlx5_wq_cyc *wq = &rq->wqe.wq;

View File

@ -9,6 +9,7 @@
/* RX data path */ /* RX data path */
int mlx5e_xsk_alloc_rx_wqes_batched(struct mlx5e_rq *rq, u16 ix, int wqe_bulk);
int mlx5e_xsk_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, int wqe_bulk); int mlx5e_xsk_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, int wqe_bulk);
struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
struct mlx5e_mpw_info *wi, struct mlx5e_mpw_info *wi,

View File

@ -433,6 +433,13 @@ static void mlx5e_init_frags_partition(struct mlx5e_rq *rq)
struct mlx5e_wqe_frag_info *prev = NULL; struct mlx5e_wqe_frag_info *prev = NULL;
int i; int i;
if (rq->xsk_pool) {
/* Assumptions used by XSK batched allocator. */
WARN_ON(rq->wqe.info.num_frags != 1);
WARN_ON(rq->wqe.info.log_num_frags != 0);
WARN_ON(rq->wqe.info.arr[0].frag_stride != PAGE_SIZE);
}
next_frag.au = &rq->wqe.alloc_units[0]; next_frag.au = &rq->wqe.alloc_units[0];
for (i = 0; i < mlx5_wq_cyc_get_size(&rq->wqe.wq); i++) { for (i = 0; i < mlx5_wq_cyc_get_size(&rq->wqe.wq); i++) {

View File

@ -827,7 +827,14 @@ INDIRECT_CALLABLE_SCOPE bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
if (!rq->xsk_pool) if (!rq->xsk_pool)
count = mlx5e_alloc_rx_wqes(rq, head, wqe_bulk); count = mlx5e_alloc_rx_wqes(rq, head, wqe_bulk);
else if (likely(!rq->xsk_pool->dma_need_sync))
count = mlx5e_xsk_alloc_rx_wqes_batched(rq, head, wqe_bulk);
else else
/* If dma_need_sync is true, it's more efficient to call
* xsk_buff_alloc in a loop, rather than xsk_buff_alloc_batch,
* because the latter does the same check and returns only one
* frame.
*/
count = mlx5e_xsk_alloc_rx_wqes(rq, head, wqe_bulk); count = mlx5e_xsk_alloc_rx_wqes(rq, head, wqe_bulk);
mlx5_wq_cyc_push_n(wq, count); mlx5_wq_cyc_push_n(wq, count);