net/mlx5e: xsk: Use xsk_buff_alloc_batch on striding RQ
XSK provides a function to allocate frames in batches for more efficient processing. This commit starts using this function on striding RQ and creates an optimized flow for XSK. A side effect is an opportunity to optimize the regular RX flow by dropping branching for XSK cases. Performance improvement is up to 6.4% in the aligned mode and up to 7.5% in the unaligned mode. Aligned mode, 2048-byte frames: 12.9 Mpps -> 13.8 Mpps Aligned mode, 4096-byte frames: 11.8 Mpps -> 12.5 Mpps Unaligned mode, 2048-byte frames: 11.9 Mpps -> 12.8 Mpps Unaligned mode, 3072-byte frames: 11.4 Mpps -> 12.1 Mpps Unaligned mode, 4096-byte frames: 11.0 Mpps -> 11.2 Mpps CPU: Intel(R) Xeon(R) Gold 6240 CPU @ 2.60GHz Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com> Reviewed-by: Tariq Toukan <tariqt@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
parent
259bbc6436
commit
cf544517c4
|
@ -452,4 +452,11 @@ static inline bool mlx5e_icosq_can_post_wqe(struct mlx5e_icosq *sq, u16 wqe_size
|
||||||
|
|
||||||
return mlx5e_wqc_has_room_for(&sq->wq, sq->cc, sq->pc, room);
|
return mlx5e_wqc_has_room_for(&sq->wq, sq->cc, sq->pc, room);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline struct mlx5e_mpw_info *mlx5e_get_mpw_info(struct mlx5e_rq *rq, int i)
|
||||||
|
{
|
||||||
|
size_t isz = struct_size(rq->mpwqe.info, alloc_units, rq->mpwqe.pages_per_wqe);
|
||||||
|
|
||||||
|
return (struct mlx5e_mpw_info *)((char *)rq->mpwqe.info + array_size(i, isz));
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -8,6 +8,90 @@
|
||||||
|
|
||||||
/* RX data path */
|
/* RX data path */
|
||||||
|
|
||||||
|
int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
|
||||||
|
{
|
||||||
|
struct mlx5e_mpw_info *wi = mlx5e_get_mpw_info(rq, ix);
|
||||||
|
struct mlx5e_icosq *icosq = rq->icosq;
|
||||||
|
struct mlx5_wq_cyc *wq = &icosq->wq;
|
||||||
|
struct mlx5e_umr_wqe *umr_wqe;
|
||||||
|
int batch, i;
|
||||||
|
u32 offset; /* 17-bit value with MTT. */
|
||||||
|
u16 pi;
|
||||||
|
|
||||||
|
if (unlikely(!xsk_buff_can_alloc(rq->xsk_pool, rq->mpwqe.pages_per_wqe)))
|
||||||
|
goto err;
|
||||||
|
|
||||||
|
BUILD_BUG_ON(sizeof(wi->alloc_units[0]) != sizeof(wi->alloc_units[0].xsk));
|
||||||
|
batch = xsk_buff_alloc_batch(rq->xsk_pool, (struct xdp_buff **)wi->alloc_units,
|
||||||
|
rq->mpwqe.pages_per_wqe);
|
||||||
|
|
||||||
|
/* If batch < pages_per_wqe, either:
|
||||||
|
* 1. Some (or all) descriptors were invalid.
|
||||||
|
* 2. dma_need_sync is true, and it fell back to allocating one frame.
|
||||||
|
* In either case, try to continue allocating frames one by one, until
|
||||||
|
* the first error, which will mean there are no more valid descriptors.
|
||||||
|
*/
|
||||||
|
for (; batch < rq->mpwqe.pages_per_wqe; batch++) {
|
||||||
|
wi->alloc_units[batch].xsk = xsk_buff_alloc(rq->xsk_pool);
|
||||||
|
if (unlikely(!wi->alloc_units[batch].xsk))
|
||||||
|
goto err_reuse_batch;
|
||||||
|
}
|
||||||
|
|
||||||
|
pi = mlx5e_icosq_get_next_pi(icosq, rq->mpwqe.umr_wqebbs);
|
||||||
|
umr_wqe = mlx5_wq_cyc_get_wqe(wq, pi);
|
||||||
|
memcpy(umr_wqe, &rq->mpwqe.umr_wqe, sizeof(struct mlx5e_umr_wqe));
|
||||||
|
|
||||||
|
if (unlikely(rq->mpwqe.unaligned)) {
|
||||||
|
for (i = 0; i < batch; i++) {
|
||||||
|
dma_addr_t addr = xsk_buff_xdp_get_frame_dma(wi->alloc_units[i].xsk);
|
||||||
|
|
||||||
|
umr_wqe->inline_ksms[i] = (struct mlx5_ksm) {
|
||||||
|
.key = rq->mkey_be,
|
||||||
|
.va = cpu_to_be64(addr),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (i = 0; i < batch; i++) {
|
||||||
|
dma_addr_t addr = xsk_buff_xdp_get_frame_dma(wi->alloc_units[i].xsk);
|
||||||
|
|
||||||
|
umr_wqe->inline_mtts[i] = (struct mlx5_mtt) {
|
||||||
|
.ptag = cpu_to_be64(addr | MLX5_EN_WR),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bitmap_zero(wi->xdp_xmit_bitmap, rq->mpwqe.pages_per_wqe);
|
||||||
|
wi->consumed_strides = 0;
|
||||||
|
|
||||||
|
umr_wqe->ctrl.opmod_idx_opcode =
|
||||||
|
cpu_to_be32((icosq->pc << MLX5_WQE_CTRL_WQE_INDEX_SHIFT) | MLX5_OPCODE_UMR);
|
||||||
|
|
||||||
|
offset = ix * rq->mpwqe.mtts_per_wqe;
|
||||||
|
if (likely(!rq->mpwqe.unaligned))
|
||||||
|
offset = MLX5_ALIGNED_MTTS_OCTW(offset);
|
||||||
|
umr_wqe->uctrl.xlt_offset = cpu_to_be16(offset);
|
||||||
|
|
||||||
|
icosq->db.wqe_info[pi] = (struct mlx5e_icosq_wqe_info) {
|
||||||
|
.wqe_type = MLX5E_ICOSQ_WQE_UMR_RX,
|
||||||
|
.num_wqebbs = rq->mpwqe.umr_wqebbs,
|
||||||
|
.umr.rq = rq,
|
||||||
|
};
|
||||||
|
|
||||||
|
icosq->pc += rq->mpwqe.umr_wqebbs;
|
||||||
|
|
||||||
|
icosq->doorbell_cseg = &umr_wqe->ctrl;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
err_reuse_batch:
|
||||||
|
while (--batch >= 0)
|
||||||
|
xsk_buff_free(wi->alloc_units[batch].xsk);
|
||||||
|
|
||||||
|
err:
|
||||||
|
rq->stats->buff_alloc_err++;
|
||||||
|
return -ENOMEM;
|
||||||
|
}
|
||||||
|
|
||||||
int mlx5e_xsk_alloc_rx_wqes_batched(struct mlx5e_rq *rq, u16 ix, int wqe_bulk)
|
int mlx5e_xsk_alloc_rx_wqes_batched(struct mlx5e_rq *rq, u16 ix, int wqe_bulk)
|
||||||
{
|
{
|
||||||
struct mlx5_wq_cyc *wq = &rq->wqe.wq;
|
struct mlx5_wq_cyc *wq = &rq->wqe.wq;
|
||||||
|
@ -112,7 +196,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
|
||||||
*/
|
*/
|
||||||
WARN_ON_ONCE(head_offset);
|
WARN_ON_ONCE(head_offset);
|
||||||
|
|
||||||
xdp->data_end = xdp->data + cqe_bcnt;
|
xsk_buff_set_size(xdp, cqe_bcnt);
|
||||||
xdp_set_data_meta_invalid(xdp);
|
xdp_set_data_meta_invalid(xdp);
|
||||||
xsk_buff_dma_sync_for_cpu(xdp, rq->xsk_pool);
|
xsk_buff_dma_sync_for_cpu(xdp, rq->xsk_pool);
|
||||||
net_prefetch(xdp->data);
|
net_prefetch(xdp->data);
|
||||||
|
@ -159,7 +243,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
|
||||||
*/
|
*/
|
||||||
WARN_ON_ONCE(wi->offset);
|
WARN_ON_ONCE(wi->offset);
|
||||||
|
|
||||||
xdp->data_end = xdp->data + cqe_bcnt;
|
xsk_buff_set_size(xdp, cqe_bcnt);
|
||||||
xdp_set_data_meta_invalid(xdp);
|
xdp_set_data_meta_invalid(xdp);
|
||||||
xsk_buff_dma_sync_for_cpu(xdp, rq->xsk_pool);
|
xsk_buff_dma_sync_for_cpu(xdp, rq->xsk_pool);
|
||||||
net_prefetch(xdp->data);
|
net_prefetch(xdp->data);
|
||||||
|
|
|
@ -9,6 +9,7 @@
|
||||||
|
|
||||||
/* RX data path */
|
/* RX data path */
|
||||||
|
|
||||||
|
int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix);
|
||||||
int mlx5e_xsk_alloc_rx_wqes_batched(struct mlx5e_rq *rq, u16 ix, int wqe_bulk);
|
int mlx5e_xsk_alloc_rx_wqes_batched(struct mlx5e_rq *rq, u16 ix, int wqe_bulk);
|
||||||
int mlx5e_xsk_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, int wqe_bulk);
|
int mlx5e_xsk_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, int wqe_bulk);
|
||||||
struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
|
struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
|
||||||
|
|
|
@ -75,13 +75,6 @@ const struct mlx5e_rx_handlers mlx5e_rx_handlers_nic = {
|
||||||
.handle_rx_cqe_mpwqe_shampo = mlx5e_handle_rx_cqe_mpwrq_shampo,
|
.handle_rx_cqe_mpwqe_shampo = mlx5e_handle_rx_cqe_mpwrq_shampo,
|
||||||
};
|
};
|
||||||
|
|
||||||
static struct mlx5e_mpw_info *mlx5e_get_mpw_info(struct mlx5e_rq *rq, int i)
|
|
||||||
{
|
|
||||||
size_t isz = struct_size(rq->mpwqe.info, alloc_units, rq->mpwqe.pages_per_wqe);
|
|
||||||
|
|
||||||
return (struct mlx5e_mpw_info *)((char *)rq->mpwqe.info + array_size(i, isz));
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline bool mlx5e_rx_hw_stamp(struct hwtstamp_config *config)
|
static inline bool mlx5e_rx_hw_stamp(struct hwtstamp_config *config)
|
||||||
{
|
{
|
||||||
return config->rx_filter == HWTSTAMP_FILTER_ALL;
|
return config->rx_filter == HWTSTAMP_FILTER_ALL;
|
||||||
|
@ -668,15 +661,6 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
|
||||||
int err;
|
int err;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
/* Check in advance that we have enough frames, instead of allocating
|
|
||||||
* one-by-one, failing and moving frames to the Reuse Ring.
|
|
||||||
*/
|
|
||||||
if (rq->xsk_pool &&
|
|
||||||
unlikely(!xsk_buff_can_alloc(rq->xsk_pool, rq->mpwqe.pages_per_wqe))) {
|
|
||||||
err = -ENOMEM;
|
|
||||||
goto err;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (test_bit(MLX5E_RQ_STATE_SHAMPO, &rq->state)) {
|
if (test_bit(MLX5E_RQ_STATE_SHAMPO, &rq->state)) {
|
||||||
err = mlx5e_alloc_rx_hd_mpwqe(rq);
|
err = mlx5e_alloc_rx_hd_mpwqe(rq);
|
||||||
if (unlikely(err))
|
if (unlikely(err))
|
||||||
|
@ -687,33 +671,16 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
|
||||||
umr_wqe = mlx5_wq_cyc_get_wqe(wq, pi);
|
umr_wqe = mlx5_wq_cyc_get_wqe(wq, pi);
|
||||||
memcpy(umr_wqe, &rq->mpwqe.umr_wqe, sizeof(struct mlx5e_umr_wqe));
|
memcpy(umr_wqe, &rq->mpwqe.umr_wqe, sizeof(struct mlx5e_umr_wqe));
|
||||||
|
|
||||||
if (unlikely(rq->mpwqe.unaligned)) {
|
for (i = 0; i < rq->mpwqe.pages_per_wqe; i++, au++) {
|
||||||
for (i = 0; i < rq->mpwqe.pages_per_wqe; i++, au++) {
|
dma_addr_t addr;
|
||||||
dma_addr_t addr;
|
|
||||||
|
|
||||||
err = mlx5e_page_alloc(rq, au);
|
err = mlx5e_page_alloc_pool(rq, au);
|
||||||
if (unlikely(err))
|
if (unlikely(err))
|
||||||
goto err_unmap;
|
goto err_unmap;
|
||||||
/* Unaligned means XSK. */
|
addr = page_pool_get_dma_addr(au->page);
|
||||||
addr = xsk_buff_xdp_get_frame_dma(au->xsk);
|
umr_wqe->inline_mtts[i] = (struct mlx5_mtt) {
|
||||||
umr_wqe->inline_ksms[i] = (struct mlx5_ksm) {
|
.ptag = cpu_to_be64(addr | MLX5_EN_WR),
|
||||||
.key = rq->mkey_be,
|
};
|
||||||
.va = cpu_to_be64(addr),
|
|
||||||
};
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (i = 0; i < rq->mpwqe.pages_per_wqe; i++, au++) {
|
|
||||||
dma_addr_t addr;
|
|
||||||
|
|
||||||
err = mlx5e_page_alloc(rq, au);
|
|
||||||
if (unlikely(err))
|
|
||||||
goto err_unmap;
|
|
||||||
addr = rq->xsk_pool ? xsk_buff_xdp_get_frame_dma(au->xsk) :
|
|
||||||
page_pool_get_dma_addr(au->page);
|
|
||||||
umr_wqe->inline_mtts[i] = (struct mlx5_mtt) {
|
|
||||||
.ptag = cpu_to_be64(addr | MLX5_EN_WR),
|
|
||||||
};
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bitmap_zero(wi->xdp_xmit_bitmap, rq->mpwqe.pages_per_wqe);
|
bitmap_zero(wi->xdp_xmit_bitmap, rq->mpwqe.pages_per_wqe);
|
||||||
|
@ -723,9 +690,7 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
|
||||||
cpu_to_be32((sq->pc << MLX5_WQE_CTRL_WQE_INDEX_SHIFT) |
|
cpu_to_be32((sq->pc << MLX5_WQE_CTRL_WQE_INDEX_SHIFT) |
|
||||||
MLX5_OPCODE_UMR);
|
MLX5_OPCODE_UMR);
|
||||||
|
|
||||||
offset = ix * rq->mpwqe.mtts_per_wqe;
|
offset = MLX5_ALIGNED_MTTS_OCTW(ix * rq->mpwqe.mtts_per_wqe);
|
||||||
if (!rq->mpwqe.unaligned)
|
|
||||||
offset = MLX5_ALIGNED_MTTS_OCTW(offset);
|
|
||||||
umr_wqe->uctrl.xlt_offset = cpu_to_be16(offset);
|
umr_wqe->uctrl.xlt_offset = cpu_to_be16(offset);
|
||||||
|
|
||||||
sq->db.wqe_info[pi] = (struct mlx5e_icosq_wqe_info) {
|
sq->db.wqe_info[pi] = (struct mlx5e_icosq_wqe_info) {
|
||||||
|
@ -1016,7 +981,8 @@ INDIRECT_CALLABLE_SCOPE bool mlx5e_post_rx_mpwqes(struct mlx5e_rq *rq)
|
||||||
head = rq->mpwqe.actual_wq_head;
|
head = rq->mpwqe.actual_wq_head;
|
||||||
i = missing;
|
i = missing;
|
||||||
do {
|
do {
|
||||||
alloc_err = mlx5e_alloc_rx_mpwqe(rq, head);
|
alloc_err = rq->xsk_pool ? mlx5e_xsk_alloc_rx_mpwqe(rq, head) :
|
||||||
|
mlx5e_alloc_rx_mpwqe(rq, head);
|
||||||
|
|
||||||
if (unlikely(alloc_err))
|
if (unlikely(alloc_err))
|
||||||
break;
|
break;
|
||||||
|
|
Loading…
Reference in New Issue