diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index f174c72480ab..87fb27ab9c24 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -638,7 +638,7 @@ static void i40e_unmap_and_free_tx_resource(struct i40e_ring *ring, if (tx_buffer->tx_flags & I40E_TX_FLAGS_FD_SB) kfree(tx_buffer->raw_buf); else if (ring_is_xdp(ring)) - page_frag_free(tx_buffer->raw_buf); + xdp_return_frame(tx_buffer->xdpf); else dev_kfree_skb_any(tx_buffer->skb); if (dma_unmap_len(tx_buffer, len)) @@ -841,7 +841,7 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi, /* free the skb/XDP data */ if (ring_is_xdp(tx_ring)) - page_frag_free(tx_buf->raw_buf); + xdp_return_frame(tx_buf->xdpf); else napi_consume_skb(tx_buf->skb, napi_budget); @@ -2203,9 +2203,20 @@ static bool i40e_is_non_eop(struct i40e_ring *rx_ring, #define I40E_XDP_CONSUMED 1 #define I40E_XDP_TX 2 -static int i40e_xmit_xdp_ring(struct xdp_buff *xdp, +static int i40e_xmit_xdp_ring(struct xdp_frame *xdpf, struct i40e_ring *xdp_ring); +static int i40e_xmit_xdp_tx_ring(struct xdp_buff *xdp, + struct i40e_ring *xdp_ring) +{ + struct xdp_frame *xdpf = convert_to_xdp_frame(xdp); + + if (unlikely(!xdpf)) + return I40E_XDP_CONSUMED; + + return i40e_xmit_xdp_ring(xdpf, xdp_ring); +} + /** * i40e_run_xdp - run an XDP program * @rx_ring: Rx ring being processed @@ -2225,13 +2236,15 @@ static struct sk_buff *i40e_run_xdp(struct i40e_ring *rx_ring, if (!xdp_prog) goto xdp_out; + prefetchw(xdp->data_hard_start); /* xdp_frame write */ + act = bpf_prog_run_xdp(xdp_prog, xdp); switch (act) { case XDP_PASS: break; case XDP_TX: xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->queue_index]; - result = i40e_xmit_xdp_ring(xdp, xdp_ring); + result = i40e_xmit_xdp_tx_ring(xdp, xdp_ring); break; case XDP_REDIRECT: err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog); @@ -3478,13 +3491,13 @@ dma_error: * @xdp: data to transmit * @xdp_ring: XDP Tx ring **/ -static int i40e_xmit_xdp_ring(struct xdp_buff *xdp, +static int i40e_xmit_xdp_ring(struct xdp_frame *xdpf, struct i40e_ring *xdp_ring) { - u32 size = xdp->data_end - xdp->data; u16 i = xdp_ring->next_to_use; struct i40e_tx_buffer *tx_bi; struct i40e_tx_desc *tx_desc; + u32 size = xdpf->len; dma_addr_t dma; if (!unlikely(I40E_DESC_UNUSED(xdp_ring))) { @@ -3492,14 +3505,14 @@ static int i40e_xmit_xdp_ring(struct xdp_buff *xdp, return I40E_XDP_CONSUMED; } - dma = dma_map_single(xdp_ring->dev, xdp->data, size, DMA_TO_DEVICE); + dma = dma_map_single(xdp_ring->dev, xdpf->data, size, DMA_TO_DEVICE); if (dma_mapping_error(xdp_ring->dev, dma)) return I40E_XDP_CONSUMED; tx_bi = &xdp_ring->tx_bi[i]; tx_bi->bytecount = size; tx_bi->gso_segs = 1; - tx_bi->raw_buf = xdp->data; + tx_bi->xdpf = xdpf; /* record length, and DMA address */ dma_unmap_len_set(tx_bi, len, size); @@ -3675,7 +3688,7 @@ netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev) * * Returns Zero if sent, else an error code **/ -int i40e_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp) +int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf) { struct i40e_netdev_priv *np = netdev_priv(dev); unsigned int queue_index = smp_processor_id(); @@ -3688,7 +3701,7 @@ int i40e_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp) if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs) return -ENXIO; - err = i40e_xmit_xdp_ring(xdp, vsi->xdp_rings[queue_index]); + err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]); if (err != I40E_XDP_TX) return -ENOSPC; diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h index 3043483ec426..4bf318b8be85 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h @@ -306,6 +306,7 @@ static inline unsigned int i40e_txd_use_count(unsigned int size) struct i40e_tx_buffer { struct i40e_tx_desc *next_to_watch; union { + struct xdp_frame *xdpf; struct sk_buff *skb; void *raw_buf; }; @@ -510,7 +511,7 @@ u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw); void i40e_detect_recover_hung(struct i40e_vsi *vsi); int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size); bool __i40e_chk_linearize(struct sk_buff *skb); -int i40e_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp); +int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf); void i40e_xdp_flush(struct net_device *dev); /** diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h index 4f08c712e58e..7dd5038cfcc4 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h @@ -241,8 +241,7 @@ struct ixgbe_tx_buffer { unsigned long time_stamp; union { struct sk_buff *skb; - /* XDP uses address ptr on irq_clean */ - void *data; + struct xdp_frame *xdpf; }; unsigned int bytecount; unsigned short gso_segs; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index afadba99f7b8..51e7d82a5860 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -1216,7 +1216,7 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector, /* free the skb */ if (ring_is_xdp(tx_ring)) - page_frag_free(tx_buffer->data); + xdp_return_frame(tx_buffer->xdpf); else napi_consume_skb(tx_buffer->skb, napi_budget); @@ -2262,7 +2262,7 @@ static struct sk_buff *ixgbe_build_skb(struct ixgbe_ring *rx_ring, #define IXGBE_XDP_TX 2 static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter, - struct xdp_buff *xdp); + struct xdp_frame *xdpf); static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter *adapter, struct ixgbe_ring *rx_ring, @@ -2270,6 +2270,7 @@ static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter *adapter, { int err, result = IXGBE_XDP_PASS; struct bpf_prog *xdp_prog; + struct xdp_frame *xdpf; u32 act; rcu_read_lock(); @@ -2278,12 +2279,19 @@ static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter *adapter, if (!xdp_prog) goto xdp_out; + prefetchw(xdp->data_hard_start); /* xdp_frame write */ + act = bpf_prog_run_xdp(xdp_prog, xdp); switch (act) { case XDP_PASS: break; case XDP_TX: - result = ixgbe_xmit_xdp_ring(adapter, xdp); + xdpf = convert_to_xdp_frame(xdp); + if (unlikely(!xdpf)) { + result = IXGBE_XDP_CONSUMED; + break; + } + result = ixgbe_xmit_xdp_ring(adapter, xdpf); break; case XDP_REDIRECT: err = xdp_do_redirect(adapter->netdev, xdp, xdp_prog); @@ -5797,7 +5805,7 @@ static void ixgbe_clean_tx_ring(struct ixgbe_ring *tx_ring) /* Free all the Tx ring sk_buffs */ if (ring_is_xdp(tx_ring)) - page_frag_free(tx_buffer->data); + xdp_return_frame(tx_buffer->xdpf); else dev_kfree_skb_any(tx_buffer->skb); @@ -6370,7 +6378,7 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter, struct device *dev = rx_ring->dev; int orig_node = dev_to_node(dev); int ring_node = -1; - int size; + int size, err; size = sizeof(struct ixgbe_rx_buffer) * rx_ring->count; @@ -6407,6 +6415,13 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter, rx_ring->queue_index) < 0) goto err; + err = xdp_rxq_info_reg_mem_model(&rx_ring->xdp_rxq, + MEM_TYPE_PAGE_SHARED, NULL); + if (err) { + xdp_rxq_info_unreg(&rx_ring->xdp_rxq); + goto err; + } + rx_ring->xdp_prog = adapter->xdp_prog; return 0; @@ -8336,7 +8351,7 @@ static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb, } static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter, - struct xdp_buff *xdp) + struct xdp_frame *xdpf) { struct ixgbe_ring *ring = adapter->xdp_ring[smp_processor_id()]; struct ixgbe_tx_buffer *tx_buffer; @@ -8345,12 +8360,12 @@ static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter, dma_addr_t dma; u16 i; - len = xdp->data_end - xdp->data; + len = xdpf->len; if (unlikely(!ixgbe_desc_unused(ring))) return IXGBE_XDP_CONSUMED; - dma = dma_map_single(ring->dev, xdp->data, len, DMA_TO_DEVICE); + dma = dma_map_single(ring->dev, xdpf->data, len, DMA_TO_DEVICE); if (dma_mapping_error(ring->dev, dma)) return IXGBE_XDP_CONSUMED; @@ -8365,7 +8380,8 @@ static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter, dma_unmap_len_set(tx_buffer, len, len); dma_unmap_addr_set(tx_buffer, dma, dma); - tx_buffer->data = xdp->data; + tx_buffer->xdpf = xdpf; + tx_desc->read.buffer_addr = cpu_to_le64(dma); /* put descriptor type bits */ @@ -9996,7 +10012,7 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp) } } -static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp) +static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf) { struct ixgbe_adapter *adapter = netdev_priv(dev); struct ixgbe_ring *ring; @@ -10012,7 +10028,7 @@ static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp) if (unlikely(!ring)) return -ENXIO; - err = ixgbe_xmit_xdp_ring(adapter, xdp); + err = ixgbe_xmit_xdp_ring(adapter, xdpf); if (err != IXGBE_XDP_TX) return -ENOSPC; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig index c032319f1cb9..12257034131e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig +++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig @@ -30,6 +30,7 @@ config MLX5_CORE_EN bool "Mellanox Technologies ConnectX-4 Ethernet support" depends on NETDEVICES && ETHERNET && INET && PCI && MLX5_CORE depends on IPV6=y || IPV6=n || MLX5_CORE=m + select PAGE_POOL default n ---help--- Ethernet support in Mellanox Technologies ConnectX-4 NIC. diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 30cad07be2b5..3317a4da87cb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -53,6 +53,8 @@ #include "mlx5_core.h" #include "en_stats.h" +struct page_pool; + #define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v) #define MLX5E_ETH_HARD_MTU (ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN) @@ -392,6 +394,7 @@ struct mlx5e_xdpsq { struct { struct mlx5e_dma_info *di; bool doorbell; + bool redirect_flush; } db; /* read only */ @@ -533,6 +536,7 @@ struct mlx5e_rq { unsigned int hw_mtu; struct mlx5e_xdpsq xdpsq; DECLARE_BITMAP(flags, 8); + struct page_pool *page_pool; /* control */ struct mlx5_wq_ctrl wq_ctrl; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index b29c1d93f058..f10037419978 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -35,6 +35,7 @@ #include #include #include +#include #include "eswitch.h" #include "en.h" #include "en_tc.h" @@ -389,10 +390,11 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, struct mlx5e_rq_param *rqp, struct mlx5e_rq *rq) { + struct page_pool_params pp_params = { 0 }; struct mlx5_core_dev *mdev = c->mdev; void *rqc = rqp->rqc; void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq); - u32 byte_count; + u32 byte_count, pool_size; int npages; int wq_sz; int err; @@ -432,9 +434,12 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, rq->buff.map_dir = rq->xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE; rq->buff.headroom = mlx5e_get_rq_headroom(mdev, params); + pool_size = 1 << params->log_rq_mtu_frames; switch (rq->wq_type) { case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: + + pool_size = MLX5_MPWRQ_PAGES_PER_WQE << mlx5e_mpwqe_get_log_rq_size(params); rq->post_wqes = mlx5e_post_rx_mpwqes; rq->dealloc_wqe = mlx5e_dealloc_rx_mpwqe; @@ -512,6 +517,32 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, rq->mkey_be = c->mkey_be; } + /* Create a page_pool and register it with rxq */ + pp_params.order = rq->buff.page_order; + pp_params.flags = 0; /* No-internal DMA mapping in page_pool */ + pp_params.pool_size = pool_size; + pp_params.nid = cpu_to_node(c->cpu); + pp_params.dev = c->pdev; + pp_params.dma_dir = rq->buff.map_dir; + + /* page_pool can be used even when there is no rq->xdp_prog, + * given page_pool does not handle DMA mapping there is no + * required state to clear. And page_pool gracefully handle + * elevated refcnt. + */ + rq->page_pool = page_pool_create(&pp_params); + if (IS_ERR(rq->page_pool)) { + if (rq->wq_type != MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) + kfree(rq->wqe.frag_info); + err = PTR_ERR(rq->page_pool); + rq->page_pool = NULL; + goto err_rq_wq_destroy; + } + err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, + MEM_TYPE_PAGE_POOL, rq->page_pool); + if (err) + goto err_rq_wq_destroy; + for (i = 0; i < wq_sz; i++) { struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i); @@ -548,6 +579,8 @@ err_rq_wq_destroy: if (rq->xdp_prog) bpf_prog_put(rq->xdp_prog); xdp_rxq_info_unreg(&rq->xdp_rxq); + if (rq->page_pool) + page_pool_destroy(rq->page_pool); mlx5_wq_destroy(&rq->wq_ctrl); return err; @@ -561,6 +594,8 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq) bpf_prog_put(rq->xdp_prog); xdp_rxq_info_unreg(&rq->xdp_rxq); + if (rq->page_pool) + page_pool_destroy(rq->page_pool); switch (rq->wq_type) { case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 176645762e49..7bbf0db27a01 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -37,6 +37,7 @@ #include #include #include +#include #include "en.h" #include "en_tc.h" #include "eswitch.h" @@ -221,7 +222,7 @@ static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq, if (mlx5e_rx_cache_get(rq, dma_info)) return 0; - dma_info->page = dev_alloc_pages(rq->buff.page_order); + dma_info->page = page_pool_dev_alloc_pages(rq->page_pool); if (unlikely(!dma_info->page)) return -ENOMEM; @@ -236,15 +237,26 @@ static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq, return 0; } +static void mlx5e_page_dma_unmap(struct mlx5e_rq *rq, + struct mlx5e_dma_info *dma_info) +{ + dma_unmap_page(rq->pdev, dma_info->addr, RQ_PAGE_SIZE(rq), + rq->buff.map_dir); +} + void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info, bool recycle) { - if (likely(recycle) && mlx5e_rx_cache_put(rq, dma_info)) - return; + if (likely(recycle)) { + if (mlx5e_rx_cache_put(rq, dma_info)) + return; - dma_unmap_page(rq->pdev, dma_info->addr, RQ_PAGE_SIZE(rq), - rq->buff.map_dir); - put_page(dma_info->page); + mlx5e_page_dma_unmap(rq, dma_info); + page_pool_recycle_direct(rq->page_pool, dma_info->page); + } else { + mlx5e_page_dma_unmap(rq, dma_info); + put_page(dma_info->page); + } } static inline bool mlx5e_page_reuse(struct mlx5e_rq *rq, @@ -800,9 +812,10 @@ static inline int mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di, void *va, u16 *rx_headroom, u32 *len) { - const struct bpf_prog *prog = READ_ONCE(rq->xdp_prog); + struct bpf_prog *prog = READ_ONCE(rq->xdp_prog); struct xdp_buff xdp; u32 act; + int err; if (!prog) return false; @@ -823,6 +836,15 @@ static inline int mlx5e_xdp_handle(struct mlx5e_rq *rq, if (unlikely(!mlx5e_xmit_xdp_frame(rq, di, &xdp))) trace_xdp_exception(rq->netdev, prog, act); return true; + case XDP_REDIRECT: + /* When XDP enabled then page-refcnt==1 here */ + err = xdp_do_redirect(rq->netdev, &xdp, prog); + if (!err) { + __set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags); + rq->xdpsq.db.redirect_flush = true; + mlx5e_page_dma_unmap(rq, di); + } + return true; default: bpf_warn_invalid_xdp_action(act); case XDP_ABORTED: @@ -868,6 +890,7 @@ struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, dma_sync_single_range_for_cpu(rq->pdev, di->addr, wi->offset, frag_size, DMA_FROM_DEVICE); + prefetchw(va); /* xdp_frame data area */ prefetch(data); wi->offset += frag_size; @@ -1140,6 +1163,11 @@ int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget) xdpsq->db.doorbell = false; } + if (xdpsq->db.redirect_flush) { + xdp_do_flush_map(); + xdpsq->db.redirect_flush = false; + } + mlx5_cqwq_update_db_record(&cq->wq); /* ensure cq space is freed before enabling more cqes */ diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 28583aa0c17d..1e58be152d5c 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -248,11 +248,11 @@ struct veth { __be16 h_vlan_TCI; }; -bool tun_is_xdp_buff(void *ptr) +bool tun_is_xdp_frame(void *ptr) { return (unsigned long)ptr & TUN_XDP_FLAG; } -EXPORT_SYMBOL(tun_is_xdp_buff); +EXPORT_SYMBOL(tun_is_xdp_frame); void *tun_xdp_to_ptr(void *ptr) { @@ -660,10 +660,10 @@ void tun_ptr_free(void *ptr) { if (!ptr) return; - if (tun_is_xdp_buff(ptr)) { - struct xdp_buff *xdp = tun_ptr_to_xdp(ptr); + if (tun_is_xdp_frame(ptr)) { + struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); - put_page(virt_to_head_page(xdp->data)); + xdp_return_frame(xdpf); } else { __skb_array_destroy_skb(ptr); } @@ -854,6 +854,12 @@ static int tun_attach(struct tun_struct *tun, struct file *file, tun->dev, tfile->queue_index); if (err < 0) goto out; + err = xdp_rxq_info_reg_mem_model(&tfile->xdp_rxq, + MEM_TYPE_PAGE_SHARED, NULL); + if (err < 0) { + xdp_rxq_info_unreg(&tfile->xdp_rxq); + goto out; + } err = 0; } @@ -1295,21 +1301,13 @@ static const struct net_device_ops tun_netdev_ops = { .ndo_get_stats64 = tun_net_get_stats64, }; -static int tun_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp) +static int tun_xdp_xmit(struct net_device *dev, struct xdp_frame *frame) { struct tun_struct *tun = netdev_priv(dev); - struct xdp_buff *buff = xdp->data_hard_start; - int headroom = xdp->data - xdp->data_hard_start; struct tun_file *tfile; u32 numqueues; int ret = 0; - /* Assure headroom is available and buff is properly aligned */ - if (unlikely(headroom < sizeof(*xdp) || tun_is_xdp_buff(xdp))) - return -ENOSPC; - - *buff = *xdp; - rcu_read_lock(); numqueues = READ_ONCE(tun->numqueues); @@ -1323,7 +1321,7 @@ static int tun_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp) /* Encode the XDP flag into lowest bit for consumer to differ * XDP buffer from sk_buff. */ - if (ptr_ring_produce(&tfile->tx_ring, tun_xdp_to_ptr(buff))) { + if (ptr_ring_produce(&tfile->tx_ring, tun_xdp_to_ptr(frame))) { this_cpu_inc(tun->pcpu_stats->tx_dropped); ret = -ENOSPC; } @@ -1333,6 +1331,16 @@ out: return ret; } +static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) +{ + struct xdp_frame *frame = convert_to_xdp_frame(xdp); + + if (unlikely(!frame)) + return -EOVERFLOW; + + return tun_xdp_xmit(dev, frame); +} + static void tun_xdp_flush(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); @@ -1680,7 +1688,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, case XDP_TX: get_page(alloc_frag->page); alloc_frag->offset += buflen; - if (tun_xdp_xmit(tun->dev, &xdp)) + if (tun_xdp_tx(tun->dev, &xdp)) goto err_redirect; tun_xdp_flush(tun->dev); rcu_read_unlock(); @@ -2001,11 +2009,11 @@ static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from) static ssize_t tun_put_user_xdp(struct tun_struct *tun, struct tun_file *tfile, - struct xdp_buff *xdp, + struct xdp_frame *xdp_frame, struct iov_iter *iter) { int vnet_hdr_sz = 0; - size_t size = xdp->data_end - xdp->data; + size_t size = xdp_frame->len; struct tun_pcpu_stats *stats; size_t ret; @@ -2021,7 +2029,7 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun, iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso)); } - ret = copy_to_iter(xdp->data, size, iter) + vnet_hdr_sz; + ret = copy_to_iter(xdp_frame->data, size, iter) + vnet_hdr_sz; stats = get_cpu_ptr(tun->pcpu_stats); u64_stats_update_begin(&stats->syncp); @@ -2189,11 +2197,11 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, return err; } - if (tun_is_xdp_buff(ptr)) { - struct xdp_buff *xdp = tun_ptr_to_xdp(ptr); + if (tun_is_xdp_frame(ptr)) { + struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); - ret = tun_put_user_xdp(tun, tfile, xdp, to); - put_page(virt_to_head_page(xdp->data)); + ret = tun_put_user_xdp(tun, tfile, xdpf, to); + xdp_return_frame(xdpf); } else { struct sk_buff *skb = ptr; @@ -2432,10 +2440,10 @@ out_free: static int tun_ptr_peek_len(void *ptr) { if (likely(ptr)) { - if (tun_is_xdp_buff(ptr)) { - struct xdp_buff *xdp = tun_ptr_to_xdp(ptr); + if (tun_is_xdp_frame(ptr)) { + struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); - return xdp->data_end - xdp->data; + return xdpf->len; } return __skb_array_len_with_tag(ptr); } else { diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 7b187ec7411e..01694e26f03e 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -415,46 +415,51 @@ static void virtnet_xdp_flush(struct net_device *dev) virtqueue_kick(sq->vq); } -static bool __virtnet_xdp_xmit(struct virtnet_info *vi, - struct xdp_buff *xdp) +static int __virtnet_xdp_xmit(struct virtnet_info *vi, + struct xdp_frame *xdpf) { struct virtio_net_hdr_mrg_rxbuf *hdr; - unsigned int len; + struct xdp_frame *xdpf_sent; struct send_queue *sq; + unsigned int len; unsigned int qp; - void *xdp_sent; int err; qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id(); sq = &vi->sq[qp]; /* Free up any pending old buffers before queueing new ones. */ - while ((xdp_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) { - struct page *sent_page = virt_to_head_page(xdp_sent); + while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) + xdp_return_frame(xdpf_sent); - put_page(sent_page); - } + /* virtqueue want to use data area in-front of packet */ + if (unlikely(xdpf->metasize > 0)) + return -EOPNOTSUPP; - xdp->data -= vi->hdr_len; + if (unlikely(xdpf->headroom < vi->hdr_len)) + return -EOVERFLOW; + + /* Make room for virtqueue hdr (also change xdpf->headroom?) */ + xdpf->data -= vi->hdr_len; /* Zero header and leave csum up to XDP layers */ - hdr = xdp->data; + hdr = xdpf->data; memset(hdr, 0, vi->hdr_len); + xdpf->len += vi->hdr_len; - sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data); + sg_init_one(sq->sg, xdpf->data, xdpf->len); - err = virtqueue_add_outbuf(sq->vq, sq->sg, 1, xdp->data, GFP_ATOMIC); + err = virtqueue_add_outbuf(sq->vq, sq->sg, 1, xdpf, GFP_ATOMIC); if (unlikely(err)) - return false; /* Caller handle free/refcnt */ + return -ENOSPC; /* Caller handle free/refcnt */ - return true; + return 0; } -static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp) +static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf) { struct virtnet_info *vi = netdev_priv(dev); struct receive_queue *rq = vi->rq; struct bpf_prog *xdp_prog; - bool sent; /* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this * indicate XDP resources have been successfully allocated. @@ -463,10 +468,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp) if (!xdp_prog) return -ENXIO; - sent = __virtnet_xdp_xmit(vi, xdp); - if (!sent) - return -ENOSPC; - return 0; + return __virtnet_xdp_xmit(vi, xdpf); } static unsigned int virtnet_get_headroom(struct virtnet_info *vi) @@ -555,7 +557,6 @@ static struct sk_buff *receive_small(struct net_device *dev, struct page *page = virt_to_head_page(buf); unsigned int delta = 0; struct page *xdp_page; - bool sent; int err; len -= vi->hdr_len; @@ -564,6 +565,7 @@ static struct sk_buff *receive_small(struct net_device *dev, xdp_prog = rcu_dereference(rq->xdp_prog); if (xdp_prog) { struct virtio_net_hdr_mrg_rxbuf *hdr = buf + header_offset; + struct xdp_frame *xdpf; struct xdp_buff xdp; void *orig_data; u32 act; @@ -606,8 +608,11 @@ static struct sk_buff *receive_small(struct net_device *dev, delta = orig_data - xdp.data; break; case XDP_TX: - sent = __virtnet_xdp_xmit(vi, &xdp); - if (unlikely(!sent)) { + xdpf = convert_to_xdp_frame(&xdp); + if (unlikely(!xdpf)) + goto err_xdp; + err = __virtnet_xdp_xmit(vi, xdpf); + if (unlikely(err)) { trace_xdp_exception(vi->dev, xdp_prog, act); goto err_xdp; } @@ -690,7 +695,6 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, struct bpf_prog *xdp_prog; unsigned int truesize; unsigned int headroom = mergeable_ctx_to_headroom(ctx); - bool sent; int err; head_skb = NULL; @@ -698,6 +702,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, rcu_read_lock(); xdp_prog = rcu_dereference(rq->xdp_prog); if (xdp_prog) { + struct xdp_frame *xdpf; struct page *xdp_page; struct xdp_buff xdp; void *data; @@ -762,8 +767,11 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, } break; case XDP_TX: - sent = __virtnet_xdp_xmit(vi, &xdp); - if (unlikely(!sent)) { + xdpf = convert_to_xdp_frame(&xdp); + if (unlikely(!xdpf)) + goto err_xdp; + err = __virtnet_xdp_xmit(vi, xdpf); + if (unlikely(err)) { trace_xdp_exception(vi->dev, xdp_prog, act); if (unlikely(xdp_page != page)) put_page(xdp_page); @@ -1301,6 +1309,13 @@ static int virtnet_open(struct net_device *dev) if (err < 0) return err; + err = xdp_rxq_info_reg_mem_model(&vi->rq[i].xdp_rxq, + MEM_TYPE_PAGE_SHARED, NULL); + if (err < 0) { + xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq); + return err; + } + virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi); virtnet_napi_tx_enable(vi, vi->sq[i].vq, &vi->sq[i].napi); } diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 986058a57917..bbf38befefb2 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -32,6 +32,7 @@ #include #include +#include #include "vhost.h" @@ -181,10 +182,10 @@ static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq) static int vhost_net_buf_peek_len(void *ptr) { - if (tun_is_xdp_buff(ptr)) { - struct xdp_buff *xdp = tun_ptr_to_xdp(ptr); + if (tun_is_xdp_frame(ptr)) { + struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); - return xdp->data_end - xdp->data; + return xdpf->len; } return __skb_array_len_with_tag(ptr); diff --git a/include/linux/filter.h b/include/linux/filter.h index fc4e8f91b03d..4da8b2308174 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -30,6 +30,7 @@ struct sock; struct seccomp_data; struct bpf_prog_aux; struct xdp_rxq_info; +struct xdp_buff; /* ArgX, context and stack frame pointer register positions. Note, * Arg1, Arg2, Arg3, etc are used as argument mappings of function @@ -500,14 +501,6 @@ struct bpf_skb_data_end { void *data_end; }; -struct xdp_buff { - void *data; - void *data_end; - void *data_meta; - void *data_hard_start; - struct xdp_rxq_info *rxq; -}; - struct sk_msg_buff { void *data; void *data_end; @@ -772,21 +765,6 @@ int xdp_do_redirect(struct net_device *dev, struct bpf_prog *prog); void xdp_do_flush_map(void); -/* Drivers not supporting XDP metadata can use this helper, which - * rejects any room expansion for metadata as a result. - */ -static __always_inline void -xdp_set_data_meta_invalid(struct xdp_buff *xdp) -{ - xdp->data_meta = xdp->data + 1; -} - -static __always_inline bool -xdp_data_meta_unsupported(const struct xdp_buff *xdp) -{ - return unlikely(xdp->data_meta > xdp->data); -} - void bpf_warn_invalid_xdp_action(u32 act); struct sock *do_sk_redirect_map(struct sk_buff *skb); diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h index fd00170b494f..3d2996dc7d85 100644 --- a/include/linux/if_tun.h +++ b/include/linux/if_tun.h @@ -22,7 +22,7 @@ #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) struct socket *tun_get_socket(struct file *); struct ptr_ring *tun_get_tx_ring(struct file *file); -bool tun_is_xdp_buff(void *ptr); +bool tun_is_xdp_frame(void *ptr); void *tun_xdp_to_ptr(void *ptr); void *tun_ptr_to_xdp(void *ptr); void tun_ptr_free(void *ptr); @@ -39,7 +39,7 @@ static inline struct ptr_ring *tun_get_tx_ring(struct file *f) { return ERR_PTR(-EINVAL); } -static inline bool tun_is_xdp_buff(void *ptr) +static inline bool tun_is_xdp_frame(void *ptr) { return false; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cf44503ea81a..14e0777ffcfb 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1165,7 +1165,7 @@ struct dev_ifalias { * This function is used to set or query state related to XDP on the * netdevice and manage BPF offload. See definition of * enum bpf_netdev_command for details. - * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_buff *xdp); + * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_frame *xdp); * This function is used to submit a XDP packet for transmit on a * netdevice. * void (*ndo_xdp_flush)(struct net_device *dev); @@ -1356,7 +1356,7 @@ struct net_device_ops { int (*ndo_bpf)(struct net_device *dev, struct netdev_bpf *bpf); int (*ndo_xdp_xmit)(struct net_device *dev, - struct xdp_buff *xdp); + struct xdp_frame *xdp); void (*ndo_xdp_flush)(struct net_device *dev); }; diff --git a/include/net/page_pool.h b/include/net/page_pool.h new file mode 100644 index 000000000000..c79087153148 --- /dev/null +++ b/include/net/page_pool.h @@ -0,0 +1,143 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * page_pool.h + * Author: Jesper Dangaard Brouer + * Copyright (C) 2016 Red Hat, Inc. + */ + +/** + * DOC: page_pool allocator + * + * This page_pool allocator is optimized for the XDP mode that + * uses one-frame-per-page, but have fallbacks that act like the + * regular page allocator APIs. + * + * Basic use involve replacing alloc_pages() calls with the + * page_pool_alloc_pages() call. Drivers should likely use + * page_pool_dev_alloc_pages() replacing dev_alloc_pages(). + * + * If page_pool handles DMA mapping (use page->private), then API user + * is responsible for invoking page_pool_put_page() once. In-case of + * elevated refcnt, the DMA state is released, assuming other users of + * the page will eventually call put_page(). + * + * If no DMA mapping is done, then it can act as shim-layer that + * fall-through to alloc_page. As no state is kept on the page, the + * regular put_page() call is sufficient. + */ +#ifndef _NET_PAGE_POOL_H +#define _NET_PAGE_POOL_H + +#include /* Needed by ptr_ring */ +#include +#include + +#define PP_FLAG_DMA_MAP 1 /* Should page_pool do the DMA map/unmap */ +#define PP_FLAG_ALL PP_FLAG_DMA_MAP + +/* + * Fast allocation side cache array/stack + * + * The cache size and refill watermark is related to the network + * use-case. The NAPI budget is 64 packets. After a NAPI poll the RX + * ring is usually refilled and the max consumed elements will be 64, + * thus a natural max size of objects needed in the cache. + * + * Keeping room for more objects, is due to XDP_DROP use-case. As + * XDP_DROP allows the opportunity to recycle objects directly into + * this array, as it shares the same softirq/NAPI protection. If + * cache is already full (or partly full) then the XDP_DROP recycles + * would have to take a slower code path. + */ +#define PP_ALLOC_CACHE_SIZE 128 +#define PP_ALLOC_CACHE_REFILL 64 +struct pp_alloc_cache { + u32 count; + void *cache[PP_ALLOC_CACHE_SIZE]; +}; + +struct page_pool_params { + unsigned int flags; + unsigned int order; + unsigned int pool_size; + int nid; /* Numa node id to allocate from pages from */ + struct device *dev; /* device, for DMA pre-mapping purposes */ + enum dma_data_direction dma_dir; /* DMA mapping direction */ +}; + +struct page_pool { + struct rcu_head rcu; + struct page_pool_params p; + + /* + * Data structure for allocation side + * + * Drivers allocation side usually already perform some kind + * of resource protection. Piggyback on this protection, and + * require driver to protect allocation side. + * + * For NIC drivers this means, allocate a page_pool per + * RX-queue. As the RX-queue is already protected by + * Softirq/BH scheduling and napi_schedule. NAPI schedule + * guarantee that a single napi_struct will only be scheduled + * on a single CPU (see napi_schedule). + */ + struct pp_alloc_cache alloc ____cacheline_aligned_in_smp; + + /* Data structure for storing recycled pages. + * + * Returning/freeing pages is more complicated synchronization + * wise, because free's can happen on remote CPUs, with no + * association with allocation resource. + * + * Use ptr_ring, as it separates consumer and producer + * effeciently, it a way that doesn't bounce cache-lines. + * + * TODO: Implement bulk return pages into this structure. + */ + struct ptr_ring ring; +}; + +struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp); + +static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool) +{ + gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); + + return page_pool_alloc_pages(pool, gfp); +} + +struct page_pool *page_pool_create(const struct page_pool_params *params); + +void page_pool_destroy(struct page_pool *pool); + +/* Never call this directly, use helpers below */ +void __page_pool_put_page(struct page_pool *pool, + struct page *page, bool allow_direct); + +static inline void page_pool_put_page(struct page_pool *pool, struct page *page) +{ + /* When page_pool isn't compiled-in, net/core/xdp.c doesn't + * allow registering MEM_TYPE_PAGE_POOL, but shield linker. + */ +#ifdef CONFIG_PAGE_POOL + __page_pool_put_page(pool, page, false); +#endif +} +/* Very limited use-cases allow recycle direct */ +static inline void page_pool_recycle_direct(struct page_pool *pool, + struct page *page) +{ + __page_pool_put_page(pool, page, true); +} + +static inline bool is_page_pool_compiled_in(void) +{ +#ifdef CONFIG_PAGE_POOL + return true; +#else + return false; +#endif +} + +#endif /* _NET_PAGE_POOL_H */ diff --git a/include/net/xdp.h b/include/net/xdp.h index b2362ddfa694..137ad5f9f40f 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -33,16 +33,99 @@ * also mandatory during RX-ring setup. */ +enum xdp_mem_type { + MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */ + MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */ + MEM_TYPE_PAGE_POOL, + MEM_TYPE_MAX, +}; + +struct xdp_mem_info { + u32 type; /* enum xdp_mem_type, but known size type */ + u32 id; +}; + +struct page_pool; + struct xdp_rxq_info { struct net_device *dev; u32 queue_index; u32 reg_state; + struct xdp_mem_info mem; } ____cacheline_aligned; /* perf critical, avoid false-sharing */ +struct xdp_buff { + void *data; + void *data_end; + void *data_meta; + void *data_hard_start; + struct xdp_rxq_info *rxq; +}; + +struct xdp_frame { + void *data; + u16 len; + u16 headroom; + u16 metasize; + /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time, + * while mem info is valid on remote CPU. + */ + struct xdp_mem_info mem; + struct net_device *dev_rx; /* used by cpumap */ +}; + +/* Convert xdp_buff to xdp_frame */ +static inline +struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) +{ + struct xdp_frame *xdp_frame; + int metasize; + int headroom; + + /* Assure headroom is available for storing info */ + headroom = xdp->data - xdp->data_hard_start; + metasize = xdp->data - xdp->data_meta; + metasize = metasize > 0 ? metasize : 0; + if (unlikely((headroom - metasize) < sizeof(*xdp_frame))) + return NULL; + + /* Store info in top of packet */ + xdp_frame = xdp->data_hard_start; + + xdp_frame->data = xdp->data; + xdp_frame->len = xdp->data_end - xdp->data; + xdp_frame->headroom = headroom - sizeof(*xdp_frame); + xdp_frame->metasize = metasize; + + /* rxq only valid until napi_schedule ends, convert to xdp_mem_info */ + xdp_frame->mem = xdp->rxq->mem; + + return xdp_frame; +} + +void xdp_return_frame(struct xdp_frame *xdpf); + int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index); void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq); void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq); bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq); +int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, + enum xdp_mem_type type, void *allocator); + +/* Drivers not supporting XDP metadata can use this helper, which + * rejects any room expansion for metadata as a result. + */ +static __always_inline void +xdp_set_data_meta_invalid(struct xdp_buff *xdp) +{ + xdp->data_meta = xdp->data + 1; +} + +static __always_inline bool +xdp_data_meta_unsupported(const struct xdp_buff *xdp) +{ + return unlikely(xdp->data_meta > xdp->data); +} #endif /* __LINUX_NET_XDP_H__ */ diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index a4bb0b34375a..c95b04ec103e 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -137,27 +138,6 @@ free_cmap: return ERR_PTR(err); } -static void __cpu_map_queue_destructor(void *ptr) -{ - /* The tear-down procedure should have made sure that queue is - * empty. See __cpu_map_entry_replace() and work-queue - * invoked cpu_map_kthread_stop(). Catch any broken behaviour - * gracefully and warn once. - */ - if (WARN_ON_ONCE(ptr)) - page_frag_free(ptr); -} - -static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) -{ - if (atomic_dec_and_test(&rcpu->refcnt)) { - /* The queue should be empty at this point */ - ptr_ring_cleanup(rcpu->queue, __cpu_map_queue_destructor); - kfree(rcpu->queue); - kfree(rcpu); - } -} - static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) { atomic_inc(&rcpu->refcnt); @@ -179,45 +159,8 @@ static void cpu_map_kthread_stop(struct work_struct *work) kthread_stop(rcpu->kthread); } -/* For now, xdp_pkt is a cpumap internal data structure, with info - * carried between enqueue to dequeue. It is mapped into the top - * headroom of the packet, to avoid allocating separate mem. - */ -struct xdp_pkt { - void *data; - u16 len; - u16 headroom; - u16 metasize; - struct net_device *dev_rx; -}; - -/* Convert xdp_buff to xdp_pkt */ -static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp) -{ - struct xdp_pkt *xdp_pkt; - int metasize; - int headroom; - - /* Assure headroom is available for storing info */ - headroom = xdp->data - xdp->data_hard_start; - metasize = xdp->data - xdp->data_meta; - metasize = metasize > 0 ? metasize : 0; - if (unlikely((headroom - metasize) < sizeof(*xdp_pkt))) - return NULL; - - /* Store info in top of packet */ - xdp_pkt = xdp->data_hard_start; - - xdp_pkt->data = xdp->data; - xdp_pkt->len = xdp->data_end - xdp->data; - xdp_pkt->headroom = headroom - sizeof(*xdp_pkt); - xdp_pkt->metasize = metasize; - - return xdp_pkt; -} - static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, - struct xdp_pkt *xdp_pkt) + struct xdp_frame *xdpf) { unsigned int frame_size; void *pkt_data_start; @@ -232,7 +175,7 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, * would be preferred to set frame_size to 2048 or 4096 * depending on the driver. * frame_size = 2048; - * frame_len = frame_size - sizeof(*xdp_pkt); + * frame_len = frame_size - sizeof(*xdp_frame); * * Instead, with info avail, skb_shared_info in placed after * packet len. This, unfortunately fakes the truesize. @@ -240,21 +183,21 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, * is not at a fixed memory location, with mixed length * packets, which is bad for cache-line hotness. */ - frame_size = SKB_DATA_ALIGN(xdp_pkt->len) + xdp_pkt->headroom + + frame_size = SKB_DATA_ALIGN(xdpf->len) + xdpf->headroom + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - pkt_data_start = xdp_pkt->data - xdp_pkt->headroom; + pkt_data_start = xdpf->data - xdpf->headroom; skb = build_skb(pkt_data_start, frame_size); if (!skb) return NULL; - skb_reserve(skb, xdp_pkt->headroom); - __skb_put(skb, xdp_pkt->len); - if (xdp_pkt->metasize) - skb_metadata_set(skb, xdp_pkt->metasize); + skb_reserve(skb, xdpf->headroom); + __skb_put(skb, xdpf->len); + if (xdpf->metasize) + skb_metadata_set(skb, xdpf->metasize); /* Essential SKB info: protocol and skb->dev */ - skb->protocol = eth_type_trans(skb, xdp_pkt->dev_rx); + skb->protocol = eth_type_trans(skb, xdpf->dev_rx); /* Optional SKB info, currently missing: * - HW checksum info (skb->ip_summed) @@ -265,6 +208,31 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, return skb; } +static void __cpu_map_ring_cleanup(struct ptr_ring *ring) +{ + /* The tear-down procedure should have made sure that queue is + * empty. See __cpu_map_entry_replace() and work-queue + * invoked cpu_map_kthread_stop(). Catch any broken behaviour + * gracefully and warn once. + */ + struct xdp_frame *xdpf; + + while ((xdpf = ptr_ring_consume(ring))) + if (WARN_ON_ONCE(xdpf)) + xdp_return_frame(xdpf); +} + +static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) +{ + if (atomic_dec_and_test(&rcpu->refcnt)) { + /* The queue should be empty at this point */ + __cpu_map_ring_cleanup(rcpu->queue); + ptr_ring_cleanup(rcpu->queue, NULL); + kfree(rcpu->queue); + kfree(rcpu); + } +} + static int cpu_map_kthread_run(void *data) { struct bpf_cpu_map_entry *rcpu = data; @@ -278,7 +246,7 @@ static int cpu_map_kthread_run(void *data) */ while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { unsigned int processed = 0, drops = 0, sched = 0; - struct xdp_pkt *xdp_pkt; + struct xdp_frame *xdpf; /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { @@ -301,13 +269,13 @@ static int cpu_map_kthread_run(void *data) * kthread CPU pinned. Lockless access to ptr_ring * consume side valid as no-resize allowed of queue. */ - while ((xdp_pkt = __ptr_ring_consume(rcpu->queue))) { + while ((xdpf = __ptr_ring_consume(rcpu->queue))) { struct sk_buff *skb; int ret; - skb = cpu_map_build_skb(rcpu, xdp_pkt); + skb = cpu_map_build_skb(rcpu, xdpf); if (!skb) { - page_frag_free(xdp_pkt); + xdp_return_frame(xdpf); continue; } @@ -604,13 +572,13 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, spin_lock(&q->producer_lock); for (i = 0; i < bq->count; i++) { - void *xdp_pkt = bq->q[i]; + struct xdp_frame *xdpf = bq->q[i]; int err; - err = __ptr_ring_produce(q, xdp_pkt); + err = __ptr_ring_produce(q, xdpf); if (err) { drops++; - page_frag_free(xdp_pkt); /* Free xdp_pkt */ + xdp_return_frame(xdpf); } processed++; } @@ -625,7 +593,7 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, /* Runs under RCU-read-side, plus in softirq under NAPI protection. * Thus, safe percpu variable access. */ -static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt) +static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) { struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); @@ -636,28 +604,28 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt) * driver to code invoking us to finished, due to driver * (e.g. ixgbe) recycle tricks based on page-refcnt. * - * Thus, incoming xdp_pkt is always queued here (else we race + * Thus, incoming xdp_frame is always queued here (else we race * with another CPU on page-refcnt and remaining driver code). * Queue time is very short, as driver will invoke flush * operation, when completing napi->poll call. */ - bq->q[bq->count++] = xdp_pkt; + bq->q[bq->count++] = xdpf; return 0; } int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx) { - struct xdp_pkt *xdp_pkt; + struct xdp_frame *xdpf; - xdp_pkt = convert_to_xdp_pkt(xdp); - if (unlikely(!xdp_pkt)) + xdpf = convert_to_xdp_frame(xdp); + if (unlikely(!xdpf)) return -EOVERFLOW; /* Info needed when constructing SKB on remote CPU */ - xdp_pkt->dev_rx = dev_rx; + xdpf->dev_rx = dev_rx; - bq_enqueue(rcpu, xdp_pkt); + bq_enqueue(rcpu, xdpf); return 0; } diff --git a/net/Kconfig b/net/Kconfig index 0428f12c25c2..6fa1a4493b8c 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -423,6 +423,9 @@ config MAY_USE_DEVLINK on MAY_USE_DEVLINK to ensure they do not cause link errors when devlink is a loadable module and the driver using it is built-in. +config PAGE_POOL + bool + endif # if NET # Used by archs to tell that they support BPF JIT compiler plus which flavour. diff --git a/net/core/Makefile b/net/core/Makefile index 6dbbba8c57ae..7080417f8bc8 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -14,6 +14,7 @@ obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ fib_notifier.o xdp.o obj-y += net-sysfs.o +obj-$(CONFIG_PAGE_POOL) += page_pool.o obj-$(CONFIG_PROC_FS) += net-procfs.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_NETPOLL) += netpoll.o diff --git a/net/core/filter.c b/net/core/filter.c index d31aff93270d..a374b8560bc4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2692,6 +2692,7 @@ static unsigned long xdp_get_metalen(const struct xdp_buff *xdp) BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) { + void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame); unsigned long metalen = xdp_get_metalen(xdp); void *data_start = xdp->data_hard_start + metalen; void *data = xdp->data + offset; @@ -2700,6 +2701,13 @@ BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) data > xdp->data_end - ETH_HLEN)) return -EINVAL; + /* Avoid info leak, when reusing area prev used by xdp_frame */ + if (data < xdp_frame_end) { + unsigned long clearlen = xdp_frame_end - data; + + memset(data, 0, clearlen); + } + if (metalen) memmove(xdp->data_meta + offset, xdp->data_meta, metalen); @@ -2749,13 +2757,18 @@ static int __bpf_tx_xdp(struct net_device *dev, struct xdp_buff *xdp, u32 index) { + struct xdp_frame *xdpf; int err; if (!dev->netdev_ops->ndo_xdp_xmit) { return -EOPNOTSUPP; } - err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp); + xdpf = convert_to_xdp_frame(xdp); + if (unlikely(!xdpf)) + return -EOVERFLOW; + + err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); if (err) return err; dev->netdev_ops->ndo_xdp_flush(dev); @@ -2771,11 +2784,19 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, if (map->map_type == BPF_MAP_TYPE_DEVMAP) { struct net_device *dev = fwd; + struct xdp_frame *xdpf; if (!dev->netdev_ops->ndo_xdp_xmit) return -EOPNOTSUPP; - err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp); + xdpf = convert_to_xdp_frame(xdp); + if (unlikely(!xdpf)) + return -EOVERFLOW; + + /* TODO: move to inside map code instead, for bulk support + * err = dev_map_enqueue(dev, xdp); + */ + err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); if (err) return err; __dev_map_insert_ctx(map, index); diff --git a/net/core/page_pool.c b/net/core/page_pool.c new file mode 100644 index 000000000000..68bf07206744 --- /dev/null +++ b/net/core/page_pool.c @@ -0,0 +1,317 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * page_pool.c + * Author: Jesper Dangaard Brouer + * Copyright (C) 2016 Red Hat, Inc. + */ +#include +#include +#include + +#include +#include +#include +#include +#include /* for __put_page() */ + +static int page_pool_init(struct page_pool *pool, + const struct page_pool_params *params) +{ + unsigned int ring_qsize = 1024; /* Default */ + + memcpy(&pool->p, params, sizeof(pool->p)); + + /* Validate only known flags were used */ + if (pool->p.flags & ~(PP_FLAG_ALL)) + return -EINVAL; + + if (pool->p.pool_size) + ring_qsize = pool->p.pool_size; + + /* Sanity limit mem that can be pinned down */ + if (ring_qsize > 32768) + return -E2BIG; + + /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. + * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, + * which is the XDP_TX use-case. + */ + if ((pool->p.dma_dir != DMA_FROM_DEVICE) && + (pool->p.dma_dir != DMA_BIDIRECTIONAL)) + return -EINVAL; + + if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) + return -ENOMEM; + + return 0; +} + +struct page_pool *page_pool_create(const struct page_pool_params *params) +{ + struct page_pool *pool; + int err = 0; + + pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid); + if (!pool) + return ERR_PTR(-ENOMEM); + + err = page_pool_init(pool, params); + if (err < 0) { + pr_warn("%s() gave up with errno %d\n", __func__, err); + kfree(pool); + return ERR_PTR(err); + } + return pool; +} +EXPORT_SYMBOL(page_pool_create); + +/* fast path */ +static struct page *__page_pool_get_cached(struct page_pool *pool) +{ + struct ptr_ring *r = &pool->ring; + struct page *page; + + /* Quicker fallback, avoid locks when ring is empty */ + if (__ptr_ring_empty(r)) + return NULL; + + /* Test for safe-context, caller should provide this guarantee */ + if (likely(in_serving_softirq())) { + if (likely(pool->alloc.count)) { + /* Fast-path */ + page = pool->alloc.cache[--pool->alloc.count]; + return page; + } + /* Slower-path: Alloc array empty, time to refill + * + * Open-coded bulk ptr_ring consumer. + * + * Discussion: the ring consumer lock is not really + * needed due to the softirq/NAPI protection, but + * later need the ability to reclaim pages on the + * ring. Thus, keeping the locks. + */ + spin_lock(&r->consumer_lock); + while ((page = __ptr_ring_consume(r))) { + if (pool->alloc.count == PP_ALLOC_CACHE_REFILL) + break; + pool->alloc.cache[pool->alloc.count++] = page; + } + spin_unlock(&r->consumer_lock); + return page; + } + + /* Slow-path: Get page from locked ring queue */ + page = ptr_ring_consume(&pool->ring); + return page; +} + +/* slow path */ +noinline +static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, + gfp_t _gfp) +{ + struct page *page; + gfp_t gfp = _gfp; + dma_addr_t dma; + + /* We could always set __GFP_COMP, and avoid this branch, as + * prep_new_page() can handle order-0 with __GFP_COMP. + */ + if (pool->p.order) + gfp |= __GFP_COMP; + + /* FUTURE development: + * + * Current slow-path essentially falls back to single page + * allocations, which doesn't improve performance. This code + * need bulk allocation support from the page allocator code. + */ + + /* Cache was empty, do real allocation */ + page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); + if (!page) + return NULL; + + if (!(pool->p.flags & PP_FLAG_DMA_MAP)) + goto skip_dma_map; + + /* Setup DMA mapping: use page->private for DMA-addr + * This mapping is kept for lifetime of page, until leaving pool. + */ + dma = dma_map_page(pool->p.dev, page, 0, + (PAGE_SIZE << pool->p.order), + pool->p.dma_dir); + if (dma_mapping_error(pool->p.dev, dma)) { + put_page(page); + return NULL; + } + set_page_private(page, dma); /* page->private = dma; */ + +skip_dma_map: + /* When page just alloc'ed is should/must have refcnt 1. */ + return page; +} + +/* For using page_pool replace: alloc_pages() API calls, but provide + * synchronization guarantee for allocation side. + */ +struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) +{ + struct page *page; + + /* Fast-path: Get a page from cache */ + page = __page_pool_get_cached(pool); + if (page) + return page; + + /* Slow-path: cache empty, do real allocation */ + page = __page_pool_alloc_pages_slow(pool, gfp); + return page; +} +EXPORT_SYMBOL(page_pool_alloc_pages); + +/* Cleanup page_pool state from page */ +static void __page_pool_clean_page(struct page_pool *pool, + struct page *page) +{ + if (!(pool->p.flags & PP_FLAG_DMA_MAP)) + return; + + /* DMA unmap */ + dma_unmap_page(pool->p.dev, page_private(page), + PAGE_SIZE << pool->p.order, pool->p.dma_dir); + set_page_private(page, 0); +} + +/* Return a page to the page allocator, cleaning up our state */ +static void __page_pool_return_page(struct page_pool *pool, struct page *page) +{ + __page_pool_clean_page(pool, page); + put_page(page); + /* An optimization would be to call __free_pages(page, pool->p.order) + * knowing page is not part of page-cache (thus avoiding a + * __page_cache_release() call). + */ +} + +static bool __page_pool_recycle_into_ring(struct page_pool *pool, + struct page *page) +{ + int ret; + /* BH protection not needed if current is serving softirq */ + if (in_serving_softirq()) + ret = ptr_ring_produce(&pool->ring, page); + else + ret = ptr_ring_produce_bh(&pool->ring, page); + + return (ret == 0) ? true : false; +} + +/* Only allow direct recycling in special circumstances, into the + * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case. + * + * Caller must provide appropriate safe context. + */ +static bool __page_pool_recycle_direct(struct page *page, + struct page_pool *pool) +{ + if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) + return false; + + /* Caller MUST have verified/know (page_ref_count(page) == 1) */ + pool->alloc.cache[pool->alloc.count++] = page; + return true; +} + +void __page_pool_put_page(struct page_pool *pool, + struct page *page, bool allow_direct) +{ + /* This allocator is optimized for the XDP mode that uses + * one-frame-per-page, but have fallbacks that act like the + * regular page allocator APIs. + * + * refcnt == 1 means page_pool owns page, and can recycle it. + */ + if (likely(page_ref_count(page) == 1)) { + /* Read barrier done in page_ref_count / READ_ONCE */ + + if (allow_direct && in_serving_softirq()) + if (__page_pool_recycle_direct(page, pool)) + return; + + if (!__page_pool_recycle_into_ring(pool, page)) { + /* Cache full, fallback to free pages */ + __page_pool_return_page(pool, page); + } + return; + } + /* Fallback/non-XDP mode: API user have elevated refcnt. + * + * Many drivers split up the page into fragments, and some + * want to keep doing this to save memory and do refcnt based + * recycling. Support this use case too, to ease drivers + * switching between XDP/non-XDP. + * + * In-case page_pool maintains the DMA mapping, API user must + * call page_pool_put_page once. In this elevated refcnt + * case, the DMA is unmapped/released, as driver is likely + * doing refcnt based recycle tricks, meaning another process + * will be invoking put_page. + */ + __page_pool_clean_page(pool, page); + put_page(page); +} +EXPORT_SYMBOL(__page_pool_put_page); + +static void __page_pool_empty_ring(struct page_pool *pool) +{ + struct page *page; + + /* Empty recycle ring */ + while ((page = ptr_ring_consume(&pool->ring))) { + /* Verify the refcnt invariant of cached pages */ + if (!(page_ref_count(page) == 1)) + pr_crit("%s() page_pool refcnt %d violation\n", + __func__, page_ref_count(page)); + + __page_pool_return_page(pool, page); + } +} + +static void __page_pool_destroy_rcu(struct rcu_head *rcu) +{ + struct page_pool *pool; + + pool = container_of(rcu, struct page_pool, rcu); + + WARN(pool->alloc.count, "API usage violation"); + + __page_pool_empty_ring(pool); + ptr_ring_cleanup(&pool->ring, NULL); + kfree(pool); +} + +/* Cleanup and release resources */ +void page_pool_destroy(struct page_pool *pool) +{ + struct page *page; + + /* Empty alloc cache, assume caller made sure this is + * no-longer in use, and page_pool_alloc_pages() cannot be + * call concurrently. + */ + while (pool->alloc.count) { + page = pool->alloc.cache[--pool->alloc.count]; + __page_pool_return_page(pool, page); + } + + /* No more consumers should exist, but producers could still + * be in-flight. + */ + __page_pool_empty_ring(pool); + + /* An xdp_mem_allocator can still ref page_pool pointer */ + call_rcu(&pool->rcu, __page_pool_destroy_rcu); +} +EXPORT_SYMBOL(page_pool_destroy); diff --git a/net/core/xdp.c b/net/core/xdp.c index 097a0f74e004..0c86b53a3a63 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -5,6 +5,10 @@ */ #include #include +#include +#include +#include +#include #include @@ -13,6 +17,104 @@ #define REG_STATE_UNREGISTERED 0x2 #define REG_STATE_UNUSED 0x3 +static DEFINE_IDA(mem_id_pool); +static DEFINE_MUTEX(mem_id_lock); +#define MEM_ID_MAX 0xFFFE +#define MEM_ID_MIN 1 +static int mem_id_next = MEM_ID_MIN; + +static bool mem_id_init; /* false */ +static struct rhashtable *mem_id_ht; + +struct xdp_mem_allocator { + struct xdp_mem_info mem; + union { + void *allocator; + struct page_pool *page_pool; + }; + struct rhash_head node; + struct rcu_head rcu; +}; + +static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed) +{ + const u32 *k = data; + const u32 key = *k; + + BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_mem_allocator, mem.id) + != sizeof(u32)); + + /* Use cyclic increasing ID as direct hash key, see rht_bucket_index */ + return key << RHT_HASH_RESERVED_SPACE; +} + +static int xdp_mem_id_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) +{ + const struct xdp_mem_allocator *xa = ptr; + u32 mem_id = *(u32 *)arg->key; + + return xa->mem.id != mem_id; +} + +static const struct rhashtable_params mem_id_rht_params = { + .nelem_hint = 64, + .head_offset = offsetof(struct xdp_mem_allocator, node), + .key_offset = offsetof(struct xdp_mem_allocator, mem.id), + .key_len = FIELD_SIZEOF(struct xdp_mem_allocator, mem.id), + .max_size = MEM_ID_MAX, + .min_size = 8, + .automatic_shrinking = true, + .hashfn = xdp_mem_id_hashfn, + .obj_cmpfn = xdp_mem_id_cmp, +}; + +static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu) +{ + struct xdp_mem_allocator *xa; + + xa = container_of(rcu, struct xdp_mem_allocator, rcu); + + /* Allow this ID to be reused */ + ida_simple_remove(&mem_id_pool, xa->mem.id); + + /* Notice, driver is expected to free the *allocator, + * e.g. page_pool, and MUST also use RCU free. + */ + + /* Poison memory */ + xa->mem.id = 0xFFFF; + xa->mem.type = 0xF0F0; + xa->allocator = (void *)0xDEAD9001; + + kfree(xa); +} + +static void __xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) +{ + struct xdp_mem_allocator *xa; + int id = xdp_rxq->mem.id; + int err; + + if (id == 0) + return; + + mutex_lock(&mem_id_lock); + + xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params); + if (!xa) { + mutex_unlock(&mem_id_lock); + return; + } + + err = rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params); + WARN_ON(err); + + call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); + + mutex_unlock(&mem_id_lock); +} + void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) { /* Simplify driver cleanup code paths, allow unreg "unused" */ @@ -21,8 +123,14 @@ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG"); + __xdp_rxq_info_unreg_mem_model(xdp_rxq); + xdp_rxq->reg_state = REG_STATE_UNREGISTERED; xdp_rxq->dev = NULL; + + /* Reset mem info to defaults */ + xdp_rxq->mem.id = 0; + xdp_rxq->mem.type = 0; } EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg); @@ -71,3 +179,164 @@ bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq) return (xdp_rxq->reg_state == REG_STATE_REGISTERED); } EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg); + +static int __mem_id_init_hash_table(void) +{ + struct rhashtable *rht; + int ret; + + if (unlikely(mem_id_init)) + return 0; + + rht = kzalloc(sizeof(*rht), GFP_KERNEL); + if (!rht) + return -ENOMEM; + + ret = rhashtable_init(rht, &mem_id_rht_params); + if (ret < 0) { + kfree(rht); + return ret; + } + mem_id_ht = rht; + smp_mb(); /* mutex lock should provide enough pairing */ + mem_id_init = true; + + return 0; +} + +/* Allocate a cyclic ID that maps to allocator pointer. + * See: https://www.kernel.org/doc/html/latest/core-api/idr.html + * + * Caller must lock mem_id_lock. + */ +static int __mem_id_cyclic_get(gfp_t gfp) +{ + int retries = 1; + int id; + +again: + id = ida_simple_get(&mem_id_pool, mem_id_next, MEM_ID_MAX, gfp); + if (id < 0) { + if (id == -ENOSPC) { + /* Cyclic allocator, reset next id */ + if (retries--) { + mem_id_next = MEM_ID_MIN; + goto again; + } + } + return id; /* errno */ + } + mem_id_next = id + 1; + + return id; +} + +static bool __is_supported_mem_type(enum xdp_mem_type type) +{ + if (type == MEM_TYPE_PAGE_POOL) + return is_page_pool_compiled_in(); + + if (type >= MEM_TYPE_MAX) + return false; + + return true; +} + +int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, + enum xdp_mem_type type, void *allocator) +{ + struct xdp_mem_allocator *xdp_alloc; + gfp_t gfp = GFP_KERNEL; + int id, errno, ret; + void *ptr; + + if (xdp_rxq->reg_state != REG_STATE_REGISTERED) { + WARN(1, "Missing register, driver bug"); + return -EFAULT; + } + + if (!__is_supported_mem_type(type)) + return -EOPNOTSUPP; + + xdp_rxq->mem.type = type; + + if (!allocator) { + if (type == MEM_TYPE_PAGE_POOL) + return -EINVAL; /* Setup time check page_pool req */ + return 0; + } + + /* Delay init of rhashtable to save memory if feature isn't used */ + if (!mem_id_init) { + mutex_lock(&mem_id_lock); + ret = __mem_id_init_hash_table(); + mutex_unlock(&mem_id_lock); + if (ret < 0) { + WARN_ON(1); + return ret; + } + } + + xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp); + if (!xdp_alloc) + return -ENOMEM; + + mutex_lock(&mem_id_lock); + id = __mem_id_cyclic_get(gfp); + if (id < 0) { + errno = id; + goto err; + } + xdp_rxq->mem.id = id; + xdp_alloc->mem = xdp_rxq->mem; + xdp_alloc->allocator = allocator; + + /* Insert allocator into ID lookup table */ + ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node); + if (IS_ERR(ptr)) { + errno = PTR_ERR(ptr); + goto err; + } + + mutex_unlock(&mem_id_lock); + + return 0; +err: + mutex_unlock(&mem_id_lock); + kfree(xdp_alloc); + return errno; +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); + +void xdp_return_frame(struct xdp_frame *xdpf) +{ + struct xdp_mem_info *mem = &xdpf->mem; + struct xdp_mem_allocator *xa; + void *data = xdpf->data; + struct page *page; + + switch (mem->type) { + case MEM_TYPE_PAGE_POOL: + rcu_read_lock(); + /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ + xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); + page = virt_to_head_page(data); + if (xa) + page_pool_put_page(xa->page_pool, page); + else + put_page(page); + rcu_read_unlock(); + break; + case MEM_TYPE_PAGE_SHARED: + page_frag_free(data); + break; + case MEM_TYPE_PAGE_ORDER0: + page = virt_to_page(data); /* Assumes order0 page*/ + put_page(page); + break; + default: + /* Not possible, checked in xdp_rxq_info_reg_mem_model() */ + break; + } +} +EXPORT_SYMBOL_GPL(xdp_return_frame);