Alexei Starovoitov says:

====================
pull-request: bpf-next 2020-12-03

The main changes are:

1) Support BTF in kernel modules, from Andrii.

2) Introduce preferred busy-polling, from Björn.

3) bpf_ima_inode_hash() and bpf_bprm_opts_set() helpers, from KP Singh.

4) Memcg-based memory accounting for bpf objects, from Roman.

5) Allow bpf_{s,g}etsockopt from cgroup bind{4,6} hooks, from Stanislav.

* https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (118 commits)
  selftests/bpf: Fix invalid use of strncat in test_sockmap
  libbpf: Use memcpy instead of strncpy to please GCC
  selftests/bpf: Add fentry/fexit/fmod_ret selftest for kernel module
  selftests/bpf: Add tp_btf CO-RE reloc test for modules
  libbpf: Support attachment of BPF tracing programs to kernel modules
  libbpf: Factor out low-level BPF program loading helper
  bpf: Allow to specify kernel module BTFs when attaching BPF programs
  bpf: Remove hard-coded btf_vmlinux assumption from BPF verifier
  selftests/bpf: Add CO-RE relocs selftest relying on kernel module BTF
  selftests/bpf: Add support for marking sub-tests as skipped
  selftests/bpf: Add bpf_testmod kernel module for testing
  libbpf: Add kernel module BTF support for CO-RE relocations
  libbpf: Refactor CO-RE relocs to not assume a single BTF object
  libbpf: Add internal helper to load BTF data by FD
  bpf: Keep module's btf_data_size intact after load
  bpf: Fix bpf_put_raw_tracepoint()'s use of __module_address()
  selftests/bpf: Add Userspace tests for TCP_WINDOW_CLAMP
  bpf: Adds support for setting window clamp
  samples/bpf: Fix spelling mistake "recieving" -> "receiving"
  bpf: Fix cold build of test_progs-no_alu32
  ...
====================

Link: https://lore.kernel.org/r/20201204021936.85653-1-alexei.starovoitov@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Jakub Kicinski 2020-12-04 07:48:11 -08:00
commit a1dd1d8697
196 changed files with 4335 additions and 2574 deletions

View File

@ -124,6 +124,9 @@
#define SO_DETACH_REUSEPORT_BPF 68
#define SO_PREFER_BUSY_POLL 69
#define SO_BUSY_POLL_BUDGET 70
#if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64

View File

@ -135,6 +135,9 @@
#define SO_DETACH_REUSEPORT_BPF 68
#define SO_PREFER_BUSY_POLL 69
#define SO_BUSY_POLL_BUDGET 70
#if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64

View File

@ -116,6 +116,9 @@
#define SO_DETACH_REUSEPORT_BPF 0x4042
#define SO_PREFER_BUSY_POLL 0x4043
#define SO_BUSY_POLL_BUDGET 0x4044
#if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64

View File

@ -117,6 +117,9 @@
#define SO_DETACH_REUSEPORT_BPF 0x0047
#define SO_PREFER_BUSY_POLL 0x0048
#define SO_BUSY_POLL_BUDGET 0x0049
#if !defined(__KERNEL__)

View File

@ -416,7 +416,7 @@ static int ena_xdp_register_rxq_info(struct ena_ring *rx_ring)
{
int rc;
rc = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, rx_ring->qid);
rc = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, rx_ring->qid, 0);
if (rc) {
netif_err(rx_ring->adapter, ifup, rx_ring->netdev,

View File

@ -2884,7 +2884,7 @@ static int bnxt_alloc_rx_rings(struct bnxt *bp)
if (rc)
return rc;
rc = xdp_rxq_info_reg(&rxr->xdp_rxq, bp->dev, i);
rc = xdp_rxq_info_reg(&rxr->xdp_rxq, bp->dev, i, 0);
if (rc < 0)
return rc;

View File

@ -770,7 +770,7 @@ static void nicvf_rcv_queue_config(struct nicvf *nic, struct queue_set *qs,
rq->caching = 1;
/* Driver have no proper error path for failed XDP RX-queue info reg */
WARN_ON(xdp_rxq_info_reg(&rq->xdp_rxq, nic->netdev, qidx) < 0);
WARN_ON(xdp_rxq_info_reg(&rq->xdp_rxq, nic->netdev, qidx, 0) < 0);
/* Send a mailbox msg to PF to config RQ */
mbx.rq.msg = NIC_MBOX_MSG_RQ_CFG;

View File

@ -3334,7 +3334,7 @@ static int dpaa2_eth_setup_rx_flow(struct dpaa2_eth_priv *priv,
return 0;
err = xdp_rxq_info_reg(&fq->channel->xdp_rxq, priv->net_dev,
fq->flowid);
fq->flowid, 0);
if (err) {
dev_err(dev, "xdp_rxq_info_reg failed\n");
return err;

View File

@ -676,6 +676,8 @@ void i40e_free_tx_resources(struct i40e_ring *tx_ring)
i40e_clean_tx_ring(tx_ring);
kfree(tx_ring->tx_bi);
tx_ring->tx_bi = NULL;
kfree(tx_ring->xsk_descs);
tx_ring->xsk_descs = NULL;
if (tx_ring->desc) {
dma_free_coherent(tx_ring->dev, tx_ring->size,
@ -1277,6 +1279,13 @@ int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring)
if (!tx_ring->tx_bi)
goto err;
if (ring_is_xdp(tx_ring)) {
tx_ring->xsk_descs = kcalloc(I40E_MAX_NUM_DESCRIPTORS, sizeof(*tx_ring->xsk_descs),
GFP_KERNEL);
if (!tx_ring->xsk_descs)
goto err;
}
u64_stats_init(&tx_ring->syncp);
/* round up to nearest 4K */
@ -1300,6 +1309,8 @@ int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring)
return 0;
err:
kfree(tx_ring->xsk_descs);
tx_ring->xsk_descs = NULL;
kfree(tx_ring->tx_bi);
tx_ring->tx_bi = NULL;
return -ENOMEM;
@ -1436,7 +1447,7 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
/* XDP RX-queue info only needed for RX rings exposed to XDP */
if (rx_ring->vsi->type == I40E_VSI_MAIN) {
err = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev,
rx_ring->queue_index);
rx_ring->queue_index, rx_ring->q_vector->napi.napi_id);
if (err < 0)
return err;
}

View File

@ -389,6 +389,7 @@ struct i40e_ring {
struct i40e_channel *ch;
struct xdp_rxq_info xdp_rxq;
struct xsk_buff_pool *xsk_pool;
struct xdp_desc *xsk_descs; /* For storing descriptors in the AF_XDP ZC path */
} ____cacheline_internodealigned_in_smp;
static inline bool ring_uses_build_skb(struct i40e_ring *ring)

View File

@ -2,6 +2,7 @@
/* Copyright(c) 2018 Intel Corporation. */
#include <linux/bpf_trace.h>
#include <linux/stringify.h>
#include <net/xdp_sock_drv.h>
#include <net/xdp.h>
@ -380,6 +381,69 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
return failure ? budget : (int)total_rx_packets;
}
static void i40e_xmit_pkt(struct i40e_ring *xdp_ring, struct xdp_desc *desc,
unsigned int *total_bytes)
{
struct i40e_tx_desc *tx_desc;
dma_addr_t dma;
dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc->addr);
xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, desc->len);
tx_desc = I40E_TX_DESC(xdp_ring, xdp_ring->next_to_use++);
tx_desc->buffer_addr = cpu_to_le64(dma);
tx_desc->cmd_type_offset_bsz = build_ctob(I40E_TX_DESC_CMD_ICRC | I40E_TX_DESC_CMD_EOP,
0, desc->len, 0);
*total_bytes += desc->len;
}
static void i40e_xmit_pkt_batch(struct i40e_ring *xdp_ring, struct xdp_desc *desc,
unsigned int *total_bytes)
{
u16 ntu = xdp_ring->next_to_use;
struct i40e_tx_desc *tx_desc;
dma_addr_t dma;
u32 i;
loop_unrolled_for(i = 0; i < PKTS_PER_BATCH; i++) {
dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc[i].addr);
xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, desc[i].len);
tx_desc = I40E_TX_DESC(xdp_ring, ntu++);
tx_desc->buffer_addr = cpu_to_le64(dma);
tx_desc->cmd_type_offset_bsz = build_ctob(I40E_TX_DESC_CMD_ICRC |
I40E_TX_DESC_CMD_EOP,
0, desc[i].len, 0);
*total_bytes += desc[i].len;
}
xdp_ring->next_to_use = ntu;
}
static void i40e_fill_tx_hw_ring(struct i40e_ring *xdp_ring, struct xdp_desc *descs, u32 nb_pkts,
unsigned int *total_bytes)
{
u32 batched, leftover, i;
batched = nb_pkts & ~(PKTS_PER_BATCH - 1);
leftover = nb_pkts & (PKTS_PER_BATCH - 1);
for (i = 0; i < batched; i += PKTS_PER_BATCH)
i40e_xmit_pkt_batch(xdp_ring, &descs[i], total_bytes);
for (i = batched; i < batched + leftover; i++)
i40e_xmit_pkt(xdp_ring, &descs[i], total_bytes);
}
static void i40e_set_rs_bit(struct i40e_ring *xdp_ring)
{
u16 ntu = xdp_ring->next_to_use ? xdp_ring->next_to_use - 1 : xdp_ring->count - 1;
struct i40e_tx_desc *tx_desc;
tx_desc = I40E_TX_DESC(xdp_ring, ntu);
tx_desc->cmd_type_offset_bsz |= (I40E_TX_DESC_CMD_RS << I40E_TXD_QW1_CMD_SHIFT);
}
/**
* i40e_xmit_zc - Performs zero-copy Tx AF_XDP
* @xdp_ring: XDP Tx ring
@ -389,49 +453,30 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
**/
static bool i40e_xmit_zc(struct i40e_ring *xdp_ring, unsigned int budget)
{
unsigned int sent_frames = 0, total_bytes = 0;
struct i40e_tx_desc *tx_desc = NULL;
struct i40e_tx_buffer *tx_bi;
struct xdp_desc desc;
dma_addr_t dma;
struct xdp_desc *descs = xdp_ring->xsk_descs;
u32 nb_pkts, nb_processed = 0;
unsigned int total_bytes = 0;
while (budget-- > 0) {
if (!xsk_tx_peek_desc(xdp_ring->xsk_pool, &desc))
break;
nb_pkts = xsk_tx_peek_release_desc_batch(xdp_ring->xsk_pool, descs, budget);
if (!nb_pkts)
return false;
dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc.addr);
xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma,
desc.len);
tx_bi = &xdp_ring->tx_bi[xdp_ring->next_to_use];
tx_bi->bytecount = desc.len;
tx_desc = I40E_TX_DESC(xdp_ring, xdp_ring->next_to_use);
tx_desc->buffer_addr = cpu_to_le64(dma);
tx_desc->cmd_type_offset_bsz =
build_ctob(I40E_TX_DESC_CMD_ICRC
| I40E_TX_DESC_CMD_EOP,
0, desc.len, 0);
sent_frames++;
total_bytes += tx_bi->bytecount;
xdp_ring->next_to_use++;
if (xdp_ring->next_to_use == xdp_ring->count)
if (xdp_ring->next_to_use + nb_pkts >= xdp_ring->count) {
nb_processed = xdp_ring->count - xdp_ring->next_to_use;
i40e_fill_tx_hw_ring(xdp_ring, descs, nb_processed, &total_bytes);
xdp_ring->next_to_use = 0;
}
if (tx_desc) {
i40e_fill_tx_hw_ring(xdp_ring, &descs[nb_processed], nb_pkts - nb_processed,
&total_bytes);
/* Request an interrupt for the last frame and bump tail ptr. */
tx_desc->cmd_type_offset_bsz |= (I40E_TX_DESC_CMD_RS <<
I40E_TXD_QW1_CMD_SHIFT);
i40e_set_rs_bit(xdp_ring);
i40e_xdp_ring_update_tail(xdp_ring);
xsk_tx_release(xdp_ring->xsk_pool);
i40e_update_tx_stats(xdp_ring, sent_frames, total_bytes);
}
i40e_update_tx_stats(xdp_ring, nb_pkts, total_bytes);
return !!budget;
return true;
}
/**

View File

@ -4,6 +4,22 @@
#ifndef _I40E_XSK_H_
#define _I40E_XSK_H_
/* This value should match the pragma in the loop_unrolled_for
* macro. Why 4? It is strictly empirical. It seems to be a good
* compromise between the advantage of having simultaneous outstanding
* reads to the DMA array that can hide each others latency and the
* disadvantage of having a larger code path.
*/
#define PKTS_PER_BATCH 4
#ifdef __clang__
#define loop_unrolled_for _Pragma("clang loop unroll_count(4)") for
#elif __GNUC__ >= 8
#define loop_unrolled_for _Pragma("GCC unroll 4") for
#else
#define loop_unrolled_for for
#endif
struct i40e_vsi;
struct xsk_buff_pool;
struct zero_copy_allocator;

View File

@ -306,7 +306,7 @@ int ice_setup_rx_ctx(struct ice_ring *ring)
if (!xdp_rxq_info_is_reg(&ring->xdp_rxq))
/* coverity[check_return] */
xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev,
ring->q_index);
ring->q_index, ring->q_vector->napi.napi_id);
ring->xsk_pool = ice_xsk_pool(ring);
if (ring->xsk_pool) {
@ -333,7 +333,7 @@ int ice_setup_rx_ctx(struct ice_ring *ring)
/* coverity[check_return] */
xdp_rxq_info_reg(&ring->xdp_rxq,
ring->netdev,
ring->q_index);
ring->q_index, ring->q_vector->napi.napi_id);
err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
MEM_TYPE_PAGE_SHARED,

View File

@ -483,7 +483,7 @@ int ice_setup_rx_ring(struct ice_ring *rx_ring)
if (rx_ring->vsi->type == ICE_VSI_PF &&
!xdp_rxq_info_is_reg(&rx_ring->xdp_rxq))
if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev,
rx_ring->q_index))
rx_ring->q_index, rx_ring->q_vector->napi.napi_id))
goto err;
return 0;

View File

@ -4352,7 +4352,7 @@ int igb_setup_rx_resources(struct igb_ring *rx_ring)
/* XDP RX-queue info */
if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev,
rx_ring->queue_index) < 0)
rx_ring->queue_index, 0) < 0)
goto err;
return 0;

View File

@ -6577,7 +6577,7 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter,
/* XDP RX-queue info */
if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, adapter->netdev,
rx_ring->queue_index) < 0)
rx_ring->queue_index, rx_ring->q_vector->napi.napi_id) < 0)
goto err;
rx_ring->xdp_prog = adapter->xdp_prog;

View File

@ -3493,7 +3493,7 @@ int ixgbevf_setup_rx_resources(struct ixgbevf_adapter *adapter,
/* XDP RX-queue info */
if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, adapter->netdev,
rx_ring->queue_index) < 0)
rx_ring->queue_index, 0) < 0)
goto err;
rx_ring->xdp_prog = adapter->xdp_prog;

View File

@ -3243,7 +3243,7 @@ static int mvneta_create_page_pool(struct mvneta_port *pp,
return err;
}
err = xdp_rxq_info_reg(&rxq->xdp_rxq, pp->dev, rxq->id);
err = xdp_rxq_info_reg(&rxq->xdp_rxq, pp->dev, rxq->id, 0);
if (err < 0)
goto err_free_pp;

View File

@ -2614,11 +2614,11 @@ static int mvpp2_rxq_init(struct mvpp2_port *port,
mvpp2_rxq_status_update(port, rxq->id, 0, rxq->size);
if (priv->percpu_pools) {
err = xdp_rxq_info_reg(&rxq->xdp_rxq_short, port->dev, rxq->id);
err = xdp_rxq_info_reg(&rxq->xdp_rxq_short, port->dev, rxq->id, 0);
if (err < 0)
goto err_free_dma;
err = xdp_rxq_info_reg(&rxq->xdp_rxq_long, port->dev, rxq->id);
err = xdp_rxq_info_reg(&rxq->xdp_rxq_long, port->dev, rxq->id, 0);
if (err < 0)
goto err_unregister_rxq_short;

View File

@ -283,7 +283,7 @@ int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
ring->log_stride = ffs(ring->stride) - 1;
ring->buf_size = ring->size * ring->stride + TXBB_SIZE;
if (xdp_rxq_info_reg(&ring->xdp_rxq, priv->dev, queue_index) < 0)
if (xdp_rxq_info_reg(&ring->xdp_rxq, priv->dev, queue_index, 0) < 0)
goto err_ring;
tmp = size * roundup_pow_of_two(MLX4_EN_MAX_RX_FRAGS *

View File

@ -434,7 +434,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
rq_xdp_ix = rq->ix;
if (xsk)
rq_xdp_ix += params->num_channels * MLX5E_RQ_GROUP_XSK;
err = xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq_xdp_ix);
err = xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq_xdp_ix, 0);
if (err < 0)
goto err_rq_xdp_prog;

View File

@ -2533,7 +2533,7 @@ nfp_net_rx_ring_alloc(struct nfp_net_dp *dp, struct nfp_net_rx_ring *rx_ring)
if (dp->netdev) {
err = xdp_rxq_info_reg(&rx_ring->xdp_rxq, dp->netdev,
rx_ring->idx);
rx_ring->idx, rx_ring->r_vec->napi.napi_id);
if (err < 0)
return err;
}

View File

@ -1762,7 +1762,7 @@ static void qede_init_fp(struct qede_dev *edev)
/* Driver have no error path from here */
WARN_ON(xdp_rxq_info_reg(&fp->rxq->xdp_rxq, edev->ndev,
fp->rxq->rxq_id) < 0);
fp->rxq->rxq_id, 0) < 0);
if (xdp_rxq_info_reg_mem_model(&fp->rxq->xdp_rxq,
MEM_TYPE_PAGE_ORDER0,

View File

@ -262,7 +262,7 @@ void efx_init_rx_queue(struct efx_rx_queue *rx_queue)
/* Initialise XDP queue information */
rc = xdp_rxq_info_reg(&rx_queue->xdp_rxq_info, efx->net_dev,
rx_queue->core_index);
rx_queue->core_index, 0);
if (rc) {
netif_err(efx, rx_err, efx->net_dev,

View File

@ -1314,7 +1314,7 @@ static int netsec_setup_rx_dring(struct netsec_priv *priv)
goto err_out;
}
err = xdp_rxq_info_reg(&dring->xdp_rxq, priv->ndev, 0);
err = xdp_rxq_info_reg(&dring->xdp_rxq, priv->ndev, 0, priv->napi.napi_id);
if (err)
goto err_out;

View File

@ -1186,7 +1186,7 @@ static int cpsw_ndev_create_xdp_rxq(struct cpsw_priv *priv, int ch)
pool = cpsw->page_pool[ch];
rxq = &priv->xdp_rxq[ch];
ret = xdp_rxq_info_reg(rxq, priv->ndev, ch);
ret = xdp_rxq_info_reg(rxq, priv->ndev, ch, 0);
if (ret)
return ret;

View File

@ -1499,7 +1499,7 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device,
u64_stats_init(&nvchan->tx_stats.syncp);
u64_stats_init(&nvchan->rx_stats.syncp);
ret = xdp_rxq_info_reg(&nvchan->xdp_rxq, ndev, i);
ret = xdp_rxq_info_reg(&nvchan->xdp_rxq, ndev, i, 0);
if (ret) {
netdev_err(ndev, "xdp_rxq_info_reg fail: %d\n", ret);

View File

@ -780,7 +780,7 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
} else {
/* Setup XDP RX-queue info, for new tfile getting attached */
err = xdp_rxq_info_reg(&tfile->xdp_rxq,
tun->dev, tfile->queue_index);
tun->dev, tfile->queue_index, 0);
if (err < 0)
goto out;
err = xdp_rxq_info_reg_mem_model(&tfile->xdp_rxq,

View File

@ -884,7 +884,6 @@ static int veth_napi_add(struct net_device *dev)
for (i = 0; i < dev->real_num_rx_queues; i++) {
struct veth_rq *rq = &priv->rq[i];
netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT);
napi_enable(&rq->xdp_napi);
}
@ -926,7 +925,8 @@ static int veth_enable_xdp(struct net_device *dev)
for (i = 0; i < dev->real_num_rx_queues; i++) {
struct veth_rq *rq = &priv->rq[i];
err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i);
netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT);
err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id);
if (err < 0)
goto err_rxq_reg;
@ -952,8 +952,12 @@ static int veth_enable_xdp(struct net_device *dev)
err_reg_mem:
xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq);
err_rxq_reg:
for (i--; i >= 0; i--)
xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq);
for (i--; i >= 0; i--) {
struct veth_rq *rq = &priv->rq[i];
xdp_rxq_info_unreg(&rq->xdp_rxq);
netif_napi_del(&rq->xdp_napi);
}
return err;
}

View File

@ -1485,7 +1485,7 @@ static int virtnet_open(struct net_device *dev)
if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
schedule_delayed_work(&vi->refill, 0);
err = xdp_rxq_info_reg(&vi->rq[i].xdp_rxq, dev, i);
err = xdp_rxq_info_reg(&vi->rq[i].xdp_rxq, dev, i, vi->rq[i].napi.napi_id);
if (err < 0)
return err;

View File

@ -2014,7 +2014,7 @@ static int xennet_create_page_pool(struct netfront_queue *queue)
}
err = xdp_rxq_info_reg(&queue->xdp_rxq, queue->info->netdev,
queue->id);
queue->id, 0);
if (err) {
netdev_err(queue->info->netdev, "xdp_rxq_info_reg failed\n");
goto err_free_pp;

View File

@ -657,7 +657,7 @@ int __set_page_dirty_buffers(struct page *page)
} while (bh != head);
}
/*
* Lock out page->mem_cgroup migration to keep PageDirty
* Lock out page's memcg migration to keep PageDirty
* synchronized with per-memcg dirty page counters.
*/
lock_page_memcg(page);

View File

@ -397,7 +397,8 @@ static void ep_busy_loop(struct eventpoll *ep, int nonblock)
unsigned int napi_id = READ_ONCE(ep->napi_id);
if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on())
napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep);
napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false,
BUSY_POLL_BUDGET);
}
static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)

View File

@ -650,7 +650,7 @@ iomap_set_page_dirty(struct page *page)
return !TestSetPageDirty(page);
/*
* Lock out page->mem_cgroup migration to keep PageDirty
* Lock out page's memcg migration to keep PageDirty
* synchronized with per-memcg dirty page counters.
*/
lock_page_memcg(page);

View File

@ -246,11 +246,11 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
__ret; \
})
#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) \
BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_BIND)
#define BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr) \
BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_BIND, NULL)
#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) \
BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_BIND)
#define BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr) \
BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_BIND, NULL)
#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (cgroup_bpf_enabled && \
sk->sk_prot->pre_connect)
@ -434,8 +434,8 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
#define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) ({ 0; })
#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; })
#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; })
#define BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr) ({ 0; })
#define BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr) ({ 0; })
#define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; })
#define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; })
#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; })

View File

@ -20,6 +20,8 @@
#include <linux/module.h>
#include <linux/kallsyms.h>
#include <linux/capability.h>
#include <linux/sched/mm.h>
#include <linux/slab.h>
struct bpf_verifier_env;
struct bpf_verifier_log;
@ -37,6 +39,7 @@ struct bpf_iter_aux_info;
struct bpf_local_storage;
struct bpf_local_storage_map;
struct kobject;
struct mem_cgroup;
extern struct idr btf_idr;
extern spinlock_t btf_idr_lock;
@ -135,11 +138,6 @@ struct bpf_map_ops {
const struct bpf_iter_seq_info *iter_seq_info;
};
struct bpf_map_memory {
u32 pages;
struct user_struct *user;
};
struct bpf_map {
/* The first two cachelines with read-mostly members of which some
* are also accessed in fast-path (e.g. ops, max_entries).
@ -160,7 +158,9 @@ struct bpf_map {
u32 btf_key_type_id;
u32 btf_value_type_id;
struct btf *btf;
struct bpf_map_memory memory;
#ifdef CONFIG_MEMCG_KMEM
struct mem_cgroup *memcg;
#endif
char name[BPF_OBJ_NAME_LEN];
u32 btf_vmlinux_value_type_id;
bool bypass_spec_v1;
@ -421,8 +421,11 @@ struct bpf_insn_access_aux {
enum bpf_reg_type reg_type;
union {
int ctx_field_size;
struct {
struct btf *btf;
u32 btf_id;
};
};
struct bpf_verifier_log *log; /* for verbose logs */
};
@ -458,6 +461,7 @@ struct bpf_verifier_ops {
struct bpf_insn *dst,
struct bpf_prog *prog, u32 *target_size);
int (*btf_struct_access)(struct bpf_verifier_log *log,
const struct btf *btf,
const struct btf_type *t, int off, int size,
enum bpf_access_type atype,
u32 *next_btf_id);
@ -771,6 +775,7 @@ struct bpf_prog_aux {
u32 ctx_arg_info_size;
u32 max_rdonly_access;
u32 max_rdwr_access;
struct btf *attach_btf;
const struct bpf_ctx_arg_aux *ctx_arg_info;
struct mutex dst_mutex; /* protects dst_* pointers below, *after* prog becomes visible */
struct bpf_prog *dst_prog;
@ -1005,7 +1010,6 @@ struct bpf_event_entry {
bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
int bpf_prog_calc_tag(struct bpf_prog *fp);
const char *kernel_type_name(u32 btf_type_id);
const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
@ -1202,8 +1206,6 @@ void bpf_prog_sub(struct bpf_prog *prog, int i);
void bpf_prog_inc(struct bpf_prog *prog);
struct bpf_prog * __must_check bpf_prog_inc_not_zero(struct bpf_prog *prog);
void bpf_prog_put(struct bpf_prog *prog);
int __bpf_prog_charge(struct user_struct *user, u32 pages);
void __bpf_prog_uncharge(struct user_struct *user, u32 pages);
void __bpf_free_used_maps(struct bpf_prog_aux *aux,
struct bpf_map **used_maps, u32 len);
@ -1218,12 +1220,6 @@ void bpf_map_inc_with_uref(struct bpf_map *map);
struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map);
void bpf_map_put_with_uref(struct bpf_map *map);
void bpf_map_put(struct bpf_map *map);
int bpf_map_charge_memlock(struct bpf_map *map, u32 pages);
void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages);
int bpf_map_charge_init(struct bpf_map_memory *mem, u64 size);
void bpf_map_charge_finish(struct bpf_map_memory *mem);
void bpf_map_charge_move(struct bpf_map_memory *dst,
struct bpf_map_memory *src);
void *bpf_map_area_alloc(u64 size, int numa_node);
void *bpf_map_area_mmapable_alloc(u64 size, int numa_node);
void bpf_map_area_free(void *base);
@ -1240,6 +1236,34 @@ int generic_map_delete_batch(struct bpf_map *map,
struct bpf_map *bpf_map_get_curr_or_next(u32 *id);
struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id);
#ifdef CONFIG_MEMCG_KMEM
void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
int node);
void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags);
void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
size_t align, gfp_t flags);
#else
static inline void *
bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
int node)
{
return kmalloc_node(size, flags, node);
}
static inline void *
bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
{
return kzalloc(size, flags);
}
static inline void __percpu *
bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align,
gfp_t flags)
{
return __alloc_percpu_gfp(size, align, flags);
}
#endif
extern int sysctl_unprivileged_bpf_disabled;
static inline bool bpf_allow_ptr_leaks(void)
@ -1430,12 +1454,13 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog,
bool btf_ctx_access(int off, int size, enum bpf_access_type type,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info);
int btf_struct_access(struct bpf_verifier_log *log,
int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf,
const struct btf_type *t, int off, int size,
enum bpf_access_type atype,
u32 *next_btf_id);
bool btf_struct_ids_match(struct bpf_verifier_log *log,
int off, u32 id, u32 need_type_id);
const struct btf *btf, u32 id, int off,
const struct btf *need_btf, u32 need_type_id);
int btf_distill_func_proto(struct bpf_verifier_log *log,
struct btf *btf,
@ -1490,15 +1515,6 @@ bpf_prog_inc_not_zero(struct bpf_prog *prog)
return ERR_PTR(-EOPNOTSUPP);
}
static inline int __bpf_prog_charge(struct user_struct *user, u32 pages)
{
return 0;
}
static inline void __bpf_prog_uncharge(struct user_struct *user, u32 pages)
{
}
static inline void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
const struct bpf_link_ops *ops,
struct bpf_prog *prog)
@ -1842,6 +1858,7 @@ extern const struct bpf_func_proto bpf_copy_from_user_proto;
extern const struct bpf_func_proto bpf_snprintf_btf_proto;
extern const struct bpf_func_proto bpf_per_cpu_ptr_proto;
extern const struct bpf_func_proto bpf_this_cpu_ptr_proto;
extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto;
const struct bpf_func_proto *bpf_tracing_func_proto(
enum bpf_func_id func_id, const struct bpf_prog *prog);

View File

@ -5,6 +5,7 @@
#define _LINUX_BPF_VERIFIER_H 1
#include <linux/bpf.h> /* for enum bpf_reg_type */
#include <linux/btf.h> /* for struct btf and btf_id() */
#include <linux/filter.h> /* for MAX_BPF_STACK */
#include <linux/tnum.h>
@ -43,6 +44,8 @@ enum bpf_reg_liveness {
struct bpf_reg_state {
/* Ordering of fields matters. See states_equal() */
enum bpf_reg_type type;
/* Fixed part of pointer offset, pointer types only */
s32 off;
union {
/* valid when type == PTR_TO_PACKET */
int range;
@ -52,15 +55,20 @@ struct bpf_reg_state {
*/
struct bpf_map *map_ptr;
u32 btf_id; /* for PTR_TO_BTF_ID */
/* for PTR_TO_BTF_ID */
struct {
struct btf *btf;
u32 btf_id;
};
u32 mem_size; /* for PTR_TO_MEM | PTR_TO_MEM_OR_NULL */
/* Max size from any of the above. */
unsigned long raw;
struct {
unsigned long raw1;
unsigned long raw2;
} raw;
};
/* Fixed part of pointer offset, pointer types only */
s32 off;
/* For PTR_TO_PACKET, used to find other pointers with the same variable
* offset, so they can share range knowledge.
* For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we
@ -311,7 +319,10 @@ struct bpf_insn_aux_data {
struct {
enum bpf_reg_type reg_type; /* type of pseudo_btf_id */
union {
struct {
struct btf *btf;
u32 btf_id; /* btf_id for struct typed var */
};
u32 mem_size; /* mem_size for non-struct typed var */
};
} btf_var;
@ -459,9 +470,12 @@ int check_ctx_reg(struct bpf_verifier_env *env,
/* this lives here instead of in bpf.h because it needs to dereference tgt_prog */
static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog,
u32 btf_id)
struct btf *btf, u32 btf_id)
{
return tgt_prog ? (((u64)tgt_prog->aux->id) << 32 | btf_id) : btf_id;
if (tgt_prog)
return ((u64)tgt_prog->aux->id << 32) | btf_id;
else
return ((u64)btf_obj_id(btf) << 32) | 0x80000000 | btf_id;
}
int bpf_check_attach_target(struct bpf_verifier_log *log,

View File

@ -18,6 +18,7 @@ struct btf_show;
extern const struct file_operations btf_fops;
void btf_get(struct btf *btf);
void btf_put(struct btf *btf);
int btf_new_fd(const union bpf_attr *attr);
struct btf *btf_get_by_fd(int fd);
@ -88,7 +89,8 @@ int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj,
char *buf, int len, u64 flags);
int btf_get_fd_by_id(u32 id);
u32 btf_id(const struct btf *btf);
u32 btf_obj_id(const struct btf *btf);
bool btf_is_kernel(const struct btf *btf);
bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
const struct btf_member *m,
u32 expected_offset, u32 expected_size);
@ -206,6 +208,8 @@ static inline const struct btf_var_secinfo *btf_type_var_secinfo(
}
#ifdef CONFIG_BPF_SYSCALL
struct bpf_prog;
const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
const char *btf_name_by_offset(const struct btf *btf, u32 offset);
struct btf *btf_parse_vmlinux(void);

View File

@ -29,6 +29,7 @@ extern int ima_post_read_file(struct file *file, void *buf, loff_t size,
enum kernel_read_file_id id);
extern void ima_post_path_mknod(struct dentry *dentry);
extern int ima_file_hash(struct file *file, char *buf, size_t buf_size);
extern int ima_inode_hash(struct inode *inode, char *buf, size_t buf_size);
extern void ima_kexec_cmdline(int kernel_fd, const void *buf, int size);
#ifdef CONFIG_IMA_KEXEC
@ -115,6 +116,11 @@ static inline int ima_file_hash(struct file *file, char *buf, size_t buf_size)
return -EOPNOTSUPP;
}
static inline int ima_inode_hash(struct inode *inode, char *buf, size_t buf_size)
{
return -EOPNOTSUPP;
}
static inline void ima_kexec_cmdline(int kernel_fd, const void *buf, int size) {}
#endif /* CONFIG_IMA */

View File

@ -343,6 +343,175 @@ struct mem_cgroup {
extern struct mem_cgroup *root_mem_cgroup;
enum page_memcg_data_flags {
/* page->memcg_data is a pointer to an objcgs vector */
MEMCG_DATA_OBJCGS = (1UL << 0),
/* page has been accounted as a non-slab kernel page */
MEMCG_DATA_KMEM = (1UL << 1),
/* the next bit after the last actual flag */
__NR_MEMCG_DATA_FLAGS = (1UL << 2),
};
#define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1)
/*
* page_memcg - get the memory cgroup associated with a page
* @page: a pointer to the page struct
*
* Returns a pointer to the memory cgroup associated with the page,
* or NULL. This function assumes that the page is known to have a
* proper memory cgroup pointer. It's not safe to call this function
* against some type of pages, e.g. slab pages or ex-slab pages.
*
* Any of the following ensures page and memcg binding stability:
* - the page lock
* - LRU isolation
* - lock_page_memcg()
* - exclusive reference
*/
static inline struct mem_cgroup *page_memcg(struct page *page)
{
unsigned long memcg_data = page->memcg_data;
VM_BUG_ON_PAGE(PageSlab(page), page);
VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page);
return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
}
/*
* page_memcg_rcu - locklessly get the memory cgroup associated with a page
* @page: a pointer to the page struct
*
* Returns a pointer to the memory cgroup associated with the page,
* or NULL. This function assumes that the page is known to have a
* proper memory cgroup pointer. It's not safe to call this function
* against some type of pages, e.g. slab pages or ex-slab pages.
*/
static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
{
VM_BUG_ON_PAGE(PageSlab(page), page);
WARN_ON_ONCE(!rcu_read_lock_held());
return (struct mem_cgroup *)(READ_ONCE(page->memcg_data) &
~MEMCG_DATA_FLAGS_MASK);
}
/*
* page_memcg_check - get the memory cgroup associated with a page
* @page: a pointer to the page struct
*
* Returns a pointer to the memory cgroup associated with the page,
* or NULL. This function unlike page_memcg() can take any page
* as an argument. It has to be used in cases when it's not known if a page
* has an associated memory cgroup pointer or an object cgroups vector.
*
* Any of the following ensures page and memcg binding stability:
* - the page lock
* - LRU isolation
* - lock_page_memcg()
* - exclusive reference
*/
static inline struct mem_cgroup *page_memcg_check(struct page *page)
{
/*
* Because page->memcg_data might be changed asynchronously
* for slab pages, READ_ONCE() should be used here.
*/
unsigned long memcg_data = READ_ONCE(page->memcg_data);
if (memcg_data & MEMCG_DATA_OBJCGS)
return NULL;
return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
}
/*
* PageMemcgKmem - check if the page has MemcgKmem flag set
* @page: a pointer to the page struct
*
* Checks if the page has MemcgKmem flag set. The caller must ensure that
* the page has an associated memory cgroup. It's not safe to call this function
* against some types of pages, e.g. slab pages.
*/
static inline bool PageMemcgKmem(struct page *page)
{
VM_BUG_ON_PAGE(page->memcg_data & MEMCG_DATA_OBJCGS, page);
return page->memcg_data & MEMCG_DATA_KMEM;
}
#ifdef CONFIG_MEMCG_KMEM
/*
* page_objcgs - get the object cgroups vector associated with a page
* @page: a pointer to the page struct
*
* Returns a pointer to the object cgroups vector associated with the page,
* or NULL. This function assumes that the page is known to have an
* associated object cgroups vector. It's not safe to call this function
* against pages, which might have an associated memory cgroup: e.g.
* kernel stack pages.
*/
static inline struct obj_cgroup **page_objcgs(struct page *page)
{
unsigned long memcg_data = READ_ONCE(page->memcg_data);
VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS), page);
VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page);
return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
}
/*
* page_objcgs_check - get the object cgroups vector associated with a page
* @page: a pointer to the page struct
*
* Returns a pointer to the object cgroups vector associated with the page,
* or NULL. This function is safe to use if the page can be directly associated
* with a memory cgroup.
*/
static inline struct obj_cgroup **page_objcgs_check(struct page *page)
{
unsigned long memcg_data = READ_ONCE(page->memcg_data);
if (!memcg_data || !(memcg_data & MEMCG_DATA_OBJCGS))
return NULL;
VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page);
return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
}
/*
* set_page_objcgs - associate a page with a object cgroups vector
* @page: a pointer to the page struct
* @objcgs: a pointer to the object cgroups vector
*
* Atomically associates a page with a vector of object cgroups.
*/
static inline bool set_page_objcgs(struct page *page,
struct obj_cgroup **objcgs)
{
return !cmpxchg(&page->memcg_data, 0, (unsigned long)objcgs |
MEMCG_DATA_OBJCGS);
}
#else
static inline struct obj_cgroup **page_objcgs(struct page *page)
{
return NULL;
}
static inline struct obj_cgroup **page_objcgs_check(struct page *page)
{
return NULL;
}
static inline bool set_page_objcgs(struct page *page,
struct obj_cgroup **objcgs)
{
return true;
}
#endif
static __always_inline bool memcg_stat_item_in_bytes(int idx)
{
if (idx == MEMCG_PERCPU_B)
@ -743,15 +912,19 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
static inline void __mod_memcg_page_state(struct page *page,
int idx, int val)
{
if (page->mem_cgroup)
__mod_memcg_state(page->mem_cgroup, idx, val);
struct mem_cgroup *memcg = page_memcg(page);
if (memcg)
__mod_memcg_state(memcg, idx, val);
}
static inline void mod_memcg_page_state(struct page *page,
int idx, int val)
{
if (page->mem_cgroup)
mod_memcg_state(page->mem_cgroup, idx, val);
struct mem_cgroup *memcg = page_memcg(page);
if (memcg)
mod_memcg_state(memcg, idx, val);
}
static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
@ -834,16 +1007,17 @@ static inline void __mod_lruvec_page_state(struct page *page,
enum node_stat_item idx, int val)
{
struct page *head = compound_head(page); /* rmap on tail pages */
struct mem_cgroup *memcg = page_memcg(head);
pg_data_t *pgdat = page_pgdat(page);
struct lruvec *lruvec;
/* Untracked pages have no memcg, no lruvec. Update only the node */
if (!head->mem_cgroup) {
if (!memcg) {
__mod_node_page_state(pgdat, idx, val);
return;
}
lruvec = mem_cgroup_lruvec(head->mem_cgroup, pgdat);
lruvec = mem_cgroup_lruvec(memcg, pgdat);
__mod_lruvec_state(lruvec, idx, val);
}
@ -878,8 +1052,10 @@ static inline void count_memcg_events(struct mem_cgroup *memcg,
static inline void count_memcg_page_event(struct page *page,
enum vm_event_item idx)
{
if (page->mem_cgroup)
count_memcg_events(page->mem_cgroup, idx, 1);
struct mem_cgroup *memcg = page_memcg(page);
if (memcg)
count_memcg_events(memcg, idx, 1);
}
static inline void count_memcg_event_mm(struct mm_struct *mm,
@ -948,6 +1124,27 @@ void mem_cgroup_split_huge_fixup(struct page *head);
struct mem_cgroup;
static inline struct mem_cgroup *page_memcg(struct page *page)
{
return NULL;
}
static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
{
WARN_ON_ONCE(!rcu_read_lock_held());
return NULL;
}
static inline struct mem_cgroup *page_memcg_check(struct page *page)
{
return NULL;
}
static inline bool PageMemcgKmem(struct page *page)
{
return false;
}
static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
return true;
@ -1437,7 +1634,7 @@ static inline void mem_cgroup_track_foreign_dirty(struct page *page,
if (mem_cgroup_disabled())
return;
if (unlikely(&page->mem_cgroup->css != wb->memcg_css))
if (unlikely(&page_memcg(page)->css != wb->memcg_css))
mem_cgroup_track_foreign_dirty_slowpath(page, wb);
}

View File

@ -1484,28 +1484,6 @@ static inline void set_page_links(struct page *page, enum zone_type zone,
#endif
}
#ifdef CONFIG_MEMCG
static inline struct mem_cgroup *page_memcg(struct page *page)
{
return page->mem_cgroup;
}
static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
{
WARN_ON_ONCE(!rcu_read_lock_held());
return READ_ONCE(page->mem_cgroup);
}
#else
static inline struct mem_cgroup *page_memcg(struct page *page)
{
return NULL;
}
static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
{
WARN_ON_ONCE(!rcu_read_lock_held());
return NULL;
}
#endif
/*
* Some inline functions in vmstat.h depend on page_zone()
*/

View File

@ -199,10 +199,7 @@ struct page {
atomic_t _refcount;
#ifdef CONFIG_MEMCG
union {
struct mem_cgroup *mem_cgroup;
struct obj_cgroup **obj_cgroups;
};
unsigned long memcg_data;
#endif
/*

View File

@ -355,8 +355,9 @@ enum {
NAPI_STATE_DISABLE, /* Disable pending */
NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */
NAPI_STATE_LISTED, /* NAPI added to system lists */
NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
NAPI_STATE_NO_BUSY_POLL, /* Do not add in napi_hash, no busy polling */
NAPI_STATE_IN_BUSY_POLL, /* sk_busy_loop() owns this NAPI */
NAPI_STATE_PREFER_BUSY_POLL, /* prefer busy-polling over softirq processing*/
};
enum {
@ -367,6 +368,7 @@ enum {
NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED),
NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
NAPIF_STATE_PREFER_BUSY_POLL = BIT(NAPI_STATE_PREFER_BUSY_POLL),
};
enum gro_result {
@ -437,6 +439,11 @@ static inline bool napi_disable_pending(struct napi_struct *n)
return test_bit(NAPI_STATE_DISABLE, &n->state);
}
static inline bool napi_prefer_busy_poll(struct napi_struct *n)
{
return test_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state);
}
bool napi_schedule_prep(struct napi_struct *n);
/**

View File

@ -715,9 +715,8 @@ PAGEFLAG_FALSE(DoubleMap)
#define PAGE_MAPCOUNT_RESERVE -128
#define PG_buddy 0x00000080
#define PG_offline 0x00000100
#define PG_kmemcg 0x00000200
#define PG_table 0x00000400
#define PG_guard 0x00000800
#define PG_table 0x00000200
#define PG_guard 0x00000400
#define PageType(page, flag) \
((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
@ -768,12 +767,6 @@ PAGE_TYPE_OPS(Buddy, buddy)
*/
PAGE_TYPE_OPS(Offline, offline)
/*
* If kmemcg is enabled, the buddy allocator will set PageKmemcg() on
* pages allocated with __GFP_ACCOUNT. It gets cleared on page free.
*/
PAGE_TYPE_OPS(Kmemcg, kmemcg)
/*
* Marks pages in use as page tables.
*/

View File

@ -23,6 +23,8 @@
*/
#define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1))
#define BUSY_POLL_BUDGET 8
#ifdef CONFIG_NET_RX_BUSY_POLL
struct napi_struct;
@ -43,7 +45,7 @@ bool sk_busy_loop_end(void *p, unsigned long start_time);
void napi_busy_loop(unsigned int napi_id,
bool (*loop_end)(void *, unsigned long),
void *loop_end_arg);
void *loop_end_arg, bool prefer_busy_poll, u16 budget);
#else /* CONFIG_NET_RX_BUSY_POLL */
static inline unsigned long net_busy_loop_on(void)
@ -105,7 +107,9 @@ static inline void sk_busy_loop(struct sock *sk, int nonblock)
unsigned int napi_id = READ_ONCE(sk->sk_napi_id);
if (napi_id >= MIN_NAPI_ID)
napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk);
napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk,
READ_ONCE(sk->sk_prefer_busy_poll),
READ_ONCE(sk->sk_busy_poll_budget) ?: BUSY_POLL_BUDGET);
#endif
}
@ -131,13 +135,28 @@ static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb)
sk_rx_queue_set(sk, skb);
}
static inline void __sk_mark_napi_id_once(struct sock *sk, unsigned int napi_id)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
if (!READ_ONCE(sk->sk_napi_id))
WRITE_ONCE(sk->sk_napi_id, napi_id);
#endif
}
/* variant used for unconnected sockets */
static inline void sk_mark_napi_id_once(struct sock *sk,
const struct sk_buff *skb)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
if (!READ_ONCE(sk->sk_napi_id))
WRITE_ONCE(sk->sk_napi_id, skb->napi_id);
__sk_mark_napi_id_once(sk, skb->napi_id);
#endif
}
static inline void sk_mark_napi_id_once_xdp(struct sock *sk,
const struct xdp_buff *xdp)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
__sk_mark_napi_id_once(sk, xdp->rxq->napi_id);
#endif
}

View File

@ -301,6 +301,8 @@ struct bpf_local_storage;
* @sk_ack_backlog: current listen backlog
* @sk_max_ack_backlog: listen backlog set in listen()
* @sk_uid: user id of owner
* @sk_prefer_busy_poll: prefer busypolling over softirq processing
* @sk_busy_poll_budget: napi processing budget when busypolling
* @sk_priority: %SO_PRIORITY setting
* @sk_type: socket type (%SOCK_STREAM, etc)
* @sk_protocol: which protocol this socket belongs in this network family
@ -479,6 +481,10 @@ struct sock {
u32 sk_ack_backlog;
u32 sk_max_ack_backlog;
kuid_t sk_uid;
#ifdef CONFIG_NET_RX_BUSY_POLL
u8 sk_prefer_busy_poll;
u16 sk_busy_poll_budget;
#endif
struct pid *sk_peer_pid;
const struct cred *sk_peer_cred;
long sk_rcvtimeo;

View File

@ -410,6 +410,7 @@ void tcp_syn_ack_timeout(const struct request_sock *req);
int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
int flags, int *addr_len);
int tcp_set_rcvlowat(struct sock *sk, int val);
int tcp_set_window_clamp(struct sock *sk, int val);
void tcp_data_ready(struct sock *sk);
#ifdef CONFIG_MMU
int tcp_mmap(struct file *file, struct socket *sock,

View File

@ -59,6 +59,7 @@ struct xdp_rxq_info {
u32 queue_index;
u32 reg_state;
struct xdp_mem_info mem;
unsigned int napi_id;
} ____cacheline_aligned; /* perf critical, avoid false-sharing */
struct xdp_txq_info {
@ -226,7 +227,7 @@ static inline void xdp_release_frame(struct xdp_frame *xdpf)
}
int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
struct net_device *dev, u32 queue_index);
struct net_device *dev, u32 queue_index, unsigned int napi_id);
void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq);
void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq);
bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq);

View File

@ -13,6 +13,7 @@
void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries);
bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc);
u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *desc, u32 max);
void xsk_tx_release(struct xsk_buff_pool *pool);
struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
u16 queue_id);
@ -128,6 +129,12 @@ static inline bool xsk_tx_peek_desc(struct xsk_buff_pool *pool,
return false;
}
static inline u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *desc,
u32 max)
{
return 0;
}
static inline void xsk_tx_release(struct xsk_buff_pool *pool)
{
}

View File

@ -257,7 +257,7 @@ TRACE_EVENT(track_foreign_dirty,
__entry->ino = inode ? inode->i_ino : 0;
__entry->memcg_id = wb->memcg_css->id;
__entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
__entry->page_cgroup_ino = cgroup_ino(page->mem_cgroup->css.cgroup);
__entry->page_cgroup_ino = cgroup_ino(page_memcg(page)->css.cgroup);
),
TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu",

View File

@ -119,6 +119,9 @@
#define SO_DETACH_REUSEPORT_BPF 68
#define SO_PREFER_BUSY_POLL 69
#define SO_BUSY_POLL_BUDGET 70
#if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))

View File

@ -557,7 +557,12 @@ union bpf_attr {
__aligned_u64 line_info; /* line info */
__u32 line_info_cnt; /* number of bpf_line_info records */
__u32 attach_btf_id; /* in-kernel BTF type id to attach to */
__u32 attach_prog_fd; /* 0 to attach to vmlinux */
union {
/* valid prog_fd to attach to bpf prog */
__u32 attach_prog_fd;
/* or valid module BTF object fd or 0 to attach to vmlinux */
__u32 attach_btf_obj_fd;
};
};
struct { /* anonymous struct used by BPF_OBJ_* commands */
@ -3787,6 +3792,36 @@ union bpf_attr {
* *ARG_PTR_TO_BTF_ID* of type *task_struct*.
* Return
* Pointer to the current task.
*
* long bpf_bprm_opts_set(struct linux_binprm *bprm, u64 flags)
* Description
* Set or clear certain options on *bprm*:
*
* **BPF_F_BPRM_SECUREEXEC** Set the secureexec bit
* which sets the **AT_SECURE** auxv for glibc. The bit
* is cleared if the flag is not specified.
* Return
* **-EINVAL** if invalid *flags* are passed, zero otherwise.
*
* u64 bpf_ktime_get_coarse_ns(void)
* Description
* Return a coarse-grained version of the time elapsed since
* system boot, in nanoseconds. Does not include time the system
* was suspended.
*
* See: **clock_gettime**\ (**CLOCK_MONOTONIC_COARSE**)
* Return
* Current *ktime*.
*
* long bpf_ima_inode_hash(struct inode *inode, void *dst, u32 size)
* Description
* Returns the stored IMA hash of the *inode* (if it's avaialable).
* If the hash is larger than *size*, then only *size*
* bytes will be copied to *dst*
* Return
* The **hash_algo** is returned on success,
* **-EOPNOTSUP** if IMA is disabled or **-EINVAL** if
* invalid arguments are passed.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@ -3948,6 +3983,9 @@ union bpf_attr {
FN(task_storage_get), \
FN(task_storage_delete), \
FN(get_current_task_btf), \
FN(bprm_opts_set), \
FN(ktime_get_coarse_ns), \
FN(ima_inode_hash), \
/* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
@ -4119,6 +4157,11 @@ enum bpf_lwt_encap_mode {
BPF_LWT_ENCAP_IP,
};
/* Flags for bpf_bprm_opts_set helper */
enum {
BPF_F_BPRM_SECUREEXEC = (1ULL << 0),
};
#define __bpf_md_ptr(type, name) \
union { \
type name; \

View File

@ -34,7 +34,7 @@ static int bpf_array_alloc_percpu(struct bpf_array *array)
int i;
for (i = 0; i < array->map.max_entries; i++) {
ptr = __alloc_percpu_gfp(array->elem_size, 8,
ptr = bpf_map_alloc_percpu(&array->map, array->elem_size, 8,
GFP_USER | __GFP_NOWARN);
if (!ptr) {
bpf_array_free_percpu(array);
@ -81,11 +81,10 @@ int array_map_alloc_check(union bpf_attr *attr)
static struct bpf_map *array_map_alloc(union bpf_attr *attr)
{
bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
int ret, numa_node = bpf_map_attr_numa_node(attr);
int numa_node = bpf_map_attr_numa_node(attr);
u32 elem_size, index_mask, max_entries;
bool bypass_spec_v1 = bpf_bypass_spec_v1();
u64 cost, array_size, mask64;
struct bpf_map_memory mem;
u64 array_size, mask64;
struct bpf_array *array;
elem_size = round_up(attr->value_size, 8);
@ -126,44 +125,29 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
}
}
/* make sure there is no u32 overflow later in round_up() */
cost = array_size;
if (percpu)
cost += (u64)attr->max_entries * elem_size * num_possible_cpus();
ret = bpf_map_charge_init(&mem, cost);
if (ret < 0)
return ERR_PTR(ret);
/* allocate all map elements and zero-initialize them */
if (attr->map_flags & BPF_F_MMAPABLE) {
void *data;
/* kmalloc'ed memory can't be mmap'ed, use explicit vmalloc */
data = bpf_map_area_mmapable_alloc(array_size, numa_node);
if (!data) {
bpf_map_charge_finish(&mem);
if (!data)
return ERR_PTR(-ENOMEM);
}
array = data + PAGE_ALIGN(sizeof(struct bpf_array))
- offsetof(struct bpf_array, value);
} else {
array = bpf_map_area_alloc(array_size, numa_node);
}
if (!array) {
bpf_map_charge_finish(&mem);
if (!array)
return ERR_PTR(-ENOMEM);
}
array->index_mask = index_mask;
array->map.bypass_spec_v1 = bypass_spec_v1;
/* copy mandatory map attributes */
bpf_map_init_from_attr(&array->map, attr);
bpf_map_charge_move(&array->map.memory, &mem);
array->elem_size = elem_size;
if (percpu && bpf_array_alloc_percpu(array)) {
bpf_map_charge_finish(&array->map.memory);
bpf_map_area_free(array);
return ERR_PTR(-ENOMEM);
}
@ -1018,7 +1002,7 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
struct bpf_array_aux *aux;
struct bpf_map *map;
aux = kzalloc(sizeof(*aux), GFP_KERNEL);
aux = kzalloc(sizeof(*aux), GFP_KERNEL_ACCOUNT);
if (!aux)
return ERR_PTR(-ENOMEM);

View File

@ -67,7 +67,8 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
if (charge_mem && mem_charge(smap, owner, smap->elem_size))
return NULL;
selem = kzalloc(smap->elem_size, GFP_ATOMIC | __GFP_NOWARN);
selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
GFP_ATOMIC | __GFP_NOWARN);
if (selem) {
if (value)
memcpy(SDATA(selem)->data, value, smap->map.value_size);
@ -264,7 +265,8 @@ int bpf_local_storage_alloc(void *owner,
if (err)
return err;
storage = kzalloc(sizeof(*storage), GFP_ATOMIC | __GFP_NOWARN);
storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
GFP_ATOMIC | __GFP_NOWARN);
if (!storage) {
err = -ENOMEM;
goto uncharge;
@ -543,10 +545,8 @@ struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr)
struct bpf_local_storage_map *smap;
unsigned int i;
u32 nbuckets;
u64 cost;
int ret;
smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN);
smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT);
if (!smap)
return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&smap->map, attr);
@ -555,18 +555,10 @@ struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr)
/* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
nbuckets = max_t(u32, 2, nbuckets);
smap->bucket_log = ilog2(nbuckets);
cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap);
ret = bpf_map_charge_init(&smap->map.memory, cost);
if (ret < 0) {
kfree(smap);
return ERR_PTR(ret);
}
smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets,
GFP_USER | __GFP_NOWARN);
GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT);
if (!smap->buckets) {
bpf_map_charge_finish(&smap->map.memory);
kfree(smap);
return ERR_PTR(-ENOMEM);
}

View File

@ -7,6 +7,7 @@
#include <linux/filter.h>
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/binfmts.h>
#include <linux/lsm_hooks.h>
#include <linux/bpf_lsm.h>
#include <linux/kallsyms.h>
@ -14,6 +15,7 @@
#include <net/bpf_sk_storage.h>
#include <linux/bpf_local_storage.h>
#include <linux/btf_ids.h>
#include <linux/ima.h>
/* For every LSM hook that allows attachment of BPF programs, declare a nop
* function where a BPF program can be attached.
@ -51,6 +53,52 @@ int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
return 0;
}
/* Mask for all the currently supported BPRM option flags */
#define BPF_F_BRPM_OPTS_MASK BPF_F_BPRM_SECUREEXEC
BPF_CALL_2(bpf_bprm_opts_set, struct linux_binprm *, bprm, u64, flags)
{
if (flags & ~BPF_F_BRPM_OPTS_MASK)
return -EINVAL;
bprm->secureexec = (flags & BPF_F_BPRM_SECUREEXEC);
return 0;
}
BTF_ID_LIST_SINGLE(bpf_bprm_opts_set_btf_ids, struct, linux_binprm)
const static struct bpf_func_proto bpf_bprm_opts_set_proto = {
.func = bpf_bprm_opts_set,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
.arg1_btf_id = &bpf_bprm_opts_set_btf_ids[0],
.arg2_type = ARG_ANYTHING,
};
BPF_CALL_3(bpf_ima_inode_hash, struct inode *, inode, void *, dst, u32, size)
{
return ima_inode_hash(inode, dst, size);
}
static bool bpf_ima_inode_hash_allowed(const struct bpf_prog *prog)
{
return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id);
}
BTF_ID_LIST_SINGLE(bpf_ima_inode_hash_btf_ids, struct, inode)
const static struct bpf_func_proto bpf_ima_inode_hash_proto = {
.func = bpf_ima_inode_hash,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
.arg1_btf_id = &bpf_ima_inode_hash_btf_ids[0],
.arg2_type = ARG_PTR_TO_UNINIT_MEM,
.arg3_type = ARG_CONST_SIZE,
.allowed = bpf_ima_inode_hash_allowed,
};
static const struct bpf_func_proto *
bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@ -71,6 +119,10 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_task_storage_get_proto;
case BPF_FUNC_task_storage_delete:
return &bpf_task_storage_delete_proto;
case BPF_FUNC_bprm_opts_set:
return &bpf_bprm_opts_set_proto;
case BPF_FUNC_ima_inode_hash:
return prog->aux->sleepable ? &bpf_ima_inode_hash_proto : NULL;
default:
return tracing_prog_func_proto(func_id, prog);
}

View File

@ -548,12 +548,10 @@ static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
{
const struct bpf_struct_ops *st_ops;
size_t map_total_size, st_map_size;
size_t st_map_size;
struct bpf_struct_ops_map *st_map;
const struct btf_type *t, *vt;
struct bpf_map_memory mem;
struct bpf_map *map;
int err;
if (!bpf_capable())
return ERR_PTR(-EPERM);
@ -573,20 +571,11 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
* struct bpf_struct_ops_tcp_congestions_ops
*/
(vt->size - sizeof(struct bpf_struct_ops_value));
map_total_size = st_map_size +
/* uvalue */
sizeof(vt->size) +
/* struct bpf_progs **progs */
btf_type_vlen(t) * sizeof(struct bpf_prog *);
err = bpf_map_charge_init(&mem, map_total_size);
if (err < 0)
return ERR_PTR(err);
st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE);
if (!st_map) {
bpf_map_charge_finish(&mem);
if (!st_map)
return ERR_PTR(-ENOMEM);
}
st_map->st_ops = st_ops;
map = &st_map->map;
@ -597,14 +586,12 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
st_map->image = bpf_jit_alloc_exec(PAGE_SIZE);
if (!st_map->uvalue || !st_map->progs || !st_map->image) {
bpf_struct_ops_map_free(map);
bpf_map_charge_finish(&mem);
return ERR_PTR(-ENOMEM);
}
mutex_init(&st_map->lock);
set_vm_flush_reset_perms(st_map->image);
bpf_map_init_from_attr(map, attr);
bpf_map_charge_move(&map->memory, &mem);
return map;
}

View File

@ -1524,6 +1524,11 @@ static void btf_free_rcu(struct rcu_head *rcu)
btf_free(btf);
}
void btf_get(struct btf *btf)
{
refcount_inc(&btf->refcnt);
}
void btf_put(struct btf *btf)
{
if (btf && refcount_dec_and_test(&btf->refcnt)) {
@ -4555,11 +4560,10 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog)
{
struct bpf_prog *tgt_prog = prog->aux->dst_prog;
if (tgt_prog) {
if (tgt_prog)
return tgt_prog->aux->btf;
} else {
return btf_vmlinux;
}
else
return prog->aux->attach_btf;
}
static bool is_string_ptr(struct btf *btf, const struct btf_type *t)
@ -4700,6 +4704,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
if (ctx_arg_info->offset == off) {
info->reg_type = ctx_arg_info->reg_type;
info->btf = btf_vmlinux;
info->btf_id = ctx_arg_info->btf_id;
return true;
}
@ -4716,6 +4721,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
ret = btf_translate_to_vmlinux(log, btf, t, tgt_type, arg);
if (ret > 0) {
info->btf = btf_vmlinux;
info->btf_id = ret;
return true;
} else {
@ -4723,6 +4729,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
}
}
info->btf = btf;
info->btf_id = t->type;
t = btf_type_by_id(btf, t->type);
/* skip modifiers */
@ -4749,7 +4756,7 @@ enum bpf_struct_walk_result {
WALK_STRUCT,
};
static int btf_struct_walk(struct bpf_verifier_log *log,
static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
const struct btf_type *t, int off, int size,
u32 *next_btf_id)
{
@ -4760,7 +4767,7 @@ static int btf_struct_walk(struct bpf_verifier_log *log,
u32 vlen, elem_id, mid;
again:
tname = __btf_name_by_offset(btf_vmlinux, t->name_off);
tname = __btf_name_by_offset(btf, t->name_off);
if (!btf_type_is_struct(t)) {
bpf_log(log, "Type '%s' is not a struct\n", tname);
return -EINVAL;
@ -4777,7 +4784,7 @@ again:
goto error;
member = btf_type_member(t) + vlen - 1;
mtype = btf_type_skip_modifiers(btf_vmlinux, member->type,
mtype = btf_type_skip_modifiers(btf, member->type,
NULL);
if (!btf_type_is_array(mtype))
goto error;
@ -4793,7 +4800,7 @@ again:
/* Only allow structure for now, can be relaxed for
* other types later.
*/
t = btf_type_skip_modifiers(btf_vmlinux, array_elem->type,
t = btf_type_skip_modifiers(btf, array_elem->type,
NULL);
if (!btf_type_is_struct(t))
goto error;
@ -4851,10 +4858,10 @@ error:
/* type of the field */
mid = member->type;
mtype = btf_type_by_id(btf_vmlinux, member->type);
mname = __btf_name_by_offset(btf_vmlinux, member->name_off);
mtype = btf_type_by_id(btf, member->type);
mname = __btf_name_by_offset(btf, member->name_off);
mtype = __btf_resolve_size(btf_vmlinux, mtype, &msize,
mtype = __btf_resolve_size(btf, mtype, &msize,
&elem_type, &elem_id, &total_nelems,
&mid);
if (IS_ERR(mtype)) {
@ -4949,7 +4956,7 @@ error:
mname, moff, tname, off, size);
return -EACCES;
}
stype = btf_type_skip_modifiers(btf_vmlinux, mtype->type, &id);
stype = btf_type_skip_modifiers(btf, mtype->type, &id);
if (btf_type_is_struct(stype)) {
*next_btf_id = id;
return WALK_PTR;
@ -4975,7 +4982,7 @@ error:
return -EINVAL;
}
int btf_struct_access(struct bpf_verifier_log *log,
int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf,
const struct btf_type *t, int off, int size,
enum bpf_access_type atype __maybe_unused,
u32 *next_btf_id)
@ -4984,7 +4991,7 @@ int btf_struct_access(struct bpf_verifier_log *log,
u32 id;
do {
err = btf_struct_walk(log, t, off, size, &id);
err = btf_struct_walk(log, btf, t, off, size, &id);
switch (err) {
case WALK_PTR:
@ -5000,7 +5007,7 @@ int btf_struct_access(struct bpf_verifier_log *log,
* by diving in it. At this point the offset is
* aligned with the new type, so set it to 0.
*/
t = btf_type_by_id(btf_vmlinux, id);
t = btf_type_by_id(btf, id);
off = 0;
break;
default:
@ -5016,21 +5023,37 @@ int btf_struct_access(struct bpf_verifier_log *log,
return -EINVAL;
}
/* Check that two BTF types, each specified as an BTF object + id, are exactly
* the same. Trivial ID check is not enough due to module BTFs, because we can
* end up with two different module BTFs, but IDs point to the common type in
* vmlinux BTF.
*/
static bool btf_types_are_same(const struct btf *btf1, u32 id1,
const struct btf *btf2, u32 id2)
{
if (id1 != id2)
return false;
if (btf1 == btf2)
return true;
return btf_type_by_id(btf1, id1) == btf_type_by_id(btf2, id2);
}
bool btf_struct_ids_match(struct bpf_verifier_log *log,
int off, u32 id, u32 need_type_id)
const struct btf *btf, u32 id, int off,
const struct btf *need_btf, u32 need_type_id)
{
const struct btf_type *type;
int err;
/* Are we already done? */
if (need_type_id == id && off == 0)
if (off == 0 && btf_types_are_same(btf, id, need_btf, need_type_id))
return true;
again:
type = btf_type_by_id(btf_vmlinux, id);
type = btf_type_by_id(btf, id);
if (!type)
return false;
err = btf_struct_walk(log, type, off, 1, &id);
err = btf_struct_walk(log, btf, type, off, 1, &id);
if (err != WALK_STRUCT)
return false;
@ -5039,7 +5062,7 @@ again:
* continue the search with offset 0 in the new
* type.
*/
if (need_type_id != id) {
if (!btf_types_are_same(btf, id, need_btf, need_type_id)) {
off = 0;
goto again;
}
@ -5710,11 +5733,16 @@ int btf_get_fd_by_id(u32 id)
return fd;
}
u32 btf_id(const struct btf *btf)
u32 btf_obj_id(const struct btf *btf)
{
return btf->id;
}
bool btf_is_kernel(const struct btf *btf)
{
return btf->kernel_btf;
}
static int btf_id_cmp_func(const void *a, const void *b)
{
const int *pa = a, *pb = b;

View File

@ -77,7 +77,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
{
gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags;
struct bpf_prog_aux *aux;
struct bpf_prog *fp;
@ -86,7 +86,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
if (fp == NULL)
return NULL;
aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);
aux = kzalloc(sizeof(*aux), GFP_KERNEL_ACCOUNT | gfp_extra_flags);
if (aux == NULL) {
vfree(fp);
return NULL;
@ -106,7 +106,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
{
gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags;
struct bpf_prog *prog;
int cpu;
@ -138,7 +138,7 @@ int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)
prog->aux->jited_linfo = kcalloc(prog->aux->nr_linfo,
sizeof(*prog->aux->jited_linfo),
GFP_KERNEL | __GFP_NOWARN);
GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
if (!prog->aux->jited_linfo)
return -ENOMEM;
@ -219,25 +219,17 @@ void bpf_prog_free_linfo(struct bpf_prog *prog)
struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
gfp_t gfp_extra_flags)
{
gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags;
struct bpf_prog *fp;
u32 pages, delta;
int ret;
u32 pages;
size = round_up(size, PAGE_SIZE);
pages = size / PAGE_SIZE;
if (pages <= fp_old->pages)
return fp_old;
delta = pages - fp_old->pages;
ret = __bpf_prog_charge(fp_old->aux->user, delta);
if (ret)
return NULL;
fp = __vmalloc(size, gfp_flags);
if (fp == NULL) {
__bpf_prog_uncharge(fp_old->aux->user, delta);
} else {
if (fp) {
memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
fp->pages = pages;
fp->aux->prog = fp;
@ -2211,6 +2203,7 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
const struct bpf_func_proto bpf_get_numa_node_id_proto __weak;
const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak;
const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak;
const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;

View File

@ -84,8 +84,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
u32 value_size = attr->value_size;
struct bpf_cpu_map *cmap;
int err = -ENOMEM;
u64 cost;
int ret;
if (!bpf_capable())
return ERR_PTR(-EPERM);
@ -97,7 +95,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
attr->map_flags & ~BPF_F_NUMA_NODE)
return ERR_PTR(-EINVAL);
cmap = kzalloc(sizeof(*cmap), GFP_USER);
cmap = kzalloc(sizeof(*cmap), GFP_USER | __GFP_ACCOUNT);
if (!cmap)
return ERR_PTR(-ENOMEM);
@ -109,26 +107,14 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
goto free_cmap;
}
/* make sure page count doesn't overflow */
cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
/* Notice returns -EPERM on if map size is larger than memlock limit */
ret = bpf_map_charge_init(&cmap->map.memory, cost);
if (ret) {
err = ret;
goto free_cmap;
}
/* Alloc array for possible remote "destination" CPUs */
cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
sizeof(struct bpf_cpu_map_entry *),
cmap->map.numa_node);
if (!cmap->cpu_map)
goto free_charge;
goto free_cmap;
return &cmap->map;
free_charge:
bpf_map_charge_finish(&cmap->map.memory);
free_cmap:
kfree(cmap);
return ERR_PTR(err);
@ -412,7 +398,8 @@ static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd)
}
static struct bpf_cpu_map_entry *
__cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id)
__cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
u32 cpu)
{
int numa, err, i, fd = value->bpf_prog.fd;
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
@ -422,12 +409,12 @@ __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id)
/* Have map->numa_node, but choose node of redirect target CPU */
numa = cpu_to_node(cpu);
rcpu = kzalloc_node(sizeof(*rcpu), gfp, numa);
rcpu = bpf_map_kmalloc_node(map, sizeof(*rcpu), gfp | __GFP_ZERO, numa);
if (!rcpu)
return NULL;
/* Alloc percpu bulkq */
rcpu->bulkq = __alloc_percpu_gfp(sizeof(*rcpu->bulkq),
rcpu->bulkq = bpf_map_alloc_percpu(map, sizeof(*rcpu->bulkq),
sizeof(void *), gfp);
if (!rcpu->bulkq)
goto free_rcu;
@ -438,7 +425,8 @@ __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id)
}
/* Alloc queue */
rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa);
rcpu->queue = bpf_map_kmalloc_node(map, sizeof(*rcpu->queue), gfp,
numa);
if (!rcpu->queue)
goto free_bulkq;
@ -447,7 +435,7 @@ __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id)
goto free_queue;
rcpu->cpu = cpu;
rcpu->map_id = map_id;
rcpu->map_id = map->id;
rcpu->value.qsize = value->qsize;
if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd))
@ -455,7 +443,8 @@ __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id)
/* Setup kthread */
rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
"cpumap/%d/map:%d", cpu, map_id);
"cpumap/%d/map:%d", cpu,
map->id);
if (IS_ERR(rcpu->kthread))
goto free_prog;
@ -571,7 +560,7 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
rcpu = NULL; /* Same as deleting */
} else {
/* Updating qsize cause re-allocation of bpf_cpu_map_entry */
rcpu = __cpu_map_entry_alloc(&cpumap_value, key_cpu, map->id);
rcpu = __cpu_map_entry_alloc(map, &cpumap_value, key_cpu);
if (!rcpu)
return -ENOMEM;
rcpu->cmap = cmap;

View File

@ -109,8 +109,6 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
{
u32 valsize = attr->value_size;
u64 cost = 0;
int err;
/* check sanity of attributes. 2 value sizes supported:
* 4 bytes: ifindex
@ -135,21 +133,13 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
if (!dtab->n_buckets) /* Overflow check */
return -EINVAL;
cost += (u64) sizeof(struct hlist_head) * dtab->n_buckets;
} else {
cost += (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
}
/* if map size is larger than memlock limit, reject it */
err = bpf_map_charge_init(&dtab->map.memory, cost);
if (err)
return -EINVAL;
if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets,
dtab->map.numa_node);
if (!dtab->dev_index_head)
goto free_charge;
return -ENOMEM;
spin_lock_init(&dtab->index_lock);
} else {
@ -157,14 +147,10 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
sizeof(struct bpf_dtab_netdev *),
dtab->map.numa_node);
if (!dtab->netdev_map)
goto free_charge;
return -ENOMEM;
}
return 0;
free_charge:
bpf_map_charge_finish(&dtab->map.memory);
return -ENOMEM;
}
static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
@ -175,7 +161,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
if (!capable(CAP_NET_ADMIN))
return ERR_PTR(-EPERM);
dtab = kzalloc(sizeof(*dtab), GFP_USER);
dtab = kzalloc(sizeof(*dtab), GFP_USER | __GFP_ACCOUNT);
if (!dtab)
return ERR_PTR(-ENOMEM);
@ -602,7 +588,8 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
struct bpf_prog *prog = NULL;
struct bpf_dtab_netdev *dev;
dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
dev = bpf_map_kmalloc_node(&dtab->map, sizeof(*dev),
GFP_ATOMIC | __GFP_NOWARN,
dtab->map.numa_node);
if (!dev)
return ERR_PTR(-ENOMEM);

View File

@ -292,7 +292,8 @@ static int prealloc_init(struct bpf_htab *htab)
u32 size = round_up(htab->map.value_size, 8);
void __percpu *pptr;
pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN);
pptr = bpf_map_alloc_percpu(&htab->map, size, 8,
GFP_USER | __GFP_NOWARN);
if (!pptr)
goto free_elems;
htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
@ -346,7 +347,7 @@ static int alloc_extra_elems(struct bpf_htab *htab)
struct pcpu_freelist_node *l;
int cpu;
pptr = __alloc_percpu_gfp(sizeof(struct htab_elem *), 8,
pptr = bpf_map_alloc_percpu(&htab->map, sizeof(struct htab_elem *), 8,
GFP_USER | __GFP_NOWARN);
if (!pptr)
return -ENOMEM;
@ -442,9 +443,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
struct bpf_htab *htab;
int err, i;
u64 cost;
htab = kzalloc(sizeof(*htab), GFP_USER);
htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_ACCOUNT);
if (!htab)
return ERR_PTR(-ENOMEM);
@ -480,30 +480,18 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
htab->n_buckets > U32_MAX / sizeof(struct bucket))
goto free_htab;
cost = (u64) htab->n_buckets * sizeof(struct bucket) +
(u64) htab->elem_size * htab->map.max_entries;
if (percpu)
cost += (u64) round_up(htab->map.value_size, 8) *
num_possible_cpus() * htab->map.max_entries;
else
cost += (u64) htab->elem_size * num_possible_cpus();
/* if map size is larger than memlock limit, reject it */
err = bpf_map_charge_init(&htab->map.memory, cost);
if (err)
goto free_htab;
err = -ENOMEM;
htab->buckets = bpf_map_area_alloc(htab->n_buckets *
sizeof(struct bucket),
htab->map.numa_node);
if (!htab->buckets)
goto free_charge;
goto free_htab;
for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) {
htab->map_locked[i] = __alloc_percpu_gfp(sizeof(int),
sizeof(int), GFP_USER);
htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map,
sizeof(int),
sizeof(int),
GFP_USER);
if (!htab->map_locked[i])
goto free_map_locked;
}
@ -538,8 +526,6 @@ free_map_locked:
for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
free_percpu(htab->map_locked[i]);
bpf_map_area_free(htab->buckets);
free_charge:
bpf_map_charge_finish(&htab->map.memory);
free_htab:
lockdep_unregister_key(&htab->lockdep_key);
kfree(htab);
@ -925,7 +911,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
l_new = ERR_PTR(-E2BIG);
goto dec_count;
}
l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
l_new = bpf_map_kmalloc_node(&htab->map, htab->elem_size,
GFP_ATOMIC | __GFP_NOWARN,
htab->map.numa_node);
if (!l_new) {
l_new = ERR_PTR(-ENOMEM);
@ -942,7 +929,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
pptr = htab_elem_get_ptr(l_new, key_size);
} else {
/* alloc_percpu zero-fills */
pptr = __alloc_percpu_gfp(size, 8,
pptr = bpf_map_alloc_percpu(&htab->map, size, 8,
GFP_ATOMIC | __GFP_NOWARN);
if (!pptr) {
kfree(l_new);

View File

@ -167,6 +167,17 @@ const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = {
.ret_type = RET_INTEGER,
};
BPF_CALL_0(bpf_ktime_get_coarse_ns)
{
return ktime_get_coarse_ns();
}
const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto = {
.func = bpf_ktime_get_coarse_ns,
.gpl_only = false,
.ret_type = RET_INTEGER,
};
BPF_CALL_0(bpf_get_current_pid_tgid)
{
struct task_struct *task = current;
@ -685,6 +696,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_ktime_get_ns_proto;
case BPF_FUNC_ktime_get_boot_ns:
return &bpf_ktime_get_boot_ns_proto;
case BPF_FUNC_ktime_get_coarse_ns:
return &bpf_ktime_get_coarse_ns_proto;
case BPF_FUNC_ringbuf_output:
return &bpf_ringbuf_output_proto;
case BPF_FUNC_ringbuf_reserve:

View File

@ -164,7 +164,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key,
return 0;
}
new = kmalloc_node(sizeof(struct bpf_storage_buffer) +
new = bpf_map_kmalloc_node(map, sizeof(struct bpf_storage_buffer) +
map->value_size,
__GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN,
map->numa_node);
@ -287,8 +287,6 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
{
int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_cgroup_storage_map *map;
struct bpf_map_memory mem;
int ret;
if (attr->key_size != sizeof(struct bpf_cgroup_storage_key) &&
attr->key_size != sizeof(__u64))
@ -308,18 +306,10 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
/* max_entries is not used and enforced to be 0 */
return ERR_PTR(-EINVAL);
ret = bpf_map_charge_init(&mem, sizeof(struct bpf_cgroup_storage_map));
if (ret < 0)
return ERR_PTR(ret);
map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map),
__GFP_ZERO | GFP_USER, numa_node);
if (!map) {
bpf_map_charge_finish(&mem);
__GFP_ZERO | GFP_USER | __GFP_ACCOUNT, numa_node);
if (!map)
return ERR_PTR(-ENOMEM);
}
bpf_map_charge_move(&map->map.memory, &mem);
/* copy mandatory map attributes */
bpf_map_init_from_attr(&map->map, attr);
@ -496,9 +486,9 @@ static size_t bpf_cgroup_storage_calculate_size(struct bpf_map *map, u32 *pages)
struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
enum bpf_cgroup_storage_type stype)
{
const gfp_t gfp = __GFP_ZERO | GFP_USER;
struct bpf_cgroup_storage *storage;
struct bpf_map *map;
gfp_t flags;
size_t size;
u32 pages;
@ -508,23 +498,19 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
size = bpf_cgroup_storage_calculate_size(map, &pages);
if (bpf_map_charge_memlock(map, pages))
return ERR_PTR(-EPERM);
storage = kmalloc_node(sizeof(struct bpf_cgroup_storage),
__GFP_ZERO | GFP_USER, map->numa_node);
storage = bpf_map_kmalloc_node(map, sizeof(struct bpf_cgroup_storage),
gfp, map->numa_node);
if (!storage)
goto enomem;
flags = __GFP_ZERO | GFP_USER;
if (stype == BPF_CGROUP_STORAGE_SHARED) {
storage->buf = kmalloc_node(size, flags, map->numa_node);
storage->buf = bpf_map_kmalloc_node(map, size, gfp,
map->numa_node);
if (!storage->buf)
goto enomem;
check_and_init_map_lock(map, storage->buf->data);
} else {
storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags);
storage->percpu_buf = bpf_map_alloc_percpu(map, size, 8, gfp);
if (!storage->percpu_buf)
goto enomem;
}
@ -534,7 +520,6 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
return storage;
enomem:
bpf_map_uncharge_memlock(map, pages);
kfree(storage);
return ERR_PTR(-ENOMEM);
}
@ -561,16 +546,11 @@ void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage)
{
enum bpf_cgroup_storage_type stype;
struct bpf_map *map;
u32 pages;
if (!storage)
return;
map = &storage->map->map;
bpf_cgroup_storage_calculate_size(map, &pages);
bpf_map_uncharge_memlock(map, pages);
stype = cgroup_storage_type(map);
if (stype == BPF_CGROUP_STORAGE_SHARED)
call_rcu(&storage->rcu, free_shared_cgroup_storage_rcu);

View File

@ -282,7 +282,7 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
if (value)
size += trie->map.value_size;
node = kmalloc_node(size, GFP_ATOMIC | __GFP_NOWARN,
node = bpf_map_kmalloc_node(&trie->map, size, GFP_ATOMIC | __GFP_NOWARN,
trie->map.numa_node);
if (!node)
return NULL;
@ -540,8 +540,6 @@ out:
static struct bpf_map *trie_alloc(union bpf_attr *attr)
{
struct lpm_trie *trie;
u64 cost = sizeof(*trie), cost_per_node;
int ret;
if (!bpf_capable())
return ERR_PTR(-EPERM);
@ -557,7 +555,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
attr->value_size > LPM_VAL_SIZE_MAX)
return ERR_PTR(-EINVAL);
trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN);
trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT);
if (!trie)
return ERR_PTR(-ENOMEM);
@ -567,20 +565,9 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
offsetof(struct bpf_lpm_trie_key, data);
trie->max_prefixlen = trie->data_size * 8;
cost_per_node = sizeof(struct lpm_trie_node) +
attr->value_size + trie->data_size;
cost += (u64) attr->max_entries * cost_per_node;
ret = bpf_map_charge_init(&trie->map.memory, cost);
if (ret)
goto out_err;
spin_lock_init(&trie->lock);
return &trie->map;
out_err:
kfree(trie);
return ERR_PTR(ret);
}
static void trie_free(struct bpf_map *map)

View File

@ -66,29 +66,21 @@ static int queue_stack_map_alloc_check(union bpf_attr *attr)
static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
{
int ret, numa_node = bpf_map_attr_numa_node(attr);
struct bpf_map_memory mem = {0};
int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_queue_stack *qs;
u64 size, queue_size, cost;
u64 size, queue_size;
size = (u64) attr->max_entries + 1;
cost = queue_size = sizeof(*qs) + size * attr->value_size;
ret = bpf_map_charge_init(&mem, cost);
if (ret < 0)
return ERR_PTR(ret);
queue_size = sizeof(*qs) + size * attr->value_size;
qs = bpf_map_area_alloc(queue_size, numa_node);
if (!qs) {
bpf_map_charge_finish(&mem);
if (!qs)
return ERR_PTR(-ENOMEM);
}
memset(qs, 0, sizeof(*qs));
bpf_map_init_from_attr(&qs->map, attr);
bpf_map_charge_move(&qs->map.memory, &mem);
qs->size = size;
raw_spin_lock_init(&qs->lock);

View File

@ -150,9 +150,8 @@ static void reuseport_array_free(struct bpf_map *map)
static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
{
int err, numa_node = bpf_map_attr_numa_node(attr);
int numa_node = bpf_map_attr_numa_node(attr);
struct reuseport_array *array;
struct bpf_map_memory mem;
u64 array_size;
if (!bpf_capable())
@ -161,20 +160,13 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
array_size = sizeof(*array);
array_size += (u64)attr->max_entries * sizeof(struct sock *);
err = bpf_map_charge_init(&mem, array_size);
if (err)
return ERR_PTR(err);
/* allocate all map elements and zero-initialize them */
array = bpf_map_area_alloc(array_size, numa_node);
if (!array) {
bpf_map_charge_finish(&mem);
if (!array)
return ERR_PTR(-ENOMEM);
}
/* copy mandatory map attributes */
bpf_map_init_from_attr(&array->map, attr);
bpf_map_charge_move(&array->map.memory, &mem);
return &array->map;
}

View File

@ -48,7 +48,6 @@ struct bpf_ringbuf {
struct bpf_ringbuf_map {
struct bpf_map map;
struct bpf_map_memory memory;
struct bpf_ringbuf *rb;
};
@ -60,8 +59,8 @@ struct bpf_ringbuf_hdr {
static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node)
{
const gfp_t flags = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN |
__GFP_ZERO;
const gfp_t flags = GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL |
__GFP_NOWARN | __GFP_ZERO;
int nr_meta_pages = RINGBUF_PGOFF + RINGBUF_POS_PAGES;
int nr_data_pages = data_sz >> PAGE_SHIFT;
int nr_pages = nr_meta_pages + nr_data_pages;
@ -88,10 +87,7 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node)
* user-space implementations significantly.
*/
array_size = (nr_meta_pages + 2 * nr_data_pages) * sizeof(*pages);
if (array_size > PAGE_SIZE)
pages = vmalloc_node(array_size, numa_node);
else
pages = kmalloc_node(array_size, flags, numa_node);
pages = bpf_map_area_alloc(array_size, numa_node);
if (!pages)
return NULL;
@ -134,7 +130,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
rb = bpf_ringbuf_area_alloc(data_sz, numa_node);
if (!rb)
return ERR_PTR(-ENOMEM);
return NULL;
spin_lock_init(&rb->spinlock);
init_waitqueue_head(&rb->waitq);
@ -150,8 +146,6 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
{
struct bpf_ringbuf_map *rb_map;
u64 cost;
int err;
if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
@ -167,32 +161,19 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
return ERR_PTR(-E2BIG);
#endif
rb_map = kzalloc(sizeof(*rb_map), GFP_USER);
rb_map = kzalloc(sizeof(*rb_map), GFP_USER | __GFP_ACCOUNT);
if (!rb_map)
return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&rb_map->map, attr);
cost = sizeof(struct bpf_ringbuf_map) +
sizeof(struct bpf_ringbuf) +
attr->max_entries;
err = bpf_map_charge_init(&rb_map->map.memory, cost);
if (err)
goto err_free_map;
rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node);
if (IS_ERR(rb_map->rb)) {
err = PTR_ERR(rb_map->rb);
goto err_uncharge;
if (!rb_map->rb) {
kfree(rb_map);
return ERR_PTR(-ENOMEM);
}
return &rb_map->map;
err_uncharge:
bpf_map_charge_finish(&rb_map->map.memory);
err_free_map:
kfree(rb_map);
return ERR_PTR(err);
}
static void bpf_ringbuf_free(struct bpf_ringbuf *rb)

View File

@ -90,7 +90,6 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
{
u32 value_size = attr->value_size;
struct bpf_stack_map *smap;
struct bpf_map_memory mem;
u64 cost, n_buckets;
int err;
@ -119,15 +118,9 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
err = bpf_map_charge_init(&mem, cost);
if (err)
return ERR_PTR(err);
smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr));
if (!smap) {
bpf_map_charge_finish(&mem);
if (!smap)
return ERR_PTR(-ENOMEM);
}
bpf_map_init_from_attr(&smap->map, attr);
smap->map.value_size = value_size;
@ -135,20 +128,17 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
err = get_callchain_buffers(sysctl_perf_event_max_stack);
if (err)
goto free_charge;
goto free_smap;
err = prealloc_elems_and_freelist(smap);
if (err)
goto put_buffers;
bpf_map_charge_move(&smap->map.memory, &mem);
return &smap->map;
put_buffers:
put_callchain_buffers();
free_charge:
bpf_map_charge_finish(&mem);
free_smap:
bpf_map_area_free(smap);
return ERR_PTR(err);
}

View File

@ -31,6 +31,7 @@
#include <linux/poll.h>
#include <linux/bpf-netns.h>
#include <linux/rcupdate_trace.h>
#include <linux/memcontrol.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
@ -127,7 +128,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
return map;
}
static u32 bpf_map_value_size(struct bpf_map *map)
static u32 bpf_map_value_size(const struct bpf_map *map)
{
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
@ -267,6 +268,10 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
return err;
}
/* Please, do not use this function outside from the map creation path
* (e.g. in map update path) without taking care of setting the active
* memory cgroup (see at bpf_map_kmalloc_node() for example).
*/
static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
{
/* We really just want to fail instead of triggering OOM killer
@ -279,7 +284,7 @@ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
* __GFP_RETRY_MAYFAIL to avoid such situations.
*/
const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO;
const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_ACCOUNT;
unsigned int flags = 0;
unsigned long align = 1;
void *area;
@ -341,77 +346,6 @@ void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
map->numa_node = bpf_map_attr_numa_node(attr);
}
static int bpf_charge_memlock(struct user_struct *user, u32 pages)
{
unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
if (atomic_long_add_return(pages, &user->locked_vm) > memlock_limit) {
atomic_long_sub(pages, &user->locked_vm);
return -EPERM;
}
return 0;
}
static void bpf_uncharge_memlock(struct user_struct *user, u32 pages)
{
if (user)
atomic_long_sub(pages, &user->locked_vm);
}
int bpf_map_charge_init(struct bpf_map_memory *mem, u64 size)
{
u32 pages = round_up(size, PAGE_SIZE) >> PAGE_SHIFT;
struct user_struct *user;
int ret;
if (size >= U32_MAX - PAGE_SIZE)
return -E2BIG;
user = get_current_user();
ret = bpf_charge_memlock(user, pages);
if (ret) {
free_uid(user);
return ret;
}
mem->pages = pages;
mem->user = user;
return 0;
}
void bpf_map_charge_finish(struct bpf_map_memory *mem)
{
bpf_uncharge_memlock(mem->user, mem->pages);
free_uid(mem->user);
}
void bpf_map_charge_move(struct bpf_map_memory *dst,
struct bpf_map_memory *src)
{
*dst = *src;
/* Make sure src will not be used for the redundant uncharging. */
memset(src, 0, sizeof(struct bpf_map_memory));
}
int bpf_map_charge_memlock(struct bpf_map *map, u32 pages)
{
int ret;
ret = bpf_charge_memlock(map->memory.user, pages);
if (ret)
return ret;
map->memory.pages += pages;
return ret;
}
void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages)
{
bpf_uncharge_memlock(map->memory.user, pages);
map->memory.pages -= pages;
}
static int bpf_map_alloc_id(struct bpf_map *map)
{
int id;
@ -456,17 +390,74 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
__release(&map_idr_lock);
}
#ifdef CONFIG_MEMCG_KMEM
static void bpf_map_save_memcg(struct bpf_map *map)
{
map->memcg = get_mem_cgroup_from_mm(current->mm);
}
static void bpf_map_release_memcg(struct bpf_map *map)
{
mem_cgroup_put(map->memcg);
}
void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
int node)
{
struct mem_cgroup *old_memcg;
void *ptr;
old_memcg = set_active_memcg(map->memcg);
ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node);
set_active_memcg(old_memcg);
return ptr;
}
void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
{
struct mem_cgroup *old_memcg;
void *ptr;
old_memcg = set_active_memcg(map->memcg);
ptr = kzalloc(size, flags | __GFP_ACCOUNT);
set_active_memcg(old_memcg);
return ptr;
}
void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
size_t align, gfp_t flags)
{
struct mem_cgroup *old_memcg;
void __percpu *ptr;
old_memcg = set_active_memcg(map->memcg);
ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT);
set_active_memcg(old_memcg);
return ptr;
}
#else
static void bpf_map_save_memcg(struct bpf_map *map)
{
}
static void bpf_map_release_memcg(struct bpf_map *map)
{
}
#endif
/* called from workqueue */
static void bpf_map_free_deferred(struct work_struct *work)
{
struct bpf_map *map = container_of(work, struct bpf_map, work);
struct bpf_map_memory mem;
bpf_map_charge_move(&mem, &map->memory);
security_bpf_map_free(map);
bpf_map_release_memcg(map);
/* implementation dependent freeing */
map->ops->map_free(map);
bpf_map_charge_finish(&mem);
}
static void bpf_map_put_uref(struct bpf_map *map)
@ -527,6 +518,19 @@ static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f)
}
#ifdef CONFIG_PROC_FS
/* Provides an approximation of the map's memory footprint.
* Used only to provide a backward compatibility and display
* a reasonable "memlock" info.
*/
static unsigned long bpf_map_memory_footprint(const struct bpf_map *map)
{
unsigned long size;
size = round_up(map->key_size + bpf_map_value_size(map), 8);
return round_up(map->max_entries * size, PAGE_SIZE);
}
static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
{
const struct bpf_map *map = filp->private_data;
@ -545,7 +549,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
"value_size:\t%u\n"
"max_entries:\t%u\n"
"map_flags:\t%#x\n"
"memlock:\t%llu\n"
"memlock:\t%lu\n"
"map_id:\t%u\n"
"frozen:\t%u\n",
map->map_type,
@ -553,7 +557,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
map->value_size,
map->max_entries,
map->map_flags,
map->memory.pages * 1ULL << PAGE_SHIFT,
bpf_map_memory_footprint(map),
map->id,
READ_ONCE(map->frozen));
if (type) {
@ -796,7 +800,6 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
static int map_create(union bpf_attr *attr)
{
int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_map_memory mem;
struct bpf_map *map;
int f_flags;
int err;
@ -875,6 +878,8 @@ static int map_create(union bpf_attr *attr)
if (err)
goto free_map_sec;
bpf_map_save_memcg(map);
err = bpf_map_new_fd(map, f_flags);
if (err < 0) {
/* failed to allocate fd.
@ -893,9 +898,7 @@ free_map_sec:
security_bpf_map_free(map);
free_map:
btf_put(map->btf);
bpf_map_charge_move(&mem, &map->memory);
map->ops->map_free(map);
bpf_map_charge_finish(&mem);
return err;
}
@ -1629,51 +1632,6 @@ static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op)
audit_log_end(ab);
}
int __bpf_prog_charge(struct user_struct *user, u32 pages)
{
unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
unsigned long user_bufs;
if (user) {
user_bufs = atomic_long_add_return(pages, &user->locked_vm);
if (user_bufs > memlock_limit) {
atomic_long_sub(pages, &user->locked_vm);
return -EPERM;
}
}
return 0;
}
void __bpf_prog_uncharge(struct user_struct *user, u32 pages)
{
if (user)
atomic_long_sub(pages, &user->locked_vm);
}
static int bpf_prog_charge_memlock(struct bpf_prog *prog)
{
struct user_struct *user = get_current_user();
int ret;
ret = __bpf_prog_charge(user, prog->pages);
if (ret) {
free_uid(user);
return ret;
}
prog->aux->user = user;
return 0;
}
static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
{
struct user_struct *user = prog->aux->user;
__bpf_prog_uncharge(user, prog->pages);
free_uid(user);
}
static int bpf_prog_alloc_id(struct bpf_prog *prog)
{
int id;
@ -1723,7 +1681,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
kvfree(aux->func_info);
kfree(aux->func_info_aux);
bpf_prog_uncharge_memlock(aux->prog);
free_uid(aux->user);
security_bpf_prog_free(aux);
bpf_prog_free(aux->prog);
}
@ -1733,6 +1691,8 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
bpf_prog_kallsyms_del_all(prog);
btf_put(prog->aux->btf);
bpf_prog_free_linfo(prog);
if (prog->aux->attach_btf)
btf_put(prog->aux->attach_btf);
if (deferred) {
if (prog->aux->sleepable)
@ -1966,12 +1926,16 @@ static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr)
static int
bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
enum bpf_attach_type expected_attach_type,
u32 btf_id, u32 prog_fd)
struct btf *attach_btf, u32 btf_id,
struct bpf_prog *dst_prog)
{
if (btf_id) {
if (btf_id > BTF_MAX_TYPE)
return -EINVAL;
if (!attach_btf && !dst_prog)
return -EINVAL;
switch (prog_type) {
case BPF_PROG_TYPE_TRACING:
case BPF_PROG_TYPE_LSM:
@ -1983,7 +1947,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
}
}
if (prog_fd && prog_type != BPF_PROG_TYPE_TRACING &&
if (attach_btf && (!btf_id || dst_prog))
return -EINVAL;
if (dst_prog && prog_type != BPF_PROG_TYPE_TRACING &&
prog_type != BPF_PROG_TYPE_EXT)
return -EINVAL;
@ -2100,7 +2067,8 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
{
enum bpf_prog_type type = attr->prog_type;
struct bpf_prog *prog;
struct bpf_prog *prog, *dst_prog = NULL;
struct btf *attach_btf = NULL;
int err;
char license[128];
bool is_gpl;
@ -2142,47 +2110,70 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
if (is_perfmon_prog_type(type) && !perfmon_capable())
return -EPERM;
/* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog
* or btf, we need to check which one it is
*/
if (attr->attach_prog_fd) {
dst_prog = bpf_prog_get(attr->attach_prog_fd);
if (IS_ERR(dst_prog)) {
dst_prog = NULL;
attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd);
if (IS_ERR(attach_btf))
return -EINVAL;
if (!btf_is_kernel(attach_btf)) {
btf_put(attach_btf);
return -EINVAL;
}
}
} else if (attr->attach_btf_id) {
/* fall back to vmlinux BTF, if BTF type ID is specified */
attach_btf = bpf_get_btf_vmlinux();
if (IS_ERR(attach_btf))
return PTR_ERR(attach_btf);
if (!attach_btf)
return -EINVAL;
btf_get(attach_btf);
}
bpf_prog_load_fixup_attach_type(attr);
if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
attr->attach_btf_id,
attr->attach_prog_fd))
attach_btf, attr->attach_btf_id,
dst_prog)) {
if (dst_prog)
bpf_prog_put(dst_prog);
if (attach_btf)
btf_put(attach_btf);
return -EINVAL;
}
/* plain bpf_prog allocation */
prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
if (!prog)
if (!prog) {
if (dst_prog)
bpf_prog_put(dst_prog);
if (attach_btf)
btf_put(attach_btf);
return -ENOMEM;
}
prog->expected_attach_type = attr->expected_attach_type;
prog->aux->attach_btf = attach_btf;
prog->aux->attach_btf_id = attr->attach_btf_id;
if (attr->attach_prog_fd) {
struct bpf_prog *dst_prog;
dst_prog = bpf_prog_get(attr->attach_prog_fd);
if (IS_ERR(dst_prog)) {
err = PTR_ERR(dst_prog);
goto free_prog_nouncharge;
}
prog->aux->dst_prog = dst_prog;
}
prog->aux->offload_requested = !!attr->prog_ifindex;
prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
err = security_bpf_prog_alloc(prog->aux);
if (err)
goto free_prog_nouncharge;
err = bpf_prog_charge_memlock(prog);
if (err)
goto free_prog_sec;
goto free_prog;
prog->aux->user = get_current_user();
prog->len = attr->insn_cnt;
err = -EFAULT;
if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns),
bpf_prog_insn_size(prog)) != 0)
goto free_prog;
goto free_prog_sec;
prog->orig_prog = NULL;
prog->jited = 0;
@ -2193,19 +2184,19 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
if (bpf_prog_is_dev_bound(prog->aux)) {
err = bpf_prog_offload_init(prog, attr);
if (err)
goto free_prog;
goto free_prog_sec;
}
/* find program type: socket_filter vs tracing_filter */
err = find_prog_type(type, prog);
if (err < 0)
goto free_prog;
goto free_prog_sec;
prog->aux->load_time = ktime_get_boottime_ns();
err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name,
sizeof(attr->prog_name));
if (err < 0)
goto free_prog;
goto free_prog_sec;
/* run eBPF verifier */
err = bpf_check(&prog, attr, uattr);
@ -2250,11 +2241,12 @@ free_used_maps:
*/
__bpf_prog_put_noref(prog, prog->aux->func_cnt);
return err;
free_prog:
bpf_prog_uncharge_memlock(prog);
free_prog_sec:
free_uid(prog->aux->user);
security_bpf_prog_free(prog->aux);
free_prog_nouncharge:
free_prog:
if (prog->aux->attach_btf)
btf_put(prog->aux->attach_btf);
bpf_prog_free(prog);
return err;
}
@ -2612,7 +2604,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
goto out_put_prog;
}
key = bpf_trampoline_compute_key(tgt_prog, btf_id);
key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id);
}
link = kzalloc(sizeof(*link), GFP_USER);
@ -3589,7 +3581,7 @@ static int bpf_prog_get_info_by_fd(struct file *file,
}
if (prog->aux->btf)
info.btf_id = btf_id(prog->aux->btf);
info.btf_id = btf_obj_id(prog->aux->btf);
ulen = info.nr_func_info;
info.nr_func_info = prog->aux->func_info_cnt;
@ -3692,7 +3684,7 @@ static int bpf_map_get_info_by_fd(struct file *file,
memcpy(info.name, map->name, sizeof(map->name));
if (map->btf) {
info.btf_id = btf_id(map->btf);
info.btf_id = btf_obj_id(map->btf);
info.btf_key_type_id = map->btf_key_type_id;
info.btf_value_type_id = map->btf_value_type_id;
}

View File

@ -136,8 +136,7 @@ struct bpf_iter_seq_task_file_info {
};
static struct file *
task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info,
struct task_struct **task, struct files_struct **fstruct)
task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
{
struct pid_namespace *ns = info->common.ns;
u32 curr_tid = info->tid, max_fds;
@ -150,14 +149,17 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info,
* Otherwise, it does not hold any reference.
*/
again:
if (*task) {
curr_task = *task;
curr_files = *fstruct;
if (info->task) {
curr_task = info->task;
curr_files = info->files;
curr_fd = info->fd;
} else {
curr_task = task_seq_get_next(ns, &curr_tid, true);
if (!curr_task)
if (!curr_task) {
info->task = NULL;
info->files = NULL;
return NULL;
}
curr_files = get_files_struct(curr_task);
if (!curr_files) {
@ -167,9 +169,8 @@ again:
goto again;
}
/* set *fstruct, *task and info->tid */
*fstruct = curr_files;
*task = curr_task;
info->files = curr_files;
info->task = curr_task;
if (curr_tid == info->tid) {
curr_fd = info->fd;
} else {
@ -199,8 +200,8 @@ again:
rcu_read_unlock();
put_files_struct(curr_files);
put_task_struct(curr_task);
*task = NULL;
*fstruct = NULL;
info->task = NULL;
info->files = NULL;
info->fd = 0;
curr_tid = ++(info->tid);
goto again;
@ -209,21 +210,13 @@ again:
static void *task_file_seq_start(struct seq_file *seq, loff_t *pos)
{
struct bpf_iter_seq_task_file_info *info = seq->private;
struct files_struct *files = NULL;
struct task_struct *task = NULL;
struct file *file;
file = task_file_seq_get_next(info, &task, &files);
if (!file) {
info->files = NULL;
info->task = NULL;
return NULL;
}
if (*pos == 0)
info->files = NULL;
file = task_file_seq_get_next(info);
if (file && *pos == 0)
++*pos;
info->task = task;
info->files = files;
return file;
}
@ -231,24 +224,11 @@ static void *task_file_seq_start(struct seq_file *seq, loff_t *pos)
static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct bpf_iter_seq_task_file_info *info = seq->private;
struct files_struct *files = info->files;
struct task_struct *task = info->task;
struct file *file;
++*pos;
++info->fd;
fput((struct file *)v);
file = task_file_seq_get_next(info, &task, &files);
if (!file) {
info->files = NULL;
info->task = NULL;
return NULL;
}
info->task = task;
info->files = files;
return file;
return task_file_seq_get_next(info);
}
struct bpf_iter__task_file {

View File

@ -238,7 +238,9 @@ struct bpf_call_arg_meta {
u64 msize_max_value;
int ref_obj_id;
int func_id;
struct btf *btf;
u32 btf_id;
struct btf *ret_btf;
u32 ret_btf_id;
};
@ -556,10 +558,9 @@ static struct bpf_func_state *func(struct bpf_verifier_env *env,
return cur->frame[reg->frameno];
}
const char *kernel_type_name(u32 id)
static const char *kernel_type_name(const struct btf* btf, u32 id)
{
return btf_name_by_offset(btf_vmlinux,
btf_type_by_id(btf_vmlinux, id)->name_off);
return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
}
static void print_verifier_state(struct bpf_verifier_env *env,
@ -589,7 +590,7 @@ static void print_verifier_state(struct bpf_verifier_env *env,
if (t == PTR_TO_BTF_ID ||
t == PTR_TO_BTF_ID_OR_NULL ||
t == PTR_TO_PERCPU_BTF_ID)
verbose(env, "%s", kernel_type_name(reg->btf_id));
verbose(env, "%s", kernel_type_name(reg->btf, reg->btf_id));
verbose(env, "(id=%d", reg->id);
if (reg_type_may_be_refcounted_or_null(t))
verbose(env, ",ref_obj_id=%d", reg->ref_obj_id);
@ -1383,7 +1384,8 @@ static void mark_reg_not_init(struct bpf_verifier_env *env,
static void mark_btf_ld_reg(struct bpf_verifier_env *env,
struct bpf_reg_state *regs, u32 regno,
enum bpf_reg_type reg_type, u32 btf_id)
enum bpf_reg_type reg_type,
struct btf *btf, u32 btf_id)
{
if (reg_type == SCALAR_VALUE) {
mark_reg_unknown(env, regs, regno);
@ -1391,6 +1393,7 @@ static void mark_btf_ld_reg(struct bpf_verifier_env *env,
}
mark_reg_known_zero(env, regs, regno);
regs[regno].type = PTR_TO_BTF_ID;
regs[regno].btf = btf;
regs[regno].btf_id = btf_id;
}
@ -2764,7 +2767,7 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
/* check access to 'struct bpf_context' fields. Supports fixed offsets only */
static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
enum bpf_access_type t, enum bpf_reg_type *reg_type,
u32 *btf_id)
struct btf **btf, u32 *btf_id)
{
struct bpf_insn_access_aux info = {
.reg_type = *reg_type,
@ -2782,10 +2785,12 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
*/
*reg_type = info.reg_type;
if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL)
if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL) {
*btf = info.btf;
*btf_id = info.btf_id;
else
} else {
env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
}
/* remember the offset of last byte accessed in ctx */
if (env->prog->aux->max_ctx_offset < off + size)
env->prog->aux->max_ctx_offset = off + size;
@ -3297,8 +3302,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
int value_regno)
{
struct bpf_reg_state *reg = regs + regno;
const struct btf_type *t = btf_type_by_id(btf_vmlinux, reg->btf_id);
const char *tname = btf_name_by_offset(btf_vmlinux, t->name_off);
const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id);
const char *tname = btf_name_by_offset(reg->btf, t->name_off);
u32 btf_id;
int ret;
@ -3319,23 +3324,23 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
}
if (env->ops->btf_struct_access) {
ret = env->ops->btf_struct_access(&env->log, t, off, size,
atype, &btf_id);
ret = env->ops->btf_struct_access(&env->log, reg->btf, t,
off, size, atype, &btf_id);
} else {
if (atype != BPF_READ) {
verbose(env, "only read is supported\n");
return -EACCES;
}
ret = btf_struct_access(&env->log, t, off, size, atype,
&btf_id);
ret = btf_struct_access(&env->log, reg->btf, t, off, size,
atype, &btf_id);
}
if (ret < 0)
return ret;
if (atype == BPF_READ && value_regno >= 0)
mark_btf_ld_reg(env, regs, value_regno, ret, btf_id);
mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id);
return 0;
}
@ -3385,12 +3390,12 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
return -EACCES;
}
ret = btf_struct_access(&env->log, t, off, size, atype, &btf_id);
ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id);
if (ret < 0)
return ret;
if (value_regno >= 0)
mark_btf_ld_reg(env, regs, value_regno, ret, btf_id);
mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id);
return 0;
}
@ -3466,6 +3471,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
mark_reg_unknown(env, regs, value_regno);
} else if (reg->type == PTR_TO_CTX) {
enum bpf_reg_type reg_type = SCALAR_VALUE;
struct btf *btf = NULL;
u32 btf_id = 0;
if (t == BPF_WRITE && value_regno >= 0 &&
@ -3478,7 +3484,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (err < 0)
return err;
err = check_ctx_access(env, insn_idx, off, size, t, &reg_type, &btf_id);
err = check_ctx_access(env, insn_idx, off, size, t, &reg_type, &btf, &btf_id);
if (err)
verbose_linfo(env, insn_idx, "; ");
if (!err && t == BPF_READ && value_regno >= 0) {
@ -3500,9 +3506,11 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
*/
regs[value_regno].subreg_def = DEF_NOT_SUBREG;
if (reg_type == PTR_TO_BTF_ID ||
reg_type == PTR_TO_BTF_ID_OR_NULL)
reg_type == PTR_TO_BTF_ID_OR_NULL) {
regs[value_regno].btf = btf;
regs[value_regno].btf_id = btf_id;
}
}
regs[value_regno].type = reg_type;
}
@ -4118,11 +4126,11 @@ found:
arg_btf_id = compatible->btf_id;
}
if (!btf_struct_ids_match(&env->log, reg->off, reg->btf_id,
*arg_btf_id)) {
if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
btf_vmlinux, *arg_btf_id)) {
verbose(env, "R%d is of type %s but %s is expected\n",
regno, kernel_type_name(reg->btf_id),
kernel_type_name(*arg_btf_id));
regno, kernel_type_name(reg->btf, reg->btf_id),
kernel_type_name(btf_vmlinux, *arg_btf_id));
return -EACCES;
}
@ -4244,6 +4252,7 @@ skip_type_check:
verbose(env, "Helper has invalid btf_id in R%d\n", regno);
return -EACCES;
}
meta->ret_btf = reg->btf;
meta->ret_btf_id = reg->btf_id;
} else if (arg_type == ARG_PTR_TO_SPIN_LOCK) {
if (meta->func_id == BPF_FUNC_spin_lock) {
@ -5190,16 +5199,16 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
const struct btf_type *t;
mark_reg_known_zero(env, regs, BPF_REG_0);
t = btf_type_skip_modifiers(btf_vmlinux, meta.ret_btf_id, NULL);
t = btf_type_skip_modifiers(meta.ret_btf, meta.ret_btf_id, NULL);
if (!btf_type_is_struct(t)) {
u32 tsize;
const struct btf_type *ret;
const char *tname;
/* resolve the type size of ksym. */
ret = btf_resolve_size(btf_vmlinux, t, &tsize);
ret = btf_resolve_size(meta.ret_btf, t, &tsize);
if (IS_ERR(ret)) {
tname = btf_name_by_offset(btf_vmlinux, t->name_off);
tname = btf_name_by_offset(meta.ret_btf, t->name_off);
verbose(env, "unable to resolve the size of type '%s': %ld\n",
tname, PTR_ERR(ret));
return -EINVAL;
@ -5212,6 +5221,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
regs[BPF_REG_0].type =
fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ?
PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL;
regs[BPF_REG_0].btf = meta.ret_btf;
regs[BPF_REG_0].btf_id = meta.ret_btf_id;
}
} else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL ||
@ -5228,6 +5238,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
fn->ret_type, func_id_name(func_id), func_id);
return -EINVAL;
}
/* current BPF helper definitions are only coming from
* built-in code with type IDs from vmlinux BTF
*/
regs[BPF_REG_0].btf = btf_vmlinux;
regs[BPF_REG_0].btf_id = ret_btf_id;
} else {
verbose(env, "unknown return type %d of func %s#%d\n",
@ -5627,7 +5641,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
if (reg_is_pkt_pointer(ptr_reg)) {
dst_reg->id = ++env->id_gen;
/* something was added to pkt_ptr, set range to zero */
dst_reg->raw = 0;
memset(&dst_reg->raw, 0, sizeof(dst_reg->raw));
}
break;
case BPF_SUB:
@ -5692,7 +5706,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
dst_reg->id = ++env->id_gen;
/* something was added to pkt_ptr, set range to zero */
if (smin_val < 0)
dst_reg->raw = 0;
memset(&dst_reg->raw, 0, sizeof(dst_reg->raw));
}
break;
case BPF_AND:
@ -7744,6 +7758,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
break;
case PTR_TO_BTF_ID:
case PTR_TO_PERCPU_BTF_ID:
dst_reg->btf = aux->btf_var.btf;
dst_reg->btf_id = aux->btf_var.btf_id;
break;
default:
@ -8058,6 +8073,11 @@ static void init_explored_state(struct bpf_verifier_env *env, int idx)
env->insn_aux_data[idx].prune_point = true;
}
enum {
DONE_EXPLORING = 0,
KEEP_EXPLORING = 1,
};
/* t, w, e - match pseudo-code above:
* t - index of current instruction
* w - next instruction
@ -8070,10 +8090,10 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
int *insn_state = env->cfg.insn_state;
if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH))
return 0;
return DONE_EXPLORING;
if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH))
return 0;
return DONE_EXPLORING;
if (w < 0 || w >= env->prog->len) {
verbose_linfo(env, t, "%d: ", t);
@ -8092,10 +8112,10 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
if (env->cfg.cur_stack >= env->prog->len)
return -E2BIG;
insn_stack[env->cfg.cur_stack++] = w;
return 1;
return KEEP_EXPLORING;
} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
if (loop_ok && env->bpf_capable)
return 0;
return DONE_EXPLORING;
verbose_linfo(env, t, "%d: ", t);
verbose_linfo(env, w, "%d: ", w);
verbose(env, "back-edge from insn %d to %d\n", t, w);
@ -8107,7 +8127,74 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
verbose(env, "insn state internal bug\n");
return -EFAULT;
}
return 0;
return DONE_EXPLORING;
}
/* Visits the instruction at index t and returns one of the following:
* < 0 - an error occurred
* DONE_EXPLORING - the instruction was fully explored
* KEEP_EXPLORING - there is still work to be done before it is fully explored
*/
static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
{
struct bpf_insn *insns = env->prog->insnsi;
int ret;
/* All non-branch instructions have a single fall-through edge. */
if (BPF_CLASS(insns[t].code) != BPF_JMP &&
BPF_CLASS(insns[t].code) != BPF_JMP32)
return push_insn(t, t + 1, FALLTHROUGH, env, false);
switch (BPF_OP(insns[t].code)) {
case BPF_EXIT:
return DONE_EXPLORING;
case BPF_CALL:
ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
if (ret)
return ret;
if (t + 1 < insn_cnt)
init_explored_state(env, t + 1);
if (insns[t].src_reg == BPF_PSEUDO_CALL) {
init_explored_state(env, t);
ret = push_insn(t, t + insns[t].imm + 1, BRANCH,
env, false);
}
return ret;
case BPF_JA:
if (BPF_SRC(insns[t].code) != BPF_K)
return -EINVAL;
/* unconditional jump with single edge */
ret = push_insn(t, t + insns[t].off + 1, FALLTHROUGH, env,
true);
if (ret)
return ret;
/* unconditional jmp is not a good pruning point,
* but it's marked, since backtracking needs
* to record jmp history in is_state_visited().
*/
init_explored_state(env, t + insns[t].off + 1);
/* tell verifier to check for equivalent states
* after every call and jump
*/
if (t + 1 < insn_cnt)
init_explored_state(env, t + 1);
return ret;
default:
/* conditional jump with two edges */
init_explored_state(env, t);
ret = push_insn(t, t + 1, FALLTHROUGH, env, true);
if (ret)
return ret;
return push_insn(t, t + insns[t].off + 1, BRANCH, env, true);
}
}
/* non-recursive depth-first-search to detect loops in BPF program
@ -8115,11 +8202,10 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
*/
static int check_cfg(struct bpf_verifier_env *env)
{
struct bpf_insn *insns = env->prog->insnsi;
int insn_cnt = env->prog->len;
int *insn_stack, *insn_state;
int ret = 0;
int i, t;
int i;
insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
if (!insn_state)
@ -8135,92 +8221,32 @@ static int check_cfg(struct bpf_verifier_env *env)
insn_stack[0] = 0; /* 0 is the first instruction */
env->cfg.cur_stack = 1;
peek_stack:
if (env->cfg.cur_stack == 0)
goto check_state;
t = insn_stack[env->cfg.cur_stack - 1];
while (env->cfg.cur_stack > 0) {
int t = insn_stack[env->cfg.cur_stack - 1];
if (BPF_CLASS(insns[t].code) == BPF_JMP ||
BPF_CLASS(insns[t].code) == BPF_JMP32) {
u8 opcode = BPF_OP(insns[t].code);
if (opcode == BPF_EXIT) {
goto mark_explored;
} else if (opcode == BPF_CALL) {
ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
if (ret == 1)
goto peek_stack;
else if (ret < 0)
goto err_free;
if (t + 1 < insn_cnt)
init_explored_state(env, t + 1);
if (insns[t].src_reg == BPF_PSEUDO_CALL) {
init_explored_state(env, t);
ret = push_insn(t, t + insns[t].imm + 1, BRANCH,
env, false);
if (ret == 1)
goto peek_stack;
else if (ret < 0)
goto err_free;
}
} else if (opcode == BPF_JA) {
if (BPF_SRC(insns[t].code) != BPF_K) {
ret = -EINVAL;
goto err_free;
}
/* unconditional jump with single edge */
ret = push_insn(t, t + insns[t].off + 1,
FALLTHROUGH, env, true);
if (ret == 1)
goto peek_stack;
else if (ret < 0)
goto err_free;
/* unconditional jmp is not a good pruning point,
* but it's marked, since backtracking needs
* to record jmp history in is_state_visited().
*/
init_explored_state(env, t + insns[t].off + 1);
/* tell verifier to check for equivalent states
* after every call and jump
*/
if (t + 1 < insn_cnt)
init_explored_state(env, t + 1);
} else {
/* conditional jump with two edges */
init_explored_state(env, t);
ret = push_insn(t, t + 1, FALLTHROUGH, env, true);
if (ret == 1)
goto peek_stack;
else if (ret < 0)
goto err_free;
ret = push_insn(t, t + insns[t].off + 1, BRANCH, env, true);
if (ret == 1)
goto peek_stack;
else if (ret < 0)
goto err_free;
}
} else {
/* all other non-branch instructions with single
* fall-through edge
*/
ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
if (ret == 1)
goto peek_stack;
else if (ret < 0)
goto err_free;
}
mark_explored:
ret = visit_insn(t, insn_cnt, env);
switch (ret) {
case DONE_EXPLORING:
insn_state[t] = EXPLORED;
if (env->cfg.cur_stack-- <= 0) {
env->cfg.cur_stack--;
break;
case KEEP_EXPLORING:
break;
default:
if (ret > 0) {
verbose(env, "visit_insn internal bug\n");
ret = -EFAULT;
}
goto err_free;
}
}
if (env->cfg.cur_stack < 0) {
verbose(env, "pop stack internal bug\n");
ret = -EFAULT;
goto err_free;
}
goto peek_stack;
check_state:
for (i = 0; i < insn_cnt; i++) {
if (insn_state[i] != EXPLORED) {
verbose(env, "unreachable insn %d\n", i);
@ -9740,6 +9766,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
t = btf_type_skip_modifiers(btf_vmlinux, type, NULL);
if (percpu) {
aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID;
aux->btf_var.btf = btf_vmlinux;
aux->btf_var.btf_id = type;
} else if (!btf_type_is_struct(t)) {
const struct btf_type *ret;
@ -9758,6 +9785,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
aux->btf_var.mem_size = tsize;
} else {
aux->btf_var.reg_type = PTR_TO_BTF_ID;
aux->btf_var.btf = btf_vmlinux;
aux->btf_var.btf_id = type;
}
return 0;
@ -11610,7 +11638,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
bpf_log(log, "Tracing programs must provide btf_id\n");
return -EINVAL;
}
btf = tgt_prog ? tgt_prog->aux->btf : btf_vmlinux;
btf = tgt_prog ? tgt_prog->aux->btf : prog->aux->attach_btf;
if (!btf) {
bpf_log(log,
"FENTRY/FEXIT program can only be attached to another program annotated with BTF\n");
@ -11886,7 +11914,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
return ret;
}
key = bpf_trampoline_compute_key(tgt_prog, btf_id);
key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id);
tr = bpf_trampoline_get(key, &tgt_info);
if (!tr)
return -ENOMEM;

View File

@ -404,9 +404,10 @@ static int memcg_charge_kernel_stack(struct task_struct *tsk)
for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
/*
* If memcg_kmem_charge_page() fails, page->mem_cgroup
* pointer is NULL, and memcg_kmem_uncharge_page() in
* free_thread_stack() will ignore this page.
* If memcg_kmem_charge_page() fails, page's
* memory cgroup pointer is NULL, and
* memcg_kmem_uncharge_page() in free_thread_stack()
* will ignore this page.
*/
ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL,
0);

View File

@ -3709,6 +3709,10 @@ static noinline int do_init_module(struct module *mod)
mod->init_layout.ro_size = 0;
mod->init_layout.ro_after_init_size = 0;
mod->init_layout.text_size = 0;
#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
/* .BTF is not SHF_ALLOC and will get removed, so sanitize pointer */
mod->btf_data = NULL;
#endif
/*
* We want to free module_init, but be aware that kallsyms may be
* walking this with preempt disabled. In all the failure paths, we

View File

@ -1290,6 +1290,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_ktime_get_ns_proto;
case BPF_FUNC_ktime_get_boot_ns:
return &bpf_ktime_get_boot_ns_proto;
case BPF_FUNC_ktime_get_coarse_ns:
return &bpf_ktime_get_coarse_ns_proto;
case BPF_FUNC_tail_call:
return &bpf_tail_call_proto;
case BPF_FUNC_get_current_pid_tgid:
@ -2068,10 +2070,12 @@ struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name)
void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
{
struct module *mod = __module_address((unsigned long)btp);
struct module *mod;
if (mod)
preempt_disable();
mod = __module_address((unsigned long)btp);
module_put(mod);
preempt_enable();
}
static __always_inline

View File

@ -182,8 +182,8 @@ hex_only:
pr_warn("page dumped because: %s\n", reason);
#ifdef CONFIG_MEMCG
if (!page_poisoned && page->mem_cgroup)
pr_warn("page->mem_cgroup:%px\n", page->mem_cgroup);
if (!page_poisoned && page->memcg_data)
pr_warn("pages's memcg:%lx\n", page->memcg_data);
#endif
}

View File

@ -470,7 +470,7 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
#ifdef CONFIG_MEMCG
static inline struct deferred_split *get_deferred_split_queue(struct page *page)
{
struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
struct mem_cgroup *memcg = page_memcg(compound_head(page));
struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
if (memcg)
@ -2764,7 +2764,7 @@ void deferred_split_huge_page(struct page *page)
{
struct deferred_split *ds_queue = get_deferred_split_queue(page);
#ifdef CONFIG_MEMCG
struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
struct mem_cgroup *memcg = page_memcg(compound_head(page));
#endif
unsigned long flags;

View File

@ -533,7 +533,7 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
{
struct mem_cgroup *memcg;
memcg = page->mem_cgroup;
memcg = page_memcg(page);
if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
memcg = root_mem_cgroup;
@ -560,16 +560,7 @@ ino_t page_cgroup_ino(struct page *page)
unsigned long ino = 0;
rcu_read_lock();
memcg = page->mem_cgroup;
/*
* The lowest bit set means that memcg isn't a valid
* memcg pointer, but a obj_cgroups pointer.
* In this case the page is shared and doesn't belong
* to any specific memory cgroup.
*/
if ((unsigned long) memcg & 0x1UL)
memcg = NULL;
memcg = page_memcg_check(page);
while (memcg && !(memcg->css.flags & CSS_ONLINE))
memcg = parent_mem_cgroup(memcg);
@ -1055,7 +1046,7 @@ EXPORT_SYMBOL(get_mem_cgroup_from_mm);
*/
struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
{
struct mem_cgroup *memcg = page->mem_cgroup;
struct mem_cgroup *memcg = page_memcg(page);
if (mem_cgroup_disabled())
return NULL;
@ -1354,7 +1345,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd
goto out;
}
memcg = page->mem_cgroup;
memcg = page_memcg(page);
/*
* Swapcache readahead pages are added to the LRU - and
* possibly migrated - before they are charged.
@ -2114,7 +2105,7 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
}
/**
* lock_page_memcg - lock a page->mem_cgroup binding
* lock_page_memcg - lock a page and memcg binding
* @page: the page
*
* This function protects unlocked LRU pages from being moved to
@ -2146,7 +2137,7 @@ struct mem_cgroup *lock_page_memcg(struct page *page)
if (mem_cgroup_disabled())
return NULL;
again:
memcg = head->mem_cgroup;
memcg = page_memcg(head);
if (unlikely(!memcg))
return NULL;
@ -2154,7 +2145,7 @@ again:
return memcg;
spin_lock_irqsave(&memcg->move_lock, flags);
if (memcg != head->mem_cgroup) {
if (memcg != page_memcg(head)) {
spin_unlock_irqrestore(&memcg->move_lock, flags);
goto again;
}
@ -2192,14 +2183,14 @@ void __unlock_page_memcg(struct mem_cgroup *memcg)
}
/**
* unlock_page_memcg - unlock a page->mem_cgroup binding
* unlock_page_memcg - unlock a page and memcg binding
* @page: the page
*/
void unlock_page_memcg(struct page *page)
{
struct page *head = compound_head(page);
__unlock_page_memcg(head->mem_cgroup);
__unlock_page_memcg(page_memcg(head));
}
EXPORT_SYMBOL(unlock_page_memcg);
@ -2889,7 +2880,7 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
static void commit_charge(struct page *page, struct mem_cgroup *memcg)
{
VM_BUG_ON_PAGE(page->mem_cgroup, page);
VM_BUG_ON_PAGE(page_memcg(page), page);
/*
* Any of the following ensures page->mem_cgroup stability:
*
@ -2898,7 +2889,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg)
* - lock_page_memcg()
* - exclusive reference
*/
page->mem_cgroup = memcg;
page->memcg_data = (unsigned long)memcg;
}
#ifdef CONFIG_MEMCG_KMEM
@ -2913,8 +2904,7 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
if (!vec)
return -ENOMEM;
if (cmpxchg(&page->obj_cgroups, NULL,
(struct obj_cgroup **) ((unsigned long)vec | 0x1UL)))
if (!set_page_objcgs(page, vec))
kfree(vec);
else
kmemleak_not_leak(vec);
@ -2925,6 +2915,12 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
/*
* Returns a pointer to the memory cgroup to which the kernel object is charged.
*
* A passed kernel object can be a slab object or a generic kernel page, so
* different mechanisms for getting the memory cgroup pointer should be used.
* In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller
* can not know for sure how the kernel object is implemented.
* mem_cgroup_from_obj() can be safely used in such cases.
*
* The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
* cgroup_mutex, etc.
*/
@ -2937,36 +2933,31 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
page = virt_to_head_page(p);
/*
* If page->mem_cgroup is set, it's either a simple mem_cgroup pointer
* or a pointer to obj_cgroup vector. In the latter case the lowest
* bit of the pointer is set.
* The page->mem_cgroup pointer can be asynchronously changed
* from NULL to (obj_cgroup_vec | 0x1UL), but can't be changed
* from a valid memcg pointer to objcg vector or back.
*/
if (!page->mem_cgroup)
return NULL;
/*
* Slab objects are accounted individually, not per-page.
* Memcg membership data for each individual object is saved in
* the page->obj_cgroups.
*/
if (page_has_obj_cgroups(page)) {
if (page_objcgs_check(page)) {
struct obj_cgroup *objcg;
unsigned int off;
off = obj_to_index(page->slab_cache, page, p);
objcg = page_obj_cgroups(page)[off];
objcg = page_objcgs(page)[off];
if (objcg)
return obj_cgroup_memcg(objcg);
return NULL;
}
/* All other pages use page->mem_cgroup */
return page->mem_cgroup;
/*
* page_memcg_check() is used here, because page_has_obj_cgroups()
* check above could fail because the object cgroups vector wasn't set
* at that moment, but it can be set concurrently.
* page_memcg_check(page) will guarantee that a proper memory
* cgroup pointer or NULL will be returned.
*/
return page_memcg_check(page);
}
__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
@ -3104,8 +3095,8 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
if (memcg && !mem_cgroup_is_root(memcg)) {
ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
if (!ret) {
page->mem_cgroup = memcg;
__SetPageKmemcg(page);
page->memcg_data = (unsigned long)memcg |
MEMCG_DATA_KMEM;
return 0;
}
css_put(&memcg->css);
@ -3120,7 +3111,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
*/
void __memcg_kmem_uncharge_page(struct page *page, int order)
{
struct mem_cgroup *memcg = page->mem_cgroup;
struct mem_cgroup *memcg = page_memcg(page);
unsigned int nr_pages = 1 << order;
if (!memcg)
@ -3128,12 +3119,8 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
__memcg_kmem_uncharge(memcg, nr_pages);
page->mem_cgroup = NULL;
page->memcg_data = 0;
css_put(&memcg->css);
/* slab pages do not have PageKmemcg flag set */
if (PageKmemcg(page))
__ClearPageKmemcg(page);
}
static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
@ -3279,7 +3266,7 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
*/
void mem_cgroup_split_huge_fixup(struct page *head)
{
struct mem_cgroup *memcg = head->mem_cgroup;
struct mem_cgroup *memcg = page_memcg(head);
int i;
if (mem_cgroup_disabled())
@ -3287,7 +3274,7 @@ void mem_cgroup_split_huge_fixup(struct page *head)
for (i = 1; i < HPAGE_PMD_NR; i++) {
css_get(&memcg->css);
head[i].mem_cgroup = memcg;
head[i].memcg_data = (unsigned long)memcg;
}
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@ -4669,7 +4656,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
struct bdi_writeback *wb)
{
struct mem_cgroup *memcg = page->mem_cgroup;
struct mem_cgroup *memcg = page_memcg(page);
struct memcg_cgwb_frn *frn;
u64 now = get_jiffies_64();
u64 oldest_at = now;
@ -5646,14 +5633,14 @@ static int mem_cgroup_move_account(struct page *page,
/*
* Prevent mem_cgroup_migrate() from looking at
* page->mem_cgroup of its source page while we change it.
* page's memory cgroup of its source page while we change it.
*/
ret = -EBUSY;
if (!trylock_page(page))
goto out;
ret = -EINVAL;
if (page->mem_cgroup != from)
if (page_memcg(page) != from)
goto out_unlock;
pgdat = page_pgdat(page);
@ -5708,13 +5695,13 @@ static int mem_cgroup_move_account(struct page *page,
/*
* All state has been migrated, let's switch to the new memcg.
*
* It is safe to change page->mem_cgroup here because the page
* It is safe to change page's memcg here because the page
* is referenced, charged, isolated, and locked: we can't race
* with (un)charging, migration, LRU putback, or anything else
* that would rely on a stable page->mem_cgroup.
* that would rely on a stable page's memory cgroup.
*
* Note that lock_page_memcg is a memcg lock, not a page lock,
* to save space. As soon as we switch page->mem_cgroup to a
* to save space. As soon as we switch page's memory cgroup to a
* new memcg that isn't locked, the above state can change
* concurrently again. Make sure we're truly done with it.
*/
@ -5723,7 +5710,7 @@ static int mem_cgroup_move_account(struct page *page,
css_get(&to->css);
css_put(&from->css);
page->mem_cgroup = to;
page->memcg_data = (unsigned long)to;
__unlock_page_memcg(from);
@ -5789,7 +5776,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
* mem_cgroup_move_account() checks the page is valid or
* not under LRU exclusion.
*/
if (page->mem_cgroup == mc.from) {
if (page_memcg(page) == mc.from) {
ret = MC_TARGET_PAGE;
if (is_device_private_page(page))
ret = MC_TARGET_DEVICE;
@ -5833,7 +5820,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
VM_BUG_ON_PAGE(!page || !PageHead(page), page);
if (!(mc.flags & MOVE_ANON))
return ret;
if (page->mem_cgroup == mc.from) {
if (page_memcg(page) == mc.from) {
ret = MC_TARGET_PAGE;
if (target) {
get_page(page);
@ -6779,12 +6766,12 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
/*
* Every swap fault against a single page tries to charge the
* page, bail as early as possible. shmem_unuse() encounters
* already charged pages, too. page->mem_cgroup is protected
* by the page lock, which serializes swap cache removal, which
* in turn serializes uncharging.
* already charged pages, too. page and memcg binding is
* protected by the page lock, which serializes swap cache
* removal, which in turn serializes uncharging.
*/
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (compound_head(page)->mem_cgroup)
if (page_memcg(compound_head(page)))
goto out;
id = lookup_swap_cgroup_id(ent);
@ -6868,21 +6855,21 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
VM_BUG_ON_PAGE(PageLRU(page), page);
if (!page->mem_cgroup)
if (!page_memcg(page))
return;
/*
* Nobody should be changing or seriously looking at
* page->mem_cgroup at this point, we have fully
* page_memcg(page) at this point, we have fully
* exclusive access to the page.
*/
if (ug->memcg != page->mem_cgroup) {
if (ug->memcg != page_memcg(page)) {
if (ug->memcg) {
uncharge_batch(ug);
uncharge_gather_clear(ug);
}
ug->memcg = page->mem_cgroup;
ug->memcg = page_memcg(page);
/* pairs with css_put in uncharge_batch */
css_get(&ug->memcg->css);
@ -6891,15 +6878,13 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
nr_pages = compound_nr(page);
ug->nr_pages += nr_pages;
if (!PageKmemcg(page)) {
ug->pgpgout++;
} else {
if (PageMemcgKmem(page))
ug->nr_kmem += nr_pages;
__ClearPageKmemcg(page);
}
else
ug->pgpgout++;
ug->dummy_page = page;
page->mem_cgroup = NULL;
page->memcg_data = 0;
css_put(&ug->memcg->css);
}
@ -6942,7 +6927,7 @@ void mem_cgroup_uncharge(struct page *page)
return;
/* Don't touch page->lru of any random page, pre-check: */
if (!page->mem_cgroup)
if (!page_memcg(page))
return;
uncharge_gather_clear(&ug);
@ -6992,11 +6977,11 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
return;
/* Page cache replacement: new page already charged? */
if (newpage->mem_cgroup)
if (page_memcg(newpage))
return;
/* Swapcache readahead pages can get replaced before being charged */
memcg = oldpage->mem_cgroup;
memcg = page_memcg(oldpage);
if (!memcg)
return;
@ -7191,7 +7176,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
return;
memcg = page->mem_cgroup;
memcg = page_memcg(page);
/* Readahead page, never charged */
if (!memcg)
@ -7212,7 +7197,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
VM_BUG_ON_PAGE(oldid, page);
mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
page->mem_cgroup = NULL;
page->memcg_data = 0;
if (!mem_cgroup_is_root(memcg))
page_counter_uncharge(&memcg->memory, nr_entries);
@ -7255,7 +7240,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return 0;
memcg = page->mem_cgroup;
memcg = page_memcg(page);
/* Readahead page, never charged */
if (!memcg)
@ -7336,7 +7321,7 @@ bool mem_cgroup_swap_full(struct page *page)
if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
return false;
memcg = page->mem_cgroup;
memcg = page_memcg(page);
if (!memcg)
return false;

View File

@ -1092,7 +1092,7 @@ static inline bool page_expected_state(struct page *page,
if (unlikely((unsigned long)page->mapping |
page_ref_count(page) |
#ifdef CONFIG_MEMCG
(unsigned long)page->mem_cgroup |
(unsigned long)page_memcg(page) |
#endif
(page->flags & check_flags)))
return false;
@ -1117,7 +1117,7 @@ static const char *page_bad_reason(struct page *page, unsigned long flags)
bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
}
#ifdef CONFIG_MEMCG
if (unlikely(page->mem_cgroup))
if (unlikely(page_memcg(page)))
bad_reason = "page still charged to cgroup";
#endif
return bad_reason;
@ -1214,7 +1214,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
* Do not let hwpoison pages hit pcplists/buddy
* Untie memcg state and reset page's owner
*/
if (memcg_kmem_enabled() && PageKmemcg(page))
if (memcg_kmem_enabled() && PageMemcgKmem(page))
__memcg_kmem_uncharge_page(page, order);
reset_page_owner(page, order);
return false;
@ -1244,7 +1244,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
}
if (PageMappingFlags(page))
page->mapping = NULL;
if (memcg_kmem_enabled() && PageKmemcg(page))
if (memcg_kmem_enabled() && PageMemcgKmem(page))
__memcg_kmem_uncharge_page(page, order);
if (check_free)
bad += check_free_page(page);

View File

@ -291,12 +291,14 @@ static inline void count_swpout_vm_event(struct page *page)
static void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
{
struct cgroup_subsys_state *css;
struct mem_cgroup *memcg;
if (!page->mem_cgroup)
memcg = page_memcg(page);
if (!memcg)
return;
rcu_read_lock();
css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys);
css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys);
bio_associate_blkg_from_css(bio, css);
rcu_read_unlock();
}

View File

@ -239,30 +239,13 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla
}
#ifdef CONFIG_MEMCG_KMEM
static inline struct obj_cgroup **page_obj_cgroups(struct page *page)
{
/*
* page->mem_cgroup and page->obj_cgroups are sharing the same
* space. To distinguish between them in case we don't know for sure
* that the page is a slab page (e.g. page_cgroup_ino()), let's
* always set the lowest bit of obj_cgroups.
*/
return (struct obj_cgroup **)
((unsigned long)page->obj_cgroups & ~0x1UL);
}
static inline bool page_has_obj_cgroups(struct page *page)
{
return ((unsigned long)page->obj_cgroups & 0x1UL);
}
int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
gfp_t gfp);
static inline void memcg_free_page_obj_cgroups(struct page *page)
{
kfree(page_obj_cgroups(page));
page->obj_cgroups = NULL;
kfree(page_objcgs(page));
page->memcg_data = 0;
}
static inline size_t obj_full_size(struct kmem_cache *s)
@ -323,7 +306,7 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
if (likely(p[i])) {
page = virt_to_head_page(p[i]);
if (!page_has_obj_cgroups(page) &&
if (!page_objcgs(page) &&
memcg_alloc_page_obj_cgroups(page, s, flags)) {
obj_cgroup_uncharge(objcg, obj_full_size(s));
continue;
@ -331,7 +314,7 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
off = obj_to_index(s, page, p[i]);
obj_cgroup_get(objcg);
page_obj_cgroups(page)[off] = objcg;
page_objcgs(page)[off] = objcg;
mod_objcg_state(objcg, page_pgdat(page),
cache_vmstat_idx(s), obj_full_size(s));
} else {
@ -345,6 +328,7 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s_orig,
void **p, int objects)
{
struct kmem_cache *s;
struct obj_cgroup **objcgs;
struct obj_cgroup *objcg;
struct page *page;
unsigned int off;
@ -358,7 +342,8 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s_orig,
continue;
page = virt_to_head_page(p[i]);
if (!page_has_obj_cgroups(page))
objcgs = page_objcgs(page);
if (!objcgs)
continue;
if (!s_orig)
@ -367,11 +352,11 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s_orig,
s = s_orig;
off = obj_to_index(s, page, p[i]);
objcg = page_obj_cgroups(page)[off];
objcg = objcgs[off];
if (!objcg)
continue;
page_obj_cgroups(page)[off] = NULL;
objcgs[off] = NULL;
obj_cgroup_uncharge(objcg, obj_full_size(s));
mod_objcg_state(objcg, page_pgdat(page), cache_vmstat_idx(s),
-obj_full_size(s));
@ -380,11 +365,6 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s_orig,
}
#else /* CONFIG_MEMCG_KMEM */
static inline bool page_has_obj_cgroups(struct page *page)
{
return false;
}
static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr)
{
return NULL;

View File

@ -257,7 +257,7 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
struct lruvec *lruvec;
int memcgid;
/* Page is fully exclusive and pins page->mem_cgroup */
/* Page is fully exclusive and pins page's memory cgroup pointer */
VM_BUG_ON_PAGE(PageLRU(page), page);
VM_BUG_ON_PAGE(page_count(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);

View File

@ -415,7 +415,7 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog)
BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk,
void *, value, u64, flags)
{
if (!in_serving_softirq() && !in_task())
if (in_irq() || in_nmi())
return (unsigned long)NULL;
return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags);
@ -424,7 +424,7 @@ BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk,
BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map,
struct sock *, sk)
{
if (!in_serving_softirq() && !in_task())
if (in_irq() || in_nmi())
return -EPERM;
return ____bpf_sk_storage_delete(map, sk);

View File

@ -6448,7 +6448,8 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
NAPIF_STATE_PREFER_BUSY_POLL);
/* If STATE_MISSED was set, leave STATE_SCHED set,
* because we will call napi->poll() one more time.
@ -6485,10 +6486,30 @@ static struct napi_struct *napi_by_id(unsigned int napi_id)
#if defined(CONFIG_NET_RX_BUSY_POLL)
#define BUSY_POLL_BUDGET 8
static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
{
if (!skip_schedule) {
gro_normal_list(napi);
__napi_schedule(napi);
return;
}
if (napi->gro_bitmask) {
/* flush too old packets
* If HZ < 1000, flush all packets.
*/
napi_gro_flush(napi, HZ >= 1000);
}
gro_normal_list(napi);
clear_bit(NAPI_STATE_SCHED, &napi->state);
}
static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll,
u16 budget)
{
bool skip_schedule = false;
unsigned long timeout;
int rc;
/* Busy polling means there is a high chance device driver hard irq
@ -6505,29 +6526,33 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
local_bh_disable();
if (prefer_busy_poll) {
napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
timeout = READ_ONCE(napi->dev->gro_flush_timeout);
if (napi->defer_hard_irqs_count && timeout) {
hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
skip_schedule = true;
}
}
/* All we really want here is to re-enable device interrupts.
* Ideally, a new ndo_busy_poll_stop() could avoid another round.
*/
rc = napi->poll(napi, BUSY_POLL_BUDGET);
rc = napi->poll(napi, budget);
/* We can't gro_normal_list() here, because napi->poll() might have
* rearmed the napi (napi_complete_done()) in which case it could
* already be running on another CPU.
*/
trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
trace_napi_poll(napi, rc, budget);
netpoll_poll_unlock(have_poll_lock);
if (rc == BUSY_POLL_BUDGET) {
/* As the whole budget was spent, we still own the napi so can
* safely handle the rx_list.
*/
gro_normal_list(napi);
__napi_schedule(napi);
}
if (rc == budget)
__busy_poll_stop(napi, skip_schedule);
local_bh_enable();
}
void napi_busy_loop(unsigned int napi_id,
bool (*loop_end)(void *, unsigned long),
void *loop_end_arg)
void *loop_end_arg, bool prefer_busy_poll, u16 budget)
{
unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
int (*napi_poll)(struct napi_struct *napi, int budget);
@ -6555,17 +6580,23 @@ restart:
* we avoid dirtying napi->state as much as we can.
*/
if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
NAPIF_STATE_IN_BUSY_POLL))
NAPIF_STATE_IN_BUSY_POLL)) {
if (prefer_busy_poll)
set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
goto count;
}
if (cmpxchg(&napi->state, val,
val | NAPIF_STATE_IN_BUSY_POLL |
NAPIF_STATE_SCHED) != val)
NAPIF_STATE_SCHED) != val) {
if (prefer_busy_poll)
set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
goto count;
}
have_poll_lock = netpoll_poll_lock(napi);
napi_poll = napi->poll;
}
work = napi_poll(napi, BUSY_POLL_BUDGET);
trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
work = napi_poll(napi, budget);
trace_napi_poll(napi, work, budget);
gro_normal_list(napi);
count:
if (work > 0)
@ -6578,7 +6609,7 @@ count:
if (unlikely(need_resched())) {
if (napi_poll)
busy_poll_stop(napi, have_poll_lock);
busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
preempt_enable();
rcu_read_unlock();
cond_resched();
@ -6589,7 +6620,7 @@ count:
cpu_relax();
}
if (napi_poll)
busy_poll_stop(napi, have_poll_lock);
busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
preempt_enable();
out:
rcu_read_unlock();
@ -6640,8 +6671,10 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
* NAPI_STATE_MISSED, since we do not react to a device IRQ.
*/
if (!napi_disable_pending(napi) &&
!test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
!test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
__napi_schedule_irqoff(napi);
}
return HRTIMER_NORESTART;
}
@ -6699,6 +6732,7 @@ void napi_disable(struct napi_struct *n)
hrtimer_cancel(&n->timer);
clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state);
clear_bit(NAPI_STATE_DISABLE, &n->state);
}
EXPORT_SYMBOL(napi_disable);
@ -6771,6 +6805,19 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
goto out_unlock;
}
/* The NAPI context has more processing work, but busy-polling
* is preferred. Exit early.
*/
if (napi_prefer_busy_poll(n)) {
if (napi_complete_done(n, work)) {
/* If timeout is not set, we need to make sure
* that the NAPI is re-scheduled.
*/
napi_schedule(n);
}
goto out_unlock;
}
if (n->gro_bitmask) {
/* flush too old packets
* If HZ < 1000, flush all packets.
@ -9753,7 +9800,7 @@ static int netif_alloc_rx_queues(struct net_device *dev)
rx[i].dev = dev;
/* XDP RX-queue setup */
err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i);
err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0);
if (err < 0)
goto err_rxq_info;
}

View File

@ -4910,6 +4910,9 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname,
tp->notsent_lowat = val;
sk->sk_write_space(sk);
break;
case TCP_WINDOW_CLAMP:
ret = tcp_set_window_clamp(sk, val);
break;
default:
ret = -EINVAL;
}
@ -6995,6 +6998,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sk_storage_delete_proto;
case BPF_FUNC_setsockopt:
switch (prog->expected_attach_type) {
case BPF_CGROUP_INET4_BIND:
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
return &bpf_sock_addr_setsockopt_proto;
@ -7003,6 +7008,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
}
case BPF_FUNC_getsockopt:
switch (prog->expected_attach_type) {
case BPF_CGROUP_INET4_BIND:
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
return &bpf_sock_addr_getsockopt_proto;

View File

@ -1159,6 +1159,22 @@ set_sndbuf:
sk->sk_ll_usec = val;
}
break;
case SO_PREFER_BUSY_POLL:
if (valbool && !capable(CAP_NET_ADMIN))
ret = -EPERM;
else
WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
break;
case SO_BUSY_POLL_BUDGET:
if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
ret = -EPERM;
} else {
if (val < 0 || val > U16_MAX)
ret = -EINVAL;
else
WRITE_ONCE(sk->sk_busy_poll_budget, val);
}
break;
#endif
case SO_MAX_PACING_RATE:
@ -1523,6 +1539,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
case SO_BUSY_POLL:
v.val = sk->sk_ll_usec;
break;
case SO_PREFER_BUSY_POLL:
v.val = READ_ONCE(sk->sk_prefer_busy_poll);
break;
#endif
case SO_MAX_PACING_RATE:

View File

@ -27,8 +27,6 @@ struct bpf_stab {
static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
{
struct bpf_stab *stab;
u64 cost;
int err;
if (!capable(CAP_NET_ADMIN))
return ERR_PTR(-EPERM);
@ -39,29 +37,22 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
stab = kzalloc(sizeof(*stab), GFP_USER);
stab = kzalloc(sizeof(*stab), GFP_USER | __GFP_ACCOUNT);
if (!stab)
return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&stab->map, attr);
raw_spin_lock_init(&stab->lock);
/* Make sure page count doesn't overflow. */
cost = (u64) stab->map.max_entries * sizeof(struct sock *);
err = bpf_map_charge_init(&stab->map.memory, cost);
if (err)
goto free_stab;
stab->sks = bpf_map_area_alloc(stab->map.max_entries *
sizeof(struct sock *),
stab->map.numa_node);
if (stab->sks)
return &stab->map;
err = -ENOMEM;
bpf_map_charge_finish(&stab->map.memory);
free_stab:
if (!stab->sks) {
kfree(stab);
return ERR_PTR(err);
return ERR_PTR(-ENOMEM);
}
return &stab->map;
}
int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog)
@ -975,7 +966,8 @@ static struct bpf_shtab_elem *sock_hash_alloc_elem(struct bpf_shtab *htab,
}
}
new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
new = bpf_map_kmalloc_node(&htab->map, htab->elem_size,
GFP_ATOMIC | __GFP_NOWARN,
htab->map.numa_node);
if (!new) {
atomic_dec(&htab->count);
@ -1103,7 +1095,6 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
{
struct bpf_shtab *htab;
int i, err;
u64 cost;
if (!capable(CAP_NET_ADMIN))
return ERR_PTR(-EPERM);
@ -1116,7 +1107,7 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
if (attr->key_size > MAX_BPF_STACK)
return ERR_PTR(-E2BIG);
htab = kzalloc(sizeof(*htab), GFP_USER);
htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_ACCOUNT);
if (!htab)
return ERR_PTR(-ENOMEM);
@ -1131,21 +1122,10 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
goto free_htab;
}
cost = (u64) htab->buckets_num * sizeof(struct bpf_shtab_bucket) +
(u64) htab->elem_size * htab->map.max_entries;
if (cost >= U32_MAX - PAGE_SIZE) {
err = -EINVAL;
goto free_htab;
}
err = bpf_map_charge_init(&htab->map.memory, cost);
if (err)
goto free_htab;
htab->buckets = bpf_map_area_alloc(htab->buckets_num *
sizeof(struct bpf_shtab_bucket),
htab->map.numa_node);
if (!htab->buckets) {
bpf_map_charge_finish(&htab->map.memory);
err = -ENOMEM;
goto free_htab;
}

View File

@ -158,7 +158,7 @@ static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq)
/* Returns 0 on success, negative on failure */
int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
struct net_device *dev, u32 queue_index)
struct net_device *dev, u32 queue_index, unsigned int napi_id)
{
if (xdp_rxq->reg_state == REG_STATE_UNUSED) {
WARN(1, "Driver promised not to register this");
@ -179,6 +179,7 @@ int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
xdp_rxq_info_init(xdp_rxq);
xdp_rxq->dev = dev;
xdp_rxq->queue_index = queue_index;
xdp_rxq->napi_id = napi_id;
xdp_rxq->reg_state = REG_STATE_REGISTERED;
return 0;

View File

@ -450,7 +450,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
/* BPF prog is run before any checks are done so that if the prog
* changes context in a wrong way it will be caught.
*/
err = BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr);
err = BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr);
if (err)
return err;

View File

@ -95,6 +95,7 @@ static bool bpf_tcp_ca_is_valid_access(int off, int size,
}
static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log,
const struct btf *btf,
const struct btf_type *t, int off,
int size, enum bpf_access_type atype,
u32 *next_btf_id)
@ -102,7 +103,7 @@ static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log,
size_t end;
if (atype == BPF_READ)
return btf_struct_access(log, t, off, size, atype, next_btf_id);
return btf_struct_access(log, btf, t, off, size, atype, next_btf_id);
if (t != tcp_sock_type) {
bpf_log(log, "only read is supported\n");

View File

@ -3042,6 +3042,21 @@ int tcp_sock_set_keepcnt(struct sock *sk, int val)
}
EXPORT_SYMBOL(tcp_sock_set_keepcnt);
int tcp_set_window_clamp(struct sock *sk, int val)
{
struct tcp_sock *tp = tcp_sk(sk);
if (!val) {
if (sk->sk_state != TCP_CLOSE)
return -EINVAL;
tp->window_clamp = 0;
} else {
tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
SOCK_MIN_RCVBUF / 2 : val;
}
return 0;
}
/*
* Socket option code for TCP.
*/
@ -3255,15 +3270,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname,
break;
case TCP_WINDOW_CLAMP:
if (!val) {
if (sk->sk_state != TCP_CLOSE) {
err = -EINVAL;
break;
}
tp->window_clamp = 0;
} else
tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
SOCK_MIN_RCVBUF / 2 : val;
err = tcp_set_window_clamp(sk, val);
break;
case TCP_QUICKACK:

View File

@ -451,7 +451,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
/* BPF prog is run before any checks are done so that if the prog
* changes context in a wrong way it will be caught.
*/
err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr);
err = BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr);
if (err)
return err;

View File

@ -23,6 +23,7 @@
#include <linux/netdevice.h>
#include <linux/rculist.h>
#include <net/xdp_sock_drv.h>
#include <net/busy_poll.h>
#include <net/xdp.h>
#include "xsk_queue.h"
@ -232,6 +233,7 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp,
if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
return -EINVAL;
sk_mark_napi_id_once_xdp(&xs->sk, xdp);
len = xdp->data_end - xdp->data;
return xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ?
@ -332,6 +334,63 @@ out:
}
EXPORT_SYMBOL(xsk_tx_peek_desc);
static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, struct xdp_desc *descs,
u32 max_entries)
{
u32 nb_pkts = 0;
while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts]))
nb_pkts++;
xsk_tx_release(pool);
return nb_pkts;
}
u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *descs,
u32 max_entries)
{
struct xdp_sock *xs;
u32 nb_pkts;
rcu_read_lock();
if (!list_is_singular(&pool->xsk_tx_list)) {
/* Fallback to the non-batched version */
rcu_read_unlock();
return xsk_tx_peek_release_fallback(pool, descs, max_entries);
}
xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list);
if (!xs) {
nb_pkts = 0;
goto out;
}
nb_pkts = xskq_cons_peek_desc_batch(xs->tx, descs, pool, max_entries);
if (!nb_pkts) {
xs->tx->queue_empty_descs++;
goto out;
}
/* This is the backpressure mechanism for the Tx path. Try to
* reserve space in the completion queue for all packets, but
* if there are fewer slots available, just process that many
* packets. This avoids having to implement any buffering in
* the Tx path.
*/
nb_pkts = xskq_prod_reserve_addr_batch(pool->cq, descs, nb_pkts);
if (!nb_pkts)
goto out;
xskq_cons_release_n(xs->tx, nb_pkts);
__xskq_cons_release(xs->tx);
xs->sk.sk_write_space(&xs->sk);
out:
rcu_read_unlock();
return nb_pkts;
}
EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch);
static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
{
struct net_device *dev = xs->dev;
@ -454,18 +513,65 @@ static int __xsk_sendmsg(struct sock *sk)
return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk);
}
static bool xsk_no_wakeup(struct sock *sk)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
/* Prefer busy-polling, skip the wakeup. */
return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) &&
READ_ONCE(sk->sk_napi_id) >= MIN_NAPI_ID;
#else
return false;
#endif
}
static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
{
bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
struct sock *sk = sock->sk;
struct xdp_sock *xs = xdp_sk(sk);
struct xsk_buff_pool *pool;
if (unlikely(!xsk_is_bound(xs)))
return -ENXIO;
if (unlikely(need_wait))
return -EOPNOTSUPP;
if (sk_can_busy_loop(sk))
sk_busy_loop(sk, 1); /* only support non-blocking sockets */
if (xsk_no_wakeup(sk))
return 0;
pool = xs->pool;
if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
return __xsk_sendmsg(sk);
return 0;
}
static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
{
bool need_wait = !(flags & MSG_DONTWAIT);
struct sock *sk = sock->sk;
struct xdp_sock *xs = xdp_sk(sk);
if (unlikely(!(xs->dev->flags & IFF_UP)))
return -ENETDOWN;
if (unlikely(!xs->rx))
return -ENOBUFS;
if (unlikely(!xsk_is_bound(xs)))
return -ENXIO;
if (unlikely(need_wait))
return -EOPNOTSUPP;
if (sk_can_busy_loop(sk))
sk_busy_loop(sk, 1); /* only support non-blocking sockets */
if (xsk_no_wakeup(sk))
return 0;
if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc)
return xsk_wakeup(xs, XDP_WAKEUP_RX);
return 0;
}
static __poll_t xsk_poll(struct file *file, struct socket *sock,
@ -542,7 +648,7 @@ static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
node);
if (node) {
WARN_ON(xsk_map_inc(node->map));
bpf_map_inc(&node->map->map);
map = node->map;
*map_entry = node->map_entry;
}
@ -572,7 +678,7 @@ static void xsk_delete_from_maps(struct xdp_sock *xs)
while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
xsk_map_try_sock_delete(map, xs, map_entry);
xsk_map_put(map);
bpf_map_put(&map->map);
}
}
@ -1128,7 +1234,7 @@ static const struct proto_ops xsk_proto_ops = {
.setsockopt = xsk_setsockopt,
.getsockopt = xsk_getsockopt,
.sendmsg = xsk_sendmsg,
.recvmsg = sock_no_recvmsg,
.recvmsg = xsk_recvmsg,
.mmap = xsk_mmap,
.sendpage = sock_no_sendpage,
};

View File

@ -41,8 +41,6 @@ static inline struct xdp_sock *xdp_sk(struct sock *sk)
void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
struct xdp_sock **map_entry);
int xsk_map_inc(struct xsk_map *map);
void xsk_map_put(struct xsk_map *map);
void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id);
int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
u16 queue_id);

View File

@ -144,14 +144,13 @@ static int __xp_assign_dev(struct xsk_buff_pool *pool,
if (err)
return err;
if (flags & XDP_USE_NEED_WAKEUP) {
if (flags & XDP_USE_NEED_WAKEUP)
pool->uses_need_wakeup = true;
/* Tx needs to be explicitly woken up the first time.
* Also for supporting drivers that do not implement this
* feature. They will always have to call sendto().
/* Tx needs to be explicitly woken up the first time. Also
* for supporting drivers that do not implement this
* feature. They will always have to call sendto() or poll().
*/
pool->cached_need_wakeup = XDP_WAKEUP_TX;
}
dev_hold(netdev);

View File

@ -18,9 +18,11 @@ struct xdp_ring {
/* Hinder the adjacent cache prefetcher to prefetch the consumer
* pointer if the producer pointer is touched and vice versa.
*/
u32 pad ____cacheline_aligned_in_smp;
u32 pad1 ____cacheline_aligned_in_smp;
u32 consumer ____cacheline_aligned_in_smp;
u32 pad2 ____cacheline_aligned_in_smp;
u32 flags;
u32 pad3 ____cacheline_aligned_in_smp;
};
/* Used for the RX and TX queues for packets */
@ -197,6 +199,30 @@ static inline bool xskq_cons_read_desc(struct xsk_queue *q,
return false;
}
static inline u32 xskq_cons_read_desc_batch(struct xsk_queue *q,
struct xdp_desc *descs,
struct xsk_buff_pool *pool, u32 max)
{
u32 cached_cons = q->cached_cons, nb_entries = 0;
while (cached_cons != q->cached_prod && nb_entries < max) {
struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
u32 idx = cached_cons & q->ring_mask;
descs[nb_entries] = ring->desc[idx];
if (unlikely(!xskq_cons_is_valid_desc(q, &descs[nb_entries], pool))) {
/* Skip the entry */
cached_cons++;
continue;
}
nb_entries++;
cached_cons++;
}
return nb_entries;
}
/* Functions for consumers */
static inline void __xskq_cons_release(struct xsk_queue *q)
@ -218,17 +244,22 @@ static inline void xskq_cons_get_entries(struct xsk_queue *q)
__xskq_cons_peek(q);
}
static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt)
static inline u32 xskq_cons_nb_entries(struct xsk_queue *q, u32 max)
{
u32 entries = q->cached_prod - q->cached_cons;
if (entries >= cnt)
return true;
if (entries >= max)
return max;
__xskq_cons_peek(q);
entries = q->cached_prod - q->cached_cons;
return entries >= cnt;
return entries >= max ? max : entries;
}
static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt)
{
return xskq_cons_nb_entries(q, cnt) >= cnt ? true : false;
}
static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr)
@ -247,16 +278,28 @@ static inline bool xskq_cons_peek_desc(struct xsk_queue *q,
return xskq_cons_read_desc(q, desc, pool);
}
static inline u32 xskq_cons_peek_desc_batch(struct xsk_queue *q, struct xdp_desc *descs,
struct xsk_buff_pool *pool, u32 max)
{
u32 entries = xskq_cons_nb_entries(q, max);
return xskq_cons_read_desc_batch(q, descs, pool, entries);
}
/* To improve performance in the xskq_cons_release functions, only update local state here.
* Reflect this to global state when we get new entries from the ring in
* xskq_cons_get_entries() and whenever Rx or Tx processing are completed in the NAPI loop.
*/
static inline void xskq_cons_release(struct xsk_queue *q)
{
/* To improve performance, only update local state here.
* Reflect this to global state when we get new entries
* from the ring in xskq_cons_get_entries() and whenever
* Rx or Tx processing are completed in the NAPI loop.
*/
q->cached_cons++;
}
static inline void xskq_cons_release_n(struct xsk_queue *q, u32 cnt)
{
q->cached_cons += cnt;
}
static inline bool xskq_cons_is_full(struct xsk_queue *q)
{
/* No barriers needed since data is not accessed */
@ -266,18 +309,23 @@ static inline bool xskq_cons_is_full(struct xsk_queue *q)
/* Functions for producers */
static inline bool xskq_prod_is_full(struct xsk_queue *q)
static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max)
{
u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons);
if (free_entries)
return false;
if (free_entries >= max)
return max;
/* Refresh the local tail pointer */
q->cached_cons = READ_ONCE(q->ring->consumer);
free_entries = q->nentries - (q->cached_prod - q->cached_cons);
return !free_entries;
return free_entries >= max ? max : free_entries;
}
static inline bool xskq_prod_is_full(struct xsk_queue *q)
{
return xskq_prod_nb_free(q, 1) ? false : true;
}
static inline int xskq_prod_reserve(struct xsk_queue *q)
@ -302,6 +350,23 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr)
return 0;
}
static inline u32 xskq_prod_reserve_addr_batch(struct xsk_queue *q, struct xdp_desc *descs,
u32 max)
{
struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
u32 nb_entries, i, cached_prod;
nb_entries = xskq_prod_nb_free(q, max);
/* A, matches D */
cached_prod = q->cached_prod;
for (i = 0; i < nb_entries; i++)
ring->desc[cached_prod++ & q->ring_mask] = descs[i].addr;
q->cached_prod = cached_prod;
return nb_entries;
}
static inline int xskq_prod_reserve_desc(struct xsk_queue *q,
u64 addr, u32 len)
{

View File

@ -11,32 +11,17 @@
#include "xsk.h"
int xsk_map_inc(struct xsk_map *map)
{
bpf_map_inc(&map->map);
return 0;
}
void xsk_map_put(struct xsk_map *map)
{
bpf_map_put(&map->map);
}
static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map,
struct xdp_sock **map_entry)
{
struct xsk_map_node *node;
int err;
node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN);
node = bpf_map_kzalloc(&map->map, sizeof(*node),
GFP_ATOMIC | __GFP_NOWARN);
if (!node)
return ERR_PTR(-ENOMEM);
err = xsk_map_inc(map);
if (err) {
kfree(node);
return ERR_PTR(err);
}
bpf_map_inc(&map->map);
node->map = map;
node->map_entry = map_entry;
@ -45,7 +30,7 @@ static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map,
static void xsk_map_node_free(struct xsk_map_node *node)
{
xsk_map_put(node->map);
bpf_map_put(&node->map->map);
kfree(node);
}
@ -73,9 +58,8 @@ static void xsk_map_sock_delete(struct xdp_sock *xs,
static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
{
struct bpf_map_memory mem;
int err, numa_node;
struct xsk_map *m;
int numa_node;
u64 size;
if (!capable(CAP_NET_ADMIN))
@ -89,18 +73,11 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
numa_node = bpf_map_attr_numa_node(attr);
size = struct_size(m, xsk_map, attr->max_entries);
err = bpf_map_charge_init(&mem, size);
if (err < 0)
return ERR_PTR(err);
m = bpf_map_area_alloc(size, numa_node);
if (!m) {
bpf_map_charge_finish(&mem);
if (!m)
return ERR_PTR(-ENOMEM);
}
bpf_map_init_from_attr(&m->map, attr);
bpf_map_charge_move(&m->map.memory, &mem);
spin_lock_init(&m->lock);
return &m->map;

View File

@ -52,3 +52,6 @@ xdp_tx_iptunnel
xdpsock
xsk_fwd
testfile.img
hbm_out.log
iperf.*
*.out

View File

@ -48,6 +48,7 @@ tprogs-y += syscall_tp
tprogs-y += cpustat
tprogs-y += xdp_adjust_tail
tprogs-y += xdpsock
tprogs-y += xdpsock_ctrl_proc
tprogs-y += xsk_fwd
tprogs-y += xdp_fwd
tprogs-y += task_fd_query
@ -73,16 +74,16 @@ tracex5-objs := tracex5_user.o $(TRACE_HELPERS)
tracex6-objs := tracex6_user.o
tracex7-objs := tracex7_user.o
test_probe_write_user-objs := test_probe_write_user_user.o
trace_output-objs := trace_output_user.o $(TRACE_HELPERS)
trace_output-objs := trace_output_user.o
lathist-objs := lathist_user.o
offwaketime-objs := offwaketime_user.o $(TRACE_HELPERS)
spintest-objs := spintest_user.o $(TRACE_HELPERS)
map_perf_test-objs := map_perf_test_user.o
test_overhead-objs := bpf_load.o test_overhead_user.o
test_overhead-objs := test_overhead_user.o
test_cgrp2_array_pin-objs := test_cgrp2_array_pin.o
test_cgrp2_attach-objs := test_cgrp2_attach.o
test_cgrp2_sock-objs := test_cgrp2_sock.o
test_cgrp2_sock2-objs := bpf_load.o test_cgrp2_sock2.o
test_cgrp2_sock2-objs := test_cgrp2_sock2.o
xdp1-objs := xdp1_user.o
# reuse xdp1 source intentionally
xdp2-objs := xdp1_user.o
@ -91,8 +92,8 @@ test_current_task_under_cgroup-objs := $(CGROUP_HELPERS) \
test_current_task_under_cgroup_user.o
trace_event-objs := trace_event_user.o $(TRACE_HELPERS)
sampleip-objs := sampleip_user.o $(TRACE_HELPERS)
tc_l2_redirect-objs := bpf_load.o tc_l2_redirect_user.o
lwt_len_hist-objs := bpf_load.o lwt_len_hist_user.o
tc_l2_redirect-objs := tc_l2_redirect_user.o
lwt_len_hist-objs := lwt_len_hist_user.o
xdp_tx_iptunnel-objs := xdp_tx_iptunnel_user.o
test_map_in_map-objs := test_map_in_map_user.o
per_socket_stats_example-objs := cookie_uid_helper_example.o
@ -105,12 +106,13 @@ syscall_tp-objs := syscall_tp_user.o
cpustat-objs := cpustat_user.o
xdp_adjust_tail-objs := xdp_adjust_tail_user.o
xdpsock-objs := xdpsock_user.o
xdpsock_ctrl_proc-objs := xdpsock_ctrl_proc.o
xsk_fwd-objs := xsk_fwd.o
xdp_fwd-objs := xdp_fwd_user.o
task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS)
xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS)
ibumad-objs := bpf_load.o ibumad_user.o $(TRACE_HELPERS)
hbm-objs := bpf_load.o hbm.o $(CGROUP_HELPERS)
task_fd_query-objs := task_fd_query_user.o $(TRACE_HELPERS)
xdp_sample_pkts-objs := xdp_sample_pkts_user.o
ibumad-objs := ibumad_user.o
hbm-objs := hbm.o $(CGROUP_HELPERS)
# Tell kbuild to always build the programs
always-y := $(tprogs-y)
@ -197,14 +199,12 @@ TPROGS_CFLAGS += --sysroot=$(SYSROOT)
TPROGS_LDFLAGS := -L$(SYSROOT)/usr/lib
endif
TPROGCFLAGS_bpf_load.o += -Wno-unused-variable
TPROGS_LDLIBS += $(LIBBPF) -lelf -lz
TPROGLDLIBS_tracex4 += -lrt
TPROGLDLIBS_trace_output += -lrt
TPROGLDLIBS_map_perf_test += -lrt
TPROGLDLIBS_test_overhead += -lrt
TPROGLDLIBS_xdpsock += -pthread
TPROGLDLIBS_xdpsock += -pthread -lcap
TPROGLDLIBS_xsk_fwd += -pthread
# Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:

View File

@ -1,667 +0,0 @@
// SPDX-License-Identifier: GPL-2.0
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <libelf.h>
#include <gelf.h>
#include <errno.h>
#include <unistd.h>
#include <string.h>
#include <stdbool.h>
#include <stdlib.h>
#include <linux/bpf.h>
#include <linux/filter.h>
#include <linux/perf_event.h>
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
#include <linux/types.h>
#include <sys/socket.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <poll.h>
#include <ctype.h>
#include <assert.h>
#include <bpf/bpf.h>
#include "bpf_load.h"
#include "perf-sys.h"
#define DEBUGFS "/sys/kernel/debug/tracing/"
static char license[128];
static int kern_version;
static bool processed_sec[128];
char bpf_log_buf[BPF_LOG_BUF_SIZE];
int map_fd[MAX_MAPS];
int prog_fd[MAX_PROGS];
int event_fd[MAX_PROGS];
int prog_cnt;
int prog_array_fd = -1;
struct bpf_map_data map_data[MAX_MAPS];
int map_data_count;
static int populate_prog_array(const char *event, int prog_fd)
{
int ind = atoi(event), err;
err = bpf_map_update_elem(prog_array_fd, &ind, &prog_fd, BPF_ANY);
if (err < 0) {
printf("failed to store prog_fd in prog_array\n");
return -1;
}
return 0;
}
static int write_kprobe_events(const char *val)
{
int fd, ret, flags;
if (val == NULL)
return -1;
else if (val[0] == '\0')
flags = O_WRONLY | O_TRUNC;
else
flags = O_WRONLY | O_APPEND;
fd = open(DEBUGFS "kprobe_events", flags);
ret = write(fd, val, strlen(val));
close(fd);
return ret;
}
static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
{
bool is_socket = strncmp(event, "socket", 6) == 0;
bool is_kprobe = strncmp(event, "kprobe/", 7) == 0;
bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0;
bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0;
bool is_raw_tracepoint = strncmp(event, "raw_tracepoint/", 15) == 0;
bool is_xdp = strncmp(event, "xdp", 3) == 0;
bool is_perf_event = strncmp(event, "perf_event", 10) == 0;
bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0;
bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0;
bool is_sockops = strncmp(event, "sockops", 7) == 0;
bool is_sk_skb = strncmp(event, "sk_skb", 6) == 0;
bool is_sk_msg = strncmp(event, "sk_msg", 6) == 0;
size_t insns_cnt = size / sizeof(struct bpf_insn);
enum bpf_prog_type prog_type;
char buf[256];
int fd, efd, err, id;
struct perf_event_attr attr = {};
attr.type = PERF_TYPE_TRACEPOINT;
attr.sample_type = PERF_SAMPLE_RAW;
attr.sample_period = 1;
attr.wakeup_events = 1;
if (is_socket) {
prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
} else if (is_kprobe || is_kretprobe) {
prog_type = BPF_PROG_TYPE_KPROBE;
} else if (is_tracepoint) {
prog_type = BPF_PROG_TYPE_TRACEPOINT;
} else if (is_raw_tracepoint) {
prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT;
} else if (is_xdp) {
prog_type = BPF_PROG_TYPE_XDP;
} else if (is_perf_event) {
prog_type = BPF_PROG_TYPE_PERF_EVENT;
} else if (is_cgroup_skb) {
prog_type = BPF_PROG_TYPE_CGROUP_SKB;
} else if (is_cgroup_sk) {
prog_type = BPF_PROG_TYPE_CGROUP_SOCK;
} else if (is_sockops) {
prog_type = BPF_PROG_TYPE_SOCK_OPS;
} else if (is_sk_skb) {
prog_type = BPF_PROG_TYPE_SK_SKB;
} else if (is_sk_msg) {
prog_type = BPF_PROG_TYPE_SK_MSG;
} else {
printf("Unknown event '%s'\n", event);
return -1;
}
if (prog_cnt == MAX_PROGS)
return -1;
fd = bpf_load_program(prog_type, prog, insns_cnt, license, kern_version,
bpf_log_buf, BPF_LOG_BUF_SIZE);
if (fd < 0) {
printf("bpf_load_program() err=%d\n%s", errno, bpf_log_buf);
return -1;
}
prog_fd[prog_cnt++] = fd;
if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk)
return 0;
if (is_socket || is_sockops || is_sk_skb || is_sk_msg) {
if (is_socket)
event += 6;
else
event += 7;
if (*event != '/')
return 0;
event++;
if (!isdigit(*event)) {
printf("invalid prog number\n");
return -1;
}
return populate_prog_array(event, fd);
}
if (is_raw_tracepoint) {
efd = bpf_raw_tracepoint_open(event + 15, fd);
if (efd < 0) {
printf("tracepoint %s %s\n", event + 15, strerror(errno));
return -1;
}
event_fd[prog_cnt - 1] = efd;
return 0;
}
if (is_kprobe || is_kretprobe) {
bool need_normal_check = true;
const char *event_prefix = "";
if (is_kprobe)
event += 7;
else
event += 10;
if (*event == 0) {
printf("event name cannot be empty\n");
return -1;
}
if (isdigit(*event))
return populate_prog_array(event, fd);
#ifdef __x86_64__
if (strncmp(event, "sys_", 4) == 0) {
snprintf(buf, sizeof(buf), "%c:__x64_%s __x64_%s",
is_kprobe ? 'p' : 'r', event, event);
err = write_kprobe_events(buf);
if (err >= 0) {
need_normal_check = false;
event_prefix = "__x64_";
}
}
#endif
if (need_normal_check) {
snprintf(buf, sizeof(buf), "%c:%s %s",
is_kprobe ? 'p' : 'r', event, event);
err = write_kprobe_events(buf);
if (err < 0) {
printf("failed to create kprobe '%s' error '%s'\n",
event, strerror(errno));
return -1;
}
}
strcpy(buf, DEBUGFS);
strcat(buf, "events/kprobes/");
strcat(buf, event_prefix);
strcat(buf, event);
strcat(buf, "/id");
} else if (is_tracepoint) {
event += 11;
if (*event == 0) {
printf("event name cannot be empty\n");
return -1;
}
strcpy(buf, DEBUGFS);
strcat(buf, "events/");
strcat(buf, event);
strcat(buf, "/id");
}
efd = open(buf, O_RDONLY, 0);
if (efd < 0) {
printf("failed to open event %s\n", event);
return -1;
}
err = read(efd, buf, sizeof(buf));
if (err < 0 || err >= sizeof(buf)) {
printf("read from '%s' failed '%s'\n", event, strerror(errno));
return -1;
}
close(efd);
buf[err] = 0;
id = atoi(buf);
attr.config = id;
efd = sys_perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0);
if (efd < 0) {
printf("event %d fd %d err %s\n", id, efd, strerror(errno));
return -1;
}
event_fd[prog_cnt - 1] = efd;
err = ioctl(efd, PERF_EVENT_IOC_ENABLE, 0);
if (err < 0) {
printf("ioctl PERF_EVENT_IOC_ENABLE failed err %s\n",
strerror(errno));
return -1;
}
err = ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd);
if (err < 0) {
printf("ioctl PERF_EVENT_IOC_SET_BPF failed err %s\n",
strerror(errno));
return -1;
}
return 0;
}
static int load_maps(struct bpf_map_data *maps, int nr_maps,
fixup_map_cb fixup_map)
{
int i, numa_node;
for (i = 0; i < nr_maps; i++) {
if (fixup_map) {
fixup_map(&maps[i], i);
/* Allow userspace to assign map FD prior to creation */
if (maps[i].fd != -1) {
map_fd[i] = maps[i].fd;
continue;
}
}
numa_node = maps[i].def.map_flags & BPF_F_NUMA_NODE ?
maps[i].def.numa_node : -1;
if (maps[i].def.type == BPF_MAP_TYPE_ARRAY_OF_MAPS ||
maps[i].def.type == BPF_MAP_TYPE_HASH_OF_MAPS) {
int inner_map_fd = map_fd[maps[i].def.inner_map_idx];
map_fd[i] = bpf_create_map_in_map_node(maps[i].def.type,
maps[i].name,
maps[i].def.key_size,
inner_map_fd,
maps[i].def.max_entries,
maps[i].def.map_flags,
numa_node);
} else {
map_fd[i] = bpf_create_map_node(maps[i].def.type,
maps[i].name,
maps[i].def.key_size,
maps[i].def.value_size,
maps[i].def.max_entries,
maps[i].def.map_flags,
numa_node);
}
if (map_fd[i] < 0) {
printf("failed to create map %d (%s): %d %s\n",
i, maps[i].name, errno, strerror(errno));
return 1;
}
maps[i].fd = map_fd[i];
if (maps[i].def.type == BPF_MAP_TYPE_PROG_ARRAY)
prog_array_fd = map_fd[i];
}
return 0;
}
static int get_sec(Elf *elf, int i, GElf_Ehdr *ehdr, char **shname,
GElf_Shdr *shdr, Elf_Data **data)
{
Elf_Scn *scn;
scn = elf_getscn(elf, i);
if (!scn)
return 1;
if (gelf_getshdr(scn, shdr) != shdr)
return 2;
*shname = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name);
if (!*shname || !shdr->sh_size)
return 3;
*data = elf_getdata(scn, 0);
if (!*data || elf_getdata(scn, *data) != NULL)
return 4;
return 0;
}
static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols,
GElf_Shdr *shdr, struct bpf_insn *insn,
struct bpf_map_data *maps, int nr_maps)
{
int i, nrels;
nrels = shdr->sh_size / shdr->sh_entsize;
for (i = 0; i < nrels; i++) {
GElf_Sym sym;
GElf_Rel rel;
unsigned int insn_idx;
bool match = false;
int j, map_idx;
gelf_getrel(data, i, &rel);
insn_idx = rel.r_offset / sizeof(struct bpf_insn);
gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &sym);
if (insn[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) {
printf("invalid relo for insn[%d].code 0x%x\n",
insn_idx, insn[insn_idx].code);
return 1;
}
insn[insn_idx].src_reg = BPF_PSEUDO_MAP_FD;
/* Match FD relocation against recorded map_data[] offset */
for (map_idx = 0; map_idx < nr_maps; map_idx++) {
if (maps[map_idx].elf_offset == sym.st_value) {
match = true;
break;
}
}
if (match) {
insn[insn_idx].imm = maps[map_idx].fd;
} else {
printf("invalid relo for insn[%d] no map_data match\n",
insn_idx);
return 1;
}
}
return 0;
}
static int cmp_symbols(const void *l, const void *r)
{
const GElf_Sym *lsym = (const GElf_Sym *)l;
const GElf_Sym *rsym = (const GElf_Sym *)r;
if (lsym->st_value < rsym->st_value)
return -1;
else if (lsym->st_value > rsym->st_value)
return 1;
else
return 0;
}
static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx,
Elf *elf, Elf_Data *symbols, int strtabidx)
{
int map_sz_elf, map_sz_copy;
bool validate_zero = false;
Elf_Data *data_maps;
int i, nr_maps;
GElf_Sym *sym;
Elf_Scn *scn;
int copy_sz;
if (maps_shndx < 0)
return -EINVAL;
if (!symbols)
return -EINVAL;
/* Get data for maps section via elf index */
scn = elf_getscn(elf, maps_shndx);
if (scn)
data_maps = elf_getdata(scn, NULL);
if (!scn || !data_maps) {
printf("Failed to get Elf_Data from maps section %d\n",
maps_shndx);
return -EINVAL;
}
/* For each map get corrosponding symbol table entry */
sym = calloc(MAX_MAPS+1, sizeof(GElf_Sym));
for (i = 0, nr_maps = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) {
assert(nr_maps < MAX_MAPS+1);
if (!gelf_getsym(symbols, i, &sym[nr_maps]))
continue;
if (sym[nr_maps].st_shndx != maps_shndx)
continue;
/* Only increment iif maps section */
nr_maps++;
}
/* Align to map_fd[] order, via sort on offset in sym.st_value */
qsort(sym, nr_maps, sizeof(GElf_Sym), cmp_symbols);
/* Keeping compatible with ELF maps section changes
* ------------------------------------------------
* The program size of struct bpf_load_map_def is known by loader
* code, but struct stored in ELF file can be different.
*
* Unfortunately sym[i].st_size is zero. To calculate the
* struct size stored in the ELF file, assume all struct have
* the same size, and simply divide with number of map
* symbols.
*/
map_sz_elf = data_maps->d_size / nr_maps;
map_sz_copy = sizeof(struct bpf_load_map_def);
if (map_sz_elf < map_sz_copy) {
/*
* Backward compat, loading older ELF file with
* smaller struct, keeping remaining bytes zero.
*/
map_sz_copy = map_sz_elf;
} else if (map_sz_elf > map_sz_copy) {
/*
* Forward compat, loading newer ELF file with larger
* struct with unknown features. Assume zero means
* feature not used. Thus, validate rest of struct
* data is zero.
*/
validate_zero = true;
}
/* Memcpy relevant part of ELF maps data to loader maps */
for (i = 0; i < nr_maps; i++) {
struct bpf_load_map_def *def;
unsigned char *addr, *end;
const char *map_name;
size_t offset;
map_name = elf_strptr(elf, strtabidx, sym[i].st_name);
maps[i].name = strdup(map_name);
if (!maps[i].name) {
printf("strdup(%s): %s(%d)\n", map_name,
strerror(errno), errno);
free(sym);
return -errno;
}
/* Symbol value is offset into ELF maps section data area */
offset = sym[i].st_value;
def = (struct bpf_load_map_def *)(data_maps->d_buf + offset);
maps[i].elf_offset = offset;
memset(&maps[i].def, 0, sizeof(struct bpf_load_map_def));
memcpy(&maps[i].def, def, map_sz_copy);
/* Verify no newer features were requested */
if (validate_zero) {
addr = (unsigned char *) def + map_sz_copy;
end = (unsigned char *) def + map_sz_elf;
for (; addr < end; addr++) {
if (*addr != 0) {
free(sym);
return -EFBIG;
}
}
}
}
free(sym);
return nr_maps;
}
static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map)
{
int fd, i, ret, maps_shndx = -1, strtabidx = -1;
Elf *elf;
GElf_Ehdr ehdr;
GElf_Shdr shdr, shdr_prog;
Elf_Data *data, *data_prog, *data_maps = NULL, *symbols = NULL;
char *shname, *shname_prog;
int nr_maps = 0;
/* reset global variables */
kern_version = 0;
memset(license, 0, sizeof(license));
memset(processed_sec, 0, sizeof(processed_sec));
if (elf_version(EV_CURRENT) == EV_NONE)
return 1;
fd = open(path, O_RDONLY, 0);
if (fd < 0)
return 1;
elf = elf_begin(fd, ELF_C_READ, NULL);
if (!elf)
return 1;
if (gelf_getehdr(elf, &ehdr) != &ehdr)
return 1;
/* clear all kprobes */
i = write_kprobe_events("");
/* scan over all elf sections to get license and map info */
for (i = 1; i < ehdr.e_shnum; i++) {
if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
continue;
if (0) /* helpful for llvm debugging */
printf("section %d:%s data %p size %zd link %d flags %d\n",
i, shname, data->d_buf, data->d_size,
shdr.sh_link, (int) shdr.sh_flags);
if (strcmp(shname, "license") == 0) {
processed_sec[i] = true;
memcpy(license, data->d_buf, data->d_size);
} else if (strcmp(shname, "version") == 0) {
processed_sec[i] = true;
if (data->d_size != sizeof(int)) {
printf("invalid size of version section %zd\n",
data->d_size);
return 1;
}
memcpy(&kern_version, data->d_buf, sizeof(int));
} else if (strcmp(shname, "maps") == 0) {
int j;
maps_shndx = i;
data_maps = data;
for (j = 0; j < MAX_MAPS; j++)
map_data[j].fd = -1;
} else if (shdr.sh_type == SHT_SYMTAB) {
strtabidx = shdr.sh_link;
symbols = data;
}
}
ret = 1;
if (!symbols) {
printf("missing SHT_SYMTAB section\n");
goto done;
}
if (data_maps) {
nr_maps = load_elf_maps_section(map_data, maps_shndx,
elf, symbols, strtabidx);
if (nr_maps < 0) {
printf("Error: Failed loading ELF maps (errno:%d):%s\n",
nr_maps, strerror(-nr_maps));
goto done;
}
if (load_maps(map_data, nr_maps, fixup_map))
goto done;
map_data_count = nr_maps;
processed_sec[maps_shndx] = true;
}
/* process all relo sections, and rewrite bpf insns for maps */
for (i = 1; i < ehdr.e_shnum; i++) {
if (processed_sec[i])
continue;
if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
continue;
if (shdr.sh_type == SHT_REL) {
struct bpf_insn *insns;
/* locate prog sec that need map fixup (relocations) */
if (get_sec(elf, shdr.sh_info, &ehdr, &shname_prog,
&shdr_prog, &data_prog))
continue;
if (shdr_prog.sh_type != SHT_PROGBITS ||
!(shdr_prog.sh_flags & SHF_EXECINSTR))
continue;
insns = (struct bpf_insn *) data_prog->d_buf;
processed_sec[i] = true; /* relo section */
if (parse_relo_and_apply(data, symbols, &shdr, insns,
map_data, nr_maps))
continue;
}
}
/* load programs */
for (i = 1; i < ehdr.e_shnum; i++) {
if (processed_sec[i])
continue;
if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
continue;
if (memcmp(shname, "kprobe/", 7) == 0 ||
memcmp(shname, "kretprobe/", 10) == 0 ||
memcmp(shname, "tracepoint/", 11) == 0 ||
memcmp(shname, "raw_tracepoint/", 15) == 0 ||
memcmp(shname, "xdp", 3) == 0 ||
memcmp(shname, "perf_event", 10) == 0 ||
memcmp(shname, "socket", 6) == 0 ||
memcmp(shname, "cgroup/", 7) == 0 ||
memcmp(shname, "sockops", 7) == 0 ||
memcmp(shname, "sk_skb", 6) == 0 ||
memcmp(shname, "sk_msg", 6) == 0) {
ret = load_and_attach(shname, data->d_buf,
data->d_size);
if (ret != 0)
goto done;
}
}
done:
close(fd);
return ret;
}
int load_bpf_file(char *path)
{
return do_load_bpf_file(path, NULL);
}
int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map)
{
return do_load_bpf_file(path, fixup_map);
}

View File

@ -1,57 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __BPF_LOAD_H
#define __BPF_LOAD_H
#include <bpf/bpf.h>
#define MAX_MAPS 32
#define MAX_PROGS 32
struct bpf_load_map_def {
unsigned int type;
unsigned int key_size;
unsigned int value_size;
unsigned int max_entries;
unsigned int map_flags;
unsigned int inner_map_idx;
unsigned int numa_node;
};
struct bpf_map_data {
int fd;
char *name;
size_t elf_offset;
struct bpf_load_map_def def;
};
typedef void (*fixup_map_cb)(struct bpf_map_data *map, int idx);
extern int prog_fd[MAX_PROGS];
extern int event_fd[MAX_PROGS];
extern char bpf_log_buf[BPF_LOG_BUF_SIZE];
extern int prog_cnt;
/* There is a one-to-one mapping between map_fd[] and map_data[].
* The map_data[] just contains more rich info on the given map.
*/
extern int map_fd[MAX_MAPS];
extern struct bpf_map_data map_data[MAX_MAPS];
extern int map_data_count;
/* parses elf file compiled by llvm .c->.o
* . parses 'maps' section and creates maps via BPF syscall
* . parses 'license' section and passes it to syscall
* . parses elf relocations for BPF maps and adjusts BPF_LD_IMM64 insns by
* storing map_fd into insn->imm and marking such insns as BPF_PSEUDO_MAP_FD
* . loads eBPF programs via BPF syscall
*
* One ELF file can contain multiple BPF programs which will be loaded
* and their FDs stored stored in prog_fd array
*
* returns zero on success
*/
int load_bpf_file(char *path);
int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map);
int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags);
#endif

Some files were not shown because too many files have changed in this diff Show More