Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

Daniel Borkmann says:

====================
pull-request: bpf-next 2019-04-28

The following pull-request contains BPF updates for your *net-next* tree.

The main changes are:

1) Introduce BPF socket local storage map so that BPF programs can store
   private data they associate with a socket (instead of e.g. separate hash
   table), from Martin.

2) Add support for bpftool to dump BTF types. This is done through a new
   `bpftool btf dump` sub-command, from Andrii.

3) Enable BPF-based flow dissector for skb-less eth_get_headlen() calls which
   was currently not supported since skb was used to lookup netns, from Stanislav.

4) Add an opt-in interface for tracepoints to expose a writable context
   for attached BPF programs, used here for NBD sockets, from Matt.

5) BPF xadd related arm64 JIT fixes and scalability improvements, from Daniel.

6) Change the skb->protocol for bpf_skb_adjust_room() helper in order to
   support tunnels such as sit. Add selftests as well, from Willem.

7) Various smaller misc fixes.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2019-04-28 08:42:41 -04:00
commit 5f0d736e7f
84 changed files with 4168 additions and 361 deletions

View File

@ -10742,6 +10742,7 @@ L: linux-block@vger.kernel.org
L: nbd@other.debian.org L: nbd@other.debian.org
F: Documentation/blockdev/nbd.txt F: Documentation/blockdev/nbd.txt
F: drivers/block/nbd.c F: drivers/block/nbd.c
F: include/trace/events/nbd.h
F: include/uapi/linux/nbd.h F: include/uapi/linux/nbd.h
NETWORK DROP MONITOR NETWORK DROP MONITOR

View File

@ -277,6 +277,7 @@ __AARCH64_INSN_FUNCS(adrp, 0x9F000000, 0x90000000)
__AARCH64_INSN_FUNCS(prfm, 0x3FC00000, 0x39800000) __AARCH64_INSN_FUNCS(prfm, 0x3FC00000, 0x39800000)
__AARCH64_INSN_FUNCS(prfm_lit, 0xFF000000, 0xD8000000) __AARCH64_INSN_FUNCS(prfm_lit, 0xFF000000, 0xD8000000)
__AARCH64_INSN_FUNCS(str_reg, 0x3FE0EC00, 0x38206800) __AARCH64_INSN_FUNCS(str_reg, 0x3FE0EC00, 0x38206800)
__AARCH64_INSN_FUNCS(ldadd, 0x3F20FC00, 0xB8200000)
__AARCH64_INSN_FUNCS(ldr_reg, 0x3FE0EC00, 0x38606800) __AARCH64_INSN_FUNCS(ldr_reg, 0x3FE0EC00, 0x38606800)
__AARCH64_INSN_FUNCS(ldr_lit, 0xBF000000, 0x18000000) __AARCH64_INSN_FUNCS(ldr_lit, 0xBF000000, 0x18000000)
__AARCH64_INSN_FUNCS(ldrsw_lit, 0xFF000000, 0x98000000) __AARCH64_INSN_FUNCS(ldrsw_lit, 0xFF000000, 0x98000000)
@ -394,6 +395,13 @@ u32 aarch64_insn_gen_load_store_ex(enum aarch64_insn_register reg,
enum aarch64_insn_register state, enum aarch64_insn_register state,
enum aarch64_insn_size_type size, enum aarch64_insn_size_type size,
enum aarch64_insn_ldst_type type); enum aarch64_insn_ldst_type type);
u32 aarch64_insn_gen_ldadd(enum aarch64_insn_register result,
enum aarch64_insn_register address,
enum aarch64_insn_register value,
enum aarch64_insn_size_type size);
u32 aarch64_insn_gen_stadd(enum aarch64_insn_register address,
enum aarch64_insn_register value,
enum aarch64_insn_size_type size);
u32 aarch64_insn_gen_add_sub_imm(enum aarch64_insn_register dst, u32 aarch64_insn_gen_add_sub_imm(enum aarch64_insn_register dst,
enum aarch64_insn_register src, enum aarch64_insn_register src,
int imm, enum aarch64_insn_variant variant, int imm, enum aarch64_insn_variant variant,

View File

@ -734,6 +734,46 @@ u32 aarch64_insn_gen_load_store_ex(enum aarch64_insn_register reg,
state); state);
} }
u32 aarch64_insn_gen_ldadd(enum aarch64_insn_register result,
enum aarch64_insn_register address,
enum aarch64_insn_register value,
enum aarch64_insn_size_type size)
{
u32 insn = aarch64_insn_get_ldadd_value();
switch (size) {
case AARCH64_INSN_SIZE_32:
case AARCH64_INSN_SIZE_64:
break;
default:
pr_err("%s: unimplemented size encoding %d\n", __func__, size);
return AARCH64_BREAK_FAULT;
}
insn = aarch64_insn_encode_ldst_size(size, insn);
insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn,
result);
insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
address);
return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RS, insn,
value);
}
u32 aarch64_insn_gen_stadd(enum aarch64_insn_register address,
enum aarch64_insn_register value,
enum aarch64_insn_size_type size)
{
/*
* STADD is simply encoded as an alias for LDADD with XZR as
* the destination register.
*/
return aarch64_insn_gen_ldadd(AARCH64_INSN_REG_ZR, address,
value, size);
}
static u32 aarch64_insn_encode_prfm_imm(enum aarch64_insn_prfm_type type, static u32 aarch64_insn_encode_prfm_imm(enum aarch64_insn_prfm_type type,
enum aarch64_insn_prfm_target target, enum aarch64_insn_prfm_target target,
enum aarch64_insn_prfm_policy policy, enum aarch64_insn_prfm_policy policy,

View File

@ -100,11 +100,9 @@
#define A64_STXR(sf, Rt, Rn, Rs) \ #define A64_STXR(sf, Rt, Rn, Rs) \
A64_LSX(sf, Rt, Rn, Rs, STORE_EX) A64_LSX(sf, Rt, Rn, Rs, STORE_EX)
/* Prefetch */ /* LSE atomics */
#define A64_PRFM(Rn, type, target, policy) \ #define A64_STADD(sf, Rn, Rs) \
aarch64_insn_gen_prefetch(Rn, AARCH64_INSN_PRFM_TYPE_##type, \ aarch64_insn_gen_stadd(Rn, Rs, A64_SIZE(sf))
AARCH64_INSN_PRFM_TARGET_##target, \
AARCH64_INSN_PRFM_POLICY_##policy)
/* Add/subtract (immediate) */ /* Add/subtract (immediate) */
#define A64_ADDSUB_IMM(sf, Rd, Rn, imm12, type) \ #define A64_ADDSUB_IMM(sf, Rd, Rn, imm12, type) \

View File

@ -365,7 +365,7 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
const bool is64 = BPF_CLASS(code) == BPF_ALU64 || const bool is64 = BPF_CLASS(code) == BPF_ALU64 ||
BPF_CLASS(code) == BPF_JMP; BPF_CLASS(code) == BPF_JMP;
const bool isdw = BPF_SIZE(code) == BPF_DW; const bool isdw = BPF_SIZE(code) == BPF_DW;
u8 jmp_cond; u8 jmp_cond, reg;
s32 jmp_offset; s32 jmp_offset;
#define check_imm(bits, imm) do { \ #define check_imm(bits, imm) do { \
@ -756,19 +756,28 @@ emit_cond_jmp:
break; break;
} }
break; break;
/* STX XADD: lock *(u32 *)(dst + off) += src */ /* STX XADD: lock *(u32 *)(dst + off) += src */
case BPF_STX | BPF_XADD | BPF_W: case BPF_STX | BPF_XADD | BPF_W:
/* STX XADD: lock *(u64 *)(dst + off) += src */ /* STX XADD: lock *(u64 *)(dst + off) += src */
case BPF_STX | BPF_XADD | BPF_DW: case BPF_STX | BPF_XADD | BPF_DW:
if (!off) {
reg = dst;
} else {
emit_a64_mov_i(1, tmp, off, ctx); emit_a64_mov_i(1, tmp, off, ctx);
emit(A64_ADD(1, tmp, tmp, dst), ctx); emit(A64_ADD(1, tmp, tmp, dst), ctx);
emit(A64_PRFM(tmp, PST, L1, STRM), ctx); reg = tmp;
emit(A64_LDXR(isdw, tmp2, tmp), ctx); }
if (cpus_have_cap(ARM64_HAS_LSE_ATOMICS)) {
emit(A64_STADD(isdw, reg, src), ctx);
} else {
emit(A64_LDXR(isdw, tmp2, reg), ctx);
emit(A64_ADD(isdw, tmp2, tmp2, src), ctx); emit(A64_ADD(isdw, tmp2, tmp2, src), ctx);
emit(A64_STXR(isdw, tmp2, tmp, tmp3), ctx); emit(A64_STXR(isdw, tmp2, reg, tmp3), ctx);
jmp_offset = -3; jmp_offset = -3;
check_imm19(jmp_offset); check_imm19(jmp_offset);
emit(A64_CBNZ(0, tmp3, jmp_offset), ctx); emit(A64_CBNZ(0, tmp3, jmp_offset), ctx);
}
break; break;
default: default:

View File

@ -44,6 +44,9 @@
#include <linux/nbd-netlink.h> #include <linux/nbd-netlink.h>
#include <net/genetlink.h> #include <net/genetlink.h>
#define CREATE_TRACE_POINTS
#include <trace/events/nbd.h>
static DEFINE_IDR(nbd_index_idr); static DEFINE_IDR(nbd_index_idr);
static DEFINE_MUTEX(nbd_index_mutex); static DEFINE_MUTEX(nbd_index_mutex);
static int nbd_total_devices = 0; static int nbd_total_devices = 0;
@ -510,6 +513,10 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
if (sent) { if (sent) {
if (sent >= sizeof(request)) { if (sent >= sizeof(request)) {
skip = sent - sizeof(request); skip = sent - sizeof(request);
/* initialize handle for tracing purposes */
handle = nbd_cmd_handle(cmd);
goto send_pages; goto send_pages;
} }
iov_iter_advance(&from, sent); iov_iter_advance(&from, sent);
@ -526,11 +533,14 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
handle = nbd_cmd_handle(cmd); handle = nbd_cmd_handle(cmd);
memcpy(request.handle, &handle, sizeof(handle)); memcpy(request.handle, &handle, sizeof(handle));
trace_nbd_send_request(&request, nbd->index, blk_mq_rq_from_pdu(cmd));
dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n", dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n",
req, nbdcmd_to_ascii(type), req, nbdcmd_to_ascii(type),
(unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req)); (unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
result = sock_xmit(nbd, index, 1, &from, result = sock_xmit(nbd, index, 1, &from,
(type == NBD_CMD_WRITE) ? MSG_MORE : 0, &sent); (type == NBD_CMD_WRITE) ? MSG_MORE : 0, &sent);
trace_nbd_header_sent(req, handle);
if (result <= 0) { if (result <= 0) {
if (was_interrupted(result)) { if (was_interrupted(result)) {
/* If we havne't sent anything we can just return BUSY, /* If we havne't sent anything we can just return BUSY,
@ -603,6 +613,7 @@ send_pages:
bio = next; bio = next;
} }
out: out:
trace_nbd_payload_sent(req, handle);
nsock->pending = NULL; nsock->pending = NULL;
nsock->sent = 0; nsock->sent = 0;
return 0; return 0;
@ -650,6 +661,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
tag, req); tag, req);
return ERR_PTR(-ENOENT); return ERR_PTR(-ENOENT);
} }
trace_nbd_header_received(req, handle);
cmd = blk_mq_rq_to_pdu(req); cmd = blk_mq_rq_to_pdu(req);
mutex_lock(&cmd->lock); mutex_lock(&cmd->lock);
@ -703,6 +715,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
} }
} }
out: out:
trace_nbd_payload_received(req, handle);
mutex_unlock(&cmd->lock); mutex_unlock(&cmd->lock);
return ret ? ERR_PTR(ret) : cmd; return ret ? ERR_PTR(ret) : cmd;
} }

View File

@ -354,7 +354,8 @@ int aq_ring_rx_clean(struct aq_ring_s *self,
hdr_len = buff->len; hdr_len = buff->len;
if (hdr_len > AQ_CFG_RX_HDR_SIZE) if (hdr_len > AQ_CFG_RX_HDR_SIZE)
hdr_len = eth_get_headlen(aq_buf_vaddr(&buff->rxdata), hdr_len = eth_get_headlen(skb->dev,
aq_buf_vaddr(&buff->rxdata),
AQ_CFG_RX_HDR_SIZE); AQ_CFG_RX_HDR_SIZE);
memcpy(__skb_put(skb, hdr_len), aq_buf_vaddr(&buff->rxdata), memcpy(__skb_put(skb, hdr_len), aq_buf_vaddr(&buff->rxdata),

View File

@ -899,7 +899,7 @@ static struct sk_buff *bnxt_rx_page_skb(struct bnxt *bp,
DMA_ATTR_WEAK_ORDERING); DMA_ATTR_WEAK_ORDERING);
if (unlikely(!payload)) if (unlikely(!payload))
payload = eth_get_headlen(data_ptr, len); payload = eth_get_headlen(bp->dev, data_ptr, len);
skb = napi_alloc_skb(&rxr->bnapi->napi, payload); skb = napi_alloc_skb(&rxr->bnapi->napi, payload);
if (!skb) { if (!skb) {

View File

@ -598,7 +598,7 @@ static int hns_nic_poll_rx_skb(struct hns_nic_ring_data *ring_data,
} else { } else {
ring->stats.seg_pkt_cnt++; ring->stats.seg_pkt_cnt++;
pull_len = eth_get_headlen(va, HNS_RX_HEAD_SIZE); pull_len = eth_get_headlen(ndev, va, HNS_RX_HEAD_SIZE);
memcpy(__skb_put(skb, pull_len), va, memcpy(__skb_put(skb, pull_len), va,
ALIGN(pull_len, sizeof(long))); ALIGN(pull_len, sizeof(long)));

View File

@ -2588,7 +2588,7 @@ static int hns3_alloc_skb(struct hns3_enet_ring *ring, int length,
ring->stats.seg_pkt_cnt++; ring->stats.seg_pkt_cnt++;
u64_stats_update_end(&ring->syncp); u64_stats_update_end(&ring->syncp);
ring->pull_len = eth_get_headlen(va, HNS3_RX_HEAD_SIZE); ring->pull_len = eth_get_headlen(netdev, va, HNS3_RX_HEAD_SIZE);
__skb_put(skb, ring->pull_len); __skb_put(skb, ring->pull_len);
hns3_nic_reuse_page(skb, ring->frag_num++, ring, ring->pull_len, hns3_nic_reuse_page(skb, ring->frag_num++, ring, ring->pull_len,
desc_cb); desc_cb);

View File

@ -280,7 +280,7 @@ static bool fm10k_add_rx_frag(struct fm10k_rx_buffer *rx_buffer,
/* we need the header to contain the greater of either ETH_HLEN or /* we need the header to contain the greater of either ETH_HLEN or
* 60 bytes if the skb->len is less than 60 for skb_pad. * 60 bytes if the skb->len is less than 60 for skb_pad.
*/ */
pull_len = eth_get_headlen(va, FM10K_RX_HDR_LEN); pull_len = eth_get_headlen(skb->dev, va, FM10K_RX_HDR_LEN);
/* align pull length to size of long to optimize memcpy performance */ /* align pull length to size of long to optimize memcpy performance */
memcpy(__skb_put(skb, pull_len), va, ALIGN(pull_len, sizeof(long))); memcpy(__skb_put(skb, pull_len), va, ALIGN(pull_len, sizeof(long)));

View File

@ -2035,7 +2035,8 @@ static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring,
/* Determine available headroom for copy */ /* Determine available headroom for copy */
headlen = size; headlen = size;
if (headlen > I40E_RX_HDR_SIZE) if (headlen > I40E_RX_HDR_SIZE)
headlen = eth_get_headlen(xdp->data, I40E_RX_HDR_SIZE); headlen = eth_get_headlen(skb->dev, xdp->data,
I40E_RX_HDR_SIZE);
/* align pull length to size of long to optimize memcpy performance */ /* align pull length to size of long to optimize memcpy performance */
memcpy(__skb_put(skb, headlen), xdp->data, memcpy(__skb_put(skb, headlen), xdp->data,

View File

@ -1315,7 +1315,7 @@ static struct sk_buff *iavf_construct_skb(struct iavf_ring *rx_ring,
/* Determine available headroom for copy */ /* Determine available headroom for copy */
headlen = size; headlen = size;
if (headlen > IAVF_RX_HDR_SIZE) if (headlen > IAVF_RX_HDR_SIZE)
headlen = eth_get_headlen(va, IAVF_RX_HDR_SIZE); headlen = eth_get_headlen(skb->dev, va, IAVF_RX_HDR_SIZE);
/* align pull length to size of long to optimize memcpy performance */ /* align pull length to size of long to optimize memcpy performance */
memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long))); memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long)));

View File

@ -699,7 +699,7 @@ ice_construct_skb(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf,
/* Determine available headroom for copy */ /* Determine available headroom for copy */
headlen = size; headlen = size;
if (headlen > ICE_RX_HDR_SIZE) if (headlen > ICE_RX_HDR_SIZE)
headlen = eth_get_headlen(va, ICE_RX_HDR_SIZE); headlen = eth_get_headlen(skb->dev, va, ICE_RX_HDR_SIZE);
/* align pull length to size of long to optimize memcpy performance */ /* align pull length to size of long to optimize memcpy performance */
memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long))); memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long)));

View File

@ -8051,7 +8051,7 @@ static struct sk_buff *igb_construct_skb(struct igb_ring *rx_ring,
/* Determine available headroom for copy */ /* Determine available headroom for copy */
headlen = size; headlen = size;
if (headlen > IGB_RX_HDR_LEN) if (headlen > IGB_RX_HDR_LEN)
headlen = eth_get_headlen(va, IGB_RX_HDR_LEN); headlen = eth_get_headlen(skb->dev, va, IGB_RX_HDR_LEN);
/* align pull length to size of long to optimize memcpy performance */ /* align pull length to size of long to optimize memcpy performance */
memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long))); memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long)));

View File

@ -1199,7 +1199,7 @@ static struct sk_buff *igc_construct_skb(struct igc_ring *rx_ring,
/* Determine available headroom for copy */ /* Determine available headroom for copy */
headlen = size; headlen = size;
if (headlen > IGC_RX_HDR_LEN) if (headlen > IGC_RX_HDR_LEN)
headlen = eth_get_headlen(va, IGC_RX_HDR_LEN); headlen = eth_get_headlen(skb->dev, va, IGC_RX_HDR_LEN);
/* align pull length to size of long to optimize memcpy performance */ /* align pull length to size of long to optimize memcpy performance */
memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long))); memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long)));

View File

@ -1800,7 +1800,7 @@ static void ixgbe_pull_tail(struct ixgbe_ring *rx_ring,
* we need the header to contain the greater of either ETH_HLEN or * we need the header to contain the greater of either ETH_HLEN or
* 60 bytes if the skb->len is less than 60 for skb_pad. * 60 bytes if the skb->len is less than 60 for skb_pad.
*/ */
pull_len = eth_get_headlen(va, IXGBE_RX_HDR_SIZE); pull_len = eth_get_headlen(skb->dev, va, IXGBE_RX_HDR_SIZE);
/* align pull length to size of long to optimize memcpy performance */ /* align pull length to size of long to optimize memcpy performance */
skb_copy_to_linear_data(skb, va, ALIGN(pull_len, sizeof(long))); skb_copy_to_linear_data(skb, va, ALIGN(pull_len, sizeof(long)));

View File

@ -895,7 +895,8 @@ struct sk_buff *ixgbevf_construct_skb(struct ixgbevf_ring *rx_ring,
/* Determine available headroom for copy */ /* Determine available headroom for copy */
headlen = size; headlen = size;
if (headlen > IXGBEVF_RX_HDR_SIZE) if (headlen > IXGBEVF_RX_HDR_SIZE)
headlen = eth_get_headlen(xdp->data, IXGBEVF_RX_HDR_SIZE); headlen = eth_get_headlen(skb->dev, xdp->data,
IXGBEVF_RX_HDR_SIZE);
/* align pull length to size of long to optimize memcpy performance */ /* align pull length to size of long to optimize memcpy performance */
memcpy(__skb_put(skb, headlen), xdp->data, memcpy(__skb_put(skb, headlen), xdp->data,

View File

@ -163,7 +163,7 @@ static inline u16 mlx5e_calc_min_inline(enum mlx5_inline_modes mode,
case MLX5_INLINE_MODE_NONE: case MLX5_INLINE_MODE_NONE:
return 0; return 0;
case MLX5_INLINE_MODE_TCP_UDP: case MLX5_INLINE_MODE_TCP_UDP:
hlen = eth_get_headlen(skb->data, skb_headlen(skb)); hlen = eth_get_headlen(skb->dev, skb->data, skb_headlen(skb));
if (hlen == ETH_HLEN && !skb_vlan_tag_present(skb)) if (hlen == ETH_HLEN && !skb_vlan_tag_present(skb))
hlen += VLAN_HLEN; hlen += VLAN_HLEN;
break; break;

View File

@ -1965,7 +1965,8 @@ drop:
if (frags) { if (frags) {
/* Exercise flow dissector code path. */ /* Exercise flow dissector code path. */
u32 headlen = eth_get_headlen(skb->data, skb_headlen(skb)); u32 headlen = eth_get_headlen(tun->dev, skb->data,
skb_headlen(skb));
if (unlikely(headlen > skb_headlen(skb))) { if (unlikely(headlen > skb_headlen(skb))) {
this_cpu_inc(tun->pcpu_stats->rx_dropped); this_cpu_inc(tun->pcpu_stats->rx_dropped);

View File

@ -184,6 +184,7 @@ enum bpf_arg_type {
ARG_PTR_TO_MAP_KEY, /* pointer to stack used as map key */ ARG_PTR_TO_MAP_KEY, /* pointer to stack used as map key */
ARG_PTR_TO_MAP_VALUE, /* pointer to stack used as map value */ ARG_PTR_TO_MAP_VALUE, /* pointer to stack used as map value */
ARG_PTR_TO_UNINIT_MAP_VALUE, /* pointer to valid memory used to store a map value */ ARG_PTR_TO_UNINIT_MAP_VALUE, /* pointer to valid memory used to store a map value */
ARG_PTR_TO_MAP_VALUE_OR_NULL, /* pointer to stack used as map value or NULL */
/* the following constraints used to prototype bpf_memcmp() and other /* the following constraints used to prototype bpf_memcmp() and other
* functions that access data on eBPF program stack * functions that access data on eBPF program stack
@ -204,6 +205,7 @@ enum bpf_arg_type {
ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */ ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */
ARG_PTR_TO_INT, /* pointer to int */ ARG_PTR_TO_INT, /* pointer to int */
ARG_PTR_TO_LONG, /* pointer to long */ ARG_PTR_TO_LONG, /* pointer to long */
ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */
}; };
/* type of values returned from helper functions */ /* type of values returned from helper functions */
@ -272,6 +274,7 @@ enum bpf_reg_type {
PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */ PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */
PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */ PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */
PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */ PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */
PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */
}; };
/* The information passed from prog-specific *_is_valid_access /* The information passed from prog-specific *_is_valid_access
@ -361,6 +364,7 @@ struct bpf_prog_aux {
u32 used_map_cnt; u32 used_map_cnt;
u32 max_ctx_offset; u32 max_ctx_offset;
u32 max_pkt_offset; u32 max_pkt_offset;
u32 max_tp_access;
u32 stack_depth; u32 stack_depth;
u32 id; u32 id;
u32 func_cnt; /* used by non-func prog as the number of func progs */ u32 func_cnt; /* used by non-func prog as the number of func progs */

View File

@ -25,6 +25,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe)
BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint) BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint)
BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event) BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event)
BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint) BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint)
BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, raw_tracepoint_writable)
#endif #endif
#ifdef CONFIG_CGROUP_BPF #ifdef CONFIG_CGROUP_BPF
BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
@ -60,6 +61,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
#ifdef CONFIG_NET #ifdef CONFIG_NET
BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops)
#if defined(CONFIG_BPF_STREAM_PARSER) #if defined(CONFIG_BPF_STREAM_PARSER)
BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops)

View File

@ -33,7 +33,7 @@ struct device;
int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr); int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr);
unsigned char *arch_get_platform_mac_address(void); unsigned char *arch_get_platform_mac_address(void);
int nvmem_get_mac_address(struct device *dev, void *addrbuf); int nvmem_get_mac_address(struct device *dev, void *addrbuf);
u32 eth_get_headlen(void *data, unsigned int max_len); u32 eth_get_headlen(const struct net_device *dev, void *data, unsigned int len);
__be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev); __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev);
extern const struct header_ops eth_header_ops; extern const struct header_ops eth_header_ops;

View File

@ -1258,11 +1258,19 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
unsigned int key_count); unsigned int key_count);
#ifdef CONFIG_NET #ifdef CONFIG_NET
int skb_flow_dissector_prog_query(const union bpf_attr *attr,
union bpf_attr __user *uattr);
int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
struct bpf_prog *prog); struct bpf_prog *prog);
int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr); int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr);
#else #else
static inline int skb_flow_dissector_prog_query(const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
return -EOPNOTSUPP;
}
static inline int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, static inline int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
struct bpf_prog *prog) struct bpf_prog *prog)
{ {
@ -1275,12 +1283,12 @@ static inline int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr)
} }
#endif #endif
struct bpf_flow_keys; struct bpf_flow_dissector;
bool __skb_flow_bpf_dissect(struct bpf_prog *prog, bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
__be16 proto, int nhoff, int hlen);
bool __skb_flow_dissect(const struct net *net,
const struct sk_buff *skb, const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
struct bpf_flow_keys *flow_keys);
bool __skb_flow_dissect(const struct sk_buff *skb,
struct flow_dissector *flow_dissector, struct flow_dissector *flow_dissector,
void *target_container, void *target_container,
void *data, __be16 proto, int nhoff, int hlen, void *data, __be16 proto, int nhoff, int hlen,
@ -1290,8 +1298,8 @@ static inline bool skb_flow_dissect(const struct sk_buff *skb,
struct flow_dissector *flow_dissector, struct flow_dissector *flow_dissector,
void *target_container, unsigned int flags) void *target_container, unsigned int flags)
{ {
return __skb_flow_dissect(skb, flow_dissector, target_container, return __skb_flow_dissect(NULL, skb, flow_dissector,
NULL, 0, 0, 0, flags); target_container, NULL, 0, 0, 0, flags);
} }
static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb, static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb,
@ -1299,18 +1307,19 @@ static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb,
unsigned int flags) unsigned int flags)
{ {
memset(flow, 0, sizeof(*flow)); memset(flow, 0, sizeof(*flow));
return __skb_flow_dissect(skb, &flow_keys_dissector, flow, return __skb_flow_dissect(NULL, skb, &flow_keys_dissector,
NULL, 0, 0, 0, flags); flow, NULL, 0, 0, 0, flags);
} }
static inline bool static inline bool
skb_flow_dissect_flow_keys_basic(const struct sk_buff *skb, skb_flow_dissect_flow_keys_basic(const struct net *net,
const struct sk_buff *skb,
struct flow_keys_basic *flow, void *data, struct flow_keys_basic *flow, void *data,
__be16 proto, int nhoff, int hlen, __be16 proto, int nhoff, int hlen,
unsigned int flags) unsigned int flags)
{ {
memset(flow, 0, sizeof(*flow)); memset(flow, 0, sizeof(*flow));
return __skb_flow_dissect(skb, &flow_keys_basic_dissector, flow, return __skb_flow_dissect(net, skb, &flow_keys_basic_dissector, flow,
data, proto, nhoff, hlen, flags); data, proto, nhoff, hlen, flags);
} }
@ -2488,7 +2497,8 @@ static inline void skb_probe_transport_header(struct sk_buff *skb)
if (skb_transport_header_was_set(skb)) if (skb_transport_header_was_set(skb))
return; return;
if (skb_flow_dissect_flow_keys_basic(skb, &keys, NULL, 0, 0, 0, 0)) if (skb_flow_dissect_flow_keys_basic(NULL, skb, &keys,
NULL, 0, 0, 0, 0))
skb_set_transport_header(skb, keys.control.thoff); skb_set_transport_header(skb, keys.control.thoff);
} }

View File

@ -45,6 +45,7 @@ struct bpf_raw_event_map {
struct tracepoint *tp; struct tracepoint *tp;
void *bpf_func; void *bpf_func;
u32 num_args; u32 num_args;
u32 writable_size;
} __aligned(32); } __aligned(32);
#endif #endif

View File

@ -0,0 +1,13 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright (c) 2019 Facebook */
#ifndef _BPF_SK_STORAGE_H
#define _BPF_SK_STORAGE_H
struct sock;
void bpf_sk_storage_free(struct sock *sk);
extern const struct bpf_func_proto bpf_sk_storage_get_proto;
extern const struct bpf_func_proto bpf_sk_storage_delete_proto;
#endif /* _BPF_SK_STORAGE_H */

View File

@ -305,4 +305,11 @@ static inline void *skb_flow_dissector_target(struct flow_dissector *flow_dissec
return ((char *)target_container) + flow_dissector->offset[key_id]; return ((char *)target_container) + flow_dissector->offset[key_id];
} }
struct bpf_flow_dissector {
struct bpf_flow_keys *flow_keys;
const struct sk_buff *skb;
void *data;
void *data_end;
};
#endif #endif

View File

@ -364,14 +364,11 @@ struct tcf_proto {
}; };
struct qdisc_skb_cb { struct qdisc_skb_cb {
union {
struct { struct {
unsigned int pkt_len; unsigned int pkt_len;
u16 slave_dev_queue_mapping; u16 slave_dev_queue_mapping;
u16 tc_classid; u16 tc_classid;
}; };
struct bpf_flow_keys *flow_keys;
};
#define QDISC_CB_PRIV_LEN 20 #define QDISC_CB_PRIV_LEN 20
unsigned char data[QDISC_CB_PRIV_LEN]; unsigned char data[QDISC_CB_PRIV_LEN];
}; };

View File

@ -236,6 +236,8 @@ struct sock_common {
/* public: */ /* public: */
}; };
struct bpf_sk_storage;
/** /**
* struct sock - network layer representation of sockets * struct sock - network layer representation of sockets
* @__sk_common: shared layout with inet_timewait_sock * @__sk_common: shared layout with inet_timewait_sock
@ -510,6 +512,9 @@ struct sock {
#endif #endif
void (*sk_destruct)(struct sock *sk); void (*sk_destruct)(struct sock *sk);
struct sock_reuseport __rcu *sk_reuseport_cb; struct sock_reuseport __rcu *sk_reuseport_cb;
#ifdef CONFIG_BPF_SYSCALL
struct bpf_sk_storage __rcu *sk_bpf_storage;
#endif
struct rcu_head sk_rcu; struct rcu_head sk_rcu;
}; };

View File

@ -69,8 +69,7 @@ __bpf_trace_##call(void *__data, proto) \
* to make sure that if the tracepoint handling changes, the * to make sure that if the tracepoint handling changes, the
* bpf probe will fail to compile unless it too is updated. * bpf probe will fail to compile unless it too is updated.
*/ */
#undef DEFINE_EVENT #define __DEFINE_EVENT(template, call, proto, args, size) \
#define DEFINE_EVENT(template, call, proto, args) \
static inline void bpf_test_probe_##call(void) \ static inline void bpf_test_probe_##call(void) \
{ \ { \
check_trace_callback_type_##call(__bpf_trace_##template); \ check_trace_callback_type_##call(__bpf_trace_##template); \
@ -81,12 +80,36 @@ __bpf_trace_tp_map_##call = { \
.tp = &__tracepoint_##call, \ .tp = &__tracepoint_##call, \
.bpf_func = (void *)__bpf_trace_##template, \ .bpf_func = (void *)__bpf_trace_##template, \
.num_args = COUNT_ARGS(args), \ .num_args = COUNT_ARGS(args), \
.writable_size = size, \
}; };
#define FIRST(x, ...) x
#undef DEFINE_EVENT_WRITABLE
#define DEFINE_EVENT_WRITABLE(template, call, proto, args, size) \
static inline void bpf_test_buffer_##call(void) \
{ \
/* BUILD_BUG_ON() is ignored if the code is completely eliminated, but \
* BUILD_BUG_ON_ZERO() uses a different mechanism that is not \
* dead-code-eliminated. \
*/ \
FIRST(proto); \
(void)BUILD_BUG_ON_ZERO(size != sizeof(*FIRST(args))); \
} \
__DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), size)
#undef DEFINE_EVENT
#define DEFINE_EVENT(template, call, proto, args) \
__DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), 0)
#undef DEFINE_EVENT_PRINT #undef DEFINE_EVENT_PRINT
#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \ #define DEFINE_EVENT_PRINT(template, name, proto, args, print) \
DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args)) DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
#undef DEFINE_EVENT_WRITABLE
#undef __DEFINE_EVENT
#undef FIRST
#endif /* CONFIG_BPF_EVENTS */ #endif /* CONFIG_BPF_EVENTS */

View File

@ -0,0 +1,50 @@
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM bpf_test_run
#if !defined(_TRACE_BPF_TEST_RUN_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_BPF_TEST_RUN_H
#include <linux/tracepoint.h>
DECLARE_EVENT_CLASS(bpf_test_finish,
TP_PROTO(int *err),
TP_ARGS(err),
TP_STRUCT__entry(
__field(int, err)
),
TP_fast_assign(
__entry->err = *err;
),
TP_printk("bpf_test_finish with err=%d", __entry->err)
);
#ifdef DEFINE_EVENT_WRITABLE
#undef BPF_TEST_RUN_DEFINE_EVENT
#define BPF_TEST_RUN_DEFINE_EVENT(template, call, proto, args, size) \
DEFINE_EVENT_WRITABLE(template, call, PARAMS(proto), \
PARAMS(args), size)
#else
#undef BPF_TEST_RUN_DEFINE_EVENT
#define BPF_TEST_RUN_DEFINE_EVENT(template, call, proto, args, size) \
DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args))
#endif
BPF_TEST_RUN_DEFINE_EVENT(bpf_test_finish, bpf_test_finish,
TP_PROTO(int *err),
TP_ARGS(err),
sizeof(int)
);
#endif
/* This part must be outside protection */
#include <trace/define_trace.h>

107
include/trace/events/nbd.h Normal file
View File

@ -0,0 +1,107 @@
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM nbd
#if !defined(_TRACE_NBD_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_NBD_H
#include <linux/tracepoint.h>
DECLARE_EVENT_CLASS(nbd_transport_event,
TP_PROTO(struct request *req, u64 handle),
TP_ARGS(req, handle),
TP_STRUCT__entry(
__field(struct request *, req)
__field(u64, handle)
),
TP_fast_assign(
__entry->req = req;
__entry->handle = handle;
),
TP_printk(
"nbd transport event: request %p, handle 0x%016llx",
__entry->req,
__entry->handle
)
);
DEFINE_EVENT(nbd_transport_event, nbd_header_sent,
TP_PROTO(struct request *req, u64 handle),
TP_ARGS(req, handle)
);
DEFINE_EVENT(nbd_transport_event, nbd_payload_sent,
TP_PROTO(struct request *req, u64 handle),
TP_ARGS(req, handle)
);
DEFINE_EVENT(nbd_transport_event, nbd_header_received,
TP_PROTO(struct request *req, u64 handle),
TP_ARGS(req, handle)
);
DEFINE_EVENT(nbd_transport_event, nbd_payload_received,
TP_PROTO(struct request *req, u64 handle),
TP_ARGS(req, handle)
);
DECLARE_EVENT_CLASS(nbd_send_request,
TP_PROTO(struct nbd_request *nbd_request, int index,
struct request *rq),
TP_ARGS(nbd_request, index, rq),
TP_STRUCT__entry(
__field(struct nbd_request *, nbd_request)
__field(u64, dev_index)
__field(struct request *, request)
),
TP_fast_assign(
__entry->nbd_request = 0;
__entry->dev_index = index;
__entry->request = rq;
),
TP_printk("nbd%lld: request %p", __entry->dev_index, __entry->request)
);
#ifdef DEFINE_EVENT_WRITABLE
#undef NBD_DEFINE_EVENT
#define NBD_DEFINE_EVENT(template, call, proto, args, size) \
DEFINE_EVENT_WRITABLE(template, call, PARAMS(proto), \
PARAMS(args), size)
#else
#undef NBD_DEFINE_EVENT
#define NBD_DEFINE_EVENT(template, call, proto, args, size) \
DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args))
#endif
NBD_DEFINE_EVENT(nbd_send_request, nbd_send_request,
TP_PROTO(struct nbd_request *nbd_request, int index,
struct request *rq),
TP_ARGS(nbd_request, index, rq),
sizeof(struct nbd_request)
);
#endif
/* This part must be outside protection */
#include <trace/define_trace.h>

View File

@ -133,6 +133,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
BPF_MAP_TYPE_QUEUE, BPF_MAP_TYPE_QUEUE,
BPF_MAP_TYPE_STACK, BPF_MAP_TYPE_STACK,
BPF_MAP_TYPE_SK_STORAGE,
}; };
/* Note that tracing related programs such as /* Note that tracing related programs such as
@ -168,6 +169,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_SK_REUSEPORT, BPF_PROG_TYPE_SK_REUSEPORT,
BPF_PROG_TYPE_FLOW_DISSECTOR, BPF_PROG_TYPE_FLOW_DISSECTOR,
BPF_PROG_TYPE_CGROUP_SYSCTL, BPF_PROG_TYPE_CGROUP_SYSCTL,
BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
}; };
enum bpf_attach_type { enum bpf_attach_type {
@ -2629,6 +2631,42 @@ union bpf_attr {
* was provided. * was provided.
* *
* **-ERANGE** if resulting value was out of range. * **-ERANGE** if resulting value was out of range.
*
* void *bpf_sk_storage_get(struct bpf_map *map, struct bpf_sock *sk, void *value, u64 flags)
* Description
* Get a bpf-local-storage from a sk.
*
* Logically, it could be thought of getting the value from
* a *map* with *sk* as the **key**. From this
* perspective, the usage is not much different from
* **bpf_map_lookup_elem(map, &sk)** except this
* helper enforces the key must be a **bpf_fullsock()**
* and the map must be a BPF_MAP_TYPE_SK_STORAGE also.
*
* Underneath, the value is stored locally at *sk* instead of
* the map. The *map* is used as the bpf-local-storage **type**.
* The bpf-local-storage **type** (i.e. the *map*) is searched
* against all bpf-local-storages residing at sk.
*
* An optional *flags* (BPF_SK_STORAGE_GET_F_CREATE) can be
* used such that a new bpf-local-storage will be
* created if one does not exist. *value* can be used
* together with BPF_SK_STORAGE_GET_F_CREATE to specify
* the initial value of a bpf-local-storage. If *value* is
* NULL, the new bpf-local-storage will be zero initialized.
* Return
* A bpf-local-storage pointer is returned on success.
*
* **NULL** if not found or there was an error in adding
* a new bpf-local-storage.
*
* int bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk)
* Description
* Delete a bpf-local-storage from a sk.
* Return
* 0 on success.
*
* **-ENOENT** if the bpf-local-storage cannot be found.
*/ */
#define __BPF_FUNC_MAPPER(FN) \ #define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \ FN(unspec), \
@ -2737,7 +2775,9 @@ union bpf_attr {
FN(sysctl_get_new_value), \ FN(sysctl_get_new_value), \
FN(sysctl_set_new_value), \ FN(sysctl_set_new_value), \
FN(strtol), \ FN(strtol), \
FN(strtoul), FN(strtoul), \
FN(sk_storage_get), \
FN(sk_storage_delete),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper /* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call * function eBPF program intends to call
@ -2813,6 +2853,9 @@ enum bpf_func_id {
/* BPF_FUNC_sysctl_get_name flags. */ /* BPF_FUNC_sysctl_get_name flags. */
#define BPF_F_SYSCTL_BASE_NAME (1ULL << 0) #define BPF_F_SYSCTL_BASE_NAME (1ULL << 0)
/* BPF_FUNC_sk_storage_get flags */
#define BPF_SK_STORAGE_GET_F_CREATE (1ULL << 0)
/* Mode for BPF_FUNC_skb_adjust_room helper. */ /* Mode for BPF_FUNC_skb_adjust_room helper. */
enum bpf_adj_room_mode { enum bpf_adj_room_mode {
BPF_ADJ_ROOM_NET, BPF_ADJ_ROOM_NET,

View File

@ -526,7 +526,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
return -EACCES; return -EACCES;
if (map->map_type != BPF_MAP_TYPE_HASH && if (map->map_type != BPF_MAP_TYPE_HASH &&
map->map_type != BPF_MAP_TYPE_ARRAY && map->map_type != BPF_MAP_TYPE_ARRAY &&
map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
map->map_type != BPF_MAP_TYPE_SK_STORAGE)
return -ENOTSUPP; return -ENOTSUPP;
if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >
map->value_size) { map->value_size) {
@ -1789,12 +1790,16 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
} }
raw_tp->btp = btp; raw_tp->btp = btp;
prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd, prog = bpf_prog_get(attr->raw_tracepoint.prog_fd);
BPF_PROG_TYPE_RAW_TRACEPOINT);
if (IS_ERR(prog)) { if (IS_ERR(prog)) {
err = PTR_ERR(prog); err = PTR_ERR(prog);
goto out_free_tp; goto out_free_tp;
} }
if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT &&
prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) {
err = -EINVAL;
goto out_put_prog;
}
err = bpf_probe_register(raw_tp->btp, prog); err = bpf_probe_register(raw_tp->btp, prog);
if (err) if (err)
@ -2009,6 +2014,8 @@ static int bpf_prog_query(const union bpf_attr *attr,
break; break;
case BPF_LIRC_MODE2: case BPF_LIRC_MODE2:
return lirc_prog_query(attr, uattr); return lirc_prog_query(attr, uattr);
case BPF_FLOW_DISSECTOR:
return skb_flow_dissector_prog_query(attr, uattr);
default: default:
return -EINVAL; return -EINVAL;
} }

View File

@ -405,6 +405,7 @@ static const char * const reg_type_str[] = {
[PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null", [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null",
[PTR_TO_TCP_SOCK] = "tcp_sock", [PTR_TO_TCP_SOCK] = "tcp_sock",
[PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null",
[PTR_TO_TP_BUFFER] = "tp_buffer",
}; };
static char slot_type_char[] = { static char slot_type_char[] = {
@ -1993,6 +1994,32 @@ static int check_ctx_reg(struct bpf_verifier_env *env,
return 0; return 0;
} }
static int check_tp_buffer_access(struct bpf_verifier_env *env,
const struct bpf_reg_state *reg,
int regno, int off, int size)
{
if (off < 0) {
verbose(env,
"R%d invalid tracepoint buffer access: off=%d, size=%d",
regno, off, size);
return -EACCES;
}
if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
verbose(env,
"R%d invalid variable buffer offset: off=%d, var_off=%s",
regno, off, tn_buf);
return -EACCES;
}
if (off + size > env->prog->aux->max_tp_access)
env->prog->aux->max_tp_access = off + size;
return 0;
}
/* truncate register to smaller size (in bytes) /* truncate register to smaller size (in bytes)
* must be called with size < BPF_REG_SIZE * must be called with size < BPF_REG_SIZE
*/ */
@ -2137,6 +2164,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
err = check_sock_access(env, insn_idx, regno, off, size, t); err = check_sock_access(env, insn_idx, regno, off, size, t);
if (!err && value_regno >= 0) if (!err && value_regno >= 0)
mark_reg_unknown(env, regs, value_regno); mark_reg_unknown(env, regs, value_regno);
} else if (reg->type == PTR_TO_TP_BUFFER) {
err = check_tp_buffer_access(env, reg, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown(env, regs, value_regno);
} else { } else {
verbose(env, "R%d invalid mem access '%s'\n", regno, verbose(env, "R%d invalid mem access '%s'\n", regno,
reg_type_str[reg->type]); reg_type_str[reg->type]);
@ -2512,9 +2543,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
if (arg_type == ARG_PTR_TO_MAP_KEY || if (arg_type == ARG_PTR_TO_MAP_KEY ||
arg_type == ARG_PTR_TO_MAP_VALUE || arg_type == ARG_PTR_TO_MAP_VALUE ||
arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE ||
arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) {
expected_type = PTR_TO_STACK; expected_type = PTR_TO_STACK;
if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE && if (register_is_null(reg) &&
arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL)
/* final test in check_stack_boundary() */;
else if (!type_is_pkt_pointer(type) &&
type != PTR_TO_MAP_VALUE &&
type != expected_type) type != expected_type)
goto err_type; goto err_type;
} else if (arg_type == ARG_CONST_SIZE || } else if (arg_type == ARG_CONST_SIZE ||
@ -2547,6 +2583,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
} }
meta->ref_obj_id = reg->ref_obj_id; meta->ref_obj_id = reg->ref_obj_id;
} }
} else if (arg_type == ARG_PTR_TO_SOCKET) {
expected_type = PTR_TO_SOCKET;
if (type != expected_type)
goto err_type;
} else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) {
if (meta->func_id == BPF_FUNC_spin_lock) { if (meta->func_id == BPF_FUNC_spin_lock) {
if (process_spin_lock(env, regno, true)) if (process_spin_lock(env, regno, true))
@ -2604,6 +2644,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
meta->map_ptr->key_size, false, meta->map_ptr->key_size, false,
NULL); NULL);
} else if (arg_type == ARG_PTR_TO_MAP_VALUE || } else if (arg_type == ARG_PTR_TO_MAP_VALUE ||
(arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL &&
!register_is_null(reg)) ||
arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) {
/* bpf_map_xxx(..., map_ptr, ..., value) call: /* bpf_map_xxx(..., map_ptr, ..., value) call:
* check [value, value + map->value_size) validity * check [value, value + map->value_size) validity
@ -2753,6 +2795,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
func_id != BPF_FUNC_map_push_elem) func_id != BPF_FUNC_map_push_elem)
goto error; goto error;
break; break;
case BPF_MAP_TYPE_SK_STORAGE:
if (func_id != BPF_FUNC_sk_storage_get &&
func_id != BPF_FUNC_sk_storage_delete)
goto error;
break;
default: default:
break; break;
} }
@ -2816,6 +2863,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
map->map_type != BPF_MAP_TYPE_STACK) map->map_type != BPF_MAP_TYPE_STACK)
goto error; goto error;
break; break;
case BPF_FUNC_sk_storage_get:
case BPF_FUNC_sk_storage_delete:
if (map->map_type != BPF_MAP_TYPE_SK_STORAGE)
goto error;
break;
default: default:
break; break;
} }

View File

@ -915,6 +915,27 @@ const struct bpf_verifier_ops raw_tracepoint_verifier_ops = {
const struct bpf_prog_ops raw_tracepoint_prog_ops = { const struct bpf_prog_ops raw_tracepoint_prog_ops = {
}; };
static bool raw_tp_writable_prog_is_valid_access(int off, int size,
enum bpf_access_type type,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
if (off == 0) {
if (size != sizeof(u64) || type != BPF_READ)
return false;
info->reg_type = PTR_TO_TP_BUFFER;
}
return raw_tp_prog_is_valid_access(off, size, type, prog, info);
}
const struct bpf_verifier_ops raw_tracepoint_writable_verifier_ops = {
.get_func_proto = raw_tp_prog_func_proto,
.is_valid_access = raw_tp_writable_prog_is_valid_access,
};
const struct bpf_prog_ops raw_tracepoint_writable_prog_ops = {
};
static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
const struct bpf_prog *prog, const struct bpf_prog *prog,
struct bpf_insn_access_aux *info) struct bpf_insn_access_aux *info)
@ -1204,6 +1225,9 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *
if (prog->aux->max_ctx_offset > btp->num_args * sizeof(u64)) if (prog->aux->max_ctx_offset > btp->num_args * sizeof(u64))
return -EINVAL; return -EINVAL;
if (prog->aux->max_tp_access > btp->writable_size)
return -EINVAL;
return tracepoint_probe_register(tp, (void *)btp->bpf_func, prog); return tracepoint_probe_register(tp, (void *)btp->bpf_func, prog);
} }

View File

@ -10,9 +10,13 @@
#include <linux/etherdevice.h> #include <linux/etherdevice.h>
#include <linux/filter.h> #include <linux/filter.h>
#include <linux/sched/signal.h> #include <linux/sched/signal.h>
#include <net/bpf_sk_storage.h>
#include <net/sock.h> #include <net/sock.h>
#include <net/tcp.h> #include <net/tcp.h>
#define CREATE_TRACE_POINTS
#include <trace/events/bpf_test_run.h>
static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
u32 *retval, u32 *time) u32 *retval, u32 *time)
{ {
@ -100,6 +104,7 @@ static int bpf_test_finish(const union bpf_attr *kattr,
if (err != -ENOSPC) if (err != -ENOSPC)
err = 0; err = 0;
out: out:
trace_bpf_test_finish(&err);
return err; return err;
} }
@ -331,6 +336,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
sizeof(struct __sk_buff)); sizeof(struct __sk_buff));
out: out:
kfree_skb(skb); kfree_skb(skb);
bpf_sk_storage_free(sk);
kfree(sk); kfree(sk);
kfree(ctx); kfree(ctx);
return ret; return ret;
@ -379,13 +385,12 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
union bpf_attr __user *uattr) union bpf_attr __user *uattr)
{ {
u32 size = kattr->test.data_size_in; u32 size = kattr->test.data_size_in;
struct bpf_flow_dissector ctx = {};
u32 repeat = kattr->test.repeat; u32 repeat = kattr->test.repeat;
struct bpf_flow_keys flow_keys; struct bpf_flow_keys flow_keys;
u64 time_start, time_spent = 0; u64 time_start, time_spent = 0;
struct bpf_skb_data_end *cb; const struct ethhdr *eth;
u32 retval, duration; u32 retval, duration;
struct sk_buff *skb;
struct sock *sk;
void *data; void *data;
int ret; int ret;
u32 i; u32 i;
@ -396,46 +401,28 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
if (kattr->test.ctx_in || kattr->test.ctx_out) if (kattr->test.ctx_in || kattr->test.ctx_out)
return -EINVAL; return -EINVAL;
data = bpf_test_init(kattr, size, NET_SKB_PAD + NET_IP_ALIGN, if (size < ETH_HLEN)
SKB_DATA_ALIGN(sizeof(struct skb_shared_info))); return -EINVAL;
data = bpf_test_init(kattr, size, 0, 0);
if (IS_ERR(data)) if (IS_ERR(data))
return PTR_ERR(data); return PTR_ERR(data);
sk = kzalloc(sizeof(*sk), GFP_USER); eth = (struct ethhdr *)data;
if (!sk) {
kfree(data);
return -ENOMEM;
}
sock_net_set(sk, current->nsproxy->net_ns);
sock_init_data(NULL, sk);
skb = build_skb(data, 0);
if (!skb) {
kfree(data);
kfree(sk);
return -ENOMEM;
}
skb->sk = sk;
skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
__skb_put(skb, size);
skb->protocol = eth_type_trans(skb,
current->nsproxy->net_ns->loopback_dev);
skb_reset_network_header(skb);
cb = (struct bpf_skb_data_end *)skb->cb;
cb->qdisc_cb.flow_keys = &flow_keys;
if (!repeat) if (!repeat)
repeat = 1; repeat = 1;
ctx.flow_keys = &flow_keys;
ctx.data = data;
ctx.data_end = (__u8 *)data + size;
rcu_read_lock(); rcu_read_lock();
preempt_disable(); preempt_disable();
time_start = ktime_get_ns(); time_start = ktime_get_ns();
for (i = 0; i < repeat; i++) { for (i = 0; i < repeat; i++) {
retval = __skb_flow_bpf_dissect(prog, skb, retval = bpf_flow_dissect(prog, &ctx, eth->h_proto, ETH_HLEN,
&flow_keys_dissector, size);
&flow_keys);
if (signal_pending(current)) { if (signal_pending(current)) {
preempt_enable(); preempt_enable();
@ -468,7 +455,6 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
retval, duration); retval, duration);
out: out:
kfree_skb(skb); kfree(data);
kfree(sk);
return ret; return ret;
} }

View File

@ -34,3 +34,4 @@ obj-$(CONFIG_HWBM) += hwbm.o
obj-$(CONFIG_NET_DEVLINK) += devlink.o obj-$(CONFIG_NET_DEVLINK) += devlink.o
obj-$(CONFIG_GRO_CELLS) += gro_cells.o obj-$(CONFIG_GRO_CELLS) += gro_cells.o
obj-$(CONFIG_FAILOVER) += failover.o obj-$(CONFIG_FAILOVER) += failover.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o

804
net/core/bpf_sk_storage.c Normal file
View File

@ -0,0 +1,804 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2019 Facebook */
#include <linux/rculist.h>
#include <linux/list.h>
#include <linux/hash.h>
#include <linux/types.h>
#include <linux/spinlock.h>
#include <linux/bpf.h>
#include <net/bpf_sk_storage.h>
#include <net/sock.h>
#include <uapi/linux/btf.h>
static atomic_t cache_idx;
struct bucket {
struct hlist_head list;
raw_spinlock_t lock;
};
/* Thp map is not the primary owner of a bpf_sk_storage_elem.
* Instead, the sk->sk_bpf_storage is.
*
* The map (bpf_sk_storage_map) is for two purposes
* 1. Define the size of the "sk local storage". It is
* the map's value_size.
*
* 2. Maintain a list to keep track of all elems such
* that they can be cleaned up during the map destruction.
*
* When a bpf local storage is being looked up for a
* particular sk, the "bpf_map" pointer is actually used
* as the "key" to search in the list of elem in
* sk->sk_bpf_storage.
*
* Hence, consider sk->sk_bpf_storage is the mini-map
* with the "bpf_map" pointer as the searching key.
*/
struct bpf_sk_storage_map {
struct bpf_map map;
/* Lookup elem does not require accessing the map.
*
* Updating/Deleting requires a bucket lock to
* link/unlink the elem from the map. Having
* multiple buckets to improve contention.
*/
struct bucket *buckets;
u32 bucket_log;
u16 elem_size;
u16 cache_idx;
};
struct bpf_sk_storage_data {
/* smap is used as the searching key when looking up
* from sk->sk_bpf_storage.
*
* Put it in the same cacheline as the data to minimize
* the number of cachelines access during the cache hit case.
*/
struct bpf_sk_storage_map __rcu *smap;
u8 data[0] __aligned(8);
};
/* Linked to bpf_sk_storage and bpf_sk_storage_map */
struct bpf_sk_storage_elem {
struct hlist_node map_node; /* Linked to bpf_sk_storage_map */
struct hlist_node snode; /* Linked to bpf_sk_storage */
struct bpf_sk_storage __rcu *sk_storage;
struct rcu_head rcu;
/* 8 bytes hole */
/* The data is stored in aother cacheline to minimize
* the number of cachelines access during a cache hit.
*/
struct bpf_sk_storage_data sdata ____cacheline_aligned;
};
#define SELEM(_SDATA) container_of((_SDATA), struct bpf_sk_storage_elem, sdata)
#define SDATA(_SELEM) (&(_SELEM)->sdata)
#define BPF_SK_STORAGE_CACHE_SIZE 16
struct bpf_sk_storage {
struct bpf_sk_storage_data __rcu *cache[BPF_SK_STORAGE_CACHE_SIZE];
struct hlist_head list; /* List of bpf_sk_storage_elem */
struct sock *sk; /* The sk that owns the the above "list" of
* bpf_sk_storage_elem.
*/
struct rcu_head rcu;
raw_spinlock_t lock; /* Protect adding/removing from the "list" */
};
static struct bucket *select_bucket(struct bpf_sk_storage_map *smap,
struct bpf_sk_storage_elem *selem)
{
return &smap->buckets[hash_ptr(selem, smap->bucket_log)];
}
static int omem_charge(struct sock *sk, unsigned int size)
{
/* same check as in sock_kmalloc() */
if (size <= sysctl_optmem_max &&
atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
atomic_add(size, &sk->sk_omem_alloc);
return 0;
}
return -ENOMEM;
}
static bool selem_linked_to_sk(const struct bpf_sk_storage_elem *selem)
{
return !hlist_unhashed(&selem->snode);
}
static bool selem_linked_to_map(const struct bpf_sk_storage_elem *selem)
{
return !hlist_unhashed(&selem->map_node);
}
static struct bpf_sk_storage_elem *selem_alloc(struct bpf_sk_storage_map *smap,
struct sock *sk, void *value,
bool charge_omem)
{
struct bpf_sk_storage_elem *selem;
if (charge_omem && omem_charge(sk, smap->elem_size))
return NULL;
selem = kzalloc(smap->elem_size, GFP_ATOMIC | __GFP_NOWARN);
if (selem) {
if (value)
memcpy(SDATA(selem)->data, value, smap->map.value_size);
return selem;
}
if (charge_omem)
atomic_sub(smap->elem_size, &sk->sk_omem_alloc);
return NULL;
}
/* sk_storage->lock must be held and selem->sk_storage == sk_storage.
* The caller must ensure selem->smap is still valid to be
* dereferenced for its smap->elem_size and smap->cache_idx.
*/
static bool __selem_unlink_sk(struct bpf_sk_storage *sk_storage,
struct bpf_sk_storage_elem *selem,
bool uncharge_omem)
{
struct bpf_sk_storage_map *smap;
bool free_sk_storage;
struct sock *sk;
smap = rcu_dereference(SDATA(selem)->smap);
sk = sk_storage->sk;
/* All uncharging on sk->sk_omem_alloc must be done first.
* sk may be freed once the last selem is unlinked from sk_storage.
*/
if (uncharge_omem)
atomic_sub(smap->elem_size, &sk->sk_omem_alloc);
free_sk_storage = hlist_is_singular_node(&selem->snode,
&sk_storage->list);
if (free_sk_storage) {
atomic_sub(sizeof(struct bpf_sk_storage), &sk->sk_omem_alloc);
sk_storage->sk = NULL;
/* After this RCU_INIT, sk may be freed and cannot be used */
RCU_INIT_POINTER(sk->sk_bpf_storage, NULL);
/* sk_storage is not freed now. sk_storage->lock is
* still held and raw_spin_unlock_bh(&sk_storage->lock)
* will be done by the caller.
*
* Although the unlock will be done under
* rcu_read_lock(), it is more intutivie to
* read if kfree_rcu(sk_storage, rcu) is done
* after the raw_spin_unlock_bh(&sk_storage->lock).
*
* Hence, a "bool free_sk_storage" is returned
* to the caller which then calls the kfree_rcu()
* after unlock.
*/
}
hlist_del_init_rcu(&selem->snode);
if (rcu_access_pointer(sk_storage->cache[smap->cache_idx]) ==
SDATA(selem))
RCU_INIT_POINTER(sk_storage->cache[smap->cache_idx], NULL);
kfree_rcu(selem, rcu);
return free_sk_storage;
}
static void selem_unlink_sk(struct bpf_sk_storage_elem *selem)
{
struct bpf_sk_storage *sk_storage;
bool free_sk_storage = false;
if (unlikely(!selem_linked_to_sk(selem)))
/* selem has already been unlinked from sk */
return;
sk_storage = rcu_dereference(selem->sk_storage);
raw_spin_lock_bh(&sk_storage->lock);
if (likely(selem_linked_to_sk(selem)))
free_sk_storage = __selem_unlink_sk(sk_storage, selem, true);
raw_spin_unlock_bh(&sk_storage->lock);
if (free_sk_storage)
kfree_rcu(sk_storage, rcu);
}
/* sk_storage->lock must be held and sk_storage->list cannot be empty */
static void __selem_link_sk(struct bpf_sk_storage *sk_storage,
struct bpf_sk_storage_elem *selem)
{
RCU_INIT_POINTER(selem->sk_storage, sk_storage);
hlist_add_head(&selem->snode, &sk_storage->list);
}
static void selem_unlink_map(struct bpf_sk_storage_elem *selem)
{
struct bpf_sk_storage_map *smap;
struct bucket *b;
if (unlikely(!selem_linked_to_map(selem)))
/* selem has already be unlinked from smap */
return;
smap = rcu_dereference(SDATA(selem)->smap);
b = select_bucket(smap, selem);
raw_spin_lock_bh(&b->lock);
if (likely(selem_linked_to_map(selem)))
hlist_del_init_rcu(&selem->map_node);
raw_spin_unlock_bh(&b->lock);
}
static void selem_link_map(struct bpf_sk_storage_map *smap,
struct bpf_sk_storage_elem *selem)
{
struct bucket *b = select_bucket(smap, selem);
raw_spin_lock_bh(&b->lock);
RCU_INIT_POINTER(SDATA(selem)->smap, smap);
hlist_add_head_rcu(&selem->map_node, &b->list);
raw_spin_unlock_bh(&b->lock);
}
static void selem_unlink(struct bpf_sk_storage_elem *selem)
{
/* Always unlink from map before unlinking from sk_storage
* because selem will be freed after successfully unlinked from
* the sk_storage.
*/
selem_unlink_map(selem);
selem_unlink_sk(selem);
}
static struct bpf_sk_storage_data *
__sk_storage_lookup(struct bpf_sk_storage *sk_storage,
struct bpf_sk_storage_map *smap,
bool cacheit_lockit)
{
struct bpf_sk_storage_data *sdata;
struct bpf_sk_storage_elem *selem;
/* Fast path (cache hit) */
sdata = rcu_dereference(sk_storage->cache[smap->cache_idx]);
if (sdata && rcu_access_pointer(sdata->smap) == smap)
return sdata;
/* Slow path (cache miss) */
hlist_for_each_entry_rcu(selem, &sk_storage->list, snode)
if (rcu_access_pointer(SDATA(selem)->smap) == smap)
break;
if (!selem)
return NULL;
sdata = SDATA(selem);
if (cacheit_lockit) {
/* spinlock is needed to avoid racing with the
* parallel delete. Otherwise, publishing an already
* deleted sdata to the cache will become a use-after-free
* problem in the next __sk_storage_lookup().
*/
raw_spin_lock_bh(&sk_storage->lock);
if (selem_linked_to_sk(selem))
rcu_assign_pointer(sk_storage->cache[smap->cache_idx],
sdata);
raw_spin_unlock_bh(&sk_storage->lock);
}
return sdata;
}
static struct bpf_sk_storage_data *
sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit)
{
struct bpf_sk_storage *sk_storage;
struct bpf_sk_storage_map *smap;
sk_storage = rcu_dereference(sk->sk_bpf_storage);
if (!sk_storage)
return NULL;
smap = (struct bpf_sk_storage_map *)map;
return __sk_storage_lookup(sk_storage, smap, cacheit_lockit);
}
static int check_flags(const struct bpf_sk_storage_data *old_sdata,
u64 map_flags)
{
if (old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
/* elem already exists */
return -EEXIST;
if (!old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
/* elem doesn't exist, cannot update it */
return -ENOENT;
return 0;
}
static int sk_storage_alloc(struct sock *sk,
struct bpf_sk_storage_map *smap,
struct bpf_sk_storage_elem *first_selem)
{
struct bpf_sk_storage *prev_sk_storage, *sk_storage;
int err;
err = omem_charge(sk, sizeof(*sk_storage));
if (err)
return err;
sk_storage = kzalloc(sizeof(*sk_storage), GFP_ATOMIC | __GFP_NOWARN);
if (!sk_storage) {
err = -ENOMEM;
goto uncharge;
}
INIT_HLIST_HEAD(&sk_storage->list);
raw_spin_lock_init(&sk_storage->lock);
sk_storage->sk = sk;
__selem_link_sk(sk_storage, first_selem);
selem_link_map(smap, first_selem);
/* Publish sk_storage to sk. sk->sk_lock cannot be acquired.
* Hence, atomic ops is used to set sk->sk_bpf_storage
* from NULL to the newly allocated sk_storage ptr.
*
* From now on, the sk->sk_bpf_storage pointer is protected
* by the sk_storage->lock. Hence, when freeing
* the sk->sk_bpf_storage, the sk_storage->lock must
* be held before setting sk->sk_bpf_storage to NULL.
*/
prev_sk_storage = cmpxchg((struct bpf_sk_storage **)&sk->sk_bpf_storage,
NULL, sk_storage);
if (unlikely(prev_sk_storage)) {
selem_unlink_map(first_selem);
err = -EAGAIN;
goto uncharge;
/* Note that even first_selem was linked to smap's
* bucket->list, first_selem can be freed immediately
* (instead of kfree_rcu) because
* bpf_sk_storage_map_free() does a
* synchronize_rcu() before walking the bucket->list.
* Hence, no one is accessing selem from the
* bucket->list under rcu_read_lock().
*/
}
return 0;
uncharge:
kfree(sk_storage);
atomic_sub(sizeof(*sk_storage), &sk->sk_omem_alloc);
return err;
}
/* sk cannot be going away because it is linking new elem
* to sk->sk_bpf_storage. (i.e. sk->sk_refcnt cannot be 0).
* Otherwise, it will become a leak (and other memory issues
* during map destruction).
*/
static struct bpf_sk_storage_data *sk_storage_update(struct sock *sk,
struct bpf_map *map,
void *value,
u64 map_flags)
{
struct bpf_sk_storage_data *old_sdata = NULL;
struct bpf_sk_storage_elem *selem;
struct bpf_sk_storage *sk_storage;
struct bpf_sk_storage_map *smap;
int err;
/* BPF_EXIST and BPF_NOEXIST cannot be both set */
if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) ||
/* BPF_F_LOCK can only be used in a value with spin_lock */
unlikely((map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map)))
return ERR_PTR(-EINVAL);
smap = (struct bpf_sk_storage_map *)map;
sk_storage = rcu_dereference(sk->sk_bpf_storage);
if (!sk_storage || hlist_empty(&sk_storage->list)) {
/* Very first elem for this sk */
err = check_flags(NULL, map_flags);
if (err)
return ERR_PTR(err);
selem = selem_alloc(smap, sk, value, true);
if (!selem)
return ERR_PTR(-ENOMEM);
err = sk_storage_alloc(sk, smap, selem);
if (err) {
kfree(selem);
atomic_sub(smap->elem_size, &sk->sk_omem_alloc);
return ERR_PTR(err);
}
return SDATA(selem);
}
if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) {
/* Hoping to find an old_sdata to do inline update
* such that it can avoid taking the sk_storage->lock
* and changing the lists.
*/
old_sdata = __sk_storage_lookup(sk_storage, smap, false);
err = check_flags(old_sdata, map_flags);
if (err)
return ERR_PTR(err);
if (old_sdata && selem_linked_to_sk(SELEM(old_sdata))) {
copy_map_value_locked(map, old_sdata->data,
value, false);
return old_sdata;
}
}
raw_spin_lock_bh(&sk_storage->lock);
/* Recheck sk_storage->list under sk_storage->lock */
if (unlikely(hlist_empty(&sk_storage->list))) {
/* A parallel del is happening and sk_storage is going
* away. It has just been checked before, so very
* unlikely. Return instead of retry to keep things
* simple.
*/
err = -EAGAIN;
goto unlock_err;
}
old_sdata = __sk_storage_lookup(sk_storage, smap, false);
err = check_flags(old_sdata, map_flags);
if (err)
goto unlock_err;
if (old_sdata && (map_flags & BPF_F_LOCK)) {
copy_map_value_locked(map, old_sdata->data, value, false);
selem = SELEM(old_sdata);
goto unlock;
}
/* sk_storage->lock is held. Hence, we are sure
* we can unlink and uncharge the old_sdata successfully
* later. Hence, instead of charging the new selem now
* and then uncharge the old selem later (which may cause
* a potential but unnecessary charge failure), avoid taking
* a charge at all here (the "!old_sdata" check) and the
* old_sdata will not be uncharged later during __selem_unlink_sk().
*/
selem = selem_alloc(smap, sk, value, !old_sdata);
if (!selem) {
err = -ENOMEM;
goto unlock_err;
}
/* First, link the new selem to the map */
selem_link_map(smap, selem);
/* Second, link (and publish) the new selem to sk_storage */
__selem_link_sk(sk_storage, selem);
/* Third, remove old selem, SELEM(old_sdata) */
if (old_sdata) {
selem_unlink_map(SELEM(old_sdata));
__selem_unlink_sk(sk_storage, SELEM(old_sdata), false);
}
unlock:
raw_spin_unlock_bh(&sk_storage->lock);
return SDATA(selem);
unlock_err:
raw_spin_unlock_bh(&sk_storage->lock);
return ERR_PTR(err);
}
static int sk_storage_delete(struct sock *sk, struct bpf_map *map)
{
struct bpf_sk_storage_data *sdata;
sdata = sk_storage_lookup(sk, map, false);
if (!sdata)
return -ENOENT;
selem_unlink(SELEM(sdata));
return 0;
}
/* Called by __sk_destruct() */
void bpf_sk_storage_free(struct sock *sk)
{
struct bpf_sk_storage_elem *selem;
struct bpf_sk_storage *sk_storage;
bool free_sk_storage = false;
struct hlist_node *n;
rcu_read_lock();
sk_storage = rcu_dereference(sk->sk_bpf_storage);
if (!sk_storage) {
rcu_read_unlock();
return;
}
/* Netiher the bpf_prog nor the bpf-map's syscall
* could be modifying the sk_storage->list now.
* Thus, no elem can be added-to or deleted-from the
* sk_storage->list by the bpf_prog or by the bpf-map's syscall.
*
* It is racing with bpf_sk_storage_map_free() alone
* when unlinking elem from the sk_storage->list and
* the map's bucket->list.
*/
raw_spin_lock_bh(&sk_storage->lock);
hlist_for_each_entry_safe(selem, n, &sk_storage->list, snode) {
/* Always unlink from map before unlinking from
* sk_storage.
*/
selem_unlink_map(selem);
free_sk_storage = __selem_unlink_sk(sk_storage, selem, true);
}
raw_spin_unlock_bh(&sk_storage->lock);
rcu_read_unlock();
if (free_sk_storage)
kfree_rcu(sk_storage, rcu);
}
static void bpf_sk_storage_map_free(struct bpf_map *map)
{
struct bpf_sk_storage_elem *selem;
struct bpf_sk_storage_map *smap;
struct bucket *b;
unsigned int i;
smap = (struct bpf_sk_storage_map *)map;
synchronize_rcu();
/* bpf prog and the userspace can no longer access this map
* now. No new selem (of this map) can be added
* to the sk->sk_bpf_storage or to the map bucket's list.
*
* The elem of this map can be cleaned up here
* or
* by bpf_sk_storage_free() during __sk_destruct().
*/
for (i = 0; i < (1U << smap->bucket_log); i++) {
b = &smap->buckets[i];
rcu_read_lock();
/* No one is adding to b->list now */
while ((selem = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(&b->list)),
struct bpf_sk_storage_elem,
map_node))) {
selem_unlink(selem);
cond_resched_rcu();
}
rcu_read_unlock();
}
/* bpf_sk_storage_free() may still need to access the map.
* e.g. bpf_sk_storage_free() has unlinked selem from the map
* which then made the above while((selem = ...)) loop
* exited immediately.
*
* However, the bpf_sk_storage_free() still needs to access
* the smap->elem_size to do the uncharging in
* __selem_unlink_sk().
*
* Hence, wait another rcu grace period for the
* bpf_sk_storage_free() to finish.
*/
synchronize_rcu();
kvfree(smap->buckets);
kfree(map);
}
static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
{
if (attr->map_flags != BPF_F_NO_PREALLOC || attr->max_entries ||
attr->key_size != sizeof(int) || !attr->value_size ||
/* Enforce BTF for userspace sk dumping */
!attr->btf_key_type_id || !attr->btf_value_type_id)
return -EINVAL;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (attr->value_size >= KMALLOC_MAX_SIZE -
MAX_BPF_STACK - sizeof(struct bpf_sk_storage_elem) ||
/* U16_MAX is much more than enough for sk local storage
* considering a tcp_sock is ~2k.
*/
attr->value_size > U16_MAX - sizeof(struct bpf_sk_storage_elem))
return -E2BIG;
return 0;
}
static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
{
struct bpf_sk_storage_map *smap;
unsigned int i;
u32 nbuckets;
u64 cost;
smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN);
if (!smap)
return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&smap->map, attr);
smap->bucket_log = ilog2(roundup_pow_of_two(num_possible_cpus()));
nbuckets = 1U << smap->bucket_log;
smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets,
GFP_USER | __GFP_NOWARN);
if (!smap->buckets) {
kfree(smap);
return ERR_PTR(-ENOMEM);
}
cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap);
for (i = 0; i < nbuckets; i++) {
INIT_HLIST_HEAD(&smap->buckets[i].list);
raw_spin_lock_init(&smap->buckets[i].lock);
}
smap->elem_size = sizeof(struct bpf_sk_storage_elem) + attr->value_size;
smap->cache_idx = (unsigned int)atomic_inc_return(&cache_idx) %
BPF_SK_STORAGE_CACHE_SIZE;
smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
return &smap->map;
}
static int notsupp_get_next_key(struct bpf_map *map, void *key,
void *next_key)
{
return -ENOTSUPP;
}
static int bpf_sk_storage_map_check_btf(const struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)
{
u32 int_data;
if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
return -EINVAL;
int_data = *(u32 *)(key_type + 1);
if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data))
return -EINVAL;
return 0;
}
static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_sk_storage_data *sdata;
struct socket *sock;
int fd, err;
fd = *(int *)key;
sock = sockfd_lookup(fd, &err);
if (sock) {
sdata = sk_storage_lookup(sock->sk, map, true);
sockfd_put(sock);
return sdata ? sdata->data : NULL;
}
return ERR_PTR(err);
}
static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key,
void *value, u64 map_flags)
{
struct bpf_sk_storage_data *sdata;
struct socket *sock;
int fd, err;
fd = *(int *)key;
sock = sockfd_lookup(fd, &err);
if (sock) {
sdata = sk_storage_update(sock->sk, map, value, map_flags);
sockfd_put(sock);
return IS_ERR(sdata) ? PTR_ERR(sdata) : 0;
}
return err;
}
static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
{
struct socket *sock;
int fd, err;
fd = *(int *)key;
sock = sockfd_lookup(fd, &err);
if (sock) {
err = sk_storage_delete(sock->sk, map);
sockfd_put(sock);
return err;
}
return err;
}
BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
void *, value, u64, flags)
{
struct bpf_sk_storage_data *sdata;
if (flags > BPF_SK_STORAGE_GET_F_CREATE)
return (unsigned long)NULL;
sdata = sk_storage_lookup(sk, map, true);
if (sdata)
return (unsigned long)sdata->data;
if (flags == BPF_SK_STORAGE_GET_F_CREATE &&
/* Cannot add new elem to a going away sk.
* Otherwise, the new elem may become a leak
* (and also other memory issues during map
* destruction).
*/
refcount_inc_not_zero(&sk->sk_refcnt)) {
sdata = sk_storage_update(sk, map, value, BPF_NOEXIST);
/* sk must be a fullsock (guaranteed by verifier),
* so sock_gen_put() is unnecessary.
*/
sock_put(sk);
return IS_ERR(sdata) ?
(unsigned long)NULL : (unsigned long)sdata->data;
}
return (unsigned long)NULL;
}
BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk)
{
if (refcount_inc_not_zero(&sk->sk_refcnt)) {
int err;
err = sk_storage_delete(sk, map);
sock_put(sk);
return err;
}
return -ENOENT;
}
const struct bpf_map_ops sk_storage_map_ops = {
.map_alloc_check = bpf_sk_storage_map_alloc_check,
.map_alloc = bpf_sk_storage_map_alloc,
.map_free = bpf_sk_storage_map_free,
.map_get_next_key = notsupp_get_next_key,
.map_lookup_elem = bpf_fd_sk_storage_lookup_elem,
.map_update_elem = bpf_fd_sk_storage_update_elem,
.map_delete_elem = bpf_fd_sk_storage_delete_elem,
.map_check_btf = bpf_sk_storage_map_check_btf,
};
const struct bpf_func_proto bpf_sk_storage_get_proto = {
.func = bpf_sk_storage_get,
.gpl_only = false,
.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_PTR_TO_SOCKET,
.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
.arg4_type = ARG_ANYTHING,
};
const struct bpf_func_proto bpf_sk_storage_delete_proto = {
.func = bpf_sk_storage_delete,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_PTR_TO_SOCKET,
};

View File

@ -75,6 +75,7 @@
#include <net/seg6_local.h> #include <net/seg6_local.h>
#include <net/lwtunnel.h> #include <net/lwtunnel.h>
#include <net/ipv6_stubs.h> #include <net/ipv6_stubs.h>
#include <net/bpf_sk_storage.h>
/** /**
* sk_filter_trim_cap - run a packet through a socket filter * sk_filter_trim_cap - run a packet through a socket filter
@ -1730,6 +1731,40 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
.arg4_type = ARG_CONST_SIZE, .arg4_type = ARG_CONST_SIZE,
}; };
BPF_CALL_4(bpf_flow_dissector_load_bytes,
const struct bpf_flow_dissector *, ctx, u32, offset,
void *, to, u32, len)
{
void *ptr;
if (unlikely(offset > 0xffff))
goto err_clear;
if (unlikely(!ctx->skb))
goto err_clear;
ptr = skb_header_pointer(ctx->skb, offset, len, to);
if (unlikely(!ptr))
goto err_clear;
if (ptr != to)
memcpy(to, ptr, len);
return 0;
err_clear:
memset(to, 0, len);
return -EFAULT;
}
static const struct bpf_func_proto bpf_flow_dissector_load_bytes_proto = {
.func = bpf_flow_dissector_load_bytes,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_PTR_TO_UNINIT_MEM,
.arg4_type = ARG_CONST_SIZE,
};
BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb, BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb,
u32, offset, void *, to, u32, len, u32, start_header) u32, offset, void *, to, u32, len, u32, start_header)
{ {
@ -3047,6 +3082,14 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
skb_set_transport_header(skb, mac_len + nh_len); skb_set_transport_header(skb, mac_len + nh_len);
} }
/* Match skb->protocol to new outer l3 protocol */
if (skb->protocol == htons(ETH_P_IP) &&
flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
skb->protocol = htons(ETH_P_IPV6);
else if (skb->protocol == htons(ETH_P_IPV6) &&
flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4)
skb->protocol = htons(ETH_P_IP);
} }
if (skb_is_gso(skb)) { if (skb_is_gso(skb)) {
@ -5861,6 +5904,9 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
} }
} }
const struct bpf_func_proto bpf_sk_storage_get_proto __weak;
const struct bpf_func_proto bpf_sk_storage_delete_proto __weak;
static const struct bpf_func_proto * static const struct bpf_func_proto *
cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{ {
@ -5869,6 +5915,10 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_local_storage_proto; return &bpf_get_local_storage_proto;
case BPF_FUNC_sk_fullsock: case BPF_FUNC_sk_fullsock:
return &bpf_sk_fullsock_proto; return &bpf_sk_fullsock_proto;
case BPF_FUNC_sk_storage_get:
return &bpf_sk_storage_get_proto;
case BPF_FUNC_sk_storage_delete:
return &bpf_sk_storage_delete_proto;
#ifdef CONFIG_INET #ifdef CONFIG_INET
case BPF_FUNC_tcp_sock: case BPF_FUNC_tcp_sock:
return &bpf_tcp_sock_proto; return &bpf_tcp_sock_proto;
@ -5950,6 +6000,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_skb_fib_lookup_proto; return &bpf_skb_fib_lookup_proto;
case BPF_FUNC_sk_fullsock: case BPF_FUNC_sk_fullsock:
return &bpf_sk_fullsock_proto; return &bpf_sk_fullsock_proto;
case BPF_FUNC_sk_storage_get:
return &bpf_sk_storage_get_proto;
case BPF_FUNC_sk_storage_delete:
return &bpf_sk_storage_delete_proto;
#ifdef CONFIG_XFRM #ifdef CONFIG_XFRM
case BPF_FUNC_skb_get_xfrm_state: case BPF_FUNC_skb_get_xfrm_state:
return &bpf_skb_get_xfrm_state_proto; return &bpf_skb_get_xfrm_state_proto;
@ -6121,7 +6175,7 @@ flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{ {
switch (func_id) { switch (func_id) {
case BPF_FUNC_skb_load_bytes: case BPF_FUNC_skb_load_bytes:
return &bpf_skb_load_bytes_proto; return &bpf_flow_dissector_load_bytes_proto;
default: default:
return bpf_base_func_proto(func_id); return bpf_base_func_proto(func_id);
} }
@ -6248,9 +6302,7 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
return false; return false;
break; break;
case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
if (size != sizeof(__u64))
return false; return false;
break;
case bpf_ctx_range(struct __sk_buff, tstamp): case bpf_ctx_range(struct __sk_buff, tstamp):
if (size != sizeof(__u64)) if (size != sizeof(__u64))
return false; return false;
@ -6285,7 +6337,6 @@ static bool sk_filter_is_valid_access(int off, int size,
case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data):
case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_meta):
case bpf_ctx_range(struct __sk_buff, data_end): case bpf_ctx_range(struct __sk_buff, data_end):
case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range_till(struct __sk_buff, family, local_port):
case bpf_ctx_range(struct __sk_buff, tstamp): case bpf_ctx_range(struct __sk_buff, tstamp):
case bpf_ctx_range(struct __sk_buff, wire_len): case bpf_ctx_range(struct __sk_buff, wire_len):
@ -6312,7 +6363,6 @@ static bool cg_skb_is_valid_access(int off, int size,
switch (off) { switch (off) {
case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, tc_classid):
case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_meta):
case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
case bpf_ctx_range(struct __sk_buff, wire_len): case bpf_ctx_range(struct __sk_buff, wire_len):
return false; return false;
case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data):
@ -6358,7 +6408,6 @@ static bool lwt_is_valid_access(int off, int size,
case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, tc_classid):
case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range_till(struct __sk_buff, family, local_port):
case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_meta):
case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
case bpf_ctx_range(struct __sk_buff, tstamp): case bpf_ctx_range(struct __sk_buff, tstamp):
case bpf_ctx_range(struct __sk_buff, wire_len): case bpf_ctx_range(struct __sk_buff, wire_len):
return false; return false;
@ -6601,7 +6650,6 @@ static bool tc_cls_act_is_valid_access(int off, int size,
case bpf_ctx_range(struct __sk_buff, data_end): case bpf_ctx_range(struct __sk_buff, data_end):
info->reg_type = PTR_TO_PACKET_END; info->reg_type = PTR_TO_PACKET_END;
break; break;
case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range_till(struct __sk_buff, family, local_port):
return false; return false;
} }
@ -6803,7 +6851,6 @@ static bool sk_skb_is_valid_access(int off, int size,
switch (off) { switch (off) {
case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, tc_classid):
case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_meta):
case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
case bpf_ctx_range(struct __sk_buff, tstamp): case bpf_ctx_range(struct __sk_buff, tstamp):
case bpf_ctx_range(struct __sk_buff, wire_len): case bpf_ctx_range(struct __sk_buff, wire_len):
return false; return false;
@ -6877,24 +6924,65 @@ static bool flow_dissector_is_valid_access(int off, int size,
const struct bpf_prog *prog, const struct bpf_prog *prog,
struct bpf_insn_access_aux *info) struct bpf_insn_access_aux *info)
{ {
const int size_default = sizeof(__u32);
if (off < 0 || off >= sizeof(struct __sk_buff))
return false;
if (type == BPF_WRITE) if (type == BPF_WRITE)
return false; return false;
switch (off) { switch (off) {
case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data):
if (size != size_default)
return false;
info->reg_type = PTR_TO_PACKET; info->reg_type = PTR_TO_PACKET;
break; return true;
case bpf_ctx_range(struct __sk_buff, data_end): case bpf_ctx_range(struct __sk_buff, data_end):
if (size != size_default)
return false;
info->reg_type = PTR_TO_PACKET_END; info->reg_type = PTR_TO_PACKET_END;
break; return true;
case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
if (size != sizeof(__u64))
return false;
info->reg_type = PTR_TO_FLOW_KEYS; info->reg_type = PTR_TO_FLOW_KEYS;
break; return true;
default: default:
return false; return false;
} }
}
return bpf_skb_is_valid_access(off, size, type, prog, info); static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
struct bpf_prog *prog,
u32 *target_size)
{
struct bpf_insn *insn = insn_buf;
switch (si->off) {
case offsetof(struct __sk_buff, data):
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data),
si->dst_reg, si->src_reg,
offsetof(struct bpf_flow_dissector, data));
break;
case offsetof(struct __sk_buff, data_end):
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data_end),
si->dst_reg, si->src_reg,
offsetof(struct bpf_flow_dissector, data_end));
break;
case offsetof(struct __sk_buff, flow_keys):
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, flow_keys),
si->dst_reg, si->src_reg,
offsetof(struct bpf_flow_dissector, flow_keys));
break;
}
return insn - insn_buf;
} }
static u32 bpf_convert_ctx_access(enum bpf_access_type type, static u32 bpf_convert_ctx_access(enum bpf_access_type type,
@ -7201,15 +7289,6 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
skc_num, 2, target_size)); skc_num, 2, target_size));
break; break;
case offsetof(struct __sk_buff, flow_keys):
off = si->off;
off -= offsetof(struct __sk_buff, flow_keys);
off += offsetof(struct sk_buff, cb);
off += offsetof(struct qdisc_skb_cb, flow_keys);
*insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
si->src_reg, off);
break;
case offsetof(struct __sk_buff, tstamp): case offsetof(struct __sk_buff, tstamp):
BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tstamp) != 8); BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tstamp) != 8);
@ -8214,7 +8293,7 @@ const struct bpf_prog_ops sk_msg_prog_ops = {
const struct bpf_verifier_ops flow_dissector_verifier_ops = { const struct bpf_verifier_ops flow_dissector_verifier_ops = {
.get_func_proto = flow_dissector_func_proto, .get_func_proto = flow_dissector_func_proto,
.is_valid_access = flow_dissector_is_valid_access, .is_valid_access = flow_dissector_is_valid_access,
.convert_ctx_access = bpf_convert_ctx_access, .convert_ctx_access = flow_dissector_convert_ctx_access,
}; };
const struct bpf_prog_ops flow_dissector_prog_ops = { const struct bpf_prog_ops flow_dissector_prog_ops = {

View File

@ -65,6 +65,45 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
} }
EXPORT_SYMBOL(skb_flow_dissector_init); EXPORT_SYMBOL(skb_flow_dissector_init);
int skb_flow_dissector_prog_query(const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
u32 prog_id, prog_cnt = 0, flags = 0;
struct bpf_prog *attached;
struct net *net;
if (attr->query.query_flags)
return -EINVAL;
net = get_net_ns_by_fd(attr->query.target_fd);
if (IS_ERR(net))
return PTR_ERR(net);
rcu_read_lock();
attached = rcu_dereference(net->flow_dissector_prog);
if (attached) {
prog_cnt = 1;
prog_id = attached->aux->id;
}
rcu_read_unlock();
put_net(net);
if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
return -EFAULT;
if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt)))
return -EFAULT;
if (!attr->query.prog_cnt || !prog_ids || !prog_cnt)
return 0;
if (copy_to_user(prog_ids, &prog_id, sizeof(u32)))
return -EFAULT;
return 0;
}
int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
struct bpf_prog *prog) struct bpf_prog *prog)
{ {
@ -683,50 +722,30 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
} }
} }
bool __skb_flow_bpf_dissect(struct bpf_prog *prog, bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
const struct sk_buff *skb, __be16 proto, int nhoff, int hlen)
struct flow_dissector *flow_dissector,
struct bpf_flow_keys *flow_keys)
{ {
struct bpf_skb_data_end cb_saved; struct bpf_flow_keys *flow_keys = ctx->flow_keys;
struct bpf_skb_data_end *cb;
u32 result; u32 result;
/* Note that even though the const qualifier is discarded
* throughout the execution of the BPF program, all changes(the
* control block) are reverted after the BPF program returns.
* Therefore, __skb_flow_dissect does not alter the skb.
*/
cb = (struct bpf_skb_data_end *)skb->cb;
/* Save Control Block */
memcpy(&cb_saved, cb, sizeof(cb_saved));
memset(cb, 0, sizeof(*cb));
/* Pass parameters to the BPF program */ /* Pass parameters to the BPF program */
memset(flow_keys, 0, sizeof(*flow_keys)); memset(flow_keys, 0, sizeof(*flow_keys));
cb->qdisc_cb.flow_keys = flow_keys; flow_keys->n_proto = proto;
flow_keys->n_proto = skb->protocol; flow_keys->nhoff = nhoff;
flow_keys->nhoff = skb_network_offset(skb);
flow_keys->thoff = flow_keys->nhoff; flow_keys->thoff = flow_keys->nhoff;
bpf_compute_data_pointers((struct sk_buff *)skb); result = BPF_PROG_RUN(prog, ctx);
result = BPF_PROG_RUN(prog, skb);
/* Restore state */ flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, nhoff, hlen);
memcpy(cb, &cb_saved, sizeof(cb_saved));
flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff,
skb_network_offset(skb), skb->len);
flow_keys->thoff = clamp_t(u16, flow_keys->thoff, flow_keys->thoff = clamp_t(u16, flow_keys->thoff,
flow_keys->nhoff, skb->len); flow_keys->nhoff, hlen);
return result == BPF_OK; return result == BPF_OK;
} }
/** /**
* __skb_flow_dissect - extract the flow_keys struct and return it * __skb_flow_dissect - extract the flow_keys struct and return it
* @net: associated network namespace, derived from @skb if NULL
* @skb: sk_buff to extract the flow from, can be NULL if the rest are specified * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified
* @flow_dissector: list of keys to dissect * @flow_dissector: list of keys to dissect
* @target_container: target structure to put dissected values into * @target_container: target structure to put dissected values into
@ -743,7 +762,8 @@ bool __skb_flow_bpf_dissect(struct bpf_prog *prog,
* *
* Caller must take care of zeroing target container memory. * Caller must take care of zeroing target container memory.
*/ */
bool __skb_flow_dissect(const struct sk_buff *skb, bool __skb_flow_dissect(const struct net *net,
const struct sk_buff *skb,
struct flow_dissector *flow_dissector, struct flow_dissector *flow_dissector,
void *target_container, void *target_container,
void *data, __be16 proto, int nhoff, int hlen, void *data, __be16 proto, int nhoff, int hlen,
@ -756,6 +776,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
struct flow_dissector_key_icmp *key_icmp; struct flow_dissector_key_icmp *key_icmp;
struct flow_dissector_key_tags *key_tags; struct flow_dissector_key_tags *key_tags;
struct flow_dissector_key_vlan *key_vlan; struct flow_dissector_key_vlan *key_vlan;
struct bpf_prog *attached = NULL;
enum flow_dissect_ret fdret; enum flow_dissect_ret fdret;
enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX; enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX;
int num_hdrs = 0; int num_hdrs = 0;
@ -798,22 +819,39 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
target_container); target_container);
if (skb) { if (skb) {
struct bpf_flow_keys flow_keys; if (!net) {
struct bpf_prog *attached = NULL;
rcu_read_lock();
if (skb->dev) if (skb->dev)
attached = rcu_dereference(dev_net(skb->dev)->flow_dissector_prog); net = dev_net(skb->dev);
else if (skb->sk) else if (skb->sk)
attached = rcu_dereference(sock_net(skb->sk)->flow_dissector_prog); net = sock_net(skb->sk);
else }
WARN_ON_ONCE(1); }
WARN_ON_ONCE(!net);
if (net) {
rcu_read_lock();
attached = rcu_dereference(net->flow_dissector_prog);
if (attached) { if (attached) {
ret = __skb_flow_bpf_dissect(attached, skb, struct bpf_flow_keys flow_keys;
flow_dissector, struct bpf_flow_dissector ctx = {
&flow_keys); .flow_keys = &flow_keys,
.data = data,
.data_end = data + hlen,
};
__be16 n_proto = proto;
if (skb) {
ctx.skb = skb;
/* we can't use 'proto' in the skb case
* because it might be set to skb->vlan_proto
* which has been pulled from the data
*/
n_proto = skb->protocol;
}
ret = bpf_flow_dissect(attached, &ctx, n_proto, nhoff,
hlen);
__skb_flow_bpf_to_target(&flow_keys, flow_dissector, __skb_flow_bpf_to_target(&flow_keys, flow_dissector,
target_container); target_container);
rcu_read_unlock(); rcu_read_unlock();
@ -1410,8 +1448,8 @@ u32 __skb_get_hash_symmetric(const struct sk_buff *skb)
__flow_hash_secret_init(); __flow_hash_secret_init();
memset(&keys, 0, sizeof(keys)); memset(&keys, 0, sizeof(keys));
__skb_flow_dissect(skb, &flow_keys_dissector_symmetric, &keys, __skb_flow_dissect(NULL, skb, &flow_keys_dissector_symmetric,
NULL, 0, 0, 0, &keys, NULL, 0, 0, 0,
FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
return __flow_hash_from_keys(&keys, hashrnd); return __flow_hash_from_keys(&keys, hashrnd);
@ -1512,7 +1550,8 @@ u32 skb_get_poff(const struct sk_buff *skb)
{ {
struct flow_keys_basic keys; struct flow_keys_basic keys;
if (!skb_flow_dissect_flow_keys_basic(skb, &keys, NULL, 0, 0, 0, 0)) if (!skb_flow_dissect_flow_keys_basic(NULL, skb, &keys,
NULL, 0, 0, 0, 0))
return 0; return 0;
return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb)); return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));

View File

@ -137,6 +137,7 @@
#include <linux/filter.h> #include <linux/filter.h>
#include <net/sock_reuseport.h> #include <net/sock_reuseport.h>
#include <net/bpf_sk_storage.h>
#include <trace/events/sock.h> #include <trace/events/sock.h>
@ -1709,6 +1710,10 @@ static void __sk_destruct(struct rcu_head *head)
sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
#ifdef CONFIG_BPF_SYSCALL
bpf_sk_storage_free(sk);
#endif
if (atomic_read(&sk->sk_omem_alloc)) if (atomic_read(&sk->sk_omem_alloc))
pr_debug("%s: optmem leakage (%d bytes) detected\n", pr_debug("%s: optmem leakage (%d bytes) detected\n",
__func__, atomic_read(&sk->sk_omem_alloc)); __func__, atomic_read(&sk->sk_omem_alloc));

View File

@ -119,13 +119,14 @@ EXPORT_SYMBOL(eth_header);
/** /**
* eth_get_headlen - determine the length of header for an ethernet frame * eth_get_headlen - determine the length of header for an ethernet frame
* @dev: pointer to network device
* @data: pointer to start of frame * @data: pointer to start of frame
* @len: total length of frame * @len: total length of frame
* *
* Make a best effort attempt to pull the length for all of the headers for * Make a best effort attempt to pull the length for all of the headers for
* a given frame in a linear buffer. * a given frame in a linear buffer.
*/ */
u32 eth_get_headlen(void *data, unsigned int len) u32 eth_get_headlen(const struct net_device *dev, void *data, unsigned int len)
{ {
const unsigned int flags = FLOW_DISSECTOR_F_PARSE_1ST_FRAG; const unsigned int flags = FLOW_DISSECTOR_F_PARSE_1ST_FRAG;
const struct ethhdr *eth = (const struct ethhdr *)data; const struct ethhdr *eth = (const struct ethhdr *)data;
@ -136,8 +137,9 @@ u32 eth_get_headlen(void *data, unsigned int len)
return len; return len;
/* parse any remaining L2/L3 headers, check for L4 */ /* parse any remaining L2/L3 headers, check for L4 */
if (!skb_flow_dissect_flow_keys_basic(NULL, &keys, data, eth->h_proto, if (!skb_flow_dissect_flow_keys_basic(dev_net(dev), NULL, &keys, data,
sizeof(*eth), len, flags)) eth->h_proto, sizeof(*eth),
len, flags))
return max_t(u32, keys.control.thoff, sizeof(*eth)); return max_t(u32, keys.control.thoff, sizeof(*eth));
/* parse for any L4 headers */ /* parse for any L4 headers */

View File

@ -1,5 +1,6 @@
cpustat cpustat
fds_example fds_example
hbm
lathist lathist
lwt_len_hist lwt_len_hist
map_perf_test map_perf_test

View File

@ -0,0 +1,222 @@
================
bpftool-btf
================
-------------------------------------------------------------------------------
tool for inspection of BTF data
-------------------------------------------------------------------------------
:Manual section: 8
SYNOPSIS
========
**bpftool** [*OPTIONS*] **btf** *COMMAND*
*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] }
*COMMANDS* := { **dump** | **help** }
BTF COMMANDS
=============
| **bpftool** **btf dump** *BTF_SRC*
| **bpftool** **btf help**
|
| *BTF_SRC* := { **id** *BTF_ID* | **prog** *PROG* | **map** *MAP* [{**key** | **value** | **kv** | **all**}] | **file** *FILE* }
| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* }
| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* }
DESCRIPTION
===========
**bpftool btf dump** *BTF_SRC*
Dump BTF entries from a given *BTF_SRC*.
When **id** is specified, BTF object with that ID will be
loaded and all its BTF types emitted.
When **map** is provided, it's expected that map has
associated BTF object with BTF types describing key and
value. It's possible to select whether to dump only BTF
type(s) associated with key (**key**), value (**value**),
both key and value (**kv**), or all BTF types present in
associated BTF object (**all**). If not specified, **kv**
is assumed.
When **prog** is provided, it's expected that program has
associated BTF object with BTF types.
When specifying *FILE*, an ELF file is expected, containing
.BTF section with well-defined BTF binary format data,
typically produced by clang or pahole.
**bpftool btf help**
Print short help message.
OPTIONS
=======
-h, --help
Print short generic help message (similar to **bpftool help**).
-V, --version
Print version number (similar to **bpftool version**).
-j, --json
Generate JSON output. For commands that cannot produce JSON, this
option has no effect.
-p, --pretty
Generate human-readable JSON output. Implies **-j**.
EXAMPLES
========
**# bpftool btf dump id 1226**
::
[1] PTR '(anon)' type_id=2
[2] STRUCT 'dummy_tracepoint_args' size=16 vlen=2
'pad' type_id=3 bits_offset=0
'sock' type_id=4 bits_offset=64
[3] INT 'long long unsigned int' size=8 bits_offset=0 nr_bits=64 encoding=(none)
[4] PTR '(anon)' type_id=5
[5] FWD 'sock' fwd_kind=union
This gives an example of default output for all supported BTF kinds.
**$ cat prog.c**
::
struct fwd_struct;
enum my_enum {
VAL1 = 3,
VAL2 = 7,
};
typedef struct my_struct my_struct_t;
struct my_struct {
const unsigned int const_int_field;
int bitfield_field: 4;
char arr_field[16];
const struct fwd_struct *restrict fwd_field;
enum my_enum enum_field;
volatile my_struct_t *typedef_ptr_field;
};
union my_union {
int a;
struct my_struct b;
};
struct my_struct struct_global_var __attribute__((section("data_sec"))) = {
.bitfield_field = 3,
.enum_field = VAL1,
};
int global_var __attribute__((section("data_sec"))) = 7;
__attribute__((noinline))
int my_func(union my_union *arg1, int arg2)
{
static int static_var __attribute__((section("data_sec"))) = 123;
static_var++;
return static_var;
}
**$ bpftool btf dump file prog.o**
::
[1] PTR '(anon)' type_id=2
[2] UNION 'my_union' size=48 vlen=2
'a' type_id=3 bits_offset=0
'b' type_id=4 bits_offset=0
[3] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED
[4] STRUCT 'my_struct' size=48 vlen=6
'const_int_field' type_id=5 bits_offset=0
'bitfield_field' type_id=3 bits_offset=32 bitfield_size=4
'arr_field' type_id=8 bits_offset=40
'fwd_field' type_id=10 bits_offset=192
'enum_field' type_id=14 bits_offset=256
'typedef_ptr_field' type_id=15 bits_offset=320
[5] CONST '(anon)' type_id=6
[6] INT 'unsigned int' size=4 bits_offset=0 nr_bits=32 encoding=(none)
[7] INT 'char' size=1 bits_offset=0 nr_bits=8 encoding=SIGNED
[8] ARRAY '(anon)' type_id=7 index_type_id=9 nr_elems=16
[9] INT '__ARRAY_SIZE_TYPE__' size=4 bits_offset=0 nr_bits=32 encoding=(none)
[10] RESTRICT '(anon)' type_id=11
[11] PTR '(anon)' type_id=12
[12] CONST '(anon)' type_id=13
[13] FWD 'fwd_struct' fwd_kind=union
[14] ENUM 'my_enum' size=4 vlen=2
'VAL1' val=3
'VAL2' val=7
[15] PTR '(anon)' type_id=16
[16] VOLATILE '(anon)' type_id=17
[17] TYPEDEF 'my_struct_t' type_id=4
[18] FUNC_PROTO '(anon)' ret_type_id=3 vlen=2
'arg1' type_id=1
'arg2' type_id=3
[19] FUNC 'my_func' type_id=18
[20] VAR 'struct_global_var' type_id=4, linkage=global-alloc
[21] VAR 'global_var' type_id=3, linkage=global-alloc
[22] VAR 'my_func.static_var' type_id=3, linkage=static
[23] DATASEC 'data_sec' size=0 vlen=3
type_id=20 offset=0 size=48
type_id=21 offset=0 size=4
type_id=22 offset=52 size=4
The following commands print BTF types associated with specified map's key,
value, both key and value, and all BTF types, respectively. By default, both
key and value types will be printed.
**# bpftool btf dump map id 123 key**
::
[39] TYPEDEF 'u32' type_id=37
**# bpftool btf dump map id 123 value**
::
[86] PTR '(anon)' type_id=87
**# bpftool btf dump map id 123 kv**
::
[39] TYPEDEF 'u32' type_id=37
[86] PTR '(anon)' type_id=87
**# bpftool btf dump map id 123 all**
::
[1] PTR '(anon)' type_id=0
.
.
.
[2866] ARRAY '(anon)' type_id=52 index_type_id=51 nr_elems=4
All the standard ways to specify map or program are supported:
**# bpftool btf dump map id 123**
**# bpftool btf dump map pinned /sys/fs/bpf/map_name**
**# bpftool btf dump prog id 456**
**# bpftool btf dump prog tag b88e0a09b1d9759d**
**# bpftool btf dump prog pinned /sys/fs/bpf/prog_name**
SEE ALSO
========
**bpf**\ (2),
**bpf-helpers**\ (7),
**bpftool**\ (8),
**bpftool-map**\ (8),
**bpftool-prog**\ (8),
**bpftool-cgroup**\ (8),
**bpftool-feature**\ (8),
**bpftool-net**\ (8),
**bpftool-perf**\ (8)

View File

@ -145,4 +145,5 @@ SEE ALSO
**bpftool-map**\ (8), **bpftool-map**\ (8),
**bpftool-feature**\ (8), **bpftool-feature**\ (8),
**bpftool-net**\ (8), **bpftool-net**\ (8),
**bpftool-perf**\ (8) **bpftool-perf**\ (8),
**bpftool-btf**\ (8)

View File

@ -82,4 +82,5 @@ SEE ALSO
**bpftool-map**\ (8), **bpftool-map**\ (8),
**bpftool-cgroup**\ (8), **bpftool-cgroup**\ (8),
**bpftool-net**\ (8), **bpftool-net**\ (8),
**bpftool-perf**\ (8) **bpftool-perf**\ (8),
**bpftool-btf**\ (8)

View File

@ -258,4 +258,5 @@ SEE ALSO
**bpftool-cgroup**\ (8), **bpftool-cgroup**\ (8),
**bpftool-feature**\ (8), **bpftool-feature**\ (8),
**bpftool-net**\ (8), **bpftool-net**\ (8),
**bpftool-perf**\ (8) **bpftool-perf**\ (8),
**bpftool-btf**\ (8)

View File

@ -143,4 +143,5 @@ SEE ALSO
**bpftool-map**\ (8), **bpftool-map**\ (8),
**bpftool-cgroup**\ (8), **bpftool-cgroup**\ (8),
**bpftool-feature**\ (8), **bpftool-feature**\ (8),
**bpftool-perf**\ (8) **bpftool-perf**\ (8),
**bpftool-btf**\ (8)

View File

@ -85,4 +85,5 @@ SEE ALSO
**bpftool-map**\ (8), **bpftool-map**\ (8),
**bpftool-cgroup**\ (8), **bpftool-cgroup**\ (8),
**bpftool-feature**\ (8), **bpftool-feature**\ (8),
**bpftool-net**\ (8) **bpftool-net**\ (8),
**bpftool-btf**\ (8)

View File

@ -271,4 +271,5 @@ SEE ALSO
**bpftool-cgroup**\ (8), **bpftool-cgroup**\ (8),
**bpftool-feature**\ (8), **bpftool-feature**\ (8),
**bpftool-net**\ (8), **bpftool-net**\ (8),
**bpftool-perf**\ (8) **bpftool-perf**\ (8),
**bpftool-btf**\ (8)

View File

@ -76,4 +76,5 @@ SEE ALSO
**bpftool-cgroup**\ (8), **bpftool-cgroup**\ (8),
**bpftool-feature**\ (8), **bpftool-feature**\ (8),
**bpftool-net**\ (8), **bpftool-net**\ (8),
**bpftool-perf**\ (8) **bpftool-perf**\ (8),
**bpftool-btf**\ (8)

View File

@ -217,6 +217,7 @@ _bpftool()
done done
cur=${words[cword]} cur=${words[cword]}
prev=${words[cword - 1]} prev=${words[cword - 1]}
pprev=${words[cword - 2]}
local object=${words[1]} command=${words[2]} local object=${words[1]} command=${words[2]}
@ -607,6 +608,51 @@ _bpftool()
;; ;;
esac esac
;; ;;
btf)
local PROG_TYPE='id pinned tag'
local MAP_TYPE='id pinned'
case $command in
dump)
case $prev in
$command)
COMPREPLY+=( $( compgen -W "id map prog file" -- \
"$cur" ) )
return 0
;;
prog)
COMPREPLY=( $( compgen -W "$PROG_TYPE" -- "$cur" ) )
return 0
;;
map)
COMPREPLY=( $( compgen -W "$MAP_TYPE" -- "$cur" ) )
return 0
;;
id)
case $pprev in
prog)
_bpftool_get_prog_ids
;;
map)
_bpftool_get_map_ids
;;
esac
return 0
;;
*)
if [[ $cword == 6 ]] && [[ ${words[3]} == "map" ]]; then
COMPREPLY+=( $( compgen -W 'key value kv all' -- \
"$cur" ) )
fi
return 0
;;
esac
;;
*)
[[ $prev == $object ]] && \
COMPREPLY=( $( compgen -W 'dump help' -- "$cur" ) )
;;
esac
;;
cgroup) cgroup)
case $command in case $command in
show|list) show|list)

586
tools/bpf/bpftool/btf.c Normal file
View File

@ -0,0 +1,586 @@
// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
/* Copyright (C) 2019 Facebook */
#include <errno.h>
#include <fcntl.h>
#include <linux/err.h>
#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <gelf.h>
#include <bpf.h>
#include <linux/btf.h>
#include "btf.h"
#include "json_writer.h"
#include "main.h"
static const char * const btf_kind_str[NR_BTF_KINDS] = {
[BTF_KIND_UNKN] = "UNKNOWN",
[BTF_KIND_INT] = "INT",
[BTF_KIND_PTR] = "PTR",
[BTF_KIND_ARRAY] = "ARRAY",
[BTF_KIND_STRUCT] = "STRUCT",
[BTF_KIND_UNION] = "UNION",
[BTF_KIND_ENUM] = "ENUM",
[BTF_KIND_FWD] = "FWD",
[BTF_KIND_TYPEDEF] = "TYPEDEF",
[BTF_KIND_VOLATILE] = "VOLATILE",
[BTF_KIND_CONST] = "CONST",
[BTF_KIND_RESTRICT] = "RESTRICT",
[BTF_KIND_FUNC] = "FUNC",
[BTF_KIND_FUNC_PROTO] = "FUNC_PROTO",
[BTF_KIND_VAR] = "VAR",
[BTF_KIND_DATASEC] = "DATASEC",
};
static const char *btf_int_enc_str(__u8 encoding)
{
switch (encoding) {
case 0:
return "(none)";
case BTF_INT_SIGNED:
return "SIGNED";
case BTF_INT_CHAR:
return "CHAR";
case BTF_INT_BOOL:
return "BOOL";
default:
return "UNKN";
}
}
static const char *btf_var_linkage_str(__u32 linkage)
{
switch (linkage) {
case BTF_VAR_STATIC:
return "static";
case BTF_VAR_GLOBAL_ALLOCATED:
return "global-alloc";
default:
return "(unknown)";
}
}
static const char *btf_str(const struct btf *btf, __u32 off)
{
if (!off)
return "(anon)";
return btf__name_by_offset(btf, off) ? : "(invalid)";
}
static int dump_btf_type(const struct btf *btf, __u32 id,
const struct btf_type *t)
{
json_writer_t *w = json_wtr;
int kind, safe_kind;
kind = BTF_INFO_KIND(t->info);
safe_kind = kind <= BTF_KIND_MAX ? kind : BTF_KIND_UNKN;
if (json_output) {
jsonw_start_object(w);
jsonw_uint_field(w, "id", id);
jsonw_string_field(w, "kind", btf_kind_str[safe_kind]);
jsonw_string_field(w, "name", btf_str(btf, t->name_off));
} else {
printf("[%u] %s '%s'", id, btf_kind_str[safe_kind],
btf_str(btf, t->name_off));
}
switch (BTF_INFO_KIND(t->info)) {
case BTF_KIND_INT: {
__u32 v = *(__u32 *)(t + 1);
const char *enc;
enc = btf_int_enc_str(BTF_INT_ENCODING(v));
if (json_output) {
jsonw_uint_field(w, "size", t->size);
jsonw_uint_field(w, "bits_offset", BTF_INT_OFFSET(v));
jsonw_uint_field(w, "nr_bits", BTF_INT_BITS(v));
jsonw_string_field(w, "encoding", enc);
} else {
printf(" size=%u bits_offset=%u nr_bits=%u encoding=%s",
t->size, BTF_INT_OFFSET(v), BTF_INT_BITS(v),
enc);
}
break;
}
case BTF_KIND_PTR:
case BTF_KIND_CONST:
case BTF_KIND_VOLATILE:
case BTF_KIND_RESTRICT:
case BTF_KIND_TYPEDEF:
if (json_output)
jsonw_uint_field(w, "type_id", t->type);
else
printf(" type_id=%u", t->type);
break;
case BTF_KIND_ARRAY: {
const struct btf_array *arr = (const void *)(t + 1);
if (json_output) {
jsonw_uint_field(w, "type_id", arr->type);
jsonw_uint_field(w, "index_type_id", arr->index_type);
jsonw_uint_field(w, "nr_elems", arr->nelems);
} else {
printf(" type_id=%u index_type_id=%u nr_elems=%u",
arr->type, arr->index_type, arr->nelems);
}
break;
}
case BTF_KIND_STRUCT:
case BTF_KIND_UNION: {
const struct btf_member *m = (const void *)(t + 1);
__u16 vlen = BTF_INFO_VLEN(t->info);
int i;
if (json_output) {
jsonw_uint_field(w, "size", t->size);
jsonw_uint_field(w, "vlen", vlen);
jsonw_name(w, "members");
jsonw_start_array(w);
} else {
printf(" size=%u vlen=%u", t->size, vlen);
}
for (i = 0; i < vlen; i++, m++) {
const char *name = btf_str(btf, m->name_off);
__u32 bit_off, bit_sz;
if (BTF_INFO_KFLAG(t->info)) {
bit_off = BTF_MEMBER_BIT_OFFSET(m->offset);
bit_sz = BTF_MEMBER_BITFIELD_SIZE(m->offset);
} else {
bit_off = m->offset;
bit_sz = 0;
}
if (json_output) {
jsonw_start_object(w);
jsonw_string_field(w, "name", name);
jsonw_uint_field(w, "type_id", m->type);
jsonw_uint_field(w, "bits_offset", bit_off);
if (bit_sz) {
jsonw_uint_field(w, "bitfield_size",
bit_sz);
}
jsonw_end_object(w);
} else {
printf("\n\t'%s' type_id=%u bits_offset=%u",
name, m->type, bit_off);
if (bit_sz)
printf(" bitfield_size=%u", bit_sz);
}
}
if (json_output)
jsonw_end_array(w);
break;
}
case BTF_KIND_ENUM: {
const struct btf_enum *v = (const void *)(t + 1);
__u16 vlen = BTF_INFO_VLEN(t->info);
int i;
if (json_output) {
jsonw_uint_field(w, "size", t->size);
jsonw_uint_field(w, "vlen", vlen);
jsonw_name(w, "values");
jsonw_start_array(w);
} else {
printf(" size=%u vlen=%u", t->size, vlen);
}
for (i = 0; i < vlen; i++, v++) {
const char *name = btf_str(btf, v->name_off);
if (json_output) {
jsonw_start_object(w);
jsonw_string_field(w, "name", name);
jsonw_uint_field(w, "val", v->val);
jsonw_end_object(w);
} else {
printf("\n\t'%s' val=%u", name, v->val);
}
}
if (json_output)
jsonw_end_array(w);
break;
}
case BTF_KIND_FWD: {
const char *fwd_kind = BTF_INFO_KIND(t->info) ? "union"
: "struct";
if (json_output)
jsonw_string_field(w, "fwd_kind", fwd_kind);
else
printf(" fwd_kind=%s", fwd_kind);
break;
}
case BTF_KIND_FUNC:
if (json_output)
jsonw_uint_field(w, "type_id", t->type);
else
printf(" type_id=%u", t->type);
break;
case BTF_KIND_FUNC_PROTO: {
const struct btf_param *p = (const void *)(t + 1);
__u16 vlen = BTF_INFO_VLEN(t->info);
int i;
if (json_output) {
jsonw_uint_field(w, "ret_type_id", t->type);
jsonw_uint_field(w, "vlen", vlen);
jsonw_name(w, "params");
jsonw_start_array(w);
} else {
printf(" ret_type_id=%u vlen=%u", t->type, vlen);
}
for (i = 0; i < vlen; i++, p++) {
const char *name = btf_str(btf, p->name_off);
if (json_output) {
jsonw_start_object(w);
jsonw_string_field(w, "name", name);
jsonw_uint_field(w, "type_id", p->type);
jsonw_end_object(w);
} else {
printf("\n\t'%s' type_id=%u", name, p->type);
}
}
if (json_output)
jsonw_end_array(w);
break;
}
case BTF_KIND_VAR: {
const struct btf_var *v = (const void *)(t + 1);
const char *linkage;
linkage = btf_var_linkage_str(v->linkage);
if (json_output) {
jsonw_uint_field(w, "type_id", t->type);
jsonw_string_field(w, "linkage", linkage);
} else {
printf(" type_id=%u, linkage=%s", t->type, linkage);
}
break;
}
case BTF_KIND_DATASEC: {
const struct btf_var_secinfo *v = (const void *)(t+1);
__u16 vlen = BTF_INFO_VLEN(t->info);
int i;
if (json_output) {
jsonw_uint_field(w, "size", t->size);
jsonw_uint_field(w, "vlen", vlen);
jsonw_name(w, "vars");
jsonw_start_array(w);
} else {
printf(" size=%u vlen=%u", t->size, vlen);
}
for (i = 0; i < vlen; i++, v++) {
if (json_output) {
jsonw_start_object(w);
jsonw_uint_field(w, "type_id", v->type);
jsonw_uint_field(w, "offset", v->offset);
jsonw_uint_field(w, "size", v->size);
jsonw_end_object(w);
} else {
printf("\n\ttype_id=%u offset=%u size=%u",
v->type, v->offset, v->size);
}
}
if (json_output)
jsonw_end_array(w);
break;
}
default:
break;
}
if (json_output)
jsonw_end_object(json_wtr);
else
printf("\n");
return 0;
}
static int dump_btf_raw(const struct btf *btf,
__u32 *root_type_ids, int root_type_cnt)
{
const struct btf_type *t;
int i;
if (json_output) {
jsonw_start_object(json_wtr);
jsonw_name(json_wtr, "types");
jsonw_start_array(json_wtr);
}
if (root_type_cnt) {
for (i = 0; i < root_type_cnt; i++) {
t = btf__type_by_id(btf, root_type_ids[i]);
dump_btf_type(btf, root_type_ids[i], t);
}
} else {
int cnt = btf__get_nr_types(btf);
for (i = 1; i <= cnt; i++) {
t = btf__type_by_id(btf, i);
dump_btf_type(btf, i, t);
}
}
if (json_output) {
jsonw_end_array(json_wtr);
jsonw_end_object(json_wtr);
}
return 0;
}
static bool check_btf_endianness(GElf_Ehdr *ehdr)
{
static unsigned int const endian = 1;
switch (ehdr->e_ident[EI_DATA]) {
case ELFDATA2LSB:
return *(unsigned char const *)&endian == 1;
case ELFDATA2MSB:
return *(unsigned char const *)&endian == 0;
default:
return 0;
}
}
static int btf_load_from_elf(const char *path, struct btf **btf)
{
int err = -1, fd = -1, idx = 0;
Elf_Data *btf_data = NULL;
Elf_Scn *scn = NULL;
Elf *elf = NULL;
GElf_Ehdr ehdr;
if (elf_version(EV_CURRENT) == EV_NONE) {
p_err("failed to init libelf for %s", path);
return -1;
}
fd = open(path, O_RDONLY);
if (fd < 0) {
p_err("failed to open %s: %s", path, strerror(errno));
return -1;
}
elf = elf_begin(fd, ELF_C_READ, NULL);
if (!elf) {
p_err("failed to open %s as ELF file", path);
goto done;
}
if (!gelf_getehdr(elf, &ehdr)) {
p_err("failed to get EHDR from %s", path);
goto done;
}
if (!check_btf_endianness(&ehdr)) {
p_err("non-native ELF endianness is not supported");
goto done;
}
if (!elf_rawdata(elf_getscn(elf, ehdr.e_shstrndx), NULL)) {
p_err("failed to get e_shstrndx from %s\n", path);
goto done;
}
while ((scn = elf_nextscn(elf, scn)) != NULL) {
GElf_Shdr sh;
char *name;
idx++;
if (gelf_getshdr(scn, &sh) != &sh) {
p_err("failed to get section(%d) header from %s",
idx, path);
goto done;
}
name = elf_strptr(elf, ehdr.e_shstrndx, sh.sh_name);
if (!name) {
p_err("failed to get section(%d) name from %s",
idx, path);
goto done;
}
if (strcmp(name, BTF_ELF_SEC) == 0) {
btf_data = elf_getdata(scn, 0);
if (!btf_data) {
p_err("failed to get section(%d, %s) data from %s",
idx, name, path);
goto done;
}
break;
}
}
if (!btf_data) {
p_err("%s ELF section not found in %s", BTF_ELF_SEC, path);
goto done;
}
*btf = btf__new(btf_data->d_buf, btf_data->d_size);
if (IS_ERR(*btf)) {
err = PTR_ERR(*btf);
*btf = NULL;
p_err("failed to load BTF data from %s: %s",
path, strerror(err));
goto done;
}
err = 0;
done:
if (err) {
if (*btf) {
btf__free(*btf);
*btf = NULL;
}
}
if (elf)
elf_end(elf);
close(fd);
return err;
}
static int do_dump(int argc, char **argv)
{
struct btf *btf = NULL;
__u32 root_type_ids[2];
int root_type_cnt = 0;
__u32 btf_id = -1;
const char *src;
int fd = -1;
int err;
if (!REQ_ARGS(2)) {
usage();
return -1;
}
src = GET_ARG();
if (is_prefix(src, "map")) {
struct bpf_map_info info = {};
__u32 len = sizeof(info);
if (!REQ_ARGS(2)) {
usage();
return -1;
}
fd = map_parse_fd_and_info(&argc, &argv, &info, &len);
if (fd < 0)
return -1;
btf_id = info.btf_id;
if (argc && is_prefix(*argv, "key")) {
root_type_ids[root_type_cnt++] = info.btf_key_type_id;
NEXT_ARG();
} else if (argc && is_prefix(*argv, "value")) {
root_type_ids[root_type_cnt++] = info.btf_value_type_id;
NEXT_ARG();
} else if (argc && is_prefix(*argv, "all")) {
NEXT_ARG();
} else if (argc && is_prefix(*argv, "kv")) {
root_type_ids[root_type_cnt++] = info.btf_key_type_id;
root_type_ids[root_type_cnt++] = info.btf_value_type_id;
NEXT_ARG();
} else {
root_type_ids[root_type_cnt++] = info.btf_key_type_id;
root_type_ids[root_type_cnt++] = info.btf_value_type_id;
}
} else if (is_prefix(src, "prog")) {
struct bpf_prog_info info = {};
__u32 len = sizeof(info);
if (!REQ_ARGS(2)) {
usage();
return -1;
}
fd = prog_parse_fd(&argc, &argv);
if (fd < 0)
return -1;
err = bpf_obj_get_info_by_fd(fd, &info, &len);
if (err) {
p_err("can't get prog info: %s", strerror(errno));
goto done;
}
btf_id = info.btf_id;
} else if (is_prefix(src, "id")) {
char *endptr;
btf_id = strtoul(*argv, &endptr, 0);
if (*endptr) {
p_err("can't parse %s as ID", **argv);
return -1;
}
NEXT_ARG();
} else if (is_prefix(src, "file")) {
err = btf_load_from_elf(*argv, &btf);
if (err)
goto done;
NEXT_ARG();
} else {
err = -1;
p_err("unrecognized BTF source specifier: '%s'", src);
goto done;
}
if (!btf) {
err = btf__get_from_id(btf_id, &btf);
if (err) {
p_err("get btf by id (%u): %s", btf_id, strerror(err));
goto done;
}
if (!btf) {
err = ENOENT;
p_err("can't find btf with ID (%u)", btf_id);
goto done;
}
}
dump_btf_raw(btf, root_type_ids, root_type_cnt);
done:
close(fd);
btf__free(btf);
return err;
}
static int do_help(int argc, char **argv)
{
if (json_output) {
jsonw_null(json_wtr);
return 0;
}
fprintf(stderr,
"Usage: %s btf dump BTF_SRC\n"
" %s btf help\n"
"\n"
" BTF_SRC := { id BTF_ID | prog PROG | map MAP [{key | value | kv | all}] | file FILE }\n"
" " HELP_SPEC_MAP "\n"
" " HELP_SPEC_PROGRAM "\n"
" " HELP_SPEC_OPTIONS "\n"
"",
bin_name, bin_name);
return 0;
}
static const struct cmd cmds[] = {
{ "help", do_help },
{ "dump", do_dump },
{ 0 }
};
int do_btf(int argc, char **argv)
{
return cmd_select(cmds, argc, argv, do_help);
}

View File

@ -56,7 +56,7 @@ static int do_help(int argc, char **argv)
" %s batch file FILE\n" " %s batch file FILE\n"
" %s version\n" " %s version\n"
"\n" "\n"
" OBJECT := { prog | map | cgroup | perf | net | feature }\n" " OBJECT := { prog | map | cgroup | perf | net | feature | btf }\n"
" " HELP_SPEC_OPTIONS "\n" " " HELP_SPEC_OPTIONS "\n"
"", "",
bin_name, bin_name, bin_name); bin_name, bin_name, bin_name);
@ -188,6 +188,7 @@ static const struct cmd cmds[] = {
{ "perf", do_perf }, { "perf", do_perf },
{ "net", do_net }, { "net", do_net },
{ "feature", do_feature }, { "feature", do_feature },
{ "btf", do_btf },
{ "version", do_version }, { "version", do_version },
{ 0 } { 0 }
}; };

View File

@ -150,6 +150,7 @@ int do_perf(int argc, char **arg);
int do_net(int argc, char **arg); int do_net(int argc, char **arg);
int do_tracelog(int argc, char **arg); int do_tracelog(int argc, char **arg);
int do_feature(int argc, char **argv); int do_feature(int argc, char **argv);
int do_btf(int argc, char **argv);
int parse_u32_arg(int *argc, char ***argv, __u32 *val, const char *what); int parse_u32_arg(int *argc, char ***argv, __u32 *val, const char *what);
int prog_parse_fd(int *argc, char ***argv); int prog_parse_fd(int *argc, char ***argv);

View File

@ -46,6 +46,7 @@ const char * const map_type_name[] = {
[BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE] = "percpu_cgroup_storage", [BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE] = "percpu_cgroup_storage",
[BPF_MAP_TYPE_QUEUE] = "queue", [BPF_MAP_TYPE_QUEUE] = "queue",
[BPF_MAP_TYPE_STACK] = "stack", [BPF_MAP_TYPE_STACK] = "stack",
[BPF_MAP_TYPE_SK_STORAGE] = "sk_storage",
}; };
const size_t map_type_name_size = ARRAY_SIZE(map_type_name); const size_t map_type_name_size = ARRAY_SIZE(map_type_name);
@ -724,7 +725,7 @@ static int dump_map_elem(int fd, void *key, void *value,
} else { } else {
const char *msg = NULL; const char *msg = NULL;
if (errno == ENOENT) if (lookup_errno == ENOENT)
msg = "<no entry>"; msg = "<no entry>";
else if (lookup_errno == ENOSPC && else if (lookup_errno == ENOSPC &&
map_info->type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) map_info->type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY)

View File

@ -3,6 +3,7 @@
#define _GNU_SOURCE #define _GNU_SOURCE
#include <errno.h> #include <errno.h>
#include <fcntl.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <unistd.h> #include <unistd.h>
@ -12,6 +13,8 @@
#include <linux/rtnetlink.h> #include <linux/rtnetlink.h>
#include <linux/tc_act/tc_bpf.h> #include <linux/tc_act/tc_bpf.h>
#include <sys/socket.h> #include <sys/socket.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <bpf.h> #include <bpf.h>
#include <nlattr.h> #include <nlattr.h>
@ -48,6 +51,10 @@ struct bpf_filter_t {
int ifindex; int ifindex;
}; };
struct bpf_attach_info {
__u32 flow_dissector_id;
};
static int dump_link_nlmsg(void *cookie, void *msg, struct nlattr **tb) static int dump_link_nlmsg(void *cookie, void *msg, struct nlattr **tb)
{ {
struct bpf_netdev_t *netinfo = cookie; struct bpf_netdev_t *netinfo = cookie;
@ -180,8 +187,45 @@ out:
return 0; return 0;
} }
static int query_flow_dissector(struct bpf_attach_info *attach_info)
{
__u32 attach_flags;
__u32 prog_ids[1];
__u32 prog_cnt;
int err;
int fd;
fd = open("/proc/self/ns/net", O_RDONLY);
if (fd < 0) {
p_err("can't open /proc/self/ns/net: %d",
strerror(errno));
return -1;
}
prog_cnt = ARRAY_SIZE(prog_ids);
err = bpf_prog_query(fd, BPF_FLOW_DISSECTOR, 0,
&attach_flags, prog_ids, &prog_cnt);
close(fd);
if (err) {
if (errno == EINVAL) {
/* Older kernel's don't support querying
* flow dissector programs.
*/
errno = 0;
return 0;
}
p_err("can't query prog: %s", strerror(errno));
return -1;
}
if (prog_cnt == 1)
attach_info->flow_dissector_id = prog_ids[0];
return 0;
}
static int do_show(int argc, char **argv) static int do_show(int argc, char **argv)
{ {
struct bpf_attach_info attach_info = {};
int i, sock, ret, filter_idx = -1; int i, sock, ret, filter_idx = -1;
struct bpf_netdev_t dev_array; struct bpf_netdev_t dev_array;
unsigned int nl_pid; unsigned int nl_pid;
@ -199,6 +243,10 @@ static int do_show(int argc, char **argv)
usage(); usage();
} }
ret = query_flow_dissector(&attach_info);
if (ret)
return -1;
sock = libbpf_netlink_open(&nl_pid); sock = libbpf_netlink_open(&nl_pid);
if (sock < 0) { if (sock < 0) {
fprintf(stderr, "failed to open netlink sock\n"); fprintf(stderr, "failed to open netlink sock\n");
@ -227,6 +275,12 @@ static int do_show(int argc, char **argv)
} }
NET_END_ARRAY("\n"); NET_END_ARRAY("\n");
} }
NET_START_ARRAY("flow_dissector", "%s:\n");
if (attach_info.flow_dissector_id > 0)
NET_DUMP_UINT("id", "id %u", attach_info.flow_dissector_id);
NET_END_ARRAY("\n");
NET_END_OBJECT; NET_END_OBJECT;
if (json_output) if (json_output)
jsonw_end_array(json_wtr); jsonw_end_array(json_wtr);

View File

@ -133,6 +133,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
BPF_MAP_TYPE_QUEUE, BPF_MAP_TYPE_QUEUE,
BPF_MAP_TYPE_STACK, BPF_MAP_TYPE_STACK,
BPF_MAP_TYPE_SK_STORAGE,
}; };
/* Note that tracing related programs such as /* Note that tracing related programs such as
@ -168,6 +169,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_SK_REUSEPORT, BPF_PROG_TYPE_SK_REUSEPORT,
BPF_PROG_TYPE_FLOW_DISSECTOR, BPF_PROG_TYPE_FLOW_DISSECTOR,
BPF_PROG_TYPE_CGROUP_SYSCTL, BPF_PROG_TYPE_CGROUP_SYSCTL,
BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
}; };
enum bpf_attach_type { enum bpf_attach_type {
@ -1737,12 +1739,19 @@ union bpf_attr {
* error if an eBPF program tries to set a callback that is not * error if an eBPF program tries to set a callback that is not
* supported in the current kernel. * supported in the current kernel.
* *
* The supported callback values that *argval* can combine are: * *argval* is a flag array which can combine these flags:
* *
* * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out) * * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out)
* * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission) * * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission)
* * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change) * * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change)
* *
* Therefore, this function can be used to clear a callback flag by
* setting the appropriate bit to zero. e.g. to disable the RTO
* callback:
*
* **bpf_sock_ops_cb_flags_set(bpf_sock,**
* **bpf_sock->bpf_sock_ops_cb_flags & ~BPF_SOCK_OPS_RTO_CB_FLAG)**
*
* Here are some examples of where one could call such eBPF * Here are some examples of where one could call such eBPF
* program: * program:
* *
@ -2622,6 +2631,42 @@ union bpf_attr {
* was provided. * was provided.
* *
* **-ERANGE** if resulting value was out of range. * **-ERANGE** if resulting value was out of range.
*
* void *bpf_sk_storage_get(struct bpf_map *map, struct bpf_sock *sk, void *value, u64 flags)
* Description
* Get a bpf-local-storage from a sk.
*
* Logically, it could be thought of getting the value from
* a *map* with *sk* as the **key**. From this
* perspective, the usage is not much different from
* **bpf_map_lookup_elem(map, &sk)** except this
* helper enforces the key must be a **bpf_fullsock()**
* and the map must be a BPF_MAP_TYPE_SK_STORAGE also.
*
* Underneath, the value is stored locally at *sk* instead of
* the map. The *map* is used as the bpf-local-storage **type**.
* The bpf-local-storage **type** (i.e. the *map*) is searched
* against all bpf-local-storages residing at sk.
*
* An optional *flags* (BPF_SK_STORAGE_GET_F_CREATE) can be
* used such that a new bpf-local-storage will be
* created if one does not exist. *value* can be used
* together with BPF_SK_STORAGE_GET_F_CREATE to specify
* the initial value of a bpf-local-storage. If *value* is
* NULL, the new bpf-local-storage will be zero initialized.
* Return
* A bpf-local-storage pointer is returned on success.
*
* **NULL** if not found or there was an error in adding
* a new bpf-local-storage.
*
* int bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk)
* Description
* Delete a bpf-local-storage from a sk.
* Return
* 0 on success.
*
* **-ENOENT** if the bpf-local-storage cannot be found.
*/ */
#define __BPF_FUNC_MAPPER(FN) \ #define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \ FN(unspec), \
@ -2730,7 +2775,9 @@ union bpf_attr {
FN(sysctl_get_new_value), \ FN(sysctl_get_new_value), \
FN(sysctl_set_new_value), \ FN(sysctl_set_new_value), \
FN(strtol), \ FN(strtol), \
FN(strtoul), FN(strtoul), \
FN(sk_storage_get), \
FN(sk_storage_delete),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper /* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call * function eBPF program intends to call
@ -2806,6 +2853,9 @@ enum bpf_func_id {
/* BPF_FUNC_sysctl_get_name flags. */ /* BPF_FUNC_sysctl_get_name flags. */
#define BPF_F_SYSCTL_BASE_NAME (1ULL << 0) #define BPF_F_SYSCTL_BASE_NAME (1ULL << 0)
/* BPF_FUNC_sk_storage_get flags */
#define BPF_SK_STORAGE_GET_F_CREATE (1ULL << 0)
/* Mode for BPF_FUNC_skb_adjust_room helper. */ /* Mode for BPF_FUNC_skb_adjust_room helper. */
enum bpf_adj_room_mode { enum bpf_adj_room_mode {
BPF_ADJ_ROOM_NET, BPF_ADJ_ROOM_NET,

View File

@ -26,6 +26,7 @@
#include <linux/bpf.h> #include <linux/bpf.h>
#include <stdbool.h> #include <stdbool.h>
#include <stddef.h> #include <stddef.h>
#include <stdint.h>
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {

View File

@ -126,6 +126,8 @@ static inline __u64 ptr_to_u64(const void *ptr)
struct bpf_capabilities { struct bpf_capabilities {
/* v4.14: kernel support for program & map names. */ /* v4.14: kernel support for program & map names. */
__u32 name:1; __u32 name:1;
/* v5.2: kernel support for global data sections. */
__u32 global_data:1;
}; };
/* /*
@ -854,12 +856,15 @@ bpf_object__init_maps(struct bpf_object *obj, int flags)
* *
* TODO: Detect array of map and report error. * TODO: Detect array of map and report error.
*/ */
if (obj->caps.global_data) {
if (obj->efile.data_shndx >= 0) if (obj->efile.data_shndx >= 0)
nr_maps_glob++; nr_maps_glob++;
if (obj->efile.rodata_shndx >= 0) if (obj->efile.rodata_shndx >= 0)
nr_maps_glob++; nr_maps_glob++;
if (obj->efile.bss_shndx >= 0) if (obj->efile.bss_shndx >= 0)
nr_maps_glob++; nr_maps_glob++;
}
for (i = 0; data && i < nr_syms; i++) { for (i = 0; data && i < nr_syms; i++) {
GElf_Sym sym; GElf_Sym sym;
@ -870,14 +875,14 @@ bpf_object__init_maps(struct bpf_object *obj, int flags)
nr_maps++; nr_maps++;
} }
/* Alloc obj->maps and fill nr_maps. */
pr_debug("maps in %s: %d maps in %zd bytes\n", obj->path,
nr_maps, data->d_size);
if (!nr_maps && !nr_maps_glob) if (!nr_maps && !nr_maps_glob)
return 0; return 0;
/* Assume equally sized map definitions */ /* Assume equally sized map definitions */
if (data) { if (data) {
pr_debug("maps in %s: %d maps in %zd bytes\n", obj->path,
nr_maps, data->d_size);
map_def_sz = data->d_size / nr_maps; map_def_sz = data->d_size / nr_maps;
if (!data->d_size || (data->d_size % nr_maps) != 0) { if (!data->d_size || (data->d_size % nr_maps) != 0) {
pr_warning("unable to determine map definition size " pr_warning("unable to determine map definition size "
@ -971,6 +976,9 @@ bpf_object__init_maps(struct bpf_object *obj, int flags)
map_idx++; map_idx++;
} }
if (!obj->caps.global_data)
goto finalize;
/* /*
* Populate rest of obj->maps with libbpf internal maps. * Populate rest of obj->maps with libbpf internal maps.
*/ */
@ -988,6 +996,7 @@ bpf_object__init_maps(struct bpf_object *obj, int flags)
ret = bpf_object__init_internal_map(obj, &obj->maps[map_idx++], ret = bpf_object__init_internal_map(obj, &obj->maps[map_idx++],
LIBBPF_MAP_BSS, LIBBPF_MAP_BSS,
obj->efile.bss, NULL); obj->efile.bss, NULL);
finalize:
if (!ret) if (!ret)
qsort(obj->maps, obj->nr_maps, sizeof(obj->maps[0]), qsort(obj->maps, obj->nr_maps, sizeof(obj->maps[0]),
compare_bpf_map); compare_bpf_map);
@ -1333,12 +1342,18 @@ bpf_program__collect_reloc(struct bpf_program *prog, GElf_Shdr *shdr,
if (bpf_object__shndx_is_maps(obj, shdr_idx) || if (bpf_object__shndx_is_maps(obj, shdr_idx) ||
bpf_object__shndx_is_data(obj, shdr_idx)) { bpf_object__shndx_is_data(obj, shdr_idx)) {
type = bpf_object__section_to_libbpf_map_type(obj, shdr_idx); type = bpf_object__section_to_libbpf_map_type(obj, shdr_idx);
if (type != LIBBPF_MAP_UNSPEC && if (type != LIBBPF_MAP_UNSPEC) {
GELF_ST_BIND(sym.st_info) == STB_GLOBAL) { if (GELF_ST_BIND(sym.st_info) == STB_GLOBAL) {
pr_warning("bpf: relocation: not yet supported relo for non-static global \'%s\' variable found in insns[%d].code 0x%x\n", pr_warning("bpf: relocation: not yet supported relo for non-static global \'%s\' variable found in insns[%d].code 0x%x\n",
name, insn_idx, insns[insn_idx].code); name, insn_idx, insns[insn_idx].code);
return -LIBBPF_ERRNO__RELOC; return -LIBBPF_ERRNO__RELOC;
} }
if (!obj->caps.global_data) {
pr_warning("bpf: relocation: kernel does not support global \'%s\' variable access in insns[%d]\n",
name, insn_idx);
return -LIBBPF_ERRNO__RELOC;
}
}
for (map_idx = 0; map_idx < nr_maps; map_idx++) { for (map_idx = 0; map_idx < nr_maps; map_idx++) {
if (maps[map_idx].libbpf_type != type) if (maps[map_idx].libbpf_type != type)
@ -1495,10 +1510,68 @@ bpf_object__probe_name(struct bpf_object *obj)
return 0; return 0;
} }
static int
bpf_object__probe_global_data(struct bpf_object *obj)
{
struct bpf_load_program_attr prg_attr;
struct bpf_create_map_attr map_attr;
char *cp, errmsg[STRERR_BUFSIZE];
struct bpf_insn insns[] = {
BPF_LD_MAP_VALUE(BPF_REG_1, 0, 16),
BPF_ST_MEM(BPF_DW, BPF_REG_1, 0, 42),
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
};
int ret, map;
memset(&map_attr, 0, sizeof(map_attr));
map_attr.map_type = BPF_MAP_TYPE_ARRAY;
map_attr.key_size = sizeof(int);
map_attr.value_size = 32;
map_attr.max_entries = 1;
map = bpf_create_map_xattr(&map_attr);
if (map < 0) {
cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg));
pr_warning("Error in %s():%s(%d). Couldn't create simple array map.\n",
__func__, cp, errno);
return -errno;
}
insns[0].imm = map;
memset(&prg_attr, 0, sizeof(prg_attr));
prg_attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
prg_attr.insns = insns;
prg_attr.insns_cnt = ARRAY_SIZE(insns);
prg_attr.license = "GPL";
ret = bpf_load_program_xattr(&prg_attr, NULL, 0);
if (ret >= 0) {
obj->caps.global_data = 1;
close(ret);
}
close(map);
return 0;
}
static int static int
bpf_object__probe_caps(struct bpf_object *obj) bpf_object__probe_caps(struct bpf_object *obj)
{ {
return bpf_object__probe_name(obj); int (*probe_fn[])(struct bpf_object *obj) = {
bpf_object__probe_name,
bpf_object__probe_global_data,
};
int i, ret;
for (i = 0; i < ARRAY_SIZE(probe_fn); i++) {
ret = probe_fn[i](obj);
if (ret < 0)
return ret;
}
return 0;
} }
static int static int
@ -2063,6 +2136,7 @@ static bool bpf_prog_type__needs_kver(enum bpf_prog_type type)
case BPF_PROG_TYPE_UNSPEC: case BPF_PROG_TYPE_UNSPEC:
case BPF_PROG_TYPE_TRACEPOINT: case BPF_PROG_TYPE_TRACEPOINT:
case BPF_PROG_TYPE_RAW_TRACEPOINT: case BPF_PROG_TYPE_RAW_TRACEPOINT:
case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
case BPF_PROG_TYPE_PERF_EVENT: case BPF_PROG_TYPE_PERF_EVENT:
case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_CGROUP_SYSCTL:
return false; return false;
@ -2100,6 +2174,7 @@ __bpf_object__open(const char *path, void *obj_buf, size_t obj_buf_sz,
CHECK_ERR(bpf_object__elf_init(obj), err, out); CHECK_ERR(bpf_object__elf_init(obj), err, out);
CHECK_ERR(bpf_object__check_endianness(obj), err, out); CHECK_ERR(bpf_object__check_endianness(obj), err, out);
CHECK_ERR(bpf_object__probe_caps(obj), err, out);
CHECK_ERR(bpf_object__elf_collect(obj, flags), err, out); CHECK_ERR(bpf_object__elf_collect(obj, flags), err, out);
CHECK_ERR(bpf_object__collect_reloc(obj), err, out); CHECK_ERR(bpf_object__collect_reloc(obj), err, out);
CHECK_ERR(bpf_object__validate(obj, needs_kver), err, out); CHECK_ERR(bpf_object__validate(obj, needs_kver), err, out);
@ -2193,7 +2268,6 @@ int bpf_object__load(struct bpf_object *obj)
obj->loaded = true; obj->loaded = true;
CHECK_ERR(bpf_object__probe_caps(obj), err, out);
CHECK_ERR(bpf_object__create_maps(obj), err, out); CHECK_ERR(bpf_object__create_maps(obj), err, out);
CHECK_ERR(bpf_object__relocate(obj), err, out); CHECK_ERR(bpf_object__relocate(obj), err, out);
CHECK_ERR(bpf_object__load_progs(obj), err, out); CHECK_ERR(bpf_object__load_progs(obj), err, out);

View File

@ -9,6 +9,7 @@
#include <net/if.h> #include <net/if.h>
#include <sys/utsname.h> #include <sys/utsname.h>
#include <linux/btf.h>
#include <linux/filter.h> #include <linux/filter.h>
#include <linux/kernel.h> #include <linux/kernel.h>
@ -93,6 +94,7 @@ probe_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns,
case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_DEVICE:
case BPF_PROG_TYPE_SK_MSG: case BPF_PROG_TYPE_SK_MSG:
case BPF_PROG_TYPE_RAW_TRACEPOINT: case BPF_PROG_TYPE_RAW_TRACEPOINT:
case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
case BPF_PROG_TYPE_LWT_SEG6LOCAL: case BPF_PROG_TYPE_LWT_SEG6LOCAL:
case BPF_PROG_TYPE_LIRC_MODE2: case BPF_PROG_TYPE_LIRC_MODE2:
case BPF_PROG_TYPE_SK_REUSEPORT: case BPF_PROG_TYPE_SK_REUSEPORT:
@ -130,11 +132,65 @@ bool bpf_probe_prog_type(enum bpf_prog_type prog_type, __u32 ifindex)
return errno != EINVAL && errno != EOPNOTSUPP; return errno != EINVAL && errno != EOPNOTSUPP;
} }
static int load_btf(void)
{
#define BTF_INFO_ENC(kind, kind_flag, vlen) \
((!!(kind_flag) << 31) | ((kind) << 24) | ((vlen) & BTF_MAX_VLEN))
#define BTF_TYPE_ENC(name, info, size_or_type) \
(name), (info), (size_or_type)
#define BTF_INT_ENC(encoding, bits_offset, nr_bits) \
((encoding) << 24 | (bits_offset) << 16 | (nr_bits))
#define BTF_TYPE_INT_ENC(name, encoding, bits_offset, bits, sz) \
BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_INT, 0, 0), sz), \
BTF_INT_ENC(encoding, bits_offset, bits)
#define BTF_MEMBER_ENC(name, type, bits_offset) \
(name), (type), (bits_offset)
const char btf_str_sec[] = "\0bpf_spin_lock\0val\0cnt\0l";
/* struct bpf_spin_lock {
* int val;
* };
* struct val {
* int cnt;
* struct bpf_spin_lock l;
* };
*/
__u32 btf_raw_types[] = {
/* int */
BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
/* struct bpf_spin_lock */ /* [2] */
BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), 4),
BTF_MEMBER_ENC(15, 1, 0), /* int val; */
/* struct val */ /* [3] */
BTF_TYPE_ENC(15, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), 8),
BTF_MEMBER_ENC(19, 1, 0), /* int cnt; */
BTF_MEMBER_ENC(23, 2, 32),/* struct bpf_spin_lock l; */
};
struct btf_header btf_hdr = {
.magic = BTF_MAGIC,
.version = BTF_VERSION,
.hdr_len = sizeof(struct btf_header),
.type_len = sizeof(btf_raw_types),
.str_off = sizeof(btf_raw_types),
.str_len = sizeof(btf_str_sec),
};
__u8 raw_btf[sizeof(struct btf_header) + sizeof(btf_raw_types) +
sizeof(btf_str_sec)];
memcpy(raw_btf, &btf_hdr, sizeof(btf_hdr));
memcpy(raw_btf + sizeof(btf_hdr), btf_raw_types, sizeof(btf_raw_types));
memcpy(raw_btf + sizeof(btf_hdr) + sizeof(btf_raw_types),
btf_str_sec, sizeof(btf_str_sec));
return bpf_load_btf(raw_btf, sizeof(raw_btf), 0, 0, 0);
}
bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex) bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex)
{ {
int key_size, value_size, max_entries, map_flags; int key_size, value_size, max_entries, map_flags;
__u32 btf_key_type_id = 0, btf_value_type_id = 0;
struct bpf_create_map_attr attr = {}; struct bpf_create_map_attr attr = {};
int fd = -1, fd_inner; int fd = -1, btf_fd = -1, fd_inner;
key_size = sizeof(__u32); key_size = sizeof(__u32);
value_size = sizeof(__u32); value_size = sizeof(__u32);
@ -160,6 +216,16 @@ bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex)
case BPF_MAP_TYPE_STACK: case BPF_MAP_TYPE_STACK:
key_size = 0; key_size = 0;
break; break;
case BPF_MAP_TYPE_SK_STORAGE:
btf_key_type_id = 1;
btf_value_type_id = 3;
value_size = 8;
max_entries = 0;
map_flags = BPF_F_NO_PREALLOC;
btf_fd = load_btf();
if (btf_fd < 0)
return false;
break;
case BPF_MAP_TYPE_UNSPEC: case BPF_MAP_TYPE_UNSPEC:
case BPF_MAP_TYPE_HASH: case BPF_MAP_TYPE_HASH:
case BPF_MAP_TYPE_ARRAY: case BPF_MAP_TYPE_ARRAY:
@ -205,11 +271,18 @@ bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex)
attr.max_entries = max_entries; attr.max_entries = max_entries;
attr.map_flags = map_flags; attr.map_flags = map_flags;
attr.map_ifindex = ifindex; attr.map_ifindex = ifindex;
if (btf_fd >= 0) {
attr.btf_fd = btf_fd;
attr.btf_key_type_id = btf_key_type_id;
attr.btf_value_type_id = btf_value_type_id;
}
fd = bpf_create_map_xattr(&attr); fd = bpf_create_map_xattr(&attr);
} }
if (fd >= 0) if (fd >= 0)
close(fd); close(fd);
if (btf_fd >= 0)
close(btf_fd);
return fd >= 0; return fd >= 0;
} }

View File

@ -74,6 +74,8 @@ all: $(TEST_CUSTOM_PROGS)
$(OUTPUT)/urandom_read: $(OUTPUT)/%: %.c $(OUTPUT)/urandom_read: $(OUTPUT)/%: %.c
$(CC) -o $@ $< -Wl,--build-id $(CC) -o $@ $< -Wl,--build-id
$(OUTPUT)/test_maps: map_tests/*.c
BPFOBJ := $(OUTPUT)/libbpf.a BPFOBJ := $(OUTPUT)/libbpf.a
$(TEST_GEN_PROGS): $(BPFOBJ) $(TEST_GEN_PROGS): $(BPFOBJ)
@ -232,6 +234,27 @@ $(PROG_TESTS_H): $(PROG_TESTS_DIR) $(PROG_TESTS_FILES)
echo '#endif' \ echo '#endif' \
) > $(PROG_TESTS_H)) ) > $(PROG_TESTS_H))
TEST_MAPS_CFLAGS := -I. -I$(OUTPUT)
MAP_TESTS_DIR = $(OUTPUT)/map_tests
$(MAP_TESTS_DIR):
mkdir -p $@
MAP_TESTS_H := $(MAP_TESTS_DIR)/tests.h
test_maps.c: $(MAP_TESTS_H)
$(OUTPUT)/test_maps: CFLAGS += $(TEST_MAPS_CFLAGS)
MAP_TESTS_FILES := $(wildcard map_tests/*.c)
$(MAP_TESTS_H): $(MAP_TESTS_DIR) $(MAP_TESTS_FILES)
$(shell ( cd map_tests/; \
echo '/* Generated header, do not edit */'; \
echo '#ifdef DECLARE'; \
ls *.c 2> /dev/null | \
sed -e 's@\([^\.]*\)\.c@extern void test_\1(void);@'; \
echo '#endif'; \
echo '#ifdef CALL'; \
ls *.c 2> /dev/null | \
sed -e 's@\([^\.]*\)\.c@test_\1();@'; \
echo '#endif' \
) > $(MAP_TESTS_H))
VERIFIER_TESTS_H := $(OUTPUT)/verifier/tests.h VERIFIER_TESTS_H := $(OUTPUT)/verifier/tests.h
test_verifier.c: $(VERIFIER_TESTS_H) test_verifier.c: $(VERIFIER_TESTS_H)
$(OUTPUT)/test_verifier: CFLAGS += $(TEST_VERIFIER_CFLAGS) $(OUTPUT)/test_verifier: CFLAGS += $(TEST_VERIFIER_CFLAGS)
@ -251,4 +274,4 @@ $(OUTPUT)/verifier/tests.h: $(VERIFIER_TESTS_DIR) $(VERIFIER_TEST_FILES)
) > $(VERIFIER_TESTS_H)) ) > $(VERIFIER_TESTS_H))
EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(ALU32_BUILD_DIR) \ EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(ALU32_BUILD_DIR) \
$(VERIFIER_TESTS_H) $(PROG_TESTS_H) $(VERIFIER_TESTS_H) $(PROG_TESTS_H) $(MAP_TESTS_H)

View File

@ -211,6 +211,11 @@ static int (*bpf_strtol)(const char *buf, unsigned long long buf_len,
static int (*bpf_strtoul)(const char *buf, unsigned long long buf_len, static int (*bpf_strtoul)(const char *buf, unsigned long long buf_len,
unsigned long long flags, unsigned long *res) = unsigned long long flags, unsigned long *res) =
(void *) BPF_FUNC_strtoul; (void *) BPF_FUNC_strtoul;
static void *(*bpf_sk_storage_get)(void *map, struct bpf_sock *sk,
void *value, __u64 flags) =
(void *) BPF_FUNC_sk_storage_get;
static int (*bpf_sk_storage_delete)(void *map, struct bpf_sock *sk) =
(void *)BPF_FUNC_sk_storage_delete;
/* llvm builtin functions that eBPF C program may use to /* llvm builtin functions that eBPF C program may use to
* emit BPF_LD_ABS and BPF_LD_IND instructions * emit BPF_LD_ABS and BPF_LD_IND instructions

View File

@ -33,3 +33,4 @@ CONFIG_MPLS=y
CONFIG_NET_MPLS_GSO=m CONFIG_NET_MPLS_GSO=m
CONFIG_MPLS_ROUTING=m CONFIG_MPLS_ROUTING=m
CONFIG_MPLS_IPTUNNEL=m CONFIG_MPLS_IPTUNNEL=m
CONFIG_IPV6_SIT=m

View File

@ -26,7 +26,7 @@ static void load_and_attach_program(void)
struct bpf_object *obj; struct bpf_object *obj;
ret = bpf_flow_load(&obj, cfg_path_name, cfg_section_name, ret = bpf_flow_load(&obj, cfg_path_name, cfg_section_name,
cfg_map_name, &prog_fd); cfg_map_name, NULL, &prog_fd, NULL);
if (ret) if (ret)
error(1, 0, "bpf_flow_load %s", cfg_path_name); error(1, 0, "bpf_flow_load %s", cfg_path_name);

View File

@ -9,10 +9,12 @@ static inline int bpf_flow_load(struct bpf_object **obj,
const char *path, const char *path,
const char *section_name, const char *section_name,
const char *map_name, const char *map_name,
int *prog_fd) const char *keys_map_name,
int *prog_fd,
int *keys_fd)
{ {
struct bpf_program *prog, *main_prog; struct bpf_program *prog, *main_prog;
struct bpf_map *prog_array; struct bpf_map *prog_array, *keys;
int prog_array_fd; int prog_array_fd;
int ret, fd, i; int ret, fd, i;
@ -23,19 +25,29 @@ static inline int bpf_flow_load(struct bpf_object **obj,
main_prog = bpf_object__find_program_by_title(*obj, section_name); main_prog = bpf_object__find_program_by_title(*obj, section_name);
if (!main_prog) if (!main_prog)
return ret; return -1;
*prog_fd = bpf_program__fd(main_prog); *prog_fd = bpf_program__fd(main_prog);
if (*prog_fd < 0) if (*prog_fd < 0)
return ret; return -1;
prog_array = bpf_object__find_map_by_name(*obj, map_name); prog_array = bpf_object__find_map_by_name(*obj, map_name);
if (!prog_array) if (!prog_array)
return ret; return -1;
prog_array_fd = bpf_map__fd(prog_array); prog_array_fd = bpf_map__fd(prog_array);
if (prog_array_fd < 0) if (prog_array_fd < 0)
return ret; return -1;
if (keys_map_name && keys_fd) {
keys = bpf_object__find_map_by_name(*obj, keys_map_name);
if (!keys)
return -1;
*keys_fd = bpf_map__fd(keys);
if (*keys_fd < 0)
return -1;
}
i = 0; i = 0;
bpf_object__for_each_program(prog, *obj) { bpf_object__for_each_program(prog, *obj) {

View File

@ -0,0 +1,629 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2019 Facebook */
#include <linux/compiler.h>
#include <linux/err.h>
#include <sys/resource.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <linux/btf.h>
#include <unistd.h>
#include <signal.h>
#include <errno.h>
#include <string.h>
#include <pthread.h>
#include <bpf/bpf.h>
#include <bpf/libbpf.h>
#include <test_btf.h>
#include <test_maps.h>
static struct bpf_create_map_attr xattr = {
.name = "sk_storage_map",
.map_type = BPF_MAP_TYPE_SK_STORAGE,
.map_flags = BPF_F_NO_PREALLOC,
.max_entries = 0,
.key_size = 4,
.value_size = 8,
.btf_key_type_id = 1,
.btf_value_type_id = 3,
.btf_fd = -1,
};
static unsigned int nr_sk_threads_done;
static unsigned int nr_sk_threads_err;
static unsigned int nr_sk_per_thread = 4096;
static unsigned int nr_sk_threads = 4;
static int sk_storage_map = -1;
static unsigned int stop;
static int runtime_s = 5;
static bool is_stopped(void)
{
return READ_ONCE(stop);
}
static unsigned int threads_err(void)
{
return READ_ONCE(nr_sk_threads_err);
}
static void notify_thread_err(void)
{
__sync_add_and_fetch(&nr_sk_threads_err, 1);
}
static bool wait_for_threads_err(void)
{
while (!is_stopped() && !threads_err())
usleep(500);
return !is_stopped();
}
static unsigned int threads_done(void)
{
return READ_ONCE(nr_sk_threads_done);
}
static void notify_thread_done(void)
{
__sync_add_and_fetch(&nr_sk_threads_done, 1);
}
static void notify_thread_redo(void)
{
__sync_sub_and_fetch(&nr_sk_threads_done, 1);
}
static bool wait_for_threads_done(void)
{
while (threads_done() != nr_sk_threads && !is_stopped() &&
!threads_err())
usleep(50);
return !is_stopped() && !threads_err();
}
static bool wait_for_threads_redo(void)
{
while (threads_done() && !is_stopped() && !threads_err())
usleep(50);
return !is_stopped() && !threads_err();
}
static bool wait_for_map(void)
{
while (READ_ONCE(sk_storage_map) == -1 && !is_stopped())
usleep(50);
return !is_stopped();
}
static bool wait_for_map_close(void)
{
while (READ_ONCE(sk_storage_map) != -1 && !is_stopped())
;
return !is_stopped();
}
static int load_btf(void)
{
const char btf_str_sec[] = "\0bpf_spin_lock\0val\0cnt\0l";
__u32 btf_raw_types[] = {
/* int */
BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
/* struct bpf_spin_lock */ /* [2] */
BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), 4),
BTF_MEMBER_ENC(15, 1, 0), /* int val; */
/* struct val */ /* [3] */
BTF_TYPE_ENC(15, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), 8),
BTF_MEMBER_ENC(19, 1, 0), /* int cnt; */
BTF_MEMBER_ENC(23, 2, 32),/* struct bpf_spin_lock l; */
};
struct btf_header btf_hdr = {
.magic = BTF_MAGIC,
.version = BTF_VERSION,
.hdr_len = sizeof(struct btf_header),
.type_len = sizeof(btf_raw_types),
.str_off = sizeof(btf_raw_types),
.str_len = sizeof(btf_str_sec),
};
__u8 raw_btf[sizeof(struct btf_header) + sizeof(btf_raw_types) +
sizeof(btf_str_sec)];
memcpy(raw_btf, &btf_hdr, sizeof(btf_hdr));
memcpy(raw_btf + sizeof(btf_hdr), btf_raw_types, sizeof(btf_raw_types));
memcpy(raw_btf + sizeof(btf_hdr) + sizeof(btf_raw_types),
btf_str_sec, sizeof(btf_str_sec));
return bpf_load_btf(raw_btf, sizeof(raw_btf), 0, 0, 0);
}
static int create_sk_storage_map(void)
{
int btf_fd, map_fd;
btf_fd = load_btf();
CHECK(btf_fd == -1, "bpf_load_btf", "btf_fd:%d errno:%d\n",
btf_fd, errno);
xattr.btf_fd = btf_fd;
map_fd = bpf_create_map_xattr(&xattr);
xattr.btf_fd = -1;
close(btf_fd);
CHECK(map_fd == -1,
"bpf_create_map_xattr()", "errno:%d\n", errno);
return map_fd;
}
static void *insert_close_thread(void *arg)
{
struct {
int cnt;
int lock;
} value = { .cnt = 0xeB9F, .lock = 0, };
int i, map_fd, err, *sk_fds;
sk_fds = malloc(sizeof(*sk_fds) * nr_sk_per_thread);
if (!sk_fds) {
notify_thread_err();
return ERR_PTR(-ENOMEM);
}
for (i = 0; i < nr_sk_per_thread; i++)
sk_fds[i] = -1;
while (!is_stopped()) {
if (!wait_for_map())
goto close_all;
map_fd = READ_ONCE(sk_storage_map);
for (i = 0; i < nr_sk_per_thread && !is_stopped(); i++) {
sk_fds[i] = socket(AF_INET6, SOCK_STREAM, 0);
if (sk_fds[i] == -1) {
err = -errno;
fprintf(stderr, "socket(): errno:%d\n", errno);
goto errout;
}
err = bpf_map_update_elem(map_fd, &sk_fds[i], &value,
BPF_NOEXIST);
if (err) {
err = -errno;
fprintf(stderr,
"bpf_map_update_elem(): errno:%d\n",
errno);
goto errout;
}
}
notify_thread_done();
wait_for_map_close();
close_all:
for (i = 0; i < nr_sk_per_thread; i++) {
close(sk_fds[i]);
sk_fds[i] = -1;
}
notify_thread_redo();
}
free(sk_fds);
return NULL;
errout:
for (i = 0; i < nr_sk_per_thread && sk_fds[i] != -1; i++)
close(sk_fds[i]);
free(sk_fds);
notify_thread_err();
return ERR_PTR(err);
}
static int do_sk_storage_map_stress_free(void)
{
int i, map_fd = -1, err = 0, nr_threads_created = 0;
pthread_t *sk_thread_ids;
void *thread_ret;
sk_thread_ids = malloc(sizeof(pthread_t) * nr_sk_threads);
if (!sk_thread_ids) {
fprintf(stderr, "malloc(sk_threads): NULL\n");
return -ENOMEM;
}
for (i = 0; i < nr_sk_threads; i++) {
err = pthread_create(&sk_thread_ids[i], NULL,
insert_close_thread, NULL);
if (err) {
err = -errno;
goto done;
}
nr_threads_created++;
}
while (!is_stopped()) {
map_fd = create_sk_storage_map();
WRITE_ONCE(sk_storage_map, map_fd);
if (!wait_for_threads_done())
break;
WRITE_ONCE(sk_storage_map, -1);
close(map_fd);
map_fd = -1;
if (!wait_for_threads_redo())
break;
}
done:
WRITE_ONCE(stop, 1);
for (i = 0; i < nr_threads_created; i++) {
pthread_join(sk_thread_ids[i], &thread_ret);
if (IS_ERR(thread_ret) && !err) {
err = PTR_ERR(thread_ret);
fprintf(stderr, "threads#%u: err:%d\n", i, err);
}
}
free(sk_thread_ids);
if (map_fd != -1)
close(map_fd);
return err;
}
static void *update_thread(void *arg)
{
struct {
int cnt;
int lock;
} value = { .cnt = 0xeB9F, .lock = 0, };
int map_fd = READ_ONCE(sk_storage_map);
int sk_fd = *(int *)arg;
int err = 0; /* Suppress compiler false alarm */
while (!is_stopped()) {
err = bpf_map_update_elem(map_fd, &sk_fd, &value, 0);
if (err && errno != EAGAIN) {
err = -errno;
fprintf(stderr, "bpf_map_update_elem: %d %d\n",
err, errno);
break;
}
}
if (!is_stopped()) {
notify_thread_err();
return ERR_PTR(err);
}
return NULL;
}
static void *delete_thread(void *arg)
{
int map_fd = READ_ONCE(sk_storage_map);
int sk_fd = *(int *)arg;
int err = 0; /* Suppress compiler false alarm */
while (!is_stopped()) {
err = bpf_map_delete_elem(map_fd, &sk_fd);
if (err && errno != ENOENT) {
err = -errno;
fprintf(stderr, "bpf_map_delete_elem: %d %d\n",
err, errno);
break;
}
}
if (!is_stopped()) {
notify_thread_err();
return ERR_PTR(err);
}
return NULL;
}
static int do_sk_storage_map_stress_change(void)
{
int i, sk_fd, map_fd = -1, err = 0, nr_threads_created = 0;
pthread_t *sk_thread_ids;
void *thread_ret;
sk_thread_ids = malloc(sizeof(pthread_t) * nr_sk_threads);
if (!sk_thread_ids) {
fprintf(stderr, "malloc(sk_threads): NULL\n");
return -ENOMEM;
}
sk_fd = socket(AF_INET6, SOCK_STREAM, 0);
if (sk_fd == -1) {
err = -errno;
goto done;
}
map_fd = create_sk_storage_map();
WRITE_ONCE(sk_storage_map, map_fd);
for (i = 0; i < nr_sk_threads; i++) {
if (i & 0x1)
err = pthread_create(&sk_thread_ids[i], NULL,
update_thread, &sk_fd);
else
err = pthread_create(&sk_thread_ids[i], NULL,
delete_thread, &sk_fd);
if (err) {
err = -errno;
goto done;
}
nr_threads_created++;
}
wait_for_threads_err();
done:
WRITE_ONCE(stop, 1);
for (i = 0; i < nr_threads_created; i++) {
pthread_join(sk_thread_ids[i], &thread_ret);
if (IS_ERR(thread_ret) && !err) {
err = PTR_ERR(thread_ret);
fprintf(stderr, "threads#%u: err:%d\n", i, err);
}
}
free(sk_thread_ids);
if (sk_fd != -1)
close(sk_fd);
close(map_fd);
return err;
}
static void stop_handler(int signum)
{
if (signum != SIGALRM)
printf("stopping...\n");
WRITE_ONCE(stop, 1);
}
#define BPF_SK_STORAGE_MAP_TEST_NR_THREADS "BPF_SK_STORAGE_MAP_TEST_NR_THREADS"
#define BPF_SK_STORAGE_MAP_TEST_SK_PER_THREAD "BPF_SK_STORAGE_MAP_TEST_SK_PER_THREAD"
#define BPF_SK_STORAGE_MAP_TEST_RUNTIME_S "BPF_SK_STORAGE_MAP_TEST_RUNTIME_S"
#define BPF_SK_STORAGE_MAP_TEST_NAME "BPF_SK_STORAGE_MAP_TEST_NAME"
static void test_sk_storage_map_stress_free(void)
{
struct rlimit rlim_old, rlim_new = {};
int err;
getrlimit(RLIMIT_NOFILE, &rlim_old);
signal(SIGTERM, stop_handler);
signal(SIGINT, stop_handler);
if (runtime_s > 0) {
signal(SIGALRM, stop_handler);
alarm(runtime_s);
}
if (rlim_old.rlim_cur < nr_sk_threads * nr_sk_per_thread) {
rlim_new.rlim_cur = nr_sk_threads * nr_sk_per_thread + 128;
rlim_new.rlim_max = rlim_new.rlim_cur + 128;
err = setrlimit(RLIMIT_NOFILE, &rlim_new);
CHECK(err, "setrlimit(RLIMIT_NOFILE)", "rlim_new:%lu errno:%d",
rlim_new.rlim_cur, errno);
}
err = do_sk_storage_map_stress_free();
signal(SIGTERM, SIG_DFL);
signal(SIGINT, SIG_DFL);
if (runtime_s > 0) {
signal(SIGALRM, SIG_DFL);
alarm(0);
}
if (rlim_new.rlim_cur)
setrlimit(RLIMIT_NOFILE, &rlim_old);
CHECK(err, "test_sk_storage_map_stress_free", "err:%d\n", err);
}
static void test_sk_storage_map_stress_change(void)
{
int err;
signal(SIGTERM, stop_handler);
signal(SIGINT, stop_handler);
if (runtime_s > 0) {
signal(SIGALRM, stop_handler);
alarm(runtime_s);
}
err = do_sk_storage_map_stress_change();
signal(SIGTERM, SIG_DFL);
signal(SIGINT, SIG_DFL);
if (runtime_s > 0) {
signal(SIGALRM, SIG_DFL);
alarm(0);
}
CHECK(err, "test_sk_storage_map_stress_change", "err:%d\n", err);
}
static void test_sk_storage_map_basic(void)
{
struct {
int cnt;
int lock;
} value = { .cnt = 0xeB9f, .lock = 0, }, lookup_value;
struct bpf_create_map_attr bad_xattr;
int btf_fd, map_fd, sk_fd, err;
btf_fd = load_btf();
CHECK(btf_fd == -1, "bpf_load_btf", "btf_fd:%d errno:%d\n",
btf_fd, errno);
xattr.btf_fd = btf_fd;
sk_fd = socket(AF_INET6, SOCK_STREAM, 0);
CHECK(sk_fd == -1, "socket()", "sk_fd:%d errno:%d\n",
sk_fd, errno);
map_fd = bpf_create_map_xattr(&xattr);
CHECK(map_fd == -1, "bpf_create_map_xattr(good_xattr)",
"map_fd:%d errno:%d\n", map_fd, errno);
/* Add new elem */
memcpy(&lookup_value, &value, sizeof(value));
err = bpf_map_update_elem(map_fd, &sk_fd, &value,
BPF_NOEXIST | BPF_F_LOCK);
CHECK(err, "bpf_map_update_elem(BPF_NOEXIST|BPF_F_LOCK)",
"err:%d errno:%d\n", err, errno);
err = bpf_map_lookup_elem_flags(map_fd, &sk_fd, &lookup_value,
BPF_F_LOCK);
CHECK(err || lookup_value.cnt != value.cnt,
"bpf_map_lookup_elem_flags(BPF_F_LOCK)",
"err:%d errno:%d cnt:%x(%x)\n",
err, errno, lookup_value.cnt, value.cnt);
/* Bump the cnt and update with BPF_EXIST | BPF_F_LOCK */
value.cnt += 1;
err = bpf_map_update_elem(map_fd, &sk_fd, &value,
BPF_EXIST | BPF_F_LOCK);
CHECK(err, "bpf_map_update_elem(BPF_EXIST|BPF_F_LOCK)",
"err:%d errno:%d\n", err, errno);
err = bpf_map_lookup_elem_flags(map_fd, &sk_fd, &lookup_value,
BPF_F_LOCK);
CHECK(err || lookup_value.cnt != value.cnt,
"bpf_map_lookup_elem_flags(BPF_F_LOCK)",
"err:%d errno:%d cnt:%x(%x)\n",
err, errno, lookup_value.cnt, value.cnt);
/* Bump the cnt and update with BPF_EXIST */
value.cnt += 1;
err = bpf_map_update_elem(map_fd, &sk_fd, &value, BPF_EXIST);
CHECK(err, "bpf_map_update_elem(BPF_EXIST)",
"err:%d errno:%d\n", err, errno);
err = bpf_map_lookup_elem_flags(map_fd, &sk_fd, &lookup_value,
BPF_F_LOCK);
CHECK(err || lookup_value.cnt != value.cnt,
"bpf_map_lookup_elem_flags(BPF_F_LOCK)",
"err:%d errno:%d cnt:%x(%x)\n",
err, errno, lookup_value.cnt, value.cnt);
/* Update with BPF_NOEXIST */
value.cnt += 1;
err = bpf_map_update_elem(map_fd, &sk_fd, &value,
BPF_NOEXIST | BPF_F_LOCK);
CHECK(!err || errno != EEXIST,
"bpf_map_update_elem(BPF_NOEXIST|BPF_F_LOCK)",
"err:%d errno:%d\n", err, errno);
err = bpf_map_update_elem(map_fd, &sk_fd, &value, BPF_NOEXIST);
CHECK(!err || errno != EEXIST, "bpf_map_update_elem(BPF_NOEXIST)",
"err:%d errno:%d\n", err, errno);
value.cnt -= 1;
err = bpf_map_lookup_elem_flags(map_fd, &sk_fd, &lookup_value,
BPF_F_LOCK);
CHECK(err || lookup_value.cnt != value.cnt,
"bpf_map_lookup_elem_flags(BPF_F_LOCK)",
"err:%d errno:%d cnt:%x(%x)\n",
err, errno, lookup_value.cnt, value.cnt);
/* Bump the cnt again and update with map_flags == 0 */
value.cnt += 1;
err = bpf_map_update_elem(map_fd, &sk_fd, &value, 0);
CHECK(err, "bpf_map_update_elem()", "err:%d errno:%d\n",
err, errno);
err = bpf_map_lookup_elem_flags(map_fd, &sk_fd, &lookup_value,
BPF_F_LOCK);
CHECK(err || lookup_value.cnt != value.cnt,
"bpf_map_lookup_elem_flags(BPF_F_LOCK)",
"err:%d errno:%d cnt:%x(%x)\n",
err, errno, lookup_value.cnt, value.cnt);
/* Test delete elem */
err = bpf_map_delete_elem(map_fd, &sk_fd);
CHECK(err, "bpf_map_delete_elem()", "err:%d errno:%d\n",
err, errno);
err = bpf_map_lookup_elem_flags(map_fd, &sk_fd, &lookup_value,
BPF_F_LOCK);
CHECK(!err || errno != ENOENT,
"bpf_map_lookup_elem_flags(BPF_F_LOCK)",
"err:%d errno:%d\n", err, errno);
err = bpf_map_delete_elem(map_fd, &sk_fd);
CHECK(!err || errno != ENOENT, "bpf_map_delete_elem()",
"err:%d errno:%d\n", err, errno);
memcpy(&bad_xattr, &xattr, sizeof(xattr));
bad_xattr.btf_key_type_id = 0;
err = bpf_create_map_xattr(&bad_xattr);
CHECK(!err || errno != EINVAL, "bap_create_map_xattr(bad_xattr)",
"err:%d errno:%d\n", err, errno);
memcpy(&bad_xattr, &xattr, sizeof(xattr));
bad_xattr.btf_key_type_id = 3;
err = bpf_create_map_xattr(&bad_xattr);
CHECK(!err || errno != EINVAL, "bap_create_map_xattr(bad_xattr)",
"err:%d errno:%d\n", err, errno);
memcpy(&bad_xattr, &xattr, sizeof(xattr));
bad_xattr.max_entries = 1;
err = bpf_create_map_xattr(&bad_xattr);
CHECK(!err || errno != EINVAL, "bap_create_map_xattr(bad_xattr)",
"err:%d errno:%d\n", err, errno);
memcpy(&bad_xattr, &xattr, sizeof(xattr));
bad_xattr.map_flags = 0;
err = bpf_create_map_xattr(&bad_xattr);
CHECK(!err || errno != EINVAL, "bap_create_map_xattr(bad_xattr)",
"err:%d errno:%d\n", err, errno);
xattr.btf_fd = -1;
close(btf_fd);
close(map_fd);
close(sk_fd);
}
void test_sk_storage_map(void)
{
const char *test_name, *env_opt;
bool test_ran = false;
test_name = getenv(BPF_SK_STORAGE_MAP_TEST_NAME);
env_opt = getenv(BPF_SK_STORAGE_MAP_TEST_NR_THREADS);
if (env_opt)
nr_sk_threads = atoi(env_opt);
env_opt = getenv(BPF_SK_STORAGE_MAP_TEST_SK_PER_THREAD);
if (env_opt)
nr_sk_per_thread = atoi(env_opt);
env_opt = getenv(BPF_SK_STORAGE_MAP_TEST_RUNTIME_S);
if (env_opt)
runtime_s = atoi(env_opt);
if (!test_name || !strcmp(test_name, "basic")) {
test_sk_storage_map_basic();
test_ran = true;
}
if (!test_name || !strcmp(test_name, "stress_free")) {
test_sk_storage_map_stress_free();
test_ran = true;
}
if (!test_name || !strcmp(test_name, "stress_change")) {
test_sk_storage_map_stress_change();
test_ran = true;
}
if (test_ran)
printf("%s:PASS\n", __func__);
else
CHECK(1, "Invalid test_name", "%s\n", test_name);
}

View File

@ -1,5 +1,8 @@
// SPDX-License-Identifier: GPL-2.0 // SPDX-License-Identifier: GPL-2.0
#include <test_progs.h> #include <test_progs.h>
#include <error.h>
#include <linux/if.h>
#include <linux/if_tun.h>
#define CHECK_FLOW_KEYS(desc, got, expected) \ #define CHECK_FLOW_KEYS(desc, got, expected) \
CHECK_ATTR(memcmp(&got, &expected, sizeof(got)) != 0, \ CHECK_ATTR(memcmp(&got, &expected, sizeof(got)) != 0, \
@ -79,8 +82,8 @@ struct test tests[] = {
.tcp.doff = 5, .tcp.doff = 5,
}, },
.keys = { .keys = {
.nhoff = 0, .nhoff = ETH_HLEN,
.thoff = sizeof(struct iphdr), .thoff = ETH_HLEN + sizeof(struct iphdr),
.addr_proto = ETH_P_IP, .addr_proto = ETH_P_IP,
.ip_proto = IPPROTO_TCP, .ip_proto = IPPROTO_TCP,
.n_proto = __bpf_constant_htons(ETH_P_IP), .n_proto = __bpf_constant_htons(ETH_P_IP),
@ -95,8 +98,8 @@ struct test tests[] = {
.tcp.doff = 5, .tcp.doff = 5,
}, },
.keys = { .keys = {
.nhoff = 0, .nhoff = ETH_HLEN,
.thoff = sizeof(struct ipv6hdr), .thoff = ETH_HLEN + sizeof(struct ipv6hdr),
.addr_proto = ETH_P_IPV6, .addr_proto = ETH_P_IPV6,
.ip_proto = IPPROTO_TCP, .ip_proto = IPPROTO_TCP,
.n_proto = __bpf_constant_htons(ETH_P_IPV6), .n_proto = __bpf_constant_htons(ETH_P_IPV6),
@ -113,8 +116,8 @@ struct test tests[] = {
.tcp.doff = 5, .tcp.doff = 5,
}, },
.keys = { .keys = {
.nhoff = VLAN_HLEN, .nhoff = ETH_HLEN + VLAN_HLEN,
.thoff = VLAN_HLEN + sizeof(struct iphdr), .thoff = ETH_HLEN + VLAN_HLEN + sizeof(struct iphdr),
.addr_proto = ETH_P_IP, .addr_proto = ETH_P_IP,
.ip_proto = IPPROTO_TCP, .ip_proto = IPPROTO_TCP,
.n_proto = __bpf_constant_htons(ETH_P_IP), .n_proto = __bpf_constant_htons(ETH_P_IP),
@ -131,8 +134,9 @@ struct test tests[] = {
.tcp.doff = 5, .tcp.doff = 5,
}, },
.keys = { .keys = {
.nhoff = VLAN_HLEN * 2, .nhoff = ETH_HLEN + VLAN_HLEN * 2,
.thoff = VLAN_HLEN * 2 + sizeof(struct ipv6hdr), .thoff = ETH_HLEN + VLAN_HLEN * 2 +
sizeof(struct ipv6hdr),
.addr_proto = ETH_P_IPV6, .addr_proto = ETH_P_IPV6,
.ip_proto = IPPROTO_TCP, .ip_proto = IPPROTO_TCP,
.n_proto = __bpf_constant_htons(ETH_P_IPV6), .n_proto = __bpf_constant_htons(ETH_P_IPV6),
@ -140,13 +144,73 @@ struct test tests[] = {
}, },
}; };
static int create_tap(const char *ifname)
{
struct ifreq ifr = {
.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_NAPI | IFF_NAPI_FRAGS,
};
int fd, ret;
strncpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name));
fd = open("/dev/net/tun", O_RDWR);
if (fd < 0)
return -1;
ret = ioctl(fd, TUNSETIFF, &ifr);
if (ret)
return -1;
return fd;
}
static int tx_tap(int fd, void *pkt, size_t len)
{
struct iovec iov[] = {
{
.iov_len = len,
.iov_base = pkt,
},
};
return writev(fd, iov, ARRAY_SIZE(iov));
}
static int ifup(const char *ifname)
{
struct ifreq ifr = {};
int sk, ret;
strncpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name));
sk = socket(PF_INET, SOCK_DGRAM, 0);
if (sk < 0)
return -1;
ret = ioctl(sk, SIOCGIFFLAGS, &ifr);
if (ret) {
close(sk);
return -1;
}
ifr.ifr_flags |= IFF_UP;
ret = ioctl(sk, SIOCSIFFLAGS, &ifr);
if (ret) {
close(sk);
return -1;
}
close(sk);
return 0;
}
void test_flow_dissector(void) void test_flow_dissector(void)
{ {
int i, err, prog_fd, keys_fd = -1, tap_fd;
struct bpf_object *obj; struct bpf_object *obj;
int i, err, prog_fd; __u32 duration = 0;
err = bpf_flow_load(&obj, "./bpf_flow.o", "flow_dissector", err = bpf_flow_load(&obj, "./bpf_flow.o", "flow_dissector",
"jmp_table", &prog_fd); "jmp_table", "last_dissection", &prog_fd, &keys_fd);
if (err) { if (err) {
error_cnt++; error_cnt++;
return; return;
@ -171,5 +235,34 @@ void test_flow_dissector(void)
CHECK_FLOW_KEYS(tests[i].name, flow_keys, tests[i].keys); CHECK_FLOW_KEYS(tests[i].name, flow_keys, tests[i].keys);
} }
/* Do the same tests but for skb-less flow dissector.
* We use a known path in the net/tun driver that calls
* eth_get_headlen and we manually export bpf_flow_keys
* via BPF map in this case.
*/
err = bpf_prog_attach(prog_fd, 0, BPF_FLOW_DISSECTOR, 0);
CHECK(err, "bpf_prog_attach", "err %d errno %d", err, errno);
tap_fd = create_tap("tap0");
CHECK(tap_fd < 0, "create_tap", "tap_fd %d errno %d", tap_fd, errno);
err = ifup("tap0");
CHECK(err, "ifup", "err %d errno %d", err, errno);
for (i = 0; i < ARRAY_SIZE(tests); i++) {
struct bpf_flow_keys flow_keys = {};
struct bpf_prog_test_run_attr tattr = {};
__u32 key = 0;
err = tx_tap(tap_fd, &tests[i].pkt, sizeof(tests[i].pkt));
CHECK(err < 0, "tx_tap", "err %d errno %d", err, errno);
err = bpf_map_lookup_elem(keys_fd, &key, &flow_keys);
CHECK_ATTR(err, tests[i].name, "bpf_map_lookup_elem %d\n", err);
CHECK_ATTR(err, tests[i].name, "skb-less err %d\n", err);
CHECK_FLOW_KEYS(tests[i].name, flow_keys, tests[i].keys);
}
bpf_object__close(obj); bpf_object__close(obj);
} }

View File

@ -0,0 +1,48 @@
// SPDX-License-Identifier: GPL-2.0
#include <test_progs.h>
void test_flow_dissector_load_bytes(void)
{
struct bpf_flow_keys flow_keys;
__u32 duration = 0, retval, size;
struct bpf_insn prog[] = {
// BPF_REG_1 - 1st argument: context
// BPF_REG_2 - 2nd argument: offset, start at first byte
BPF_MOV64_IMM(BPF_REG_2, 0),
// BPF_REG_3 - 3rd argument: destination, reserve byte on stack
BPF_ALU64_REG(BPF_MOV, BPF_REG_3, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -1),
// BPF_REG_4 - 4th argument: copy one byte
BPF_MOV64_IMM(BPF_REG_4, 1),
// bpf_skb_load_bytes(ctx, sizeof(pkt_v4), ptr, 1)
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
BPF_FUNC_skb_load_bytes),
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
// if (ret == 0) return BPF_DROP (2)
BPF_MOV64_IMM(BPF_REG_0, BPF_DROP),
BPF_EXIT_INSN(),
// if (ret != 0) return BPF_OK (0)
BPF_MOV64_IMM(BPF_REG_0, BPF_OK),
BPF_EXIT_INSN(),
};
int fd, err;
/* make sure bpf_skb_load_bytes is not allowed from skb-less context
*/
fd = bpf_load_program(BPF_PROG_TYPE_FLOW_DISSECTOR, prog,
ARRAY_SIZE(prog), "GPL", 0, NULL, 0);
CHECK(fd < 0,
"flow_dissector-bpf_skb_load_bytes-load",
"fd %d errno %d\n",
fd, errno);
err = bpf_prog_test_run(fd, 1, &pkt_v4, sizeof(pkt_v4),
&flow_keys, &size, &retval, &duration);
CHECK(size != sizeof(flow_keys) || err || retval != 1,
"flow_dissector-bpf_skb_load_bytes",
"err %d errno %d retval %d duration %d size %u/%zu\n",
err, errno, retval, duration, size, sizeof(flow_keys));
if (fd >= -1)
close(fd);
}

View File

@ -0,0 +1,42 @@
// SPDX-License-Identifier: GPL-2.0
#include <test_progs.h>
#include <linux/nbd.h>
void test_raw_tp_writable_reject_nbd_invalid(void)
{
__u32 duration = 0;
char error[4096];
int bpf_fd = -1, tp_fd = -1;
const struct bpf_insn program[] = {
/* r6 is our tp buffer */
BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0),
/* one byte beyond the end of the nbd_request struct */
BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_6,
sizeof(struct nbd_request)),
BPF_EXIT_INSN(),
};
struct bpf_load_program_attr load_attr = {
.prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
.license = "GPL v2",
.insns = program,
.insns_cnt = sizeof(program) / sizeof(struct bpf_insn),
.log_level = 2,
};
bpf_fd = bpf_load_program_xattr(&load_attr, error, sizeof(error));
if (CHECK(bpf_fd < 0, "bpf_raw_tracepoint_writable load",
"failed: %d errno %d\n", bpf_fd, errno))
return;
tp_fd = bpf_raw_tracepoint_open("nbd_send_request", bpf_fd);
if (CHECK(tp_fd >= 0, "bpf_raw_tracepoint_writable open",
"erroneously succeeded\n"))
goto out_bpffd;
close(tp_fd);
out_bpffd:
close(bpf_fd);
}

View File

@ -0,0 +1,80 @@
// SPDX-License-Identifier: GPL-2.0
#include <test_progs.h>
#include <linux/nbd.h>
void test_raw_tp_writable_test_run(void)
{
__u32 duration = 0;
char error[4096];
const struct bpf_insn trace_program[] = {
BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0),
BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6, 0),
BPF_MOV64_IMM(BPF_REG_0, 42),
BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_0, 0),
BPF_EXIT_INSN(),
};
struct bpf_load_program_attr load_attr = {
.prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
.license = "GPL v2",
.insns = trace_program,
.insns_cnt = sizeof(trace_program) / sizeof(struct bpf_insn),
.log_level = 2,
};
int bpf_fd = bpf_load_program_xattr(&load_attr, error, sizeof(error));
if (CHECK(bpf_fd < 0, "bpf_raw_tracepoint_writable loaded",
"failed: %d errno %d\n", bpf_fd, errno))
return;
const struct bpf_insn skb_program[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
};
struct bpf_load_program_attr skb_load_attr = {
.prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
.license = "GPL v2",
.insns = skb_program,
.insns_cnt = sizeof(skb_program) / sizeof(struct bpf_insn),
};
int filter_fd =
bpf_load_program_xattr(&skb_load_attr, error, sizeof(error));
if (CHECK(filter_fd < 0, "test_program_loaded", "failed: %d errno %d\n",
filter_fd, errno))
goto out_bpffd;
int tp_fd = bpf_raw_tracepoint_open("bpf_test_finish", bpf_fd);
if (CHECK(tp_fd < 0, "bpf_raw_tracepoint_writable opened",
"failed: %d errno %d\n", tp_fd, errno))
goto out_filterfd;
char test_skb[128] = {
0,
};
__u32 prog_ret;
int err = bpf_prog_test_run(filter_fd, 1, test_skb, sizeof(test_skb), 0,
0, &prog_ret, 0);
CHECK(err != 42, "test_run",
"tracepoint did not modify return value\n");
CHECK(prog_ret != 0, "test_run_ret",
"socket_filter did not return 0\n");
close(tp_fd);
err = bpf_prog_test_run(filter_fd, 1, test_skb, sizeof(test_skb), 0, 0,
&prog_ret, 0);
CHECK(err != 0, "test_run_notrace",
"test_run failed with %d errno %d\n", err, errno);
CHECK(prog_ret != 0, "test_run_ret_notrace",
"socket_filter did not return 0\n");
out_filterfd:
close(filter_fd);
out_bpffd:
close(bpf_fd);
}

View File

@ -64,6 +64,25 @@ struct bpf_map_def SEC("maps") jmp_table = {
.max_entries = 8 .max_entries = 8
}; };
struct bpf_map_def SEC("maps") last_dissection = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(__u32),
.value_size = sizeof(struct bpf_flow_keys),
.max_entries = 1,
};
static __always_inline int export_flow_keys(struct bpf_flow_keys *keys,
int ret)
{
struct bpf_flow_keys *val;
__u32 key = 0;
val = bpf_map_lookup_elem(&last_dissection, &key);
if (val)
memcpy(val, keys, sizeof(*val));
return ret;
}
static __always_inline void *bpf_flow_dissect_get_header(struct __sk_buff *skb, static __always_inline void *bpf_flow_dissect_get_header(struct __sk_buff *skb,
__u16 hdr_size, __u16 hdr_size,
void *buffer) void *buffer)
@ -109,10 +128,10 @@ static __always_inline int parse_eth_proto(struct __sk_buff *skb, __be16 proto)
break; break;
default: default:
/* Protocol not supported */ /* Protocol not supported */
return BPF_DROP; return export_flow_keys(keys, BPF_DROP);
} }
return BPF_DROP; return export_flow_keys(keys, BPF_DROP);
} }
SEC("flow_dissector") SEC("flow_dissector")
@ -139,8 +158,8 @@ static __always_inline int parse_ip_proto(struct __sk_buff *skb, __u8 proto)
case IPPROTO_ICMP: case IPPROTO_ICMP:
icmp = bpf_flow_dissect_get_header(skb, sizeof(*icmp), &_icmp); icmp = bpf_flow_dissect_get_header(skb, sizeof(*icmp), &_icmp);
if (!icmp) if (!icmp)
return BPF_DROP; return export_flow_keys(keys, BPF_DROP);
return BPF_OK; return export_flow_keys(keys, BPF_OK);
case IPPROTO_IPIP: case IPPROTO_IPIP:
keys->is_encap = true; keys->is_encap = true;
return parse_eth_proto(skb, bpf_htons(ETH_P_IP)); return parse_eth_proto(skb, bpf_htons(ETH_P_IP));
@ -150,11 +169,11 @@ static __always_inline int parse_ip_proto(struct __sk_buff *skb, __u8 proto)
case IPPROTO_GRE: case IPPROTO_GRE:
gre = bpf_flow_dissect_get_header(skb, sizeof(*gre), &_gre); gre = bpf_flow_dissect_get_header(skb, sizeof(*gre), &_gre);
if (!gre) if (!gre)
return BPF_DROP; return export_flow_keys(keys, BPF_DROP);
if (bpf_htons(gre->flags & GRE_VERSION)) if (bpf_htons(gre->flags & GRE_VERSION))
/* Only inspect standard GRE packets with version 0 */ /* Only inspect standard GRE packets with version 0 */
return BPF_OK; return export_flow_keys(keys, BPF_OK);
keys->thoff += sizeof(*gre); /* Step over GRE Flags and Proto */ keys->thoff += sizeof(*gre); /* Step over GRE Flags and Proto */
if (GRE_IS_CSUM(gre->flags)) if (GRE_IS_CSUM(gre->flags))
@ -170,7 +189,7 @@ static __always_inline int parse_ip_proto(struct __sk_buff *skb, __u8 proto)
eth = bpf_flow_dissect_get_header(skb, sizeof(*eth), eth = bpf_flow_dissect_get_header(skb, sizeof(*eth),
&_eth); &_eth);
if (!eth) if (!eth)
return BPF_DROP; return export_flow_keys(keys, BPF_DROP);
keys->thoff += sizeof(*eth); keys->thoff += sizeof(*eth);
@ -181,31 +200,31 @@ static __always_inline int parse_ip_proto(struct __sk_buff *skb, __u8 proto)
case IPPROTO_TCP: case IPPROTO_TCP:
tcp = bpf_flow_dissect_get_header(skb, sizeof(*tcp), &_tcp); tcp = bpf_flow_dissect_get_header(skb, sizeof(*tcp), &_tcp);
if (!tcp) if (!tcp)
return BPF_DROP; return export_flow_keys(keys, BPF_DROP);
if (tcp->doff < 5) if (tcp->doff < 5)
return BPF_DROP; return export_flow_keys(keys, BPF_DROP);
if ((__u8 *)tcp + (tcp->doff << 2) > data_end) if ((__u8 *)tcp + (tcp->doff << 2) > data_end)
return BPF_DROP; return export_flow_keys(keys, BPF_DROP);
keys->sport = tcp->source; keys->sport = tcp->source;
keys->dport = tcp->dest; keys->dport = tcp->dest;
return BPF_OK; return export_flow_keys(keys, BPF_OK);
case IPPROTO_UDP: case IPPROTO_UDP:
case IPPROTO_UDPLITE: case IPPROTO_UDPLITE:
udp = bpf_flow_dissect_get_header(skb, sizeof(*udp), &_udp); udp = bpf_flow_dissect_get_header(skb, sizeof(*udp), &_udp);
if (!udp) if (!udp)
return BPF_DROP; return export_flow_keys(keys, BPF_DROP);
keys->sport = udp->source; keys->sport = udp->source;
keys->dport = udp->dest; keys->dport = udp->dest;
return BPF_OK; return export_flow_keys(keys, BPF_OK);
default: default:
return BPF_DROP; return export_flow_keys(keys, BPF_DROP);
} }
return BPF_DROP; return export_flow_keys(keys, BPF_DROP);
} }
static __always_inline int parse_ipv6_proto(struct __sk_buff *skb, __u8 nexthdr) static __always_inline int parse_ipv6_proto(struct __sk_buff *skb, __u8 nexthdr)
@ -225,7 +244,7 @@ static __always_inline int parse_ipv6_proto(struct __sk_buff *skb, __u8 nexthdr)
return parse_ip_proto(skb, nexthdr); return parse_ip_proto(skb, nexthdr);
} }
return BPF_DROP; return export_flow_keys(keys, BPF_DROP);
} }
PROG(IP)(struct __sk_buff *skb) PROG(IP)(struct __sk_buff *skb)
@ -238,11 +257,11 @@ PROG(IP)(struct __sk_buff *skb)
iph = bpf_flow_dissect_get_header(skb, sizeof(*iph), &_iph); iph = bpf_flow_dissect_get_header(skb, sizeof(*iph), &_iph);
if (!iph) if (!iph)
return BPF_DROP; return export_flow_keys(keys, BPF_DROP);
/* IP header cannot be smaller than 20 bytes */ /* IP header cannot be smaller than 20 bytes */
if (iph->ihl < 5) if (iph->ihl < 5)
return BPF_DROP; return export_flow_keys(keys, BPF_DROP);
keys->addr_proto = ETH_P_IP; keys->addr_proto = ETH_P_IP;
keys->ipv4_src = iph->saddr; keys->ipv4_src = iph->saddr;
@ -250,7 +269,7 @@ PROG(IP)(struct __sk_buff *skb)
keys->thoff += iph->ihl << 2; keys->thoff += iph->ihl << 2;
if (data + keys->thoff > data_end) if (data + keys->thoff > data_end)
return BPF_DROP; return export_flow_keys(keys, BPF_DROP);
if (iph->frag_off & bpf_htons(IP_MF | IP_OFFSET)) { if (iph->frag_off & bpf_htons(IP_MF | IP_OFFSET)) {
keys->is_frag = true; keys->is_frag = true;
@ -264,7 +283,7 @@ PROG(IP)(struct __sk_buff *skb)
} }
if (done) if (done)
return BPF_OK; return export_flow_keys(keys, BPF_OK);
return parse_ip_proto(skb, iph->protocol); return parse_ip_proto(skb, iph->protocol);
} }
@ -276,7 +295,7 @@ PROG(IPV6)(struct __sk_buff *skb)
ip6h = bpf_flow_dissect_get_header(skb, sizeof(*ip6h), &_ip6h); ip6h = bpf_flow_dissect_get_header(skb, sizeof(*ip6h), &_ip6h);
if (!ip6h) if (!ip6h)
return BPF_DROP; return export_flow_keys(keys, BPF_DROP);
keys->addr_proto = ETH_P_IPV6; keys->addr_proto = ETH_P_IPV6;
memcpy(&keys->ipv6_src, &ip6h->saddr, 2*sizeof(ip6h->saddr)); memcpy(&keys->ipv6_src, &ip6h->saddr, 2*sizeof(ip6h->saddr));
@ -288,11 +307,12 @@ PROG(IPV6)(struct __sk_buff *skb)
PROG(IPV6OP)(struct __sk_buff *skb) PROG(IPV6OP)(struct __sk_buff *skb)
{ {
struct bpf_flow_keys *keys = skb->flow_keys;
struct ipv6_opt_hdr *ip6h, _ip6h; struct ipv6_opt_hdr *ip6h, _ip6h;
ip6h = bpf_flow_dissect_get_header(skb, sizeof(*ip6h), &_ip6h); ip6h = bpf_flow_dissect_get_header(skb, sizeof(*ip6h), &_ip6h);
if (!ip6h) if (!ip6h)
return BPF_DROP; return export_flow_keys(keys, BPF_DROP);
/* hlen is in 8-octets and does not include the first 8 bytes /* hlen is in 8-octets and does not include the first 8 bytes
* of the header * of the header
@ -309,7 +329,7 @@ PROG(IPV6FR)(struct __sk_buff *skb)
fragh = bpf_flow_dissect_get_header(skb, sizeof(*fragh), &_fragh); fragh = bpf_flow_dissect_get_header(skb, sizeof(*fragh), &_fragh);
if (!fragh) if (!fragh)
return BPF_DROP; return export_flow_keys(keys, BPF_DROP);
keys->thoff += sizeof(*fragh); keys->thoff += sizeof(*fragh);
keys->is_frag = true; keys->is_frag = true;
@ -321,13 +341,14 @@ PROG(IPV6FR)(struct __sk_buff *skb)
PROG(MPLS)(struct __sk_buff *skb) PROG(MPLS)(struct __sk_buff *skb)
{ {
struct bpf_flow_keys *keys = skb->flow_keys;
struct mpls_label *mpls, _mpls; struct mpls_label *mpls, _mpls;
mpls = bpf_flow_dissect_get_header(skb, sizeof(*mpls), &_mpls); mpls = bpf_flow_dissect_get_header(skb, sizeof(*mpls), &_mpls);
if (!mpls) if (!mpls)
return BPF_DROP; return export_flow_keys(keys, BPF_DROP);
return BPF_OK; return export_flow_keys(keys, BPF_OK);
} }
PROG(VLAN)(struct __sk_buff *skb) PROG(VLAN)(struct __sk_buff *skb)
@ -339,10 +360,10 @@ PROG(VLAN)(struct __sk_buff *skb)
if (keys->n_proto == bpf_htons(ETH_P_8021AD)) { if (keys->n_proto == bpf_htons(ETH_P_8021AD)) {
vlan = bpf_flow_dissect_get_header(skb, sizeof(*vlan), &_vlan); vlan = bpf_flow_dissect_get_header(skb, sizeof(*vlan), &_vlan);
if (!vlan) if (!vlan)
return BPF_DROP; return export_flow_keys(keys, BPF_DROP);
if (vlan->h_vlan_encapsulated_proto != bpf_htons(ETH_P_8021Q)) if (vlan->h_vlan_encapsulated_proto != bpf_htons(ETH_P_8021Q))
return BPF_DROP; return export_flow_keys(keys, BPF_DROP);
keys->nhoff += sizeof(*vlan); keys->nhoff += sizeof(*vlan);
keys->thoff += sizeof(*vlan); keys->thoff += sizeof(*vlan);
@ -350,14 +371,14 @@ PROG(VLAN)(struct __sk_buff *skb)
vlan = bpf_flow_dissect_get_header(skb, sizeof(*vlan), &_vlan); vlan = bpf_flow_dissect_get_header(skb, sizeof(*vlan), &_vlan);
if (!vlan) if (!vlan)
return BPF_DROP; return export_flow_keys(keys, BPF_DROP);
keys->nhoff += sizeof(*vlan); keys->nhoff += sizeof(*vlan);
keys->thoff += sizeof(*vlan); keys->thoff += sizeof(*vlan);
/* Only allow 8021AD + 8021Q double tagging and no triple tagging.*/ /* Only allow 8021AD + 8021Q double tagging and no triple tagging.*/
if (vlan->h_vlan_encapsulated_proto == bpf_htons(ETH_P_8021AD) || if (vlan->h_vlan_encapsulated_proto == bpf_htons(ETH_P_8021AD) ||
vlan->h_vlan_encapsulated_proto == bpf_htons(ETH_P_8021Q)) vlan->h_vlan_encapsulated_proto == bpf_htons(ETH_P_8021Q))
return BPF_DROP; return export_flow_keys(keys, BPF_DROP);
keys->n_proto = vlan->h_vlan_encapsulated_proto; keys->n_proto = vlan->h_vlan_encapsulated_proto;
return parse_eth_proto(skb, vlan->h_vlan_encapsulated_proto); return parse_eth_proto(skb, vlan->h_vlan_encapsulated_proto);

View File

@ -55,6 +55,31 @@ struct bpf_map_def SEC("maps") linum_map = {
.max_entries = __NR_BPF_LINUM_ARRAY_IDX, .max_entries = __NR_BPF_LINUM_ARRAY_IDX,
}; };
struct bpf_spinlock_cnt {
struct bpf_spin_lock lock;
__u32 cnt;
};
struct bpf_map_def SEC("maps") sk_pkt_out_cnt = {
.type = BPF_MAP_TYPE_SK_STORAGE,
.key_size = sizeof(int),
.value_size = sizeof(struct bpf_spinlock_cnt),
.max_entries = 0,
.map_flags = BPF_F_NO_PREALLOC,
};
BPF_ANNOTATE_KV_PAIR(sk_pkt_out_cnt, int, struct bpf_spinlock_cnt);
struct bpf_map_def SEC("maps") sk_pkt_out_cnt10 = {
.type = BPF_MAP_TYPE_SK_STORAGE,
.key_size = sizeof(int),
.value_size = sizeof(struct bpf_spinlock_cnt),
.max_entries = 0,
.map_flags = BPF_F_NO_PREALLOC,
};
BPF_ANNOTATE_KV_PAIR(sk_pkt_out_cnt10, int, struct bpf_spinlock_cnt);
static bool is_loopback6(__u32 *a6) static bool is_loopback6(__u32 *a6)
{ {
return !a6[0] && !a6[1] && !a6[2] && a6[3] == bpf_htonl(1); return !a6[0] && !a6[1] && !a6[2] && a6[3] == bpf_htonl(1);
@ -120,7 +145,9 @@ static void tpcpy(struct bpf_tcp_sock *dst,
SEC("cgroup_skb/egress") SEC("cgroup_skb/egress")
int egress_read_sock_fields(struct __sk_buff *skb) int egress_read_sock_fields(struct __sk_buff *skb)
{ {
struct bpf_spinlock_cnt cli_cnt_init = { .lock = 0, .cnt = 0xeB9F };
__u32 srv_idx = ADDR_SRV_IDX, cli_idx = ADDR_CLI_IDX, result_idx; __u32 srv_idx = ADDR_SRV_IDX, cli_idx = ADDR_CLI_IDX, result_idx;
struct bpf_spinlock_cnt *pkt_out_cnt, *pkt_out_cnt10;
struct sockaddr_in6 *srv_sa6, *cli_sa6; struct sockaddr_in6 *srv_sa6, *cli_sa6;
struct bpf_tcp_sock *tp, *tp_ret; struct bpf_tcp_sock *tp, *tp_ret;
struct bpf_sock *sk, *sk_ret; struct bpf_sock *sk, *sk_ret;
@ -161,6 +188,32 @@ int egress_read_sock_fields(struct __sk_buff *skb)
skcpy(sk_ret, sk); skcpy(sk_ret, sk);
tpcpy(tp_ret, tp); tpcpy(tp_ret, tp);
if (result_idx == EGRESS_SRV_IDX) {
/* The userspace has created it for srv sk */
pkt_out_cnt = bpf_sk_storage_get(&sk_pkt_out_cnt, sk, 0, 0);
pkt_out_cnt10 = bpf_sk_storage_get(&sk_pkt_out_cnt10, sk,
0, 0);
} else {
pkt_out_cnt = bpf_sk_storage_get(&sk_pkt_out_cnt, sk,
&cli_cnt_init,
BPF_SK_STORAGE_GET_F_CREATE);
pkt_out_cnt10 = bpf_sk_storage_get(&sk_pkt_out_cnt10,
sk, &cli_cnt_init,
BPF_SK_STORAGE_GET_F_CREATE);
}
if (!pkt_out_cnt || !pkt_out_cnt10)
RETURN;
/* Even both cnt and cnt10 have lock defined in their BTF,
* intentionally one cnt takes lock while one does not
* as a test for the spinlock support in BPF_MAP_TYPE_SK_STORAGE.
*/
pkt_out_cnt->cnt += 1;
bpf_spin_lock(&pkt_out_cnt10->lock);
pkt_out_cnt10->cnt += 10;
bpf_spin_unlock(&pkt_out_cnt10->lock);
RETURN; RETURN;
} }

View File

@ -77,17 +77,52 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
struct v4hdr h_outer; struct v4hdr h_outer;
struct tcphdr tcph; struct tcphdr tcph;
int olen, l2_len; int olen, l2_len;
int tcp_off;
__u64 flags; __u64 flags;
/* Most tests encapsulate a packet into a tunnel with the same
* network protocol, and derive the outer header fields from
* the inner header.
*
* The 6in4 case tests different inner and outer protocols. As
* the inner is ipv6, but the outer expects an ipv4 header as
* input, manually build a struct iphdr based on the ipv6hdr.
*/
if (encap_proto == IPPROTO_IPV6) {
const __u32 saddr = (192 << 24) | (168 << 16) | (1 << 8) | 1;
const __u32 daddr = (192 << 24) | (168 << 16) | (1 << 8) | 2;
struct ipv6hdr iph6_inner;
/* Read the IPv6 header */
if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph6_inner,
sizeof(iph6_inner)) < 0)
return TC_ACT_OK;
/* Derive the IPv4 header fields from the IPv6 header */
memset(&iph_inner, 0, sizeof(iph_inner));
iph_inner.version = 4;
iph_inner.ihl = 5;
iph_inner.tot_len = bpf_htons(sizeof(iph6_inner) +
bpf_ntohs(iph6_inner.payload_len));
iph_inner.ttl = iph6_inner.hop_limit - 1;
iph_inner.protocol = iph6_inner.nexthdr;
iph_inner.saddr = __bpf_constant_htonl(saddr);
iph_inner.daddr = __bpf_constant_htonl(daddr);
tcp_off = sizeof(iph6_inner);
} else {
if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_inner, if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_inner,
sizeof(iph_inner)) < 0) sizeof(iph_inner)) < 0)
return TC_ACT_OK; return TC_ACT_OK;
tcp_off = sizeof(iph_inner);
}
/* filter only packets we want */ /* filter only packets we want */
if (iph_inner.ihl != 5 || iph_inner.protocol != IPPROTO_TCP) if (iph_inner.ihl != 5 || iph_inner.protocol != IPPROTO_TCP)
return TC_ACT_OK; return TC_ACT_OK;
if (bpf_skb_load_bytes(skb, ETH_HLEN + sizeof(iph_inner), if (bpf_skb_load_bytes(skb, ETH_HLEN + tcp_off,
&tcph, sizeof(tcph)) < 0) &tcph, sizeof(tcph)) < 0)
return TC_ACT_OK; return TC_ACT_OK;
@ -129,6 +164,7 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
l2_len); l2_len);
break; break;
case IPPROTO_IPIP: case IPPROTO_IPIP:
case IPPROTO_IPV6:
break; break;
default: default:
return TC_ACT_OK; return TC_ACT_OK;
@ -164,6 +200,17 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
BPF_F_INVALIDATE_HASH) < 0) BPF_F_INVALIDATE_HASH) < 0)
return TC_ACT_SHOT; return TC_ACT_SHOT;
/* if changing outer proto type, update eth->h_proto */
if (encap_proto == IPPROTO_IPV6) {
struct ethhdr eth;
if (bpf_skb_load_bytes(skb, 0, &eth, sizeof(eth)) < 0)
return TC_ACT_SHOT;
eth.h_proto = bpf_htons(ETH_P_IP);
if (bpf_skb_store_bytes(skb, 0, &eth, sizeof(eth), 0) < 0)
return TC_ACT_SHOT;
}
return TC_ACT_OK; return TC_ACT_OK;
} }
@ -325,6 +372,15 @@ int __encap_udp_eth(struct __sk_buff *skb)
return TC_ACT_OK; return TC_ACT_OK;
} }
SEC("encap_sit_none")
int __encap_sit_none(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
return encap_ipv4(skb, IPPROTO_IPV6, ETH_P_IP);
else
return TC_ACT_OK;
}
SEC("encap_ip6tnl_none") SEC("encap_ip6tnl_none")
int __encap_ip6tnl_none(struct __sk_buff *skb) int __encap_ip6tnl_none(struct __sk_buff *skb)
{ {

View File

@ -24,6 +24,7 @@
#include "bpf_rlimit.h" #include "bpf_rlimit.h"
#include "bpf_util.h" #include "bpf_util.h"
#include "test_btf.h"
#define MAX_INSNS 512 #define MAX_INSNS 512
#define MAX_SUBPROGS 16 #define MAX_SUBPROGS 16
@ -58,68 +59,6 @@ static int __base_pr(enum libbpf_print_level level __attribute__((unused)),
return vfprintf(stderr, format, args); return vfprintf(stderr, format, args);
} }
#define BTF_INFO_ENC(kind, kind_flag, vlen) \
((!!(kind_flag) << 31) | ((kind) << 24) | ((vlen) & BTF_MAX_VLEN))
#define BTF_TYPE_ENC(name, info, size_or_type) \
(name), (info), (size_or_type)
#define BTF_INT_ENC(encoding, bits_offset, nr_bits) \
((encoding) << 24 | (bits_offset) << 16 | (nr_bits))
#define BTF_TYPE_INT_ENC(name, encoding, bits_offset, bits, sz) \
BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_INT, 0, 0), sz), \
BTF_INT_ENC(encoding, bits_offset, bits)
#define BTF_FWD_ENC(name, kind_flag) \
BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_FWD, kind_flag, 0), 0)
#define BTF_ARRAY_ENC(type, index_type, nr_elems) \
(type), (index_type), (nr_elems)
#define BTF_TYPE_ARRAY_ENC(type, index_type, nr_elems) \
BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_ARRAY, 0, 0), 0), \
BTF_ARRAY_ENC(type, index_type, nr_elems)
#define BTF_STRUCT_ENC(name, nr_elems, sz) \
BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, nr_elems), sz)
#define BTF_UNION_ENC(name, nr_elems, sz) \
BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_UNION, 0, nr_elems), sz)
#define BTF_VAR_ENC(name, type, linkage) \
BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_VAR, 0, 0), type), (linkage)
#define BTF_VAR_SECINFO_ENC(type, offset, size) \
(type), (offset), (size)
#define BTF_MEMBER_ENC(name, type, bits_offset) \
(name), (type), (bits_offset)
#define BTF_ENUM_ENC(name, val) (name), (val)
#define BTF_MEMBER_OFFSET(bitfield_size, bits_offset) \
((bitfield_size) << 24 | (bits_offset))
#define BTF_TYPEDEF_ENC(name, type) \
BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_TYPEDEF, 0, 0), type)
#define BTF_PTR_ENC(type) \
BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), type)
#define BTF_CONST_ENC(type) \
BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), type)
#define BTF_VOLATILE_ENC(type) \
BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_VOLATILE, 0, 0), type)
#define BTF_RESTRICT_ENC(type) \
BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_RESTRICT, 0, 0), type)
#define BTF_FUNC_PROTO_ENC(ret_type, nargs) \
BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, nargs), ret_type)
#define BTF_FUNC_PROTO_ARG_ENC(name, type) \
(name), (type)
#define BTF_FUNC_ENC(name, func_proto) \
BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_FUNC, 0, 0), func_proto)
#define BTF_END_RAW 0xdeadbeef #define BTF_END_RAW 0xdeadbeef
#define NAME_TBD 0xdeadb33f #define NAME_TBD 0xdeadb33f

View File

@ -0,0 +1,69 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright (c) 2019 Facebook */
#ifndef _TEST_BTF_H
#define _TEST_BTF_H
#define BTF_INFO_ENC(kind, kind_flag, vlen) \
((!!(kind_flag) << 31) | ((kind) << 24) | ((vlen) & BTF_MAX_VLEN))
#define BTF_TYPE_ENC(name, info, size_or_type) \
(name), (info), (size_or_type)
#define BTF_INT_ENC(encoding, bits_offset, nr_bits) \
((encoding) << 24 | (bits_offset) << 16 | (nr_bits))
#define BTF_TYPE_INT_ENC(name, encoding, bits_offset, bits, sz) \
BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_INT, 0, 0), sz), \
BTF_INT_ENC(encoding, bits_offset, bits)
#define BTF_FWD_ENC(name, kind_flag) \
BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_FWD, kind_flag, 0), 0)
#define BTF_ARRAY_ENC(type, index_type, nr_elems) \
(type), (index_type), (nr_elems)
#define BTF_TYPE_ARRAY_ENC(type, index_type, nr_elems) \
BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_ARRAY, 0, 0), 0), \
BTF_ARRAY_ENC(type, index_type, nr_elems)
#define BTF_STRUCT_ENC(name, nr_elems, sz) \
BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, nr_elems), sz)
#define BTF_UNION_ENC(name, nr_elems, sz) \
BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_UNION, 0, nr_elems), sz)
#define BTF_VAR_ENC(name, type, linkage) \
BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_VAR, 0, 0), type), (linkage)
#define BTF_VAR_SECINFO_ENC(type, offset, size) \
(type), (offset), (size)
#define BTF_MEMBER_ENC(name, type, bits_offset) \
(name), (type), (bits_offset)
#define BTF_ENUM_ENC(name, val) (name), (val)
#define BTF_MEMBER_OFFSET(bitfield_size, bits_offset) \
((bitfield_size) << 24 | (bits_offset))
#define BTF_TYPEDEF_ENC(name, type) \
BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_TYPEDEF, 0, 0), type)
#define BTF_PTR_ENC(type) \
BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), type)
#define BTF_CONST_ENC(type) \
BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), type)
#define BTF_VOLATILE_ENC(type) \
BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_VOLATILE, 0, 0), type)
#define BTF_RESTRICT_ENC(type) \
BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_RESTRICT, 0, 0), type)
#define BTF_FUNC_PROTO_ENC(ret_type, nargs) \
BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, nargs), ret_type)
#define BTF_FUNC_PROTO_ARG_ENC(name, type) \
(name), (type)
#define BTF_FUNC_ENC(name, func_proto) \
BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_FUNC, 0, 0), func_proto)
#endif /* _TEST_BTF_H */

View File

@ -27,6 +27,7 @@
#include "bpf_util.h" #include "bpf_util.h"
#include "bpf_rlimit.h" #include "bpf_rlimit.h"
#include "test_maps.h"
#ifndef ENOTSUPP #ifndef ENOTSUPP
#define ENOTSUPP 524 #define ENOTSUPP 524
@ -36,15 +37,6 @@ static int skips;
static int map_flags; static int map_flags;
#define CHECK(condition, tag, format...) ({ \
int __ret = !!(condition); \
if (__ret) { \
printf("%s(%d):FAIL:%s ", __func__, __LINE__, tag); \
printf(format); \
exit(-1); \
} \
})
static void test_hashmap(unsigned int task, void *data) static void test_hashmap(unsigned int task, void *data)
{ {
long long key, next_key, first_key, value; long long key, next_key, first_key, value;
@ -1703,6 +1695,10 @@ static void run_all_tests(void)
test_map_in_map(); test_map_in_map();
} }
#define DECLARE
#include <map_tests/tests.h>
#undef DECLARE
int main(void) int main(void)
{ {
srand(time(NULL)); srand(time(NULL));
@ -1713,6 +1709,10 @@ int main(void)
map_flags = BPF_F_NO_PREALLOC; map_flags = BPF_F_NO_PREALLOC;
run_all_tests(); run_all_tests();
#define CALL
#include <map_tests/tests.h>
#undef CALL
printf("test_maps: OK, %d SKIPPED\n", skips); printf("test_maps: OK, %d SKIPPED\n", skips);
return 0; return 0;
} }

View File

@ -0,0 +1,17 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _TEST_MAPS_H
#define _TEST_MAPS_H
#include <stdio.h>
#include <stdlib.h>
#define CHECK(condition, tag, format...) ({ \
int __ret = !!(condition); \
if (__ret) { \
printf("%s(%d):FAIL:%s ", __func__, __LINE__, tag); \
printf(format); \
exit(-1); \
} \
})
#endif

View File

@ -35,6 +35,11 @@ enum bpf_linum_array_idx {
__NR_BPF_LINUM_ARRAY_IDX, __NR_BPF_LINUM_ARRAY_IDX,
}; };
struct bpf_spinlock_cnt {
struct bpf_spin_lock lock;
__u32 cnt;
};
#define CHECK(condition, tag, format...) ({ \ #define CHECK(condition, tag, format...) ({ \
int __ret = !!(condition); \ int __ret = !!(condition); \
if (__ret) { \ if (__ret) { \
@ -50,6 +55,8 @@ enum bpf_linum_array_idx {
#define DATA_LEN sizeof(DATA) #define DATA_LEN sizeof(DATA)
static struct sockaddr_in6 srv_sa6, cli_sa6; static struct sockaddr_in6 srv_sa6, cli_sa6;
static int sk_pkt_out_cnt10_fd;
static int sk_pkt_out_cnt_fd;
static int linum_map_fd; static int linum_map_fd;
static int addr_map_fd; static int addr_map_fd;
static int tp_map_fd; static int tp_map_fd;
@ -220,28 +227,90 @@ static void check_result(void)
"Unexpected listen_tp", "Check listen_tp output. ingress_linum:%u", "Unexpected listen_tp", "Check listen_tp output. ingress_linum:%u",
ingress_linum); ingress_linum);
CHECK(srv_tp.data_segs_out != 1 || CHECK(srv_tp.data_segs_out != 2 ||
srv_tp.data_segs_in || srv_tp.data_segs_in ||
srv_tp.snd_cwnd != 10 || srv_tp.snd_cwnd != 10 ||
srv_tp.total_retrans || srv_tp.total_retrans ||
srv_tp.bytes_acked != DATA_LEN, srv_tp.bytes_acked != 2 * DATA_LEN,
"Unexpected srv_tp", "Check srv_tp output. egress_linum:%u", "Unexpected srv_tp", "Check srv_tp output. egress_linum:%u",
egress_linum); egress_linum);
CHECK(cli_tp.data_segs_out || CHECK(cli_tp.data_segs_out ||
cli_tp.data_segs_in != 1 || cli_tp.data_segs_in != 2 ||
cli_tp.snd_cwnd != 10 || cli_tp.snd_cwnd != 10 ||
cli_tp.total_retrans || cli_tp.total_retrans ||
cli_tp.bytes_received != DATA_LEN, cli_tp.bytes_received != 2 * DATA_LEN,
"Unexpected cli_tp", "Check cli_tp output. egress_linum:%u", "Unexpected cli_tp", "Check cli_tp output. egress_linum:%u",
egress_linum); egress_linum);
} }
static void check_sk_pkt_out_cnt(int accept_fd, int cli_fd)
{
struct bpf_spinlock_cnt pkt_out_cnt = {}, pkt_out_cnt10 = {};
int err;
pkt_out_cnt.cnt = ~0;
pkt_out_cnt10.cnt = ~0;
err = bpf_map_lookup_elem(sk_pkt_out_cnt_fd, &accept_fd, &pkt_out_cnt);
if (!err)
err = bpf_map_lookup_elem(sk_pkt_out_cnt10_fd, &accept_fd,
&pkt_out_cnt10);
/* The bpf prog only counts for fullsock and
* passive conneciton did not become fullsock until 3WHS
* had been finished.
* The bpf prog only counted two data packet out but we
* specially init accept_fd's pkt_out_cnt by 2 in
* init_sk_storage(). Hence, 4 here.
*/
CHECK(err || pkt_out_cnt.cnt != 4 || pkt_out_cnt10.cnt != 40,
"bpf_map_lookup_elem(sk_pkt_out_cnt, &accept_fd)",
"err:%d errno:%d pkt_out_cnt:%u pkt_out_cnt10:%u",
err, errno, pkt_out_cnt.cnt, pkt_out_cnt10.cnt);
pkt_out_cnt.cnt = ~0;
pkt_out_cnt10.cnt = ~0;
err = bpf_map_lookup_elem(sk_pkt_out_cnt_fd, &cli_fd, &pkt_out_cnt);
if (!err)
err = bpf_map_lookup_elem(sk_pkt_out_cnt10_fd, &cli_fd,
&pkt_out_cnt10);
/* Active connection is fullsock from the beginning.
* 1 SYN and 1 ACK during 3WHS
* 2 Acks on data packet.
*
* The bpf_prog initialized it to 0xeB9F.
*/
CHECK(err || pkt_out_cnt.cnt != 0xeB9F + 4 ||
pkt_out_cnt10.cnt != 0xeB9F + 40,
"bpf_map_lookup_elem(sk_pkt_out_cnt, &cli_fd)",
"err:%d errno:%d pkt_out_cnt:%u pkt_out_cnt10:%u",
err, errno, pkt_out_cnt.cnt, pkt_out_cnt10.cnt);
}
static void init_sk_storage(int sk_fd, __u32 pkt_out_cnt)
{
struct bpf_spinlock_cnt scnt = {};
int err;
scnt.cnt = pkt_out_cnt;
err = bpf_map_update_elem(sk_pkt_out_cnt_fd, &sk_fd, &scnt,
BPF_NOEXIST);
CHECK(err, "bpf_map_update_elem(sk_pkt_out_cnt_fd)",
"err:%d errno:%d", err, errno);
scnt.cnt *= 10;
err = bpf_map_update_elem(sk_pkt_out_cnt10_fd, &sk_fd, &scnt,
BPF_NOEXIST);
CHECK(err, "bpf_map_update_elem(sk_pkt_out_cnt10_fd)",
"err:%d errno:%d", err, errno);
}
static void test(void) static void test(void)
{ {
int listen_fd, cli_fd, accept_fd, epfd, err; int listen_fd, cli_fd, accept_fd, epfd, err;
struct epoll_event ev; struct epoll_event ev;
socklen_t addrlen; socklen_t addrlen;
int i;
addrlen = sizeof(struct sockaddr_in6); addrlen = sizeof(struct sockaddr_in6);
ev.events = EPOLLIN; ev.events = EPOLLIN;
@ -308,17 +377,20 @@ static void test(void)
accept_fd, errno); accept_fd, errno);
close(listen_fd); close(listen_fd);
ev.data.fd = cli_fd;
err = epoll_ctl(epfd, EPOLL_CTL_ADD, cli_fd, &ev);
CHECK(err, "epoll_ctl(EPOLL_CTL_ADD, cli_fd)", "err:%d errno:%d",
err, errno);
init_sk_storage(accept_fd, 2);
for (i = 0; i < 2; i++) {
/* Send some data from accept_fd to cli_fd */ /* Send some data from accept_fd to cli_fd */
err = send(accept_fd, DATA, DATA_LEN, 0); err = send(accept_fd, DATA, DATA_LEN, 0);
CHECK(err != DATA_LEN, "send(accept_fd)", "err:%d errno:%d", CHECK(err != DATA_LEN, "send(accept_fd)", "err:%d errno:%d",
err, errno); err, errno);
/* Have some timeout in recv(cli_fd). Just in case. */ /* Have some timeout in recv(cli_fd). Just in case. */
ev.data.fd = cli_fd;
err = epoll_ctl(epfd, EPOLL_CTL_ADD, cli_fd, &ev);
CHECK(err, "epoll_ctl(EPOLL_CTL_ADD, cli_fd)", "err:%d errno:%d",
err, errno);
err = epoll_wait(epfd, &ev, 1, 1000); err = epoll_wait(epfd, &ev, 1, 1000);
CHECK(err != 1 || ev.data.fd != cli_fd, CHECK(err != 1 || ev.data.fd != cli_fd,
"epoll_wait(cli_fd)", "err:%d errno:%d ev.data.fd:%d cli_fd:%d", "epoll_wait(cli_fd)", "err:%d errno:%d ev.data.fd:%d cli_fd:%d",
@ -326,6 +398,9 @@ static void test(void)
err = recv(cli_fd, NULL, 0, MSG_TRUNC); err = recv(cli_fd, NULL, 0, MSG_TRUNC);
CHECK(err, "recv(cli_fd)", "err:%d errno:%d", err, errno); CHECK(err, "recv(cli_fd)", "err:%d errno:%d", err, errno);
}
check_sk_pkt_out_cnt(accept_fd, cli_fd);
close(epfd); close(epfd);
close(accept_fd); close(accept_fd);
@ -395,6 +470,14 @@ int main(int argc, char **argv)
CHECK(!map, "cannot find linum_map", "(null)"); CHECK(!map, "cannot find linum_map", "(null)");
linum_map_fd = bpf_map__fd(map); linum_map_fd = bpf_map__fd(map);
map = bpf_object__find_map_by_name(obj, "sk_pkt_out_cnt");
CHECK(!map, "cannot find sk_pkt_out_cnt", "(null)");
sk_pkt_out_cnt_fd = bpf_map__fd(map);
map = bpf_object__find_map_by_name(obj, "sk_pkt_out_cnt10");
CHECK(!map, "cannot find sk_pkt_out_cnt10", "(null)");
sk_pkt_out_cnt10_fd = bpf_map__fd(map);
test(); test();
bpf_object__close(obj); bpf_object__close(obj);

View File

@ -97,6 +97,9 @@ if [[ "$#" -eq "0" ]]; then
echo "ip6ip6" echo "ip6ip6"
$0 ipv6 ip6tnl none 100 $0 ipv6 ip6tnl none 100
echo "sit"
$0 ipv6 sit none 100
for mac in none mpls eth ; do for mac in none mpls eth ; do
echo "ip gre $mac" echo "ip gre $mac"
$0 ipv4 gre $mac 100 $0 ipv4 gre $mac 100
@ -211,11 +214,20 @@ else
targs="" targs=""
fi fi
# tunnel address family differs from inner for SIT
if [[ "${tuntype}" == "sit" ]]; then
link_addr1="${ns1_v4}"
link_addr2="${ns2_v4}"
else
link_addr1="${addr1}"
link_addr2="${addr2}"
fi
# serverside, insert decap module # serverside, insert decap module
# server is still running # server is still running
# client can connect again # client can connect again
ip netns exec "${ns2}" ip link add name testtun0 type "${ttype}" \ ip netns exec "${ns2}" ip link add name testtun0 type "${ttype}" \
${tmode} remote "${addr1}" local "${addr2}" $targs ${tmode} remote "${link_addr1}" local "${link_addr2}" $targs
expect_tun_fail=0 expect_tun_fail=0
@ -260,6 +272,12 @@ else
server_listen server_listen
fi fi
# bpf_skb_net_shrink does not take tunnel flags yet, cannot update L3.
if [[ "${tuntype}" == "sit" ]]; then
echo OK
exit 0
fi
# serverside, use BPF for decap # serverside, use BPF for decap
ip netns exec "${ns2}" ip link del dev testtun0 ip netns exec "${ns2}" ip link del dev testtun0
ip netns exec "${ns2}" tc qdisc add dev veth2 clsact ip netns exec "${ns2}" tc qdisc add dev veth2 clsact

View File

@ -47,12 +47,13 @@
#include "bpf_rlimit.h" #include "bpf_rlimit.h"
#include "bpf_rand.h" #include "bpf_rand.h"
#include "bpf_util.h" #include "bpf_util.h"
#include "test_btf.h"
#include "../../../include/linux/filter.h" #include "../../../include/linux/filter.h"
#define MAX_INSNS BPF_MAXINSNS #define MAX_INSNS BPF_MAXINSNS
#define MAX_TEST_INSNS 1000000 #define MAX_TEST_INSNS 1000000
#define MAX_FIXUPS 8 #define MAX_FIXUPS 8
#define MAX_NR_MAPS 17 #define MAX_NR_MAPS 18
#define MAX_TEST_RUNS 8 #define MAX_TEST_RUNS 8
#define POINTER_VALUE 0xcafe4all #define POINTER_VALUE 0xcafe4all
#define TEST_DATA_LEN 64 #define TEST_DATA_LEN 64
@ -85,6 +86,7 @@ struct bpf_test {
int fixup_map_array_ro[MAX_FIXUPS]; int fixup_map_array_ro[MAX_FIXUPS];
int fixup_map_array_wo[MAX_FIXUPS]; int fixup_map_array_wo[MAX_FIXUPS];
int fixup_map_array_small[MAX_FIXUPS]; int fixup_map_array_small[MAX_FIXUPS];
int fixup_sk_storage_map[MAX_FIXUPS];
const char *errstr; const char *errstr;
const char *errstr_unpriv; const char *errstr_unpriv;
uint32_t retval, retval_unpriv, insn_processed; uint32_t retval, retval_unpriv, insn_processed;
@ -497,24 +499,6 @@ static int create_cgroup_storage(bool percpu)
return fd; return fd;
} }
#define BTF_INFO_ENC(kind, kind_flag, vlen) \
((!!(kind_flag) << 31) | ((kind) << 24) | ((vlen) & BTF_MAX_VLEN))
#define BTF_TYPE_ENC(name, info, size_or_type) \
(name), (info), (size_or_type)
#define BTF_INT_ENC(encoding, bits_offset, nr_bits) \
((encoding) << 24 | (bits_offset) << 16 | (nr_bits))
#define BTF_TYPE_INT_ENC(name, encoding, bits_offset, bits, sz) \
BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_INT, 0, 0), sz), \
BTF_INT_ENC(encoding, bits_offset, bits)
#define BTF_MEMBER_ENC(name, type, bits_offset) \
(name), (type), (bits_offset)
struct btf_raw_data {
__u32 raw_types[64];
const char *str_sec;
__u32 str_sec_size;
};
/* struct bpf_spin_lock { /* struct bpf_spin_lock {
* int val; * int val;
* }; * };
@ -589,6 +573,31 @@ static int create_map_spin_lock(void)
return fd; return fd;
} }
static int create_sk_storage_map(void)
{
struct bpf_create_map_attr attr = {
.name = "test_map",
.map_type = BPF_MAP_TYPE_SK_STORAGE,
.key_size = 4,
.value_size = 8,
.max_entries = 0,
.map_flags = BPF_F_NO_PREALLOC,
.btf_key_type_id = 1,
.btf_value_type_id = 3,
};
int fd, btf_fd;
btf_fd = load_btf();
if (btf_fd < 0)
return -1;
attr.btf_fd = btf_fd;
fd = bpf_create_map_xattr(&attr);
close(attr.btf_fd);
if (fd < 0)
printf("Failed to create sk_storage_map\n");
return fd;
}
static char bpf_vlog[UINT_MAX >> 8]; static char bpf_vlog[UINT_MAX >> 8];
static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type, static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type,
@ -611,6 +620,7 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type,
int *fixup_map_array_ro = test->fixup_map_array_ro; int *fixup_map_array_ro = test->fixup_map_array_ro;
int *fixup_map_array_wo = test->fixup_map_array_wo; int *fixup_map_array_wo = test->fixup_map_array_wo;
int *fixup_map_array_small = test->fixup_map_array_small; int *fixup_map_array_small = test->fixup_map_array_small;
int *fixup_sk_storage_map = test->fixup_sk_storage_map;
if (test->fill_helper) { if (test->fill_helper) {
test->fill_insns = calloc(MAX_TEST_INSNS, sizeof(struct bpf_insn)); test->fill_insns = calloc(MAX_TEST_INSNS, sizeof(struct bpf_insn));
@ -765,6 +775,13 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type,
fixup_map_array_small++; fixup_map_array_small++;
} while (*fixup_map_array_small); } while (*fixup_map_array_small);
} }
if (*fixup_sk_storage_map) {
map_fds[17] = create_sk_storage_map();
do {
prog[*fixup_sk_storage_map].imm = map_fds[17];
fixup_sk_storage_map++;
} while (*fixup_sk_storage_map);
}
} }
static int set_admin(bool admin) static int set_admin(bool admin)

View File

@ -0,0 +1,34 @@
{
"raw_tracepoint_writable: reject variable offset",
.insns = {
/* r6 is our tp buffer */
BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0),
BPF_LD_MAP_FD(BPF_REG_1, 0),
/* move the key (== 0) to r10-8 */
BPF_MOV32_IMM(BPF_REG_0, 0),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0),
/* lookup in the map */
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
BPF_FUNC_map_lookup_elem),
/* exit clean if null */
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
BPF_EXIT_INSN(),
/* shift the buffer pointer to a variable location */
BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0),
BPF_ALU64_REG(BPF_ADD, BPF_REG_6, BPF_REG_0),
/* clobber whatever's there */
BPF_MOV64_IMM(BPF_REG_7, 4242),
BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_7, 0),
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
.fixup_map_hash_8b = { 1, },
.prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
.errstr = "R6 invalid variable buffer offset: off=0, var_off=(0x0; 0xffffffff)",
},

View File

@ -382,3 +382,119 @@
.result = REJECT, .result = REJECT,
.errstr = "reference has not been acquired before", .errstr = "reference has not been acquired before",
}, },
{
"sk_storage_get(map, skb->sk, NULL, 0): value == NULL",
.insns = {
BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
BPF_EMIT_CALL(BPF_FUNC_sk_fullsock),
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
BPF_MOV64_IMM(BPF_REG_4, 0),
BPF_MOV64_IMM(BPF_REG_3, 0),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
BPF_LD_MAP_FD(BPF_REG_1, 0),
BPF_EMIT_CALL(BPF_FUNC_sk_storage_get),
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
.fixup_sk_storage_map = { 11 },
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
.result = ACCEPT,
},
{
"sk_storage_get(map, skb->sk, 1, 1): value == 1",
.insns = {
BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
BPF_EMIT_CALL(BPF_FUNC_sk_fullsock),
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
BPF_MOV64_IMM(BPF_REG_4, 1),
BPF_MOV64_IMM(BPF_REG_3, 1),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
BPF_LD_MAP_FD(BPF_REG_1, 0),
BPF_EMIT_CALL(BPF_FUNC_sk_storage_get),
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
.fixup_sk_storage_map = { 11 },
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
.result = REJECT,
.errstr = "R3 type=inv expected=fp",
},
{
"sk_storage_get(map, skb->sk, &stack_value, 1): stack_value",
.insns = {
BPF_MOV64_IMM(BPF_REG_2, 0),
BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -8),
BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
BPF_EMIT_CALL(BPF_FUNC_sk_fullsock),
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
BPF_MOV64_IMM(BPF_REG_4, 1),
BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -8),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
BPF_LD_MAP_FD(BPF_REG_1, 0),
BPF_EMIT_CALL(BPF_FUNC_sk_storage_get),
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
.fixup_sk_storage_map = { 14 },
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
.result = ACCEPT,
},
{
"sk_storage_get(map, skb->sk, &stack_value, 1): partially init stack_value",
.insns = {
BPF_MOV64_IMM(BPF_REG_2, 0),
BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8),
BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
BPF_EMIT_CALL(BPF_FUNC_sk_fullsock),
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
BPF_MOV64_IMM(BPF_REG_4, 1),
BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -8),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
BPF_LD_MAP_FD(BPF_REG_1, 0),
BPF_EMIT_CALL(BPF_FUNC_sk_storage_get),
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
.fixup_sk_storage_map = { 14 },
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
.result = REJECT,
.errstr = "invalid indirect read from stack",
},
{
"bpf_map_lookup_elem(smap, &key)",
.insns = {
BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
BPF_LD_MAP_FD(BPF_REG_1, 0),
BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
.fixup_sk_storage_map = { 3 },
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
.result = REJECT,
.errstr = "cannot pass map_type 24 into func bpf_map_lookup_elem",
},