diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index c970a25d2a49..360c082e885c 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -7,6 +7,7 @@ struct sock; struct cgroup; struct sk_buff; +struct bpf_sock_ops_kern; #ifdef CONFIG_CGROUP_BPF @@ -42,6 +43,10 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, int __cgroup_bpf_run_filter_sk(struct sock *sk, enum bpf_attach_type type); +int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, + struct bpf_sock_ops_kern *sock_ops, + enum bpf_attach_type type); + /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */ #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ ({ \ @@ -75,6 +80,18 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, __ret; \ }) +#define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled && (sock_ops)->sk) { \ + typeof(sk) __sk = sk_to_full_sk((sock_ops)->sk); \ + if (sk_fullsock(__sk)) \ + __ret = __cgroup_bpf_run_filter_sock_ops(__sk, \ + sock_ops, \ + BPF_CGROUP_SOCK_OPS); \ + } \ + __ret; \ +}) #else struct cgroup_bpf {}; @@ -85,6 +102,7 @@ static inline void cgroup_bpf_inherit(struct cgroup *cgrp, #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #endif /* CONFIG_CGROUP_BPF */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 03bf223f18be..3d137c33d664 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -10,6 +10,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock_prog_ops) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout_prog_ops) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout_prog_ops) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit_prog_ops) +BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops_prog_ops) #endif #ifdef CONFIG_BPF_EVENTS BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe_prog_ops) diff --git a/include/linux/filter.h b/include/linux/filter.h index 1fa26dc562ce..738f8b14f025 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -898,4 +898,13 @@ static inline int bpf_tell_extensions(void) return SKF_AD_MAX; } +struct bpf_sock_ops_kern { + struct sock *sk; + u32 op; + union { + u32 reply; + u32 replylong[4]; + }; +}; + #endif /* __LINUX_FILTER_H__ */ diff --git a/include/net/tcp.h b/include/net/tcp.h index d0751b79d99c..70483296157f 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -46,6 +46,10 @@ #include #include +#include +#include +#include + extern struct inet_hashinfo tcp_hashinfo; extern struct percpu_counter tcp_orphan_count; @@ -1000,7 +1004,9 @@ void tcp_get_default_congestion_control(char *name); void tcp_get_available_congestion_control(char *buf, size_t len); void tcp_get_allowed_congestion_control(char *buf, size_t len); int tcp_set_allowed_congestion_control(char *allowed); -int tcp_set_congestion_control(struct sock *sk, const char *name); +int tcp_set_congestion_control(struct sock *sk, const char *name, bool load); +void tcp_reinit_congestion_control(struct sock *sk, + const struct tcp_congestion_ops *ca); u32 tcp_slow_start(struct tcp_sock *tp, u32 acked); void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked); @@ -2021,4 +2027,62 @@ int tcp_set_ulp(struct sock *sk, const char *name); void tcp_get_available_ulp(char *buf, size_t len); void tcp_cleanup_ulp(struct sock *sk); +/* Call BPF_SOCK_OPS program that returns an int. If the return value + * is < 0, then the BPF op failed (for example if the loaded BPF + * program does not support the chosen operation or there is no BPF + * program loaded). + */ +#ifdef CONFIG_BPF +static inline int tcp_call_bpf(struct sock *sk, int op) +{ + struct bpf_sock_ops_kern sock_ops; + int ret; + + if (sk_fullsock(sk)) + sock_owned_by_me(sk); + + memset(&sock_ops, 0, sizeof(sock_ops)); + sock_ops.sk = sk; + sock_ops.op = op; + + ret = BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); + if (ret == 0) + ret = sock_ops.reply; + else + ret = -1; + return ret; +} +#else +static inline int tcp_call_bpf(struct sock *sk, int op) +{ + return -EPERM; +} +#endif + +static inline u32 tcp_timeout_init(struct sock *sk) +{ + int timeout; + + timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT); + + if (timeout <= 0) + timeout = TCP_TIMEOUT_INIT; + return timeout; +} + +static inline u32 tcp_rwnd_init_bpf(struct sock *sk) +{ + int rwnd; + + rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT); + + if (rwnd < 0) + rwnd = 0; + return rwnd; +} + +static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk) +{ + return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1); +} #endif /* _TCP_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f94b48b168dc..a6a91e5e96fc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -120,12 +120,14 @@ enum bpf_prog_type { BPF_PROG_TYPE_LWT_IN, BPF_PROG_TYPE_LWT_OUT, BPF_PROG_TYPE_LWT_XMIT, + BPF_PROG_TYPE_SOCK_OPS, }; enum bpf_attach_type { BPF_CGROUP_INET_INGRESS, BPF_CGROUP_INET_EGRESS, BPF_CGROUP_INET_SOCK_CREATE, + BPF_CGROUP_SOCK_OPS, __MAX_BPF_ATTACH_TYPE }; @@ -518,6 +520,17 @@ union bpf_attr { * Set full skb->hash. * @skb: pointer to skb * @hash: hash to set + * + * int bpf_setsockopt(bpf_socket, level, optname, optval, optlen) + * Calls setsockopt. Not all opts are available, only those with + * integer optvals plus TCP_CONGESTION. + * Supported levels: SOL_SOCKET and IPROTO_TCP + * @bpf_socket: pointer to bpf_socket + * @level: SOL_SOCKET or IPROTO_TCP + * @optname: option name + * @optval: pointer to option value + * @optlen: length of optval in byes + * Return: 0 or negative error */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -568,7 +581,8 @@ union bpf_attr { FN(probe_read_str), \ FN(get_socket_cookie), \ FN(get_socket_uid), \ - FN(set_hash), + FN(set_hash), \ + FN(setsockopt), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -720,4 +734,56 @@ struct bpf_map_info { __u32 map_flags; } __attribute__((aligned(8))); +/* User bpf_sock_ops struct to access socket values and specify request ops + * and their replies. + * Some of this fields are in network (bigendian) byte order and may need + * to be converted before use (bpf_ntohl() defined in samples/bpf/bpf_endian.h). + * New fields can only be added at the end of this structure + */ +struct bpf_sock_ops { + __u32 op; + union { + __u32 reply; + __u32 replylong[4]; + }; + __u32 family; + __u32 remote_ip4; /* Stored in network byte order */ + __u32 local_ip4; /* Stored in network byte order */ + __u32 remote_ip6[4]; /* Stored in network byte order */ + __u32 local_ip6[4]; /* Stored in network byte order */ + __u32 remote_port; /* Stored in network byte order */ + __u32 local_port; /* stored in host byte order */ +}; + +/* List of known BPF sock_ops operators. + * New entries can only be added at the end + */ +enum { + BPF_SOCK_OPS_VOID, + BPF_SOCK_OPS_TIMEOUT_INIT, /* Should return SYN-RTO value to use or + * -1 if default value should be used + */ + BPF_SOCK_OPS_RWND_INIT, /* Should return initial advertized + * window (in packets) or -1 if default + * value should be used + */ + BPF_SOCK_OPS_TCP_CONNECT_CB, /* Calls BPF program right before an + * active connection is initialized + */ + BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, /* Calls BPF program when an + * active connection is + * established + */ + BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, /* Calls BPF program when a + * passive connection is + * established + */ + BPF_SOCK_OPS_NEEDS_ECN, /* If connection's congestion control + * needs ECN + */ +}; + +#define TCP_BPF_IW 1001 /* Set TCP initial congestion window */ +#define TCP_BPF_SNDCWND_CLAMP 1002 /* Set sndcwnd_clamp */ + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ea6033cba947..546113430049 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -236,3 +236,40 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, return ret; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); + +/** + * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock + * @sk: socket to get cgroup from + * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains + * sk with connection information (IP addresses, etc.) May not contain + * cgroup info if it is a req sock. + * @type: The type of program to be exectuted + * + * socket passed is expected to be of type INET or INET6. + * + * The program type passed in via @type must be suitable for sock_ops + * filtering. No further check is performed to assert that. + * + * This function will return %-EPERM if any if an attached program was found + * and if it returned != 1 during execution. In all other cases, 0 is returned. + */ +int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, + struct bpf_sock_ops_kern *sock_ops, + enum bpf_attach_type type) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_prog *prog; + int ret = 0; + + + rcu_read_lock(); + + prog = rcu_dereference(cgrp->bpf.effective[type]); + if (prog) + ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM; + + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4409ccca8831..d4d47de75bba 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1079,6 +1079,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_INET_SOCK_CREATE: ptype = BPF_PROG_TYPE_CGROUP_SOCK; break; + case BPF_CGROUP_SOCK_OPS: + ptype = BPF_PROG_TYPE_SOCK_OPS; + break; default: return -EINVAL; } @@ -1119,6 +1122,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_SOCK_OPS: cgrp = cgroup_get_from_fd(attr->target_fd); if (IS_ERR(cgrp)) return PTR_ERR(cgrp); @@ -1133,6 +1137,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) return ret; } + #endif /* CONFIG_CGROUP_BPF */ #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration diff --git a/net/core/filter.c b/net/core/filter.c index b39c869d22e3..523b91d25025 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -54,6 +54,7 @@ #include #include #include +#include /** * sk_filter_trim_cap - run a packet through a socket filter @@ -2672,6 +2673,110 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, + int, level, int, optname, char *, optval, int, optlen) +{ + struct sock *sk = bpf_sock->sk; + int ret = 0; + int val; + + if (!sk_fullsock(sk)) + return -EINVAL; + + if (level == SOL_SOCKET) { + if (optlen != sizeof(int)) + return -EINVAL; + val = *((int *)optval); + + /* Only some socketops are supported */ + switch (optname) { + case SO_RCVBUF: + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF); + break; + case SO_SNDBUF: + sk->sk_userlocks |= SOCK_SNDBUF_LOCK; + sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF); + break; + case SO_MAX_PACING_RATE: + sk->sk_max_pacing_rate = val; + sk->sk_pacing_rate = min(sk->sk_pacing_rate, + sk->sk_max_pacing_rate); + break; + case SO_PRIORITY: + sk->sk_priority = val; + break; + case SO_RCVLOWAT: + if (val < 0) + val = INT_MAX; + sk->sk_rcvlowat = val ? : 1; + break; + case SO_MARK: + sk->sk_mark = val; + break; + default: + ret = -EINVAL; + } + } else if (level == SOL_TCP && + sk->sk_prot->setsockopt == tcp_setsockopt) { +#ifdef CONFIG_INET + if (optname == TCP_CONGESTION) { + char name[TCP_CA_NAME_MAX]; + + strncpy(name, optval, min_t(long, optlen, + TCP_CA_NAME_MAX-1)); + name[TCP_CA_NAME_MAX-1] = 0; + ret = tcp_set_congestion_control(sk, name, false); + if (!ret && bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN) + /* replacing an existing ca */ + tcp_reinit_congestion_control(sk, + inet_csk(sk)->icsk_ca_ops); + } else { + struct tcp_sock *tp = tcp_sk(sk); + + if (optlen != sizeof(int)) + return -EINVAL; + + val = *((int *)optval); + /* Only some options are supported */ + switch (optname) { + case TCP_BPF_IW: + if (val <= 0 || tp->data_segs_out > 0) + ret = -EINVAL; + else + tp->snd_cwnd = val; + break; + case TCP_BPF_SNDCWND_CLAMP: + if (val <= 0) { + ret = -EINVAL; + } else { + tp->snd_cwnd_clamp = val; + tp->snd_ssthresh = val; + } + default: + ret = -EINVAL; + } + } +#else + ret = -EINVAL; +#endif + } else { + ret = -EINVAL; + } + return ret; +} + +static const struct bpf_func_proto bpf_setsockopt_proto = { + .func = bpf_setsockopt, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -2822,6 +2927,17 @@ lwt_inout_func_proto(enum bpf_func_id func_id) } } +static const struct bpf_func_proto * + sock_ops_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_setsockopt: + return &bpf_setsockopt_proto; + default: + return bpf_base_func_proto(func_id); + } +} + static const struct bpf_func_proto * lwt_xmit_func_proto(enum bpf_func_id func_id) { @@ -3110,6 +3226,36 @@ void bpf_warn_invalid_xdp_action(u32 act) } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); +static bool __is_valid_sock_ops_access(int off, int size) +{ + if (off < 0 || off >= sizeof(struct bpf_sock_ops)) + return false; + /* The verifier guarantees that size > 0. */ + if (off % size != 0) + return false; + if (size != sizeof(__u32)) + return false; + + return true; +} + +static bool sock_ops_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + if (type == BPF_WRITE) { + switch (off) { + case offsetof(struct bpf_sock_ops, op) ... + offsetof(struct bpf_sock_ops, replylong[3]): + break; + default: + return false; + } + } + + return __is_valid_sock_ops_access(off, size); +} + static u32 bpf_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, @@ -3379,6 +3525,138 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + int off; + + switch (si->off) { + case offsetof(struct bpf_sock_ops, op) ... + offsetof(struct bpf_sock_ops, replylong[3]): + BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, op) != + FIELD_SIZEOF(struct bpf_sock_ops_kern, op)); + BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, reply) != + FIELD_SIZEOF(struct bpf_sock_ops_kern, reply)); + BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, replylong) != + FIELD_SIZEOF(struct bpf_sock_ops_kern, replylong)); + off = si->off; + off -= offsetof(struct bpf_sock_ops, op); + off += offsetof(struct bpf_sock_ops_kern, op); + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, + off); + else + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + off); + break; + + case offsetof(struct bpf_sock_ops, family): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_family)); + break; + + case offsetof(struct bpf_sock_ops, remote_ip4): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_daddr)); + break; + + case offsetof(struct bpf_sock_ops, local_ip4): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_rcv_saddr) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_rcv_saddr)); + break; + + case offsetof(struct bpf_sock_ops, remote_ip6[0]) ... + offsetof(struct bpf_sock_ops, remote_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + skc_v6_daddr.s6_addr32[0]) != 4); + + off = si->off; + off -= offsetof(struct bpf_sock_ops, remote_ip6[0]); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_v6_daddr.s6_addr32[0]) + + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct bpf_sock_ops, local_ip6[0]) ... + offsetof(struct bpf_sock_ops, local_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]) != 4); + + off = si->off; + off -= offsetof(struct bpf_sock_ops, local_ip6[0]); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]) + + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct bpf_sock_ops, remote_port): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_dport)); +#ifndef __BIG_ENDIAN_BITFIELD + *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); +#endif + break; + + case offsetof(struct bpf_sock_ops, local_port): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_num)); + break; + } + return insn - insn_buf; +} + const struct bpf_verifier_ops sk_filter_prog_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, @@ -3428,6 +3706,12 @@ const struct bpf_verifier_ops cg_sock_prog_ops = { .convert_ctx_access = sock_filter_convert_ctx_access, }; +const struct bpf_verifier_ops sock_ops_prog_ops = { + .get_func_proto = sock_ops_func_proto, + .is_valid_access = sock_ops_is_valid_access, + .convert_ctx_access = sock_ops_convert_ctx_access, +}; + int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index fae45e402742..71ce33decd97 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2481,7 +2481,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, name[val] = 0; lock_sock(sk); - err = tcp_set_congestion_control(sk, name); + err = tcp_set_congestion_control(sk, name, true); release_sock(sk); return err; } diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 324c9bcc5456..fde983f6376b 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -189,8 +189,8 @@ void tcp_init_congestion_control(struct sock *sk) INET_ECN_dontxmit(sk); } -static void tcp_reinit_congestion_control(struct sock *sk, - const struct tcp_congestion_ops *ca) +void tcp_reinit_congestion_control(struct sock *sk, + const struct tcp_congestion_ops *ca) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -333,8 +333,12 @@ out: return ret; } -/* Change congestion control for socket */ -int tcp_set_congestion_control(struct sock *sk, const char *name) +/* Change congestion control for socket. If load is false, then it is the + * responsibility of the caller to call tcp_init_congestion_control or + * tcp_reinit_congestion_control (if the current congestion control was + * already initialized. + */ +int tcp_set_congestion_control(struct sock *sk, const char *name, bool load) { struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; @@ -344,21 +348,29 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) return -EPERM; rcu_read_lock(); - ca = __tcp_ca_find_autoload(name); + if (!load) + ca = tcp_ca_find(name); + else + ca = __tcp_ca_find_autoload(name); /* No change asking for existing value */ if (ca == icsk->icsk_ca_ops) { icsk->icsk_ca_setsockopt = 1; goto out; } - if (!ca) + if (!ca) { err = -ENOENT; - else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || - ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) + } else if (!load) { + icsk->icsk_ca_ops = ca; + if (!try_module_get(ca->owner)) + err = -EBUSY; + } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || + ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) { err = -EPERM; - else if (!try_module_get(ca->owner)) + } else if (!try_module_get(ca->owner)) { err = -EBUSY; - else + } else { tcp_reinit_congestion_control(sk, ca); + } out: rcu_read_unlock(); return err; diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 8b1539efaf38..ce9c7fef200f 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -221,6 +221,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, tcp_init_congestion_control(child); tcp_mtup_init(child); tcp_init_metrics(child); + tcp_call_bpf(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tcp_init_buffer_space(child); tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2ab7e2fa9bb9..2920e0cb09f8 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5571,7 +5571,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) icsk->icsk_af_ops->rebuild_header(sk); tcp_init_metrics(sk); - + tcp_call_bpf(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); tcp_init_congestion_control(sk); /* Prevent spurious tcp_cwnd_restart() on first data @@ -5977,6 +5977,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) } else { /* Make sure socket is routed, for correct metrics. */ icsk->icsk_af_ops->rebuild_header(sk); + tcp_call_bpf(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tcp_init_congestion_control(sk); tcp_mtup_init(sk); @@ -6190,7 +6191,8 @@ static void tcp_ecn_create_request(struct request_sock *req, ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst; if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) || - (ecn_ok_dst & DST_FEATURE_ECN_CA)) + (ecn_ok_dst & DST_FEATURE_ECN_CA) || + tcp_bpf_ca_needs_ecn((struct sock *)req)) inet_rsk(req)->ecn_ok = 1; } @@ -6406,7 +6408,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, } else { tcp_rsk(req)->tfo_listener = false; if (!want_cookie) - inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + inet_csk_reqsk_queue_hash_add(sk, req, + tcp_timeout_init((struct sock *)req)); af_ops->send_synack(sk, dst, &fl, req, &foc, !want_cookie ? TCP_SYNACK_NORMAL : TCP_SYNACK_COOKIE); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index d30ee31e94eb..0ff83c1637d8 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -351,6 +351,7 @@ void tcp_openreq_init_rwin(struct request_sock *req, int full_space = tcp_full_space(sk_listener); u32 window_clamp; __u8 rcv_wscale; + u32 rcv_wnd; int mss; mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); @@ -363,6 +364,12 @@ void tcp_openreq_init_rwin(struct request_sock *req, (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) req->rsk_window_clamp = full_space; + rcv_wnd = tcp_rwnd_init_bpf((struct sock *)req); + if (rcv_wnd == 0) + rcv_wnd = dst_metric(dst, RTAX_INITRWND); + else if (full_space < rcv_wnd * mss) + full_space = rcv_wnd * mss; + /* tcp_full_space because it is guaranteed to be the first packet */ tcp_select_initial_window(full_space, mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), @@ -370,7 +377,7 @@ void tcp_openreq_init_rwin(struct request_sock *req, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, - dst_metric(dst, RTAX_INITRWND)); + rcv_wnd); ireq->rcv_wscale = rcv_wscale; } EXPORT_SYMBOL(tcp_openreq_init_rwin); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 1d79137f3795..4d36f0b093e6 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -316,7 +316,8 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; if (!(tp->ecn_flags & TCP_ECN_OK)) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; - else if (tcp_ca_needs_ecn(sk)) + else if (tcp_ca_needs_ecn(sk) || + tcp_bpf_ca_needs_ecn(sk)) INET_ECN_xmit(sk); } @@ -324,8 +325,9 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); + bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 || - tcp_ca_needs_ecn(sk); + tcp_ca_needs_ecn(sk) || bpf_needs_ecn; if (!use_ecn) { const struct dst_entry *dst = __sk_dst_get(sk); @@ -339,7 +341,7 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) if (use_ecn) { TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; tp->ecn_flags = TCP_ECN_OK; - if (tcp_ca_needs_ecn(sk)) + if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) INET_ECN_xmit(sk); } } @@ -3266,6 +3268,7 @@ static void tcp_connect_init(struct sock *sk) const struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); __u8 rcv_wscale; + u32 rcv_wnd; /* We'll fix this up when we get a response from the other end. * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. @@ -3299,13 +3302,17 @@ static void tcp_connect_init(struct sock *sk) (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0)) tp->window_clamp = tcp_full_space(sk); + rcv_wnd = tcp_rwnd_init_bpf(sk); + if (rcv_wnd == 0) + rcv_wnd = dst_metric(dst, RTAX_INITRWND); + tcp_select_initial_window(tcp_full_space(sk), tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), &tp->rcv_wnd, &tp->window_clamp, sock_net(sk)->ipv4.sysctl_tcp_window_scaling, &rcv_wscale, - dst_metric(dst, RTAX_INITRWND)); + rcv_wnd); tp->rx_opt.rcv_wscale = rcv_wscale; tp->rcv_ssthresh = tp->rcv_wnd; @@ -3326,7 +3333,7 @@ static void tcp_connect_init(struct sock *sk) tp->rcv_wup = tp->rcv_nxt; tp->copied_seq = tp->rcv_nxt; - inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; + inet_csk(sk)->icsk_rto = tcp_timeout_init(sk); inet_csk(sk)->icsk_retransmits = 0; tcp_clear_retrans(tp); } @@ -3439,6 +3446,7 @@ int tcp_connect(struct sock *sk) struct sk_buff *buff; int err; + tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB); tcp_connect_init(sk); if (unlikely(tp->repair)) { diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index e7ec9b8539a5..9c650589e80f 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -36,6 +36,7 @@ hostprogs-y += lwt_len_hist hostprogs-y += xdp_tx_iptunnel hostprogs-y += test_map_in_map hostprogs-y += per_socket_stats_example +hostprogs-y += load_sock_ops # Libbpf dependencies LIBBPF := ../../tools/lib/bpf/bpf.o @@ -52,6 +53,7 @@ tracex3-objs := bpf_load.o $(LIBBPF) tracex3_user.o tracex4-objs := bpf_load.o $(LIBBPF) tracex4_user.o tracex5-objs := bpf_load.o $(LIBBPF) tracex5_user.o tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o +load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o lathist-objs := bpf_load.o $(LIBBPF) lathist_user.o @@ -111,6 +113,12 @@ always += lwt_len_hist_kern.o always += xdp_tx_iptunnel_kern.o always += test_map_in_map_kern.o always += cookie_uid_helper_example.o +always += tcp_synrto_kern.o +always += tcp_rwnd_kern.o +always += tcp_bufs_kern.o +always += tcp_cong_kern.o +always += tcp_iw_kern.o +always += tcp_clamp_kern.o HOSTCFLAGS += -I$(objtree)/usr/include HOSTCFLAGS += -I$(srctree)/tools/lib/ @@ -130,6 +138,7 @@ HOSTLOADLIBES_tracex4 += -lelf -lrt HOSTLOADLIBES_tracex5 += -lelf HOSTLOADLIBES_tracex6 += -lelf HOSTLOADLIBES_test_cgrp2_sock2 += -lelf +HOSTLOADLIBES_load_sock_ops += -lelf HOSTLOADLIBES_test_probe_write_user += -lelf HOSTLOADLIBES_trace_output += -lelf -lrt HOSTLOADLIBES_lathist += -lelf diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h index f4840b8bb8f9..d50ac342dc92 100644 --- a/samples/bpf/bpf_helpers.h +++ b/samples/bpf/bpf_helpers.h @@ -60,6 +60,9 @@ static unsigned long long (*bpf_get_prandom_u32)(void) = (void *) BPF_FUNC_get_prandom_u32; static int (*bpf_xdp_adjust_head)(void *ctx, int offset) = (void *) BPF_FUNC_xdp_adjust_head; +static int (*bpf_setsockopt)(void *ctx, int level, int optname, void *optval, + int optlen) = + (void *) BPF_FUNC_setsockopt; /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index a91c57dd8571..a4be7cfa6519 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -64,6 +64,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) bool is_perf_event = strncmp(event, "perf_event", 10) == 0; bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0; bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0; + bool is_sockops = strncmp(event, "sockops", 7) == 0; size_t insns_cnt = size / sizeof(struct bpf_insn); enum bpf_prog_type prog_type; char buf[256]; @@ -89,6 +90,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) prog_type = BPF_PROG_TYPE_CGROUP_SKB; } else if (is_cgroup_sk) { prog_type = BPF_PROG_TYPE_CGROUP_SOCK; + } else if (is_sockops) { + prog_type = BPF_PROG_TYPE_SOCK_OPS; } else { printf("Unknown event '%s'\n", event); return -1; @@ -106,8 +109,11 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk) return 0; - if (is_socket) { - event += 6; + if (is_socket || is_sockops) { + if (is_socket) + event += 6; + else + event += 7; if (*event != '/') return 0; event++; @@ -560,7 +566,8 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map) memcmp(shname, "xdp", 3) == 0 || memcmp(shname, "perf_event", 10) == 0 || memcmp(shname, "socket", 6) == 0 || - memcmp(shname, "cgroup/", 7) == 0) + memcmp(shname, "cgroup/", 7) == 0 || + memcmp(shname, "sockops", 7) == 0) load_and_attach(shname, data->d_buf, data->d_size); } diff --git a/samples/bpf/load_sock_ops.c b/samples/bpf/load_sock_ops.c new file mode 100644 index 000000000000..e5da6cf71a3e --- /dev/null +++ b/samples/bpf/load_sock_ops.c @@ -0,0 +1,97 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include "libbpf.h" +#include "bpf_load.h" +#include +#include +#include +#include + +static void usage(char *pname) +{ + printf("USAGE:\n %s [-l] \n", pname); + printf("\tLoad and attach a sock_ops program to the specified " + "cgroup\n"); + printf("\tIf \"-l\" is used, the program will continue to run\n"); + printf("\tprinting the BPF log buffer\n"); + printf("\tIf the specified filename does not end in \".o\", it\n"); + printf("\tappends \"_kern.o\" to the name\n"); + printf("\n"); + printf(" %s -r \n", pname); + printf("\tDetaches the currently attached sock_ops program\n"); + printf("\tfrom the specified cgroup\n"); + printf("\n"); + exit(1); +} + +int main(int argc, char **argv) +{ + int logFlag = 0; + int error = 0; + char *cg_path; + char fn[500]; + char *prog; + int cg_fd; + + if (argc < 3) + usage(argv[0]); + + if (!strcmp(argv[1], "-r")) { + cg_path = argv[2]; + cg_fd = open(cg_path, O_DIRECTORY, O_RDONLY); + error = bpf_prog_detach(cg_fd, BPF_CGROUP_SOCK_OPS); + if (error) { + printf("ERROR: bpf_prog_detach: %d (%s)\n", + error, strerror(errno)); + return 2; + } + return 0; + } else if (!strcmp(argv[1], "-h")) { + usage(argv[0]); + } else if (!strcmp(argv[1], "-l")) { + logFlag = 1; + if (argc < 4) + usage(argv[0]); + } + + prog = argv[argc - 1]; + cg_path = argv[argc - 2]; + if (strlen(prog) > 480) { + fprintf(stderr, "ERROR: program name too long (> 480 chars)\n"); + return 3; + } + cg_fd = open(cg_path, O_DIRECTORY, O_RDONLY); + + if (!strcmp(prog + strlen(prog)-2, ".o")) + strcpy(fn, prog); + else + sprintf(fn, "%s_kern.o", prog); + if (logFlag) + printf("loading bpf file:%s\n", fn); + if (load_bpf_file(fn)) { + printf("ERROR: load_bpf_file failed for: %s\n", fn); + printf("%s", bpf_log_buf); + return 4; + } + if (logFlag) + printf("TCP BPF Loaded %s\n", fn); + + error = bpf_prog_attach(prog_fd[0], cg_fd, BPF_CGROUP_SOCK_OPS, 0); + if (error) { + printf("ERROR: bpf_prog_attach: %d (%s)\n", + error, strerror(errno)); + return 5; + } else if (logFlag) { + read_trace_pipe(); + } + + return error; +} diff --git a/samples/bpf/tcp_bufs_kern.c b/samples/bpf/tcp_bufs_kern.c new file mode 100644 index 000000000000..ee83bbabd17c --- /dev/null +++ b/samples/bpf/tcp_bufs_kern.c @@ -0,0 +1,86 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * BPF program to set initial receive window to 40 packets and send + * and receive buffers to 1.5MB. This would usually be done after + * doing appropriate checks that indicate the hosts are far enough + * away (i.e. large RTT). + * + * Use load_sock_ops to load this BPF program. + */ + +#include +#include +#include +#include +#include +#include "bpf_helpers.h" +#include "bpf_endian.h" + +#define DEBUG 1 + +#define bpf_printk(fmt, ...) \ +({ \ + char ____fmt[] = fmt; \ + bpf_trace_printk(____fmt, sizeof(____fmt), \ + ##__VA_ARGS__); \ +}) + +SEC("sockops") +int bpf_bufs(struct bpf_sock_ops *skops) +{ + int bufsize = 1500000; + int rwnd_init = 40; + int rv = 0; + int op; + + /* For testing purposes, only execute rest of BPF program + * if neither port numberis 55601 + */ + if (bpf_ntohl(skops->remote_port) != 55601 && + skops->local_port != 55601) + return -1; + + op = (int) skops->op; + +#ifdef DEBUG + bpf_printk("Returning %d\n", rv); +#endif + + /* Usually there would be a check to insure the hosts are far + * from each other so it makes sense to increase buffer sizes + */ + switch (op) { + case BPF_SOCK_OPS_RWND_INIT: + rv = rwnd_init; + break; + case BPF_SOCK_OPS_TCP_CONNECT_CB: + /* Set sndbuf and rcvbuf of active connections */ + rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize, + sizeof(bufsize)); + rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, + &bufsize, sizeof(bufsize)); + break; + case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: + /* Nothing to do */ + break; + case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: + /* Set sndbuf and rcvbuf of passive connections */ + rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize, + sizeof(bufsize)); + rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, + &bufsize, sizeof(bufsize)); + break; + default: + rv = -1; + } +#ifdef DEBUG + bpf_printk("Returning %d\n", rv); +#endif + skops->reply = rv; + return 1; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/tcp_clamp_kern.c b/samples/bpf/tcp_clamp_kern.c new file mode 100644 index 000000000000..d68eadd9ca2d --- /dev/null +++ b/samples/bpf/tcp_clamp_kern.c @@ -0,0 +1,102 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Sample BPF program to set send and receive buffers to 150KB, sndcwnd clamp + * to 100 packets and SYN and SYN_ACK RTOs to 10ms when both hosts are within + * the same datacenter. For his example, we assume they are within the same + * datacenter when the first 5.5 bytes of their IPv6 addresses are the same. + * + * Use load_sock_ops to load this BPF program. + */ + +#include +#include +#include +#include +#include +#include "bpf_helpers.h" +#include "bpf_endian.h" + +#define DEBUG 1 + +#define bpf_printk(fmt, ...) \ +({ \ + char ____fmt[] = fmt; \ + bpf_trace_printk(____fmt, sizeof(____fmt), \ + ##__VA_ARGS__); \ +}) + +SEC("sockops") +int bpf_clamp(struct bpf_sock_ops *skops) +{ + int bufsize = 150000; + int to_init = 10; + int clamp = 100; + int rv = 0; + int op; + + /* For testing purposes, only execute rest of BPF program + * if neither port numberis 55601 + */ + if (bpf_ntohl(skops->remote_port) != 55601 && skops->local_port != 55601) + return -1; + + op = (int) skops->op; + +#ifdef DEBUG + bpf_printk("BPF command: %d\n", op); +#endif + + /* Check that both hosts are within same datacenter. For this example + * it is the case when the first 5.5 bytes of their IPv6 addresses are + * the same. + */ + if (skops->family == AF_INET6 && + skops->local_ip6[0] == skops->remote_ip6[0] && + (bpf_ntohl(skops->local_ip6[1]) & 0xfff00000) == + (bpf_ntohl(skops->remote_ip6[1]) & 0xfff00000)) { + switch (op) { + case BPF_SOCK_OPS_TIMEOUT_INIT: + rv = to_init; + break; + case BPF_SOCK_OPS_TCP_CONNECT_CB: + /* Set sndbuf and rcvbuf of active connections */ + rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, + &bufsize, sizeof(bufsize)); + rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, + SO_RCVBUF, &bufsize, + sizeof(bufsize)); + break; + case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: + rv = bpf_setsockopt(skops, SOL_TCP, + TCP_BPF_SNDCWND_CLAMP, + &clamp, sizeof(clamp)); + break; + case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: + /* Set sndbuf and rcvbuf of passive connections */ + rv = bpf_setsockopt(skops, SOL_TCP, + TCP_BPF_SNDCWND_CLAMP, + &clamp, sizeof(clamp)); + rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, + SO_SNDBUF, &bufsize, + sizeof(bufsize)); + rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, + SO_RCVBUF, &bufsize, + sizeof(bufsize)); + break; + default: + rv = -1; + } + } else { + rv = -1; + } +#ifdef DEBUG + bpf_printk("Returning %d\n", rv); +#endif + skops->reply = rv; + return 1; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/tcp_cong_kern.c b/samples/bpf/tcp_cong_kern.c new file mode 100644 index 000000000000..dac15bce1fa9 --- /dev/null +++ b/samples/bpf/tcp_cong_kern.c @@ -0,0 +1,83 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * BPF program to set congestion control to dctcp when both hosts are + * in the same datacenter (as deteremined by IPv6 prefix). + * + * Use load_sock_ops to load this BPF program. + */ + +#include +#include +#include +#include +#include +#include +#include "bpf_helpers.h" +#include "bpf_endian.h" + +#define DEBUG 1 + +#define bpf_printk(fmt, ...) \ +({ \ + char ____fmt[] = fmt; \ + bpf_trace_printk(____fmt, sizeof(____fmt), \ + ##__VA_ARGS__); \ +}) + +SEC("sockops") +int bpf_cong(struct bpf_sock_ops *skops) +{ + char cong[] = "dctcp"; + int rv = 0; + int op; + + /* For testing purposes, only execute rest of BPF program + * if neither port numberis 55601 + */ + if (bpf_ntohl(skops->remote_port) != 55601 && + skops->local_port != 55601) + return -1; + + op = (int) skops->op; + +#ifdef DEBUG + bpf_printk("BPF command: %d\n", op); +#endif + + /* Check if both hosts are in the same datacenter. For this + * example they are if the 1st 5.5 bytes in the IPv6 address + * are the same. + */ + if (skops->family == AF_INET6 && + skops->local_ip6[0] == skops->remote_ip6[0] && + (bpf_ntohl(skops->local_ip6[1]) & 0xfff00000) == + (bpf_ntohl(skops->remote_ip6[1]) & 0xfff00000)) { + switch (op) { + case BPF_SOCK_OPS_NEEDS_ECN: + rv = 1; + break; + case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: + rv = bpf_setsockopt(skops, SOL_TCP, TCP_CONGESTION, + cong, sizeof(cong)); + break; + case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: + rv = bpf_setsockopt(skops, SOL_TCP, TCP_CONGESTION, + cong, sizeof(cong)); + break; + default: + rv = -1; + } + } else { + rv = -1; + } +#ifdef DEBUG + bpf_printk("Returning %d\n", rv); +#endif + skops->reply = rv; + return 1; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/tcp_iw_kern.c b/samples/bpf/tcp_iw_kern.c new file mode 100644 index 000000000000..23c5122ef819 --- /dev/null +++ b/samples/bpf/tcp_iw_kern.c @@ -0,0 +1,88 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * BPF program to set initial congestion window and initial receive + * window to 40 packets and send and receive buffers to 1.5MB. This + * would usually be done after doing appropriate checks that indicate + * the hosts are far enough away (i.e. large RTT). + * + * Use load_sock_ops to load this BPF program. + */ + +#include +#include +#include +#include +#include +#include "bpf_helpers.h" +#include "bpf_endian.h" + +#define DEBUG 1 + +#define bpf_printk(fmt, ...) \ +({ \ + char ____fmt[] = fmt; \ + bpf_trace_printk(____fmt, sizeof(____fmt), \ + ##__VA_ARGS__); \ +}) + +SEC("sockops") +int bpf_iw(struct bpf_sock_ops *skops) +{ + int bufsize = 1500000; + int rwnd_init = 40; + int iw = 40; + int rv = 0; + int op; + + /* For testing purposes, only execute rest of BPF program + * if neither port numberis 55601 + */ + if (bpf_ntohl(skops->remote_port) != 55601 && + skops->local_port != 55601) + return -1; + + op = (int) skops->op; + +#ifdef DEBUG + bpf_printk("BPF command: %d\n", op); +#endif + + /* Usually there would be a check to insure the hosts are far + * from each other so it makes sense to increase buffer sizes + */ + switch (op) { + case BPF_SOCK_OPS_RWND_INIT: + rv = rwnd_init; + break; + case BPF_SOCK_OPS_TCP_CONNECT_CB: + /* Set sndbuf and rcvbuf of active connections */ + rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize, + sizeof(bufsize)); + rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, + &bufsize, sizeof(bufsize)); + break; + case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: + rv = bpf_setsockopt(skops, SOL_TCP, TCP_BPF_IW, &iw, + sizeof(iw)); + break; + case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: + /* Set sndbuf and rcvbuf of passive connections */ + rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize, + sizeof(bufsize)); + rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, + &bufsize, sizeof(bufsize)); + break; + default: + rv = -1; + } +#ifdef DEBUG + bpf_printk("Returning %d\n", rv); +#endif + skops->reply = rv; + return 1; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/tcp_rwnd_kern.c b/samples/bpf/tcp_rwnd_kern.c new file mode 100644 index 000000000000..3f2a228f81ce --- /dev/null +++ b/samples/bpf/tcp_rwnd_kern.c @@ -0,0 +1,69 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * BPF program to set initial receive window to 40 packets when using IPv6 + * and the first 5.5 bytes of the IPv6 addresses are not the same (in this + * example that means both hosts are not the same datacenter). + * + * Use load_sock_ops to load this BPF program. + */ + +#include +#include +#include +#include +#include +#include "bpf_helpers.h" +#include "bpf_endian.h" + +#define DEBUG 1 + +#define bpf_printk(fmt, ...) \ +({ \ + char ____fmt[] = fmt; \ + bpf_trace_printk(____fmt, sizeof(____fmt), \ + ##__VA_ARGS__); \ +}) + +SEC("sockops") +int bpf_rwnd(struct bpf_sock_ops *skops) +{ + int rv = -1; + int op; + + /* For testing purposes, only execute rest of BPF program + * if neither port numberis 55601 + */ + if (bpf_ntohl(skops->remote_port) != + 55601 && skops->local_port != 55601) + return -1; + + op = (int) skops->op; + +#ifdef DEBUG + bpf_printk("BPF command: %d\n", op); +#endif + + /* Check for RWND_INIT operation and IPv6 addresses */ + if (op == BPF_SOCK_OPS_RWND_INIT && + skops->family == AF_INET6) { + + /* If the first 5.5 bytes of the IPv6 address are not the same + * then both hosts are not in the same datacenter + * so use a larger initial advertized window (40 packets) + */ + if (skops->local_ip6[0] != skops->remote_ip6[0] || + (bpf_ntohl(skops->local_ip6[1]) & 0xfffff000) != + (bpf_ntohl(skops->remote_ip6[1]) & 0xfffff000)) + rv = 40; + } +#ifdef DEBUG + bpf_printk("Returning %d\n", rv); +#endif + skops->reply = rv; + return 1; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/tcp_synrto_kern.c b/samples/bpf/tcp_synrto_kern.c new file mode 100644 index 000000000000..3c3fc83d81cb --- /dev/null +++ b/samples/bpf/tcp_synrto_kern.c @@ -0,0 +1,69 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * BPF program to set SYN and SYN-ACK RTOs to 10ms when using IPv6 addresses + * and the first 5.5 bytes of the IPv6 addresses are the same (in this example + * that means both hosts are in the same datacenter). + * + * Use load_sock_ops to load this BPF program. + */ + +#include +#include +#include +#include +#include +#include "bpf_helpers.h" +#include "bpf_endian.h" + +#define DEBUG 1 + +#define bpf_printk(fmt, ...) \ +({ \ + char ____fmt[] = fmt; \ + bpf_trace_printk(____fmt, sizeof(____fmt), \ + ##__VA_ARGS__); \ +}) + +SEC("sockops") +int bpf_synrto(struct bpf_sock_ops *skops) +{ + int rv = -1; + int op; + + /* For testing purposes, only execute rest of BPF program + * if neither port numberis 55601 + */ + if (bpf_ntohl(skops->remote_port) != 55601 && + skops->local_port != 55601) + return -1; + + op = (int) skops->op; + +#ifdef DEBUG + bpf_printk("BPF command: %d\n", op); +#endif + + /* Check for TIMEOUT_INIT operation and IPv6 addresses */ + if (op == BPF_SOCK_OPS_TIMEOUT_INIT && + skops->family == AF_INET6) { + + /* If the first 5.5 bytes of the IPv6 address are the same + * then both hosts are in the same datacenter + * so use an RTO of 10ms + */ + if (skops->local_ip6[0] == skops->remote_ip6[0] && + (bpf_ntohl(skops->local_ip6[1]) & 0xfff00000) == + (bpf_ntohl(skops->remote_ip6[1]) & 0xfff00000)) + rv = 10; + } +#ifdef DEBUG + bpf_printk("Returning %d\n", rv); +#endif + skops->reply = rv; + return 1; +} +char _license[] SEC("license") = "GPL"; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f94b48b168dc..284b3661f1df 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -120,12 +120,14 @@ enum bpf_prog_type { BPF_PROG_TYPE_LWT_IN, BPF_PROG_TYPE_LWT_OUT, BPF_PROG_TYPE_LWT_XMIT, + BPF_PROG_TYPE_SOCK_OPS, }; enum bpf_attach_type { BPF_CGROUP_INET_INGRESS, BPF_CGROUP_INET_EGRESS, BPF_CGROUP_INET_SOCK_CREATE, + BPF_CGROUP_SOCK_OPS, __MAX_BPF_ATTACH_TYPE }; @@ -518,6 +520,17 @@ union bpf_attr { * Set full skb->hash. * @skb: pointer to skb * @hash: hash to set + * + * int bpf_setsockopt(bpf_socket, level, optname, optval, optlen) + * Calls setsockopt. Not all opts are available, only those with + * integer optvals plus TCP_CONGESTION. + * Supported levels: SOL_SOCKET and IPROTO_TCP + * @bpf_socket: pointer to bpf_socket + * @level: SOL_SOCKET or IPROTO_TCP + * @optname: option name + * @optval: pointer to option value + * @optlen: length of optval in byes + * Return: 0 or negative error */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -568,7 +581,8 @@ union bpf_attr { FN(probe_read_str), \ FN(get_socket_cookie), \ FN(get_socket_uid), \ - FN(set_hash), + FN(set_hash), \ + FN(setsockopt), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -720,4 +734,54 @@ struct bpf_map_info { __u32 map_flags; } __attribute__((aligned(8))); +/* User bpf_sock_ops struct to access socket values and specify request ops + * and their replies. + * New fields can only be added at the end of this structure + */ +struct bpf_sock_ops { + __u32 op; + union { + __u32 reply; + __u32 replylong[4]; + }; + __u32 family; + __u32 remote_ip4; + __u32 local_ip4; + __u32 remote_ip6[4]; + __u32 local_ip6[4]; + __u32 remote_port; + __u32 local_port; +}; + +/* List of known BPF sock_ops operators. + * New entries can only be added at the end + */ +enum { + BPF_SOCK_OPS_VOID, + BPF_SOCK_OPS_TIMEOUT_INIT, /* Should return SYN-RTO value to use or + * -1 if default value should be used + */ + BPF_SOCK_OPS_RWND_INIT, /* Should return initial advertized + * window (in packets) or -1 if default + * value should be used + */ + BPF_SOCK_OPS_TCP_CONNECT_CB, /* Calls BPF program right before an + * active connection is initialized + */ + BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, /* Calls BPF program when an + * active connection is + * established + */ + BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, /* Calls BPF program when a + * passive connection is + * established + */ + BPF_SOCK_OPS_NEEDS_ECN, /* If connection's congestion control + * needs ECN + */ +}; + +#define TCP_BPF_IW 1001 /* Set TCP initial congestion window */ +#define TCP_BPF_SNDCWND_CLAMP 1002 /* Set sndcwnd_clamp */ + #endif /* _UAPI__LINUX_BPF_H__ */