Merge branch 'tcp_skb_cb'
Eric Dumazet says: ==================== tcp: better TCP_SKB_CB layout TCP had the assumption that IPCB and IP6CB are first members of skb->cb[] This is fine, except that IPCB/IP6CB are used in TCP for a very short time in input path. What really matters for TCP stack is to get skb->next, TCP_SKB_CB(skb)->seq, and TCP_SKB_CB(skb)->end_seq in the same cache line. skb that are immediately consumed do not care because whole skb->cb[] is hot in cpu cache, while skb that sit in wocket write queue or receive queues do not need TCP_SKB_CB(skb)->header at all. This patch set implements the prereq for IPv4, IPv6, and TCP to make this possible. This makes TCP more efficient. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
commit
dc83d4d8f6
|
@ -180,8 +180,10 @@ static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg)
|
|||
return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0;
|
||||
}
|
||||
|
||||
void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
|
||||
__be32 saddr, const struct ip_reply_arg *arg,
|
||||
void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
|
||||
const struct ip_options *sopt,
|
||||
__be32 daddr, __be32 saddr,
|
||||
const struct ip_reply_arg *arg,
|
||||
unsigned int len);
|
||||
|
||||
#define IP_INC_STATS(net, field) SNMP_INC_STATS64((net)->mib.ip_statistics, field)
|
||||
|
@ -511,7 +513,14 @@ int ip_forward(struct sk_buff *skb);
|
|||
|
||||
void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
|
||||
__be32 daddr, struct rtable *rt, int is_frag);
|
||||
int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb);
|
||||
|
||||
int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb,
|
||||
const struct ip_options *sopt);
|
||||
static inline int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
|
||||
{
|
||||
return __ip_options_echo(dopt, skb, &IPCB(skb)->opt);
|
||||
}
|
||||
|
||||
void ip_options_fragment(struct sk_buff *skb);
|
||||
int ip_options_compile(struct net *net, struct ip_options *opt,
|
||||
struct sk_buff *skb);
|
||||
|
|
|
@ -288,7 +288,8 @@ struct ipv6_txoptions *ipv6_renew_options(struct sock *sk,
|
|||
struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
|
||||
struct ipv6_txoptions *opt);
|
||||
|
||||
bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb);
|
||||
bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb,
|
||||
const struct inet6_skb_parm *opt);
|
||||
|
||||
static inline bool ipv6_accept_ra(struct inet6_dev *idev)
|
||||
{
|
||||
|
|
|
@ -696,12 +696,6 @@ static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
|
|||
* If this grows please adjust skbuff.h:skbuff->cb[xxx] size appropriately.
|
||||
*/
|
||||
struct tcp_skb_cb {
|
||||
union {
|
||||
struct inet_skb_parm h4;
|
||||
#if IS_ENABLED(CONFIG_IPV6)
|
||||
struct inet6_skb_parm h6;
|
||||
#endif
|
||||
} header; /* For incoming frames */
|
||||
__u32 seq; /* Starting sequence number */
|
||||
__u32 end_seq; /* SEQ + FIN + SYN + datalen */
|
||||
__u32 tcp_tw_isn; /* isn chosen by tcp_timewait_state_process() */
|
||||
|
@ -720,6 +714,12 @@ struct tcp_skb_cb {
|
|||
__u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */
|
||||
/* 1 byte hole */
|
||||
__u32 ack_seq; /* Sequence number ACK'd */
|
||||
union {
|
||||
struct inet_skb_parm h4;
|
||||
#if IS_ENABLED(CONFIG_IPV6)
|
||||
struct inet6_skb_parm h6;
|
||||
#endif
|
||||
} header; /* For incoming frames */
|
||||
};
|
||||
|
||||
#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
|
||||
|
|
|
@ -404,7 +404,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
|
|||
ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
|
||||
ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
|
||||
|
||||
if (ipv6_opt_accepted(sk, skb) ||
|
||||
if (ipv6_opt_accepted(sk, skb, IP6CB(skb)) ||
|
||||
np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
|
||||
np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
|
||||
atomic_inc(&skb->users);
|
||||
|
|
|
@ -87,17 +87,15 @@ void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
|
|||
* NOTE: dopt cannot point to skb.
|
||||
*/
|
||||
|
||||
int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
|
||||
int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb,
|
||||
const struct ip_options *sopt)
|
||||
{
|
||||
const struct ip_options *sopt;
|
||||
unsigned char *sptr, *dptr;
|
||||
int soffset, doffset;
|
||||
int optlen;
|
||||
|
||||
memset(dopt, 0, sizeof(struct ip_options));
|
||||
|
||||
sopt = &(IPCB(skb)->opt);
|
||||
|
||||
if (sopt->optlen == 0)
|
||||
return 0;
|
||||
|
||||
|
|
|
@ -1522,8 +1522,10 @@ static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = {
|
|||
.uc_ttl = -1,
|
||||
};
|
||||
|
||||
void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
|
||||
__be32 saddr, const struct ip_reply_arg *arg,
|
||||
void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
|
||||
const struct ip_options *sopt,
|
||||
__be32 daddr, __be32 saddr,
|
||||
const struct ip_reply_arg *arg,
|
||||
unsigned int len)
|
||||
{
|
||||
struct ip_options_data replyopts;
|
||||
|
@ -1534,7 +1536,7 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
|
|||
struct sock *sk;
|
||||
struct inet_sock *inet;
|
||||
|
||||
if (ip_options_echo(&replyopts.opt.opt, skb))
|
||||
if (__ip_options_echo(&replyopts.opt.opt, skb, sopt))
|
||||
return;
|
||||
|
||||
ipc.addr = daddr;
|
||||
|
|
|
@ -681,8 +681,9 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
|
|||
|
||||
net = dev_net(skb_dst(skb)->dev);
|
||||
arg.tos = ip_hdr(skb)->tos;
|
||||
ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
|
||||
ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
|
||||
ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt,
|
||||
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
|
||||
&arg, arg.iov[0].iov_len);
|
||||
|
||||
TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
|
||||
TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
|
||||
|
@ -764,8 +765,9 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
|
|||
if (oif)
|
||||
arg.bound_dev_if = oif;
|
||||
arg.tos = tos;
|
||||
ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
|
||||
ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
|
||||
ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt,
|
||||
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
|
||||
&arg, arg.iov[0].iov_len);
|
||||
|
||||
TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
|
||||
}
|
||||
|
@ -884,18 +886,16 @@ EXPORT_SYMBOL(tcp_syn_flood_action);
|
|||
*/
|
||||
static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
|
||||
{
|
||||
const struct ip_options *opt = &(IPCB(skb)->opt);
|
||||
const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
|
||||
struct ip_options_rcu *dopt = NULL;
|
||||
|
||||
if (opt && opt->optlen) {
|
||||
int opt_size = sizeof(*dopt) + opt->optlen;
|
||||
|
||||
dopt = kmalloc(opt_size, GFP_ATOMIC);
|
||||
if (dopt) {
|
||||
if (ip_options_echo(&dopt->opt, skb)) {
|
||||
kfree(dopt);
|
||||
dopt = NULL;
|
||||
}
|
||||
if (dopt && __ip_options_echo(&dopt->opt, skb, opt)) {
|
||||
kfree(dopt);
|
||||
dopt = NULL;
|
||||
}
|
||||
}
|
||||
return dopt;
|
||||
|
@ -1429,7 +1429,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
|
|||
|
||||
#ifdef CONFIG_SYN_COOKIES
|
||||
if (!th->syn)
|
||||
sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
|
||||
sk = cookie_v4_check(sk, skb, &TCP_SKB_CB(skb)->header.h4.opt);
|
||||
#endif
|
||||
return sk;
|
||||
}
|
||||
|
@ -1634,6 +1634,13 @@ int tcp_v4_rcv(struct sk_buff *skb)
|
|||
|
||||
th = tcp_hdr(skb);
|
||||
iph = ip_hdr(skb);
|
||||
/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
|
||||
* barrier() makes sure compiler wont play fool^Waliasing games.
|
||||
*/
|
||||
memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
|
||||
sizeof(struct inet_skb_parm));
|
||||
barrier();
|
||||
|
||||
TCP_SKB_CB(skb)->seq = ntohl(th->seq);
|
||||
TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
|
||||
skb->len - th->doff * 4);
|
||||
|
|
|
@ -974,6 +974,11 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
|
|||
|
||||
/* Our usage of tstamp should remain private */
|
||||
skb->tstamp.tv64 = 0;
|
||||
|
||||
/* Cleanup our debris for IP stacks */
|
||||
memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
|
||||
sizeof(struct inet6_skb_parm)));
|
||||
|
||||
err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
|
||||
|
||||
if (likely(err <= 0))
|
||||
|
|
|
@ -672,10 +672,10 @@ int inet6_sk_rebuild_header(struct sock *sk)
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(inet6_sk_rebuild_header);
|
||||
|
||||
bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb)
|
||||
bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb,
|
||||
const struct inet6_skb_parm *opt)
|
||||
{
|
||||
const struct ipv6_pinfo *np = inet6_sk(sk);
|
||||
const struct inet6_skb_parm *opt = IP6CB(skb);
|
||||
|
||||
if (np->rxopt.all) {
|
||||
if ((opt->hop && (np->rxopt.bits.hopopts ||
|
||||
|
|
|
@ -203,7 +203,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
|
|||
ireq->ir_num = ntohs(th->dest);
|
||||
ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
|
||||
ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
|
||||
if (ipv6_opt_accepted(sk, skb) ||
|
||||
if (ipv6_opt_accepted(sk, skb, &TCP_SKB_CB(skb)->header.h6) ||
|
||||
np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
|
||||
np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
|
||||
atomic_inc(&skb->users);
|
||||
|
|
|
@ -742,7 +742,8 @@ static void tcp_v6_init_req(struct request_sock *req, struct sock *sk,
|
|||
ireq->ir_iif = inet6_iif(skb);
|
||||
|
||||
if (!TCP_SKB_CB(skb)->tcp_tw_isn &&
|
||||
(ipv6_opt_accepted(sk, skb) || np->rxopt.bits.rxinfo ||
|
||||
(ipv6_opt_accepted(sk, skb, &TCP_SKB_CB(skb)->header.h6) ||
|
||||
np->rxopt.bits.rxinfo ||
|
||||
np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim ||
|
||||
np->rxopt.bits.rxohlim || np->repflow)) {
|
||||
atomic_inc(&skb->users);
|
||||
|
@ -1367,7 +1368,7 @@ ipv6_pktoptions:
|
|||
np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb));
|
||||
if (np->repflow)
|
||||
np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb));
|
||||
if (ipv6_opt_accepted(sk, opt_skb)) {
|
||||
if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) {
|
||||
skb_set_owner_r(opt_skb, sk);
|
||||
opt_skb = xchg(&np->pktoptions, opt_skb);
|
||||
} else {
|
||||
|
@ -1411,6 +1412,13 @@ static int tcp_v6_rcv(struct sk_buff *skb)
|
|||
|
||||
th = tcp_hdr(skb);
|
||||
hdr = ipv6_hdr(skb);
|
||||
/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
|
||||
* barrier() makes sure compiler wont play fool^Waliasing games.
|
||||
*/
|
||||
memmove(&TCP_SKB_CB(skb)->header.h6, IP6CB(skb),
|
||||
sizeof(struct inet6_skb_parm));
|
||||
barrier();
|
||||
|
||||
TCP_SKB_CB(skb)->seq = ntohl(th->seq);
|
||||
TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
|
||||
skb->len - th->doff*4);
|
||||
|
|
Loading…
Reference in New Issue