From 1365e547c6bb6b6b137bc40a189675503baa037b Mon Sep 17 00:00:00 2001 From: Alexander Alemayhu Date: Tue, 3 Jan 2017 17:13:20 +0100 Subject: [PATCH 01/15] xfrm: trivial typos o s/descentant/descendant o s/workarbound/workaround Signed-off-by: Alexander Alemayhu Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 2 +- net/xfrm/xfrm_state.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 177e208e8ff5..99ad1af2927f 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -330,7 +330,7 @@ void xfrm_policy_destroy(struct xfrm_policy *policy) } EXPORT_SYMBOL(xfrm_policy_destroy); -/* Rule must be locked. Release descentant resources, announce +/* Rule must be locked. Release descendant resources, announce * entry dead. The rule must be unlinked from lists to the moment. */ diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 64e3c82eedf6..c5cf4d611aab 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -409,7 +409,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) if (x->xflags & XFRM_SOFT_EXPIRE) { /* enter hard expire without soft expire first?! * setting a new date could trigger this. - * workarbound: fix x->curflt.add_time by below: + * workaround: fix x->curflt.add_time by below: */ x->curlft.add_time = now - x->saved_tmo - 1; tmo = x->lft.hard_add_expires_seconds - x->saved_tmo; From b3b73b8e6df685ba61476b256f98eff1d650c199 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 5 Jan 2017 13:23:58 +0100 Subject: [PATCH 02/15] xfrm: state: do not acquire lock in get_mtu helpers Once flow cache gets removed the mtu initialisation happens for every skb that gets an xfrm attached, so this lock starts to show up in perf. It is not obvious why this lock is required -- the caller holds reference on the state struct, type->destructor is only called from the state gc worker (all state structs on gc list must have refcount 0). xfrm_init_state already has been called (else private data accessed by type->get_mtu() would not be set up). So just remove the lock -- the race on the state (DEAD?) doesn't matter (could change right after dropping the lock too). Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index c5cf4d611aab..6b3366fef019 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2000,16 +2000,13 @@ EXPORT_SYMBOL(xfrm_state_delete_tunnel); int xfrm_state_mtu(struct xfrm_state *x, int mtu) { - int res; + const struct xfrm_type *type = READ_ONCE(x->type); - spin_lock_bh(&x->lock); if (x->km.state == XFRM_STATE_VALID && - x->type && x->type->get_mtu) - res = x->type->get_mtu(x, mtu); - else - res = mtu - x->props.header_len; - spin_unlock_bh(&x->lock); - return res; + type && type->get_mtu) + return type->get_mtu(x, mtu); + + return mtu - x->props.header_len; } int __xfrm_init_state(struct xfrm_state *x, bool init_replay) From 961a9c0a4a9783e47f8cb76134b2303c4872380b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 9 Jan 2017 14:20:45 +0100 Subject: [PATCH 03/15] xfrm: remove unused function Has been ifdef'd out for more than 10 years, remove it. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/ipv4/xfrm4_state.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index 542074c00c78..d6660a8c0ea5 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -90,11 +90,3 @@ void __init xfrm4_state_init(void) { xfrm_state_register_afinfo(&xfrm4_state_afinfo); } - -#if 0 -void __exit xfrm4_state_fini(void) -{ - xfrm_state_unregister_afinfo(&xfrm4_state_afinfo); -} -#endif /* 0 */ - From 423826a7b104724a16676a75d597a35d3bdb18b8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 9 Jan 2017 14:20:46 +0100 Subject: [PATCH 04/15] xfrm: avoid rcu sparse warning xfrm/xfrm_state.c:1973:21: error: incompatible types in comparison expression (different address spaces) Harmless, but lets fix it to reduce the noise. While at it, get rid of unneeded NULL check, its never hit: net/ipv4/xfrm4_state.c: xfrm_state_register_afinfo(&xfrm4_state_afinfo); net/ipv6/xfrm6_state.c: return xfrm_state_register_afinfo(&xfrm6_state_afinfo); net/ipv6/xfrm6_state.c: xfrm_state_unregister_afinfo(&xfrm6_state_afinfo); ... are the only callsites. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 6b3366fef019..57e9578c35e2 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1932,10 +1932,10 @@ EXPORT_SYMBOL(xfrm_unregister_km); int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo) { int err = 0; - if (unlikely(afinfo == NULL)) - return -EINVAL; - if (unlikely(afinfo->family >= NPROTO)) + + if (WARN_ON(afinfo->family >= NPROTO)) return -EAFNOSUPPORT; + spin_lock_bh(&xfrm_state_afinfo_lock); if (unlikely(xfrm_state_afinfo[afinfo->family] != NULL)) err = -EEXIST; @@ -1948,14 +1948,14 @@ EXPORT_SYMBOL(xfrm_state_register_afinfo); int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo) { - int err = 0; - if (unlikely(afinfo == NULL)) - return -EINVAL; - if (unlikely(afinfo->family >= NPROTO)) + int err = 0, family = afinfo->family; + + if (WARN_ON(family >= NPROTO)) return -EAFNOSUPPORT; + spin_lock_bh(&xfrm_state_afinfo_lock); if (likely(xfrm_state_afinfo[afinfo->family] != NULL)) { - if (unlikely(xfrm_state_afinfo[afinfo->family] != afinfo)) + if (rcu_access_pointer(xfrm_state_afinfo[family]) != afinfo) err = -EINVAL; else RCU_INIT_POINTER(xfrm_state_afinfo[afinfo->family], NULL); From af5d27c4e12b804c065c0e7c87507fea5683dab4 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 9 Jan 2017 14:20:47 +0100 Subject: [PATCH 05/15] xfrm: remove xfrm_state_put_afinfo commit 44abdc3047aecafc141dfbaf1ed ("xfrm: replace rwlock on xfrm_state_afinfo with rcu") made xfrm_state_put_afinfo equivalent to rcu_read_unlock. Use spatch to replace it with direct calls to rcu_read_unlock: @@ struct xfrm_state_afinfo *a; @@ - xfrm_state_put_afinfo(a); + rcu_read_unlock(); old: text data bss dec hex filename 22570 72 424 23066 5a1a xfrm_state.o 1612 0 0 1612 64c xfrm_output.o new: 22554 72 424 23050 5a0a xfrm_state.o 1596 0 0 1596 63c xfrm_output.o Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 - net/xfrm/xfrm_output.c | 8 +++----- net/xfrm/xfrm_state.c | 31 +++++++++++++------------------ 3 files changed, 16 insertions(+), 24 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 31947b9c21d6..957d0cc30691 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -343,7 +343,6 @@ struct xfrm_state_afinfo { int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo); int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo); struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family); -void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo); struct xfrm_input_afinfo { unsigned int family; diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 637387bbaaea..8ba29fe58352 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -246,10 +246,8 @@ void xfrm_local_error(struct sk_buff *skb, int mtu) return; afinfo = xfrm_state_get_afinfo(proto); - if (!afinfo) - return; - - afinfo->local_error(skb, mtu); - xfrm_state_put_afinfo(afinfo); + if (afinfo) + afinfo->local_error(skb, mtu); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(xfrm_local_error); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 57e9578c35e2..783084484582 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -192,7 +192,7 @@ int xfrm_register_type(const struct xfrm_type *type, unsigned short family) else err = -EEXIST; spin_unlock_bh(&xfrm_type_lock); - xfrm_state_put_afinfo(afinfo); + rcu_read_unlock(); return err; } EXPORT_SYMBOL(xfrm_register_type); @@ -213,7 +213,7 @@ int xfrm_unregister_type(const struct xfrm_type *type, unsigned short family) else typemap[type->proto] = NULL; spin_unlock_bh(&xfrm_type_lock); - xfrm_state_put_afinfo(afinfo); + rcu_read_unlock(); return err; } EXPORT_SYMBOL(xfrm_unregister_type); @@ -235,13 +235,13 @@ retry: if (unlikely(type && !try_module_get(type->owner))) type = NULL; if (!type && !modload_attempted) { - xfrm_state_put_afinfo(afinfo); + rcu_read_unlock(); request_module("xfrm-type-%d-%d", family, proto); modload_attempted = 1; goto retry; } - xfrm_state_put_afinfo(afinfo); + rcu_read_unlock(); return type; } @@ -280,7 +280,7 @@ int xfrm_register_mode(struct xfrm_mode *mode, int family) out: spin_unlock_bh(&xfrm_mode_lock); - xfrm_state_put_afinfo(afinfo); + rcu_read_unlock(); return err; } EXPORT_SYMBOL(xfrm_register_mode); @@ -308,7 +308,7 @@ int xfrm_unregister_mode(struct xfrm_mode *mode, int family) } spin_unlock_bh(&xfrm_mode_lock); - xfrm_state_put_afinfo(afinfo); + rcu_read_unlock(); return err; } EXPORT_SYMBOL(xfrm_unregister_mode); @@ -331,13 +331,13 @@ retry: if (unlikely(mode && !try_module_get(mode->owner))) mode = NULL; if (!mode && !modload_attempted) { - xfrm_state_put_afinfo(afinfo); + rcu_read_unlock(); request_module("xfrm-mode-%d-%d", family, encap); modload_attempted = 1; goto retry; } - xfrm_state_put_afinfo(afinfo); + rcu_read_unlock(); return mode; } @@ -651,13 +651,13 @@ xfrm_init_tempstate(struct xfrm_state *x, const struct flowi *fl, afinfo->init_tempsel(&x->sel, fl); if (family != tmpl->encap_family) { - xfrm_state_put_afinfo(afinfo); + rcu_read_unlock(); afinfo = xfrm_state_get_afinfo(tmpl->encap_family); if (!afinfo) return -1; } afinfo->init_temprop(x, tmpl, daddr, saddr); - xfrm_state_put_afinfo(afinfo); + rcu_read_unlock(); return 0; } @@ -1474,7 +1474,7 @@ xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n, if (afinfo->tmpl_sort) err = afinfo->tmpl_sort(dst, src, n); spin_unlock_bh(&net->xfrm.xfrm_state_lock); - xfrm_state_put_afinfo(afinfo); + rcu_read_unlock(); return err; } EXPORT_SYMBOL(xfrm_tmpl_sort); @@ -1494,7 +1494,7 @@ xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n, if (afinfo->state_sort) err = afinfo->state_sort(dst, src, n); spin_unlock_bh(&net->xfrm.xfrm_state_lock); - xfrm_state_put_afinfo(afinfo); + rcu_read_unlock(); return err; } EXPORT_SYMBOL(xfrm_state_sort); @@ -1978,11 +1978,6 @@ struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family) return afinfo; } -void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo) -{ - rcu_read_unlock(); -} - /* Temporarily located here until net/xfrm/xfrm_tunnel.c is created */ void xfrm_state_delete_tunnel(struct xfrm_state *x) { @@ -2025,7 +2020,7 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay) if (afinfo->init_flags) err = afinfo->init_flags(x); - xfrm_state_put_afinfo(afinfo); + rcu_read_unlock(); if (err) goto error; From 711059b9752ad09ae6bcd4be8e48d30e5db483d8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 9 Jan 2017 14:20:48 +0100 Subject: [PATCH 06/15] xfrm: add and use xfrm_state_afinfo_get_rcu xfrm_init_tempstate is always called from within rcu read side section. We can thus use a simpler function that doesn't call rcu_read_lock again. While at it, also make xfrm_init_tempstate return value void, the return value was never tested. A followup patch will replace remaining callers of xfrm_state_get_afinfo with xfrm_state_afinfo_get_rcu variant and then remove the 'old' get_afinfo interface. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 + net/xfrm/xfrm_state.c | 25 +++++++++++++++---------- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 957d0cc30691..c52197cf51dc 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -343,6 +343,7 @@ struct xfrm_state_afinfo { int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo); int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo); struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family); +struct xfrm_state_afinfo *xfrm_state_afinfo_get_rcu(unsigned int family); struct xfrm_input_afinfo { unsigned int family; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 783084484582..b5dad899fb0e 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -639,26 +639,23 @@ void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si) } EXPORT_SYMBOL(xfrm_sad_getinfo); -static int +static void xfrm_init_tempstate(struct xfrm_state *x, const struct flowi *fl, const struct xfrm_tmpl *tmpl, const xfrm_address_t *daddr, const xfrm_address_t *saddr, unsigned short family) { - struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); - if (!afinfo) - return -1; - afinfo->init_tempsel(&x->sel, fl); + struct xfrm_state_afinfo *afinfo = xfrm_state_afinfo_get_rcu(family); + + if (afinfo) + afinfo->init_tempsel(&x->sel, fl); if (family != tmpl->encap_family) { - rcu_read_unlock(); - afinfo = xfrm_state_get_afinfo(tmpl->encap_family); + afinfo = xfrm_state_afinfo_get_rcu(tmpl->encap_family); if (!afinfo) - return -1; + return; } afinfo->init_temprop(x, tmpl, daddr, saddr); - rcu_read_unlock(); - return 0; } static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark, @@ -1966,6 +1963,14 @@ int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo) } EXPORT_SYMBOL(xfrm_state_unregister_afinfo); +struct xfrm_state_afinfo *xfrm_state_afinfo_get_rcu(unsigned int family) +{ + if (unlikely(family >= NPROTO)) + return NULL; + + return rcu_dereference(xfrm_state_afinfo[family]); +} + struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family) { struct xfrm_state_afinfo *afinfo; From 75cda62d9ca2cd3fab0412d438fcd1bfa0580b3d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 9 Jan 2017 14:20:49 +0100 Subject: [PATCH 07/15] xfrm: state: simplify rcu_read_unlock handling in two spots Instead of: if (foo) { unlock(); return bar(); } unlock(); do: unlock(); if (foo) return bar(); This is ok because rcu protected structure is only dereferenced before the conditional. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index b5dad899fb0e..a62097e640b5 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -231,17 +231,18 @@ retry: return NULL; typemap = afinfo->type_map; - type = typemap[proto]; + type = READ_ONCE(typemap[proto]); if (unlikely(type && !try_module_get(type->owner))) type = NULL; + + rcu_read_unlock(); + if (!type && !modload_attempted) { - rcu_read_unlock(); request_module("xfrm-type-%d-%d", family, proto); modload_attempted = 1; goto retry; } - rcu_read_unlock(); return type; } @@ -327,17 +328,17 @@ retry: if (unlikely(afinfo == NULL)) return NULL; - mode = afinfo->mode_map[encap]; + mode = READ_ONCE(afinfo->mode_map[encap]); if (unlikely(mode && !try_module_get(mode->owner))) mode = NULL; + + rcu_read_unlock(); if (!mode && !modload_attempted) { - rcu_read_unlock(); request_module("xfrm-mode-%d-%d", family, encap); modload_attempted = 1; goto retry; } - rcu_read_unlock(); return mode; } From 3819a35fdb8f82428c2b5190f195c4e0b8c427bf Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 13 Jan 2017 14:55:14 +0100 Subject: [PATCH 08/15] xfrm: fix possible null deref in xfrm_init_tempstate Dan reports following smatch warning: net/xfrm/xfrm_state.c:659 error: we previously assumed 'afinfo' could be null (see line 651) 649 struct xfrm_state_afinfo *afinfo = xfrm_state_afinfo_get_rcu(family); 651 if (afinfo) ... 658 } 659 afinfo->init_temprop(x, tmpl, daddr, saddr); I am resonably sure afinfo cannot be NULL here. xfrm_state4.c and state6.c are both part of ipv4/ipv6 (depends on CONFIG_XFRM, a boolean) but even if ipv6 is a module state6.c can't be removed (ipv6 lacks module_exit so it cannot be removed). The only callers for xfrm6_fini that leads to state backend unregister are error unwinding paths that can be called during ipv6 init function. So after ipv6 module is loaded successfully the state backend cannot go away anymore. The family value from policy lookup path is taken from dst_entry, so that should always be AF_INET(6). However, since this silences the warning and avoids readers of this code wondering about possible null deref it seems preferrable to be defensive and just add the old check back. Fixes: 711059b9752ad0 ("xfrm: add and use xfrm_state_afinfo_get_rcu") Reported-by: Dan Carpenter Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index a62097e640b5..5a597dbbe564 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -648,8 +648,10 @@ xfrm_init_tempstate(struct xfrm_state *x, const struct flowi *fl, { struct xfrm_state_afinfo *afinfo = xfrm_state_afinfo_get_rcu(family); - if (afinfo) - afinfo->init_tempsel(&x->sel, fl); + if (!afinfo) + return; + + afinfo->init_tempsel(&x->sel, fl); if (family != tmpl->encap_family) { afinfo = xfrm_state_afinfo_get_rcu(tmpl->encap_family); From ebd89a2d0675f1325c2be5b7576fd8cb7e8defd0 Mon Sep 17 00:00:00 2001 From: Gilad Ben-Yossef Date: Mon, 16 Jan 2017 13:17:55 +0200 Subject: [PATCH 09/15] IPsec: do not ignore crypto err in ah4 input ah4 input processing uses the asynchronous hash crypto API which supplies an error code as part of the operation completion but the error code was being ignored. Treat a crypto API error indication as a verification failure. While a crypto API reported error would almost certainly result in a memcpy of the digest failing anyway and thus the security risk seems minor, performing a memory compare on what might be uninitialized memory is wrong. Signed-off-by: Gilad Ben-Yossef Signed-off-by: Steffen Klassert --- net/ipv4/ah4.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index f2a71025a770..22377c8ff14b 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -270,6 +270,9 @@ static void ah_input_done(struct crypto_async_request *base, int err) int ihl = ip_hdrlen(skb); int ah_hlen = (ah->hdrlen + 2) << 2; + if (err) + goto out; + work_iph = AH_SKB_CB(skb)->tmp; auth_data = ah_tmp_auth(work_iph, ihl); icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len); From 726282aa6bbe627b3876afc27f88172e37b1d01d Mon Sep 17 00:00:00 2001 From: Gilad Ben-Yossef Date: Mon, 16 Jan 2017 13:17:56 +0200 Subject: [PATCH 10/15] IPsec: do not ignore crypto err in ah6 input ah6 input processing uses the asynchronous hash crypto API which supplies an error code as part of the operation completion but the error code was being ignored. Treat a crypto API error indication as a verification failure. While a crypto API reported error would almost certainly result in a memcpy of the digest failing anyway and thus the security risk seems minor, performing a memory compare on what might be uninitialized memory is wrong. Signed-off-by: Gilad Ben-Yossef Signed-off-by: Steffen Klassert --- net/ipv6/ah6.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 189eb10b742d..dda6035e3b84 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -474,6 +474,9 @@ static void ah6_input_done(struct crypto_async_request *base, int err) int hdr_len = skb_network_header_len(skb); int ah_hlen = (ah->hdrlen + 2) << 2; + if (err) + goto out; + work_iph = AH_SKB_CB(skb)->tmp; auth_data = ah_tmp_auth(work_iph, hdr_len); icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len); From cac2661c53f35cbe651bef9b07026a5a05ab8ce0 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Tue, 17 Jan 2017 10:22:57 +0100 Subject: [PATCH 11/15] esp4: Avoid skb_cow_data whenever possible This patch tries to avoid skb_cow_data on esp4. On the encrypt side we add the IPsec tailbits to the linear part of the buffer if there is space on it. If there is no space on the linear part, we add a page fragment with the tailbits to the buffer and use separate src and dst scatterlists. On the decrypt side, we leave the buffer as it is if it is not cloned. With this, we can avoid a linearization of the buffer in most of the cases. Joint work with: Sowmini Varadhan Ilan Tayari Signed-off-by: Sowmini Varadhan Signed-off-by: Ilan Tayari Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 2 + net/ipv4/esp4.c | 340 +++++++++++++++++++++++++++++++++++---------- 2 files changed, 267 insertions(+), 75 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index c52197cf51dc..d9a81dcef53e 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -213,6 +213,8 @@ struct xfrm_state { /* Last used time */ unsigned long lastused; + struct page_frag xfrag; + /* Reference to data common to all the instances of this * transformer. */ const struct xfrm_type *type; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 20fb25e3027b..9e8d97133513 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -18,6 +18,8 @@ #include #include +#include + struct esp_skb_cb { struct xfrm_skb_cb xfrm; void *tmp; @@ -92,11 +94,40 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead, __alignof__(struct scatterlist)); } +static void esp_ssg_unref(struct xfrm_state *x, void *tmp) +{ + struct esp_output_extra *extra = esp_tmp_extra(tmp); + struct crypto_aead *aead = x->data; + int extralen = 0; + u8 *iv; + struct aead_request *req; + struct scatterlist *sg; + + if (x->props.flags & XFRM_STATE_ESN) + extralen += sizeof(*extra); + + extra = esp_tmp_extra(tmp); + iv = esp_tmp_iv(aead, tmp, extralen); + req = esp_tmp_req(aead, iv); + + /* Unref skb_frag_pages in the src scatterlist if necessary. + * Skip the first sg which comes from skb->data. + */ + if (req->src != req->dst) + for (sg = sg_next(req->src); sg; sg = sg_next(sg)) + put_page(sg_page(sg)); +} + static void esp_output_done(struct crypto_async_request *base, int err) { struct sk_buff *skb = base->data; + void *tmp; + struct dst_entry *dst = skb_dst(skb); + struct xfrm_state *x = dst->xfrm; - kfree(ESP_SKB_CB(skb)->tmp); + tmp = ESP_SKB_CB(skb)->tmp; + esp_ssg_unref(x, tmp); + kfree(tmp); xfrm_output_resume(skb, err); } @@ -120,6 +151,29 @@ static void esp_output_restore_header(struct sk_buff *skb) sizeof(__be32)); } +static struct ip_esp_hdr *esp_output_set_extra(struct sk_buff *skb, + struct ip_esp_hdr *esph, + struct esp_output_extra *extra) +{ + struct xfrm_state *x = skb_dst(skb)->xfrm; + + /* For ESN we move the header forward by 4 bytes to + * accomodate the high bits. We will move it back after + * encryption. + */ + if ((x->props.flags & XFRM_STATE_ESN)) { + extra->esphoff = (unsigned char *)esph - + skb_transport_header(skb); + esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4); + extra->seqhi = esph->spi; + esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi); + } + + esph->spi = x->id.spi; + + return esph; +} + static void esp_output_done_esn(struct crypto_async_request *base, int err) { struct sk_buff *skb = base->data; @@ -130,16 +184,18 @@ static void esp_output_done_esn(struct crypto_async_request *base, int err) static int esp_output(struct xfrm_state *x, struct sk_buff *skb) { - int err; struct esp_output_extra *extra; + int err = -ENOMEM; struct ip_esp_hdr *esph; struct crypto_aead *aead; struct aead_request *req; - struct scatterlist *sg; + struct scatterlist *sg, *dsg; struct sk_buff *trailer; + struct page *page; void *tmp; u8 *iv; u8 *tail; + u8 *vaddr; int blksize; int clen; int alen; @@ -149,7 +205,9 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) int nfrags; int assoclen; int extralen; + int tailen; __be64 seqno; + __u8 proto = *skb_mac_header(skb); /* skb is pure payload to encrypt */ @@ -169,12 +227,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) blksize = ALIGN(crypto_aead_blocksize(aead), 4); clen = ALIGN(skb->len + 2 + tfclen, blksize); plen = clen - skb->len - tfclen; - - err = skb_cow_data(skb, tfclen + plen + alen, &trailer); - if (err < 0) - goto error; - nfrags = err; - + tailen = tfclen + plen + alen; assoclen = sizeof(*esph); extralen = 0; @@ -183,35 +236,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) assoclen += sizeof(__be32); } - tmp = esp_alloc_tmp(aead, nfrags, extralen); - if (!tmp) { - err = -ENOMEM; - goto error; - } - - extra = esp_tmp_extra(tmp); - iv = esp_tmp_iv(aead, tmp, extralen); - req = esp_tmp_req(aead, iv); - sg = esp_req_sg(aead, req); - - /* Fill padding... */ - tail = skb_tail_pointer(trailer); - if (tfclen) { - memset(tail, 0, tfclen); - tail += tfclen; - } - do { - int i; - for (i = 0; i < plen - 2; i++) - tail[i] = i + 1; - } while (0); - tail[plen - 2] = plen - 2; - tail[plen - 1] = *skb_mac_header(skb); - pskb_put(skb, trailer, clen - skb->len + alen); - - skb_push(skb, -skb_network_offset(skb)); - esph = ip_esp_hdr(skb); *skb_mac_header(skb) = IPPROTO_ESP; + esph = ip_esp_hdr(skb); /* this is non-NULL only with UDP Encapsulation */ if (x->encap) { @@ -230,7 +256,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) uh = (struct udphdr *)esph; uh->source = sport; uh->dest = dport; - uh->len = htons(skb->len - skb_transport_offset(skb)); + uh->len = htons(skb->len + tailen + - skb_transport_offset(skb)); uh->check = 0; switch (encap_type) { @@ -248,31 +275,170 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) *skb_mac_header(skb) = IPPROTO_UDP; } - esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); + if (!skb_cloned(skb)) { + if (tailen <= skb_availroom(skb)) { + nfrags = 1; + trailer = skb; + tail = skb_tail_pointer(trailer); - aead_request_set_callback(req, 0, esp_output_done, skb); + goto skip_cow; + } else if ((skb_shinfo(skb)->nr_frags < MAX_SKB_FRAGS) + && !skb_has_frag_list(skb)) { + int allocsize; + struct sock *sk = skb->sk; + struct page_frag *pfrag = &x->xfrag; - /* For ESN we move the header forward by 4 bytes to - * accomodate the high bits. We will move it back after - * encryption. - */ - if ((x->props.flags & XFRM_STATE_ESN)) { - extra->esphoff = (unsigned char *)esph - - skb_transport_header(skb); - esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4); - extra->seqhi = esph->spi; - esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi); - aead_request_set_callback(req, 0, esp_output_done_esn, skb); + allocsize = ALIGN(tailen, L1_CACHE_BYTES); + + spin_lock_bh(&x->lock); + + if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) { + spin_unlock_bh(&x->lock); + goto cow; + } + + page = pfrag->page; + get_page(page); + + vaddr = kmap_atomic(page); + + tail = vaddr + pfrag->offset; + + /* Fill padding... */ + if (tfclen) { + memset(tail, 0, tfclen); + tail += tfclen; + } + do { + int i; + for (i = 0; i < plen - 2; i++) + tail[i] = i + 1; + } while (0); + tail[plen - 2] = plen - 2; + tail[plen - 1] = proto; + + kunmap_atomic(vaddr); + + nfrags = skb_shinfo(skb)->nr_frags; + + __skb_fill_page_desc(skb, nfrags, page, pfrag->offset, + tailen); + skb_shinfo(skb)->nr_frags = ++nfrags; + + pfrag->offset = pfrag->offset + allocsize; + nfrags++; + + skb->len += tailen; + skb->data_len += tailen; + skb->truesize += tailen; + if (sk) + atomic_add(tailen, &sk->sk_wmem_alloc); + + skb_push(skb, -skb_network_offset(skb)); + + esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); + esph->spi = x->id.spi; + + tmp = esp_alloc_tmp(aead, nfrags + 2, extralen); + if (!tmp) { + spin_unlock_bh(&x->lock); + err = -ENOMEM; + goto error; + } + + extra = esp_tmp_extra(tmp); + iv = esp_tmp_iv(aead, tmp, extralen); + req = esp_tmp_req(aead, iv); + sg = esp_req_sg(aead, req); + dsg = &sg[nfrags]; + + esph = esp_output_set_extra(skb, esph, extra); + + sg_init_table(sg, nfrags); + skb_to_sgvec(skb, sg, + (unsigned char *)esph - skb->data, + assoclen + ivlen + clen + alen); + + allocsize = ALIGN(skb->data_len, L1_CACHE_BYTES); + + if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) { + spin_unlock_bh(&x->lock); + err = -ENOMEM; + goto error; + } + + skb_shinfo(skb)->nr_frags = 1; + + page = pfrag->page; + get_page(page); + /* replace page frags in skb with new page */ + __skb_fill_page_desc(skb, 0, page, pfrag->offset, skb->data_len); + pfrag->offset = pfrag->offset + allocsize; + + sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1); + skb_to_sgvec(skb, dsg, + (unsigned char *)esph - skb->data, + assoclen + ivlen + clen + alen); + + spin_unlock_bh(&x->lock); + + goto skip_cow2; + } } +cow: + err = skb_cow_data(skb, tailen, &trailer); + if (err < 0) + goto error; + nfrags = err; + tail = skb_tail_pointer(trailer); + esph = ip_esp_hdr(skb); + +skip_cow: + /* Fill padding... */ + if (tfclen) { + memset(tail, 0, tfclen); + tail += tfclen; + } + do { + int i; + for (i = 0; i < plen - 2; i++) + tail[i] = i + 1; + } while (0); + tail[plen - 2] = plen - 2; + tail[plen - 1] = proto; + pskb_put(skb, trailer, clen - skb->len + alen); + + skb_push(skb, -skb_network_offset(skb)); + esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); esph->spi = x->id.spi; + tmp = esp_alloc_tmp(aead, nfrags, extralen); + if (!tmp) { + err = -ENOMEM; + goto error; + } + + extra = esp_tmp_extra(tmp); + iv = esp_tmp_iv(aead, tmp, extralen); + req = esp_tmp_req(aead, iv); + sg = esp_req_sg(aead, req); + dsg = sg; + + esph = esp_output_set_extra(skb, esph, extra); + sg_init_table(sg, nfrags); skb_to_sgvec(skb, sg, (unsigned char *)esph - skb->data, assoclen + ivlen + clen + alen); - aead_request_set_crypt(req, sg, sg, ivlen + clen, iv); +skip_cow2: + if ((x->props.flags & XFRM_STATE_ESN)) + aead_request_set_callback(req, 0, esp_output_done_esn, skb); + else + aead_request_set_callback(req, 0, esp_output_done, skb); + + aead_request_set_crypt(req, sg, dsg, ivlen + clen, iv); aead_request_set_ad(req, assoclen); seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low + @@ -298,6 +464,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) esp_output_restore_header(skb); } + if (sg != dsg) + esp_ssg_unref(x, tmp); kfree(tmp); error: @@ -401,6 +569,23 @@ static void esp_input_restore_header(struct sk_buff *skb) __skb_pull(skb, 4); } +static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi) +{ + struct xfrm_state *x = xfrm_input_state(skb); + struct ip_esp_hdr *esph = (struct ip_esp_hdr *)skb->data; + + /* For ESN we move the header forward by 4 bytes to + * accomodate the high bits. We will move it back after + * decryption. + */ + if ((x->props.flags & XFRM_STATE_ESN)) { + esph = (void *)skb_push(skb, 4); + *seqhi = esph->spi; + esph->spi = esph->seq_no; + esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi; + } +} + static void esp_input_done_esn(struct crypto_async_request *base, int err) { struct sk_buff *skb = base->data; @@ -437,12 +622,6 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) if (elen <= 0) goto out; - err = skb_cow_data(skb, 0, &trailer); - if (err < 0) - goto out; - - nfrags = err; - assoclen = sizeof(*esph); seqhilen = 0; @@ -451,6 +630,26 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) assoclen += seqhilen; } + if (!skb_cloned(skb)) { + if (!skb_is_nonlinear(skb)) { + nfrags = 1; + + goto skip_cow; + } else if (!skb_has_frag_list(skb)) { + nfrags = skb_shinfo(skb)->nr_frags; + nfrags++; + + goto skip_cow; + } + } + + err = skb_cow_data(skb, 0, &trailer); + if (err < 0) + goto out; + + nfrags = err; + +skip_cow: err = -ENOMEM; tmp = esp_alloc_tmp(aead, nfrags, seqhilen); if (!tmp) @@ -462,27 +661,18 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) req = esp_tmp_req(aead, iv); sg = esp_req_sg(aead, req); - skb->ip_summed = CHECKSUM_NONE; - - esph = (struct ip_esp_hdr *)skb->data; - - aead_request_set_callback(req, 0, esp_input_done, skb); - - /* For ESN we move the header forward by 4 bytes to - * accomodate the high bits. We will move it back after - * decryption. - */ - if ((x->props.flags & XFRM_STATE_ESN)) { - esph = (void *)skb_push(skb, 4); - *seqhi = esph->spi; - esph->spi = esph->seq_no; - esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi; - aead_request_set_callback(req, 0, esp_input_done_esn, skb); - } + esp_input_set_header(skb, seqhi); sg_init_table(sg, nfrags); skb_to_sgvec(skb, sg, 0, skb->len); + skb->ip_summed = CHECKSUM_NONE; + + if ((x->props.flags & XFRM_STATE_ESN)) + aead_request_set_callback(req, 0, esp_input_done_esn, skb); + else + aead_request_set_callback(req, 0, esp_input_done, skb); + aead_request_set_crypt(req, sg, sg, elen + ivlen, iv); aead_request_set_ad(req, assoclen); From 03e2a30f6a27e2f3e5283b777f6ddd146b38c738 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Tue, 17 Jan 2017 10:23:03 +0100 Subject: [PATCH 12/15] esp6: Avoid skb_cow_data whenever possible This patch tries to avoid skb_cow_data on esp6. On the encrypt side we add the IPsec tailbits to the linear part of the buffer if there is space on it. If there is no space on the linear part, we add a page fragment with the tailbits to the buffer and use separate src and dst scatterlists. On the decrypt side, we leave the buffer as it is if it is not cloned. With this, we can avoid a linearization of the buffer in most of the cases. Joint work with: Sowmini Varadhan Ilan Tayari Signed-off-by: Sowmini Varadhan Signed-off-by: Ilan Tayari Signed-off-by: Steffen Klassert --- net/ipv6/esp6.c | 326 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 258 insertions(+), 68 deletions(-) diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index cbcdd5db31f4..a428ac6b7c74 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -44,6 +44,8 @@ #include #include +#include + struct esp_skb_cb { struct xfrm_skb_cb xfrm; void *tmp; @@ -114,11 +116,40 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead, __alignof__(struct scatterlist)); } +static void esp_ssg_unref(struct xfrm_state *x, void *tmp) +{ + __be32 *seqhi; + struct crypto_aead *aead = x->data; + int seqhilen = 0; + u8 *iv; + struct aead_request *req; + struct scatterlist *sg; + + if (x->props.flags & XFRM_STATE_ESN) + seqhilen += sizeof(__be32); + + seqhi = esp_tmp_seqhi(tmp); + iv = esp_tmp_iv(aead, tmp, seqhilen); + req = esp_tmp_req(aead, iv); + + /* Unref skb_frag_pages in the src scatterlist if necessary. + * Skip the first sg which comes from skb->data. + */ + if (req->src != req->dst) + for (sg = sg_next(req->src); sg; sg = sg_next(sg)) + put_page(sg_page(sg)); +} + static void esp_output_done(struct crypto_async_request *base, int err) { struct sk_buff *skb = base->data; + void *tmp; + struct dst_entry *dst = skb_dst(skb); + struct xfrm_state *x = dst->xfrm; - kfree(ESP_SKB_CB(skb)->tmp); + tmp = ESP_SKB_CB(skb)->tmp; + esp_ssg_unref(x, tmp); + kfree(tmp); xfrm_output_resume(skb, err); } @@ -138,6 +169,27 @@ static void esp_output_restore_header(struct sk_buff *skb) esp_restore_header(skb, skb_transport_offset(skb) - sizeof(__be32)); } +static struct ip_esp_hdr *esp_output_set_esn(struct sk_buff *skb, + struct ip_esp_hdr *esph, + __be32 *seqhi) +{ + struct xfrm_state *x = skb_dst(skb)->xfrm; + + /* For ESN we move the header forward by 4 bytes to + * accomodate the high bits. We will move it back after + * encryption. + */ + if ((x->props.flags & XFRM_STATE_ESN)) { + esph = (void *)(skb_transport_header(skb) - sizeof(__be32)); + *seqhi = esph->spi; + esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi); + } + + esph->spi = x->id.spi; + + return esph; +} + static void esp_output_done_esn(struct crypto_async_request *base, int err) { struct sk_buff *skb = base->data; @@ -152,8 +204,9 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) struct ip_esp_hdr *esph; struct crypto_aead *aead; struct aead_request *req; - struct scatterlist *sg; + struct scatterlist *sg, *dsg; struct sk_buff *trailer; + struct page *page; void *tmp; int blksize; int clen; @@ -164,10 +217,13 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) int nfrags; int assoclen; int seqhilen; + int tailen; u8 *iv; u8 *tail; + u8 *vaddr; __be32 *seqhi; __be64 seqno; + __u8 proto = *skb_mac_header(skb); /* skb is pure payload to encrypt */ aead = x->data; @@ -186,11 +242,7 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) blksize = ALIGN(crypto_aead_blocksize(aead), 4); clen = ALIGN(skb->len + 2 + tfclen, blksize); plen = clen - skb->len - tfclen; - - err = skb_cow_data(skb, tfclen + plen + alen, &trailer); - if (err < 0) - goto error; - nfrags = err; + tailen = tfclen + plen + alen; assoclen = sizeof(*esph); seqhilen = 0; @@ -200,6 +252,148 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) assoclen += seqhilen; } + *skb_mac_header(skb) = IPPROTO_ESP; + esph = ip_esp_hdr(skb); + + if (!skb_cloned(skb)) { + if (tailen <= skb_availroom(skb)) { + nfrags = 1; + trailer = skb; + tail = skb_tail_pointer(trailer); + + goto skip_cow; + } else if ((skb_shinfo(skb)->nr_frags < MAX_SKB_FRAGS) + && !skb_has_frag_list(skb)) { + int allocsize; + struct sock *sk = skb->sk; + struct page_frag *pfrag = &x->xfrag; + + allocsize = ALIGN(tailen, L1_CACHE_BYTES); + + spin_lock_bh(&x->lock); + + if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) { + spin_unlock_bh(&x->lock); + goto cow; + } + + page = pfrag->page; + get_page(page); + + vaddr = kmap_atomic(page); + + tail = vaddr + pfrag->offset; + + /* Fill padding... */ + if (tfclen) { + memset(tail, 0, tfclen); + tail += tfclen; + } + do { + int i; + for (i = 0; i < plen - 2; i++) + tail[i] = i + 1; + } while (0); + tail[plen - 2] = plen - 2; + tail[plen - 1] = proto; + + kunmap_atomic(vaddr); + + nfrags = skb_shinfo(skb)->nr_frags; + + __skb_fill_page_desc(skb, nfrags, page, pfrag->offset, + tailen); + skb_shinfo(skb)->nr_frags = ++nfrags; + + pfrag->offset = pfrag->offset + allocsize; + nfrags++; + + skb->len += tailen; + skb->data_len += tailen; + skb->truesize += tailen; + if (sk) + atomic_add(tailen, &sk->sk_wmem_alloc); + + skb_push(skb, -skb_network_offset(skb)); + + esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); + esph->spi = x->id.spi; + + tmp = esp_alloc_tmp(aead, nfrags + 2, seqhilen); + if (!tmp) { + spin_unlock_bh(&x->lock); + err = -ENOMEM; + goto error; + } + seqhi = esp_tmp_seqhi(tmp); + iv = esp_tmp_iv(aead, tmp, seqhilen); + req = esp_tmp_req(aead, iv); + sg = esp_req_sg(aead, req); + dsg = &sg[nfrags]; + + esph = esp_output_set_esn(skb, esph, seqhi); + + sg_init_table(sg, nfrags); + skb_to_sgvec(skb, sg, + (unsigned char *)esph - skb->data, + assoclen + ivlen + clen + alen); + + allocsize = ALIGN(skb->data_len, L1_CACHE_BYTES); + + if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) { + spin_unlock_bh(&x->lock); + err = -ENOMEM; + goto error; + } + + skb_shinfo(skb)->nr_frags = 1; + + page = pfrag->page; + get_page(page); + /* replace page frags in skb with new page */ + __skb_fill_page_desc(skb, 0, page, pfrag->offset, skb->data_len); + pfrag->offset = pfrag->offset + allocsize; + + sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1); + skb_to_sgvec(skb, dsg, + (unsigned char *)esph - skb->data, + assoclen + ivlen + clen + alen); + + spin_unlock_bh(&x->lock); + + goto skip_cow2; + } + } + +cow: + err = skb_cow_data(skb, tailen, &trailer); + if (err < 0) + goto error; + nfrags = err; + + tail = skb_tail_pointer(trailer); + esph = ip_esp_hdr(skb); + +skip_cow: + /* Fill padding... */ + if (tfclen) { + memset(tail, 0, tfclen); + tail += tfclen; + } + do { + int i; + for (i = 0; i < plen - 2; i++) + tail[i] = i + 1; + } while (0); + tail[plen - 2] = plen - 2; + tail[plen - 1] = proto; + pskb_put(skb, trailer, clen - skb->len + alen); + + skb_push(skb, -skb_network_offset(skb)); + + esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); + esph->spi = x->id.spi; + tmp = esp_alloc_tmp(aead, nfrags, seqhilen); if (!tmp) { err = -ENOMEM; @@ -210,49 +404,22 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) iv = esp_tmp_iv(aead, tmp, seqhilen); req = esp_tmp_req(aead, iv); sg = esp_req_sg(aead, req); + dsg = sg; - /* Fill padding... */ - tail = skb_tail_pointer(trailer); - if (tfclen) { - memset(tail, 0, tfclen); - tail += tfclen; - } - do { - int i; - for (i = 0; i < plen - 2; i++) - tail[i] = i + 1; - } while (0); - tail[plen - 2] = plen - 2; - tail[plen - 1] = *skb_mac_header(skb); - pskb_put(skb, trailer, clen - skb->len + alen); - - skb_push(skb, -skb_network_offset(skb)); - esph = ip_esp_hdr(skb); - *skb_mac_header(skb) = IPPROTO_ESP; - - esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); - - aead_request_set_callback(req, 0, esp_output_done, skb); - - /* For ESN we move the header forward by 4 bytes to - * accomodate the high bits. We will move it back after - * encryption. - */ - if ((x->props.flags & XFRM_STATE_ESN)) { - esph = (void *)(skb_transport_header(skb) - sizeof(__be32)); - *seqhi = esph->spi; - esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi); - aead_request_set_callback(req, 0, esp_output_done_esn, skb); - } - - esph->spi = x->id.spi; + esph = esp_output_set_esn(skb, esph, seqhi); sg_init_table(sg, nfrags); skb_to_sgvec(skb, sg, (unsigned char *)esph - skb->data, assoclen + ivlen + clen + alen); - aead_request_set_crypt(req, sg, sg, ivlen + clen, iv); +skip_cow2: + if ((x->props.flags & XFRM_STATE_ESN)) + aead_request_set_callback(req, 0, esp_output_done_esn, skb); + else + aead_request_set_callback(req, 0, esp_output_done, skb); + + aead_request_set_crypt(req, sg, dsg, ivlen + clen, iv); aead_request_set_ad(req, assoclen); seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low + @@ -278,6 +445,8 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) esp_output_restore_header(skb); } + if (sg != dsg) + esp_ssg_unref(x, tmp); kfree(tmp); error: @@ -343,6 +512,23 @@ static void esp_input_restore_header(struct sk_buff *skb) __skb_pull(skb, 4); } +static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi) +{ + struct xfrm_state *x = xfrm_input_state(skb); + struct ip_esp_hdr *esph = (struct ip_esp_hdr *)skb->data; + + /* For ESN we move the header forward by 4 bytes to + * accomodate the high bits. We will move it back after + * decryption. + */ + if ((x->props.flags & XFRM_STATE_ESN)) { + esph = (void *)skb_push(skb, 4); + *seqhi = esph->spi; + esph->spi = esph->seq_no; + esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi; + } +} + static void esp_input_done_esn(struct crypto_async_request *base, int err) { struct sk_buff *skb = base->data; @@ -378,14 +564,6 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) goto out; } - nfrags = skb_cow_data(skb, 0, &trailer); - if (nfrags < 0) { - ret = -EINVAL; - goto out; - } - - ret = -ENOMEM; - assoclen = sizeof(*esph); seqhilen = 0; @@ -394,6 +572,27 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) assoclen += seqhilen; } + if (!skb_cloned(skb)) { + if (!skb_is_nonlinear(skb)) { + nfrags = 1; + + goto skip_cow; + } else if (!skb_has_frag_list(skb)) { + nfrags = skb_shinfo(skb)->nr_frags; + nfrags++; + + goto skip_cow; + } + } + + nfrags = skb_cow_data(skb, 0, &trailer); + if (nfrags < 0) { + ret = -EINVAL; + goto out; + } + +skip_cow: + ret = -ENOMEM; tmp = esp_alloc_tmp(aead, nfrags, seqhilen); if (!tmp) goto out; @@ -404,27 +603,18 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) req = esp_tmp_req(aead, iv); sg = esp_req_sg(aead, req); - skb->ip_summed = CHECKSUM_NONE; - - esph = (struct ip_esp_hdr *)skb->data; - - aead_request_set_callback(req, 0, esp_input_done, skb); - - /* For ESN we move the header forward by 4 bytes to - * accomodate the high bits. We will move it back after - * decryption. - */ - if ((x->props.flags & XFRM_STATE_ESN)) { - esph = (void *)skb_push(skb, 4); - *seqhi = esph->spi; - esph->spi = esph->seq_no; - esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi; - aead_request_set_callback(req, 0, esp_input_done_esn, skb); - } + esp_input_set_header(skb, seqhi); sg_init_table(sg, nfrags); skb_to_sgvec(skb, sg, 0, skb->len); + skb->ip_summed = CHECKSUM_NONE; + + if ((x->props.flags & XFRM_STATE_ESN)) + aead_request_set_callback(req, 0, esp_input_done_esn, skb); + else + aead_request_set_callback(req, 0, esp_input_done, skb); + aead_request_set_crypt(req, sg, sg, elen + ivlen, iv); aead_request_set_ad(req, assoclen); From eb758c8864d49f5786432ce38fd8a72bdbbd10cf Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Tue, 17 Jan 2017 10:23:08 +0100 Subject: [PATCH 13/15] esp: Introduce a helper to setup the trailer We need to setup the trailer in two different cases, so add a helper to avoid code duplication. Signed-off-by: Steffen Klassert --- net/ipv4/esp4.c | 44 +++++++++++++++++++------------------------- net/ipv6/esp6.c | 44 +++++++++++++++++++------------------------- 2 files changed, 38 insertions(+), 50 deletions(-) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 9e8d97133513..b1e24446e297 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -182,6 +182,22 @@ static void esp_output_done_esn(struct crypto_async_request *base, int err) esp_output_done(base, err); } +static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto) +{ + /* Fill padding... */ + if (tfclen) { + memset(tail, 0, tfclen); + tail += tfclen; + } + do { + int i; + for (i = 0; i < plen - 2; i++) + tail[i] = i + 1; + } while (0); + tail[plen - 2] = plen - 2; + tail[plen - 1] = proto; +} + static int esp_output(struct xfrm_state *x, struct sk_buff *skb) { struct esp_output_extra *extra; @@ -304,18 +320,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) tail = vaddr + pfrag->offset; - /* Fill padding... */ - if (tfclen) { - memset(tail, 0, tfclen); - tail += tfclen; - } - do { - int i; - for (i = 0; i < plen - 2; i++) - tail[i] = i + 1; - } while (0); - tail[plen - 2] = plen - 2; - tail[plen - 1] = proto; + esp_output_fill_trailer(tail, tfclen, plen, proto); kunmap_atomic(vaddr); @@ -395,20 +400,9 @@ cow: esph = ip_esp_hdr(skb); skip_cow: - /* Fill padding... */ - if (tfclen) { - memset(tail, 0, tfclen); - tail += tfclen; - } - do { - int i; - for (i = 0; i < plen - 2; i++) - tail[i] = i + 1; - } while (0); - tail[plen - 2] = plen - 2; - tail[plen - 1] = proto; - pskb_put(skb, trailer, clen - skb->len + alen); + esp_output_fill_trailer(tail, tfclen, plen, proto); + pskb_put(skb, trailer, clen - skb->len + alen); skb_push(skb, -skb_network_offset(skb)); esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); esph->spi = x->id.spi; diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index a428ac6b7c74..ff54faa75631 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -198,6 +198,22 @@ static void esp_output_done_esn(struct crypto_async_request *base, int err) esp_output_done(base, err); } +static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto) +{ + /* Fill padding... */ + if (tfclen) { + memset(tail, 0, tfclen); + tail += tfclen; + } + do { + int i; + for (i = 0; i < plen - 2; i++) + tail[i] = i + 1; + } while (0); + tail[plen - 2] = plen - 2; + tail[plen - 1] = proto; +} + static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) { int err; @@ -284,18 +300,7 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) tail = vaddr + pfrag->offset; - /* Fill padding... */ - if (tfclen) { - memset(tail, 0, tfclen); - tail += tfclen; - } - do { - int i; - for (i = 0; i < plen - 2; i++) - tail[i] = i + 1; - } while (0); - tail[plen - 2] = plen - 2; - tail[plen - 1] = proto; + esp_output_fill_trailer(tail, tfclen, plen, proto); kunmap_atomic(vaddr); @@ -375,20 +380,9 @@ cow: esph = ip_esp_hdr(skb); skip_cow: - /* Fill padding... */ - if (tfclen) { - memset(tail, 0, tfclen); - tail += tfclen; - } - do { - int i; - for (i = 0; i < plen - 2; i++) - tail[i] = i + 1; - } while (0); - tail[plen - 2] = plen - 2; - tail[plen - 1] = proto; - pskb_put(skb, trailer, clen - skb->len + alen); + esp_output_fill_trailer(tail, tfclen, plen, proto); + pskb_put(skb, trailer, clen - skb->len + alen); skb_push(skb, -skb_network_offset(skb)); esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); From f991bb9da142ba79b54ed0757f22e756f45e2c5a Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 30 Jan 2017 06:45:38 +0100 Subject: [PATCH 14/15] net: Drop secpath on free after gro merge. With a followup patch, a gro merged skb can have a secpath. So drop it before freeing or reusing the skb. Signed-off-by: Steffen Klassert --- net/core/dev.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/dev.c b/net/core/dev.c index 56818f7eab2b..ef3a969477bf 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4623,6 +4623,7 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) case GRO_MERGED_FREE: if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) { skb_dst_drop(skb); + secpath_reset(skb); kmem_cache_free(skbuff_head_cache, skb); } else { __kfree_skb(skb); @@ -4663,6 +4664,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) skb->encapsulation = 0; skb_shinfo(skb)->gso_type = 0; skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); + secpath_reset(skb); napi->skb = skb; } From 1995876a06bcf6f9f7d7b699bdbf387831679771 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 30 Jan 2017 06:45:43 +0100 Subject: [PATCH 15/15] xfrm: Add a dummy network device for napi. This patch adds a dummy network device so that we can use gro_cells for IPsec GRO. With this, we handle IPsec GRO with no impact on the generic networking code. Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_input.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 6e3f0254d8a1..3213fe8027be 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -21,6 +21,9 @@ static struct kmem_cache *secpath_cachep __read_mostly; static DEFINE_SPINLOCK(xfrm_input_afinfo_lock); static struct xfrm_input_afinfo __rcu *xfrm_input_afinfo[NPROTO]; +static struct gro_cells gro_cells; +static struct net_device xfrm_napi_dev; + int xfrm_input_register_afinfo(struct xfrm_input_afinfo *afinfo) { int err = 0; @@ -371,7 +374,7 @@ resume: if (decaps) { skb_dst_drop(skb); - netif_rx(skb); + gro_cells_receive(&gro_cells, skb); return 0; } else { return x->inner_mode->afinfo->transport_finish(skb, async); @@ -394,6 +397,13 @@ EXPORT_SYMBOL(xfrm_input_resume); void __init xfrm_input_init(void) { + int err; + + init_dummy_netdev(&xfrm_napi_dev); + err = gro_cells_init(&gro_cells, &xfrm_napi_dev); + if (err) + gro_cells.cells = NULL; + secpath_cachep = kmem_cache_create("secpath_cache", sizeof(struct sec_path), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,