Merge branch 'ipv6-Move-exceptions-to-fib6_nh-and-make-it-optional-in-a-fib6_info'

David Ahern says:

====================
ipv6: Move exceptions to fib6_nh and make it optional in a fib6_info

Patches 1 and 4 move pcpu and exception caches from fib6_info to fib6_nh.
With respect to the current FIB entries this is only a movement from one
struct to another contained within the first.

Patch 2 refactors the core logic of fib6_drop_pcpu_from into a helper
that is invoked per fib6_nh.

Patch 3 refactors exception handling in a similar way - creating a bunch
of helpers that can be invoked per fib6_nh with the goal of making patch
4 easier to review as well as creating the code needed for nexthop
objects.

Patch 5 makes a fib6_nh at the end of a fib6_info an array similar to
IPv4 and its fib_info. For the current fib entry model, all fib6_info
will have a fib6_nh allocated for it.

Patch 6 refactors ip6_route_del moving the code for deleting an
exception entry into a new function.

Patch 7 adds tests for redirect route exceptions. The new test was
written against 5.1 (before any of the nexthop refactoring). It and the
pmtu.sh selftest exercise the exception code paths - from creating
exceptions to cleaning them up on device delete. All tests pass without
any rcu locking or memleak warnings.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2019-05-24 13:26:44 -07:00
commit a80886e4e0
8 changed files with 820 additions and 244 deletions

View File

@ -2886,7 +2886,7 @@ mlxsw_sp_nexthop6_group_cmp(const struct mlxsw_sp_nexthop_group *nh_grp,
return false;
list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) {
struct fib6_nh *fib6_nh = &mlxsw_sp_rt6->rt->fib6_nh;
struct fib6_nh *fib6_nh = mlxsw_sp_rt6->rt->fib6_nh;
struct in6_addr *gw;
int ifindex, weight;
@ -2958,7 +2958,7 @@ mlxsw_sp_nexthop6_group_hash(struct mlxsw_sp_fib6_entry *fib6_entry, u32 seed)
struct net_device *dev;
list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) {
dev = mlxsw_sp_rt6->rt->fib6_nh.fib_nh_dev;
dev = mlxsw_sp_rt6->rt->fib6_nh->fib_nh_dev;
val ^= dev->ifindex;
}
@ -3960,9 +3960,9 @@ mlxsw_sp_rt6_nexthop(struct mlxsw_sp_nexthop_group *nh_grp,
struct mlxsw_sp_nexthop *nh = &nh_grp->nexthops[i];
struct fib6_info *rt = mlxsw_sp_rt6->rt;
if (nh->rif && nh->rif->dev == rt->fib6_nh.fib_nh_dev &&
if (nh->rif && nh->rif->dev == rt->fib6_nh->fib_nh_dev &&
ipv6_addr_equal((const struct in6_addr *) &nh->gw_addr,
&rt->fib6_nh.fib_nh_gw6))
&rt->fib6_nh->fib_nh_gw6))
return nh;
continue;
}
@ -4022,13 +4022,13 @@ mlxsw_sp_fib6_entry_offload_set(struct mlxsw_sp_fib_entry *fib_entry)
if (fib_entry->type == MLXSW_SP_FIB_ENTRY_TYPE_LOCAL ||
fib_entry->type == MLXSW_SP_FIB_ENTRY_TYPE_BLACKHOLE) {
list_first_entry(&fib6_entry->rt6_list, struct mlxsw_sp_rt6,
list)->rt->fib6_nh.fib_nh_flags |= RTNH_F_OFFLOAD;
list)->rt->fib6_nh->fib_nh_flags |= RTNH_F_OFFLOAD;
return;
}
list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) {
struct mlxsw_sp_nexthop_group *nh_grp = fib_entry->nh_group;
struct fib6_nh *fib6_nh = &mlxsw_sp_rt6->rt->fib6_nh;
struct fib6_nh *fib6_nh = mlxsw_sp_rt6->rt->fib6_nh;
struct mlxsw_sp_nexthop *nh;
nh = mlxsw_sp_rt6_nexthop(nh_grp, mlxsw_sp_rt6);
@ -4050,7 +4050,7 @@ mlxsw_sp_fib6_entry_offload_unset(struct mlxsw_sp_fib_entry *fib_entry)
list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) {
struct fib6_info *rt = mlxsw_sp_rt6->rt;
rt->fib6_nh.fib_nh_flags &= ~RTNH_F_OFFLOAD;
rt->fib6_nh->fib_nh_flags &= ~RTNH_F_OFFLOAD;
}
}
@ -4928,7 +4928,8 @@ static void mlxsw_sp_rt6_destroy(struct mlxsw_sp_rt6 *mlxsw_sp_rt6)
static bool mlxsw_sp_fib6_rt_can_mp(const struct fib6_info *rt)
{
/* RTF_CACHE routes are ignored */
return !(rt->fib6_flags & RTF_ADDRCONF) && rt->fib6_nh.fib_nh_gw_family;
return !(rt->fib6_flags & RTF_ADDRCONF) &&
rt->fib6_nh->fib_nh_gw_family;
}
static struct fib6_info *
@ -4987,8 +4988,8 @@ static bool mlxsw_sp_nexthop6_ipip_type(const struct mlxsw_sp *mlxsw_sp,
const struct fib6_info *rt,
enum mlxsw_sp_ipip_type *ret)
{
return rt->fib6_nh.fib_nh_dev &&
mlxsw_sp_netdev_ipip_type(mlxsw_sp, rt->fib6_nh.fib_nh_dev, ret);
return rt->fib6_nh->fib_nh_dev &&
mlxsw_sp_netdev_ipip_type(mlxsw_sp, rt->fib6_nh->fib_nh_dev, ret);
}
static int mlxsw_sp_nexthop6_type_init(struct mlxsw_sp *mlxsw_sp,
@ -4998,7 +4999,7 @@ static int mlxsw_sp_nexthop6_type_init(struct mlxsw_sp *mlxsw_sp,
{
const struct mlxsw_sp_ipip_ops *ipip_ops;
struct mlxsw_sp_ipip_entry *ipip_entry;
struct net_device *dev = rt->fib6_nh.fib_nh_dev;
struct net_device *dev = rt->fib6_nh->fib_nh_dev;
struct mlxsw_sp_rif *rif;
int err;
@ -5041,11 +5042,11 @@ static int mlxsw_sp_nexthop6_init(struct mlxsw_sp *mlxsw_sp,
struct mlxsw_sp_nexthop *nh,
const struct fib6_info *rt)
{
struct net_device *dev = rt->fib6_nh.fib_nh_dev;
struct net_device *dev = rt->fib6_nh->fib_nh_dev;
nh->nh_grp = nh_grp;
nh->nh_weight = rt->fib6_nh.fib_nh_weight;
memcpy(&nh->gw_addr, &rt->fib6_nh.fib_nh_gw6, sizeof(nh->gw_addr));
nh->nh_weight = rt->fib6_nh->fib_nh_weight;
memcpy(&nh->gw_addr, &rt->fib6_nh->fib_nh_gw6, sizeof(nh->gw_addr));
mlxsw_sp_nexthop_counter_alloc(mlxsw_sp, nh);
list_add_tail(&nh->router_list_node, &mlxsw_sp->router->nexthop_list);
@ -5068,7 +5069,7 @@ static void mlxsw_sp_nexthop6_fini(struct mlxsw_sp *mlxsw_sp,
static bool mlxsw_sp_rt6_is_gateway(const struct mlxsw_sp *mlxsw_sp,
const struct fib6_info *rt)
{
return rt->fib6_nh.fib_nh_gw_family ||
return rt->fib6_nh->fib_nh_gw_family ||
mlxsw_sp_nexthop6_ipip_type(mlxsw_sp, rt, NULL);
}

View File

@ -131,6 +131,9 @@ struct fib6_nh {
#ifdef CONFIG_IPV6_ROUTER_PREF
unsigned long last_probe;
#endif
struct rt6_info * __percpu *rt6i_pcpu;
struct rt6_exception_bucket __rcu *rt6i_exception_bucket;
};
struct fib6_info {
@ -156,22 +159,18 @@ struct fib6_info {
struct rt6key fib6_src;
struct rt6key fib6_prefsrc;
struct rt6_info * __percpu *rt6i_pcpu;
struct rt6_exception_bucket __rcu *rt6i_exception_bucket;
u32 fib6_metric;
u8 fib6_protocol;
u8 fib6_type;
u8 exception_bucket_flushed:1,
should_flush:1,
u8 should_flush:1,
dst_nocount:1,
dst_nopolicy:1,
dst_host:1,
fib6_destroying:1,
unused:2;
unused:3;
struct fib6_nh fib6_nh;
struct rcu_head rcu;
struct fib6_nh fib6_nh[0];
};
struct rt6_info {
@ -281,7 +280,7 @@ static inline void ip6_rt_put(struct rt6_info *rt)
dst_release(&rt->dst);
}
struct fib6_info *fib6_info_alloc(gfp_t gfp_flags);
struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh);
void fib6_info_destroy_rcu(struct rcu_head *head);
static inline void fib6_info_hold(struct fib6_info *f6i)
@ -444,7 +443,7 @@ void rt6_get_prefsrc(const struct rt6_info *rt, struct in6_addr *addr)
static inline struct net_device *fib6_info_nh_dev(const struct fib6_info *f6i)
{
return f6i->fib6_nh.fib_nh_dev;
return f6i->fib6_nh->fib_nh_dev;
}
int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,

View File

@ -70,7 +70,7 @@ static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i)
{
/* the RTF_ADDRCONF flag filters out RA's */
return !(f6i->fib6_flags & RTF_ADDRCONF) &&
f6i->fib6_nh.fib_nh_gw_family;
f6i->fib6_nh->fib_nh_gw_family;
}
void ip6_route_input(struct sk_buff *skb);
@ -275,7 +275,7 @@ static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt,
static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *b)
{
struct fib6_nh *nha = &a->fib6_nh, *nhb = &b->fib6_nh;
struct fib6_nh *nha = a->fib6_nh, *nhb = b->fib6_nh;
return nha->fib_nh_dev == nhb->fib_nh_dev &&
ipv6_addr_equal(&nha->fib_nh_gw6, &nhb->fib_nh_gw6) &&

View File

@ -2421,9 +2421,9 @@ static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
goto out;
for_each_fib6_node_rt_rcu(fn) {
if (rt->fib6_nh.fib_nh_dev->ifindex != dev->ifindex)
if (rt->fib6_nh->fib_nh_dev->ifindex != dev->ifindex)
continue;
if (no_gw && rt->fib6_nh.fib_nh_gw_family)
if (no_gw && rt->fib6_nh->fib_nh_gw_family)
continue;
if ((rt->fib6_flags & flags) != flags)
continue;
@ -6341,16 +6341,16 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val)
list_for_each_entry(ifa, &idev->addr_list, if_list) {
spin_lock(&ifa->lock);
if (ifa->rt) {
struct fib6_info *rt = ifa->rt;
struct fib6_nh *nh = ifa->rt->fib6_nh;
int cpu;
rcu_read_lock();
ifa->rt->dst_nopolicy = val ? true : false;
if (rt->rt6i_pcpu) {
if (nh->rt6i_pcpu) {
for_each_possible_cpu(cpu) {
struct rt6_info **rtp;
rtp = per_cpu_ptr(rt->rt6i_pcpu, cpu);
rtp = per_cpu_ptr(nh->rt6i_pcpu, cpu);
addrconf_set_nopolicy(*rtp, val);
}
}

View File

@ -147,20 +147,18 @@ static __be32 addr_bit_set(const void *token, int fn_bit)
addr[fn_bit >> 5];
}
struct fib6_info *fib6_info_alloc(gfp_t gfp_flags)
struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh)
{
struct fib6_info *f6i;
size_t sz = sizeof(*f6i);
f6i = kzalloc(sizeof(*f6i), gfp_flags);
if (with_fib6_nh)
sz += sizeof(struct fib6_nh);
f6i = kzalloc(sz, gfp_flags);
if (!f6i)
return NULL;
f6i->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
if (!f6i->rt6i_pcpu) {
kfree(f6i);
return NULL;
}
INIT_LIST_HEAD(&f6i->fib6_siblings);
refcount_set(&f6i->fib6_ref, 1);
@ -170,36 +168,11 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags)
void fib6_info_destroy_rcu(struct rcu_head *head)
{
struct fib6_info *f6i = container_of(head, struct fib6_info, rcu);
struct rt6_exception_bucket *bucket;
WARN_ON(f6i->fib6_node);
bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1);
kfree(bucket);
if (f6i->rt6i_pcpu) {
int cpu;
for_each_possible_cpu(cpu) {
struct rt6_info **ppcpu_rt;
struct rt6_info *pcpu_rt;
ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu);
pcpu_rt = *ppcpu_rt;
if (pcpu_rt) {
dst_dev_put(&pcpu_rt->dst);
dst_release(&pcpu_rt->dst);
*ppcpu_rt = NULL;
}
}
free_percpu(f6i->rt6i_pcpu);
}
fib6_nh_release(&f6i->fib6_nh);
fib6_nh_release(f6i->fib6_nh);
ip_fib_metrics_put(f6i->fib6_metrics);
kfree(f6i);
}
EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu);
@ -899,16 +872,14 @@ insert_above:
return ln;
}
static void fib6_drop_pcpu_from(struct fib6_info *f6i,
const struct fib6_table *table)
static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh,
const struct fib6_info *match,
const struct fib6_table *table)
{
int cpu;
/* Make sure rt6_make_pcpu_route() wont add other percpu routes
* while we are cleaning them here.
*/
f6i->fib6_destroying = 1;
mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */
if (!fib6_nh->rt6i_pcpu)
return;
/* release the reference to this fib entry from
* all of its cached pcpu routes
@ -917,9 +888,15 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i,
struct rt6_info **ppcpu_rt;
struct rt6_info *pcpu_rt;
ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu);
ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
pcpu_rt = *ppcpu_rt;
if (pcpu_rt) {
/* only dropping the 'from' reference if the cached route
* is using 'match'. The cached pcpu_rt->from only changes
* from a fib6_info to NULL (ip6_dst_destroy); it can never
* change from one fib6_info reference to another
*/
if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) {
struct fib6_info *from;
from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL);
@ -928,13 +905,27 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i,
}
}
static void fib6_drop_pcpu_from(struct fib6_info *f6i,
const struct fib6_table *table)
{
struct fib6_nh *fib6_nh;
/* Make sure rt6_make_pcpu_route() wont add other percpu routes
* while we are cleaning them here.
*/
f6i->fib6_destroying = 1;
mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */
fib6_nh = f6i->fib6_nh;
__fib6_drop_pcpu_from(fib6_nh, f6i, table);
}
static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
struct net *net)
{
struct fib6_table *table = rt->fib6_table;
if (rt->rt6i_pcpu)
fib6_drop_pcpu_from(rt, table);
fib6_drop_pcpu_from(rt, table);
if (refcount_read(&rt->fib6_ref) != 1) {
/* This route is used as dummy address holder in some split
@ -2314,14 +2305,14 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v)
#else
seq_puts(seq, "00000000000000000000000000000000 00 ");
#endif
if (rt->fib6_nh.fib_nh_gw_family) {
if (rt->fib6_nh->fib_nh_gw_family) {
flags |= RTF_GATEWAY;
seq_printf(seq, "%pi6", &rt->fib6_nh.fib_nh_gw6);
seq_printf(seq, "%pi6", &rt->fib6_nh->fib_nh_gw6);
} else {
seq_puts(seq, "00000000000000000000000000000000");
}
dev = rt->fib6_nh.fib_nh_dev;
dev = rt->fib6_nh->fib_nh_dev;
seq_printf(seq, " %08x %08x %08x %08x %8s\n",
rt->fib6_metric, refcount_read(&rt->fib6_ref), 0,
flags, dev ? dev->name : "");

View File

@ -1293,8 +1293,8 @@ static void ndisc_router_discovery(struct sk_buff *skb)
rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev);
if (rt) {
neigh = ip6_neigh_lookup(&rt->fib6_nh.fib_nh_gw6,
rt->fib6_nh.fib_nh_dev, NULL,
neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6,
rt->fib6_nh->fib_nh_dev, NULL,
&ipv6_hdr(skb)->saddr);
if (!neigh) {
ND_PRINTK(0, err,
@ -1323,8 +1323,8 @@ static void ndisc_router_discovery(struct sk_buff *skb)
return;
}
neigh = ip6_neigh_lookup(&rt->fib6_nh.fib_nh_gw6,
rt->fib6_nh.fib_nh_dev, NULL,
neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6,
rt->fib6_nh->fib_nh_dev, NULL,
&ipv6_hdr(skb)->saddr);
if (!neigh) {
ND_PRINTK(0, err,

View File

@ -441,12 +441,12 @@ void fib6_select_path(const struct net *net, struct fib6_result *res,
if (!fl6->mp_hash)
fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound))
if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound))
goto out;
list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
fib6_siblings) {
const struct fib6_nh *nh = &sibling->fib6_nh;
const struct fib6_nh *nh = sibling->fib6_nh;
int nh_upper_bound;
nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
@ -460,7 +460,7 @@ void fib6_select_path(const struct net *net, struct fib6_result *res,
out:
res->f6i = match;
res->nh = &match->fib6_nh;
res->nh = match->fib6_nh;
}
/*
@ -496,13 +496,13 @@ static void rt6_device_match(struct net *net, struct fib6_result *res,
struct fib6_nh *nh;
if (!oif && ipv6_addr_any(saddr)) {
nh = &f6i->fib6_nh;
nh = f6i->fib6_nh;
if (!(nh->fib_nh_flags & RTNH_F_DEAD))
goto out;
}
for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) {
nh = &spf6i->fib6_nh;
nh = spf6i->fib6_nh;
if (__rt6_device_match(net, nh, saddr, oif, flags)) {
res->f6i = spf6i;
goto out;
@ -511,14 +511,14 @@ static void rt6_device_match(struct net *net, struct fib6_result *res,
if (oif && flags & RT6_LOOKUP_F_IFACE) {
res->f6i = net->ipv6.fib6_null_entry;
nh = &res->f6i->fib6_nh;
nh = res->f6i->fib6_nh;
goto out;
}
nh = &f6i->fib6_nh;
nh = f6i->fib6_nh;
if (nh->fib_nh_flags & RTNH_F_DEAD) {
res->f6i = net->ipv6.fib6_null_entry;
nh = &res->f6i->fib6_nh;
nh = res->f6i->fib6_nh;
}
out:
res->nh = nh;
@ -714,7 +714,7 @@ static void __find_rr_leaf(struct fib6_info *f6i_start,
if (fib6_check_expired(f6i))
continue;
nh = &f6i->fib6_nh;
nh = f6i->fib6_nh;
if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) {
res->f6i = f6i;
res->nh = nh;
@ -796,7 +796,7 @@ static void rt6_select(struct net *net, struct fib6_node *fn, int oif,
out:
if (!res->f6i) {
res->f6i = net->ipv6.fib6_null_entry;
res->nh = &res->f6i->fib6_nh;
res->nh = res->f6i->fib6_nh;
res->fib6_flags = res->f6i->fib6_flags;
res->fib6_type = res->f6i->fib6_type;
}
@ -1270,7 +1270,7 @@ static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
{
struct rt6_info *pcpu_rt, **p;
p = this_cpu_ptr(res->f6i->rt6i_pcpu);
p = this_cpu_ptr(res->nh->rt6i_pcpu);
pcpu_rt = *p;
if (pcpu_rt)
@ -1291,7 +1291,7 @@ static struct rt6_info *rt6_make_pcpu_route(struct net *net,
}
dst_hold(&pcpu_rt->dst);
p = this_cpu_ptr(res->f6i->rt6i_pcpu);
p = this_cpu_ptr(res->nh->rt6i_pcpu);
prev = cmpxchg(p, NULL, pcpu_rt);
BUG_ON(prev);
@ -1461,25 +1461,74 @@ static unsigned int fib6_mtu(const struct fib6_result *res)
return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
}
#define FIB6_EXCEPTION_BUCKET_FLUSHED 0x1UL
/* used when the flushed bit is not relevant, only access to the bucket
* (ie., all bucket users except rt6_insert_exception);
*
* called under rcu lock; sometimes called with rt6_exception_lock held
*/
static
struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh,
spinlock_t *lock)
{
struct rt6_exception_bucket *bucket;
if (lock)
bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
lockdep_is_held(lock));
else
bucket = rcu_dereference(nh->rt6i_exception_bucket);
/* remove bucket flushed bit if set */
if (bucket) {
unsigned long p = (unsigned long)bucket;
p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED;
bucket = (struct rt6_exception_bucket *)p;
}
return bucket;
}
static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket)
{
unsigned long p = (unsigned long)bucket;
return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED);
}
/* called with rt6_exception_lock held */
static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh,
spinlock_t *lock)
{
struct rt6_exception_bucket *bucket;
unsigned long p;
bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
lockdep_is_held(lock));
p = (unsigned long)bucket;
p |= FIB6_EXCEPTION_BUCKET_FLUSHED;
bucket = (struct rt6_exception_bucket *)p;
rcu_assign_pointer(nh->rt6i_exception_bucket, bucket);
}
static int rt6_insert_exception(struct rt6_info *nrt,
const struct fib6_result *res)
{
struct net *net = dev_net(nrt->dst.dev);
struct rt6_exception_bucket *bucket;
struct fib6_info *f6i = res->f6i;
struct in6_addr *src_key = NULL;
struct rt6_exception *rt6_ex;
struct fib6_info *f6i = res->f6i;
struct fib6_nh *nh = res->nh;
int err = 0;
spin_lock_bh(&rt6_exception_lock);
if (f6i->exception_bucket_flushed) {
err = -EINVAL;
goto out;
}
bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket,
lockdep_is_held(&rt6_exception_lock));
bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
lockdep_is_held(&rt6_exception_lock));
if (!bucket) {
bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
GFP_ATOMIC);
@ -1487,7 +1536,10 @@ static int rt6_insert_exception(struct rt6_info *nrt,
err = -ENOMEM;
goto out;
}
rcu_assign_pointer(f6i->rt6i_exception_bucket, bucket);
rcu_assign_pointer(nh->rt6i_exception_bucket, bucket);
} else if (fib6_nh_excptn_bucket_flushed(bucket)) {
err = -EINVAL;
goto out;
}
#ifdef CONFIG_IPV6_SUBTREES
@ -1542,7 +1594,7 @@ out:
return err;
}
void rt6_flush_exceptions(struct fib6_info *rt)
static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from)
{
struct rt6_exception_bucket *bucket;
struct rt6_exception *rt6_ex;
@ -1550,25 +1602,33 @@ void rt6_flush_exceptions(struct fib6_info *rt)
int i;
spin_lock_bh(&rt6_exception_lock);
/* Prevent rt6_insert_exception() to recreate the bucket list */
rt->exception_bucket_flushed = 1;
bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
lockdep_is_held(&rt6_exception_lock));
bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
if (!bucket)
goto out;
/* Prevent rt6_insert_exception() to recreate the bucket list */
if (!from)
fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock);
for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
rt6_remove_exception(bucket, rt6_ex);
WARN_ON_ONCE(bucket->depth);
hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) {
if (!from ||
rcu_access_pointer(rt6_ex->rt6i->from) == from)
rt6_remove_exception(bucket, rt6_ex);
}
WARN_ON_ONCE(!from && bucket->depth);
bucket++;
}
out:
spin_unlock_bh(&rt6_exception_lock);
}
void rt6_flush_exceptions(struct fib6_info *f6i)
{
fib6_nh_flush_exceptions(f6i->fib6_nh, f6i);
}
/* Find cached rt in the hash table inside passed in rt
* Caller has to hold rcu_read_lock()
*/
@ -1597,7 +1657,7 @@ static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
src_key = saddr;
find_ex:
#endif
bucket = rcu_dereference(res->f6i->rt6i_exception_bucket);
bucket = fib6_nh_get_excptn_bucket(res->nh, NULL);
rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
@ -1615,25 +1675,20 @@ find_ex:
}
/* Remove the passed in cached rt from the hash table that contains it */
static int rt6_remove_exception_rt(struct rt6_info *rt)
static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen,
const struct rt6_info *rt)
{
const struct in6_addr *src_key = NULL;
struct rt6_exception_bucket *bucket;
struct in6_addr *src_key = NULL;
struct rt6_exception *rt6_ex;
struct fib6_info *from;
int err;
from = rcu_dereference(rt->from);
if (!from ||
!(rt->rt6i_flags & RTF_CACHE))
return -EINVAL;
if (!rcu_access_pointer(from->rt6i_exception_bucket))
if (!rcu_access_pointer(nh->rt6i_exception_bucket))
return -ENOENT;
spin_lock_bh(&rt6_exception_lock);
bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
lockdep_is_held(&rt6_exception_lock));
bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
#ifdef CONFIG_IPV6_SUBTREES
/* rt6i_src.plen != 0 indicates 'from' is in subtree
* and exception table is indexed by a hash of
@ -1641,7 +1696,7 @@ static int rt6_remove_exception_rt(struct rt6_info *rt)
* Otherwise, the exception table is indexed by
* a hash of only rt6i_dst.
*/
if (from->fib6_src.plen)
if (plen)
src_key = &rt->rt6i_src.addr;
#endif
rt6_ex = __rt6_find_exception_spinlock(&bucket,
@ -1658,23 +1713,29 @@ static int rt6_remove_exception_rt(struct rt6_info *rt)
return err;
}
static int rt6_remove_exception_rt(struct rt6_info *rt)
{
struct fib6_info *from;
from = rcu_dereference(rt->from);
if (!from || !(rt->rt6i_flags & RTF_CACHE))
return -EINVAL;
return fib6_nh_remove_exception(from->fib6_nh,
from->fib6_src.plen, rt);
}
/* Find rt6_ex which contains the passed in rt cache and
* refresh its stamp
*/
static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen,
const struct rt6_info *rt)
{
const struct in6_addr *src_key = NULL;
struct rt6_exception_bucket *bucket;
struct in6_addr *src_key = NULL;
struct rt6_exception *rt6_ex;
struct fib6_info *from;
rcu_read_lock();
from = rcu_dereference(rt->from);
if (!from || !(rt->rt6i_flags & RTF_CACHE))
goto unlock;
bucket = rcu_dereference(from->rt6i_exception_bucket);
bucket = fib6_nh_get_excptn_bucket(nh, NULL);
#ifdef CONFIG_IPV6_SUBTREES
/* rt6i_src.plen != 0 indicates 'from' is in subtree
* and exception table is indexed by a hash of
@ -1682,15 +1743,25 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
* Otherwise, the exception table is indexed by
* a hash of only rt6i_dst.
*/
if (from->fib6_src.plen)
if (plen)
src_key = &rt->rt6i_src.addr;
#endif
rt6_ex = __rt6_find_exception_rcu(&bucket,
&rt->rt6i_dst.addr,
src_key);
rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key);
if (rt6_ex)
rt6_ex->stamp = jiffies;
}
static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
{
struct fib6_info *from;
rcu_read_lock();
from = rcu_dereference(rt->from);
if (!from || !(rt->rt6i_flags & RTF_CACHE))
goto unlock;
fib6_nh_update_exception(from->fib6_nh, from->fib6_src.plen, rt);
unlock:
rcu_read_unlock();
}
@ -1718,15 +1789,13 @@ static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
}
static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
struct fib6_info *rt, int mtu)
const struct fib6_nh *nh, int mtu)
{
struct rt6_exception_bucket *bucket;
struct rt6_exception *rt6_ex;
int i;
bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
lockdep_is_held(&rt6_exception_lock));
bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
if (!bucket)
return;
@ -1748,21 +1817,19 @@ static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
#define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
struct in6_addr *gateway)
static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh,
const struct in6_addr *gateway)
{
struct rt6_exception_bucket *bucket;
struct rt6_exception *rt6_ex;
struct hlist_node *tmp;
int i;
if (!rcu_access_pointer(rt->rt6i_exception_bucket))
if (!rcu_access_pointer(nh->rt6i_exception_bucket))
return;
spin_lock_bh(&rt6_exception_lock);
bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
lockdep_is_held(&rt6_exception_lock));
bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
if (bucket) {
for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
hlist_for_each_entry_safe(rt6_ex, tmp,
@ -1827,23 +1894,21 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
gc_args->more++;
}
void rt6_age_exceptions(struct fib6_info *rt,
struct fib6_gc_args *gc_args,
unsigned long now)
static void fib6_nh_age_exceptions(const struct fib6_nh *nh,
struct fib6_gc_args *gc_args,
unsigned long now)
{
struct rt6_exception_bucket *bucket;
struct rt6_exception *rt6_ex;
struct hlist_node *tmp;
int i;
if (!rcu_access_pointer(rt->rt6i_exception_bucket))
if (!rcu_access_pointer(nh->rt6i_exception_bucket))
return;
rcu_read_lock_bh();
spin_lock(&rt6_exception_lock);
bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
lockdep_is_held(&rt6_exception_lock));
bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
if (bucket) {
for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
hlist_for_each_entry_safe(rt6_ex, tmp,
@ -1858,6 +1923,13 @@ void rt6_age_exceptions(struct fib6_info *rt,
rcu_read_unlock_bh();
}
void rt6_age_exceptions(struct fib6_info *f6i,
struct fib6_gc_args *gc_args,
unsigned long now)
{
fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now);
}
/* must be called with rcu lock held */
int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif,
struct flowi6 *fl6, struct fib6_result *res, int strict)
@ -2384,7 +2456,7 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
rcu_read_unlock();
return;
}
res.nh = &res.f6i->fib6_nh;
res.nh = res.f6i->fib6_nh;
res.fib6_flags = res.f6i->fib6_flags;
res.fib6_type = res.f6i->fib6_type;
@ -2527,7 +2599,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
restart:
for_each_fib6_node_rt_rcu(fn) {
res.f6i = rt;
res.nh = &rt->fib6_nh;
res.nh = rt->fib6_nh;
if (fib6_check_expired(rt))
continue;
@ -2551,7 +2623,7 @@ restart:
}
res.f6i = rt;
res.nh = &rt->fib6_nh;
res.nh = rt->fib6_nh;
out:
if (ret) {
ip6_hold_safe(net, &ret);
@ -3068,6 +3140,12 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
!netif_carrier_ok(dev))
fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
if (!fib6_nh->rt6i_pcpu) {
err = -ENOMEM;
goto out;
}
err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap,
cfg->fc_encap_type, cfg, gfp_flags, extack);
if (err)
@ -3092,6 +3170,38 @@ out:
void fib6_nh_release(struct fib6_nh *fib6_nh)
{
struct rt6_exception_bucket *bucket;
rcu_read_lock();
fib6_nh_flush_exceptions(fib6_nh, NULL);
bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL);
if (bucket) {
rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL);
kfree(bucket);
}
rcu_read_unlock();
if (fib6_nh->rt6i_pcpu) {
int cpu;
for_each_possible_cpu(cpu) {
struct rt6_info **ppcpu_rt;
struct rt6_info *pcpu_rt;
ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
pcpu_rt = *ppcpu_rt;
if (pcpu_rt) {
dst_dev_put(&pcpu_rt->dst);
dst_release(&pcpu_rt->dst);
*ppcpu_rt = NULL;
}
}
free_percpu(fib6_nh->rt6i_pcpu);
}
fib_nh_common_release(&fib6_nh->nh_common);
}
@ -3154,7 +3264,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
goto out;
err = -ENOMEM;
rt = fib6_info_alloc(gfp_flags);
rt = fib6_info_alloc(gfp_flags, true);
if (!rt)
goto out;
@ -3194,7 +3304,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
rt->fib6_src.plen = cfg->fc_src_len;
#endif
err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack);
err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack);
if (err)
goto out;
@ -3202,7 +3312,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
* they would result in kernel looping; promote them to reject routes
*/
addr_type = ipv6_addr_type(&cfg->fc_dst);
if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type))
if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev, addr_type))
rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
@ -3320,7 +3430,7 @@ out_put:
return err;
}
static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
static int __ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
{
int rc = -ESRCH;
@ -3336,10 +3446,25 @@ out:
return rc;
}
static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt,
struct fib6_nh *nh)
{
struct fib6_result res = {
.f6i = rt,
.nh = nh,
};
struct rt6_info *rt_cache;
rt_cache = rt6_find_cached_rt(&res, &cfg->fc_dst, &cfg->fc_src);
if (rt_cache)
return __ip6_del_cached_rt(rt_cache, cfg);
return 0;
}
static int ip6_route_del(struct fib6_config *cfg,
struct netlink_ext_ack *extack)
{
struct rt6_info *rt_cache;
struct fib6_table *table;
struct fib6_info *rt;
struct fib6_node *fn;
@ -3362,26 +3487,18 @@ static int ip6_route_del(struct fib6_config *cfg,
for_each_fib6_node_rt_rcu(fn) {
struct fib6_nh *nh;
nh = rt->fib6_nh;
if (cfg->fc_flags & RTF_CACHE) {
struct fib6_result res = {
.f6i = rt,
};
int rc;
rt_cache = rt6_find_cached_rt(&res,
&cfg->fc_dst,
&cfg->fc_src);
if (rt_cache) {
rc = ip6_del_cached_rt(rt_cache, cfg);
if (rc != -ESRCH) {
rcu_read_unlock();
return rc;
}
rc = ip6_del_cached_rt(cfg, rt, nh);
if (rc != -ESRCH) {
rcu_read_unlock();
return rc;
}
continue;
}
nh = &rt->fib6_nh;
if (cfg->fc_ifindex &&
(!nh->fib_nh_dev ||
nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
@ -3503,7 +3620,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
if (!res.f6i)
goto out;
res.nh = &res.f6i->fib6_nh;
res.nh = res.f6i->fib6_nh;
res.fib6_flags = res.f6i->fib6_flags;
res.fib6_type = res.f6i->fib6_type;
nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL);
@ -3555,12 +3672,12 @@ static struct fib6_info *rt6_get_route_info(struct net *net,
goto out;
for_each_fib6_node_rt_rcu(fn) {
if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex)
if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex)
continue;
if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
!rt->fib6_nh.fib_nh_gw_family)
!rt->fib6_nh->fib_nh_gw_family)
continue;
if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr))
if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr))
continue;
if (!fib6_info_hold_safe(rt))
continue;
@ -3618,7 +3735,7 @@ struct fib6_info *rt6_get_dflt_router(struct net *net,
rcu_read_lock();
for_each_fib6_node_rt_rcu(&table->tb6_root) {
struct fib6_nh *nh = &rt->fib6_nh;
struct fib6_nh *nh = rt->fib6_nh;
if (dev == nh->fib_nh_dev &&
((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
@ -3870,7 +3987,7 @@ static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
struct net *net = ((struct arg_dev_net_ip *)arg)->net;
struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) &&
if (((void *)rt->fib6_nh->fib_nh_dev == dev || !dev) &&
rt != net->ipv6.fib6_null_entry &&
ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
spin_lock_bh(&rt6_exception_lock);
@ -3898,18 +4015,17 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
{
struct in6_addr *gateway = (struct in6_addr *)arg;
struct fib6_nh *nh = rt->fib6_nh;
if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
rt->fib6_nh.fib_nh_gw_family &&
ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) {
nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6))
return -1;
}
/* Further clean up cached routes in exception table.
* This is needed because cached route may have a different
* gateway than its 'parent' in the case of an ip redirect.
*/
rt6_exceptions_clean_tohost(rt, gateway);
fib6_nh_exceptions_clean_tohost(nh, gateway);
return 0;
}
@ -3949,9 +4065,9 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
static bool rt6_is_dead(const struct fib6_info *rt)
{
if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ||
(rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN &&
ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev)))
if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD ||
(rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN &&
ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev)))
return true;
return false;
@ -3963,11 +4079,11 @@ static int rt6_multipath_total_weight(const struct fib6_info *rt)
int total = 0;
if (!rt6_is_dead(rt))
total += rt->fib6_nh.fib_nh_weight;
total += rt->fib6_nh->fib_nh_weight;
list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
if (!rt6_is_dead(iter))
total += iter->fib6_nh.fib_nh_weight;
total += iter->fib6_nh->fib_nh_weight;
}
return total;
@ -3978,11 +4094,11 @@ static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
int upper_bound = -1;
if (!rt6_is_dead(rt)) {
*weight += rt->fib6_nh.fib_nh_weight;
*weight += rt->fib6_nh->fib_nh_weight;
upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
total) - 1;
}
atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound);
atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound);
}
static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
@ -4026,8 +4142,8 @@ static int fib6_ifup(struct fib6_info *rt, void *p_arg)
struct net *net = dev_net(arg->dev);
if (rt != net->ipv6.fib6_null_entry &&
rt->fib6_nh.fib_nh_dev == arg->dev) {
rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags;
rt->fib6_nh->fib_nh_dev == arg->dev) {
rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags;
fib6_update_sernum_upto_root(net, rt);
rt6_multipath_rebalance(rt);
}
@ -4055,10 +4171,10 @@ static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
{
struct fib6_info *iter;
if (rt->fib6_nh.fib_nh_dev == dev)
if (rt->fib6_nh->fib_nh_dev == dev)
return true;
list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
if (iter->fib6_nh.fib_nh_dev == dev)
if (iter->fib6_nh->fib_nh_dev == dev)
return true;
return false;
@ -4079,12 +4195,12 @@ static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
struct fib6_info *iter;
unsigned int dead = 0;
if (rt->fib6_nh.fib_nh_dev == down_dev ||
rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
if (rt->fib6_nh->fib_nh_dev == down_dev ||
rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD)
dead++;
list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
if (iter->fib6_nh.fib_nh_dev == down_dev ||
iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
if (iter->fib6_nh->fib_nh_dev == down_dev ||
iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD)
dead++;
return dead;
@ -4096,11 +4212,11 @@ static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
{
struct fib6_info *iter;
if (rt->fib6_nh.fib_nh_dev == dev)
rt->fib6_nh.fib_nh_flags |= nh_flags;
if (rt->fib6_nh->fib_nh_dev == dev)
rt->fib6_nh->fib_nh_flags |= nh_flags;
list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
if (iter->fib6_nh.fib_nh_dev == dev)
iter->fib6_nh.fib_nh_flags |= nh_flags;
if (iter->fib6_nh->fib_nh_dev == dev)
iter->fib6_nh->fib_nh_flags |= nh_flags;
}
/* called with write lock held for table with rt */
@ -4115,12 +4231,12 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
switch (arg->event) {
case NETDEV_UNREGISTER:
return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0;
case NETDEV_DOWN:
if (rt->should_flush)
return -1;
if (!rt->fib6_nsiblings)
return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0;
if (rt6_multipath_uses_dev(rt, dev)) {
unsigned int count;
@ -4136,10 +4252,10 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
}
return -2;
case NETDEV_CHANGE:
if (rt->fib6_nh.fib_nh_dev != dev ||
if (rt->fib6_nh->fib_nh_dev != dev ||
rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
break;
rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN;
rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
rt6_multipath_rebalance(rt);
break;
}
@ -4173,9 +4289,36 @@ void rt6_disable_ip(struct net_device *dev, unsigned long event)
struct rt6_mtu_change_arg {
struct net_device *dev;
unsigned int mtu;
struct fib6_info *f6i;
};
static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg)
{
struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg;
struct fib6_info *f6i = arg->f6i;
/* For administrative MTU increase, there is no way to discover
* IPv6 PMTU increase, so PMTU increase should be updated here.
* Since RFC 1981 doesn't include administrative MTU increase
* update PMTU increase is a MUST. (i.e. jumbo frame)
*/
if (nh->fib_nh_dev == arg->dev) {
struct inet6_dev *idev = __in6_dev_get(arg->dev);
u32 mtu = f6i->fib6_pmtu;
if (mtu >= arg->mtu ||
(mtu < arg->mtu && mtu == idev->cnf.mtu6))
fib6_metric_set(f6i, RTAX_MTU, arg->mtu);
spin_lock_bh(&rt6_exception_lock);
rt6_exceptions_update_pmtu(idev, nh, arg->mtu);
spin_unlock_bh(&rt6_exception_lock);
}
return 0;
}
static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg)
{
struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
struct inet6_dev *idev;
@ -4190,24 +4333,11 @@ static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
if (!idev)
return 0;
/* For administrative MTU increase, there is no way to discover
IPv6 PMTU increase, so PMTU increase should be updated here.
Since RFC 1981 doesn't include administrative MTU increase
update PMTU increase is a MUST. (i.e. jumbo frame)
*/
if (rt->fib6_nh.fib_nh_dev == arg->dev &&
!fib6_metric_locked(rt, RTAX_MTU)) {
u32 mtu = rt->fib6_pmtu;
if (fib6_metric_locked(f6i, RTAX_MTU))
return 0;
if (mtu >= arg->mtu ||
(mtu < arg->mtu && mtu == idev->cnf.mtu6))
fib6_metric_set(rt, RTAX_MTU, arg->mtu);
spin_lock_bh(&rt6_exception_lock);
rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
spin_unlock_bh(&rt6_exception_lock);
}
return 0;
arg->f6i = f6i;
return fib6_nh_mtu_change(f6i->fib6_nh, arg);
}
void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
@ -4487,7 +4617,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
goto cleanup;
}
rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1;
rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1;
err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
rt, &r_cfg);
@ -4654,7 +4784,7 @@ static size_t rt6_nlmsg_size(struct fib6_info *rt)
nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
+ NLA_ALIGN(sizeof(struct rtnexthop))
+ nla_total_size(16) /* RTA_GATEWAY */
+ lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws);
+ lwtunnel_get_encap_size(rt->fib6_nh->fib_nh_lws);
nexthop_len *= rt->fib6_nsiblings;
}
@ -4672,7 +4802,7 @@ static size_t rt6_nlmsg_size(struct fib6_info *rt)
+ nla_total_size(sizeof(struct rta_cacheinfo))
+ nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
+ nla_total_size(1) /* RTA_PREF */
+ lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws)
+ lwtunnel_get_encap_size(rt->fib6_nh->fib_nh_lws)
+ nexthop_len;
}
@ -4792,14 +4922,14 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
if (!mp)
goto nla_put_failure;
if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common,
rt->fib6_nh.fib_nh_weight) < 0)
if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common,
rt->fib6_nh->fib_nh_weight) < 0)
goto nla_put_failure;
list_for_each_entry_safe(sibling, next_sibling,
&rt->fib6_siblings, fib6_siblings) {
if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common,
sibling->fib6_nh.fib_nh_weight) < 0)
if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common,
sibling->fib6_nh->fib_nh_weight) < 0)
goto nla_put_failure;
}
@ -4807,7 +4937,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
} else {
unsigned char nh_flags = 0;
if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common,
if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common,
&nh_flags, false) < 0)
goto nla_put_failure;
@ -4837,7 +4967,7 @@ nla_put_failure:
static bool fib6_info_uses_dev(const struct fib6_info *f6i,
const struct net_device *dev)
{
if (f6i->fib6_nh.fib_nh_dev == dev)
if (f6i->fib6_nh->fib_nh_dev == dev)
return true;
if (f6i->fib6_nsiblings) {
@ -4845,7 +4975,7 @@ static bool fib6_info_uses_dev(const struct fib6_info *f6i,
list_for_each_entry_safe(sibling, next_sibling,
&f6i->fib6_siblings, fib6_siblings) {
if (sibling->fib6_nh.fib_nh_dev == dev)
if (sibling->fib6_nh->fib_nh_dev == dev)
return true;
}
}
@ -5166,7 +5296,7 @@ static int ip6_route_dev_notify(struct notifier_block *this,
return NOTIFY_OK;
if (event == NETDEV_REGISTER) {
net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev;
net->ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = dev;
net->ipv6.ip6_null_entry->dst.dev = dev;
net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
@ -5360,11 +5490,11 @@ static int __net_init ip6_route_net_init(struct net *net)
if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
goto out_ip6_dst_ops;
net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
sizeof(*net->ipv6.fib6_null_entry),
GFP_KERNEL);
net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true);
if (!net->ipv6.fib6_null_entry)
goto out_ip6_dst_entries;
memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template,
sizeof(*net->ipv6.fib6_null_entry));
net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
sizeof(*net->ipv6.ip6_null_entry),
@ -5501,7 +5631,7 @@ void __init ip6_route_init_special_entries(void)
/* Registering of the loopback is done before this portion of code,
* the loopback reference in rt6_info will not be taken, do it
* manually for init_net */
init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev;
init_net.ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = init_net.loopback_dev;
init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES

View File

@ -0,0 +1,455 @@
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
#
# redirect test
#
# .253 +----+
# +----| r1 |
# | +----+
# +----+ | |.1
# | h1 |--------------+ | 10.1.1.0/30 2001:db8:1::0/126
# +----+ .1 | |.2
# 172.16.1/24 | +----+ +----+
# 2001:db8:16:1/64 +----| r2 |-------------------| h2 |
# .254 +----+ .254 .2 +----+
# 172.16.2/24
# 2001:db8:16:2/64
#
# Route from h1 to h2 goes through r1, eth1 - connection between r1 and r2.
# Route on r1 changed to go to r2 via eth0. This causes a redirect to be sent
# from r1 to h1 telling h1 to use r2 when talking to h2.
VERBOSE=0
PAUSE_ON_FAIL=no
H1_N1_IP=172.16.1.1
R1_N1_IP=172.16.1.253
R2_N1_IP=172.16.1.254
H1_N1_IP6=2001:db8:16:1::1
R1_N1_IP6=2001:db8:16:1::253
R2_N1_IP6=2001:db8:16:1::254
R1_R2_N1_IP=10.1.1.1
R2_R1_N1_IP=10.1.1.2
R1_R2_N1_IP6=2001:db8:1::1
R2_R1_N1_IP6=2001:db8:1::2
H2_N2=172.16.2.0/24
H2_N2_6=2001:db8:16:2::/64
H2_N2_IP=172.16.2.2
R2_N2_IP=172.16.2.254
H2_N2_IP6=2001:db8:16:2::2
R2_N2_IP6=2001:db8:16:2::254
VRF=red
VRF_TABLE=1111
################################################################################
# helpers
log_section()
{
echo
echo "###########################################################################"
echo "$*"
echo "###########################################################################"
echo
}
log_test()
{
local rc=$1
local expected=$2
local msg="$3"
if [ ${rc} -eq ${expected} ]; then
printf "TEST: %-60s [ OK ]\n" "${msg}"
nsuccess=$((nsuccess+1))
else
ret=1
nfail=$((nfail+1))
printf "TEST: %-60s [FAIL]\n" "${msg}"
if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
echo
echo "hit enter to continue, 'q' to quit"
read a
[ "$a" = "q" ] && exit 1
fi
fi
}
run_cmd()
{
local cmd="$*"
local out
local rc
if [ "$VERBOSE" = "1" ]; then
echo "COMMAND: $cmd"
fi
out=$(eval $cmd 2>&1)
rc=$?
if [ "$VERBOSE" = "1" -a -n "$out" ]; then
echo "$out"
fi
[ "$VERBOSE" = "1" ] && echo
return $rc
}
get_linklocal()
{
local ns=$1
local dev=$2
local addr
addr=$(ip -netns $ns -6 -br addr show dev ${dev} | \
awk '{
for (i = 3; i <= NF; ++i) {
if ($i ~ /^fe80/)
print $i
}
}'
)
addr=${addr/\/*}
[ -z "$addr" ] && return 1
echo $addr
return 0
}
################################################################################
# setup and teardown
cleanup()
{
local ns
for ns in h1 h2 r1 r2; do
ip netns del $ns 2>/dev/null
done
}
create_vrf()
{
local ns=$1
ip -netns ${ns} link add ${VRF} type vrf table ${VRF_TABLE}
ip -netns ${ns} link set ${VRF} up
ip -netns ${ns} route add vrf ${VRF} unreachable default metric 8192
ip -netns ${ns} -6 route add vrf ${VRF} unreachable default metric 8192
ip -netns ${ns} addr add 127.0.0.1/8 dev ${VRF}
ip -netns ${ns} -6 addr add ::1 dev ${VRF} nodad
ip -netns ${ns} ru del pref 0
ip -netns ${ns} ru add pref 32765 from all lookup local
ip -netns ${ns} -6 ru del pref 0
ip -netns ${ns} -6 ru add pref 32765 from all lookup local
}
setup()
{
local ns
#
# create nodes as namespaces
#
for ns in h1 h2 r1 r2; do
ip netns add $ns
ip -netns $ns li set lo up
case "${ns}" in
h[12]) ip netns exec $ns sysctl -q -w net.ipv4.conf.all.accept_redirects=1
ip netns exec $ns sysctl -q -w net.ipv6.conf.all.accept_redirects=1
ip netns exec $ns sysctl -q -w net.ipv6.conf.all.keep_addr_on_down=1
;;
r[12]) ip netns exec $ns sysctl -q -w net.ipv4.ip_forward=1
ip netns exec $ns sysctl -q -w net.ipv4.conf.all.send_redirects=1
ip netns exec $ns sysctl -q -w net.ipv6.conf.all.forwarding=1
ip netns exec $ns sysctl -q -w net.ipv6.route.mtu_expires=10
esac
done
#
# create interconnects
#
ip -netns h1 li add eth0 type veth peer name r1h1
ip -netns h1 li set r1h1 netns r1 name eth0 up
ip -netns h1 li add eth1 type veth peer name r2h1
ip -netns h1 li set r2h1 netns r2 name eth0 up
ip -netns h2 li add eth0 type veth peer name r2h2
ip -netns h2 li set eth0 up
ip -netns h2 li set r2h2 netns r2 name eth2 up
ip -netns r1 li add eth1 type veth peer name r2r1
ip -netns r1 li set eth1 up
ip -netns r1 li set r2r1 netns r2 name eth1 up
#
# h1
#
if [ "${WITH_VRF}" = "yes" ]; then
create_vrf "h1"
H1_VRF_ARG="vrf ${VRF}"
H1_PING_ARG="-I ${VRF}"
else
H1_VRF_ARG=
H1_PING_ARG=
fi
ip -netns h1 li add br0 type bridge
if [ "${WITH_VRF}" = "yes" ]; then
ip -netns h1 li set br0 vrf ${VRF} up
else
ip -netns h1 li set br0 up
fi
ip -netns h1 addr add dev br0 ${H1_N1_IP}/24
ip -netns h1 -6 addr add dev br0 ${H1_N1_IP6}/64 nodad
ip -netns h1 li set eth0 master br0 up
ip -netns h1 li set eth1 master br0 up
#
# h2
#
ip -netns h2 addr add dev eth0 ${H2_N2_IP}/24
ip -netns h2 ro add default via ${R2_N2_IP} dev eth0
ip -netns h2 -6 addr add dev eth0 ${H2_N2_IP6}/64 nodad
ip -netns h2 -6 ro add default via ${R2_N2_IP6} dev eth0
#
# r1
#
ip -netns r1 addr add dev eth0 ${R1_N1_IP}/24
ip -netns r1 -6 addr add dev eth0 ${R1_N1_IP6}/64 nodad
ip -netns r1 addr add dev eth1 ${R1_R2_N1_IP}/30
ip -netns r1 -6 addr add dev eth1 ${R1_R2_N1_IP6}/126 nodad
#
# r2
#
ip -netns r2 addr add dev eth0 ${R2_N1_IP}/24
ip -netns r2 -6 addr add dev eth0 ${R2_N1_IP6}/64 nodad
ip -netns r2 addr add dev eth1 ${R2_R1_N1_IP}/30
ip -netns r2 -6 addr add dev eth1 ${R2_R1_N1_IP6}/126 nodad
ip -netns r2 addr add dev eth2 ${R2_N2_IP}/24
ip -netns r2 -6 addr add dev eth2 ${R2_N2_IP6}/64 nodad
sleep 2
R1_LLADDR=$(get_linklocal r1 eth0)
if [ $? -ne 0 ]; then
echo "Error: Failed to get link-local address of r1's eth0"
exit 1
fi
R2_LLADDR=$(get_linklocal r2 eth0)
if [ $? -ne 0 ]; then
echo "Error: Failed to get link-local address of r2's eth0"
exit 1
fi
}
change_h2_mtu()
{
local mtu=$1
run_cmd ip -netns h2 li set eth0 mtu ${mtu}
run_cmd ip -netns r2 li set eth2 mtu ${mtu}
}
check_exception()
{
local mtu="$1"
local with_redirect="$2"
local desc="$3"
# From 172.16.1.101: icmp_seq=1 Redirect Host(New nexthop: 172.16.1.102)
if [ "$VERBOSE" = "1" ]; then
echo "Commands to check for exception:"
run_cmd ip -netns h1 ro get ${H1_VRF_ARG} ${H2_N2_IP}
run_cmd ip -netns h1 -6 ro get ${H1_VRF_ARG} ${H2_N2_IP6}
fi
if [ -n "${mtu}" ]; then
mtu=" mtu ${mtu}"
fi
if [ "$with_redirect" = "yes" ]; then
ip -netns h1 ro get ${H1_VRF_ARG} ${H2_N2_IP} | \
grep -q "cache <redirected> expires [0-9]*sec${mtu}"
elif [ -n "${mtu}" ]; then
ip -netns h1 ro get ${H1_VRF_ARG} ${H2_N2_IP} | \
grep -q "cache expires [0-9]*sec${mtu}"
else
ip -netns h1 ro get ${H1_VRF_ARG} ${H2_N2_IP} | \
grep -q "cache"
fi
log_test $? 0 "IPv4: ${desc}"
if [ "$with_redirect" = "yes" ]; then
ip -netns h1 -6 ro get ${H1_VRF_ARG} ${H2_N2_IP6} | grep -q "${H2_N2_IP6} from :: via ${R2_LLADDR} dev br0.*${mtu}"
else
ip -netns h1 -6 ro get ${H1_VRF_ARG} ${H2_N2_IP6} | grep -q "${mtu}"
fi
log_test $? 0 "IPv6: ${desc}"
}
run_ping()
{
local sz=$1
run_cmd ip netns exec h1 ping -q -M want -i 0.2 -c 10 -w 2 -s ${sz} ${H1_PING_ARG} ${H2_N2_IP}
run_cmd ip netns exec h1 ${ping6} -q -M want -i 0.2 -c 10 -w 2 -s ${sz} ${H1_PING_ARG} ${H2_N2_IP6}
}
replace_route_legacy()
{
# r1 to h2 via r2 and eth0
run_cmd ip -netns r1 ro replace ${H2_N2} via ${R2_N1_IP} dev eth0
run_cmd ip -netns r1 -6 ro replace ${H2_N2_6} via ${R2_LLADDR} dev eth0
}
initial_route_legacy()
{
# r1 to h2 via r2 and eth1
run_cmd ip -netns r1 ro add ${H2_N2} via ${R2_R1_N1_IP} dev eth1
run_cmd ip -netns r1 -6 ro add ${H2_N2_6} via ${R2_R1_N1_IP6} dev eth1
# h1 to h2 via r1
# - IPv6 redirect only works if gateway is the LLA
run_cmd ip -netns h1 ro add ${H1_VRF_ARG} ${H2_N2} via ${R1_N1_IP} dev br0
run_cmd ip -netns h1 -6 ro add ${H1_VRF_ARG} ${H2_N2_6} via ${R1_LLADDR} dev br0
}
check_connectivity()
{
local rc
run_cmd ip netns exec h1 ping -c1 -w1 ${H1_PING_ARG} ${H2_N2_IP}
rc=$?
run_cmd ip netns exec h1 ${ping6} -c1 -w1 ${H1_PING_ARG} ${H2_N2_IP6}
[ $? -ne 0 ] && rc=$?
return $rc
}
do_test()
{
local ttype="$1"
eval initial_route_${ttype}
# verify connectivity
check_connectivity
if [ $? -ne 0 ]; then
echo "Error: Basic connectivity is broken"
ret=1
return
fi
# redirect exception followed by mtu
eval replace_route_${ttype}
run_ping 64
check_exception "" "yes" "redirect exception"
check_connectivity
if [ $? -ne 0 ]; then
echo "Error: Basic connectivity is broken after redirect"
ret=1
return
fi
change_h2_mtu 1300
run_ping 1350
check_exception "1300" "yes" "redirect exception plus mtu"
# remove exceptions and restore routing
change_h2_mtu 1500
ip -netns h1 li set br0 down
ip -netns h1 li set br0 up
eval initial_route_${ttype}
check_connectivity
if [ $? -ne 0 ]; then
echo "Error: Basic connectivity is broken after reset"
ret=1
return
fi
check_exception "" "no" "routing reset"
# MTU exception followed by redirect
change_h2_mtu 1300
run_ping 1350
check_exception "1300" "no" "mtu exception"
eval replace_route_${ttype}
run_ping 64
check_exception "1300" "yes" "mtu exception plus redirect"
check_connectivity
if [ $? -ne 0 ]; then
echo "Error: Basic connectivity is broken after redirect"
ret=1
return
fi
}
################################################################################
# usage
usage()
{
cat <<EOF
usage: ${0##*/} OPTS
-p Pause on fail
-v verbose mode (show commands and output)
EOF
}
################################################################################
# main
# Some systems don't have a ping6 binary anymore
which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping)
ret=0
nsuccess=0
nfail=0
while getopts :pv o
do
case $o in
p) PAUSE_ON_FAIL=yes;;
v) VERBOSE=$(($VERBOSE + 1));;
*) usage; exit 1;;
esac
done
trap cleanup EXIT
cleanup
WITH_VRF=no
setup
log_section "Legacy routing"
do_test "legacy"
cleanup
log_section "Legacy routing with VRF"
WITH_VRF=yes
setup
do_test "legacy"
printf "\nTests passed: %3d\n" ${nsuccess}
printf "Tests failed: %3d\n" ${nfail}
exit $ret