Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf

Pablo Neira Ayuso says:

====================
Netfilter/IPVS fixes for net

The following patchset contains Netfilter/IPVS fixes for your net tree:

1) Infinite loop in IPVS when net namespace is released, from
   Tan Hu.

2) Do not show negative timeouts in ip_vs_conn by using the new
   jiffies_delta_to_msecs(), patches from Matteo Croce.

3) Set F_IFACE flag for linklocal addresses in ip6t_rpfilter,
   from Florian Westphal.

4) Fix overflow in set size allocation, from Taehee Yoo.

5) Use netlink_dump_start() from ctnetlink to fix memleak from
   the error path, again from Florian.

6) Register nfnetlink_subsys in last place, otherwise netns
   init path may lose race and see net->nft uninitialized data.
   This also reverts previous attempt to fix this by increase
   netns refcount, patches from Florian.

7) Remove conntrack entries on layer 4 protocol tracker module
   removal, from Florian.

8) Use GFP_KERNEL_ACCOUNT for xtables blob allocation, from
   Michal Hocko.

9) Get tproxy documentation in sync with existing codebase,
   from Mate Eckl.

10) Honor preset layer 3 protocol via ctx->family in the new nft_ct
    timeout infrastructure, from Harsha Sharma.

11) Let uapi nfnetlink_osf.h compile standalone with no errors,
    from Dmitry V. Levin.

12) Missing braces compilation warning in nft_tproxy, patch from
    Mate Eclk.

13) Disregard bogus check to bail out on non-anonymous sets from
    the dynamic set update extension.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2018-08-18 09:59:19 -07:00
commit 3fe49d699a
20 changed files with 164 additions and 96 deletions

View File

@ -5,19 +5,28 @@ This feature adds Linux 2.2-like transparent proxy support to current kernels.
To use it, enable the socket match and the TPROXY target in your kernel config.
You will need policy routing too, so be sure to enable that as well.
From Linux 4.18 transparent proxy support is also available in nf_tables.
1. Making non-local sockets work
================================
The idea is that you identify packets with destination address matching a local
socket on your box, set the packet mark to a certain value, and then match on that
value using policy routing to have those packets delivered locally:
socket on your box, set the packet mark to a certain value:
# iptables -t mangle -N DIVERT
# iptables -t mangle -A PREROUTING -p tcp -m socket -j DIVERT
# iptables -t mangle -A DIVERT -j MARK --set-mark 1
# iptables -t mangle -A DIVERT -j ACCEPT
Alternatively you can do this in nft with the following commands:
# nft add table filter
# nft add chain filter divert "{ type filter hook prerouting priority -150; }"
# nft add rule filter divert meta l4proto tcp socket transparent 1 meta mark set 1 accept
And then match on that value using policy routing to have those packets
delivered locally:
# ip rule add fwmark 1 lookup 100
# ip route add local 0.0.0.0/0 dev lo table 100
@ -57,17 +66,28 @@ add rules like this to the iptables ruleset above:
# iptables -t mangle -A PREROUTING -p tcp --dport 80 -j TPROXY \
--tproxy-mark 0x1/0x1 --on-port 50080
Or the following rule to nft:
# nft add rule filter divert tcp dport 80 tproxy to :50080 meta mark set 1 accept
Note that for this to work you'll have to modify the proxy to enable (SOL_IP,
IP_TRANSPARENT) for the listening socket.
As an example implementation, tcprdr is available here:
https://git.breakpoint.cc/cgit/fw/tcprdr.git/
This tool is written by Florian Westphal and it was used for testing during the
nf_tables implementation.
3. Iptables extensions
======================
3. Iptables and nf_tables extensions
====================================
To use tproxy you'll need to have the 'socket' and 'TPROXY' modules
compiled for iptables. A patched version of iptables is available
here: http://git.balabit.hu/?p=bazsi/iptables-tproxy.git
To use tproxy you'll need to have the following modules compiled for iptables:
- NETFILTER_XT_MATCH_SOCKET
- NETFILTER_XT_TARGET_TPROXY
Or the floowing modules for nf_tables:
- NFT_SOCKET
- NFT_TPROXY
4. Application support
======================

View File

@ -447,6 +447,11 @@ static inline clock_t jiffies_delta_to_clock_t(long delta)
return jiffies_to_clock_t(max(0L, delta));
}
static inline unsigned int jiffies_delta_to_msecs(long delta)
{
return jiffies_to_msecs(max(0L, delta));
}
extern unsigned long clock_t_to_jiffies(unsigned long x);
extern u64 jiffies_64_to_clock_t(u64 x);
extern u64 nsec_to_clock_t(u64 x);

View File

@ -274,7 +274,7 @@ enum nft_set_class {
* @space: memory class
*/
struct nft_set_estimate {
unsigned int size;
u64 size;
enum nft_set_class lookup;
enum nft_set_class space;
};
@ -336,7 +336,7 @@ struct nft_set_ops {
const struct nft_set_elem *elem,
unsigned int flags);
unsigned int (*privsize)(const struct nlattr * const nla[],
u64 (*privsize)(const struct nlattr * const nla[],
const struct nft_set_desc *desc);
bool (*estimate)(const struct nft_set_desc *desc,
u32 features,
@ -1374,6 +1374,6 @@ struct nft_trans_flowtable {
(((struct nft_trans_flowtable *)trans->data)->flowtable)
int __init nft_chain_filter_init(void);
void __exit nft_chain_filter_fini(void);
void nft_chain_filter_fini(void);
#endif /* _NET_NF_TABLES_H */

View File

@ -2,6 +2,8 @@
#define _NF_OSF_H
#include <linux/types.h>
#include <linux/ip.h>
#include <linux/tcp.h>
#define MAXGENRELEN 32

View File

@ -21,8 +21,6 @@
#define _XT_OSF_H
#include <linux/types.h>
#include <linux/ip.h>
#include <linux/tcp.h>
#include <linux/netfilter/nfnetlink_osf.h>
#define XT_OSF_GENRE NF_OSF_GENRE

View File

@ -26,6 +26,12 @@ static bool rpfilter_addr_unicast(const struct in6_addr *addr)
return addr_type & IPV6_ADDR_UNICAST;
}
static bool rpfilter_addr_linklocal(const struct in6_addr *addr)
{
int addr_type = ipv6_addr_type(addr);
return addr_type & IPV6_ADDR_LINKLOCAL;
}
static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb,
const struct net_device *dev, u8 flags)
{
@ -48,7 +54,11 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb,
}
fl6.flowi6_mark = flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0;
if ((flags & XT_RPFILTER_LOOSE) == 0)
if (rpfilter_addr_linklocal(&iph->saddr)) {
lookup_flags |= RT6_LOOKUP_F_IFACE;
fl6.flowi6_oif = dev->ifindex;
} else if ((flags & XT_RPFILTER_LOOSE) == 0)
fl6.flowi6_oif = dev->ifindex;
rt = (void *)ip6_route_lookup(net, &fl6, skb, lookup_flags);

View File

@ -1117,24 +1117,28 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6)
seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X "
"%s %04X %-11s %7lu%s\n",
"%s %04X %-11s %7u%s\n",
ip_vs_proto_name(cp->protocol),
&cp->caddr.in6, ntohs(cp->cport),
&cp->vaddr.in6, ntohs(cp->vport),
dbuf, ntohs(cp->dport),
ip_vs_state_name(cp),
(cp->timer.expires-jiffies)/HZ, pe_data);
jiffies_delta_to_msecs(cp->timer.expires -
jiffies) / 1000,
pe_data);
else
#endif
seq_printf(seq,
"%-3s %08X %04X %08X %04X"
" %s %04X %-11s %7lu%s\n",
" %s %04X %-11s %7u%s\n",
ip_vs_proto_name(cp->protocol),
ntohl(cp->caddr.ip), ntohs(cp->cport),
ntohl(cp->vaddr.ip), ntohs(cp->vport),
dbuf, ntohs(cp->dport),
ip_vs_state_name(cp),
(cp->timer.expires-jiffies)/HZ, pe_data);
jiffies_delta_to_msecs(cp->timer.expires -
jiffies) / 1000,
pe_data);
}
return 0;
}
@ -1179,26 +1183,28 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6)
seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X "
"%s %04X %-11s %-6s %7lu\n",
"%s %04X %-11s %-6s %7u\n",
ip_vs_proto_name(cp->protocol),
&cp->caddr.in6, ntohs(cp->cport),
&cp->vaddr.in6, ntohs(cp->vport),
dbuf, ntohs(cp->dport),
ip_vs_state_name(cp),
ip_vs_origin_name(cp->flags),
(cp->timer.expires-jiffies)/HZ);
jiffies_delta_to_msecs(cp->timer.expires -
jiffies) / 1000);
else
#endif
seq_printf(seq,
"%-3s %08X %04X %08X %04X "
"%s %04X %-11s %-6s %7lu\n",
"%s %04X %-11s %-6s %7u\n",
ip_vs_proto_name(cp->protocol),
ntohl(cp->caddr.ip), ntohs(cp->cport),
ntohl(cp->vaddr.ip), ntohs(cp->vport),
dbuf, ntohs(cp->dport),
ip_vs_state_name(cp),
ip_vs_origin_name(cp->flags),
(cp->timer.expires-jiffies)/HZ);
jiffies_delta_to_msecs(cp->timer.expires -
jiffies) / 1000);
}
return 0;
}

View File

@ -1972,13 +1972,20 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int
if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
/* the destination server is not available */
if (sysctl_expire_nodest_conn(ipvs)) {
__u32 flags = cp->flags;
/* when timer already started, silently drop the packet.*/
if (timer_pending(&cp->timer))
__ip_vs_conn_put(cp);
else
ip_vs_conn_put(cp);
if (sysctl_expire_nodest_conn(ipvs) &&
!(flags & IP_VS_CONN_F_ONE_PACKET)) {
/* try to expire the connection immediately */
ip_vs_conn_expire_now(cp);
}
/* don't restart its timer, and silently
drop the packet. */
__ip_vs_conn_put(cp);
return NF_DROP;
}

View File

@ -846,6 +846,21 @@ ctnetlink_alloc_filter(const struct nlattr * const cda[])
#endif
}
static int ctnetlink_start(struct netlink_callback *cb)
{
const struct nlattr * const *cda = cb->data;
struct ctnetlink_filter *filter = NULL;
if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) {
filter = ctnetlink_alloc_filter(cda);
if (IS_ERR(filter))
return PTR_ERR(filter);
}
cb->data = filter;
return 0;
}
static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
{
struct ctnetlink_filter *filter = data;
@ -1290,19 +1305,12 @@ static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl,
if (nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.start = ctnetlink_start,
.dump = ctnetlink_dump_table,
.done = ctnetlink_done,
.data = (void *)cda,
};
if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) {
struct ctnetlink_filter *filter;
filter = ctnetlink_alloc_filter(cda);
if (IS_ERR(filter))
return PTR_ERR(filter);
c.data = filter;
}
return netlink_dump_start(ctnl, skb, nlh, &c);
}

View File

@ -312,7 +312,9 @@ void nf_ct_l4proto_unregister_one(const struct nf_conntrack_l4proto *l4proto)
__nf_ct_l4proto_unregister_one(l4proto);
mutex_unlock(&nf_ct_proto_mutex);
synchronize_rcu();
synchronize_net();
/* Remove all contrack entries for this protocol */
nf_ct_iterate_destroy(kill_l4proto, (void *)l4proto);
}
EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister_one);
@ -333,14 +335,17 @@ static void
nf_ct_l4proto_unregister(const struct nf_conntrack_l4proto * const l4proto[],
unsigned int num_proto)
{
int i;
mutex_lock(&nf_ct_proto_mutex);
while (num_proto-- != 0)
__nf_ct_l4proto_unregister_one(l4proto[num_proto]);
for (i = 0; i < num_proto; i++)
__nf_ct_l4proto_unregister_one(l4proto[i]);
mutex_unlock(&nf_ct_proto_mutex);
synchronize_net();
/* Remove all contrack entries for this protocol */
nf_ct_iterate_destroy(kill_l4proto, (void *)l4proto);
for (i = 0; i < num_proto; i++)
nf_ct_iterate_destroy(kill_l4proto, (void *)l4proto[i]);
}
static int

View File

@ -3354,7 +3354,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
struct nft_set *set;
struct nft_ctx ctx;
char *name;
unsigned int size;
u64 size;
u64 timeout;
u32 ktype, dtype, flags, policy, gc_int, objtype;
struct nft_set_desc desc;
@ -5925,10 +5925,7 @@ static int nf_tables_flowtable_event(struct notifier_block *this,
if (event != NETDEV_UNREGISTER)
return 0;
net = maybe_get_net(dev_net(dev));
if (!net)
return 0;
net = dev_net(dev);
mutex_lock(&net->nft.commit_mutex);
list_for_each_entry(table, &net->nft.tables, list) {
list_for_each_entry(flowtable, &table->flowtables, list) {
@ -5936,7 +5933,7 @@ static int nf_tables_flowtable_event(struct notifier_block *this,
}
}
mutex_unlock(&net->nft.commit_mutex);
put_net(net);
return NOTIFY_DONE;
}
@ -7273,21 +7270,36 @@ static int __init nf_tables_module_init(void)
{
int err;
nft_chain_filter_init();
err = nf_tables_core_module_init();
err = register_pernet_subsys(&nf_tables_net_ops);
if (err < 0)
return err;
err = nft_chain_filter_init();
if (err < 0)
goto err1;
err = nf_tables_core_module_init();
if (err < 0)
goto err2;
err = register_netdevice_notifier(&nf_tables_flowtable_notifier);
if (err < 0)
goto err3;
/* must be last */
err = nfnetlink_subsys_register(&nf_tables_subsys);
if (err < 0)
goto err;
goto err4;
register_netdevice_notifier(&nf_tables_flowtable_notifier);
return register_pernet_subsys(&nf_tables_net_ops);
err:
return err;
err4:
unregister_netdevice_notifier(&nf_tables_flowtable_notifier);
err3:
nf_tables_core_module_exit();
err2:
nft_chain_filter_fini();
err1:
unregister_pernet_subsys(&nf_tables_net_ops);
return err;
}

View File

@ -238,29 +238,33 @@ static const struct nla_policy filter_policy[NFACCT_FILTER_MAX + 1] = {
[NFACCT_FILTER_VALUE] = { .type = NLA_U32 },
};
static struct nfacct_filter *
nfacct_filter_alloc(const struct nlattr * const attr)
static int nfnl_acct_start(struct netlink_callback *cb)
{
struct nfacct_filter *filter;
const struct nlattr *const attr = cb->data;
struct nlattr *tb[NFACCT_FILTER_MAX + 1];
struct nfacct_filter *filter;
int err;
if (!attr)
return 0;
err = nla_parse_nested(tb, NFACCT_FILTER_MAX, attr, filter_policy,
NULL);
if (err < 0)
return ERR_PTR(err);
return err;
if (!tb[NFACCT_FILTER_MASK] || !tb[NFACCT_FILTER_VALUE])
return ERR_PTR(-EINVAL);
return -EINVAL;
filter = kzalloc(sizeof(struct nfacct_filter), GFP_KERNEL);
if (!filter)
return ERR_PTR(-ENOMEM);
return -ENOMEM;
filter->mask = ntohl(nla_get_be32(tb[NFACCT_FILTER_MASK]));
filter->value = ntohl(nla_get_be32(tb[NFACCT_FILTER_VALUE]));
cb->data = filter;
return filter;
return 0;
}
static int nfnl_acct_get(struct net *net, struct sock *nfnl,
@ -275,18 +279,11 @@ static int nfnl_acct_get(struct net *net, struct sock *nfnl,
if (nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = nfnl_acct_dump,
.start = nfnl_acct_start,
.done = nfnl_acct_done,
.data = (void *)tb[NFACCT_FILTER],
};
if (tb[NFACCT_FILTER]) {
struct nfacct_filter *filter;
filter = nfacct_filter_alloc(tb[NFACCT_FILTER]);
if (IS_ERR(filter))
return PTR_ERR(filter);
c.data = filter;
}
return netlink_dump_start(nfnl, skb, nlh, &c);
}

View File

@ -293,6 +293,13 @@ static void nft_netdev_event(unsigned long event, struct net_device *dev,
if (strcmp(basechain->dev_name, dev->name) != 0)
return;
/* UNREGISTER events are also happpening on netns exit.
*
* Altough nf_tables core releases all tables/chains, only
* this event handler provides guarantee that
* basechain.ops->dev is still accessible, so we cannot
* skip exiting net namespaces.
*/
__nft_release_basechain(ctx);
break;
case NETDEV_CHANGENAME:
@ -318,10 +325,6 @@ static int nf_tables_netdev_event(struct notifier_block *this,
event != NETDEV_CHANGENAME)
return NOTIFY_DONE;
ctx.net = maybe_get_net(ctx.net);
if (!ctx.net)
return NOTIFY_DONE;
mutex_lock(&ctx.net->nft.commit_mutex);
list_for_each_entry(table, &ctx.net->nft.tables, list) {
if (table->family != NFPROTO_NETDEV)
@ -338,7 +341,6 @@ static int nf_tables_netdev_event(struct notifier_block *this,
}
}
mutex_unlock(&ctx.net->nft.commit_mutex);
put_net(ctx.net);
return NOTIFY_DONE;
}
@ -392,7 +394,7 @@ int __init nft_chain_filter_init(void)
return 0;
}
void __exit nft_chain_filter_fini(void)
void nft_chain_filter_fini(void)
{
nft_chain_filter_bridge_fini();
nft_chain_filter_inet_fini();

View File

@ -832,12 +832,13 @@ static int nft_ct_timeout_obj_init(const struct nft_ctx *ctx,
__u8 l4num;
int ret;
if (!tb[NFTA_CT_TIMEOUT_L3PROTO] ||
!tb[NFTA_CT_TIMEOUT_L4PROTO] ||
if (!tb[NFTA_CT_TIMEOUT_L4PROTO] ||
!tb[NFTA_CT_TIMEOUT_DATA])
return -EINVAL;
l3num = ntohs(nla_get_be16(tb[NFTA_CT_TIMEOUT_L3PROTO]));
if (tb[NFTA_CT_TIMEOUT_L3PROTO])
l3num = ntohs(nla_get_be16(tb[NFTA_CT_TIMEOUT_L3PROTO]));
l4num = nla_get_u8(tb[NFTA_CT_TIMEOUT_L4PROTO]);
priv->l4proto = l4num;

View File

@ -187,8 +187,6 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
if (tb[NFTA_DYNSET_EXPR] != NULL) {
if (!(set->flags & NFT_SET_EVAL))
return -EINVAL;
if (!nft_set_is_anonymous(set))
return -EOPNOTSUPP;
priv->expr = nft_expr_init(ctx, tb[NFTA_DYNSET_EXPR]);
if (IS_ERR(priv->expr))

View File

@ -248,13 +248,13 @@ static inline u32 nft_bitmap_size(u32 klen)
return ((2 << ((klen * BITS_PER_BYTE) - 1)) / BITS_PER_BYTE) << 1;
}
static inline u32 nft_bitmap_total_size(u32 klen)
static inline u64 nft_bitmap_total_size(u32 klen)
{
return sizeof(struct nft_bitmap) + nft_bitmap_size(klen);
}
static unsigned int nft_bitmap_privsize(const struct nlattr * const nla[],
const struct nft_set_desc *desc)
static u64 nft_bitmap_privsize(const struct nlattr * const nla[],
const struct nft_set_desc *desc)
{
u32 klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN]));

View File

@ -341,8 +341,8 @@ schedule:
nft_set_gc_interval(set));
}
static unsigned int nft_rhash_privsize(const struct nlattr * const nla[],
const struct nft_set_desc *desc)
static u64 nft_rhash_privsize(const struct nlattr * const nla[],
const struct nft_set_desc *desc)
{
return sizeof(struct nft_rhash);
}
@ -585,8 +585,8 @@ cont:
}
}
static unsigned int nft_hash_privsize(const struct nlattr * const nla[],
const struct nft_set_desc *desc)
static u64 nft_hash_privsize(const struct nlattr * const nla[],
const struct nft_set_desc *desc)
{
return sizeof(struct nft_hash) +
nft_hash_buckets(desc->size) * sizeof(struct hlist_head);

View File

@ -411,8 +411,8 @@ static void nft_rbtree_gc(struct work_struct *work)
nft_set_gc_interval(set));
}
static unsigned int nft_rbtree_privsize(const struct nlattr * const nla[],
const struct nft_set_desc *desc)
static u64 nft_rbtree_privsize(const struct nlattr * const nla[],
const struct nft_set_desc *desc)
{
return sizeof(struct nft_rbtree);
}

View File

@ -82,13 +82,15 @@ static void nft_tproxy_eval_v6(const struct nft_expr *expr,
const struct nft_tproxy *priv = nft_expr_priv(expr);
struct sk_buff *skb = pkt->skb;
const struct ipv6hdr *iph = ipv6_hdr(skb);
struct in6_addr taddr = {0};
struct in6_addr taddr;
int thoff = pkt->xt.thoff;
struct udphdr _hdr, *hp;
__be16 tport = 0;
struct sock *sk;
int l4proto;
memset(&taddr, 0, sizeof(taddr));
if (!pkt->tprot_set) {
regs->verdict.code = NFT_BREAK;
return;

View File

@ -1178,12 +1178,7 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size)
if (sz < sizeof(*info) || sz >= XT_MAX_TABLE_SIZE)
return NULL;
/* __GFP_NORETRY is not fully supported by kvmalloc but it should
* work reasonably well if sz is too large and bail out rather
* than shoot all processes down before realizing there is nothing
* more to reclaim.
*/
info = kvmalloc(sz, GFP_KERNEL | __GFP_NORETRY);
info = kvmalloc(sz, GFP_KERNEL_ACCOUNT);
if (!info)
return NULL;