From 07dc8bc9a6b15f54d3ad962af74a096c7d7b42b4 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Tue, 7 Nov 2017 10:08:01 +0000
Subject: [PATCH 01/76] netfilter: remove redundant assignment to e

The assignment to variable e is redundant since the same assignment
occurs just a few lines later, hence it can be removed.  Cleans up
clang warning for arp_tables, ip_tables and ip6_tables:

warning: Value stored to 'e' is never read

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/arp_tables.c | 1 -
 net/ipv4/netfilter/ip_tables.c  | 1 -
 net/ipv6/netfilter/ip6_tables.c | 1 -
 3 files changed, 3 deletions(-)

diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index f88221aebc9d..0c3c944a7b72 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -373,7 +373,6 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
 					if (!xt_find_jump_offset(offsets, newpos,
 								 newinfo->number))
 						return 0;
-					e = entry0 + newpos;
 				} else {
 					/* ... this is a fallthru */
 					newpos = pos + e->next_offset;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 4cbe5e80f3bf..2e0d339028bb 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -439,7 +439,6 @@ mark_source_chains(const struct xt_table_info *newinfo,
 					if (!xt_find_jump_offset(offsets, newpos,
 								 newinfo->number))
 						return 0;
-					e = entry0 + newpos;
 				} else {
 					/* ... this is a fallthru */
 					newpos = pos + e->next_offset;
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index f06e25065a34..1d7ae9366335 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -458,7 +458,6 @@ mark_source_chains(const struct xt_table_info *newinfo,
 					if (!xt_find_jump_offset(offsets, newpos,
 								 newinfo->number))
 						return 0;
-					e = entry0 + newpos;
 				} else {
 					/* ... this is a fallthru */
 					newpos = pos + e->next_offset;

From 613d0776d3fe7eb28c695a63a5533a1ec8258c86 Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@virtuozzo.com>
Date: Sun, 12 Nov 2017 14:32:37 +0300
Subject: [PATCH 02/76] netfilter: exit_net cleanup check added

Be sure that lists initialized in net_init hook was return to initial
state.

Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/ipt_CLUSTERIP.c | 1 +
 net/netfilter/nf_tables_api.c      | 7 +++++++
 net/netfilter/nfnetlink_log.c      | 5 +++++
 net/netfilter/nfnetlink_queue.c    | 5 +++++
 net/netfilter/x_tables.c           | 9 +++++++++
 5 files changed, 27 insertions(+)

diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 17b4ca562944..e35b8d074f06 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -819,6 +819,7 @@ static void clusterip_net_exit(struct net *net)
 	cn->procdir = NULL;
 #endif
 	nf_unregister_net_hook(net, &cip_arp_ops);
+	WARN_ON_ONCE(!list_empty(&cn->configs));
 }
 
 static struct pernet_operations clusterip_net_ops = {
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index d8327b43e4dc..10798b357481 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5847,6 +5847,12 @@ static int __net_init nf_tables_init_net(struct net *net)
 	return 0;
 }
 
+static void __net_exit nf_tables_exit_net(struct net *net)
+{
+	WARN_ON_ONCE(!list_empty(&net->nft.af_info));
+	WARN_ON_ONCE(!list_empty(&net->nft.commit_list));
+}
+
 int __nft_release_basechain(struct nft_ctx *ctx)
 {
 	struct nft_rule *rule, *nr;
@@ -5917,6 +5923,7 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi)
 
 static struct pernet_operations nf_tables_net_ops = {
 	.init	= nf_tables_init_net,
+	.exit	= nf_tables_exit_net,
 };
 
 static int __init nf_tables_module_init(void)
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index cad6498f10b0..1f511ed0fea3 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -1093,10 +1093,15 @@ static int __net_init nfnl_log_net_init(struct net *net)
 
 static void __net_exit nfnl_log_net_exit(struct net *net)
 {
+	struct nfnl_log_net *log = nfnl_log_pernet(net);
+	unsigned int i;
+
 #ifdef CONFIG_PROC_FS
 	remove_proc_entry("nfnetlink_log", net->nf.proc_netfilter);
 #endif
 	nf_log_unset(net, &nfulnl_logger);
+	for (i = 0; i < INSTANCE_BUCKETS; i++)
+		WARN_ON_ONCE(!hlist_empty(&log->instance_table[i]));
 }
 
 static struct pernet_operations nfnl_log_net_ops = {
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index a16356cacec3..c09b36755ed7 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -1512,10 +1512,15 @@ static int __net_init nfnl_queue_net_init(struct net *net)
 
 static void __net_exit nfnl_queue_net_exit(struct net *net)
 {
+	struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+	unsigned int i;
+
 	nf_unregister_queue_handler(net);
 #ifdef CONFIG_PROC_FS
 	remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter);
 #endif
+	for (i = 0; i < INSTANCE_BUCKETS; i++)
+		WARN_ON_ONCE(!hlist_empty(&q->instance_table[i]));
 }
 
 static void nfnl_queue_net_exit_batch(struct list_head *net_exit_list)
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index a77dd514297c..55802e97f906 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -1729,8 +1729,17 @@ static int __net_init xt_net_init(struct net *net)
 	return 0;
 }
 
+static void __net_exit xt_net_exit(struct net *net)
+{
+	int i;
+
+	for (i = 0; i < NFPROTO_NUMPROTO; i++)
+		WARN_ON_ONCE(!list_empty(&net->xt.tables[i]));
+}
+
 static struct pernet_operations xt_net_ops = {
 	.init = xt_net_init,
+	.exit = xt_net_exit,
 };
 
 static int __init xt_init(void)

From bc7d811ace4ad39a3941089ca871633366878719 Mon Sep 17 00:00:00 2001
From: Eric Sesterhenn <eric.sesterhenn@x41-dsec.de>
Date: Mon, 13 Nov 2017 09:09:40 +0100
Subject: [PATCH 03/76] netfilter: nf_ct_h323: Convert CHECK_BOUND macro to
 function

It is bad practive to return in a macro, this patch
moves the check into a function.

Signed-off-by: Eric Sesterhenn <eric.sesterhenn@x41-dsec.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_h323_asn1.c | 94 ++++++++++++++++++--------
 1 file changed, 65 insertions(+), 29 deletions(-)

diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c
index cf1bf2605c10..3d9a009ac147 100644
--- a/net/netfilter/nf_conntrack_h323_asn1.c
+++ b/net/netfilter/nf_conntrack_h323_asn1.c
@@ -103,7 +103,6 @@ struct bitstr {
 #define INC_BIT(bs) if((++(bs)->bit)>7){(bs)->cur++;(bs)->bit=0;}
 #define INC_BITS(bs,b) if(((bs)->bit+=(b))>7){(bs)->cur+=(bs)->bit>>3;(bs)->bit&=7;}
 #define BYTE_ALIGN(bs) if((bs)->bit){(bs)->cur++;(bs)->bit=0;}
-#define CHECK_BOUND(bs,n) if((bs)->cur+(n)>(bs)->end)return(H323_ERROR_BOUND)
 static unsigned int get_len(struct bitstr *bs);
 static unsigned int get_bit(struct bitstr *bs);
 static unsigned int get_bits(struct bitstr *bs, unsigned int b);
@@ -165,6 +164,14 @@ static unsigned int get_len(struct bitstr *bs)
 	return v;
 }
 
+static int nf_h323_error_boundary(struct bitstr *bs, size_t bytes)
+{
+	if (*bs->cur + bytes > *bs->end)
+		return 1;
+
+	return 0;
+}
+
 /****************************************************************************/
 static unsigned int get_bit(struct bitstr *bs)
 {
@@ -280,7 +287,8 @@ static int decode_bool(struct bitstr *bs, const struct field_t *f,
 
 	INC_BIT(bs);
 
-	CHECK_BOUND(bs, 0);
+	if (nf_h323_error_boundary(bs, 0))
+		return H323_ERROR_BOUND;
 	return H323_ERROR_NONE;
 }
 
@@ -293,11 +301,14 @@ static int decode_oid(struct bitstr *bs, const struct field_t *f,
 	PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
 
 	BYTE_ALIGN(bs);
-	CHECK_BOUND(bs, 1);
+	if (nf_h323_error_boundary(bs, 1))
+		return H323_ERROR_BOUND;
+
 	len = *bs->cur++;
 	bs->cur += len;
+	if (nf_h323_error_boundary(bs, 0))
+		return H323_ERROR_BOUND;
 
-	CHECK_BOUND(bs, 0);
 	return H323_ERROR_NONE;
 }
 
@@ -330,7 +341,8 @@ static int decode_int(struct bitstr *bs, const struct field_t *f,
 		break;
 	case UNCO:
 		BYTE_ALIGN(bs);
-		CHECK_BOUND(bs, 2);
+		if (nf_h323_error_boundary(bs, 2))
+			return H323_ERROR_BOUND;
 		len = get_len(bs);
 		bs->cur += len;
 		break;
@@ -341,7 +353,8 @@ static int decode_int(struct bitstr *bs, const struct field_t *f,
 
 	PRINT("\n");
 
-	CHECK_BOUND(bs, 0);
+	if (nf_h323_error_boundary(bs, 0))
+		return H323_ERROR_BOUND;
 	return H323_ERROR_NONE;
 }
 
@@ -357,7 +370,8 @@ static int decode_enum(struct bitstr *bs, const struct field_t *f,
 		INC_BITS(bs, f->sz);
 	}
 
-	CHECK_BOUND(bs, 0);
+	if (nf_h323_error_boundary(bs, 0))
+		return H323_ERROR_BOUND;
 	return H323_ERROR_NONE;
 }
 
@@ -375,12 +389,14 @@ static int decode_bitstr(struct bitstr *bs, const struct field_t *f,
 		len = f->lb;
 		break;
 	case WORD:		/* 2-byte length */
-		CHECK_BOUND(bs, 2);
+		if (nf_h323_error_boundary(bs, 2))
+			return H323_ERROR_BOUND;
 		len = (*bs->cur++) << 8;
 		len += (*bs->cur++) + f->lb;
 		break;
 	case SEMI:
-		CHECK_BOUND(bs, 2);
+		if (nf_h323_error_boundary(bs, 2))
+			return H323_ERROR_BOUND;
 		len = get_len(bs);
 		break;
 	default:
@@ -391,7 +407,8 @@ static int decode_bitstr(struct bitstr *bs, const struct field_t *f,
 	bs->cur += len >> 3;
 	bs->bit = len & 7;
 
-	CHECK_BOUND(bs, 0);
+	if (nf_h323_error_boundary(bs, 0))
+		return H323_ERROR_BOUND;
 	return H323_ERROR_NONE;
 }
 
@@ -409,7 +426,8 @@ static int decode_numstr(struct bitstr *bs, const struct field_t *f,
 	BYTE_ALIGN(bs);
 	INC_BITS(bs, (len << 2));
 
-	CHECK_BOUND(bs, 0);
+	if (nf_h323_error_boundary(bs, 0))
+		return H323_ERROR_BOUND;
 	return H323_ERROR_NONE;
 }
 
@@ -440,12 +458,14 @@ static int decode_octstr(struct bitstr *bs, const struct field_t *f,
 		break;
 	case BYTE:		/* Range == 256 */
 		BYTE_ALIGN(bs);
-		CHECK_BOUND(bs, 1);
+		if (nf_h323_error_boundary(bs, 1))
+			return H323_ERROR_BOUND;
 		len = (*bs->cur++) + f->lb;
 		break;
 	case SEMI:
 		BYTE_ALIGN(bs);
-		CHECK_BOUND(bs, 2);
+		if (nf_h323_error_boundary(bs, 2))
+			return H323_ERROR_BOUND;
 		len = get_len(bs) + f->lb;
 		break;
 	default:		/* 2 <= Range <= 255 */
@@ -458,7 +478,8 @@ static int decode_octstr(struct bitstr *bs, const struct field_t *f,
 
 	PRINT("\n");
 
-	CHECK_BOUND(bs, 0);
+	if (nf_h323_error_boundary(bs, 0))
+		return H323_ERROR_BOUND;
 	return H323_ERROR_NONE;
 }
 
@@ -473,7 +494,8 @@ static int decode_bmpstr(struct bitstr *bs, const struct field_t *f,
 	switch (f->sz) {
 	case BYTE:		/* Range == 256 */
 		BYTE_ALIGN(bs);
-		CHECK_BOUND(bs, 1);
+		if (nf_h323_error_boundary(bs, 1))
+			return H323_ERROR_BOUND;
 		len = (*bs->cur++) + f->lb;
 		break;
 	default:		/* 2 <= Range <= 255 */
@@ -484,7 +506,8 @@ static int decode_bmpstr(struct bitstr *bs, const struct field_t *f,
 
 	bs->cur += len << 1;
 
-	CHECK_BOUND(bs, 0);
+	if (nf_h323_error_boundary(bs, 0))
+		return H323_ERROR_BOUND;
 	return H323_ERROR_NONE;
 }
 
@@ -525,9 +548,11 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f,
 
 		/* Decode */
 		if (son->attr & OPEN) {	/* Open field */
-			CHECK_BOUND(bs, 2);
+			if (nf_h323_error_boundary(bs, 2))
+				return H323_ERROR_BOUND;
 			len = get_len(bs);
-			CHECK_BOUND(bs, len);
+			if (nf_h323_error_boundary(bs, len))
+				return H323_ERROR_BOUND;
 			if (!base || !(son->attr & DECODE)) {
 				PRINT("%*.s%s\n", (level + 1) * TAB_SIZE,
 				      " ", son->name);
@@ -556,7 +581,8 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f,
 
 	/* Get the extension bitmap */
 	bmp2_len = get_bits(bs, 7) + 1;
-	CHECK_BOUND(bs, (bmp2_len + 7) >> 3);
+	if (nf_h323_error_boundary(bs, (bmp2_len + 7) >> 3))
+		return H323_ERROR_BOUND;
 	bmp2 = get_bitmap(bs, bmp2_len);
 	bmp |= bmp2 >> f->sz;
 	if (base)
@@ -567,9 +593,11 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f,
 	for (opt = 0; opt < bmp2_len; opt++, i++, son++) {
 		/* Check Range */
 		if (i >= f->ub) {	/* Newer Version? */
-			CHECK_BOUND(bs, 2);
+			if (nf_h323_error_boundary(bs, 2))
+				return H323_ERROR_BOUND;
 			len = get_len(bs);
-			CHECK_BOUND(bs, len);
+			if (nf_h323_error_boundary(bs, len))
+				return H323_ERROR_BOUND;
 			bs->cur += len;
 			continue;
 		}
@@ -583,9 +611,11 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f,
 		if (!((0x80000000 >> opt) & bmp2))	/* Not present */
 			continue;
 
-		CHECK_BOUND(bs, 2);
+		if (nf_h323_error_boundary(bs, 2))
+			return H323_ERROR_BOUND;
 		len = get_len(bs);
-		CHECK_BOUND(bs, len);
+		if (nf_h323_error_boundary(bs, len))
+			return H323_ERROR_BOUND;
 		if (!base || !(son->attr & DECODE)) {
 			PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ",
 			      son->name);
@@ -623,19 +653,22 @@ static int decode_seqof(struct bitstr *bs, const struct field_t *f,
 	switch (f->sz) {
 	case BYTE:
 		BYTE_ALIGN(bs);
-		CHECK_BOUND(bs, 1);
+		if (nf_h323_error_boundary(bs, 1))
+			return H323_ERROR_BOUND;
 		count = *bs->cur++;
 		break;
 	case WORD:
 		BYTE_ALIGN(bs);
-		CHECK_BOUND(bs, 2);
+		if (nf_h323_error_boundary(bs, 2))
+			return H323_ERROR_BOUND;
 		count = *bs->cur++;
 		count <<= 8;
 		count += *bs->cur++;
 		break;
 	case SEMI:
 		BYTE_ALIGN(bs);
-		CHECK_BOUND(bs, 2);
+		if (nf_h323_error_boundary(bs, 2))
+			return H323_ERROR_BOUND;
 		count = get_len(bs);
 		break;
 	default:
@@ -659,7 +692,8 @@ static int decode_seqof(struct bitstr *bs, const struct field_t *f,
 		if (son->attr & OPEN) {
 			BYTE_ALIGN(bs);
 			len = get_len(bs);
-			CHECK_BOUND(bs, len);
+			if (nf_h323_error_boundary(bs, len))
+				return H323_ERROR_BOUND;
 			if (!base || !(son->attr & DECODE)) {
 				PRINT("%*.s%s\n", (level + 1) * TAB_SIZE,
 				      " ", son->name);
@@ -728,7 +762,8 @@ static int decode_choice(struct bitstr *bs, const struct field_t *f,
 	if (type >= f->ub) {	/* Newer version? */
 		BYTE_ALIGN(bs);
 		len = get_len(bs);
-		CHECK_BOUND(bs, len);
+		if (nf_h323_error_boundary(bs, len))
+			return H323_ERROR_BOUND;
 		bs->cur += len;
 		return H323_ERROR_NONE;
 	}
@@ -743,7 +778,8 @@ static int decode_choice(struct bitstr *bs, const struct field_t *f,
 	if (ext || (son->attr & OPEN)) {
 		BYTE_ALIGN(bs);
 		len = get_len(bs);
-		CHECK_BOUND(bs, len);
+		if (nf_h323_error_boundary(bs, len))
+			return H323_ERROR_BOUND;
 		if (!base || !(son->attr & DECODE)) {
 			PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ",
 			      son->name);

From ec8a8f3c31ddef0a7d9626c4b8a4baa30f3b80aa Mon Sep 17 00:00:00 2001
From: Eric Sesterhenn <eric.sesterhenn@x41-dsec.de>
Date: Mon, 13 Nov 2017 09:09:41 +0100
Subject: [PATCH 04/76] netfilter: nf_ct_h323: Extend nf_h323_error_boundary to
 work on bits as well

This patch fixes several out of bounds memory reads by extending
the nf_h323_error_boundary() function to work on bits as well
an check the affected parts.

Signed-off-by: Eric Sesterhenn <eric.sesterhenn@x41-dsec.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_h323_asn1.c | 92 +++++++++++++++++---------
 1 file changed, 62 insertions(+), 30 deletions(-)

diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c
index 3d9a009ac147..dc6347342e34 100644
--- a/net/netfilter/nf_conntrack_h323_asn1.c
+++ b/net/netfilter/nf_conntrack_h323_asn1.c
@@ -164,8 +164,13 @@ static unsigned int get_len(struct bitstr *bs)
 	return v;
 }
 
-static int nf_h323_error_boundary(struct bitstr *bs, size_t bytes)
+static int nf_h323_error_boundary(struct bitstr *bs, size_t bytes, size_t bits)
 {
+	bits += bs->bit;
+	bytes += bits / BITS_PER_BYTE;
+	if (bits % BITS_PER_BYTE > 0)
+		bytes++;
+
 	if (*bs->cur + bytes > *bs->end)
 		return 1;
 
@@ -286,8 +291,7 @@ static int decode_bool(struct bitstr *bs, const struct field_t *f,
 	PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
 
 	INC_BIT(bs);
-
-	if (nf_h323_error_boundary(bs, 0))
+	if (nf_h323_error_boundary(bs, 0, 0))
 		return H323_ERROR_BOUND;
 	return H323_ERROR_NONE;
 }
@@ -301,12 +305,12 @@ static int decode_oid(struct bitstr *bs, const struct field_t *f,
 	PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
 
 	BYTE_ALIGN(bs);
-	if (nf_h323_error_boundary(bs, 1))
+	if (nf_h323_error_boundary(bs, 1, 0))
 		return H323_ERROR_BOUND;
 
 	len = *bs->cur++;
 	bs->cur += len;
-	if (nf_h323_error_boundary(bs, 0))
+	if (nf_h323_error_boundary(bs, 0, 0))
 		return H323_ERROR_BOUND;
 
 	return H323_ERROR_NONE;
@@ -330,6 +334,8 @@ static int decode_int(struct bitstr *bs, const struct field_t *f,
 		bs->cur += 2;
 		break;
 	case CONS:		/* 64K < Range < 4G */
+		if (nf_h323_error_boundary(bs, 0, 2))
+			return H323_ERROR_BOUND;
 		len = get_bits(bs, 2) + 1;
 		BYTE_ALIGN(bs);
 		if (base && (f->attr & DECODE)) {	/* timeToLive */
@@ -341,7 +347,7 @@ static int decode_int(struct bitstr *bs, const struct field_t *f,
 		break;
 	case UNCO:
 		BYTE_ALIGN(bs);
-		if (nf_h323_error_boundary(bs, 2))
+		if (nf_h323_error_boundary(bs, 2, 0))
 			return H323_ERROR_BOUND;
 		len = get_len(bs);
 		bs->cur += len;
@@ -353,7 +359,7 @@ static int decode_int(struct bitstr *bs, const struct field_t *f,
 
 	PRINT("\n");
 
-	if (nf_h323_error_boundary(bs, 0))
+	if (nf_h323_error_boundary(bs, 0, 0))
 		return H323_ERROR_BOUND;
 	return H323_ERROR_NONE;
 }
@@ -370,7 +376,7 @@ static int decode_enum(struct bitstr *bs, const struct field_t *f,
 		INC_BITS(bs, f->sz);
 	}
 
-	if (nf_h323_error_boundary(bs, 0))
+	if (nf_h323_error_boundary(bs, 0, 0))
 		return H323_ERROR_BOUND;
 	return H323_ERROR_NONE;
 }
@@ -389,13 +395,13 @@ static int decode_bitstr(struct bitstr *bs, const struct field_t *f,
 		len = f->lb;
 		break;
 	case WORD:		/* 2-byte length */
-		if (nf_h323_error_boundary(bs, 2))
+		if (nf_h323_error_boundary(bs, 2, 0))
 			return H323_ERROR_BOUND;
 		len = (*bs->cur++) << 8;
 		len += (*bs->cur++) + f->lb;
 		break;
 	case SEMI:
-		if (nf_h323_error_boundary(bs, 2))
+		if (nf_h323_error_boundary(bs, 2, 0))
 			return H323_ERROR_BOUND;
 		len = get_len(bs);
 		break;
@@ -407,7 +413,7 @@ static int decode_bitstr(struct bitstr *bs, const struct field_t *f,
 	bs->cur += len >> 3;
 	bs->bit = len & 7;
 
-	if (nf_h323_error_boundary(bs, 0))
+	if (nf_h323_error_boundary(bs, 0, 0))
 		return H323_ERROR_BOUND;
 	return H323_ERROR_NONE;
 }
@@ -421,12 +427,14 @@ static int decode_numstr(struct bitstr *bs, const struct field_t *f,
 	PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
 
 	/* 2 <= Range <= 255 */
+	if (nf_h323_error_boundary(bs, 0, f->sz))
+		return H323_ERROR_BOUND;
 	len = get_bits(bs, f->sz) + f->lb;
 
 	BYTE_ALIGN(bs);
 	INC_BITS(bs, (len << 2));
 
-	if (nf_h323_error_boundary(bs, 0))
+	if (nf_h323_error_boundary(bs, 0, 0))
 		return H323_ERROR_BOUND;
 	return H323_ERROR_NONE;
 }
@@ -458,17 +466,19 @@ static int decode_octstr(struct bitstr *bs, const struct field_t *f,
 		break;
 	case BYTE:		/* Range == 256 */
 		BYTE_ALIGN(bs);
-		if (nf_h323_error_boundary(bs, 1))
+		if (nf_h323_error_boundary(bs, 1, 0))
 			return H323_ERROR_BOUND;
 		len = (*bs->cur++) + f->lb;
 		break;
 	case SEMI:
 		BYTE_ALIGN(bs);
-		if (nf_h323_error_boundary(bs, 2))
+		if (nf_h323_error_boundary(bs, 2, 0))
 			return H323_ERROR_BOUND;
 		len = get_len(bs) + f->lb;
 		break;
 	default:		/* 2 <= Range <= 255 */
+		if (nf_h323_error_boundary(bs, 0, f->sz))
+			return H323_ERROR_BOUND;
 		len = get_bits(bs, f->sz) + f->lb;
 		BYTE_ALIGN(bs);
 		break;
@@ -478,7 +488,7 @@ static int decode_octstr(struct bitstr *bs, const struct field_t *f,
 
 	PRINT("\n");
 
-	if (nf_h323_error_boundary(bs, 0))
+	if (nf_h323_error_boundary(bs, 0, 0))
 		return H323_ERROR_BOUND;
 	return H323_ERROR_NONE;
 }
@@ -494,11 +504,13 @@ static int decode_bmpstr(struct bitstr *bs, const struct field_t *f,
 	switch (f->sz) {
 	case BYTE:		/* Range == 256 */
 		BYTE_ALIGN(bs);
-		if (nf_h323_error_boundary(bs, 1))
+		if (nf_h323_error_boundary(bs, 1, 0))
 			return H323_ERROR_BOUND;
 		len = (*bs->cur++) + f->lb;
 		break;
 	default:		/* 2 <= Range <= 255 */
+		if (nf_h323_error_boundary(bs, 0, f->sz))
+			return H323_ERROR_BOUND;
 		len = get_bits(bs, f->sz) + f->lb;
 		BYTE_ALIGN(bs);
 		break;
@@ -506,7 +518,7 @@ static int decode_bmpstr(struct bitstr *bs, const struct field_t *f,
 
 	bs->cur += len << 1;
 
-	if (nf_h323_error_boundary(bs, 0))
+	if (nf_h323_error_boundary(bs, 0, 0))
 		return H323_ERROR_BOUND;
 	return H323_ERROR_NONE;
 }
@@ -526,9 +538,13 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f,
 	base = (base && (f->attr & DECODE)) ? base + f->offset : NULL;
 
 	/* Extensible? */
+	if (nf_h323_error_boundary(bs, 0, 1))
+		return H323_ERROR_BOUND;
 	ext = (f->attr & EXT) ? get_bit(bs) : 0;
 
 	/* Get fields bitmap */
+	if (nf_h323_error_boundary(bs, 0, f->sz))
+		return H323_ERROR_BOUND;
 	bmp = get_bitmap(bs, f->sz);
 	if (base)
 		*(unsigned int *)base = bmp;
@@ -548,10 +564,10 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f,
 
 		/* Decode */
 		if (son->attr & OPEN) {	/* Open field */
-			if (nf_h323_error_boundary(bs, 2))
+			if (nf_h323_error_boundary(bs, 2, 0))
 				return H323_ERROR_BOUND;
 			len = get_len(bs);
-			if (nf_h323_error_boundary(bs, len))
+			if (nf_h323_error_boundary(bs, len, 0))
 				return H323_ERROR_BOUND;
 			if (!base || !(son->attr & DECODE)) {
 				PRINT("%*.s%s\n", (level + 1) * TAB_SIZE,
@@ -580,8 +596,10 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f,
 		return H323_ERROR_NONE;
 
 	/* Get the extension bitmap */
+	if (nf_h323_error_boundary(bs, 0, 7))
+		return H323_ERROR_BOUND;
 	bmp2_len = get_bits(bs, 7) + 1;
-	if (nf_h323_error_boundary(bs, (bmp2_len + 7) >> 3))
+	if (nf_h323_error_boundary(bs, 0, bmp2_len))
 		return H323_ERROR_BOUND;
 	bmp2 = get_bitmap(bs, bmp2_len);
 	bmp |= bmp2 >> f->sz;
@@ -593,10 +611,10 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f,
 	for (opt = 0; opt < bmp2_len; opt++, i++, son++) {
 		/* Check Range */
 		if (i >= f->ub) {	/* Newer Version? */
-			if (nf_h323_error_boundary(bs, 2))
+			if (nf_h323_error_boundary(bs, 2, 0))
 				return H323_ERROR_BOUND;
 			len = get_len(bs);
-			if (nf_h323_error_boundary(bs, len))
+			if (nf_h323_error_boundary(bs, len, 0))
 				return H323_ERROR_BOUND;
 			bs->cur += len;
 			continue;
@@ -611,10 +629,10 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f,
 		if (!((0x80000000 >> opt) & bmp2))	/* Not present */
 			continue;
 
-		if (nf_h323_error_boundary(bs, 2))
+		if (nf_h323_error_boundary(bs, 2, 0))
 			return H323_ERROR_BOUND;
 		len = get_len(bs);
-		if (nf_h323_error_boundary(bs, len))
+		if (nf_h323_error_boundary(bs, len, 0))
 			return H323_ERROR_BOUND;
 		if (!base || !(son->attr & DECODE)) {
 			PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ",
@@ -653,13 +671,13 @@ static int decode_seqof(struct bitstr *bs, const struct field_t *f,
 	switch (f->sz) {
 	case BYTE:
 		BYTE_ALIGN(bs);
-		if (nf_h323_error_boundary(bs, 1))
+		if (nf_h323_error_boundary(bs, 1, 0))
 			return H323_ERROR_BOUND;
 		count = *bs->cur++;
 		break;
 	case WORD:
 		BYTE_ALIGN(bs);
-		if (nf_h323_error_boundary(bs, 2))
+		if (nf_h323_error_boundary(bs, 2, 0))
 			return H323_ERROR_BOUND;
 		count = *bs->cur++;
 		count <<= 8;
@@ -667,11 +685,13 @@ static int decode_seqof(struct bitstr *bs, const struct field_t *f,
 		break;
 	case SEMI:
 		BYTE_ALIGN(bs);
-		if (nf_h323_error_boundary(bs, 2))
+		if (nf_h323_error_boundary(bs, 2, 0))
 			return H323_ERROR_BOUND;
 		count = get_len(bs);
 		break;
 	default:
+		if (nf_h323_error_boundary(bs, 0, f->sz))
+			return H323_ERROR_BOUND;
 		count = get_bits(bs, f->sz);
 		break;
 	}
@@ -691,8 +711,10 @@ static int decode_seqof(struct bitstr *bs, const struct field_t *f,
 	for (i = 0; i < count; i++) {
 		if (son->attr & OPEN) {
 			BYTE_ALIGN(bs);
+			if (nf_h323_error_boundary(bs, 2, 0))
+				return H323_ERROR_BOUND;
 			len = get_len(bs);
-			if (nf_h323_error_boundary(bs, len))
+			if (nf_h323_error_boundary(bs, len, 0))
 				return H323_ERROR_BOUND;
 			if (!base || !(son->attr & DECODE)) {
 				PRINT("%*.s%s\n", (level + 1) * TAB_SIZE,
@@ -744,11 +766,17 @@ static int decode_choice(struct bitstr *bs, const struct field_t *f,
 	base = (base && (f->attr & DECODE)) ? base + f->offset : NULL;
 
 	/* Decode the choice index number */
+	if (nf_h323_error_boundary(bs, 0, 1))
+		return H323_ERROR_BOUND;
 	if ((f->attr & EXT) && get_bit(bs)) {
 		ext = 1;
+		if (nf_h323_error_boundary(bs, 0, 7))
+			return H323_ERROR_BOUND;
 		type = get_bits(bs, 7) + f->lb;
 	} else {
 		ext = 0;
+		if (nf_h323_error_boundary(bs, 0, f->sz))
+			return H323_ERROR_BOUND;
 		type = get_bits(bs, f->sz);
 		if (type >= f->lb)
 			return H323_ERROR_RANGE;
@@ -761,8 +789,10 @@ static int decode_choice(struct bitstr *bs, const struct field_t *f,
 	/* Check Range */
 	if (type >= f->ub) {	/* Newer version? */
 		BYTE_ALIGN(bs);
+		if (nf_h323_error_boundary(bs, 2, 0))
+			return H323_ERROR_BOUND;
 		len = get_len(bs);
-		if (nf_h323_error_boundary(bs, len))
+		if (nf_h323_error_boundary(bs, len, 0))
 			return H323_ERROR_BOUND;
 		bs->cur += len;
 		return H323_ERROR_NONE;
@@ -777,8 +807,10 @@ static int decode_choice(struct bitstr *bs, const struct field_t *f,
 
 	if (ext || (son->attr & OPEN)) {
 		BYTE_ALIGN(bs);
+		if (nf_h323_error_boundary(bs, len, 0))
+			return H323_ERROR_BOUND;
 		len = get_len(bs);
-		if (nf_h323_error_boundary(bs, len))
+		if (nf_h323_error_boundary(bs, len, 0))
 			return H323_ERROR_BOUND;
 		if (!base || !(son->attr & DECODE)) {
 			PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ",

From fbcd253d2448b8f168241e38f629a36c4c8c1e94 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sun, 19 Nov 2017 21:27:28 +0100
Subject: [PATCH 05/76] netfilter: conntrack: lower timeout to RETRANS seconds
 if window is 0

When zero window is announced we can get into a situation where
connection stays around forever:

1. One side announces zero window.
2. Other side closes.

In this case, no FIN is sent (stuck in send queue).

Unless other side opens the window up again conntrack
stays in ESTABLISHED state for a very long time.

Lets alleviate this by lowering the timeout to RETRANS (5 minutes),
the other end should be sending zero window probes to keep the
connection established as long as a socket still exists.

Cc: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_proto_tcp.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index b12fc07111d0..37ef35b861f2 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -1039,6 +1039,9 @@ static int tcp_packet(struct nf_conn *ct,
 		 IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED &&
 		 timeouts[new_state] > timeouts[TCP_CONNTRACK_UNACK])
 		timeout = timeouts[TCP_CONNTRACK_UNACK];
+	else if (ct->proto.tcp.last_win == 0 &&
+		 timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS])
+		timeout = timeouts[TCP_CONNTRACK_RETRANS];
 	else
 		timeout = timeouts[new_state];
 	spin_unlock_bh(&ct->lock);

From 8b1836c4b64386e9bc580438cae386ed31a43ab9 Mon Sep 17 00:00:00 2001
From: Jay Elliott <jelliott@arista.com>
Date: Wed, 15 Nov 2017 15:01:13 -0800
Subject: [PATCH 06/76] netfilter: conntrack: clamp timeouts to INT_MAX

When the conntracking code multiplies a timeout by HZ, it can overflow
from positive to negative; this causes it to instantly expire.  To
protect against this the multiplication is done in 64-bit so we can
prevent it from exceeding INT_MAX.

Signed-off-by: Jay Elliott <jelliott@arista.com>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_netlink.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 59c08997bfdf..66d72a8fa87f 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1566,9 +1566,11 @@ static int ctnetlink_change_helper(struct nf_conn *ct,
 static int ctnetlink_change_timeout(struct nf_conn *ct,
 				    const struct nlattr * const cda[])
 {
-	u_int32_t timeout = ntohl(nla_get_be32(cda[CTA_TIMEOUT]));
+	u64 timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ;
 
-	ct->timeout = nfct_time_stamp + timeout * HZ;
+	if (timeout > INT_MAX)
+		timeout = INT_MAX;
+	ct->timeout = nfct_time_stamp + (u32)timeout;
 
 	if (test_bit(IPS_DYING_BIT, &ct->status))
 		return -ETIME;
@@ -1768,6 +1770,7 @@ ctnetlink_create_conntrack(struct net *net,
 	int err = -EINVAL;
 	struct nf_conntrack_helper *helper;
 	struct nf_conn_tstamp *tstamp;
+	u64 timeout;
 
 	ct = nf_conntrack_alloc(net, zone, otuple, rtuple, GFP_ATOMIC);
 	if (IS_ERR(ct))
@@ -1776,7 +1779,10 @@ ctnetlink_create_conntrack(struct net *net,
 	if (!cda[CTA_TIMEOUT])
 		goto err1;
 
-	ct->timeout = nfct_time_stamp + ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ;
+	timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ;
+	if (timeout > INT_MAX)
+		timeout = INT_MAX;
+	ct->timeout = (u32)timeout + nfct_time_stamp;
 
 	rcu_read_lock();
  	if (cda[CTA_HELP]) {

From fe77d8257c4d838c5976557ddb87bd789f312412 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@open-mesh.com>
Date: Wed, 29 Nov 2017 10:25:02 +0100
Subject: [PATCH 07/76] batman-adv: Always initialize fragment header priority

The batman-adv unuicast fragment header contains 3 bits for the priority of
the packet. These bits will be initialized when the skb->priority contains
a value between 256 and 263. But otherwise, the uninitialized bits from the
stack will be used.

Fixes: c0f25c802b33 ("batman-adv: Include frame priority in fragment header")
Signed-off-by: Sven Eckelmann <sven.eckelmann@open-mesh.com>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/fragmentation.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c
index a98cf1104a30..ebe6e38934e4 100644
--- a/net/batman-adv/fragmentation.c
+++ b/net/batman-adv/fragmentation.c
@@ -499,6 +499,8 @@ int batadv_frag_send_packet(struct sk_buff *skb,
 	 */
 	if (skb->priority >= 256 && skb->priority <= 263)
 		frag_header.priority = skb->priority - 256;
+	else
+		frag_header.priority = 0;
 
 	ether_addr_copy(frag_header.orig, primary_if->net_dev->dev_addr);
 	ether_addr_copy(frag_header.dest, orig_node->orig);

From 198a62ddffa4a4ffaeb741f642b7b52f2d91ae9b Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Wed, 29 Nov 2017 10:50:42 +0100
Subject: [PATCH 08/76] batman-adv: Fix check of retrieved orig_gw in
 batadv_v_gw_is_eligible

The batadv_v_gw_is_eligible function already assumes that orig_node is not
NULL. But batadv_gw_node_get may have failed to find the originator. It
must therefore be checked whether the batadv_gw_node_get failed and not
whether orig_node is NULL to detect this error.

Fixes: 50164d8f500f ("batman-adv: B.A.T.M.A.N. V - implement GW selection logic")
Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Acked-by: Antonio Quartulli <a@unstable.cc>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/bat_v.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index 341ceab8338d..e0e2bfcd6b3e 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -814,7 +814,7 @@ static bool batadv_v_gw_is_eligible(struct batadv_priv *bat_priv,
 	}
 
 	orig_gw = batadv_gw_node_get(bat_priv, orig_node);
-	if (!orig_node)
+	if (!orig_gw)
 		goto out;
 
 	if (batadv_v_gw_throughput_get(orig_gw, &orig_throughput) < 0)

From 974a6b20518602310637bd8ac9ad348bf8a864d6 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Fri, 1 Dec 2017 11:47:56 +0100
Subject: [PATCH 09/76] batman-adv: Fix kernel-doc for timer functions

The commit e99e88a9d2b0 ("treewide: setup_timer() -> timer_setup()")
changed the argument name and type of the timer function but didn't adjust
the kernel-doc of these functions.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/tp_meter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c
index 15cd2139381e..ebc4e2241c77 100644
--- a/net/batman-adv/tp_meter.c
+++ b/net/batman-adv/tp_meter.c
@@ -482,7 +482,7 @@ static void batadv_tp_reset_sender_timer(struct batadv_tp_vars *tp_vars)
 
 /**
  * batadv_tp_sender_timeout - timer that fires in case of packet loss
- * @arg: address of the related tp_vars
+ * @t: address to timer_list inside tp_vars
  *
  * If fired it means that there was packet loss.
  * Switch to Slow Start, set the ss_threshold to half of the current cwnd and
@@ -1106,7 +1106,7 @@ static void batadv_tp_reset_receiver_timer(struct batadv_tp_vars *tp_vars)
 /**
  * batadv_tp_receiver_shutdown - stop a tp meter receiver when timeout is
  *  reached without received ack
- * @arg: address of the related tp_vars
+ * @t: address to timer_list inside tp_vars
  */
 static void batadv_tp_receiver_shutdown(struct timer_list *t)
 {

From 4b380c42f7d00a395feede754f0bc2292eebe6e5 Mon Sep 17 00:00:00 2001
From: Kevin Cernekee <cernekee@chromium.org>
Date: Sun, 3 Dec 2017 12:12:45 -0800
Subject: [PATCH 10/76] netfilter: nfnetlink_cthelper: Add missing permission
 checks

The capability check in nfnetlink_rcv() verifies that the caller
has CAP_NET_ADMIN in the namespace that "owns" the netlink socket.
However, nfnl_cthelper_list is shared by all net namespaces on the
system.  An unprivileged user can create user and net namespaces
in which he holds CAP_NET_ADMIN to bypass the netlink_net_capable()
check:

    $ nfct helper list
    nfct v1.4.4: netlink error: Operation not permitted
    $ vpnns -- nfct helper list
    {
            .name = ftp,
            .queuenum = 0,
            .l3protonum = 2,
            .l4protonum = 6,
            .priv_data_len = 24,
            .status = enabled,
    };

Add capable() checks in nfnetlink_cthelper, as this is cleaner than
trying to generalize the solution.

Signed-off-by: Kevin Cernekee <cernekee@chromium.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink_cthelper.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
index 41628b393673..d33ce6d5ebce 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -17,6 +17,7 @@
 #include <linux/types.h>
 #include <linux/list.h>
 #include <linux/errno.h>
+#include <linux/capability.h>
 #include <net/netlink.h>
 #include <net/sock.h>
 
@@ -407,6 +408,9 @@ static int nfnl_cthelper_new(struct net *net, struct sock *nfnl,
 	struct nfnl_cthelper *nlcth;
 	int ret = 0;
 
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
 	if (!tb[NFCTH_NAME] || !tb[NFCTH_TUPLE])
 		return -EINVAL;
 
@@ -611,6 +615,9 @@ static int nfnl_cthelper_get(struct net *net, struct sock *nfnl,
 	struct nfnl_cthelper *nlcth;
 	bool tuple_set = false;
 
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
 	if (nlh->nlmsg_flags & NLM_F_DUMP) {
 		struct netlink_dump_control c = {
 			.dump = nfnl_cthelper_dump_table,
@@ -678,6 +685,9 @@ static int nfnl_cthelper_del(struct net *net, struct sock *nfnl,
 	struct nfnl_cthelper *nlcth, *n;
 	int j = 0, ret;
 
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
 	if (tb[NFCTH_NAME])
 		helper_name = nla_data(tb[NFCTH_NAME]);
 

From 6ab405114b0b229151ef06f4e31c7834dd09d0c0 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Fri, 1 Dec 2017 01:46:07 +0100
Subject: [PATCH 11/76] netfilter: xt_bpf: add overflow checks

Check whether inputs from userspace are too long (explicit length field too
big or string not null-terminated) to avoid out-of-bounds reads.

As far as I can tell, this can at worst lead to very limited kernel heap
memory disclosure or oopses.

This bug can be triggered by an unprivileged user even if the xt_bpf module
is not loaded: iptables is available in network namespaces, and the xt_bpf
module can be autoloaded.

Triggering the bug with a classic BPF filter with fake length 0x1000 causes
the following KASAN report:

==================================================================
BUG: KASAN: slab-out-of-bounds in bpf_prog_create+0x84/0xf0
Read of size 32768 at addr ffff8801eff2c494 by task test/4627

CPU: 0 PID: 4627 Comm: test Not tainted 4.15.0-rc1+ #1
[...]
Call Trace:
 dump_stack+0x5c/0x85
 print_address_description+0x6a/0x260
 kasan_report+0x254/0x370
 ? bpf_prog_create+0x84/0xf0
 memcpy+0x1f/0x50
 bpf_prog_create+0x84/0xf0
 bpf_mt_check+0x90/0xd6 [xt_bpf]
[...]
Allocated by task 4627:
 kasan_kmalloc+0xa0/0xd0
 __kmalloc_node+0x47/0x60
 xt_alloc_table_info+0x41/0x70 [x_tables]
[...]
The buggy address belongs to the object at ffff8801eff2c3c0
                which belongs to the cache kmalloc-2048 of size 2048
The buggy address is located 212 bytes inside of
                2048-byte region [ffff8801eff2c3c0, ffff8801eff2cbc0)
[...]
==================================================================

Fixes: e6f30c731718 ("netfilter: x_tables: add xt_bpf match")
Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_bpf.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c
index 041da0d9c06f..1f7fbd3c7e5a 100644
--- a/net/netfilter/xt_bpf.c
+++ b/net/netfilter/xt_bpf.c
@@ -27,6 +27,9 @@ static int __bpf_mt_check_bytecode(struct sock_filter *insns, __u16 len,
 {
 	struct sock_fprog_kern program;
 
+	if (len > XT_BPF_MAX_NUM_INSTR)
+		return -EINVAL;
+
 	program.len = len;
 	program.filter = insns;
 
@@ -55,6 +58,9 @@ static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret)
 	mm_segment_t oldfs = get_fs();
 	int retval, fd;
 
+	if (strnlen(path, XT_BPF_PATH_MAX) == XT_BPF_PATH_MAX)
+		return -EINVAL;
+
 	set_fs(KERNEL_DS);
 	fd = bpf_obj_get_user(path, 0);
 	set_fs(oldfs);

From 5ba7dcfe77037b67016263ea597a8b431692ecab Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 3 Dec 2017 11:26:45 +0100
Subject: [PATCH 12/76] batman-adv: Fix lock for ogm cnt access in
 batadv_iv_ogm_calc_tq

The originator node object orig_neigh_node is used to when accessing the
bcast_own(_sum) and real_packet_count information. The access to them has
to be protected with the spinlock in orig_neigh_node.

But the function uses the lock in orig_node instead. This is incorrect
because they could be two different originator node objects.

Fixes: 0ede9f41b217 ("batman-adv: protect bit operations to count OGMs with spinlock")
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/bat_iv_ogm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index 1b659ab652fb..bbe8414b6ee7 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -1214,7 +1214,7 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node,
 	orig_node->last_seen = jiffies;
 
 	/* find packet count of corresponding one hop neighbor */
-	spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock);
+	spin_lock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock);
 	if_num = if_incoming->if_num;
 	orig_eq_count = orig_neigh_node->bat_iv.bcast_own_sum[if_num];
 	neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing);
@@ -1224,7 +1224,7 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node,
 	} else {
 		neigh_rq_count = 0;
 	}
-	spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock);
+	spin_unlock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock);
 
 	/* pay attention to not get a value bigger than 100 % */
 	if (orig_eq_count > neigh_rq_count)

From 71334963d01ed7ec61a958a5a6585172793f5a24 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 5 Dec 2017 11:27:59 +0100
Subject: [PATCH 13/76] wireless: replace usage of hexdump with od/sed

Since od/sed are in posix, hopefully there's a better chance
people will have them, over hexdump.

Fixes: 90a53e4432b1 ("cfg80211: implement regdb signature checking")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/wireless/Makefile b/net/wireless/Makefile
index 278d979c211a..63cbb6432b2d 100644
--- a/net/wireless/Makefile
+++ b/net/wireless/Makefile
@@ -27,7 +27,7 @@ $(obj)/shipped-certs.c: $(wildcard $(srctree)/$(src)/certs/*.x509)
 	@$(kecho) "  GEN     $@"
 	@echo '#include "reg.h"' > $@
 	@echo 'const u8 shipped_regdb_certs[] = {' >> $@
-	@for f in $^ ; do hexdump -v -e '1/1 "0x%.2x," "\n"' < $$f >> $@ ; done
+	@for f in $^ ; do od -An -v -tx1 < $$f | sed -e 's/ /\n/g' | sed -e 's/^[0-9a-f]\+$$/\0/;t;d' | sed -e 's/^/0x/;s/$$/,/' >> $@ ; done
 	@echo '};' >> $@
 	@echo 'unsigned int shipped_regdb_certs_len = sizeof(shipped_regdb_certs);' >> $@
 
@@ -36,6 +36,6 @@ $(obj)/extra-certs.c: $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%) \
 	@$(kecho) "  GEN     $@"
 	@echo '#include "reg.h"' > $@
 	@echo 'const u8 extra_regdb_certs[] = {' >> $@
-	@for f in $^ ; do test -f $$f && hexdump -v -e '1/1 "0x%.2x," "\n"' < $$f >> $@ || true ; done
+	@for f in $^ ; do test -f $$f && od -An -v -tx1 < $$f | sed -e 's/ /\n/g' | sed -e 's/^[0-9a-f]\+$$/\0/;t;d' | sed -e 's/^/0x/;s/$$/,/' >> $@ ; done
 	@echo '};' >> $@
 	@echo 'unsigned int extra_regdb_certs_len = sizeof(extra_regdb_certs);' >> $@

From 715a12334764657bafb3ab964fb25f4e6115c770 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 5 Dec 2017 11:59:33 +0100
Subject: [PATCH 14/76] wireless: don't write C files on failures

Change the scripting inside the shipped/extra certs C code
generation to not write the file when there are any failures.
That way, if the build aborts due to failures, we don't get
into a situation where a dummy file has been created and the
next build succeeds, but not with the desired output.

Fixes: 90a53e4432b1 ("cfg80211: implement regdb signature checking")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/Makefile | 48 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 38 insertions(+), 10 deletions(-)

diff --git a/net/wireless/Makefile b/net/wireless/Makefile
index 63cbb6432b2d..d7d6cb00c47b 100644
--- a/net/wireless/Makefile
+++ b/net/wireless/Makefile
@@ -25,17 +25,45 @@ endif
 
 $(obj)/shipped-certs.c: $(wildcard $(srctree)/$(src)/certs/*.x509)
 	@$(kecho) "  GEN     $@"
-	@echo '#include "reg.h"' > $@
-	@echo 'const u8 shipped_regdb_certs[] = {' >> $@
-	@for f in $^ ; do od -An -v -tx1 < $$f | sed -e 's/ /\n/g' | sed -e 's/^[0-9a-f]\+$$/\0/;t;d' | sed -e 's/^/0x/;s/$$/,/' >> $@ ; done
-	@echo '};' >> $@
-	@echo 'unsigned int shipped_regdb_certs_len = sizeof(shipped_regdb_certs);' >> $@
+	@(set -e; \
+	  allf=""; \
+	  for f in $^ ; do \
+	      # similar to hexdump -v -e '1/1 "0x%.2x," "\n"' \
+	      thisf=$$(od -An -v -tx1 < $$f | \
+	                   sed -e 's/ /\n/g' | \
+	                   sed -e 's/^[0-9a-f]\+$$/\0/;t;d' | \
+	                   sed -e 's/^/0x/;s/$$/,/'); \
+	      # file should not be empty - maybe command substitution failed? \
+	      test ! -z "$$thisf";\
+	      allf=$$allf$$thisf;\
+	  done; \
+	  ( \
+	      echo '#include "reg.h"'; \
+	      echo 'const u8 shipped_regdb_certs[] = {'; \
+	      echo "$$allf"; \
+	      echo '};'; \
+	      echo 'unsigned int shipped_regdb_certs_len = sizeof(shipped_regdb_certs);'; \
+	  ) >> $@)
 
 $(obj)/extra-certs.c: $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%) \
 		      $(wildcard $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%)/*.x509)
 	@$(kecho) "  GEN     $@"
-	@echo '#include "reg.h"' > $@
-	@echo 'const u8 extra_regdb_certs[] = {' >> $@
-	@for f in $^ ; do test -f $$f && od -An -v -tx1 < $$f | sed -e 's/ /\n/g' | sed -e 's/^[0-9a-f]\+$$/\0/;t;d' | sed -e 's/^/0x/;s/$$/,/' >> $@ ; done
-	@echo '};' >> $@
-	@echo 'unsigned int extra_regdb_certs_len = sizeof(extra_regdb_certs);' >> $@
+	@(set -e; \
+	  allf=""; \
+	  for f in $^ ; do \
+	      # similar to hexdump -v -e '1/1 "0x%.2x," "\n"' \
+	      thisf=$$(od -An -v -tx1 < $$f | \
+	                   sed -e 's/ /\n/g' | \
+	                   sed -e 's/^[0-9a-f]\+$$/\0/;t;d' | \
+	                   sed -e 's/^/0x/;s/$$/,/'); \
+	      # file should not be empty - maybe command substitution failed? \
+	      test ! -z "$$thisf";\
+	      allf=$$allf$$thisf;\
+	  done; \
+	  ( \
+	      echo '#include "reg.h"'; \
+	      echo 'const u8 extra_regdb_certs[] = {'; \
+	      echo "$$allf"; \
+	      echo '};'; \
+	      echo 'unsigned int extra_regdb_certs_len = sizeof(extra_regdb_certs);'; \
+	  ) >> $@)

From 916a27901de01446bcf57ecca4783f6cff493309 Mon Sep 17 00:00:00 2001
From: Kevin Cernekee <cernekee@chromium.org>
Date: Tue, 5 Dec 2017 15:42:41 -0800
Subject: [PATCH 15/76] netfilter: xt_osf: Add missing permission checks

The capability check in nfnetlink_rcv() verifies that the caller
has CAP_NET_ADMIN in the namespace that "owns" the netlink socket.
However, xt_osf_fingers is shared by all net namespaces on the
system.  An unprivileged user can create user and net namespaces
in which he holds CAP_NET_ADMIN to bypass the netlink_net_capable()
check:

    vpnns -- nfnl_osf -f /tmp/pf.os

    vpnns -- nfnl_osf -f /tmp/pf.os -d

These non-root operations successfully modify the systemwide OS
fingerprint list.  Add new capable() checks so that they can't.

Signed-off-by: Kevin Cernekee <cernekee@chromium.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_osf.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c
index 36e14b1f061d..a34f314a8c23 100644
--- a/net/netfilter/xt_osf.c
+++ b/net/netfilter/xt_osf.c
@@ -19,6 +19,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 
+#include <linux/capability.h>
 #include <linux/if.h>
 #include <linux/inetdevice.h>
 #include <linux/ip.h>
@@ -70,6 +71,9 @@ static int xt_osf_add_callback(struct net *net, struct sock *ctnl,
 	struct xt_osf_finger *kf = NULL, *sf;
 	int err = 0;
 
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
 	if (!osf_attrs[OSF_ATTR_FINGER])
 		return -EINVAL;
 
@@ -115,6 +119,9 @@ static int xt_osf_remove_callback(struct net *net, struct sock *ctnl,
 	struct xt_osf_finger *sf;
 	int err = -ENOENT;
 
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
 	if (!osf_attrs[OSF_ATTR_FINGER])
 		return -EINVAL;
 

From 96307a0a75d8f1847debefd6a402339aac43e224 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 7 Dec 2017 14:26:09 +0100
Subject: [PATCH 16/76] netfilter: ipt_CLUSTERIP: fix clusterip_net_exit build
 regression

The added check produces a build error when CONFIG_PROC_FS is
disabled:

net/ipv4/netfilter/ipt_CLUSTERIP.c: In function 'clusterip_net_exit':
net/ipv4/netfilter/ipt_CLUSTERIP.c:822:28: error: 'cn' undeclared (first use in this function)

This moves the variable declaration out of the #ifdef to make it
available to the WARN_ON_ONCE().

Fixes: 613d0776d3fe ("netfilter: exit_net cleanup check added")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Vasily Averin <vvs@virtuozzo.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/ipt_CLUSTERIP.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index e35b8d074f06..69060e3abe85 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -813,8 +813,8 @@ static int clusterip_net_init(struct net *net)
 
 static void clusterip_net_exit(struct net *net)
 {
-#ifdef CONFIG_PROC_FS
 	struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+#ifdef CONFIG_PROC_FS
 	proc_remove(cn->procdir);
 	cn->procdir = NULL;
 #endif

From 0afe9d4ab9d40c281bdcdd118661fe8e4bdcef18 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Sat, 9 Dec 2017 21:10:10 +0100
Subject: [PATCH 17/76] mac80211: fix locking in
 ieee80211_sta_tear_down_BA_sessions

Due to overlap between
commit 1281103770e9 ("mac80211: Simplify locking in ieee80211_sta_tear_down_BA_sessions()")
and the way that Luca modified
commit 72e2c3438ba3 ("mac80211: tear down RX aggregations first")
when sending it upstream from Intel's internal tree, we get
the following warning:

WARNING: CPU: 0 PID: 5472 at net/mac80211/agg-tx.c:315 ___ieee80211_stop_tx_ba_session+0x158/0x1f0

since there's no appropriate locking around the call to
___ieee80211_stop_tx_ba_session; Sara's original just had
a call to the locked __ieee80211_stop_tx_ba_session (one
less underscore) but it looks like Luca modified both of
the calls when fixing it up for upstream, leading to the
problem at hand.

Move the locking appropriately to fix this problem.

Reported-by: Kalle Valo <kvalo@codeaurora.org>
Reported-by: Pavel Machek <pavel@ucw.cz>
Tested-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/ht.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index 167f83b853e6..1621b6ab17ba 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -291,16 +291,15 @@ void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta,
 	int i;
 
 	mutex_lock(&sta->ampdu_mlme.mtx);
-	for (i = 0; i <  IEEE80211_NUM_TIDS; i++) {
+	for (i = 0; i <  IEEE80211_NUM_TIDS; i++)
 		___ieee80211_stop_rx_ba_session(sta, i, WLAN_BACK_RECIPIENT,
 						WLAN_REASON_QSTA_LEAVE_QBSS,
 						reason != AGG_STOP_DESTROY_STA &&
 						reason != AGG_STOP_PEER_REQUEST);
-	}
-	mutex_unlock(&sta->ampdu_mlme.mtx);
 
 	for (i = 0; i <  IEEE80211_NUM_TIDS; i++)
 		___ieee80211_stop_tx_ba_session(sta, i, reason);
+	mutex_unlock(&sta->ampdu_mlme.mtx);
 
 	/* stopping might queue the work again - so cancel only afterwards */
 	cancel_work_sync(&sta->ampdu_mlme.work);

From f5b5702ac55b11113a94d6228d191c7f827b7a3b Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 11 Dec 2017 10:14:27 +0100
Subject: [PATCH 18/76] netfilter: exthdr: add missign attributes to policy

Add missing netlink attribute policy.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_exthdr.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index a0a93d987a3b..47ec1046ad11 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -214,6 +214,8 @@ static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = {
 	[NFTA_EXTHDR_OFFSET]		= { .type = NLA_U32 },
 	[NFTA_EXTHDR_LEN]		= { .type = NLA_U32 },
 	[NFTA_EXTHDR_FLAGS]		= { .type = NLA_U32 },
+	[NFTA_EXTHDR_OP]		= { .type = NLA_U32 },
+	[NFTA_EXTHDR_SREG]		= { .type = NLA_U32 },
 };
 
 static int nft_exthdr_init(const struct nft_ctx *ctx,

From a8ceb5dbfde1092b466936bca0ff3be127ecf38e Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 5 Dec 2017 21:29:37 +0200
Subject: [PATCH 19/76] ptr_ring: add barriers

Users of ptr_ring expect that it's safe to give the
data structure a pointer and have it be available
to consumers, but that actually requires an smb_wmb
or a stronger barrier.

In absence of such barriers and on architectures that reorder writes,
consumer might read an un=initialized value from an skb pointer stored
in the skb array.  This was observed causing crashes.

To fix, add memory barriers.  The barrier we use is a wmb, the
assumption being that producers do not need to read the value so we do
not need to order these reads.

Reported-by: George Cherian <george.cherian@cavium.com>
Suggested-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ptr_ring.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index 37b4bb2545b3..6866df4f31b5 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -101,12 +101,18 @@ static inline bool ptr_ring_full_bh(struct ptr_ring *r)
 
 /* Note: callers invoking this in a loop must use a compiler barrier,
  * for example cpu_relax(). Callers must hold producer_lock.
+ * Callers are responsible for making sure pointer that is being queued
+ * points to a valid data.
  */
 static inline int __ptr_ring_produce(struct ptr_ring *r, void *ptr)
 {
 	if (unlikely(!r->size) || r->queue[r->producer])
 		return -ENOSPC;
 
+	/* Make sure the pointer we are storing points to a valid data. */
+	/* Pairs with smp_read_barrier_depends in __ptr_ring_consume. */
+	smp_wmb();
+
 	r->queue[r->producer++] = ptr;
 	if (unlikely(r->producer >= r->size))
 		r->producer = 0;
@@ -275,6 +281,9 @@ static inline void *__ptr_ring_consume(struct ptr_ring *r)
 	if (ptr)
 		__ptr_ring_discard_one(r);
 
+	/* Make sure anyone accessing data through the pointer is up to date. */
+	/* Pairs with smp_wmb in __ptr_ring_produce. */
+	smp_read_barrier_depends();
 	return ptr;
 }
 

From 23715275e4fb6f64358a499d20928a9e93819f2f Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Mon, 11 Dec 2017 18:19:33 +0300
Subject: [PATCH 20/76] netfilter: ip6t_MASQUERADE: add dependency on conntrack
 module

After commit 4d3a57f23dec ("netfilter: conntrack: do not enable connection
tracking unless needed") conntrack is disabled by default unless some
module explicitly declares dependency in particular network namespace.

Fixes: a357b3f80bc8 ("netfilter: nat: add dependencies on conntrack module")
Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/ip6t_MASQUERADE.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c
index 2b1a15846f9a..92c0047e7e33 100644
--- a/net/ipv6/netfilter/ip6t_MASQUERADE.c
+++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c
@@ -33,13 +33,19 @@ static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par)
 
 	if (range->flags & NF_NAT_RANGE_MAP_IPS)
 		return -EINVAL;
-	return 0;
+	return nf_ct_netns_get(par->net, par->family);
+}
+
+static void masquerade_tg6_destroy(const struct xt_tgdtor_param *par)
+{
+	nf_ct_netns_put(par->net, par->family);
 }
 
 static struct xt_target masquerade_tg6_reg __read_mostly = {
 	.name		= "MASQUERADE",
 	.family		= NFPROTO_IPV6,
 	.checkentry	= masquerade_tg6_checkentry,
+	.destroy	= masquerade_tg6_destroy,
 	.target		= masquerade_tg6,
 	.targetsize	= sizeof(struct nf_nat_range),
 	.table		= "nat",

From f1e2400a80ff55eb7c5f4fd9d7eb163fd0de9a2c Mon Sep 17 00:00:00 2001
From: Jerome Brunet <jbrunet@baylibre.com>
Date: Fri, 8 Dec 2017 12:08:11 +0100
Subject: [PATCH 21/76] net: phy: meson-gxl: detect LPA corruption

The purpose of this change is to fix the incorrect detection of the link
partner (LP) advertised capabilities which sometimes happens with this PHY
(roughly 1 time in a dozen)

This issue may cause the link to be negotiated at 10Mbps/Full or
10Mbps/Half when 100MBps/Full is actually possible. In some case, the link
is even completely broken and no communication is possible.

To detect the corruption, we must look for a magic undocumented bit in the
WOL bank (hint given by the SoC vendor kernel) but this is not enough to
cover all cases. We also have to look at the LPA ack. If the LP supports
Aneg but did not ack our base code when aneg is completed, we assume
something went wrong.

The detection of a corrupted LPA triggers a restart of the aneg process.
This solves the problem but may take up to 6 retries to complete.

Fixes: 7334b3e47aee ("net: phy: Add Meson GXL Internal PHY driver")
Signed-off-by: Jerome Brunet <jbrunet@baylibre.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/meson-gxl.c | 74 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 73 insertions(+), 1 deletion(-)

diff --git a/drivers/net/phy/meson-gxl.c b/drivers/net/phy/meson-gxl.c
index 1ea69b7585d9..700007dd4be5 100644
--- a/drivers/net/phy/meson-gxl.c
+++ b/drivers/net/phy/meson-gxl.c
@@ -22,6 +22,7 @@
 #include <linux/ethtool.h>
 #include <linux/phy.h>
 #include <linux/netdevice.h>
+#include <linux/bitfield.h>
 
 static int meson_gxl_config_init(struct phy_device *phydev)
 {
@@ -50,6 +51,77 @@ static int meson_gxl_config_init(struct phy_device *phydev)
 	return 0;
 }
 
+/* This function is provided to cope with the possible failures of this phy
+ * during aneg process. When aneg fails, the PHY reports that aneg is done
+ * but the value found in MII_LPA is wrong:
+ *  - Early failures: MII_LPA is just 0x0001. if MII_EXPANSION reports that
+ *    the link partner (LP) supports aneg but the LP never acked our base
+ *    code word, it is likely that we never sent it to begin with.
+ *  - Late failures: MII_LPA is filled with a value which seems to make sense
+ *    but it actually is not what the LP is advertising. It seems that we
+ *    can detect this using a magic bit in the WOL bank (reg 12 - bit 12).
+ *    If this particular bit is not set when aneg is reported being done,
+ *    it means MII_LPA is likely to be wrong.
+ *
+ * In both case, forcing a restart of the aneg process solve the problem.
+ * When this failure happens, the first retry is usually successful but,
+ * in some cases, it may take up to 6 retries to get a decent result
+ */
+int meson_gxl_read_status(struct phy_device *phydev)
+{
+	int ret, wol, lpa, exp;
+
+	if (phydev->autoneg == AUTONEG_ENABLE) {
+		ret = genphy_aneg_done(phydev);
+		if (ret < 0)
+			return ret;
+		else if (!ret)
+			goto read_status_continue;
+
+		/* Need to access WOL bank, make sure the access is open */
+		ret = phy_write(phydev, 0x14, 0x0000);
+		if (ret)
+			return ret;
+		ret = phy_write(phydev, 0x14, 0x0400);
+		if (ret)
+			return ret;
+		ret = phy_write(phydev, 0x14, 0x0000);
+		if (ret)
+			return ret;
+		ret = phy_write(phydev, 0x14, 0x0400);
+		if (ret)
+			return ret;
+
+		/* Request LPI_STATUS WOL register */
+		ret = phy_write(phydev, 0x14, 0x8D80);
+		if (ret)
+			return ret;
+
+		/* Read LPI_STATUS value */
+		wol = phy_read(phydev, 0x15);
+		if (wol < 0)
+			return wol;
+
+		lpa = phy_read(phydev, MII_LPA);
+		if (lpa < 0)
+			return lpa;
+
+		exp = phy_read(phydev, MII_EXPANSION);
+		if (exp < 0)
+			return exp;
+
+		if (!(wol & BIT(12)) ||
+		    ((exp & EXPANSION_NWAY) && !(lpa & LPA_LPACK))) {
+			/* Looks like aneg failed after all */
+			phydev_dbg(phydev, "LPA corruption - aneg restart\n");
+			return genphy_restart_aneg(phydev);
+		}
+	}
+
+read_status_continue:
+	return genphy_read_status(phydev);
+}
+
 static struct phy_driver meson_gxl_phy[] = {
 	{
 		.phy_id		= 0x01814400,
@@ -60,7 +132,7 @@ static struct phy_driver meson_gxl_phy[] = {
 		.config_init	= meson_gxl_config_init,
 		.config_aneg	= genphy_config_aneg,
 		.aneg_done      = genphy_aneg_done,
-		.read_status	= genphy_read_status,
+		.read_status	= meson_gxl_read_status,
 		.suspend        = genphy_suspend,
 		.resume         = genphy_resume,
 	},

From 2aab6b40b03154a263463a5d992ddd7d122a016a Mon Sep 17 00:00:00 2001
From: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Date: Fri, 8 Dec 2017 16:35:40 +0100
Subject: [PATCH 22/76] net: sh_eth: do not advertise Gigabit capabilities when
 not available

Not all variants of the sh_eth hardware have Gigabit
support. Unfortunately, the current driver doesn't tell the PHY about
the limited MAC capabilities. Due to this, if you have a Gigabit
capable PHY, the PHY will advertise its Gigabit capability and
establish a link at 1Gbit/s, even though the MAC doesn't support it.

In order to avoid this, we use the recently introduced
phy_set_max_speed() to tell the PHY to not advertise speed higher than
100 MBit/s.

Tested on a SH7786 platform, with a Gigabit PHY.

Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/renesas/sh_eth.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
index db72d13cebb9..75323000c364 100644
--- a/drivers/net/ethernet/renesas/sh_eth.c
+++ b/drivers/net/ethernet/renesas/sh_eth.c
@@ -1892,6 +1892,16 @@ static int sh_eth_phy_init(struct net_device *ndev)
 		return PTR_ERR(phydev);
 	}
 
+	/* mask with MAC supported features */
+	if (mdp->cd->register_type != SH_ETH_REG_GIGABIT) {
+		int err = phy_set_max_speed(phydev, SPEED_100);
+		if (err) {
+			netdev_err(ndev, "failed to limit PHY to 100 Mbit/s\n");
+			phy_disconnect(phydev);
+			return err;
+		}
+	}
+
 	phy_attached_info(phydev);
 
 	return 0;

From 93c647643b48f0131f02e45da3bd367d80443291 Mon Sep 17 00:00:00 2001
From: Kevin Cernekee <cernekee@chromium.org>
Date: Wed, 6 Dec 2017 12:12:27 -0800
Subject: [PATCH 23/76] netlink: Add netns check on taps

Currently, a nlmon link inside a child namespace can observe systemwide
netlink activity.  Filter the traffic so that nlmon can only sniff
netlink messages from its own netns.

Test case:

    vpnns -- bash -c "ip link add nlmon0 type nlmon; \
                      ip link set nlmon0 up; \
                      tcpdump -i nlmon0 -q -w /tmp/nlmon.pcap -U" &
    sudo ip xfrm state add src 10.1.1.1 dst 10.1.1.2 proto esp \
        spi 0x1 mode transport \
        auth sha1 0x6162633132330000000000000000000000000000 \
        enc aes 0x00000000000000000000000000000000
    grep --binary abc123 /tmp/nlmon.pcap

Signed-off-by: Kevin Cernekee <cernekee@chromium.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlink/af_netlink.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index b9e0ee4e22f5..79cc1bf36e4a 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -253,6 +253,9 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb,
 	struct sock *sk = skb->sk;
 	int ret = -ENOMEM;
 
+	if (!net_eq(dev_net(dev), sock_net(sk)))
+		return 0;
+
 	dev_hold(dev);
 
 	if (is_vmalloc_addr(skb->head))

From 8f659a03a0ba9289b9aeb9b4470e6fb263d6f483 Mon Sep 17 00:00:00 2001
From: Mohamed Ghannam <simo.ghannam@gmail.com>
Date: Sun, 10 Dec 2017 03:50:58 +0000
Subject: [PATCH 24/76] net: ipv4: fix for a race condition in raw_sendmsg

inet->hdrincl is racy, and could lead to uninitialized stack pointer
usage, so its value should be read only once.

Fixes: c008ba5bdc9f ("ipv4: Avoid reading user iov twice after raw_probe_proto_opt")
Signed-off-by: Mohamed Ghannam <simo.ghannam@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/raw.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 33b70bfd1122..125c1eab3eaa 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -513,11 +513,16 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	int err;
 	struct ip_options_data opt_copy;
 	struct raw_frag_vec rfv;
+	int hdrincl;
 
 	err = -EMSGSIZE;
 	if (len > 0xFFFF)
 		goto out;
 
+	/* hdrincl should be READ_ONCE(inet->hdrincl)
+	 * but READ_ONCE() doesn't work with bit fields
+	 */
+	hdrincl = inet->hdrincl;
 	/*
 	 *	Check the flags.
 	 */
@@ -593,7 +598,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		/* Linux does not mangle headers on raw sockets,
 		 * so that IP options + IP_HDRINCL is non-sense.
 		 */
-		if (inet->hdrincl)
+		if (hdrincl)
 			goto done;
 		if (ipc.opt->opt.srr) {
 			if (!daddr)
@@ -615,12 +620,12 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 
 	flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
 			   RT_SCOPE_UNIVERSE,
-			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
+			   hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 			   inet_sk_flowi_flags(sk) |
-			    (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
+			    (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
 			   daddr, saddr, 0, 0, sk->sk_uid);
 
-	if (!inet->hdrincl) {
+	if (!hdrincl) {
 		rfv.msg = msg;
 		rfv.hlen = 0;
 
@@ -645,7 +650,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		goto do_confirm;
 back_from_confirm:
 
-	if (inet->hdrincl)
+	if (hdrincl)
 		err = raw_send_hdrinc(sk, &fl4, msg, len,
 				      &rt, msg->msg_flags, &ipc.sockc);
 

From 2342b8d95bcae5946e1b9b8d58645f37500ef2e7 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 10 Dec 2017 15:40:51 +0800
Subject: [PATCH 25/76] sctp: make sure stream nums can match optlen in
 sctp_setsockopt_reset_streams

Now in sctp_setsockopt_reset_streams, it only does the check
optlen < sizeof(*params) for optlen. But it's not enough, as
params->srs_number_streams should also match optlen.

If the streams in params->srs_stream_list are less than stream
nums in params->srs_number_streams, later when dereferencing
the stream list, it could cause a slab-out-of-bounds crash, as
reported by syzbot.

This patch is to fix it by also checking the stream numbers in
sctp_setsockopt_reset_streams to make sure at least it's not
greater than the streams in the list.

Fixes: 7f9d68ac944e ("sctp: implement sender-side procedures for SSN Reset Request Parameter")
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/socket.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index eb17a911aa29..3253f724a995 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3891,13 +3891,17 @@ static int sctp_setsockopt_reset_streams(struct sock *sk,
 	struct sctp_association *asoc;
 	int retval = -EINVAL;
 
-	if (optlen < sizeof(struct sctp_reset_streams))
+	if (optlen < sizeof(*params))
 		return -EINVAL;
 
 	params = memdup_user(optval, optlen);
 	if (IS_ERR(params))
 		return PTR_ERR(params);
 
+	if (params->srs_number_streams * sizeof(__u16) >
+	    optlen - sizeof(*params))
+		goto out;
+
 	asoc = sctp_id2assoc(sk, params->srs_assoc_id);
 	if (!asoc)
 		goto out;

From 200809716aed1cac586fcac4c0551a688439be1f Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 10 Dec 2017 16:56:00 +0800
Subject: [PATCH 26/76] fou: fix some member types in guehdr

guehdr struct is used to build or parse gue packets, which
are always in big endian. It's better to define all guehdr
members as __beXX types.

Also, in validate_gue_flags it's not good to use a __be32
variable for both Standard flags(__be16) and Private flags
(__be32), and pass it to other funcions.

This patch could fix a bunch of sparse warnings from fou.

Fixes: 5024c33ac354 ("gue: Add infrastructure for flags and options")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/gue.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/net/gue.h b/include/net/gue.h
index 2fdb29ca74c2..fdad41469b65 100644
--- a/include/net/gue.h
+++ b/include/net/gue.h
@@ -44,10 +44,10 @@ struct guehdr {
 #else
 #error  "Please fix <asm/byteorder.h>"
 #endif
-			__u8    proto_ctype;
-			__u16   flags;
+			__u8	proto_ctype;
+			__be16	flags;
 		};
-		__u32 word;
+		__be32	word;
 	};
 };
 
@@ -84,11 +84,10 @@ static inline size_t guehdr_priv_flags_len(__be32 flags)
  * if there is an unknown standard or private flags, or the options length for
  * the flags exceeds the options length specific in hlen of the GUE header.
  */
-static inline int validate_gue_flags(struct guehdr *guehdr,
-				     size_t optlen)
+static inline int validate_gue_flags(struct guehdr *guehdr, size_t optlen)
 {
+	__be16 flags = guehdr->flags;
 	size_t len;
-	__be32 flags = guehdr->flags;
 
 	if (flags & ~GUE_FLAGS_ALL)
 		return 1;
@@ -101,12 +100,13 @@ static inline int validate_gue_flags(struct guehdr *guehdr,
 		/* Private flags are last four bytes accounted in
 		 * guehdr_flags_len
 		 */
-		flags = *(__be32 *)((void *)&guehdr[1] + len - GUE_LEN_PRIV);
+		__be32 pflags = *(__be32 *)((void *)&guehdr[1] +
+					    len - GUE_LEN_PRIV);
 
-		if (flags & ~GUE_PFLAGS_ALL)
+		if (pflags & ~GUE_PFLAGS_ALL)
 			return 1;
 
-		len += guehdr_priv_flags_len(flags);
+		len += guehdr_priv_flags_len(pflags);
 		if (len > optlen)
 			return 1;
 	}

From 30791ac41927ebd3e75486f9504b6d2280463bf0 Mon Sep 17 00:00:00 2001
From: Christoph Paasch <cpaasch@apple.com>
Date: Mon, 11 Dec 2017 00:05:46 -0800
Subject: [PATCH 27/76] tcp md5sig: Use skb's saddr when replying to an
 incoming segment

The MD5-key that belongs to a connection is identified by the peer's
IP-address. When we are in tcp_v4(6)_reqsk_send_ack(), we are replying
to an incoming segment from tcp_check_req() that failed the seq-number
checks.

Thus, to find the correct key, we need to use the skb's saddr and not
the daddr.

This bug seems to have been there since quite a while, but probably got
unnoticed because the consequences are not catastrophic. We will call
tcp_v4_reqsk_send_ack only to send a challenge-ACK back to the peer,
thus the connection doesn't really fail.

Fixes: 9501f9722922 ("tcp md5sig: Let the caller pass appropriate key for tcp_v{4,6}_do_calc_md5_hash().")
Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 2 +-
 net/ipv6/tcp_ipv6.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 77ea45da0fe9..94e28350f420 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -848,7 +848,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 			req->ts_recent,
 			0,
-			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
+			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
 					  AF_INET),
 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 			ip_hdr(skb)->tos);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 1f04ec0e4a7a..7178476b3d2f 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -994,7 +994,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 			req->ts_recent, sk->sk_bound_dev_if,
-			tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr),
+			tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr),
 			0, 0);
 }
 

From 283ca526a9bd75aed7350220d7b1f8027d99c3fd Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 12 Dec 2017 02:25:30 +0100
Subject: [PATCH 28/76] bpf: fix corruption on concurrent perf_event_output
 calls

When tracing and networking programs are both attached in the
system and both use event-output helpers that eventually call
into perf_event_output(), then we could end up in a situation
where the tracing attached program runs in user context while
a cls_bpf program is triggered on that same CPU out of softirq
context.

Since both rely on the same per-cpu perf_sample_data, we could
potentially corrupt it. This can only ever happen in a combination
of the two types; all tracing programs use a bpf_prog_active
counter to bail out in case a program is already running on
that CPU out of a different context. XDP and cls_bpf programs
by themselves don't have this issue as they run in the same
context only. Therefore, split both perf_sample_data so they
cannot be accessed from each other.

Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data")
Reported-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Song Liu <songliubraving@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/trace/bpf_trace.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 0ce99c379c30..40207c2a4113 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -343,14 +343,13 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
 	.arg4_type	= ARG_CONST_SIZE,
 };
 
-static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd);
+static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd);
 
 static __always_inline u64
 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
-			u64 flags, struct perf_raw_record *raw)
+			u64 flags, struct perf_sample_data *sd)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
-	struct perf_sample_data *sd = this_cpu_ptr(&bpf_sd);
 	unsigned int cpu = smp_processor_id();
 	u64 index = flags & BPF_F_INDEX_MASK;
 	struct bpf_event_entry *ee;
@@ -373,8 +372,6 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
 	if (unlikely(event->oncpu != cpu))
 		return -EOPNOTSUPP;
 
-	perf_sample_data_init(sd, 0, 0);
-	sd->raw = raw;
 	perf_event_output(event, sd, regs);
 	return 0;
 }
@@ -382,6 +379,7 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
 BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
 	   u64, flags, void *, data, u64, size)
 {
+	struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd);
 	struct perf_raw_record raw = {
 		.frag = {
 			.size = size,
@@ -392,7 +390,10 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
 	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
 		return -EINVAL;
 
-	return __bpf_perf_event_output(regs, map, flags, &raw);
+	perf_sample_data_init(sd, 0, 0);
+	sd->raw = &raw;
+
+	return __bpf_perf_event_output(regs, map, flags, sd);
 }
 
 static const struct bpf_func_proto bpf_perf_event_output_proto = {
@@ -407,10 +408,12 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
 };
 
 static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
+static DEFINE_PER_CPU(struct perf_sample_data, bpf_misc_sd);
 
 u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 		     void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
 {
+	struct perf_sample_data *sd = this_cpu_ptr(&bpf_misc_sd);
 	struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs);
 	struct perf_raw_frag frag = {
 		.copy		= ctx_copy,
@@ -428,8 +431,10 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 	};
 
 	perf_fetch_caller_regs(regs);
+	perf_sample_data_init(sd, 0, 0);
+	sd->raw = &raw;
 
-	return __bpf_perf_event_output(regs, map, flags, &raw);
+	return __bpf_perf_event_output(regs, map, flags, sd);
 }
 
 BPF_CALL_0(bpf_get_current_task)

From a23f06f06dbe54696e8d4f156b317e8c9961c345 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 12 Dec 2017 02:25:31 +0100
Subject: [PATCH 29/76] bpf: fix build issues on um due to mising
 bpf_perf_event.h

Since c895f6f703ad ("bpf: correct broken uapi for
BPF_PROG_TYPE_PERF_EVENT program type") um (uml) won't build
on i386 or x86_64:

  [...]
    CC      init/main.o
  In file included from ../include/linux/perf_event.h:18:0,
                   from ../include/linux/trace_events.h:10,
                   from ../include/trace/syscall.h:7,
                   from ../include/linux/syscalls.h:82,
                   from ../init/main.c:20:
  ../include/uapi/linux/bpf_perf_event.h:11:32: fatal error:
  asm/bpf_perf_event.h: No such file or directory #include
  <asm/bpf_perf_event.h>
  [...]

Lets add missing bpf_perf_event.h also to um arch. This seems
to be the only one still missing.

Fixes: c895f6f703ad ("bpf: correct broken uapi for BPF_PROG_TYPE_PERF_EVENT program type")
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Suggested-by: Richard Weinberger <richard@sigma-star.at>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Cc: Richard Weinberger <richard@sigma-star.at>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Richard Weinberger <richard@nod.at>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/um/include/asm/Kbuild | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
index 50a32c33d729..73c57f614c9e 100644
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -1,4 +1,5 @@
 generic-y += barrier.h
+generic-y += bpf_perf_event.h
 generic-y += bug.h
 generic-y += clkdev.h
 generic-y += current.h

From 720f228e8d3128b7ab1d39f51fdd8da07a7640c9 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 12 Dec 2017 02:25:32 +0100
Subject: [PATCH 30/76] bpf: fix broken BPF selftest build

At least on x86_64, the kernel's BPF selftests seemed to have stopped
to build due to 618e165b2a8e ("selftests/bpf: sync kernel headers and
introduce arch support in Makefile"):

  [...]
  In file included from test_verifier.c:29:0:
  ../../../include/uapi/linux/bpf_perf_event.h:11:32:
     fatal error: asm/bpf_perf_event.h: No such file or directory
   #include <asm/bpf_perf_event.h>
                                ^
  compilation terminated.
  [...]

While pulling in tools/arch/*/include/uapi/asm/bpf_perf_event.h seems
to work fine, there's no automated fall-back logic right now that would
do the same out of tools/include/uapi/asm-generic/bpf_perf_event.h. The
usual convention today is to add a include/[uapi/]asm/ equivalent that
would pull in the correct arch header or generic one as fall-back, all
ifdef'ed based on compiler target definition. It's similarly done also
in other cases such as tools/include/asm/barrier.h, thus adapt the same
here.

Fixes: 618e165b2a8e ("selftests/bpf: sync kernel headers and introduce arch support in Makefile")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/include/uapi/asm/bpf_perf_event.h |  7 +++++++
 tools/testing/selftests/bpf/Makefile    | 13 +------------
 2 files changed, 8 insertions(+), 12 deletions(-)
 create mode 100644 tools/include/uapi/asm/bpf_perf_event.h

diff --git a/tools/include/uapi/asm/bpf_perf_event.h b/tools/include/uapi/asm/bpf_perf_event.h
new file mode 100644
index 000000000000..13a58531e6fa
--- /dev/null
+++ b/tools/include/uapi/asm/bpf_perf_event.h
@@ -0,0 +1,7 @@
+#if defined(__aarch64__)
+#include "../../arch/arm64/include/uapi/asm/bpf_perf_event.h"
+#elif defined(__s390__)
+#include "../../arch/s390/include/uapi/asm/bpf_perf_event.h"
+#else
+#include <uapi/asm-generic/bpf_perf_event.h>
+#endif
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 21a2d76b67dc..792af7c3b74f 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -1,19 +1,8 @@
 # SPDX-License-Identifier: GPL-2.0
 
-ifeq ($(srctree),)
-srctree := $(patsubst %/,%,$(dir $(CURDIR)))
-srctree := $(patsubst %/,%,$(dir $(srctree)))
-srctree := $(patsubst %/,%,$(dir $(srctree)))
-srctree := $(patsubst %/,%,$(dir $(srctree)))
-endif
-include $(srctree)/tools/scripts/Makefile.arch
-
-$(call detected_var,SRCARCH)
-
 LIBDIR := ../../../lib
 BPFDIR := $(LIBDIR)/bpf
 APIDIR := ../../../include/uapi
-ASMDIR:= ../../../arch/$(ARCH)/include/uapi
 GENDIR := ../../../../include/generated
 GENHDR := $(GENDIR)/autoconf.h
 
@@ -21,7 +10,7 @@ ifneq ($(wildcard $(GENHDR)),)
   GENFLAGS := -DHAVE_GENHDR
 endif
 
-CFLAGS += -Wall -O2 -I$(APIDIR) -I$(ASMDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) -I../../../include
+CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) -I../../../include
 LDLIBS += -lcap -lelf
 
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \

From 9147efcbe0b7cc96b18eb64b1a3f0d4bba81443c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Dec 2017 14:22:39 -0800
Subject: [PATCH 31/76] bpf: add schedule points to map alloc/free

While using large percpu maps, htab_map_alloc() can hold
cpu for hundreds of ms.

This patch adds cond_resched() calls to percpu alloc/free
call sites, all running in process context.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/hashtab.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index e469e05c8e83..3905d4bc5b80 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -114,6 +114,7 @@ static void htab_free_elems(struct bpf_htab *htab)
 		pptr = htab_elem_get_ptr(get_htab_elem(htab, i),
 					 htab->map.key_size);
 		free_percpu(pptr);
+		cond_resched();
 	}
 free_elems:
 	bpf_map_area_free(htab->elems);
@@ -159,6 +160,7 @@ static int prealloc_init(struct bpf_htab *htab)
 			goto free_elems;
 		htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
 				  pptr);
+		cond_resched();
 	}
 
 skip_percpu_elems:

From 6b782f43d34974c7909306fd9af06241d658a1f7 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Mon, 11 Dec 2017 09:54:09 +0100
Subject: [PATCH 32/76] Revert "ravb: add workaround for clock when resuming
 with WoL enabled"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit fbf3d034f2ff6264183cfa6845770e8cc2a986c8.

As of commit 560869100b99a3da ("clk: renesas: cpg-mssr: Restore module
clocks during resume"), the workaround is no longer needed.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Acked-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/renesas/ravb_main.c | 27 ++----------------------
 1 file changed, 2 insertions(+), 25 deletions(-)

diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c
index 2b962d349f5f..009780df664b 100644
--- a/drivers/net/ethernet/renesas/ravb_main.c
+++ b/drivers/net/ethernet/renesas/ravb_main.c
@@ -2308,32 +2308,9 @@ static int __maybe_unused ravb_resume(struct device *dev)
 	struct ravb_private *priv = netdev_priv(ndev);
 	int ret = 0;
 
-	if (priv->wol_enabled) {
-		/* Reduce the usecount of the clock to zero and then
-		 * restore it to its original value. This is done to force
-		 * the clock to be re-enabled which is a workaround
-		 * for renesas-cpg-mssr driver which do not enable clocks
-		 * when resuming from PSCI suspend/resume.
-		 *
-		 * Without this workaround the driver fails to communicate
-		 * with the hardware if WoL was enabled when the system
-		 * entered PSCI suspend. This is due to that if WoL is enabled
-		 * we explicitly keep the clock from being turned off when
-		 * suspending, but in PSCI sleep power is cut so the clock
-		 * is disabled anyhow, the clock driver is not aware of this
-		 * so the clock is not turned back on when resuming.
-		 *
-		 * TODO: once the renesas-cpg-mssr suspend/resume is working
-		 *       this clock dance should be removed.
-		 */
-		clk_disable(priv->clk);
-		clk_disable(priv->clk);
-		clk_enable(priv->clk);
-		clk_enable(priv->clk);
-
-		/* Set reset mode to rearm the WoL logic */
+	/* If WoL is enabled set reset mode to rearm the WoL logic */
+	if (priv->wol_enabled)
 		ravb_write(ndev, CCC_OPC_RESET, CCC);
-	}
 
 	/* All register have been reset to default values.
 	 * Restore all registers which where setup at probe time and

From b9b312a7a451e9c098921856e7cfbc201120e1a7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 11 Dec 2017 07:03:38 -0800
Subject: [PATCH 33/76] ipv6: mcast: better catch silly mtu values

syzkaller reported crashes in IPv6 stack [1]

Xin Long found that lo MTU was set to silly values.

IPv6 stack reacts to changes to small MTU, by disabling itself under
RTNL.

But there is a window where threads not using RTNL can see a wrong
device mtu. This can lead to surprises, in mld code where it is assumed
the mtu is suitable.

Fix this by reading device mtu once and checking IPv6 minimal MTU.

[1]
 skbuff: skb_over_panic: text:0000000010b86b8d len:196 put:20
 head:000000003b477e60 data:000000000e85441e tail:0xd4 end:0xc0 dev:lo
 ------------[ cut here ]------------
 kernel BUG at net/core/skbuff.c:104!
 invalid opcode: 0000 [#1] SMP KASAN
 Dumping ftrace buffer:
    (ftrace buffer empty)
 Modules linked in:
 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.15.0-rc2-mm1+ #39
 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
 Google 01/01/2011
 RIP: 0010:skb_panic+0x15c/0x1f0 net/core/skbuff.c:100
 RSP: 0018:ffff8801db307508 EFLAGS: 00010286
 RAX: 0000000000000082 RBX: ffff8801c517e840 RCX: 0000000000000000
 RDX: 0000000000000082 RSI: 1ffff1003b660e61 RDI: ffffed003b660e95
 RBP: ffff8801db307570 R08: 1ffff1003b660e23 R09: 0000000000000000
 R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff85bd4020
 R13: ffffffff84754ed2 R14: 0000000000000014 R15: ffff8801c4e26540
 FS:  0000000000000000(0000) GS:ffff8801db300000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 0000000000463610 CR3: 00000001c6698000 CR4: 00000000001406e0
 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
 Call Trace:
  <IRQ>
  skb_over_panic net/core/skbuff.c:109 [inline]
  skb_put+0x181/0x1c0 net/core/skbuff.c:1694
  add_grhead.isra.24+0x42/0x3b0 net/ipv6/mcast.c:1695
  add_grec+0xa55/0x1060 net/ipv6/mcast.c:1817
  mld_send_cr net/ipv6/mcast.c:1903 [inline]
  mld_ifc_timer_expire+0x4d2/0x770 net/ipv6/mcast.c:2448
  call_timer_fn+0x23b/0x840 kernel/time/timer.c:1320
  expire_timers kernel/time/timer.c:1357 [inline]
  __run_timers+0x7e1/0xb60 kernel/time/timer.c:1660
  run_timer_softirq+0x4c/0xb0 kernel/time/timer.c:1686
  __do_softirq+0x29d/0xbb2 kernel/softirq.c:285
  invoke_softirq kernel/softirq.c:365 [inline]
  irq_exit+0x1d3/0x210 kernel/softirq.c:405
  exiting_irq arch/x86/include/asm/apic.h:540 [inline]
  smp_apic_timer_interrupt+0x16b/0x700 arch/x86/kernel/apic/apic.c:1052
  apic_timer_interrupt+0xa9/0xb0 arch/x86/entry/entry_64.S:920

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Tested-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/mcast.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index fc6d7d143f2c..844642682b83 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -1682,16 +1682,16 @@ static int grec_size(struct ifmcaddr6 *pmc, int type, int gdel, int sdel)
 }
 
 static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc,
-	int type, struct mld2_grec **ppgr)
+	int type, struct mld2_grec **ppgr, unsigned int mtu)
 {
-	struct net_device *dev = pmc->idev->dev;
 	struct mld2_report *pmr;
 	struct mld2_grec *pgr;
 
-	if (!skb)
-		skb = mld_newpack(pmc->idev, dev->mtu);
-	if (!skb)
-		return NULL;
+	if (!skb) {
+		skb = mld_newpack(pmc->idev, mtu);
+		if (!skb)
+			return NULL;
+	}
 	pgr = skb_put(skb, sizeof(struct mld2_grec));
 	pgr->grec_type = type;
 	pgr->grec_auxwords = 0;
@@ -1714,10 +1714,15 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
 	struct mld2_grec *pgr = NULL;
 	struct ip6_sf_list *psf, *psf_next, *psf_prev, **psf_list;
 	int scount, stotal, first, isquery, truncate;
+	unsigned int mtu;
 
 	if (pmc->mca_flags & MAF_NOREPORT)
 		return skb;
 
+	mtu = READ_ONCE(dev->mtu);
+	if (mtu < IPV6_MIN_MTU)
+		return skb;
+
 	isquery = type == MLD2_MODE_IS_INCLUDE ||
 		  type == MLD2_MODE_IS_EXCLUDE;
 	truncate = type == MLD2_MODE_IS_EXCLUDE ||
@@ -1738,7 +1743,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
 		    AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) {
 			if (skb)
 				mld_sendpack(skb);
-			skb = mld_newpack(idev, dev->mtu);
+			skb = mld_newpack(idev, mtu);
 		}
 	}
 	first = 1;
@@ -1774,12 +1779,12 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
 				pgr->grec_nsrcs = htons(scount);
 			if (skb)
 				mld_sendpack(skb);
-			skb = mld_newpack(idev, dev->mtu);
+			skb = mld_newpack(idev, mtu);
 			first = 1;
 			scount = 0;
 		}
 		if (first) {
-			skb = add_grhead(skb, pmc, type, &pgr);
+			skb = add_grhead(skb, pmc, type, &pgr, mtu);
 			first = 0;
 		}
 		if (!skb)
@@ -1814,7 +1819,7 @@ empty_source:
 				mld_sendpack(skb);
 				skb = NULL; /* add_grhead will get a new one */
 			}
-			skb = add_grhead(skb, pmc, type, &pgr);
+			skb = add_grhead(skb, pmc, type, &pgr, mtu);
 		}
 	}
 	if (pgr)

From b5476022bbada3764609368f03329ca287528dc8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 11 Dec 2017 07:17:39 -0800
Subject: [PATCH 34/76] ipv4: igmp: guard against silly MTU values
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

IPv4 stack reacts to changes to small MTU, by disabling itself under
RTNL.

But there is a window where threads not using RTNL can see a wrong
device mtu. This can lead to surprises, in igmp code where it is
assumed the mtu is suitable.

Fix this by reading device mtu once and checking IPv4 minimal MTU.

This patch adds missing IPV4_MIN_MTU define, to not abuse
ETH_MIN_MTU anymore.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h     |  1 +
 net/ipv4/devinet.c   |  2 +-
 net/ipv4/igmp.c      | 24 +++++++++++++++---------
 net/ipv4/ip_tunnel.c |  4 ++--
 4 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index 9896f46cbbf1..af8addbaa3c1 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -34,6 +34,7 @@
 #include <net/flow_dissector.h>
 
 #define IPV4_MAX_PMTU		65535U		/* RFC 2675, Section 5.1 */
+#define IPV4_MIN_MTU		68			/* RFC 791 */
 
 struct sock;
 
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index a4573bccd6da..7a93359fbc72 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1428,7 +1428,7 @@ skip:
 
 static bool inetdev_valid_mtu(unsigned int mtu)
 {
-	return mtu >= 68;
+	return mtu >= IPV4_MIN_MTU;
 }
 
 static void inetdev_send_gratuitous_arp(struct net_device *dev,
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index d1f8f302dbf3..50448a220a1f 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -404,16 +404,17 @@ static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel)
 }
 
 static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc,
-	int type, struct igmpv3_grec **ppgr)
+	int type, struct igmpv3_grec **ppgr, unsigned int mtu)
 {
 	struct net_device *dev = pmc->interface->dev;
 	struct igmpv3_report *pih;
 	struct igmpv3_grec *pgr;
 
-	if (!skb)
-		skb = igmpv3_newpack(dev, dev->mtu);
-	if (!skb)
-		return NULL;
+	if (!skb) {
+		skb = igmpv3_newpack(dev, mtu);
+		if (!skb)
+			return NULL;
+	}
 	pgr = skb_put(skb, sizeof(struct igmpv3_grec));
 	pgr->grec_type = type;
 	pgr->grec_auxwords = 0;
@@ -436,12 +437,17 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
 	struct igmpv3_grec *pgr = NULL;
 	struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list;
 	int scount, stotal, first, isquery, truncate;
+	unsigned int mtu;
 
 	if (pmc->multiaddr == IGMP_ALL_HOSTS)
 		return skb;
 	if (ipv4_is_local_multicast(pmc->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports)
 		return skb;
 
+	mtu = READ_ONCE(dev->mtu);
+	if (mtu < IPV4_MIN_MTU)
+		return skb;
+
 	isquery = type == IGMPV3_MODE_IS_INCLUDE ||
 		  type == IGMPV3_MODE_IS_EXCLUDE;
 	truncate = type == IGMPV3_MODE_IS_EXCLUDE ||
@@ -462,7 +468,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
 		    AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) {
 			if (skb)
 				igmpv3_sendpack(skb);
-			skb = igmpv3_newpack(dev, dev->mtu);
+			skb = igmpv3_newpack(dev, mtu);
 		}
 	}
 	first = 1;
@@ -498,12 +504,12 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
 				pgr->grec_nsrcs = htons(scount);
 			if (skb)
 				igmpv3_sendpack(skb);
-			skb = igmpv3_newpack(dev, dev->mtu);
+			skb = igmpv3_newpack(dev, mtu);
 			first = 1;
 			scount = 0;
 		}
 		if (first) {
-			skb = add_grhead(skb, pmc, type, &pgr);
+			skb = add_grhead(skb, pmc, type, &pgr, mtu);
 			first = 0;
 		}
 		if (!skb)
@@ -538,7 +544,7 @@ empty_source:
 				igmpv3_sendpack(skb);
 				skb = NULL; /* add_grhead will get a new one */
 			}
-			skb = add_grhead(skb, pmc, type, &pgr);
+			skb = add_grhead(skb, pmc, type, &pgr, mtu);
 		}
 	}
 	if (pgr)
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index fe6fee728ce4..5ddb1cb52bd4 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -349,8 +349,8 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
 	dev->needed_headroom = t_hlen + hlen;
 	mtu -= (dev->hard_header_len + t_hlen);
 
-	if (mtu < 68)
-		mtu = 68;
+	if (mtu < IPV4_MIN_MTU)
+		mtu = IPV4_MIN_MTU;
 
 	return mtu;
 }

From 83593010d3b87601e775f240ce46c53ddf25828d Mon Sep 17 00:00:00 2001
From: Pravin Shedge <pravin.shedge4linux@gmail.com>
Date: Mon, 11 Dec 2017 22:09:46 +0530
Subject: [PATCH 35/76] net: remove duplicate includes

These duplicate includes have been found with scripts/checkincludes.pl but
they have been removed manually to avoid removing false positives.

Signed-off-by: Pravin Shedge <pravin.shedge4linux@gmail.com>
Acked-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/netprio_cgroup.c            | 1 -
 net/dsa/slave.c                      | 1 -
 net/netfilter/nf_conntrack_netlink.c | 1 -
 net/sched/act_meta_mark.c            | 1 -
 net/sched/act_meta_skbtcindex.c      | 1 -
 net/sched/cls_api.c                  | 1 -
 net/sched/cls_u32.c                  | 1 -
 7 files changed, 7 deletions(-)

diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 1c4810919a0a..b9057478d69c 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -14,7 +14,6 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/types.h>
-#include <linux/module.h>
 #include <linux/string.h>
 #include <linux/errno.h>
 #include <linux/skbuff.h>
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index d6e7a642493b..a95a55f79137 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -16,7 +16,6 @@
 #include <linux/of_net.h>
 #include <linux/of_mdio.h>
 #include <linux/mdio.h>
-#include <linux/list.h>
 #include <net/rtnetlink.h>
 #include <net/pkt_cls.h>
 #include <net/tc_act/tc_mirred.h>
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 59c08997bfdf..332b51870ed7 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -45,7 +45,6 @@
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <net/netfilter/nf_conntrack_timestamp.h>
 #include <net/netfilter/nf_conntrack_labels.h>
-#include <net/netfilter/nf_conntrack_seqadj.h>
 #include <net/netfilter/nf_conntrack_synproxy.h>
 #ifdef CONFIG_NF_NAT_NEEDED
 #include <net/netfilter/nf_nat_core.h>
diff --git a/net/sched/act_meta_mark.c b/net/sched/act_meta_mark.c
index 1e3f10e5da99..6445184b2759 100644
--- a/net/sched/act_meta_mark.c
+++ b/net/sched/act_meta_mark.c
@@ -22,7 +22,6 @@
 #include <net/pkt_sched.h>
 #include <uapi/linux/tc_act/tc_ife.h>
 #include <net/tc_act/tc_ife.h>
-#include <linux/rtnetlink.h>
 
 static int skbmark_encode(struct sk_buff *skb, void *skbdata,
 			  struct tcf_meta_info *e)
diff --git a/net/sched/act_meta_skbtcindex.c b/net/sched/act_meta_skbtcindex.c
index 2ea1f26c9e96..7221437ca3a6 100644
--- a/net/sched/act_meta_skbtcindex.c
+++ b/net/sched/act_meta_skbtcindex.c
@@ -22,7 +22,6 @@
 #include <net/pkt_sched.h>
 #include <uapi/linux/tc_act/tc_ife.h>
 #include <net/tc_act/tc_ife.h>
-#include <linux/rtnetlink.h>
 
 static int skbtcindex_encode(struct sk_buff *skb, void *skbdata,
 			     struct tcf_meta_info *e)
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index ddcf04b4ab43..f40256a3e7f0 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -23,7 +23,6 @@
 #include <linux/skbuff.h>
 #include <linux/init.h>
 #include <linux/kmod.h>
-#include <linux/err.h>
 #include <linux/slab.h>
 #include <net/net_namespace.h>
 #include <net/sock.h>
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index ac152b4f4247..507859cdd1cb 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -45,7 +45,6 @@
 #include <net/netlink.h>
 #include <net/act_api.h>
 #include <net/pkt_cls.h>
-#include <linux/netdevice.h>
 #include <linux/idr.h>
 
 struct tc_u_knode {

From c545a945d0d9ea2ea2c7d23d43cf0d86e32cd7cf Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Mon, 11 Dec 2017 19:11:55 +0100
Subject: [PATCH 36/76] tipc: eliminate potential memory leak

In the function tipc_sk_mcast_rcv() we call refcount_dec(&skb->users)
on received sk_buffers. Since the reference counter might hit zero at
this point, we have a potential memory leak.

We fix this by replacing refcount_dec() with kfree_skb().

Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/socket.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 5d18c0caa92b..41127d0b925e 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -1140,7 +1140,7 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
 				__skb_dequeue(arrvq);
 				__skb_queue_tail(inputq, skb);
 			}
-			refcount_dec(&skb->users);
+			kfree_skb(skb);
 			spin_unlock_bh(&inputq->lock);
 			continue;
 		}

From a46182b00290839fa3fa159d54fd3237bd8669f0 Mon Sep 17 00:00:00 2001
From: Kevin Cernekee <cernekee@chromium.org>
Date: Mon, 11 Dec 2017 11:13:45 -0800
Subject: [PATCH 37/76] net: igmp: Use correct source address on IGMPv3 reports

Closing a multicast socket after the final IPv4 address is deleted
from an interface can generate a membership report that uses the
source IP from a different interface.  The following test script, run
from an isolated netns, reproduces the issue:

    #!/bin/bash

    ip link add dummy0 type dummy
    ip link add dummy1 type dummy
    ip link set dummy0 up
    ip link set dummy1 up
    ip addr add 10.1.1.1/24 dev dummy0
    ip addr add 192.168.99.99/24 dev dummy1

    tcpdump -U -i dummy0 &
    socat EXEC:"sleep 2" \
        UDP4-DATAGRAM:239.101.1.68:8889,ip-add-membership=239.0.1.68:10.1.1.1 &

    sleep 1
    ip addr del 10.1.1.1/24 dev dummy0
    sleep 5
    kill %tcpdump

RFC 3376 specifies that the report must be sent with a valid IP source
address from the destination subnet, or from address 0.0.0.0.  Add an
extra check to make sure this is the case.

Signed-off-by: Kevin Cernekee <cernekee@chromium.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/igmp.c | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 50448a220a1f..726f6b608274 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -89,6 +89,7 @@
 #include <linux/rtnetlink.h>
 #include <linux/times.h>
 #include <linux/pkt_sched.h>
+#include <linux/byteorder/generic.h>
 
 #include <net/net_namespace.h>
 #include <net/arp.h>
@@ -321,6 +322,23 @@ igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted)
 	return scount;
 }
 
+/* source address selection per RFC 3376 section 4.2.13 */
+static __be32 igmpv3_get_srcaddr(struct net_device *dev,
+				 const struct flowi4 *fl4)
+{
+	struct in_device *in_dev = __in_dev_get_rcu(dev);
+
+	if (!in_dev)
+		return htonl(INADDR_ANY);
+
+	for_ifa(in_dev) {
+		if (inet_ifa_match(fl4->saddr, ifa))
+			return fl4->saddr;
+	} endfor_ifa(in_dev);
+
+	return htonl(INADDR_ANY);
+}
+
 static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu)
 {
 	struct sk_buff *skb;
@@ -368,7 +386,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu)
 	pip->frag_off = htons(IP_DF);
 	pip->ttl      = 1;
 	pip->daddr    = fl4.daddr;
-	pip->saddr    = fl4.saddr;
+	pip->saddr    = igmpv3_get_srcaddr(dev, &fl4);
 	pip->protocol = IPPROTO_IGMP;
 	pip->tot_len  = 0;	/* filled in later */
 	ip_select_ident(net, skb, NULL);

From aceef61ee56898cfa7b6960fb60b9326c3860441 Mon Sep 17 00:00:00 2001
From: Sebastian Sjoholm <ssjoholm@mac.com>
Date: Mon, 11 Dec 2017 21:51:14 +0100
Subject: [PATCH 38/76] net: qmi_wwan: add Sierra EM7565 1199:9091
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Sierra Wireless EM7565 is an Qualcomm MDM9x50 based M.2 modem.
The USB id is added to qmi_wwan.c to allow QMI communication
with the EM7565.

Signed-off-by: Sebastian Sjoholm <ssjoholm@mac.com>
Acked-by: Bjørn Mork <bjorn@mork.no>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/qmi_wwan.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index 304ec6555cd8..d2ca5a202e8d 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -1204,6 +1204,7 @@ static const struct usb_device_id products[] = {
 	{QMI_FIXED_INTF(0x1199, 0x9079, 10)},	/* Sierra Wireless EM74xx */
 	{QMI_FIXED_INTF(0x1199, 0x907b, 8)},	/* Sierra Wireless EM74xx */
 	{QMI_FIXED_INTF(0x1199, 0x907b, 10)},	/* Sierra Wireless EM74xx */
+	{QMI_FIXED_INTF(0x1199, 0x9091, 8)},	/* Sierra Wireless EM7565 */
 	{QMI_FIXED_INTF(0x1bbb, 0x011e, 4)},	/* Telekom Speedstick LTE II (Alcatel One Touch L100V LTE) */
 	{QMI_FIXED_INTF(0x1bbb, 0x0203, 2)},	/* Alcatel L800MA */
 	{QMI_FIXED_INTF(0x2357, 0x0201, 4)},	/* TP-LINK HSUPA Modem MA180 */

From 2a9ee696c72a24d63529c76483fcd92d04b1d2b7 Mon Sep 17 00:00:00 2001
From: Branislav Radocaj <branislav@radocaj.org>
Date: Tue, 12 Dec 2017 00:13:38 +0100
Subject: [PATCH 39/76] net: ethernet: arc: fix error handling in
 emac_rockchip_probe

If clk_set_rate() fails, we should disable clk before return.
Found by Linux Driver Verification project (linuxtesting.org).

Changes since v2 [1]:
* Merged with latest code changes

Changes since v1:
Update made thanks to David's review, much appreciated David.
* Improved inconsistent failure handling of clock rate setting
* For completeness of usecase, added arc_emac_probe error handling

Signed-off-by: Branislav Radocaj <branislav@radocaj.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/arc/emac_rockchip.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/arc/emac_rockchip.c b/drivers/net/ethernet/arc/emac_rockchip.c
index c6163874e4e7..16f9bee992fe 100644
--- a/drivers/net/ethernet/arc/emac_rockchip.c
+++ b/drivers/net/ethernet/arc/emac_rockchip.c
@@ -199,9 +199,11 @@ static int emac_rockchip_probe(struct platform_device *pdev)
 
 	/* RMII interface needs always a rate of 50MHz */
 	err = clk_set_rate(priv->refclk, 50000000);
-	if (err)
+	if (err) {
 		dev_err(dev,
 			"failed to change reference clock rate (%d)\n", err);
+		goto out_regulator_disable;
+	}
 
 	if (priv->soc_data->need_div_macclk) {
 		priv->macclk = devm_clk_get(dev, "macclk");
@@ -230,12 +232,14 @@ static int emac_rockchip_probe(struct platform_device *pdev)
 	err = arc_emac_probe(ndev, interface);
 	if (err) {
 		dev_err(dev, "failed to probe arc emac (%d)\n", err);
-		goto out_regulator_disable;
+		goto out_clk_disable_macclk;
 	}
 
 	return 0;
+
 out_clk_disable_macclk:
-	clk_disable_unprepare(priv->macclk);
+	if (priv->soc_data->need_div_macclk)
+		clk_disable_unprepare(priv->macclk);
 out_regulator_disable:
 	if (priv->regulator)
 		regulator_disable(priv->regulator);

From 6e266610eb6553cfb7e7eb5d11914bd01509c406 Mon Sep 17 00:00:00 2001
From: Jia-Ju Bai <baijiaju1990@163.com>
Date: Tue, 12 Dec 2017 16:49:52 +0800
Subject: [PATCH 40/76] hippi: Fix a Fix a possible sleep-in-atomic bug in
 rr_close

The driver may sleep under a spinlock.
The function call path is:
rr_close (acquire the spinlock)
  free_irq --> may sleep

To fix it, free_irq is moved to the place without holding the spinlock.

This bug is found by my static analysis tool(DSAC) and checked by my code review.

Signed-off-by: Jia-Ju Bai <baijiaju1990@163.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hippi/rrunner.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/hippi/rrunner.c b/drivers/net/hippi/rrunner.c
index 8483f03d5a41..1ab97d99b9ba 100644
--- a/drivers/net/hippi/rrunner.c
+++ b/drivers/net/hippi/rrunner.c
@@ -1379,8 +1379,8 @@ static int rr_close(struct net_device *dev)
 			    rrpriv->info_dma);
 	rrpriv->info = NULL;
 
-	free_irq(pdev->irq, dev);
 	spin_unlock_irqrestore(&rrpriv->lock, flags);
+	free_irq(pdev->irq, dev);
 
 	return 0;
 }

From 2e51a8dc7fdc9d06c52a0a0e442cc813357ea44d Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Tue, 12 Dec 2017 09:29:46 +0000
Subject: [PATCH 41/76] net: dsa: allow XAUI phy interface mode

XGMII is a 32-bit bus plus two clock signals per direction.  XAUI is
four serial lanes per direction.  The 88e6190 supports XAUI but not
XGMII as it doesn't have enough pins.  The same is true of 88e6176.

Match on PHY_INTERFACE_MODE_XAUI for the XAUI port type, but keep
accepting XGMII for backwards compatibility.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx/port.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/dsa/mv88e6xxx/port.c b/drivers/net/dsa/mv88e6xxx/port.c
index a7801f6668a5..6315774d72b3 100644
--- a/drivers/net/dsa/mv88e6xxx/port.c
+++ b/drivers/net/dsa/mv88e6xxx/port.c
@@ -338,6 +338,7 @@ int mv88e6390x_port_set_cmode(struct mv88e6xxx_chip *chip, int port,
 		cmode = MV88E6XXX_PORT_STS_CMODE_2500BASEX;
 		break;
 	case PHY_INTERFACE_MODE_XGMII:
+	case PHY_INTERFACE_MODE_XAUI:
 		cmode = MV88E6XXX_PORT_STS_CMODE_XAUI;
 		break;
 	case PHY_INTERFACE_MODE_RXAUI:

From cd8165c3d5fb07667328434835f2968a87caee67 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Tue, 12 Dec 2017 09:29:51 +0000
Subject: [PATCH 42/76] ARM: dts: vf610-zii-dev: use XAUI for DSA link ports

Use XAUI rather than XGMII for DSA link ports, as this is the interface
mode that the switches actually use. XAUI is the 4 lane bus with clock
per direction, whereas XGMII is a 32 bit bus with clock.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/arm/boot/dts/vf610-zii-dev-rev-c.dts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/boot/dts/vf610-zii-dev-rev-c.dts b/arch/arm/boot/dts/vf610-zii-dev-rev-c.dts
index 02a6227c717c..15a685dc2aa2 100644
--- a/arch/arm/boot/dts/vf610-zii-dev-rev-c.dts
+++ b/arch/arm/boot/dts/vf610-zii-dev-rev-c.dts
@@ -121,7 +121,7 @@
 					switch0port10: port@10 {
 						reg = <10>;
 						label = "dsa";
-						phy-mode = "xgmii";
+						phy-mode = "xaui";
 						link = <&switch1port10>;
 					};
 				};
@@ -208,7 +208,7 @@
 					switch1port10: port@10 {
 						reg = <10>;
 						label = "dsa";
-						phy-mode = "xgmii";
+						phy-mode = "xaui";
 						link = <&switch0port10>;
 					};
 				};

From f5e64032a799d4f54decc7eb6aafcdffb67f9ad9 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Tue, 12 Dec 2017 10:45:36 +0000
Subject: [PATCH 43/76] net: phy: fix resume handling

When a PHY has the BMCR_PDOWN bit set, it may decide to ignore writes
to other registers, or reset the registers to power-on defaults.
Micrel PHYs do this for their interrupt registers.

The current structure of phylib tries to enable interrupts before
resuming (and releasing) the BMCR_PDOWN bit.  This fails, causing
Micrel PHYs to stop working after a suspend/resume sequence if they
are using interrupts.

Fix this by ensuring that the PHY driver resume methods do not take
the phydev->lock mutex themselves, but the callers of phy_resume()
take that lock.  This then allows us to move the call to phy_resume()
before we enable interrupts in phy_start().

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/at803x.c     |  4 ----
 drivers/net/phy/phy.c        |  9 +++------
 drivers/net/phy/phy_device.c | 10 ++++++----
 3 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c
index 5f93e6add563..e911e4990b20 100644
--- a/drivers/net/phy/at803x.c
+++ b/drivers/net/phy/at803x.c
@@ -239,14 +239,10 @@ static int at803x_resume(struct phy_device *phydev)
 {
 	int value;
 
-	mutex_lock(&phydev->lock);
-
 	value = phy_read(phydev, MII_BMCR);
 	value &= ~(BMCR_PDOWN | BMCR_ISOLATE);
 	phy_write(phydev, MII_BMCR, value);
 
-	mutex_unlock(&phydev->lock);
-
 	return 0;
 }
 
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 2b1e67bc1e73..ed10d1fc8f59 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -828,7 +828,6 @@ EXPORT_SYMBOL(phy_stop);
  */
 void phy_start(struct phy_device *phydev)
 {
-	bool do_resume = false;
 	int err = 0;
 
 	mutex_lock(&phydev->lock);
@@ -841,6 +840,9 @@ void phy_start(struct phy_device *phydev)
 		phydev->state = PHY_UP;
 		break;
 	case PHY_HALTED:
+		/* if phy was suspended, bring the physical link up again */
+		phy_resume(phydev);
+
 		/* make sure interrupts are re-enabled for the PHY */
 		if (phydev->irq != PHY_POLL) {
 			err = phy_enable_interrupts(phydev);
@@ -849,17 +851,12 @@ void phy_start(struct phy_device *phydev)
 		}
 
 		phydev->state = PHY_RESUMING;
-		do_resume = true;
 		break;
 	default:
 		break;
 	}
 	mutex_unlock(&phydev->lock);
 
-	/* if phy was suspended, bring the physical link up again */
-	if (do_resume)
-		phy_resume(phydev);
-
 	phy_trigger_machine(phydev, true);
 }
 EXPORT_SYMBOL(phy_start);
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 67f25ac29025..b15b31ca2618 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -135,7 +135,9 @@ static int mdio_bus_phy_resume(struct device *dev)
 	if (!mdio_bus_phy_may_suspend(phydev))
 		goto no_resume;
 
+	mutex_lock(&phydev->lock);
 	ret = phy_resume(phydev);
+	mutex_unlock(&phydev->lock);
 	if (ret < 0)
 		return ret;
 
@@ -1026,7 +1028,9 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
 	if (err)
 		goto error;
 
+	mutex_lock(&phydev->lock);
 	phy_resume(phydev);
+	mutex_unlock(&phydev->lock);
 	phy_led_triggers_register(phydev);
 
 	return err;
@@ -1157,6 +1161,8 @@ int phy_resume(struct phy_device *phydev)
 	struct phy_driver *phydrv = to_phy_driver(phydev->mdio.dev.driver);
 	int ret = 0;
 
+	WARN_ON(!mutex_is_locked(&phydev->lock));
+
 	if (phydev->drv && phydrv->resume)
 		ret = phydrv->resume(phydev);
 
@@ -1639,13 +1645,9 @@ int genphy_resume(struct phy_device *phydev)
 {
 	int value;
 
-	mutex_lock(&phydev->lock);
-
 	value = phy_read(phydev, MII_BMCR);
 	phy_write(phydev, MII_BMCR, value & ~BMCR_PDOWN);
 
-	mutex_unlock(&phydev->lock);
-
 	return 0;
 }
 EXPORT_SYMBOL(genphy_resume);

From 94a5ef1b77da4674a6bc1d3de3051b758859d106 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Tue, 12 Dec 2017 10:49:15 +0000
Subject: [PATCH 44/76] of_mdio / mdiobus: ensure mdio devices have fwnode
 correctly populated

Ensure that all mdio devices populate the struct device fwnode pointer
as well as the of_node pointer to allow drivers that wish to use
fwnode APIs to work.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Rob Herring <robh@kernel.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mdio_bus.c | 1 +
 drivers/of/of_mdio.c       | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c
index 2df7b62c1a36..54d00a1d2bef 100644
--- a/drivers/net/phy/mdio_bus.c
+++ b/drivers/net/phy/mdio_bus.c
@@ -270,6 +270,7 @@ static void of_mdiobus_link_mdiodev(struct mii_bus *bus,
 
 		if (addr == mdiodev->addr) {
 			dev->of_node = child;
+			dev->fwnode = of_fwnode_handle(child);
 			return;
 		}
 	}
diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c
index 98258583abb0..3481e69738b5 100644
--- a/drivers/of/of_mdio.c
+++ b/drivers/of/of_mdio.c
@@ -81,6 +81,7 @@ static int of_mdiobus_register_phy(struct mii_bus *mdio,
 	 * can be looked up later */
 	of_node_get(child);
 	phy->mdio.dev.of_node = child;
+	phy->mdio.dev.fwnode = of_fwnode_handle(child);
 
 	/* All data is now stored in the phy struct;
 	 * register it */
@@ -111,6 +112,7 @@ static int of_mdiobus_register_device(struct mii_bus *mdio,
 	 */
 	of_node_get(child);
 	mdiodev->dev.of_node = child;
+	mdiodev->dev.fwnode = of_fwnode_handle(child);
 
 	/* All data is now stored in the mdiodev struct; register it. */
 	rc = mdio_device_register(mdiodev);
@@ -206,6 +208,7 @@ int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np)
 	mdio->phy_mask = ~0;
 
 	mdio->dev.of_node = np;
+	mdio->dev.fwnode = of_fwnode_handle(np);
 
 	/* Get bus level PHY reset GPIO details */
 	mdio->reset_delay_us = DEFAULT_GPIO_RESET_DELAY;

From 3b3397e2031564db07022e99f04d4b9f3df6fced Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Tue, 12 Dec 2017 13:03:11 +0000
Subject: [PATCH 45/76] net: phy: meson-gxl: make function
 meson_gxl_read_status static

The function meson_gxl_read_status is local to the source and does
not need to be in global scope, so make it static.

Cleans up sparse warning:
symbol 'meson_gxl_read_status' was not declared. Should it be static?

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Reviewed-by: Jerome Brunet <jbrunet@baylibre.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/meson-gxl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/phy/meson-gxl.c b/drivers/net/phy/meson-gxl.c
index 700007dd4be5..842eb871a6e3 100644
--- a/drivers/net/phy/meson-gxl.c
+++ b/drivers/net/phy/meson-gxl.c
@@ -67,7 +67,7 @@ static int meson_gxl_config_init(struct phy_device *phydev)
  * When this failure happens, the first retry is usually successful but,
  * in some cases, it may take up to 6 retries to get a decent result
  */
-int meson_gxl_read_status(struct phy_device *phydev)
+static int meson_gxl_read_status(struct phy_device *phydev)
 {
 	int ret, wol, lpa, exp;
 

From c009cb842fcc0f84536a9d2692e6f063af5ac5c6 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Tue, 12 Dec 2017 10:30:29 -0800
Subject: [PATCH 46/76] skge: remove redundunt free_irq under spinlock

The code to handle multi-port SKGE boards was freeing IRQ
twice. The first one was under lock and might sleep.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/skge.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/ethernet/marvell/skge.c b/drivers/net/ethernet/marvell/skge.c
index 6e423f098a60..31efc47c847e 100644
--- a/drivers/net/ethernet/marvell/skge.c
+++ b/drivers/net/ethernet/marvell/skge.c
@@ -4081,7 +4081,6 @@ static void skge_remove(struct pci_dev *pdev)
 	if (hw->ports > 1) {
 		skge_write32(hw, B0_IMSK, 0);
 		skge_read32(hw, B0_IMSK);
-		free_irq(pdev->irq, hw);
 	}
 	spin_unlock_irq(&hw->hw_lock);
 

From 9ee11bd03cb1a5c3ca33c2bb70e7ed325f68890f Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Tue, 12 Dec 2017 16:28:58 -0800
Subject: [PATCH 47/76] tcp: fix potential underestimation on rcv_rtt

When ms timestamp is used, current logic uses 1us in
tcp_rcv_rtt_update() when the real rcv_rtt is within 1 - 999us.
This could cause rcv_rtt underestimation.
Fix it by always using a min value of 1ms if ms timestamp is used.

Fixes: 645f4c6f2ebd ("tcp: switch rcv_rtt_est and rcvq_space to high resolution timestamps")
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9550cc42de2d..45f750e85714 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -508,9 +508,6 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
 	u32 new_sample = tp->rcv_rtt_est.rtt_us;
 	long m = sample;
 
-	if (m == 0)
-		m = 1;
-
 	if (new_sample != 0) {
 		/* If we sample in larger samples in the non-timestamp
 		 * case, we could grossly overestimate the RTT especially
@@ -547,6 +544,8 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
 	if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
 		return;
 	delta_us = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcv_rtt_est.time);
+	if (!delta_us)
+		delta_us = 1;
 	tcp_rcv_rtt_update(tp, delta_us, 1);
 
 new_measure:
@@ -563,8 +562,11 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
 	    (TCP_SKB_CB(skb)->end_seq -
 	     TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) {
 		u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
-		u32 delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
+		u32 delta_us;
 
+		if (!delta)
+			delta = 1;
+		delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
 		tcp_rcv_rtt_update(tp, delta_us, 0);
 	}
 }

From 4688eb7cf3ae2c2721d1dacff5c1384cba47d176 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Dec 2017 18:22:52 -0800
Subject: [PATCH 48/76] tcp: refresh tcp_mstamp from timers callbacks

Only the retransmit timer currently refreshes tcp_mstamp

We should do the same for delayed acks and keepalives.

Even if RFC 7323 does not request it, this is consistent to what linux
did in the past, when TS values were based on jiffies.

Fixes: 385e20706fac ("tcp: use tp->tcp_mstamp in output path")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Soheil Hassas Yeganeh <soheil@google.com>
Cc: Mike Maloney <maloney@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by:  Mike Maloney <maloney@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_timer.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 16df6dd44b98..968fda198376 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -264,6 +264,7 @@ void tcp_delack_timer_handler(struct sock *sk)
 			icsk->icsk_ack.pingpong = 0;
 			icsk->icsk_ack.ato      = TCP_ATO_MIN;
 		}
+		tcp_mstamp_refresh(tcp_sk(sk));
 		tcp_send_ack(sk);
 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS);
 	}
@@ -632,6 +633,7 @@ static void tcp_keepalive_timer (struct timer_list *t)
 		goto out;
 	}
 
+	tcp_mstamp_refresh(tp);
 	if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
 		if (tp->linger2 >= 0) {
 			const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;

From 53c64870d03edfa5c554ac2f750c5d6b38e3680a Mon Sep 17 00:00:00 2001
From: Jie Deng <Jie.Deng1@synopsys.com>
Date: Wed, 13 Dec 2017 12:04:12 +0800
Subject: [PATCH 49/76] dwc-xlgmac: Add co-maintainer

Jose Abreu will join to maintain dwc-xlgmac.
He will help with new feature development for
this driver. Thanks Jose and welcome on board!

Signed-off-by: Jie Deng <jiedeng@synopsys.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 9e0045e3ee0c..51497dc05333 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13117,6 +13117,7 @@ F:	drivers/dma/dw/
 
 SYNOPSYS DESIGNWARE ENTERPRISE ETHERNET DRIVER
 M:	Jie Deng <jiedeng@synopsys.com>
+M:	Jose Abreu <Jose.Abreu@synopsys.com>
 L:	netdev@vger.kernel.org
 S:	Supported
 F:	drivers/net/ethernet/synopsys/

From de9c4e06bbe872d725f306e34f3eea21155488e2 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Wed, 13 Dec 2017 09:22:03 +0000
Subject: [PATCH 50/76] net: phy: marvell: avoid configuring fiber page for
 SGMII-to-Copper

When in SGMII-to-Copper mode, the fiber page is used for the MAC facing
link, and does not require configuration of the fiber auto-negotiation
settings.  Avoid trying.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/marvell.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
index 4d02b27df044..b5a8f750e433 100644
--- a/drivers/net/phy/marvell.c
+++ b/drivers/net/phy/marvell.c
@@ -637,6 +637,10 @@ static int m88e1510_config_aneg(struct phy_device *phydev)
 	if (err < 0)
 		goto error;
 
+	/* Do not touch the fiber page if we're in copper->sgmii mode */
+	if (phydev->interface == PHY_INTERFACE_MODE_SGMII)
+		return 0;
+
 	/* Then the fiber link */
 	err = marvell_set_page(phydev, MII_MARVELL_FIBER_PAGE);
 	if (err < 0)

From 78034f5fdd622520eb843301cf35ce6c626543a7 Mon Sep 17 00:00:00 2001
From: Eugenia Emantayev <eugenia@mellanox.com>
Date: Wed, 13 Dec 2017 18:12:09 +0200
Subject: [PATCH 51/76] net/mlx4_en: Fix selftest for small MTUs

Set the minimal MTU threshold for running loopback selftest.
MTU should be big enough to include packet payload, NET_IP_ALIGN,
Ethernet headers and preamble length.

Fixes: e7c1c2c46201 ("mlx4_en: Added self diagnostics test implementation")
Signed-off-by: Eugenia Emantayev <eugenia@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/en_selftest.c | 2 +-
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h     | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_selftest.c b/drivers/net/ethernet/mellanox/mlx4/en_selftest.c
index 88699b181946..946d9db7c8c2 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_selftest.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_selftest.c
@@ -185,7 +185,7 @@ void mlx4_en_ex_selftest(struct net_device *dev, u32 *flags, u64 *buf)
 		if (priv->mdev->dev->caps.flags &
 					MLX4_DEV_CAP_FLAG_UC_LOOPBACK) {
 			buf[3] = mlx4_en_test_registers(priv);
-			if (priv->port_up)
+			if (priv->port_up && dev->mtu >= MLX4_SELFTEST_LB_MIN_MTU)
 				buf[4] = mlx4_en_test_loopback(priv);
 		}
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index 1856e279a7e0..2b72677eccd4 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -153,6 +153,9 @@
 #define SMALL_PACKET_SIZE      (256 - NET_IP_ALIGN)
 #define HEADER_COPY_SIZE       (128 - NET_IP_ALIGN)
 #define MLX4_LOOPBACK_TEST_PAYLOAD (HEADER_COPY_SIZE - ETH_HLEN)
+#define PREAMBLE_LEN           8
+#define MLX4_SELFTEST_LB_MIN_MTU (MLX4_LOOPBACK_TEST_PAYLOAD + NET_IP_ALIGN + \
+				  ETH_HLEN + PREAMBLE_LEN)
 
 #define MLX4_EN_MIN_MTU		46
 /* VLAN_HLEN is added twice,to support skb vlan tagged with multiple

From 0bb9fc4f5429ac970181c073aa32e521e20f7b73 Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Wed, 13 Dec 2017 18:12:10 +0200
Subject: [PATCH 52/76] net/mlx4_core: Fix wrong calculation of free counters

The field res_free indicates the total number of counters which are
available for allocation (reserved and unreserved). Fixed a bug where
the reserved counters were subtracted from res_free before any
allocation was performed.

Before this fix, free counters which were not reserved could not be
allocated.

Fixes: 9de92c60beaa ("net/mlx4_core: Adjust counter grant policy in the resource tracker")
Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Reviewed-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/resource_tracker.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
index 04304dd894c6..606a0e0beeae 100644
--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -611,7 +611,6 @@ int mlx4_init_resource_tracker(struct mlx4_dev *dev)
 						MLX4_MAX_PORTS;
 				else
 					res_alloc->guaranteed[t] = 0;
-				res_alloc->res_free -= res_alloc->guaranteed[t];
 				break;
 			default:
 				break;

From 5a1647c391ba543a77a400dddf89053ec5c2b7a4 Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Wed, 13 Dec 2017 18:12:11 +0200
Subject: [PATCH 53/76] net/mlx4_en: Fill all counters under one call of stats
 lock

Before this patch, the stats_lock was acquired twice. In between the
locks Driver sent command to gather some more statistics (per priority
and counter statistics). If the stats lock was acquired by get
statistics NDO in between we would have report out of sync counters.

Fix this by collecting all stats from Firmware in advance and then
fill the Software structs under one lock.

Fixes: 0b131561a7d6 ("net/mlx4_en: Add Flow control statistics display via ethtool")
Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/en_port.c | 57 +++++++++++---------
 1 file changed, 31 insertions(+), 26 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_port.c b/drivers/net/ethernet/mellanox/mlx4/en_port.c
index e0eb695318e6..1fa4849a6f56 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_port.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_port.c
@@ -188,7 +188,7 @@ int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
 	struct net_device *dev = mdev->pndev[port];
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 	struct net_device_stats *stats = &dev->stats;
-	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_cmd_mailbox *mailbox, *mailbox_priority;
 	u64 in_mod = reset << 8 | port;
 	int err;
 	int i, counter_index;
@@ -198,6 +198,13 @@ int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
 	mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
 	if (IS_ERR(mailbox))
 		return PTR_ERR(mailbox);
+
+	mailbox_priority = mlx4_alloc_cmd_mailbox(mdev->dev);
+	if (IS_ERR(mailbox_priority)) {
+		mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+		return PTR_ERR(mailbox_priority);
+	}
+
 	err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, in_mod, 0,
 			   MLX4_CMD_DUMP_ETH_STATS, MLX4_CMD_TIME_CLASS_B,
 			   MLX4_CMD_NATIVE);
@@ -206,6 +213,28 @@ int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
 
 	mlx4_en_stats = mailbox->buf;
 
+	memset(&tmp_counter_stats, 0, sizeof(tmp_counter_stats));
+	counter_index = mlx4_get_default_counter_index(mdev->dev, port);
+	err = mlx4_get_counter_stats(mdev->dev, counter_index,
+				     &tmp_counter_stats, reset);
+
+	/* 0xffs indicates invalid value */
+	memset(mailbox_priority->buf, 0xff,
+	       sizeof(*flowstats) * MLX4_NUM_PRIORITIES);
+
+	if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FLOWSTATS_EN) {
+		memset(mailbox_priority->buf, 0,
+		       sizeof(*flowstats) * MLX4_NUM_PRIORITIES);
+		err = mlx4_cmd_box(mdev->dev, 0, mailbox_priority->dma,
+				   in_mod | MLX4_DUMP_ETH_STATS_FLOW_CONTROL,
+				   0, MLX4_CMD_DUMP_ETH_STATS,
+				   MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+		if (err)
+			goto out;
+	}
+
+	flowstats = mailbox_priority->buf;
+
 	spin_lock_bh(&priv->stats_lock);
 
 	mlx4_en_fold_software_stats(dev);
@@ -345,31 +374,6 @@ int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
 	priv->pkstats.tx_prio[8][0] = be64_to_cpu(mlx4_en_stats->TTOT_novlan);
 	priv->pkstats.tx_prio[8][1] = be64_to_cpu(mlx4_en_stats->TOCT_novlan);
 
-	spin_unlock_bh(&priv->stats_lock);
-
-	memset(&tmp_counter_stats, 0, sizeof(tmp_counter_stats));
-	counter_index = mlx4_get_default_counter_index(mdev->dev, port);
-	err = mlx4_get_counter_stats(mdev->dev, counter_index,
-				     &tmp_counter_stats, reset);
-
-	/* 0xffs indicates invalid value */
-	memset(mailbox->buf, 0xff, sizeof(*flowstats) * MLX4_NUM_PRIORITIES);
-
-	if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FLOWSTATS_EN) {
-		memset(mailbox->buf, 0,
-		       sizeof(*flowstats) * MLX4_NUM_PRIORITIES);
-		err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma,
-				   in_mod | MLX4_DUMP_ETH_STATS_FLOW_CONTROL,
-				   0, MLX4_CMD_DUMP_ETH_STATS,
-				   MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
-		if (err)
-			goto out;
-	}
-
-	flowstats = mailbox->buf;
-
-	spin_lock_bh(&priv->stats_lock);
-
 	if (tmp_counter_stats.counter_mode == 0) {
 		priv->pf_stats.rx_bytes   = be64_to_cpu(tmp_counter_stats.rx_bytes);
 		priv->pf_stats.tx_bytes   = be64_to_cpu(tmp_counter_stats.tx_bytes);
@@ -410,6 +414,7 @@ int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
 
 out:
 	mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+	mlx4_free_cmd_mailbox(mdev->dev, mailbox_priority);
 	return err;
 }
 

From fccff0862838908d21eaf956d57e09c6c189f7c5 Mon Sep 17 00:00:00 2001
From: Yuval Mintz <yuvalm@mellanox.com>
Date: Fri, 15 Dec 2017 08:44:21 +0100
Subject: [PATCH 54/76] mlxsw: spectrum: Disable MAC learning for ovs port

Learning is currently enabled for ports which are OVS slaves -
even though OVS doesn't need this indication.
Since we're not associating a fid with the port, HW would continuously
notify driver of learned [& aged] MACs which would be logged as errors.

Fixes: 2b94e58df58c ("mlxsw: spectrum: Allow ports to work under OVS master")
Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 2d0897b7d860..9bd8d28de152 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -4300,6 +4300,7 @@ static int mlxsw_sp_port_stp_set(struct mlxsw_sp_port *mlxsw_sp_port,
 
 static int mlxsw_sp_port_ovs_join(struct mlxsw_sp_port *mlxsw_sp_port)
 {
+	u16 vid = 1;
 	int err;
 
 	err = mlxsw_sp_port_vp_mode_set(mlxsw_sp_port, true);
@@ -4312,8 +4313,19 @@ static int mlxsw_sp_port_ovs_join(struct mlxsw_sp_port *mlxsw_sp_port)
 				     true, false);
 	if (err)
 		goto err_port_vlan_set;
+
+	for (; vid <= VLAN_N_VID - 1; vid++) {
+		err = mlxsw_sp_port_vid_learning_set(mlxsw_sp_port,
+						     vid, false);
+		if (err)
+			goto err_vid_learning_set;
+	}
+
 	return 0;
 
+err_vid_learning_set:
+	for (vid--; vid >= 1; vid--)
+		mlxsw_sp_port_vid_learning_set(mlxsw_sp_port, vid, true);
 err_port_vlan_set:
 	mlxsw_sp_port_stp_set(mlxsw_sp_port, false);
 err_port_stp_set:
@@ -4323,6 +4335,12 @@ err_port_stp_set:
 
 static void mlxsw_sp_port_ovs_leave(struct mlxsw_sp_port *mlxsw_sp_port)
 {
+	u16 vid;
+
+	for (vid = VLAN_N_VID - 1; vid >= 1; vid--)
+		mlxsw_sp_port_vid_learning_set(mlxsw_sp_port,
+					       vid, true);
+
 	mlxsw_sp_port_vlan_set(mlxsw_sp_port, 2, VLAN_N_VID - 1,
 			       false, false);
 	mlxsw_sp_port_stp_set(mlxsw_sp_port, false);

From 7fbd9493f0eeae8cef58300505a9ef5c8fce6313 Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Date: Wed, 13 Dec 2017 18:56:29 +0100
Subject: [PATCH 55/76] s390/qeth: apply takeover changes when mode is toggled

Just as for an explicit enable/disable, toggling the takeover mode also
requires that the IP addresses get updated. Otherwise all IPs that were
added to the table before the mode-toggle, get registered with the old
settings.

Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/s390/net/qeth_core.h      |  2 +-
 drivers/s390/net/qeth_core_main.c |  2 +-
 drivers/s390/net/qeth_l3_sys.c    | 35 +++++++++++++++----------------
 3 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h
index 15015a24f8ad..51c618d9fefe 100644
--- a/drivers/s390/net/qeth_core.h
+++ b/drivers/s390/net/qeth_core.h
@@ -565,7 +565,7 @@ enum qeth_cq {
 };
 
 struct qeth_ipato {
-	int enabled;
+	bool enabled;
 	int invert4;
 	int invert6;
 	struct list_head entries;
diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c
index 430e3214f7e2..8d18675e60e2 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -1480,7 +1480,7 @@ static int qeth_setup_card(struct qeth_card *card)
 	qeth_set_intial_options(card);
 	/* IP address takeover */
 	INIT_LIST_HEAD(&card->ipato.entries);
-	card->ipato.enabled = 0;
+	card->ipato.enabled = false;
 	card->ipato.invert4 = 0;
 	card->ipato.invert6 = 0;
 	/* init QDIO stuff */
diff --git a/drivers/s390/net/qeth_l3_sys.c b/drivers/s390/net/qeth_l3_sys.c
index bd12fdf678be..198717f71b3d 100644
--- a/drivers/s390/net/qeth_l3_sys.c
+++ b/drivers/s390/net/qeth_l3_sys.c
@@ -372,6 +372,7 @@ static ssize_t qeth_l3_dev_ipato_enable_store(struct device *dev,
 	struct qeth_card *card = dev_get_drvdata(dev);
 	struct qeth_ipaddr *addr;
 	int i, rc = 0;
+	bool enable;
 
 	if (!card)
 		return -EINVAL;
@@ -384,25 +385,23 @@ static ssize_t qeth_l3_dev_ipato_enable_store(struct device *dev,
 	}
 
 	if (sysfs_streq(buf, "toggle")) {
-		card->ipato.enabled = (card->ipato.enabled)? 0 : 1;
-	} else if (sysfs_streq(buf, "1")) {
-		card->ipato.enabled = 1;
-		hash_for_each(card->ip_htable, i, addr, hnode) {
-				if ((addr->type == QETH_IP_TYPE_NORMAL) &&
-				qeth_l3_is_addr_covered_by_ipato(card, addr))
-					addr->set_flags |=
-					QETH_IPA_SETIP_TAKEOVER_FLAG;
-			}
-	} else if (sysfs_streq(buf, "0")) {
-		card->ipato.enabled = 0;
-		hash_for_each(card->ip_htable, i, addr, hnode) {
-			if (addr->set_flags &
-			QETH_IPA_SETIP_TAKEOVER_FLAG)
-				addr->set_flags &=
-				~QETH_IPA_SETIP_TAKEOVER_FLAG;
-			}
-	} else
+		enable = !card->ipato.enabled;
+	} else if (kstrtobool(buf, &enable)) {
 		rc = -EINVAL;
+		goto out;
+	}
+
+	if (card->ipato.enabled == enable)
+		goto out;
+	card->ipato.enabled = enable;
+
+	hash_for_each(card->ip_htable, i, addr, hnode) {
+		if (!enable)
+			addr->set_flags &= ~QETH_IPA_SETIP_TAKEOVER_FLAG;
+		else if (addr->type == QETH_IP_TYPE_NORMAL &&
+			 qeth_l3_is_addr_covered_by_ipato(card, addr))
+			addr->set_flags |= QETH_IPA_SETIP_TAKEOVER_FLAG;
+	}
 out:
 	mutex_unlock(&card->conf_mutex);
 	return rc ? rc : count;

From b22d73d6689fd902a66c08ebe71ab2f3b351e22f Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Date: Wed, 13 Dec 2017 18:56:30 +0100
Subject: [PATCH 56/76] s390/qeth: don't apply takeover changes to RXIP

When takeover is switched off, current code clears the 'TAKEOVER' flag on
all IPs. But the flag is also used for RXIP addresses, and those should
not be affected by the takeover mode.
Fix the behaviour by consistenly applying takover logic to NORMAL
addresses only.

Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/s390/net/qeth_l3_main.c | 5 +++--
 drivers/s390/net/qeth_l3_sys.c  | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index 6a73894b0cb5..4a4be81800eb 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -174,6 +174,8 @@ int qeth_l3_is_addr_covered_by_ipato(struct qeth_card *card,
 
 	if (!card->ipato.enabled)
 		return 0;
+	if (addr->type != QETH_IP_TYPE_NORMAL)
+		return 0;
 
 	qeth_l3_convert_addr_to_bits((u8 *) &addr->u, addr_bits,
 				  (addr->proto == QETH_PROT_IPV4)? 4:16);
@@ -290,8 +292,7 @@ int qeth_l3_add_ip(struct qeth_card *card, struct qeth_ipaddr *tmp_addr)
 		memcpy(addr, tmp_addr, sizeof(struct qeth_ipaddr));
 		addr->ref_counter = 1;
 
-		if (addr->type == QETH_IP_TYPE_NORMAL  &&
-				qeth_l3_is_addr_covered_by_ipato(card, addr)) {
+		if (qeth_l3_is_addr_covered_by_ipato(card, addr)) {
 			QETH_CARD_TEXT(card, 2, "tkovaddr");
 			addr->set_flags |= QETH_IPA_SETIP_TAKEOVER_FLAG;
 		}
diff --git a/drivers/s390/net/qeth_l3_sys.c b/drivers/s390/net/qeth_l3_sys.c
index 198717f71b3d..e256928092e5 100644
--- a/drivers/s390/net/qeth_l3_sys.c
+++ b/drivers/s390/net/qeth_l3_sys.c
@@ -396,10 +396,11 @@ static ssize_t qeth_l3_dev_ipato_enable_store(struct device *dev,
 	card->ipato.enabled = enable;
 
 	hash_for_each(card->ip_htable, i, addr, hnode) {
+		if (addr->type != QETH_IP_TYPE_NORMAL)
+			continue;
 		if (!enable)
 			addr->set_flags &= ~QETH_IPA_SETIP_TAKEOVER_FLAG;
-		else if (addr->type == QETH_IP_TYPE_NORMAL &&
-			 qeth_l3_is_addr_covered_by_ipato(card, addr))
+		else if (qeth_l3_is_addr_covered_by_ipato(card, addr))
 			addr->set_flags |= QETH_IPA_SETIP_TAKEOVER_FLAG;
 	}
 out:

From 8a03a3692b100d84785ee7a834e9215e304c9e00 Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Date: Wed, 13 Dec 2017 18:56:31 +0100
Subject: [PATCH 57/76] s390/qeth: lock IP table while applying takeover
 changes

Modifying the flags of an IP addr object needs to be protected against
eg. concurrent removal of the same object from the IP table.

Fixes: 5f78e29ceebf ("qeth: optimize IP handling in rx_mode callback")
Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/s390/net/qeth_l3_sys.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/s390/net/qeth_l3_sys.c b/drivers/s390/net/qeth_l3_sys.c
index e256928092e5..aa676b4090da 100644
--- a/drivers/s390/net/qeth_l3_sys.c
+++ b/drivers/s390/net/qeth_l3_sys.c
@@ -395,6 +395,7 @@ static ssize_t qeth_l3_dev_ipato_enable_store(struct device *dev,
 		goto out;
 	card->ipato.enabled = enable;
 
+	spin_lock_bh(&card->ip_lock);
 	hash_for_each(card->ip_htable, i, addr, hnode) {
 		if (addr->type != QETH_IP_TYPE_NORMAL)
 			continue;
@@ -403,6 +404,7 @@ static ssize_t qeth_l3_dev_ipato_enable_store(struct device *dev,
 		else if (qeth_l3_is_addr_covered_by_ipato(card, addr))
 			addr->set_flags |= QETH_IPA_SETIP_TAKEOVER_FLAG;
 	}
+	spin_unlock_bh(&card->ip_lock);
 out:
 	mutex_unlock(&card->conf_mutex);
 	return rc ? rc : count;

From 02f510f326501470348a5df341e8232c3497bbbb Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Date: Wed, 13 Dec 2017 18:56:32 +0100
Subject: [PATCH 58/76] s390/qeth: update takeover IPs after configuration
 change

Any modification to the takeover IP-ranges requires that we re-evaluate
which IP addresses are takeover-eligible. Otherwise we might do takeover
for some addresses when we no longer should, or vice-versa.

Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/s390/net/qeth_core.h      |  4 +-
 drivers/s390/net/qeth_core_main.c |  4 +-
 drivers/s390/net/qeth_l3.h        |  2 +-
 drivers/s390/net/qeth_l3_main.c   | 31 +++++++++++++--
 drivers/s390/net/qeth_l3_sys.c    | 63 +++++++++++++++++--------------
 5 files changed, 67 insertions(+), 37 deletions(-)

diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h
index 51c618d9fefe..badf42acbf95 100644
--- a/drivers/s390/net/qeth_core.h
+++ b/drivers/s390/net/qeth_core.h
@@ -566,8 +566,8 @@ enum qeth_cq {
 
 struct qeth_ipato {
 	bool enabled;
-	int invert4;
-	int invert6;
+	bool invert4;
+	bool invert6;
 	struct list_head entries;
 };
 
diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c
index 8d18675e60e2..6c815207f4f5 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -1481,8 +1481,8 @@ static int qeth_setup_card(struct qeth_card *card)
 	/* IP address takeover */
 	INIT_LIST_HEAD(&card->ipato.entries);
 	card->ipato.enabled = false;
-	card->ipato.invert4 = 0;
-	card->ipato.invert6 = 0;
+	card->ipato.invert4 = false;
+	card->ipato.invert6 = false;
 	/* init QDIO stuff */
 	qeth_init_qdio_info(card);
 	INIT_DELAYED_WORK(&card->buffer_reclaim_work, qeth_buffer_reclaim_work);
diff --git a/drivers/s390/net/qeth_l3.h b/drivers/s390/net/qeth_l3.h
index 194ae9b577cc..e5833837b799 100644
--- a/drivers/s390/net/qeth_l3.h
+++ b/drivers/s390/net/qeth_l3.h
@@ -82,7 +82,7 @@ void qeth_l3_del_vipa(struct qeth_card *, enum qeth_prot_versions, const u8 *);
 int qeth_l3_add_rxip(struct qeth_card *, enum qeth_prot_versions, const u8 *);
 void qeth_l3_del_rxip(struct qeth_card *card, enum qeth_prot_versions,
 			const u8 *);
-int qeth_l3_is_addr_covered_by_ipato(struct qeth_card *, struct qeth_ipaddr *);
+void qeth_l3_update_ipato(struct qeth_card *card);
 struct qeth_ipaddr *qeth_l3_get_addr_buffer(enum qeth_prot_versions);
 int qeth_l3_add_ip(struct qeth_card *, struct qeth_ipaddr *);
 int qeth_l3_delete_ip(struct qeth_card *, struct qeth_ipaddr *);
diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index 4a4be81800eb..ef0961e18686 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -164,8 +164,8 @@ static void qeth_l3_convert_addr_to_bits(u8 *addr, u8 *bits, int len)
 	}
 }
 
-int qeth_l3_is_addr_covered_by_ipato(struct qeth_card *card,
-						struct qeth_ipaddr *addr)
+static bool qeth_l3_is_addr_covered_by_ipato(struct qeth_card *card,
+					     struct qeth_ipaddr *addr)
 {
 	struct qeth_ipato_entry *ipatoe;
 	u8 addr_bits[128] = {0, };
@@ -606,6 +606,27 @@ int qeth_l3_setrouting_v6(struct qeth_card *card)
 /*
  * IP address takeover related functions
  */
+
+/**
+ * qeth_l3_update_ipato() - Update 'takeover' property, for all NORMAL IPs.
+ *
+ * Caller must hold ip_lock.
+ */
+void qeth_l3_update_ipato(struct qeth_card *card)
+{
+	struct qeth_ipaddr *addr;
+	unsigned int i;
+
+	hash_for_each(card->ip_htable, i, addr, hnode) {
+		if (addr->type != QETH_IP_TYPE_NORMAL)
+			continue;
+		if (qeth_l3_is_addr_covered_by_ipato(card, addr))
+			addr->set_flags |= QETH_IPA_SETIP_TAKEOVER_FLAG;
+		else
+			addr->set_flags &= ~QETH_IPA_SETIP_TAKEOVER_FLAG;
+	}
+}
+
 static void qeth_l3_clear_ipato_list(struct qeth_card *card)
 {
 	struct qeth_ipato_entry *ipatoe, *tmp;
@@ -617,6 +638,7 @@ static void qeth_l3_clear_ipato_list(struct qeth_card *card)
 		kfree(ipatoe);
 	}
 
+	qeth_l3_update_ipato(card);
 	spin_unlock_bh(&card->ip_lock);
 }
 
@@ -641,8 +663,10 @@ int qeth_l3_add_ipato_entry(struct qeth_card *card,
 		}
 	}
 
-	if (!rc)
+	if (!rc) {
 		list_add_tail(&new->entry, &card->ipato.entries);
+		qeth_l3_update_ipato(card);
+	}
 
 	spin_unlock_bh(&card->ip_lock);
 
@@ -665,6 +689,7 @@ void qeth_l3_del_ipato_entry(struct qeth_card *card,
 			    (proto == QETH_PROT_IPV4)? 4:16) &&
 		    (ipatoe->mask_bits == mask_bits)) {
 			list_del(&ipatoe->entry);
+			qeth_l3_update_ipato(card);
 			kfree(ipatoe);
 		}
 	}
diff --git a/drivers/s390/net/qeth_l3_sys.c b/drivers/s390/net/qeth_l3_sys.c
index aa676b4090da..6ea2b528a64e 100644
--- a/drivers/s390/net/qeth_l3_sys.c
+++ b/drivers/s390/net/qeth_l3_sys.c
@@ -370,9 +370,8 @@ static ssize_t qeth_l3_dev_ipato_enable_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t count)
 {
 	struct qeth_card *card = dev_get_drvdata(dev);
-	struct qeth_ipaddr *addr;
-	int i, rc = 0;
 	bool enable;
+	int rc = 0;
 
 	if (!card)
 		return -EINVAL;
@@ -391,20 +390,12 @@ static ssize_t qeth_l3_dev_ipato_enable_store(struct device *dev,
 		goto out;
 	}
 
-	if (card->ipato.enabled == enable)
-		goto out;
-	card->ipato.enabled = enable;
-
-	spin_lock_bh(&card->ip_lock);
-	hash_for_each(card->ip_htable, i, addr, hnode) {
-		if (addr->type != QETH_IP_TYPE_NORMAL)
-			continue;
-		if (!enable)
-			addr->set_flags &= ~QETH_IPA_SETIP_TAKEOVER_FLAG;
-		else if (qeth_l3_is_addr_covered_by_ipato(card, addr))
-			addr->set_flags |= QETH_IPA_SETIP_TAKEOVER_FLAG;
+	if (card->ipato.enabled != enable) {
+		card->ipato.enabled = enable;
+		spin_lock_bh(&card->ip_lock);
+		qeth_l3_update_ipato(card);
+		spin_unlock_bh(&card->ip_lock);
 	}
-	spin_unlock_bh(&card->ip_lock);
 out:
 	mutex_unlock(&card->conf_mutex);
 	return rc ? rc : count;
@@ -430,20 +421,27 @@ static ssize_t qeth_l3_dev_ipato_invert4_store(struct device *dev,
 				const char *buf, size_t count)
 {
 	struct qeth_card *card = dev_get_drvdata(dev);
+	bool invert;
 	int rc = 0;
 
 	if (!card)
 		return -EINVAL;
 
 	mutex_lock(&card->conf_mutex);
-	if (sysfs_streq(buf, "toggle"))
-		card->ipato.invert4 = (card->ipato.invert4)? 0 : 1;
-	else if (sysfs_streq(buf, "1"))
-		card->ipato.invert4 = 1;
-	else if (sysfs_streq(buf, "0"))
-		card->ipato.invert4 = 0;
-	else
+	if (sysfs_streq(buf, "toggle")) {
+		invert = !card->ipato.invert4;
+	} else if (kstrtobool(buf, &invert)) {
 		rc = -EINVAL;
+		goto out;
+	}
+
+	if (card->ipato.invert4 != invert) {
+		card->ipato.invert4 = invert;
+		spin_lock_bh(&card->ip_lock);
+		qeth_l3_update_ipato(card);
+		spin_unlock_bh(&card->ip_lock);
+	}
+out:
 	mutex_unlock(&card->conf_mutex);
 	return rc ? rc : count;
 }
@@ -609,20 +607,27 @@ static ssize_t qeth_l3_dev_ipato_invert6_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t count)
 {
 	struct qeth_card *card = dev_get_drvdata(dev);
+	bool invert;
 	int rc = 0;
 
 	if (!card)
 		return -EINVAL;
 
 	mutex_lock(&card->conf_mutex);
-	if (sysfs_streq(buf, "toggle"))
-		card->ipato.invert6 = (card->ipato.invert6)? 0 : 1;
-	else if (sysfs_streq(buf, "1"))
-		card->ipato.invert6 = 1;
-	else if (sysfs_streq(buf, "0"))
-		card->ipato.invert6 = 0;
-	else
+	if (sysfs_streq(buf, "toggle")) {
+		invert = !card->ipato.invert6;
+	} else if (kstrtobool(buf, &invert)) {
 		rc = -EINVAL;
+		goto out;
+	}
+
+	if (card->ipato.invert6 != invert) {
+		card->ipato.invert6 = invert;
+		spin_lock_bh(&card->ip_lock);
+		qeth_l3_update_ipato(card);
+		spin_unlock_bh(&card->ip_lock);
+	}
+out:
 	mutex_unlock(&card->conf_mutex);
 	return rc ? rc : count;
 }

From 35b99dffc3f710cafceee6c8c6ac6a98eb2cb4bf Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Wed, 13 Dec 2017 14:41:06 -0500
Subject: [PATCH 59/76] sock: free skb in skb_complete_tx_timestamp on error

skb_complete_tx_timestamp must ingest the skb it is passed. Call
kfree_skb if the skb cannot be enqueued.

Fixes: b245be1f4db1 ("net-timestamp: no-payload only sysctl")
Fixes: 9ac25fc06375 ("net: fix socket refcounting in skb_complete_tx_timestamp()")
Reported-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 6b0ff396fa9d..a592ca025fc4 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4293,7 +4293,7 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
 	struct sock *sk = skb->sk;
 
 	if (!skb_may_tx_timestamp(sk, false))
-		return;
+		goto err;
 
 	/* Take a reference to prevent skb_orphan() from freeing the socket,
 	 * but only if the socket refcount is not zero.
@@ -4302,7 +4302,11 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
 		*skb_hwtstamps(skb) = *hwtstamps;
 		__skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false);
 		sock_put(sk);
+		return;
 	}
+
+err:
+	kfree_skb(skb);
 }
 EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
 

From e4d02ca04c6d48ab2226342a1c4ed54f1dbb72bd Mon Sep 17 00:00:00 2001
From: Igor Russkikh <igor.russkikh@aquantia.com>
Date: Thu, 14 Dec 2017 12:34:40 +0300
Subject: [PATCH 60/76] net: aquantia: Fix actual speed capabilities reporting

Different hardware device Ids correspond to different maximum speed
available. Extra checks were added for devices D108 and D109 to
remove unsupported speeds from these device capabilities list.

Signed-off-by: Igor Russkikh <igor.russkikh@aquantia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/aquantia/atlantic/aq_hw.h      |  4 +++-
 drivers/net/ethernet/aquantia/atlantic/aq_nic.c     |  7 ++++---
 drivers/net/ethernet/aquantia/atlantic/aq_nic.h     |  2 +-
 .../net/ethernet/aquantia/atlantic/aq_pci_func.c    |  5 +++--
 .../ethernet/aquantia/atlantic/hw_atl/hw_atl_a0.c   | 13 ++++++++++++-
 .../ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c   | 13 ++++++++++++-
 6 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_hw.h b/drivers/net/ethernet/aquantia/atlantic/aq_hw.h
index 0207927dc8a6..4ebd53b3c7da 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_hw.h
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_hw.h
@@ -85,7 +85,9 @@ struct aq_hw_ops {
 	void (*destroy)(struct aq_hw_s *self);
 
 	int (*get_hw_caps)(struct aq_hw_s *self,
-			   struct aq_hw_caps_s *aq_hw_caps);
+			   struct aq_hw_caps_s *aq_hw_caps,
+			   unsigned short device,
+			   unsigned short subsystem_device);
 
 	int (*hw_ring_tx_xmit)(struct aq_hw_s *self, struct aq_ring_s *aq_ring,
 			       unsigned int frags);
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
index 78dfb2ab78ce..a360ccc298b9 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
@@ -222,7 +222,7 @@ static struct net_device *aq_nic_ndev_alloc(void)
 
 struct aq_nic_s *aq_nic_alloc_cold(const struct net_device_ops *ndev_ops,
 				   const struct ethtool_ops *et_ops,
-				   struct device *dev,
+				   struct pci_dev *pdev,
 				   struct aq_pci_func_s *aq_pci_func,
 				   unsigned int port,
 				   const struct aq_hw_ops *aq_hw_ops)
@@ -242,7 +242,7 @@ struct aq_nic_s *aq_nic_alloc_cold(const struct net_device_ops *ndev_ops,
 	ndev->netdev_ops = ndev_ops;
 	ndev->ethtool_ops = et_ops;
 
-	SET_NETDEV_DEV(ndev, dev);
+	SET_NETDEV_DEV(ndev, &pdev->dev);
 
 	ndev->if_port = port;
 	self->ndev = ndev;
@@ -254,7 +254,8 @@ struct aq_nic_s *aq_nic_alloc_cold(const struct net_device_ops *ndev_ops,
 
 	self->aq_hw = self->aq_hw_ops.create(aq_pci_func, self->port,
 						&self->aq_hw_ops);
-	err = self->aq_hw_ops.get_hw_caps(self->aq_hw, &self->aq_hw_caps);
+	err = self->aq_hw_ops.get_hw_caps(self->aq_hw, &self->aq_hw_caps,
+					  pdev->device, pdev->subsystem_device);
 	if (err < 0)
 		goto err_exit;
 
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
index 4309983acdd6..3c9f8db03d5f 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
@@ -71,7 +71,7 @@ struct aq_nic_cfg_s {
 
 struct aq_nic_s *aq_nic_alloc_cold(const struct net_device_ops *ndev_ops,
 				   const struct ethtool_ops *et_ops,
-				   struct device *dev,
+				   struct pci_dev *pdev,
 				   struct aq_pci_func_s *aq_pci_func,
 				   unsigned int port,
 				   const struct aq_hw_ops *aq_hw_ops);
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c
index cadaa646c89f..58c29d04b186 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c
@@ -51,7 +51,8 @@ struct aq_pci_func_s *aq_pci_func_alloc(struct aq_hw_ops *aq_hw_ops,
 	pci_set_drvdata(pdev, self);
 	self->pdev = pdev;
 
-	err = aq_hw_ops->get_hw_caps(NULL, &self->aq_hw_caps);
+	err = aq_hw_ops->get_hw_caps(NULL, &self->aq_hw_caps, pdev->device,
+				     pdev->subsystem_device);
 	if (err < 0)
 		goto err_exit;
 
@@ -59,7 +60,7 @@ struct aq_pci_func_s *aq_pci_func_alloc(struct aq_hw_ops *aq_hw_ops,
 
 	for (port = 0; port < self->ports; ++port) {
 		struct aq_nic_s *aq_nic = aq_nic_alloc_cold(ndev_ops, eth_ops,
-							    &pdev->dev, self,
+							    pdev, self,
 							    port, aq_hw_ops);
 
 		if (!aq_nic) {
diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0.c
index 07b3c49a16a4..b0abd187cead 100644
--- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0.c
+++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0.c
@@ -18,9 +18,20 @@
 #include "hw_atl_a0_internal.h"
 
 static int hw_atl_a0_get_hw_caps(struct aq_hw_s *self,
-				 struct aq_hw_caps_s *aq_hw_caps)
+				 struct aq_hw_caps_s *aq_hw_caps,
+				 unsigned short device,
+				 unsigned short subsystem_device)
 {
 	memcpy(aq_hw_caps, &hw_atl_a0_hw_caps_, sizeof(*aq_hw_caps));
+
+	if (device == HW_ATL_DEVICE_ID_D108 && subsystem_device == 0x0001)
+		aq_hw_caps->link_speed_msk &= ~HW_ATL_A0_RATE_10G;
+
+	if (device == HW_ATL_DEVICE_ID_D109 && subsystem_device == 0x0001) {
+		aq_hw_caps->link_speed_msk &= ~HW_ATL_A0_RATE_10G;
+		aq_hw_caps->link_speed_msk &= ~HW_ATL_A0_RATE_5G;
+	}
+
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c
index ec68c20efcbd..e4e3b8e2d67e 100644
--- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c
+++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c
@@ -18,9 +18,20 @@
 #include "hw_atl_b0_internal.h"
 
 static int hw_atl_b0_get_hw_caps(struct aq_hw_s *self,
-				 struct aq_hw_caps_s *aq_hw_caps)
+				 struct aq_hw_caps_s *aq_hw_caps,
+				 unsigned short device,
+				 unsigned short subsystem_device)
 {
 	memcpy(aq_hw_caps, &hw_atl_b0_hw_caps_, sizeof(*aq_hw_caps));
+
+	if (device == HW_ATL_DEVICE_ID_D108 && subsystem_device == 0x0001)
+		aq_hw_caps->link_speed_msk &= ~HW_ATL_B0_RATE_10G;
+
+	if (device == HW_ATL_DEVICE_ID_D109 && subsystem_device == 0x0001) {
+		aq_hw_caps->link_speed_msk &= ~HW_ATL_B0_RATE_10G;
+		aq_hw_caps->link_speed_msk &= ~HW_ATL_B0_RATE_5G;
+	}
+
 	return 0;
 }
 

From 1e366161510f266516107a69db91f1f2edaea11c Mon Sep 17 00:00:00 2001
From: Igor Russkikh <igor.russkikh@aquantia.com>
Date: Thu, 14 Dec 2017 12:34:41 +0300
Subject: [PATCH 61/76] net: aquantia: Fix hardware DMA stream overload on
 large MRRS

Systems with large MRRS on device (2K, 4K) with high data rates and/or
large MTU, atlantic observes DMA packet buffer overflow. On some systems
that causes PCIe transaction errors, hardware NMIs or datapath freeze.
This patch
1) Limits MRRS from device side to 2K (thats maximum our hardware supports)
2) Limit maximum size of outstanding TX DMA data read requests. This makes
hardware buffers running fine.

Signed-off-by: Pavel Belous <pavel.belous@aquantia.com>
Signed-off-by: Igor Russkikh <igor.russkikh@aquantia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c    | 12 ++++++++++++
 .../aquantia/atlantic/hw_atl/hw_atl_llh_internal.h   |  6 ++++++
 2 files changed, 18 insertions(+)

diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c
index e4e3b8e2d67e..36fddb199160 100644
--- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c
+++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c
@@ -16,6 +16,7 @@
 #include "hw_atl_utils.h"
 #include "hw_atl_llh.h"
 #include "hw_atl_b0_internal.h"
+#include "hw_atl_llh_internal.h"
 
 static int hw_atl_b0_get_hw_caps(struct aq_hw_s *self,
 				 struct aq_hw_caps_s *aq_hw_caps,
@@ -368,6 +369,7 @@ static int hw_atl_b0_hw_init(struct aq_hw_s *self,
 	};
 
 	int err = 0;
+	u32 val;
 
 	self->aq_nic_cfg = aq_nic_cfg;
 
@@ -385,6 +387,16 @@ static int hw_atl_b0_hw_init(struct aq_hw_s *self,
 	hw_atl_b0_hw_rss_set(self, &aq_nic_cfg->aq_rss);
 	hw_atl_b0_hw_rss_hash_set(self, &aq_nic_cfg->aq_rss);
 
+	/* Force limit MRRS on RDM/TDM to 2K */
+	val = aq_hw_read_reg(self, pci_reg_control6_adr);
+	aq_hw_write_reg(self, pci_reg_control6_adr, (val & ~0x707) | 0x404);
+
+	/* TX DMA total request limit. B0 hardware is not capable to
+	 * handle more than (8K-MRRS) incoming DMA data.
+	 * Value 24 in 256byte units
+	 */
+	aq_hw_write_reg(self, tx_dma_total_req_limit_adr, 24);
+
 	err = aq_hw_err_from_flags(self);
 	if (err < 0)
 		goto err_exit;
diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh_internal.h b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh_internal.h
index 5527fc0e5942..93450ec930e8 100644
--- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh_internal.h
+++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh_internal.h
@@ -2343,6 +2343,9 @@
 #define tx_dma_desc_base_addrmsw_adr(descriptor) \
 			(0x00007c04u + (descriptor) * 0x40)
 
+/* tx dma total request limit */
+#define tx_dma_total_req_limit_adr 0x00007b20u
+
 /* tx interrupt moderation control register definitions
  * Preprocessor definitions for TX Interrupt Moderation Control Register
  * Base Address: 0x00008980
@@ -2369,6 +2372,9 @@
 /* default value of bitfield reg_res_dsbl */
 #define pci_reg_res_dsbl_default 0x1
 
+/* PCI core control register */
+#define pci_reg_control6_adr 0x1014u
+
 /* global microprocessor scratch pad definitions */
 #define glb_cpu_scratch_scp_adr(scratch_scp) (0x00000300u + (scratch_scp) * 0x4)
 

From be08d839d9ef1c9b0e4ed809ec852ff100f9970d Mon Sep 17 00:00:00 2001
From: Igor Russkikh <igor.russkikh@aquantia.com>
Date: Thu, 14 Dec 2017 12:34:42 +0300
Subject: [PATCH 62/76] net: aquantia: Extend stat counters to 64bit values

Device hardware provides only 32bit counters. Using these directly
causes byte counters to overflow soon. A separate nic level structure
with 64 bit counters is now used to collect incrementally all the stats
and report these counters to ethtool stats and ndev stats.

Signed-off-by: Igor Russkikh <igor.russkikh@aquantia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/aquantia/atlantic/aq_hw.h    | 25 +++++-
 .../net/ethernet/aquantia/atlantic/aq_nic.c   | 35 +++++++--
 .../aquantia/atlantic/hw_atl/hw_atl_utils.c   | 76 ++++++-------------
 .../aquantia/atlantic/hw_atl/hw_atl_utils.h   |  6 +-
 4 files changed, 77 insertions(+), 65 deletions(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_hw.h b/drivers/net/ethernet/aquantia/atlantic/aq_hw.h
index 4ebd53b3c7da..b3825de6cdfb 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_hw.h
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_hw.h
@@ -46,6 +46,28 @@ struct aq_hw_link_status_s {
 	unsigned int mbps;
 };
 
+struct aq_stats_s {
+	u64 uprc;
+	u64 mprc;
+	u64 bprc;
+	u64 erpt;
+	u64 uptc;
+	u64 mptc;
+	u64 bptc;
+	u64 erpr;
+	u64 mbtc;
+	u64 bbtc;
+	u64 mbrc;
+	u64 bbrc;
+	u64 ubrc;
+	u64 ubtc;
+	u64 dpc;
+	u64 dma_pkt_rc;
+	u64 dma_pkt_tc;
+	u64 dma_oct_rc;
+	u64 dma_oct_tc;
+};
+
 #define AQ_HW_IRQ_INVALID 0U
 #define AQ_HW_IRQ_LEGACY  1U
 #define AQ_HW_IRQ_MSI     2U
@@ -166,8 +188,7 @@ struct aq_hw_ops {
 
 	int (*hw_update_stats)(struct aq_hw_s *self);
 
-	int (*hw_get_hw_stats)(struct aq_hw_s *self, u64 *data,
-			       unsigned int *p_count);
+	struct aq_stats_s *(*hw_get_hw_stats)(struct aq_hw_s *self);
 
 	int (*hw_get_fw_version)(struct aq_hw_s *self, u32 *fw_version);
 
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
index a360ccc298b9..28cbe9d43df6 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
@@ -750,16 +750,40 @@ int aq_nic_get_regs_count(struct aq_nic_s *self)
 
 void aq_nic_get_stats(struct aq_nic_s *self, u64 *data)
 {
-	struct aq_vec_s *aq_vec = NULL;
 	unsigned int i = 0U;
 	unsigned int count = 0U;
-	int err = 0;
+	struct aq_vec_s *aq_vec = NULL;
+	struct aq_stats_s *stats = self->aq_hw_ops.hw_get_hw_stats(self->aq_hw);
 
-	err = self->aq_hw_ops.hw_get_hw_stats(self->aq_hw, data, &count);
-	if (err < 0)
+	if (!stats)
 		goto err_exit;
 
-	data += count;
+	data[i] = stats->uprc + stats->mprc + stats->bprc;
+	data[++i] = stats->uprc;
+	data[++i] = stats->mprc;
+	data[++i] = stats->bprc;
+	data[++i] = stats->erpt;
+	data[++i] = stats->uptc + stats->mptc + stats->bptc;
+	data[++i] = stats->uptc;
+	data[++i] = stats->mptc;
+	data[++i] = stats->bptc;
+	data[++i] = stats->ubrc;
+	data[++i] = stats->ubtc;
+	data[++i] = stats->mbrc;
+	data[++i] = stats->mbtc;
+	data[++i] = stats->bbrc;
+	data[++i] = stats->bbtc;
+	data[++i] = stats->ubrc + stats->mbrc + stats->bbrc;
+	data[++i] = stats->ubtc + stats->mbtc + stats->bbtc;
+	data[++i] = stats->dma_pkt_rc;
+	data[++i] = stats->dma_pkt_tc;
+	data[++i] = stats->dma_oct_rc;
+	data[++i] = stats->dma_oct_tc;
+	data[++i] = stats->dpc;
+
+	i++;
+
+	data += i;
 	count = 0U;
 
 	for (i = 0U, aq_vec = self->aq_vec[0];
@@ -769,7 +793,6 @@ void aq_nic_get_stats(struct aq_nic_s *self, u64 *data)
 	}
 
 err_exit:;
-	(void)err;
 }
 
 void aq_nic_get_link_ksettings(struct aq_nic_s *self,
diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c
index 1fe016fc4bc7..f2ce12ed4218 100644
--- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c
+++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c
@@ -503,73 +503,43 @@ int hw_atl_utils_update_stats(struct aq_hw_s *self)
 	struct hw_atl_s *hw_self = PHAL_ATLANTIC;
 	struct hw_aq_atl_utils_mbox mbox;
 
-	if (!self->aq_link_status.mbps)
-		return 0;
-
 	hw_atl_utils_mpi_read_stats(self, &mbox);
 
 #define AQ_SDELTA(_N_) (hw_self->curr_stats._N_ += \
 			mbox.stats._N_ - hw_self->last_stats._N_)
+	if (self->aq_link_status.mbps) {
+		AQ_SDELTA(uprc);
+		AQ_SDELTA(mprc);
+		AQ_SDELTA(bprc);
+		AQ_SDELTA(erpt);
 
-	AQ_SDELTA(uprc);
-	AQ_SDELTA(mprc);
-	AQ_SDELTA(bprc);
-	AQ_SDELTA(erpt);
-
-	AQ_SDELTA(uptc);
-	AQ_SDELTA(mptc);
-	AQ_SDELTA(bptc);
-	AQ_SDELTA(erpr);
-
-	AQ_SDELTA(ubrc);
-	AQ_SDELTA(ubtc);
-	AQ_SDELTA(mbrc);
-	AQ_SDELTA(mbtc);
-	AQ_SDELTA(bbrc);
-	AQ_SDELTA(bbtc);
-	AQ_SDELTA(dpc);
+		AQ_SDELTA(uptc);
+		AQ_SDELTA(mptc);
+		AQ_SDELTA(bptc);
+		AQ_SDELTA(erpr);
 
+		AQ_SDELTA(ubrc);
+		AQ_SDELTA(ubtc);
+		AQ_SDELTA(mbrc);
+		AQ_SDELTA(mbtc);
+		AQ_SDELTA(bbrc);
+		AQ_SDELTA(bbtc);
+		AQ_SDELTA(dpc);
+	}
 #undef AQ_SDELTA
+	hw_self->curr_stats.dma_pkt_rc = stats_rx_dma_good_pkt_counterlsw_get(self);
+	hw_self->curr_stats.dma_pkt_tc = stats_tx_dma_good_pkt_counterlsw_get(self);
+	hw_self->curr_stats.dma_oct_rc = stats_rx_dma_good_octet_counterlsw_get(self);
+	hw_self->curr_stats.dma_oct_tc = stats_tx_dma_good_octet_counterlsw_get(self);
 
 	memcpy(&hw_self->last_stats, &mbox.stats, sizeof(mbox.stats));
 
 	return 0;
 }
 
-int hw_atl_utils_get_hw_stats(struct aq_hw_s *self,
-			      u64 *data, unsigned int *p_count)
+struct aq_stats_s *hw_atl_utils_get_hw_stats(struct aq_hw_s *self)
 {
-	struct hw_atl_s *hw_self = PHAL_ATLANTIC;
-	struct hw_atl_stats_s *stats = &hw_self->curr_stats;
-	int i = 0;
-
-	data[i] = stats->uprc + stats->mprc + stats->bprc;
-	data[++i] = stats->uprc;
-	data[++i] = stats->mprc;
-	data[++i] = stats->bprc;
-	data[++i] = stats->erpt;
-	data[++i] = stats->uptc + stats->mptc + stats->bptc;
-	data[++i] = stats->uptc;
-	data[++i] = stats->mptc;
-	data[++i] = stats->bptc;
-	data[++i] = stats->ubrc;
-	data[++i] = stats->ubtc;
-	data[++i] = stats->mbrc;
-	data[++i] = stats->mbtc;
-	data[++i] = stats->bbrc;
-	data[++i] = stats->bbtc;
-	data[++i] = stats->ubrc + stats->mbrc + stats->bbrc;
-	data[++i] = stats->ubtc + stats->mbtc + stats->bbtc;
-	data[++i] = stats_rx_dma_good_pkt_counterlsw_get(self);
-	data[++i] = stats_tx_dma_good_pkt_counterlsw_get(self);
-	data[++i] = stats_rx_dma_good_octet_counterlsw_get(self);
-	data[++i] = stats_tx_dma_good_octet_counterlsw_get(self);
-	data[++i] = stats->dpc;
-
-	if (p_count)
-		*p_count = ++i;
-
-	return 0;
+	return &PHAL_ATLANTIC->curr_stats;
 }
 
 static const u32 hw_atl_utils_hw_mac_regs[] = {
diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.h b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.h
index c99cc690e425..21aeca6908d3 100644
--- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.h
+++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.h
@@ -129,7 +129,7 @@ struct __packed hw_aq_atl_utils_mbox {
 struct __packed hw_atl_s {
 	struct aq_hw_s base;
 	struct hw_atl_stats_s last_stats;
-	struct hw_atl_stats_s curr_stats;
+	struct aq_stats_s curr_stats;
 	u64 speed;
 	unsigned int chip_features;
 	u32 fw_ver_actual;
@@ -207,8 +207,6 @@ int hw_atl_utils_get_fw_version(struct aq_hw_s *self, u32 *fw_version);
 
 int hw_atl_utils_update_stats(struct aq_hw_s *self);
 
-int hw_atl_utils_get_hw_stats(struct aq_hw_s *self,
-			      u64 *data,
-			      unsigned int *p_count);
+struct aq_stats_s *hw_atl_utils_get_hw_stats(struct aq_hw_s *self);
 
 #endif /* HW_ATL_UTILS_H */

From 9f8a2203a542f5f3cdeb17f40250c49bb87aa7e3 Mon Sep 17 00:00:00 2001
From: Igor Russkikh <igor.russkikh@aquantia.com>
Date: Thu, 14 Dec 2017 12:34:43 +0300
Subject: [PATCH 63/76] net: aquantia: Fill ndev stat couters from hardware

Originally they were filled from ring sw counters.
These sometimes incorrectly calculate byte and packet amounts
when using LRO/LSO and jumboframes. Filling ndev counters from
hardware makes them precise.

Signed-off-by: Igor Russkikh <igor.russkikh@aquantia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/aquantia/atlantic/aq_nic.c   | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
index 28cbe9d43df6..307caac68731 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
@@ -37,6 +37,8 @@ static unsigned int aq_itr_rx;
 module_param_named(aq_itr_rx, aq_itr_rx, uint, 0644);
 MODULE_PARM_DESC(aq_itr_rx, "RX interrupt throttle rate");
 
+static void aq_nic_update_ndev_stats(struct aq_nic_s *self);
+
 static void aq_nic_rss_init(struct aq_nic_s *self, unsigned int num_rss_queues)
 {
 	struct aq_nic_cfg_s *cfg = &self->aq_nic_cfg;
@@ -166,11 +168,7 @@ static int aq_nic_update_link_status(struct aq_nic_s *self)
 static void aq_nic_service_timer_cb(struct timer_list *t)
 {
 	struct aq_nic_s *self = from_timer(self, t, service_timer);
-	struct net_device *ndev = aq_nic_get_ndev(self);
 	int err = 0;
-	unsigned int i = 0U;
-	struct aq_ring_stats_rx_s stats_rx;
-	struct aq_ring_stats_tx_s stats_tx;
 
 	if (aq_utils_obj_test(&self->header.flags, AQ_NIC_FLAGS_IS_NOT_READY))
 		goto err_exit;
@@ -182,19 +180,8 @@ static void aq_nic_service_timer_cb(struct timer_list *t)
 	if (self->aq_hw_ops.hw_update_stats)
 		self->aq_hw_ops.hw_update_stats(self->aq_hw);
 
-	memset(&stats_rx, 0U, sizeof(struct aq_ring_stats_rx_s));
-	memset(&stats_tx, 0U, sizeof(struct aq_ring_stats_tx_s));
-	for (i = AQ_DIMOF(self->aq_vec); i--;) {
-		if (self->aq_vec[i])
-			aq_vec_add_stats(self->aq_vec[i], &stats_rx, &stats_tx);
-	}
+	aq_nic_update_ndev_stats(self);
 
-	ndev->stats.rx_packets = stats_rx.packets;
-	ndev->stats.rx_bytes = stats_rx.bytes;
-	ndev->stats.rx_errors = stats_rx.errors;
-	ndev->stats.tx_packets = stats_tx.packets;
-	ndev->stats.tx_bytes = stats_tx.bytes;
-	ndev->stats.tx_errors = stats_tx.errors;
 
 err_exit:
 	mod_timer(&self->service_timer,
@@ -795,6 +782,19 @@ void aq_nic_get_stats(struct aq_nic_s *self, u64 *data)
 err_exit:;
 }
 
+static void aq_nic_update_ndev_stats(struct aq_nic_s *self)
+{
+	struct net_device *ndev = self->ndev;
+	struct aq_stats_s *stats = self->aq_hw_ops.hw_get_hw_stats(self->aq_hw);
+
+	ndev->stats.rx_packets = stats->uprc + stats->mprc + stats->bprc;
+	ndev->stats.rx_bytes = stats->ubrc + stats->mbrc + stats->bbrc;
+	ndev->stats.rx_errors = stats->erpr;
+	ndev->stats.tx_packets = stats->uptc + stats->mptc + stats->bptc;
+	ndev->stats.tx_bytes = stats->ubtc + stats->mbtc + stats->bbtc;
+	ndev->stats.tx_errors = stats->erpt;
+}
+
 void aq_nic_get_link_ksettings(struct aq_nic_s *self,
 			       struct ethtool_link_ksettings *cmd)
 {

From 45cc1c7ad47c4d166d15c7bce449d2de4daef0c5 Mon Sep 17 00:00:00 2001
From: Igor Russkikh <igor.russkikh@aquantia.com>
Date: Thu, 14 Dec 2017 12:34:44 +0300
Subject: [PATCH 64/76] net: aquantia: Fill in multicast counter in ndev stats
 from hardware

This metric comes from HW and is also diff-calculated, like other counters

Signed-off-by: Igor Russkikh <igor.russkikh@aquantia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/aquantia/atlantic/aq_nic.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
index 307caac68731..b3a5d1fbc713 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
@@ -793,6 +793,7 @@ static void aq_nic_update_ndev_stats(struct aq_nic_s *self)
 	ndev->stats.tx_packets = stats->uptc + stats->mptc + stats->bptc;
 	ndev->stats.tx_bytes = stats->ubtc + stats->mbtc + stats->bbtc;
 	ndev->stats.tx_errors = stats->erpt;
+	ndev->stats.multicast = stats->mprc;
 }
 
 void aq_nic_get_link_ksettings(struct aq_nic_s *self,

From fdb4a0830e74acfbe84d4d4e6772ea09c96786ad Mon Sep 17 00:00:00 2001
From: Igor Russkikh <igor.russkikh@aquantia.com>
Date: Thu, 14 Dec 2017 12:34:45 +0300
Subject: [PATCH 65/76] net: aquantia: Improve link state and statistics check
 interval callback

Reduce timeout from 2 secs to 1 sec. If link is down,
reduce it to 500msec. This speeds up link detection.

Signed-off-by: Igor Russkikh <igor.russkikh@aquantia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/aquantia/atlantic/aq_cfg.h | 2 +-
 drivers/net/ethernet/aquantia/atlantic/aq_nic.c | 7 +++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h b/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h
index 57e796870595..73b93a7b4800 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h
@@ -50,7 +50,7 @@
 #define AQ_CFG_PCI_FUNC_MSIX_IRQS   9U
 #define AQ_CFG_PCI_FUNC_PORTS       2U
 
-#define AQ_CFG_SERVICE_TIMER_INTERVAL    (2 * HZ)
+#define AQ_CFG_SERVICE_TIMER_INTERVAL    (1 * HZ)
 #define AQ_CFG_POLLING_TIMER_INTERVAL   ((unsigned int)(2 * HZ))
 
 #define AQ_CFG_SKB_FRAGS_MAX   32U
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
index b3a5d1fbc713..75a894a9251c 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
@@ -168,6 +168,7 @@ static int aq_nic_update_link_status(struct aq_nic_s *self)
 static void aq_nic_service_timer_cb(struct timer_list *t)
 {
 	struct aq_nic_s *self = from_timer(self, t, service_timer);
+	int ctimer = AQ_CFG_SERVICE_TIMER_INTERVAL;
 	int err = 0;
 
 	if (aq_utils_obj_test(&self->header.flags, AQ_NIC_FLAGS_IS_NOT_READY))
@@ -182,10 +183,12 @@ static void aq_nic_service_timer_cb(struct timer_list *t)
 
 	aq_nic_update_ndev_stats(self);
 
+	/* If no link - use faster timer rate to detect link up asap */
+	if (!netif_carrier_ok(self->ndev))
+		ctimer = max(ctimer / 2, 1);
 
 err_exit:
-	mod_timer(&self->service_timer,
-		  jiffies + AQ_CFG_SERVICE_TIMER_INTERVAL);
+	mod_timer(&self->service_timer, jiffies + ctimer);
 }
 
 static void aq_nic_polling_timer_cb(struct timer_list *t)

From f3e2778429c2ad8555e888858e0f0e98c86c4b0f Mon Sep 17 00:00:00 2001
From: Igor Russkikh <igor.russkikh@aquantia.com>
Date: Thu, 14 Dec 2017 12:34:46 +0300
Subject: [PATCH 66/76] net: aquantia: Update hw counters on hw init

On very first start we should read out current HW counter values
to make diff based calculations later.
This also should be done each time NIC gets down/up or wakes up
after sleep state. We reset link state explicitly to prevent diffs
from being summed this first time.

Signed-off-by: Igor Russkikh <igor.russkikh@aquantia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0.c | 4 ++++
 drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0.c
index b0abd187cead..f18dce14c93c 100644
--- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0.c
+++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0.c
@@ -344,6 +344,10 @@ static int hw_atl_a0_hw_init(struct aq_hw_s *self,
 	hw_atl_a0_hw_rss_set(self, &aq_nic_cfg->aq_rss);
 	hw_atl_a0_hw_rss_hash_set(self, &aq_nic_cfg->aq_rss);
 
+	/* Reset link status and read out initial hardware counters */
+	self->aq_link_status.mbps = 0;
+	hw_atl_utils_update_stats(self);
+
 	err = aq_hw_err_from_flags(self);
 	if (err < 0)
 		goto err_exit;
diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c
index 36fddb199160..e4a22ce7bf09 100644
--- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c
+++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c
@@ -397,6 +397,10 @@ static int hw_atl_b0_hw_init(struct aq_hw_s *self,
 	 */
 	aq_hw_write_reg(self, tx_dma_total_req_limit_adr, 24);
 
+	/* Reset link status and read out initial hardware counters */
+	self->aq_link_status.mbps = 0;
+	hw_atl_utils_update_stats(self);
+
 	err = aq_hw_err_from_flags(self);
 	if (err < 0)
 		goto err_exit;

From 98bc036de40489416d61ab175bb417c094e7783c Mon Sep 17 00:00:00 2001
From: Igor Russkikh <igor.russkikh@aquantia.com>
Date: Thu, 14 Dec 2017 12:34:47 +0300
Subject: [PATCH 67/76] net: aquantia: Fix typo in ethtool statistics names

Signed-off-by: Igor Russkikh <igor.russkikh@aquantia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/aquantia/atlantic/aq_ethtool.c  | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c b/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c
index 70efb7467bf3..f2d8063a2cef 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c
@@ -66,14 +66,14 @@ static const char aq_ethtool_stat_names[][ETH_GSTRING_LEN] = {
 	"OutUCast",
 	"OutMCast",
 	"OutBCast",
-	"InUCastOctects",
-	"OutUCastOctects",
-	"InMCastOctects",
-	"OutMCastOctects",
-	"InBCastOctects",
-	"OutBCastOctects",
-	"InOctects",
-	"OutOctects",
+	"InUCastOctets",
+	"OutUCastOctets",
+	"InMCastOctets",
+	"OutMCastOctets",
+	"InBCastOctets",
+	"OutBCastOctets",
+	"InOctets",
+	"OutOctets",
 	"InPacketsDma",
 	"OutPacketsDma",
 	"InOctetsDma",

From d4c242d4ba5730b62579969804cd8fcf58b9c84f Mon Sep 17 00:00:00 2001
From: Igor Russkikh <igor.russkikh@aquantia.com>
Date: Thu, 14 Dec 2017 12:34:48 +0300
Subject: [PATCH 68/76] net: aquantia: Increment driver version

Add a suffix to distinguish kernel mainline version and aquantia releases

Signed-off-by: Igor Russkikh <igor.russkikh@aquantia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/aquantia/atlantic/aq_cfg.h | 3 ++-
 drivers/net/ethernet/aquantia/atlantic/ver.h    | 6 ++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h b/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h
index 73b93a7b4800..105fdb958cef 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h
@@ -80,6 +80,7 @@
 #define AQ_CFG_DRV_VERSION	__stringify(NIC_MAJOR_DRIVER_VERSION)"."\
 				__stringify(NIC_MINOR_DRIVER_VERSION)"."\
 				__stringify(NIC_BUILD_DRIVER_VERSION)"."\
-				__stringify(NIC_REVISION_DRIVER_VERSION)
+				__stringify(NIC_REVISION_DRIVER_VERSION) \
+				AQ_CFG_DRV_VERSION_SUFFIX
 
 #endif /* AQ_CFG_H */
diff --git a/drivers/net/ethernet/aquantia/atlantic/ver.h b/drivers/net/ethernet/aquantia/atlantic/ver.h
index 0de858d215c2..9009f2651e70 100644
--- a/drivers/net/ethernet/aquantia/atlantic/ver.h
+++ b/drivers/net/ethernet/aquantia/atlantic/ver.h
@@ -11,8 +11,10 @@
 #define VER_H
 
 #define NIC_MAJOR_DRIVER_VERSION           1
-#define NIC_MINOR_DRIVER_VERSION           5
-#define NIC_BUILD_DRIVER_VERSION           345
+#define NIC_MINOR_DRIVER_VERSION           6
+#define NIC_BUILD_DRIVER_VERSION           13
 #define NIC_REVISION_DRIVER_VERSION        0
 
+#define AQ_CFG_DRV_VERSION_SUFFIX "-kern"
+
 #endif /* VER_H */

From 7a4fa29106d9a38ef005f5ab15d493c259f269c0 Mon Sep 17 00:00:00 2001
From: Yuval Mintz <yuvalm@mellanox.com>
Date: Thu, 14 Dec 2017 15:54:29 +0200
Subject: [PATCH 69/76] net: sched: Add TCA_HW_OFFLOAD

Qdiscs can be offloaded to HW, but current implementation isn't uniform.
Instead, qdiscs either pass information about offload status via their
TCA_OPTIONS or omit it altogether.

Introduce a new attribute - TCA_HW_OFFLOAD that would form a uniform
uAPI for the offloading status of qdiscs.

Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h      | 1 +
 include/uapi/linux/rtnetlink.h | 1 +
 net/sched/sch_api.c            | 2 ++
 3 files changed, 4 insertions(+)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 65d0d25f2648..83a3e47d5845 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -71,6 +71,7 @@ struct Qdisc {
 				      * qdisc_tree_decrease_qlen() should stop.
 				      */
 #define TCQ_F_INVISIBLE		0x80 /* invisible by default in dump */
+#define TCQ_F_OFFLOADED		0x200 /* qdisc is offloaded to HW */
 	u32			limit;
 	const struct Qdisc_ops	*ops;
 	struct qdisc_size_table	__rcu *stab;
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index d8b5f80c2ea6..843e29aa3cac 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -557,6 +557,7 @@ enum {
 	TCA_PAD,
 	TCA_DUMP_INVISIBLE,
 	TCA_CHAIN,
+	TCA_HW_OFFLOAD,
 	__TCA_MAX
 };
 
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index b6c4f536876b..0f1eab99ff4e 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -795,6 +795,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 	tcm->tcm_info = refcount_read(&q->refcnt);
 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
 		goto nla_put_failure;
+	if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
+		goto nla_put_failure;
 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
 		goto nla_put_failure;
 	qlen = q->q.qlen;

From 428a68af3a7c3a3380ff1f750a24d213f370f89f Mon Sep 17 00:00:00 2001
From: Yuval Mintz <yuvalm@mellanox.com>
Date: Thu, 14 Dec 2017 15:54:30 +0200
Subject: [PATCH 70/76] net: sched: Move to new offload indication in RED

Let RED utilize the new internal flag, TCQ_F_OFFLOADED,
to mark a given qdisc as offloaded instead of using a dedicated
indication.

Also, change internal logic into looking at said flag when possible.

Fixes: 602f3baf2218 ("net_sch: red: Add offload ability to RED qdisc")
Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_red.c | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 9d874e60e032..f0747eb87dc4 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -157,6 +157,7 @@ static int red_offload(struct Qdisc *sch, bool enable)
 		.handle = sch->handle,
 		.parent = sch->parent,
 	};
+	int err;
 
 	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
 		return -EOPNOTSUPP;
@@ -171,7 +172,14 @@ static int red_offload(struct Qdisc *sch, bool enable)
 		opt.command = TC_RED_DESTROY;
 	}
 
-	return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, &opt);
+	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, &opt);
+
+	if (!err && enable)
+		sch->flags |= TCQ_F_OFFLOADED;
+	else
+		sch->flags &= ~TCQ_F_OFFLOADED;
+
+	return err;
 }
 
 static void red_destroy(struct Qdisc *sch)
@@ -274,7 +282,7 @@ static int red_init(struct Qdisc *sch, struct nlattr *opt)
 	return red_change(sch, opt);
 }
 
-static int red_dump_offload(struct Qdisc *sch, struct tc_red_qopt *opt)
+static int red_dump_offload_stats(struct Qdisc *sch, struct tc_red_qopt *opt)
 {
 	struct net_device *dev = qdisc_dev(sch);
 	struct tc_red_qopt_offload hw_stats = {
@@ -286,21 +294,12 @@ static int red_dump_offload(struct Qdisc *sch, struct tc_red_qopt *opt)
 			.stats.qstats = &sch->qstats,
 		},
 	};
-	int err;
 
-	opt->flags &= ~TC_RED_OFFLOADED;
-	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+	if (!(sch->flags & TCQ_F_OFFLOADED))
 		return 0;
 
-	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED,
-					    &hw_stats);
-	if (err == -EOPNOTSUPP)
-		return 0;
-
-	if (!err)
-		opt->flags |= TC_RED_OFFLOADED;
-
-	return err;
+	return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED,
+					     &hw_stats);
 }
 
 static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
@@ -319,7 +318,7 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
 	int err;
 
 	sch->qstats.backlog = q->qdisc->qstats.backlog;
-	err = red_dump_offload(sch, &opt);
+	err = red_dump_offload_stats(sch, &opt);
 	if (err)
 		goto nla_put_failure;
 
@@ -347,7 +346,7 @@ static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 		.marked	= q->stats.prob_mark + q->stats.forced_mark,
 	};
 
-	if (tc_can_offload(dev) &&  dev->netdev_ops->ndo_setup_tc) {
+	if (sch->flags & TCQ_F_OFFLOADED) {
 		struct red_stats hw_stats = {0};
 		struct tc_red_qopt_offload hw_stats_request = {
 			.command = TC_RED_XSTATS,

From 4a98795bc8ea148b1ebbbf001283e06430cffe36 Mon Sep 17 00:00:00 2001
From: Yuval Mintz <yuvalm@mellanox.com>
Date: Thu, 14 Dec 2017 15:54:31 +0200
Subject: [PATCH 71/76] pkt_sched: Remove TC_RED_OFFLOADED from uapi

Following the previous patch, RED is now using the new uniform uapi
for indicating it's offloaded. As a result, TC_RED_OFFLOADED is no
longer utilized by kernel and can be removed [as it's still not
part of any stable release].

Fixes: 602f3baf2218 ("net_sch: red: Add offload ability to RED qdisc")
Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index af3cc2f4e1ad..37b5096ae97b 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -256,7 +256,6 @@ struct tc_red_qopt {
 #define TC_RED_ECN		1
 #define TC_RED_HARDDROP		2
 #define TC_RED_ADAPTATIVE	4
-#define TC_RED_OFFLOADED	8
 };
 
 struct tc_red_xstats {

From c647c0d62c82eb3ddf78a0d8b3d58819d9f552aa Mon Sep 17 00:00:00 2001
From: Daniele Palmas <dnlplm@gmail.com>
Date: Thu, 14 Dec 2017 16:56:14 +0100
Subject: [PATCH 72/76] net: usb: qmi_wwan: add Telit ME910 PID 0x1101 support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds support for Telit ME910 PID 0x1101.

Signed-off-by: Daniele Palmas <dnlplm@gmail.com>
Acked-by: Bjørn Mork <bjorn@mork.no>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/qmi_wwan.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index d2ca5a202e8d..3000ddd1c7e2 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -1211,6 +1211,7 @@ static const struct usb_device_id products[] = {
 	{QMI_FIXED_INTF(0x2357, 0x9000, 4)},	/* TP-LINK MA260 */
 	{QMI_QUIRK_SET_DTR(0x1bc7, 0x1040, 2)},	/* Telit LE922A */
 	{QMI_FIXED_INTF(0x1bc7, 0x1100, 3)},	/* Telit ME910 */
+	{QMI_FIXED_INTF(0x1bc7, 0x1101, 3)},	/* Telit ME910 dual modem */
 	{QMI_FIXED_INTF(0x1bc7, 0x1200, 5)},	/* Telit LE920 */
 	{QMI_QUIRK_SET_DTR(0x1bc7, 0x1201, 2)},	/* Telit LE920, LE920A4 */
 	{QMI_FIXED_INTF(0x1c9e, 0x9801, 3)},	/* Telewell TW-3G HSPA+ */

From c05fad5713b81b049ec6ac4eb2d304030b1efdce Mon Sep 17 00:00:00 2001
From: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Date: Fri, 15 Dec 2017 10:46:16 +0800
Subject: [PATCH 73/76] ip_gre: fix wrong return value of erspan_rcv

If pskb_may_pull return failed, return PACKET_REJECT instead of -ENOMEM.

Fixes: 84e54fe0a5ea ("gre: introduce native tunnel support for ERSPAN")
Cc: William Tu <u9012063@gmail.com>
Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Acked-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index bb6239169b1a..9c1735632c8c 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -266,7 +266,7 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 	len = gre_hdr_len + sizeof(*ershdr);
 
 	if (unlikely(!pskb_may_pull(skb, len)))
-		return -ENOMEM;
+		return PACKET_REJECT;
 
 	iph = ip_hdr(skb);
 	ershdr = (struct erspanhdr *)(skb->data + gre_hdr_len);

From 343723dd51ef1025a860e54df9472b5ba21ee3d9 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 15 Dec 2017 12:40:12 +0100
Subject: [PATCH 74/76] net: sched: fix clsact init error path

Since in qdisc_create, the destroy op is called when init fails, we
don't do cleanup in init and leave it up to destroy.
This fixes use-after-free when trying to put already freed block.

Fixes: 6e40cf2d4dee ("net: sched: use extended variants of block_get/put in ingress and clsact qdiscs")
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c     | 4 ++--
 net/sched/sch_ingress.c | 6 +-----
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index f40256a3e7f0..b91ea03e3afa 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -351,6 +351,8 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
 {
 	struct tcf_chain *chain;
 
+	if (!block)
+		return;
 	/* Hold a refcnt for all chains, except 0, so that they don't disappear
 	 * while we are iterating.
 	 */
@@ -377,8 +379,6 @@ void tcf_block_put(struct tcf_block *block)
 {
 	struct tcf_block_ext_info ei = {0, };
 
-	if (!block)
-		return;
 	tcf_block_put_ext(block, block->q, &ei);
 }
 
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index 5ecc38f35d47..5e1cd2e5df87 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -190,7 +190,7 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt)
 
 	err = tcf_block_get_ext(&q->egress_block, sch, &q->egress_block_info);
 	if (err)
-		goto err_egress_block_get;
+		return err;
 
 	net_inc_ingress_queue();
 	net_inc_egress_queue();
@@ -198,10 +198,6 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt)
 	sch->flags |= TCQ_F_CPUSTATS;
 
 	return 0;
-
-err_egress_block_get:
-	tcf_block_put_ext(q->ingress_block, sch, &q->ingress_block_info);
-	return err;
 }
 
 static void clsact_destroy(struct Qdisc *sch)

From b59e6979a86384e68b0ab6ffeab11f0034fba82d Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 15 Dec 2017 12:40:13 +0100
Subject: [PATCH 75/76] net: sched: fix static key imbalance in case of
 ingress/clsact_init error

Move static key increments to the beginning of the init function
so they pair 1:1 with decrements in ingress/clsact_destroy,
which is called in case ingress/clsact_init fails.

Fixes: 6529eaba33f0 ("net: sched: introduce tcf block infractructure")
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_ingress.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index 5e1cd2e5df87..fc1286f499c1 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -68,6 +68,8 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt)
 	struct net_device *dev = qdisc_dev(sch);
 	int err;
 
+	net_inc_ingress_queue();
+
 	mini_qdisc_pair_init(&q->miniqp, sch, &dev->miniq_ingress);
 
 	q->block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
@@ -78,7 +80,6 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt)
 	if (err)
 		return err;
 
-	net_inc_ingress_queue();
 	sch->flags |= TCQ_F_CPUSTATS;
 
 	return 0;
@@ -172,6 +173,9 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt)
 	struct net_device *dev = qdisc_dev(sch);
 	int err;
 
+	net_inc_ingress_queue();
+	net_inc_egress_queue();
+
 	mini_qdisc_pair_init(&q->miniqp_ingress, sch, &dev->miniq_ingress);
 
 	q->ingress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
@@ -192,9 +196,6 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt)
 	if (err)
 		return err;
 
-	net_inc_ingress_queue();
-	net_inc_egress_queue();
-
 	sch->flags |= TCQ_F_CPUSTATS;
 
 	return 0;

From 043ee1debd0b29c16c4c4b11a348ca667bfe9144 Mon Sep 17 00:00:00 2001
From: Hemanth Puranik <hpuranik@codeaurora.org>
Date: Fri, 15 Dec 2017 20:05:58 +0530
Subject: [PATCH 76/76] net: qcom/emac: Reduce timeout for mdio read/write

Currently mdio read/write takes around ~115us as the timeout
between status check is set to 100us.
By reducing the timeout to 1us mdio read/write takes ~15us to
complete. This improves the link up event response.

Signed-off-by: Hemanth Puranik <hpuranik@codeaurora.org>
Acked-by: Timur Tabi <timur@codeaurora.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qualcomm/emac/emac-phy.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/qualcomm/emac/emac-phy.c b/drivers/net/ethernet/qualcomm/emac/emac-phy.c
index 18461fcb9815..53dbf1e163a8 100644
--- a/drivers/net/ethernet/qualcomm/emac/emac-phy.c
+++ b/drivers/net/ethernet/qualcomm/emac/emac-phy.c
@@ -47,6 +47,7 @@
 #define MDIO_CLK_25_28                                               7
 
 #define MDIO_WAIT_TIMES                                           1000
+#define MDIO_STATUS_DELAY_TIME                                       1
 
 static int emac_mdio_read(struct mii_bus *bus, int addr, int regnum)
 {
@@ -65,7 +66,7 @@ static int emac_mdio_read(struct mii_bus *bus, int addr, int regnum)
 
 	if (readl_poll_timeout(adpt->base + EMAC_MDIO_CTRL, reg,
 			       !(reg & (MDIO_START | MDIO_BUSY)),
-			       100, MDIO_WAIT_TIMES * 100))
+			       MDIO_STATUS_DELAY_TIME, MDIO_WAIT_TIMES * 100))
 		return -EIO;
 
 	return (reg >> MDIO_DATA_SHFT) & MDIO_DATA_BMSK;
@@ -88,8 +89,8 @@ static int emac_mdio_write(struct mii_bus *bus, int addr, int regnum, u16 val)
 	writel(reg, adpt->base + EMAC_MDIO_CTRL);
 
 	if (readl_poll_timeout(adpt->base + EMAC_MDIO_CTRL, reg,
-			       !(reg & (MDIO_START | MDIO_BUSY)), 100,
-			       MDIO_WAIT_TIMES * 100))
+			       !(reg & (MDIO_START | MDIO_BUSY)),
+			       MDIO_STATUS_DELAY_TIME, MDIO_WAIT_TIMES * 100))
 		return -EIO;
 
 	return 0;