From 07dc8bc9a6b15f54d3ad962af74a096c7d7b42b4 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 7 Nov 2017 10:08:01 +0000 Subject: [PATCH 001/269] netfilter: remove redundant assignment to e The assignment to variable e is redundant since the same assignment occurs just a few lines later, hence it can be removed. Cleans up clang warning for arp_tables, ip_tables and ip6_tables: warning: Value stored to 'e' is never read Signed-off-by: Colin Ian King Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arp_tables.c | 1 - net/ipv4/netfilter/ip_tables.c | 1 - net/ipv6/netfilter/ip6_tables.c | 1 - 3 files changed, 3 deletions(-) diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index f88221aebc9d..0c3c944a7b72 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -373,7 +373,6 @@ static int mark_source_chains(const struct xt_table_info *newinfo, if (!xt_find_jump_offset(offsets, newpos, newinfo->number)) return 0; - e = entry0 + newpos; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 4cbe5e80f3bf..2e0d339028bb 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -439,7 +439,6 @@ mark_source_chains(const struct xt_table_info *newinfo, if (!xt_find_jump_offset(offsets, newpos, newinfo->number)) return 0; - e = entry0 + newpos; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index f06e25065a34..1d7ae9366335 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -458,7 +458,6 @@ mark_source_chains(const struct xt_table_info *newinfo, if (!xt_find_jump_offset(offsets, newpos, newinfo->number)) return 0; - e = entry0 + newpos; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; From 613d0776d3fe7eb28c695a63a5533a1ec8258c86 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Sun, 12 Nov 2017 14:32:37 +0300 Subject: [PATCH 002/269] netfilter: exit_net cleanup check added Be sure that lists initialized in net_init hook was return to initial state. Signed-off-by: Vasily Averin Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/ipt_CLUSTERIP.c | 1 + net/netfilter/nf_tables_api.c | 7 +++++++ net/netfilter/nfnetlink_log.c | 5 +++++ net/netfilter/nfnetlink_queue.c | 5 +++++ net/netfilter/x_tables.c | 9 +++++++++ 5 files changed, 27 insertions(+) diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 17b4ca562944..e35b8d074f06 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -819,6 +819,7 @@ static void clusterip_net_exit(struct net *net) cn->procdir = NULL; #endif nf_unregister_net_hook(net, &cip_arp_ops); + WARN_ON_ONCE(!list_empty(&cn->configs)); } static struct pernet_operations clusterip_net_ops = { diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d8327b43e4dc..10798b357481 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5847,6 +5847,12 @@ static int __net_init nf_tables_init_net(struct net *net) return 0; } +static void __net_exit nf_tables_exit_net(struct net *net) +{ + WARN_ON_ONCE(!list_empty(&net->nft.af_info)); + WARN_ON_ONCE(!list_empty(&net->nft.commit_list)); +} + int __nft_release_basechain(struct nft_ctx *ctx) { struct nft_rule *rule, *nr; @@ -5917,6 +5923,7 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi) static struct pernet_operations nf_tables_net_ops = { .init = nf_tables_init_net, + .exit = nf_tables_exit_net, }; static int __init nf_tables_module_init(void) diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index cad6498f10b0..1f511ed0fea3 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -1093,10 +1093,15 @@ static int __net_init nfnl_log_net_init(struct net *net) static void __net_exit nfnl_log_net_exit(struct net *net) { + struct nfnl_log_net *log = nfnl_log_pernet(net); + unsigned int i; + #ifdef CONFIG_PROC_FS remove_proc_entry("nfnetlink_log", net->nf.proc_netfilter); #endif nf_log_unset(net, &nfulnl_logger); + for (i = 0; i < INSTANCE_BUCKETS; i++) + WARN_ON_ONCE(!hlist_empty(&log->instance_table[i])); } static struct pernet_operations nfnl_log_net_ops = { diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index a16356cacec3..c09b36755ed7 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -1512,10 +1512,15 @@ static int __net_init nfnl_queue_net_init(struct net *net) static void __net_exit nfnl_queue_net_exit(struct net *net) { + struct nfnl_queue_net *q = nfnl_queue_pernet(net); + unsigned int i; + nf_unregister_queue_handler(net); #ifdef CONFIG_PROC_FS remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter); #endif + for (i = 0; i < INSTANCE_BUCKETS; i++) + WARN_ON_ONCE(!hlist_empty(&q->instance_table[i])); } static void nfnl_queue_net_exit_batch(struct list_head *net_exit_list) diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index a77dd514297c..55802e97f906 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1729,8 +1729,17 @@ static int __net_init xt_net_init(struct net *net) return 0; } +static void __net_exit xt_net_exit(struct net *net) +{ + int i; + + for (i = 0; i < NFPROTO_NUMPROTO; i++) + WARN_ON_ONCE(!list_empty(&net->xt.tables[i])); +} + static struct pernet_operations xt_net_ops = { .init = xt_net_init, + .exit = xt_net_exit, }; static int __init xt_init(void) From bc7d811ace4ad39a3941089ca871633366878719 Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Mon, 13 Nov 2017 09:09:40 +0100 Subject: [PATCH 003/269] netfilter: nf_ct_h323: Convert CHECK_BOUND macro to function It is bad practive to return in a macro, this patch moves the check into a function. Signed-off-by: Eric Sesterhenn Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_h323_asn1.c | 94 ++++++++++++++++++-------- 1 file changed, 65 insertions(+), 29 deletions(-) diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c index cf1bf2605c10..3d9a009ac147 100644 --- a/net/netfilter/nf_conntrack_h323_asn1.c +++ b/net/netfilter/nf_conntrack_h323_asn1.c @@ -103,7 +103,6 @@ struct bitstr { #define INC_BIT(bs) if((++(bs)->bit)>7){(bs)->cur++;(bs)->bit=0;} #define INC_BITS(bs,b) if(((bs)->bit+=(b))>7){(bs)->cur+=(bs)->bit>>3;(bs)->bit&=7;} #define BYTE_ALIGN(bs) if((bs)->bit){(bs)->cur++;(bs)->bit=0;} -#define CHECK_BOUND(bs,n) if((bs)->cur+(n)>(bs)->end)return(H323_ERROR_BOUND) static unsigned int get_len(struct bitstr *bs); static unsigned int get_bit(struct bitstr *bs); static unsigned int get_bits(struct bitstr *bs, unsigned int b); @@ -165,6 +164,14 @@ static unsigned int get_len(struct bitstr *bs) return v; } +static int nf_h323_error_boundary(struct bitstr *bs, size_t bytes) +{ + if (*bs->cur + bytes > *bs->end) + return 1; + + return 0; +} + /****************************************************************************/ static unsigned int get_bit(struct bitstr *bs) { @@ -280,7 +287,8 @@ static int decode_bool(struct bitstr *bs, const struct field_t *f, INC_BIT(bs); - CHECK_BOUND(bs, 0); + if (nf_h323_error_boundary(bs, 0)) + return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -293,11 +301,14 @@ static int decode_oid(struct bitstr *bs, const struct field_t *f, PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); BYTE_ALIGN(bs); - CHECK_BOUND(bs, 1); + if (nf_h323_error_boundary(bs, 1)) + return H323_ERROR_BOUND; + len = *bs->cur++; bs->cur += len; + if (nf_h323_error_boundary(bs, 0)) + return H323_ERROR_BOUND; - CHECK_BOUND(bs, 0); return H323_ERROR_NONE; } @@ -330,7 +341,8 @@ static int decode_int(struct bitstr *bs, const struct field_t *f, break; case UNCO: BYTE_ALIGN(bs); - CHECK_BOUND(bs, 2); + if (nf_h323_error_boundary(bs, 2)) + return H323_ERROR_BOUND; len = get_len(bs); bs->cur += len; break; @@ -341,7 +353,8 @@ static int decode_int(struct bitstr *bs, const struct field_t *f, PRINT("\n"); - CHECK_BOUND(bs, 0); + if (nf_h323_error_boundary(bs, 0)) + return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -357,7 +370,8 @@ static int decode_enum(struct bitstr *bs, const struct field_t *f, INC_BITS(bs, f->sz); } - CHECK_BOUND(bs, 0); + if (nf_h323_error_boundary(bs, 0)) + return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -375,12 +389,14 @@ static int decode_bitstr(struct bitstr *bs, const struct field_t *f, len = f->lb; break; case WORD: /* 2-byte length */ - CHECK_BOUND(bs, 2); + if (nf_h323_error_boundary(bs, 2)) + return H323_ERROR_BOUND; len = (*bs->cur++) << 8; len += (*bs->cur++) + f->lb; break; case SEMI: - CHECK_BOUND(bs, 2); + if (nf_h323_error_boundary(bs, 2)) + return H323_ERROR_BOUND; len = get_len(bs); break; default: @@ -391,7 +407,8 @@ static int decode_bitstr(struct bitstr *bs, const struct field_t *f, bs->cur += len >> 3; bs->bit = len & 7; - CHECK_BOUND(bs, 0); + if (nf_h323_error_boundary(bs, 0)) + return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -409,7 +426,8 @@ static int decode_numstr(struct bitstr *bs, const struct field_t *f, BYTE_ALIGN(bs); INC_BITS(bs, (len << 2)); - CHECK_BOUND(bs, 0); + if (nf_h323_error_boundary(bs, 0)) + return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -440,12 +458,14 @@ static int decode_octstr(struct bitstr *bs, const struct field_t *f, break; case BYTE: /* Range == 256 */ BYTE_ALIGN(bs); - CHECK_BOUND(bs, 1); + if (nf_h323_error_boundary(bs, 1)) + return H323_ERROR_BOUND; len = (*bs->cur++) + f->lb; break; case SEMI: BYTE_ALIGN(bs); - CHECK_BOUND(bs, 2); + if (nf_h323_error_boundary(bs, 2)) + return H323_ERROR_BOUND; len = get_len(bs) + f->lb; break; default: /* 2 <= Range <= 255 */ @@ -458,7 +478,8 @@ static int decode_octstr(struct bitstr *bs, const struct field_t *f, PRINT("\n"); - CHECK_BOUND(bs, 0); + if (nf_h323_error_boundary(bs, 0)) + return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -473,7 +494,8 @@ static int decode_bmpstr(struct bitstr *bs, const struct field_t *f, switch (f->sz) { case BYTE: /* Range == 256 */ BYTE_ALIGN(bs); - CHECK_BOUND(bs, 1); + if (nf_h323_error_boundary(bs, 1)) + return H323_ERROR_BOUND; len = (*bs->cur++) + f->lb; break; default: /* 2 <= Range <= 255 */ @@ -484,7 +506,8 @@ static int decode_bmpstr(struct bitstr *bs, const struct field_t *f, bs->cur += len << 1; - CHECK_BOUND(bs, 0); + if (nf_h323_error_boundary(bs, 0)) + return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -525,9 +548,11 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, /* Decode */ if (son->attr & OPEN) { /* Open field */ - CHECK_BOUND(bs, 2); + if (nf_h323_error_boundary(bs, 2)) + return H323_ERROR_BOUND; len = get_len(bs); - CHECK_BOUND(bs, len); + if (nf_h323_error_boundary(bs, len)) + return H323_ERROR_BOUND; if (!base || !(son->attr & DECODE)) { PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", son->name); @@ -556,7 +581,8 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, /* Get the extension bitmap */ bmp2_len = get_bits(bs, 7) + 1; - CHECK_BOUND(bs, (bmp2_len + 7) >> 3); + if (nf_h323_error_boundary(bs, (bmp2_len + 7) >> 3)) + return H323_ERROR_BOUND; bmp2 = get_bitmap(bs, bmp2_len); bmp |= bmp2 >> f->sz; if (base) @@ -567,9 +593,11 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, for (opt = 0; opt < bmp2_len; opt++, i++, son++) { /* Check Range */ if (i >= f->ub) { /* Newer Version? */ - CHECK_BOUND(bs, 2); + if (nf_h323_error_boundary(bs, 2)) + return H323_ERROR_BOUND; len = get_len(bs); - CHECK_BOUND(bs, len); + if (nf_h323_error_boundary(bs, len)) + return H323_ERROR_BOUND; bs->cur += len; continue; } @@ -583,9 +611,11 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, if (!((0x80000000 >> opt) & bmp2)) /* Not present */ continue; - CHECK_BOUND(bs, 2); + if (nf_h323_error_boundary(bs, 2)) + return H323_ERROR_BOUND; len = get_len(bs); - CHECK_BOUND(bs, len); + if (nf_h323_error_boundary(bs, len)) + return H323_ERROR_BOUND; if (!base || !(son->attr & DECODE)) { PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", son->name); @@ -623,19 +653,22 @@ static int decode_seqof(struct bitstr *bs, const struct field_t *f, switch (f->sz) { case BYTE: BYTE_ALIGN(bs); - CHECK_BOUND(bs, 1); + if (nf_h323_error_boundary(bs, 1)) + return H323_ERROR_BOUND; count = *bs->cur++; break; case WORD: BYTE_ALIGN(bs); - CHECK_BOUND(bs, 2); + if (nf_h323_error_boundary(bs, 2)) + return H323_ERROR_BOUND; count = *bs->cur++; count <<= 8; count += *bs->cur++; break; case SEMI: BYTE_ALIGN(bs); - CHECK_BOUND(bs, 2); + if (nf_h323_error_boundary(bs, 2)) + return H323_ERROR_BOUND; count = get_len(bs); break; default: @@ -659,7 +692,8 @@ static int decode_seqof(struct bitstr *bs, const struct field_t *f, if (son->attr & OPEN) { BYTE_ALIGN(bs); len = get_len(bs); - CHECK_BOUND(bs, len); + if (nf_h323_error_boundary(bs, len)) + return H323_ERROR_BOUND; if (!base || !(son->attr & DECODE)) { PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", son->name); @@ -728,7 +762,8 @@ static int decode_choice(struct bitstr *bs, const struct field_t *f, if (type >= f->ub) { /* Newer version? */ BYTE_ALIGN(bs); len = get_len(bs); - CHECK_BOUND(bs, len); + if (nf_h323_error_boundary(bs, len)) + return H323_ERROR_BOUND; bs->cur += len; return H323_ERROR_NONE; } @@ -743,7 +778,8 @@ static int decode_choice(struct bitstr *bs, const struct field_t *f, if (ext || (son->attr & OPEN)) { BYTE_ALIGN(bs); len = get_len(bs); - CHECK_BOUND(bs, len); + if (nf_h323_error_boundary(bs, len)) + return H323_ERROR_BOUND; if (!base || !(son->attr & DECODE)) { PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", son->name); From ec8a8f3c31ddef0a7d9626c4b8a4baa30f3b80aa Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Mon, 13 Nov 2017 09:09:41 +0100 Subject: [PATCH 004/269] netfilter: nf_ct_h323: Extend nf_h323_error_boundary to work on bits as well This patch fixes several out of bounds memory reads by extending the nf_h323_error_boundary() function to work on bits as well an check the affected parts. Signed-off-by: Eric Sesterhenn Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_h323_asn1.c | 92 +++++++++++++++++--------- 1 file changed, 62 insertions(+), 30 deletions(-) diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c index 3d9a009ac147..dc6347342e34 100644 --- a/net/netfilter/nf_conntrack_h323_asn1.c +++ b/net/netfilter/nf_conntrack_h323_asn1.c @@ -164,8 +164,13 @@ static unsigned int get_len(struct bitstr *bs) return v; } -static int nf_h323_error_boundary(struct bitstr *bs, size_t bytes) +static int nf_h323_error_boundary(struct bitstr *bs, size_t bytes, size_t bits) { + bits += bs->bit; + bytes += bits / BITS_PER_BYTE; + if (bits % BITS_PER_BYTE > 0) + bytes++; + if (*bs->cur + bytes > *bs->end) return 1; @@ -286,8 +291,7 @@ static int decode_bool(struct bitstr *bs, const struct field_t *f, PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); INC_BIT(bs); - - if (nf_h323_error_boundary(bs, 0)) + if (nf_h323_error_boundary(bs, 0, 0)) return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -301,12 +305,12 @@ static int decode_oid(struct bitstr *bs, const struct field_t *f, PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); BYTE_ALIGN(bs); - if (nf_h323_error_boundary(bs, 1)) + if (nf_h323_error_boundary(bs, 1, 0)) return H323_ERROR_BOUND; len = *bs->cur++; bs->cur += len; - if (nf_h323_error_boundary(bs, 0)) + if (nf_h323_error_boundary(bs, 0, 0)) return H323_ERROR_BOUND; return H323_ERROR_NONE; @@ -330,6 +334,8 @@ static int decode_int(struct bitstr *bs, const struct field_t *f, bs->cur += 2; break; case CONS: /* 64K < Range < 4G */ + if (nf_h323_error_boundary(bs, 0, 2)) + return H323_ERROR_BOUND; len = get_bits(bs, 2) + 1; BYTE_ALIGN(bs); if (base && (f->attr & DECODE)) { /* timeToLive */ @@ -341,7 +347,7 @@ static int decode_int(struct bitstr *bs, const struct field_t *f, break; case UNCO: BYTE_ALIGN(bs); - if (nf_h323_error_boundary(bs, 2)) + if (nf_h323_error_boundary(bs, 2, 0)) return H323_ERROR_BOUND; len = get_len(bs); bs->cur += len; @@ -353,7 +359,7 @@ static int decode_int(struct bitstr *bs, const struct field_t *f, PRINT("\n"); - if (nf_h323_error_boundary(bs, 0)) + if (nf_h323_error_boundary(bs, 0, 0)) return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -370,7 +376,7 @@ static int decode_enum(struct bitstr *bs, const struct field_t *f, INC_BITS(bs, f->sz); } - if (nf_h323_error_boundary(bs, 0)) + if (nf_h323_error_boundary(bs, 0, 0)) return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -389,13 +395,13 @@ static int decode_bitstr(struct bitstr *bs, const struct field_t *f, len = f->lb; break; case WORD: /* 2-byte length */ - if (nf_h323_error_boundary(bs, 2)) + if (nf_h323_error_boundary(bs, 2, 0)) return H323_ERROR_BOUND; len = (*bs->cur++) << 8; len += (*bs->cur++) + f->lb; break; case SEMI: - if (nf_h323_error_boundary(bs, 2)) + if (nf_h323_error_boundary(bs, 2, 0)) return H323_ERROR_BOUND; len = get_len(bs); break; @@ -407,7 +413,7 @@ static int decode_bitstr(struct bitstr *bs, const struct field_t *f, bs->cur += len >> 3; bs->bit = len & 7; - if (nf_h323_error_boundary(bs, 0)) + if (nf_h323_error_boundary(bs, 0, 0)) return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -421,12 +427,14 @@ static int decode_numstr(struct bitstr *bs, const struct field_t *f, PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); /* 2 <= Range <= 255 */ + if (nf_h323_error_boundary(bs, 0, f->sz)) + return H323_ERROR_BOUND; len = get_bits(bs, f->sz) + f->lb; BYTE_ALIGN(bs); INC_BITS(bs, (len << 2)); - if (nf_h323_error_boundary(bs, 0)) + if (nf_h323_error_boundary(bs, 0, 0)) return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -458,17 +466,19 @@ static int decode_octstr(struct bitstr *bs, const struct field_t *f, break; case BYTE: /* Range == 256 */ BYTE_ALIGN(bs); - if (nf_h323_error_boundary(bs, 1)) + if (nf_h323_error_boundary(bs, 1, 0)) return H323_ERROR_BOUND; len = (*bs->cur++) + f->lb; break; case SEMI: BYTE_ALIGN(bs); - if (nf_h323_error_boundary(bs, 2)) + if (nf_h323_error_boundary(bs, 2, 0)) return H323_ERROR_BOUND; len = get_len(bs) + f->lb; break; default: /* 2 <= Range <= 255 */ + if (nf_h323_error_boundary(bs, 0, f->sz)) + return H323_ERROR_BOUND; len = get_bits(bs, f->sz) + f->lb; BYTE_ALIGN(bs); break; @@ -478,7 +488,7 @@ static int decode_octstr(struct bitstr *bs, const struct field_t *f, PRINT("\n"); - if (nf_h323_error_boundary(bs, 0)) + if (nf_h323_error_boundary(bs, 0, 0)) return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -494,11 +504,13 @@ static int decode_bmpstr(struct bitstr *bs, const struct field_t *f, switch (f->sz) { case BYTE: /* Range == 256 */ BYTE_ALIGN(bs); - if (nf_h323_error_boundary(bs, 1)) + if (nf_h323_error_boundary(bs, 1, 0)) return H323_ERROR_BOUND; len = (*bs->cur++) + f->lb; break; default: /* 2 <= Range <= 255 */ + if (nf_h323_error_boundary(bs, 0, f->sz)) + return H323_ERROR_BOUND; len = get_bits(bs, f->sz) + f->lb; BYTE_ALIGN(bs); break; @@ -506,7 +518,7 @@ static int decode_bmpstr(struct bitstr *bs, const struct field_t *f, bs->cur += len << 1; - if (nf_h323_error_boundary(bs, 0)) + if (nf_h323_error_boundary(bs, 0, 0)) return H323_ERROR_BOUND; return H323_ERROR_NONE; } @@ -526,9 +538,13 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, base = (base && (f->attr & DECODE)) ? base + f->offset : NULL; /* Extensible? */ + if (nf_h323_error_boundary(bs, 0, 1)) + return H323_ERROR_BOUND; ext = (f->attr & EXT) ? get_bit(bs) : 0; /* Get fields bitmap */ + if (nf_h323_error_boundary(bs, 0, f->sz)) + return H323_ERROR_BOUND; bmp = get_bitmap(bs, f->sz); if (base) *(unsigned int *)base = bmp; @@ -548,10 +564,10 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, /* Decode */ if (son->attr & OPEN) { /* Open field */ - if (nf_h323_error_boundary(bs, 2)) + if (nf_h323_error_boundary(bs, 2, 0)) return H323_ERROR_BOUND; len = get_len(bs); - if (nf_h323_error_boundary(bs, len)) + if (nf_h323_error_boundary(bs, len, 0)) return H323_ERROR_BOUND; if (!base || !(son->attr & DECODE)) { PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, @@ -580,8 +596,10 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, return H323_ERROR_NONE; /* Get the extension bitmap */ + if (nf_h323_error_boundary(bs, 0, 7)) + return H323_ERROR_BOUND; bmp2_len = get_bits(bs, 7) + 1; - if (nf_h323_error_boundary(bs, (bmp2_len + 7) >> 3)) + if (nf_h323_error_boundary(bs, 0, bmp2_len)) return H323_ERROR_BOUND; bmp2 = get_bitmap(bs, bmp2_len); bmp |= bmp2 >> f->sz; @@ -593,10 +611,10 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, for (opt = 0; opt < bmp2_len; opt++, i++, son++) { /* Check Range */ if (i >= f->ub) { /* Newer Version? */ - if (nf_h323_error_boundary(bs, 2)) + if (nf_h323_error_boundary(bs, 2, 0)) return H323_ERROR_BOUND; len = get_len(bs); - if (nf_h323_error_boundary(bs, len)) + if (nf_h323_error_boundary(bs, len, 0)) return H323_ERROR_BOUND; bs->cur += len; continue; @@ -611,10 +629,10 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f, if (!((0x80000000 >> opt) & bmp2)) /* Not present */ continue; - if (nf_h323_error_boundary(bs, 2)) + if (nf_h323_error_boundary(bs, 2, 0)) return H323_ERROR_BOUND; len = get_len(bs); - if (nf_h323_error_boundary(bs, len)) + if (nf_h323_error_boundary(bs, len, 0)) return H323_ERROR_BOUND; if (!base || !(son->attr & DECODE)) { PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", @@ -653,13 +671,13 @@ static int decode_seqof(struct bitstr *bs, const struct field_t *f, switch (f->sz) { case BYTE: BYTE_ALIGN(bs); - if (nf_h323_error_boundary(bs, 1)) + if (nf_h323_error_boundary(bs, 1, 0)) return H323_ERROR_BOUND; count = *bs->cur++; break; case WORD: BYTE_ALIGN(bs); - if (nf_h323_error_boundary(bs, 2)) + if (nf_h323_error_boundary(bs, 2, 0)) return H323_ERROR_BOUND; count = *bs->cur++; count <<= 8; @@ -667,11 +685,13 @@ static int decode_seqof(struct bitstr *bs, const struct field_t *f, break; case SEMI: BYTE_ALIGN(bs); - if (nf_h323_error_boundary(bs, 2)) + if (nf_h323_error_boundary(bs, 2, 0)) return H323_ERROR_BOUND; count = get_len(bs); break; default: + if (nf_h323_error_boundary(bs, 0, f->sz)) + return H323_ERROR_BOUND; count = get_bits(bs, f->sz); break; } @@ -691,8 +711,10 @@ static int decode_seqof(struct bitstr *bs, const struct field_t *f, for (i = 0; i < count; i++) { if (son->attr & OPEN) { BYTE_ALIGN(bs); + if (nf_h323_error_boundary(bs, 2, 0)) + return H323_ERROR_BOUND; len = get_len(bs); - if (nf_h323_error_boundary(bs, len)) + if (nf_h323_error_boundary(bs, len, 0)) return H323_ERROR_BOUND; if (!base || !(son->attr & DECODE)) { PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, @@ -744,11 +766,17 @@ static int decode_choice(struct bitstr *bs, const struct field_t *f, base = (base && (f->attr & DECODE)) ? base + f->offset : NULL; /* Decode the choice index number */ + if (nf_h323_error_boundary(bs, 0, 1)) + return H323_ERROR_BOUND; if ((f->attr & EXT) && get_bit(bs)) { ext = 1; + if (nf_h323_error_boundary(bs, 0, 7)) + return H323_ERROR_BOUND; type = get_bits(bs, 7) + f->lb; } else { ext = 0; + if (nf_h323_error_boundary(bs, 0, f->sz)) + return H323_ERROR_BOUND; type = get_bits(bs, f->sz); if (type >= f->lb) return H323_ERROR_RANGE; @@ -761,8 +789,10 @@ static int decode_choice(struct bitstr *bs, const struct field_t *f, /* Check Range */ if (type >= f->ub) { /* Newer version? */ BYTE_ALIGN(bs); + if (nf_h323_error_boundary(bs, 2, 0)) + return H323_ERROR_BOUND; len = get_len(bs); - if (nf_h323_error_boundary(bs, len)) + if (nf_h323_error_boundary(bs, len, 0)) return H323_ERROR_BOUND; bs->cur += len; return H323_ERROR_NONE; @@ -777,8 +807,10 @@ static int decode_choice(struct bitstr *bs, const struct field_t *f, if (ext || (son->attr & OPEN)) { BYTE_ALIGN(bs); + if (nf_h323_error_boundary(bs, len, 0)) + return H323_ERROR_BOUND; len = get_len(bs); - if (nf_h323_error_boundary(bs, len)) + if (nf_h323_error_boundary(bs, len, 0)) return H323_ERROR_BOUND; if (!base || !(son->attr & DECODE)) { PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", From fbcd253d2448b8f168241e38f629a36c4c8c1e94 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 19 Nov 2017 21:27:28 +0100 Subject: [PATCH 005/269] netfilter: conntrack: lower timeout to RETRANS seconds if window is 0 When zero window is announced we can get into a situation where connection stays around forever: 1. One side announces zero window. 2. Other side closes. In this case, no FIN is sent (stuck in send queue). Unless other side opens the window up again conntrack stays in ESTABLISHED state for a very long time. Lets alleviate this by lowering the timeout to RETRANS (5 minutes), the other end should be sending zero window probes to keep the connection established as long as a socket still exists. Cc: Jozsef Kadlecsik Signed-off-by: Florian Westphal Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto_tcp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index b12fc07111d0..37ef35b861f2 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1039,6 +1039,9 @@ static int tcp_packet(struct nf_conn *ct, IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED && timeouts[new_state] > timeouts[TCP_CONNTRACK_UNACK]) timeout = timeouts[TCP_CONNTRACK_UNACK]; + else if (ct->proto.tcp.last_win == 0 && + timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS]) + timeout = timeouts[TCP_CONNTRACK_RETRANS]; else timeout = timeouts[new_state]; spin_unlock_bh(&ct->lock); From aa24163b2ee5c92120e32e99b5a93143a0f4258e Mon Sep 17 00:00:00 2001 From: Prateek Sood Date: Wed, 15 Nov 2017 19:50:14 +0530 Subject: [PATCH 006/269] cgroup/cpuset: remove circular dependency deadlock Remove circular dependency deadlock in a scenario where hotplug of CPU is being done while there is updation in cgroup and cpuset triggered from userspace. Process A => kthreadd => Process B => Process C => Process A Process A cpu_subsys_offline(); cpu_down(); _cpu_down(); percpu_down_write(&cpu_hotplug_lock); //held cpuhp_invoke_callback(); workqueue_offline_cpu(); queue_work_on(); // unbind_work on system_highpri_wq __queue_work(); insert_work(); wake_up_worker(); flush_work(); wait_for_completion(); worker_thread(); manage_workers(); create_worker(); kthread_create_on_node(); wake_up_process(kthreadd_task); kthreadd kthreadd(); kernel_thread(); do_fork(); copy_process(); percpu_down_read(&cgroup_threadgroup_rwsem); __rwsem_down_read_failed_common(); //waiting Process B kernfs_fop_write(); cgroup_file_write(); cgroup_procs_write(); percpu_down_write(&cgroup_threadgroup_rwsem); //held cgroup_attach_task(); cgroup_migrate(); cgroup_migrate_execute(); cpuset_can_attach(); mutex_lock(&cpuset_mutex); //waiting Process C kernfs_fop_write(); cgroup_file_write(); cpuset_write_resmask(); mutex_lock(&cpuset_mutex); //held update_cpumask(); update_cpumasks_hier(); rebuild_sched_domains_locked(); get_online_cpus(); percpu_down_read(&cpu_hotplug_lock); //waiting Eliminating deadlock by reversing the locking order for cpuset_mutex and cpu_hotplug_lock. Signed-off-by: Prateek Sood Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 53 ++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index f7efa7b4d825..cab5fd1ee767 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -812,6 +812,18 @@ done: return ndoms; } +static void cpuset_sched_change_begin(void) +{ + cpus_read_lock(); + mutex_lock(&cpuset_mutex); +} + +static void cpuset_sched_change_end(void) +{ + mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); +} + /* * Rebuild scheduler domains. * @@ -821,16 +833,14 @@ done: * 'cpus' is removed, then call this routine to rebuild the * scheduler's dynamic sched domains. * - * Call with cpuset_mutex held. Takes get_online_cpus(). */ -static void rebuild_sched_domains_locked(void) +static void rebuild_sched_domains_cpuslocked(void) { struct sched_domain_attr *attr; cpumask_var_t *doms; int ndoms; lockdep_assert_held(&cpuset_mutex); - get_online_cpus(); /* * We have raced with CPU hotplug. Don't do anything to avoid @@ -838,27 +848,25 @@ static void rebuild_sched_domains_locked(void) * Anyways, hotplug work item will rebuild sched domains. */ if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) - goto out; + return; /* Generate domain masks and attrs */ ndoms = generate_sched_domains(&doms, &attr); /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); -out: - put_online_cpus(); } #else /* !CONFIG_SMP */ -static void rebuild_sched_domains_locked(void) +static void rebuild_sched_domains_cpuslocked(void) { } #endif /* CONFIG_SMP */ void rebuild_sched_domains(void) { - mutex_lock(&cpuset_mutex); - rebuild_sched_domains_locked(); - mutex_unlock(&cpuset_mutex); + cpuset_sched_change_begin(); + rebuild_sched_domains_cpuslocked(); + cpuset_sched_change_end(); } /** @@ -944,7 +952,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) rcu_read_unlock(); if (need_rebuild_sched_domains) - rebuild_sched_domains_locked(); + rebuild_sched_domains_cpuslocked(); } /** @@ -1276,7 +1284,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) cs->relax_domain_level = val; if (!cpumask_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) - rebuild_sched_domains_locked(); + rebuild_sched_domains_cpuslocked(); } return 0; @@ -1309,7 +1317,6 @@ static void update_tasks_flags(struct cpuset *cs) * * Call with cpuset_mutex held. */ - static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on) { @@ -1342,7 +1349,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, spin_unlock_irq(&callback_lock); if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) - rebuild_sched_domains_locked(); + rebuild_sched_domains_cpuslocked(); if (spread_flag_changed) update_tasks_flags(cs); @@ -1610,7 +1617,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = 0; - mutex_lock(&cpuset_mutex); + cpuset_sched_change_begin(); if (!is_cpuset_online(cs)) { retval = -ENODEV; goto out_unlock; @@ -1646,7 +1653,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - mutex_unlock(&cpuset_mutex); + cpuset_sched_change_end(); return retval; } @@ -1657,7 +1664,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = -ENODEV; - mutex_lock(&cpuset_mutex); + cpuset_sched_change_begin(); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1670,7 +1677,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - mutex_unlock(&cpuset_mutex); + cpuset_sched_change_end(); return retval; } @@ -1709,7 +1716,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, kernfs_break_active_protection(of->kn); flush_work(&cpuset_hotplug_work); - mutex_lock(&cpuset_mutex); + cpuset_sched_change_begin(); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1733,7 +1740,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, free_trial_cpuset(trialcs); out_unlock: - mutex_unlock(&cpuset_mutex); + cpuset_sched_change_end(); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); flush_workqueue(cpuset_migrate_mm_wq); @@ -2034,14 +2041,14 @@ out_unlock: /* * If the cpuset being removed has its flag 'sched_load_balance' * enabled, then simulate turning sched_load_balance off, which - * will call rebuild_sched_domains_locked(). + * will call rebuild_sched_domains_cpuslocked(). */ static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); - mutex_lock(&cpuset_mutex); + cpuset_sched_change_begin(); if (is_sched_load_balance(cs)) update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); @@ -2049,7 +2056,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) cpuset_dec(); clear_bit(CS_ONLINE, &cs->flags); - mutex_unlock(&cpuset_mutex); + cpuset_sched_change_end(); } static void cpuset_css_free(struct cgroup_subsys_state *css) From 1599a185f0e6113be185b9fb809c621c73865829 Mon Sep 17 00:00:00 2001 From: Prateek Sood Date: Wed, 15 Nov 2017 19:50:15 +0530 Subject: [PATCH 007/269] cpuset: Make cpuset hotplug synchronous Convert cpuset_hotplug_workfn() into synchronous call for cpu hotplug path. For memory hotplug path it still gets queued as a work item. Since cpuset_hotplug_workfn() can be made synchronous for cpu hotplug path, it is not required to wait for cpuset hotplug while thawing processes. Signed-off-by: Prateek Sood Signed-off-by: Tejun Heo --- include/linux/cpuset.h | 6 ------ kernel/cgroup/cpuset.c | 45 +++++++++++++++++++++--------------------- kernel/power/process.c | 2 -- kernel/sched/core.c | 1 - 4 files changed, 22 insertions(+), 32 deletions(-) diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 1b8e41597ef5..2ab910f85154 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -52,9 +52,7 @@ static inline void cpuset_dec(void) extern int cpuset_init(void); extern void cpuset_init_smp(void); -extern void cpuset_force_rebuild(void); extern void cpuset_update_active_cpus(void); -extern void cpuset_wait_for_hotplug(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); extern void cpuset_cpus_allowed_fallback(struct task_struct *p); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); @@ -167,15 +165,11 @@ static inline bool cpusets_enabled(void) { return false; } static inline int cpuset_init(void) { return 0; } static inline void cpuset_init_smp(void) {} -static inline void cpuset_force_rebuild(void) { } - static inline void cpuset_update_active_cpus(void) { partition_sched_domains(1, NULL, NULL); } -static inline void cpuset_wait_for_hotplug(void) { } - static inline void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask) { diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index cab5fd1ee767..227bc25d951d 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2277,15 +2277,8 @@ retry: mutex_unlock(&cpuset_mutex); } -static bool force_rebuild; - -void cpuset_force_rebuild(void) -{ - force_rebuild = true; -} - /** - * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset + * cpuset_hotplug - handle CPU/memory hotunplug for a cpuset * * This function is called after either CPU or memory configuration has * changed and updates cpuset accordingly. The top_cpuset is always @@ -2300,7 +2293,7 @@ void cpuset_force_rebuild(void) * Note that CPU offlining during suspend is ignored. We don't modify * cpusets across suspend/resume cycles at all. */ -static void cpuset_hotplug_workfn(struct work_struct *work) +static void cpuset_hotplug(bool use_cpu_hp_lock) { static cpumask_t new_cpus; static nodemask_t new_mems; @@ -2358,25 +2351,31 @@ static void cpuset_hotplug_workfn(struct work_struct *work) } /* rebuild sched domains if cpus_allowed has changed */ - if (cpus_updated || force_rebuild) { - force_rebuild = false; - rebuild_sched_domains(); + if (cpus_updated) { + if (use_cpu_hp_lock) + rebuild_sched_domains(); + else { + /* Acquiring cpu_hotplug_lock is not required. + * When cpuset_hotplug() is called in hotplug path, + * cpu_hotplug_lock is held by the hotplug context + * which is waiting for cpuhp_thread_fun to indicate + * completion of callback. + */ + mutex_lock(&cpuset_mutex); + rebuild_sched_domains_cpuslocked(); + mutex_unlock(&cpuset_mutex); + } } } +static void cpuset_hotplug_workfn(struct work_struct *work) +{ + cpuset_hotplug(true); +} + void cpuset_update_active_cpus(void) { - /* - * We're inside cpu hotplug critical region which usually nests - * inside cgroup synchronization. Bounce actual hotplug processing - * to a work item to avoid reverse locking order. - */ - schedule_work(&cpuset_hotplug_work); -} - -void cpuset_wait_for_hotplug(void) -{ - flush_work(&cpuset_hotplug_work); + cpuset_hotplug(false); } /* diff --git a/kernel/power/process.c b/kernel/power/process.c index 7381d49a44db..c326d7235c5f 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -204,8 +204,6 @@ void thaw_processes(void) __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); - cpuset_wait_for_hotplug(); - read_lock(&tasklist_lock); for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 75554f366fd3..88b3450b29ab 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5624,7 +5624,6 @@ static void cpuset_cpu_active(void) * restore the original sched domains by considering the * cpuset configurations. */ - cpuset_force_rebuild(); } cpuset_update_active_cpus(); } From 7d229c668a114e80d6be62b00e21a73bdd9ba7b3 Mon Sep 17 00:00:00 2001 From: Tal Shorer Date: Fri, 3 Nov 2017 17:27:49 +0200 Subject: [PATCH 008/269] main: kernel_start: move housekeeping_init() before workqueue_init_early() This is needed in order to allow the unbound workqueue to take housekeeping cpus into accounty Signed-off-by: Tal Shorer Signed-off-by: Tejun Heo --- init/main.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/init/main.c b/init/main.c index dfec3809e740..e96e3a14533c 100644 --- a/init/main.c +++ b/init/main.c @@ -588,6 +588,12 @@ asmlinkage __visible void __init start_kernel(void) local_irq_disable(); radix_tree_init(); + /* + * Set up housekeeping before setting up workqueues to allow the unbound + * workqueue to take non-housekeeping into account. + */ + housekeeping_init(); + /* * Allow workqueue creation and work item queueing/cancelling * early. Work item execution depends on kthreads and starts after @@ -605,7 +611,6 @@ asmlinkage __visible void __init start_kernel(void) early_irq_init(); init_IRQ(); tick_init(); - housekeeping_init(); rcu_init_nohz(); init_timers(); hrtimers_init(); From c98a9805096460567404799a7bd3149826affde7 Mon Sep 17 00:00:00 2001 From: Tal Shorer Date: Fri, 3 Nov 2017 17:27:50 +0200 Subject: [PATCH 009/269] workqueue: respect isolated cpus when queueing an unbound work Initialize wq_unbound_cpumask to exclude cpus that were isolated by the cmdline's isolcpus parameter. Signed-off-by: Tal Shorer Signed-off-by: Tejun Heo --- kernel/workqueue.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8fdb710bfdd7..6a5658cb46da 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -48,6 +48,7 @@ #include #include #include +#include #include "workqueue_internal.h" @@ -4957,6 +4958,10 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL)) return -ENOMEM; + /* + * Not excluding isolated cpus on purpose. + * If the user wishes to include them, we allow that. + */ cpumask_and(cpumask, cpumask, cpu_possible_mask); if (!cpumask_empty(cpumask)) { apply_wqattrs_lock(); @@ -5555,7 +5560,7 @@ int __init workqueue_init_early(void) WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); - cpumask_copy(wq_unbound_cpumask, cpu_possible_mask); + cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN)); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); From 16a27dfd218566f9604d5542c6285395cfc6831c Mon Sep 17 00:00:00 2001 From: Albert Pool Date: Mon, 20 Nov 2017 14:20:09 +0100 Subject: [PATCH 010/269] ata: mediatek: Fix typo in module description Signed-off-by: Albert Pool Signed-off-by: Tejun Heo --- drivers/ata/ahci_mtk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/ata/ahci_mtk.c b/drivers/ata/ahci_mtk.c index 80854f71559a..489452a64303 100644 --- a/drivers/ata/ahci_mtk.c +++ b/drivers/ata/ahci_mtk.c @@ -1,5 +1,5 @@ /* - * MeidaTek AHCI SATA driver + * MediaTek AHCI SATA driver * * Copyright (c) 2017 MediaTek Inc. * Author: Ryder Lee @@ -192,5 +192,5 @@ static struct platform_driver mtk_ahci_driver = { }; module_platform_driver(mtk_ahci_driver); -MODULE_DESCRIPTION("MeidaTek SATA AHCI Driver"); +MODULE_DESCRIPTION("MediaTek SATA AHCI Driver"); MODULE_LICENSE("GPL v2"); From ddf7005f32212f28669032651e09bd8d2245c35d Mon Sep 17 00:00:00 2001 From: Wang Long Date: Sun, 19 Nov 2017 16:08:37 -0500 Subject: [PATCH 011/269] debug cgroup: use task_css_set instead of rcu_dereference This macro `task_css_set` verifies that the caller is inside proper critical section if the kernel set CONFIG_PROVE_RCU=y. Signed-off-by: Wang Long Signed-off-by: Tejun Heo --- kernel/cgroup/debug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 5f780d8f6a9d..9caeda610249 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -50,7 +50,7 @@ static int current_css_set_read(struct seq_file *seq, void *v) spin_lock_irq(&css_set_lock); rcu_read_lock(); - cset = rcu_dereference(current->cgroups); + cset = task_css_set(current); refcnt = refcount_read(&cset->refcount); seq_printf(seq, "css_set %pK %d", cset, refcnt); if (refcnt > cset->nr_tasks) @@ -96,7 +96,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) spin_lock_irq(&css_set_lock); rcu_read_lock(); - cset = rcu_dereference(current->cgroups); + cset = task_css_set(current); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; From c1da86c19ad6bfb77ceef3414c82269e8466f410 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Sat, 25 Nov 2017 15:49:49 +0530 Subject: [PATCH 012/269] pata_pdc2027x: Remove unnecessary error check Here, The function pdc_hardware_init always return zero. So it is not necessary to check its return value. Signed-off-by: Arvind Yadav Signed-off-by: Tejun Heo --- drivers/ata/pata_pdc2027x.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/ata/pata_pdc2027x.c b/drivers/ata/pata_pdc2027x.c index ffd8d33c6e0f..4a9d532bdbb8 100644 --- a/drivers/ata/pata_pdc2027x.c +++ b/drivers/ata/pata_pdc2027x.c @@ -649,7 +649,7 @@ static long pdc_detect_pll_input_clock(struct ata_host *host) * @host: target ATA host * @board_idx: board identifier */ -static int pdc_hardware_init(struct ata_host *host, unsigned int board_idx) +static void pdc_hardware_init(struct ata_host *host, unsigned int board_idx) { long pll_clock; @@ -665,8 +665,6 @@ static int pdc_hardware_init(struct ata_host *host, unsigned int board_idx) /* Adjust PLL control register */ pdc_adjust_pll(host, pll_clock, board_idx); - - return 0; } /** @@ -753,8 +751,7 @@ static int pdc2027x_init_one(struct pci_dev *pdev, //pci_enable_intx(pdev); /* initialize adapter */ - if (pdc_hardware_init(host, board_idx) != 0) - return -EIO; + pdc_hardware_init(host, board_idx); pci_set_master(pdev); return ata_host_activate(host, pdev->irq, ata_bmdma_interrupt, @@ -778,8 +775,7 @@ static int pdc2027x_reinit_one(struct pci_dev *pdev) else board_idx = PDC_UDMA_133; - if (pdc_hardware_init(host, board_idx)) - return -EIO; + pdc_hardware_init(host, board_idx); ata_host_resume(host); return 0; From 20f9ceed72f127e4cc44c0358160e6e0118f823d Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Sat, 25 Nov 2017 16:47:35 +0530 Subject: [PATCH 013/269] pata_pdc2027x : make pdc2027x_*_timing structures const Make these pdc2027x_*_timing structures const as it is never modified. Signed-off-by: Arvind Yadav Signed-off-by: Tejun Heo --- drivers/ata/pata_pdc2027x.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/ata/pata_pdc2027x.c b/drivers/ata/pata_pdc2027x.c index 4a9d532bdbb8..6db2e34bd52f 100644 --- a/drivers/ata/pata_pdc2027x.c +++ b/drivers/ata/pata_pdc2027x.c @@ -82,7 +82,7 @@ static int pdc2027x_set_mode(struct ata_link *link, struct ata_device **r_failed * is issued to the device. However, if the controller clock is 133MHz, * the following tables must be used. */ -static struct pdc2027x_pio_timing { +static const struct pdc2027x_pio_timing { u8 value0, value1, value2; } pdc2027x_pio_timing_tbl[] = { { 0xfb, 0x2b, 0xac }, /* PIO mode 0 */ @@ -92,7 +92,7 @@ static struct pdc2027x_pio_timing { { 0x23, 0x09, 0x25 }, /* PIO mode 4, IORDY on, Prefetch off */ }; -static struct pdc2027x_mdma_timing { +static const struct pdc2027x_mdma_timing { u8 value0, value1; } pdc2027x_mdma_timing_tbl[] = { { 0xdf, 0x5f }, /* MDMA mode 0 */ @@ -100,7 +100,7 @@ static struct pdc2027x_mdma_timing { { 0x69, 0x25 }, /* MDMA mode 2 */ }; -static struct pdc2027x_udma_timing { +static const struct pdc2027x_udma_timing { u8 value0, value1, value2; } pdc2027x_udma_timing_tbl[] = { { 0x4a, 0x0f, 0xd5 }, /* UDMA mode 0 */ From abee210500ed15a22787009d9210b9a34911afcc Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 27 Nov 2017 15:51:04 -0500 Subject: [PATCH 014/269] percpu: hack to let the CRIS architecture to boot until they clean up Commit 438a506180 ("percpu: don't forget to free the temporary struct pcpu_alloc_info") uncovered a problem on the CRIS architecture where the bootmem allocator is initialized with virtual addresses. Given it has: #define __va(x) ((void *)((unsigned long)(x) | 0x80000000)) then things just work out because the end result is the same whether you give this a physical or a virtual address. Untill you call memblock_free_early(__pa(address)) that is, because values from __pa() don't match with the virtual addresses stuffed in the bootmem allocator anymore. Avoid freeing the temporary pcpu_alloc_info memory on that architecture until they fix things up to let the kernel boot like it did before. Signed-off-by: Nicolas Pitre Signed-off-by: Tejun Heo Fixes: 438a506180 ("percpu: don't forget to free the temporary struct pcpu_alloc_info") --- mm/percpu.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/percpu.c b/mm/percpu.c index 79e3549cab0f..50e7fdf84055 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -2719,7 +2719,11 @@ void __init setup_per_cpu_areas(void) if (pcpu_setup_first_chunk(ai, fc) < 0) panic("Failed to initialize percpu areas."); +#ifdef CONFIG_CRIS +#warning "the CRIS architecture has physical and virtual addresses confused" +#else pcpu_free_alloc_info(ai); +#endif } #endif /* CONFIG_SMP */ From 8b1836c4b64386e9bc580438cae386ed31a43ab9 Mon Sep 17 00:00:00 2001 From: Jay Elliott Date: Wed, 15 Nov 2017 15:01:13 -0800 Subject: [PATCH 015/269] netfilter: conntrack: clamp timeouts to INT_MAX When the conntracking code multiplies a timeout by HZ, it can overflow from positive to negative; this causes it to instantly expire. To protect against this the multiplication is done in 64-bit so we can prevent it from exceeding INT_MAX. Signed-off-by: Jay Elliott Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 59c08997bfdf..66d72a8fa87f 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1566,9 +1566,11 @@ static int ctnetlink_change_helper(struct nf_conn *ct, static int ctnetlink_change_timeout(struct nf_conn *ct, const struct nlattr * const cda[]) { - u_int32_t timeout = ntohl(nla_get_be32(cda[CTA_TIMEOUT])); + u64 timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ; - ct->timeout = nfct_time_stamp + timeout * HZ; + if (timeout > INT_MAX) + timeout = INT_MAX; + ct->timeout = nfct_time_stamp + (u32)timeout; if (test_bit(IPS_DYING_BIT, &ct->status)) return -ETIME; @@ -1768,6 +1770,7 @@ ctnetlink_create_conntrack(struct net *net, int err = -EINVAL; struct nf_conntrack_helper *helper; struct nf_conn_tstamp *tstamp; + u64 timeout; ct = nf_conntrack_alloc(net, zone, otuple, rtuple, GFP_ATOMIC); if (IS_ERR(ct)) @@ -1776,7 +1779,10 @@ ctnetlink_create_conntrack(struct net *net, if (!cda[CTA_TIMEOUT]) goto err1; - ct->timeout = nfct_time_stamp + ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ; + timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ; + if (timeout > INT_MAX) + timeout = INT_MAX; + ct->timeout = (u32)timeout + nfct_time_stamp; rcu_read_lock(); if (cda[CTA_HELP]) { From 52cf373c37a684f8fc279d541307fad39d206376 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Tue, 28 Nov 2017 13:59:25 +0100 Subject: [PATCH 016/269] cgroup: properly init u64_stats Lockdep complains that the stats update is trying to register a non-static key. This is because u64_stats are using a seqlock on 32bit arches, which needs to be initialized before usage. Fixes: 041cd640b2f3 (cgroup: Implement cgroup2 basic CPU usage accounting) Signed-off-by: Lucas Stach Signed-off-by: Tejun Heo --- kernel/cgroup/stat.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c index 133b465691d6..1e111dd455c4 100644 --- a/kernel/cgroup/stat.c +++ b/kernel/cgroup/stat.c @@ -296,8 +296,12 @@ int cgroup_stat_init(struct cgroup *cgrp) } /* ->updated_children list is self terminated */ - for_each_possible_cpu(cpu) - cgroup_cpu_stat(cgrp, cpu)->updated_children = cgrp; + for_each_possible_cpu(cpu) { + struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); + + cstat->updated_children = cgrp; + u64_stats_init(&cstat->sync); + } prev_cputime_init(&cgrp->stat.prev_cputime); From d2890c3778b164fde587bc16583f3a1c87233ec5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 26 Nov 2017 23:16:49 -0800 Subject: [PATCH 017/269] crypto: rsa - fix buffer overread when stripping leading zeroes In rsa_get_n(), if the buffer contained all 0's and "FIPS mode" is enabled, we would read one byte past the end of the buffer while scanning the leading zeroes. Fix it by checking 'n_sz' before '!*ptr'. This bug was reachable by adding a specially crafted key of type "asymmetric" (requires CONFIG_RSA and CONFIG_X509_CERTIFICATE_PARSER). KASAN report: BUG: KASAN: slab-out-of-bounds in rsa_get_n+0x19e/0x1d0 crypto/rsa_helper.c:33 Read of size 1 at addr ffff88003501a708 by task keyctl/196 CPU: 1 PID: 196 Comm: keyctl Not tainted 4.14.0-09238-g1d3b78bbc6e9 #26 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-20171110_100015-anatol 04/01/2014 Call Trace: rsa_get_n+0x19e/0x1d0 crypto/rsa_helper.c:33 asn1_ber_decoder+0x82a/0x1fd0 lib/asn1_decoder.c:328 rsa_set_pub_key+0xd3/0x320 crypto/rsa.c:278 crypto_akcipher_set_pub_key ./include/crypto/akcipher.h:364 [inline] pkcs1pad_set_pub_key+0xae/0x200 crypto/rsa-pkcs1pad.c:117 crypto_akcipher_set_pub_key ./include/crypto/akcipher.h:364 [inline] public_key_verify_signature+0x270/0x9d0 crypto/asymmetric_keys/public_key.c:106 x509_check_for_self_signed+0x2ea/0x480 crypto/asymmetric_keys/x509_public_key.c:141 x509_cert_parse+0x46a/0x620 crypto/asymmetric_keys/x509_cert_parser.c:129 x509_key_preparse+0x61/0x750 crypto/asymmetric_keys/x509_public_key.c:174 asymmetric_key_preparse+0xa4/0x150 crypto/asymmetric_keys/asymmetric_type.c:388 key_create_or_update+0x4d4/0x10a0 security/keys/key.c:850 SYSC_add_key security/keys/keyctl.c:122 [inline] SyS_add_key+0xe8/0x290 security/keys/keyctl.c:62 entry_SYSCALL_64_fastpath+0x1f/0x96 Allocated by task 196: __do_kmalloc mm/slab.c:3711 [inline] __kmalloc_track_caller+0x118/0x2e0 mm/slab.c:3726 kmemdup+0x17/0x40 mm/util.c:118 kmemdup ./include/linux/string.h:414 [inline] x509_cert_parse+0x2cb/0x620 crypto/asymmetric_keys/x509_cert_parser.c:106 x509_key_preparse+0x61/0x750 crypto/asymmetric_keys/x509_public_key.c:174 asymmetric_key_preparse+0xa4/0x150 crypto/asymmetric_keys/asymmetric_type.c:388 key_create_or_update+0x4d4/0x10a0 security/keys/key.c:850 SYSC_add_key security/keys/keyctl.c:122 [inline] SyS_add_key+0xe8/0x290 security/keys/keyctl.c:62 entry_SYSCALL_64_fastpath+0x1f/0x96 Fixes: 5a7de97309f5 ("crypto: rsa - return raw integers for the ASN.1 parser") Cc: # v4.8+ Cc: Tudor Ambarus Signed-off-by: Eric Biggers Reviewed-by: James Morris Reviewed-by: David Howells Signed-off-by: Herbert Xu --- crypto/rsa_helper.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/rsa_helper.c b/crypto/rsa_helper.c index 0b66dc824606..cad395d70d78 100644 --- a/crypto/rsa_helper.c +++ b/crypto/rsa_helper.c @@ -30,7 +30,7 @@ int rsa_get_n(void *context, size_t hdrlen, unsigned char tag, return -EINVAL; if (fips_enabled) { - while (!*ptr && n_sz) { + while (n_sz && !*ptr) { ptr++; n_sz--; } From b32a7dc8aef1882fbf983eb354837488cc9d54dc Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 27 Nov 2017 23:23:05 -0800 Subject: [PATCH 018/269] crypto: algif_aead - fix reference counting of null skcipher In the AEAD interface for AF_ALG, the reference to the "null skcipher" held by each tfm was being dropped in the wrong place -- when each af_alg_ctx was freed instead of when the aead_tfm was freed. As discovered by syzkaller, a specially crafted program could use this to cause the null skcipher to be freed while it is still in use. Fix it by dropping the reference in the right place. Fixes: 72548b093ee3 ("crypto: algif_aead - copy AAD from src to dst") Reported-by: syzbot Cc: # v4.14+ Signed-off-by: Eric Biggers Reviewed-by: Stephan Mueller Signed-off-by: Herbert Xu --- crypto/algif_aead.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c index aacae0837aff..9d73be28cf01 100644 --- a/crypto/algif_aead.c +++ b/crypto/algif_aead.c @@ -487,6 +487,7 @@ static void aead_release(void *private) struct aead_tfm *tfm = private; crypto_free_aead(tfm->aead); + crypto_put_default_null_skcipher2(); kfree(tfm); } @@ -519,7 +520,6 @@ static void aead_sock_destruct(struct sock *sk) unsigned int ivlen = crypto_aead_ivsize(tfm); af_alg_pull_tsgl(sk, ctx->used, NULL, 0); - crypto_put_default_null_skcipher2(); sock_kzfree_s(sk, ctx->iv, ivlen); sock_kfree_s(sk, ctx, ctx->len); af_alg_release_parent(sk); From 887207ed9e5812ed9239b6d07185a2d35dda91db Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 28 Nov 2017 00:46:24 -0800 Subject: [PATCH 019/269] crypto: af_alg - fix NULL pointer dereference in af_alg_free_areq_sgls() If allocating the ->tsgl member of 'struct af_alg_async_req' failed, during cleanup we dereferenced the NULL ->tsgl pointer in af_alg_free_areq_sgls(), because ->tsgl_entries was nonzero. Fix it by only freeing the ->tsgl list if it is non-NULL. This affected both algif_skcipher and algif_aead. Fixes: e870456d8e7c ("crypto: algif_skcipher - overhaul memory management") Fixes: d887c52d6ae4 ("crypto: algif_aead - overhaul memory management") Reported-by: syzbot Cc: # v4.14+ Signed-off-by: Eric Biggers Reviewed-by: Stephan Mueller Signed-off-by: Herbert Xu --- crypto/af_alg.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/crypto/af_alg.c b/crypto/af_alg.c index 85cea9de324a..1e5353f62067 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -672,14 +672,15 @@ void af_alg_free_areq_sgls(struct af_alg_async_req *areq) } tsgl = areq->tsgl; - for_each_sg(tsgl, sg, areq->tsgl_entries, i) { - if (!sg_page(sg)) - continue; - put_page(sg_page(sg)); - } + if (tsgl) { + for_each_sg(tsgl, sg, areq->tsgl_entries, i) { + if (!sg_page(sg)) + continue; + put_page(sg_page(sg)); + } - if (areq->tsgl && areq->tsgl_entries) sock_kfree_s(sk, tsgl, areq->tsgl_entries * sizeof(*tsgl)); + } } EXPORT_SYMBOL_GPL(af_alg_free_areq_sgls); From af3ff8045bbf3e32f1a448542e73abb4c8ceb6f1 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 28 Nov 2017 18:01:38 -0800 Subject: [PATCH 020/269] crypto: hmac - require that the underlying hash algorithm is unkeyed Because the HMAC template didn't check that its underlying hash algorithm is unkeyed, trying to use "hmac(hmac(sha3-512-generic))" through AF_ALG or through KEYCTL_DH_COMPUTE resulted in the inner HMAC being used without having been keyed, resulting in sha3_update() being called without sha3_init(), causing a stack buffer overflow. This is a very old bug, but it seems to have only started causing real problems when SHA-3 support was added (requires CONFIG_CRYPTO_SHA3) because the innermost hash's state is ->import()ed from a zeroed buffer, and it just so happens that other hash algorithms are fine with that, but SHA-3 is not. However, there could be arch or hardware-dependent hash algorithms also affected; I couldn't test everything. Fix the bug by introducing a function crypto_shash_alg_has_setkey() which tests whether a shash algorithm is keyed. Then update the HMAC template to require that its underlying hash algorithm is unkeyed. Here is a reproducer: #include #include int main() { int algfd; struct sockaddr_alg addr = { .salg_type = "hash", .salg_name = "hmac(hmac(sha3-512-generic))", }; char key[4096] = { 0 }; algfd = socket(AF_ALG, SOCK_SEQPACKET, 0); bind(algfd, (const struct sockaddr *)&addr, sizeof(addr)); setsockopt(algfd, SOL_ALG, ALG_SET_KEY, key, sizeof(key)); } Here was the KASAN report from syzbot: BUG: KASAN: stack-out-of-bounds in memcpy include/linux/string.h:341 [inline] BUG: KASAN: stack-out-of-bounds in sha3_update+0xdf/0x2e0 crypto/sha3_generic.c:161 Write of size 4096 at addr ffff8801cca07c40 by task syzkaller076574/3044 CPU: 1 PID: 3044 Comm: syzkaller076574 Not tainted 4.14.0-mm1+ #25 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:53 print_address_description+0x73/0x250 mm/kasan/report.c:252 kasan_report_error mm/kasan/report.c:351 [inline] kasan_report+0x25b/0x340 mm/kasan/report.c:409 check_memory_region_inline mm/kasan/kasan.c:260 [inline] check_memory_region+0x137/0x190 mm/kasan/kasan.c:267 memcpy+0x37/0x50 mm/kasan/kasan.c:303 memcpy include/linux/string.h:341 [inline] sha3_update+0xdf/0x2e0 crypto/sha3_generic.c:161 crypto_shash_update+0xcb/0x220 crypto/shash.c:109 shash_finup_unaligned+0x2a/0x60 crypto/shash.c:151 crypto_shash_finup+0xc4/0x120 crypto/shash.c:165 hmac_finup+0x182/0x330 crypto/hmac.c:152 crypto_shash_finup+0xc4/0x120 crypto/shash.c:165 shash_digest_unaligned+0x9e/0xd0 crypto/shash.c:172 crypto_shash_digest+0xc4/0x120 crypto/shash.c:186 hmac_setkey+0x36a/0x690 crypto/hmac.c:66 crypto_shash_setkey+0xad/0x190 crypto/shash.c:64 shash_async_setkey+0x47/0x60 crypto/shash.c:207 crypto_ahash_setkey+0xaf/0x180 crypto/ahash.c:200 hash_setkey+0x40/0x90 crypto/algif_hash.c:446 alg_setkey crypto/af_alg.c:221 [inline] alg_setsockopt+0x2a1/0x350 crypto/af_alg.c:254 SYSC_setsockopt net/socket.c:1851 [inline] SyS_setsockopt+0x189/0x360 net/socket.c:1830 entry_SYSCALL_64_fastpath+0x1f/0x96 Reported-by: syzbot Cc: Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- crypto/hmac.c | 6 +++++- crypto/shash.c | 5 +++-- include/crypto/internal/hash.h | 8 ++++++++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/crypto/hmac.c b/crypto/hmac.c index 92871dc2a63e..e74730224f0a 100644 --- a/crypto/hmac.c +++ b/crypto/hmac.c @@ -195,11 +195,15 @@ static int hmac_create(struct crypto_template *tmpl, struct rtattr **tb) salg = shash_attr_alg(tb[1], 0, 0); if (IS_ERR(salg)) return PTR_ERR(salg); + alg = &salg->base; + /* The underlying hash algorithm must be unkeyed */ err = -EINVAL; + if (crypto_shash_alg_has_setkey(salg)) + goto out_put_alg; + ds = salg->digestsize; ss = salg->statesize; - alg = &salg->base; if (ds > alg->cra_blocksize || ss < alg->cra_blocksize) goto out_put_alg; diff --git a/crypto/shash.c b/crypto/shash.c index 325a14da5827..e849d3ee2e27 100644 --- a/crypto/shash.c +++ b/crypto/shash.c @@ -25,11 +25,12 @@ static const struct crypto_type crypto_shash_type; -static int shash_no_setkey(struct crypto_shash *tfm, const u8 *key, - unsigned int keylen) +int shash_no_setkey(struct crypto_shash *tfm, const u8 *key, + unsigned int keylen) { return -ENOSYS; } +EXPORT_SYMBOL_GPL(shash_no_setkey); static int shash_setkey_unaligned(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) diff --git a/include/crypto/internal/hash.h b/include/crypto/internal/hash.h index f0b44c16e88f..c2bae8da642c 100644 --- a/include/crypto/internal/hash.h +++ b/include/crypto/internal/hash.h @@ -82,6 +82,14 @@ int ahash_register_instance(struct crypto_template *tmpl, struct ahash_instance *inst); void ahash_free_instance(struct crypto_instance *inst); +int shash_no_setkey(struct crypto_shash *tfm, const u8 *key, + unsigned int keylen); + +static inline bool crypto_shash_alg_has_setkey(struct shash_alg *alg) +{ + return alg->setkey != shash_no_setkey; +} + int crypto_init_ahash_spawn(struct crypto_ahash_spawn *spawn, struct hash_alg_common *alg, struct crypto_instance *inst); From 9816ef6ecbc102b9bcbb1d83e12c7fb19924f38c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 22 Nov 2017 11:58:03 +0300 Subject: [PATCH 021/269] scsi: lpfc: Use after free in lpfc_rq_buf_free() The error message dereferences "rqb_entry" so we need to print it first and then free the buffer. Fixes: 6c621a2229b0 ("scsi: lpfc: Separate NVMET RQ buffer posting from IO resources SGL/iocbq/context") Signed-off-by: Dan Carpenter Acked-by: Dick Kennedy Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_mem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/lpfc/lpfc_mem.c b/drivers/scsi/lpfc/lpfc_mem.c index 56faeb049b4a..87c08ff37ddd 100644 --- a/drivers/scsi/lpfc/lpfc_mem.c +++ b/drivers/scsi/lpfc/lpfc_mem.c @@ -753,12 +753,12 @@ lpfc_rq_buf_free(struct lpfc_hba *phba, struct lpfc_dmabuf *mp) drqe.address_hi = putPaddrHigh(rqb_entry->dbuf.phys); rc = lpfc_sli4_rq_put(rqb_entry->hrq, rqb_entry->drq, &hrqe, &drqe); if (rc < 0) { - (rqbp->rqb_free_buffer)(phba, rqb_entry); lpfc_printf_log(phba, KERN_ERR, LOG_INIT, "6409 Cannot post to RQ %d: %x %x\n", rqb_entry->hrq->queue_id, rqb_entry->hrq->host_index, rqb_entry->hrq->hba_index); + (rqbp->rqb_free_buffer)(phba, rqb_entry); } else { list_add_tail(&rqb_entry->hbuf.list, &rqbp->rqb_buffer_list); rqbp->buffer_count++; From fe55e79536a37348dcb0b7177ee5fda6deccb99a Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Sat, 25 Nov 2017 19:38:10 +0100 Subject: [PATCH 022/269] scsi: libfc: fix ELS request handling The modification of fc_lport_recv_els_req() in commit fcabb09e59a7 ("scsi: libfc: directly call ELS request handlers") caused certain requests not to be handled at all. Fix that. Fixes: fcabb09e59a7 ("scsi: libfc: directly call ELS request handlers") Signed-off-by: Martin Wilck Reviewed-by: Johannes Thumshirn Signed-off-by: Martin K. Petersen --- drivers/scsi/libfc/fc_lport.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/scsi/libfc/fc_lport.c b/drivers/scsi/libfc/fc_lport.c index 5da46052e179..21be672679fb 100644 --- a/drivers/scsi/libfc/fc_lport.c +++ b/drivers/scsi/libfc/fc_lport.c @@ -904,10 +904,14 @@ static void fc_lport_recv_els_req(struct fc_lport *lport, case ELS_FLOGI: if (!lport->point_to_multipoint) fc_lport_recv_flogi_req(lport, fp); + else + fc_rport_recv_req(lport, fp); break; case ELS_LOGO: if (fc_frame_sid(fp) == FC_FID_FLOGI) fc_lport_recv_logo_req(lport, fp); + else + fc_rport_recv_req(lport, fp); break; case ELS_RSCN: lport->tt.disc_recv_req(lport, fp); From d18539754d97876503275efc7d00a1901bb0cfad Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 28 Nov 2017 14:25:25 +0100 Subject: [PATCH 023/269] scsi: aacraid: address UBSAN warning regression As reported by Meelis Roos, my previous patch causes an incorrect calculation of the timeout, through an undefined signed integer overflow: [ 12.228155] UBSAN: Undefined behaviour in drivers/scsi/aacraid/commsup.c:2514:49 [ 12.228229] signed integer overflow: [ 12.228283] 964297611 * 250 cannot be represented in type 'long int' The problem is that doing a multiplication with HZ first and then dividing by USEC_PER_SEC worked correctly for 32-bit microseconds, but not for 32-bit nanoseconds, which would require up to 41 bits. This reworks the calculation to first convert the nanoseconds into jiffies, which should give us the same result as before and not overflow. Unfortunately I did not understand the exact intention of the algorithm, in particular the part where we add half a second, so it's possible that there is still a preexisting problem in this function. I added a comment that this would be handled more nicely using usleep_range(), which generally works better for waking up at a particular time than the current schedule_timeout() based implementation. I did not feel comfortable trying to implement that without being sure what the intent is here though. Fixes: 820f18865912 ("scsi: aacraid: use timespec64 instead of timeval") Tested-by: Meelis Roos Signed-off-by: Arnd Bergmann Signed-off-by: Martin K. Petersen --- drivers/scsi/aacraid/commsup.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/aacraid/commsup.c b/drivers/scsi/aacraid/commsup.c index bec9f3193f60..80a8cb26cdea 100644 --- a/drivers/scsi/aacraid/commsup.c +++ b/drivers/scsi/aacraid/commsup.c @@ -2482,8 +2482,8 @@ int aac_command_thread(void *data) /* Synchronize our watches */ if (((NSEC_PER_SEC - (NSEC_PER_SEC / HZ)) > now.tv_nsec) && (now.tv_nsec > (NSEC_PER_SEC / HZ))) - difference = (((NSEC_PER_SEC - now.tv_nsec) * HZ) - + NSEC_PER_SEC / 2) / NSEC_PER_SEC; + difference = HZ + HZ / 2 - + now.tv_nsec / (NSEC_PER_SEC / HZ); else { if (now.tv_nsec > NSEC_PER_SEC / 2) ++now.tv_sec; @@ -2507,6 +2507,10 @@ int aac_command_thread(void *data) if (kthread_should_stop()) break; + /* + * we probably want usleep_range() here instead of the + * jiffies computation + */ schedule_timeout(difference); if (kthread_should_stop()) From 45349821ab3a8d378b8f37e52c6fe1aa1b870c47 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Tue, 28 Nov 2017 16:26:57 +0100 Subject: [PATCH 024/269] scsi: bfa: fix access to bfad_im_port_s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 'cd21c605b2cf ("scsi: fc: provide fc_bsg_to_shost() helper")' changed access to bfa's 'struct bfad_im_port_s' by using shost_priv() instead of shost->hostdata[0]. This lead to crashes like in the following back-trace: task: ffff880046375300 ti: ffff8800a2ef8000 task.ti: ffff8800a2ef8000 RIP: e030:[] [] bfa_fcport_get_attr+0x82/0x260 [bfa] RSP: e02b:ffff8800a2efba10 EFLAGS: 00010046 RAX: 575f415441536432 RBX: ffff8800a2efba28 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffff8800a2efba28 RDI: ffff880004dc31d8 RBP: ffff880004dc31d8 R08: 0000000000000000 R09: 0000000000000001 R10: ffff88011fadc468 R11: 0000000000000001 R12: ffff880004dc31f0 R13: 0000000000000200 R14: ffff880004dc61d0 R15: ffff880004947a10 FS: 00007feb1e489700(0000) GS:ffff88011fac0000(0000) knlGS:0000000000000000 CS: e033 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00007ffe14e46c10 CR3: 00000000957b8000 CR4: 0000000000000660 Stack: ffff88001d4da000 ffff880004dc31c0 ffffffffa048a9df ffffffff81e56380 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [] bfad_iocmd_ioc_get_info+0x4f/0x220 [bfa] [] bfad_iocmd_handler+0xa00/0xd40 [bfa] [] bfad_im_bsg_request+0xee/0x1b0 [bfa] [] fc_bsg_dispatch+0x10b/0x1b0 [scsi_transport_fc] [] bsg_request_fn+0x11d/0x1c0 [] __blk_run_queue+0x2f/0x40 [] blk_execute_rq_nowait+0xa8/0x160 [] blk_execute_rq+0x77/0x120 [] bsg_ioctl+0x1b6/0x200 [] do_vfs_ioctl+0x2cd/0x4a0 [] SyS_ioctl+0x74/0x80 [] entry_SYSCALL_64_fastpath+0x12/0x6d Fixes: cd21c605b2cf ("scsi: fc: provide fc_bsg_to_shost() helper") Signed-off-by: Johannes Thumshirn Cc: Michal Koutný Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/bfa/bfad_bsg.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/bfa/bfad_bsg.c b/drivers/scsi/bfa/bfad_bsg.c index 72ca2a2e08e2..09ef68c8225f 100644 --- a/drivers/scsi/bfa/bfad_bsg.c +++ b/drivers/scsi/bfa/bfad_bsg.c @@ -3135,7 +3135,8 @@ bfad_im_bsg_vendor_request(struct bsg_job *job) struct fc_bsg_request *bsg_request = job->request; struct fc_bsg_reply *bsg_reply = job->reply; uint32_t vendor_cmd = bsg_request->rqst_data.h_vendor.vendor_cmd[0]; - struct bfad_im_port_s *im_port = shost_priv(fc_bsg_to_shost(job)); + struct Scsi_Host *shost = fc_bsg_to_shost(job); + struct bfad_im_port_s *im_port = shost->hostdata[0]; struct bfad_s *bfad = im_port->bfad; void *payload_kbuf; int rc = -EINVAL; @@ -3350,7 +3351,8 @@ int bfad_im_bsg_els_ct_request(struct bsg_job *job) { struct bfa_bsg_data *bsg_data; - struct bfad_im_port_s *im_port = shost_priv(fc_bsg_to_shost(job)); + struct Scsi_Host *shost = fc_bsg_to_shost(job); + struct bfad_im_port_s *im_port = shost->hostdata[0]; struct bfad_s *bfad = im_port->bfad; bfa_bsg_fcpt_t *bsg_fcpt; struct bfad_fcxp *drv_fcxp; From ecaaab5649781c5a0effdaf298a925063020500e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 28 Nov 2017 20:56:59 -0800 Subject: [PATCH 025/269] crypto: salsa20 - fix blkcipher_walk API usage When asked to encrypt or decrypt 0 bytes, both the generic and x86 implementations of Salsa20 crash in blkcipher_walk_done(), either when doing 'kfree(walk->buffer)' or 'free_page((unsigned long)walk->page)', because walk->buffer and walk->page have not been initialized. The bug is that Salsa20 is calling blkcipher_walk_done() even when nothing is in 'walk.nbytes'. But blkcipher_walk_done() is only meant to be called when a nonzero number of bytes have been provided. The broken code is part of an optimization that tries to make only one call to salsa20_encrypt_bytes() to process inputs that are not evenly divisible by 64 bytes. To fix the bug, just remove this "optimization" and use the blkcipher_walk API the same way all the other users do. Reproducer: #include #include #include int main() { int algfd, reqfd; struct sockaddr_alg addr = { .salg_type = "skcipher", .salg_name = "salsa20", }; char key[16] = { 0 }; algfd = socket(AF_ALG, SOCK_SEQPACKET, 0); bind(algfd, (void *)&addr, sizeof(addr)); reqfd = accept(algfd, 0, 0); setsockopt(algfd, SOL_ALG, ALG_SET_KEY, key, sizeof(key)); read(reqfd, key, sizeof(key)); } Reported-by: syzbot Fixes: eb6f13eb9f81 ("[CRYPTO] salsa20_generic: Fix multi-page processing") Cc: # v2.6.25+ Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/salsa20_glue.c | 7 ------- crypto/salsa20_generic.c | 7 ------- 2 files changed, 14 deletions(-) diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c index 399a29d067d6..cb91a64a99e7 100644 --- a/arch/x86/crypto/salsa20_glue.c +++ b/arch/x86/crypto/salsa20_glue.c @@ -59,13 +59,6 @@ static int encrypt(struct blkcipher_desc *desc, salsa20_ivsetup(ctx, walk.iv); - if (likely(walk.nbytes == nbytes)) - { - salsa20_encrypt_bytes(ctx, walk.src.virt.addr, - walk.dst.virt.addr, nbytes); - return blkcipher_walk_done(desc, &walk, 0); - } - while (walk.nbytes >= 64) { salsa20_encrypt_bytes(ctx, walk.src.virt.addr, walk.dst.virt.addr, diff --git a/crypto/salsa20_generic.c b/crypto/salsa20_generic.c index f550b5d94630..d7da0eea5622 100644 --- a/crypto/salsa20_generic.c +++ b/crypto/salsa20_generic.c @@ -188,13 +188,6 @@ static int encrypt(struct blkcipher_desc *desc, salsa20_ivsetup(ctx, walk.iv); - if (likely(walk.nbytes == nbytes)) - { - salsa20_encrypt_bytes(ctx, walk.dst.virt.addr, - walk.src.virt.addr, nbytes); - return blkcipher_walk_done(desc, &walk, 0); - } - while (walk.nbytes >= 64) { salsa20_encrypt_bytes(ctx, walk.dst.virt.addr, walk.src.virt.addr, From 5c9afbda911ce20b3f2181d1e440a0222e1027dd Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 17 Nov 2017 22:37:53 +0100 Subject: [PATCH 026/269] dmaengine: ioat: Fix error handling path If the last test in 'ioat_dma_self_test()' fails, we must release all the allocated resources and not just part of them. Signed-off-by: Christophe JAILLET Acked-by: Dave Jiang Signed-off-by: Vinod Koul --- drivers/dma/ioat/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/ioat/init.c b/drivers/dma/ioat/init.c index 2f31d3d0caa6..7792a9186f9c 100644 --- a/drivers/dma/ioat/init.c +++ b/drivers/dma/ioat/init.c @@ -390,7 +390,7 @@ static int ioat_dma_self_test(struct ioatdma_device *ioat_dma) if (memcmp(src, dest, IOAT_TEST_SIZE)) { dev_err(dev, "Self-test copy failed compare, disabling\n"); err = -ENODEV; - goto free_resources; + goto unmap_dma; } unmap_dma: From 62a277d43d47e74972de44d33bd3763e31992414 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 20 Nov 2017 08:28:14 -0600 Subject: [PATCH 027/269] dmaengine: at_hdmac: fix potential NULL pointer dereference in atc_prep_dma_interleaved _xt_ is being dereferenced before it is null checked, hence there is a potential null pointer dereference. Fix this by moving the pointer dereference after _xt_ has been null checked. This issue was detected with the help of Coccinelle. Fixes: 4483320e241c ("dmaengine: Use Pointer xt after NULL check.") Signed-off-by: Gustavo A. R. Silva Acked-by: Ludovic Desroches Signed-off-by: Vinod Koul --- drivers/dma/at_hdmac.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c index fbab271b3bf9..a861b5b4d443 100644 --- a/drivers/dma/at_hdmac.c +++ b/drivers/dma/at_hdmac.c @@ -708,7 +708,7 @@ atc_prep_dma_interleaved(struct dma_chan *chan, unsigned long flags) { struct at_dma_chan *atchan = to_at_dma_chan(chan); - struct data_chunk *first = xt->sgl; + struct data_chunk *first; struct at_desc *desc = NULL; size_t xfer_count; unsigned int dwidth; @@ -720,6 +720,8 @@ atc_prep_dma_interleaved(struct dma_chan *chan, if (unlikely(!xt || xt->numf != 1 || !xt->frame_size)) return NULL; + first = xt->sgl; + dev_info(chan2dev(chan), "%s: src=%pad, dest=%pad, numf=%d, frame_size=%d, flags=0x%lx\n", __func__, &xt->src_start, &xt->dst_start, xt->numf, From fe77d8257c4d838c5976557ddb87bd789f312412 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Wed, 29 Nov 2017 10:25:02 +0100 Subject: [PATCH 028/269] batman-adv: Always initialize fragment header priority The batman-adv unuicast fragment header contains 3 bits for the priority of the packet. These bits will be initialized when the skb->priority contains a value between 256 and 263. But otherwise, the uninitialized bits from the stack will be used. Fixes: c0f25c802b33 ("batman-adv: Include frame priority in fragment header") Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/fragmentation.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index a98cf1104a30..ebe6e38934e4 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -499,6 +499,8 @@ int batadv_frag_send_packet(struct sk_buff *skb, */ if (skb->priority >= 256 && skb->priority <= 263) frag_header.priority = skb->priority - 256; + else + frag_header.priority = 0; ether_addr_copy(frag_header.orig, primary_if->net_dev->dev_addr); ether_addr_copy(frag_header.dest, orig_node->orig); From 198a62ddffa4a4ffaeb741f642b7b52f2d91ae9b Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Wed, 29 Nov 2017 10:50:42 +0100 Subject: [PATCH 029/269] batman-adv: Fix check of retrieved orig_gw in batadv_v_gw_is_eligible The batadv_v_gw_is_eligible function already assumes that orig_node is not NULL. But batadv_gw_node_get may have failed to find the originator. It must therefore be checked whether the batadv_gw_node_get failed and not whether orig_node is NULL to detect this error. Fixes: 50164d8f500f ("batman-adv: B.A.T.M.A.N. V - implement GW selection logic") Signed-off-by: Sven Eckelmann Acked-by: Antonio Quartulli Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_v.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 341ceab8338d..e0e2bfcd6b3e 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -814,7 +814,7 @@ static bool batadv_v_gw_is_eligible(struct batadv_priv *bat_priv, } orig_gw = batadv_gw_node_get(bat_priv, orig_node); - if (!orig_node) + if (!orig_gw) goto out; if (batadv_v_gw_throughput_get(orig_gw, &orig_throughput) < 0) From 974a6b20518602310637bd8ac9ad348bf8a864d6 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 1 Dec 2017 11:47:56 +0100 Subject: [PATCH 030/269] batman-adv: Fix kernel-doc for timer functions The commit e99e88a9d2b0 ("treewide: setup_timer() -> timer_setup()") changed the argument name and type of the timer function but didn't adjust the kernel-doc of these functions. Signed-off-by: Sven Eckelmann Acked-by: Kees Cook Signed-off-by: Simon Wunderlich --- net/batman-adv/tp_meter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index 15cd2139381e..ebc4e2241c77 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -482,7 +482,7 @@ static void batadv_tp_reset_sender_timer(struct batadv_tp_vars *tp_vars) /** * batadv_tp_sender_timeout - timer that fires in case of packet loss - * @arg: address of the related tp_vars + * @t: address to timer_list inside tp_vars * * If fired it means that there was packet loss. * Switch to Slow Start, set the ss_threshold to half of the current cwnd and @@ -1106,7 +1106,7 @@ static void batadv_tp_reset_receiver_timer(struct batadv_tp_vars *tp_vars) /** * batadv_tp_receiver_shutdown - stop a tp meter receiver when timeout is * reached without received ack - * @arg: address of the related tp_vars + * @t: address to timer_list inside tp_vars */ static void batadv_tp_receiver_shutdown(struct timer_list *t) { From f9ecc83f8d723372976df8eda3193726d7a24fcb Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Fri, 1 Dec 2017 13:37:12 -0500 Subject: [PATCH 031/269] eeprom: at24: fix I2C device selection for runtime PM The at24 driver creates dummy I2C devices to access offsets in the chip that are outside the area supported using a single I2C address. It is not meaningful to use runtime PM to such devices; the system firmware (ACPI) does not know about these devices nor runtime PM was enabled for them. Always use the real device instead of the dummy ones. Fixes: 98e8201039af ("eeprom: at24: enable runtime pm support") Signed-off-by: Sakari Ailus Tested-by: Sven Van Asbroeck on a 24AA16/24LC16B [Bartosz: rebased on top of previous fixes for 4.15, tweaked the commit message] [Sven: fixed Bartosz's rebase] Signed-off-by: Sven Van Asbroeck Signed-off-by: Bartosz Golaszewski --- drivers/misc/eeprom/at24.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/drivers/misc/eeprom/at24.c b/drivers/misc/eeprom/at24.c index 305a7a464d09..20b4f26d30d7 100644 --- a/drivers/misc/eeprom/at24.c +++ b/drivers/misc/eeprom/at24.c @@ -562,7 +562,7 @@ static ssize_t at24_eeprom_write_i2c(struct at24_data *at24, const char *buf, static int at24_read(void *priv, unsigned int off, void *val, size_t count) { struct at24_data *at24 = priv; - struct i2c_client *client; + struct device *dev = &at24->client[0]->dev; char *buf = val; int ret; @@ -572,11 +572,9 @@ static int at24_read(void *priv, unsigned int off, void *val, size_t count) if (off + count > at24->chip.byte_len) return -EINVAL; - client = at24_translate_offset(at24, &off); - - ret = pm_runtime_get_sync(&client->dev); + ret = pm_runtime_get_sync(dev); if (ret < 0) { - pm_runtime_put_noidle(&client->dev); + pm_runtime_put_noidle(dev); return ret; } @@ -592,7 +590,7 @@ static int at24_read(void *priv, unsigned int off, void *val, size_t count) status = at24->read_func(at24, buf, off, count); if (status < 0) { mutex_unlock(&at24->lock); - pm_runtime_put(&client->dev); + pm_runtime_put(dev); return status; } buf += status; @@ -602,7 +600,7 @@ static int at24_read(void *priv, unsigned int off, void *val, size_t count) mutex_unlock(&at24->lock); - pm_runtime_put(&client->dev); + pm_runtime_put(dev); return 0; } @@ -610,7 +608,7 @@ static int at24_read(void *priv, unsigned int off, void *val, size_t count) static int at24_write(void *priv, unsigned int off, void *val, size_t count) { struct at24_data *at24 = priv; - struct i2c_client *client; + struct device *dev = &at24->client[0]->dev; char *buf = val; int ret; @@ -620,11 +618,9 @@ static int at24_write(void *priv, unsigned int off, void *val, size_t count) if (off + count > at24->chip.byte_len) return -EINVAL; - client = at24_translate_offset(at24, &off); - - ret = pm_runtime_get_sync(&client->dev); + ret = pm_runtime_get_sync(dev); if (ret < 0) { - pm_runtime_put_noidle(&client->dev); + pm_runtime_put_noidle(dev); return ret; } @@ -640,7 +636,7 @@ static int at24_write(void *priv, unsigned int off, void *val, size_t count) status = at24->write_func(at24, buf, off, count); if (status < 0) { mutex_unlock(&at24->lock); - pm_runtime_put(&client->dev); + pm_runtime_put(dev); return status; } buf += status; @@ -650,7 +646,7 @@ static int at24_write(void *priv, unsigned int off, void *val, size_t count) mutex_unlock(&at24->lock); - pm_runtime_put(&client->dev); + pm_runtime_put(dev); return 0; } From fc82228a5e3860502dbf3bfa4a9570cb7093cf7f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sun, 3 Dec 2017 20:38:01 -0500 Subject: [PATCH 032/269] ext4: support fast symlinks from ext3 file systems 407cd7fb83c0 (ext4: change fast symlink test to not rely on i_blocks) broke ~10 years old ext3 file systems created by 2.6.17. Any ELF executable fails because the /lib/ld-linux.so.2 fast symlink cannot be read anymore. The patch assumed fast symlinks were created in a specific way, but that's not true on these really old file systems. The new behavior is apparently needed only with the large EA inode feature. Revert to the old behavior if the large EA inode feature is not set. This makes my old VM boot again. Fixes: 407cd7fb83c0 (ext4: change fast symlink test to not rely on i_blocks) Signed-off-by: Andi Kleen Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger Cc: stable@vger.kernel.org --- fs/ext4/inode.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 7df2c5644e59..534a9130f625 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -149,6 +149,15 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, */ int ext4_inode_is_fast_symlink(struct inode *inode) { + if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { + int ea_blocks = EXT4_I(inode)->i_file_acl ? + EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; + + if (ext4_has_inline_data(inode)) + return 0; + + return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); + } return S_ISLNK(inode->i_mode) && inode->i_size && (inode->i_size < EXT4_N_BLOCKS * 4); } From c894aa97577e47d3066b27b32499ecf899bfa8b0 Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Sun, 3 Dec 2017 22:52:51 -0500 Subject: [PATCH 033/269] ext4: fix fdatasync(2) after fallocate(2) operation Currently, fallocate(2) with KEEP_SIZE followed by a fdatasync(2) then crash, we'll see wrong allocated block number (stat -c %b), the blocks allocated beyond EOF are all lost. fstests generic/468 exposes this bug. Commit 67a7d5f561f4 ("ext4: fix fdatasync(2) after extent manipulation operations") fixed all the other extent manipulation operation paths such as hole punch, zero range, collapse range etc., but forgot the fallocate case. So similarly, fix it by recording the correct journal tid in ext4 inode in fallocate(2) path, so that ext4_sync_file() will wait for the right tid to be committed on fdatasync(2). This addresses the test failure in xfstests test generic/468. Signed-off-by: Eryu Guan Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/extents.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 07bca11749d4..c941251ac0c0 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4722,6 +4722,7 @@ retry: EXT4_INODE_EOFBLOCKS); } ext4_mark_inode_dirty(handle, inode); + ext4_update_inode_fsync_trans(handle, inode, 1); ret2 = ext4_journal_stop(handle); if (ret2) break; From 4b380c42f7d00a395feede754f0bc2292eebe6e5 Mon Sep 17 00:00:00 2001 From: Kevin Cernekee Date: Sun, 3 Dec 2017 12:12:45 -0800 Subject: [PATCH 034/269] netfilter: nfnetlink_cthelper: Add missing permission checks The capability check in nfnetlink_rcv() verifies that the caller has CAP_NET_ADMIN in the namespace that "owns" the netlink socket. However, nfnl_cthelper_list is shared by all net namespaces on the system. An unprivileged user can create user and net namespaces in which he holds CAP_NET_ADMIN to bypass the netlink_net_capable() check: $ nfct helper list nfct v1.4.4: netlink error: Operation not permitted $ vpnns -- nfct helper list { .name = ftp, .queuenum = 0, .l3protonum = 2, .l4protonum = 6, .priv_data_len = 24, .status = enabled, }; Add capable() checks in nfnetlink_cthelper, as this is cleaner than trying to generalize the solution. Signed-off-by: Kevin Cernekee Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_cthelper.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index 41628b393673..d33ce6d5ebce 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -407,6 +408,9 @@ static int nfnl_cthelper_new(struct net *net, struct sock *nfnl, struct nfnl_cthelper *nlcth; int ret = 0; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (!tb[NFCTH_NAME] || !tb[NFCTH_TUPLE]) return -EINVAL; @@ -611,6 +615,9 @@ static int nfnl_cthelper_get(struct net *net, struct sock *nfnl, struct nfnl_cthelper *nlcth; bool tuple_set = false; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nfnl_cthelper_dump_table, @@ -678,6 +685,9 @@ static int nfnl_cthelper_del(struct net *net, struct sock *nfnl, struct nfnl_cthelper *nlcth, *n; int j = 0, ret; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (tb[NFCTH_NAME]) helper_name = nla_data(tb[NFCTH_NAME]); From 6ab405114b0b229151ef06f4e31c7834dd09d0c0 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 1 Dec 2017 01:46:07 +0100 Subject: [PATCH 035/269] netfilter: xt_bpf: add overflow checks Check whether inputs from userspace are too long (explicit length field too big or string not null-terminated) to avoid out-of-bounds reads. As far as I can tell, this can at worst lead to very limited kernel heap memory disclosure or oopses. This bug can be triggered by an unprivileged user even if the xt_bpf module is not loaded: iptables is available in network namespaces, and the xt_bpf module can be autoloaded. Triggering the bug with a classic BPF filter with fake length 0x1000 causes the following KASAN report: ================================================================== BUG: KASAN: slab-out-of-bounds in bpf_prog_create+0x84/0xf0 Read of size 32768 at addr ffff8801eff2c494 by task test/4627 CPU: 0 PID: 4627 Comm: test Not tainted 4.15.0-rc1+ #1 [...] Call Trace: dump_stack+0x5c/0x85 print_address_description+0x6a/0x260 kasan_report+0x254/0x370 ? bpf_prog_create+0x84/0xf0 memcpy+0x1f/0x50 bpf_prog_create+0x84/0xf0 bpf_mt_check+0x90/0xd6 [xt_bpf] [...] Allocated by task 4627: kasan_kmalloc+0xa0/0xd0 __kmalloc_node+0x47/0x60 xt_alloc_table_info+0x41/0x70 [x_tables] [...] The buggy address belongs to the object at ffff8801eff2c3c0 which belongs to the cache kmalloc-2048 of size 2048 The buggy address is located 212 bytes inside of 2048-byte region [ffff8801eff2c3c0, ffff8801eff2cbc0) [...] ================================================================== Fixes: e6f30c731718 ("netfilter: x_tables: add xt_bpf match") Signed-off-by: Jann Horn Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_bpf.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c index 041da0d9c06f..1f7fbd3c7e5a 100644 --- a/net/netfilter/xt_bpf.c +++ b/net/netfilter/xt_bpf.c @@ -27,6 +27,9 @@ static int __bpf_mt_check_bytecode(struct sock_filter *insns, __u16 len, { struct sock_fprog_kern program; + if (len > XT_BPF_MAX_NUM_INSTR) + return -EINVAL; + program.len = len; program.filter = insns; @@ -55,6 +58,9 @@ static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret) mm_segment_t oldfs = get_fs(); int retval, fd; + if (strnlen(path, XT_BPF_PATH_MAX) == XT_BPF_PATH_MAX) + return -EINVAL; + set_fs(KERNEL_DS); fd = bpf_obj_get_user(path, 0); set_fs(oldfs); From 5ba7dcfe77037b67016263ea597a8b431692ecab Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 3 Dec 2017 11:26:45 +0100 Subject: [PATCH 036/269] batman-adv: Fix lock for ogm cnt access in batadv_iv_ogm_calc_tq The originator node object orig_neigh_node is used to when accessing the bcast_own(_sum) and real_packet_count information. The access to them has to be protected with the spinlock in orig_neigh_node. But the function uses the lock in orig_node instead. This is incorrect because they could be two different originator node objects. Fixes: 0ede9f41b217 ("batman-adv: protect bit operations to count OGMs with spinlock") Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_iv_ogm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 1b659ab652fb..bbe8414b6ee7 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -1214,7 +1214,7 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, orig_node->last_seen = jiffies; /* find packet count of corresponding one hop neighbor */ - spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); + spin_lock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock); if_num = if_incoming->if_num; orig_eq_count = orig_neigh_node->bat_iv.bcast_own_sum[if_num]; neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing); @@ -1224,7 +1224,7 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, } else { neigh_rq_count = 0; } - spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); + spin_unlock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock); /* pay attention to not get a value bigger than 100 % */ if (orig_eq_count > neigh_rq_count) From 5a93bae2c382c588f437ce0395e8032ae287dc36 Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Thu, 19 Oct 2017 14:32:33 +0800 Subject: [PATCH 037/269] tracing: Fix code comments in trace.c Naming in code comments for tracing_snapshot, tracing_snapshot_alloc and trace_pid_filter_add_remove_task don't match the real function names. And latency_trace has been removed from tracing directory. Fix them. Link: http://lkml.kernel.org/r/1508394753-20887-1-git-send-email-chuhu@redhat.com Fixes: cab5037 ("tracing/ftrace: Enable snapshot function trigger") Fixes: 886b5b7 ("tracing: remove /debug/tracing/latency_trace") Signed-off-by: Chunyu Hu [ Replaced /sys/kernel/debug/tracing with /sys/kerne/tracing ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 73e67b68c53b..5815ec16edd4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -362,7 +362,7 @@ trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct } /** - * trace_pid_filter_add_remove - Add or remove a task from a pid_list + * trace_pid_filter_add_remove_task - Add or remove a task from a pid_list * @pid_list: The list to modify * @self: The current task for fork or NULL for exit * @task: The task to add or remove @@ -925,7 +925,7 @@ static void tracing_snapshot_instance(struct trace_array *tr) } /** - * trace_snapshot - take a snapshot of the current buffer. + * tracing_snapshot - take a snapshot of the current buffer. * * This causes a swap between the snapshot buffer and the current live * tracing buffer. You can use this to take snapshots of the live @@ -1004,9 +1004,9 @@ int tracing_alloc_snapshot(void) EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); /** - * trace_snapshot_alloc - allocate and take a snapshot of the current buffer. + * tracing_snapshot_alloc - allocate and take a snapshot of the current buffer. * - * This is similar to trace_snapshot(), but it will allocate the + * This is similar to tracing_snapshot(), but it will allocate the * snapshot buffer if it isn't already allocated. Use this only * where it is safe to sleep, as the allocation may sleep. * @@ -1303,7 +1303,7 @@ unsigned long __read_mostly tracing_thresh; /* * Copy the new maximum trace into the separate maximum-trace * structure. (this way the maximum trace is permanently saved, - * for later retrieval via /sys/kernel/debug/tracing/latency_trace) + * for later retrieval via /sys/kernel/tracing/tracing_max_latency) */ static void __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) From 250d0c7754aa37c6443f07f1f5f591e2806295d8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 19 Oct 2017 10:32:13 +0200 Subject: [PATCH 038/269] tracing: always define trace_{irq,preempt}_{enable_disable} We get a build error in the irqsoff tracer in some configurations: kernel/trace/trace_irqsoff.c: In function 'trace_preempt_on': kernel/trace/trace_irqsoff.c:855:2: error: implicit declaration of function 'trace_preempt_enable_rcuidle'; did you mean 'trace_irq_enable_rcuidle'? [-Werror=implicit-function-declaration] trace_preempt_enable_rcuidle(a0, a1); The problem is that trace_preempt_enable_rcuidle() has different definition based on multiple Kconfig symbols, but not all combinations have a valid definition. This changes the conditions so that we always get exactly one definition of each of the four tracing macros. I have not tried to verify that these definitions are sensible, but now we can build all randconfig combinations again. Link: http://lkml.kernel.org/r/20171019083230.2450779-1-arnd@arndb.de Fixes: d59158162e03 ("tracing: Add support for preempt and irq enable/disable events") Acked-by: Joel Fernandes Signed-off-by: Arnd Bergmann Signed-off-by: Steven Rostedt (VMware) --- include/trace/events/preemptirq.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/include/trace/events/preemptirq.h b/include/trace/events/preemptirq.h index f5024c560d8f..9c4eb33c5a1d 100644 --- a/include/trace/events/preemptirq.h +++ b/include/trace/events/preemptirq.h @@ -56,15 +56,18 @@ DEFINE_EVENT(preemptirq_template, preempt_enable, #include -#else /* !CONFIG_PREEMPTIRQ_EVENTS */ +#endif /* !CONFIG_PREEMPTIRQ_EVENTS */ +#if !defined(CONFIG_PREEMPTIRQ_EVENTS) || defined(CONFIG_PROVE_LOCKING) #define trace_irq_enable(...) #define trace_irq_disable(...) -#define trace_preempt_enable(...) -#define trace_preempt_disable(...) #define trace_irq_enable_rcuidle(...) #define trace_irq_disable_rcuidle(...) +#endif + +#if !defined(CONFIG_PREEMPTIRQ_EVENTS) || !defined(CONFIG_DEBUG_PREEMPT) +#define trace_preempt_enable(...) +#define trace_preempt_disable(...) #define trace_preempt_enable_rcuidle(...) #define trace_preempt_disable_rcuidle(...) - #endif From 90e406f96f630c07d631a021fd4af10aac913e77 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Thu, 30 Nov 2017 11:39:43 +0800 Subject: [PATCH 039/269] tracing: Allocate mask_str buffer dynamically The default NR_CPUS can be very large, but actual possible nr_cpu_ids usually is very small. For my x86 distribution, the NR_CPUS is 8192 and nr_cpu_ids is 4. About 2 pages are wasted. Most machines don't have so many CPUs, so define a array with NR_CPUS just wastes memory. So let's allocate the buffer dynamically when need. With this change, the mutext tracing_cpumask_update_lock also can be removed now, which was used to protect mask_str. Link: http://lkml.kernel.org/r/1512013183-19107-1-git-send-email-changbin.du@intel.com Fixes: 36dfe9252bd4c ("ftrace: make use of tracing_cpumask") Cc: stable@vger.kernel.org Signed-off-by: Changbin Du Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5815ec16edd4..9f3f043ba3b7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4178,37 +4178,30 @@ static const struct file_operations show_traces_fops = { .llseek = seq_lseek, }; -/* - * The tracer itself will not take this lock, but still we want - * to provide a consistent cpumask to user-space: - */ -static DEFINE_MUTEX(tracing_cpumask_update_lock); - -/* - * Temporary storage for the character representation of the - * CPU bitmask (and one more byte for the newline): - */ -static char mask_str[NR_CPUS + 1]; - static ssize_t tracing_cpumask_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { struct trace_array *tr = file_inode(filp)->i_private; + char *mask_str; int len; - mutex_lock(&tracing_cpumask_update_lock); + len = snprintf(NULL, 0, "%*pb\n", + cpumask_pr_args(tr->tracing_cpumask)) + 1; + mask_str = kmalloc(len, GFP_KERNEL); + if (!mask_str) + return -ENOMEM; - len = snprintf(mask_str, count, "%*pb\n", + len = snprintf(mask_str, len, "%*pb\n", cpumask_pr_args(tr->tracing_cpumask)); if (len >= count) { count = -EINVAL; goto out_err; } - count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1); + count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len); out_err: - mutex_unlock(&tracing_cpumask_update_lock); + kfree(mask_str); return count; } @@ -4228,8 +4221,6 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, if (err) goto err_unlock; - mutex_lock(&tracing_cpumask_update_lock); - local_irq_disable(); arch_spin_lock(&tr->max_lock); for_each_tracing_cpu(cpu) { @@ -4252,8 +4243,6 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, local_irq_enable(); cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new); - - mutex_unlock(&tracing_cpumask_update_lock); free_cpumask_var(tracing_cpumask_new); return count; From 2dde6b0034dbc050957cdb6539ce28eca57e8cdf Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 3 Nov 2017 11:39:57 +0100 Subject: [PATCH 040/269] tracing: make PREEMPTIRQ_EVENTS depend on TRACING When CONFIG_TRACING is disabled, the new preemptirq events tracer produces a build failure: In file included from kernel/trace/trace_irqsoff.c:17:0: kernel/trace/trace.h: In function 'trace_test_and_set_recursion': kernel/trace/trace.h:542:28: error: 'struct task_struct' has no member named 'trace_recursion' Adding an explicit dependency avoids the broken configuration. Link: http://lkml.kernel.org/r/20171103104031.270375-1-arnd@arndb.de Fixes: d59158162e03 ("tracing: Add support for preempt and irq enable/disable events") Signed-off-by: Arnd Bergmann Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index af7dad126c13..904c952ac383 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -164,6 +164,7 @@ config PREEMPTIRQ_EVENTS bool "Enable trace events for preempt and irq disable/enable" select TRACE_IRQFLAGS depends on DEBUG_PREEMPT || !PROVE_LOCKING + depends on TRACING default n help Enable tracing of disable and enable events for preemption and irqs. From c4bfd39d7fa5203d4b387c283d360e9a108e85b3 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Wed, 17 May 2017 17:14:15 -0700 Subject: [PATCH 041/269] ring-buffer: Remove unused function __rb_data_page_index() This fixes the following warning when building with clang: kernel/trace/ring_buffer.c:1842:1: error: unused function '__rb_data_page_index' [-Werror,-Wunused-function] Link: http://lkml.kernel.org/r/20170518001415.5223-1-mka@chromium.org Reviewed-by: Douglas Anderson Signed-off-by: Matthias Kaehlcke Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 91874a95060d..c87766c1c204 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1799,12 +1799,6 @@ void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val) } EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite); -static __always_inline void * -__rb_data_page_index(struct buffer_data_page *bpage, unsigned index) -{ - return bpage->data + index; -} - static __always_inline void *__rb_page_index(struct buffer_page *bpage, unsigned index) { return bpage->page->data + index; From a773d419275bf54854ca6cfda8f2594ed2790faa Mon Sep 17 00:00:00 2001 From: Felipe Balbi Date: Fri, 2 Jun 2017 13:20:25 +0300 Subject: [PATCH 042/269] tracing: Pass export pointer as argument to ->write() By passing an export descriptor to the write function, users don't need to keep a global static pointer and can rely on container_of() to fetch their own structure. Link: http://lkml.kernel.org/r/20170602102025.5140-1-felipe.balbi@linux.intel.com Acked-by: Steven Rostedt (VMware) Reviewed-by: Chunyan Zhang Signed-off-by: Felipe Balbi Signed-off-by: Steven Rostedt (VMware) --- drivers/hwtracing/stm/ftrace.c | 6 ++++-- include/linux/trace.h | 2 +- kernel/trace/trace.c | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/hwtracing/stm/ftrace.c b/drivers/hwtracing/stm/ftrace.c index bd126a7c6da2..7da75644c750 100644 --- a/drivers/hwtracing/stm/ftrace.c +++ b/drivers/hwtracing/stm/ftrace.c @@ -42,9 +42,11 @@ static struct stm_ftrace { * @len: length of the data packet */ static void notrace -stm_ftrace_write(const void *buf, unsigned int len) +stm_ftrace_write(struct trace_export *export, const void *buf, unsigned int len) { - stm_source_write(&stm_ftrace.data, STM_FTRACE_CHAN, buf, len); + struct stm_ftrace *stm = container_of(export, struct stm_ftrace, ftrace); + + stm_source_write(&stm->data, STM_FTRACE_CHAN, buf, len); } static int stm_ftrace_link(struct stm_source_data *data) diff --git a/include/linux/trace.h b/include/linux/trace.h index d24991c1fef3..b95ffb2188ab 100644 --- a/include/linux/trace.h +++ b/include/linux/trace.h @@ -18,7 +18,7 @@ */ struct trace_export { struct trace_export __rcu *next; - void (*write)(const void *, unsigned int); + void (*write)(struct trace_export *, const void *, unsigned int); }; int register_ftrace_export(struct trace_export *export); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9f3f043ba3b7..59518b8126d0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2415,7 +2415,7 @@ trace_process_export(struct trace_export *export, entry = ring_buffer_event_data(event); size = ring_buffer_event_length(event); - export->write(entry, size); + export->write(export, entry, size); } static DEFINE_MUTEX(ftrace_export_lock); From afc567a4977b2d798e05153dd131a3c8d4758c0c Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Sat, 25 Nov 2017 00:27:26 -0500 Subject: [PATCH 043/269] dm table: fix regression from improper dm_dev_internal.count refcount_t conversion Multiple refcounts are needed if the device was already added. The micro-optimization of setting the refcount to 1 on first added (rather than fall thru to a common refcount_inc) lost sight of the fact that the refcount_inc is also needed for the case when the device already exists and the mode need not be upgraded. Fixes: 2a0b4682e0 ("dm: convert dm_dev_internal.count from atomic_t to refcount_t") Reported-by: Zdenek Kabelac Signed-off-by: Mike Snitzer --- drivers/md/dm-table.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 88130b5d95f9..aaffd0c0ee9a 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -453,14 +453,15 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, refcount_set(&dd->count, 1); list_add(&dd->list, &t->devices); + goto out; } else if (dd->dm_dev->mode != (mode | dd->dm_dev->mode)) { r = upgrade_mode(dd, mode, t->md); if (r) return r; - refcount_inc(&dd->count); } - + refcount_inc(&dd->count); +out: *result = dd->dm_dev; return 0; } From 7e6358d244e4706fe612a77b9c36519a33600ac0 Mon Sep 17 00:00:00 2001 From: "monty_pavel@sina.com" Date: Sat, 25 Nov 2017 01:43:50 +0800 Subject: [PATCH 044/269] dm: fix various targets to dm_register_target after module __init resources created A NULL pointer is seen if two concurrent "vgchange -ay -K " processes race to load the dm-thin-pool module: PID: 25992 TASK: ffff883cd7d23500 CPU: 4 COMMAND: "vgchange" #0 [ffff883cd743d600] machine_kexec at ffffffff81038fa9 0000001 [ffff883cd743d660] crash_kexec at ffffffff810c5992 0000002 [ffff883cd743d730] oops_end at ffffffff81515c90 0000003 [ffff883cd743d760] no_context at ffffffff81049f1b 0000004 [ffff883cd743d7b0] __bad_area_nosemaphore at ffffffff8104a1a5 0000005 [ffff883cd743d800] bad_area at ffffffff8104a2ce 0000006 [ffff883cd743d830] __do_page_fault at ffffffff8104aa6f 0000007 [ffff883cd743d950] do_page_fault at ffffffff81517bae 0000008 [ffff883cd743d980] page_fault at ffffffff81514f95 [exception RIP: kmem_cache_alloc+108] RIP: ffffffff8116ef3c RSP: ffff883cd743da38 RFLAGS: 00010046 RAX: 0000000000000004 RBX: ffffffff81121b90 RCX: ffff881bf1e78cc0 RDX: 0000000000000000 RSI: 00000000000000d0 RDI: 0000000000000000 RBP: ffff883cd743da68 R8: ffff881bf1a4eb00 R9: 0000000080042000 R10: 0000000000002000 R11: 0000000000000000 R12: 00000000000000d0 R13: 0000000000000000 R14: 00000000000000d0 R15: 0000000000000246 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 0000009 [ffff883cd743da70] mempool_alloc_slab at ffffffff81121ba5 0000010 [ffff883cd743da80] mempool_create_node at ffffffff81122083 0000011 [ffff883cd743dad0] mempool_create at ffffffff811220f4 0000012 [ffff883cd743dae0] pool_ctr at ffffffffa08de049 [dm_thin_pool] 0000013 [ffff883cd743dbd0] dm_table_add_target at ffffffffa0005f2f [dm_mod] 0000014 [ffff883cd743dc30] table_load at ffffffffa0008ba9 [dm_mod] 0000015 [ffff883cd743dc90] ctl_ioctl at ffffffffa0009dc4 [dm_mod] The race results in a NULL pointer because: Process A (vgchange -ay -K): a. send DM_LIST_VERSIONS_CMD ioctl; b. pool_target not registered; c. modprobe dm_thin_pool and wait until end. Process B (vgchange -ay -K): a. send DM_LIST_VERSIONS_CMD ioctl; b. pool_target registered; c. table_load->dm_table_add_target->pool_ctr; d. _new_mapping_cache is NULL and panic. Note: 1. process A and process B are two concurrent processes. 2. pool_target can be detected by process B but _new_mapping_cache initialization has not ended. To fix dm-thin-pool, and other targets (cache, multipath, and snapshot) with the same problem, simply dm_register_target() after all resources created during module init (as labelled with __init) are finished. Cc: stable@vger.kernel.org Signed-off-by: monty Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-target.c | 12 ++++----- drivers/md/dm-mpath.c | 18 +++++++------- drivers/md/dm-snap.c | 48 ++++++++++++++++++------------------ drivers/md/dm-thin.c | 22 ++++++++--------- 4 files changed, 49 insertions(+), 51 deletions(-) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index cf23a14f9c6a..47407e43b96a 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -3472,18 +3472,18 @@ static int __init dm_cache_init(void) { int r; - r = dm_register_target(&cache_target); - if (r) { - DMERR("cache target registration failed: %d", r); - return r; - } - migration_cache = KMEM_CACHE(dm_cache_migration, 0); if (!migration_cache) { dm_unregister_target(&cache_target); return -ENOMEM; } + r = dm_register_target(&cache_target); + if (r) { + DMERR("cache target registration failed: %d", r); + return r; + } + return 0; } diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index c8faa2b85842..35a2a2fa477f 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1957,13 +1957,6 @@ static int __init dm_multipath_init(void) { int r; - r = dm_register_target(&multipath_target); - if (r < 0) { - DMERR("request-based register failed %d", r); - r = -EINVAL; - goto bad_register_target; - } - kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0); if (!kmultipathd) { DMERR("failed to create workqueue kmpathd"); @@ -1985,13 +1978,20 @@ static int __init dm_multipath_init(void) goto bad_alloc_kmpath_handlerd; } + r = dm_register_target(&multipath_target); + if (r < 0) { + DMERR("request-based register failed %d", r); + r = -EINVAL; + goto bad_register_target; + } + return 0; +bad_register_target: + destroy_workqueue(kmpath_handlerd); bad_alloc_kmpath_handlerd: destroy_workqueue(kmultipathd); bad_alloc_kmultipathd: - dm_unregister_target(&multipath_target); -bad_register_target: return r; } diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 1113b42e1eda..a0613bd8ed00 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -2411,24 +2411,6 @@ static int __init dm_snapshot_init(void) return r; } - r = dm_register_target(&snapshot_target); - if (r < 0) { - DMERR("snapshot target register failed %d", r); - goto bad_register_snapshot_target; - } - - r = dm_register_target(&origin_target); - if (r < 0) { - DMERR("Origin target register failed %d", r); - goto bad_register_origin_target; - } - - r = dm_register_target(&merge_target); - if (r < 0) { - DMERR("Merge target register failed %d", r); - goto bad_register_merge_target; - } - r = init_origin_hash(); if (r) { DMERR("init_origin_hash failed."); @@ -2449,19 +2431,37 @@ static int __init dm_snapshot_init(void) goto bad_pending_cache; } + r = dm_register_target(&snapshot_target); + if (r < 0) { + DMERR("snapshot target register failed %d", r); + goto bad_register_snapshot_target; + } + + r = dm_register_target(&origin_target); + if (r < 0) { + DMERR("Origin target register failed %d", r); + goto bad_register_origin_target; + } + + r = dm_register_target(&merge_target); + if (r < 0) { + DMERR("Merge target register failed %d", r); + goto bad_register_merge_target; + } + return 0; -bad_pending_cache: - kmem_cache_destroy(exception_cache); -bad_exception_cache: - exit_origin_hash(); -bad_origin_hash: - dm_unregister_target(&merge_target); bad_register_merge_target: dm_unregister_target(&origin_target); bad_register_origin_target: dm_unregister_target(&snapshot_target); bad_register_snapshot_target: + kmem_cache_destroy(pending_cache); +bad_pending_cache: + kmem_cache_destroy(exception_cache); +bad_exception_cache: + exit_origin_hash(); +bad_origin_hash: dm_exception_store_exit(); return r; diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 89e5dff9b4cf..f91d771fff4b 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -4355,30 +4355,28 @@ static struct target_type thin_target = { static int __init dm_thin_init(void) { - int r; + int r = -ENOMEM; pool_table_init(); + _new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0); + if (!_new_mapping_cache) + return r; + r = dm_register_target(&thin_target); if (r) - return r; + goto bad_new_mapping_cache; r = dm_register_target(&pool_target); if (r) - goto bad_pool_target; - - r = -ENOMEM; - - _new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0); - if (!_new_mapping_cache) - goto bad_new_mapping_cache; + goto bad_thin_target; return 0; -bad_new_mapping_cache: - dm_unregister_target(&pool_target); -bad_pool_target: +bad_thin_target: dm_unregister_target(&thin_target); +bad_new_mapping_cache: + kmem_cache_destroy(_new_mapping_cache); return r; } From ce179cbdedf2f54306177e591664be7b18cf386a Mon Sep 17 00:00:00 2001 From: Yuantian Tang Date: Mon, 4 Dec 2017 17:01:20 +0800 Subject: [PATCH 045/269] ahci: qoriq: refine port register configuration These PP2C and PP3C registers control the configuration of the PHY control OOB timing for the COMINIT/COMWAKE parameters respectively for sata port. Overwrite default values with calculated ones to get better OOB timing. Signed-off-by: Tang Yuantian Signed-off-by: Tejun Heo --- drivers/ata/ahci_qoriq.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/ata/ahci_qoriq.c b/drivers/ata/ahci_qoriq.c index b6b0bf76dfc7..2685f28160f7 100644 --- a/drivers/ata/ahci_qoriq.c +++ b/drivers/ata/ahci_qoriq.c @@ -35,6 +35,8 @@ /* port register default value */ #define AHCI_PORT_PHY_1_CFG 0xa003fffe +#define AHCI_PORT_PHY2_CFG 0x28184d1f +#define AHCI_PORT_PHY3_CFG 0x0e081509 #define AHCI_PORT_TRANS_CFG 0x08000029 #define AHCI_PORT_AXICC_CFG 0x3fffffff @@ -183,6 +185,8 @@ static int ahci_qoriq_phy_init(struct ahci_host_priv *hpriv) writel(readl(qpriv->ecc_addr) | ECC_DIS_ARMV8_CH2, qpriv->ecc_addr); writel(AHCI_PORT_PHY_1_CFG, reg_base + PORT_PHY1); + writel(AHCI_PORT_PHY2_CFG, reg_base + PORT_PHY2); + writel(AHCI_PORT_PHY3_CFG, reg_base + PORT_PHY3); writel(AHCI_PORT_TRANS_CFG, reg_base + PORT_TRANS); if (qpriv->is_dmacoherent) writel(AHCI_PORT_AXICC_CFG, reg_base + PORT_AXICC); @@ -190,6 +194,8 @@ static int ahci_qoriq_phy_init(struct ahci_host_priv *hpriv) case AHCI_LS2080A: writel(AHCI_PORT_PHY_1_CFG, reg_base + PORT_PHY1); + writel(AHCI_PORT_PHY2_CFG, reg_base + PORT_PHY2); + writel(AHCI_PORT_PHY3_CFG, reg_base + PORT_PHY3); writel(AHCI_PORT_TRANS_CFG, reg_base + PORT_TRANS); if (qpriv->is_dmacoherent) writel(AHCI_PORT_AXICC_CFG, reg_base + PORT_AXICC); @@ -201,6 +207,8 @@ static int ahci_qoriq_phy_init(struct ahci_host_priv *hpriv) writel(readl(qpriv->ecc_addr) | ECC_DIS_ARMV8_CH2, qpriv->ecc_addr); writel(AHCI_PORT_PHY_1_CFG, reg_base + PORT_PHY1); + writel(AHCI_PORT_PHY2_CFG, reg_base + PORT_PHY2); + writel(AHCI_PORT_PHY3_CFG, reg_base + PORT_PHY3); writel(AHCI_PORT_TRANS_CFG, reg_base + PORT_TRANS); if (qpriv->is_dmacoherent) writel(AHCI_PORT_AXICC_CFG, reg_base + PORT_AXICC); @@ -212,6 +220,8 @@ static int ahci_qoriq_phy_init(struct ahci_host_priv *hpriv) writel(readl(qpriv->ecc_addr) | ECC_DIS_LS1088A, qpriv->ecc_addr); writel(AHCI_PORT_PHY_1_CFG, reg_base + PORT_PHY1); + writel(AHCI_PORT_PHY2_CFG, reg_base + PORT_PHY2); + writel(AHCI_PORT_PHY3_CFG, reg_base + PORT_PHY3); writel(AHCI_PORT_TRANS_CFG, reg_base + PORT_TRANS); if (qpriv->is_dmacoherent) writel(AHCI_PORT_AXICC_CFG, reg_base + PORT_AXICC); @@ -219,6 +229,8 @@ static int ahci_qoriq_phy_init(struct ahci_host_priv *hpriv) case AHCI_LS2088A: writel(AHCI_PORT_PHY_1_CFG, reg_base + PORT_PHY1); + writel(AHCI_PORT_PHY2_CFG, reg_base + PORT_PHY2); + writel(AHCI_PORT_PHY3_CFG, reg_base + PORT_PHY3); writel(AHCI_PORT_TRANS_CFG, reg_base + PORT_TRANS); if (qpriv->is_dmacoherent) writel(AHCI_PORT_AXICC_CFG, reg_base + PORT_AXICC); From 2467c0451ce5574738e223b93e3253c9a7015be1 Mon Sep 17 00:00:00 2001 From: Matthias Brugger Date: Fri, 1 Dec 2017 11:47:22 +0100 Subject: [PATCH 046/269] ahci: mtk: Change driver name to ahci-mtk The driver name "ahci" is already used by the ahci platform driver. This leads to the following error: Error: Driver 'ahci' is already registered, aborting... Change the name to ahci-mtk to fix this. Signed-off-by: Matthias Brugger Signed-off-by: Tejun Heo --- drivers/ata/ahci_mtk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ata/ahci_mtk.c b/drivers/ata/ahci_mtk.c index 489452a64303..0ae6971c2a4c 100644 --- a/drivers/ata/ahci_mtk.c +++ b/drivers/ata/ahci_mtk.c @@ -25,7 +25,7 @@ #include #include "ahci.h" -#define DRV_NAME "ahci" +#define DRV_NAME "ahci-mtk" #define SYS_CFG 0x14 #define SYS_CFG_SATA_MSK GENMASK(31, 30) From 2dc0b46b5ea30f169b0b272253ea846a5a281731 Mon Sep 17 00:00:00 2001 From: David Milburn Date: Tue, 14 Nov 2017 16:17:25 -0600 Subject: [PATCH 047/269] libata: sata_down_spd_limit should return if driver has not recorded sstatus speed During hotplug, it is possible for 6Gbps link speed to be limited all the way down to 1.5 Gbps which may lead to a slower link speed when drive is re-connected. This behavior has been seen on a Intel Lewisburg SATA controller (8086:a1d2) with HGST HUH728080ALE600 drive where SATA link speed was limited to 1.5 Gbps and when re-connected the link came up 3.0 Gbps. This patch was retested on above configuration and showed the hotplugged link to come back online at max speed (6Gbps). I did not see the downgrade when testing on Intel C600/X79, but retested patched linux-4.14-rc5 kernel and didn't see any side effects from this change. Also, successfully retested hotplug on port multiplier 3Gbps link. tj: Minor comment updates. Signed-off-by: David Milburn Signed-off-by: Tejun Heo --- drivers/ata/libata-core.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 2a882929de4a..8193b38a1cae 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -3082,13 +3082,19 @@ int sata_down_spd_limit(struct ata_link *link, u32 spd_limit) bit = fls(mask) - 1; mask &= ~(1 << bit); - /* Mask off all speeds higher than or equal to the current - * one. Force 1.5Gbps if current SPD is not available. + /* + * Mask off all speeds higher than or equal to the current one. At + * this point, if current SPD is not available and we previously + * recorded the link speed from SStatus, the driver has already + * masked off the highest bit so mask should already be 1 or 0. + * Otherwise, we should not force 1.5Gbps on a link where we have + * not previously recorded speed from SStatus. Just return in this + * case. */ if (spd > 1) mask &= (1 << (spd - 1)) - 1; else - mask &= 1; + return -EINVAL; /* were we already at the bottom? */ if (!mask) From 11db855c3d06e82f432cb1bafd73296586d5ceec Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Dec 2017 14:41:11 -0800 Subject: [PATCH 048/269] Revert "cpuset: Make cpuset hotplug synchronous" This reverts commit 1599a185f0e6113be185b9fb809c621c73865829. This and the previous commit led to another circular locking scenario and the scenario which is fixed by this commit no longer exists after e8b3f8db7aad ("workqueue/hotplug: simplify workqueue_offline_cpu()") which removes work item flushing from hotplug path. Revert it for now. Signed-off-by: Tejun Heo --- include/linux/cpuset.h | 6 ++++++ kernel/cgroup/cpuset.c | 45 +++++++++++++++++++++--------------------- kernel/power/process.c | 2 ++ kernel/sched/core.c | 1 + 4 files changed, 32 insertions(+), 22 deletions(-) diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 2ab910f85154..1b8e41597ef5 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -52,7 +52,9 @@ static inline void cpuset_dec(void) extern int cpuset_init(void); extern void cpuset_init_smp(void); +extern void cpuset_force_rebuild(void); extern void cpuset_update_active_cpus(void); +extern void cpuset_wait_for_hotplug(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); extern void cpuset_cpus_allowed_fallback(struct task_struct *p); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); @@ -165,11 +167,15 @@ static inline bool cpusets_enabled(void) { return false; } static inline int cpuset_init(void) { return 0; } static inline void cpuset_init_smp(void) {} +static inline void cpuset_force_rebuild(void) { } + static inline void cpuset_update_active_cpus(void) { partition_sched_domains(1, NULL, NULL); } +static inline void cpuset_wait_for_hotplug(void) { } + static inline void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask) { diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 227bc25d951d..cab5fd1ee767 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2277,8 +2277,15 @@ retry: mutex_unlock(&cpuset_mutex); } +static bool force_rebuild; + +void cpuset_force_rebuild(void) +{ + force_rebuild = true; +} + /** - * cpuset_hotplug - handle CPU/memory hotunplug for a cpuset + * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset * * This function is called after either CPU or memory configuration has * changed and updates cpuset accordingly. The top_cpuset is always @@ -2293,7 +2300,7 @@ retry: * Note that CPU offlining during suspend is ignored. We don't modify * cpusets across suspend/resume cycles at all. */ -static void cpuset_hotplug(bool use_cpu_hp_lock) +static void cpuset_hotplug_workfn(struct work_struct *work) { static cpumask_t new_cpus; static nodemask_t new_mems; @@ -2351,31 +2358,25 @@ static void cpuset_hotplug(bool use_cpu_hp_lock) } /* rebuild sched domains if cpus_allowed has changed */ - if (cpus_updated) { - if (use_cpu_hp_lock) - rebuild_sched_domains(); - else { - /* Acquiring cpu_hotplug_lock is not required. - * When cpuset_hotplug() is called in hotplug path, - * cpu_hotplug_lock is held by the hotplug context - * which is waiting for cpuhp_thread_fun to indicate - * completion of callback. - */ - mutex_lock(&cpuset_mutex); - rebuild_sched_domains_cpuslocked(); - mutex_unlock(&cpuset_mutex); - } + if (cpus_updated || force_rebuild) { + force_rebuild = false; + rebuild_sched_domains(); } } -static void cpuset_hotplug_workfn(struct work_struct *work) -{ - cpuset_hotplug(true); -} - void cpuset_update_active_cpus(void) { - cpuset_hotplug(false); + /* + * We're inside cpu hotplug critical region which usually nests + * inside cgroup synchronization. Bounce actual hotplug processing + * to a work item to avoid reverse locking order. + */ + schedule_work(&cpuset_hotplug_work); +} + +void cpuset_wait_for_hotplug(void) +{ + flush_work(&cpuset_hotplug_work); } /* diff --git a/kernel/power/process.c b/kernel/power/process.c index c326d7235c5f..7381d49a44db 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -204,6 +204,8 @@ void thaw_processes(void) __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); + cpuset_wait_for_hotplug(); + read_lock(&tasklist_lock); for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 88b3450b29ab..75554f366fd3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5624,6 +5624,7 @@ static void cpuset_cpu_active(void) * restore the original sched domains by considering the * cpuset configurations. */ + cpuset_force_rebuild(); } cpuset_update_active_cpus(); } From e8b3f8db7aad99fcc5234fc5b89984ff6620de3d Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 1 Dec 2017 22:20:36 +0800 Subject: [PATCH 049/269] workqueue/hotplug: simplify workqueue_offline_cpu() Since the recent cpu/hotplug refactoring, workqueue_offline_cpu() is guaranteed to run on the local cpu which is going offline. This also fixes the following deadlock by removing work item scheduling and flushing from CPU hotplug path. http://lkml.kernel.org/r/1504764252-29091-1-git-send-email-prsood@codeaurora.org tj: Description update. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6a5658cb46da..48a4d00f55dc 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1635,7 +1635,7 @@ static void worker_enter_idle(struct worker *worker) mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); /* - * Sanity check nr_running. Because wq_unbind_fn() releases + * Sanity check nr_running. Because unbind_workers() releases * pool->lock between setting %WORKER_UNBOUND and zapping * nr_running, the warning may trigger spuriously. Check iff * unbind is not in progress. @@ -4511,9 +4511,8 @@ void show_workqueue_state(void) * cpu comes back online. */ -static void wq_unbind_fn(struct work_struct *work) +static void unbind_workers(int cpu) { - int cpu = smp_processor_id(); struct worker_pool *pool; struct worker *worker; @@ -4710,12 +4709,13 @@ int workqueue_online_cpu(unsigned int cpu) int workqueue_offline_cpu(unsigned int cpu) { - struct work_struct unbind_work; struct workqueue_struct *wq; /* unbinding per-cpu workers should happen on the local CPU */ - INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); - queue_work_on(cpu, system_highpri_wq, &unbind_work); + if (WARN_ON(cpu != smp_processor_id())) + return -1; + + unbind_workers(cpu); /* update NUMA affinity of unbound workqueues */ mutex_lock(&wq_pool_mutex); @@ -4723,9 +4723,6 @@ int workqueue_offline_cpu(unsigned int cpu) wq_update_unbound_numa(wq, cpu, false); mutex_unlock(&wq_pool_mutex); - /* wait for per-cpu unbinding to finish */ - flush_work(&unbind_work); - destroy_work_on_stack(&unbind_work); return 0; } From 62408c1ef00784e8bcfc4848ade76480fb8aed21 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 1 Dec 2017 22:23:07 +0800 Subject: [PATCH 050/269] workqueue/hotplug: remove the workaround in rebind_workers() Since the cpu/hotplug refactoring, DOWN_FAILED is never called without preceding DOWN_PREPARE making the workaround unnecessary. Remove it. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 48a4d00f55dc..45ce93f3dd1f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4589,16 +4589,6 @@ static void rebind_workers(struct worker_pool *pool) spin_lock_irq(&pool->lock); - /* - * XXX: CPU hotplug notifiers are weird and can call DOWN_FAILED - * w/o preceding DOWN_PREPARE. Work around it. CPU hotplug is - * being reworked and this can go away in time. - */ - if (!(pool->flags & POOL_DISASSOCIATED)) { - spin_unlock_irq(&pool->lock); - return; - } - pool->flags &= ~POOL_DISASSOCIATED; for_each_pool_worker(worker, pool) { From bdfbbda90aeb75ce0951413fd7f495d4d377bd5e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Dec 2017 14:55:59 -0800 Subject: [PATCH 051/269] Revert "cgroup/cpuset: remove circular dependency deadlock" This reverts commit aa24163b2ee5c92120e32e99b5a93143a0f4258e. This and the following commit led to another circular locking scenario and the scenario which is fixed by this commit no longer exists after e8b3f8db7aad ("workqueue/hotplug: simplify workqueue_offline_cpu()") which removes work item flushing from hotplug path. Revert it for now. Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 53 ++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 30 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index cab5fd1ee767..f7efa7b4d825 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -812,18 +812,6 @@ done: return ndoms; } -static void cpuset_sched_change_begin(void) -{ - cpus_read_lock(); - mutex_lock(&cpuset_mutex); -} - -static void cpuset_sched_change_end(void) -{ - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); -} - /* * Rebuild scheduler domains. * @@ -833,14 +821,16 @@ static void cpuset_sched_change_end(void) * 'cpus' is removed, then call this routine to rebuild the * scheduler's dynamic sched domains. * + * Call with cpuset_mutex held. Takes get_online_cpus(). */ -static void rebuild_sched_domains_cpuslocked(void) +static void rebuild_sched_domains_locked(void) { struct sched_domain_attr *attr; cpumask_var_t *doms; int ndoms; lockdep_assert_held(&cpuset_mutex); + get_online_cpus(); /* * We have raced with CPU hotplug. Don't do anything to avoid @@ -848,25 +838,27 @@ static void rebuild_sched_domains_cpuslocked(void) * Anyways, hotplug work item will rebuild sched domains. */ if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) - return; + goto out; /* Generate domain masks and attrs */ ndoms = generate_sched_domains(&doms, &attr); /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); +out: + put_online_cpus(); } #else /* !CONFIG_SMP */ -static void rebuild_sched_domains_cpuslocked(void) +static void rebuild_sched_domains_locked(void) { } #endif /* CONFIG_SMP */ void rebuild_sched_domains(void) { - cpuset_sched_change_begin(); - rebuild_sched_domains_cpuslocked(); - cpuset_sched_change_end(); + mutex_lock(&cpuset_mutex); + rebuild_sched_domains_locked(); + mutex_unlock(&cpuset_mutex); } /** @@ -952,7 +944,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) rcu_read_unlock(); if (need_rebuild_sched_domains) - rebuild_sched_domains_cpuslocked(); + rebuild_sched_domains_locked(); } /** @@ -1284,7 +1276,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) cs->relax_domain_level = val; if (!cpumask_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) - rebuild_sched_domains_cpuslocked(); + rebuild_sched_domains_locked(); } return 0; @@ -1317,6 +1309,7 @@ static void update_tasks_flags(struct cpuset *cs) * * Call with cpuset_mutex held. */ + static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on) { @@ -1349,7 +1342,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, spin_unlock_irq(&callback_lock); if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) - rebuild_sched_domains_cpuslocked(); + rebuild_sched_domains_locked(); if (spread_flag_changed) update_tasks_flags(cs); @@ -1617,7 +1610,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = 0; - cpuset_sched_change_begin(); + mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) { retval = -ENODEV; goto out_unlock; @@ -1653,7 +1646,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - cpuset_sched_change_end(); + mutex_unlock(&cpuset_mutex); return retval; } @@ -1664,7 +1657,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = -ENODEV; - cpuset_sched_change_begin(); + mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1677,7 +1670,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - cpuset_sched_change_end(); + mutex_unlock(&cpuset_mutex); return retval; } @@ -1716,7 +1709,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, kernfs_break_active_protection(of->kn); flush_work(&cpuset_hotplug_work); - cpuset_sched_change_begin(); + mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1740,7 +1733,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, free_trial_cpuset(trialcs); out_unlock: - cpuset_sched_change_end(); + mutex_unlock(&cpuset_mutex); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); flush_workqueue(cpuset_migrate_mm_wq); @@ -2041,14 +2034,14 @@ out_unlock: /* * If the cpuset being removed has its flag 'sched_load_balance' * enabled, then simulate turning sched_load_balance off, which - * will call rebuild_sched_domains_cpuslocked(). + * will call rebuild_sched_domains_locked(). */ static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); - cpuset_sched_change_begin(); + mutex_lock(&cpuset_mutex); if (is_sched_load_balance(cs)) update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); @@ -2056,7 +2049,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) cpuset_dec(); clear_bit(CS_ONLINE, &cs->flags); - cpuset_sched_change_end(); + mutex_unlock(&cpuset_mutex); } static void cpuset_css_free(struct cgroup_subsys_state *css) From ba69ead9e9e9bb3cec5faf03526c36764ac8942a Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Mon, 27 Nov 2017 23:47:34 +0100 Subject: [PATCH 052/269] scsi: scsi_devinfo: handle non-terminated strings devinfo->vendor and devinfo->model aren't necessarily zero-terminated. Fixes: b8018b973c7c "scsi_devinfo: fixup string compare" Signed-off-by: Martin Wilck Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_devinfo.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/scsi_devinfo.c b/drivers/scsi/scsi_devinfo.c index 78d4aa8df675..b256d4cbd3ad 100644 --- a/drivers/scsi/scsi_devinfo.c +++ b/drivers/scsi/scsi_devinfo.c @@ -458,7 +458,8 @@ static struct scsi_dev_info_list *scsi_dev_info_list_find(const char *vendor, /* * vendor strings must be an exact match */ - if (vmax != strlen(devinfo->vendor) || + if (vmax != strnlen(devinfo->vendor, + sizeof(devinfo->vendor)) || memcmp(devinfo->vendor, vskip, vmax)) continue; @@ -466,7 +467,7 @@ static struct scsi_dev_info_list *scsi_dev_info_list_find(const char *vendor, * @model specifies the full string, and * must be larger or equal to devinfo->model */ - mlen = strlen(devinfo->model); + mlen = strnlen(devinfo->model, sizeof(devinfo->model)); if (mmax < mlen || memcmp(devinfo->model, mskip, mlen)) continue; return devinfo; From 81df022b688d43d2a3667518b2f755d384397910 Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Mon, 27 Nov 2017 23:47:35 +0100 Subject: [PATCH 053/269] scsi: scsi_devinfo: cleanly zero-pad devinfo strings Cleanly fill memory for "vendor" and "model" with 0-bytes for the "compatible" case rather than adding only a single 0 byte. This simplifies the devinfo code a a bit, and avoids mistakes in other places of the code (not in current upstream, but we had one such mistake in the SUSE kernel). [mkp: applied by hand and added braces] Signed-off-by: Martin Wilck Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_devinfo.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/drivers/scsi/scsi_devinfo.c b/drivers/scsi/scsi_devinfo.c index b256d4cbd3ad..449ef5adbb2b 100644 --- a/drivers/scsi/scsi_devinfo.c +++ b/drivers/scsi/scsi_devinfo.c @@ -34,7 +34,6 @@ struct scsi_dev_info_list_table { }; -static const char spaces[] = " "; /* 16 of them */ static blist_flags_t scsi_default_dev_flags; static LIST_HEAD(scsi_dev_info_list); static char scsi_dev_flags[256]; @@ -298,20 +297,13 @@ static void scsi_strcpy_devinfo(char *name, char *to, size_t to_length, size_t from_length; from_length = strlen(from); - strncpy(to, from, min(to_length, from_length)); - if (from_length < to_length) { - if (compatible) { - /* - * NUL terminate the string if it is short. - */ - to[from_length] = '\0'; - } else { - /* - * space pad the string if it is short. - */ - strncpy(&to[from_length], spaces, - to_length - from_length); - } + /* This zero-pads the destination */ + strncpy(to, from, to_length); + if (from_length < to_length && !compatible) { + /* + * space pad the string if it is short. + */ + memset(&to[from_length], ' ', to_length - from_length); } if (from_length > to_length) printk(KERN_WARNING "%s: %s string '%s' is too long\n", From ca0168e8a77cf833f8c9ac1d26a3a4012bab4f72 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 5 Dec 2017 09:32:25 -0500 Subject: [PATCH 054/269] alloc_super(): do ->s_umount initialization earlier ... so that failure exits could count on it having been done. Signed-off-by: Al Viro --- fs/super.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/fs/super.c b/fs/super.c index d4e33e8f1e6f..7ff1349609e4 100644 --- a/fs/super.c +++ b/fs/super.c @@ -191,6 +191,24 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, INIT_LIST_HEAD(&s->s_mounts); s->s_user_ns = get_user_ns(user_ns); + init_rwsem(&s->s_umount); + lockdep_set_class(&s->s_umount, &type->s_umount_key); + /* + * sget() can have s_umount recursion. + * + * When it cannot find a suitable sb, it allocates a new + * one (this one), and tries again to find a suitable old + * one. + * + * In case that succeeds, it will acquire the s_umount + * lock of the old one. Since these are clearly distrinct + * locks, and this object isn't exposed yet, there's no + * risk of deadlocks. + * + * Annotate this by putting this lock in a different + * subclass. + */ + down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); if (security_sb_alloc(s)) goto fail; @@ -218,25 +236,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, goto fail; if (list_lru_init_memcg(&s->s_inode_lru)) goto fail; - - init_rwsem(&s->s_umount); - lockdep_set_class(&s->s_umount, &type->s_umount_key); - /* - * sget() can have s_umount recursion. - * - * When it cannot find a suitable sb, it allocates a new - * one (this one), and tries again to find a suitable old - * one. - * - * In case that succeeds, it will acquire the s_umount - * lock of the old one. Since these are clearly distrinct - * locks, and this object isn't exposed yet, there's no - * risk of deadlocks. - * - * Annotate this by putting this lock in a different - * subclass. - */ - down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); s->s_count = 1; atomic_set(&s->s_active, 1); mutex_init(&s->s_vfs_rename_mutex); From be17f1ce8572d6e15559897421fb7041360bb64a Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 30 Nov 2017 15:49:10 +0100 Subject: [PATCH 055/269] mmc: core: properly init drv_type When the latest version of parsing the new eMMC bindings was moved from core.c to mmc.c, it was overlooked that drv_type could be used uninitialized. Fix it! Fixes: 6186d06c519e21 ("mmc: parse new binding for eMMC fixed driver type") Reported-by: Colin Ian King Reported-by: Dan Carpenter Signed-off-by: Wolfram Sang Signed-off-by: Ulf Hansson --- drivers/mmc/core/mmc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c index d209fb466979..208a762b87ef 100644 --- a/drivers/mmc/core/mmc.c +++ b/drivers/mmc/core/mmc.c @@ -1290,7 +1290,7 @@ out_err: static void mmc_select_driver_type(struct mmc_card *card) { - int card_drv_type, drive_strength, drv_type; + int card_drv_type, drive_strength, drv_type = 0; int fixed_drv_type = card->host->fixed_drv_type; card_drv_type = card->ext_csd.raw_driver_strength | From c2f31b79d510ec1a27138bdcf2d0ece1080be85e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Dec 2017 09:10:17 -0800 Subject: [PATCH 056/269] cgroup: add warning about RT not being supported on cgroup2 We haven't yet figured out what to do with RT threads on cgroup2. Document the limitation. v2: Included the warning about system management software behavior as suggested by Michael. Signed-off-by: Tejun Heo Reported-by: "Michael Kerrisk (man-pages)" --- Documentation/cgroup-v2.txt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index 779211fbb69f..2cddab7efb20 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt @@ -898,6 +898,13 @@ controller implements weight and absolute bandwidth limit models for normal scheduling policy and absolute bandwidth allocation model for realtime scheduling policy. +WARNING: cgroup2 doesn't yet support control of realtime processes and +the cpu controller can only be enabled when all RT processes are in +the root cgroup. Be aware that system management software may already +have placed RT processes into nonroot cgroups during the system boot +process, and these processes may need to be moved to the root cgroup +before the cpu controller can be enabled. + CPU Interface Files ~~~~~~~~~~~~~~~~~~~ From 71334963d01ed7ec61a958a5a6585172793f5a24 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 5 Dec 2017 11:27:59 +0100 Subject: [PATCH 057/269] wireless: replace usage of hexdump with od/sed Since od/sed are in posix, hopefully there's a better chance people will have them, over hexdump. Fixes: 90a53e4432b1 ("cfg80211: implement regdb signature checking") Signed-off-by: Johannes Berg --- net/wireless/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/wireless/Makefile b/net/wireless/Makefile index 278d979c211a..63cbb6432b2d 100644 --- a/net/wireless/Makefile +++ b/net/wireless/Makefile @@ -27,7 +27,7 @@ $(obj)/shipped-certs.c: $(wildcard $(srctree)/$(src)/certs/*.x509) @$(kecho) " GEN $@" @echo '#include "reg.h"' > $@ @echo 'const u8 shipped_regdb_certs[] = {' >> $@ - @for f in $^ ; do hexdump -v -e '1/1 "0x%.2x," "\n"' < $$f >> $@ ; done + @for f in $^ ; do od -An -v -tx1 < $$f | sed -e 's/ /\n/g' | sed -e 's/^[0-9a-f]\+$$/\0/;t;d' | sed -e 's/^/0x/;s/$$/,/' >> $@ ; done @echo '};' >> $@ @echo 'unsigned int shipped_regdb_certs_len = sizeof(shipped_regdb_certs);' >> $@ @@ -36,6 +36,6 @@ $(obj)/extra-certs.c: $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%) \ @$(kecho) " GEN $@" @echo '#include "reg.h"' > $@ @echo 'const u8 extra_regdb_certs[] = {' >> $@ - @for f in $^ ; do test -f $$f && hexdump -v -e '1/1 "0x%.2x," "\n"' < $$f >> $@ || true ; done + @for f in $^ ; do test -f $$f && od -An -v -tx1 < $$f | sed -e 's/ /\n/g' | sed -e 's/^[0-9a-f]\+$$/\0/;t;d' | sed -e 's/^/0x/;s/$$/,/' >> $@ ; done @echo '};' >> $@ @echo 'unsigned int extra_regdb_certs_len = sizeof(extra_regdb_certs);' >> $@ From 715a12334764657bafb3ab964fb25f4e6115c770 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 5 Dec 2017 11:59:33 +0100 Subject: [PATCH 058/269] wireless: don't write C files on failures Change the scripting inside the shipped/extra certs C code generation to not write the file when there are any failures. That way, if the build aborts due to failures, we don't get into a situation where a dummy file has been created and the next build succeeds, but not with the desired output. Fixes: 90a53e4432b1 ("cfg80211: implement regdb signature checking") Signed-off-by: Johannes Berg --- net/wireless/Makefile | 48 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/net/wireless/Makefile b/net/wireless/Makefile index 63cbb6432b2d..d7d6cb00c47b 100644 --- a/net/wireless/Makefile +++ b/net/wireless/Makefile @@ -25,17 +25,45 @@ endif $(obj)/shipped-certs.c: $(wildcard $(srctree)/$(src)/certs/*.x509) @$(kecho) " GEN $@" - @echo '#include "reg.h"' > $@ - @echo 'const u8 shipped_regdb_certs[] = {' >> $@ - @for f in $^ ; do od -An -v -tx1 < $$f | sed -e 's/ /\n/g' | sed -e 's/^[0-9a-f]\+$$/\0/;t;d' | sed -e 's/^/0x/;s/$$/,/' >> $@ ; done - @echo '};' >> $@ - @echo 'unsigned int shipped_regdb_certs_len = sizeof(shipped_regdb_certs);' >> $@ + @(set -e; \ + allf=""; \ + for f in $^ ; do \ + # similar to hexdump -v -e '1/1 "0x%.2x," "\n"' \ + thisf=$$(od -An -v -tx1 < $$f | \ + sed -e 's/ /\n/g' | \ + sed -e 's/^[0-9a-f]\+$$/\0/;t;d' | \ + sed -e 's/^/0x/;s/$$/,/'); \ + # file should not be empty - maybe command substitution failed? \ + test ! -z "$$thisf";\ + allf=$$allf$$thisf;\ + done; \ + ( \ + echo '#include "reg.h"'; \ + echo 'const u8 shipped_regdb_certs[] = {'; \ + echo "$$allf"; \ + echo '};'; \ + echo 'unsigned int shipped_regdb_certs_len = sizeof(shipped_regdb_certs);'; \ + ) >> $@) $(obj)/extra-certs.c: $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%) \ $(wildcard $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%)/*.x509) @$(kecho) " GEN $@" - @echo '#include "reg.h"' > $@ - @echo 'const u8 extra_regdb_certs[] = {' >> $@ - @for f in $^ ; do test -f $$f && od -An -v -tx1 < $$f | sed -e 's/ /\n/g' | sed -e 's/^[0-9a-f]\+$$/\0/;t;d' | sed -e 's/^/0x/;s/$$/,/' >> $@ ; done - @echo '};' >> $@ - @echo 'unsigned int extra_regdb_certs_len = sizeof(extra_regdb_certs);' >> $@ + @(set -e; \ + allf=""; \ + for f in $^ ; do \ + # similar to hexdump -v -e '1/1 "0x%.2x," "\n"' \ + thisf=$$(od -An -v -tx1 < $$f | \ + sed -e 's/ /\n/g' | \ + sed -e 's/^[0-9a-f]\+$$/\0/;t;d' | \ + sed -e 's/^/0x/;s/$$/,/'); \ + # file should not be empty - maybe command substitution failed? \ + test ! -z "$$thisf";\ + allf=$$allf$$thisf;\ + done; \ + ( \ + echo '#include "reg.h"'; \ + echo 'const u8 extra_regdb_certs[] = {'; \ + echo "$$allf"; \ + echo '};'; \ + echo 'unsigned int extra_regdb_certs_len = sizeof(extra_regdb_certs);'; \ + ) >> $@) From 916a27901de01446bcf57ecca4783f6cff493309 Mon Sep 17 00:00:00 2001 From: Kevin Cernekee Date: Tue, 5 Dec 2017 15:42:41 -0800 Subject: [PATCH 059/269] netfilter: xt_osf: Add missing permission checks The capability check in nfnetlink_rcv() verifies that the caller has CAP_NET_ADMIN in the namespace that "owns" the netlink socket. However, xt_osf_fingers is shared by all net namespaces on the system. An unprivileged user can create user and net namespaces in which he holds CAP_NET_ADMIN to bypass the netlink_net_capable() check: vpnns -- nfnl_osf -f /tmp/pf.os vpnns -- nfnl_osf -f /tmp/pf.os -d These non-root operations successfully modify the systemwide OS fingerprint list. Add new capable() checks so that they can't. Signed-off-by: Kevin Cernekee Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_osf.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index 36e14b1f061d..a34f314a8c23 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -70,6 +71,9 @@ static int xt_osf_add_callback(struct net *net, struct sock *ctnl, struct xt_osf_finger *kf = NULL, *sf; int err = 0; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (!osf_attrs[OSF_ATTR_FINGER]) return -EINVAL; @@ -115,6 +119,9 @@ static int xt_osf_remove_callback(struct net *net, struct sock *ctnl, struct xt_osf_finger *sf; int err = -ENOENT; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (!osf_attrs[OSF_ATTR_FINGER]) return -EINVAL; From 7f6d2ecd3d7acaf205ea7b3e96f9ffc55b92298b Mon Sep 17 00:00:00 2001 From: David Lechner Date: Sun, 3 Dec 2017 19:54:41 -0600 Subject: [PATCH 060/269] eeprom: at24: change nvmem stride to 1 Trying to read the MAC address from an eeprom that has an offset that is not a multiple of 4 causes an error currently. Fix it by changing the nvmem stride to 1. Cc: stable@vger.kernel.org Signed-off-by: David Lechner [Bartosz: tweaked the commit message] Signed-off-by: Bartosz Golaszewski --- drivers/misc/eeprom/at24.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/misc/eeprom/at24.c b/drivers/misc/eeprom/at24.c index 20b4f26d30d7..4d63ac8a82e0 100644 --- a/drivers/misc/eeprom/at24.c +++ b/drivers/misc/eeprom/at24.c @@ -876,7 +876,7 @@ static int at24_probe(struct i2c_client *client, const struct i2c_device_id *id) at24->nvmem_config.reg_read = at24_read; at24->nvmem_config.reg_write = at24_write; at24->nvmem_config.priv = at24; - at24->nvmem_config.stride = 4; + at24->nvmem_config.stride = 1; at24->nvmem_config.word_size = 1; at24->nvmem_config.size = chip.byte_len; From 4f7f5551a760eb0124267be65763008169db7087 Mon Sep 17 00:00:00 2001 From: Masamitsu Yamazaki Date: Wed, 15 Nov 2017 07:33:14 +0000 Subject: [PATCH 061/269] ipmi: Stop timers before cleaning up the module System may crash after unloading ipmi_si.ko module because a timer may remain and fire after the module cleaned up resources. cleanup_one_si() contains the following processing. /* * Make sure that interrupts, the timer and the thread are * stopped and will not run again. */ if (to_clean->irq_cleanup) to_clean->irq_cleanup(to_clean); wait_for_timer_and_thread(to_clean); /* * Timeouts are stopped, now make sure the interrupts are off * in the BMC. Note that timers and CPU interrupts are off, * so no need for locks. */ while (to_clean->curr_msg || (to_clean->si_state != SI_NORMAL)) { poll(to_clean); schedule_timeout_uninterruptible(1); } si_state changes as following in the while loop calling poll(to_clean). SI_GETTING_MESSAGES => SI_CHECKING_ENABLES => SI_SETTING_ENABLES => SI_GETTING_EVENTS => SI_NORMAL As written in the code comments above, timers are expected to stop before the polling loop and not to run again. But the timer is set again in the following process when si_state becomes SI_SETTING_ENABLES. => poll => smi_event_handler => handle_transaction_done // smi_info->si_state == SI_SETTING_ENABLES => start_getting_events => start_new_msg => smi_mod_timer => mod_timer As a result, before the timer set in start_new_msg() expires, the polling loop may see si_state becoming SI_NORMAL and the module clean-up finishes. For example, hard LOCKUP and panic occurred as following. smi_timeout was called after smi_event_handler, kcs_event and hangs at port_inb() trying to access I/O port after release. [exception RIP: port_inb+19] RIP: ffffffffc0473053 RSP: ffff88069fdc3d80 RFLAGS: 00000006 RAX: ffff8806800f8e00 RBX: ffff880682bd9400 RCX: 0000000000000000 RDX: 0000000000000ca3 RSI: 0000000000000ca3 RDI: ffff8806800f8e40 RBP: ffff88069fdc3d80 R8: ffffffff81d86dfc R9: ffffffff81e36426 R10: 00000000000509f0 R11: 0000000000100000 R12: 0000000000]:000000 R13: 0000000000000000 R14: 0000000000000246 R15: ffff8806800f8e00 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0000 --- --- To fix the problem I defined a flag, timer_can_start, as member of struct smi_info. The flag is enabled immediately after initializing the timer and disabled immediately before waiting for timer deletion. Fixes: 0cfec916e86d ("ipmi: Start the timer and thread on internal msgs") Signed-off-by: Yamazaki Masamitsu [Adjusted for recent changes in the driver.] Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_si_intf.c | 44 +++++++++++++++++--------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index 71d33a1807e4..99b0513bb55b 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -199,6 +199,9 @@ struct smi_info { /* The timer for this si. */ struct timer_list si_timer; + /* This flag is set, if the timer can be set */ + bool timer_can_start; + /* This flag is set, if the timer is running (timer_pending() isn't enough) */ bool timer_running; @@ -355,6 +358,8 @@ out: static void smi_mod_timer(struct smi_info *smi_info, unsigned long new_val) { + if (!smi_info->timer_can_start) + return; smi_info->last_timeout_jiffies = jiffies; mod_timer(&smi_info->si_timer, new_val); smi_info->timer_running = true; @@ -374,21 +379,18 @@ static void start_new_msg(struct smi_info *smi_info, unsigned char *msg, smi_info->handlers->start_transaction(smi_info->si_sm, msg, size); } -static void start_check_enables(struct smi_info *smi_info, bool start_timer) +static void start_check_enables(struct smi_info *smi_info) { unsigned char msg[2]; msg[0] = (IPMI_NETFN_APP_REQUEST << 2); msg[1] = IPMI_GET_BMC_GLOBAL_ENABLES_CMD; - if (start_timer) - start_new_msg(smi_info, msg, 2); - else - smi_info->handlers->start_transaction(smi_info->si_sm, msg, 2); + start_new_msg(smi_info, msg, 2); smi_info->si_state = SI_CHECKING_ENABLES; } -static void start_clear_flags(struct smi_info *smi_info, bool start_timer) +static void start_clear_flags(struct smi_info *smi_info) { unsigned char msg[3]; @@ -397,10 +399,7 @@ static void start_clear_flags(struct smi_info *smi_info, bool start_timer) msg[1] = IPMI_CLEAR_MSG_FLAGS_CMD; msg[2] = WDT_PRE_TIMEOUT_INT; - if (start_timer) - start_new_msg(smi_info, msg, 3); - else - smi_info->handlers->start_transaction(smi_info->si_sm, msg, 3); + start_new_msg(smi_info, msg, 3); smi_info->si_state = SI_CLEARING_FLAGS; } @@ -435,11 +434,11 @@ static void start_getting_events(struct smi_info *smi_info) * Note that we cannot just use disable_irq(), since the interrupt may * be shared. */ -static inline bool disable_si_irq(struct smi_info *smi_info, bool start_timer) +static inline bool disable_si_irq(struct smi_info *smi_info) { if ((smi_info->io.irq) && (!smi_info->interrupt_disabled)) { smi_info->interrupt_disabled = true; - start_check_enables(smi_info, start_timer); + start_check_enables(smi_info); return true; } return false; @@ -449,7 +448,7 @@ static inline bool enable_si_irq(struct smi_info *smi_info) { if ((smi_info->io.irq) && (smi_info->interrupt_disabled)) { smi_info->interrupt_disabled = false; - start_check_enables(smi_info, true); + start_check_enables(smi_info); return true; } return false; @@ -467,7 +466,7 @@ static struct ipmi_smi_msg *alloc_msg_handle_irq(struct smi_info *smi_info) msg = ipmi_alloc_smi_msg(); if (!msg) { - if (!disable_si_irq(smi_info, true)) + if (!disable_si_irq(smi_info)) smi_info->si_state = SI_NORMAL; } else if (enable_si_irq(smi_info)) { ipmi_free_smi_msg(msg); @@ -483,7 +482,7 @@ retry: /* Watchdog pre-timeout */ smi_inc_stat(smi_info, watchdog_pretimeouts); - start_clear_flags(smi_info, true); + start_clear_flags(smi_info); smi_info->msg_flags &= ~WDT_PRE_TIMEOUT_INT; if (smi_info->intf) ipmi_smi_watchdog_pretimeout(smi_info->intf); @@ -866,7 +865,7 @@ restart: * disable and messages disabled. */ if (smi_info->supports_event_msg_buff || smi_info->io.irq) { - start_check_enables(smi_info, true); + start_check_enables(smi_info); } else { smi_info->curr_msg = alloc_msg_handle_irq(smi_info); if (!smi_info->curr_msg) @@ -1167,6 +1166,7 @@ static int smi_start_processing(void *send_info, /* Set up the timer that drives the interface. */ setup_timer(&new_smi->si_timer, smi_timeout, (long)new_smi); + new_smi->timer_can_start = true; smi_mod_timer(new_smi, jiffies + SI_TIMEOUT_JIFFIES); /* Try to claim any interrupts. */ @@ -1936,10 +1936,12 @@ static void check_for_broken_irqs(struct smi_info *smi_info) check_set_rcv_irq(smi_info); } -static inline void wait_for_timer_and_thread(struct smi_info *smi_info) +static inline void stop_timer_and_thread(struct smi_info *smi_info) { if (smi_info->thread != NULL) kthread_stop(smi_info->thread); + + smi_info->timer_can_start = false; if (smi_info->timer_running) del_timer_sync(&smi_info->si_timer); } @@ -2152,7 +2154,7 @@ static int try_smi_init(struct smi_info *new_smi) * Start clearing the flags before we enable interrupts or the * timer to avoid racing with the timer. */ - start_clear_flags(new_smi, false); + start_clear_flags(new_smi); /* * IRQ is defined to be set when non-zero. req_events will @@ -2238,7 +2240,7 @@ out_err_remove_attrs: dev_set_drvdata(new_smi->io.dev, NULL); out_err_stop_timer: - wait_for_timer_and_thread(new_smi); + stop_timer_and_thread(new_smi); out_err: new_smi->interrupt_disabled = true; @@ -2388,7 +2390,7 @@ static void cleanup_one_si(struct smi_info *to_clean) */ if (to_clean->io.irq_cleanup) to_clean->io.irq_cleanup(&to_clean->io); - wait_for_timer_and_thread(to_clean); + stop_timer_and_thread(to_clean); /* * Timeouts are stopped, now make sure the interrupts are off @@ -2400,7 +2402,7 @@ static void cleanup_one_si(struct smi_info *to_clean) schedule_timeout_uninterruptible(1); } if (to_clean->handlers) - disable_si_irq(to_clean, false); + disable_si_irq(to_clean); while (to_clean->curr_msg || (to_clean->si_state != SI_NORMAL)) { poll(to_clean); schedule_timeout_uninterruptible(1); From aece09024414b54158e03aa45f4a4436e7cb996c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 5 Dec 2017 17:37:17 +0300 Subject: [PATCH 062/269] staging: ccree: Uninitialized return in ssi_ahash_import() The return value isn't initialized on some success paths. Fixes: c5f39d07860c ("staging: ccree: fix leak of import() after init()") Signed-off-by: Dan Carpenter Signed-off-by: Greg Kroah-Hartman --- drivers/staging/ccree/ssi_hash.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/ccree/ssi_hash.c b/drivers/staging/ccree/ssi_hash.c index 1799d3f26a9e..2035835b62dc 100644 --- a/drivers/staging/ccree/ssi_hash.c +++ b/drivers/staging/ccree/ssi_hash.c @@ -1769,7 +1769,7 @@ static int ssi_ahash_import(struct ahash_request *req, const void *in) struct device *dev = drvdata_to_dev(ctx->drvdata); struct ahash_req_ctx *state = ahash_request_ctx(req); u32 tmp; - int rc; + int rc = 0; memcpy(&tmp, in, sizeof(u32)); if (tmp != CC_EXPORT_MAGIC) { From 202fc673c626e4ffe6b888c469b248ecc6d50265 Mon Sep 17 00:00:00 2001 From: Marcus Wolf Date: Wed, 8 Nov 2017 19:13:56 +0200 Subject: [PATCH 063/269] staging: pi433: Fixes issue with bit shift in rf69_get_modulation Fixes issue with bit shift in rf69_get_modulation Signed-off-by: Marcus Wolf Signed-off-by: Greg Kroah-Hartman --- drivers/staging/pi433/rf69.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/pi433/rf69.c b/drivers/staging/pi433/rf69.c index e69a2153c999..12c9df9cddde 100644 --- a/drivers/staging/pi433/rf69.c +++ b/drivers/staging/pi433/rf69.c @@ -102,7 +102,7 @@ enum modulation rf69_get_modulation(struct spi_device *spi) currentValue = READ_REG(REG_DATAMODUL); - switch (currentValue & MASK_DATAMODUL_MODULATION_TYPE >> 3) { // TODO improvement: change 3 to define + switch (currentValue & MASK_DATAMODUL_MODULATION_TYPE) { case DATAMODUL_MODULATION_TYPE_OOK: return OOK; case DATAMODUL_MODULATION_TYPE_FSK: return FSK; default: return undefined; From a821df3f1af72aa6a0d573eea94a7dd2613e9f4e Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Tue, 21 Nov 2017 09:36:33 +1100 Subject: [PATCH 064/269] cifs: fix NULL deref in SMB2_read Signed-off-by: Ronnie Sahlberg Reviewed-by: Pavel Shilovsky CC: Stable Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 5331631386a2..01346b8b6edb 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2678,27 +2678,27 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, cifs_small_buf_release(req); rsp = (struct smb2_read_rsp *)rsp_iov.iov_base; - shdr = get_sync_hdr(rsp); - - if (shdr->Status == STATUS_END_OF_FILE) { - free_rsp_buf(resp_buftype, rsp_iov.iov_base); - return 0; - } if (rc) { - cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE); - cifs_dbg(VFS, "Send error in read = %d\n", rc); - } else { - *nbytes = le32_to_cpu(rsp->DataLength); - if ((*nbytes > CIFS_MAX_MSGSIZE) || - (*nbytes > io_parms->length)) { - cifs_dbg(FYI, "bad length %d for count %d\n", - *nbytes, io_parms->length); - rc = -EIO; - *nbytes = 0; + if (rc != -ENODATA) { + cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE); + cifs_dbg(VFS, "Send error in read = %d\n", rc); } + free_rsp_buf(resp_buftype, rsp_iov.iov_base); + return rc == -ENODATA ? 0 : rc; } + *nbytes = le32_to_cpu(rsp->DataLength); + if ((*nbytes > CIFS_MAX_MSGSIZE) || + (*nbytes > io_parms->length)) { + cifs_dbg(FYI, "bad length %d for count %d\n", + *nbytes, io_parms->length); + rc = -EIO; + *nbytes = 0; + } + + shdr = get_sync_hdr(rsp); + if (*buf) { memcpy(*buf, (char *)shdr + rsp->DataOffset, *nbytes); free_rsp_buf(resp_buftype, rsp_iov.iov_base); From 5702591fc6a3f409f460def104ee149330dac82d Mon Sep 17 00:00:00 2001 From: Aurelien Aptel Date: Tue, 21 Nov 2017 14:47:56 +0100 Subject: [PATCH 065/269] CIFS: don't log STATUS_NOT_FOUND errors for DFS cifs.ko makes DFS queries regardless of the type of the server and non-DFS servers are common. This often results in superfluous logging of non-critical errors. Signed-off-by: Aurelien Aptel Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg --- fs/cifs/smb2ops.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index e06740436b92..ed88ab8a4774 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1406,7 +1406,8 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses, } while (rc == -EAGAIN); if (rc) { - cifs_dbg(VFS, "ioctl error in smb2_get_dfs_refer rc=%d\n", rc); + if (rc != -ENOENT) + cifs_dbg(VFS, "ioctl error in smb2_get_dfs_refer rc=%d\n", rc); goto out; } From 7912af5c835bd86f2b0347a480e0f40e2fab30d0 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 6 Dec 2017 14:55:05 -0600 Subject: [PATCH 066/269] PCI: Add pci_get_domain_bus_and_slot() stub The coretemp driver build fails when CONFIG_PCI is not enabled because it uses a function that does not have a stub for that config case, so add the function stub. ../drivers/hwmon/coretemp.c: In function 'adjust_tjmax': ../drivers/hwmon/coretemp.c:250:9: error: implicit declaration of function 'pci_get_domain_bus_and_slot' [-Werror=implicit-function-declaration] struct pci_dev *host_bridge = pci_get_domain_bus_and_slot(0, 0, devfn); ../drivers/hwmon/coretemp.c:250:32: warning: initialization makes pointer from integer without a cast [enabled by default] struct pci_dev *host_bridge = pci_get_domain_bus_and_slot(0, 0, devfn); Signed-off-by: Randy Dunlap [bhelgaas: identical patch also by Arnd Bergmann ] Signed-off-by: Bjorn Helgaas Acked-by: Guenter Roeck --- include/linux/pci.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/pci.h b/include/linux/pci.h index 0403894147a3..c170c9250c8b 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1674,6 +1674,9 @@ static inline struct pci_dev *pci_get_slot(struct pci_bus *bus, static inline struct pci_dev *pci_get_bus_and_slot(unsigned int bus, unsigned int devfn) { return NULL; } +static inline struct pci_dev *pci_get_domain_bus_and_slot(int domain, + unsigned int bus, unsigned int devfn) +{ return NULL; } static inline int pci_domain_nr(struct pci_bus *bus) { return 0; } static inline struct pci_dev *pci_dev_get(struct pci_dev *dev) { return NULL; } From 470195f82e4ea550b7c37736a12bf3fa565295ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Wed, 29 Nov 2017 15:12:27 +0100 Subject: [PATCH 067/269] x86/PCI: Fix infinite loop in search for 64bit BAR placement MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Break the loop if we can't find some address space for a 64bit BAR. Signed-off-by: Christian König Signed-off-by: Bjorn Helgaas --- arch/x86/pci/fixup.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 1e996df687a3..5328e86f73eb 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -696,8 +696,13 @@ static void pci_amd_enable_64bit_bar(struct pci_dev *dev) res->end = 0xfd00000000ull - 1; /* Just grab the free area behind system memory for this */ - while ((conflict = request_resource_conflict(&iomem_resource, res))) + while ((conflict = request_resource_conflict(&iomem_resource, res))) { + if (conflict->end >= res->end) { + kfree(res); + return; + } res->start = conflict->end + 1; + } dev_info(&dev->dev, "adding root bus resource %pR\n", res); From a19e2696135efb471981c1ae1ec3cb2b70c41a2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Wed, 29 Nov 2017 15:12:28 +0100 Subject: [PATCH 068/269] x86/PCI: Only enable a 64bit BAR on single-socket AMD Family 15h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we have a multi-socket system, each CPU core needs the same setup. Since this is tricky to do in the fixup code, don't enable a 64bit BAR on multi-socket systems for now. Signed-off-by: Christian König Signed-off-by: Bjorn Helgaas --- arch/x86/pci/fixup.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 5328e86f73eb..e663d6bf1328 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -665,6 +665,16 @@ static void pci_amd_enable_64bit_bar(struct pci_dev *dev) unsigned i; u32 base, limit, high; struct resource *res, *conflict; + struct pci_dev *other; + + /* Check that we are the only device of that type */ + other = pci_get_device(dev->vendor, dev->device, NULL); + if (other != dev || + (other = pci_get_device(dev->vendor, dev->device, other))) { + /* This is a multi-socket system, don't touch it for now */ + pci_dev_put(other); + return; + } for (i = 0; i < 8; i++) { pci_read_config_dword(dev, AMD_141b_MMIO_BASE(i), &base); @@ -719,10 +729,10 @@ static void pci_amd_enable_64bit_bar(struct pci_dev *dev) pci_bus_add_resource(dev->bus, res, 0); } -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x1401, pci_amd_enable_64bit_bar); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x141b, pci_amd_enable_64bit_bar); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x1571, pci_amd_enable_64bit_bar); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x15b1, pci_amd_enable_64bit_bar); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x1601, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x1401, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x141b, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x1571, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x15b1, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x1601, pci_amd_enable_64bit_bar); #endif From 947134d9b00f342415af7eddd42a5fce7262a1b9 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Mon, 4 Dec 2017 11:45:21 -0500 Subject: [PATCH 069/269] x86/smpboot: Do not use smp_num_siblings in __max_logical_packages calculation Documentation/x86/topology.txt defines smp_num_siblings as "The number of threads in a core". Since commit bbb65d2d365e ("x86: use cpuid vector 0xb when available for detecting cpu topology") smp_num_siblings is the maximum number of threads in a core. If Simultaneous MultiThreading (SMT) is disabled on a system, smp_num_siblings is 2 and not 1 as expected. Use topology_max_smt_threads(), which contains the active numer of threads, in the __max_logical_packages calculation. On a single socket, single core, single thread system __max_smt_threads has not been updated when the __max_logical_packages calculation happens, so its zero which makes the package estimate fail. Initialize it to one, which is the minimum number of threads on a core. [ tglx: Folded the __max_smt_threads fix in ] Fixes: b4c0a7326f5d ("x86/smpboot: Fix __max_logical_packages estimate") Reported-by: Jakub Kicinski Signed-off-by: Prarit Bhargava Tested-by: Jakub Kicinski Cc: netdev@vger.kernel.org Cc: "netdev@vger.kernel.org" Cc: Clark Williams Link: https://lkml.kernel.org/r/20171204164521.17870-1-prarit@redhat.com --- arch/x86/kernel/smpboot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 05a97d5fe298..35cb20994e32 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -106,7 +106,7 @@ EXPORT_SYMBOL(__max_logical_packages); static unsigned int logical_packages __read_mostly; /* Maximum number of SMT threads on any online core */ -int __max_smt_threads __read_mostly; +int __read_mostly __max_smt_threads = 1; /* Flag to indicate if a complete sched domain rebuild is required */ bool x86_topology_update; @@ -1304,7 +1304,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus) * Today neither Intel nor AMD support heterogenous systems so * extrapolate the boot cpu's data to all packages. */ - ncpus = cpu_data(0).booted_cores * smp_num_siblings; + ncpus = cpu_data(0).booted_cores * topology_max_smt_threads(); __max_logical_packages = DIV_ROUND_UP(nr_cpu_ids, ncpus); pr_info("Max logical packages: %u\n", __max_logical_packages); From 08529078d8d9adf689bf39cc38d53979a0869970 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 4 Dec 2017 15:40:55 +0300 Subject: [PATCH 070/269] x86/boot/compressed/64: Detect and handle 5-level paging at boot-time Prerequisite for fixing the current problem of instantaneous reboots when a 5-level paging kernel is booted on 4-level paging hardware. At the same time this change prepares the decompression code to boot-time switching between 4- and 5-level paging. [ tglx: Folded the GCC < 5 fix. ] Fixes: 77ef56e4f0fb ("x86: Enable 5-level paging support via CONFIG_X86_5LEVEL=y") Signed-off-by: Kirill A. Shutemov Signed-off-by: Thomas Gleixner Cc: Andi Kleen Cc: stable@vger.kernel.org Cc: Andy Lutomirski Cc: linux-mm@kvack.org Cc: Cyrill Gorcunov Cc: Borislav Petkov Cc: Linus Torvalds Link: https://lkml.kernel.org/r/20171204124059.63515-2-kirill.shutemov@linux.intel.com --- arch/x86/boot/compressed/Makefile | 1 + arch/x86/boot/compressed/head_64.S | 16 +++++++++++---- arch/x86/boot/compressed/pgtable_64.c | 28 +++++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 4 deletions(-) create mode 100644 arch/x86/boot/compressed/pgtable_64.c diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 1e9c322e973a..f25e1530e064 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -80,6 +80,7 @@ vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o ifdef CONFIG_X86_64 vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/pagetable.o vmlinux-objs-y += $(obj)/mem_encrypt.o + vmlinux-objs-y += $(obj)/pgtable_64.o endif $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 20919b4f3133..fc313e29fe2c 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -305,10 +305,18 @@ ENTRY(startup_64) leaq boot_stack_end(%rbx), %rsp #ifdef CONFIG_X86_5LEVEL - /* Check if 5-level paging has already enabled */ - movq %cr4, %rax - testl $X86_CR4_LA57, %eax - jnz lvl5 + /* + * Check if we need to enable 5-level paging. + * RSI holds real mode data and need to be preserved across + * a function call. + */ + pushq %rsi + call l5_paging_required + popq %rsi + + /* If l5_paging_required() returned zero, we're done here. */ + cmpq $0, %rax + je lvl5 /* * At this point we are in long mode with 4-level paging enabled, diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c new file mode 100644 index 000000000000..b4469a37e9a1 --- /dev/null +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -0,0 +1,28 @@ +#include + +/* + * __force_order is used by special_insns.h asm code to force instruction + * serialization. + * + * It is not referenced from the code, but GCC < 5 with -fPIE would fail + * due to an undefined symbol. Define it to make these ancient GCCs work. + */ +unsigned long __force_order; + +int l5_paging_required(void) +{ + /* Check if leaf 7 is supported. */ + + if (native_cpuid_eax(0) < 7) + return 0; + + /* Check if la57 is supported. */ + if (!(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) + return 0; + + /* Check if 5-level paging has already been enabled. */ + if (native_read_cr4() & X86_CR4_LA57) + return 0; + + return 1; +} From 6d7e0ba2d2be9e50cccba213baf07e0e183c1b24 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 4 Dec 2017 15:40:56 +0300 Subject: [PATCH 071/269] x86/boot/compressed/64: Print error if 5-level paging is not supported If the machine does not support the paging mode for which the kernel was compiled, the boot process cannot continue. It's not possible to let the kernel detect the mismatch as it does not even reach the point where cpu features can be evaluted due to a triple fault in the KASLR setup. Instead of instantaneous silent reboot, emit an error message which gives the user the information why the boot fails. Fixes: 77ef56e4f0fb ("x86: Enable 5-level paging support via CONFIG_X86_5LEVEL=y") Reported-by: Borislav Petkov Signed-off-by: Kirill A. Shutemov Signed-off-by: Thomas Gleixner Tested-by: Borislav Petkov Cc: Andi Kleen Cc: stable@vger.kernel.org Cc: Andy Lutomirski Cc: linux-mm@kvack.org Cc: Cyrill Gorcunov Cc: Linus Torvalds Link: https://lkml.kernel.org/r/20171204124059.63515-3-kirill.shutemov@linux.intel.com --- arch/x86/boot/compressed/misc.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index b50c42455e25..98761a1576ce 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -169,6 +169,16 @@ void __puthex(unsigned long value) } } +static bool l5_supported(void) +{ + /* Check if leaf 7 is supported. */ + if (native_cpuid_eax(0) < 7) + return 0; + + /* Check if la57 is supported. */ + return native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)); +} + #if CONFIG_X86_NEED_RELOCS static void handle_relocations(void *output, unsigned long output_len, unsigned long virt_addr) @@ -362,6 +372,12 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, console_init(); debug_putstr("early console in extract_kernel\n"); + if (IS_ENABLED(CONFIG_X86_5LEVEL) && !l5_supported()) { + error("This linux kernel as configured requires 5-level paging\n" + "This CPU does not support the required 'cr4.la57' feature\n" + "Unable to boot - please use a kernel appropriate for your CPU\n"); + } + free_mem_ptr = heap; /* Heap */ free_mem_end_ptr = heap + BOOT_HEAP_SIZE; From 04271ce9601f1686db480ea11ea1848394d9e6a2 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Tue, 28 Nov 2017 15:55:07 +0530 Subject: [PATCH 072/269] i2c-cht-wc: constify platform_device_id platform_device_id are not supposed to change at runtime. All functions working with platform_device_id provided by work with const platform_device_id. So mark the non-const structs as const. Signed-off-by: Arvind Yadav Reviewed-by: Hans de Goede Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-cht-wc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-cht-wc.c b/drivers/i2c/busses/i2c-cht-wc.c index 0d05dadb2dc5..44cffad43701 100644 --- a/drivers/i2c/busses/i2c-cht-wc.c +++ b/drivers/i2c/busses/i2c-cht-wc.c @@ -379,7 +379,7 @@ static int cht_wc_i2c_adap_i2c_remove(struct platform_device *pdev) return 0; } -static struct platform_device_id cht_wc_i2c_adap_id_table[] = { +static const struct platform_device_id cht_wc_i2c_adap_id_table[] = { { .name = "cht_wcove_ext_chgr" }, {}, }; From 4cae8ff136782d77b108cb3a5ba53e60597ba3a6 Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Tue, 5 Dec 2017 22:30:01 +0200 Subject: [PATCH 073/269] IB/core: Bound check alternate path port number The alternate port number is used as an array index in the IB security implementation, invalid values can result in a kernel panic. Cc: # v4.12 Fixes: d291f1a65232 ("IB/core: Enforce PKey security on QPs") Signed-off-by: Daniel Jurgens Reviewed-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_cmd.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 16d55710b116..d0202bb176a4 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1971,6 +1971,12 @@ static int modify_qp(struct ib_uverbs_file *file, goto release_qp; } + if ((cmd->base.attr_mask & IB_QP_ALT_PATH) && + !rdma_is_port_valid(qp->device, cmd->base.alt_port_num)) { + ret = -EINVAL; + goto release_qp; + } + attr->qp_state = cmd->base.qp_state; attr->cur_qp_state = cmd->base.cur_qp_state; attr->path_mtu = cmd->base.path_mtu; From 0fbe8f575b15585eec3326e43708fbbc024e8486 Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Tue, 5 Dec 2017 22:30:02 +0200 Subject: [PATCH 074/269] IB/core: Don't enforce PKey security on SMI MADs Per the infiniband spec an SMI MAD can have any PKey. Checking the pkey on SMI MADs is not necessary, and it seems that some older adapters using the mthca driver don't follow the convention of using the default PKey, resulting in false denials, or errors querying the PKey cache. SMI MAD security is still enforced, only agents allowed to manage the subnet are able to receive or send SMI MADs. Reported-by: Chris Blake Cc: # v4.12 Fixes: 47a2b338fe63 ("IB/core: Enforce security on management datagrams") Signed-off-by: Daniel Jurgens Reviewed-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/security.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c index a337386652b0..feafdb961c48 100644 --- a/drivers/infiniband/core/security.c +++ b/drivers/infiniband/core/security.c @@ -739,8 +739,11 @@ int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index) if (!rdma_protocol_ib(map->agent.device, map->agent.port_num)) return 0; - if (map->agent.qp->qp_type == IB_QPT_SMI && !map->agent.smp_allowed) - return -EACCES; + if (map->agent.qp->qp_type == IB_QPT_SMI) { + if (!map->agent.smp_allowed) + return -EACCES; + return 0; + } return ib_security_pkey_access(map->agent.device, map->agent.port_num, From 4d02ebd9bbbdde1d524e62b540b0402cee7bbcdf Mon Sep 17 00:00:00 2001 From: Guy Levi Date: Tue, 5 Dec 2017 22:30:03 +0200 Subject: [PATCH 075/269] IB/mlx4: Fix RSS hash fields restrictions Mistakenly the driver didn't allow RSS hash fields combinations which involve both IPv4 and IPv6 protocols. This bug caused to failures for user's use cases for RSS. Consequently, this patch fixes this bug and allows any combination that the HW can support. Additionally, the patch fixes the driver to return an error in case the user provides an unsupported mask for RSS hash fields. Fixes: 3078f5f1bd8b ("IB/mlx4: Add support for RSS QP") Signed-off-by: Guy Levi Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx4/qp.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 013049bcdb53..caf490ab24c8 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -666,6 +666,19 @@ static int set_qp_rss(struct mlx4_ib_dev *dev, struct mlx4_ib_rss *rss_ctx, return (-EOPNOTSUPP); } + if (ucmd->rx_hash_fields_mask & ~(MLX4_IB_RX_HASH_SRC_IPV4 | + MLX4_IB_RX_HASH_DST_IPV4 | + MLX4_IB_RX_HASH_SRC_IPV6 | + MLX4_IB_RX_HASH_DST_IPV6 | + MLX4_IB_RX_HASH_SRC_PORT_TCP | + MLX4_IB_RX_HASH_DST_PORT_TCP | + MLX4_IB_RX_HASH_SRC_PORT_UDP | + MLX4_IB_RX_HASH_DST_PORT_UDP)) { + pr_debug("RX Hash fields_mask has unsupported mask (0x%llx)\n", + ucmd->rx_hash_fields_mask); + return (-EOPNOTSUPP); + } + if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV4) && (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV4)) { rss_ctx->flags = MLX4_RSS_IPV4; @@ -691,11 +704,11 @@ static int set_qp_rss(struct mlx4_ib_dev *dev, struct mlx4_ib_rss *rss_ctx, return (-EOPNOTSUPP); } - if (rss_ctx->flags & MLX4_RSS_IPV4) { + if (rss_ctx->flags & MLX4_RSS_IPV4) rss_ctx->flags |= MLX4_RSS_UDP_IPV4; - } else if (rss_ctx->flags & MLX4_RSS_IPV6) { + if (rss_ctx->flags & MLX4_RSS_IPV6) rss_ctx->flags |= MLX4_RSS_UDP_IPV6; - } else { + if (!(rss_ctx->flags & (MLX4_RSS_IPV6 | MLX4_RSS_IPV4))) { pr_debug("RX Hash fields_mask is not supported - UDP must be set with IPv4 or IPv6\n"); return (-EOPNOTSUPP); } @@ -707,15 +720,14 @@ static int set_qp_rss(struct mlx4_ib_dev *dev, struct mlx4_ib_rss *rss_ctx, if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_TCP) && (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_TCP)) { - if (rss_ctx->flags & MLX4_RSS_IPV4) { + if (rss_ctx->flags & MLX4_RSS_IPV4) rss_ctx->flags |= MLX4_RSS_TCP_IPV4; - } else if (rss_ctx->flags & MLX4_RSS_IPV6) { + if (rss_ctx->flags & MLX4_RSS_IPV6) rss_ctx->flags |= MLX4_RSS_TCP_IPV6; - } else { + if (!(rss_ctx->flags & (MLX4_RSS_IPV6 | MLX4_RSS_IPV4))) { pr_debug("RX Hash fields_mask is not supported - TCP must be set with IPv4 or IPv6\n"); return (-EOPNOTSUPP); } - } else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_TCP) || (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_TCP)) { pr_debug("RX Hash fields_mask is not supported - both TCP SRC and DST must be set\n"); From d0e312fe3d34c1bc014a7f8ec6540d05e8077483 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 5 Dec 2017 22:30:04 +0200 Subject: [PATCH 076/269] RDMA/netlink: Fix general protection fault The RDMA netlink core code checks validity of messages by ensuring that type and operand are in range. It works well for almost all clients except NLDEV, which has cb_table less than number of operands. Request to access such operand will trigger the following kernel panic. This patch updates all places where cb_table is declared for the consistency, but only NLDEV is actually need it. general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN Modules linked in: CPU: 0 PID: 522 Comm: syz-executor6 Not tainted 4.13.0+ #4 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014 task: ffff8800657799c0 task.stack: ffff8800695d000 RIP: 0010:rdma_nl_rcv_msg+0x13a/0x4c0 RSP: 0018:ffff8800695d7838 EFLAGS: 00010207 RAX: dffffc0000000000 RBX: 1ffff1000d2baf0b RCX: 00000000704ff4d7 RDX: 0000000000000000 RSI: ffffffff81ddb03c RDI: 00000003827fa6bc RBP: ffff8800695d7900 R08: ffffffff82ec0578 R09: 0000000000000000 R10: ffff8800695d7900 R11: 0000000000000001 R12: 000000000000001c R13: ffff880069d31e00 R14: 00000000ffffffff R15: ffff880069d357c0 FS: 00007fee6acb8700(0000) GS:ffff88006ca00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000201a9000 CR3: 0000000059766000 CR4: 00000000000006b0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? rdma_nl_multicast+0x80/0x80 rdma_nl_rcv+0x36b/0x4d0 ? ibnl_put_attr+0xc0/0xc0 netlink_unicast+0x4bd/0x6d0 ? netlink_sendskb+0x50/0x50 ? drop_futex_key_refs.isra.4+0x68/0xb0 netlink_sendmsg+0x9ab/0xbd0 ? nlmsg_notify+0x140/0x140 ? wake_up_q+0xa1/0xf0 ? drop_futex_key_refs.isra.4+0x68/0xb0 sock_sendmsg+0x88/0xd0 sock_write_iter+0x228/0x3c0 ? sock_sendmsg+0xd0/0xd0 ? do_futex+0x3e5/0xb20 ? iov_iter_init+0xaf/0x1d0 __vfs_write+0x46e/0x640 ? sched_clock_cpu+0x1b/0x190 ? __vfs_read+0x620/0x620 ? __fget+0x23a/0x390 ? rw_verify_area+0xca/0x290 vfs_write+0x192/0x490 SyS_write+0xde/0x1c0 ? SyS_read+0x1c0/0x1c0 ? trace_hardirqs_on_thunk+0x1a/0x1c entry_SYSCALL_64_fastpath+0x18/0xad RIP: 0033:0x7fee6a74a219 RSP: 002b:00007fee6acb7d58 EFLAGS: 00000212 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000638000 RCX: 00007fee6a74a219 RDX: 0000000000000078 RSI: 0000000020141000 RDI: 0000000000000006 RBP: 0000000000000046 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000212 R12: ffff8800695d7f98 R13: 0000000020141000 R14: 0000000000000006 R15: 00000000ffffffff Code: d6 48 b8 00 00 00 00 00 fc ff df 66 41 81 e4 ff 03 44 8d 72 ff 4a 8d 3c b5 c0 a6 7f 82 44 89 b5 4c ff ff ff 48 89 f9 48 c1 e9 03 <0f> b6 0c 01 48 89 f8 83 e0 07 83 c0 03 38 c8 7c 08 84 c9 0f 85 RIP: rdma_nl_rcv_msg+0x13a/0x4c0 RSP: ffff8800695d7838 ---[ end trace ba085d123959c8ec ]--- Kernel panic - not syncing: Fatal exception Cc: syzkaller Fixes: b4c598a67ea1 ("RDMA/netlink: Implement nldev device dumpit calback") Reviewed-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/cma.c | 2 +- drivers/infiniband/core/device.c | 2 +- drivers/infiniband/core/iwcm.c | 2 +- drivers/infiniband/core/nldev.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index f6983357145d..6294a7001d33 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -4458,7 +4458,7 @@ out: return skb->len; } -static const struct rdma_nl_cbs cma_cb_table[] = { +static const struct rdma_nl_cbs cma_cb_table[RDMA_NL_RDMA_CM_NUM_OPS] = { [RDMA_NL_RDMA_CM_ID_STATS] = { .dump = cma_get_id_stats}, }; diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 5e1be4949d5f..30914f3baa5f 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -1146,7 +1146,7 @@ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, } EXPORT_SYMBOL(ib_get_net_dev_by_params); -static const struct rdma_nl_cbs ibnl_ls_cb_table[] = { +static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = { [RDMA_NL_LS_OP_RESOLVE] = { .doit = ib_nl_handle_resolve_resp, .flags = RDMA_NL_ADMIN_PERM, diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index e9e189ec7502..5d676cff41f4 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -80,7 +80,7 @@ const char *__attribute_const__ iwcm_reject_msg(int reason) } EXPORT_SYMBOL(iwcm_reject_msg); -static struct rdma_nl_cbs iwcm_nl_cb_table[] = { +static struct rdma_nl_cbs iwcm_nl_cb_table[RDMA_NL_IWPM_NUM_OPS] = { [RDMA_NL_IWPM_REG_PID] = {.dump = iwpm_register_pid_cb}, [RDMA_NL_IWPM_ADD_MAPPING] = {.dump = iwpm_add_mapping_cb}, [RDMA_NL_IWPM_QUERY_MAPPING] = {.dump = iwpm_add_and_query_mapping_cb}, diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 2fae850a3eff..9a05245a1acf 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -303,7 +303,7 @@ out: cb->args[0] = idx; return skb->len; } -static const struct rdma_nl_cbs nldev_cb_table[] = { +static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { [RDMA_NLDEV_CMD_GET] = { .doit = nldev_get_doit, .dump = nldev_get_dumpit, From 335ebf6fa35ca1c59b73f76fad19b249d3550e86 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Thu, 30 Nov 2017 09:41:56 -0800 Subject: [PATCH 077/269] iw_cxgb4: only clear the ARMED bit if a notification is needed In __flush_qp(), the CQ ARMED bit was being cleared regardless of whether any notification is actually needed. This resulted in the iser termination logic getting stuck in ib_drain_sq() because the CQ was not marked ARMED and thus the drain CQE notification wasn't triggered. This new bug was exposed when this commit was merged: commit cbb40fadd31c ("iw_cxgb4: only call the cq comp_handler when the cq is armed") Signed-off-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb4/qp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index 5ee7fe433136..355e288ec969 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -1285,21 +1285,21 @@ static void __flush_qp(struct c4iw_qp *qhp, struct c4iw_cq *rchp, spin_unlock_irqrestore(&rchp->lock, flag); if (schp == rchp) { - if (t4_clear_cq_armed(&rchp->cq) && - (rq_flushed || sq_flushed)) { + if ((rq_flushed || sq_flushed) && + t4_clear_cq_armed(&rchp->cq)) { spin_lock_irqsave(&rchp->comp_handler_lock, flag); (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context); spin_unlock_irqrestore(&rchp->comp_handler_lock, flag); } } else { - if (t4_clear_cq_armed(&rchp->cq) && rq_flushed) { + if (rq_flushed && t4_clear_cq_armed(&rchp->cq)) { spin_lock_irqsave(&rchp->comp_handler_lock, flag); (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context); spin_unlock_irqrestore(&rchp->comp_handler_lock, flag); } - if (t4_clear_cq_armed(&schp->cq) && sq_flushed) { + if (sq_flushed && t4_clear_cq_armed(&schp->cq)) { spin_lock_irqsave(&schp->comp_handler_lock, flag); (*schp->ibcq.comp_handler)(&schp->ibcq, schp->ibcq.cq_context); From 96307a0a75d8f1847debefd6a402339aac43e224 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 7 Dec 2017 14:26:09 +0100 Subject: [PATCH 078/269] netfilter: ipt_CLUSTERIP: fix clusterip_net_exit build regression The added check produces a build error when CONFIG_PROC_FS is disabled: net/ipv4/netfilter/ipt_CLUSTERIP.c: In function 'clusterip_net_exit': net/ipv4/netfilter/ipt_CLUSTERIP.c:822:28: error: 'cn' undeclared (first use in this function) This moves the variable declaration out of the #ifdef to make it available to the WARN_ON_ONCE(). Fixes: 613d0776d3fe ("netfilter: exit_net cleanup check added") Signed-off-by: Arnd Bergmann Reviewed-by: Vasily Averin Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/ipt_CLUSTERIP.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index e35b8d074f06..69060e3abe85 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -813,8 +813,8 @@ static int clusterip_net_init(struct net *net) static void clusterip_net_exit(struct net *net) { -#ifdef CONFIG_PROC_FS struct clusterip_net *cn = net_generic(net, clusterip_net_id); +#ifdef CONFIG_PROC_FS proc_remove(cn->procdir); cn->procdir = NULL; #endif From 7e70aa789d4a0c89dbfbd2c8a974a4df717475ec Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 5 Dec 2017 15:52:56 +0800 Subject: [PATCH 079/269] scsi: core: run queue if SCSI device queue isn't ready and queue is idle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before commit 0df21c86bdbf ("scsi: implement .get_budget and .put_budget for blk-mq"), we run queue after 3ms if queue is idle and SCSI device queue isn't ready, which is done in handling BLK_STS_RESOURCE. After commit 0df21c86bdbf is introduced, queue won't be run any more under this situation. IO hang is observed when timeout happened, and this patch fixes the IO hang issue by running queue after delay in scsi_dev_queue_ready, just like non-mq. This issue can be triggered by the following script[1]. There is another issue which can be covered by running idle queue: when .get_budget() is called on request coming from hctx->dispatch_list, if one request just completes during .get_budget(), we can't depend on SCSI's restart to make progress any more. This patch fixes the race too. With this patch, we basically recover to previous behaviour (before commit 0df21c86bdbf) of handling idle queue when running out of resource. [1] script for test/verify SCSI timeout rmmod scsi_debug modprobe scsi_debug max_queue=1 DEVICE=`ls -d /sys/bus/pseudo/drivers/scsi_debug/adapter*/host*/target*/*/block/* | head -1 | xargs basename` DISK_DIR=`ls -d /sys/block/$DEVICE/device/scsi_disk/*` echo "using scsi device $DEVICE" echo "-1" >/sys/bus/pseudo/drivers/scsi_debug/every_nth echo "temporary write through" >$DISK_DIR/cache_type echo "128" >/sys/bus/pseudo/drivers/scsi_debug/opts echo none > /sys/block/$DEVICE/queue/scheduler dd if=/dev/$DEVICE of=/dev/null bs=1M iflag=direct count=1 & sleep 5 echo "0" >/sys/bus/pseudo/drivers/scsi_debug/opts wait echo "SUCCESS" Fixes: 0df21c86bdbf ("scsi: implement .get_budget and .put_budget for blk-mq") Signed-off-by: Ming Lei Tested-by: Holger Hoffstätte Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_lib.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 00742c50cd44..d9ca1dfab154 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1967,6 +1967,8 @@ static bool scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx) out_put_device: put_device(&sdev->sdev_gendev); out: + if (atomic_read(&sdev->device_busy) == 0 && !scsi_device_blocked(sdev)) + blk_mq_delay_run_hw_queue(hctx, SCSI_QUEUE_DELAY); return false; } From 48d83282db077f93b2cf40de120f4d6f29eb293b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 6 Dec 2017 15:14:18 +0100 Subject: [PATCH 080/269] scsi: bfa: fix type conversion warning A regression fix introduced a harmless type mismatch warning: drivers/scsi/bfa/bfad_bsg.c: In function 'bfad_im_bsg_vendor_request': drivers/scsi/bfa/bfad_bsg.c:3137:35: error: initialization of 'struct bfad_im_port_s *' from 'long unsigned int' makes pointer from integer without a cast [-Werror=int-conversion] struct bfad_im_port_s *im_port = shost->hostdata[0]; ^~~~~ drivers/scsi/bfa/bfad_bsg.c: In function 'bfad_im_bsg_els_ct_request': drivers/scsi/bfa/bfad_bsg.c:3353:35: error: initialization of 'struct bfad_im_port_s *' from 'long unsigned int' makes pointer from integer without a cast [-Werror=int-conversion] struct bfad_im_port_s *im_port = shost->hostdata[0]; This changes the code back to shost_priv() once more, but encapsulates it in an inline function to document the rather unusual way of using the private data only as a pointer to the previously allocated structure. I did not try to get rid of the extra indirection level entirely, which would have been rather invasive and required reworking the entire initialization sequence. Fixes: 45349821ab3a ("scsi: bfa: fix access to bfad_im_port_s") Signed-off-by: Arnd Bergmann Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/bfa/bfad_bsg.c | 4 ++-- drivers/scsi/bfa/bfad_im.c | 6 ++++-- drivers/scsi/bfa/bfad_im.h | 10 ++++++++++ 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/bfa/bfad_bsg.c b/drivers/scsi/bfa/bfad_bsg.c index 09ef68c8225f..b2fa195adc7a 100644 --- a/drivers/scsi/bfa/bfad_bsg.c +++ b/drivers/scsi/bfa/bfad_bsg.c @@ -3136,7 +3136,7 @@ bfad_im_bsg_vendor_request(struct bsg_job *job) struct fc_bsg_reply *bsg_reply = job->reply; uint32_t vendor_cmd = bsg_request->rqst_data.h_vendor.vendor_cmd[0]; struct Scsi_Host *shost = fc_bsg_to_shost(job); - struct bfad_im_port_s *im_port = shost->hostdata[0]; + struct bfad_im_port_s *im_port = bfad_get_im_port(shost); struct bfad_s *bfad = im_port->bfad; void *payload_kbuf; int rc = -EINVAL; @@ -3352,7 +3352,7 @@ bfad_im_bsg_els_ct_request(struct bsg_job *job) { struct bfa_bsg_data *bsg_data; struct Scsi_Host *shost = fc_bsg_to_shost(job); - struct bfad_im_port_s *im_port = shost->hostdata[0]; + struct bfad_im_port_s *im_port = bfad_get_im_port(shost); struct bfad_s *bfad = im_port->bfad; bfa_bsg_fcpt_t *bsg_fcpt; struct bfad_fcxp *drv_fcxp; diff --git a/drivers/scsi/bfa/bfad_im.c b/drivers/scsi/bfa/bfad_im.c index 24e657a4ec80..c05d6e91e4bd 100644 --- a/drivers/scsi/bfa/bfad_im.c +++ b/drivers/scsi/bfa/bfad_im.c @@ -546,6 +546,7 @@ int bfad_im_scsi_host_alloc(struct bfad_s *bfad, struct bfad_im_port_s *im_port, struct device *dev) { + struct bfad_im_port_pointer *im_portp; int error = 1; mutex_lock(&bfad_mutex); @@ -564,7 +565,8 @@ bfad_im_scsi_host_alloc(struct bfad_s *bfad, struct bfad_im_port_s *im_port, goto out_free_idr; } - im_port->shost->hostdata[0] = (unsigned long)im_port; + im_portp = shost_priv(im_port->shost); + im_portp->p = im_port; im_port->shost->unique_id = im_port->idr_id; im_port->shost->this_id = -1; im_port->shost->max_id = MAX_FCP_TARGET; @@ -748,7 +750,7 @@ bfad_scsi_host_alloc(struct bfad_im_port_s *im_port, struct bfad_s *bfad) sht->sg_tablesize = bfad->cfg_data.io_max_sge; - return scsi_host_alloc(sht, sizeof(unsigned long)); + return scsi_host_alloc(sht, sizeof(struct bfad_im_port_pointer)); } void diff --git a/drivers/scsi/bfa/bfad_im.h b/drivers/scsi/bfa/bfad_im.h index c81ec2a77ef5..06ce4ba2b7bc 100644 --- a/drivers/scsi/bfa/bfad_im.h +++ b/drivers/scsi/bfa/bfad_im.h @@ -69,6 +69,16 @@ struct bfad_im_port_s { struct fc_vport *fc_vport; }; +struct bfad_im_port_pointer { + struct bfad_im_port_s *p; +}; + +static inline struct bfad_im_port_s *bfad_get_im_port(struct Scsi_Host *host) +{ + struct bfad_im_port_pointer *im_portp = shost_priv(host); + return im_portp->p; +} + enum bfad_itnim_state { ITNIM_STATE_NONE, ITNIM_STATE_ONLINE, From 040d786032bf59002d374b86d75b04d97624005c Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 30 Nov 2017 11:59:22 +0800 Subject: [PATCH 081/269] ceph: drop negative child dentries before try pruning inode's alias Negative child dentry holds reference on inode's alias, it makes d_prune_aliases() do nothing. Cc: stable@vger.kernel.org Signed-off-by: "Yan, Zheng" Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 42 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 4 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index ab69dcb70e8a..1b468250e947 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1440,6 +1440,29 @@ static int __close_session(struct ceph_mds_client *mdsc, return request_close_session(mdsc, session); } +static bool drop_negative_children(struct dentry *dentry) +{ + struct dentry *child; + bool all_negative = true; + + if (!d_is_dir(dentry)) + goto out; + + spin_lock(&dentry->d_lock); + list_for_each_entry(child, &dentry->d_subdirs, d_child) { + if (d_really_is_positive(child)) { + all_negative = false; + break; + } + } + spin_unlock(&dentry->d_lock); + + if (all_negative) + shrink_dcache_parent(dentry); +out: + return all_negative; +} + /* * Trim old(er) caps. * @@ -1490,16 +1513,27 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) if ((used | wanted) & ~oissued & mine) goto out; /* we need these caps */ - session->s_trim_caps--; if (oissued) { /* we aren't the only cap.. just remove us */ __ceph_remove_cap(cap, true); + session->s_trim_caps--; } else { + struct dentry *dentry; /* try dropping referring dentries */ spin_unlock(&ci->i_ceph_lock); - d_prune_aliases(inode); - dout("trim_caps_cb %p cap %p pruned, count now %d\n", - inode, cap, atomic_read(&inode->i_count)); + dentry = d_find_any_alias(inode); + if (dentry && drop_negative_children(dentry)) { + int count; + dput(dentry); + d_prune_aliases(inode); + count = atomic_read(&inode->i_count); + if (count == 1) + session->s_trim_caps--; + dout("trim_caps_cb %p cap %p pruned, count now %d\n", + inode, cap, count); + } else { + dput(dentry); + } return 0; } From 5790eabc6e7c3ce2d6ca2e3bbf4de467ce2b64b3 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 8 Dec 2017 17:31:37 +0200 Subject: [PATCH 082/269] ptr_ring: fix up after recent ptr_ring changes Add more stubs to make it build. Fixes: 81fbfe8a ("ptr_ring: use kmalloc_array()") Signed-off-by: Michael S. Tsirkin --- tools/virtio/ringtest/ptr_ring.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/tools/virtio/ringtest/ptr_ring.c b/tools/virtio/ringtest/ptr_ring.c index 38bb171aceba..e6e81305ef46 100644 --- a/tools/virtio/ringtest/ptr_ring.c +++ b/tools/virtio/ringtest/ptr_ring.c @@ -16,24 +16,41 @@ #define unlikely(x) (__builtin_expect(!!(x), 0)) #define likely(x) (__builtin_expect(!!(x), 1)) #define ALIGN(x, a) (((x) + (a) - 1) / (a) * (a)) +#define SIZE_MAX (~(size_t)0) + typedef pthread_spinlock_t spinlock_t; typedef int gfp_t; -static void *kmalloc(unsigned size, gfp_t gfp) -{ - return memalign(64, size); -} +#define __GFP_ZERO 0x1 -static void *kzalloc(unsigned size, gfp_t gfp) +static void *kmalloc(unsigned size, gfp_t gfp) { void *p = memalign(64, size); if (!p) return p; - memset(p, 0, size); + if (gfp & __GFP_ZERO) + memset(p, 0, size); return p; } +static inline void *kzalloc(unsigned size, gfp_t flags) +{ + return kmalloc(size, flags | __GFP_ZERO); +} + +static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags) +{ + if (size != 0 && n > SIZE_MAX / size) + return NULL; + return kmalloc(n * size, flags); +} + +static inline void *kcalloc(size_t n, size_t size, gfp_t flags) +{ + return kmalloc_array(n, size, flags | __GFP_ZERO); +} + static void kfree(void *p) { if (p) From c1fd0abee0d52eb7e2871194b6c79d54792f515f Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 7 Dec 2017 22:42:27 -0500 Subject: [PATCH 083/269] dm mpath: fix bio-based multipath queue_if_no_path handling Commit ca5beb76 ("dm mpath: micro-optimize the hot path relative to MPATHF_QUEUE_IF_NO_PATH") caused bio-based DM-multipath to fail mptest's "test_02_sdev_delete". Restoring the logic that existed prior to commit ca5beb76 fixes this bio-based DM-multipath regression. Also verified all mptest tests pass with request-based DM-multipath. This commit effectively reverts commit ca5beb76 -- but it does so without reintroducing the need to take the m->lock spinlock in must_push_back_{rq,bio}. Fixes: ca5beb76 ("dm mpath: micro-optimize the hot path relative to MPATHF_QUEUE_IF_NO_PATH") Cc: stable@vger.kernel.org # 4.12+ Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 49 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 7 deletions(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 35a2a2fa477f..f7810cc869ac 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -457,6 +457,38 @@ do { \ dm_noflush_suspending((m)->ti)); \ } while (0) +/* + * Check whether bios must be queued in the device-mapper core rather + * than here in the target. + * + * If MPATHF_QUEUE_IF_NO_PATH and MPATHF_SAVED_QUEUE_IF_NO_PATH hold + * the same value then we are not between multipath_presuspend() + * and multipath_resume() calls and we have no need to check + * for the DMF_NOFLUSH_SUSPENDING flag. + */ +static bool __must_push_back(struct multipath *m, unsigned long flags) +{ + return ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &flags) != + test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &flags)) && + dm_noflush_suspending(m->ti)); +} + +/* + * Following functions use READ_ONCE to get atomic access to + * all m->flags to avoid taking spinlock + */ +static bool must_push_back_rq(struct multipath *m) +{ + unsigned long flags = READ_ONCE(m->flags); + return test_bit(MPATHF_QUEUE_IF_NO_PATH, &flags) || __must_push_back(m, flags); +} + +static bool must_push_back_bio(struct multipath *m) +{ + unsigned long flags = READ_ONCE(m->flags); + return __must_push_back(m, flags); +} + /* * Map cloned requests (request-based multipath) */ @@ -478,7 +510,7 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, pgpath = choose_pgpath(m, nr_bytes); if (!pgpath) { - if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) + if (must_push_back_rq(m)) return DM_MAPIO_DELAY_REQUEUE; dm_report_EIO(m); /* Failed */ return DM_MAPIO_KILL; @@ -553,7 +585,7 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m } if (!pgpath) { - if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) + if (must_push_back_bio(m)) return DM_MAPIO_REQUEUE; dm_report_EIO(m); return DM_MAPIO_KILL; @@ -651,8 +683,7 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path, assign_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags, (save_old_value && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) || (!save_old_value && queue_if_no_path)); - assign_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags, - queue_if_no_path || dm_noflush_suspending(m->ti)); + assign_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags, queue_if_no_path); spin_unlock_irqrestore(&m->lock, flags); if (!queue_if_no_path) { @@ -1486,7 +1517,7 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone, fail_path(pgpath); if (atomic_read(&m->nr_valid_paths) == 0 && - !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { + !must_push_back_rq(m)) { if (error == BLK_STS_IOERR) dm_report_EIO(m); /* complete with the original error */ @@ -1521,8 +1552,12 @@ static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone, if (atomic_read(&m->nr_valid_paths) == 0 && !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { - dm_report_EIO(m); - *error = BLK_STS_IOERR; + if (must_push_back_bio(m)) { + r = DM_ENDIO_REQUEUE; + } else { + dm_report_EIO(m); + *error = BLK_STS_IOERR; + } goto done; } From fbc7c07ec23c040179384a1f16b62b6030eb6bdd Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 6 Dec 2017 09:27:30 -0800 Subject: [PATCH 084/269] dm bufio: fix shrinker scans when (nr_to_scan < retain_target) When system is under memory pressure it is observed that dm bufio shrinker often reclaims only one buffer per scan. This change fixes the following two issues in dm bufio shrinker that cause this behavior: 1. ((nr_to_scan - freed) <= retain_target) condition is used to terminate slab scan process. This assumes that nr_to_scan is equal to the LRU size, which might not be correct because do_shrink_slab() in vmscan.c calculates nr_to_scan using multiple inputs. As a result when nr_to_scan is less than retain_target (64) the scan will terminate after the first iteration, effectively reclaiming one buffer per scan and making scans very inefficient. This hurts vmscan performance especially because mutex is acquired/released every time dm_bufio_shrink_scan() is called. New implementation uses ((LRU size - freed) <= retain_target) condition for scan termination. LRU size can be safely determined inside __scan() because this function is called after dm_bufio_lock(). 2. do_shrink_slab() uses value returned by dm_bufio_shrink_count() to determine number of freeable objects in the slab. However dm_bufio always retains retain_target buffers in its LRU and will terminate a scan when this mark is reached. Therefore returning the entire LRU size from dm_bufio_shrink_count() is misleading because that does not represent the number of freeable objects that slab will reclaim during a scan. Returning (LRU size - retain_target) better represents the number of freeable objects in the slab. This way do_shrink_slab() returns 0 when (LRU size < retain_target) and vmscan will not try to scan this shrinker avoiding scans that will not reclaim any memory. Test: tested using Android device running /system/extras/alloc-stress that generates memory pressure and causes intensive shrinker scans Signed-off-by: Suren Baghdasaryan Signed-off-by: Mike Snitzer --- drivers/md/dm-bufio.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index b8ac591aaaa7..c546b567f3b5 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1611,7 +1611,8 @@ static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, int l; struct dm_buffer *b, *tmp; unsigned long freed = 0; - unsigned long count = nr_to_scan; + unsigned long count = c->n_buffers[LIST_CLEAN] + + c->n_buffers[LIST_DIRTY]; unsigned long retain_target = get_retain_buffers(c); for (l = 0; l < LIST_SIZE; l++) { @@ -1647,8 +1648,11 @@ static unsigned long dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker); + unsigned long count = READ_ONCE(c->n_buffers[LIST_CLEAN]) + + READ_ONCE(c->n_buffers[LIST_DIRTY]); + unsigned long retain_target = get_retain_buffers(c); - return READ_ONCE(c->n_buffers[LIST_CLEAN]) + READ_ONCE(c->n_buffers[LIST_DIRTY]); + return (count < retain_target) ? 0 : (count - retain_target); } /* From bd3486ded7a0c313a6575343e6c2b21d14476645 Mon Sep 17 00:00:00 2001 From: Bin Liu Date: Tue, 5 Dec 2017 08:45:30 -0600 Subject: [PATCH 085/269] usb: musb: da8xx: fix babble condition handling When babble condition happens, the musb controller might automatically turns off VBUS. On DA8xx platform, the controller generates drvvbus interrupt for turning off VBUS along with the babble interrupt. In this case, we should handle the babble interrupt first and recover from the babble condition. This change ignores the drvvbus interrupt if babble interrupt is also generated at the same time, so the babble recovery routine works properly. Cc: stable@vger.kernel.org # v3.16+ Signed-off-by: Bin Liu Signed-off-by: Greg Kroah-Hartman --- drivers/usb/musb/da8xx.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/usb/musb/da8xx.c b/drivers/usb/musb/da8xx.c index 0397606a211b..6c036de63272 100644 --- a/drivers/usb/musb/da8xx.c +++ b/drivers/usb/musb/da8xx.c @@ -284,7 +284,15 @@ static irqreturn_t da8xx_musb_interrupt(int irq, void *hci) musb->xceiv->otg->state = OTG_STATE_A_WAIT_VRISE; portstate(musb->port1_status |= USB_PORT_STAT_POWER); del_timer(&musb->dev_timer); - } else { + } else if (!(musb->int_usb & MUSB_INTR_BABBLE)) { + /* + * When babble condition happens, drvvbus interrupt + * is also generated. Ignore this drvvbus interrupt + * and let babble interrupt handler recovers the + * controller; otherwise, the host-mode flag is lost + * due to the MUSB_DEV_MODE() call below and babble + * recovery logic will not be called. + */ musb->is_active = 0; MUSB_DEV_MODE(musb); otg->default_a = 0; From 62354454625741f0569c2cbe45b2d192f8fd258e Mon Sep 17 00:00:00 2001 From: David Kozub Date: Tue, 5 Dec 2017 22:40:04 +0100 Subject: [PATCH 086/269] USB: uas and storage: Add US_FL_BROKEN_FUA for another JMicron JMS567 ID There is another JMS567-based USB3 UAS enclosure (152d:0578) that fails with the following error: [sda] tag#0 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE [sda] tag#0 Sense Key : Illegal Request [current] [sda] tag#0 Add. Sense: Invalid field in cdb The issue occurs both with UAS (occasionally) and mass storage (immediately after mounting a FS on a disk in the enclosure). Enabling US_FL_BROKEN_FUA quirk solves this issue. This patch adds an UNUSUAL_DEV with US_FL_BROKEN_FUA for the enclosure for both UAS and mass storage. Signed-off-by: David Kozub Acked-by: Alan Stern Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/storage/unusual_devs.h | 7 +++++++ drivers/usb/storage/unusual_uas.h | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/drivers/usb/storage/unusual_devs.h b/drivers/usb/storage/unusual_devs.h index 2968046e7c05..f72d045ee9ef 100644 --- a/drivers/usb/storage/unusual_devs.h +++ b/drivers/usb/storage/unusual_devs.h @@ -2100,6 +2100,13 @@ UNUSUAL_DEV( 0x152d, 0x0567, 0x0114, 0x0116, USB_SC_DEVICE, USB_PR_DEVICE, NULL, US_FL_BROKEN_FUA ), +/* Reported by David Kozub */ +UNUSUAL_DEV(0x152d, 0x0578, 0x0000, 0x9999, + "JMicron", + "JMS567", + USB_SC_DEVICE, USB_PR_DEVICE, NULL, + US_FL_BROKEN_FUA), + /* * Reported by Alexandre Oliva * JMicron responds to USN and several other SCSI ioctls with a diff --git a/drivers/usb/storage/unusual_uas.h b/drivers/usb/storage/unusual_uas.h index d520374a824e..e6127fb21c12 100644 --- a/drivers/usb/storage/unusual_uas.h +++ b/drivers/usb/storage/unusual_uas.h @@ -129,6 +129,13 @@ UNUSUAL_DEV(0x152d, 0x0567, 0x0000, 0x9999, USB_SC_DEVICE, USB_PR_DEVICE, NULL, US_FL_BROKEN_FUA | US_FL_NO_REPORT_OPCODES), +/* Reported-by: David Kozub */ +UNUSUAL_DEV(0x152d, 0x0578, 0x0000, 0x9999, + "JMicron", + "JMS567", + USB_SC_DEVICE, USB_PR_DEVICE, NULL, + US_FL_BROKEN_FUA), + /* Reported-by: Hans de Goede */ UNUSUAL_DEV(0x2109, 0x0711, 0x0000, 0x9999, "VIA", From 82a2b827c96883d8b39a58bba23d222d6b0de7ff Mon Sep 17 00:00:00 2001 From: Julien BOIBESSOT Date: Tue, 5 Dec 2017 16:09:04 +0100 Subject: [PATCH 087/269] tools/usbip: fixes potential (minor) "buffer overflow" (detected on recent gcc with -Werror) Fixes following build error: vhci_driver.c: In function 'refresh_imported_device_list': vhci_driver.c:118:37: error: 'snprintf' output may be truncated before the last format character [-Werror=format-truncation=] snprintf(status, sizeof(status), "status.%d", i); ^~~~~~~~~~~ vhci_driver.c:118:4: note: 'snprintf' output between 9 and 18 bytes into a destination of size 17 snprintf(status, sizeof(status), "status.%d", i); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ cc1: all warnings being treated as errors Signed-off-by: Julien BOIBESSOT Acked-by: Shuah Khan Signed-off-by: Greg Kroah-Hartman --- tools/usb/usbip/libsrc/vhci_driver.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/usb/usbip/libsrc/vhci_driver.c b/tools/usb/usbip/libsrc/vhci_driver.c index 8a1cd1616de4..627d1dfc332b 100644 --- a/tools/usb/usbip/libsrc/vhci_driver.c +++ b/tools/usb/usbip/libsrc/vhci_driver.c @@ -106,7 +106,7 @@ static int parse_status(const char *value) return 0; } -#define MAX_STATUS_NAME 16 +#define MAX_STATUS_NAME 18 static int refresh_imported_device_list(void) { From 635f545a7e8be7596b9b2b6a43cab6bbd5a88e43 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 7 Dec 2017 14:16:47 -0700 Subject: [PATCH 088/269] usbip: fix stub_rx: get_pipe() to validate endpoint number get_pipe() routine doesn't validate the input endpoint number and uses to reference ep_in and ep_out arrays. Invalid endpoint number can trigger BUG(). Range check the epnum and returning error instead of calling BUG(). Change caller stub_recv_cmd_submit() to handle the get_pipe() error return. Reported-by: Secunia Research Cc: stable Signed-off-by: Shuah Khan Signed-off-by: Greg Kroah-Hartman --- drivers/usb/usbip/stub_rx.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/usb/usbip/stub_rx.c b/drivers/usb/usbip/stub_rx.c index 536e037f541f..4d61063c259d 100644 --- a/drivers/usb/usbip/stub_rx.c +++ b/drivers/usb/usbip/stub_rx.c @@ -328,15 +328,15 @@ static int get_pipe(struct stub_device *sdev, int epnum, int dir) struct usb_host_endpoint *ep; struct usb_endpoint_descriptor *epd = NULL; + if (epnum < 0 || epnum > 15) + goto err_ret; + if (dir == USBIP_DIR_IN) ep = udev->ep_in[epnum & 0x7f]; else ep = udev->ep_out[epnum & 0x7f]; - if (!ep) { - dev_err(&sdev->udev->dev, "no such endpoint?, %d\n", - epnum); - BUG(); - } + if (!ep) + goto err_ret; epd = &ep->desc; if (usb_endpoint_xfer_control(epd)) { @@ -367,9 +367,10 @@ static int get_pipe(struct stub_device *sdev, int epnum, int dir) return usb_rcvisocpipe(udev, epnum); } +err_ret: /* NOT REACHED */ - dev_err(&sdev->udev->dev, "get pipe, epnum %d\n", epnum); - return 0; + dev_err(&sdev->udev->dev, "get pipe() invalid epnum %d\n", epnum); + return -1; } static void masking_bogus_flags(struct urb *urb) @@ -435,6 +436,9 @@ static void stub_recv_cmd_submit(struct stub_device *sdev, struct usb_device *udev = sdev->udev; int pipe = get_pipe(sdev, pdu->base.ep, pdu->base.direction); + if (pipe == -1) + return; + priv = stub_priv_alloc(sdev, pdu); if (!priv) return; From c6688ef9f29762e65bce325ef4acd6c675806366 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 7 Dec 2017 14:16:48 -0700 Subject: [PATCH 089/269] usbip: fix stub_rx: harden CMD_SUBMIT path to handle malicious input Harden CMD_SUBMIT path to handle malicious input that could trigger large memory allocations. Add checks to validate transfer_buffer_length and number_of_packets to protect against bad input requesting for unbounded memory allocations. Validate early in get_pipe() and return failure. Reported-by: Secunia Research Cc: stable Signed-off-by: Shuah Khan Signed-off-by: Greg Kroah-Hartman --- drivers/usb/usbip/stub_rx.c | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/drivers/usb/usbip/stub_rx.c b/drivers/usb/usbip/stub_rx.c index 4d61063c259d..493ac2928391 100644 --- a/drivers/usb/usbip/stub_rx.c +++ b/drivers/usb/usbip/stub_rx.c @@ -322,11 +322,13 @@ static struct stub_priv *stub_priv_alloc(struct stub_device *sdev, return priv; } -static int get_pipe(struct stub_device *sdev, int epnum, int dir) +static int get_pipe(struct stub_device *sdev, struct usbip_header *pdu) { struct usb_device *udev = sdev->udev; struct usb_host_endpoint *ep; struct usb_endpoint_descriptor *epd = NULL; + int epnum = pdu->base.ep; + int dir = pdu->base.direction; if (epnum < 0 || epnum > 15) goto err_ret; @@ -339,6 +341,15 @@ static int get_pipe(struct stub_device *sdev, int epnum, int dir) goto err_ret; epd = &ep->desc; + + /* validate transfer_buffer_length */ + if (pdu->u.cmd_submit.transfer_buffer_length > INT_MAX) { + dev_err(&sdev->udev->dev, + "CMD_SUBMIT: -EMSGSIZE transfer_buffer_length %d\n", + pdu->u.cmd_submit.transfer_buffer_length); + return -1; + } + if (usb_endpoint_xfer_control(epd)) { if (dir == USBIP_DIR_OUT) return usb_sndctrlpipe(udev, epnum); @@ -361,6 +372,21 @@ static int get_pipe(struct stub_device *sdev, int epnum, int dir) } if (usb_endpoint_xfer_isoc(epd)) { + /* validate packet size and number of packets */ + unsigned int maxp, packets, bytes; + + maxp = usb_endpoint_maxp(epd); + maxp *= usb_endpoint_maxp_mult(epd); + bytes = pdu->u.cmd_submit.transfer_buffer_length; + packets = DIV_ROUND_UP(bytes, maxp); + + if (pdu->u.cmd_submit.number_of_packets < 0 || + pdu->u.cmd_submit.number_of_packets > packets) { + dev_err(&sdev->udev->dev, + "CMD_SUBMIT: isoc invalid num packets %d\n", + pdu->u.cmd_submit.number_of_packets); + return -1; + } if (dir == USBIP_DIR_OUT) return usb_sndisocpipe(udev, epnum); else @@ -369,7 +395,7 @@ static int get_pipe(struct stub_device *sdev, int epnum, int dir) err_ret: /* NOT REACHED */ - dev_err(&sdev->udev->dev, "get pipe() invalid epnum %d\n", epnum); + dev_err(&sdev->udev->dev, "CMD_SUBMIT: invalid epnum %d\n", epnum); return -1; } @@ -434,7 +460,7 @@ static void stub_recv_cmd_submit(struct stub_device *sdev, struct stub_priv *priv; struct usbip_device *ud = &sdev->ud; struct usb_device *udev = sdev->udev; - int pipe = get_pipe(sdev, pdu->base.ep, pdu->base.direction); + int pipe = get_pipe(sdev, pdu); if (pipe == -1) return; @@ -456,7 +482,8 @@ static void stub_recv_cmd_submit(struct stub_device *sdev, } /* allocate urb transfer buffer, if needed */ - if (pdu->u.cmd_submit.transfer_buffer_length > 0) { + if (pdu->u.cmd_submit.transfer_buffer_length > 0 && + pdu->u.cmd_submit.transfer_buffer_length <= INT_MAX) { priv->urb->transfer_buffer = kzalloc(pdu->u.cmd_submit.transfer_buffer_length, GFP_KERNEL); From 2f2d0088eb93db5c649d2a5e34a3800a8a935fc5 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 7 Dec 2017 14:16:49 -0700 Subject: [PATCH 090/269] usbip: prevent vhci_hcd driver from leaking a socket pointer address When a client has a USB device attached over IP, the vhci_hcd driver is locally leaking a socket pointer address via the /sys/devices/platform/vhci_hcd/status file (world-readable) and in debug output when "usbip --debug port" is run. Fix it to not leak. The socket pointer address is not used at the moment and it was made visible as a convenient way to find IP address from socket pointer address by looking up /proc/net/{tcp,tcp6}. As this opens a security hole, the fix replaces socket pointer address with sockfd. Reported-by: Secunia Research Cc: stable Signed-off-by: Shuah Khan Signed-off-by: Greg Kroah-Hartman --- drivers/usb/usbip/usbip_common.h | 1 + drivers/usb/usbip/vhci_sysfs.c | 25 ++++++++++++++++--------- tools/usb/usbip/libsrc/vhci_driver.c | 8 ++++---- 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/drivers/usb/usbip/usbip_common.h b/drivers/usb/usbip/usbip_common.h index e5de35c8c505..473fb8a87289 100644 --- a/drivers/usb/usbip/usbip_common.h +++ b/drivers/usb/usbip/usbip_common.h @@ -256,6 +256,7 @@ struct usbip_device { /* lock for status */ spinlock_t lock; + int sockfd; struct socket *tcp_socket; struct task_struct *tcp_rx; diff --git a/drivers/usb/usbip/vhci_sysfs.c b/drivers/usb/usbip/vhci_sysfs.c index e78f7472cac4..091f76b7196d 100644 --- a/drivers/usb/usbip/vhci_sysfs.c +++ b/drivers/usb/usbip/vhci_sysfs.c @@ -17,15 +17,20 @@ /* * output example: - * hub port sta spd dev socket local_busid - * hs 0000 004 000 00000000 c5a7bb80 1-2.3 + * hub port sta spd dev sockfd local_busid + * hs 0000 004 000 00000000 3 1-2.3 * ................................................ - * ss 0008 004 000 00000000 d8cee980 2-3.4 + * ss 0008 004 000 00000000 4 2-3.4 * ................................................ * - * IP address can be retrieved from a socket pointer address by looking - * up /proc/net/{tcp,tcp6}. Also, a userland program may remember a - * port number and its peer IP address. + * Output includes socket fd instead of socket pointer address to avoid + * leaking kernel memory address in: + * /sys/devices/platform/vhci_hcd.0/status and in debug output. + * The socket pointer address is not used at the moment and it was made + * visible as a convenient way to find IP address from socket pointer + * address by looking up /proc/net/{tcp,tcp6}. As this opens a security + * hole, the change is made to use sockfd instead. + * */ static void port_show_vhci(char **out, int hub, int port, struct vhci_device *vdev) { @@ -39,8 +44,8 @@ static void port_show_vhci(char **out, int hub, int port, struct vhci_device *vd if (vdev->ud.status == VDEV_ST_USED) { *out += sprintf(*out, "%03u %08x ", vdev->speed, vdev->devid); - *out += sprintf(*out, "%16p %s", - vdev->ud.tcp_socket, + *out += sprintf(*out, "%u %s", + vdev->ud.sockfd, dev_name(&vdev->udev->dev)); } else { @@ -160,7 +165,8 @@ static ssize_t nports_show(struct device *dev, struct device_attribute *attr, char *s = out; /* - * Half the ports are for SPEED_HIGH and half for SPEED_SUPER, thus the * 2. + * Half the ports are for SPEED_HIGH and half for SPEED_SUPER, + * thus the * 2. */ out += sprintf(out, "%d\n", VHCI_PORTS * vhci_num_controllers); return out - s; @@ -366,6 +372,7 @@ static ssize_t store_attach(struct device *dev, struct device_attribute *attr, vdev->devid = devid; vdev->speed = speed; + vdev->ud.sockfd = sockfd; vdev->ud.tcp_socket = socket; vdev->ud.status = VDEV_ST_NOTASSIGNED; diff --git a/tools/usb/usbip/libsrc/vhci_driver.c b/tools/usb/usbip/libsrc/vhci_driver.c index 627d1dfc332b..c9c81614a66a 100644 --- a/tools/usb/usbip/libsrc/vhci_driver.c +++ b/tools/usb/usbip/libsrc/vhci_driver.c @@ -50,14 +50,14 @@ static int parse_status(const char *value) while (*c != '\0') { int port, status, speed, devid; - unsigned long socket; + int sockfd; char lbusid[SYSFS_BUS_ID_SIZE]; struct usbip_imported_device *idev; char hub[3]; - ret = sscanf(c, "%2s %d %d %d %x %lx %31s\n", + ret = sscanf(c, "%2s %d %d %d %x %u %31s\n", hub, &port, &status, &speed, - &devid, &socket, lbusid); + &devid, &sockfd, lbusid); if (ret < 5) { dbg("sscanf failed: %d", ret); @@ -66,7 +66,7 @@ static int parse_status(const char *value) dbg("hub %s port %d status %d speed %d devid %x", hub, port, status, speed, devid); - dbg("socket %lx lbusid %s", socket, lbusid); + dbg("sockfd %u lbusid %s", sockfd, lbusid); /* if a device is connected, look at it */ idev = &vhci_driver->idev[port]; From be6123df1ea8f01ee2f896a16c2b7be3e4557a5a Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 7 Dec 2017 14:16:50 -0700 Subject: [PATCH 091/269] usbip: fix stub_send_ret_submit() vulnerability to null transfer_buffer stub_send_ret_submit() handles urb with a potential null transfer_buffer, when it replays a packet with potential malicious data that could contain a null buffer. Add a check for the condition when actual_length > 0 and transfer_buffer is null. Reported-by: Secunia Research Cc: stable Signed-off-by: Shuah Khan Signed-off-by: Greg Kroah-Hartman --- drivers/usb/usbip/stub_tx.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/usb/usbip/stub_tx.c b/drivers/usb/usbip/stub_tx.c index b18bce96c212..53172b1f6257 100644 --- a/drivers/usb/usbip/stub_tx.c +++ b/drivers/usb/usbip/stub_tx.c @@ -167,6 +167,13 @@ static int stub_send_ret_submit(struct stub_device *sdev) memset(&pdu_header, 0, sizeof(pdu_header)); memset(&msg, 0, sizeof(msg)); + if (urb->actual_length > 0 && !urb->transfer_buffer) { + dev_err(&sdev->udev->dev, + "urb: actual_length %d transfer_buffer null\n", + urb->actual_length); + return -1; + } + if (usb_pipetype(urb->pipe) == PIPE_ISOCHRONOUS) iovnum = 2 + urb->number_of_packets; else From 5d9b70f7d52eb14bb37861c663bae44de9521c35 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Fri, 8 Dec 2017 18:10:05 +0200 Subject: [PATCH 092/269] xhci: Don't add a virt_dev to the devs array before it's fully allocated Avoid null pointer dereference if some function is walking through the devs array accessing members of a new virt_dev that is mid allocation. Add the virt_dev to xhci->devs[i] _after_ the virt_device and all its members are properly allocated. issue found by KASAN: null-ptr-deref in xhci_find_slot_id_by_port "Quick analysis suggests that xhci_alloc_virt_device() is not mutex protected. If so, there is a time frame where xhci->devs[slot_id] is set but not fully initialized. Specifically, xhci->devs[i]->udev can be NULL." Cc: stable Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-mem.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c index 15f7d422885f..3a29b32a3bd0 100644 --- a/drivers/usb/host/xhci-mem.c +++ b/drivers/usb/host/xhci-mem.c @@ -971,10 +971,9 @@ int xhci_alloc_virt_device(struct xhci_hcd *xhci, int slot_id, return 0; } - xhci->devs[slot_id] = kzalloc(sizeof(*xhci->devs[slot_id]), flags); - if (!xhci->devs[slot_id]) + dev = kzalloc(sizeof(*dev), flags); + if (!dev) return 0; - dev = xhci->devs[slot_id]; /* Allocate the (output) device context that will be used in the HC. */ dev->out_ctx = xhci_alloc_container_ctx(xhci, XHCI_CTX_TYPE_DEVICE, flags); @@ -1015,9 +1014,17 @@ int xhci_alloc_virt_device(struct xhci_hcd *xhci, int slot_id, trace_xhci_alloc_virt_device(dev); + xhci->devs[slot_id] = dev; + return 1; fail: - xhci_free_virt_device(xhci, slot_id); + + if (dev->in_ctx) + xhci_free_container_ctx(xhci, dev->in_ctx); + if (dev->out_ctx) + xhci_free_container_ctx(xhci, dev->out_ctx); + kfree(dev); + return 0; } From 72b663a99c074a8d073e7ecdae446cfb024ef551 Mon Sep 17 00:00:00 2001 From: Chunfeng Yun Date: Fri, 8 Dec 2017 18:10:06 +0200 Subject: [PATCH 093/269] usb: xhci: fix TDS for MTK xHCI1.1 For MTK's xHCI 1.0 or latter, TD size is the number of max packet sized packets remaining in the TD, not including this TRB (following spec). For MTK's xHCI 0.96 and older, TD size is the number of max packet sized packets remaining in the TD, including this TRB (not following spec). Cc: stable Signed-off-by: Chunfeng Yun Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-ring.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c index 6eb87c6e4d24..c5cbc685c691 100644 --- a/drivers/usb/host/xhci-ring.c +++ b/drivers/usb/host/xhci-ring.c @@ -3112,7 +3112,7 @@ static u32 xhci_td_remainder(struct xhci_hcd *xhci, int transferred, { u32 maxp, total_packet_count; - /* MTK xHCI is mostly 0.97 but contains some features from 1.0 */ + /* MTK xHCI 0.96 contains some features from 1.0 */ if (xhci->hci_version < 0x100 && !(xhci->quirks & XHCI_MTK_HOST)) return ((td_total_len - transferred) >> 10); @@ -3121,8 +3121,8 @@ static u32 xhci_td_remainder(struct xhci_hcd *xhci, int transferred, trb_buff_len == td_total_len) return 0; - /* for MTK xHCI, TD size doesn't include this TRB */ - if (xhci->quirks & XHCI_MTK_HOST) + /* for MTK xHCI 0.96, TD size include this TRB, but not in 1.x */ + if ((xhci->quirks & XHCI_MTK_HOST) && (xhci->hci_version < 0x100)) trb_buff_len = 0; maxp = usb_endpoint_maxp(&urb->ep->desc); From babc8110057cb9ca542c3c1666cbda4e8ccf9250 Mon Sep 17 00:00:00 2001 From: Stefan Schake Date: Sat, 2 Dec 2017 18:40:39 +0100 Subject: [PATCH 094/269] drm/vc4: Release fence after signalling We were never releasing the initial fence reference that is obtained through dma_fence_init. Link: https://github.com/anholt/linux/issues/122 Fixes: cdec4d361323 ("drm/vc4: Expose dma-buf fences for V3D rendering.") Signed-off-by: Stefan Schake Signed-off-by: Eric Anholt Reviewed-by: Eric Anholt Link: https://patchwork.freedesktop.org/patch/msgid/1512236444-301-1-git-send-email-stschake@gmail.com --- drivers/gpu/drm/vc4/vc4_gem.c | 4 +++- drivers/gpu/drm/vc4/vc4_irq.c | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c index 6c32c89a83a9..638540943c61 100644 --- a/drivers/gpu/drm/vc4/vc4_gem.c +++ b/drivers/gpu/drm/vc4/vc4_gem.c @@ -888,8 +888,10 @@ vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec) /* If we got force-completed because of GPU reset rather than * through our IRQ handler, signal the fence now. */ - if (exec->fence) + if (exec->fence) { dma_fence_signal(exec->fence); + dma_fence_put(exec->fence); + } if (exec->bo) { for (i = 0; i < exec->bo_count; i++) { diff --git a/drivers/gpu/drm/vc4/vc4_irq.c b/drivers/gpu/drm/vc4/vc4_irq.c index 61b2e5377993..26eddbb62893 100644 --- a/drivers/gpu/drm/vc4/vc4_irq.c +++ b/drivers/gpu/drm/vc4/vc4_irq.c @@ -139,6 +139,7 @@ vc4_irq_finish_render_job(struct drm_device *dev) list_move_tail(&exec->head, &vc4->job_done_list); if (exec->fence) { dma_fence_signal_locked(exec->fence); + dma_fence_put(exec->fence); exec->fence = NULL; } vc4_submit_next_render_job(dev); From eaf0ec303bd73f6b2c18f48542974a710fadfeb9 Mon Sep 17 00:00:00 2001 From: Pravin Shedge Date: Wed, 6 Dec 2017 10:16:15 -0800 Subject: [PATCH 095/269] fs: xfs: remove duplicate includes These duplicate includes have been found with scripts/checkincludes.pl but they have been removed manually to avoid removing false positives. Signed-off-by: Pravin Shedge Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/scrub/scrub.c | 1 - fs/xfs/scrub/trace.c | 1 - fs/xfs/xfs_reflink.c | 2 -- fs/xfs/xfs_trace.c | 1 - 4 files changed, 5 deletions(-) diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 9c42c4efd01e..ab3aef2ae823 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -46,7 +46,6 @@ #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" -#include "scrub/scrub.h" #include "scrub/btree.h" /* diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 472080e75788..86daed0e3a45 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -26,7 +26,6 @@ #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_da_format.h" -#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_trans.h" diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index cc041a29eb70..cf7c8f81bebb 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -49,8 +49,6 @@ #include "xfs_alloc.h" #include "xfs_quota_defs.h" #include "xfs_quota.h" -#include "xfs_btree.h" -#include "xfs_bmap_btree.h" #include "xfs_reflink.h" #include "xfs_iomap.h" #include "xfs_rmap_btree.h" diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 5d95fe348294..35f3546b6af5 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -24,7 +24,6 @@ #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_da_format.h" -#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_da_btree.h" From f59cf5c29919d17b61913c3360a7bd29b72975c1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Dec 2017 17:32:55 -0800 Subject: [PATCH 096/269] xfs: remove "no-allocation" reservations for file creations If we create a new file we will need an inode, and usually some metadata in the parent direction. Aiming for everything to go well despite the lack of a reservation leads to dirty transactions cancelled under a heavy create/delete load. This patch removes those nospace transactions, which will lead to slightly earlier ENOSPC on some workloads, but instead prevent file system shutdowns due to cancelling dirty transactions for others. A customer could observe assertations failures and shutdowns due to cancelation of dirty transactions during heavy NFS workloads as shown below: 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728125] XFS: Assertion failed: error != -ENOSPC, file: fs/xfs/xfs_inode.c, line: 1262 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728222] Call Trace: 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728246] [] dump_stack+0x63/0x81 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728262] [] warn_slowpath_common+0x8a/0xc0 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728264] [] warn_slowpath_null+0x1a/0x20 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728285] [] asswarn+0x33/0x40 [xfs] 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728308] [] xfs_create+0x7be/0x7d0 [xfs] 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728329] [] xfs_generic_create+0x1fb/0x2e0 [xfs] 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728348] [] xfs_vn_mknod+0x14/0x20 [xfs] 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728366] [] xfs_vn_create+0x13/0x20 [xfs] 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728380] [] vfs_create+0xd5/0x140 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728390] [] do_nfsd_create+0x499/0x610 [nfsd] 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728396] [] nfsd3_proc_create+0x135/0x210 [nfsd] 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728401] [] nfsd_dispatch+0xc3/0x210 [nfsd] 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728416] [] svc_process_common+0x453/0x6f0 [sunrpc] 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728423] [] svc_process+0x113/0x1f0 [sunrpc] 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728427] [] nfsd+0x10f/0x180 [nfsd] 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728432] [] ? nfsd_destroy+0x80/0x80 [nfsd] 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728438] [] kthread+0xd8/0xf0 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728441] [] ? kthread_create_on_node+0x1b0/0x1b0 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728451] [] ret_from_fork+0x42/0x70 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728453] [] ? kthread_create_on_node+0x1b0/0x1b0 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728454] ---[ end trace f9822c842fec81d4 ]--- 2017-05-30 21:17:06 kernel: ALERT: [ 2670.728477] XFS (sdb): Internal error xfs_trans_cancel at line 983 of file fs/xfs/xfs_trans.c. Caller xfs_create+0x4ee/0x7d0 [xfs] 2017-05-30 21:17:06 kernel: ALERT: [ 2670.728684] XFS (sdb): Corruption of in-memory data detected. Shutting down filesystem 2017-05-30 21:17:06 kernel: ALERT: [ 2670.728685] XFS (sdb): Please umount the filesystem and rectify the problem(s) Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ialloc.c | 10 +++------- fs/xfs/libxfs/xfs_ialloc.h | 1 - fs/xfs/xfs_inode.c | 33 +++++++-------------------------- fs/xfs/xfs_inode.h | 2 +- fs/xfs/xfs_qm.c | 4 ++-- fs/xfs/xfs_symlink.c | 15 +-------------- 6 files changed, 14 insertions(+), 51 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index de3f04a98656..3b57ef0f2f76 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -920,8 +920,7 @@ STATIC xfs_agnumber_t xfs_ialloc_ag_select( xfs_trans_t *tp, /* transaction pointer */ xfs_ino_t parent, /* parent directory inode number */ - umode_t mode, /* bits set to indicate file type */ - int okalloc) /* ok to allocate more space */ + umode_t mode) /* bits set to indicate file type */ { xfs_agnumber_t agcount; /* number of ag's in the filesystem */ xfs_agnumber_t agno; /* current ag number */ @@ -978,9 +977,6 @@ xfs_ialloc_ag_select( return agno; } - if (!okalloc) - goto nextag; - if (!pag->pagf_init) { error = xfs_alloc_pagf_init(mp, tp, agno, flags); if (error) @@ -1680,7 +1676,6 @@ xfs_dialloc( struct xfs_trans *tp, xfs_ino_t parent, umode_t mode, - int okalloc, struct xfs_buf **IO_agbp, xfs_ino_t *inop) { @@ -1692,6 +1687,7 @@ xfs_dialloc( int noroom = 0; xfs_agnumber_t start_agno; struct xfs_perag *pag; + int okalloc = 1; if (*IO_agbp) { /* @@ -1707,7 +1703,7 @@ xfs_dialloc( * We do not have an agbp, so select an initial allocation * group for inode allocation. */ - start_agno = xfs_ialloc_ag_select(tp, parent, mode, okalloc); + start_agno = xfs_ialloc_ag_select(tp, parent, mode); if (start_agno == NULLAGNUMBER) { *inop = NULLFSINO; return 0; diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index d2bdcd5e7312..66a8de0b1caa 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -81,7 +81,6 @@ xfs_dialloc( struct xfs_trans *tp, /* transaction pointer */ xfs_ino_t parent, /* parent inode (directory) */ umode_t mode, /* mode bits for new inode */ - int okalloc, /* ok to allocate more space */ struct xfs_buf **agbp, /* buf for a.g. inode header */ xfs_ino_t *inop); /* inode number allocated */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 801274126648..b41952a4ddd8 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -749,7 +749,6 @@ xfs_ialloc( xfs_nlink_t nlink, dev_t rdev, prid_t prid, - int okalloc, xfs_buf_t **ialloc_context, xfs_inode_t **ipp) { @@ -765,7 +764,7 @@ xfs_ialloc( * Call the space management code to pick * the on-disk inode to be allocated. */ - error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc, + error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, ialloc_context, &ino); if (error) return error; @@ -957,7 +956,6 @@ xfs_dir_ialloc( xfs_nlink_t nlink, dev_t rdev, prid_t prid, /* project id */ - int okalloc, /* ok to allocate new space */ xfs_inode_t **ipp, /* pointer to inode; it will be locked. */ int *committed) @@ -988,8 +986,8 @@ xfs_dir_ialloc( * transaction commit so that no other process can steal * the inode(s) that we've just allocated. */ - code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc, - &ialloc_context, &ip); + code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, &ialloc_context, + &ip); /* * Return an error if we were unable to allocate a new inode. @@ -1061,7 +1059,7 @@ xfs_dir_ialloc( * this call should always succeed. */ code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, - okalloc, &ialloc_context, &ip); + &ialloc_context, &ip); /* * If we get an error at this point, return to the caller @@ -1182,11 +1180,6 @@ xfs_create( xfs_flush_inodes(mp); error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp); } - if (error == -ENOSPC) { - /* No space at all so try a "no-allocation" reservation */ - resblks = 0; - error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp); - } if (error) goto out_release_inode; @@ -1203,19 +1196,13 @@ xfs_create( if (error) goto out_trans_cancel; - if (!resblks) { - error = xfs_dir_canenter(tp, dp, name); - if (error) - goto out_trans_cancel; - } - /* * A newly created regular or special file just has one directory * entry pointing to them, but a directory also the "." entry * pointing to itself. */ - error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, - prid, resblks > 0, &ip, NULL); + error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid, &ip, + NULL); if (error) goto out_trans_cancel; @@ -1340,11 +1327,6 @@ xfs_create_tmpfile( tres = &M_RES(mp)->tr_create_tmpfile; error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp); - if (error == -ENOSPC) { - /* No space at all so try a "no-allocation" reservation */ - resblks = 0; - error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp); - } if (error) goto out_release_inode; @@ -1353,8 +1335,7 @@ xfs_create_tmpfile( if (error) goto out_trans_cancel; - error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, - prid, resblks > 0, &ip, NULL); + error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, &ip, NULL); if (error) goto out_trans_cancel; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index cc13c3763721..b2136af9289f 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -428,7 +428,7 @@ xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip); int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t, - xfs_nlink_t, dev_t, prid_t, int, + xfs_nlink_t, dev_t, prid_t, struct xfs_inode **, int *); /* from xfs_file.c */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 010a13a201aa..ec952dfad359 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -793,8 +793,8 @@ xfs_qm_qino_alloc( return error; if (need_alloc) { - error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, - &committed); + error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, ip, + &committed); if (error) { xfs_trans_cancel(tp); return error; diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 68d3ca2c4968..2e9e793a8f9d 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -232,11 +232,6 @@ xfs_symlink( resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, resblks, 0, 0, &tp); - if (error == -ENOSPC && fs_blocks == 0) { - resblks = 0; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, 0, 0, 0, - &tp); - } if (error) goto out_release_inode; @@ -259,14 +254,6 @@ xfs_symlink( if (error) goto out_trans_cancel; - /* - * Check for ability to enter directory entry, if no space reserved. - */ - if (!resblks) { - error = xfs_dir_canenter(tp, dp, link_name); - if (error) - goto out_trans_cancel; - } /* * Initialize the bmap freelist prior to calling either * bmapi or the directory create code. @@ -277,7 +264,7 @@ xfs_symlink( * Allocate an inode for the symlink. */ error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0, - prid, resblks > 0, &ip, NULL); + prid, &ip, NULL); if (error) goto out_trans_cancel; From b7e0b6ff54dd92febbb1914ab93cd6a21622e169 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 6 Dec 2017 16:13:35 -0800 Subject: [PATCH 097/269] xfs: make iomap_begin functions trim iomaps consistently Historically, the XFS iomap_begin function only returned mappings for exactly the range queried, i.e. it doesn't do XFS_BMAPI_ENTIRE lookups. The current vfs iomap consumers are only set up to deal with trimmed mappings. xfs_xattr_iomap_begin does BMAPI_ENTIRE lookups, which is inconsistent with the current iomap usage. Remove the flag so that both iomap_begin functions behave the same way. FWIW this also fixes a behavioral regression in xattr FIEMAP that was introduced in 4.8 wherein attr fork extents are no longer trimmed like they used to be. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_iomap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 33eb4fb2e3fd..7ab52a8bc0a9 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1213,7 +1213,7 @@ xfs_xattr_iomap_begin( ASSERT(ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, - &nimaps, XFS_BMAPI_ENTIRE | XFS_BMAPI_ATTRFORK); + &nimaps, XFS_BMAPI_ATTRFORK); out_unlock: xfs_iunlock(ip, lockmode); From d7ee946942bdd12394809305e3df05aa4c8b7b8f Mon Sep 17 00:00:00 2001 From: Markus Trippelsdorf Date: Wed, 11 Oct 2017 07:01:31 +0200 Subject: [PATCH 098/269] VFS: Handle lazytime in do_mount() Since commit e462ec50cb5fa ("VFS: Differentiate mount flags (MS_*) from internal superblock flags") the lazytime mount option doesn't get passed on anymore. Fix the issue by handling the option in do_mount(). Reviewed-by: Lukas Czerner Signed-off-by: Markus Trippelsdorf Signed-off-by: Al Viro --- fs/namespace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/namespace.c b/fs/namespace.c index e158ec6b527b..9d1374ab6e06 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2826,6 +2826,7 @@ long do_mount(const char *dev_name, const char __user *dir_name, SB_DIRSYNC | SB_SILENT | SB_POSIXACL | + SB_LAZYTIME | SB_I_VERSION); if (flags & MS_REMOUNT) From 6f6a23a213be51728502b88741ba6a10cda2441d Mon Sep 17 00:00:00 2001 From: Adam Wallis Date: Mon, 27 Nov 2017 10:45:01 -0500 Subject: [PATCH 099/269] dmaengine: dmatest: move callback wait queue to thread context Commit adfa543e7314 ("dmatest: don't use set_freezable_with_signal()") introduced a bug (that is in fact documented by the patch commit text) that leaves behind a dangling pointer. Since the done_wait structure is allocated on the stack, future invocations to the DMATEST can produce undesirable results (e.g., corrupted spinlocks). Commit a9df21e34b42 ("dmaengine: dmatest: warn user when dma test times out") attempted to WARN the user that the stack was likely corrupted but did not fix the actual issue. This patch fixes the issue by pushing the wait queue and callback structs into the the thread structure. If a failure occurs due to time, dmaengine_terminate_all will force the callback to safely call wake_up_all() without possibility of using a freed pointer. Cc: stable@vger.kernel.org Bug: https://bugzilla.kernel.org/show_bug.cgi?id=197605 Fixes: adfa543e7314 ("dmatest: don't use set_freezable_with_signal()") Reviewed-by: Sinan Kaya Suggested-by: Shunyong Yang Signed-off-by: Adam Wallis Signed-off-by: Vinod Koul --- drivers/dma/dmatest.c | 55 ++++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 24 deletions(-) diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c index 47edc7fbf91f..ec5f9d2bc820 100644 --- a/drivers/dma/dmatest.c +++ b/drivers/dma/dmatest.c @@ -155,6 +155,12 @@ MODULE_PARM_DESC(run, "Run the test (default: false)"); #define PATTERN_COUNT_MASK 0x1f #define PATTERN_MEMSET_IDX 0x01 +/* poor man's completion - we want to use wait_event_freezable() on it */ +struct dmatest_done { + bool done; + wait_queue_head_t *wait; +}; + struct dmatest_thread { struct list_head node; struct dmatest_info *info; @@ -165,6 +171,8 @@ struct dmatest_thread { u8 **dsts; u8 **udsts; enum dma_transaction_type type; + wait_queue_head_t done_wait; + struct dmatest_done test_done; bool done; }; @@ -342,18 +350,25 @@ static unsigned int dmatest_verify(u8 **bufs, unsigned int start, return error_count; } -/* poor man's completion - we want to use wait_event_freezable() on it */ -struct dmatest_done { - bool done; - wait_queue_head_t *wait; -}; static void dmatest_callback(void *arg) { struct dmatest_done *done = arg; - - done->done = true; - wake_up_all(done->wait); + struct dmatest_thread *thread = + container_of(arg, struct dmatest_thread, done_wait); + if (!thread->done) { + done->done = true; + wake_up_all(done->wait); + } else { + /* + * If thread->done, it means that this callback occurred + * after the parent thread has cleaned up. This can + * happen in the case that driver doesn't implement + * the terminate_all() functionality and a dma operation + * did not occur within the timeout period + */ + WARN(1, "dmatest: Kernel memory may be corrupted!!\n"); + } } static unsigned int min_odd(unsigned int x, unsigned int y) @@ -424,9 +439,8 @@ static unsigned long long dmatest_KBs(s64 runtime, unsigned long long len) */ static int dmatest_func(void *data) { - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(done_wait); struct dmatest_thread *thread = data; - struct dmatest_done done = { .wait = &done_wait }; + struct dmatest_done *done = &thread->test_done; struct dmatest_info *info; struct dmatest_params *params; struct dma_chan *chan; @@ -673,9 +687,9 @@ static int dmatest_func(void *data) continue; } - done.done = false; + done->done = false; tx->callback = dmatest_callback; - tx->callback_param = &done; + tx->callback_param = done; cookie = tx->tx_submit(tx); if (dma_submit_error(cookie)) { @@ -688,21 +702,12 @@ static int dmatest_func(void *data) } dma_async_issue_pending(chan); - wait_event_freezable_timeout(done_wait, done.done, + wait_event_freezable_timeout(thread->done_wait, done->done, msecs_to_jiffies(params->timeout)); status = dma_async_is_tx_complete(chan, cookie, NULL, NULL); - if (!done.done) { - /* - * We're leaving the timed out dma operation with - * dangling pointer to done_wait. To make this - * correct, we'll need to allocate wait_done for - * each test iteration and perform "who's gonna - * free it this time?" dancing. For now, just - * leave it dangling. - */ - WARN(1, "dmatest: Kernel stack may be corrupted!!\n"); + if (!done->done) { dmaengine_unmap_put(um); result("test timed out", total_tests, src_off, dst_off, len, 0); @@ -789,7 +794,7 @@ err_thread_type: dmatest_KBs(runtime, total_len), ret); /* terminate all transfers on specified channels */ - if (ret) + if (ret || failed_tests) dmaengine_terminate_all(chan); thread->done = true; @@ -849,6 +854,8 @@ static int dmatest_add_threads(struct dmatest_info *info, thread->info = info; thread->chan = dtc->chan; thread->type = type; + thread->test_done.wait = &thread->done_wait; + init_waitqueue_head(&thread->done_wait); smp_wmb(); thread->task = kthread_create(dmatest_func, thread, "%s-%s%u", dma_chan_name(chan), op, i); From eb9436966fdc84cebdf222952a99898ab46d9bb0 Mon Sep 17 00:00:00 2001 From: Tobias Jordan Date: Wed, 6 Dec 2017 14:28:27 +0100 Subject: [PATCH 100/269] dmaengine: jz4740: disable/unprepare clk if probe fails in error path of jz4740_dma_probe(), call clk_disable_unprepare() to clean up. Found by Linux Driver Verification project (linuxtesting.org). Fixes: 25ce6c35fea0 MIPS: jz4740: Remove custom DMA API Signed-off-by: Tobias Jordan Signed-off-by: Vinod Koul --- drivers/dma/dma-jz4740.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/dma/dma-jz4740.c b/drivers/dma/dma-jz4740.c index d50273fed715..afd5e10f8927 100644 --- a/drivers/dma/dma-jz4740.c +++ b/drivers/dma/dma-jz4740.c @@ -555,7 +555,7 @@ static int jz4740_dma_probe(struct platform_device *pdev) ret = dma_async_device_register(dd); if (ret) - return ret; + goto err_clk; irq = platform_get_irq(pdev, 0); ret = request_irq(irq, jz4740_dma_irq, 0, dev_name(&pdev->dev), dmadev); @@ -568,6 +568,8 @@ static int jz4740_dma_probe(struct platform_device *pdev) err_unregister: dma_async_device_unregister(dd); +err_clk: + clk_disable_unprepare(dmadev->clk); return ret; } From 996fc4477a0ea28226b30d175f053fb6f9a4fa36 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 10 Dec 2017 23:44:11 -0500 Subject: [PATCH 101/269] ext4: add missing error check in __ext4_new_inode() It's possible for ext4_get_acl() to return an ERR_PTR. So we need to add a check for this case in __ext4_new_inode(). Otherwise on an error we can end up oops the kernel. This was getting triggered by xfstests generic/388, which is a test which exercises the shutdown code path. Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/ialloc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index b4267d72f249..b32cf263750d 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -816,6 +816,8 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, #ifdef CONFIG_EXT4_FS_POSIX_ACL struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(p)) + return ERR_CAST(p); if (p) { int acl_size = p->a_count * sizeof(ext4_acl_entry); From 0afe9d4ab9d40c281bdcdd118661fe8e4bdcef18 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 9 Dec 2017 21:10:10 +0100 Subject: [PATCH 102/269] mac80211: fix locking in ieee80211_sta_tear_down_BA_sessions Due to overlap between commit 1281103770e9 ("mac80211: Simplify locking in ieee80211_sta_tear_down_BA_sessions()") and the way that Luca modified commit 72e2c3438ba3 ("mac80211: tear down RX aggregations first") when sending it upstream from Intel's internal tree, we get the following warning: WARNING: CPU: 0 PID: 5472 at net/mac80211/agg-tx.c:315 ___ieee80211_stop_tx_ba_session+0x158/0x1f0 since there's no appropriate locking around the call to ___ieee80211_stop_tx_ba_session; Sara's original just had a call to the locked __ieee80211_stop_tx_ba_session (one less underscore) but it looks like Luca modified both of the calls when fixing it up for upstream, leading to the problem at hand. Move the locking appropriately to fix this problem. Reported-by: Kalle Valo Reported-by: Pavel Machek Tested-by: Pavel Machek Signed-off-by: Johannes Berg --- net/mac80211/ht.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 167f83b853e6..1621b6ab17ba 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -291,16 +291,15 @@ void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, int i; mutex_lock(&sta->ampdu_mlme.mtx); - for (i = 0; i < IEEE80211_NUM_TIDS; i++) { + for (i = 0; i < IEEE80211_NUM_TIDS; i++) ___ieee80211_stop_rx_ba_session(sta, i, WLAN_BACK_RECIPIENT, WLAN_REASON_QSTA_LEAVE_QBSS, reason != AGG_STOP_DESTROY_STA && reason != AGG_STOP_PEER_REQUEST); - } - mutex_unlock(&sta->ampdu_mlme.mtx); for (i = 0; i < IEEE80211_NUM_TIDS; i++) ___ieee80211_stop_tx_ba_session(sta, i, reason); + mutex_unlock(&sta->ampdu_mlme.mtx); /* stopping might queue the work again - so cancel only afterwards */ cancel_work_sync(&sta->ampdu_mlme.work); From 438c84c2f0c794f75ab55ce65c505b01bfce4480 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 11 Dec 2017 11:28:10 +0100 Subject: [PATCH 103/269] ovl: don't follow redirects if redirect_dir=off Overlayfs is following redirects even when redirects are disabled. If this is unintentional (probably the majority of cases) then this can be a problem. E.g. upper layer comes from untrusted USB drive, and attacker crafts a redirect to enable read access to otherwise unreadable directories. If "redirect_dir=off", then turn off following as well as creation of redirects. If "redirect_dir=follow", then turn on following, but turn off creation of redirects (which is what "redirect_dir=off" does now). This is a backward incompatible change, so make it dependent on a config option. Reported-by: David Howells Signed-off-by: Miklos Szeredi --- Documentation/filesystems/overlayfs.txt | 34 +++++++++++++ fs/overlayfs/Kconfig | 10 ++++ fs/overlayfs/namei.c | 16 ++++++ fs/overlayfs/ovl_entry.h | 2 + fs/overlayfs/super.c | 68 ++++++++++++++++++------- 5 files changed, 113 insertions(+), 17 deletions(-) diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt index 8caa60734647..e6a5f4912b6d 100644 --- a/Documentation/filesystems/overlayfs.txt +++ b/Documentation/filesystems/overlayfs.txt @@ -156,6 +156,40 @@ handle it in two different ways: root of the overlay. Finally the directory is moved to the new location. +There are several ways to tune the "redirect_dir" feature. + +Kernel config options: + +- OVERLAY_FS_REDIRECT_DIR: + If this is enabled, then redirect_dir is turned on by default. +- OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW: + If this is enabled, then redirects are always followed by default. Enabling + this results in a less secure configuration. Enable this option only when + worried about backward compatibility with kernels that have the redirect_dir + feature and follow redirects even if turned off. + +Module options (can also be changed through /sys/module/overlay/parameters/*): + +- "redirect_dir=BOOL": + See OVERLAY_FS_REDIRECT_DIR kernel config option above. +- "redirect_always_follow=BOOL": + See OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW kernel config option above. +- "redirect_max=NUM": + The maximum number of bytes in an absolute redirect (default is 256). + +Mount options: + +- "redirect_dir=on": + Redirects are enabled. +- "redirect_dir=follow": + Redirects are not created, but followed. +- "redirect_dir=off": + Redirects are not created and only followed if "redirect_always_follow" + feature is enabled in the kernel/module config. +- "redirect_dir=nofollow": + Redirects are not created and not followed (equivalent to "redirect_dir=off" + if "redirect_always_follow" feature is not enabled). + Non-directories --------------- diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index cbfc196e5dc5..5ac415466861 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -24,6 +24,16 @@ config OVERLAY_FS_REDIRECT_DIR an overlay which has redirects on a kernel that doesn't support this feature will have unexpected results. +config OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW + bool "Overlayfs: follow redirects even if redirects are turned off" + default y + depends on OVERLAY_FS + help + Disable this to get a possibly more secure configuration, but that + might not be backward compatible with previous kernels. + + For more information, see Documentation/filesystems/overlayfs.txt + config OVERLAY_FS_INDEX bool "Overlayfs: turn on inodes index feature by default" depends on OVERLAY_FS diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 625ed8066570..2a12dc2e9840 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -681,6 +681,22 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (d.stop) break; + /* + * Following redirects can have security consequences: it's like + * a symlink into the lower layer without the permission checks. + * This is only a problem if the upper layer is untrusted (e.g + * comes from an USB drive). This can allow a non-readable file + * or directory to become readable. + * + * Only following redirects when redirects are enabled disables + * this attack vector when not necessary. + */ + err = -EPERM; + if (d.redirect && !ofs->config.redirect_follow) { + pr_warn_ratelimited("overlay: refusing to follow redirect for (%pd2)\n", dentry); + goto out_put; + } + if (d.redirect && d.redirect[0] == '/' && poe != roe) { poe = roe; diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 752bab645879..9d0bc03bf6e4 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -14,6 +14,8 @@ struct ovl_config { char *workdir; bool default_permissions; bool redirect_dir; + bool redirect_follow; + const char *redirect_mode; bool index; }; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 288d20f9a55a..13a8a8617e44 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -33,6 +33,13 @@ module_param_named(redirect_dir, ovl_redirect_dir_def, bool, 0644); MODULE_PARM_DESC(ovl_redirect_dir_def, "Default to on or off for the redirect_dir feature"); +static bool ovl_redirect_always_follow = + IS_ENABLED(CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW); +module_param_named(redirect_always_follow, ovl_redirect_always_follow, + bool, 0644); +MODULE_PARM_DESC(ovl_redirect_always_follow, + "Follow redirects even if redirect_dir feature is turned off"); + static bool ovl_index_def = IS_ENABLED(CONFIG_OVERLAY_FS_INDEX); module_param_named(index, ovl_index_def, bool, 0644); MODULE_PARM_DESC(ovl_index_def, @@ -232,6 +239,7 @@ static void ovl_free_fs(struct ovl_fs *ofs) kfree(ofs->config.lowerdir); kfree(ofs->config.upperdir); kfree(ofs->config.workdir); + kfree(ofs->config.redirect_mode); if (ofs->creator_cred) put_cred(ofs->creator_cred); kfree(ofs); @@ -295,6 +303,11 @@ static bool ovl_force_readonly(struct ovl_fs *ofs) return (!ofs->upper_mnt || !ofs->workdir); } +static const char *ovl_redirect_mode_def(void) +{ + return ovl_redirect_dir_def ? "on" : "off"; +} + /** * ovl_show_options * @@ -313,12 +326,10 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry) } if (ofs->config.default_permissions) seq_puts(m, ",default_permissions"); - if (ofs->config.redirect_dir != ovl_redirect_dir_def) - seq_printf(m, ",redirect_dir=%s", - ofs->config.redirect_dir ? "on" : "off"); + if (strcmp(ofs->config.redirect_mode, ovl_redirect_mode_def()) != 0) + seq_printf(m, ",redirect_dir=%s", ofs->config.redirect_mode); if (ofs->config.index != ovl_index_def) - seq_printf(m, ",index=%s", - ofs->config.index ? "on" : "off"); + seq_printf(m, ",index=%s", ofs->config.index ? "on" : "off"); return 0; } @@ -348,8 +359,7 @@ enum { OPT_UPPERDIR, OPT_WORKDIR, OPT_DEFAULT_PERMISSIONS, - OPT_REDIRECT_DIR_ON, - OPT_REDIRECT_DIR_OFF, + OPT_REDIRECT_DIR, OPT_INDEX_ON, OPT_INDEX_OFF, OPT_ERR, @@ -360,8 +370,7 @@ static const match_table_t ovl_tokens = { {OPT_UPPERDIR, "upperdir=%s"}, {OPT_WORKDIR, "workdir=%s"}, {OPT_DEFAULT_PERMISSIONS, "default_permissions"}, - {OPT_REDIRECT_DIR_ON, "redirect_dir=on"}, - {OPT_REDIRECT_DIR_OFF, "redirect_dir=off"}, + {OPT_REDIRECT_DIR, "redirect_dir=%s"}, {OPT_INDEX_ON, "index=on"}, {OPT_INDEX_OFF, "index=off"}, {OPT_ERR, NULL} @@ -390,10 +399,37 @@ static char *ovl_next_opt(char **s) return sbegin; } +static int ovl_parse_redirect_mode(struct ovl_config *config, const char *mode) +{ + if (strcmp(mode, "on") == 0) { + config->redirect_dir = true; + /* + * Does not make sense to have redirect creation without + * redirect following. + */ + config->redirect_follow = true; + } else if (strcmp(mode, "follow") == 0) { + config->redirect_follow = true; + } else if (strcmp(mode, "off") == 0) { + if (ovl_redirect_always_follow) + config->redirect_follow = true; + } else if (strcmp(mode, "nofollow") != 0) { + pr_err("overlayfs: bad mount option \"redirect_dir=%s\"\n", + mode); + return -EINVAL; + } + + return 0; +} + static int ovl_parse_opt(char *opt, struct ovl_config *config) { char *p; + config->redirect_mode = kstrdup(ovl_redirect_mode_def(), GFP_KERNEL); + if (!config->redirect_mode) + return -ENOMEM; + while ((p = ovl_next_opt(&opt)) != NULL) { int token; substring_t args[MAX_OPT_ARGS]; @@ -428,12 +464,11 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) config->default_permissions = true; break; - case OPT_REDIRECT_DIR_ON: - config->redirect_dir = true; - break; - - case OPT_REDIRECT_DIR_OFF: - config->redirect_dir = false; + case OPT_REDIRECT_DIR: + kfree(config->redirect_mode); + config->redirect_mode = match_strdup(&args[0]); + if (!config->redirect_mode) + return -ENOMEM; break; case OPT_INDEX_ON: @@ -458,7 +493,7 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) config->workdir = NULL; } - return 0; + return ovl_parse_redirect_mode(config, config->redirect_mode); } #define OVL_WORKDIR_NAME "work" @@ -1160,7 +1195,6 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (!cred) goto out_err; - ofs->config.redirect_dir = ovl_redirect_dir_def; ofs->config.index = ovl_index_def; err = ovl_parse_opt((char *) data, &ofs->config); if (err) From 08d8f8a5b094b66b29936e8751b4a818b8db1207 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Mon, 27 Nov 2017 10:12:44 -0500 Subject: [PATCH 104/269] ovl: Pass ovl_get_nlink() parameters in right order Right now we seem to be passing index as "lowerdentry" and origin.dentry as "upperdentry". IIUC, we should pass these parameters in reversed order and this looks like a bug. Signed-off-by: Vivek Goyal Acked-by: Amir Goldstein Fixes: caf70cb2ba5d ("ovl: cleanup orphan index entries") Cc: #v4.13 Signed-off-by: Miklos Szeredi --- fs/overlayfs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 2a12dc2e9840..beb945e1963c 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -435,7 +435,7 @@ int ovl_verify_index(struct dentry *index, struct ovl_path *lower, /* Check if index is orphan and don't warn before cleaning it */ if (d_inode(index)->i_nlink == 1 && - ovl_get_nlink(index, origin.dentry, 0) == 0) + ovl_get_nlink(origin.dentry, index, 0) == 0) err = -ENOENT; dput(origin.dentry); From b02a16e6413a2f782e542ef60bad9ff6bf212f8a Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 29 Nov 2017 07:35:21 +0200 Subject: [PATCH 105/269] ovl: update ctx->pos on impure dir iteration This fixes a regression with readdir of impure dir in overlayfs that is shared to VM via 9p fs. Reported-by: Miguel Bernal Marin Fixes: 4edb83bb1041 ("ovl: constant d_ino for non-merge dirs") Cc: #4.14 Signed-off-by: Amir Goldstein Tested-by: Miguel Bernal Marin Signed-off-by: Miklos Szeredi --- fs/overlayfs/readdir.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 0daa4354fec4..51088849ce97 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -663,7 +663,10 @@ static int ovl_iterate_real(struct file *file, struct dir_context *ctx) return PTR_ERR(rdt.cache); } - return iterate_dir(od->realfile, &rdt.ctx); + err = iterate_dir(od->realfile, &rdt.ctx); + ctx->pos = rdt.ctx.pos; + + return err; } From e8d4bfe3a71537284a90561f77c85dea6c154369 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Wed, 29 Nov 2017 10:01:32 +0800 Subject: [PATCH 106/269] ovl: Sync upper dirty data when syncing overlayfs When executing filesystem sync or umount on overlayfs, dirty data does not get synced as expected on upper filesystem. This patch fixes sync filesystem method to keep data consistency for overlayfs. Signed-off-by: Chengguang Xu Fixes: e593b2bf513d ("ovl: properly implement sync_filesystem()") Cc: #4.11 Signed-off-by: Miklos Szeredi --- fs/overlayfs/super.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 13a8a8617e44..76440feb79f6 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -252,6 +252,7 @@ static void ovl_put_super(struct super_block *sb) ovl_free_fs(ofs); } +/* Sync real dirty inodes in upper filesystem (if it exists) */ static int ovl_sync_fs(struct super_block *sb, int wait) { struct ovl_fs *ofs = sb->s_fs_info; @@ -260,14 +261,24 @@ static int ovl_sync_fs(struct super_block *sb, int wait) if (!ofs->upper_mnt) return 0; - upper_sb = ofs->upper_mnt->mnt_sb; - if (!upper_sb->s_op->sync_fs) + + /* + * If this is a sync(2) call or an emergency sync, all the super blocks + * will be iterated, including upper_sb, so no need to do anything. + * + * If this is a syncfs(2) call, then we do need to call + * sync_filesystem() on upper_sb, but enough if we do it when being + * called with wait == 1. + */ + if (!wait) return 0; - /* real inodes have already been synced by sync_filesystem(ovl_sb) */ + upper_sb = ofs->upper_mnt->mnt_sb; + down_read(&upper_sb->s_umount); - ret = upper_sb->s_op->sync_fs(upper_sb, wait); + ret = sync_filesystem(upper_sb); up_read(&upper_sb->s_umount); + return ret; } From 7879cb43f9a75710af439c6bd81c94de1aa3d740 Mon Sep 17 00:00:00 2001 From: Vasyl Gomonovych Date: Tue, 28 Nov 2017 00:09:23 +0100 Subject: [PATCH 107/269] ovl: Use PTR_ERR_OR_ZERO() Fix ptr_ret.cocci warnings: fs/overlayfs/overlayfs.h:179:11-17: WARNING: PTR_ERR_OR_ZERO can be used Use PTR_ERR_OR_ZERO rather than if(IS_ERR(...)) + PTR_ERR Generated by: scripts/coccinelle/api/ptr_ret.cocci Signed-off-by: Vasyl Gomonovych Signed-off-by: Miklos Szeredi --- fs/overlayfs/overlayfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 13eab09a6b6f..b489099ccd49 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -180,7 +180,7 @@ static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry) static inline struct dentry *ovl_do_tmpfile(struct dentry *dentry, umode_t mode) { struct dentry *ret = vfs_tmpfile(dentry, mode, 0); - int err = IS_ERR(ret) ? PTR_ERR(ret) : 0; + int err = PTR_ERR_OR_ZERO(ret); pr_debug("tmpfile(%pd2, 0%o) = %i\n", dentry, mode, err); return ret; From 8722e095f5a44d0e409e45c5ddc2ee9cf589c777 Mon Sep 17 00:00:00 2001 From: Vincent Pelletier Date: Thu, 30 Nov 2017 15:31:06 +0000 Subject: [PATCH 108/269] usb: dwc3: gadget: Wait longer for controller to end command processing DWC3_DEPCMD_ENDTRANSFER has been witnessed to require around 600 iterations before controller would become idle again after unplugging the USB cable with AIO reads submitted. Bump timeout from 500 iterations to 1000 so dwc3_stop_active_transfer does not receive -ETIMEDOUT and does not WARN: [ 81.326273] ------------[ cut here ]------------ [ 81.335341] WARNING: CPU: 0 PID: 1874 at drivers/usb/dwc3/gadget.c:2627 dwc3_stop_active_transfer.constprop.23+0x69/0xc0 [dwc3] [ 81.347094] Modules linked in: usb_f_fs libcomposite configfs bnep btsdio bluetooth ecdh_generic brcmfmac brcmutil dwc3 intel_powerclamp coretemp ulpi kvm_intel udc_core kvm irqbypass crc32_pclmul crc32c_intel pcbc dwc3_pci aesni_intel aes_i586 crypto_simd cryptd ehci_pci ehci_hcd basincove_gpadc industrialio gpio_keys usbcore usb_common [ 81.378142] CPU: 0 PID: 1874 Comm: irq/34-dwc3 Not tainted 4.14.0-edison+ #119 [ 81.385545] Hardware name: Intel Corporation Merrifield/BODEGA BAY, BIOS 542 2015.01.21:18.19.48 [ 81.394548] task: f5b1be00 task.stack: f420a000 [ 81.399219] EIP: dwc3_stop_active_transfer.constprop.23+0x69/0xc0 [dwc3] [ 81.406086] EFLAGS: 00010086 CPU: 0 [ 81.409672] EAX: 0000001f EBX: f5729800 ECX: c132a2a2 EDX: 00000000 [ 81.416096] ESI: f4054014 EDI: f41cf400 EBP: f420be10 ESP: f420bdf4 [ 81.422521] DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 [ 81.428061] CR0: 80050033 CR2: b7a3f000 CR3: 01d94000 CR4: 001006d0 [ 81.434483] Call Trace: [ 81.437063] __dwc3_gadget_ep_disable+0xa3/0x2b0 [dwc3] [ 81.442438] ? _raw_spin_lock_irqsave+0x32/0x40 [ 81.447135] dwc3_gadget_ep_disable+0xbf/0xe0 [dwc3] [ 81.452269] usb_ep_disable+0x1c/0xd0 [udc_core] [ 81.457048] ffs_func_eps_disable.isra.15+0x3b/0x90 [usb_f_fs] [ 81.463070] ffs_func_set_alt+0x7d/0x310 [usb_f_fs] [ 81.468132] ffs_func_disable+0x14/0x20 [usb_f_fs] [ 81.473075] reset_config+0x5b/0x90 [libcomposite] [ 81.478023] composite_disconnect+0x2b/0x50 [libcomposite] [ 81.483685] dwc3_disconnect_gadget+0x39/0x50 [dwc3] [ 81.488808] dwc3_gadget_disconnect_interrupt+0x21b/0x250 [dwc3] [ 81.495014] dwc3_thread_interrupt+0x2a8/0xf70 [dwc3] [ 81.500219] ? __schedule+0x78c/0x7e0 [ 81.504027] irq_thread_fn+0x18/0x30 [ 81.507715] ? irq_thread+0xb7/0x180 [ 81.511400] irq_thread+0x111/0x180 [ 81.515000] ? irq_finalize_oneshot+0xe0/0xe0 [ 81.519490] ? wake_threads_waitq+0x30/0x30 [ 81.523806] kthread+0x107/0x110 [ 81.527131] ? disable_percpu_irq+0x50/0x50 [ 81.531439] ? kthread_stop+0x150/0x150 [ 81.535397] ret_from_fork+0x19/0x24 [ 81.539136] Code: 89 d8 c7 45 ec 00 00 00 00 c7 45 f0 00 00 00 00 c7 45 f4 00 00 00 00 e8 56 ef ff ff 85 c0 74 12 50 68 b9 1c 14 f8 e8 64 0f f7 c8 <0f> ff 58 5a 8d 76 00 8b 83 98 00 00 00 c6 83 a0 00 00 00 00 83 [ 81.559295] ---[ end trace f3133eec81a473b8 ]--- Number of iterations measured on 4 consecutive unplugs: [ 1088.799777] dwc3_send_gadget_ep_cmd(cmd=331016, params={0, 0, 0}) iterated 605 times [ 1222.024986] dwc3_send_gadget_ep_cmd(cmd=331016, params={0, 0, 0}) iterated 580 times [ 1317.590452] dwc3_send_gadget_ep_cmd(cmd=331016, params={0, 0, 0}) iterated 598 times [ 1453.218314] dwc3_send_gadget_ep_cmd(cmd=331016, params={0, 0, 0}) iterated 594 times Signed-off-by: Vincent Pelletier Signed-off-by: Felipe Balbi --- drivers/usb/dwc3/gadget.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index 981fd986cf82..01e595bb1ff1 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -259,7 +259,7 @@ int dwc3_send_gadget_ep_cmd(struct dwc3_ep *dep, unsigned cmd, { const struct usb_endpoint_descriptor *desc = dep->endpoint.desc; struct dwc3 *dwc = dep->dwc; - u32 timeout = 500; + u32 timeout = 1000; u32 reg; int cmd_status = 0; From ded600ea9fb51a495d2fcd21e90351df876488e8 Mon Sep 17 00:00:00 2001 From: Andreas Platschek Date: Thu, 7 Dec 2017 11:32:20 +0100 Subject: [PATCH 109/269] usb: dwc3: of-simple: fix missing clk_disable_unprepare If of_clk_get() fails, the clean-up of already initialized clocks should be the same as when clk_prepare_enable() fails. Thus a clk_disable_unprepare() for each clock should be called before the clk_put(). Found by Linux Driver Verification project (linuxtesting.org). Fixes: 16adc674d0d6 ("usb: dwc3: ep0: fix setup_packet_pending initialization") Signed-off-by: Andreas Platschek Signed-off-by: Felipe Balbi --- drivers/usb/dwc3/dwc3-of-simple.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/dwc3/dwc3-of-simple.c b/drivers/usb/dwc3/dwc3-of-simple.c index c4a4d7bd2766..762370dd7c75 100644 --- a/drivers/usb/dwc3/dwc3-of-simple.c +++ b/drivers/usb/dwc3/dwc3-of-simple.c @@ -51,8 +51,10 @@ static int dwc3_of_simple_clk_init(struct dwc3_of_simple *simple, int count) clk = of_clk_get(np, i); if (IS_ERR(clk)) { - while (--i >= 0) + while (--i >= 0) { + clk_disable_unprepare(simple->clks[i]); clk_put(simple->clks[i]); + } return PTR_ERR(clk); } From a0d8c4cfdf31a9576f683628e50b76714c785ef1 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 7 Dec 2017 13:40:24 +0900 Subject: [PATCH 110/269] usb: dwc3: of-simple: set dev_pm_ops dwc3_of_simple_dev_pm_ops has never been used since the initial support by commit 16adc674d0d6 ("usb: dwc3: add generic OF glue layer"). I guess it just missed to set .pm struct member. Signed-off-by: Masahiro Yamada Signed-off-by: Felipe Balbi --- drivers/usb/dwc3/dwc3-of-simple.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/dwc3/dwc3-of-simple.c b/drivers/usb/dwc3/dwc3-of-simple.c index 762370dd7c75..7ae0eefc7cc7 100644 --- a/drivers/usb/dwc3/dwc3-of-simple.c +++ b/drivers/usb/dwc3/dwc3-of-simple.c @@ -205,6 +205,7 @@ static struct platform_driver dwc3_of_simple_driver = { .driver = { .name = "dwc3-of-simple", .of_match_table = of_dwc3_simple_match, + .pm = &dwc3_of_simple_dev_pm_ops, }, }; From ec5bb87e4e2a1d3a35563a7bcfac9febf67aba9d Mon Sep 17 00:00:00 2001 From: Manu Gautam Date: Wed, 6 Dec 2017 12:49:04 +0530 Subject: [PATCH 111/269] usb: dwc3: gadget: Fix PCM1 for ISOC EP with ep->mult less than 3 For isochronous endpoints with ep->mult less than 3, PCM1 value of trb->size in set incorrectly. For ep->mult = 2, this is set to 0/-1 and for ep->mult = 1, this is set to -2. This is because the initial mult is set to ep->mult - 1 instead of 2. Signed-off-by: Manu Gautam Signed-off-by: Felipe Balbi --- drivers/usb/dwc3/gadget.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index 01e595bb1ff1..639dd1b163a0 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -912,7 +912,7 @@ static void __dwc3_prepare_one_trb(struct dwc3_ep *dep, struct dwc3_trb *trb, */ if (speed == USB_SPEED_HIGH) { struct usb_ep *ep = &dep->endpoint; - unsigned int mult = ep->mult - 1; + unsigned int mult = 2; unsigned int maxp = usb_endpoint_maxp(ep->desc); if (length <= (2 * maxp)) From 9273083a1530891360e9fe4fad26ae96810db499 Mon Sep 17 00:00:00 2001 From: Minas Harutyunyan Date: Thu, 30 Nov 2017 12:16:37 +0400 Subject: [PATCH 112/269] usb: dwc2: Fix TxFIFOn sizes and total TxFIFO size issues In host mode reading from DPTXSIZn returning invalid value in dwc2_check_param_tx_fifo_sizes function. In total TxFIFO size calculations unnecessarily reducing by ep_info. hw->total_fifo_size can be fully allocated for FIFO's. Added num_dev_in_eps member in dwc2_hw_params structure to save number of IN EPs. Added g_tx_fifo_size array in dwc2_hw_params structure to store power on reset values of DPTXSIZn registers in forced device mode. Updated dwc2_hsotg_tx_fifo_count() function to get TxFIFO count from num_dev_in_eps. Updated dwc2_get_dev_hwparams() function to store DPTXFSIZn in g_tx_fifo_size array. dwc2_get_host/dev_hwparams() functions call moved after num_dev_in_eps set from hwcfg4. Modified dwc2_check_param_tx_fifo_sizes() function to check TxFIFOn sizes based on g_tx_fifo_size array. Removed ep_info subtraction during calculation of tx_addr_max in dwc2_hsotg_tx_fifo_total_depth() function. Also removed dwc2_hsotg_ep_info_size() function as no more need. Acked-by: John Youn Signed-off-by: Gevorg Sahakyan Signed-off-by: Minas Harutyunyan Signed-off-by: Felipe Balbi --- drivers/usb/dwc2/core.h | 4 ++++ drivers/usb/dwc2/gadget.c | 42 ++------------------------------------- drivers/usb/dwc2/params.c | 29 +++++++++++++++++---------- 3 files changed, 25 insertions(+), 50 deletions(-) diff --git a/drivers/usb/dwc2/core.h b/drivers/usb/dwc2/core.h index f66c94130cac..31749c79045f 100644 --- a/drivers/usb/dwc2/core.h +++ b/drivers/usb/dwc2/core.h @@ -537,6 +537,7 @@ struct dwc2_core_params { * 2 - Internal DMA * @power_optimized Are power optimizations enabled? * @num_dev_ep Number of device endpoints available + * @num_dev_in_eps Number of device IN endpoints available * @num_dev_perio_in_ep Number of device periodic IN endpoints * available * @dev_token_q_depth Device Mode IN Token Sequence Learning Queue @@ -565,6 +566,7 @@ struct dwc2_core_params { * 2 - 8 or 16 bits * @snpsid: Value from SNPSID register * @dev_ep_dirs: Direction of device endpoints (GHWCFG1) + * @g_tx_fifo_size[] Power-on values of TxFIFO sizes */ struct dwc2_hw_params { unsigned op_mode:3; @@ -586,12 +588,14 @@ struct dwc2_hw_params { unsigned fs_phy_type:2; unsigned i2c_enable:1; unsigned num_dev_ep:4; + unsigned num_dev_in_eps : 4; unsigned num_dev_perio_in_ep:4; unsigned total_fifo_size:16; unsigned power_optimized:1; unsigned utmi_phy_data_width:2; u32 snpsid; u32 dev_ep_dirs; + u32 g_tx_fifo_size[MAX_EPS_CHANNELS]; }; /* Size of control and EP0 buffers */ diff --git a/drivers/usb/dwc2/gadget.c b/drivers/usb/dwc2/gadget.c index 88529d092503..e4c3ce0de5de 100644 --- a/drivers/usb/dwc2/gadget.c +++ b/drivers/usb/dwc2/gadget.c @@ -195,55 +195,18 @@ int dwc2_hsotg_tx_fifo_count(struct dwc2_hsotg *hsotg) { if (hsotg->hw_params.en_multiple_tx_fifo) /* In dedicated FIFO mode we need count of IN EPs */ - return (dwc2_readl(hsotg->regs + GHWCFG4) & - GHWCFG4_NUM_IN_EPS_MASK) >> GHWCFG4_NUM_IN_EPS_SHIFT; + return hsotg->hw_params.num_dev_in_eps; else /* In shared FIFO mode we need count of Periodic IN EPs */ return hsotg->hw_params.num_dev_perio_in_ep; } -/** - * dwc2_hsotg_ep_info_size - return Endpoint Info Control block size in DWORDs - */ -static int dwc2_hsotg_ep_info_size(struct dwc2_hsotg *hsotg) -{ - int val = 0; - int i; - u32 ep_dirs; - - /* - * Don't need additional space for ep info control registers in - * slave mode. - */ - if (!using_dma(hsotg)) { - dev_dbg(hsotg->dev, "Buffer DMA ep info size 0\n"); - return 0; - } - - /* - * Buffer DMA mode - 1 location per endpoit - * Descriptor DMA mode - 4 locations per endpoint - */ - ep_dirs = hsotg->hw_params.dev_ep_dirs; - - for (i = 0; i <= hsotg->hw_params.num_dev_ep; i++) { - val += ep_dirs & 3 ? 1 : 2; - ep_dirs >>= 2; - } - - if (using_desc_dma(hsotg)) - val = val * 4; - - return val; -} - /** * dwc2_hsotg_tx_fifo_total_depth - return total FIFO depth available for * device mode TX FIFOs */ int dwc2_hsotg_tx_fifo_total_depth(struct dwc2_hsotg *hsotg) { - int ep_info_size; int addr; int tx_addr_max; u32 np_tx_fifo_size; @@ -252,8 +215,7 @@ int dwc2_hsotg_tx_fifo_total_depth(struct dwc2_hsotg *hsotg) hsotg->params.g_np_tx_fifo_size); /* Get Endpoint Info Control block size in DWORDs. */ - ep_info_size = dwc2_hsotg_ep_info_size(hsotg); - tx_addr_max = hsotg->hw_params.total_fifo_size - ep_info_size; + tx_addr_max = hsotg->hw_params.total_fifo_size; addr = hsotg->params.g_rx_fifo_size + np_tx_fifo_size; if (tx_addr_max <= addr) diff --git a/drivers/usb/dwc2/params.c b/drivers/usb/dwc2/params.c index ef73af6e03a9..03fd20f0b496 100644 --- a/drivers/usb/dwc2/params.c +++ b/drivers/usb/dwc2/params.c @@ -484,8 +484,7 @@ static void dwc2_check_param_tx_fifo_sizes(struct dwc2_hsotg *hsotg) } for (fifo = 1; fifo <= fifo_count; fifo++) { - dptxfszn = (dwc2_readl(hsotg->regs + DPTXFSIZN(fifo)) & - FIFOSIZE_DEPTH_MASK) >> FIFOSIZE_DEPTH_SHIFT; + dptxfszn = hsotg->hw_params.g_tx_fifo_size[fifo]; if (hsotg->params.g_tx_fifo_size[fifo] < min || hsotg->params.g_tx_fifo_size[fifo] > dptxfszn) { @@ -609,6 +608,7 @@ static void dwc2_get_dev_hwparams(struct dwc2_hsotg *hsotg) struct dwc2_hw_params *hw = &hsotg->hw_params; bool forced; u32 gnptxfsiz; + int fifo, fifo_count; if (hsotg->dr_mode == USB_DR_MODE_HOST) return; @@ -617,6 +617,14 @@ static void dwc2_get_dev_hwparams(struct dwc2_hsotg *hsotg) gnptxfsiz = dwc2_readl(hsotg->regs + GNPTXFSIZ); + fifo_count = dwc2_hsotg_tx_fifo_count(hsotg); + + for (fifo = 1; fifo <= fifo_count; fifo++) { + hw->g_tx_fifo_size[fifo] = + (dwc2_readl(hsotg->regs + DPTXFSIZN(fifo)) & + FIFOSIZE_DEPTH_MASK) >> FIFOSIZE_DEPTH_SHIFT; + } + if (forced) dwc2_clear_force_mode(hsotg); @@ -661,14 +669,6 @@ int dwc2_get_hwparams(struct dwc2_hsotg *hsotg) hwcfg4 = dwc2_readl(hsotg->regs + GHWCFG4); grxfsiz = dwc2_readl(hsotg->regs + GRXFSIZ); - /* - * Host specific hardware parameters. Reading these parameters - * requires the controller to be in host mode. The mode will - * be forced, if necessary, to read these values. - */ - dwc2_get_host_hwparams(hsotg); - dwc2_get_dev_hwparams(hsotg); - /* hwcfg1 */ hw->dev_ep_dirs = hwcfg1; @@ -711,6 +711,8 @@ int dwc2_get_hwparams(struct dwc2_hsotg *hsotg) hw->en_multiple_tx_fifo = !!(hwcfg4 & GHWCFG4_DED_FIFO_EN); hw->num_dev_perio_in_ep = (hwcfg4 & GHWCFG4_NUM_DEV_PERIO_IN_EP_MASK) >> GHWCFG4_NUM_DEV_PERIO_IN_EP_SHIFT; + hw->num_dev_in_eps = (hwcfg4 & GHWCFG4_NUM_IN_EPS_MASK) >> + GHWCFG4_NUM_IN_EPS_SHIFT; hw->dma_desc_enable = !!(hwcfg4 & GHWCFG4_DESC_DMA); hw->power_optimized = !!(hwcfg4 & GHWCFG4_POWER_OPTIMIZ); hw->utmi_phy_data_width = (hwcfg4 & GHWCFG4_UTMI_PHY_DATA_WIDTH_MASK) >> @@ -719,6 +721,13 @@ int dwc2_get_hwparams(struct dwc2_hsotg *hsotg) /* fifo sizes */ hw->rx_fifo_size = (grxfsiz & GRXFSIZ_DEPTH_MASK) >> GRXFSIZ_DEPTH_SHIFT; + /* + * Host specific hardware parameters. Reading these parameters + * requires the controller to be in host mode. The mode will + * be forced, if necessary, to read these values. + */ + dwc2_get_host_hwparams(hsotg); + dwc2_get_dev_hwparams(hsotg); return 0; } From 91516a2a4734614d62ee3ed921f8f88acc67c000 Mon Sep 17 00:00:00 2001 From: Christoph Fritz Date: Sat, 9 Dec 2017 23:47:55 +0100 Subject: [PATCH 113/269] mmc: core: apply NO_CMD23 quirk to some specific cards To get an usdhc Apacer and some ATP SD cards work reliable, CMD23 needs to be disabled. This has been tested on i.MX6 (sdhci-esdhc) and rk3288 (dw_mmc-rockchip). Without this patch on i.MX6 (sdhci-esdhc): $ dd if=/dev/urandom of=/mnt/test bs=1M count=10 conv=fsync | | mmc0: starting CMD25 arg 00a71f00 flags 000000b5 | mmc0: blksz 512 blocks 1024 flags 00000100 tsac 3000 ms nsac 0 | mmc0: CMD12 arg 00000000 flags 0000049d | sdhci [sdhci_irq()]: *** mmc0 got interrupt: 0x00000001 | mmc0: Timeout waiting for hardware interrupt. Without this patch on rk3288 (dw_mmc-rockchip): | mmc1: Card stuck in programming state! mmcblk1 card_busy_detect | dwmmc_rockchip ff0c0000.dwmmc: Busy; trying anyway | mmc_host mmc1: Bus speed (slot 0) = 400000Hz (slot req 400000Hz, | actual 400000HZ div = 0) | mmc1: card never left busy state | mmc1: tried to reset card, got error -110 | blk_update_request: I/O error, dev mmcblk1, sector 139778 | Buffer I/O error on dev mmcblk1p1, logical block 131586, lost async | page write Signed-off-by: Christoph Fritz Cc: # v4.14+ Signed-off-by: Ulf Hansson --- drivers/mmc/core/card.h | 2 ++ drivers/mmc/core/quirks.h | 8 ++++++++ 2 files changed, 10 insertions(+) diff --git a/drivers/mmc/core/card.h b/drivers/mmc/core/card.h index f06cd91964ce..79a5b985ccf5 100644 --- a/drivers/mmc/core/card.h +++ b/drivers/mmc/core/card.h @@ -75,9 +75,11 @@ struct mmc_fixup { #define EXT_CSD_REV_ANY (-1u) #define CID_MANFID_SANDISK 0x2 +#define CID_MANFID_ATP 0x9 #define CID_MANFID_TOSHIBA 0x11 #define CID_MANFID_MICRON 0x13 #define CID_MANFID_SAMSUNG 0x15 +#define CID_MANFID_APACER 0x27 #define CID_MANFID_KINGSTON 0x70 #define CID_MANFID_HYNIX 0x90 diff --git a/drivers/mmc/core/quirks.h b/drivers/mmc/core/quirks.h index f664e9cbc9f8..75d317623852 100644 --- a/drivers/mmc/core/quirks.h +++ b/drivers/mmc/core/quirks.h @@ -52,6 +52,14 @@ static const struct mmc_fixup mmc_blk_fixups[] = { MMC_FIXUP("MMC32G", CID_MANFID_TOSHIBA, CID_OEMID_ANY, add_quirk_mmc, MMC_QUIRK_BLK_NO_CMD23), + /* + * Some SD cards lockup while using CMD23 multiblock transfers. + */ + MMC_FIXUP("AF SD", CID_MANFID_ATP, CID_OEMID_ANY, add_quirk_sd, + MMC_QUIRK_BLK_NO_CMD23), + MMC_FIXUP("APUSD", CID_MANFID_APACER, 0x5048, add_quirk_sd, + MMC_QUIRK_BLK_NO_CMD23), + /* * Some MMC cards need longer data read timeout than indicated in CSD. */ From f5b5702ac55b11113a94d6228d191c7f827b7a3b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 11 Dec 2017 10:14:27 +0100 Subject: [PATCH 114/269] netfilter: exthdr: add missign attributes to policy Add missing netlink attribute policy. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_exthdr.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index a0a93d987a3b..47ec1046ad11 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -214,6 +214,8 @@ static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = { [NFTA_EXTHDR_OFFSET] = { .type = NLA_U32 }, [NFTA_EXTHDR_LEN] = { .type = NLA_U32 }, [NFTA_EXTHDR_FLAGS] = { .type = NLA_U32 }, + [NFTA_EXTHDR_OP] = { .type = NLA_U32 }, + [NFTA_EXTHDR_SREG] = { .type = NLA_U32 }, }; static int nft_exthdr_init(const struct nft_ctx *ctx, From 3487972d7fa6c5143951436ada5933dcf0ec659d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 7 Dec 2017 02:41:18 +0100 Subject: [PATCH 115/269] PM / sleep: Avoid excess pm_runtime_enable() calls in device_resume() Middle-layer code doing suspend-time optimizations for devices with the DPM_FLAG_SMART_SUSPEND flag set (currently, the PCI bus type and the ACPI PM domain) needs to make the core skip ->thaw_early and ->thaw callbacks for those devices in some cases and it sets the power.direct_complete flag for them for this purpose. However, it turns out that setting power.direct_complete outside of the PM core is a bad idea as it triggers an excess invocation of pm_runtime_enable() in device_resume(). For this reason, provide a helper to clear power.is_late_suspended and power.is_suspended to be invoked by the middle-layer code in question instead of setting power.direct_complete and make that code call the new helper. Fixes: c4b65157aeef (PCI / PM: Take SMART_SUSPEND driver flag into account) Fixes: 05087360fd7a (ACPI / PM: Take SMART_SUSPEND driver flag into account) Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Acked-by: Bjorn Helgaas --- drivers/acpi/device_pm.c | 2 +- drivers/base/power/main.c | 15 +++++++++++++++ drivers/pci/pci-driver.c | 2 +- include/linux/pm.h | 1 + 4 files changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c index e4ffaeec9ec2..a4c8ad98560d 100644 --- a/drivers/acpi/device_pm.c +++ b/drivers/acpi/device_pm.c @@ -1138,7 +1138,7 @@ int acpi_subsys_thaw_noirq(struct device *dev) * skip all of the subsequent "thaw" callbacks for the device. */ if (dev_pm_smart_suspend_and_suspended(dev)) { - dev->power.direct_complete = true; + dev_pm_skip_next_resume_phases(dev); return 0; } diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index db2f04415927..08744b572af6 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -525,6 +525,21 @@ static void dpm_watchdog_clear(struct dpm_watchdog *wd) /*------------------------- Resume routines -------------------------*/ +/** + * dev_pm_skip_next_resume_phases - Skip next system resume phases for device. + * @dev: Target device. + * + * Make the core skip the "early resume" and "resume" phases for @dev. + * + * This function can be called by middle-layer code during the "noirq" phase of + * system resume if necessary, but not by device drivers. + */ +void dev_pm_skip_next_resume_phases(struct device *dev) +{ + dev->power.is_late_suspended = false; + dev->power.is_suspended = false; +} + /** * device_resume_noirq - Execute a "noirq resume" callback for given device. * @dev: Device to handle. diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 7f47bb72bf30..945099d49f8f 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -999,7 +999,7 @@ static int pci_pm_thaw_noirq(struct device *dev) * the subsequent "thaw" callbacks for the device. */ if (dev_pm_smart_suspend_and_suspended(dev)) { - dev->power.direct_complete = true; + dev_pm_skip_next_resume_phases(dev); return 0; } diff --git a/include/linux/pm.h b/include/linux/pm.h index 65d39115f06d..492ed473ba7e 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -765,6 +765,7 @@ extern int pm_generic_poweroff_late(struct device *dev); extern int pm_generic_poweroff(struct device *dev); extern void pm_generic_complete(struct device *dev); +extern void dev_pm_skip_next_resume_phases(struct device *dev); extern bool dev_pm_smart_suspend_and_suspended(struct device *dev); #else /* !CONFIG_PM_SLEEP */ From 1ac8aa8d0568606485451ea860a6c6c3fad0d42d Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Thu, 30 Nov 2017 11:06:15 -0600 Subject: [PATCH 116/269] ipmi_si: Fix oops with PCI devices When the IPMI PCI code was split out, some code was consolidated for setting the io_setup field in the io structure. The PCI code needed this set before registration to probe register spacing, though, so restore the old code for that function. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=197999 Signed-off-by: Corey Minyard Tested-by: Meelis Roos --- drivers/char/ipmi/ipmi_si_pci.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/char/ipmi/ipmi_si_pci.c b/drivers/char/ipmi/ipmi_si_pci.c index 99771f5cad07..27dd11c49d21 100644 --- a/drivers/char/ipmi/ipmi_si_pci.c +++ b/drivers/char/ipmi/ipmi_si_pci.c @@ -103,10 +103,13 @@ static int ipmi_pci_probe(struct pci_dev *pdev, io.addr_source_cleanup = ipmi_pci_cleanup; io.addr_source_data = pdev; - if (pci_resource_flags(pdev, 0) & IORESOURCE_IO) + if (pci_resource_flags(pdev, 0) & IORESOURCE_IO) { io.addr_type = IPMI_IO_ADDR_SPACE; - else + io.io_setup = ipmi_si_port_setup; + } else { io.addr_type = IPMI_MEM_ADDR_SPACE; + io.io_setup = ipmi_si_mem_setup; + } io.addr_data = pci_resource_start(pdev, 0); io.regspacing = ipmi_pci_probe_regspacing(&io); From 51614b26a029515dd3bc43a8c0e16a9ee51bbf52 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 6 Dec 2017 04:25:44 -0500 Subject: [PATCH 117/269] ipmi_si: fix crash on parisc This patch fixes ipmi crash on parisc introduced in the kernel 4.15-rc. The pointer io.io_setup is not initialized and thus it causes crash in try_smi_init when attempting to call new_smi->io.io_setup. Signed-off-by: Mikulas Patocka Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_si_parisc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/char/ipmi/ipmi_si_parisc.c b/drivers/char/ipmi/ipmi_si_parisc.c index 090b073ab441..6b10f0e18a95 100644 --- a/drivers/char/ipmi/ipmi_si_parisc.c +++ b/drivers/char/ipmi/ipmi_si_parisc.c @@ -10,6 +10,8 @@ static int __init ipmi_parisc_probe(struct parisc_device *dev) { struct si_sm_io io; + memset(&io, 0, sizeof(io)); + io.si_type = SI_KCS; io.addr_source = SI_DEVICETREE; io.addr_type = IPMI_MEM_ADDR_SPACE; From 7f6f60a1ba52538c16f26930bfbcfe193d9d746a Mon Sep 17 00:00:00 2001 From: Dave Young Date: Sat, 9 Dec 2017 12:16:10 +0800 Subject: [PATCH 118/269] mm/early_ioremap: Fix boot hang with earlyprintk=efi,keep earlyprintk=efi,keep does not work any more with a warning in mm/early_ioremap.c: WARN_ON(system_state != SYSTEM_BOOTING): Boot just hangs because of the earlyprintk within the earlyprintk implementation code itself. This is caused by a new introduced middle state in: 69a78ff226fe ("init: Introduce SYSTEM_SCHEDULING state") early_ioremap() is fine in both SYSTEM_BOOTING and SYSTEM_SCHEDULING states, original condition should be updated accordingly. Signed-off-by: Dave Young Acked-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Cc: bp@suse.de Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20171209041610.GA3249@dhcp-128-65.nay.redhat.com Signed-off-by: Ingo Molnar --- mm/early_ioremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c index d04ac1ec0559..1826f191e72c 100644 --- a/mm/early_ioremap.c +++ b/mm/early_ioremap.c @@ -111,7 +111,7 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) enum fixed_addresses idx; int i, slot; - WARN_ON(system_state != SYSTEM_BOOTING); + WARN_ON(system_state >= SYSTEM_RUNNING); slot = -1; for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { From 6d60ce384d1d5ca32b595244db4077a419acc687 Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Mon, 27 Nov 2017 08:51:39 +0100 Subject: [PATCH 119/269] x86/mm/kmmio: Fix mmiotrace for page unaligned addresses If something calls ioremap() with an address not aligned to PAGE_SIZE, the returned address might be not aligned as well. This led to a probe registered on exactly the returned address, but the entire page was armed for mmiotracing. On calling iounmap() the address passed to unregister_kmmio_probe() was PAGE_SIZE aligned by the caller leading to a complete freeze of the machine. We should always page align addresses while (un)registerung mappings, because the mmiotracer works on top of pages, not mappings. We still keep track of the probes based on their real addresses and lengths though, because the mmiotrace still needs to know what are mapped memory regions. Also move the call to mmiotrace_iounmap() prior page aligning the address, so that all probes are unregistered properly, otherwise the kernel ends up failing memory allocations randomly after disabling the mmiotracer. Tested-by: Lyude Signed-off-by: Karol Herbst Acked-by: Pekka Paalanen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: nouveau@lists.freedesktop.org Link: http://lkml.kernel.org/r/20171127075139.4928-1-kherbst@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 4 ++-- arch/x86/mm/kmmio.c | 12 +++++++----- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 6e4573b1da34..c45b6ec5357b 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -404,11 +404,11 @@ void iounmap(volatile void __iomem *addr) return; } + mmiotrace_iounmap(addr); + addr = (volatile void __iomem *) (PAGE_MASK & (unsigned long __force)addr); - mmiotrace_iounmap(addr); - /* Use the vm area unlocked, assuming the caller ensures there isn't another iounmap for the same address in parallel. Reuse of the virtual address is prevented by diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index c21c2ed04612..58477ec3d66d 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -435,17 +435,18 @@ int register_kmmio_probe(struct kmmio_probe *p) unsigned long flags; int ret = 0; unsigned long size = 0; + unsigned long addr = p->addr & PAGE_MASK; const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); unsigned int l; pte_t *pte; spin_lock_irqsave(&kmmio_lock, flags); - if (get_kmmio_probe(p->addr)) { + if (get_kmmio_probe(addr)) { ret = -EEXIST; goto out; } - pte = lookup_address(p->addr, &l); + pte = lookup_address(addr, &l); if (!pte) { ret = -EINVAL; goto out; @@ -454,7 +455,7 @@ int register_kmmio_probe(struct kmmio_probe *p) kmmio_count++; list_add_rcu(&p->list, &kmmio_probes); while (size < size_lim) { - if (add_kmmio_fault_page(p->addr + size)) + if (add_kmmio_fault_page(addr + size)) pr_err("Unable to set page fault.\n"); size += page_level_size(l); } @@ -528,19 +529,20 @@ void unregister_kmmio_probe(struct kmmio_probe *p) { unsigned long flags; unsigned long size = 0; + unsigned long addr = p->addr & PAGE_MASK; const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); struct kmmio_fault_page *release_list = NULL; struct kmmio_delayed_release *drelease; unsigned int l; pte_t *pte; - pte = lookup_address(p->addr, &l); + pte = lookup_address(addr, &l); if (!pte) return; spin_lock_irqsave(&kmmio_lock, flags); while (size < size_lim) { - release_kmmio_fault_page(p->addr + size, &release_list); + release_kmmio_fault_page(addr + size, &release_list); size += page_level_size(l); } list_del_rcu(&p->list); From 2064a5ab04707c55003e099e5abbf19a0826bbac Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 3 Dec 2017 13:19:00 -0800 Subject: [PATCH 120/269] sched/core: Fix kernel-doc warnings after code movement Fix the following kernel-doc warnings after code restructuring: ../kernel/sched/core.c:5113: warning: No description found for parameter 't' ../kernel/sched/core.c:5113: warning: Excess function parameter 'interval' description in 'sched_rr_get_interval' get rid of set_fs()") Signed-off-by: Randy Dunlap Cc: Al Viro Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: abca5fc535a3e ("sched_rr_get_interval(): move compat to native, Link: http://lkml.kernel.org/r/995c6ded-b32e-bbe4-d9f5-4d42d121aff1@infradead.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 75554f366fd3..644fa2e3d993 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5097,17 +5097,6 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) return ret; } -/** - * sys_sched_rr_get_interval - return the default timeslice of a process. - * @pid: pid of the process. - * @interval: userspace pointer to the timeslice value. - * - * this syscall writes the default timeslice value of a given process - * into the user-space timespec buffer. A value of '0' means infinity. - * - * Return: On success, 0 and the timeslice is in @interval. Otherwise, - * an error code. - */ static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) { struct task_struct *p; @@ -5144,6 +5133,17 @@ out_unlock: return retval; } +/** + * sys_sched_rr_get_interval - return the default timeslice of a process. + * @pid: pid of the process. + * @interval: userspace pointer to the timeslice value. + * + * this syscall writes the default timeslice value of a given process + * into the user-space timespec buffer. A value of '0' means infinity. + * + * Return: On success, 0 and the timeslice is in @interval. Otherwise, + * an error code. + */ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, struct timespec __user *, interval) { From 01dfee9582d9b4403c4902df096ed8b43d55181c Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Fri, 8 Dec 2017 11:56:14 +0900 Subject: [PATCH 121/269] workqueue: remove unneeded kallsyms include The filw was converted from print_symbol() to %pf some time ago (044c782ce3a901fb "workqueue: fix checkpatch issues"). kallsyms does not seem to be needed anymore. Signed-off-by: Sergey Senozhatsky Cc: Tejun Heo Cc: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 45ce93f3dd1f..43d18cb46308 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -38,7 +38,6 @@ #include #include #include -#include #include #include #include From 86ad5c97ce5ccdda1459d35370fd5e105721bb8d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 9 Dec 2017 14:49:14 +0300 Subject: [PATCH 122/269] RISC-V: Logical vs Bitwise typo In the current code, there is a ! logical NOT where a bitwise ~ NOT was intended. It means that we never return -EINVAL. Signed-off-by: Dan Carpenter Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/sys_riscv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/sys_riscv.c b/arch/riscv/kernel/sys_riscv.c index a2ae936a093e..79c78668258e 100644 --- a/arch/riscv/kernel/sys_riscv.c +++ b/arch/riscv/kernel/sys_riscv.c @@ -70,7 +70,7 @@ SYSCALL_DEFINE3(riscv_flush_icache, uintptr_t, start, uintptr_t, end, bool local = (flags & SYS_RISCV_FLUSH_ICACHE_LOCAL) != 0; /* Check the reserved flags. */ - if (unlikely(flags & !SYS_RISCV_FLUSH_ICACHE_ALL)) + if (unlikely(flags & ~SYS_RISCV_FLUSH_ICACHE_ALL)) return -EINVAL; flush_icache_mm(mm, local); From 3cfa5008081db845c6c53d531ec34e9c84a9fd99 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Tue, 5 Dec 2017 17:48:11 -0800 Subject: [PATCH 123/269] RISC-V: Resurrect smp_mb__after_spinlock() I removed this last week because of an incorrect comment: smp_mb__after_spinlock() is actually still used, and is necessary on RISC-V. It's been resurrected, with a comment that describes what it actually does this time. Thanks to Andrea for finding the bug! Fixes: 3343eb6806f3 ("RISC-V: Remove smb_mb__{before,after}_spinlock()") CC: Andrea Parri Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/barrier.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/arch/riscv/include/asm/barrier.h b/arch/riscv/include/asm/barrier.h index 773c4e039cd7..c0319cbf1eec 100644 --- a/arch/riscv/include/asm/barrier.h +++ b/arch/riscv/include/asm/barrier.h @@ -38,6 +38,25 @@ #define smp_rmb() RISCV_FENCE(r,r) #define smp_wmb() RISCV_FENCE(w,w) +/* + * This is a very specific barrier: it's currently only used in two places in + * the kernel, both in the scheduler. See include/linux/spinlock.h for the two + * orderings it guarantees, but the "critical section is RCsc" guarantee + * mandates a barrier on RISC-V. The sequence looks like: + * + * lr.aq lock + * sc lock <= LOCKED + * smp_mb__after_spinlock() + * // critical section + * lr lock + * sc.rl lock <= UNLOCKED + * + * The AQ/RL pair provides a RCpc critical section, but there's not really any + * way we can take advantage of that here because the ordering is only enforced + * on that one lock. Thus, we're just doing a full fence. + */ +#define smp_mb__after_spinlock() RISCV_FENCE(rw,rw) + #include #endif /* __ASSEMBLY__ */ From 27b0174525325bf18919597016483a709f3372f8 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Fri, 8 Dec 2017 11:23:23 -0800 Subject: [PATCH 124/269] RISC-V: Remove unused CONFIG_HVC_RISCV_SBI code This is code that probably should never have made it into the kernel in the first place: it depends on a driver that hadn't been reviewed yet. During the HVC_SBI_RISCV review process a better way of doing this was suggested, but that means this code is defunct. It's compile-time disabled in 4.15 because the driver isn't in, so I think it's safe to just remove this for now. CC: Greg KH Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/setup.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 8fbb6749910d..cb7b0c63014e 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -38,10 +38,6 @@ #include #include -#ifdef CONFIG_HVC_RISCV_SBI -#include -#endif - #ifdef CONFIG_DUMMY_CONSOLE struct screen_info screen_info = { .orig_video_lines = 30, @@ -212,13 +208,6 @@ static void __init setup_bootmem(void) void __init setup_arch(char **cmdline_p) { -#if defined(CONFIG_HVC_RISCV_SBI) - if (likely(early_console == NULL)) { - early_console = &riscv_sbi_early_console_dev; - register_console(early_console); - } -#endif - #ifdef CONFIG_CMDLINE_BOOL #ifdef CONFIG_CMDLINE_OVERRIDE strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); From a8ceb5dbfde1092b466936bca0ff3be127ecf38e Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 5 Dec 2017 21:29:37 +0200 Subject: [PATCH 125/269] ptr_ring: add barriers Users of ptr_ring expect that it's safe to give the data structure a pointer and have it be available to consumers, but that actually requires an smb_wmb or a stronger barrier. In absence of such barriers and on architectures that reorder writes, consumer might read an un=initialized value from an skb pointer stored in the skb array. This was observed causing crashes. To fix, add memory barriers. The barrier we use is a wmb, the assumption being that producers do not need to read the value so we do not need to order these reads. Reported-by: George Cherian Suggested-by: Jason Wang Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Signed-off-by: David S. Miller --- include/linux/ptr_ring.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h index 37b4bb2545b3..6866df4f31b5 100644 --- a/include/linux/ptr_ring.h +++ b/include/linux/ptr_ring.h @@ -101,12 +101,18 @@ static inline bool ptr_ring_full_bh(struct ptr_ring *r) /* Note: callers invoking this in a loop must use a compiler barrier, * for example cpu_relax(). Callers must hold producer_lock. + * Callers are responsible for making sure pointer that is being queued + * points to a valid data. */ static inline int __ptr_ring_produce(struct ptr_ring *r, void *ptr) { if (unlikely(!r->size) || r->queue[r->producer]) return -ENOSPC; + /* Make sure the pointer we are storing points to a valid data. */ + /* Pairs with smp_read_barrier_depends in __ptr_ring_consume. */ + smp_wmb(); + r->queue[r->producer++] = ptr; if (unlikely(r->producer >= r->size)) r->producer = 0; @@ -275,6 +281,9 @@ static inline void *__ptr_ring_consume(struct ptr_ring *r) if (ptr) __ptr_ring_discard_one(r); + /* Make sure anyone accessing data through the pointer is up to date. */ + /* Pairs with smp_wmb in __ptr_ring_produce. */ + smp_read_barrier_depends(); return ptr; } From 23715275e4fb6f64358a499d20928a9e93819f2f Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 11 Dec 2017 18:19:33 +0300 Subject: [PATCH 126/269] netfilter: ip6t_MASQUERADE: add dependency on conntrack module After commit 4d3a57f23dec ("netfilter: conntrack: do not enable connection tracking unless needed") conntrack is disabled by default unless some module explicitly declares dependency in particular network namespace. Fixes: a357b3f80bc8 ("netfilter: nat: add dependencies on conntrack module") Signed-off-by: Konstantin Khlebnikov Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/ip6t_MASQUERADE.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c index 2b1a15846f9a..92c0047e7e33 100644 --- a/net/ipv6/netfilter/ip6t_MASQUERADE.c +++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c @@ -33,13 +33,19 @@ static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par) if (range->flags & NF_NAT_RANGE_MAP_IPS) return -EINVAL; - return 0; + return nf_ct_netns_get(par->net, par->family); +} + +static void masquerade_tg6_destroy(const struct xt_tgdtor_param *par) +{ + nf_ct_netns_put(par->net, par->family); } static struct xt_target masquerade_tg6_reg __read_mostly = { .name = "MASQUERADE", .family = NFPROTO_IPV6, .checkentry = masquerade_tg6_checkentry, + .destroy = masquerade_tg6_destroy, .target = masquerade_tg6, .targetsize = sizeof(struct nf_nat_range), .table = "nat", From f24e5834a2c3f6c5f814a417f858226f0a010ade Mon Sep 17 00:00:00 2001 From: Steve Capper Date: Mon, 4 Dec 2017 14:13:05 +0000 Subject: [PATCH 127/269] arm64: Initialise high_memory global variable earlier The high_memory global variable is used by cma_declare_contiguous(.) before it is defined. We don't notice this as we compute __pa(high_memory - 1), and it looks like we're processing a VA from the direct linear map. This problem becomes apparent when we flip the kernel virtual address space and the linear map is moved to the bottom of the kernel VA space. This patch moves the initialisation of high_memory before it used. Cc: Fixes: f7426b983a6a ("mm: cma: adjust address limit to avoid hitting low/high memory boundary") Signed-off-by: Steve Capper Signed-off-by: Will Deacon --- arch/arm64/mm/init.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 5960bef0170d..00e7b900ca41 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -476,6 +476,8 @@ void __init arm64_memblock_init(void) reserve_elfcorehdr(); + high_memory = __va(memblock_end_of_DRAM() - 1) + 1; + dma_contiguous_reserve(arm64_dma_phys_limit); memblock_allow_resize(); @@ -502,7 +504,6 @@ void __init bootmem_init(void) sparse_init(); zone_sizes_init(min, max); - high_memory = __va((max << PAGE_SHIFT) - 1) + 1; memblock_dump_all(); } From 8781bcbc5e69d7da69e84c7044ca0284848d5d01 Mon Sep 17 00:00:00 2001 From: Steve Capper Date: Fri, 1 Dec 2017 17:22:14 +0000 Subject: [PATCH 128/269] arm64: mm: Fix pte_mkclean, pte_mkdirty semantics On systems with hardware dirty bit management, the ltp madvise09 unit test fails due to dirty bit information being lost and pages being incorrectly freed. This was bisected to: arm64: Ignore hardware dirty bit updates in ptep_set_wrprotect() Reverting this commit leads to a separate problem, that the unit test retains pages that should have been dropped due to the function madvise_free_pte_range(.) not cleaning pte's properly. Currently pte_mkclean only clears the software dirty bit, thus the following code sequence can appear: pte = pte_mkclean(pte); if (pte_dirty(pte)) // this condition can return true with HW DBM! This patch also adjusts pte_mkclean to set PTE_RDONLY thus effectively clearing both the SW and HW dirty information. In order for this to function on systems without HW DBM, we need to also adjust pte_mkdirty to remove the read only bit from writable pte's to avoid infinite fault loops. Cc: Fixes: 64c26841b349 ("arm64: Ignore hardware dirty bit updates in ptep_set_wrprotect()") Reported-by: Bhupinder Thakur Tested-by: Bhupinder Thakur Reviewed-by: Catalin Marinas Signed-off-by: Steve Capper Signed-off-by: Will Deacon --- arch/arm64/include/asm/pgtable.h | 33 +++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 149d05fb9421..3ff03a755c32 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -149,12 +149,20 @@ static inline pte_t pte_mkwrite(pte_t pte) static inline pte_t pte_mkclean(pte_t pte) { - return clear_pte_bit(pte, __pgprot(PTE_DIRTY)); + pte = clear_pte_bit(pte, __pgprot(PTE_DIRTY)); + pte = set_pte_bit(pte, __pgprot(PTE_RDONLY)); + + return pte; } static inline pte_t pte_mkdirty(pte_t pte) { - return set_pte_bit(pte, __pgprot(PTE_DIRTY)); + pte = set_pte_bit(pte, __pgprot(PTE_DIRTY)); + + if (pte_write(pte)) + pte = clear_pte_bit(pte, __pgprot(PTE_RDONLY)); + + return pte; } static inline pte_t pte_mkold(pte_t pte) @@ -641,28 +649,23 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* - * ptep_set_wrprotect - mark read-only while preserving the hardware update of - * the Access Flag. + * ptep_set_wrprotect - mark read-only while trasferring potential hardware + * dirty status (PTE_DBM && !PTE_RDONLY) to the software PTE_DIRTY bit. */ #define __HAVE_ARCH_PTEP_SET_WRPROTECT static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep) { pte_t old_pte, pte; - /* - * ptep_set_wrprotect() is only called on CoW mappings which are - * private (!VM_SHARED) with the pte either read-only (!PTE_WRITE && - * PTE_RDONLY) or writable and software-dirty (PTE_WRITE && - * !PTE_RDONLY && PTE_DIRTY); see is_cow_mapping() and - * protection_map[]. There is no race with the hardware update of the - * dirty state: clearing of PTE_RDONLY when PTE_WRITE (a.k.a. PTE_DBM) - * is set. - */ - VM_WARN_ONCE(pte_write(*ptep) && !pte_dirty(*ptep), - "%s: potential race with hardware DBM", __func__); pte = READ_ONCE(*ptep); do { old_pte = pte; + /* + * If hardware-dirty (PTE_WRITE/DBM bit set and PTE_RDONLY + * clear), set the PTE_DIRTY bit. + */ + if (pte_hw_dirty(pte)) + pte = pte_mkdirty(pte); pte = pte_wrprotect(pte); pte_val(pte) = cmpxchg_relaxed(&pte_val(*ptep), pte_val(old_pte), pte_val(pte)); From f1e2400a80ff55eb7c5f4fd9d7eb163fd0de9a2c Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Fri, 8 Dec 2017 12:08:11 +0100 Subject: [PATCH 129/269] net: phy: meson-gxl: detect LPA corruption The purpose of this change is to fix the incorrect detection of the link partner (LP) advertised capabilities which sometimes happens with this PHY (roughly 1 time in a dozen) This issue may cause the link to be negotiated at 10Mbps/Full or 10Mbps/Half when 100MBps/Full is actually possible. In some case, the link is even completely broken and no communication is possible. To detect the corruption, we must look for a magic undocumented bit in the WOL bank (hint given by the SoC vendor kernel) but this is not enough to cover all cases. We also have to look at the LPA ack. If the LP supports Aneg but did not ack our base code when aneg is completed, we assume something went wrong. The detection of a corrupted LPA triggers a restart of the aneg process. This solves the problem but may take up to 6 retries to complete. Fixes: 7334b3e47aee ("net: phy: Add Meson GXL Internal PHY driver") Signed-off-by: Jerome Brunet Signed-off-by: David S. Miller --- drivers/net/phy/meson-gxl.c | 74 ++++++++++++++++++++++++++++++++++++- 1 file changed, 73 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/meson-gxl.c b/drivers/net/phy/meson-gxl.c index 1ea69b7585d9..700007dd4be5 100644 --- a/drivers/net/phy/meson-gxl.c +++ b/drivers/net/phy/meson-gxl.c @@ -22,6 +22,7 @@ #include #include #include +#include static int meson_gxl_config_init(struct phy_device *phydev) { @@ -50,6 +51,77 @@ static int meson_gxl_config_init(struct phy_device *phydev) return 0; } +/* This function is provided to cope with the possible failures of this phy + * during aneg process. When aneg fails, the PHY reports that aneg is done + * but the value found in MII_LPA is wrong: + * - Early failures: MII_LPA is just 0x0001. if MII_EXPANSION reports that + * the link partner (LP) supports aneg but the LP never acked our base + * code word, it is likely that we never sent it to begin with. + * - Late failures: MII_LPA is filled with a value which seems to make sense + * but it actually is not what the LP is advertising. It seems that we + * can detect this using a magic bit in the WOL bank (reg 12 - bit 12). + * If this particular bit is not set when aneg is reported being done, + * it means MII_LPA is likely to be wrong. + * + * In both case, forcing a restart of the aneg process solve the problem. + * When this failure happens, the first retry is usually successful but, + * in some cases, it may take up to 6 retries to get a decent result + */ +int meson_gxl_read_status(struct phy_device *phydev) +{ + int ret, wol, lpa, exp; + + if (phydev->autoneg == AUTONEG_ENABLE) { + ret = genphy_aneg_done(phydev); + if (ret < 0) + return ret; + else if (!ret) + goto read_status_continue; + + /* Need to access WOL bank, make sure the access is open */ + ret = phy_write(phydev, 0x14, 0x0000); + if (ret) + return ret; + ret = phy_write(phydev, 0x14, 0x0400); + if (ret) + return ret; + ret = phy_write(phydev, 0x14, 0x0000); + if (ret) + return ret; + ret = phy_write(phydev, 0x14, 0x0400); + if (ret) + return ret; + + /* Request LPI_STATUS WOL register */ + ret = phy_write(phydev, 0x14, 0x8D80); + if (ret) + return ret; + + /* Read LPI_STATUS value */ + wol = phy_read(phydev, 0x15); + if (wol < 0) + return wol; + + lpa = phy_read(phydev, MII_LPA); + if (lpa < 0) + return lpa; + + exp = phy_read(phydev, MII_EXPANSION); + if (exp < 0) + return exp; + + if (!(wol & BIT(12)) || + ((exp & EXPANSION_NWAY) && !(lpa & LPA_LPACK))) { + /* Looks like aneg failed after all */ + phydev_dbg(phydev, "LPA corruption - aneg restart\n"); + return genphy_restart_aneg(phydev); + } + } + +read_status_continue: + return genphy_read_status(phydev); +} + static struct phy_driver meson_gxl_phy[] = { { .phy_id = 0x01814400, @@ -60,7 +132,7 @@ static struct phy_driver meson_gxl_phy[] = { .config_init = meson_gxl_config_init, .config_aneg = genphy_config_aneg, .aneg_done = genphy_aneg_done, - .read_status = genphy_read_status, + .read_status = meson_gxl_read_status, .suspend = genphy_suspend, .resume = genphy_resume, }, From 2aab6b40b03154a263463a5d992ddd7d122a016a Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Fri, 8 Dec 2017 16:35:40 +0100 Subject: [PATCH 130/269] net: sh_eth: do not advertise Gigabit capabilities when not available Not all variants of the sh_eth hardware have Gigabit support. Unfortunately, the current driver doesn't tell the PHY about the limited MAC capabilities. Due to this, if you have a Gigabit capable PHY, the PHY will advertise its Gigabit capability and establish a link at 1Gbit/s, even though the MAC doesn't support it. In order to avoid this, we use the recently introduced phy_set_max_speed() to tell the PHY to not advertise speed higher than 100 MBit/s. Tested on a SH7786 platform, with a Gigabit PHY. Signed-off-by: Thomas Petazzoni Signed-off-by: David S. Miller --- drivers/net/ethernet/renesas/sh_eth.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index db72d13cebb9..75323000c364 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -1892,6 +1892,16 @@ static int sh_eth_phy_init(struct net_device *ndev) return PTR_ERR(phydev); } + /* mask with MAC supported features */ + if (mdp->cd->register_type != SH_ETH_REG_GIGABIT) { + int err = phy_set_max_speed(phydev, SPEED_100); + if (err) { + netdev_err(ndev, "failed to limit PHY to 100 Mbit/s\n"); + phy_disconnect(phydev); + return err; + } + } + phy_attached_info(phydev); return 0; From 93c647643b48f0131f02e45da3bd367d80443291 Mon Sep 17 00:00:00 2001 From: Kevin Cernekee Date: Wed, 6 Dec 2017 12:12:27 -0800 Subject: [PATCH 131/269] netlink: Add netns check on taps Currently, a nlmon link inside a child namespace can observe systemwide netlink activity. Filter the traffic so that nlmon can only sniff netlink messages from its own netns. Test case: vpnns -- bash -c "ip link add nlmon0 type nlmon; \ ip link set nlmon0 up; \ tcpdump -i nlmon0 -q -w /tmp/nlmon.pcap -U" & sudo ip xfrm state add src 10.1.1.1 dst 10.1.1.2 proto esp \ spi 0x1 mode transport \ auth sha1 0x6162633132330000000000000000000000000000 \ enc aes 0x00000000000000000000000000000000 grep --binary abc123 /tmp/nlmon.pcap Signed-off-by: Kevin Cernekee Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index b9e0ee4e22f5..79cc1bf36e4a 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -253,6 +253,9 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb, struct sock *sk = skb->sk; int ret = -ENOMEM; + if (!net_eq(dev_net(dev), sock_net(sk))) + return 0; + dev_hold(dev); if (is_vmalloc_addr(skb->head)) From f79ce87fa49da778a1ad54c7d3c6755e13cf8489 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Thu, 30 Nov 2017 22:51:20 +0800 Subject: [PATCH 132/269] x86/build: Don't verify mtools configuration file for isoimage If mtools.conf is not generated before, 'make isoimage' could complain: Kernel: arch/x86/boot/bzImage is ready (#597) GENIMAGE arch/x86/boot/image.iso *** Missing file: arch/x86/boot/mtools.conf arch/x86/boot/Makefile:144: recipe for target 'isoimage' failed mtools.conf is not used for isoimage generation, so do not check it. Signed-off-by: Changbin Du Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 4366d57af1 ("x86/build: Factor out fdimage/isoimage generation commands to standalone script") Link: http://lkml.kernel.org/r/1512053480-8083-1-git-send-email-changbin.du@intel.com Signed-off-by: Ingo Molnar --- arch/x86/boot/genimage.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/boot/genimage.sh b/arch/x86/boot/genimage.sh index 49f4970f693b..c9e8499fbfe7 100644 --- a/arch/x86/boot/genimage.sh +++ b/arch/x86/boot/genimage.sh @@ -44,9 +44,9 @@ FDINITRD=$6 # Make sure the files actually exist verify "$FBZIMAGE" -verify "$MTOOLSRC" genbzdisk() { + verify "$MTOOLSRC" mformat a: syslinux $FIMAGE echo "$KCMDLINE" | mcopy - a:syslinux.cfg @@ -57,6 +57,7 @@ genbzdisk() { } genfdimage144() { + verify "$MTOOLSRC" dd if=/dev/zero of=$FIMAGE bs=1024 count=1440 2> /dev/null mformat v: syslinux $FIMAGE @@ -68,6 +69,7 @@ genfdimage144() { } genfdimage288() { + verify "$MTOOLSRC" dd if=/dev/zero of=$FIMAGE bs=1024 count=2880 2> /dev/null mformat w: syslinux $FIMAGE From 0a373d4fc248cb707821d7dad54ce6d5bcb0cdfe Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 30 Nov 2017 15:35:54 +0300 Subject: [PATCH 133/269] x86/unwinder/guess: Prevent using CONFIG_UNWINDER_GUESS=y with CONFIG_STACKDEPOT=y Stackdepot doesn't work well with CONFIG_UNWINDER_GUESS=y. The 'guess' unwinder generate awfully large and inaccurate stacktraces, thus stackdepot can't deduplicate stacktraces because they all look like unique. Eventually stackdepot reaches its capacity limit: WARNING: CPU: 0 PID: 545 at lib/stackdepot.c:119 depot_save_stack+0x28e/0x550 Call Trace: ? kasan_kmalloc+0x144/0x160 ? depot_save_stack+0x1f5/0x550 ? do_raw_spin_unlock+0xda/0xf0 ? preempt_count_sub+0x13/0xc0 <...90 lines...> ? do_raw_spin_unlock+0xda/0xf0 Add a STACKDEPOT=n dependency to UNWINDER_GUESS to avoid the problem. Reported-by: kernel test robot Reported-by: Fengguang Wu Signed-off-by: Andrey Ryabinin Acked-by: Dmitry Vyukov Acked-by: Josh Poimboeuf Cc: Alexander Potapenko Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171130123554.4330-1-aryabinin@virtuozzo.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 6293a8768a91..672441c008c7 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -400,6 +400,7 @@ config UNWINDER_FRAME_POINTER config UNWINDER_GUESS bool "Guess unwinder" depends on EXPERT + depends on !STACKDEPOT ---help--- This option enables the "guess" unwinder for unwinding kernel stack traces. It scans the stack and reports every kernel text address it From 8f659a03a0ba9289b9aeb9b4470e6fb263d6f483 Mon Sep 17 00:00:00 2001 From: Mohamed Ghannam Date: Sun, 10 Dec 2017 03:50:58 +0000 Subject: [PATCH 134/269] net: ipv4: fix for a race condition in raw_sendmsg inet->hdrincl is racy, and could lead to uninitialized stack pointer usage, so its value should be read only once. Fixes: c008ba5bdc9f ("ipv4: Avoid reading user iov twice after raw_probe_proto_opt") Signed-off-by: Mohamed Ghannam Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/raw.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 33b70bfd1122..125c1eab3eaa 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -513,11 +513,16 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int err; struct ip_options_data opt_copy; struct raw_frag_vec rfv; + int hdrincl; err = -EMSGSIZE; if (len > 0xFFFF) goto out; + /* hdrincl should be READ_ONCE(inet->hdrincl) + * but READ_ONCE() doesn't work with bit fields + */ + hdrincl = inet->hdrincl; /* * Check the flags. */ @@ -593,7 +598,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) /* Linux does not mangle headers on raw sockets, * so that IP options + IP_HDRINCL is non-sense. */ - if (inet->hdrincl) + if (hdrincl) goto done; if (ipc.opt->opt.srr) { if (!daddr) @@ -615,12 +620,12 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, - inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, + hdrincl ? IPPROTO_RAW : sk->sk_protocol, inet_sk_flowi_flags(sk) | - (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), + (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), daddr, saddr, 0, 0, sk->sk_uid); - if (!inet->hdrincl) { + if (!hdrincl) { rfv.msg = msg; rfv.hlen = 0; @@ -645,7 +650,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) goto do_confirm; back_from_confirm: - if (inet->hdrincl) + if (hdrincl) err = raw_send_hdrinc(sk, &fl4, msg, len, &rt, msg->msg_flags, &ipc.sockc); From 2342b8d95bcae5946e1b9b8d58645f37500ef2e7 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 10 Dec 2017 15:40:51 +0800 Subject: [PATCH 135/269] sctp: make sure stream nums can match optlen in sctp_setsockopt_reset_streams Now in sctp_setsockopt_reset_streams, it only does the check optlen < sizeof(*params) for optlen. But it's not enough, as params->srs_number_streams should also match optlen. If the streams in params->srs_stream_list are less than stream nums in params->srs_number_streams, later when dereferencing the stream list, it could cause a slab-out-of-bounds crash, as reported by syzbot. This patch is to fix it by also checking the stream numbers in sctp_setsockopt_reset_streams to make sure at least it's not greater than the streams in the list. Fixes: 7f9d68ac944e ("sctp: implement sender-side procedures for SSN Reset Request Parameter") Reported-by: Dmitry Vyukov Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/socket.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index eb17a911aa29..3253f724a995 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3891,13 +3891,17 @@ static int sctp_setsockopt_reset_streams(struct sock *sk, struct sctp_association *asoc; int retval = -EINVAL; - if (optlen < sizeof(struct sctp_reset_streams)) + if (optlen < sizeof(*params)) return -EINVAL; params = memdup_user(optval, optlen); if (IS_ERR(params)) return PTR_ERR(params); + if (params->srs_number_streams * sizeof(__u16) > + optlen - sizeof(*params)) + goto out; + asoc = sctp_id2assoc(sk, params->srs_assoc_id); if (!asoc) goto out; From 200809716aed1cac586fcac4c0551a688439be1f Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 10 Dec 2017 16:56:00 +0800 Subject: [PATCH 136/269] fou: fix some member types in guehdr guehdr struct is used to build or parse gue packets, which are always in big endian. It's better to define all guehdr members as __beXX types. Also, in validate_gue_flags it's not good to use a __be32 variable for both Standard flags(__be16) and Private flags (__be32), and pass it to other funcions. This patch could fix a bunch of sparse warnings from fou. Fixes: 5024c33ac354 ("gue: Add infrastructure for flags and options") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/gue.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/include/net/gue.h b/include/net/gue.h index 2fdb29ca74c2..fdad41469b65 100644 --- a/include/net/gue.h +++ b/include/net/gue.h @@ -44,10 +44,10 @@ struct guehdr { #else #error "Please fix " #endif - __u8 proto_ctype; - __u16 flags; + __u8 proto_ctype; + __be16 flags; }; - __u32 word; + __be32 word; }; }; @@ -84,11 +84,10 @@ static inline size_t guehdr_priv_flags_len(__be32 flags) * if there is an unknown standard or private flags, or the options length for * the flags exceeds the options length specific in hlen of the GUE header. */ -static inline int validate_gue_flags(struct guehdr *guehdr, - size_t optlen) +static inline int validate_gue_flags(struct guehdr *guehdr, size_t optlen) { + __be16 flags = guehdr->flags; size_t len; - __be32 flags = guehdr->flags; if (flags & ~GUE_FLAGS_ALL) return 1; @@ -101,12 +100,13 @@ static inline int validate_gue_flags(struct guehdr *guehdr, /* Private flags are last four bytes accounted in * guehdr_flags_len */ - flags = *(__be32 *)((void *)&guehdr[1] + len - GUE_LEN_PRIV); + __be32 pflags = *(__be32 *)((void *)&guehdr[1] + + len - GUE_LEN_PRIV); - if (flags & ~GUE_PFLAGS_ALL) + if (pflags & ~GUE_PFLAGS_ALL) return 1; - len += guehdr_priv_flags_len(flags); + len += guehdr_priv_flags_len(pflags); if (len > optlen) return 1; } From 9d5afec6b8bd46d6ed821aa1579634437f58ef1f Mon Sep 17 00:00:00 2001 From: Chandan Rajendra Date: Mon, 11 Dec 2017 15:00:57 -0500 Subject: [PATCH 137/269] ext4: fix crash when a directory's i_size is too small On a ppc64 machine, when mounting a fuzzed ext2 image (generated by fsfuzzer) the following call trace is seen, VFS: brelse: Trying to free free buffer WARNING: CPU: 1 PID: 6913 at /root/repos/linux/fs/buffer.c:1165 .__brelse.part.6+0x24/0x40 .__brelse.part.6+0x20/0x40 (unreliable) .ext4_find_entry+0x384/0x4f0 .ext4_lookup+0x84/0x250 .lookup_slow+0xdc/0x230 .walk_component+0x268/0x400 .path_lookupat+0xec/0x2d0 .filename_lookup+0x9c/0x1d0 .vfs_statx+0x98/0x140 .SyS_newfstatat+0x48/0x80 system_call+0x58/0x6c This happens because the directory that ext4_find_entry() looks up has inode->i_size that is less than the block size of the filesystem. This causes 'nblocks' to have a value of zero. ext4_bread_batch() ends up not reading any of the directory file's blocks. This renders the entries in bh_use[] array to continue to have garbage data. buffer_uptodate() on bh_use[0] can then return a zero value upon which brelse() function is invoked. This commit fixes the bug by returning -ENOENT when the directory file has no associated blocks. Reported-by: Abdul Haleem Signed-off-by: Chandan Rajendra Cc: stable@vger.kernel.org --- fs/ext4/namei.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 798b3ac680db..e750d68fbcb5 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1399,6 +1399,10 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, "falling back\n")); } nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); + if (!nblocks) { + ret = NULL; + goto cleanup_and_exit; + } start = EXT4_I(dir)->i_dir_start_lookup; if (start >= nblocks) start = 0; From c058ecf6e455fac7346d46197a02398ead90851f Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Mon, 27 Nov 2017 13:16:32 -0800 Subject: [PATCH 138/269] iw_cxgb4: only insert drain cqes if wq is flushed Only insert our special drain CQEs to support ib_drain_sq/rq() after the wq is flushed. Otherwise, existing but not yet polled CQEs can be returned out of order to the user application. This can happen when the QP has exited RTS but not yet flushed the QP, which can happen during a normal close (vs abortive close). In addition never count the drain CQEs when determining how many CQEs need to be synthesized during the flush operation. This latter issue should never happen if the QP is properly flushed before inserting the drain CQE, but I wanted to avoid corrupting the CQ state. So we handle it and log a warning once. Fixes: 4fe7c2962e11 ("iw_cxgb4: refactor sq/rq drain logic") Signed-off-by: Steve Wise Cc: stable@vger.kernel.org Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb4/cq.c | 5 +++++ drivers/infiniband/hw/cxgb4/qp.c | 14 ++++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index ea55e95cd2c5..b7bfc536e00f 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -395,6 +395,11 @@ next_cqe: static int cqe_completes_wr(struct t4_cqe *cqe, struct t4_wq *wq) { + if (CQE_OPCODE(cqe) == C4IW_DRAIN_OPCODE) { + WARN_ONCE(1, "Unexpected DRAIN CQE qp id %u!\n", wq->sq.qid); + return 0; + } + if (CQE_OPCODE(cqe) == FW_RI_TERMINATE) return 0; diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index 355e288ec969..38bddd02a943 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -868,7 +868,12 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, qhp = to_c4iw_qp(ibqp); spin_lock_irqsave(&qhp->lock, flag); - if (t4_wq_in_error(&qhp->wq)) { + + /* + * If the qp has been flushed, then just insert a special + * drain cqe. + */ + if (qhp->wq.flushed) { spin_unlock_irqrestore(&qhp->lock, flag); complete_sq_drain_wr(qhp, wr); return err; @@ -1011,7 +1016,12 @@ int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, qhp = to_c4iw_qp(ibqp); spin_lock_irqsave(&qhp->lock, flag); - if (t4_wq_in_error(&qhp->wq)) { + + /* + * If the qp has been flushed, then just insert a special + * drain cqe. + */ + if (qhp->wq.flushed) { spin_unlock_irqrestore(&qhp->lock, flag); complete_rq_drain_wr(qhp, wr); return err; From 68a213d325c23d39f109f4c7c824b906a7d209de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Thu, 2 Nov 2017 21:25:24 +0100 Subject: [PATCH 139/269] platform/x86: dell-laptop: Fix keyboard max lighting for Dell Latitude E6410 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This machine reports number of keyboard backlight led levels, instead of value of the last led level index. Therefore max_brightness properly needs to be subtracted by 1 to match led max_brightness API. Signed-off-by: Pali Rohár Reported-by: Gabriel M. Elder Link: https://bugzilla.kernel.org/show_bug.cgi?id=196913 Signed-off-by: Darren Hart (VMware) --- drivers/platform/x86/dell-laptop.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/platform/x86/dell-laptop.c b/drivers/platform/x86/dell-laptop.c index bf897b1832b1..cd4725e7e0b5 100644 --- a/drivers/platform/x86/dell-laptop.c +++ b/drivers/platform/x86/dell-laptop.c @@ -37,6 +37,7 @@ struct quirk_entry { u8 touchpad_led; + u8 kbd_led_levels_off_1; int needs_kbd_timeouts; /* @@ -67,6 +68,10 @@ static struct quirk_entry quirk_dell_xps13_9333 = { .kbd_timeouts = { 0, 5, 15, 60, 5 * 60, 15 * 60, -1 }, }; +static struct quirk_entry quirk_dell_latitude_e6410 = { + .kbd_led_levels_off_1 = 1, +}; + static struct platform_driver platform_driver = { .driver = { .name = "dell-laptop", @@ -269,6 +274,15 @@ static const struct dmi_system_id dell_quirks[] __initconst = { }, .driver_data = &quirk_dell_xps13_9333, }, + { + .callback = dmi_matched, + .ident = "Dell Latitude E6410", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6410"), + }, + .driver_data = &quirk_dell_latitude_e6410, + }, { } }; @@ -1149,6 +1163,9 @@ static int kbd_get_info(struct kbd_info *info) units = (buffer->output[2] >> 8) & 0xFF; info->levels = (buffer->output[2] >> 16) & 0xFF; + if (quirks && quirks->kbd_led_levels_off_1 && info->levels) + info->levels--; + if (units & BIT(0)) info->seconds = (buffer->output[3] >> 0) & 0xFF; if (units & BIT(1)) From bff5bf9db1c9453ffd0a78abed3e2d040c092fd9 Mon Sep 17 00:00:00 2001 From: Peter Hutterer Date: Mon, 4 Dec 2017 10:26:17 +1000 Subject: [PATCH 140/269] platform/x86: asus-wireless: send an EV_SYN/SYN_REPORT between state changes Sending the switch state change twice within the same frame is invalid evdev protocol and only works if the client handles keys immediately as well. Processing events immediately is incorrect, it forces a fake order of events that does not exist on the device. Recent versions of libinput changed to only process the device state and SYN_REPORT time, so now the key event is lost. https://bugs.freedesktop.org/show_bug.cgi?id=104041 Signed-off-by: Peter Hutterer Signed-off-by: Darren Hart (VMware) --- drivers/platform/x86/asus-wireless.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/x86/asus-wireless.c b/drivers/platform/x86/asus-wireless.c index f3796164329e..d4aeac3477f5 100644 --- a/drivers/platform/x86/asus-wireless.c +++ b/drivers/platform/x86/asus-wireless.c @@ -118,6 +118,7 @@ static void asus_wireless_notify(struct acpi_device *adev, u32 event) return; } input_report_key(data->idev, KEY_RFKILL, 1); + input_sync(data->idev); input_report_key(data->idev, KEY_RFKILL, 0); input_sync(data->idev); } From 532298b95075144bcccf56d792f3fb3fbef2d5d0 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 11 Dec 2017 13:54:27 +0300 Subject: [PATCH 141/269] platform/x86: dell-wmi: check for kmalloc() errors This allocation won't fail in the current kernel because it's small but not checking for kmalloc() failures introduces static checker warnings so let's fix it. Signed-off-by: Dan Carpenter Reviewed-by: Mario Limonciello Signed-off-by: Darren Hart (VMware) --- drivers/platform/x86/dell-wmi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/platform/x86/dell-wmi.c b/drivers/platform/x86/dell-wmi.c index 39d2f4518483..fb25b20df316 100644 --- a/drivers/platform/x86/dell-wmi.c +++ b/drivers/platform/x86/dell-wmi.c @@ -639,6 +639,8 @@ static int dell_wmi_events_set_enabled(bool enable) int ret; buffer = kzalloc(sizeof(struct calling_interface_buffer), GFP_KERNEL); + if (!buffer) + return -ENOMEM; buffer->cmd_class = CLASS_INFO; buffer->cmd_select = SELECT_APP_REGISTRATION; buffer->input[0] = 0x10000; From 621f6401fdeefe96dfe9eab4b167c7c39f552bb0 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Mon, 11 Dec 2017 15:03:33 +0800 Subject: [PATCH 142/269] scsi: libsas: fix length error in sas_smp_handler() The return value of smp_execute_task_sg() is the untransferred residual, but bsg_job_done() requires the length of payload received. This makes SMP passthrough commands from userland by sg ioctl to libsas get a wrong response. The userland tools such as smp_utils failed because of these wrong responses: ~#smp_discover /dev/bsg/expander-2\:13 response too short, len=0 ~#smp_discover /dev/bsg/expander-2\:134 response too short, len=0 Fix this by passing the actual received length to bsg_job_done(). And if smp_execute_task_sg() returns 0, this means received length is exactly the buffer length. [mkp: typo] Fixes: 651a01364994 ("scsi: scsi_transport_sas: switch to bsg-lib for SMP passthrough") Cc: # v4.14+ Signed-off-by: Jason Yan Reported-by: chenqilin Tested-by: chenqilin CC: Christoph Hellwig Signed-off-by: Martin K. Petersen --- drivers/scsi/libsas/sas_expander.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c index 174e5eff6155..c7f21661b3cd 100644 --- a/drivers/scsi/libsas/sas_expander.c +++ b/drivers/scsi/libsas/sas_expander.c @@ -2145,7 +2145,7 @@ void sas_smp_handler(struct bsg_job *job, struct Scsi_Host *shost, struct sas_rphy *rphy) { struct domain_device *dev; - unsigned int reslen = 0; + unsigned int rcvlen = 0; int ret = -EINVAL; /* no rphy means no smp target support (ie aic94xx host) */ @@ -2179,12 +2179,12 @@ void sas_smp_handler(struct bsg_job *job, struct Scsi_Host *shost, ret = smp_execute_task_sg(dev, job->request_payload.sg_list, job->reply_payload.sg_list); - if (ret > 0) { - /* positive number is the untransferred residual */ - reslen = ret; + if (ret >= 0) { + /* bsg_job_done() requires the length received */ + rcvlen = job->reply_payload.payload_len - ret; ret = 0; } out: - bsg_job_done(job, ret, reslen); + bsg_job_done(job, ret, rcvlen); } From 3e5c63565aca5fbd1cc150cb2ca77154fc50fa0c Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 11 Dec 2017 10:09:30 +0100 Subject: [PATCH 143/269] scsi: MAINTAINERS: change FCoE list to linux-scsi fcoe-devel@open-fcoe.org is defunct and all patches are routed via the SCSI tree anyways. So update MAINTAINERS accordingly. Signed-off-by: Johannes Thumshirn Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index cd7e12dc6af4..37841b52a5b6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5352,7 +5352,7 @@ F: drivers/media/tuners/fc2580* FCOE SUBSYSTEM (libfc, libfcoe, fcoe) M: Johannes Thumshirn -L: fcoe-devel@open-fcoe.org +L: linux-scsi@vger.kernel.org W: www.Open-FCoE.org S: Supported F: drivers/scsi/libfc/ From 14e3062fb18532175af4d1c4073597999f7a2248 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 5 Dec 2017 16:57:51 -0800 Subject: [PATCH 144/269] scsi: core: Fix a scsi_show_rq() NULL pointer dereference Avoid that scsi_show_rq() triggers a NULL pointer dereference if called after sd_uninit_command(). Swap the NULL pointer assignment and the mempool_free() call in sd_uninit_command() to make it less likely that scsi_show_rq() triggers a use-after-free. Note: even with these changes scsi_show_rq() can trigger a use-after-free but that's a lesser evil than e.g. suppressing debug information for T10 PI Type 2 commands completely. This patch fixes the following oops: BUG: unable to handle kernel NULL pointer dereference at (null) IP: scsi_format_opcode_name+0x1a/0x1c0 CPU: 1 PID: 1881 Comm: cat Not tainted 4.14.0-rc2.blk_mq_io_hang+ #516 Call Trace: __scsi_format_command+0x27/0xc0 scsi_show_rq+0x5c/0xc0 __blk_mq_debugfs_rq_show+0x116/0x130 blk_mq_debugfs_rq_show+0xe/0x10 seq_read+0xfe/0x3b0 full_proxy_read+0x54/0x90 __vfs_read+0x37/0x160 vfs_read+0x96/0x130 SyS_read+0x55/0xc0 entry_SYSCALL_64_fastpath+0x1a/0xa5 [mkp: added Type 2] Fixes: 0eebd005dd07 ("scsi: Implement blk_mq_ops.show_rq()") Reported-by: Ming Lei Signed-off-by: Bart Van Assche Cc: James E.J. Bottomley Cc: Martin K. Petersen Cc: Ming Lei Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Cc: stable@vger.kernel.org Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_debugfs.c | 6 ++++-- drivers/scsi/sd.c | 4 +++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/scsi_debugfs.c b/drivers/scsi/scsi_debugfs.c index 01f08c03f2c1..c3765d29fd3f 100644 --- a/drivers/scsi/scsi_debugfs.c +++ b/drivers/scsi/scsi_debugfs.c @@ -8,9 +8,11 @@ void scsi_show_rq(struct seq_file *m, struct request *rq) { struct scsi_cmnd *cmd = container_of(scsi_req(rq), typeof(*cmd), req); int msecs = jiffies_to_msecs(jiffies - cmd->jiffies_at_alloc); - char buf[80]; + const u8 *const cdb = READ_ONCE(cmd->cmnd); + char buf[80] = "(?)"; - __scsi_format_command(buf, sizeof(buf), cmd->cmnd, cmd->cmd_len); + if (cdb) + __scsi_format_command(buf, sizeof(buf), cdb, cmd->cmd_len); seq_printf(m, ", .cmd=%s, .retries=%d, allocated %d.%03d s ago", buf, cmd->retries, msecs / 1000, msecs % 1000); } diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 24fe68522716..a028ab3322a9 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1312,6 +1312,7 @@ static int sd_init_command(struct scsi_cmnd *cmd) static void sd_uninit_command(struct scsi_cmnd *SCpnt) { struct request *rq = SCpnt->request; + u8 *cmnd; if (SCpnt->flags & SCMD_ZONE_WRITE_LOCK) sd_zbc_write_unlock_zone(SCpnt); @@ -1320,9 +1321,10 @@ static void sd_uninit_command(struct scsi_cmnd *SCpnt) __free_page(rq->special_vec.bv_page); if (SCpnt->cmnd != scsi_req(rq)->cmd) { - mempool_free(SCpnt->cmnd, sd_cdb_pool); + cmnd = SCpnt->cmnd; SCpnt->cmnd = NULL; SCpnt->cmd_len = 0; + mempool_free(cmnd, sd_cdb_pool); } } From f87f3a328dbbb3e79dd53e7e889ced9222512649 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 28 Nov 2017 18:42:18 +0000 Subject: [PATCH 145/269] locking/core: Fix deadlock during boot on systems with GENERIC_LOCKBREAK Commit: a8a217c22116 ("locking/core: Remove {read,spin,write}_can_lock()") removed the definition of raw_spin_can_lock(), causing the GENERIC_LOCKBREAK spin_lock() routines to poll the ->break_lock field when waiting on a lock. This has been reported to cause a deadlock during boot on s390, because the ->break_lock field is also set by the waiters, and can potentially remain set indefinitely if no other CPUs come in to take the lock after it has been released. This patch removes the explicit spinning on ->break_lock from the waiters, instead relying on the outer trylock() operation to determine when the lock is available. Reported-by: Sebastian Ott Tested-by: Sebastian Ott Signed-off-by: Will Deacon Acked-by: Peter Zijlstra Cc: Heiko Carstens Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Thomas Gleixner Fixes: a8a217c22116 ("locking/core: Remove {read,spin,write}_can_lock()") Link: http://lkml.kernel.org/r/1511894539-7988-2-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- kernel/locking/spinlock.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 1fd1a7543cdd..0ebb253e2199 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -68,8 +68,8 @@ void __lockfunc __raw_##op##_lock(locktype##_t *lock) \ \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ - while ((lock)->break_lock) \ - arch_##op##_relax(&lock->raw_lock); \ + \ + arch_##op##_relax(&lock->raw_lock); \ } \ (lock)->break_lock = 0; \ } \ @@ -88,8 +88,8 @@ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ - while ((lock)->break_lock) \ - arch_##op##_relax(&lock->raw_lock); \ + \ + arch_##op##_relax(&lock->raw_lock); \ } \ (lock)->break_lock = 0; \ return flags; \ From d89c70356acf11b7cf47ca5cfcafae5062a85451 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 28 Nov 2017 18:42:19 +0000 Subject: [PATCH 146/269] locking/core: Remove break_lock field when CONFIG_GENERIC_LOCKBREAK=y When CONFIG_GENERIC_LOCKBEAK=y, locking structures grow an extra int ->break_lock field which is used to implement raw_spin_is_contended() by setting the field to 1 when waiting on a lock and clearing it to zero when holding a lock. However, there are a few problems with this approach: - There is a write-write race between a CPU successfully taking the lock (and subsequently writing break_lock = 0) and a waiter waiting on the lock (and subsequently writing break_lock = 1). This could result in a contended lock being reported as uncontended and vice-versa. - On machines with store buffers, nothing guarantees that the writes to break_lock are visible to other CPUs at any particular time. - READ_ONCE/WRITE_ONCE are not used, so the field is potentially susceptible to harmful compiler optimisations, Consequently, the usefulness of this field is unclear and we'd be better off removing it and allowing architectures to implement raw_spin_is_contended() by providing a definition of arch_spin_is_contended(), as they can when CONFIG_GENERIC_LOCKBREAK=n. Signed-off-by: Will Deacon Acked-by: Peter Zijlstra Cc: Heiko Carstens Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Sebastian Ott Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1511894539-7988-3-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- include/linux/rwlock_types.h | 3 --- include/linux/spinlock.h | 5 ----- include/linux/spinlock_types.h | 3 --- kernel/locking/spinlock.c | 9 +-------- 4 files changed, 1 insertion(+), 19 deletions(-) diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h index cc0072e93e36..857a72ceb794 100644 --- a/include/linux/rwlock_types.h +++ b/include/linux/rwlock_types.h @@ -10,9 +10,6 @@ */ typedef struct { arch_rwlock_t raw_lock; -#ifdef CONFIG_GENERIC_LOCKBREAK - unsigned int break_lock; -#endif #ifdef CONFIG_DEBUG_SPINLOCK unsigned int magic, owner_cpu; void *owner; diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index a39186194cd6..3bf273538840 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -107,16 +107,11 @@ do { \ #define raw_spin_is_locked(lock) arch_spin_is_locked(&(lock)->raw_lock) -#ifdef CONFIG_GENERIC_LOCKBREAK -#define raw_spin_is_contended(lock) ((lock)->break_lock) -#else - #ifdef arch_spin_is_contended #define raw_spin_is_contended(lock) arch_spin_is_contended(&(lock)->raw_lock) #else #define raw_spin_is_contended(lock) (((void)(lock), 0)) #endif /*arch_spin_is_contended*/ -#endif /* * This barrier must provide two things: diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h index 73548eb13a5d..24b4e6f2c1a2 100644 --- a/include/linux/spinlock_types.h +++ b/include/linux/spinlock_types.h @@ -19,9 +19,6 @@ typedef struct raw_spinlock { arch_spinlock_t raw_lock; -#ifdef CONFIG_GENERIC_LOCKBREAK - unsigned int break_lock; -#endif #ifdef CONFIG_DEBUG_SPINLOCK unsigned int magic, owner_cpu; void *owner; diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 0ebb253e2199..936f3d14dd6b 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -66,12 +66,8 @@ void __lockfunc __raw_##op##_lock(locktype##_t *lock) \ break; \ preempt_enable(); \ \ - if (!(lock)->break_lock) \ - (lock)->break_lock = 1; \ - \ arch_##op##_relax(&lock->raw_lock); \ } \ - (lock)->break_lock = 0; \ } \ \ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ @@ -86,12 +82,9 @@ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ local_irq_restore(flags); \ preempt_enable(); \ \ - if (!(lock)->break_lock) \ - (lock)->break_lock = 1; \ - \ arch_##op##_relax(&lock->raw_lock); \ } \ - (lock)->break_lock = 0; \ + \ return flags; \ } \ \ From 54eed78c5c831ba696259f7fa69966d699a173b1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 11 Dec 2017 12:30:14 +0100 Subject: [PATCH 147/269] usb: gadget: webcam: fix V4L2 Kconfig dependency Configuring the USB_G_WEBCAM driver as built-in leads to a link error when CONFIG_VIDEO_V4L2 is a loadable module: drivers/usb/gadget/function/f_uvc.o: In function `uvc_function_setup': f_uvc.c:(.text+0xfe): undefined reference to `v4l2_event_queue' drivers/usb/gadget/function/f_uvc.o: In function `uvc_function_ep0_complete': f_uvc.c:(.text+0x188): undefined reference to `v4l2_event_queue' This changes the Kconfig dependency to disallow that configuration, and force it to be a module in that case as well. This is apparently a rather old bug, but very hard to trigger even in thousands of randconfig builds. Signed-off-by: Arnd Bergmann Signed-off-by: Felipe Balbi --- drivers/usb/gadget/legacy/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/gadget/legacy/Kconfig b/drivers/usb/gadget/legacy/Kconfig index 9570bbeced4f..487568f2c729 100644 --- a/drivers/usb/gadget/legacy/Kconfig +++ b/drivers/usb/gadget/legacy/Kconfig @@ -487,7 +487,7 @@ endif # or video class gadget drivers), or specific hardware, here. config USB_G_WEBCAM tristate "USB Webcam Gadget" - depends on VIDEO_DEV + depends on VIDEO_V4L2 select USB_LIBCOMPOSITE select VIDEOBUF2_VMALLOC select USB_F_UVC From 9dbe416b656bb015fc49fc17961000ffa418838a Mon Sep 17 00:00:00 2001 From: Felipe Balbi Date: Tue, 12 Dec 2017 12:44:40 +0200 Subject: [PATCH 148/269] Revert "usb: gadget: allow to enable legacy drivers without USB_ETH" This reverts commit 7a9618a22aadffb55027d665491adf466bced61a. Romain Izard recently reported that commit 7a9618a22aad ended up allowing every legacy gadget driver to statically linked to the kernel, however that doesn't work, since only one legacy gadget can be bound to a controller. Because of that, let's revert the original commit and fix the problem. Reported-by: Romain Izard Signed-off-by: Felipe Balbi --- drivers/usb/gadget/Kconfig | 4 ++-- drivers/usb/gadget/legacy/Kconfig | 10 ---------- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/drivers/usb/gadget/Kconfig b/drivers/usb/gadget/Kconfig index 0a19a76645ad..31cce7805eb2 100644 --- a/drivers/usb/gadget/Kconfig +++ b/drivers/usb/gadget/Kconfig @@ -508,8 +508,8 @@ choice controller, and the relevant drivers for each function declared by the device. -endchoice - source "drivers/usb/gadget/legacy/Kconfig" +endchoice + endif # USB_GADGET diff --git a/drivers/usb/gadget/legacy/Kconfig b/drivers/usb/gadget/legacy/Kconfig index 487568f2c729..784bf86dad4f 100644 --- a/drivers/usb/gadget/legacy/Kconfig +++ b/drivers/usb/gadget/legacy/Kconfig @@ -13,14 +13,6 @@ # both kinds of controller can also support "USB On-the-Go" (CONFIG_USB_OTG). # -menuconfig USB_GADGET_LEGACY - bool "Legacy USB Gadget Support" - help - Legacy USB gadgets are USB gadgets that do not use the USB gadget - configfs interface. - -if USB_GADGET_LEGACY - config USB_ZERO tristate "Gadget Zero (DEVELOPMENT)" select USB_LIBCOMPOSITE @@ -498,5 +490,3 @@ config USB_G_WEBCAM Say "y" to link the driver statically, or "m" to build a dynamically linked module called "g_webcam". - -endif From e966eaeeb623f09975ef362c2866fae6f86844f9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 12 Dec 2017 12:31:16 +0100 Subject: [PATCH 149/269] locking/lockdep: Remove the cross-release locking checks This code (CONFIG_LOCKDEP_CROSSRELEASE=y and CONFIG_LOCKDEP_COMPLETIONS=y), while it found a number of old bugs initially, was also causing too many false positives that caused people to disable lockdep - which is arguably a worse overall outcome. If we disable cross-release by default but keep the code upstream then in practice the most likely outcome is that we'll allow the situation to degrade gradually, by allowing entropy to introduce more and more false positives, until it overwhelms maintenance capacity. Another bad side effect was that people were trying to work around the false positives by uglifying/complicating unrelated code. There's a marked difference between annotating locking operations and uglifying good code just due to bad lock debugging code ... This gradual decrease in quality happened to a number of debugging facilities in the kernel, and lockdep is pretty complex already, so we cannot risk this outcome. Either cross-release checking can be done right with no false positives, or it should not be included in the upstream kernel. ( Note that it might make sense to maintain it out of tree and go through the false positives every now and then and see whether new bugs were introduced. ) Cc: Byungchul Park Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- Documentation/locking/crossrelease.txt | 874 ------------------------- include/linux/completion.h | 45 -- include/linux/lockdep.h | 125 ---- include/linux/sched.h | 11 - kernel/locking/lockdep.c | 658 ++----------------- lib/Kconfig.debug | 33 - 6 files changed, 38 insertions(+), 1708 deletions(-) delete mode 100644 Documentation/locking/crossrelease.txt diff --git a/Documentation/locking/crossrelease.txt b/Documentation/locking/crossrelease.txt deleted file mode 100644 index bdf1423d5f99..000000000000 --- a/Documentation/locking/crossrelease.txt +++ /dev/null @@ -1,874 +0,0 @@ -Crossrelease -============ - -Started by Byungchul Park - -Contents: - - (*) Background - - - What causes deadlock - - How lockdep works - - (*) Limitation - - - Limit lockdep - - Pros from the limitation - - Cons from the limitation - - Relax the limitation - - (*) Crossrelease - - - Introduce crossrelease - - Introduce commit - - (*) Implementation - - - Data structures - - How crossrelease works - - (*) Optimizations - - - Avoid duplication - - Lockless for hot paths - - (*) APPENDIX A: What lockdep does to work aggresively - - (*) APPENDIX B: How to avoid adding false dependencies - - -========== -Background -========== - -What causes deadlock --------------------- - -A deadlock occurs when a context is waiting for an event to happen, -which is impossible because another (or the) context who can trigger the -event is also waiting for another (or the) event to happen, which is -also impossible due to the same reason. - -For example: - - A context going to trigger event C is waiting for event A to happen. - A context going to trigger event A is waiting for event B to happen. - A context going to trigger event B is waiting for event C to happen. - -A deadlock occurs when these three wait operations run at the same time, -because event C cannot be triggered if event A does not happen, which in -turn cannot be triggered if event B does not happen, which in turn -cannot be triggered if event C does not happen. After all, no event can -be triggered since any of them never meets its condition to wake up. - -A dependency might exist between two waiters and a deadlock might happen -due to an incorrect releationship between dependencies. Thus, we must -define what a dependency is first. A dependency exists between them if: - - 1. There are two waiters waiting for each event at a given time. - 2. The only way to wake up each waiter is to trigger its event. - 3. Whether one can be woken up depends on whether the other can. - -Each wait in the example creates its dependency like: - - Event C depends on event A. - Event A depends on event B. - Event B depends on event C. - - NOTE: Precisely speaking, a dependency is one between whether a - waiter for an event can be woken up and whether another waiter for - another event can be woken up. However from now on, we will describe - a dependency as if it's one between an event and another event for - simplicity. - -And they form circular dependencies like: - - -> C -> A -> B - - / \ - \ / - ---------------- - - where 'A -> B' means that event A depends on event B. - -Such circular dependencies lead to a deadlock since no waiter can meet -its condition to wake up as described. - -CONCLUSION - -Circular dependencies cause a deadlock. - - -How lockdep works ------------------ - -Lockdep tries to detect a deadlock by checking dependencies created by -lock operations, acquire and release. Waiting for a lock corresponds to -waiting for an event, and releasing a lock corresponds to triggering an -event in the previous section. - -In short, lockdep does: - - 1. Detect a new dependency. - 2. Add the dependency into a global graph. - 3. Check if that makes dependencies circular. - 4. Report a deadlock or its possibility if so. - -For example, consider a graph built by lockdep that looks like: - - A -> B - - \ - -> E - / - C -> D - - - where A, B,..., E are different lock classes. - -Lockdep will add a dependency into the graph on detection of a new -dependency. For example, it will add a dependency 'E -> C' when a new -dependency between lock E and lock C is detected. Then the graph will be: - - A -> B - - \ - -> E - - / \ - -> C -> D - \ - / / - \ / - ------------------ - - where A, B,..., E are different lock classes. - -This graph contains a subgraph which demonstrates circular dependencies: - - -> E - - / \ - -> C -> D - \ - / / - \ / - ------------------ - - where C, D and E are different lock classes. - -This is the condition under which a deadlock might occur. Lockdep -reports it on detection after adding a new dependency. This is the way -how lockdep works. - -CONCLUSION - -Lockdep detects a deadlock or its possibility by checking if circular -dependencies were created after adding each new dependency. - - -========== -Limitation -========== - -Limit lockdep -------------- - -Limiting lockdep to work on only typical locks e.g. spin locks and -mutexes, which are released within the acquire context, the -implementation becomes simple but its capacity for detection becomes -limited. Let's check pros and cons in next section. - - -Pros from the limitation ------------------------- - -Given the limitation, when acquiring a lock, locks in a held_locks -cannot be released if the context cannot acquire it so has to wait to -acquire it, which means all waiters for the locks in the held_locks are -stuck. It's an exact case to create dependencies between each lock in -the held_locks and the lock to acquire. - -For example: - - CONTEXT X - --------- - acquire A - acquire B /* Add a dependency 'A -> B' */ - release B - release A - - where A and B are different lock classes. - -When acquiring lock A, the held_locks of CONTEXT X is empty thus no -dependency is added. But when acquiring lock B, lockdep detects and adds -a new dependency 'A -> B' between lock A in the held_locks and lock B. -They can be simply added whenever acquiring each lock. - -And data required by lockdep exists in a local structure, held_locks -embedded in task_struct. Forcing to access the data within the context, -lockdep can avoid racy problems without explicit locks while handling -the local data. - -Lastly, lockdep only needs to keep locks currently being held, to build -a dependency graph. However, relaxing the limitation, it needs to keep -even locks already released, because a decision whether they created -dependencies might be long-deferred. - -To sum up, we can expect several advantages from the limitation: - - 1. Lockdep can easily identify a dependency when acquiring a lock. - 2. Races are avoidable while accessing local locks in a held_locks. - 3. Lockdep only needs to keep locks currently being held. - -CONCLUSION - -Given the limitation, the implementation becomes simple and efficient. - - -Cons from the limitation ------------------------- - -Given the limitation, lockdep is applicable only to typical locks. For -example, page locks for page access or completions for synchronization -cannot work with lockdep. - -Can we detect deadlocks below, under the limitation? - -Example 1: - - CONTEXT X CONTEXT Y CONTEXT Z - --------- --------- ---------- - mutex_lock A - lock_page B - lock_page B - mutex_lock A /* DEADLOCK */ - unlock_page B held by X - unlock_page B - mutex_unlock A - mutex_unlock A - - where A and B are different lock classes. - -No, we cannot. - -Example 2: - - CONTEXT X CONTEXT Y - --------- --------- - mutex_lock A - mutex_lock A - wait_for_complete B /* DEADLOCK */ - complete B - mutex_unlock A - mutex_unlock A - - where A is a lock class and B is a completion variable. - -No, we cannot. - -CONCLUSION - -Given the limitation, lockdep cannot detect a deadlock or its -possibility caused by page locks or completions. - - -Relax the limitation --------------------- - -Under the limitation, things to create dependencies are limited to -typical locks. However, synchronization primitives like page locks and -completions, which are allowed to be released in any context, also -create dependencies and can cause a deadlock. So lockdep should track -these locks to do a better job. We have to relax the limitation for -these locks to work with lockdep. - -Detecting dependencies is very important for lockdep to work because -adding a dependency means adding an opportunity to check whether it -causes a deadlock. The more lockdep adds dependencies, the more it -thoroughly works. Thus Lockdep has to do its best to detect and add as -many true dependencies into a graph as possible. - -For example, considering only typical locks, lockdep builds a graph like: - - A -> B - - \ - -> E - / - C -> D - - - where A, B,..., E are different lock classes. - -On the other hand, under the relaxation, additional dependencies might -be created and added. Assuming additional 'FX -> C' and 'E -> GX' are -added thanks to the relaxation, the graph will be: - - A -> B - - \ - -> E -> GX - / - FX -> C -> D - - - where A, B,..., E, FX and GX are different lock classes, and a suffix - 'X' is added on non-typical locks. - -The latter graph gives us more chances to check circular dependencies -than the former. However, it might suffer performance degradation since -relaxing the limitation, with which design and implementation of lockdep -can be efficient, might introduce inefficiency inevitably. So lockdep -should provide two options, strong detection and efficient detection. - -Choosing efficient detection: - - Lockdep works with only locks restricted to be released within the - acquire context. However, lockdep works efficiently. - -Choosing strong detection: - - Lockdep works with all synchronization primitives. However, lockdep - suffers performance degradation. - -CONCLUSION - -Relaxing the limitation, lockdep can add additional dependencies giving -additional opportunities to check circular dependencies. - - -============ -Crossrelease -============ - -Introduce crossrelease ----------------------- - -In order to allow lockdep to handle additional dependencies by what -might be released in any context, namely 'crosslock', we have to be able -to identify those created by crosslocks. The proposed 'crossrelease' -feature provoides a way to do that. - -Crossrelease feature has to do: - - 1. Identify dependencies created by crosslocks. - 2. Add the dependencies into a dependency graph. - -That's all. Once a meaningful dependency is added into graph, then -lockdep would work with the graph as it did. The most important thing -crossrelease feature has to do is to correctly identify and add true -dependencies into the global graph. - -A dependency e.g. 'A -> B' can be identified only in the A's release -context because a decision required to identify the dependency can be -made only in the release context. That is to decide whether A can be -released so that a waiter for A can be woken up. It cannot be made in -other than the A's release context. - -It's no matter for typical locks because each acquire context is same as -its release context, thus lockdep can decide whether a lock can be -released in the acquire context. However for crosslocks, lockdep cannot -make the decision in the acquire context but has to wait until the -release context is identified. - -Therefore, deadlocks by crosslocks cannot be detected just when it -happens, because those cannot be identified until the crosslocks are -released. However, deadlock possibilities can be detected and it's very -worth. See 'APPENDIX A' section to check why. - -CONCLUSION - -Using crossrelease feature, lockdep can work with what might be released -in any context, namely crosslock. - - -Introduce commit ----------------- - -Since crossrelease defers the work adding true dependencies of -crosslocks until they are actually released, crossrelease has to queue -all acquisitions which might create dependencies with the crosslocks. -Then it identifies dependencies using the queued data in batches at a -proper time. We call it 'commit'. - -There are four types of dependencies: - -1. TT type: 'typical lock A -> typical lock B' - - Just when acquiring B, lockdep can see it's in the A's release - context. So the dependency between A and B can be identified - immediately. Commit is unnecessary. - -2. TC type: 'typical lock A -> crosslock BX' - - Just when acquiring BX, lockdep can see it's in the A's release - context. So the dependency between A and BX can be identified - immediately. Commit is unnecessary, too. - -3. CT type: 'crosslock AX -> typical lock B' - - When acquiring B, lockdep cannot identify the dependency because - there's no way to know if it's in the AX's release context. It has - to wait until the decision can be made. Commit is necessary. - -4. CC type: 'crosslock AX -> crosslock BX' - - When acquiring BX, lockdep cannot identify the dependency because - there's no way to know if it's in the AX's release context. It has - to wait until the decision can be made. Commit is necessary. - But, handling CC type is not implemented yet. It's a future work. - -Lockdep can work without commit for typical locks, but commit step is -necessary once crosslocks are involved. Introducing commit, lockdep -performs three steps. What lockdep does in each step is: - -1. Acquisition: For typical locks, lockdep does what it originally did - and queues the lock so that CT type dependencies can be checked using - it at the commit step. For crosslocks, it saves data which will be - used at the commit step and increases a reference count for it. - -2. Commit: No action is reauired for typical locks. For crosslocks, - lockdep adds CT type dependencies using the data saved at the - acquisition step. - -3. Release: No changes are required for typical locks. When a crosslock - is released, it decreases a reference count for it. - -CONCLUSION - -Crossrelease introduces commit step to handle dependencies of crosslocks -in batches at a proper time. - - -============== -Implementation -============== - -Data structures ---------------- - -Crossrelease introduces two main data structures. - -1. hist_lock - - This is an array embedded in task_struct, for keeping lock history so - that dependencies can be added using them at the commit step. Since - it's local data, it can be accessed locklessly in the owner context. - The array is filled at the acquisition step and consumed at the - commit step. And it's managed in circular manner. - -2. cross_lock - - One per lockdep_map exists. This is for keeping data of crosslocks - and used at the commit step. - - -How crossrelease works ----------------------- - -It's the key of how crossrelease works, to defer necessary works to an -appropriate point in time and perform in at once at the commit step. -Let's take a look with examples step by step, starting from how lockdep -works without crossrelease for typical locks. - - acquire A /* Push A onto held_locks */ - acquire B /* Push B onto held_locks and add 'A -> B' */ - acquire C /* Push C onto held_locks and add 'B -> C' */ - release C /* Pop C from held_locks */ - release B /* Pop B from held_locks */ - release A /* Pop A from held_locks */ - - where A, B and C are different lock classes. - - NOTE: This document assumes that readers already understand how - lockdep works without crossrelease thus omits details. But there's - one thing to note. Lockdep pretends to pop a lock from held_locks - when releasing it. But it's subtly different from the original pop - operation because lockdep allows other than the top to be poped. - -In this case, lockdep adds 'the top of held_locks -> the lock to acquire' -dependency every time acquiring a lock. - -After adding 'A -> B', a dependency graph will be: - - A -> B - - where A and B are different lock classes. - -And after adding 'B -> C', the graph will be: - - A -> B -> C - - where A, B and C are different lock classes. - -Let's performs commit step even for typical locks to add dependencies. -Of course, commit step is not necessary for them, however, it would work -well because this is a more general way. - - acquire A - /* - * Queue A into hist_locks - * - * In hist_locks: A - * In graph: Empty - */ - - acquire B - /* - * Queue B into hist_locks - * - * In hist_locks: A, B - * In graph: Empty - */ - - acquire C - /* - * Queue C into hist_locks - * - * In hist_locks: A, B, C - * In graph: Empty - */ - - commit C - /* - * Add 'C -> ?' - * Answer the following to decide '?' - * What has been queued since acquire C: Nothing - * - * In hist_locks: A, B, C - * In graph: Empty - */ - - release C - - commit B - /* - * Add 'B -> ?' - * Answer the following to decide '?' - * What has been queued since acquire B: C - * - * In hist_locks: A, B, C - * In graph: 'B -> C' - */ - - release B - - commit A - /* - * Add 'A -> ?' - * Answer the following to decide '?' - * What has been queued since acquire A: B, C - * - * In hist_locks: A, B, C - * In graph: 'B -> C', 'A -> B', 'A -> C' - */ - - release A - - where A, B and C are different lock classes. - -In this case, dependencies are added at the commit step as described. - -After commits for A, B and C, the graph will be: - - A -> B -> C - - where A, B and C are different lock classes. - - NOTE: A dependency 'A -> C' is optimized out. - -We can see the former graph built without commit step is same as the -latter graph built using commit steps. Of course the former way leads to -earlier finish for building the graph, which means we can detect a -deadlock or its possibility sooner. So the former way would be prefered -when possible. But we cannot avoid using the latter way for crosslocks. - -Let's look at how commit steps work for crosslocks. In this case, the -commit step is performed only on crosslock AX as real. And it assumes -that the AX release context is different from the AX acquire context. - - BX RELEASE CONTEXT BX ACQUIRE CONTEXT - ------------------ ------------------ - acquire A - /* - * Push A onto held_locks - * Queue A into hist_locks - * - * In held_locks: A - * In hist_locks: A - * In graph: Empty - */ - - acquire BX - /* - * Add 'the top of held_locks -> BX' - * - * In held_locks: A - * In hist_locks: A - * In graph: 'A -> BX' - */ - - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - It must be guaranteed that the following operations are seen after - acquiring BX globally. It can be done by things like barrier. - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - acquire C - /* - * Push C onto held_locks - * Queue C into hist_locks - * - * In held_locks: C - * In hist_locks: C - * In graph: 'A -> BX' - */ - - release C - /* - * Pop C from held_locks - * - * In held_locks: Empty - * In hist_locks: C - * In graph: 'A -> BX' - */ - acquire D - /* - * Push D onto held_locks - * Queue D into hist_locks - * Add 'the top of held_locks -> D' - * - * In held_locks: A, D - * In hist_locks: A, D - * In graph: 'A -> BX', 'A -> D' - */ - acquire E - /* - * Push E onto held_locks - * Queue E into hist_locks - * - * In held_locks: E - * In hist_locks: C, E - * In graph: 'A -> BX', 'A -> D' - */ - - release E - /* - * Pop E from held_locks - * - * In held_locks: Empty - * In hist_locks: D, E - * In graph: 'A -> BX', 'A -> D' - */ - release D - /* - * Pop D from held_locks - * - * In held_locks: A - * In hist_locks: A, D - * In graph: 'A -> BX', 'A -> D' - */ - commit BX - /* - * Add 'BX -> ?' - * What has been queued since acquire BX: C, E - * - * In held_locks: Empty - * In hist_locks: D, E - * In graph: 'A -> BX', 'A -> D', - * 'BX -> C', 'BX -> E' - */ - - release BX - /* - * In held_locks: Empty - * In hist_locks: D, E - * In graph: 'A -> BX', 'A -> D', - * 'BX -> C', 'BX -> E' - */ - release A - /* - * Pop A from held_locks - * - * In held_locks: Empty - * In hist_locks: A, D - * In graph: 'A -> BX', 'A -> D', - * 'BX -> C', 'BX -> E' - */ - - where A, BX, C,..., E are different lock classes, and a suffix 'X' is - added on crosslocks. - -Crossrelease considers all acquisitions after acqiuring BX are -candidates which might create dependencies with BX. True dependencies -will be determined when identifying the release context of BX. Meanwhile, -all typical locks are queued so that they can be used at the commit step. -And then two dependencies 'BX -> C' and 'BX -> E' are added at the -commit step when identifying the release context. - -The final graph will be, with crossrelease: - - -> C - / - -> BX - - / \ - A - -> E - \ - -> D - - where A, BX, C,..., E are different lock classes, and a suffix 'X' is - added on crosslocks. - -However, the final graph will be, without crossrelease: - - A -> D - - where A and D are different lock classes. - -The former graph has three more dependencies, 'A -> BX', 'BX -> C' and -'BX -> E' giving additional opportunities to check if they cause -deadlocks. This way lockdep can detect a deadlock or its possibility -caused by crosslocks. - -CONCLUSION - -We checked how crossrelease works with several examples. - - -============= -Optimizations -============= - -Avoid duplication ------------------ - -Crossrelease feature uses a cache like what lockdep already uses for -dependency chains, but this time it's for caching CT type dependencies. -Once that dependency is cached, the same will never be added again. - - -Lockless for hot paths ----------------------- - -To keep all locks for later use at the commit step, crossrelease adopts -a local array embedded in task_struct, which makes access to the data -lockless by forcing it to happen only within the owner context. It's -like how lockdep handles held_locks. Lockless implmentation is important -since typical locks are very frequently acquired and released. - - -================================================= -APPENDIX A: What lockdep does to work aggresively -================================================= - -A deadlock actually occurs when all wait operations creating circular -dependencies run at the same time. Even though they don't, a potential -deadlock exists if the problematic dependencies exist. Thus it's -meaningful to detect not only an actual deadlock but also its potential -possibility. The latter is rather valuable. When a deadlock occurs -actually, we can identify what happens in the system by some means or -other even without lockdep. However, there's no way to detect possiblity -without lockdep unless the whole code is parsed in head. It's terrible. -Lockdep does the both, and crossrelease only focuses on the latter. - -Whether or not a deadlock actually occurs depends on several factors. -For example, what order contexts are switched in is a factor. Assuming -circular dependencies exist, a deadlock would occur when contexts are -switched so that all wait operations creating the dependencies run -simultaneously. Thus to detect a deadlock possibility even in the case -that it has not occured yet, lockdep should consider all possible -combinations of dependencies, trying to: - -1. Use a global dependency graph. - - Lockdep combines all dependencies into one global graph and uses them, - regardless of which context generates them or what order contexts are - switched in. Aggregated dependencies are only considered so they are - prone to be circular if a problem exists. - -2. Check dependencies between classes instead of instances. - - What actually causes a deadlock are instances of lock. However, - lockdep checks dependencies between classes instead of instances. - This way lockdep can detect a deadlock which has not happened but - might happen in future by others but the same class. - -3. Assume all acquisitions lead to waiting. - - Although locks might be acquired without waiting which is essential - to create dependencies, lockdep assumes all acquisitions lead to - waiting since it might be true some time or another. - -CONCLUSION - -Lockdep detects not only an actual deadlock but also its possibility, -and the latter is more valuable. - - -================================================== -APPENDIX B: How to avoid adding false dependencies -================================================== - -Remind what a dependency is. A dependency exists if: - - 1. There are two waiters waiting for each event at a given time. - 2. The only way to wake up each waiter is to trigger its event. - 3. Whether one can be woken up depends on whether the other can. - -For example: - - acquire A - acquire B /* A dependency 'A -> B' exists */ - release B - release A - - where A and B are different lock classes. - -A depedency 'A -> B' exists since: - - 1. A waiter for A and a waiter for B might exist when acquiring B. - 2. Only way to wake up each is to release what it waits for. - 3. Whether the waiter for A can be woken up depends on whether the - other can. IOW, TASK X cannot release A if it fails to acquire B. - -For another example: - - TASK X TASK Y - ------ ------ - acquire AX - acquire B /* A dependency 'AX -> B' exists */ - release B - release AX held by Y - - where AX and B are different lock classes, and a suffix 'X' is added - on crosslocks. - -Even in this case involving crosslocks, the same rule can be applied. A -depedency 'AX -> B' exists since: - - 1. A waiter for AX and a waiter for B might exist when acquiring B. - 2. Only way to wake up each is to release what it waits for. - 3. Whether the waiter for AX can be woken up depends on whether the - other can. IOW, TASK X cannot release AX if it fails to acquire B. - -Let's take a look at more complicated example: - - TASK X TASK Y - ------ ------ - acquire B - release B - fork Y - acquire AX - acquire C /* A dependency 'AX -> C' exists */ - release C - release AX held by Y - - where AX, B and C are different lock classes, and a suffix 'X' is - added on crosslocks. - -Does a dependency 'AX -> B' exist? Nope. - -Two waiters are essential to create a dependency. However, waiters for -AX and B to create 'AX -> B' cannot exist at the same time in this -example. Thus the dependency 'AX -> B' cannot be created. - -It would be ideal if the full set of true ones can be considered. But -we can ensure nothing but what actually happened. Relying on what -actually happens at runtime, we can anyway add only true ones, though -they might be a subset of true ones. It's similar to how lockdep works -for typical locks. There might be more true dependencies than what -lockdep has detected in runtime. Lockdep has no choice but to rely on -what actually happens. Crossrelease also relies on it. - -CONCLUSION - -Relying on what actually happens, lockdep can avoid adding false -dependencies. diff --git a/include/linux/completion.h b/include/linux/completion.h index 0662a417febe..94a59ba7d422 100644 --- a/include/linux/completion.h +++ b/include/linux/completion.h @@ -10,9 +10,6 @@ */ #include -#ifdef CONFIG_LOCKDEP_COMPLETIONS -#include -#endif /* * struct completion - structure used to maintain state for a "completion" @@ -29,58 +26,16 @@ struct completion { unsigned int done; wait_queue_head_t wait; -#ifdef CONFIG_LOCKDEP_COMPLETIONS - struct lockdep_map_cross map; -#endif }; -#ifdef CONFIG_LOCKDEP_COMPLETIONS -static inline void complete_acquire(struct completion *x) -{ - lock_acquire_exclusive((struct lockdep_map *)&x->map, 0, 0, NULL, _RET_IP_); -} - -static inline void complete_release(struct completion *x) -{ - lock_release((struct lockdep_map *)&x->map, 0, _RET_IP_); -} - -static inline void complete_release_commit(struct completion *x) -{ - lock_commit_crosslock((struct lockdep_map *)&x->map); -} - -#define init_completion_map(x, m) \ -do { \ - lockdep_init_map_crosslock((struct lockdep_map *)&(x)->map, \ - (m)->name, (m)->key, 0); \ - __init_completion(x); \ -} while (0) - -#define init_completion(x) \ -do { \ - static struct lock_class_key __key; \ - lockdep_init_map_crosslock((struct lockdep_map *)&(x)->map, \ - "(completion)" #x, \ - &__key, 0); \ - __init_completion(x); \ -} while (0) -#else #define init_completion_map(x, m) __init_completion(x) #define init_completion(x) __init_completion(x) static inline void complete_acquire(struct completion *x) {} static inline void complete_release(struct completion *x) {} static inline void complete_release_commit(struct completion *x) {} -#endif -#ifdef CONFIG_LOCKDEP_COMPLETIONS -#define COMPLETION_INITIALIZER(work) \ - { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait), \ - STATIC_CROSS_LOCKDEP_MAP_INIT("(completion)" #work, &(work)) } -#else #define COMPLETION_INITIALIZER(work) \ { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) } -#endif #define COMPLETION_INITIALIZER_ONSTACK_MAP(work, map) \ (*({ init_completion_map(&(work), &(map)); &(work); })) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index a842551fe044..2e75dc34bff5 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -158,12 +158,6 @@ struct lockdep_map { int cpu; unsigned long ip; #endif -#ifdef CONFIG_LOCKDEP_CROSSRELEASE - /* - * Whether it's a crosslock. - */ - int cross; -#endif }; static inline void lockdep_copy_map(struct lockdep_map *to, @@ -267,95 +261,8 @@ struct held_lock { unsigned int hardirqs_off:1; unsigned int references:12; /* 32 bits */ unsigned int pin_count; -#ifdef CONFIG_LOCKDEP_CROSSRELEASE - /* - * Generation id. - * - * A value of cross_gen_id will be stored when holding this, - * which is globally increased whenever each crosslock is held. - */ - unsigned int gen_id; -#endif }; -#ifdef CONFIG_LOCKDEP_CROSSRELEASE -#define MAX_XHLOCK_TRACE_ENTRIES 5 - -/* - * This is for keeping locks waiting for commit so that true dependencies - * can be added at commit step. - */ -struct hist_lock { - /* - * Id for each entry in the ring buffer. This is used to - * decide whether the ring buffer was overwritten or not. - * - * For example, - * - * |<----------- hist_lock ring buffer size ------->| - * pppppppppppppppppppppiiiiiiiiiiiiiiiiiiiiiiiiiiiii - * wrapped > iiiiiiiiiiiiiiiiiiiiiiiiiii....................... - * - * where 'p' represents an acquisition in process - * context, 'i' represents an acquisition in irq - * context. - * - * In this example, the ring buffer was overwritten by - * acquisitions in irq context, that should be detected on - * rollback or commit. - */ - unsigned int hist_id; - - /* - * Seperate stack_trace data. This will be used at commit step. - */ - struct stack_trace trace; - unsigned long trace_entries[MAX_XHLOCK_TRACE_ENTRIES]; - - /* - * Seperate hlock instance. This will be used at commit step. - * - * TODO: Use a smaller data structure containing only necessary - * data. However, we should make lockdep code able to handle the - * smaller one first. - */ - struct held_lock hlock; -}; - -/* - * To initialize a lock as crosslock, lockdep_init_map_crosslock() should - * be called instead of lockdep_init_map(). - */ -struct cross_lock { - /* - * When more than one acquisition of crosslocks are overlapped, - * we have to perform commit for them based on cross_gen_id of - * the first acquisition, which allows us to add more true - * dependencies. - * - * Moreover, when no acquisition of a crosslock is in progress, - * we should not perform commit because the lock might not exist - * any more, which might cause incorrect memory access. So we - * have to track the number of acquisitions of a crosslock. - */ - int nr_acquire; - - /* - * Seperate hlock instance. This will be used at commit step. - * - * TODO: Use a smaller data structure containing only necessary - * data. However, we should make lockdep code able to handle the - * smaller one first. - */ - struct held_lock hlock; -}; - -struct lockdep_map_cross { - struct lockdep_map map; - struct cross_lock xlock; -}; -#endif - /* * Initialization, self-test and debugging-output methods: */ @@ -560,37 +467,6 @@ enum xhlock_context_t { XHLOCK_CTX_NR, }; -#ifdef CONFIG_LOCKDEP_CROSSRELEASE -extern void lockdep_init_map_crosslock(struct lockdep_map *lock, - const char *name, - struct lock_class_key *key, - int subclass); -extern void lock_commit_crosslock(struct lockdep_map *lock); - -/* - * What we essencially have to initialize is 'nr_acquire'. Other members - * will be initialized in add_xlock(). - */ -#define STATIC_CROSS_LOCK_INIT() \ - { .nr_acquire = 0,} - -#define STATIC_CROSS_LOCKDEP_MAP_INIT(_name, _key) \ - { .map.name = (_name), .map.key = (void *)(_key), \ - .map.cross = 1, .xlock = STATIC_CROSS_LOCK_INIT(), } - -/* - * To initialize a lockdep_map statically use this macro. - * Note that _name must not be NULL. - */ -#define STATIC_LOCKDEP_MAP_INIT(_name, _key) \ - { .name = (_name), .key = (void *)(_key), .cross = 0, } - -extern void crossrelease_hist_start(enum xhlock_context_t c); -extern void crossrelease_hist_end(enum xhlock_context_t c); -extern void lockdep_invariant_state(bool force); -extern void lockdep_init_task(struct task_struct *task); -extern void lockdep_free_task(struct task_struct *task); -#else /* !CROSSRELEASE */ #define lockdep_init_map_crosslock(m, n, k, s) do {} while (0) /* * To initialize a lockdep_map statically use this macro. @@ -604,7 +480,6 @@ static inline void crossrelease_hist_end(enum xhlock_context_t c) {} static inline void lockdep_invariant_state(bool force) {} static inline void lockdep_init_task(struct task_struct *task) {} static inline void lockdep_free_task(struct task_struct *task) {} -#endif /* CROSSRELEASE */ #ifdef CONFIG_LOCK_STAT diff --git a/include/linux/sched.h b/include/linux/sched.h index 21991d668d35..9ce6c3001e9f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -849,17 +849,6 @@ struct task_struct { struct held_lock held_locks[MAX_LOCK_DEPTH]; #endif -#ifdef CONFIG_LOCKDEP_CROSSRELEASE -#define MAX_XHLOCKS_NR 64UL - struct hist_lock *xhlocks; /* Crossrelease history locks */ - unsigned int xhlock_idx; - /* For restoring at history boundaries */ - unsigned int xhlock_idx_hist[XHLOCK_CTX_NR]; - unsigned int hist_id; - /* For overwrite check at each context exit */ - unsigned int hist_id_save[XHLOCK_CTX_NR]; -#endif - #ifdef CONFIG_UBSAN unsigned int in_ubsan; #endif diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 670d8d7d8087..5fa1324a4f29 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -57,10 +57,6 @@ #define CREATE_TRACE_POINTS #include -#ifdef CONFIG_LOCKDEP_CROSSRELEASE -#include -#endif - #ifdef CONFIG_PROVE_LOCKING int prove_locking = 1; module_param(prove_locking, int, 0644); @@ -75,19 +71,6 @@ module_param(lock_stat, int, 0644); #define lock_stat 0 #endif -#ifdef CONFIG_BOOTPARAM_LOCKDEP_CROSSRELEASE_FULLSTACK -static int crossrelease_fullstack = 1; -#else -static int crossrelease_fullstack; -#endif -static int __init allow_crossrelease_fullstack(char *str) -{ - crossrelease_fullstack = 1; - return 0; -} - -early_param("crossrelease_fullstack", allow_crossrelease_fullstack); - /* * lockdep_lock: protects the lockdep graph, the hashes and the * class/list/hash allocators. @@ -740,18 +723,6 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL); } -#ifdef CONFIG_LOCKDEP_CROSSRELEASE -static void cross_init(struct lockdep_map *lock, int cross); -static int cross_lock(struct lockdep_map *lock); -static int lock_acquire_crosslock(struct held_lock *hlock); -static int lock_release_crosslock(struct lockdep_map *lock); -#else -static inline void cross_init(struct lockdep_map *lock, int cross) {} -static inline int cross_lock(struct lockdep_map *lock) { return 0; } -static inline int lock_acquire_crosslock(struct held_lock *hlock) { return 2; } -static inline int lock_release_crosslock(struct lockdep_map *lock) { return 2; } -#endif - /* * Register a lock's class in the hash-table, if the class is not present * yet. Otherwise we look it up. We cache the result in the lock object @@ -1151,41 +1122,22 @@ print_circular_lock_scenario(struct held_lock *src, printk(KERN_CONT "\n\n"); } - if (cross_lock(tgt->instance)) { - printk(" Possible unsafe locking scenario by crosslock:\n\n"); - printk(" CPU0 CPU1\n"); - printk(" ---- ----\n"); - printk(" lock("); - __print_lock_name(parent); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(target); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(source); - printk(KERN_CONT ");\n"); - printk(" unlock("); - __print_lock_name(target); - printk(KERN_CONT ");\n"); - printk("\n *** DEADLOCK ***\n\n"); - } else { - printk(" Possible unsafe locking scenario:\n\n"); - printk(" CPU0 CPU1\n"); - printk(" ---- ----\n"); - printk(" lock("); - __print_lock_name(target); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(parent); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(target); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(source); - printk(KERN_CONT ");\n"); - printk("\n *** DEADLOCK ***\n\n"); - } + printk(" Possible unsafe locking scenario:\n\n"); + printk(" CPU0 CPU1\n"); + printk(" ---- ----\n"); + printk(" lock("); + __print_lock_name(target); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(parent); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(target); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(source); + printk(KERN_CONT ");\n"); + printk("\n *** DEADLOCK ***\n\n"); } /* @@ -1211,10 +1163,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, curr->comm, task_pid_nr(curr)); print_lock(check_src); - if (cross_lock(check_tgt->instance)) - pr_warn("\nbut now in release context of a crosslock acquired at the following:\n"); - else - pr_warn("\nbut task is already holding lock:\n"); + pr_warn("\nbut task is already holding lock:\n"); print_lock(check_tgt); pr_warn("\nwhich lock already depends on the new lock.\n\n"); @@ -1244,9 +1193,7 @@ static noinline int print_circular_bug(struct lock_list *this, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - if (cross_lock(check_tgt->instance)) - this->trace = *trace; - else if (!save_trace(&this->trace)) + if (!save_trace(&this->trace)) return 0; depth = get_lock_depth(target); @@ -1850,9 +1797,6 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, if (nest) return 2; - if (cross_lock(prev->instance)) - continue; - return print_deadlock_bug(curr, prev, next); } return 1; @@ -2018,31 +1962,26 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) for (;;) { int distance = curr->lockdep_depth - depth + 1; hlock = curr->held_locks + depth - 1; - /* - * Only non-crosslock entries get new dependencies added. - * Crosslock entries will be added by commit later: - */ - if (!cross_lock(hlock->instance)) { - /* - * Only non-recursive-read entries get new dependencies - * added: - */ - if (hlock->read != 2 && hlock->check) { - int ret = check_prev_add(curr, hlock, next, - distance, &trace, save_trace); - if (!ret) - return 0; - /* - * Stop after the first non-trylock entry, - * as non-trylock entries have added their - * own direct dependencies already, so this - * lock is connected to them indirectly: - */ - if (!hlock->trylock) - break; - } + /* + * Only non-recursive-read entries get new dependencies + * added: + */ + if (hlock->read != 2 && hlock->check) { + int ret = check_prev_add(curr, hlock, next, distance, &trace, save_trace); + if (!ret) + return 0; + + /* + * Stop after the first non-trylock entry, + * as non-trylock entries have added their + * own direct dependencies already, so this + * lock is connected to them indirectly: + */ + if (!hlock->trylock) + break; } + depth--; /* * End of lock-stack? @@ -3292,21 +3231,10 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name, void lockdep_init_map(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass) { - cross_init(lock, 0); __lockdep_init_map(lock, name, key, subclass); } EXPORT_SYMBOL_GPL(lockdep_init_map); -#ifdef CONFIG_LOCKDEP_CROSSRELEASE -void lockdep_init_map_crosslock(struct lockdep_map *lock, const char *name, - struct lock_class_key *key, int subclass) -{ - cross_init(lock, 1); - __lockdep_init_map(lock, name, key, subclass); -} -EXPORT_SYMBOL_GPL(lockdep_init_map_crosslock); -#endif - struct lock_class_key __lockdep_no_validate__; EXPORT_SYMBOL_GPL(__lockdep_no_validate__); @@ -3362,7 +3290,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int chain_head = 0; int class_idx; u64 chain_key; - int ret; if (unlikely(!debug_locks)) return 0; @@ -3411,8 +3338,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, class_idx = class - lock_classes + 1; - /* TODO: nest_lock is not implemented for crosslock yet. */ - if (depth && !cross_lock(lock)) { + if (depth) { hlock = curr->held_locks + depth - 1; if (hlock->class_idx == class_idx && nest_lock) { if (hlock->references) { @@ -3500,14 +3426,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) return 0; - ret = lock_acquire_crosslock(hlock); - /* - * 2 means normal acquire operations are needed. Otherwise, it's - * ok just to return with '0:fail, 1:success'. - */ - if (ret != 2) - return ret; - curr->curr_chain_key = chain_key; curr->lockdep_depth++; check_chain_key(curr); @@ -3745,19 +3663,11 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) struct task_struct *curr = current; struct held_lock *hlock; unsigned int depth; - int ret, i; + int i; if (unlikely(!debug_locks)) return 0; - ret = lock_release_crosslock(lock); - /* - * 2 means normal release operations are needed. Otherwise, it's - * ok just to return with '0:fail, 1:success'. - */ - if (ret != 2) - return ret; - depth = curr->lockdep_depth; /* * So we're all set to release this lock.. wait what lock? We don't @@ -4675,495 +4585,3 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) dump_stack(); } EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); - -#ifdef CONFIG_LOCKDEP_CROSSRELEASE - -/* - * Crossrelease works by recording a lock history for each thread and - * connecting those historic locks that were taken after the - * wait_for_completion() in the complete() context. - * - * Task-A Task-B - * - * mutex_lock(&A); - * mutex_unlock(&A); - * - * wait_for_completion(&C); - * lock_acquire_crosslock(); - * atomic_inc_return(&cross_gen_id); - * | - * | mutex_lock(&B); - * | mutex_unlock(&B); - * | - * | complete(&C); - * `-- lock_commit_crosslock(); - * - * Which will then add a dependency between B and C. - */ - -#define xhlock(i) (current->xhlocks[(i) % MAX_XHLOCKS_NR]) - -/* - * Whenever a crosslock is held, cross_gen_id will be increased. - */ -static atomic_t cross_gen_id; /* Can be wrapped */ - -/* - * Make an entry of the ring buffer invalid. - */ -static inline void invalidate_xhlock(struct hist_lock *xhlock) -{ - /* - * Normally, xhlock->hlock.instance must be !NULL. - */ - xhlock->hlock.instance = NULL; -} - -/* - * Lock history stacks; we have 2 nested lock history stacks: - * - * HARD(IRQ) - * SOFT(IRQ) - * - * The thing is that once we complete a HARD/SOFT IRQ the future task locks - * should not depend on any of the locks observed while running the IRQ. So - * what we do is rewind the history buffer and erase all our knowledge of that - * temporal event. - */ - -void crossrelease_hist_start(enum xhlock_context_t c) -{ - struct task_struct *cur = current; - - if (!cur->xhlocks) - return; - - cur->xhlock_idx_hist[c] = cur->xhlock_idx; - cur->hist_id_save[c] = cur->hist_id; -} - -void crossrelease_hist_end(enum xhlock_context_t c) -{ - struct task_struct *cur = current; - - if (cur->xhlocks) { - unsigned int idx = cur->xhlock_idx_hist[c]; - struct hist_lock *h = &xhlock(idx); - - cur->xhlock_idx = idx; - - /* Check if the ring was overwritten. */ - if (h->hist_id != cur->hist_id_save[c]) - invalidate_xhlock(h); - } -} - -/* - * lockdep_invariant_state() is used to annotate independence inside a task, to - * make one task look like multiple independent 'tasks'. - * - * Take for instance workqueues; each work is independent of the last. The - * completion of a future work does not depend on the completion of a past work - * (in general). Therefore we must not carry that (lock) dependency across - * works. - * - * This is true for many things; pretty much all kthreads fall into this - * pattern, where they have an invariant state and future completions do not - * depend on past completions. Its just that since they all have the 'same' - * form -- the kthread does the same over and over -- it doesn't typically - * matter. - * - * The same is true for system-calls, once a system call is completed (we've - * returned to userspace) the next system call does not depend on the lock - * history of the previous system call. - * - * They key property for independence, this invariant state, is that it must be - * a point where we hold no locks and have no history. Because if we were to - * hold locks, the restore at _end() would not necessarily recover it's history - * entry. Similarly, independence per-definition means it does not depend on - * prior state. - */ -void lockdep_invariant_state(bool force) -{ - /* - * We call this at an invariant point, no current state, no history. - * Verify the former, enforce the latter. - */ - WARN_ON_ONCE(!force && current->lockdep_depth); - if (current->xhlocks) - invalidate_xhlock(&xhlock(current->xhlock_idx)); -} - -static int cross_lock(struct lockdep_map *lock) -{ - return lock ? lock->cross : 0; -} - -/* - * This is needed to decide the relationship between wrapable variables. - */ -static inline int before(unsigned int a, unsigned int b) -{ - return (int)(a - b) < 0; -} - -static inline struct lock_class *xhlock_class(struct hist_lock *xhlock) -{ - return hlock_class(&xhlock->hlock); -} - -static inline struct lock_class *xlock_class(struct cross_lock *xlock) -{ - return hlock_class(&xlock->hlock); -} - -/* - * Should we check a dependency with previous one? - */ -static inline int depend_before(struct held_lock *hlock) -{ - return hlock->read != 2 && hlock->check && !hlock->trylock; -} - -/* - * Should we check a dependency with next one? - */ -static inline int depend_after(struct held_lock *hlock) -{ - return hlock->read != 2 && hlock->check; -} - -/* - * Check if the xhlock is valid, which would be false if, - * - * 1. Has not used after initializaion yet. - * 2. Got invalidated. - * - * Remind hist_lock is implemented as a ring buffer. - */ -static inline int xhlock_valid(struct hist_lock *xhlock) -{ - /* - * xhlock->hlock.instance must be !NULL. - */ - return !!xhlock->hlock.instance; -} - -/* - * Record a hist_lock entry. - * - * Irq disable is only required. - */ -static void add_xhlock(struct held_lock *hlock) -{ - unsigned int idx = ++current->xhlock_idx; - struct hist_lock *xhlock = &xhlock(idx); - -#ifdef CONFIG_DEBUG_LOCKDEP - /* - * This can be done locklessly because they are all task-local - * state, we must however ensure IRQs are disabled. - */ - WARN_ON_ONCE(!irqs_disabled()); -#endif - - /* Initialize hist_lock's members */ - xhlock->hlock = *hlock; - xhlock->hist_id = ++current->hist_id; - - xhlock->trace.nr_entries = 0; - xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES; - xhlock->trace.entries = xhlock->trace_entries; - - if (crossrelease_fullstack) { - xhlock->trace.skip = 3; - save_stack_trace(&xhlock->trace); - } else { - xhlock->trace.nr_entries = 1; - xhlock->trace.entries[0] = hlock->acquire_ip; - } -} - -static inline int same_context_xhlock(struct hist_lock *xhlock) -{ - return xhlock->hlock.irq_context == task_irq_context(current); -} - -/* - * This should be lockless as far as possible because this would be - * called very frequently. - */ -static void check_add_xhlock(struct held_lock *hlock) -{ - /* - * Record a hist_lock, only in case that acquisitions ahead - * could depend on the held_lock. For example, if the held_lock - * is trylock then acquisitions ahead never depends on that. - * In that case, we don't need to record it. Just return. - */ - if (!current->xhlocks || !depend_before(hlock)) - return; - - add_xhlock(hlock); -} - -/* - * For crosslock. - */ -static int add_xlock(struct held_lock *hlock) -{ - struct cross_lock *xlock; - unsigned int gen_id; - - if (!graph_lock()) - return 0; - - xlock = &((struct lockdep_map_cross *)hlock->instance)->xlock; - - /* - * When acquisitions for a crosslock are overlapped, we use - * nr_acquire to perform commit for them, based on cross_gen_id - * of the first acquisition, which allows to add additional - * dependencies. - * - * Moreover, when no acquisition of a crosslock is in progress, - * we should not perform commit because the lock might not exist - * any more, which might cause incorrect memory access. So we - * have to track the number of acquisitions of a crosslock. - * - * depend_after() is necessary to initialize only the first - * valid xlock so that the xlock can be used on its commit. - */ - if (xlock->nr_acquire++ && depend_after(&xlock->hlock)) - goto unlock; - - gen_id = (unsigned int)atomic_inc_return(&cross_gen_id); - xlock->hlock = *hlock; - xlock->hlock.gen_id = gen_id; -unlock: - graph_unlock(); - return 1; -} - -/* - * Called for both normal and crosslock acquires. Normal locks will be - * pushed on the hist_lock queue. Cross locks will record state and - * stop regular lock_acquire() to avoid being placed on the held_lock - * stack. - * - * Return: 0 - failure; - * 1 - crosslock, done; - * 2 - normal lock, continue to held_lock[] ops. - */ -static int lock_acquire_crosslock(struct held_lock *hlock) -{ - /* - * CONTEXT 1 CONTEXT 2 - * --------- --------- - * lock A (cross) - * X = atomic_inc_return(&cross_gen_id) - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - * Y = atomic_read_acquire(&cross_gen_id) - * lock B - * - * atomic_read_acquire() is for ordering between A and B, - * IOW, A happens before B, when CONTEXT 2 see Y >= X. - * - * Pairs with atomic_inc_return() in add_xlock(). - */ - hlock->gen_id = (unsigned int)atomic_read_acquire(&cross_gen_id); - - if (cross_lock(hlock->instance)) - return add_xlock(hlock); - - check_add_xhlock(hlock); - return 2; -} - -static int copy_trace(struct stack_trace *trace) -{ - unsigned long *buf = stack_trace + nr_stack_trace_entries; - unsigned int max_nr = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; - unsigned int nr = min(max_nr, trace->nr_entries); - - trace->nr_entries = nr; - memcpy(buf, trace->entries, nr * sizeof(trace->entries[0])); - trace->entries = buf; - nr_stack_trace_entries += nr; - - if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { - if (!debug_locks_off_graph_unlock()) - return 0; - - print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!"); - dump_stack(); - - return 0; - } - - return 1; -} - -static int commit_xhlock(struct cross_lock *xlock, struct hist_lock *xhlock) -{ - unsigned int xid, pid; - u64 chain_key; - - xid = xlock_class(xlock) - lock_classes; - chain_key = iterate_chain_key((u64)0, xid); - pid = xhlock_class(xhlock) - lock_classes; - chain_key = iterate_chain_key(chain_key, pid); - - if (lookup_chain_cache(chain_key)) - return 1; - - if (!add_chain_cache_classes(xid, pid, xhlock->hlock.irq_context, - chain_key)) - return 0; - - if (!check_prev_add(current, &xlock->hlock, &xhlock->hlock, 1, - &xhlock->trace, copy_trace)) - return 0; - - return 1; -} - -static void commit_xhlocks(struct cross_lock *xlock) -{ - unsigned int cur = current->xhlock_idx; - unsigned int prev_hist_id = xhlock(cur).hist_id; - unsigned int i; - - if (!graph_lock()) - return; - - if (xlock->nr_acquire) { - for (i = 0; i < MAX_XHLOCKS_NR; i++) { - struct hist_lock *xhlock = &xhlock(cur - i); - - if (!xhlock_valid(xhlock)) - break; - - if (before(xhlock->hlock.gen_id, xlock->hlock.gen_id)) - break; - - if (!same_context_xhlock(xhlock)) - break; - - /* - * Filter out the cases where the ring buffer was - * overwritten and the current entry has a bigger - * hist_id than the previous one, which is impossible - * otherwise: - */ - if (unlikely(before(prev_hist_id, xhlock->hist_id))) - break; - - prev_hist_id = xhlock->hist_id; - - /* - * commit_xhlock() returns 0 with graph_lock already - * released if fail. - */ - if (!commit_xhlock(xlock, xhlock)) - return; - } - } - - graph_unlock(); -} - -void lock_commit_crosslock(struct lockdep_map *lock) -{ - struct cross_lock *xlock; - unsigned long flags; - - if (unlikely(!debug_locks || current->lockdep_recursion)) - return; - - if (!current->xhlocks) - return; - - /* - * Do commit hist_locks with the cross_lock, only in case that - * the cross_lock could depend on acquisitions after that. - * - * For example, if the cross_lock does not have the 'check' flag - * then we don't need to check dependencies and commit for that. - * Just skip it. In that case, of course, the cross_lock does - * not depend on acquisitions ahead, either. - * - * WARNING: Don't do that in add_xlock() in advance. When an - * acquisition context is different from the commit context, - * invalid(skipped) cross_lock might be accessed. - */ - if (!depend_after(&((struct lockdep_map_cross *)lock)->xlock.hlock)) - return; - - raw_local_irq_save(flags); - check_flags(flags); - current->lockdep_recursion = 1; - xlock = &((struct lockdep_map_cross *)lock)->xlock; - commit_xhlocks(xlock); - current->lockdep_recursion = 0; - raw_local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(lock_commit_crosslock); - -/* - * Return: 0 - failure; - * 1 - crosslock, done; - * 2 - normal lock, continue to held_lock[] ops. - */ -static int lock_release_crosslock(struct lockdep_map *lock) -{ - if (cross_lock(lock)) { - if (!graph_lock()) - return 0; - ((struct lockdep_map_cross *)lock)->xlock.nr_acquire--; - graph_unlock(); - return 1; - } - return 2; -} - -static void cross_init(struct lockdep_map *lock, int cross) -{ - if (cross) - ((struct lockdep_map_cross *)lock)->xlock.nr_acquire = 0; - - lock->cross = cross; - - /* - * Crossrelease assumes that the ring buffer size of xhlocks - * is aligned with power of 2. So force it on build. - */ - BUILD_BUG_ON(MAX_XHLOCKS_NR & (MAX_XHLOCKS_NR - 1)); -} - -void lockdep_init_task(struct task_struct *task) -{ - int i; - - task->xhlock_idx = UINT_MAX; - task->hist_id = 0; - - for (i = 0; i < XHLOCK_CTX_NR; i++) { - task->xhlock_idx_hist[i] = UINT_MAX; - task->hist_id_save[i] = 0; - } - - task->xhlocks = kzalloc(sizeof(struct hist_lock) * MAX_XHLOCKS_NR, - GFP_KERNEL); -} - -void lockdep_free_task(struct task_struct *task) -{ - if (task->xhlocks) { - void *tmp = task->xhlocks; - /* Diable crossrelease for current */ - task->xhlocks = NULL; - kfree(tmp); - } -} -#endif diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 947d3e2ed5c2..9d5b78aad4c5 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1099,8 +1099,6 @@ config PROVE_LOCKING select DEBUG_MUTEXES select DEBUG_RT_MUTEXES if RT_MUTEXES select DEBUG_LOCK_ALLOC - select LOCKDEP_CROSSRELEASE - select LOCKDEP_COMPLETIONS select TRACE_IRQFLAGS default n help @@ -1170,37 +1168,6 @@ config LOCK_STAT CONFIG_LOCK_STAT defines "contended" and "acquired" lock events. (CONFIG_LOCKDEP defines "acquire" and "release" events.) -config LOCKDEP_CROSSRELEASE - bool - help - This makes lockdep work for crosslock which is a lock allowed to - be released in a different context from the acquisition context. - Normally a lock must be released in the context acquiring the lock. - However, relexing this constraint helps synchronization primitives - such as page locks or completions can use the lock correctness - detector, lockdep. - -config LOCKDEP_COMPLETIONS - bool - help - A deadlock caused by wait_for_completion() and complete() can be - detected by lockdep using crossrelease feature. - -config BOOTPARAM_LOCKDEP_CROSSRELEASE_FULLSTACK - bool "Enable the boot parameter, crossrelease_fullstack" - depends on LOCKDEP_CROSSRELEASE - default n - help - The lockdep "cross-release" feature needs to record stack traces - (of calling functions) for all acquisitions, for eventual later - use during analysis. By default only a single caller is recorded, - because the unwind operation can be very expensive with deeper - stack chains. - - However a boot parameter, crossrelease_fullstack, was - introduced since sometimes deeper traces are required for full - analysis. This option turns on the boot parameter. - config DEBUG_LOCKDEP bool "Lock dependency engine debugging" depends on DEBUG_KERNEL && LOCKDEP From 86c9e8126e9fbcbf06c36e285168b880369a537c Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 12 Dec 2017 10:48:54 +0000 Subject: [PATCH 150/269] arm64: mm: Fix false positives in set_pte_at access/dirty race detection Jiankang reports that our race detection in set_pte_at is firing when copying the page tables in dup_mmap as a result of a fork(). In this situation, the page table isn't actually live and so there is no way that we can race with a concurrent update from the hardware page table walker. This patch reworks the race detection so that we require either the mm to match the current active_mm (i.e. currently installed in our TTBR0) or the mm_users count to be greater than 1, implying that the page table could be live in another CPU. The mm_users check might still be racy, but we'll avoid false positives and it's not realistic to validate that all the necessary locks are held as part of this assertion. Cc: Yisheng Xie Reported-by: Jiankang Chen Tested-by: Jiankang Chen Signed-off-by: Will Deacon --- arch/arm64/include/asm/pgtable.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 3ff03a755c32..bdcc7f1c9d06 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -42,6 +42,8 @@ #include #include #include +#include +#include extern void __pte_error(const char *file, int line, unsigned long val); extern void __pmd_error(const char *file, int line, unsigned long val); @@ -215,9 +217,6 @@ static inline void set_pte(pte_t *ptep, pte_t pte) } } -struct mm_struct; -struct vm_area_struct; - extern void __sync_icache_dcache(pte_t pteval, unsigned long addr); /* @@ -246,7 +245,8 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, * hardware updates of the pte (ptep_set_access_flags safely changes * valid ptes without going through an invalid entry). */ - if (pte_valid(*ptep) && pte_valid(pte)) { + if (IS_ENABLED(CONFIG_DEBUG_VM) && pte_valid(*ptep) && pte_valid(pte) && + (mm == current->active_mm || atomic_read(&mm->mm_users) > 1)) { VM_WARN_ONCE(!pte_young(pte), "%s: racy access flag clearing: 0x%016llx -> 0x%016llx", __func__, pte_val(*ptep), pte_val(pte)); From c622cc013cece073722592cff1ac6643a33b1622 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Mon, 11 Dec 2017 16:42:31 -0600 Subject: [PATCH 151/269] arm64: Define cputype macros for Falkor CPU Add cputype definition macros for Qualcomm Datacenter Technologies Falkor CPU in cputype.h. It's unfortunate that the first revision of the Falkor CPU used the wrong part number 0x800, got fixed in v2 chip with part number 0xC00, and would be used the same value for future revisions. Signed-off-by: Shanker Donthineni Signed-off-by: Will Deacon --- arch/arm64/include/asm/cputype.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 235e77d98261..cbf08d7cbf30 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -91,6 +91,7 @@ #define BRCM_CPU_PART_VULCAN 0x516 #define QCOM_CPU_PART_FALKOR_V1 0x800 +#define QCOM_CPU_PART_FALKOR 0xC00 #define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53) #define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57) @@ -99,6 +100,7 @@ #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX) #define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX) #define MIDR_QCOM_FALKOR_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_FALKOR_V1) +#define MIDR_QCOM_FALKOR MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_FALKOR) #ifndef __ASSEMBLY__ From 932b50c7c1c65e6f23002e075b97ee083c4a9e71 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Mon, 11 Dec 2017 16:42:32 -0600 Subject: [PATCH 152/269] arm64: Add software workaround for Falkor erratum 1041 The ARM architecture defines the memory locations that are permitted to be accessed as the result of a speculative instruction fetch from an exception level for which all stages of translation are disabled. Specifically, the core is permitted to speculatively fetch from the 4KB region containing the current program counter 4K and next 4K. When translation is changed from enabled to disabled for the running exception level (SCTLR_ELn[M] changed from a value of 1 to 0), the Falkor core may errantly speculatively access memory locations outside of the 4KB region permitted by the architecture. The errant memory access may lead to one of the following unexpected behaviors. 1) A System Error Interrupt (SEI) being raised by the Falkor core due to the errant memory access attempting to access a region of memory that is protected by a slave-side memory protection unit. 2) Unpredictable device behavior due to a speculative read from device memory. This behavior may only occur if the instruction cache is disabled prior to or coincident with translation being changed from enabled to disabled. The conditions leading to this erratum will not occur when either of the following occur: 1) A higher exception level disables translation of a lower exception level (e.g. EL2 changing SCTLR_EL1[M] from a value of 1 to 0). 2) An exception level disabling its stage-1 translation if its stage-2 translation is enabled (e.g. EL1 changing SCTLR_EL1[M] from a value of 1 to 0 when HCR_EL2[VM] has a value of 1). To avoid the errant behavior, software must execute an ISB immediately prior to executing the MSR that will change SCTLR_ELn[M] from 1 to 0. Signed-off-by: Shanker Donthineni Signed-off-by: Will Deacon --- Documentation/arm64/silicon-errata.txt | 1 + arch/arm64/Kconfig | 12 +++++++++++- arch/arm64/include/asm/assembler.h | 10 ++++++++++ arch/arm64/kernel/cpu-reset.S | 1 + arch/arm64/kernel/efi-entry.S | 2 ++ arch/arm64/kernel/head.S | 1 + arch/arm64/kernel/relocate_kernel.S | 1 + arch/arm64/kvm/hyp-init.S | 1 + 8 files changed, 28 insertions(+), 1 deletion(-) diff --git a/Documentation/arm64/silicon-errata.txt b/Documentation/arm64/silicon-errata.txt index 304bf22bb83c..fc1c884fea10 100644 --- a/Documentation/arm64/silicon-errata.txt +++ b/Documentation/arm64/silicon-errata.txt @@ -75,3 +75,4 @@ stable kernels. | Qualcomm Tech. | Falkor v1 | E1003 | QCOM_FALKOR_ERRATUM_1003 | | Qualcomm Tech. | Falkor v1 | E1009 | QCOM_FALKOR_ERRATUM_1009 | | Qualcomm Tech. | QDF2400 ITS | E0065 | QCOM_QDF2400_ERRATUM_0065 | +| Qualcomm Tech. | Falkor v{1,2} | E1041 | QCOM_FALKOR_ERRATUM_1041 | diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a93339f5178f..c9a7e9e1414f 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -557,7 +557,6 @@ config QCOM_QDF2400_ERRATUM_0065 If unsure, say Y. - config SOCIONEXT_SYNQUACER_PREITS bool "Socionext Synquacer: Workaround for GICv3 pre-ITS" default y @@ -576,6 +575,17 @@ config HISILICON_ERRATUM_161600802 a 128kB offset to be applied to the target address in this commands. If unsure, say Y. + +config QCOM_FALKOR_ERRATUM_E1041 + bool "Falkor E1041: Speculative instruction fetches might cause errant memory access" + default y + help + Falkor CPU may speculatively fetch instructions from an improper + memory location when MMU translation is changed from SCTLR_ELn[M]=1 + to SCTLR_ELn[M]=0. Prefix an ISB instruction to fix the problem. + + If unsure, say Y. + endmenu diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index aef72d886677..8b168280976f 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -512,4 +512,14 @@ alternative_else_nop_endif #endif .endm +/** + * Errata workaround prior to disable MMU. Insert an ISB immediately prior + * to executing the MSR that will change SCTLR_ELn[M] from a value of 1 to 0. + */ + .macro pre_disable_mmu_workaround +#ifdef CONFIG_QCOM_FALKOR_ERRATUM_E1041 + isb +#endif + .endm + #endif /* __ASM_ASSEMBLER_H */ diff --git a/arch/arm64/kernel/cpu-reset.S b/arch/arm64/kernel/cpu-reset.S index 65f42d257414..2a752cb2a0f3 100644 --- a/arch/arm64/kernel/cpu-reset.S +++ b/arch/arm64/kernel/cpu-reset.S @@ -37,6 +37,7 @@ ENTRY(__cpu_soft_restart) mrs x12, sctlr_el1 ldr x13, =SCTLR_ELx_FLAGS bic x12, x12, x13 + pre_disable_mmu_workaround msr sctlr_el1, x12 isb diff --git a/arch/arm64/kernel/efi-entry.S b/arch/arm64/kernel/efi-entry.S index 4e6ad355bd05..6b9736c3fb56 100644 --- a/arch/arm64/kernel/efi-entry.S +++ b/arch/arm64/kernel/efi-entry.S @@ -96,6 +96,7 @@ ENTRY(entry) mrs x0, sctlr_el2 bic x0, x0, #1 << 0 // clear SCTLR.M bic x0, x0, #1 << 2 // clear SCTLR.C + pre_disable_mmu_workaround msr sctlr_el2, x0 isb b 2f @@ -103,6 +104,7 @@ ENTRY(entry) mrs x0, sctlr_el1 bic x0, x0, #1 << 0 // clear SCTLR.M bic x0, x0, #1 << 2 // clear SCTLR.C + pre_disable_mmu_workaround msr sctlr_el1, x0 isb 2: diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 67e86a0f57ac..e3cb9fbf96b6 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -750,6 +750,7 @@ __primary_switch: * to take into account by discarding the current kernel mapping and * creating a new one. */ + pre_disable_mmu_workaround msr sctlr_el1, x20 // disable the MMU isb bl __create_page_tables // recreate kernel mapping diff --git a/arch/arm64/kernel/relocate_kernel.S b/arch/arm64/kernel/relocate_kernel.S index ce704a4aeadd..f407e422a720 100644 --- a/arch/arm64/kernel/relocate_kernel.S +++ b/arch/arm64/kernel/relocate_kernel.S @@ -45,6 +45,7 @@ ENTRY(arm64_relocate_new_kernel) mrs x0, sctlr_el2 ldr x1, =SCTLR_ELx_FLAGS bic x0, x0, x1 + pre_disable_mmu_workaround msr sctlr_el2, x0 isb 1: diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S index 3f9615582377..870828c364c5 100644 --- a/arch/arm64/kvm/hyp-init.S +++ b/arch/arm64/kvm/hyp-init.S @@ -151,6 +151,7 @@ reset: mrs x5, sctlr_el2 ldr x6, =SCTLR_ELx_FLAGS bic x5, x5, x6 // Clear SCTL_M and etc + pre_disable_mmu_workaround msr sctlr_el2, x5 isb From 0e17cada2a5b4dc847082e1db0e3f84599ffd436 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 12 Dec 2017 11:53:26 +0000 Subject: [PATCH 153/269] arm64: hw_breakpoint: Use linux/uaccess.h instead of asm/uaccess.h The only inclusion of asm/uaccess.h should be by linux/uaccess.h. All other headers should use the latter. Reported-by: Al Viro Signed-off-by: Will Deacon --- arch/arm64/kernel/hw_breakpoint.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index 749f81779420..74bb56f656ef 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -36,7 +37,6 @@ #include #include #include -#include /* Breakpoint currently in use for each BRP. */ static DEFINE_PER_CPU(struct perf_event *, bp_on_reg[ARM_MAX_BRP]); From 32fd87b3bbf5f7a045546401dfe2894dbbf4d8c3 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 11 Dec 2017 22:48:41 +0100 Subject: [PATCH 154/269] USB: core: only clean up what we allocated When cleaning up the configurations, make sure we only free the number of configurations and interfaces that we could have allocated. Reported-by: Andrey Konovalov Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/config.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c index 55b198ba629b..93b38471754e 100644 --- a/drivers/usb/core/config.c +++ b/drivers/usb/core/config.c @@ -764,18 +764,21 @@ void usb_destroy_configuration(struct usb_device *dev) return; if (dev->rawdescriptors) { - for (i = 0; i < dev->descriptor.bNumConfigurations; i++) + for (i = 0; i < dev->descriptor.bNumConfigurations && + i < USB_MAXCONFIG; i++) kfree(dev->rawdescriptors[i]); kfree(dev->rawdescriptors); dev->rawdescriptors = NULL; } - for (c = 0; c < dev->descriptor.bNumConfigurations; c++) { + for (c = 0; c < dev->descriptor.bNumConfigurations && + c < USB_MAXCONFIG; c++) { struct usb_host_config *cf = &dev->config[c]; kfree(cf->string); - for (i = 0; i < cf->desc.bNumInterfaces; i++) { + for (i = 0; i < cf->desc.bNumInterfaces && + i < USB_MAXINTERFACES; i++) { if (cf->intf_cache[i]) kref_put(&cf->intf_cache[i]->ref, usb_release_interface_cache); From f971e511cb7d6f1b3730248cf2967d3ccdd8874c Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 27 Nov 2017 10:38:21 +0000 Subject: [PATCH 155/269] tools/perf: Convert ACCESS_ONCE() to READ_ONCE() Recently there was a treewide conversion of ACCESS_ONCE() to {READ,WRITE}_ONCE(), but a new use was introduced concurrently by commit: 1695849735752d2a ("perf mmap: Move perf_mmap and methods to separate mmap.[ch] files") Let's convert this over to READ_ONCE() so that we can remove the ACCESS_ONCE() definitions in subsequent patches. Tested-by: Paul E. McKenney Signed-off-by: Mark Rutland Reviewed-by: Paul E. McKenney Cc: Arnaldo Carvalho de Melo Cc: Joe Perches Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: apw@canonical.com Link: http://lkml.kernel.org/r/20171127103824.36526-2-mark.rutland@arm.com Signed-off-by: Ingo Molnar --- tools/perf/util/mmap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h index efd78b827b05..3a5cb5a6e94a 100644 --- a/tools/perf/util/mmap.h +++ b/tools/perf/util/mmap.h @@ -70,7 +70,7 @@ void perf_mmap__read_catchup(struct perf_mmap *md); static inline u64 perf_mmap__read_head(struct perf_mmap *mm) { struct perf_event_mmap_page *pc = mm->base; - u64 head = ACCESS_ONCE(pc->data_head); + u64 head = READ_ONCE(pc->data_head); rmb(); return head; } From 2a22f692bbe0a7933acbd50045479ffc0fdf11f7 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 27 Nov 2017 10:38:22 +0000 Subject: [PATCH 156/269] tools/include: Remove ACCESS_ONCE() There are no longer any usersapce uses of ACCESS_ONCE(), so we can remove the definition from our userspace , which is only used by tools in the kernel directory (i.e. it isn't a uapi header). This patch removes the ACCESS_ONCE() definition, and updates comments which referred to it. At the same time, some inconsistent and redundant whitespace is removed from comments. Tested-by: Paul E. McKenney Signed-off-by: Mark Rutland Cc: Arnaldo Carvalho de Melo Cc: Joe Perches Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: apw@canonical.com Link: http://lkml.kernel.org/r/20171127103824.36526-3-mark.rutland@arm.com Signed-off-by: Ingo Molnar --- tools/include/linux/compiler.h | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/tools/include/linux/compiler.h b/tools/include/linux/compiler.h index 07fd03c74a77..04e32f965ad7 100644 --- a/tools/include/linux/compiler.h +++ b/tools/include/linux/compiler.h @@ -84,8 +84,6 @@ #define uninitialized_var(x) x = *(&(x)) -#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) - #include /* @@ -135,20 +133,19 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s /* * Prevent the compiler from merging or refetching reads or writes. The * compiler is also forbidden from reordering successive instances of - * READ_ONCE, WRITE_ONCE and ACCESS_ONCE (see below), but only when the - * compiler is aware of some particular ordering. One way to make the - * compiler aware of ordering is to put the two invocations of READ_ONCE, - * WRITE_ONCE or ACCESS_ONCE() in different C statements. + * READ_ONCE and WRITE_ONCE, but only when the compiler is aware of some + * particular ordering. One way to make the compiler aware of ordering is to + * put the two invocations of READ_ONCE or WRITE_ONCE in different C + * statements. * - * In contrast to ACCESS_ONCE these two macros will also work on aggregate - * data types like structs or unions. If the size of the accessed data - * type exceeds the word size of the machine (e.g., 32 bits or 64 bits) - * READ_ONCE() and WRITE_ONCE() will fall back to memcpy and print a - * compile-time warning. + * These two macros will also work on aggregate data types like structs or + * unions. If the size of the accessed data type exceeds the word size of + * the machine (e.g., 32 bits or 64 bits) READ_ONCE() and WRITE_ONCE() will + * fall back to memcpy and print a compile-time warning. * * Their two major use cases are: (1) Mediating communication between * process-level code and irq/NMI handlers, all running on the same CPU, - * and (2) Ensuring that the compiler does not fold, spindle, or otherwise + * and (2) Ensuring that the compiler does not fold, spindle, or otherwise * mutilate accesses that either do not require ordering or that interact * with an explicit memory barrier or atomic instruction that provides the * required ordering. From b899a850431e2dd0943205a63a68573f3e312d0d Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 27 Nov 2017 10:38:23 +0000 Subject: [PATCH 157/269] compiler.h: Remove ACCESS_ONCE() There are no longer any kernelspace uses of ACCESS_ONCE(), so we can remove the definition from . This patch removes the ACCESS_ONCE() definition, and updates comments which referred to it. At the same time, some inconsistent and redundant whitespace is removed from comments. Tested-by: Paul E. McKenney Signed-off-by: Mark Rutland Cc: Arnaldo Carvalho de Melo Cc: Joe Perches Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: apw@canonical.com Link: http://lkml.kernel.org/r/20171127103824.36526-4-mark.rutland@arm.com Signed-off-by: Ingo Molnar --- include/linux/compiler.h | 47 ++++++++++------------------------------ 1 file changed, 11 insertions(+), 36 deletions(-) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 188ed9f65517..52e611ab9a6c 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -220,21 +220,21 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s /* * Prevent the compiler from merging or refetching reads or writes. The * compiler is also forbidden from reordering successive instances of - * READ_ONCE, WRITE_ONCE and ACCESS_ONCE (see below), but only when the - * compiler is aware of some particular ordering. One way to make the - * compiler aware of ordering is to put the two invocations of READ_ONCE, - * WRITE_ONCE or ACCESS_ONCE() in different C statements. + * READ_ONCE and WRITE_ONCE, but only when the compiler is aware of some + * particular ordering. One way to make the compiler aware of ordering is to + * put the two invocations of READ_ONCE or WRITE_ONCE in different C + * statements. * - * In contrast to ACCESS_ONCE these two macros will also work on aggregate - * data types like structs or unions. If the size of the accessed data - * type exceeds the word size of the machine (e.g., 32 bits or 64 bits) - * READ_ONCE() and WRITE_ONCE() will fall back to memcpy(). There's at - * least two memcpy()s: one for the __builtin_memcpy() and then one for - * the macro doing the copy of variable - '__u' allocated on the stack. + * These two macros will also work on aggregate data types like structs or + * unions. If the size of the accessed data type exceeds the word size of + * the machine (e.g., 32 bits or 64 bits) READ_ONCE() and WRITE_ONCE() will + * fall back to memcpy(). There's at least two memcpy()s: one for the + * __builtin_memcpy() and then one for the macro doing the copy of variable + * - '__u' allocated on the stack. * * Their two major use cases are: (1) Mediating communication between * process-level code and irq/NMI handlers, all running on the same CPU, - * and (2) Ensuring that the compiler does not fold, spindle, or otherwise + * and (2) Ensuring that the compiler does not fold, spindle, or otherwise * mutilate accesses that either do not require ordering or that interact * with an explicit memory barrier or atomic instruction that provides the * required ordering. @@ -327,29 +327,4 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s compiletime_assert(__native_word(t), \ "Need native word sized stores/loads for atomicity.") -/* - * Prevent the compiler from merging or refetching accesses. The compiler - * is also forbidden from reordering successive instances of ACCESS_ONCE(), - * but only when the compiler is aware of some particular ordering. One way - * to make the compiler aware of ordering is to put the two invocations of - * ACCESS_ONCE() in different C statements. - * - * ACCESS_ONCE will only work on scalar types. For union types, ACCESS_ONCE - * on a union member will work as long as the size of the member matches the - * size of the union and the size is smaller than word size. - * - * The major use cases of ACCESS_ONCE used to be (1) Mediating communication - * between process-level code and irq/NMI handlers, all running on the same CPU, - * and (2) Ensuring that the compiler does not fold, spindle, or otherwise - * mutilate accesses that either do not require ordering or that interact - * with an explicit memory barrier or atomic instruction that provides the - * required ordering. - * - * If possible use READ_ONCE()/WRITE_ONCE() instead. - */ -#define __ACCESS_ONCE(x) ({ \ - __maybe_unused typeof(x) __var = (__force typeof(x)) 0; \ - (volatile typeof(x) *)&(x); }) -#define ACCESS_ONCE(x) (*__ACCESS_ONCE(x)) - #endif /* __LINUX_COMPILER_H */ From 8cb562b1d56fad42cbee44bdc9bc64cea41a0a8c Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 27 Nov 2017 10:38:24 +0000 Subject: [PATCH 158/269] checkpatch: Remove ACCESS_ONCE() warning Now that ACCESS_ONCE() has been excised from the kernel, any uses will result in a build error, and we no longer need to whine about it in checkpatch. This patch removes the newly redundant warning. Tested-by: Paul E. McKenney Signed-off-by: Mark Rutland Acked-by: Joe Perches Cc: Andy Whitcroft Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@redhat.com Link: http://lkml.kernel.org/r/20171127103824.36526-5-mark.rutland@arm.com Signed-off-by: Ingo Molnar --- scripts/checkpatch.pl | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 040aa79e1d9d..31031f10fe56 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -6233,28 +6233,6 @@ sub process { } } -# whine about ACCESS_ONCE - if ($^V && $^V ge 5.10.0 && - $line =~ /\bACCESS_ONCE\s*$balanced_parens\s*(=(?!=))?\s*($FuncArg)?/) { - my $par = $1; - my $eq = $2; - my $fun = $3; - $par =~ s/^\(\s*(.*)\s*\)$/$1/; - if (defined($eq)) { - if (WARN("PREFER_WRITE_ONCE", - "Prefer WRITE_ONCE(, ) over ACCESS_ONCE() = \n" . $herecurr) && - $fix) { - $fixed[$fixlinenr] =~ s/\bACCESS_ONCE\s*\(\s*\Q$par\E\s*\)\s*$eq\s*\Q$fun\E/WRITE_ONCE($par, $fun)/; - } - } else { - if (WARN("PREFER_READ_ONCE", - "Prefer READ_ONCE() over ACCESS_ONCE()\n" . $herecurr) && - $fix) { - $fixed[$fixlinenr] =~ s/\bACCESS_ONCE\s*\(\s*\Q$par\E\s*\)/READ_ONCE($par)/; - } - } - } - # check for mutex_trylock_recursive usage if ($line =~ /mutex_trylock_recursive/) { ERROR("LOCKING", From 0f3922a9b99eca76c6578cd84191573378f2c988 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 8 Dec 2017 04:17:28 -0700 Subject: [PATCH 159/269] x86/Xen: don't report ancient LAPIC version Unconditionally reporting a value seen on the P4 or older invokes functionality like io_apic_get_unique_id() on 32-bit builds, resulting in a panic() with sufficiently many CPUs and/or IO-APICs. Doing what that function does would be the hypervisor's responsibility anyway, so makes no sense to be used when running on Xen. Uniformly report a more modern version; this shouldn't matter much as both LAPIC and IO-APIC are being managed entirely / mostly by the hypervisor. Signed-off-by: Jan Beulich Reviewed-by: Juergen Gross Signed-off-by: Boris Ostrovsky --- arch/x86/xen/apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index b5e48da7fbff..c14048553c18 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -56,7 +56,7 @@ static u32 xen_apic_read(u32 reg) return 0; if (reg == APIC_LVR) - return 0x10; + return 0x14; #ifdef CONFIG_X86_32 if (reg == APIC_LDR) return SET_APIC_LOGICAL_ID(1UL << smp_processor_id()); From c4f9d9cb2c29ff04c6b4bb09b72802d8aedfc7cb Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 12 Dec 2017 03:18:11 -0700 Subject: [PATCH 160/269] xen: XEN_ACPI_PROCESSOR is Dom0-only Add a respective dependency. Signed-off-by: Jan Beulich Reviewed-by: Juergen Gross Signed-off-by: Boris Ostrovsky --- drivers/xen/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index d8dd54678ab7..e5d0c28372ea 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -269,7 +269,7 @@ config XEN_ACPI_HOTPLUG_CPU config XEN_ACPI_PROCESSOR tristate "Xen ACPI processor" - depends on XEN && X86 && ACPI_PROCESSOR && CPU_FREQ + depends on XEN && XEN_DOM0 && X86 && ACPI_PROCESSOR && CPU_FREQ default m help This ACPI processor uploads Power Management information to the Xen From 30791ac41927ebd3e75486f9504b6d2280463bf0 Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Mon, 11 Dec 2017 00:05:46 -0800 Subject: [PATCH 161/269] tcp md5sig: Use skb's saddr when replying to an incoming segment The MD5-key that belongs to a connection is identified by the peer's IP-address. When we are in tcp_v4(6)_reqsk_send_ack(), we are replying to an incoming segment from tcp_check_req() that failed the seq-number checks. Thus, to find the correct key, we need to use the skb's saddr and not the daddr. This bug seems to have been there since quite a while, but probably got unnoticed because the consequences are not catastrophic. We will call tcp_v4_reqsk_send_ack only to send a challenge-ACK back to the peer, thus the connection doesn't really fail. Fixes: 9501f9722922 ("tcp md5sig: Let the caller pass appropriate key for tcp_v{4,6}_do_calc_md5_hash().") Signed-off-by: Christoph Paasch Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 77ea45da0fe9..94e28350f420 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -848,7 +848,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, req->ts_recent, 0, - tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, + tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr, AF_INET), inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, ip_hdr(skb)->tos); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 1f04ec0e4a7a..7178476b3d2f 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -994,7 +994,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, req->ts_recent, sk->sk_bound_dev_if, - tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr), + tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr), 0, 0); } From 0c31f1d7be1b5c4858b1d714dcefa25f41428cab Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 7 Dec 2017 11:15:19 +0100 Subject: [PATCH 162/269] PCI: rcar: Fix use-after-free in probe error path If CONFIG_DEBUG_SLAB=y, and no PCIe card is inserted, the kernel crashes during probe on r8a7791/koelsch: rcar-pcie fe000000.pcie: PCIe link down Unable to handle kernel paging request at virtual address 6b6b6b6b (seeing this message requires earlycon and keep_bootcon). Indeed, pci_free_host_bridge() frees the PCI host bridge, including the embedded rcar_pcie object, so pci_free_resource_list() must not be called afterwards. To fix this, move the call to pci_free_resource_list() up, and update the label name accordingly. Fixes: ddd535f1ea3eb27e ("PCI: rcar: Fix memory leak when no PCIe card is inserted") Signed-off-by: Geert Uytterhoeven Signed-off-by: Bjorn Helgaas Acked-by: Simon Horman Acked-by: Lorenzo Pieralisi --- drivers/pci/host/pcie-rcar.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/pci/host/pcie-rcar.c b/drivers/pci/host/pcie-rcar.c index 12796eccb2be..52ab3cb0a0bf 100644 --- a/drivers/pci/host/pcie-rcar.c +++ b/drivers/pci/host/pcie-rcar.c @@ -1128,12 +1128,12 @@ static int rcar_pcie_probe(struct platform_device *pdev) err = rcar_pcie_get_resources(pcie); if (err < 0) { dev_err(dev, "failed to request resources: %d\n", err); - goto err_free_bridge; + goto err_free_resource_list; } err = rcar_pcie_parse_map_dma_ranges(pcie, dev->of_node); if (err) - goto err_free_bridge; + goto err_free_resource_list; pm_runtime_enable(dev); err = pm_runtime_get_sync(dev); @@ -1176,9 +1176,9 @@ err_pm_put: err_pm_disable: pm_runtime_disable(dev); -err_free_bridge: - pci_free_host_bridge(bridge); +err_free_resource_list: pci_free_resource_list(&pcie->resources); + pci_free_host_bridge(bridge); return err; } From 283ca526a9bd75aed7350220d7b1f8027d99c3fd Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 12 Dec 2017 02:25:30 +0100 Subject: [PATCH 163/269] bpf: fix corruption on concurrent perf_event_output calls When tracing and networking programs are both attached in the system and both use event-output helpers that eventually call into perf_event_output(), then we could end up in a situation where the tracing attached program runs in user context while a cls_bpf program is triggered on that same CPU out of softirq context. Since both rely on the same per-cpu perf_sample_data, we could potentially corrupt it. This can only ever happen in a combination of the two types; all tracing programs use a bpf_prog_active counter to bail out in case a program is already running on that CPU out of a different context. XDP and cls_bpf programs by themselves don't have this issue as they run in the same context only. Therefore, split both perf_sample_data so they cannot be accessed from each other. Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data") Reported-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Tested-by: Song Liu Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0ce99c379c30..40207c2a4113 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -343,14 +343,13 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = { .arg4_type = ARG_CONST_SIZE, }; -static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd); +static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd); static __always_inline u64 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, - u64 flags, struct perf_raw_record *raw) + u64 flags, struct perf_sample_data *sd) { struct bpf_array *array = container_of(map, struct bpf_array, map); - struct perf_sample_data *sd = this_cpu_ptr(&bpf_sd); unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; struct bpf_event_entry *ee; @@ -373,8 +372,6 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, if (unlikely(event->oncpu != cpu)) return -EOPNOTSUPP; - perf_sample_data_init(sd, 0, 0); - sd->raw = raw; perf_event_output(event, sd, regs); return 0; } @@ -382,6 +379,7 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, u64, flags, void *, data, u64, size) { + struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd); struct perf_raw_record raw = { .frag = { .size = size, @@ -392,7 +390,10 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, if (unlikely(flags & ~(BPF_F_INDEX_MASK))) return -EINVAL; - return __bpf_perf_event_output(regs, map, flags, &raw); + perf_sample_data_init(sd, 0, 0); + sd->raw = &raw; + + return __bpf_perf_event_output(regs, map, flags, sd); } static const struct bpf_func_proto bpf_perf_event_output_proto = { @@ -407,10 +408,12 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = { }; static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); +static DEFINE_PER_CPU(struct perf_sample_data, bpf_misc_sd); u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) { + struct perf_sample_data *sd = this_cpu_ptr(&bpf_misc_sd); struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs); struct perf_raw_frag frag = { .copy = ctx_copy, @@ -428,8 +431,10 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, }; perf_fetch_caller_regs(regs); + perf_sample_data_init(sd, 0, 0); + sd->raw = &raw; - return __bpf_perf_event_output(regs, map, flags, &raw); + return __bpf_perf_event_output(regs, map, flags, sd); } BPF_CALL_0(bpf_get_current_task) From a23f06f06dbe54696e8d4f156b317e8c9961c345 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 12 Dec 2017 02:25:31 +0100 Subject: [PATCH 164/269] bpf: fix build issues on um due to mising bpf_perf_event.h Since c895f6f703ad ("bpf: correct broken uapi for BPF_PROG_TYPE_PERF_EVENT program type") um (uml) won't build on i386 or x86_64: [...] CC init/main.o In file included from ../include/linux/perf_event.h:18:0, from ../include/linux/trace_events.h:10, from ../include/trace/syscall.h:7, from ../include/linux/syscalls.h:82, from ../init/main.c:20: ../include/uapi/linux/bpf_perf_event.h:11:32: fatal error: asm/bpf_perf_event.h: No such file or directory #include [...] Lets add missing bpf_perf_event.h also to um arch. This seems to be the only one still missing. Fixes: c895f6f703ad ("bpf: correct broken uapi for BPF_PROG_TYPE_PERF_EVENT program type") Reported-by: Randy Dunlap Suggested-by: Richard Weinberger Signed-off-by: Daniel Borkmann Tested-by: Randy Dunlap Cc: Hendrik Brueckner Cc: Richard Weinberger Acked-by: Alexei Starovoitov Acked-by: Richard Weinberger Signed-off-by: Alexei Starovoitov --- arch/um/include/asm/Kbuild | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild index 50a32c33d729..73c57f614c9e 100644 --- a/arch/um/include/asm/Kbuild +++ b/arch/um/include/asm/Kbuild @@ -1,4 +1,5 @@ generic-y += barrier.h +generic-y += bpf_perf_event.h generic-y += bug.h generic-y += clkdev.h generic-y += current.h From 720f228e8d3128b7ab1d39f51fdd8da07a7640c9 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 12 Dec 2017 02:25:32 +0100 Subject: [PATCH 165/269] bpf: fix broken BPF selftest build At least on x86_64, the kernel's BPF selftests seemed to have stopped to build due to 618e165b2a8e ("selftests/bpf: sync kernel headers and introduce arch support in Makefile"): [...] In file included from test_verifier.c:29:0: ../../../include/uapi/linux/bpf_perf_event.h:11:32: fatal error: asm/bpf_perf_event.h: No such file or directory #include ^ compilation terminated. [...] While pulling in tools/arch/*/include/uapi/asm/bpf_perf_event.h seems to work fine, there's no automated fall-back logic right now that would do the same out of tools/include/uapi/asm-generic/bpf_perf_event.h. The usual convention today is to add a include/[uapi/]asm/ equivalent that would pull in the correct arch header or generic one as fall-back, all ifdef'ed based on compiler target definition. It's similarly done also in other cases such as tools/include/asm/barrier.h, thus adapt the same here. Fixes: 618e165b2a8e ("selftests/bpf: sync kernel headers and introduce arch support in Makefile") Signed-off-by: Daniel Borkmann Cc: Hendrik Brueckner Cc: Arnaldo Carvalho de Melo Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- tools/include/uapi/asm/bpf_perf_event.h | 7 +++++++ tools/testing/selftests/bpf/Makefile | 13 +------------ 2 files changed, 8 insertions(+), 12 deletions(-) create mode 100644 tools/include/uapi/asm/bpf_perf_event.h diff --git a/tools/include/uapi/asm/bpf_perf_event.h b/tools/include/uapi/asm/bpf_perf_event.h new file mode 100644 index 000000000000..13a58531e6fa --- /dev/null +++ b/tools/include/uapi/asm/bpf_perf_event.h @@ -0,0 +1,7 @@ +#if defined(__aarch64__) +#include "../../arch/arm64/include/uapi/asm/bpf_perf_event.h" +#elif defined(__s390__) +#include "../../arch/s390/include/uapi/asm/bpf_perf_event.h" +#else +#include +#endif diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 21a2d76b67dc..792af7c3b74f 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -1,19 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 -ifeq ($(srctree),) -srctree := $(patsubst %/,%,$(dir $(CURDIR))) -srctree := $(patsubst %/,%,$(dir $(srctree))) -srctree := $(patsubst %/,%,$(dir $(srctree))) -srctree := $(patsubst %/,%,$(dir $(srctree))) -endif -include $(srctree)/tools/scripts/Makefile.arch - -$(call detected_var,SRCARCH) - LIBDIR := ../../../lib BPFDIR := $(LIBDIR)/bpf APIDIR := ../../../include/uapi -ASMDIR:= ../../../arch/$(ARCH)/include/uapi GENDIR := ../../../../include/generated GENHDR := $(GENDIR)/autoconf.h @@ -21,7 +10,7 @@ ifneq ($(wildcard $(GENHDR)),) GENFLAGS := -DHAVE_GENHDR endif -CFLAGS += -Wall -O2 -I$(APIDIR) -I$(ASMDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) -I../../../include +CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) -I../../../include LDLIBS += -lcap -lelf TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ From 9c41e452188339989c2c9ca5fc54f10935207968 Mon Sep 17 00:00:00 2001 From: Benjamin Gaignard Date: Thu, 30 Nov 2017 09:43:57 +0100 Subject: [PATCH 166/269] i2c: stm32: Fix copyrights Uniformize STMicroelectronics copyrights headers and add SPDX identifier. Signed-off-by: Benjamin Gaignard Acked-by: Alexandre TORGUE Acked-by: Pierre-Yves MORDRET Acked-by: M'boumba Cedric Madianga Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-stm32.h | 3 ++- drivers/i2c/busses/i2c-stm32f4.c | 3 ++- drivers/i2c/busses/i2c-stm32f7.c | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/i2c/busses/i2c-stm32.h b/drivers/i2c/busses/i2c-stm32.h index dab51761f8c5..d4f9cef251ac 100644 --- a/drivers/i2c/busses/i2c-stm32.h +++ b/drivers/i2c/busses/i2c-stm32.h @@ -1,10 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 /* * i2c-stm32.h * * Copyright (C) M'boumba Cedric Madianga 2017 + * Copyright (C) STMicroelectronics 2017 * Author: M'boumba Cedric Madianga * - * License terms: GNU General Public License (GPL), version 2 */ #ifndef _I2C_STM32_H diff --git a/drivers/i2c/busses/i2c-stm32f4.c b/drivers/i2c/busses/i2c-stm32f4.c index 4ec108496f15..47c8d00de53f 100644 --- a/drivers/i2c/busses/i2c-stm32f4.c +++ b/drivers/i2c/busses/i2c-stm32f4.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Driver for STMicroelectronics STM32 I2C controller * @@ -6,11 +7,11 @@ * http://www.st.com/resource/en/reference_manual/DM00031020.pdf * * Copyright (C) M'boumba Cedric Madianga 2016 + * Copyright (C) STMicroelectronics 2017 * Author: M'boumba Cedric Madianga * * This driver is based on i2c-st.c * - * License terms: GNU General Public License (GPL), version 2 */ #include diff --git a/drivers/i2c/busses/i2c-stm32f7.c b/drivers/i2c/busses/i2c-stm32f7.c index d4a6e9c2e9aa..b445b3bb0bb1 100644 --- a/drivers/i2c/busses/i2c-stm32f7.c +++ b/drivers/i2c/busses/i2c-stm32f7.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Driver for STMicroelectronics STM32F7 I2C controller * @@ -7,11 +8,11 @@ * http://www.st.com/resource/en/reference_manual/dm00124865.pdf * * Copyright (C) M'boumba Cedric Madianga 2017 + * Copyright (C) STMicroelectronics 2017 * Author: M'boumba Cedric Madianga * * This driver is based on i2c-stm32f4.c * - * License terms: GNU General Public License (GPL), version 2 */ #include #include From 45fd4470ba86e9ca2837b666a52cc65dc69f0fa3 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Thu, 7 Dec 2017 12:25:45 +0100 Subject: [PATCH 167/269] i2c: piix4: Fix port number check on release The port number shift is still hard-coded to 1 while it now depends on the hardware. Thankfully 0 is always 0 no matter how you shift it, so this was a bug without consequences. Signed-off-by: Jean Delvare Fixes: 0fe16195f891 ("i2c: piix4: Fix SMBus port selection for AMD Family 17h chips") Reviewed-by: Guenter Roeck Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-piix4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-piix4.c b/drivers/i2c/busses/i2c-piix4.c index 174579d32e5f..462948e2c535 100644 --- a/drivers/i2c/busses/i2c-piix4.c +++ b/drivers/i2c/busses/i2c-piix4.c @@ -983,7 +983,7 @@ static void piix4_adap_remove(struct i2c_adapter *adap) if (adapdata->smba) { i2c_del_adapter(adap); - if (adapdata->port == (0 << 1)) { + if (adapdata->port == (0 << piix4_port_shift_sb800)) { release_region(adapdata->smba, SMBIOSIZE); if (adapdata->sb800_main) release_region(SB800_PIIX4_SMB_IDX, 2); From 9147efcbe0b7cc96b18eb64b1a3f0d4bba81443c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Dec 2017 14:22:39 -0800 Subject: [PATCH 168/269] bpf: add schedule points to map alloc/free While using large percpu maps, htab_map_alloc() can hold cpu for hundreds of ms. This patch adds cond_resched() calls to percpu alloc/free call sites, all running in process context. Signed-off-by: Eric Dumazet Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index e469e05c8e83..3905d4bc5b80 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -114,6 +114,7 @@ static void htab_free_elems(struct bpf_htab *htab) pptr = htab_elem_get_ptr(get_htab_elem(htab, i), htab->map.key_size); free_percpu(pptr); + cond_resched(); } free_elems: bpf_map_area_free(htab->elems); @@ -159,6 +160,7 @@ static int prealloc_init(struct bpf_htab *htab) goto free_elems; htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, pptr); + cond_resched(); } skip_percpu_elems: From faa75e147b583417273902552c61cf3250a44308 Mon Sep 17 00:00:00 2001 From: Dongjiu Geng Date: Wed, 13 Dec 2017 18:36:47 +0800 Subject: [PATCH 169/269] arm64: fault: avoid send SIGBUS two times do_sea() calls arm64_notify_die() which will always signal user-space. It also returns whether APEI claimed the external abort as a RAS notification. If it returns failure do_mem_abort() will signal user-space too. do_mem_abort() wants to know if we handled the error, we always call arm64_notify_die() so can always return success. Signed-off-by: Dongjiu Geng Reviewed-by: James Morse Reviewed-by: Xie XiuQi Signed-off-by: Will Deacon --- arch/arm64/mm/fault.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 22168cd0dde7..9b7f89df49db 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -574,7 +574,6 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) { struct siginfo info; const struct fault_info *inf; - int ret = 0; inf = esr_to_fault_info(esr); pr_err("Synchronous External Abort: %s (0x%08x) at 0x%016lx\n", @@ -589,7 +588,7 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) if (interrupts_enabled(regs)) nmi_enter(); - ret = ghes_notify_sea(); + ghes_notify_sea(); if (interrupts_enabled(regs)) nmi_exit(); @@ -604,7 +603,7 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) info.si_addr = (void __user *)addr; arm64_notify_die("", regs, &info, esr); - return ret; + return 0; } static const struct fault_info fault_info[] = { From 92ccc262e485781ff4c0fb3b7c77a619282df49a Mon Sep 17 00:00:00 2001 From: Mengting Zhang Date: Tue, 12 Dec 2017 18:16:57 +0000 Subject: [PATCH 170/269] tools/lib/lockdep: Add missing declaration of 'pr_cont()' Commit: 681fbec881de ("lockdep: Use consistent printing primitives") has moved lockdep away from using printk() for printing. The commit added usage of pr_cont() which wasn't wrapped in the userspace headers, causing the following warning for the liblockdep build: ../../../kernel/locking/lockdep.c:3544:2: warning: implicit declaration of function 'pr_cont' [-Wimplicit-function-declaration] Adding an empty declaration of 'pr_cont' fixes the problem. Signed-off-by: Mengting Zhang Signed-off-by: Sasha Levin Reviewed-by: Alexander Sverdlin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: a.p.zijlstra@chello.nl Link: http://lkml.kernel.org/r/20171212181644.11913-2-alexander.levin@verizon.com Signed-off-by: Ingo Molnar --- tools/include/linux/lockdep.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/include/linux/lockdep.h b/tools/include/linux/lockdep.h index 940c1b075659..6b0c36a58fcb 100644 --- a/tools/include/linux/lockdep.h +++ b/tools/include/linux/lockdep.h @@ -48,6 +48,7 @@ static inline int debug_locks_off(void) #define printk(...) dprintf(STDOUT_FILENO, __VA_ARGS__) #define pr_err(format, ...) fprintf (stderr, format, ## __VA_ARGS__) #define pr_warn pr_err +#define pr_cont pr_err #define list_del_rcu list_del From cf4df407e0d7cde60a45369c2a3414d18e2d4fdd Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 13 Dec 2017 11:59:39 +0100 Subject: [PATCH 171/269] Revert "USB: core: only clean up what we allocated" This reverts commit 32fd87b3bbf5f7a045546401dfe2894dbbf4d8c3. Alan wrote a better fix for this... Cc: Andrey Konovalov Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/config.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c index 93b38471754e..55b198ba629b 100644 --- a/drivers/usb/core/config.c +++ b/drivers/usb/core/config.c @@ -764,21 +764,18 @@ void usb_destroy_configuration(struct usb_device *dev) return; if (dev->rawdescriptors) { - for (i = 0; i < dev->descriptor.bNumConfigurations && - i < USB_MAXCONFIG; i++) + for (i = 0; i < dev->descriptor.bNumConfigurations; i++) kfree(dev->rawdescriptors[i]); kfree(dev->rawdescriptors); dev->rawdescriptors = NULL; } - for (c = 0; c < dev->descriptor.bNumConfigurations && - c < USB_MAXCONFIG; c++) { + for (c = 0; c < dev->descriptor.bNumConfigurations; c++) { struct usb_host_config *cf = &dev->config[c]; kfree(cf->string); - for (i = 0; i < cf->desc.bNumInterfaces && - i < USB_MAXINTERFACES; i++) { + for (i = 0; i < cf->desc.bNumInterfaces; i++) { if (cf->intf_cache[i]) kref_put(&cf->intf_cache[i]->ref, usb_release_interface_cache); From 48a4ff1c7bb5a32d2e396b03132d20d552c0eca7 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Tue, 12 Dec 2017 14:25:13 -0500 Subject: [PATCH 172/269] USB: core: prevent malicious bNumInterfaces overflow A malicious USB device with crafted descriptors can cause the kernel to access unallocated memory by setting the bNumInterfaces value too high in a configuration descriptor. Although the value is adjusted during parsing, this adjustment is skipped in one of the error return paths. This patch prevents the problem by setting bNumInterfaces to 0 initially. The existing code already sets it to the proper value after parsing is complete. Signed-off-by: Alan Stern Reported-by: Andrey Konovalov CC: Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/config.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c index 55b198ba629b..78e92d29f8d9 100644 --- a/drivers/usb/core/config.c +++ b/drivers/usb/core/config.c @@ -555,6 +555,9 @@ static int usb_parse_configuration(struct usb_device *dev, int cfgidx, unsigned iad_num = 0; memcpy(&config->desc, buffer, USB_DT_CONFIG_SIZE); + nintf = nintf_orig = config->desc.bNumInterfaces; + config->desc.bNumInterfaces = 0; // Adjusted later + if (config->desc.bDescriptorType != USB_DT_CONFIG || config->desc.bLength < USB_DT_CONFIG_SIZE || config->desc.bLength > size) { @@ -568,7 +571,6 @@ static int usb_parse_configuration(struct usb_device *dev, int cfgidx, buffer += config->desc.bLength; size -= config->desc.bLength; - nintf = nintf_orig = config->desc.bNumInterfaces; if (nintf > USB_MAXINTERFACES) { dev_warn(ddev, "config %d has too many interfaces: %d, " "using maximum allowed: %d\n", From 4b4df570b41dbb421f52605357d5d56c872df6d9 Mon Sep 17 00:00:00 2001 From: Keith Packard Date: Wed, 13 Dec 2017 00:44:26 -0800 Subject: [PATCH 173/269] drm: Update edid-derived drm_display_info fields at edid property set [v2] There are a set of values in the drm_display_info structure for each connector which hold information derived from EDID. These are computed in drm_add_display_info. Before this patch, that was only called in drm_add_edid_modes. This meant that they were only set when EDID was present and never reset when EDID was not, as happened when the display was disconnected. One of these fields, non_desktop, is used from drm_mode_connector_update_edid_property, the function responsible for assigning the new edid value to the application-visible property. Various drivers call these two functions (drm_add_edid_modes and drm_mode_connector_update_edid_property) in different orders. This means that even when EDID is present, the drm_display_info fields may not have been computed at the time that drm_mode_connector_update_edid_property used the non_desktop value to set the non_desktop property. I've added a public function (drm_reset_display_info) that resets the drm_display_info field values to default values and then made the drm_add_display_info function public. These two functions are now called directly from drm_mode_connector_update_edid_property so that the drm_display_info fields are always computed from the current EDID information before being used in that function. This means that the drm_display_info values are often computed twice, once when the EDID property it set and a second time when EDID is used to compute modes for the device. The alternative would be to uniformly ensure that the values were computed once before being used, which would require that all drivers reliably invoke the two paths in the same order. The computation is inexpensive enough that it seems more maintainable in the long term to simply compute them in both paths. The API to drm_add_display_info has been changed so that it no longer takes the set of edid-based quirks as a parameter. Rather, it now computes those quirks itself and returns them for further use by drm_add_edid_modes. This patch also includes a number of 'const' additions caused by drm_mode_connector_update_edid_property taking a 'const struct edid *' parameter and wanting to pass that along to drm_add_display_info. v2: after review by Daniel Vetter Removed EXPORT_SYMBOL_GPL for drm_reset_display_info and drm_add_display_info. Added FIXME in drm_mode_connector_update_edid_property about potentially merging that with drm_add_edid_modes to avoid the need for two driver calls. Signed-off-by: Keith Packard Reviewed-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20171213084427.31199-1-keithp@keithp.com (danvet: cherry picked from commit 12a889bf4bca ("drm: rework delayed connector cleanup in connector_iter") from drm-misc-next since functional conflict with changes in -next and we need to make sure both have the right version and nothing gets lost.) Signed-off-by: Daniel Vetter --- drivers/gpu/drm/drm_connector.c | 13 +++++++++ drivers/gpu/drm/drm_edid.c | 52 ++++++++++++++++++++++++--------- include/drm/drm_edid.h | 2 ++ 3 files changed, 53 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c index 482014137953..c4dfcbc861a1 100644 --- a/drivers/gpu/drm/drm_connector.c +++ b/drivers/gpu/drm/drm_connector.c @@ -1231,6 +1231,19 @@ int drm_mode_connector_update_edid_property(struct drm_connector *connector, if (edid) size = EDID_LENGTH * (1 + edid->extensions); + /* Set the display info, using edid if available, otherwise + * reseting the values to defaults. This duplicates the work + * done in drm_add_edid_modes, but that function is not + * consistently called before this one in all drivers and the + * computation is cheap enough that it seems better to + * duplicate it rather than attempt to ensure some arbitrary + * ordering of calls. + */ + if (edid) + drm_add_display_info(connector, edid); + else + drm_reset_display_info(connector); + drm_object_property_set_value(&connector->base, dev->mode_config.non_desktop_property, connector->display_info.non_desktop); diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c index 5dfe14763871..cb487148359a 100644 --- a/drivers/gpu/drm/drm_edid.c +++ b/drivers/gpu/drm/drm_edid.c @@ -1731,7 +1731,7 @@ EXPORT_SYMBOL(drm_edid_duplicate); * * Returns true if @vendor is in @edid, false otherwise */ -static bool edid_vendor(struct edid *edid, const char *vendor) +static bool edid_vendor(const struct edid *edid, const char *vendor) { char edid_vendor[3]; @@ -1749,7 +1749,7 @@ static bool edid_vendor(struct edid *edid, const char *vendor) * * This tells subsequent routines what fixes they need to apply. */ -static u32 edid_get_quirks(struct edid *edid) +static u32 edid_get_quirks(const struct edid *edid) { const struct edid_quirk *quirk; int i; @@ -2813,7 +2813,7 @@ add_detailed_modes(struct drm_connector *connector, struct edid *edid, /* * Search EDID for CEA extension block. */ -static u8 *drm_find_edid_extension(struct edid *edid, int ext_id) +static u8 *drm_find_edid_extension(const struct edid *edid, int ext_id) { u8 *edid_ext = NULL; int i; @@ -2835,12 +2835,12 @@ static u8 *drm_find_edid_extension(struct edid *edid, int ext_id) return edid_ext; } -static u8 *drm_find_cea_extension(struct edid *edid) +static u8 *drm_find_cea_extension(const struct edid *edid) { return drm_find_edid_extension(edid, CEA_EXT); } -static u8 *drm_find_displayid_extension(struct edid *edid) +static u8 *drm_find_displayid_extension(const struct edid *edid) { return drm_find_edid_extension(edid, DISPLAYID_EXT); } @@ -4363,7 +4363,7 @@ drm_parse_hdmi_vsdb_video(struct drm_connector *connector, const u8 *db) } static void drm_parse_cea_ext(struct drm_connector *connector, - struct edid *edid) + const struct edid *edid) { struct drm_display_info *info = &connector->display_info; const u8 *edid_ext; @@ -4397,11 +4397,33 @@ static void drm_parse_cea_ext(struct drm_connector *connector, } } -static void drm_add_display_info(struct drm_connector *connector, - struct edid *edid, u32 quirks) +/* A connector has no EDID information, so we've got no EDID to compute quirks from. Reset + * all of the values which would have been set from EDID + */ +void +drm_reset_display_info(struct drm_connector *connector) { struct drm_display_info *info = &connector->display_info; + info->width_mm = 0; + info->height_mm = 0; + + info->bpc = 0; + info->color_formats = 0; + info->cea_rev = 0; + info->max_tmds_clock = 0; + info->dvi_dual = false; + + info->non_desktop = 0; +} +EXPORT_SYMBOL_GPL(drm_reset_display_info); + +u32 drm_add_display_info(struct drm_connector *connector, const struct edid *edid) +{ + struct drm_display_info *info = &connector->display_info; + + u32 quirks = edid_get_quirks(edid); + info->width_mm = edid->width_cm * 10; info->height_mm = edid->height_cm * 10; @@ -4414,11 +4436,13 @@ static void drm_add_display_info(struct drm_connector *connector, info->non_desktop = !!(quirks & EDID_QUIRK_NON_DESKTOP); + DRM_DEBUG_KMS("non_desktop set to %d\n", info->non_desktop); + if (edid->revision < 3) - return; + return quirks; if (!(edid->input & DRM_EDID_INPUT_DIGITAL)) - return; + return quirks; drm_parse_cea_ext(connector, edid); @@ -4438,7 +4462,7 @@ static void drm_add_display_info(struct drm_connector *connector, /* Only defined for 1.4 with digital displays */ if (edid->revision < 4) - return; + return quirks; switch (edid->input & DRM_EDID_DIGITAL_DEPTH_MASK) { case DRM_EDID_DIGITAL_DEPTH_6: @@ -4473,7 +4497,9 @@ static void drm_add_display_info(struct drm_connector *connector, info->color_formats |= DRM_COLOR_FORMAT_YCRCB444; if (edid->features & DRM_EDID_FEATURE_RGB_YCRCB422) info->color_formats |= DRM_COLOR_FORMAT_YCRCB422; + return quirks; } +EXPORT_SYMBOL_GPL(drm_add_display_info); static int validate_displayid(u8 *displayid, int length, int idx) { @@ -4627,14 +4653,12 @@ int drm_add_edid_modes(struct drm_connector *connector, struct edid *edid) return 0; } - quirks = edid_get_quirks(edid); - /* * CEA-861-F adds ycbcr capability map block, for HDMI 2.0 sinks. * To avoid multiple parsing of same block, lets parse that map * from sink info, before parsing CEA modes. */ - drm_add_display_info(connector, edid, quirks); + quirks = drm_add_display_info(connector, edid); /* * EDID spec says modes should be preferred in this order: diff --git a/include/drm/drm_edid.h b/include/drm/drm_edid.h index 2ec41d032e56..efe6d5a8e834 100644 --- a/include/drm/drm_edid.h +++ b/include/drm/drm_edid.h @@ -465,6 +465,8 @@ struct edid *drm_get_edid(struct drm_connector *connector, struct edid *drm_get_edid_switcheroo(struct drm_connector *connector, struct i2c_adapter *adapter); struct edid *drm_edid_duplicate(const struct edid *edid); +void drm_reset_display_info(struct drm_connector *connector); +u32 drm_add_display_info(struct drm_connector *connector, const struct edid *edid); int drm_add_edid_modes(struct drm_connector *connector, struct edid *edid); u8 drm_match_cea_mode(const struct drm_display_mode *to_match); From 6b782f43d34974c7909306fd9af06241d658a1f7 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 11 Dec 2017 09:54:09 +0100 Subject: [PATCH 174/269] Revert "ravb: add workaround for clock when resuming with WoL enabled" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit fbf3d034f2ff6264183cfa6845770e8cc2a986c8. As of commit 560869100b99a3da ("clk: renesas: cpg-mssr: Restore module clocks during resume"), the workaround is no longer needed. Signed-off-by: Geert Uytterhoeven Reviewed-by: Niklas Söderlund Acked-by: Sergei Shtylyov Signed-off-by: David S. Miller --- drivers/net/ethernet/renesas/ravb_main.c | 27 ++---------------------- 1 file changed, 2 insertions(+), 25 deletions(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index 2b962d349f5f..009780df664b 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -2308,32 +2308,9 @@ static int __maybe_unused ravb_resume(struct device *dev) struct ravb_private *priv = netdev_priv(ndev); int ret = 0; - if (priv->wol_enabled) { - /* Reduce the usecount of the clock to zero and then - * restore it to its original value. This is done to force - * the clock to be re-enabled which is a workaround - * for renesas-cpg-mssr driver which do not enable clocks - * when resuming from PSCI suspend/resume. - * - * Without this workaround the driver fails to communicate - * with the hardware if WoL was enabled when the system - * entered PSCI suspend. This is due to that if WoL is enabled - * we explicitly keep the clock from being turned off when - * suspending, but in PSCI sleep power is cut so the clock - * is disabled anyhow, the clock driver is not aware of this - * so the clock is not turned back on when resuming. - * - * TODO: once the renesas-cpg-mssr suspend/resume is working - * this clock dance should be removed. - */ - clk_disable(priv->clk); - clk_disable(priv->clk); - clk_enable(priv->clk); - clk_enable(priv->clk); - - /* Set reset mode to rearm the WoL logic */ + /* If WoL is enabled set reset mode to rearm the WoL logic */ + if (priv->wol_enabled) ravb_write(ndev, CCC_OPC_RESET, CCC); - } /* All register have been reset to default values. * Restore all registers which where setup at probe time and From 9d98e19ba08f6aa33a4a1414f3dfe8440e67530c Mon Sep 17 00:00:00 2001 From: Yuval Shaia Date: Wed, 13 Dec 2017 12:25:19 +0200 Subject: [PATCH 175/269] IB/ipoib: Restore MM behavior in case of tx_ring allocation failure memalloc_noio_save modifies the behavior of MM, we must restore it after we are done. Fixes: d83187dda9b9 ("IB/IPoIB: Convert IPoIB to memalloc_noio_* calls") Signed-off-by: Yuval Shaia Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/ipoib/ipoib_cm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 87f4bd99cdf7..2c13123bfd69 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -1145,6 +1145,7 @@ static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn, noio_flag = memalloc_noio_save(); p->tx_ring = vzalloc(ipoib_sendq_size * sizeof(*p->tx_ring)); if (!p->tx_ring) { + memalloc_noio_restore(noio_flag); ret = -ENOMEM; goto err_tx; } From b9b312a7a451e9c098921856e7cfbc201120e1a7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 11 Dec 2017 07:03:38 -0800 Subject: [PATCH 176/269] ipv6: mcast: better catch silly mtu values syzkaller reported crashes in IPv6 stack [1] Xin Long found that lo MTU was set to silly values. IPv6 stack reacts to changes to small MTU, by disabling itself under RTNL. But there is a window where threads not using RTNL can see a wrong device mtu. This can lead to surprises, in mld code where it is assumed the mtu is suitable. Fix this by reading device mtu once and checking IPv6 minimal MTU. [1] skbuff: skb_over_panic: text:0000000010b86b8d len:196 put:20 head:000000003b477e60 data:000000000e85441e tail:0xd4 end:0xc0 dev:lo ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:104! invalid opcode: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.15.0-rc2-mm1+ #39 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:skb_panic+0x15c/0x1f0 net/core/skbuff.c:100 RSP: 0018:ffff8801db307508 EFLAGS: 00010286 RAX: 0000000000000082 RBX: ffff8801c517e840 RCX: 0000000000000000 RDX: 0000000000000082 RSI: 1ffff1003b660e61 RDI: ffffed003b660e95 RBP: ffff8801db307570 R08: 1ffff1003b660e23 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff85bd4020 R13: ffffffff84754ed2 R14: 0000000000000014 R15: ffff8801c4e26540 FS: 0000000000000000(0000) GS:ffff8801db300000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000463610 CR3: 00000001c6698000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: skb_over_panic net/core/skbuff.c:109 [inline] skb_put+0x181/0x1c0 net/core/skbuff.c:1694 add_grhead.isra.24+0x42/0x3b0 net/ipv6/mcast.c:1695 add_grec+0xa55/0x1060 net/ipv6/mcast.c:1817 mld_send_cr net/ipv6/mcast.c:1903 [inline] mld_ifc_timer_expire+0x4d2/0x770 net/ipv6/mcast.c:2448 call_timer_fn+0x23b/0x840 kernel/time/timer.c:1320 expire_timers kernel/time/timer.c:1357 [inline] __run_timers+0x7e1/0xb60 kernel/time/timer.c:1660 run_timer_softirq+0x4c/0xb0 kernel/time/timer.c:1686 __do_softirq+0x29d/0xbb2 kernel/softirq.c:285 invoke_softirq kernel/softirq.c:365 [inline] irq_exit+0x1d3/0x210 kernel/softirq.c:405 exiting_irq arch/x86/include/asm/apic.h:540 [inline] smp_apic_timer_interrupt+0x16b/0x700 arch/x86/kernel/apic/apic.c:1052 apic_timer_interrupt+0xa9/0xb0 arch/x86/entry/entry_64.S:920 Signed-off-by: Eric Dumazet Reported-by: syzbot Tested-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/mcast.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index fc6d7d143f2c..844642682b83 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1682,16 +1682,16 @@ static int grec_size(struct ifmcaddr6 *pmc, int type, int gdel, int sdel) } static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc, - int type, struct mld2_grec **ppgr) + int type, struct mld2_grec **ppgr, unsigned int mtu) { - struct net_device *dev = pmc->idev->dev; struct mld2_report *pmr; struct mld2_grec *pgr; - if (!skb) - skb = mld_newpack(pmc->idev, dev->mtu); - if (!skb) - return NULL; + if (!skb) { + skb = mld_newpack(pmc->idev, mtu); + if (!skb) + return NULL; + } pgr = skb_put(skb, sizeof(struct mld2_grec)); pgr->grec_type = type; pgr->grec_auxwords = 0; @@ -1714,10 +1714,15 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, struct mld2_grec *pgr = NULL; struct ip6_sf_list *psf, *psf_next, *psf_prev, **psf_list; int scount, stotal, first, isquery, truncate; + unsigned int mtu; if (pmc->mca_flags & MAF_NOREPORT) return skb; + mtu = READ_ONCE(dev->mtu); + if (mtu < IPV6_MIN_MTU) + return skb; + isquery = type == MLD2_MODE_IS_INCLUDE || type == MLD2_MODE_IS_EXCLUDE; truncate = type == MLD2_MODE_IS_EXCLUDE || @@ -1738,7 +1743,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) { if (skb) mld_sendpack(skb); - skb = mld_newpack(idev, dev->mtu); + skb = mld_newpack(idev, mtu); } } first = 1; @@ -1774,12 +1779,12 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, pgr->grec_nsrcs = htons(scount); if (skb) mld_sendpack(skb); - skb = mld_newpack(idev, dev->mtu); + skb = mld_newpack(idev, mtu); first = 1; scount = 0; } if (first) { - skb = add_grhead(skb, pmc, type, &pgr); + skb = add_grhead(skb, pmc, type, &pgr, mtu); first = 0; } if (!skb) @@ -1814,7 +1819,7 @@ empty_source: mld_sendpack(skb); skb = NULL; /* add_grhead will get a new one */ } - skb = add_grhead(skb, pmc, type, &pgr); + skb = add_grhead(skb, pmc, type, &pgr, mtu); } } if (pgr) From b5476022bbada3764609368f03329ca287528dc8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 11 Dec 2017 07:17:39 -0800 Subject: [PATCH 177/269] ipv4: igmp: guard against silly MTU values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit IPv4 stack reacts to changes to small MTU, by disabling itself under RTNL. But there is a window where threads not using RTNL can see a wrong device mtu. This can lead to surprises, in igmp code where it is assumed the mtu is suitable. Fix this by reading device mtu once and checking IPv4 minimal MTU. This patch adds missing IPV4_MIN_MTU define, to not abuse ETH_MIN_MTU anymore. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip.h | 1 + net/ipv4/devinet.c | 2 +- net/ipv4/igmp.c | 24 +++++++++++++++--------- net/ipv4/ip_tunnel.c | 4 ++-- 4 files changed, 19 insertions(+), 12 deletions(-) diff --git a/include/net/ip.h b/include/net/ip.h index 9896f46cbbf1..af8addbaa3c1 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -34,6 +34,7 @@ #include #define IPV4_MAX_PMTU 65535U /* RFC 2675, Section 5.1 */ +#define IPV4_MIN_MTU 68 /* RFC 791 */ struct sock; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index a4573bccd6da..7a93359fbc72 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1428,7 +1428,7 @@ skip: static bool inetdev_valid_mtu(unsigned int mtu) { - return mtu >= 68; + return mtu >= IPV4_MIN_MTU; } static void inetdev_send_gratuitous_arp(struct net_device *dev, diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index d1f8f302dbf3..50448a220a1f 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -404,16 +404,17 @@ static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel) } static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc, - int type, struct igmpv3_grec **ppgr) + int type, struct igmpv3_grec **ppgr, unsigned int mtu) { struct net_device *dev = pmc->interface->dev; struct igmpv3_report *pih; struct igmpv3_grec *pgr; - if (!skb) - skb = igmpv3_newpack(dev, dev->mtu); - if (!skb) - return NULL; + if (!skb) { + skb = igmpv3_newpack(dev, mtu); + if (!skb) + return NULL; + } pgr = skb_put(skb, sizeof(struct igmpv3_grec)); pgr->grec_type = type; pgr->grec_auxwords = 0; @@ -436,12 +437,17 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, struct igmpv3_grec *pgr = NULL; struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list; int scount, stotal, first, isquery, truncate; + unsigned int mtu; if (pmc->multiaddr == IGMP_ALL_HOSTS) return skb; if (ipv4_is_local_multicast(pmc->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports) return skb; + mtu = READ_ONCE(dev->mtu); + if (mtu < IPV4_MIN_MTU) + return skb; + isquery = type == IGMPV3_MODE_IS_INCLUDE || type == IGMPV3_MODE_IS_EXCLUDE; truncate = type == IGMPV3_MODE_IS_EXCLUDE || @@ -462,7 +468,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) { if (skb) igmpv3_sendpack(skb); - skb = igmpv3_newpack(dev, dev->mtu); + skb = igmpv3_newpack(dev, mtu); } } first = 1; @@ -498,12 +504,12 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, pgr->grec_nsrcs = htons(scount); if (skb) igmpv3_sendpack(skb); - skb = igmpv3_newpack(dev, dev->mtu); + skb = igmpv3_newpack(dev, mtu); first = 1; scount = 0; } if (first) { - skb = add_grhead(skb, pmc, type, &pgr); + skb = add_grhead(skb, pmc, type, &pgr, mtu); first = 0; } if (!skb) @@ -538,7 +544,7 @@ empty_source: igmpv3_sendpack(skb); skb = NULL; /* add_grhead will get a new one */ } - skb = add_grhead(skb, pmc, type, &pgr); + skb = add_grhead(skb, pmc, type, &pgr, mtu); } } if (pgr) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index fe6fee728ce4..5ddb1cb52bd4 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -349,8 +349,8 @@ static int ip_tunnel_bind_dev(struct net_device *dev) dev->needed_headroom = t_hlen + hlen; mtu -= (dev->hard_header_len + t_hlen); - if (mtu < 68) - mtu = 68; + if (mtu < IPV4_MIN_MTU) + mtu = IPV4_MIN_MTU; return mtu; } From 83593010d3b87601e775f240ce46c53ddf25828d Mon Sep 17 00:00:00 2001 From: Pravin Shedge Date: Mon, 11 Dec 2017 22:09:46 +0530 Subject: [PATCH 178/269] net: remove duplicate includes These duplicate includes have been found with scripts/checkincludes.pl but they have been removed manually to avoid removing false positives. Signed-off-by: Pravin Shedge Acked-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/core/netprio_cgroup.c | 1 - net/dsa/slave.c | 1 - net/netfilter/nf_conntrack_netlink.c | 1 - net/sched/act_meta_mark.c | 1 - net/sched/act_meta_skbtcindex.c | 1 - net/sched/cls_api.c | 1 - net/sched/cls_u32.c | 1 - 7 files changed, 7 deletions(-) diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 1c4810919a0a..b9057478d69c 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/net/dsa/slave.c b/net/dsa/slave.c index d6e7a642493b..a95a55f79137 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 59c08997bfdf..332b51870ed7 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -45,7 +45,6 @@ #include #include #include -#include #include #ifdef CONFIG_NF_NAT_NEEDED #include diff --git a/net/sched/act_meta_mark.c b/net/sched/act_meta_mark.c index 1e3f10e5da99..6445184b2759 100644 --- a/net/sched/act_meta_mark.c +++ b/net/sched/act_meta_mark.c @@ -22,7 +22,6 @@ #include #include #include -#include static int skbmark_encode(struct sk_buff *skb, void *skbdata, struct tcf_meta_info *e) diff --git a/net/sched/act_meta_skbtcindex.c b/net/sched/act_meta_skbtcindex.c index 2ea1f26c9e96..7221437ca3a6 100644 --- a/net/sched/act_meta_skbtcindex.c +++ b/net/sched/act_meta_skbtcindex.c @@ -22,7 +22,6 @@ #include #include #include -#include static int skbtcindex_encode(struct sk_buff *skb, void *skbdata, struct tcf_meta_info *e) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index ddcf04b4ab43..f40256a3e7f0 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index ac152b4f4247..507859cdd1cb 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -45,7 +45,6 @@ #include #include #include -#include #include struct tc_u_knode { From c545a945d0d9ea2ea2c7d23d43cf0d86e32cd7cf Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Mon, 11 Dec 2017 19:11:55 +0100 Subject: [PATCH 179/269] tipc: eliminate potential memory leak In the function tipc_sk_mcast_rcv() we call refcount_dec(&skb->users) on received sk_buffers. Since the reference counter might hit zero at this point, we have a potential memory leak. We fix this by replacing refcount_dec() with kfree_skb(). Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 5d18c0caa92b..41127d0b925e 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1140,7 +1140,7 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, __skb_dequeue(arrvq); __skb_queue_tail(inputq, skb); } - refcount_dec(&skb->users); + kfree_skb(skb); spin_unlock_bh(&inputq->lock); continue; } From a46182b00290839fa3fa159d54fd3237bd8669f0 Mon Sep 17 00:00:00 2001 From: Kevin Cernekee Date: Mon, 11 Dec 2017 11:13:45 -0800 Subject: [PATCH 180/269] net: igmp: Use correct source address on IGMPv3 reports Closing a multicast socket after the final IPv4 address is deleted from an interface can generate a membership report that uses the source IP from a different interface. The following test script, run from an isolated netns, reproduces the issue: #!/bin/bash ip link add dummy0 type dummy ip link add dummy1 type dummy ip link set dummy0 up ip link set dummy1 up ip addr add 10.1.1.1/24 dev dummy0 ip addr add 192.168.99.99/24 dev dummy1 tcpdump -U -i dummy0 & socat EXEC:"sleep 2" \ UDP4-DATAGRAM:239.101.1.68:8889,ip-add-membership=239.0.1.68:10.1.1.1 & sleep 1 ip addr del 10.1.1.1/24 dev dummy0 sleep 5 kill %tcpdump RFC 3376 specifies that the report must be sent with a valid IP source address from the destination subnet, or from address 0.0.0.0. Add an extra check to make sure this is the case. Signed-off-by: Kevin Cernekee Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 50448a220a1f..726f6b608274 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -89,6 +89,7 @@ #include #include #include +#include #include #include @@ -321,6 +322,23 @@ igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted) return scount; } +/* source address selection per RFC 3376 section 4.2.13 */ +static __be32 igmpv3_get_srcaddr(struct net_device *dev, + const struct flowi4 *fl4) +{ + struct in_device *in_dev = __in_dev_get_rcu(dev); + + if (!in_dev) + return htonl(INADDR_ANY); + + for_ifa(in_dev) { + if (inet_ifa_match(fl4->saddr, ifa)) + return fl4->saddr; + } endfor_ifa(in_dev); + + return htonl(INADDR_ANY); +} + static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu) { struct sk_buff *skb; @@ -368,7 +386,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu) pip->frag_off = htons(IP_DF); pip->ttl = 1; pip->daddr = fl4.daddr; - pip->saddr = fl4.saddr; + pip->saddr = igmpv3_get_srcaddr(dev, &fl4); pip->protocol = IPPROTO_IGMP; pip->tot_len = 0; /* filled in later */ ip_select_ident(net, skb, NULL); From aceef61ee56898cfa7b6960fb60b9326c3860441 Mon Sep 17 00:00:00 2001 From: Sebastian Sjoholm Date: Mon, 11 Dec 2017 21:51:14 +0100 Subject: [PATCH 181/269] net: qmi_wwan: add Sierra EM7565 1199:9091 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sierra Wireless EM7565 is an Qualcomm MDM9x50 based M.2 modem. The USB id is added to qmi_wwan.c to allow QMI communication with the EM7565. Signed-off-by: Sebastian Sjoholm Acked-by: Bjørn Mork Signed-off-by: David S. Miller --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 304ec6555cd8..d2ca5a202e8d 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1204,6 +1204,7 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x1199, 0x9079, 10)}, /* Sierra Wireless EM74xx */ {QMI_FIXED_INTF(0x1199, 0x907b, 8)}, /* Sierra Wireless EM74xx */ {QMI_FIXED_INTF(0x1199, 0x907b, 10)}, /* Sierra Wireless EM74xx */ + {QMI_FIXED_INTF(0x1199, 0x9091, 8)}, /* Sierra Wireless EM7565 */ {QMI_FIXED_INTF(0x1bbb, 0x011e, 4)}, /* Telekom Speedstick LTE II (Alcatel One Touch L100V LTE) */ {QMI_FIXED_INTF(0x1bbb, 0x0203, 2)}, /* Alcatel L800MA */ {QMI_FIXED_INTF(0x2357, 0x0201, 4)}, /* TP-LINK HSUPA Modem MA180 */ From 2a9ee696c72a24d63529c76483fcd92d04b1d2b7 Mon Sep 17 00:00:00 2001 From: Branislav Radocaj Date: Tue, 12 Dec 2017 00:13:38 +0100 Subject: [PATCH 182/269] net: ethernet: arc: fix error handling in emac_rockchip_probe If clk_set_rate() fails, we should disable clk before return. Found by Linux Driver Verification project (linuxtesting.org). Changes since v2 [1]: * Merged with latest code changes Changes since v1: Update made thanks to David's review, much appreciated David. * Improved inconsistent failure handling of clock rate setting * For completeness of usecase, added arc_emac_probe error handling Signed-off-by: Branislav Radocaj Signed-off-by: David S. Miller --- drivers/net/ethernet/arc/emac_rockchip.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/arc/emac_rockchip.c b/drivers/net/ethernet/arc/emac_rockchip.c index c6163874e4e7..16f9bee992fe 100644 --- a/drivers/net/ethernet/arc/emac_rockchip.c +++ b/drivers/net/ethernet/arc/emac_rockchip.c @@ -199,9 +199,11 @@ static int emac_rockchip_probe(struct platform_device *pdev) /* RMII interface needs always a rate of 50MHz */ err = clk_set_rate(priv->refclk, 50000000); - if (err) + if (err) { dev_err(dev, "failed to change reference clock rate (%d)\n", err); + goto out_regulator_disable; + } if (priv->soc_data->need_div_macclk) { priv->macclk = devm_clk_get(dev, "macclk"); @@ -230,12 +232,14 @@ static int emac_rockchip_probe(struct platform_device *pdev) err = arc_emac_probe(ndev, interface); if (err) { dev_err(dev, "failed to probe arc emac (%d)\n", err); - goto out_regulator_disable; + goto out_clk_disable_macclk; } return 0; + out_clk_disable_macclk: - clk_disable_unprepare(priv->macclk); + if (priv->soc_data->need_div_macclk) + clk_disable_unprepare(priv->macclk); out_regulator_disable: if (priv->regulator) regulator_disable(priv->regulator); From 6e266610eb6553cfb7e7eb5d11914bd01509c406 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Tue, 12 Dec 2017 16:49:52 +0800 Subject: [PATCH 183/269] hippi: Fix a Fix a possible sleep-in-atomic bug in rr_close The driver may sleep under a spinlock. The function call path is: rr_close (acquire the spinlock) free_irq --> may sleep To fix it, free_irq is moved to the place without holding the spinlock. This bug is found by my static analysis tool(DSAC) and checked by my code review. Signed-off-by: Jia-Ju Bai Signed-off-by: David S. Miller --- drivers/net/hippi/rrunner.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/hippi/rrunner.c b/drivers/net/hippi/rrunner.c index 8483f03d5a41..1ab97d99b9ba 100644 --- a/drivers/net/hippi/rrunner.c +++ b/drivers/net/hippi/rrunner.c @@ -1379,8 +1379,8 @@ static int rr_close(struct net_device *dev) rrpriv->info_dma); rrpriv->info = NULL; - free_irq(pdev->irq, dev); spin_unlock_irqrestore(&rrpriv->lock, flags); + free_irq(pdev->irq, dev); return 0; } From 2e51a8dc7fdc9d06c52a0a0e442cc813357ea44d Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 12 Dec 2017 09:29:46 +0000 Subject: [PATCH 184/269] net: dsa: allow XAUI phy interface mode XGMII is a 32-bit bus plus two clock signals per direction. XAUI is four serial lanes per direction. The 88e6190 supports XAUI but not XGMII as it doesn't have enough pins. The same is true of 88e6176. Match on PHY_INTERFACE_MODE_XAUI for the XAUI port type, but keep accepting XGMII for backwards compatibility. Signed-off-by: Russell King Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6xxx/port.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/dsa/mv88e6xxx/port.c b/drivers/net/dsa/mv88e6xxx/port.c index a7801f6668a5..6315774d72b3 100644 --- a/drivers/net/dsa/mv88e6xxx/port.c +++ b/drivers/net/dsa/mv88e6xxx/port.c @@ -338,6 +338,7 @@ int mv88e6390x_port_set_cmode(struct mv88e6xxx_chip *chip, int port, cmode = MV88E6XXX_PORT_STS_CMODE_2500BASEX; break; case PHY_INTERFACE_MODE_XGMII: + case PHY_INTERFACE_MODE_XAUI: cmode = MV88E6XXX_PORT_STS_CMODE_XAUI; break; case PHY_INTERFACE_MODE_RXAUI: From cd8165c3d5fb07667328434835f2968a87caee67 Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 12 Dec 2017 09:29:51 +0000 Subject: [PATCH 185/269] ARM: dts: vf610-zii-dev: use XAUI for DSA link ports Use XAUI rather than XGMII for DSA link ports, as this is the interface mode that the switches actually use. XAUI is the 4 lane bus with clock per direction, whereas XGMII is a 32 bit bus with clock. Signed-off-by: Russell King Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- arch/arm/boot/dts/vf610-zii-dev-rev-c.dts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/vf610-zii-dev-rev-c.dts b/arch/arm/boot/dts/vf610-zii-dev-rev-c.dts index 02a6227c717c..15a685dc2aa2 100644 --- a/arch/arm/boot/dts/vf610-zii-dev-rev-c.dts +++ b/arch/arm/boot/dts/vf610-zii-dev-rev-c.dts @@ -121,7 +121,7 @@ switch0port10: port@10 { reg = <10>; label = "dsa"; - phy-mode = "xgmii"; + phy-mode = "xaui"; link = <&switch1port10>; }; }; @@ -208,7 +208,7 @@ switch1port10: port@10 { reg = <10>; label = "dsa"; - phy-mode = "xgmii"; + phy-mode = "xaui"; link = <&switch0port10>; }; }; From f5e64032a799d4f54decc7eb6aafcdffb67f9ad9 Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 12 Dec 2017 10:45:36 +0000 Subject: [PATCH 186/269] net: phy: fix resume handling When a PHY has the BMCR_PDOWN bit set, it may decide to ignore writes to other registers, or reset the registers to power-on defaults. Micrel PHYs do this for their interrupt registers. The current structure of phylib tries to enable interrupts before resuming (and releasing) the BMCR_PDOWN bit. This fails, causing Micrel PHYs to stop working after a suspend/resume sequence if they are using interrupts. Fix this by ensuring that the PHY driver resume methods do not take the phydev->lock mutex themselves, but the callers of phy_resume() take that lock. This then allows us to move the call to phy_resume() before we enable interrupts in phy_start(). Signed-off-by: Russell King Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/at803x.c | 4 ---- drivers/net/phy/phy.c | 9 +++------ drivers/net/phy/phy_device.c | 10 ++++++---- 3 files changed, 9 insertions(+), 14 deletions(-) diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c index 5f93e6add563..e911e4990b20 100644 --- a/drivers/net/phy/at803x.c +++ b/drivers/net/phy/at803x.c @@ -239,14 +239,10 @@ static int at803x_resume(struct phy_device *phydev) { int value; - mutex_lock(&phydev->lock); - value = phy_read(phydev, MII_BMCR); value &= ~(BMCR_PDOWN | BMCR_ISOLATE); phy_write(phydev, MII_BMCR, value); - mutex_unlock(&phydev->lock); - return 0; } diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 2b1e67bc1e73..ed10d1fc8f59 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -828,7 +828,6 @@ EXPORT_SYMBOL(phy_stop); */ void phy_start(struct phy_device *phydev) { - bool do_resume = false; int err = 0; mutex_lock(&phydev->lock); @@ -841,6 +840,9 @@ void phy_start(struct phy_device *phydev) phydev->state = PHY_UP; break; case PHY_HALTED: + /* if phy was suspended, bring the physical link up again */ + phy_resume(phydev); + /* make sure interrupts are re-enabled for the PHY */ if (phydev->irq != PHY_POLL) { err = phy_enable_interrupts(phydev); @@ -849,17 +851,12 @@ void phy_start(struct phy_device *phydev) } phydev->state = PHY_RESUMING; - do_resume = true; break; default: break; } mutex_unlock(&phydev->lock); - /* if phy was suspended, bring the physical link up again */ - if (do_resume) - phy_resume(phydev); - phy_trigger_machine(phydev, true); } EXPORT_SYMBOL(phy_start); diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 67f25ac29025..b15b31ca2618 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -135,7 +135,9 @@ static int mdio_bus_phy_resume(struct device *dev) if (!mdio_bus_phy_may_suspend(phydev)) goto no_resume; + mutex_lock(&phydev->lock); ret = phy_resume(phydev); + mutex_unlock(&phydev->lock); if (ret < 0) return ret; @@ -1026,7 +1028,9 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev, if (err) goto error; + mutex_lock(&phydev->lock); phy_resume(phydev); + mutex_unlock(&phydev->lock); phy_led_triggers_register(phydev); return err; @@ -1157,6 +1161,8 @@ int phy_resume(struct phy_device *phydev) struct phy_driver *phydrv = to_phy_driver(phydev->mdio.dev.driver); int ret = 0; + WARN_ON(!mutex_is_locked(&phydev->lock)); + if (phydev->drv && phydrv->resume) ret = phydrv->resume(phydev); @@ -1639,13 +1645,9 @@ int genphy_resume(struct phy_device *phydev) { int value; - mutex_lock(&phydev->lock); - value = phy_read(phydev, MII_BMCR); phy_write(phydev, MII_BMCR, value & ~BMCR_PDOWN); - mutex_unlock(&phydev->lock); - return 0; } EXPORT_SYMBOL(genphy_resume); From 94a5ef1b77da4674a6bc1d3de3051b758859d106 Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 12 Dec 2017 10:49:15 +0000 Subject: [PATCH 187/269] of_mdio / mdiobus: ensure mdio devices have fwnode correctly populated Ensure that all mdio devices populate the struct device fwnode pointer as well as the of_node pointer to allow drivers that wish to use fwnode APIs to work. Signed-off-by: Russell King Reviewed-by: Rob Herring Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/mdio_bus.c | 1 + drivers/of/of_mdio.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index 2df7b62c1a36..54d00a1d2bef 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -270,6 +270,7 @@ static void of_mdiobus_link_mdiodev(struct mii_bus *bus, if (addr == mdiodev->addr) { dev->of_node = child; + dev->fwnode = of_fwnode_handle(child); return; } } diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c index 98258583abb0..3481e69738b5 100644 --- a/drivers/of/of_mdio.c +++ b/drivers/of/of_mdio.c @@ -81,6 +81,7 @@ static int of_mdiobus_register_phy(struct mii_bus *mdio, * can be looked up later */ of_node_get(child); phy->mdio.dev.of_node = child; + phy->mdio.dev.fwnode = of_fwnode_handle(child); /* All data is now stored in the phy struct; * register it */ @@ -111,6 +112,7 @@ static int of_mdiobus_register_device(struct mii_bus *mdio, */ of_node_get(child); mdiodev->dev.of_node = child; + mdiodev->dev.fwnode = of_fwnode_handle(child); /* All data is now stored in the mdiodev struct; register it. */ rc = mdio_device_register(mdiodev); @@ -206,6 +208,7 @@ int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np) mdio->phy_mask = ~0; mdio->dev.of_node = np; + mdio->dev.fwnode = of_fwnode_handle(np); /* Get bus level PHY reset GPIO details */ mdio->reset_delay_us = DEFAULT_GPIO_RESET_DELAY; From 3b3397e2031564db07022e99f04d4b9f3df6fced Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 12 Dec 2017 13:03:11 +0000 Subject: [PATCH 188/269] net: phy: meson-gxl: make function meson_gxl_read_status static The function meson_gxl_read_status is local to the source and does not need to be in global scope, so make it static. Cleans up sparse warning: symbol 'meson_gxl_read_status' was not declared. Should it be static? Signed-off-by: Colin Ian King Reviewed-by: Jerome Brunet Signed-off-by: David S. Miller --- drivers/net/phy/meson-gxl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/meson-gxl.c b/drivers/net/phy/meson-gxl.c index 700007dd4be5..842eb871a6e3 100644 --- a/drivers/net/phy/meson-gxl.c +++ b/drivers/net/phy/meson-gxl.c @@ -67,7 +67,7 @@ static int meson_gxl_config_init(struct phy_device *phydev) * When this failure happens, the first retry is usually successful but, * in some cases, it may take up to 6 retries to get a decent result */ -int meson_gxl_read_status(struct phy_device *phydev) +static int meson_gxl_read_status(struct phy_device *phydev) { int ret, wol, lpa, exp; From c009cb842fcc0f84536a9d2692e6f063af5ac5c6 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 12 Dec 2017 10:30:29 -0800 Subject: [PATCH 189/269] skge: remove redundunt free_irq under spinlock The code to handle multi-port SKGE boards was freeing IRQ twice. The first one was under lock and might sleep. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/skge.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/skge.c b/drivers/net/ethernet/marvell/skge.c index 6e423f098a60..31efc47c847e 100644 --- a/drivers/net/ethernet/marvell/skge.c +++ b/drivers/net/ethernet/marvell/skge.c @@ -4081,7 +4081,6 @@ static void skge_remove(struct pci_dev *pdev) if (hw->ports > 1) { skge_write32(hw, B0_IMSK, 0); skge_read32(hw, B0_IMSK); - free_irq(pdev->irq, hw); } spin_unlock_irq(&hw->hw_lock); From 9ee11bd03cb1a5c3ca33c2bb70e7ed325f68890f Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 12 Dec 2017 16:28:58 -0800 Subject: [PATCH 190/269] tcp: fix potential underestimation on rcv_rtt When ms timestamp is used, current logic uses 1us in tcp_rcv_rtt_update() when the real rcv_rtt is within 1 - 999us. This could cause rcv_rtt underestimation. Fix it by always using a min value of 1ms if ms timestamp is used. Fixes: 645f4c6f2ebd ("tcp: switch rcv_rtt_est and rcvq_space to high resolution timestamps") Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9550cc42de2d..45f750e85714 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -508,9 +508,6 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) u32 new_sample = tp->rcv_rtt_est.rtt_us; long m = sample; - if (m == 0) - m = 1; - if (new_sample != 0) { /* If we sample in larger samples in the non-timestamp * case, we could grossly overestimate the RTT especially @@ -547,6 +544,8 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp) if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) return; delta_us = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcv_rtt_est.time); + if (!delta_us) + delta_us = 1; tcp_rcv_rtt_update(tp, delta_us, 1); new_measure: @@ -563,8 +562,11 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) { u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; - u32 delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); + u32 delta_us; + if (!delta) + delta = 1; + delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); tcp_rcv_rtt_update(tp, delta_us, 0); } } From 4688eb7cf3ae2c2721d1dacff5c1384cba47d176 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Dec 2017 18:22:52 -0800 Subject: [PATCH 191/269] tcp: refresh tcp_mstamp from timers callbacks Only the retransmit timer currently refreshes tcp_mstamp We should do the same for delayed acks and keepalives. Even if RFC 7323 does not request it, this is consistent to what linux did in the past, when TS values were based on jiffies. Fixes: 385e20706fac ("tcp: use tp->tcp_mstamp in output path") Signed-off-by: Eric Dumazet Cc: Soheil Hassas Yeganeh Cc: Mike Maloney Cc: Neal Cardwell Acked-by: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Acked-by: Mike Maloney Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 16df6dd44b98..968fda198376 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -264,6 +264,7 @@ void tcp_delack_timer_handler(struct sock *sk) icsk->icsk_ack.pingpong = 0; icsk->icsk_ack.ato = TCP_ATO_MIN; } + tcp_mstamp_refresh(tcp_sk(sk)); tcp_send_ack(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS); } @@ -632,6 +633,7 @@ static void tcp_keepalive_timer (struct timer_list *t) goto out; } + tcp_mstamp_refresh(tp); if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { if (tp->linger2 >= 0) { const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN; From 53c64870d03edfa5c554ac2f750c5d6b38e3680a Mon Sep 17 00:00:00 2001 From: Jie Deng Date: Wed, 13 Dec 2017 12:04:12 +0800 Subject: [PATCH 192/269] dwc-xlgmac: Add co-maintainer Jose Abreu will join to maintain dwc-xlgmac. He will help with new feature development for this driver. Thanks Jose and welcome on board! Signed-off-by: Jie Deng Signed-off-by: David S. Miller --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 9e0045e3ee0c..51497dc05333 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13117,6 +13117,7 @@ F: drivers/dma/dw/ SYNOPSYS DESIGNWARE ENTERPRISE ETHERNET DRIVER M: Jie Deng +M: Jose Abreu L: netdev@vger.kernel.org S: Supported F: drivers/net/ethernet/synopsys/ From de9c4e06bbe872d725f306e34f3eea21155488e2 Mon Sep 17 00:00:00 2001 From: Russell King Date: Wed, 13 Dec 2017 09:22:03 +0000 Subject: [PATCH 193/269] net: phy: marvell: avoid configuring fiber page for SGMII-to-Copper When in SGMII-to-Copper mode, the fiber page is used for the MAC facing link, and does not require configuration of the fiber auto-negotiation settings. Avoid trying. Signed-off-by: Russell King Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/marvell.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index 4d02b27df044..b5a8f750e433 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -637,6 +637,10 @@ static int m88e1510_config_aneg(struct phy_device *phydev) if (err < 0) goto error; + /* Do not touch the fiber page if we're in copper->sgmii mode */ + if (phydev->interface == PHY_INTERFACE_MODE_SGMII) + return 0; + /* Then the fiber link */ err = marvell_set_page(phydev, MII_MARVELL_FIBER_PAGE); if (err < 0) From 78034f5fdd622520eb843301cf35ce6c626543a7 Mon Sep 17 00:00:00 2001 From: Eugenia Emantayev Date: Wed, 13 Dec 2017 18:12:09 +0200 Subject: [PATCH 194/269] net/mlx4_en: Fix selftest for small MTUs Set the minimal MTU threshold for running loopback selftest. MTU should be big enough to include packet payload, NET_IP_ALIGN, Ethernet headers and preamble length. Fixes: e7c1c2c46201 ("mlx4_en: Added self diagnostics test implementation") Signed-off-by: Eugenia Emantayev Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/en_selftest.c | 2 +- drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_selftest.c b/drivers/net/ethernet/mellanox/mlx4/en_selftest.c index 88699b181946..946d9db7c8c2 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_selftest.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_selftest.c @@ -185,7 +185,7 @@ void mlx4_en_ex_selftest(struct net_device *dev, u32 *flags, u64 *buf) if (priv->mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UC_LOOPBACK) { buf[3] = mlx4_en_test_registers(priv); - if (priv->port_up) + if (priv->port_up && dev->mtu >= MLX4_SELFTEST_LB_MIN_MTU) buf[4] = mlx4_en_test_loopback(priv); } diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h index 1856e279a7e0..2b72677eccd4 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h @@ -153,6 +153,9 @@ #define SMALL_PACKET_SIZE (256 - NET_IP_ALIGN) #define HEADER_COPY_SIZE (128 - NET_IP_ALIGN) #define MLX4_LOOPBACK_TEST_PAYLOAD (HEADER_COPY_SIZE - ETH_HLEN) +#define PREAMBLE_LEN 8 +#define MLX4_SELFTEST_LB_MIN_MTU (MLX4_LOOPBACK_TEST_PAYLOAD + NET_IP_ALIGN + \ + ETH_HLEN + PREAMBLE_LEN) #define MLX4_EN_MIN_MTU 46 /* VLAN_HLEN is added twice,to support skb vlan tagged with multiple From 0bb9fc4f5429ac970181c073aa32e521e20f7b73 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Wed, 13 Dec 2017 18:12:10 +0200 Subject: [PATCH 195/269] net/mlx4_core: Fix wrong calculation of free counters The field res_free indicates the total number of counters which are available for allocation (reserved and unreserved). Fixed a bug where the reserved counters were subtracted from res_free before any allocation was performed. Before this fix, free counters which were not reserved could not be allocated. Fixes: 9de92c60beaa ("net/mlx4_core: Adjust counter grant policy in the resource tracker") Signed-off-by: Eran Ben Elisha Reviewed-by: Jack Morgenstein Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/resource_tracker.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c index 04304dd894c6..606a0e0beeae 100644 --- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c +++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c @@ -611,7 +611,6 @@ int mlx4_init_resource_tracker(struct mlx4_dev *dev) MLX4_MAX_PORTS; else res_alloc->guaranteed[t] = 0; - res_alloc->res_free -= res_alloc->guaranteed[t]; break; default: break; From 5a1647c391ba543a77a400dddf89053ec5c2b7a4 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Wed, 13 Dec 2017 18:12:11 +0200 Subject: [PATCH 196/269] net/mlx4_en: Fill all counters under one call of stats lock Before this patch, the stats_lock was acquired twice. In between the locks Driver sent command to gather some more statistics (per priority and counter statistics). If the stats lock was acquired by get statistics NDO in between we would have report out of sync counters. Fix this by collecting all stats from Firmware in advance and then fill the Software structs under one lock. Fixes: 0b131561a7d6 ("net/mlx4_en: Add Flow control statistics display via ethtool") Signed-off-by: Eran Ben Elisha Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/en_port.c | 57 +++++++++++--------- 1 file changed, 31 insertions(+), 26 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_port.c b/drivers/net/ethernet/mellanox/mlx4/en_port.c index e0eb695318e6..1fa4849a6f56 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_port.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_port.c @@ -188,7 +188,7 @@ int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset) struct net_device *dev = mdev->pndev[port]; struct mlx4_en_priv *priv = netdev_priv(dev); struct net_device_stats *stats = &dev->stats; - struct mlx4_cmd_mailbox *mailbox; + struct mlx4_cmd_mailbox *mailbox, *mailbox_priority; u64 in_mod = reset << 8 | port; int err; int i, counter_index; @@ -198,6 +198,13 @@ int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset) mailbox = mlx4_alloc_cmd_mailbox(mdev->dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); + + mailbox_priority = mlx4_alloc_cmd_mailbox(mdev->dev); + if (IS_ERR(mailbox_priority)) { + mlx4_free_cmd_mailbox(mdev->dev, mailbox); + return PTR_ERR(mailbox_priority); + } + err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, in_mod, 0, MLX4_CMD_DUMP_ETH_STATS, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); @@ -206,6 +213,28 @@ int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset) mlx4_en_stats = mailbox->buf; + memset(&tmp_counter_stats, 0, sizeof(tmp_counter_stats)); + counter_index = mlx4_get_default_counter_index(mdev->dev, port); + err = mlx4_get_counter_stats(mdev->dev, counter_index, + &tmp_counter_stats, reset); + + /* 0xffs indicates invalid value */ + memset(mailbox_priority->buf, 0xff, + sizeof(*flowstats) * MLX4_NUM_PRIORITIES); + + if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FLOWSTATS_EN) { + memset(mailbox_priority->buf, 0, + sizeof(*flowstats) * MLX4_NUM_PRIORITIES); + err = mlx4_cmd_box(mdev->dev, 0, mailbox_priority->dma, + in_mod | MLX4_DUMP_ETH_STATS_FLOW_CONTROL, + 0, MLX4_CMD_DUMP_ETH_STATS, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); + if (err) + goto out; + } + + flowstats = mailbox_priority->buf; + spin_lock_bh(&priv->stats_lock); mlx4_en_fold_software_stats(dev); @@ -345,31 +374,6 @@ int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset) priv->pkstats.tx_prio[8][0] = be64_to_cpu(mlx4_en_stats->TTOT_novlan); priv->pkstats.tx_prio[8][1] = be64_to_cpu(mlx4_en_stats->TOCT_novlan); - spin_unlock_bh(&priv->stats_lock); - - memset(&tmp_counter_stats, 0, sizeof(tmp_counter_stats)); - counter_index = mlx4_get_default_counter_index(mdev->dev, port); - err = mlx4_get_counter_stats(mdev->dev, counter_index, - &tmp_counter_stats, reset); - - /* 0xffs indicates invalid value */ - memset(mailbox->buf, 0xff, sizeof(*flowstats) * MLX4_NUM_PRIORITIES); - - if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FLOWSTATS_EN) { - memset(mailbox->buf, 0, - sizeof(*flowstats) * MLX4_NUM_PRIORITIES); - err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, - in_mod | MLX4_DUMP_ETH_STATS_FLOW_CONTROL, - 0, MLX4_CMD_DUMP_ETH_STATS, - MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); - if (err) - goto out; - } - - flowstats = mailbox->buf; - - spin_lock_bh(&priv->stats_lock); - if (tmp_counter_stats.counter_mode == 0) { priv->pf_stats.rx_bytes = be64_to_cpu(tmp_counter_stats.rx_bytes); priv->pf_stats.tx_bytes = be64_to_cpu(tmp_counter_stats.tx_bytes); @@ -410,6 +414,7 @@ int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset) out: mlx4_free_cmd_mailbox(mdev->dev, mailbox); + mlx4_free_cmd_mailbox(mdev->dev, mailbox_priority); return err; } From ea497bb92064875497554ee7cdf10df7fb7393fc Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Wed, 13 Dec 2017 13:49:36 +0100 Subject: [PATCH 197/269] drm: rework delayed connector cleanup in connector_iter PROBE_DEFER also uses system_wq to reprobe drivers, which means when that again fails, and we try to flush the overall system_wq (to get all the delayed connectore cleanup work_struct completed), we deadlock. Fix this by using just a single cleanup work, so that we can only flush that one and don't block on anything else. That means a free list plus locking, a standard pattern. v2: - Correctly free connectors only on last ref. Oops (Chris). - use llist_head/node (Chris). v3 - Add init_llist_head (Chris). Fixes: a703c55004e1 ("drm: safely free connectors from connector_iter") Fixes: 613051dac40d ("drm: locking&new iterators for connector_list") Cc: Ben Widawsky Cc: Dave Airlie Cc: Chris Wilson Cc: Sean Paul Cc: # v4.11+: 613051dac40d ("drm: locking&new iterators for connector_list" Cc: # v4.11+ Cc: Daniel Vetter Cc: Jani Nikula Cc: Gustavo Padovan Cc: David Airlie Cc: Javier Martinez Canillas Cc: Shuah Khan Cc: Guillaume Tucker Cc: Mark Brown Cc: Kevin Hilman Cc: Matt Hart Cc: Thierry Escande Cc: Tomeu Vizoso Cc: Enric Balletbo i Serra Tested-by: Marek Szyprowski Reviewed-by: Chris Wilson Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20171213124936.17914-1-daniel.vetter@ffwll.ch --- drivers/gpu/drm/drm_connector.c | 50 ++++++++++++++++++++--------- drivers/gpu/drm/drm_crtc_internal.h | 1 + drivers/gpu/drm/drm_mode_config.c | 5 ++- include/drm/drm_connector.h | 10 +++--- include/drm/drm_mode_config.h | 18 ++++++++++- 5 files changed, 63 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c index c4dfcbc861a1..9ae236036e32 100644 --- a/drivers/gpu/drm/drm_connector.c +++ b/drivers/gpu/drm/drm_connector.c @@ -152,14 +152,23 @@ static void drm_connector_free(struct kref *kref) connector->funcs->destroy(connector); } -static void drm_connector_free_work_fn(struct work_struct *work) +void drm_connector_free_work_fn(struct work_struct *work) { - struct drm_connector *connector = - container_of(work, struct drm_connector, free_work); - struct drm_device *dev = connector->dev; + struct drm_connector *connector, *n; + struct drm_device *dev = + container_of(work, struct drm_device, mode_config.connector_free_work); + struct drm_mode_config *config = &dev->mode_config; + unsigned long flags; + struct llist_node *freed; - drm_mode_object_unregister(dev, &connector->base); - connector->funcs->destroy(connector); + spin_lock_irqsave(&config->connector_list_lock, flags); + freed = llist_del_all(&config->connector_free_list); + spin_unlock_irqrestore(&config->connector_list_lock, flags); + + llist_for_each_entry_safe(connector, n, freed, free_node) { + drm_mode_object_unregister(dev, &connector->base); + connector->funcs->destroy(connector); + } } /** @@ -191,8 +200,6 @@ int drm_connector_init(struct drm_device *dev, if (ret) return ret; - INIT_WORK(&connector->free_work, drm_connector_free_work_fn); - connector->base.properties = &connector->properties; connector->dev = dev; connector->funcs = funcs; @@ -547,10 +554,17 @@ EXPORT_SYMBOL(drm_connector_list_iter_begin); * actually release the connector when dropping our final reference. */ static void -drm_connector_put_safe(struct drm_connector *conn) +__drm_connector_put_safe(struct drm_connector *conn) { - if (refcount_dec_and_test(&conn->base.refcount.refcount)) - schedule_work(&conn->free_work); + struct drm_mode_config *config = &conn->dev->mode_config; + + lockdep_assert_held(&config->connector_list_lock); + + if (!refcount_dec_and_test(&conn->base.refcount.refcount)) + return; + + llist_add(&conn->free_node, &config->connector_free_list); + schedule_work(&config->connector_free_work); } /** @@ -582,10 +596,10 @@ drm_connector_list_iter_next(struct drm_connector_list_iter *iter) /* loop until it's not a zombie connector */ } while (!kref_get_unless_zero(&iter->conn->base.refcount)); - spin_unlock_irqrestore(&config->connector_list_lock, flags); if (old_conn) - drm_connector_put_safe(old_conn); + __drm_connector_put_safe(old_conn); + spin_unlock_irqrestore(&config->connector_list_lock, flags); return iter->conn; } @@ -602,9 +616,15 @@ EXPORT_SYMBOL(drm_connector_list_iter_next); */ void drm_connector_list_iter_end(struct drm_connector_list_iter *iter) { + struct drm_mode_config *config = &iter->dev->mode_config; + unsigned long flags; + iter->dev = NULL; - if (iter->conn) - drm_connector_put_safe(iter->conn); + if (iter->conn) { + spin_lock_irqsave(&config->connector_list_lock, flags); + __drm_connector_put_safe(iter->conn); + spin_unlock_irqrestore(&config->connector_list_lock, flags); + } lock_release(&connector_list_iter_dep_map, 0, _RET_IP_); } EXPORT_SYMBOL(drm_connector_list_iter_end); diff --git a/drivers/gpu/drm/drm_crtc_internal.h b/drivers/gpu/drm/drm_crtc_internal.h index 9ebb8841778c..af00f42ba269 100644 --- a/drivers/gpu/drm/drm_crtc_internal.h +++ b/drivers/gpu/drm/drm_crtc_internal.h @@ -142,6 +142,7 @@ int drm_mode_connector_set_obj_prop(struct drm_mode_object *obj, uint64_t value); int drm_connector_create_standard_properties(struct drm_device *dev); const char *drm_get_connector_force_name(enum drm_connector_force force); +void drm_connector_free_work_fn(struct work_struct *work); /* IOCTL */ int drm_mode_connector_property_set_ioctl(struct drm_device *dev, diff --git a/drivers/gpu/drm/drm_mode_config.c b/drivers/gpu/drm/drm_mode_config.c index cc78b3d9e5e4..256de7313612 100644 --- a/drivers/gpu/drm/drm_mode_config.c +++ b/drivers/gpu/drm/drm_mode_config.c @@ -382,6 +382,9 @@ void drm_mode_config_init(struct drm_device *dev) ida_init(&dev->mode_config.connector_ida); spin_lock_init(&dev->mode_config.connector_list_lock); + init_llist_head(&dev->mode_config.connector_free_list); + INIT_WORK(&dev->mode_config.connector_free_work, drm_connector_free_work_fn); + drm_mode_create_standard_properties(dev); /* Just to be sure */ @@ -432,7 +435,7 @@ void drm_mode_config_cleanup(struct drm_device *dev) } drm_connector_list_iter_end(&conn_iter); /* connector_iter drops references in a work item. */ - flush_scheduled_work(); + flush_work(&dev->mode_config.connector_free_work); if (WARN_ON(!list_empty(&dev->mode_config.connector_list))) { drm_connector_list_iter_begin(dev, &conn_iter); drm_for_each_connector_iter(connector, &conn_iter) diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h index a4649c56ca2f..5971577016a2 100644 --- a/include/drm/drm_connector.h +++ b/include/drm/drm_connector.h @@ -24,6 +24,7 @@ #define __DRM_CONNECTOR_H__ #include +#include #include #include #include @@ -918,12 +919,13 @@ struct drm_connector { uint16_t tile_h_size, tile_v_size; /** - * @free_work: + * @free_node: * - * Work used only by &drm_connector_iter to be able to clean up a - * connector from any context. + * List used only by &drm_connector_iter to be able to clean up a + * connector from any context, in conjunction with + * &drm_mode_config.connector_free_work. */ - struct work_struct free_work; + struct llist_node free_node; }; #define obj_to_connector(x) container_of(x, struct drm_connector, base) diff --git a/include/drm/drm_mode_config.h b/include/drm/drm_mode_config.h index b21e827c5c78..b0ce26d71296 100644 --- a/include/drm/drm_mode_config.h +++ b/include/drm/drm_mode_config.h @@ -27,6 +27,7 @@ #include #include #include +#include #include @@ -393,7 +394,7 @@ struct drm_mode_config { /** * @connector_list_lock: Protects @num_connector and - * @connector_list. + * @connector_list and @connector_free_list. */ spinlock_t connector_list_lock; /** @@ -413,6 +414,21 @@ struct drm_mode_config { * &struct drm_connector_list_iter to walk this list. */ struct list_head connector_list; + /** + * @connector_free_list: + * + * List of connector objects linked with &drm_connector.free_head. + * Protected by @connector_list_lock. Used by + * drm_for_each_connector_iter() and + * &struct drm_connector_list_iter to savely free connectors using + * @connector_free_work. + */ + struct llist_head connector_free_list; + /** + * @connector_free_work: Work to clean up @connector_free_list. + */ + struct work_struct connector_free_work; + /** * @num_encoder: * From bd36d3bab2e3d08f80766c86487090dbceed4651 Mon Sep 17 00:00:00 2001 From: Marius Vlad Date: Wed, 13 Dec 2017 20:10:48 +0200 Subject: [PATCH 198/269] drm/drm_lease: Prevent deadlock in case drm_lease_create() fails This case can been seen when creating the lease with the same objects passed. [ 605.515097] 2 locks held by testapp/3337: [ 605.519027] #0: (&dev->mode_config.idr_mutex){......}, at: [] drm_mode_create_lease_ioctl+0x384/0x858 [ 605.530045] #1: (&dev->mode_config.idr_mutex){......}, at: [] drm_lease_destroy+0x2c/0x110 Which was causing the process to hang: [ 605.398827] [] __switch_to+0x94/0xa8 [ 605.404030] [] __schedule+0x1b0/0x698 [ 605.409322] [] schedule+0x3c/0xa8 [ 605.414260] [] schedule_preempt_disabled+0x20/0x38 [ 605.420677] [] mutex_lock_nested+0x158/0x340 [ 605.426572] [] drm_lease_destroy+0x2c/0x110 [ 605.432389] [] drm_master_put+0xc0/0xc8 [ 605.437845] [] drm_mode_create_lease_ioctl+0x47c/0x858 [ 605.444612] [] drm_ioctl+0x198/0x448 [ 605.449811] [] do_vfs_ioctl+0xa4/0x748 [ 605.455192] [] SyS_ioctl+0x8c/0xa0 [ 605.460216] [] __sys_trace_return+0x0/0x4 drm_mode_create_lease_ioctl() calls drm_lease_create() which acquires a lock on dev->mode_config.idr_mutex. In case of failure, drm_lease_create() calls drm_master_put() which in turn tries to acquire the same lock when calling drm_lease_destroy(). v2: - Reverse the order at exit in case of fail, so that unlocking takes place before dropping the reference. - Include detail information about deadlock (Daniel Vetter) Signed-off-by: Marius Vlad Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20171213181048.32719-1-marius-cristian.vlad@nxp.com --- drivers/gpu/drm/drm_lease.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/drm_lease.c b/drivers/gpu/drm/drm_lease.c index d1eb56a1eff4..59849f02e2ad 100644 --- a/drivers/gpu/drm/drm_lease.c +++ b/drivers/gpu/drm/drm_lease.c @@ -254,10 +254,10 @@ static struct drm_master *drm_lease_create(struct drm_master *lessor, struct idr return lessee; out_lessee: - drm_master_put(&lessee); - mutex_unlock(&dev->mode_config.idr_mutex); + drm_master_put(&lessee); + return ERR_PTR(error); } From da2e6b7eeda8919f677c790ef51161dd02e513a6 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 22 Nov 2017 20:27:34 +0200 Subject: [PATCH 199/269] ovl: fix overlay: warning prefix Conform two stray warning messages to the standard overlayfs: prefix. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/dir.c | 3 ++- fs/overlayfs/readdir.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index e13921824c70..f9788bc116a8 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -887,7 +887,8 @@ static int ovl_set_redirect(struct dentry *dentry, bool samedir) spin_unlock(&dentry->d_lock); } else { kfree(redirect); - pr_warn_ratelimited("overlay: failed to set redirect (%i)\n", err); + pr_warn_ratelimited("overlayfs: failed to set redirect (%i)\n", + err); /* Fall back to userspace copy-up */ err = -EXDEV; } diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 51088849ce97..8c98578d27a1 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -499,7 +499,7 @@ out: return err; fail: - pr_warn_ratelimited("overlay: failed to look up (%s) for ino (%i)\n", + pr_warn_ratelimited("overlayfs: failed to look up (%s) for ino (%i)\n", p->name, err); goto out; } From 1d08a044cf12aee37dfd54837558e3295287b343 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 13 Dec 2017 11:45:42 +0000 Subject: [PATCH 200/269] arm64: fix CONFIG_DEBUG_WX address reporting In ptdump_check_wx(), we pass walk_pgd() a start address of 0 (rather than VA_START) for the init_mm. This means that any reported W&X addresses are offset by VA_START, which is clearly wrong and can make them appear like userspace addresses. Fix this by telling the ptdump code that we're walking init_mm starting at VA_START. We don't need to update the addr_markers, since these are still valid bounds regardless. Cc: Fixes: 1404d6f13e47 ("arm64: dump: Add checking for writable and exectuable pages") Signed-off-by: Mark Rutland Cc: Kees Cook Cc: Laura Abbott Reported-by: Timur Tabi Signed-off-by: Will Deacon --- arch/arm64/mm/dump.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c index ca74a2aace42..7b60d62ac593 100644 --- a/arch/arm64/mm/dump.c +++ b/arch/arm64/mm/dump.c @@ -389,7 +389,7 @@ void ptdump_check_wx(void) .check_wx = true, }; - walk_pgd(&st, &init_mm, 0); + walk_pgd(&st, &init_mm, VA_START); note_page(&st, 0, 0, 0); if (st.wx_pages || st.uxn_pages) pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found, %lu non-UXN pages found\n", From 3fab39997a98b97138c886978af660c4f6c7e9e6 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Thu, 14 Dec 2017 14:03:44 +0000 Subject: [PATCH 201/269] arm64/sve: Report SVE to userspace via CPUID only if supported Currently, the SVE field in ID_AA64PFR0_EL1 is visible unconditionally to userspace via the CPU ID register emulation, irrespective of the kernel config. This means that if a kernel configured with CONFIG_ARM64_SVE=n is run on SVE-capable hardware, userspace will see SVE reported as present in the ID regs even though the kernel forbids execution of SVE instructions. This patch makes the exposure of the SVE field in ID_AA64PFR0_EL1 conditional on CONFIG_ARM64_SVE=y. Since future architecture features are likely to encounter a similar requirement, this patch adds a suitable helper macros for use when declaring config-conditional ID register fields. Fixes: 43994d824e84 ("arm64/sve: Detect SVE and activate runtime support") Reviewed-by: Suzuki K Poulose Reported-by: Mark Rutland Signed-off-by: Dave Martin Cc: Suzuki Poulose Signed-off-by: Will Deacon --- arch/arm64/include/asm/cpufeature.h | 3 +++ arch/arm64/kernel/cpufeature.c | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index ac67cfc2585a..060e3a4008ab 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -60,6 +60,9 @@ enum ftr_type { #define FTR_VISIBLE true /* Feature visible to the user space */ #define FTR_HIDDEN false /* Feature is hidden from the user */ +#define FTR_VISIBLE_IF_IS_ENABLED(config) \ + (IS_ENABLED(config) ? FTR_VISIBLE : FTR_HIDDEN) + struct arm64_ftr_bits { bool sign; /* Value is signed ? */ bool visible; diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index c5ba0097887f..a73a5928f09b 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -145,7 +145,8 @@ static const struct arm64_ftr_bits ftr_id_aa64isar1[] = { }; static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_SVE_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_SVE_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_GIC_SHIFT, 4, 0), S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_ASIMD_SHIFT, 4, ID_AA64PFR0_ASIMD_NI), S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_FP_SHIFT, 4, ID_AA64PFR0_FP_NI), From c2e90800aef22e7ea14ea7560ba99993f11d3616 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 12 Dec 2017 13:45:50 +0000 Subject: [PATCH 202/269] virtio_mmio: fix devm cleanup Recent rework of the virtio_mmio probe/remove paths balanced a devm_ioremap() with an iounmap() rather than its devm variant. This ends up corrupting the devm datastructures, and results in the following boot-time splat on arm64 under QEMU 2.9.0: [ 3.450397] ------------[ cut here ]------------ [ 3.453822] Trying to vfree() nonexistent vm area (00000000c05b4844) [ 3.460534] WARNING: CPU: 1 PID: 1 at mm/vmalloc.c:1525 __vunmap+0x1b8/0x220 [ 3.475898] Kernel panic - not syncing: panic_on_warn set ... [ 3.475898] [ 3.493933] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 4.15.0-rc3 #1 [ 3.513109] Hardware name: linux,dummy-virt (DT) [ 3.525382] Call trace: [ 3.531683] dump_backtrace+0x0/0x368 [ 3.543921] show_stack+0x20/0x30 [ 3.547767] dump_stack+0x108/0x164 [ 3.559584] panic+0x25c/0x51c [ 3.569184] __warn+0x29c/0x31c [ 3.576023] report_bug+0x1d4/0x290 [ 3.586069] bug_handler.part.2+0x40/0x100 [ 3.597820] bug_handler+0x4c/0x88 [ 3.608400] brk_handler+0x11c/0x218 [ 3.613430] do_debug_exception+0xe8/0x318 [ 3.627370] el1_dbg+0x18/0x78 [ 3.634037] __vunmap+0x1b8/0x220 [ 3.648747] vunmap+0x6c/0xc0 [ 3.653864] __iounmap+0x44/0x58 [ 3.659771] devm_ioremap_release+0x34/0x68 [ 3.672983] release_nodes+0x404/0x880 [ 3.683543] devres_release_all+0x6c/0xe8 [ 3.695692] driver_probe_device+0x250/0x828 [ 3.706187] __driver_attach+0x190/0x210 [ 3.717645] bus_for_each_dev+0x14c/0x1f0 [ 3.728633] driver_attach+0x48/0x78 [ 3.740249] bus_add_driver+0x26c/0x5b8 [ 3.752248] driver_register+0x16c/0x398 [ 3.757211] __platform_driver_register+0xd8/0x128 [ 3.770860] virtio_mmio_init+0x1c/0x24 [ 3.782671] do_one_initcall+0xe0/0x398 [ 3.791890] kernel_init_freeable+0x594/0x660 [ 3.798514] kernel_init+0x18/0x190 [ 3.810220] ret_from_fork+0x10/0x18 To fix this, we can simply rip out the explicit cleanup that the devm infrastructure will do for us when our probe function returns an error code, or when our remove function returns. We only need to ensure that we call put_device() if a call to register_virtio_device() fails in the probe path. Signed-off-by: Mark Rutland Fixes: 7eb781b1bbb7136f ("virtio_mmio: add cleanup for virtio_mmio_probe") Fixes: 25f32223bce5c580 ("virtio_mmio: add cleanup for virtio_mmio_remove") Cc: Cornelia Huck Cc: Michael S. Tsirkin Cc: weiping zhang Cc: virtualization@lists.linux-foundation.org Signed-off-by: Michael S. Tsirkin Reviewed-by: Cornelia Huck --- drivers/virtio/virtio_mmio.c | 43 ++++++++---------------------------- 1 file changed, 9 insertions(+), 34 deletions(-) diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c index a9192fe4f345..c92131edfaba 100644 --- a/drivers/virtio/virtio_mmio.c +++ b/drivers/virtio/virtio_mmio.c @@ -522,10 +522,8 @@ static int virtio_mmio_probe(struct platform_device *pdev) return -EBUSY; vm_dev = devm_kzalloc(&pdev->dev, sizeof(*vm_dev), GFP_KERNEL); - if (!vm_dev) { - rc = -ENOMEM; - goto free_mem; - } + if (!vm_dev) + return -ENOMEM; vm_dev->vdev.dev.parent = &pdev->dev; vm_dev->vdev.dev.release = virtio_mmio_release_dev; @@ -535,17 +533,14 @@ static int virtio_mmio_probe(struct platform_device *pdev) spin_lock_init(&vm_dev->lock); vm_dev->base = devm_ioremap(&pdev->dev, mem->start, resource_size(mem)); - if (vm_dev->base == NULL) { - rc = -EFAULT; - goto free_vmdev; - } + if (vm_dev->base == NULL) + return -EFAULT; /* Check magic value */ magic = readl(vm_dev->base + VIRTIO_MMIO_MAGIC_VALUE); if (magic != ('v' | 'i' << 8 | 'r' << 16 | 't' << 24)) { dev_warn(&pdev->dev, "Wrong magic value 0x%08lx!\n", magic); - rc = -ENODEV; - goto unmap; + return -ENODEV; } /* Check device version */ @@ -553,8 +548,7 @@ static int virtio_mmio_probe(struct platform_device *pdev) if (vm_dev->version < 1 || vm_dev->version > 2) { dev_err(&pdev->dev, "Version %ld not supported!\n", vm_dev->version); - rc = -ENXIO; - goto unmap; + return -ENXIO; } vm_dev->vdev.id.device = readl(vm_dev->base + VIRTIO_MMIO_DEVICE_ID); @@ -563,8 +557,7 @@ static int virtio_mmio_probe(struct platform_device *pdev) * virtio-mmio device with an ID 0 is a (dummy) placeholder * with no function. End probing now with no error reported. */ - rc = -ENODEV; - goto unmap; + return -ENODEV; } vm_dev->vdev.id.vendor = readl(vm_dev->base + VIRTIO_MMIO_VENDOR_ID); @@ -590,33 +583,15 @@ static int virtio_mmio_probe(struct platform_device *pdev) platform_set_drvdata(pdev, vm_dev); rc = register_virtio_device(&vm_dev->vdev); - if (rc) { - iounmap(vm_dev->base); - devm_release_mem_region(&pdev->dev, mem->start, - resource_size(mem)); + if (rc) put_device(&vm_dev->vdev.dev); - } - return rc; -unmap: - iounmap(vm_dev->base); -free_mem: - devm_release_mem_region(&pdev->dev, mem->start, - resource_size(mem)); -free_vmdev: - devm_kfree(&pdev->dev, vm_dev); + return rc; } static int virtio_mmio_remove(struct platform_device *pdev) { struct virtio_mmio_device *vm_dev = platform_get_drvdata(pdev); - struct resource *mem; - - iounmap(vm_dev->base); - mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (mem) - devm_release_mem_region(&pdev->dev, mem->start, - resource_size(mem)); unregister_virtio_device(&vm_dev->vdev); return 0; From c47d7f56e914900410f65835933f9fc4374d0a2b Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Thu, 14 Dec 2017 15:32:24 -0800 Subject: [PATCH 203/269] include/linux/idr.h: add #include The was removed from radix-tree.h by commit f5bba9d11a25 ("include/linux/radix-tree.h: remove unneeded #include "). Since that commit, tools/testing/radix-tree/ couldn't pass compilation due to tools/testing/radix-tree/idr.c:17: undefined reference to WARN_ON_ONCE. This patch adds the bug.h header to idr.h to solve the issue. Link: http://lkml.kernel.org/r/1511963726-34070-2-git-send-email-wei.w.wang@intel.com Fixes: f5bba9d11a2 ("include/linux/radix-tree.h: remove unneeded #include ") Signed-off-by: Wei Wang Cc: Matthew Wilcox Cc: Jan Kara Cc: Eric Biggers Cc: Tejun Heo Cc: Masahiro Yamada Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/idr.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/idr.h b/include/linux/idr.h index 7c3a365f7e12..fa14f834e4ed 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -15,6 +15,7 @@ #include #include #include +#include struct idr { struct radix_tree_root idr_rt; From 338f1d9d1b829fec494d053f62820a2ee625b1ec Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 14 Dec 2017 15:32:28 -0800 Subject: [PATCH 204/269] lib/rbtree,drm/mm: add rbtree_replace_node_cached() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a variant of rbtree_replace_node() that maintains the leftmost cache of struct rbtree_root_cached when replacing nodes within the rbtree. As drm_mm is the only rb_replace_node() being used on an interval tree, the mistake looks fairly self-contained. Furthermore the only user of drm_mm_replace_node() is its testsuite... Testcase: igt/drm_mm/replace Link: http://lkml.kernel.org/r/20171122100729.3742-1-chris@chris-wilson.co.uk Link: https://patchwork.freedesktop.org/patch/msgid/20171109212435.9265-1-chris@chris-wilson.co.uk Fixes: f808c13fd373 ("lib/interval_tree: fast overlap detection") Signed-off-by: Chris Wilson Reviewed-by: Joonas Lahtinen Acked-by: Davidlohr Bueso Cc: Jérôme Glisse Cc: Joonas Lahtinen Cc: Daniel Vetter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpu/drm/drm_mm.c | 8 +++++--- include/linux/rbtree.h | 2 ++ lib/rbtree.c | 10 ++++++++++ 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/drm_mm.c b/drivers/gpu/drm/drm_mm.c index 61a1c8ea74bc..c3c79ee6119e 100644 --- a/drivers/gpu/drm/drm_mm.c +++ b/drivers/gpu/drm/drm_mm.c @@ -575,21 +575,23 @@ EXPORT_SYMBOL(drm_mm_remove_node); */ void drm_mm_replace_node(struct drm_mm_node *old, struct drm_mm_node *new) { + struct drm_mm *mm = old->mm; + DRM_MM_BUG_ON(!old->allocated); *new = *old; list_replace(&old->node_list, &new->node_list); - rb_replace_node(&old->rb, &new->rb, &old->mm->interval_tree.rb_root); + rb_replace_node_cached(&old->rb, &new->rb, &mm->interval_tree); if (drm_mm_hole_follows(old)) { list_replace(&old->hole_stack, &new->hole_stack); rb_replace_node(&old->rb_hole_size, &new->rb_hole_size, - &old->mm->holes_size); + &mm->holes_size); rb_replace_node(&old->rb_hole_addr, &new->rb_hole_addr, - &old->mm->holes_addr); + &mm->holes_addr); } old->allocated = false; diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index d574361943ea..fcbeed4053ef 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -99,6 +99,8 @@ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new, struct rb_root *root); +extern void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new, + struct rb_root_cached *root); static inline void rb_link_node(struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) diff --git a/lib/rbtree.c b/lib/rbtree.c index ba4a9d165f1b..d3ff682fd4b8 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -603,6 +603,16 @@ void rb_replace_node(struct rb_node *victim, struct rb_node *new, } EXPORT_SYMBOL(rb_replace_node); +void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new, + struct rb_root_cached *root) +{ + rb_replace_node(victim, new, &root->rb_root); + + if (root->rb_leftmost == victim) + root->rb_leftmost = new; +} +EXPORT_SYMBOL(rb_replace_node_cached); + void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new, struct rb_root *root) { From 13ab183d138f607d885e995d625e58d47678bf97 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 14 Dec 2017 15:32:31 -0800 Subject: [PATCH 205/269] mm/kmemleak.c: make cond_resched() rate-limiting more efficient Commit bde5f6bc68db ("kmemleak: add scheduling point to kmemleak_scan()") tries to rate-limit the frequency of cond_resched() calls, but does it in a way which might incur an expensive division operation in the inner loop. Simplify this. Fixes: bde5f6bc68db5 ("kmemleak: add scheduling point to kmemleak_scan()") Suggested-by: Linus Torvalds Cc: Yisheng Xie Cc: Catalin Marinas Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 3d4781756d50..d73c14294f3a 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1523,7 +1523,7 @@ static void kmemleak_scan(void) if (page_count(page) == 0) continue; scan_block(page, page + 1, NULL); - if (!(pfn % (MAX_SCAN_SIZE / sizeof(*page)))) + if (!(pfn & 63)) cond_resched(); } } From 146734b091430c80d80bb96b1139a96fb4bc830e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 14 Dec 2017 15:32:34 -0800 Subject: [PATCH 206/269] string.h: workaround for increased stack usage The hardened strlen() function causes rather large stack usage in at least one file in the kernel, in particular when CONFIG_KASAN is enabled: drivers/media/usb/em28xx/em28xx-dvb.c: In function 'em28xx_dvb_init': drivers/media/usb/em28xx/em28xx-dvb.c:2062:1: error: the frame size of 3256 bytes is larger than 204 bytes [-Werror=frame-larger-than=] Analyzing this problem led to the discovery that gcc fails to merge the stack slots for the i2c_board_info[] structures after we strlcpy() into them, due to the 'noreturn' attribute on the source string length check. I reported this as a gcc bug, but it is unlikely to get fixed for gcc-8, since it is relatively easy to work around, and it gets triggered rarely. An earlier workaround I did added an empty inline assembly statement before the call to fortify_panic(), which works surprisingly well, but is really ugly and unintuitive. This is a new approach to the same problem, this time addressing it by not calling the 'extern __real_strnlen()' function for string constants where __builtin_strlen() is a compile-time constant and therefore known to be safe. We do this by checking if the last character in the string is a compile-time constant '\0'. If it is, we can assume that strlen() of the string is also constant. As a side-effect, this should also improve the object code output for any other call of strlen() on a string constant. [akpm@linux-foundation.org: add comment] Link: http://lkml.kernel.org/r/20171205215143.3085755-1-arnd@arndb.de Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82365 Link: https://patchwork.kernel.org/patch/9980413/ Link: https://patchwork.kernel.org/patch/9974047/ Fixes: 6974f0c4555 ("include/linux/string.h: add the option of fortified string.h functions") Signed-off-by: Arnd Bergmann Cc: Kees Cook Cc: Mauro Carvalho Chehab Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Daniel Micay Cc: Greg Kroah-Hartman Cc: Martin Wilck Cc: Dan Williams Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/string.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/string.h b/include/linux/string.h index 410ecf17de3c..cfd83eb2f926 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -259,7 +259,10 @@ __FORTIFY_INLINE __kernel_size_t strlen(const char *p) { __kernel_size_t ret; size_t p_size = __builtin_object_size(p, 0); - if (p_size == (size_t)-1) + + /* Work around gcc excess stack consumption issue */ + if (p_size == (size_t)-1 || + (__builtin_constant_p(p[p_size - 1]) && p[p_size - 1] == '\0')) return __builtin_strlen(p); ret = strnlen(p, p_size); if (p_size <= ret) From 302ec300ef8a545a7fc7f667e5fd743b091c2eeb Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 14 Dec 2017 15:32:38 -0800 Subject: [PATCH 207/269] autofs: fix careless error in recent commit Commit ecc0c469f277 ("autofs: don't fail mount for transient error") was meant to replace an 'if' with a 'switch', but instead added the 'switch' leaving the case in place. Link: http://lkml.kernel.org/r/87zi6wstmw.fsf@notabene.neil.brown.name Fixes: ecc0c469f277 ("autofs: don't fail mount for transient error") Reported-by: Ben Hutchings Signed-off-by: NeilBrown Cc: Ian Kent Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/waitq.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 8fc41705c7cd..961a12dc6dc8 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -170,7 +170,6 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, mutex_unlock(&sbi->wq_mutex); - if (autofs4_write(sbi, pipe, &pkt, pktsz)) switch (ret = autofs4_write(sbi, pipe, &pkt, pktsz)) { case 0: break; From 3756f6401c302617c5e091081ca4d26ab604bec5 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 14 Dec 2017 15:32:41 -0800 Subject: [PATCH 208/269] exec: avoid gcc-8 warning for get_task_comm gcc-8 warns about using strncpy() with the source size as the limit: fs/exec.c:1223:32: error: argument to 'sizeof' in 'strncpy' call is the same expression as the source; did you mean to use the size of the destination? [-Werror=sizeof-pointer-memaccess] This is indeed slightly suspicious, as it protects us from source arguments without NUL-termination, but does not guarantee that the destination is terminated. This keeps the strncpy() to ensure we have properly padded target buffer, but ensures that we use the correct length, by passing the actual length of the destination buffer as well as adding a build-time check to ensure it is exactly TASK_COMM_LEN. There are only 23 callsites which I all reviewed to ensure this is currently the case. We could get away with doing only the check or passing the right length, but it doesn't hurt to do both. Link: http://lkml.kernel.org/r/20171205151724.1764896-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Suggested-by: Kees Cook Acked-by: Kees Cook Acked-by: Ingo Molnar Cc: Alexander Viro Cc: Peter Zijlstra Cc: Serge Hallyn Cc: James Morris Cc: Aleksa Sarai Cc: "Eric W. Biederman" Cc: Frederic Weisbecker Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 7 +++---- include/linux/sched.h | 6 +++++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 6be2aa0ab26f..156f56acfe8e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1216,15 +1216,14 @@ killed: return -EAGAIN; } -char *get_task_comm(char *buf, struct task_struct *tsk) +char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk) { - /* buf must be at least sizeof(tsk->comm) in size */ task_lock(tsk); - strncpy(buf, tsk->comm, sizeof(tsk->comm)); + strncpy(buf, tsk->comm, buf_size); task_unlock(tsk); return buf; } -EXPORT_SYMBOL_GPL(get_task_comm); +EXPORT_SYMBOL_GPL(__get_task_comm); /* * These functions flushes out all traces of the currently running executable diff --git a/include/linux/sched.h b/include/linux/sched.h index 21991d668d35..5124ba709830 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1503,7 +1503,11 @@ static inline void set_task_comm(struct task_struct *tsk, const char *from) __set_task_comm(tsk, from, false); } -extern char *get_task_comm(char *to, struct task_struct *tsk); +extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk); +#define get_task_comm(buf, tsk) ({ \ + BUILD_BUG_ON(sizeof(buf) != TASK_COMM_LEN); \ + __get_task_comm(buf, sizeof(buf), tsk); \ +}) #ifdef CONFIG_SMP void scheduler_ipi(void); From 51f73fffbf30b335d036ab356b67b05e16e26585 Mon Sep 17 00:00:00 2001 From: Srividya Desireddy Date: Thu, 14 Dec 2017 15:32:45 -0800 Subject: [PATCH 209/269] Documentation/vm/zswap.txt: update with same-value filled page feature Update zswap document with details on same-value filled pages identification feature. The usage of zswap.same_filled_pages_enabled module parameter is explained. Link: http://lkml.kernel.org/r/20171206114852epcms5p6973b02a9f455d5d3c765eafda0fe2631@epcms5p6 Signed-off-by: Srividya Desireddy Acked-by: Dan Streetman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/zswap.txt | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/Documentation/vm/zswap.txt b/Documentation/vm/zswap.txt index 89fff7d611cc..0b3a1148f9f0 100644 --- a/Documentation/vm/zswap.txt +++ b/Documentation/vm/zswap.txt @@ -98,5 +98,25 @@ request is made for a page in an old zpool, it is uncompressed using its original compressor. Once all pages are removed from an old zpool, the zpool and its compressor are freed. +Some of the pages in zswap are same-value filled pages (i.e. contents of the +page have same value or repetitive pattern). These pages include zero-filled +pages and they are handled differently. During store operation, a page is +checked if it is a same-value filled page before compressing it. If true, the +compressed length of the page is set to zero and the pattern or same-filled +value is stored. + +Same-value filled pages identification feature is enabled by default and can be +disabled at boot time by setting the "same_filled_pages_enabled" attribute to 0, +e.g. zswap.same_filled_pages_enabled=0. It can also be enabled and disabled at +runtime using the sysfs "same_filled_pages_enabled" attribute, e.g. + +echo 1 > /sys/module/zswap/parameters/same_filled_pages_enabled + +When zswap same-filled page identification is disabled at runtime, it will stop +checking for the same-value filled pages during store operation. However, the +existing pages which are marked as same-value filled pages remain stored +unchanged in zswap until they are either loaded or invalidated. + A debugfs interface is provided for various statistic about pool size, number -of pages stored, and various counters for the reasons pages are rejected. +of pages stored, same-value filled pages and various counters for the reasons +pages are rejected. From 4cc90b4cc3d4955f79eae4f7f9d64e67e17b468e Mon Sep 17 00:00:00 2001 From: "Liu, Changcheng" Date: Thu, 14 Dec 2017 15:32:48 -0800 Subject: [PATCH 210/269] scripts/faddr2line: fix CROSS_COMPILE unset error faddr2line hit var unbound error when CROSS_COMPILE isn't set since nounset option is set in bash script. Link: http://lkml.kernel.org/r/20171206013022.GA83929@sofia Fixes: 95a879825419 ("scripts/faddr2line: extend usage on generic arch") Signed-off-by: Liu Changcheng Reported-by: Richard Weinberger Reviewed-by: Richard Weinberger Cc: Thomas Gleixner Cc: Greg Kroah-Hartman Cc: Philippe Ombredanne Cc: NeilBrown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/faddr2line | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/faddr2line b/scripts/faddr2line index 39e07d8574dd..7721d5b2b0c0 100755 --- a/scripts/faddr2line +++ b/scripts/faddr2line @@ -44,10 +44,10 @@ set -o errexit set -o nounset -READELF="${CROSS_COMPILE}readelf" -ADDR2LINE="${CROSS_COMPILE}addr2line" -SIZE="${CROSS_COMPILE}size" -NM="${CROSS_COMPILE}nm" +READELF="${CROSS_COMPILE:-}readelf" +ADDR2LINE="${CROSS_COMPILE:-}addr2line" +SIZE="${CROSS_COMPILE:-}size" +NM="${CROSS_COMPILE:-}nm" command -v awk >/dev/null 2>&1 || die "awk isn't installed" command -v ${READELF} >/dev/null 2>&1 || die "readelf isn't installed" From 183f24aa5b76e37da690b2def41cc70f0792ce09 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 14 Dec 2017 15:32:52 -0800 Subject: [PATCH 211/269] mm/memory.c: mark wp_huge_pmd() inline to prevent build failure With gcc 4.1.2: mm/memory.o: In function `wp_huge_pmd': memory.c:(.text+0x9b4): undefined reference to `do_huge_pmd_wp_page' Interestingly, wp_huge_pmd() is emitted in the assembler output, but never called. Apparently replacing the call to pmd_write() in __handle_mm_fault() by a call to the more complex pmd_access_permitted() reduced the ability of the compiler to remove unused code. Fix this by marking wp_huge_pmd() inline, like was done in commit 91a90140f998 ("mm/memory.c: mark create_huge_pmd() inline to prevent build failure") for a similar problem. [akpm@linux-foundation.org: add comment] Link: http://lkml.kernel.org/r/1512335500-10889-1-git-send-email-geert@linux-m68k.org Fixes: c7da82b894e9eef6 ("mm: replace pmd_write with pmd_access_permitted in fault + gup paths") Signed-off-by: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 5eb3d2524bdc..cfaba6287702 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3831,7 +3831,8 @@ static inline int create_huge_pmd(struct vm_fault *vmf) return VM_FAULT_FALLBACK; } -static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) +/* `inline' is required to avoid gcc 4.1.2 build error */ +static inline int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) { if (vma_is_anonymous(vmf->vma)) return do_huge_pmd_wp_page(vmf, orig_pmd); From c24ad77d962c31af92f2b731dad2104cbf3fbb03 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Thu, 14 Dec 2017 15:32:55 -0800 Subject: [PATCH 212/269] mm/page_alloc.c: avoid excessive IRQ disabled times in free_unref_page_list() Since commit 9cca35d42eb6 ("mm, page_alloc: enable/disable IRQs once when freeing a list of pages") we see excessive IRQ disabled times of up to 25ms on an embedded ARM system (tracing overhead included). This is due to graphics buffers being freed back to the system via release_pages(). Graphics buffers can be huge, so it's not hard to hit cases where the list of pages to free has 2048 entries. Disabling IRQs while freeing all those pages is clearly not a good idea. Introduce a batch limit, which allows IRQ servicing once every few pages. The batch count is the same as used in other parts of the MM subsystem when dealing with IRQ disabled regions. Link: http://lkml.kernel.org/r/20171207170314.4419-1-l.stach@pengutronix.de Fixes: 9cca35d42eb6 ("mm, page_alloc: enable/disable IRQs once when freeing a list of pages") Signed-off-by: Lucas Stach Acked-by: Mel Gorman Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 73f5d4556b3d..7e5e775e97f4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2684,6 +2684,7 @@ void free_unref_page_list(struct list_head *list) { struct page *page, *next; unsigned long flags, pfn; + int batch_count = 0; /* Prepare pages for freeing */ list_for_each_entry_safe(page, next, list, lru) { @@ -2700,6 +2701,16 @@ void free_unref_page_list(struct list_head *list) set_page_private(page, 0); trace_mm_page_free_batched(page); free_unref_page_commit(page, pfn); + + /* + * Guard against excessive IRQ disabled times when we get + * a large list of pages to free. + */ + if (++batch_count == SWAP_CLUSTER_MAX) { + local_irq_restore(flags); + batch_count = 0; + local_irq_save(flags); + } } local_irq_restore(flags); } From 85c3e4a5a185f22649c6bf33bdce7bb1ac890921 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 14 Dec 2017 15:32:58 -0800 Subject: [PATCH 213/269] mm/slab.c: do not hash pointers when debugging slab If CONFIG_DEBUG_SLAB/CONFIG_DEBUG_SLAB_LEAK are enabled, the slab code prints extra debug information when e.g. corruption is detected. This includes pointers, which are not very useful when hashed. Fix this by using %px to print unhashed pointers instead where it makes sense, and by removing the printing of a last user pointer referring to code. [geert+renesas@glider.be: v2] Link: http://lkml.kernel.org/r/1513179267-2509-1-git-send-email-geert+renesas@glider.be Link: http://lkml.kernel.org/r/1512641861-5113-1-git-send-email-geert+renesas@glider.be Fixes: ad67b74d2469d9b8 ("printk: hash addresses printed with %p") Signed-off-by: Geert Uytterhoeven Acked-by: Christoph Lameter Acked-by: Linus Torvalds Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: "Tobin C . Harding" Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 183e996dde5f..4e51ef954026 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1584,11 +1584,8 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) *dbg_redzone2(cachep, objp)); } - if (cachep->flags & SLAB_STORE_USER) { - pr_err("Last user: [<%p>](%pSR)\n", - *dbg_userword(cachep, objp), - *dbg_userword(cachep, objp)); - } + if (cachep->flags & SLAB_STORE_USER) + pr_err("Last user: (%pSR)\n", *dbg_userword(cachep, objp)); realobj = (char *)objp + obj_offset(cachep); size = cachep->object_size; for (i = 0; i < size && lines; i += 16, lines--) { @@ -1621,7 +1618,7 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) /* Mismatch ! */ /* Print header */ if (lines == 0) { - pr_err("Slab corruption (%s): %s start=%p, len=%d\n", + pr_err("Slab corruption (%s): %s start=%px, len=%d\n", print_tainted(), cachep->name, realobj, size); print_objinfo(cachep, objp, 0); @@ -1650,13 +1647,13 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) if (objnr) { objp = index_to_obj(cachep, page, objnr - 1); realobj = (char *)objp + obj_offset(cachep); - pr_err("Prev obj: start=%p, len=%d\n", realobj, size); + pr_err("Prev obj: start=%px, len=%d\n", realobj, size); print_objinfo(cachep, objp, 2); } if (objnr + 1 < cachep->num) { objp = index_to_obj(cachep, page, objnr + 1); realobj = (char *)objp + obj_offset(cachep); - pr_err("Next obj: start=%p, len=%d\n", realobj, size); + pr_err("Next obj: start=%px, len=%d\n", realobj, size); print_objinfo(cachep, objp, 2); } } @@ -2608,7 +2605,7 @@ static void slab_put_obj(struct kmem_cache *cachep, /* Verify double free bug */ for (i = page->active; i < cachep->num; i++) { if (get_free_obj(page, i) == objnr) { - pr_err("slab: double free detected in cache '%s', objp %p\n", + pr_err("slab: double free detected in cache '%s', objp %px\n", cachep->name, objp); BUG(); } @@ -2772,7 +2769,7 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) else slab_error(cache, "memory outside object was overwritten"); - pr_err("%p: redzone 1:0x%llx, redzone 2:0x%llx\n", + pr_err("%px: redzone 1:0x%llx, redzone 2:0x%llx\n", obj, redzone1, redzone2); } @@ -3078,7 +3075,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { slab_error(cachep, "double free, or memory outside object was overwritten"); - pr_err("%p: redzone 1:0x%llx, redzone 2:0x%llx\n", + pr_err("%px: redzone 1:0x%llx, redzone 2:0x%llx\n", objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp)); } @@ -3091,7 +3088,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, cachep->ctor(objp); if (ARCH_SLAB_MINALIGN && ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) { - pr_err("0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", + pr_err("0x%px: not aligned to ARCH_SLAB_MINALIGN=%d\n", objp, (int)ARCH_SLAB_MINALIGN); } return objp; @@ -4283,7 +4280,7 @@ static void show_symbol(struct seq_file *m, unsigned long address) return; } #endif - seq_printf(m, "%p", (void *)address); + seq_printf(m, "%px", (void *)address); } static int leaks_show(struct seq_file *m, void *p) From 689d77f001cd22da31cc943170e1f6f2e8197035 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Thu, 14 Dec 2017 15:33:02 -0800 Subject: [PATCH 214/269] kcov: fix comparison callback signature Fix a silly copy-paste bug. We truncated u32 args to u16. Link: http://lkml.kernel.org/r/20171207101134.107168-1-dvyukov@google.com Fixes: ded97d2c2b2c ("kcov: support comparison operands collection") Signed-off-by: Dmitry Vyukov Cc: syzkaller@googlegroups.com Cc: Alexander Potapenko Cc: Vegard Nossum Cc: Quentin Casasnovas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcov.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/kcov.c b/kernel/kcov.c index 15f33faf4013..7594c033d98a 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -157,7 +157,7 @@ void notrace __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2) } EXPORT_SYMBOL(__sanitizer_cov_trace_cmp2); -void notrace __sanitizer_cov_trace_cmp4(u16 arg1, u16 arg2) +void notrace __sanitizer_cov_trace_cmp4(u32 arg1, u32 arg2) { write_comp_data(KCOV_CMP_SIZE(2), arg1, arg2, _RET_IP_); } @@ -183,7 +183,7 @@ void notrace __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2) } EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp2); -void notrace __sanitizer_cov_trace_const_cmp4(u16 arg1, u16 arg2) +void notrace __sanitizer_cov_trace_const_cmp4(u32 arg1, u32 arg2) { write_comp_data(KCOV_CMP_SIZE(2) | KCOV_CMP_CONST, arg1, arg2, _RET_IP_); From 0b265c3b3b721dca03e82719ac0e15bc2c89aa3a Mon Sep 17 00:00:00 2001 From: "Liu, Changcheng" Date: Thu, 14 Dec 2017 15:33:05 -0800 Subject: [PATCH 215/269] tools/slabinfo-gnuplot: force to use bash shell On some linux distributions, the default link of sh is dash which deoesn't support split array like "${var//,/ }" It's better to force to use bash shell directly. Link: http://lkml.kernel.org/r/20171208093751.GA175471@sofia Signed-off-by: Liu Changcheng Reviewed-by: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/vm/slabinfo-gnuplot.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/vm/slabinfo-gnuplot.sh b/tools/vm/slabinfo-gnuplot.sh index 35b039864b77..0cf28aa6f21c 100644 --- a/tools/vm/slabinfo-gnuplot.sh +++ b/tools/vm/slabinfo-gnuplot.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # Sergey Senozhatsky, 2015 # sergey.senozhatsky.work@gmail.com From 1f704fd0d14043e76e80f6b8b2251b9b2cedcca6 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 14 Dec 2017 15:33:08 -0800 Subject: [PATCH 216/269] mm/frame_vector.c: release a semaphore in 'get_vaddr_frames()' A semaphore is acquired before this check, so we must release it before leaving. Link: http://lkml.kernel.org/r/20171211211009.4971-1-christophe.jaillet@wanadoo.fr Fixes: b7f0554a56f2 ("mm: fail get_vaddr_frames() for filesystem-dax mappings") Signed-off-by: Christophe JAILLET Acked-by: Michal Hocko Cc: Dan Williams Cc: Christian Borntraeger Cc: David Sterba Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/frame_vector.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/frame_vector.c b/mm/frame_vector.c index 297c7238f7d4..c64dca6e27c2 100644 --- a/mm/frame_vector.c +++ b/mm/frame_vector.c @@ -62,8 +62,10 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, * get_user_pages_longterm() and disallow it for filesystem-dax * mappings. */ - if (vma_is_fsdax(vma)) - return -EOPNOTSUPP; + if (vma_is_fsdax(vma)) { + ret = -EOPNOTSUPP; + goto out; + } if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) { vec->got_ref = true; From bdcf0a423ea1c40bbb40e7ee483b50fc8aa3d758 Mon Sep 17 00:00:00 2001 From: Thiago Rafael Becker Date: Thu, 14 Dec 2017 15:33:12 -0800 Subject: [PATCH 217/269] kernel: make groups_sort calling a responsibility group_info allocators In testing, we found that nfsd threads may call set_groups in parallel for the same entry cached in auth.unix.gid, racing in the call of groups_sort, corrupting the groups for that entry and leading to permission denials for the client. This patch: - Make groups_sort globally visible. - Move the call to groups_sort to the modifiers of group_info - Remove the call to groups_sort from set_groups Link: http://lkml.kernel.org/r/20171211151420.18655-1-thiago.becker@gmail.com Signed-off-by: Thiago Rafael Becker Reviewed-by: Matthew Wilcox Reviewed-by: NeilBrown Acked-by: "J. Bruce Fields" Cc: Al Viro Cc: Martin Schwidefsky Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/kernel/compat_linux.c | 1 + fs/nfsd/auth.c | 3 +++ include/linux/cred.h | 1 + kernel/groups.c | 5 +++-- kernel/uid16.c | 1 + net/sunrpc/auth_gss/gss_rpc_xdr.c | 1 + net/sunrpc/auth_gss/svcauth_gss.c | 1 + net/sunrpc/svcauth_unix.c | 2 ++ 8 files changed, 13 insertions(+), 2 deletions(-) diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c index f04db3779b34..59eea9c65d3e 100644 --- a/arch/s390/kernel/compat_linux.c +++ b/arch/s390/kernel/compat_linux.c @@ -263,6 +263,7 @@ COMPAT_SYSCALL_DEFINE2(s390_setgroups16, int, gidsetsize, u16 __user *, grouplis return retval; } + groups_sort(group_info); retval = set_current_groups(group_info); put_group_info(group_info); diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 697f8ae7792d..f650e475d8f0 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -60,6 +60,9 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) gi->gid[i] = exp->ex_anon_gid; else gi->gid[i] = rqgi->gid[i]; + + /* Each thread allocates its own gi, no race */ + groups_sort(gi); } } else { gi = get_group_info(rqgi); diff --git a/include/linux/cred.h b/include/linux/cred.h index 099058e1178b..631286535d0f 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -83,6 +83,7 @@ extern int set_current_groups(struct group_info *); extern void set_groups(struct cred *, struct group_info *); extern int groups_search(const struct group_info *, kgid_t); extern bool may_setgroups(void); +extern void groups_sort(struct group_info *); /* * The security context of a task diff --git a/kernel/groups.c b/kernel/groups.c index e357bc800111..daae2f2dc6d4 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -86,11 +86,12 @@ static int gid_cmp(const void *_a, const void *_b) return gid_gt(a, b) - gid_lt(a, b); } -static void groups_sort(struct group_info *group_info) +void groups_sort(struct group_info *group_info) { sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid), gid_cmp, NULL); } +EXPORT_SYMBOL(groups_sort); /* a simple bsearch */ int groups_search(const struct group_info *group_info, kgid_t grp) @@ -122,7 +123,6 @@ int groups_search(const struct group_info *group_info, kgid_t grp) void set_groups(struct cred *new, struct group_info *group_info) { put_group_info(new->group_info); - groups_sort(group_info); get_group_info(group_info); new->group_info = group_info; } @@ -206,6 +206,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) return retval; } + groups_sort(group_info); retval = set_current_groups(group_info); put_group_info(group_info); diff --git a/kernel/uid16.c b/kernel/uid16.c index ce74a4901d2b..ef1da2a5f9bd 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -192,6 +192,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) return retval; } + groups_sort(group_info); retval = set_current_groups(group_info); put_group_info(group_info); diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c index c4778cae58ef..444380f968f1 100644 --- a/net/sunrpc/auth_gss/gss_rpc_xdr.c +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c @@ -231,6 +231,7 @@ static int gssx_dec_linux_creds(struct xdr_stream *xdr, goto out_free_groups; creds->cr_group_info->gid[i] = kgid; } + groups_sort(creds->cr_group_info); return 0; out_free_groups: diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 5dd4e6c9fef2..26531193fce4 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -481,6 +481,7 @@ static int rsc_parse(struct cache_detail *cd, goto out; rsci.cred.cr_group_info->gid[i] = kgid; } + groups_sort(rsci.cred.cr_group_info); /* mech name */ len = qword_get(&mesg, buf, mlen); diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 740b67d5a733..af7f28fb8102 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -520,6 +520,7 @@ static int unix_gid_parse(struct cache_detail *cd, ug.gi->gid[i] = kgid; } + groups_sort(ug.gi); ugp = unix_gid_lookup(cd, uid); if (ugp) { struct cache_head *ch; @@ -819,6 +820,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) kgid_t kgid = make_kgid(&init_user_ns, svc_getnl(argv)); cred->cr_group_info->gid[i] = kgid; } + groups_sort(cred->cr_group_info); if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) { *authp = rpc_autherr_badverf; return SVC_DENIED; From 4837fe37adff1d159904f0c013471b1ecbcb455e Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 14 Dec 2017 15:33:15 -0800 Subject: [PATCH 218/269] mm, oom_reaper: fix memory corruption David Rientjes has reported the following memory corruption while the oom reaper tries to unmap the victims address space BUG: Bad page map in process oom_reaper pte:6353826300000000 pmd:00000000 addr:00007f50cab1d000 vm_flags:08100073 anon_vma:ffff9eea335603f0 mapping: (null) index:7f50cab1d file: (null) fault: (null) mmap: (null) readpage: (null) CPU: 2 PID: 1001 Comm: oom_reaper Call Trace: unmap_page_range+0x1068/0x1130 __oom_reap_task_mm+0xd5/0x16b oom_reaper+0xff/0x14c kthread+0xc1/0xe0 Tetsuo Handa has noticed that the synchronization inside exit_mmap is insufficient. We only synchronize with the oom reaper if tsk_is_oom_victim which is not true if the final __mmput is called from a different context than the oom victim exit path. This can trivially happen from context of any task which has grabbed mm reference (e.g. to read /proc// file which requires mm etc.). The race would look like this oom_reaper oom_victim task mmget_not_zero do_exit mmput __oom_reap_task_mm mmput __mmput exit_mmap remove_vma unmap_page_range Fix this issue by providing a new mm_is_oom_victim() helper which operates on the mm struct rather than a task. Any context which operates on a remote mm struct should use this helper in place of tsk_is_oom_victim. The flag is set in mark_oom_victim and never cleared so it is stable in the exit_mmap path. Debugged by Tetsuo Handa. Link: http://lkml.kernel.org/r/20171210095130.17110-1-mhocko@kernel.org Fixes: 212925802454 ("mm: oom: let oom_reap_task and exit_mmap run concurrently") Signed-off-by: Michal Hocko Reported-by: David Rientjes Acked-by: David Rientjes Cc: Tetsuo Handa Cc: Andrea Argangeli Cc: [4.14] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 9 +++++++++ include/linux/sched/coredump.h | 1 + mm/mmap.c | 10 +++++----- mm/oom_kill.c | 4 +++- 4 files changed, 18 insertions(+), 6 deletions(-) diff --git a/include/linux/oom.h b/include/linux/oom.h index 01c91d874a57..5bad038ac012 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -66,6 +66,15 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk) return tsk->signal->oom_mm; } +/* + * Use this helper if tsk->mm != mm and the victim mm needs a special + * handling. This is guaranteed to stay true after once set. + */ +static inline bool mm_is_oom_victim(struct mm_struct *mm) +{ + return test_bit(MMF_OOM_VICTIM, &mm->flags); +} + /* * Checks whether a page fault on the given mm is still reliable. * This is no longer true if the oom reaper started to reap the diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 9c8847395b5e..ec912d01126f 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -70,6 +70,7 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ #define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ #define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ +#define MMF_OOM_VICTIM 25 /* mm is the oom victim */ #define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ diff --git a/mm/mmap.c b/mm/mmap.c index a4d546821214..9efdc021ad22 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3019,20 +3019,20 @@ void exit_mmap(struct mm_struct *mm) /* Use -1 here to ensure all VMAs in the mm are unmapped */ unmap_vmas(&tlb, vma, 0, -1); - set_bit(MMF_OOM_SKIP, &mm->flags); - if (unlikely(tsk_is_oom_victim(current))) { + if (unlikely(mm_is_oom_victim(mm))) { /* * Wait for oom_reap_task() to stop working on this * mm. Because MMF_OOM_SKIP is already set before * calling down_read(), oom_reap_task() will not run * on this "mm" post up_write(). * - * tsk_is_oom_victim() cannot be set from under us - * either because current->mm is already set to NULL + * mm_is_oom_victim() cannot be set from under us + * either because victim->mm is already set to NULL * under task_lock before calling mmput and oom_mm is - * set not NULL by the OOM killer only if current->mm + * set not NULL by the OOM killer only if victim->mm * is found not NULL while holding the task_lock. */ + set_bit(MMF_OOM_SKIP, &mm->flags); down_write(&mm->mmap_sem); up_write(&mm->mmap_sem); } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c957be32b27a..29f855551efe 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -683,8 +683,10 @@ static void mark_oom_victim(struct task_struct *tsk) return; /* oom_mm is bound to the signal struct life time. */ - if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) + if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) { mmgrab(tsk->signal->oom_mm); + set_bit(MMF_OOM_VICTIM, &mm->flags); + } /* * Make sure that the task is woken up from uninterruptible sleep From 7c2c11b208be09c156573fc0076b7b3646e05219 Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Thu, 14 Dec 2017 15:33:19 -0800 Subject: [PATCH 219/269] arch: define weak abort() gcc toggle -fisolate-erroneous-paths-dereference (default at -O2 onwards) isolates faulty code paths such as null pointer access, divide by zero etc. If gcc port doesnt implement __builtin_trap, an abort() is generated which causes kernel link error. In this case, gcc is generating abort due to 'divide by zero' in lib/mpi/mpih-div.c. Currently 'frv' and 'arc' are failing. Previously other arch was also broken like m32r was fixed by commit d22e3d69ee1a ("m32r: fix build failure"). Let's define this weak function which is common for all arch and fix the problem permanently. We can even remove the arch specific 'abort' after this is done. Link: http://lkml.kernel.org/r/1513118956-8718-1-git-send-email-sudipm.mukherjee@gmail.com Signed-off-by: Sudip Mukherjee Cc: Alexey Brodkin Cc: Vineet Gupta Cc: Sudip Mukherjee Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/exit.c b/kernel/exit.c index 6b4298a41167..df0c91d5606c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1755,3 +1755,11 @@ Efault: return -EFAULT; } #endif + +__weak void abort(void) +{ + BUG(); + + /* if that doesn't kill us, halt */ + panic("Oops failed to kill thread"); +} From b00d607bb188e187c7b60074d2fa91a6f1985029 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 5 Dec 2017 04:41:51 -0500 Subject: [PATCH 220/269] tracing: Have stack trace not record if RCU is not watching The stack tracer records a stack dump whenever it sees a stack usage that is more than what it ever saw before. This can happen at any function that is being traced. If it happens when the CPU is going idle (or other strange locations), RCU may not be watching, and in this case, the recording of the stack trace will trigger a warning. There's been lots of efforts to make hacks to allow stack tracing to proceed even if RCU is not watching, but this only causes more issues to appear. Simply do not trace a stack if RCU is not watching. It probably isn't a bad stack anyway. Acked-by: "Paul E. McKenney" Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_stack.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 734accc02418..3c7bfc4bf5e9 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -209,6 +209,10 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, if (__this_cpu_read(disable_stack_tracer) != 1) goto out; + /* If rcu is not watching, then save stack trace can fail */ + if (!rcu_is_watching()) + goto out; + ip += MCOUNT_INSN_SIZE; check_stack(ip, &stack); From 2610acf46b9ed528ec2cacd717bc9d354e452b73 Mon Sep 17 00:00:00 2001 From: Andreas Platschek Date: Thu, 14 Dec 2017 12:50:51 +0100 Subject: [PATCH 221/269] dmaengine: fsl-edma: disable clks on all error paths Previously enabled clks are only disabled if clk_prepare_enable() fails. However, there are other error paths were the previously enabled clocks are not disabled. To fix the problem, fsl_disable_clocks() now takes the number of clocks that shall be disabled + unprepared. For existing calls were all clocks were already successfully prepared + enabled, DMAMUX_NR is passed to disable + unprepare all clocks. In error paths were only some clocks were successfully prepared + enabled the loop counter is passed, in order to disable + unprepare all successfully prepared + enabled clocks. Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Andreas Platschek Signed-off-by: Vinod Koul --- drivers/dma/fsl-edma.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/dma/fsl-edma.c b/drivers/dma/fsl-edma.c index 6775f2c74e25..c7568869284e 100644 --- a/drivers/dma/fsl-edma.c +++ b/drivers/dma/fsl-edma.c @@ -863,11 +863,11 @@ static void fsl_edma_irq_exit( } } -static void fsl_disable_clocks(struct fsl_edma_engine *fsl_edma) +static void fsl_disable_clocks(struct fsl_edma_engine *fsl_edma, int nr_clocks) { int i; - for (i = 0; i < DMAMUX_NR; i++) + for (i = 0; i < nr_clocks; i++) clk_disable_unprepare(fsl_edma->muxclk[i]); } @@ -904,25 +904,25 @@ static int fsl_edma_probe(struct platform_device *pdev) res = platform_get_resource(pdev, IORESOURCE_MEM, 1 + i); fsl_edma->muxbase[i] = devm_ioremap_resource(&pdev->dev, res); - if (IS_ERR(fsl_edma->muxbase[i])) + if (IS_ERR(fsl_edma->muxbase[i])) { + /* on error: disable all previously enabled clks */ + fsl_disable_clocks(fsl_edma, i); return PTR_ERR(fsl_edma->muxbase[i]); + } sprintf(clkname, "dmamux%d", i); fsl_edma->muxclk[i] = devm_clk_get(&pdev->dev, clkname); if (IS_ERR(fsl_edma->muxclk[i])) { dev_err(&pdev->dev, "Missing DMAMUX block clock.\n"); + /* on error: disable all previously enabled clks */ + fsl_disable_clocks(fsl_edma, i); return PTR_ERR(fsl_edma->muxclk[i]); } ret = clk_prepare_enable(fsl_edma->muxclk[i]); - if (ret) { - /* disable only clks which were enabled on error */ - for (; i >= 0; i--) - clk_disable_unprepare(fsl_edma->muxclk[i]); - - dev_err(&pdev->dev, "DMAMUX clk block failed.\n"); - return ret; - } + if (ret) + /* on error: disable all previously enabled clks */ + fsl_disable_clocks(fsl_edma, i); } @@ -976,7 +976,7 @@ static int fsl_edma_probe(struct platform_device *pdev) if (ret) { dev_err(&pdev->dev, "Can't register Freescale eDMA engine. (%d)\n", ret); - fsl_disable_clocks(fsl_edma); + fsl_disable_clocks(fsl_edma, DMAMUX_NR); return ret; } @@ -985,7 +985,7 @@ static int fsl_edma_probe(struct platform_device *pdev) dev_err(&pdev->dev, "Can't register Freescale eDMA of_dma. (%d)\n", ret); dma_async_device_unregister(&fsl_edma->dma_dev); - fsl_disable_clocks(fsl_edma); + fsl_disable_clocks(fsl_edma, DMAMUX_NR); return ret; } @@ -1015,7 +1015,7 @@ static int fsl_edma_remove(struct platform_device *pdev) fsl_edma_cleanup_vchan(&fsl_edma->dma_dev); of_dma_controller_free(np); dma_async_device_unregister(&fsl_edma->dma_dev); - fsl_disable_clocks(fsl_edma); + fsl_disable_clocks(fsl_edma, DMAMUX_NR); return 0; } From cef31d9af908243421258f1df35a4a644604efbe Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Dec 2017 10:32:03 +0100 Subject: [PATCH 222/269] posix-timer: Properly check sigevent->sigev_notify timer_create() specifies via sigevent->sigev_notify the signal delivery for the new timer. The valid modes are SIGEV_NONE, SIGEV_SIGNAL, SIGEV_THREAD and (SIGEV_SIGNAL | SIGEV_THREAD_ID). The sanity check in good_sigevent() is only checking the valid combination for the SIGEV_THREAD_ID bit, i.e. SIGEV_SIGNAL, but if SIGEV_THREAD_ID is not set it accepts any random value. This has no real effects on the posix timer and signal delivery code, but it affects show_timer() which handles the output of /proc/$PID/timers. That function uses a string array to pretty print sigev_notify. The access to that array has no bound checks, so random sigev_notify cause access beyond the array bounds. Add proper checks for the valid notify modes and remove the SIGEV_THREAD_ID masking from various code pathes as SIGEV_NONE can never be set in combination with SIGEV_THREAD_ID. Reported-by: Eric Biggers Reported-by: Dmitry Vyukov Reported-by: Alexey Dobriyan Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: stable@vger.kernel.org --- kernel/time/posix-timers.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 13d6881f908b..ec999f32c840 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -434,17 +434,22 @@ static struct pid *good_sigevent(sigevent_t * event) { struct task_struct *rtn = current->group_leader; - if ((event->sigev_notify & SIGEV_THREAD_ID ) && - (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) || - !same_thread_group(rtn, current) || - (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL)) + switch (event->sigev_notify) { + case SIGEV_SIGNAL | SIGEV_THREAD_ID: + rtn = find_task_by_vpid(event->sigev_notify_thread_id); + if (!rtn || !same_thread_group(rtn, current)) + return NULL; + /* FALLTHRU */ + case SIGEV_SIGNAL: + case SIGEV_THREAD: + if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX) + return NULL; + /* FALLTHRU */ + case SIGEV_NONE: + return task_pid(rtn); + default: return NULL; - - if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) && - ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX))) - return NULL; - - return task_pid(rtn); + } } static struct k_itimer * alloc_posix_timer(void) @@ -669,7 +674,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) struct timespec64 ts64; bool sig_none; - sig_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE; + sig_none = timr->it_sigev_notify == SIGEV_NONE; iv = timr->it_interval; /* interval timer ? */ @@ -856,7 +861,7 @@ int common_timer_set(struct k_itimer *timr, int flags, timr->it_interval = timespec64_to_ktime(new_setting->it_interval); expires = timespec64_to_ktime(new_setting->it_value); - sigev_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE; + sigev_none = timr->it_sigev_notify == SIGEV_NONE; kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none); timr->it_active = !sigev_none; From 090edbe23ff57940fca7f57d9165ce57a826bd7a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 14 Dec 2017 13:19:05 -0800 Subject: [PATCH 223/269] x86/power/64: Use struct desc_ptr for the IDT in struct saved_context x86_64's saved_context nonsensically used separate idt_limit and idt_base fields and then cast &idt_limit to struct desc_ptr *. This was correct (with -fno-strict-aliasing), but it's confusing, served no purpose, and required #ifdeffery. Simplify this by using struct desc_ptr directly. No change in functionality. Tested-by: Jarkko Nikula Signed-off-by: Andy Lutomirski Acked-by: Rafael J. Wysocki Acked-by: Thomas Gleixner Cc: Borislav Petkov Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Pavel Machek Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Zhang Rui Link: http://lkml.kernel.org/r/967909ce38d341b01d45eff53e278e2728a3a93a.1513286253.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/suspend_64.h | 3 +-- arch/x86/power/cpu.c | 11 +---------- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h index 7306e911faee..600e9e0aea51 100644 --- a/arch/x86/include/asm/suspend_64.h +++ b/arch/x86/include/asm/suspend_64.h @@ -30,8 +30,7 @@ struct saved_context { u16 gdt_pad; /* Unused */ struct desc_ptr gdt_desc; u16 idt_pad; - u16 idt_limit; - unsigned long idt_base; + struct desc_ptr idt; u16 ldt; u16 tss; unsigned long tr; diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 5191de14f4df..472bc8c8212b 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -82,12 +82,8 @@ static void __save_processor_state(struct saved_context *ctxt) /* * descriptor tables */ -#ifdef CONFIG_X86_32 store_idt(&ctxt->idt); -#else -/* CONFIG_X86_64 */ - store_idt((struct desc_ptr *)&ctxt->idt_limit); -#endif + /* * We save it here, but restore it only in the hibernate case. * For ACPI S3 resume, this is loaded via 'early_gdt_desc' in 64-bit @@ -219,12 +215,7 @@ static void notrace __restore_processor_state(struct saved_context *ctxt) * now restore the descriptor tables to their proper values * ltr is done i fix_processor_context(). */ -#ifdef CONFIG_X86_32 load_idt(&ctxt->idt); -#else -/* CONFIG_X86_64 */ - load_idt((const struct desc_ptr *)&ctxt->idt_limit); -#endif #ifdef CONFIG_X86_64 /* From 896c80bef4d3b357814a476663158aaf669d0fb3 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 14 Dec 2017 13:19:06 -0800 Subject: [PATCH 224/269] x86/power/32: Move SYSENTER MSR restoration to fix_processor_context() x86_64 restores system call MSRs in fix_processor_context(), and x86_32 restored them along with segment registers. The 64-bit variant makes more sense, so move the 32-bit code to match the 64-bit code. No side effects are expected to runtime behavior. Tested-by: Jarkko Nikula Signed-off-by: Andy Lutomirski Acked-by: Rafael J. Wysocki Acked-by: Thomas Gleixner Cc: Borislav Petkov Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Pavel Machek Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Zhang Rui Link: http://lkml.kernel.org/r/65158f8d7ee64dd6bbc6c1c83b3b34aaa854e3ae.1513286253.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/power/cpu.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 472bc8c8212b..033c61e6891b 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -174,6 +174,9 @@ static void fix_processor_context(void) write_gdt_entry(desc, GDT_ENTRY_TSS, &tss, DESC_TSS); syscall_init(); /* This sets MSR_*STAR and related */ +#else + if (boot_cpu_has(X86_FEATURE_SEP)) + enable_sep_cpu(); #endif load_TR_desc(); /* This does ltr */ load_mm_ldt(current->active_mm); /* This does lldt */ @@ -237,12 +240,6 @@ static void notrace __restore_processor_state(struct saved_context *ctxt) loadsegment(fs, ctxt->fs); loadsegment(gs, ctxt->gs); loadsegment(ss, ctxt->ss); - - /* - * sysenter MSRs - */ - if (boot_cpu_has(X86_FEATURE_SEP)) - enable_sep_cpu(); #else /* CONFIG_X86_64 */ asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); From 7ee18d677989e99635027cee04c878950e0752b9 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 14 Dec 2017 13:19:07 -0800 Subject: [PATCH 225/269] x86/power: Make restore_processor_context() sane My previous attempt to fix a couple of bugs in __restore_processor_context(): 5b06bbcfc2c6 ("x86/power: Fix some ordering bugs in __restore_processor_context()") ... introduced yet another bug, breaking suspend-resume. Rather than trying to come up with a minimal fix, let's try to clean it up for real. This patch fixes quite a few things: - The old code saved a nonsensical subset of segment registers. The only registers that need to be saved are those that contain userspace state or those that can't be trivially restored without percpu access working. (On x86_32, we can restore percpu access by writing __KERNEL_PERCPU to %fs. On x86_64, it's easier to save and restore the kernel's GSBASE.) With this patch, we restore hardcoded values to the kernel state where applicable and explicitly restore the user state after fixing all the descriptor tables. - We used to use an unholy mix of inline asm and C helpers for segment register access. Let's get rid of the inline asm. This fixes the reported s2ram hangs and make the code all around more logical. Analyzed-by: Linus Torvalds Reported-by: Jarkko Nikula Reported-by: Pavel Machek Tested-by: Jarkko Nikula Tested-by: Pavel Machek Signed-off-by: Andy Lutomirski Acked-by: Rafael J. Wysocki Acked-by: Thomas Gleixner Cc: Borislav Petkov Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Zhang Rui Fixes: 5b06bbcfc2c6 ("x86/power: Fix some ordering bugs in __restore_processor_context()") Link: http://lkml.kernel.org/r/398ee68e5c0f766425a7b746becfc810840770ff.1513286253.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/suspend_32.h | 8 ++- arch/x86/include/asm/suspend_64.h | 16 +++++- arch/x86/power/cpu.c | 81 ++++++++++++++++--------------- 3 files changed, 63 insertions(+), 42 deletions(-) diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h index 982c325dad33..8be6afb58471 100644 --- a/arch/x86/include/asm/suspend_32.h +++ b/arch/x86/include/asm/suspend_32.h @@ -12,7 +12,13 @@ /* image of the saved processor state */ struct saved_context { - u16 es, fs, gs, ss; + /* + * On x86_32, all segment registers, with the possible exception of + * gs, are saved at kernel entry in pt_regs. + */ +#ifdef CONFIG_X86_32_LAZY_GS + u16 gs; +#endif unsigned long cr0, cr2, cr3, cr4; u64 misc_enable; bool misc_enable_saved; diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h index 600e9e0aea51..a7af9f53c0cb 100644 --- a/arch/x86/include/asm/suspend_64.h +++ b/arch/x86/include/asm/suspend_64.h @@ -20,8 +20,20 @@ */ struct saved_context { struct pt_regs regs; - u16 ds, es, fs, gs, ss; - unsigned long gs_base, gs_kernel_base, fs_base; + + /* + * User CS and SS are saved in current_pt_regs(). The rest of the + * segment selectors need to be saved and restored here. + */ + u16 ds, es, fs, gs; + + /* + * Usermode FSBASE and GSBASE may not match the fs and gs selectors, + * so we save them separately. We save the kernelmode GSBASE to + * restore percpu access after resume. + */ + unsigned long kernelmode_gs_base, usermode_gs_base, fs_base; + unsigned long cr0, cr2, cr3, cr4, cr8; u64 misc_enable; bool misc_enable_saved; diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 033c61e6891b..36a28eddb435 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -99,22 +99,18 @@ static void __save_processor_state(struct saved_context *ctxt) /* * segment registers */ -#ifdef CONFIG_X86_32 - savesegment(es, ctxt->es); - savesegment(fs, ctxt->fs); +#ifdef CONFIG_X86_32_LAZY_GS savesegment(gs, ctxt->gs); - savesegment(ss, ctxt->ss); -#else -/* CONFIG_X86_64 */ - asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds)); - asm volatile ("movw %%es, %0" : "=m" (ctxt->es)); - asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs)); - asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs)); - asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss)); +#endif +#ifdef CONFIG_X86_64 + savesegment(gs, ctxt->gs); + savesegment(fs, ctxt->fs); + savesegment(ds, ctxt->ds); + savesegment(es, ctxt->es); rdmsrl(MSR_FS_BASE, ctxt->fs_base); - rdmsrl(MSR_GS_BASE, ctxt->gs_base); - rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); + rdmsrl(MSR_GS_BASE, ctxt->kernelmode_gs_base); + rdmsrl(MSR_KERNEL_GS_BASE, ctxt->usermode_gs_base); mtrr_save_fixed_ranges(NULL); rdmsrl(MSR_EFER, ctxt->efer); @@ -189,9 +185,12 @@ static void fix_processor_context(void) } /** - * __restore_processor_state - restore the contents of CPU registers saved - * by __save_processor_state() - * @ctxt - structure to load the registers contents from + * __restore_processor_state - restore the contents of CPU registers saved + * by __save_processor_state() + * @ctxt - structure to load the registers contents from + * + * The asm code that gets us here will have restored a usable GDT, although + * it will be pointing to the wrong alias. */ static void notrace __restore_processor_state(struct saved_context *ctxt) { @@ -214,46 +213,50 @@ static void notrace __restore_processor_state(struct saved_context *ctxt) write_cr2(ctxt->cr2); write_cr0(ctxt->cr0); - /* - * now restore the descriptor tables to their proper values - * ltr is done i fix_processor_context(). - */ + /* Restore the IDT. */ load_idt(&ctxt->idt); -#ifdef CONFIG_X86_64 /* - * We need GSBASE restored before percpu access can work. - * percpu access can happen in exception handlers or in complicated - * helpers like load_gs_index(). + * Just in case the asm code got us here with the SS, DS, or ES + * out of sync with the GDT, update them. */ - wrmsrl(MSR_GS_BASE, ctxt->gs_base); + loadsegment(ss, __KERNEL_DS); + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); + + /* + * Restore percpu access. Percpu access can happen in exception + * handlers or in complicated helpers like load_gs_index(). + */ +#ifdef CONFIG_X86_64 + wrmsrl(MSR_GS_BASE, ctxt->kernelmode_gs_base); +#else + loadsegment(fs, __KERNEL_PERCPU); + loadsegment(gs, __KERNEL_STACK_CANARY); #endif + /* Restore the TSS, RO GDT, LDT, and usermode-relevant MSRs. */ fix_processor_context(); /* - * Restore segment registers. This happens after restoring the GDT - * and LDT, which happen in fix_processor_context(). + * Now that we have descriptor tables fully restored and working + * exception handling, restore the usermode segments. */ -#ifdef CONFIG_X86_32 +#ifdef CONFIG_X86_64 + loadsegment(ds, ctxt->es); loadsegment(es, ctxt->es); loadsegment(fs, ctxt->fs); - loadsegment(gs, ctxt->gs); - loadsegment(ss, ctxt->ss); -#else -/* CONFIG_X86_64 */ - asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); - asm volatile ("movw %0, %%es" :: "r" (ctxt->es)); - asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs)); load_gs_index(ctxt->gs); - asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss)); /* - * Restore FSBASE and user GSBASE after reloading the respective - * segment selectors. + * Restore FSBASE and GSBASE after restoring the selectors, since + * restoring the selectors clobbers the bases. Keep in mind + * that MSR_KERNEL_GS_BASE is horribly misnamed. */ wrmsrl(MSR_FS_BASE, ctxt->fs_base); - wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); + wrmsrl(MSR_KERNEL_GS_BASE, ctxt->usermode_gs_base); +#elif defined(CONFIG_X86_32_LAZY_GS) + loadsegment(gs, ctxt->gs); #endif do_fpu_end(); From f5b5fab1780c98b74526dbac527574bd02dc16f8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 11 Dec 2017 10:38:36 -0800 Subject: [PATCH 226/269] x86/decoder: Fix and update the opcodes map Update x86-opcode-map.txt based on the October 2017 Intel SDM publication. Fix INVPID to INVVPID. Add UD0 and UD1 instruction opcodes. Also sync the objtool and perf tooling copies of this file. Signed-off-by: Randy Dunlap Acked-by: Masami Hiramatsu Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/aac062d7-c0f6-96e3-5c92-ed299e2bd3da@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/lib/x86-opcode-map.txt | 13 +++++++++++-- tools/objtool/arch/x86/insn/x86-opcode-map.txt | 15 ++++++++++++--- .../perf/util/intel-pt-decoder/x86-opcode-map.txt | 15 ++++++++++++--- 3 files changed, 35 insertions(+), 8 deletions(-) diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index c4d55919fac1..e0b85930dd77 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -607,7 +607,7 @@ fb: psubq Pq,Qq | vpsubq Vx,Hx,Wx (66),(v1) fc: paddb Pq,Qq | vpaddb Vx,Hx,Wx (66),(v1) fd: paddw Pq,Qq | vpaddw Vx,Hx,Wx (66),(v1) fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1) -ff: +ff: UD0 EndTable Table: 3-byte opcode 1 (0x0f 0x38) @@ -717,7 +717,7 @@ AVXcode: 2 7e: vpermt2d/q Vx,Hx,Wx (66),(ev) 7f: vpermt2ps/d Vx,Hx,Wx (66),(ev) 80: INVEPT Gy,Mdq (66) -81: INVPID Gy,Mdq (66) +81: INVVPID Gy,Mdq (66) 82: INVPCID Gy,Mdq (66) 83: vpmultishiftqb Vx,Hx,Wx (66),(ev) 88: vexpandps/d Vpd,Wpd (66),(ev) @@ -970,6 +970,15 @@ GrpTable: Grp9 EndTable GrpTable: Grp10 +# all are UD1 +0: UD1 +1: UD1 +2: UD1 +3: UD1 +4: UD1 +5: UD1 +6: UD1 +7: UD1 EndTable # Grp11A and Grp11B are expressed as Grp11 in Intel SDM diff --git a/tools/objtool/arch/x86/insn/x86-opcode-map.txt b/tools/objtool/arch/x86/insn/x86-opcode-map.txt index 12e377184ee4..e0b85930dd77 100644 --- a/tools/objtool/arch/x86/insn/x86-opcode-map.txt +++ b/tools/objtool/arch/x86/insn/x86-opcode-map.txt @@ -607,7 +607,7 @@ fb: psubq Pq,Qq | vpsubq Vx,Hx,Wx (66),(v1) fc: paddb Pq,Qq | vpaddb Vx,Hx,Wx (66),(v1) fd: paddw Pq,Qq | vpaddw Vx,Hx,Wx (66),(v1) fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1) -ff: +ff: UD0 EndTable Table: 3-byte opcode 1 (0x0f 0x38) @@ -717,7 +717,7 @@ AVXcode: 2 7e: vpermt2d/q Vx,Hx,Wx (66),(ev) 7f: vpermt2ps/d Vx,Hx,Wx (66),(ev) 80: INVEPT Gy,Mdq (66) -81: INVPID Gy,Mdq (66) +81: INVVPID Gy,Mdq (66) 82: INVPCID Gy,Mdq (66) 83: vpmultishiftqb Vx,Hx,Wx (66),(ev) 88: vexpandps/d Vpd,Wpd (66),(ev) @@ -896,7 +896,7 @@ EndTable GrpTable: Grp3_1 0: TEST Eb,Ib -1: +1: TEST Eb,Ib 2: NOT Eb 3: NEG Eb 4: MUL AL,Eb @@ -970,6 +970,15 @@ GrpTable: Grp9 EndTable GrpTable: Grp10 +# all are UD1 +0: UD1 +1: UD1 +2: UD1 +3: UD1 +4: UD1 +5: UD1 +6: UD1 +7: UD1 EndTable # Grp11A and Grp11B are expressed as Grp11 in Intel SDM diff --git a/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt b/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt index 12e377184ee4..e0b85930dd77 100644 --- a/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt +++ b/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt @@ -607,7 +607,7 @@ fb: psubq Pq,Qq | vpsubq Vx,Hx,Wx (66),(v1) fc: paddb Pq,Qq | vpaddb Vx,Hx,Wx (66),(v1) fd: paddw Pq,Qq | vpaddw Vx,Hx,Wx (66),(v1) fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1) -ff: +ff: UD0 EndTable Table: 3-byte opcode 1 (0x0f 0x38) @@ -717,7 +717,7 @@ AVXcode: 2 7e: vpermt2d/q Vx,Hx,Wx (66),(ev) 7f: vpermt2ps/d Vx,Hx,Wx (66),(ev) 80: INVEPT Gy,Mdq (66) -81: INVPID Gy,Mdq (66) +81: INVVPID Gy,Mdq (66) 82: INVPCID Gy,Mdq (66) 83: vpmultishiftqb Vx,Hx,Wx (66),(ev) 88: vexpandps/d Vpd,Wpd (66),(ev) @@ -896,7 +896,7 @@ EndTable GrpTable: Grp3_1 0: TEST Eb,Ib -1: +1: TEST Eb,Ib 2: NOT Eb 3: NEG Eb 4: MUL AL,Eb @@ -970,6 +970,15 @@ GrpTable: Grp9 EndTable GrpTable: Grp10 +# all are UD1 +0: UD1 +1: UD1 +2: UD1 +3: UD1 +4: UD1 +5: UD1 +6: UD1 +7: UD1 EndTable # Grp11A and Grp11B are expressed as Grp11 in Intel SDM From 215eada73e77ede7e15531d99f712481ddd429be Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 15 Dec 2017 13:36:56 +0100 Subject: [PATCH 227/269] objtool: Resync objtool's instruction decoder source code copy with the kernel's latest version This fixes the following warning: warning: objtool: x86 instruction decoder differs from kernel Note that there are cleanups queued up for v4.16 that will make this warning more informative and will make the syncing easier as well. Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- tools/objtool/arch/x86/insn/inat.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tools/objtool/arch/x86/insn/inat.h b/tools/objtool/arch/x86/insn/inat.h index 125ecd2a300d..52dc8d911173 100644 --- a/tools/objtool/arch/x86/insn/inat.h +++ b/tools/objtool/arch/x86/insn/inat.h @@ -97,6 +97,16 @@ #define INAT_MAKE_GROUP(grp) ((grp << INAT_GRP_OFFS) | INAT_MODRM) #define INAT_MAKE_IMM(imm) (imm << INAT_IMM_OFFS) +/* Identifiers for segment registers */ +#define INAT_SEG_REG_IGNORE 0 +#define INAT_SEG_REG_DEFAULT 1 +#define INAT_SEG_REG_CS 2 +#define INAT_SEG_REG_SS 3 +#define INAT_SEG_REG_DS 4 +#define INAT_SEG_REG_ES 5 +#define INAT_SEG_REG_FS 6 +#define INAT_SEG_REG_GS 7 + /* Attribute search APIs */ extern insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode); extern int inat_get_last_prefix_id(insn_byte_t last_pfx); From 643e345c95f0b4a4082c60755e06e3e635658da6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 15 Dec 2017 13:47:51 +0100 Subject: [PATCH 228/269] tools/headers: Synchronize kernel <-> tooling headers Two kernel headers got modified recently, which are used by tooling as well: tools/include/uapi/linux/kvm.h arch/x86/include/asm/cpufeatures.h None of those changes have an effect on tooling, so do a plain copy. Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Namhyung Kim Cc: Jiri Olsa Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- tools/arch/x86/include/asm/cpufeatures.h | 1 + tools/include/uapi/linux/kvm.h | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index c0b0e9e8aa66..800104c8a3ed 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -266,6 +266,7 @@ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ #define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ +#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 282d7613fce8..496e59a2738b 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -630,9 +630,9 @@ struct kvm_s390_irq { struct kvm_s390_irq_state { __u64 buf; - __u32 flags; + __u32 flags; /* will stay unused for compatibility reasons */ __u32 len; - __u32 reserved[4]; + __u32 reserved[4]; /* will stay unused for compatibility reasons */ }; /* for KVM_SET_GUEST_DEBUG */ From f73c52a5bcd1710994e53fbccc378c42b97a06b6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 2 Dec 2017 13:04:54 -0500 Subject: [PATCH 229/269] sched/rt: Do not pull from current CPU if only one CPU to pull Daniel Wagner reported a crash on the BeagleBone Black SoC. This is a single CPU architecture, and does not have a functional arch_send_call_function_single_ipi() implementation which can crash the kernel if that is called. As it only has one CPU, it shouldn't be called, but if the kernel is compiled for SMP, the push/pull RT scheduling logic now calls it for irq_work if the one CPU is overloaded, it can use that function to call itself and crash the kernel. Ideally, we should disable the SCHED_FEAT(RT_PUSH_IPI) if the system only has a single CPU. But SCHED_FEAT is a constant if sched debugging is turned off. Another fix can also be used, and this should also help with normal SMP machines. That is, do not initiate the pull code if there's only one RT overloaded CPU, and that CPU happens to be the current CPU that is scheduling in a lower priority task. Even on a system with many CPUs, if there's many RT tasks waiting to run on a single CPU, and that CPU schedules in another RT task of lower priority, it will initiate the PULL logic in case there's a higher priority RT task on another CPU that is waiting to run. But if there is no other CPU with waiting RT tasks, it will initiate the RT pull logic on itself (as it still has RT tasks waiting to run). This is a wasted effort. Not only does this help with SMP code where the current CPU is the only one with RT overloaded tasks, it should also solve the issue that Daniel encountered, because it will prevent the PULL logic from executing, as there's only one CPU on the system, and the check added here will cause it to exit the RT pull code. Reported-by: Daniel Wagner Signed-off-by: Steven Rostedt (VMware) Acked-by: Peter Zijlstra Cc: Linus Torvalds Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Cc: linux-rt-users Cc: stable@vger.kernel.org Fixes: 4bdced5c9 ("sched/rt: Simplify the IPI based RT balancing logic") Link: http://lkml.kernel.org/r/20171202130454.4cbbfe8d@vmware.local.home Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 4056c19ca3f0..665ace2fc558 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2034,8 +2034,9 @@ static void pull_rt_task(struct rq *this_rq) bool resched = false; struct task_struct *p; struct rq *src_rq; + int rt_overload_count = rt_overloaded(this_rq); - if (likely(!rt_overloaded(this_rq))) + if (likely(!rt_overload_count)) return; /* @@ -2044,6 +2045,11 @@ static void pull_rt_task(struct rq *this_rq) */ smp_rmb(); + /* If we are the only overloaded CPU do nothing */ + if (rt_overload_count == 1 && + cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask)) + return; + #ifdef HAVE_RT_PUSH_IPI if (sched_feat(RT_PUSH_IPI)) { tell_cpu_to_push(this_rq); From fccff0862838908d21eaf956d57e09c6c189f7c5 Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Fri, 15 Dec 2017 08:44:21 +0100 Subject: [PATCH 230/269] mlxsw: spectrum: Disable MAC learning for ovs port Learning is currently enabled for ports which are OVS slaves - even though OVS doesn't need this indication. Since we're not associating a fid with the port, HW would continuously notify driver of learned [& aged] MACs which would be logged as errors. Fixes: 2b94e58df58c ("mlxsw: spectrum: Allow ports to work under OVS master") Signed-off-by: Yuval Mintz Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 2d0897b7d860..9bd8d28de152 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -4300,6 +4300,7 @@ static int mlxsw_sp_port_stp_set(struct mlxsw_sp_port *mlxsw_sp_port, static int mlxsw_sp_port_ovs_join(struct mlxsw_sp_port *mlxsw_sp_port) { + u16 vid = 1; int err; err = mlxsw_sp_port_vp_mode_set(mlxsw_sp_port, true); @@ -4312,8 +4313,19 @@ static int mlxsw_sp_port_ovs_join(struct mlxsw_sp_port *mlxsw_sp_port) true, false); if (err) goto err_port_vlan_set; + + for (; vid <= VLAN_N_VID - 1; vid++) { + err = mlxsw_sp_port_vid_learning_set(mlxsw_sp_port, + vid, false); + if (err) + goto err_vid_learning_set; + } + return 0; +err_vid_learning_set: + for (vid--; vid >= 1; vid--) + mlxsw_sp_port_vid_learning_set(mlxsw_sp_port, vid, true); err_port_vlan_set: mlxsw_sp_port_stp_set(mlxsw_sp_port, false); err_port_stp_set: @@ -4323,6 +4335,12 @@ err_port_stp_set: static void mlxsw_sp_port_ovs_leave(struct mlxsw_sp_port *mlxsw_sp_port) { + u16 vid; + + for (vid = VLAN_N_VID - 1; vid >= 1; vid--) + mlxsw_sp_port_vid_learning_set(mlxsw_sp_port, + vid, true); + mlxsw_sp_port_vlan_set(mlxsw_sp_port, 2, VLAN_N_VID - 1, false, false); mlxsw_sp_port_stp_set(mlxsw_sp_port, false); From a4544831370618cb3627e27ffcc27d1cc857868f Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 15 Dec 2017 16:07:22 +0000 Subject: [PATCH 231/269] arm64: fpsimd: Fix copying of FP state from signal frame into task struct Commit 9de52a755cfb6da5 ("arm64: fpsimd: Fix failure to restore FPSIMD state after signals") fixed an issue reported in our FPSIMD signal restore code but inadvertently introduced another issue which tends to manifest as random SEGVs in userspace. The problem is that when we copy the struct fpsimd_state from the kernel stack (populated from the signal frame) into the struct held in the current thread_struct, we blindly copy uninitialised stack into the "cpu" field, which means that context-switching of the FP registers is no longer reliable. This patch fixes the problem by copying only the user_fpsimd member of struct fpsimd_state. We should really rework the function prototypes to take struct user_fpsimd_state * instead, but let's just get this fixed for now. Cc: Dave Martin Fixes: 9de52a755cfb6da5 ("arm64: fpsimd: Fix failure to restore FPSIMD state after signals") Reported-by: Geert Uytterhoeven Signed-off-by: Will Deacon --- arch/arm64/kernel/fpsimd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 540a1e010eb5..fae81f7964b4 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -1043,7 +1043,7 @@ void fpsimd_update_current_state(struct fpsimd_state *state) local_bh_disable(); - current->thread.fpsimd_state = *state; + current->thread.fpsimd_state.user_fpsimd = state->user_fpsimd; if (system_supports_sve() && test_thread_flag(TIF_SVE)) fpsimd_to_sve(current); From 7fbd9493f0eeae8cef58300505a9ef5c8fce6313 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Wed, 13 Dec 2017 18:56:29 +0100 Subject: [PATCH 232/269] s390/qeth: apply takeover changes when mode is toggled Just as for an explicit enable/disable, toggling the takeover mode also requires that the IP addresses get updated. Otherwise all IPs that were added to the table before the mode-toggle, get registered with the old settings. Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_core.h | 2 +- drivers/s390/net/qeth_core_main.c | 2 +- drivers/s390/net/qeth_l3_sys.c | 35 +++++++++++++++---------------- 3 files changed, 19 insertions(+), 20 deletions(-) diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h index 15015a24f8ad..51c618d9fefe 100644 --- a/drivers/s390/net/qeth_core.h +++ b/drivers/s390/net/qeth_core.h @@ -565,7 +565,7 @@ enum qeth_cq { }; struct qeth_ipato { - int enabled; + bool enabled; int invert4; int invert6; struct list_head entries; diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index 430e3214f7e2..8d18675e60e2 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -1480,7 +1480,7 @@ static int qeth_setup_card(struct qeth_card *card) qeth_set_intial_options(card); /* IP address takeover */ INIT_LIST_HEAD(&card->ipato.entries); - card->ipato.enabled = 0; + card->ipato.enabled = false; card->ipato.invert4 = 0; card->ipato.invert6 = 0; /* init QDIO stuff */ diff --git a/drivers/s390/net/qeth_l3_sys.c b/drivers/s390/net/qeth_l3_sys.c index bd12fdf678be..198717f71b3d 100644 --- a/drivers/s390/net/qeth_l3_sys.c +++ b/drivers/s390/net/qeth_l3_sys.c @@ -372,6 +372,7 @@ static ssize_t qeth_l3_dev_ipato_enable_store(struct device *dev, struct qeth_card *card = dev_get_drvdata(dev); struct qeth_ipaddr *addr; int i, rc = 0; + bool enable; if (!card) return -EINVAL; @@ -384,25 +385,23 @@ static ssize_t qeth_l3_dev_ipato_enable_store(struct device *dev, } if (sysfs_streq(buf, "toggle")) { - card->ipato.enabled = (card->ipato.enabled)? 0 : 1; - } else if (sysfs_streq(buf, "1")) { - card->ipato.enabled = 1; - hash_for_each(card->ip_htable, i, addr, hnode) { - if ((addr->type == QETH_IP_TYPE_NORMAL) && - qeth_l3_is_addr_covered_by_ipato(card, addr)) - addr->set_flags |= - QETH_IPA_SETIP_TAKEOVER_FLAG; - } - } else if (sysfs_streq(buf, "0")) { - card->ipato.enabled = 0; - hash_for_each(card->ip_htable, i, addr, hnode) { - if (addr->set_flags & - QETH_IPA_SETIP_TAKEOVER_FLAG) - addr->set_flags &= - ~QETH_IPA_SETIP_TAKEOVER_FLAG; - } - } else + enable = !card->ipato.enabled; + } else if (kstrtobool(buf, &enable)) { rc = -EINVAL; + goto out; + } + + if (card->ipato.enabled == enable) + goto out; + card->ipato.enabled = enable; + + hash_for_each(card->ip_htable, i, addr, hnode) { + if (!enable) + addr->set_flags &= ~QETH_IPA_SETIP_TAKEOVER_FLAG; + else if (addr->type == QETH_IP_TYPE_NORMAL && + qeth_l3_is_addr_covered_by_ipato(card, addr)) + addr->set_flags |= QETH_IPA_SETIP_TAKEOVER_FLAG; + } out: mutex_unlock(&card->conf_mutex); return rc ? rc : count; From b22d73d6689fd902a66c08ebe71ab2f3b351e22f Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Wed, 13 Dec 2017 18:56:30 +0100 Subject: [PATCH 233/269] s390/qeth: don't apply takeover changes to RXIP When takeover is switched off, current code clears the 'TAKEOVER' flag on all IPs. But the flag is also used for RXIP addresses, and those should not be affected by the takeover mode. Fix the behaviour by consistenly applying takover logic to NORMAL addresses only. Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_l3_main.c | 5 +++-- drivers/s390/net/qeth_l3_sys.c | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c index 6a73894b0cb5..4a4be81800eb 100644 --- a/drivers/s390/net/qeth_l3_main.c +++ b/drivers/s390/net/qeth_l3_main.c @@ -174,6 +174,8 @@ int qeth_l3_is_addr_covered_by_ipato(struct qeth_card *card, if (!card->ipato.enabled) return 0; + if (addr->type != QETH_IP_TYPE_NORMAL) + return 0; qeth_l3_convert_addr_to_bits((u8 *) &addr->u, addr_bits, (addr->proto == QETH_PROT_IPV4)? 4:16); @@ -290,8 +292,7 @@ int qeth_l3_add_ip(struct qeth_card *card, struct qeth_ipaddr *tmp_addr) memcpy(addr, tmp_addr, sizeof(struct qeth_ipaddr)); addr->ref_counter = 1; - if (addr->type == QETH_IP_TYPE_NORMAL && - qeth_l3_is_addr_covered_by_ipato(card, addr)) { + if (qeth_l3_is_addr_covered_by_ipato(card, addr)) { QETH_CARD_TEXT(card, 2, "tkovaddr"); addr->set_flags |= QETH_IPA_SETIP_TAKEOVER_FLAG; } diff --git a/drivers/s390/net/qeth_l3_sys.c b/drivers/s390/net/qeth_l3_sys.c index 198717f71b3d..e256928092e5 100644 --- a/drivers/s390/net/qeth_l3_sys.c +++ b/drivers/s390/net/qeth_l3_sys.c @@ -396,10 +396,11 @@ static ssize_t qeth_l3_dev_ipato_enable_store(struct device *dev, card->ipato.enabled = enable; hash_for_each(card->ip_htable, i, addr, hnode) { + if (addr->type != QETH_IP_TYPE_NORMAL) + continue; if (!enable) addr->set_flags &= ~QETH_IPA_SETIP_TAKEOVER_FLAG; - else if (addr->type == QETH_IP_TYPE_NORMAL && - qeth_l3_is_addr_covered_by_ipato(card, addr)) + else if (qeth_l3_is_addr_covered_by_ipato(card, addr)) addr->set_flags |= QETH_IPA_SETIP_TAKEOVER_FLAG; } out: From 8a03a3692b100d84785ee7a834e9215e304c9e00 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Wed, 13 Dec 2017 18:56:31 +0100 Subject: [PATCH 234/269] s390/qeth: lock IP table while applying takeover changes Modifying the flags of an IP addr object needs to be protected against eg. concurrent removal of the same object from the IP table. Fixes: 5f78e29ceebf ("qeth: optimize IP handling in rx_mode callback") Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_l3_sys.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/s390/net/qeth_l3_sys.c b/drivers/s390/net/qeth_l3_sys.c index e256928092e5..aa676b4090da 100644 --- a/drivers/s390/net/qeth_l3_sys.c +++ b/drivers/s390/net/qeth_l3_sys.c @@ -395,6 +395,7 @@ static ssize_t qeth_l3_dev_ipato_enable_store(struct device *dev, goto out; card->ipato.enabled = enable; + spin_lock_bh(&card->ip_lock); hash_for_each(card->ip_htable, i, addr, hnode) { if (addr->type != QETH_IP_TYPE_NORMAL) continue; @@ -403,6 +404,7 @@ static ssize_t qeth_l3_dev_ipato_enable_store(struct device *dev, else if (qeth_l3_is_addr_covered_by_ipato(card, addr)) addr->set_flags |= QETH_IPA_SETIP_TAKEOVER_FLAG; } + spin_unlock_bh(&card->ip_lock); out: mutex_unlock(&card->conf_mutex); return rc ? rc : count; From 02f510f326501470348a5df341e8232c3497bbbb Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Wed, 13 Dec 2017 18:56:32 +0100 Subject: [PATCH 235/269] s390/qeth: update takeover IPs after configuration change Any modification to the takeover IP-ranges requires that we re-evaluate which IP addresses are takeover-eligible. Otherwise we might do takeover for some addresses when we no longer should, or vice-versa. Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_core.h | 4 +- drivers/s390/net/qeth_core_main.c | 4 +- drivers/s390/net/qeth_l3.h | 2 +- drivers/s390/net/qeth_l3_main.c | 31 +++++++++++++-- drivers/s390/net/qeth_l3_sys.c | 63 +++++++++++++++++-------------- 5 files changed, 67 insertions(+), 37 deletions(-) diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h index 51c618d9fefe..badf42acbf95 100644 --- a/drivers/s390/net/qeth_core.h +++ b/drivers/s390/net/qeth_core.h @@ -566,8 +566,8 @@ enum qeth_cq { struct qeth_ipato { bool enabled; - int invert4; - int invert6; + bool invert4; + bool invert6; struct list_head entries; }; diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index 8d18675e60e2..6c815207f4f5 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -1481,8 +1481,8 @@ static int qeth_setup_card(struct qeth_card *card) /* IP address takeover */ INIT_LIST_HEAD(&card->ipato.entries); card->ipato.enabled = false; - card->ipato.invert4 = 0; - card->ipato.invert6 = 0; + card->ipato.invert4 = false; + card->ipato.invert6 = false; /* init QDIO stuff */ qeth_init_qdio_info(card); INIT_DELAYED_WORK(&card->buffer_reclaim_work, qeth_buffer_reclaim_work); diff --git a/drivers/s390/net/qeth_l3.h b/drivers/s390/net/qeth_l3.h index 194ae9b577cc..e5833837b799 100644 --- a/drivers/s390/net/qeth_l3.h +++ b/drivers/s390/net/qeth_l3.h @@ -82,7 +82,7 @@ void qeth_l3_del_vipa(struct qeth_card *, enum qeth_prot_versions, const u8 *); int qeth_l3_add_rxip(struct qeth_card *, enum qeth_prot_versions, const u8 *); void qeth_l3_del_rxip(struct qeth_card *card, enum qeth_prot_versions, const u8 *); -int qeth_l3_is_addr_covered_by_ipato(struct qeth_card *, struct qeth_ipaddr *); +void qeth_l3_update_ipato(struct qeth_card *card); struct qeth_ipaddr *qeth_l3_get_addr_buffer(enum qeth_prot_versions); int qeth_l3_add_ip(struct qeth_card *, struct qeth_ipaddr *); int qeth_l3_delete_ip(struct qeth_card *, struct qeth_ipaddr *); diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c index 4a4be81800eb..ef0961e18686 100644 --- a/drivers/s390/net/qeth_l3_main.c +++ b/drivers/s390/net/qeth_l3_main.c @@ -164,8 +164,8 @@ static void qeth_l3_convert_addr_to_bits(u8 *addr, u8 *bits, int len) } } -int qeth_l3_is_addr_covered_by_ipato(struct qeth_card *card, - struct qeth_ipaddr *addr) +static bool qeth_l3_is_addr_covered_by_ipato(struct qeth_card *card, + struct qeth_ipaddr *addr) { struct qeth_ipato_entry *ipatoe; u8 addr_bits[128] = {0, }; @@ -606,6 +606,27 @@ int qeth_l3_setrouting_v6(struct qeth_card *card) /* * IP address takeover related functions */ + +/** + * qeth_l3_update_ipato() - Update 'takeover' property, for all NORMAL IPs. + * + * Caller must hold ip_lock. + */ +void qeth_l3_update_ipato(struct qeth_card *card) +{ + struct qeth_ipaddr *addr; + unsigned int i; + + hash_for_each(card->ip_htable, i, addr, hnode) { + if (addr->type != QETH_IP_TYPE_NORMAL) + continue; + if (qeth_l3_is_addr_covered_by_ipato(card, addr)) + addr->set_flags |= QETH_IPA_SETIP_TAKEOVER_FLAG; + else + addr->set_flags &= ~QETH_IPA_SETIP_TAKEOVER_FLAG; + } +} + static void qeth_l3_clear_ipato_list(struct qeth_card *card) { struct qeth_ipato_entry *ipatoe, *tmp; @@ -617,6 +638,7 @@ static void qeth_l3_clear_ipato_list(struct qeth_card *card) kfree(ipatoe); } + qeth_l3_update_ipato(card); spin_unlock_bh(&card->ip_lock); } @@ -641,8 +663,10 @@ int qeth_l3_add_ipato_entry(struct qeth_card *card, } } - if (!rc) + if (!rc) { list_add_tail(&new->entry, &card->ipato.entries); + qeth_l3_update_ipato(card); + } spin_unlock_bh(&card->ip_lock); @@ -665,6 +689,7 @@ void qeth_l3_del_ipato_entry(struct qeth_card *card, (proto == QETH_PROT_IPV4)? 4:16) && (ipatoe->mask_bits == mask_bits)) { list_del(&ipatoe->entry); + qeth_l3_update_ipato(card); kfree(ipatoe); } } diff --git a/drivers/s390/net/qeth_l3_sys.c b/drivers/s390/net/qeth_l3_sys.c index aa676b4090da..6ea2b528a64e 100644 --- a/drivers/s390/net/qeth_l3_sys.c +++ b/drivers/s390/net/qeth_l3_sys.c @@ -370,9 +370,8 @@ static ssize_t qeth_l3_dev_ipato_enable_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct qeth_card *card = dev_get_drvdata(dev); - struct qeth_ipaddr *addr; - int i, rc = 0; bool enable; + int rc = 0; if (!card) return -EINVAL; @@ -391,20 +390,12 @@ static ssize_t qeth_l3_dev_ipato_enable_store(struct device *dev, goto out; } - if (card->ipato.enabled == enable) - goto out; - card->ipato.enabled = enable; - - spin_lock_bh(&card->ip_lock); - hash_for_each(card->ip_htable, i, addr, hnode) { - if (addr->type != QETH_IP_TYPE_NORMAL) - continue; - if (!enable) - addr->set_flags &= ~QETH_IPA_SETIP_TAKEOVER_FLAG; - else if (qeth_l3_is_addr_covered_by_ipato(card, addr)) - addr->set_flags |= QETH_IPA_SETIP_TAKEOVER_FLAG; + if (card->ipato.enabled != enable) { + card->ipato.enabled = enable; + spin_lock_bh(&card->ip_lock); + qeth_l3_update_ipato(card); + spin_unlock_bh(&card->ip_lock); } - spin_unlock_bh(&card->ip_lock); out: mutex_unlock(&card->conf_mutex); return rc ? rc : count; @@ -430,20 +421,27 @@ static ssize_t qeth_l3_dev_ipato_invert4_store(struct device *dev, const char *buf, size_t count) { struct qeth_card *card = dev_get_drvdata(dev); + bool invert; int rc = 0; if (!card) return -EINVAL; mutex_lock(&card->conf_mutex); - if (sysfs_streq(buf, "toggle")) - card->ipato.invert4 = (card->ipato.invert4)? 0 : 1; - else if (sysfs_streq(buf, "1")) - card->ipato.invert4 = 1; - else if (sysfs_streq(buf, "0")) - card->ipato.invert4 = 0; - else + if (sysfs_streq(buf, "toggle")) { + invert = !card->ipato.invert4; + } else if (kstrtobool(buf, &invert)) { rc = -EINVAL; + goto out; + } + + if (card->ipato.invert4 != invert) { + card->ipato.invert4 = invert; + spin_lock_bh(&card->ip_lock); + qeth_l3_update_ipato(card); + spin_unlock_bh(&card->ip_lock); + } +out: mutex_unlock(&card->conf_mutex); return rc ? rc : count; } @@ -609,20 +607,27 @@ static ssize_t qeth_l3_dev_ipato_invert6_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct qeth_card *card = dev_get_drvdata(dev); + bool invert; int rc = 0; if (!card) return -EINVAL; mutex_lock(&card->conf_mutex); - if (sysfs_streq(buf, "toggle")) - card->ipato.invert6 = (card->ipato.invert6)? 0 : 1; - else if (sysfs_streq(buf, "1")) - card->ipato.invert6 = 1; - else if (sysfs_streq(buf, "0")) - card->ipato.invert6 = 0; - else + if (sysfs_streq(buf, "toggle")) { + invert = !card->ipato.invert6; + } else if (kstrtobool(buf, &invert)) { rc = -EINVAL; + goto out; + } + + if (card->ipato.invert6 != invert) { + card->ipato.invert6 = invert; + spin_lock_bh(&card->ip_lock); + qeth_l3_update_ipato(card); + spin_unlock_bh(&card->ip_lock); + } +out: mutex_unlock(&card->conf_mutex); return rc ? rc : count; } From 35b99dffc3f710cafceee6c8c6ac6a98eb2cb4bf Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 13 Dec 2017 14:41:06 -0500 Subject: [PATCH 236/269] sock: free skb in skb_complete_tx_timestamp on error skb_complete_tx_timestamp must ingest the skb it is passed. Call kfree_skb if the skb cannot be enqueued. Fixes: b245be1f4db1 ("net-timestamp: no-payload only sysctl") Fixes: 9ac25fc06375 ("net: fix socket refcounting in skb_complete_tx_timestamp()") Reported-by: Richard Cochran Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6b0ff396fa9d..a592ca025fc4 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4293,7 +4293,7 @@ void skb_complete_tx_timestamp(struct sk_buff *skb, struct sock *sk = skb->sk; if (!skb_may_tx_timestamp(sk, false)) - return; + goto err; /* Take a reference to prevent skb_orphan() from freeing the socket, * but only if the socket refcount is not zero. @@ -4302,7 +4302,11 @@ void skb_complete_tx_timestamp(struct sk_buff *skb, *skb_hwtstamps(skb) = *hwtstamps; __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false); sock_put(sk); + return; } + +err: + kfree_skb(skb); } EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); From e4d02ca04c6d48ab2226342a1c4ed54f1dbb72bd Mon Sep 17 00:00:00 2001 From: Igor Russkikh Date: Thu, 14 Dec 2017 12:34:40 +0300 Subject: [PATCH 237/269] net: aquantia: Fix actual speed capabilities reporting Different hardware device Ids correspond to different maximum speed available. Extra checks were added for devices D108 and D109 to remove unsupported speeds from these device capabilities list. Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/aq_hw.h | 4 +++- drivers/net/ethernet/aquantia/atlantic/aq_nic.c | 7 ++++--- drivers/net/ethernet/aquantia/atlantic/aq_nic.h | 2 +- .../net/ethernet/aquantia/atlantic/aq_pci_func.c | 5 +++-- .../ethernet/aquantia/atlantic/hw_atl/hw_atl_a0.c | 13 ++++++++++++- .../ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c | 13 ++++++++++++- 6 files changed, 35 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_hw.h b/drivers/net/ethernet/aquantia/atlantic/aq_hw.h index 0207927dc8a6..4ebd53b3c7da 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_hw.h +++ b/drivers/net/ethernet/aquantia/atlantic/aq_hw.h @@ -85,7 +85,9 @@ struct aq_hw_ops { void (*destroy)(struct aq_hw_s *self); int (*get_hw_caps)(struct aq_hw_s *self, - struct aq_hw_caps_s *aq_hw_caps); + struct aq_hw_caps_s *aq_hw_caps, + unsigned short device, + unsigned short subsystem_device); int (*hw_ring_tx_xmit)(struct aq_hw_s *self, struct aq_ring_s *aq_ring, unsigned int frags); diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c index 78dfb2ab78ce..a360ccc298b9 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c @@ -222,7 +222,7 @@ static struct net_device *aq_nic_ndev_alloc(void) struct aq_nic_s *aq_nic_alloc_cold(const struct net_device_ops *ndev_ops, const struct ethtool_ops *et_ops, - struct device *dev, + struct pci_dev *pdev, struct aq_pci_func_s *aq_pci_func, unsigned int port, const struct aq_hw_ops *aq_hw_ops) @@ -242,7 +242,7 @@ struct aq_nic_s *aq_nic_alloc_cold(const struct net_device_ops *ndev_ops, ndev->netdev_ops = ndev_ops; ndev->ethtool_ops = et_ops; - SET_NETDEV_DEV(ndev, dev); + SET_NETDEV_DEV(ndev, &pdev->dev); ndev->if_port = port; self->ndev = ndev; @@ -254,7 +254,8 @@ struct aq_nic_s *aq_nic_alloc_cold(const struct net_device_ops *ndev_ops, self->aq_hw = self->aq_hw_ops.create(aq_pci_func, self->port, &self->aq_hw_ops); - err = self->aq_hw_ops.get_hw_caps(self->aq_hw, &self->aq_hw_caps); + err = self->aq_hw_ops.get_hw_caps(self->aq_hw, &self->aq_hw_caps, + pdev->device, pdev->subsystem_device); if (err < 0) goto err_exit; diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h index 4309983acdd6..3c9f8db03d5f 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h @@ -71,7 +71,7 @@ struct aq_nic_cfg_s { struct aq_nic_s *aq_nic_alloc_cold(const struct net_device_ops *ndev_ops, const struct ethtool_ops *et_ops, - struct device *dev, + struct pci_dev *pdev, struct aq_pci_func_s *aq_pci_func, unsigned int port, const struct aq_hw_ops *aq_hw_ops); diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c index cadaa646c89f..58c29d04b186 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c @@ -51,7 +51,8 @@ struct aq_pci_func_s *aq_pci_func_alloc(struct aq_hw_ops *aq_hw_ops, pci_set_drvdata(pdev, self); self->pdev = pdev; - err = aq_hw_ops->get_hw_caps(NULL, &self->aq_hw_caps); + err = aq_hw_ops->get_hw_caps(NULL, &self->aq_hw_caps, pdev->device, + pdev->subsystem_device); if (err < 0) goto err_exit; @@ -59,7 +60,7 @@ struct aq_pci_func_s *aq_pci_func_alloc(struct aq_hw_ops *aq_hw_ops, for (port = 0; port < self->ports; ++port) { struct aq_nic_s *aq_nic = aq_nic_alloc_cold(ndev_ops, eth_ops, - &pdev->dev, self, + pdev, self, port, aq_hw_ops); if (!aq_nic) { diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0.c index 07b3c49a16a4..b0abd187cead 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0.c @@ -18,9 +18,20 @@ #include "hw_atl_a0_internal.h" static int hw_atl_a0_get_hw_caps(struct aq_hw_s *self, - struct aq_hw_caps_s *aq_hw_caps) + struct aq_hw_caps_s *aq_hw_caps, + unsigned short device, + unsigned short subsystem_device) { memcpy(aq_hw_caps, &hw_atl_a0_hw_caps_, sizeof(*aq_hw_caps)); + + if (device == HW_ATL_DEVICE_ID_D108 && subsystem_device == 0x0001) + aq_hw_caps->link_speed_msk &= ~HW_ATL_A0_RATE_10G; + + if (device == HW_ATL_DEVICE_ID_D109 && subsystem_device == 0x0001) { + aq_hw_caps->link_speed_msk &= ~HW_ATL_A0_RATE_10G; + aq_hw_caps->link_speed_msk &= ~HW_ATL_A0_RATE_5G; + } + return 0; } diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c index ec68c20efcbd..e4e3b8e2d67e 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c @@ -18,9 +18,20 @@ #include "hw_atl_b0_internal.h" static int hw_atl_b0_get_hw_caps(struct aq_hw_s *self, - struct aq_hw_caps_s *aq_hw_caps) + struct aq_hw_caps_s *aq_hw_caps, + unsigned short device, + unsigned short subsystem_device) { memcpy(aq_hw_caps, &hw_atl_b0_hw_caps_, sizeof(*aq_hw_caps)); + + if (device == HW_ATL_DEVICE_ID_D108 && subsystem_device == 0x0001) + aq_hw_caps->link_speed_msk &= ~HW_ATL_B0_RATE_10G; + + if (device == HW_ATL_DEVICE_ID_D109 && subsystem_device == 0x0001) { + aq_hw_caps->link_speed_msk &= ~HW_ATL_B0_RATE_10G; + aq_hw_caps->link_speed_msk &= ~HW_ATL_B0_RATE_5G; + } + return 0; } From 1e366161510f266516107a69db91f1f2edaea11c Mon Sep 17 00:00:00 2001 From: Igor Russkikh Date: Thu, 14 Dec 2017 12:34:41 +0300 Subject: [PATCH 238/269] net: aquantia: Fix hardware DMA stream overload on large MRRS Systems with large MRRS on device (2K, 4K) with high data rates and/or large MTU, atlantic observes DMA packet buffer overflow. On some systems that causes PCIe transaction errors, hardware NMIs or datapath freeze. This patch 1) Limits MRRS from device side to 2K (thats maximum our hardware supports) 2) Limit maximum size of outstanding TX DMA data read requests. This makes hardware buffers running fine. Signed-off-by: Pavel Belous Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- .../ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c | 12 ++++++++++++ .../aquantia/atlantic/hw_atl/hw_atl_llh_internal.h | 6 ++++++ 2 files changed, 18 insertions(+) diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c index e4e3b8e2d67e..36fddb199160 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c @@ -16,6 +16,7 @@ #include "hw_atl_utils.h" #include "hw_atl_llh.h" #include "hw_atl_b0_internal.h" +#include "hw_atl_llh_internal.h" static int hw_atl_b0_get_hw_caps(struct aq_hw_s *self, struct aq_hw_caps_s *aq_hw_caps, @@ -368,6 +369,7 @@ static int hw_atl_b0_hw_init(struct aq_hw_s *self, }; int err = 0; + u32 val; self->aq_nic_cfg = aq_nic_cfg; @@ -385,6 +387,16 @@ static int hw_atl_b0_hw_init(struct aq_hw_s *self, hw_atl_b0_hw_rss_set(self, &aq_nic_cfg->aq_rss); hw_atl_b0_hw_rss_hash_set(self, &aq_nic_cfg->aq_rss); + /* Force limit MRRS on RDM/TDM to 2K */ + val = aq_hw_read_reg(self, pci_reg_control6_adr); + aq_hw_write_reg(self, pci_reg_control6_adr, (val & ~0x707) | 0x404); + + /* TX DMA total request limit. B0 hardware is not capable to + * handle more than (8K-MRRS) incoming DMA data. + * Value 24 in 256byte units + */ + aq_hw_write_reg(self, tx_dma_total_req_limit_adr, 24); + err = aq_hw_err_from_flags(self); if (err < 0) goto err_exit; diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh_internal.h b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh_internal.h index 5527fc0e5942..93450ec930e8 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh_internal.h +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh_internal.h @@ -2343,6 +2343,9 @@ #define tx_dma_desc_base_addrmsw_adr(descriptor) \ (0x00007c04u + (descriptor) * 0x40) +/* tx dma total request limit */ +#define tx_dma_total_req_limit_adr 0x00007b20u + /* tx interrupt moderation control register definitions * Preprocessor definitions for TX Interrupt Moderation Control Register * Base Address: 0x00008980 @@ -2369,6 +2372,9 @@ /* default value of bitfield reg_res_dsbl */ #define pci_reg_res_dsbl_default 0x1 +/* PCI core control register */ +#define pci_reg_control6_adr 0x1014u + /* global microprocessor scratch pad definitions */ #define glb_cpu_scratch_scp_adr(scratch_scp) (0x00000300u + (scratch_scp) * 0x4) From be08d839d9ef1c9b0e4ed809ec852ff100f9970d Mon Sep 17 00:00:00 2001 From: Igor Russkikh Date: Thu, 14 Dec 2017 12:34:42 +0300 Subject: [PATCH 239/269] net: aquantia: Extend stat counters to 64bit values Device hardware provides only 32bit counters. Using these directly causes byte counters to overflow soon. A separate nic level structure with 64 bit counters is now used to collect incrementally all the stats and report these counters to ethtool stats and ndev stats. Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- .../net/ethernet/aquantia/atlantic/aq_hw.h | 25 +++++- .../net/ethernet/aquantia/atlantic/aq_nic.c | 35 +++++++-- .../aquantia/atlantic/hw_atl/hw_atl_utils.c | 76 ++++++------------- .../aquantia/atlantic/hw_atl/hw_atl_utils.h | 6 +- 4 files changed, 77 insertions(+), 65 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_hw.h b/drivers/net/ethernet/aquantia/atlantic/aq_hw.h index 4ebd53b3c7da..b3825de6cdfb 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_hw.h +++ b/drivers/net/ethernet/aquantia/atlantic/aq_hw.h @@ -46,6 +46,28 @@ struct aq_hw_link_status_s { unsigned int mbps; }; +struct aq_stats_s { + u64 uprc; + u64 mprc; + u64 bprc; + u64 erpt; + u64 uptc; + u64 mptc; + u64 bptc; + u64 erpr; + u64 mbtc; + u64 bbtc; + u64 mbrc; + u64 bbrc; + u64 ubrc; + u64 ubtc; + u64 dpc; + u64 dma_pkt_rc; + u64 dma_pkt_tc; + u64 dma_oct_rc; + u64 dma_oct_tc; +}; + #define AQ_HW_IRQ_INVALID 0U #define AQ_HW_IRQ_LEGACY 1U #define AQ_HW_IRQ_MSI 2U @@ -166,8 +188,7 @@ struct aq_hw_ops { int (*hw_update_stats)(struct aq_hw_s *self); - int (*hw_get_hw_stats)(struct aq_hw_s *self, u64 *data, - unsigned int *p_count); + struct aq_stats_s *(*hw_get_hw_stats)(struct aq_hw_s *self); int (*hw_get_fw_version)(struct aq_hw_s *self, u32 *fw_version); diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c index a360ccc298b9..28cbe9d43df6 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c @@ -750,16 +750,40 @@ int aq_nic_get_regs_count(struct aq_nic_s *self) void aq_nic_get_stats(struct aq_nic_s *self, u64 *data) { - struct aq_vec_s *aq_vec = NULL; unsigned int i = 0U; unsigned int count = 0U; - int err = 0; + struct aq_vec_s *aq_vec = NULL; + struct aq_stats_s *stats = self->aq_hw_ops.hw_get_hw_stats(self->aq_hw); - err = self->aq_hw_ops.hw_get_hw_stats(self->aq_hw, data, &count); - if (err < 0) + if (!stats) goto err_exit; - data += count; + data[i] = stats->uprc + stats->mprc + stats->bprc; + data[++i] = stats->uprc; + data[++i] = stats->mprc; + data[++i] = stats->bprc; + data[++i] = stats->erpt; + data[++i] = stats->uptc + stats->mptc + stats->bptc; + data[++i] = stats->uptc; + data[++i] = stats->mptc; + data[++i] = stats->bptc; + data[++i] = stats->ubrc; + data[++i] = stats->ubtc; + data[++i] = stats->mbrc; + data[++i] = stats->mbtc; + data[++i] = stats->bbrc; + data[++i] = stats->bbtc; + data[++i] = stats->ubrc + stats->mbrc + stats->bbrc; + data[++i] = stats->ubtc + stats->mbtc + stats->bbtc; + data[++i] = stats->dma_pkt_rc; + data[++i] = stats->dma_pkt_tc; + data[++i] = stats->dma_oct_rc; + data[++i] = stats->dma_oct_tc; + data[++i] = stats->dpc; + + i++; + + data += i; count = 0U; for (i = 0U, aq_vec = self->aq_vec[0]; @@ -769,7 +793,6 @@ void aq_nic_get_stats(struct aq_nic_s *self, u64 *data) } err_exit:; - (void)err; } void aq_nic_get_link_ksettings(struct aq_nic_s *self, diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c index 1fe016fc4bc7..f2ce12ed4218 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c @@ -503,73 +503,43 @@ int hw_atl_utils_update_stats(struct aq_hw_s *self) struct hw_atl_s *hw_self = PHAL_ATLANTIC; struct hw_aq_atl_utils_mbox mbox; - if (!self->aq_link_status.mbps) - return 0; - hw_atl_utils_mpi_read_stats(self, &mbox); #define AQ_SDELTA(_N_) (hw_self->curr_stats._N_ += \ mbox.stats._N_ - hw_self->last_stats._N_) + if (self->aq_link_status.mbps) { + AQ_SDELTA(uprc); + AQ_SDELTA(mprc); + AQ_SDELTA(bprc); + AQ_SDELTA(erpt); - AQ_SDELTA(uprc); - AQ_SDELTA(mprc); - AQ_SDELTA(bprc); - AQ_SDELTA(erpt); - - AQ_SDELTA(uptc); - AQ_SDELTA(mptc); - AQ_SDELTA(bptc); - AQ_SDELTA(erpr); - - AQ_SDELTA(ubrc); - AQ_SDELTA(ubtc); - AQ_SDELTA(mbrc); - AQ_SDELTA(mbtc); - AQ_SDELTA(bbrc); - AQ_SDELTA(bbtc); - AQ_SDELTA(dpc); + AQ_SDELTA(uptc); + AQ_SDELTA(mptc); + AQ_SDELTA(bptc); + AQ_SDELTA(erpr); + AQ_SDELTA(ubrc); + AQ_SDELTA(ubtc); + AQ_SDELTA(mbrc); + AQ_SDELTA(mbtc); + AQ_SDELTA(bbrc); + AQ_SDELTA(bbtc); + AQ_SDELTA(dpc); + } #undef AQ_SDELTA + hw_self->curr_stats.dma_pkt_rc = stats_rx_dma_good_pkt_counterlsw_get(self); + hw_self->curr_stats.dma_pkt_tc = stats_tx_dma_good_pkt_counterlsw_get(self); + hw_self->curr_stats.dma_oct_rc = stats_rx_dma_good_octet_counterlsw_get(self); + hw_self->curr_stats.dma_oct_tc = stats_tx_dma_good_octet_counterlsw_get(self); memcpy(&hw_self->last_stats, &mbox.stats, sizeof(mbox.stats)); return 0; } -int hw_atl_utils_get_hw_stats(struct aq_hw_s *self, - u64 *data, unsigned int *p_count) +struct aq_stats_s *hw_atl_utils_get_hw_stats(struct aq_hw_s *self) { - struct hw_atl_s *hw_self = PHAL_ATLANTIC; - struct hw_atl_stats_s *stats = &hw_self->curr_stats; - int i = 0; - - data[i] = stats->uprc + stats->mprc + stats->bprc; - data[++i] = stats->uprc; - data[++i] = stats->mprc; - data[++i] = stats->bprc; - data[++i] = stats->erpt; - data[++i] = stats->uptc + stats->mptc + stats->bptc; - data[++i] = stats->uptc; - data[++i] = stats->mptc; - data[++i] = stats->bptc; - data[++i] = stats->ubrc; - data[++i] = stats->ubtc; - data[++i] = stats->mbrc; - data[++i] = stats->mbtc; - data[++i] = stats->bbrc; - data[++i] = stats->bbtc; - data[++i] = stats->ubrc + stats->mbrc + stats->bbrc; - data[++i] = stats->ubtc + stats->mbtc + stats->bbtc; - data[++i] = stats_rx_dma_good_pkt_counterlsw_get(self); - data[++i] = stats_tx_dma_good_pkt_counterlsw_get(self); - data[++i] = stats_rx_dma_good_octet_counterlsw_get(self); - data[++i] = stats_tx_dma_good_octet_counterlsw_get(self); - data[++i] = stats->dpc; - - if (p_count) - *p_count = ++i; - - return 0; + return &PHAL_ATLANTIC->curr_stats; } static const u32 hw_atl_utils_hw_mac_regs[] = { diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.h b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.h index c99cc690e425..21aeca6908d3 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.h +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.h @@ -129,7 +129,7 @@ struct __packed hw_aq_atl_utils_mbox { struct __packed hw_atl_s { struct aq_hw_s base; struct hw_atl_stats_s last_stats; - struct hw_atl_stats_s curr_stats; + struct aq_stats_s curr_stats; u64 speed; unsigned int chip_features; u32 fw_ver_actual; @@ -207,8 +207,6 @@ int hw_atl_utils_get_fw_version(struct aq_hw_s *self, u32 *fw_version); int hw_atl_utils_update_stats(struct aq_hw_s *self); -int hw_atl_utils_get_hw_stats(struct aq_hw_s *self, - u64 *data, - unsigned int *p_count); +struct aq_stats_s *hw_atl_utils_get_hw_stats(struct aq_hw_s *self); #endif /* HW_ATL_UTILS_H */ From 9f8a2203a542f5f3cdeb17f40250c49bb87aa7e3 Mon Sep 17 00:00:00 2001 From: Igor Russkikh Date: Thu, 14 Dec 2017 12:34:43 +0300 Subject: [PATCH 240/269] net: aquantia: Fill ndev stat couters from hardware Originally they were filled from ring sw counters. These sometimes incorrectly calculate byte and packet amounts when using LRO/LSO and jumboframes. Filling ndev counters from hardware makes them precise. Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- .../net/ethernet/aquantia/atlantic/aq_nic.c | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c index 28cbe9d43df6..307caac68731 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c @@ -37,6 +37,8 @@ static unsigned int aq_itr_rx; module_param_named(aq_itr_rx, aq_itr_rx, uint, 0644); MODULE_PARM_DESC(aq_itr_rx, "RX interrupt throttle rate"); +static void aq_nic_update_ndev_stats(struct aq_nic_s *self); + static void aq_nic_rss_init(struct aq_nic_s *self, unsigned int num_rss_queues) { struct aq_nic_cfg_s *cfg = &self->aq_nic_cfg; @@ -166,11 +168,7 @@ static int aq_nic_update_link_status(struct aq_nic_s *self) static void aq_nic_service_timer_cb(struct timer_list *t) { struct aq_nic_s *self = from_timer(self, t, service_timer); - struct net_device *ndev = aq_nic_get_ndev(self); int err = 0; - unsigned int i = 0U; - struct aq_ring_stats_rx_s stats_rx; - struct aq_ring_stats_tx_s stats_tx; if (aq_utils_obj_test(&self->header.flags, AQ_NIC_FLAGS_IS_NOT_READY)) goto err_exit; @@ -182,19 +180,8 @@ static void aq_nic_service_timer_cb(struct timer_list *t) if (self->aq_hw_ops.hw_update_stats) self->aq_hw_ops.hw_update_stats(self->aq_hw); - memset(&stats_rx, 0U, sizeof(struct aq_ring_stats_rx_s)); - memset(&stats_tx, 0U, sizeof(struct aq_ring_stats_tx_s)); - for (i = AQ_DIMOF(self->aq_vec); i--;) { - if (self->aq_vec[i]) - aq_vec_add_stats(self->aq_vec[i], &stats_rx, &stats_tx); - } + aq_nic_update_ndev_stats(self); - ndev->stats.rx_packets = stats_rx.packets; - ndev->stats.rx_bytes = stats_rx.bytes; - ndev->stats.rx_errors = stats_rx.errors; - ndev->stats.tx_packets = stats_tx.packets; - ndev->stats.tx_bytes = stats_tx.bytes; - ndev->stats.tx_errors = stats_tx.errors; err_exit: mod_timer(&self->service_timer, @@ -795,6 +782,19 @@ void aq_nic_get_stats(struct aq_nic_s *self, u64 *data) err_exit:; } +static void aq_nic_update_ndev_stats(struct aq_nic_s *self) +{ + struct net_device *ndev = self->ndev; + struct aq_stats_s *stats = self->aq_hw_ops.hw_get_hw_stats(self->aq_hw); + + ndev->stats.rx_packets = stats->uprc + stats->mprc + stats->bprc; + ndev->stats.rx_bytes = stats->ubrc + stats->mbrc + stats->bbrc; + ndev->stats.rx_errors = stats->erpr; + ndev->stats.tx_packets = stats->uptc + stats->mptc + stats->bptc; + ndev->stats.tx_bytes = stats->ubtc + stats->mbtc + stats->bbtc; + ndev->stats.tx_errors = stats->erpt; +} + void aq_nic_get_link_ksettings(struct aq_nic_s *self, struct ethtool_link_ksettings *cmd) { From 45cc1c7ad47c4d166d15c7bce449d2de4daef0c5 Mon Sep 17 00:00:00 2001 From: Igor Russkikh Date: Thu, 14 Dec 2017 12:34:44 +0300 Subject: [PATCH 241/269] net: aquantia: Fill in multicast counter in ndev stats from hardware This metric comes from HW and is also diff-calculated, like other counters Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/aq_nic.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c index 307caac68731..b3a5d1fbc713 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c @@ -793,6 +793,7 @@ static void aq_nic_update_ndev_stats(struct aq_nic_s *self) ndev->stats.tx_packets = stats->uptc + stats->mptc + stats->bptc; ndev->stats.tx_bytes = stats->ubtc + stats->mbtc + stats->bbtc; ndev->stats.tx_errors = stats->erpt; + ndev->stats.multicast = stats->mprc; } void aq_nic_get_link_ksettings(struct aq_nic_s *self, From fdb4a0830e74acfbe84d4d4e6772ea09c96786ad Mon Sep 17 00:00:00 2001 From: Igor Russkikh Date: Thu, 14 Dec 2017 12:34:45 +0300 Subject: [PATCH 242/269] net: aquantia: Improve link state and statistics check interval callback Reduce timeout from 2 secs to 1 sec. If link is down, reduce it to 500msec. This speeds up link detection. Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/aq_cfg.h | 2 +- drivers/net/ethernet/aquantia/atlantic/aq_nic.c | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h b/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h index 57e796870595..73b93a7b4800 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h +++ b/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h @@ -50,7 +50,7 @@ #define AQ_CFG_PCI_FUNC_MSIX_IRQS 9U #define AQ_CFG_PCI_FUNC_PORTS 2U -#define AQ_CFG_SERVICE_TIMER_INTERVAL (2 * HZ) +#define AQ_CFG_SERVICE_TIMER_INTERVAL (1 * HZ) #define AQ_CFG_POLLING_TIMER_INTERVAL ((unsigned int)(2 * HZ)) #define AQ_CFG_SKB_FRAGS_MAX 32U diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c index b3a5d1fbc713..75a894a9251c 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c @@ -168,6 +168,7 @@ static int aq_nic_update_link_status(struct aq_nic_s *self) static void aq_nic_service_timer_cb(struct timer_list *t) { struct aq_nic_s *self = from_timer(self, t, service_timer); + int ctimer = AQ_CFG_SERVICE_TIMER_INTERVAL; int err = 0; if (aq_utils_obj_test(&self->header.flags, AQ_NIC_FLAGS_IS_NOT_READY)) @@ -182,10 +183,12 @@ static void aq_nic_service_timer_cb(struct timer_list *t) aq_nic_update_ndev_stats(self); + /* If no link - use faster timer rate to detect link up asap */ + if (!netif_carrier_ok(self->ndev)) + ctimer = max(ctimer / 2, 1); err_exit: - mod_timer(&self->service_timer, - jiffies + AQ_CFG_SERVICE_TIMER_INTERVAL); + mod_timer(&self->service_timer, jiffies + ctimer); } static void aq_nic_polling_timer_cb(struct timer_list *t) From f3e2778429c2ad8555e888858e0f0e98c86c4b0f Mon Sep 17 00:00:00 2001 From: Igor Russkikh Date: Thu, 14 Dec 2017 12:34:46 +0300 Subject: [PATCH 243/269] net: aquantia: Update hw counters on hw init On very first start we should read out current HW counter values to make diff based calculations later. This also should be done each time NIC gets down/up or wakes up after sleep state. We reset link state explicitly to prevent diffs from being summed this first time. Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0.c | 4 ++++ drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0.c index b0abd187cead..f18dce14c93c 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0.c @@ -344,6 +344,10 @@ static int hw_atl_a0_hw_init(struct aq_hw_s *self, hw_atl_a0_hw_rss_set(self, &aq_nic_cfg->aq_rss); hw_atl_a0_hw_rss_hash_set(self, &aq_nic_cfg->aq_rss); + /* Reset link status and read out initial hardware counters */ + self->aq_link_status.mbps = 0; + hw_atl_utils_update_stats(self); + err = aq_hw_err_from_flags(self); if (err < 0) goto err_exit; diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c index 36fddb199160..e4a22ce7bf09 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c @@ -397,6 +397,10 @@ static int hw_atl_b0_hw_init(struct aq_hw_s *self, */ aq_hw_write_reg(self, tx_dma_total_req_limit_adr, 24); + /* Reset link status and read out initial hardware counters */ + self->aq_link_status.mbps = 0; + hw_atl_utils_update_stats(self); + err = aq_hw_err_from_flags(self); if (err < 0) goto err_exit; From 98bc036de40489416d61ab175bb417c094e7783c Mon Sep 17 00:00:00 2001 From: Igor Russkikh Date: Thu, 14 Dec 2017 12:34:47 +0300 Subject: [PATCH 244/269] net: aquantia: Fix typo in ethtool statistics names Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- .../net/ethernet/aquantia/atlantic/aq_ethtool.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c b/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c index 70efb7467bf3..f2d8063a2cef 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c @@ -66,14 +66,14 @@ static const char aq_ethtool_stat_names[][ETH_GSTRING_LEN] = { "OutUCast", "OutMCast", "OutBCast", - "InUCastOctects", - "OutUCastOctects", - "InMCastOctects", - "OutMCastOctects", - "InBCastOctects", - "OutBCastOctects", - "InOctects", - "OutOctects", + "InUCastOctets", + "OutUCastOctets", + "InMCastOctets", + "OutMCastOctets", + "InBCastOctets", + "OutBCastOctets", + "InOctets", + "OutOctets", "InPacketsDma", "OutPacketsDma", "InOctetsDma", From d4c242d4ba5730b62579969804cd8fcf58b9c84f Mon Sep 17 00:00:00 2001 From: Igor Russkikh Date: Thu, 14 Dec 2017 12:34:48 +0300 Subject: [PATCH 245/269] net: aquantia: Increment driver version Add a suffix to distinguish kernel mainline version and aquantia releases Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/aq_cfg.h | 3 ++- drivers/net/ethernet/aquantia/atlantic/ver.h | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h b/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h index 73b93a7b4800..105fdb958cef 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h +++ b/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h @@ -80,6 +80,7 @@ #define AQ_CFG_DRV_VERSION __stringify(NIC_MAJOR_DRIVER_VERSION)"."\ __stringify(NIC_MINOR_DRIVER_VERSION)"."\ __stringify(NIC_BUILD_DRIVER_VERSION)"."\ - __stringify(NIC_REVISION_DRIVER_VERSION) + __stringify(NIC_REVISION_DRIVER_VERSION) \ + AQ_CFG_DRV_VERSION_SUFFIX #endif /* AQ_CFG_H */ diff --git a/drivers/net/ethernet/aquantia/atlantic/ver.h b/drivers/net/ethernet/aquantia/atlantic/ver.h index 0de858d215c2..9009f2651e70 100644 --- a/drivers/net/ethernet/aquantia/atlantic/ver.h +++ b/drivers/net/ethernet/aquantia/atlantic/ver.h @@ -11,8 +11,10 @@ #define VER_H #define NIC_MAJOR_DRIVER_VERSION 1 -#define NIC_MINOR_DRIVER_VERSION 5 -#define NIC_BUILD_DRIVER_VERSION 345 +#define NIC_MINOR_DRIVER_VERSION 6 +#define NIC_BUILD_DRIVER_VERSION 13 #define NIC_REVISION_DRIVER_VERSION 0 +#define AQ_CFG_DRV_VERSION_SUFFIX "-kern" + #endif /* VER_H */ From 7a4fa29106d9a38ef005f5ab15d493c259f269c0 Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Thu, 14 Dec 2017 15:54:29 +0200 Subject: [PATCH 246/269] net: sched: Add TCA_HW_OFFLOAD Qdiscs can be offloaded to HW, but current implementation isn't uniform. Instead, qdiscs either pass information about offload status via their TCA_OPTIONS or omit it altogether. Introduce a new attribute - TCA_HW_OFFLOAD that would form a uniform uAPI for the offloading status of qdiscs. Signed-off-by: Yuval Mintz Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/sch_generic.h | 1 + include/uapi/linux/rtnetlink.h | 1 + net/sched/sch_api.c | 2 ++ 3 files changed, 4 insertions(+) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 65d0d25f2648..83a3e47d5845 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -71,6 +71,7 @@ struct Qdisc { * qdisc_tree_decrease_qlen() should stop. */ #define TCQ_F_INVISIBLE 0x80 /* invisible by default in dump */ +#define TCQ_F_OFFLOADED 0x200 /* qdisc is offloaded to HW */ u32 limit; const struct Qdisc_ops *ops; struct qdisc_size_table __rcu *stab; diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index d8b5f80c2ea6..843e29aa3cac 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -557,6 +557,7 @@ enum { TCA_PAD, TCA_DUMP_INVISIBLE, TCA_CHAIN, + TCA_HW_OFFLOAD, __TCA_MAX }; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index b6c4f536876b..0f1eab99ff4e 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -795,6 +795,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, tcm->tcm_info = refcount_read(&q->refcnt); if (nla_put_string(skb, TCA_KIND, q->ops->id)) goto nla_put_failure; + if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED))) + goto nla_put_failure; if (q->ops->dump && q->ops->dump(q, skb) < 0) goto nla_put_failure; qlen = q->q.qlen; From 428a68af3a7c3a3380ff1f750a24d213f370f89f Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Thu, 14 Dec 2017 15:54:30 +0200 Subject: [PATCH 247/269] net: sched: Move to new offload indication in RED Let RED utilize the new internal flag, TCQ_F_OFFLOADED, to mark a given qdisc as offloaded instead of using a dedicated indication. Also, change internal logic into looking at said flag when possible. Fixes: 602f3baf2218 ("net_sch: red: Add offload ability to RED qdisc") Signed-off-by: Yuval Mintz Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/sch_red.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 9d874e60e032..f0747eb87dc4 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -157,6 +157,7 @@ static int red_offload(struct Qdisc *sch, bool enable) .handle = sch->handle, .parent = sch->parent, }; + int err; if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return -EOPNOTSUPP; @@ -171,7 +172,14 @@ static int red_offload(struct Qdisc *sch, bool enable) opt.command = TC_RED_DESTROY; } - return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, &opt); + err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, &opt); + + if (!err && enable) + sch->flags |= TCQ_F_OFFLOADED; + else + sch->flags &= ~TCQ_F_OFFLOADED; + + return err; } static void red_destroy(struct Qdisc *sch) @@ -274,7 +282,7 @@ static int red_init(struct Qdisc *sch, struct nlattr *opt) return red_change(sch, opt); } -static int red_dump_offload(struct Qdisc *sch, struct tc_red_qopt *opt) +static int red_dump_offload_stats(struct Qdisc *sch, struct tc_red_qopt *opt) { struct net_device *dev = qdisc_dev(sch); struct tc_red_qopt_offload hw_stats = { @@ -286,21 +294,12 @@ static int red_dump_offload(struct Qdisc *sch, struct tc_red_qopt *opt) .stats.qstats = &sch->qstats, }, }; - int err; - opt->flags &= ~TC_RED_OFFLOADED; - if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + if (!(sch->flags & TCQ_F_OFFLOADED)) return 0; - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, - &hw_stats); - if (err == -EOPNOTSUPP) - return 0; - - if (!err) - opt->flags |= TC_RED_OFFLOADED; - - return err; + return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, + &hw_stats); } static int red_dump(struct Qdisc *sch, struct sk_buff *skb) @@ -319,7 +318,7 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb) int err; sch->qstats.backlog = q->qdisc->qstats.backlog; - err = red_dump_offload(sch, &opt); + err = red_dump_offload_stats(sch, &opt); if (err) goto nla_put_failure; @@ -347,7 +346,7 @@ static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d) .marked = q->stats.prob_mark + q->stats.forced_mark, }; - if (tc_can_offload(dev) && dev->netdev_ops->ndo_setup_tc) { + if (sch->flags & TCQ_F_OFFLOADED) { struct red_stats hw_stats = {0}; struct tc_red_qopt_offload hw_stats_request = { .command = TC_RED_XSTATS, From 4a98795bc8ea148b1ebbbf001283e06430cffe36 Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Thu, 14 Dec 2017 15:54:31 +0200 Subject: [PATCH 248/269] pkt_sched: Remove TC_RED_OFFLOADED from uapi Following the previous patch, RED is now using the new uniform uapi for indicating it's offloaded. As a result, TC_RED_OFFLOADED is no longer utilized by kernel and can be removed [as it's still not part of any stable release]. Fixes: 602f3baf2218 ("net_sch: red: Add offload ability to RED qdisc") Signed-off-by: Yuval Mintz Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index af3cc2f4e1ad..37b5096ae97b 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -256,7 +256,6 @@ struct tc_red_qopt { #define TC_RED_ECN 1 #define TC_RED_HARDDROP 2 #define TC_RED_ADAPTATIVE 4 -#define TC_RED_OFFLOADED 8 }; struct tc_red_xstats { From c647c0d62c82eb3ddf78a0d8b3d58819d9f552aa Mon Sep 17 00:00:00 2001 From: Daniele Palmas Date: Thu, 14 Dec 2017 16:56:14 +0100 Subject: [PATCH 249/269] net: usb: qmi_wwan: add Telit ME910 PID 0x1101 support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds support for Telit ME910 PID 0x1101. Signed-off-by: Daniele Palmas Acked-by: Bjørn Mork Signed-off-by: David S. Miller --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index d2ca5a202e8d..3000ddd1c7e2 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1211,6 +1211,7 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x2357, 0x9000, 4)}, /* TP-LINK MA260 */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1040, 2)}, /* Telit LE922A */ {QMI_FIXED_INTF(0x1bc7, 0x1100, 3)}, /* Telit ME910 */ + {QMI_FIXED_INTF(0x1bc7, 0x1101, 3)}, /* Telit ME910 dual modem */ {QMI_FIXED_INTF(0x1bc7, 0x1200, 5)}, /* Telit LE920 */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1201, 2)}, /* Telit LE920, LE920A4 */ {QMI_FIXED_INTF(0x1c9e, 0x9801, 3)}, /* Telewell TW-3G HSPA+ */ From c05fad5713b81b049ec6ac4eb2d304030b1efdce Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Fri, 15 Dec 2017 10:46:16 +0800 Subject: [PATCH 250/269] ip_gre: fix wrong return value of erspan_rcv If pskb_may_pull return failed, return PACKET_REJECT instead of -ENOMEM. Fixes: 84e54fe0a5ea ("gre: introduce native tunnel support for ERSPAN") Cc: William Tu Signed-off-by: Haishuang Yan Acked-by: William Tu Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index bb6239169b1a..9c1735632c8c 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -266,7 +266,7 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, len = gre_hdr_len + sizeof(*ershdr); if (unlikely(!pskb_may_pull(skb, len))) - return -ENOMEM; + return PACKET_REJECT; iph = ip_hdr(skb); ershdr = (struct erspanhdr *)(skb->data + gre_hdr_len); From c156618e15101a9cc8c815108fec0300a0ec6637 Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Tue, 5 Dec 2017 13:55:44 -0500 Subject: [PATCH 251/269] nfs: fix a deadlock in nfs client initialization The following deadlock can occur between a process waiting for a client to initialize in while walking the client list during nfsv4 server trunking detection and another process waiting for the nfs_clid_init_mutex so it can initialize that client: Process 1 Process 2 --------- --------- spin_lock(&nn->nfs_client_lock); list_add_tail(&CLIENTA->cl_share_link, &nn->nfs_client_list); spin_unlock(&nn->nfs_client_lock); spin_lock(&nn->nfs_client_lock); list_add_tail(&CLIENTB->cl_share_link, &nn->nfs_client_list); spin_unlock(&nn->nfs_client_lock); mutex_lock(&nfs_clid_init_mutex); nfs41_walk_client_list(clp, result, cred); nfs_wait_client_init_complete(CLIENTA); (waiting for nfs_clid_init_mutex) Make sure nfs_match_client() only evaluates clients that have completed initialization in order to prevent that deadlock. This patch also fixes v4.0 trunking behavior by not marking the client NFS_CS_READY until the clientid has been confirmed. Signed-off-by: Scott Mayhew Signed-off-by: Anna Schumaker --- fs/nfs/client.c | 11 +++++++++++ fs/nfs/nfs4client.c | 17 +++++++++++++---- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 0ac2fb1c6b63..b9129e2befea 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -291,12 +291,23 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat const struct sockaddr *sap = data->addr; struct nfs_net *nn = net_generic(data->net, nfs_net_id); +again: list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) { const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; /* Don't match clients that failed to initialise properly */ if (clp->cl_cons_state < 0) continue; + /* If a client is still initializing then we need to wait */ + if (clp->cl_cons_state > NFS_CS_READY) { + refcount_inc(&clp->cl_count); + spin_unlock(&nn->nfs_client_lock); + nfs_wait_client_init_complete(clp); + nfs_put_client(clp); + spin_lock(&nn->nfs_client_lock); + goto again; + } + /* Different NFS versions cannot share the same nfs_client */ if (clp->rpc_ops != data->nfs_mod->rpc_ops) continue; diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 12bbab0becb4..65a7e5da508c 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -404,15 +404,19 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, if (error < 0) goto error; - if (!nfs4_has_session(clp)) - nfs_mark_client_ready(clp, NFS_CS_READY); - error = nfs4_discover_server_trunking(clp, &old); if (error < 0) goto error; - if (clp != old) + if (clp != old) { clp->cl_preserve_clid = true; + /* + * Mark the client as having failed initialization so other + * processes walking the nfs_client_list in nfs_match_client() + * won't try to use it. + */ + nfs_mark_client_ready(clp, -EPERM); + } nfs_put_client(clp); clear_bit(NFS_CS_TSM_POSSIBLE, &clp->cl_flags); return old; @@ -539,6 +543,9 @@ int nfs40_walk_client_list(struct nfs_client *new, spin_lock(&nn->nfs_client_lock); list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) { + if (pos == new) + goto found; + status = nfs4_match_client(pos, new, &prev, nn); if (status < 0) goto out_unlock; @@ -559,6 +566,7 @@ int nfs40_walk_client_list(struct nfs_client *new, * way that a SETCLIENTID_CONFIRM to pos can succeed is * if new and pos point to the same server: */ +found: refcount_inc(&pos->cl_count); spin_unlock(&nn->nfs_client_lock); @@ -572,6 +580,7 @@ int nfs40_walk_client_list(struct nfs_client *new, case 0: nfs4_swap_callback_idents(pos, new); pos->cl_confirm = new->cl_confirm; + nfs_mark_client_ready(pos, NFS_CS_READY); prev = NULL; *result = pos; From ccede7598588ae344143f82fb763912535648d58 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Dec 2017 14:04:04 -0500 Subject: [PATCH 252/269] xprtrdma: Spread reply processing over more CPUs Commit d8f532d20ee4 ("xprtrdma: Invoke rpcrdma_reply_handler directly from RECV completion") introduced a performance regression for NFS I/O small enough to not need memory registration. In multi- threaded benchmarks that generate primarily small I/O requests, IOPS throughput is reduced by nearly a third. This patch restores the previous level of throughput. Because workqueues are typically BOUND (in particular ib_comp_wq, nfsiod_workqueue, and rpciod_workqueue), NFS/RDMA workloads tend to aggregate on the CPU that is handling Receive completions. The usual approach to addressing this problem is to create a QP and CQ for each CPU, and then schedule transactions on the QP for the CPU where you want the transaction to complete. The transaction then does not require an extra context switch during completion to end up on the same CPU where the transaction was started. This approach doesn't work for the Linux NFS/RDMA client because currently the Linux NFS client does not support multiple connections per client-server pair, and the RDMA core API does not make it straightforward for ULPs to determine which CPU is responsible for handling Receive completions for a CQ. So for the moment, record the CPU number in the rpcrdma_req before the transport sends each RPC Call. Then during Receive completion, queue the RPC completion on that same CPU. Additionally, move all RPC completion processing to the deferred handler so that even RPCs with simple small replies complete on the CPU that sent the corresponding RPC Call. Fixes: d8f532d20ee4 ("xprtrdma: Invoke rpcrdma_reply_handler ...") Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 6 +----- net/sunrpc/xprtrdma/transport.c | 2 ++ net/sunrpc/xprtrdma/verbs.c | 2 +- net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index ed34dc0f144c..a3f2ab283aeb 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1408,11 +1408,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) dprintk("RPC: %s: reply %p completes request %p (xid 0x%08x)\n", __func__, rep, req, be32_to_cpu(rep->rr_xid)); - if (list_empty(&req->rl_registered) && - !test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) - rpcrdma_complete_rqst(rep); - else - queue_work(rpcrdma_receive_wq, &rep->rr_work); + queue_work_on(req->rl_cpu, rpcrdma_receive_wq, &rep->rr_work); return; out_badstatus: diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 646c24494ea7..6ee1ad8978f3 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -52,6 +52,7 @@ #include #include #include +#include #include "xprt_rdma.h" @@ -656,6 +657,7 @@ xprt_rdma_allocate(struct rpc_task *task) task->tk_pid, __func__, rqst->rq_callsize, rqst->rq_rcvsize, req); + req->rl_cpu = smp_processor_id(); req->rl_connect_cookie = 0; /* our reserved value */ rpcrdma_set_xprtdata(rqst, req); rqst->rq_buffer = req->rl_sendbuf->rg_base; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 710b3f77db82..8607c029c0dd 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -83,7 +83,7 @@ rpcrdma_alloc_wq(void) struct workqueue_struct *recv_wq; recv_wq = alloc_workqueue("xprtrdma_receive", - WQ_MEM_RECLAIM | WQ_UNBOUND | WQ_HIGHPRI, + WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); if (!recv_wq) return -ENOMEM; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 51686d9eac5f..1342f743f1c4 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -342,6 +342,7 @@ enum { struct rpcrdma_buffer; struct rpcrdma_req { struct list_head rl_list; + int rl_cpu; unsigned int rl_connect_cookie; struct rpcrdma_buffer *rl_buffer; struct rpcrdma_rep *rl_reply; From dc4fd9ab01ab379ae5af522b3efd4187a7c30a31 Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Fri, 8 Dec 2017 16:00:12 -0500 Subject: [PATCH 253/269] nfs: don't wait on commit in nfs_commit_inode() if there were no commit requests If there were no commit requests, then nfs_commit_inode() should not wait on the commit or mark the inode dirty, otherwise the following BUG_ON can be triggered: [ 1917.130762] kernel BUG at fs/inode.c:578! [ 1917.130766] Oops: Exception in kernel mode, sig: 5 [#1] [ 1917.130768] SMP NR_CPUS=2048 NUMA pSeries [ 1917.130772] Modules linked in: iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi blocklayoutdriver rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver nfs lockd grace fscache sunrpc sg nx_crypto pseries_rng ip_tables xfs libcrc32c sd_mod crc_t10dif crct10dif_generic crct10dif_common ibmvscsi scsi_transport_srp ibmveth scsi_tgt dm_mirror dm_region_hash dm_log dm_mod [ 1917.130805] CPU: 2 PID: 14923 Comm: umount.nfs4 Tainted: G ------------ T 3.10.0-768.el7.ppc64 #1 [ 1917.130810] task: c0000005ecd88040 ti: c00000004cea0000 task.ti: c00000004cea0000 [ 1917.130813] NIP: c000000000354178 LR: c000000000354160 CTR: c00000000012db80 [ 1917.130816] REGS: c00000004cea3720 TRAP: 0700 Tainted: G ------------ T (3.10.0-768.el7.ppc64) [ 1917.130820] MSR: 8000000100029032 CR: 22002822 XER: 20000000 [ 1917.130828] CFAR: c00000000011f594 SOFTE: 1 GPR00: c000000000354160 c00000004cea39a0 c0000000014c4700 c0000000018cc750 GPR04: 000000000000c750 80c0000000000000 0600000000000000 04eeb76bea749a03 GPR08: 0000000000000034 c0000000018cc758 0000000000000001 d000000005e619e8 GPR12: c00000000012db80 c000000007b31200 0000000000000000 0000000000000000 GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR20: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR24: 0000000000000000 c000000000dfc3ec 0000000000000000 c0000005eefc02c0 GPR28: d0000000079dbd50 c0000005b94a02c0 c0000005b94a0250 c0000005b94a01c8 [ 1917.130867] NIP [c000000000354178] .evict+0x1c8/0x350 [ 1917.130871] LR [c000000000354160] .evict+0x1b0/0x350 [ 1917.130873] Call Trace: [ 1917.130876] [c00000004cea39a0] [c000000000354160] .evict+0x1b0/0x350 (unreliable) [ 1917.130880] [c00000004cea3a30] [c0000000003558cc] .evict_inodes+0x13c/0x270 [ 1917.130884] [c00000004cea3af0] [c000000000327d20] .kill_anon_super+0x70/0x1e0 [ 1917.130896] [c00000004cea3b80] [d000000005e43e30] .nfs_kill_super+0x20/0x60 [nfs] [ 1917.130900] [c00000004cea3c00] [c000000000328a20] .deactivate_locked_super+0xa0/0x1b0 [ 1917.130903] [c00000004cea3c80] [c00000000035ba54] .cleanup_mnt+0xd4/0x180 [ 1917.130907] [c00000004cea3d10] [c000000000119034] .task_work_run+0x114/0x150 [ 1917.130912] [c00000004cea3db0] [c00000000001ba6c] .do_notify_resume+0xcc/0x100 [ 1917.130916] [c00000004cea3e30] [c00000000000a7b0] .ret_from_except_lite+0x5c/0x60 [ 1917.130919] Instruction dump: [ 1917.130921] 7fc3f378 486734b5 60000000 387f00a0 38800003 4bdcb365 60000000 e95f00a0 [ 1917.130927] 694a0060 7d4a0074 794ad182 694a0001 <0b0a0000> 892d02a4 2f890000 40de0134 Signed-off-by: Scott Mayhew Cc: stable@vger.kernel.org # 4.5+ Signed-off-by: Anna Schumaker --- fs/nfs/write.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 5b5f464f6f2a..4a379d7918f2 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1890,6 +1890,8 @@ int nfs_commit_inode(struct inode *inode, int how) if (res) error = nfs_generic_commit_list(inode, &head, how, &cinfo); nfs_commit_end(cinfo.mds); + if (res == 0) + return res; if (error < 0) goto out_error; if (!may_wait) From 90d91b0cd371193d9dbfa9beacab8ab9a4cb75e0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 14 Dec 2017 21:24:08 -0500 Subject: [PATCH 254/269] SUNRPC: Fix a race in the receive code path We must ensure that the call to rpc_sleep_on() in xprt_transmit() cannot race with the call to xprt_complete_rqst(). Reported-by: Chuck Lever Link: https://bugzilla.linux-nfs.org/show_bug.cgi?id=317 Fixes: ce7c252a8c74 ("SUNRPC: Add a separate spinlock to protect..") Cc: stable@vger.kernel.org # 4.14+ Reviewed-by: Chuck Lever Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- net/sunrpc/xprt.c | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 02a9bacb239b..5b06f6906a27 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1001,6 +1001,7 @@ void xprt_transmit(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; + unsigned int connect_cookie; int status, numreqs; dprintk("RPC: %5u xprt_transmit(%u)\n", task->tk_pid, req->rq_slen); @@ -1024,6 +1025,7 @@ void xprt_transmit(struct rpc_task *task) } else if (!req->rq_bytes_sent) return; + connect_cookie = xprt->connect_cookie; req->rq_xtime = ktime_get(); status = xprt->ops->send_request(task); trace_xprt_transmit(xprt, req->rq_xid, status); @@ -1047,20 +1049,28 @@ void xprt_transmit(struct rpc_task *task) xprt->stat.bklog_u += xprt->backlog.qlen; xprt->stat.sending_u += xprt->sending.qlen; xprt->stat.pending_u += xprt->pending.qlen; - - /* Don't race with disconnect */ - if (!xprt_connected(xprt)) - task->tk_status = -ENOTCONN; - else { - /* - * Sleep on the pending queue since - * we're expecting a reply. - */ - if (!req->rq_reply_bytes_recvd && rpc_reply_expected(task)) - rpc_sleep_on(&xprt->pending, task, xprt_timer); - req->rq_connect_cookie = xprt->connect_cookie; - } spin_unlock_bh(&xprt->transport_lock); + + req->rq_connect_cookie = connect_cookie; + if (rpc_reply_expected(task) && !READ_ONCE(req->rq_reply_bytes_recvd)) { + /* + * Sleep on the pending queue if we're expecting a reply. + * The spinlock ensures atomicity between the test of + * req->rq_reply_bytes_recvd, and the call to rpc_sleep_on(). + */ + spin_lock(&xprt->recv_lock); + if (!req->rq_reply_bytes_recvd) { + rpc_sleep_on(&xprt->pending, task, xprt_timer); + /* + * Send an extra queue wakeup call if the + * connection was dropped in case the call to + * rpc_sleep_on() raced. + */ + if (!xprt_connected(xprt)) + xprt_wake_pending_tasks(xprt, -ENOTCONN); + } + spin_unlock(&xprt->recv_lock); + } } static void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task) From 343723dd51ef1025a860e54df9472b5ba21ee3d9 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 15 Dec 2017 12:40:12 +0100 Subject: [PATCH 255/269] net: sched: fix clsact init error path Since in qdisc_create, the destroy op is called when init fails, we don't do cleanup in init and leave it up to destroy. This fixes use-after-free when trying to put already freed block. Fixes: 6e40cf2d4dee ("net: sched: use extended variants of block_get/put in ingress and clsact qdiscs") Signed-off-by: Jiri Pirko Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_api.c | 4 ++-- net/sched/sch_ingress.c | 6 +----- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index f40256a3e7f0..b91ea03e3afa 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -351,6 +351,8 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, { struct tcf_chain *chain; + if (!block) + return; /* Hold a refcnt for all chains, except 0, so that they don't disappear * while we are iterating. */ @@ -377,8 +379,6 @@ void tcf_block_put(struct tcf_block *block) { struct tcf_block_ext_info ei = {0, }; - if (!block) - return; tcf_block_put_ext(block, block->q, &ei); } diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index 5ecc38f35d47..5e1cd2e5df87 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -190,7 +190,7 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt) err = tcf_block_get_ext(&q->egress_block, sch, &q->egress_block_info); if (err) - goto err_egress_block_get; + return err; net_inc_ingress_queue(); net_inc_egress_queue(); @@ -198,10 +198,6 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt) sch->flags |= TCQ_F_CPUSTATS; return 0; - -err_egress_block_get: - tcf_block_put_ext(q->ingress_block, sch, &q->ingress_block_info); - return err; } static void clsact_destroy(struct Qdisc *sch) From b59e6979a86384e68b0ab6ffeab11f0034fba82d Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 15 Dec 2017 12:40:13 +0100 Subject: [PATCH 256/269] net: sched: fix static key imbalance in case of ingress/clsact_init error Move static key increments to the beginning of the init function so they pair 1:1 with decrements in ingress/clsact_destroy, which is called in case ingress/clsact_init fails. Fixes: 6529eaba33f0 ("net: sched: introduce tcf block infractructure") Signed-off-by: Jiri Pirko Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/sch_ingress.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index 5e1cd2e5df87..fc1286f499c1 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -68,6 +68,8 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt) struct net_device *dev = qdisc_dev(sch); int err; + net_inc_ingress_queue(); + mini_qdisc_pair_init(&q->miniqp, sch, &dev->miniq_ingress); q->block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS; @@ -78,7 +80,6 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt) if (err) return err; - net_inc_ingress_queue(); sch->flags |= TCQ_F_CPUSTATS; return 0; @@ -172,6 +173,9 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt) struct net_device *dev = qdisc_dev(sch); int err; + net_inc_ingress_queue(); + net_inc_egress_queue(); + mini_qdisc_pair_init(&q->miniqp_ingress, sch, &dev->miniq_ingress); q->ingress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS; @@ -192,9 +196,6 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt) if (err) return err; - net_inc_ingress_queue(); - net_inc_egress_queue(); - sch->flags |= TCQ_F_CPUSTATS; return 0; From 043ee1debd0b29c16c4c4b11a348ca667bfe9144 Mon Sep 17 00:00:00 2001 From: Hemanth Puranik Date: Fri, 15 Dec 2017 20:05:58 +0530 Subject: [PATCH 257/269] net: qcom/emac: Reduce timeout for mdio read/write Currently mdio read/write takes around ~115us as the timeout between status check is set to 100us. By reducing the timeout to 1us mdio read/write takes ~15us to complete. This improves the link up event response. Signed-off-by: Hemanth Puranik Acked-by: Timur Tabi Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/qualcomm/emac/emac-phy.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/qualcomm/emac/emac-phy.c b/drivers/net/ethernet/qualcomm/emac/emac-phy.c index 18461fcb9815..53dbf1e163a8 100644 --- a/drivers/net/ethernet/qualcomm/emac/emac-phy.c +++ b/drivers/net/ethernet/qualcomm/emac/emac-phy.c @@ -47,6 +47,7 @@ #define MDIO_CLK_25_28 7 #define MDIO_WAIT_TIMES 1000 +#define MDIO_STATUS_DELAY_TIME 1 static int emac_mdio_read(struct mii_bus *bus, int addr, int regnum) { @@ -65,7 +66,7 @@ static int emac_mdio_read(struct mii_bus *bus, int addr, int regnum) if (readl_poll_timeout(adpt->base + EMAC_MDIO_CTRL, reg, !(reg & (MDIO_START | MDIO_BUSY)), - 100, MDIO_WAIT_TIMES * 100)) + MDIO_STATUS_DELAY_TIME, MDIO_WAIT_TIMES * 100)) return -EIO; return (reg >> MDIO_DATA_SHFT) & MDIO_DATA_BMSK; @@ -88,8 +89,8 @@ static int emac_mdio_write(struct mii_bus *bus, int addr, int regnum, u16 val) writel(reg, adpt->base + EMAC_MDIO_CTRL); if (readl_poll_timeout(adpt->base + EMAC_MDIO_CTRL, reg, - !(reg & (MDIO_START | MDIO_BUSY)), 100, - MDIO_WAIT_TIMES * 100)) + !(reg & (MDIO_START | MDIO_BUSY)), + MDIO_STATUS_DELAY_TIME, MDIO_WAIT_TIMES * 100)) return -EIO; return 0; From f6f3732162b5ae3c771b9285a5a32d72b8586920 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 15 Dec 2017 18:53:22 -0800 Subject: [PATCH 258/269] Revert "mm: replace p??_write with pte_access_permitted in fault + gup paths" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commits 5c9d2d5c269c, c7da82b894e9, and e7fe7b5cae90. We'll probably need to revisit this, but basically we should not complicate the get_user_pages_fast() case, and checking the actual page table protection key bits will require more care anyway, since the protection keys depend on the exact state of the VM in question. Particularly when doing a "remote" page lookup (ie in somebody elses VM, not your own), you need to be much more careful than this was. Dave Hansen says: "So, the underlying bug here is that we now a get_user_pages_remote() and then go ahead and do the p*_access_permitted() checks against the current PKRU. This was introduced recently with the addition of the new p??_access_permitted() calls. We have checks in the VMA path for the "remote" gups and we avoid consulting PKRU for them. This got missed in the pkeys selftests because I did a ptrace read, but not a *write*. I also didn't explicitly test it against something where a COW needed to be done" It's also not entirely clear that it makes sense to check the protection key bits at this level at all. But one possible eventual solution is to make the get_user_pages_fast() case just abort if it sees protection key bits set, which makes us fall back to the regular get_user_pages() case, which then has a vma and can do the check there if we want to. We'll see. Somewhat related to this all: what we _do_ want to do some day is to check the PAGE_USER bit - it should obviously always be set for user pages, but it would be a good check to have back. Because we have no generic way to test for it, we lost it as part of moving over from the architecture-specific x86 GUP implementation to the generic one in commit e585513b76f7 ("x86/mm/gup: Switch GUP to the generic get_user_page_fast() implementation"). Cc: Peter Zijlstra Cc: Dan Williams Cc: Dave Hansen Cc: Kirill A. Shutemov Cc: "Jérôme Glisse" Cc: Andrew Morton Cc: Al Viro Signed-off-by: Linus Torvalds --- arch/s390/include/asm/pgtable.h | 6 ------ arch/sparc/mm/gup.c | 4 ++-- fs/dax.c | 3 +-- mm/gup.c | 2 +- mm/hmm.c | 8 ++++---- mm/huge_memory.c | 6 +++--- mm/memory.c | 8 ++++---- 7 files changed, 15 insertions(+), 22 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 57d7bc92e0b8..0a6b0286c32e 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1264,12 +1264,6 @@ static inline pud_t pud_mkwrite(pud_t pud) return pud; } -#define pud_write pud_write -static inline int pud_write(pud_t pud) -{ - return (pud_val(pud) & _REGION3_ENTRY_WRITE) != 0; -} - static inline pud_t pud_mkclean(pud_t pud) { if (pud_large(pud)) { diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c index 33c0f8bb0f33..5335ba3c850e 100644 --- a/arch/sparc/mm/gup.c +++ b/arch/sparc/mm/gup.c @@ -75,7 +75,7 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, if (!(pmd_val(pmd) & _PAGE_VALID)) return 0; - if (!pmd_access_permitted(pmd, write)) + if (write && !pmd_write(pmd)) return 0; refs = 0; @@ -114,7 +114,7 @@ static int gup_huge_pud(pud_t *pudp, pud_t pud, unsigned long addr, if (!(pud_val(pud) & _PAGE_VALID)) return 0; - if (!pud_access_permitted(pud, write)) + if (write && !pud_write(pud)) return 0; refs = 0; diff --git a/fs/dax.c b/fs/dax.c index 78b72c48374e..95981591977a 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -627,8 +627,7 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, if (pfn != pmd_pfn(*pmdp)) goto unlock_pmd; - if (!pmd_dirty(*pmdp) - && !pmd_access_permitted(*pmdp, WRITE)) + if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp)) goto unlock_pmd; flush_cache_page(vma, address, pfn); diff --git a/mm/gup.c b/mm/gup.c index d3fb60e5bfac..e0d82b6706d7 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -66,7 +66,7 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, */ static inline bool can_follow_write_pte(pte_t pte, unsigned int flags) { - return pte_access_permitted(pte, WRITE) || + return pte_write(pte) || ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte)); } diff --git a/mm/hmm.c b/mm/hmm.c index 3a5c172af560..ea19742a5d60 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -391,11 +391,11 @@ again: if (pmd_protnone(pmd)) return hmm_vma_walk_clear(start, end, walk); - if (!pmd_access_permitted(pmd, write_fault)) + if (write_fault && !pmd_write(pmd)) return hmm_vma_walk_clear(start, end, walk); pfn = pmd_pfn(pmd) + pte_index(addr); - flag |= pmd_access_permitted(pmd, WRITE) ? HMM_PFN_WRITE : 0; + flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0; for (; addr < end; addr += PAGE_SIZE, i++, pfn++) pfns[i] = hmm_pfn_t_from_pfn(pfn) | flag; return 0; @@ -456,11 +456,11 @@ again: continue; } - if (!pte_access_permitted(pte, write_fault)) + if (write_fault && !pte_write(pte)) goto fault; pfns[i] = hmm_pfn_t_from_pfn(pte_pfn(pte)) | flag; - pfns[i] |= pte_access_permitted(pte, WRITE) ? HMM_PFN_WRITE : 0; + pfns[i] |= pte_write(pte) ? HMM_PFN_WRITE : 0; continue; fault: diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2f2f5e774902..0e7ded98d114 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -870,7 +870,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, */ WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set"); - if (!pmd_access_permitted(*pmd, flags & FOLL_WRITE)) + if (flags & FOLL_WRITE && !pmd_write(*pmd)) return NULL; if (pmd_present(*pmd) && pmd_devmap(*pmd)) @@ -1012,7 +1012,7 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, assert_spin_locked(pud_lockptr(mm, pud)); - if (!pud_access_permitted(*pud, flags & FOLL_WRITE)) + if (flags & FOLL_WRITE && !pud_write(*pud)) return NULL; if (pud_present(*pud) && pud_devmap(*pud)) @@ -1386,7 +1386,7 @@ out_unlock: */ static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags) { - return pmd_access_permitted(pmd, WRITE) || + return pmd_write(pmd) || ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd)); } diff --git a/mm/memory.c b/mm/memory.c index cfaba6287702..ca5674cbaff2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3949,7 +3949,7 @@ static int handle_pte_fault(struct vm_fault *vmf) if (unlikely(!pte_same(*vmf->pte, entry))) goto unlock; if (vmf->flags & FAULT_FLAG_WRITE) { - if (!pte_access_permitted(entry, WRITE)) + if (!pte_write(entry)) return do_wp_page(vmf); entry = pte_mkdirty(entry); } @@ -4014,7 +4014,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, /* NUMA case for anonymous PUDs would go here */ - if (dirty && !pud_access_permitted(orig_pud, WRITE)) { + if (dirty && !pud_write(orig_pud)) { ret = wp_huge_pud(&vmf, orig_pud); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -4047,7 +4047,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) return do_huge_pmd_numa_page(&vmf, orig_pmd); - if (dirty && !pmd_access_permitted(orig_pmd, WRITE)) { + if (dirty && !pmd_write(orig_pmd)) { ret = wp_huge_pmd(&vmf, orig_pmd); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -4337,7 +4337,7 @@ int follow_phys(struct vm_area_struct *vma, goto out; pte = *ptep; - if (!pte_access_permitted(pte, flags & FOLL_WRITE)) + if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; *prot = pgprot_val(pte_pgprot(pte)); From 1784f9144b143a1e8b19fe94083b040aa559182b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 5 Dec 2017 14:14:47 +0100 Subject: [PATCH 259/269] drivers/misc/intel/pti: Rename the header file to free up the namespace We'd like to use the 'PTI' acronym for 'Page Table Isolation' - free up the namespace by renaming the driver header to . (Also standardize the header guard name while at it.) Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: J Freyensee Cc: Greg Kroah-Hartman Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- drivers/misc/pti.c | 2 +- include/linux/{pti.h => intel-pti.h} | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) rename include/linux/{pti.h => intel-pti.h} (94%) diff --git a/drivers/misc/pti.c b/drivers/misc/pti.c index eda38cbe8530..41f2a9f6851d 100644 --- a/drivers/misc/pti.c +++ b/drivers/misc/pti.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include diff --git a/include/linux/pti.h b/include/linux/intel-pti.h similarity index 94% rename from include/linux/pti.h rename to include/linux/intel-pti.h index b3ea01a3197e..2710d72de3c9 100644 --- a/include/linux/pti.h +++ b/include/linux/intel-pti.h @@ -22,8 +22,8 @@ * interface to write out it's contents for debugging a mobile system. */ -#ifndef PTI_H_ -#define PTI_H_ +#ifndef LINUX_INTEL_PTI_H_ +#define LINUX_INTEL_PTI_H_ /* offset for last dword of any PTI message. Part of MIPI P1149.7 */ #define PTI_LASTDWORD_DTS 0x30 @@ -40,4 +40,4 @@ struct pti_masterchannel *pti_request_masterchannel(u8 type, const char *thread_name); void pti_release_masterchannel(struct pti_masterchannel *mc); -#endif /*PTI_H_*/ +#endif /* LINUX_INTEL_PTI_H_ */ From a8b4db562e7283a1520f9e9730297ecaab7622ea Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Sun, 5 Nov 2017 18:27:51 -0800 Subject: [PATCH 260/269] x86/cpufeature: Add User-Mode Instruction Prevention definitions [ Note, this is a Git cherry-pick of the following commit: (limited to the cpufeatures.h file) 3522c2a6a4f3 ("x86/cpufeature: Add User-Mode Instruction Prevention definitions") ... for easier x86 PTI code testing and back-porting. ] User-Mode Instruction Prevention is a security feature present in new Intel processors that, when set, prevents the execution of a subset of instructions if such instructions are executed in user mode (CPL > 0). Attempting to execute such instructions causes a general protection exception. The subset of instructions comprises: * SGDT - Store Global Descriptor Table * SIDT - Store Interrupt Descriptor Table * SLDT - Store Local Descriptor Table * SMSW - Store Machine Status Word * STR - Store Task Register This feature is also added to the list of disabled-features to allow a cleaner handling of build-time configuration. Signed-off-by: Ricardo Neri Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Chen Yucong Cc: Chris Metcalf Cc: Dave Hansen Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Huang Rui Cc: Jiri Slaby Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Ravi V. Shankar Cc: Shuah Khan Cc: Tony Luck Cc: Vlastimil Babka Cc: ricardo.neri@intel.com Link: http://lkml.kernel.org/r/1509935277-22138-7-git-send-email-ricardo.neri-calderon@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeatures.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index cdf5be866863..c0b0e9e8aa66 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -296,6 +296,7 @@ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */ #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ +#define X86_FEATURE_UMIP (16*32+ 2) /* User Mode Instruction Protection */ #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ #define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ From f2dbad36c55e5d3a91dccbde6e8cae345fe5632f Mon Sep 17 00:00:00 2001 From: Rudolf Marek Date: Tue, 28 Nov 2017 22:01:06 +0100 Subject: [PATCH 261/269] x86: Make X86_BUG_FXSAVE_LEAK detectable in CPUID on AMD [ Note, this is a Git cherry-pick of the following commit: 2b67799bdf25 ("x86: Make X86_BUG_FXSAVE_LEAK detectable in CPUID on AMD") ... for easier x86 PTI code testing and back-porting. ] The latest AMD AMD64 Architecture Programmer's Manual adds a CPUID feature XSaveErPtr (CPUID_Fn80000008_EBX[2]). If this feature is set, the FXSAVE, XSAVE, FXSAVEOPT, XSAVEC, XSAVES / FXRSTOR, XRSTOR, XRSTORS always save/restore error pointers, thus making the X86_BUG_FXSAVE_LEAK workaround obsolete on such CPUs. Signed-Off-By: Rudolf Marek Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Tested-by: Borislav Petkov Cc: Andy Lutomirski Link: https://lkml.kernel.org/r/bdcebe90-62c5-1f05-083c-eba7f08b2540@assembler.cz Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kernel/cpu/amd.c | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index c0b0e9e8aa66..800104c8a3ed 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -266,6 +266,7 @@ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ #define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ +#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index d58184b7cd44..bcb75dc97d44 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -804,8 +804,11 @@ static void init_amd(struct cpuinfo_x86 *c) case 0x17: init_amd_zn(c); break; } - /* Enable workaround for FXSAVE leak */ - if (c->x86 >= 6) + /* + * Enable workaround for FXSAVE leak on CPUs + * without a XSaveErPtr feature + */ + if ((c->x86 >= 6) && (!cpu_has(c, X86_FEATURE_XSAVEERPTR))) set_cpu_bug(c, X86_BUG_FXSAVE_LEAK); cpu_detect_cache_sizes(c); From 2fe1bc1f501d55e5925b4035bcd85781adc76c63 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 31 Aug 2017 14:46:30 -0700 Subject: [PATCH 262/269] perf/x86: Enable free running PEBS for REGS_USER/INTR [ Note, this is a Git cherry-pick of the following commit: a47ba4d77e12 ("perf/x86: Enable free running PEBS for REGS_USER/INTR") ... for easier x86 PTI code testing and back-porting. ] Currently free running PEBS is disabled when user or interrupt registers are requested. Most of the registers are actually available in the PEBS record and can be supported. So we just need to check for the supported registers and then allow it: it is all except for the segment register. For user registers this only works when the counter is limited to ring 3 only, so this also needs to be checked. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170831214630.21892-1-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/events/intel/core.c | 4 ++++ arch/x86/events/perf_event.h | 24 +++++++++++++++++++++++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 9fb9a1f1e47b..43445da30cea 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2958,6 +2958,10 @@ static unsigned long intel_pmu_free_running_flags(struct perf_event *event) if (event->attr.use_clockid) flags &= ~PERF_SAMPLE_TIME; + if (!event->attr.exclude_kernel) + flags &= ~PERF_SAMPLE_REGS_USER; + if (event->attr.sample_regs_user & ~PEBS_REGS) + flags &= ~(PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR); return flags; } diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 4196f81ec0e1..f7aaadf9331f 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -85,13 +85,15 @@ struct amd_nb { * Flags PEBS can handle without an PMI. * * TID can only be handled by flushing at context switch. + * REGS_USER can be handled for events limited to ring 3. * */ #define PEBS_FREERUNNING_FLAGS \ (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \ PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \ PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \ - PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR) + PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \ + PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER) /* * A debug store configuration. @@ -110,6 +112,26 @@ struct debug_store { u64 pebs_event_reset[MAX_PEBS_EVENTS]; }; +#define PEBS_REGS \ + (PERF_REG_X86_AX | \ + PERF_REG_X86_BX | \ + PERF_REG_X86_CX | \ + PERF_REG_X86_DX | \ + PERF_REG_X86_DI | \ + PERF_REG_X86_SI | \ + PERF_REG_X86_SP | \ + PERF_REG_X86_BP | \ + PERF_REG_X86_IP | \ + PERF_REG_X86_FLAGS | \ + PERF_REG_X86_R8 | \ + PERF_REG_X86_R9 | \ + PERF_REG_X86_R10 | \ + PERF_REG_X86_R11 | \ + PERF_REG_X86_R12 | \ + PERF_REG_X86_R13 | \ + PERF_REG_X86_R14 | \ + PERF_REG_X86_R15) + /* * Per register state. */ From ab95477e7cb35557ecfc837687007b646bab9a9f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 12 Dec 2017 02:25:31 +0100 Subject: [PATCH 263/269] bpf: fix build issues on um due to mising bpf_perf_event.h [ Note, this is a Git cherry-pick of the following commit: a23f06f06dbe ("bpf: fix build issues on um due to mising bpf_perf_event.h") ... for easier x86 PTI code testing and back-porting. ] Since c895f6f703ad ("bpf: correct broken uapi for BPF_PROG_TYPE_PERF_EVENT program type") um (uml) won't build on i386 or x86_64: [...] CC init/main.o In file included from ../include/linux/perf_event.h:18:0, from ../include/linux/trace_events.h:10, from ../include/trace/syscall.h:7, from ../include/linux/syscalls.h:82, from ../init/main.c:20: ../include/uapi/linux/bpf_perf_event.h:11:32: fatal error: asm/bpf_perf_event.h: No such file or directory #include [...] Lets add missing bpf_perf_event.h also to um arch. This seems to be the only one still missing. Fixes: c895f6f703ad ("bpf: correct broken uapi for BPF_PROG_TYPE_PERF_EVENT program type") Reported-by: Randy Dunlap Suggested-by: Richard Weinberger Signed-off-by: Daniel Borkmann Tested-by: Randy Dunlap Cc: Hendrik Brueckner Cc: Richard Weinberger Acked-by: Alexei Starovoitov Acked-by: Richard Weinberger Signed-off-by: Alexei Starovoitov Signed-off-by: Ingo Molnar --- arch/um/include/asm/Kbuild | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild index 50a32c33d729..73c57f614c9e 100644 --- a/arch/um/include/asm/Kbuild +++ b/arch/um/include/asm/Kbuild @@ -1,4 +1,5 @@ generic-y += barrier.h +generic-y += bpf_perf_event.h generic-y += bug.h generic-y += clkdev.h generic-y += current.h From c2bc66082e1048c7573d72e62f597bdc5ce13fea Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 24 Oct 2017 11:22:47 +0100 Subject: [PATCH 264/269] locking/barriers: Add implicit smp_read_barrier_depends() to READ_ONCE() [ Note, this is a Git cherry-pick of the following commit: 76ebbe78f739 ("locking/barriers: Add implicit smp_read_barrier_depends() to READ_ONCE()") ... for easier x86 PTI code testing and back-porting. ] In preparation for the removal of lockless_dereference(), which is the same as READ_ONCE() on all architectures other than Alpha, add an implicit smp_read_barrier_depends() to READ_ONCE() so that it can be used to head dependency chains on all architectures. Signed-off-by: Will Deacon Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1508840570-22169-3-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- include/linux/compiler.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 202710420d6d..712cd8bb00b4 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -341,6 +341,7 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s __read_once_size(&(x), __u.__c, sizeof(x)); \ else \ __read_once_size_nocheck(&(x), __u.__c, sizeof(x)); \ + smp_read_barrier_depends(); /* Enforce dependency ordering from x */ \ __u.__val; \ }) #define READ_ONCE(x) __READ_ONCE(x, 1) From 3382290ed2d5e275429cef510ab21889d3ccd164 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 24 Oct 2017 11:22:48 +0100 Subject: [PATCH 265/269] locking/barriers: Convert users of lockless_dereference() to READ_ONCE() [ Note, this is a Git cherry-pick of the following commit: 506458efaf15 ("locking/barriers: Convert users of lockless_dereference() to READ_ONCE()") ... for easier x86 PTI code testing and back-porting. ] READ_ONCE() now has an implicit smp_read_barrier_depends() call, so it can be used instead of lockless_dereference() without any change in semantics. Signed-off-by: Will Deacon Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1508840570-22169-4-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- arch/x86/events/core.c | 2 +- arch/x86/include/asm/mmu_context.h | 4 ++-- arch/x86/kernel/ldt.c | 2 +- drivers/md/dm-mpath.c | 20 ++++++++++---------- fs/dcache.c | 4 ++-- fs/overlayfs/ovl_entry.h | 2 +- fs/overlayfs/readdir.c | 2 +- include/linux/rculist.h | 4 ++-- include/linux/rcupdate.h | 4 ++-- kernel/events/core.c | 4 ++-- kernel/seccomp.c | 2 +- kernel/task_work.c | 2 +- mm/slab.h | 2 +- 13 files changed, 27 insertions(+), 27 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 80534d3c2480..589af1eec7c1 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2371,7 +2371,7 @@ static unsigned long get_segment_base(unsigned int segment) struct ldt_struct *ldt; /* IRQs are off, so this synchronizes with smp_store_release */ - ldt = lockless_dereference(current->active_mm->context.ldt); + ldt = READ_ONCE(current->active_mm->context.ldt); if (!ldt || idx >= ldt->nr_entries) return 0; diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 6699fc441644..6d16d15d09a0 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -73,8 +73,8 @@ static inline void load_mm_ldt(struct mm_struct *mm) #ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; - /* lockless_dereference synchronizes with smp_store_release */ - ldt = lockless_dereference(mm->context.ldt); + /* READ_ONCE synchronizes with smp_store_release */ + ldt = READ_ONCE(mm->context.ldt); /* * Any change to mm->context.ldt is followed by an IPI to all diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index ae5615b03def..1c1eae961340 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -103,7 +103,7 @@ static void finalize_ldt_struct(struct ldt_struct *ldt) static void install_ldt(struct mm_struct *current_mm, struct ldt_struct *ldt) { - /* Synchronizes with lockless_dereference in load_mm_ldt. */ + /* Synchronizes with READ_ONCE in load_mm_ldt. */ smp_store_release(¤t_mm->context.ldt, ldt); /* Activate the LDT for all CPUs using current_mm. */ diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 11f273d2f018..3f88c9d32f7e 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -366,7 +366,7 @@ static struct pgpath *choose_path_in_pg(struct multipath *m, pgpath = path_to_pgpath(path); - if (unlikely(lockless_dereference(m->current_pg) != pg)) { + if (unlikely(READ_ONCE(m->current_pg) != pg)) { /* Only update current_pgpath if pg changed */ spin_lock_irqsave(&m->lock, flags); m->current_pgpath = pgpath; @@ -390,7 +390,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) } /* Were we instructed to switch PG? */ - if (lockless_dereference(m->next_pg)) { + if (READ_ONCE(m->next_pg)) { spin_lock_irqsave(&m->lock, flags); pg = m->next_pg; if (!pg) { @@ -406,7 +406,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) /* Don't change PG until it has no remaining paths */ check_current_pg: - pg = lockless_dereference(m->current_pg); + pg = READ_ONCE(m->current_pg); if (pg) { pgpath = choose_path_in_pg(m, pg, nr_bytes); if (!IS_ERR_OR_NULL(pgpath)) @@ -473,7 +473,7 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, struct request *clone; /* Do we need to select a new pgpath? */ - pgpath = lockless_dereference(m->current_pgpath); + pgpath = READ_ONCE(m->current_pgpath); if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags)) pgpath = choose_pgpath(m, nr_bytes); @@ -535,7 +535,7 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m bool queue_io; /* Do we need to select a new pgpath? */ - pgpath = lockless_dereference(m->current_pgpath); + pgpath = READ_ONCE(m->current_pgpath); queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags); if (!pgpath || !queue_io) pgpath = choose_pgpath(m, nr_bytes); @@ -1804,7 +1804,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti, struct pgpath *current_pgpath; int r; - current_pgpath = lockless_dereference(m->current_pgpath); + current_pgpath = READ_ONCE(m->current_pgpath); if (!current_pgpath) current_pgpath = choose_pgpath(m, 0); @@ -1826,7 +1826,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti, } if (r == -ENOTCONN) { - if (!lockless_dereference(m->current_pg)) { + if (!READ_ONCE(m->current_pg)) { /* Path status changed, redo selection */ (void) choose_pgpath(m, 0); } @@ -1895,9 +1895,9 @@ static int multipath_busy(struct dm_target *ti) return (m->queue_mode != DM_TYPE_MQ_REQUEST_BASED); /* Guess which priority_group will be used at next mapping time */ - pg = lockless_dereference(m->current_pg); - next_pg = lockless_dereference(m->next_pg); - if (unlikely(!lockless_dereference(m->current_pgpath) && next_pg)) + pg = READ_ONCE(m->current_pg); + next_pg = READ_ONCE(m->next_pg); + if (unlikely(!READ_ONCE(m->current_pgpath) && next_pg)) pg = next_pg; if (!pg) { diff --git a/fs/dcache.c b/fs/dcache.c index f90141387f01..34c852af215c 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -231,7 +231,7 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c { /* * Be careful about RCU walk racing with rename: - * use 'lockless_dereference' to fetch the name pointer. + * use 'READ_ONCE' to fetch the name pointer. * * NOTE! Even if a rename will mean that the length * was not loaded atomically, we don't care. The @@ -245,7 +245,7 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c * early because the data cannot match (there can * be no NUL in the ct/tcount data) */ - const unsigned char *cs = lockless_dereference(dentry->d_name.name); + const unsigned char *cs = READ_ONCE(dentry->d_name.name); return dentry_string_cmp(cs, ct, tcount); } diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 25d9b5adcd42..36b49bd09264 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -77,5 +77,5 @@ static inline struct ovl_inode *OVL_I(struct inode *inode) static inline struct dentry *ovl_upperdentry_dereference(struct ovl_inode *oi) { - return lockless_dereference(oi->__upperdentry); + return READ_ONCE(oi->__upperdentry); } diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 698b74dd750e..c310e3ff7f3f 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -754,7 +754,7 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) { struct inode *inode = file_inode(file); - realfile = lockless_dereference(od->upperfile); + realfile = READ_ONCE(od->upperfile); if (!realfile) { struct path upperpath; diff --git a/include/linux/rculist.h b/include/linux/rculist.h index c2cdd45a880a..127f534fec94 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -275,7 +275,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). */ #define list_entry_rcu(ptr, type, member) \ - container_of(lockless_dereference(ptr), type, member) + container_of(READ_ONCE(ptr), type, member) /* * Where are list_empty_rcu() and list_first_entry_rcu()? @@ -368,7 +368,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, * example is when items are added to the list, but never deleted. */ #define list_entry_lockless(ptr, type, member) \ - container_of((typeof(ptr))lockless_dereference(ptr), type, member) + container_of((typeof(ptr))READ_ONCE(ptr), type, member) /** * list_for_each_entry_lockless - iterate over rcu list of given type diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 1a9f70d44af9..a6ddc42f87a5 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -346,7 +346,7 @@ static inline void rcu_preempt_sleep_check(void) { } #define __rcu_dereference_check(p, c, space) \ ({ \ /* Dependency order vs. p above. */ \ - typeof(*p) *________p1 = (typeof(*p) *__force)lockless_dereference(p); \ + typeof(*p) *________p1 = (typeof(*p) *__force)READ_ONCE(p); \ RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_check() usage"); \ rcu_dereference_sparse(p, space); \ ((typeof(*p) __force __kernel *)(________p1)); \ @@ -360,7 +360,7 @@ static inline void rcu_preempt_sleep_check(void) { } #define rcu_dereference_raw(p) \ ({ \ /* Dependency order vs. p above. */ \ - typeof(p) ________p1 = lockless_dereference(p); \ + typeof(p) ________p1 = READ_ONCE(p); \ ((typeof(*p) __force __kernel *)(________p1)); \ }) diff --git a/kernel/events/core.c b/kernel/events/core.c index 10cdb9c26b5d..6eee4ed97af0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4233,7 +4233,7 @@ static void perf_remove_from_owner(struct perf_event *event) * indeed free this event, otherwise we need to serialize on * owner->perf_event_mutex. */ - owner = lockless_dereference(event->owner); + owner = READ_ONCE(event->owner); if (owner) { /* * Since delayed_put_task_struct() also drops the last @@ -4330,7 +4330,7 @@ again: * Cannot change, child events are not migrated, see the * comment with perf_event_ctx_lock_nested(). */ - ctx = lockless_dereference(child->ctx); + ctx = READ_ONCE(child->ctx); /* * Since child_mutex nests inside ctx::mutex, we must jump * through hoops. We start by grabbing a reference on the ctx. diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 418a1c045933..5f0dfb2abb8d 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -190,7 +190,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, u32 ret = SECCOMP_RET_ALLOW; /* Make sure cross-thread synced filter points somewhere sane. */ struct seccomp_filter *f = - lockless_dereference(current->seccomp.filter); + READ_ONCE(current->seccomp.filter); /* Ensure unexpected behavior doesn't result in failing open. */ if (unlikely(WARN_ON(f == NULL))) diff --git a/kernel/task_work.c b/kernel/task_work.c index 5718b3ea202a..0fef395662a6 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -68,7 +68,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func) * we raced with task_work_run(), *pprev == NULL/exited. */ raw_spin_lock_irqsave(&task->pi_lock, flags); - while ((work = lockless_dereference(*pprev))) { + while ((work = READ_ONCE(*pprev))) { if (work->func != func) pprev = &work->next; else if (cmpxchg(pprev, work, work->next) == work) diff --git a/mm/slab.h b/mm/slab.h index 028cdc7df67e..86d7c7d860f9 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -259,7 +259,7 @@ cache_from_memcg_idx(struct kmem_cache *s, int idx) * memcg_caches issues a write barrier to match this (see * memcg_create_kmem_cache()). */ - cachep = lockless_dereference(arr->entries[idx]); + cachep = READ_ONCE(arr->entries[idx]); rcu_read_unlock(); return cachep; From 2aeb07365bcd489620f71390a7d2031cd4dfb83e Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Wed, 15 Nov 2017 17:36:35 -0800 Subject: [PATCH 266/269] x86/mm/kasan: Don't use vmemmap_populate() to initialize shadow [ Note, this is a Git cherry-pick of the following commit: d17a1d97dc20: ("x86/mm/kasan: don't use vmemmap_populate() to initialize shadow") ... for easier x86 PTI code testing and back-porting. ] The KASAN shadow is currently mapped using vmemmap_populate() since that provides a semi-convenient way to map pages into init_top_pgt. However, since that no longer zeroes the mapped pages, it is not suitable for KASAN, which requires zeroed shadow memory. Add kasan_populate_shadow() interface and use it instead of vmemmap_populate(). Besides, this allows us to take advantage of gigantic pages and use them to populate the shadow, which should save us some memory wasted on page tables and reduce TLB pressure. Link: http://lkml.kernel.org/r/20171103185147.2688-2-pasha.tatashin@oracle.com Signed-off-by: Andrey Ryabinin Signed-off-by: Pavel Tatashin Cc: Andy Lutomirski Cc: Steven Sistare Cc: Daniel Jordan Cc: Bob Picco Cc: Michal Hocko Cc: Alexander Potapenko Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: Christian Borntraeger Cc: David S. Miller Cc: Dmitry Vyukov Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Mark Rutland Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Sam Ravnborg Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- arch/x86/mm/kasan_init_64.c | 143 ++++++++++++++++++++++++++++++++++-- 2 files changed, 137 insertions(+), 8 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4ae940a0ed3b..665eba1b6103 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -108,7 +108,7 @@ config X86 select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE select HAVE_ARCH_JUMP_LABEL - select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP + select HAVE_ARCH_KASAN if X86_64 select HAVE_ARCH_KGDB select HAVE_ARCH_KMEMCHECK select HAVE_ARCH_MMAP_RND_BITS if MMU diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 2b60dc6e64b1..99dfed6dfef8 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -4,12 +4,14 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -18,7 +20,134 @@ extern struct range pfn_mapped[E820_MAX_ENTRIES]; static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); -static int __init map_range(struct range *range) +static __init void *early_alloc(size_t size, int nid) +{ + return memblock_virt_alloc_try_nid_nopanic(size, size, + __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid); +} + +static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr, + unsigned long end, int nid) +{ + pte_t *pte; + + if (pmd_none(*pmd)) { + void *p; + + if (boot_cpu_has(X86_FEATURE_PSE) && + ((end - addr) == PMD_SIZE) && + IS_ALIGNED(addr, PMD_SIZE)) { + p = early_alloc(PMD_SIZE, nid); + if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL)) + return; + else if (p) + memblock_free(__pa(p), PMD_SIZE); + } + + p = early_alloc(PAGE_SIZE, nid); + pmd_populate_kernel(&init_mm, pmd, p); + } + + pte = pte_offset_kernel(pmd, addr); + do { + pte_t entry; + void *p; + + if (!pte_none(*pte)) + continue; + + p = early_alloc(PAGE_SIZE, nid); + entry = pfn_pte(PFN_DOWN(__pa(p)), PAGE_KERNEL); + set_pte_at(&init_mm, addr, pte, entry); + } while (pte++, addr += PAGE_SIZE, addr != end); +} + +static void __init kasan_populate_pud(pud_t *pud, unsigned long addr, + unsigned long end, int nid) +{ + pmd_t *pmd; + unsigned long next; + + if (pud_none(*pud)) { + void *p; + + if (boot_cpu_has(X86_FEATURE_GBPAGES) && + ((end - addr) == PUD_SIZE) && + IS_ALIGNED(addr, PUD_SIZE)) { + p = early_alloc(PUD_SIZE, nid); + if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL)) + return; + else if (p) + memblock_free(__pa(p), PUD_SIZE); + } + + p = early_alloc(PAGE_SIZE, nid); + pud_populate(&init_mm, pud, p); + } + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (!pmd_large(*pmd)) + kasan_populate_pmd(pmd, addr, next, nid); + } while (pmd++, addr = next, addr != end); +} + +static void __init kasan_populate_p4d(p4d_t *p4d, unsigned long addr, + unsigned long end, int nid) +{ + pud_t *pud; + unsigned long next; + + if (p4d_none(*p4d)) { + void *p = early_alloc(PAGE_SIZE, nid); + + p4d_populate(&init_mm, p4d, p); + } + + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + if (!pud_large(*pud)) + kasan_populate_pud(pud, addr, next, nid); + } while (pud++, addr = next, addr != end); +} + +static void __init kasan_populate_pgd(pgd_t *pgd, unsigned long addr, + unsigned long end, int nid) +{ + void *p; + p4d_t *p4d; + unsigned long next; + + if (pgd_none(*pgd)) { + p = early_alloc(PAGE_SIZE, nid); + pgd_populate(&init_mm, pgd, p); + } + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + kasan_populate_p4d(p4d, addr, next, nid); + } while (p4d++, addr = next, addr != end); +} + +static void __init kasan_populate_shadow(unsigned long addr, unsigned long end, + int nid) +{ + pgd_t *pgd; + unsigned long next; + + addr = addr & PAGE_MASK; + end = round_up(end, PAGE_SIZE); + pgd = pgd_offset_k(addr); + do { + next = pgd_addr_end(addr, end); + kasan_populate_pgd(pgd, addr, next, nid); + } while (pgd++, addr = next, addr != end); +} + +static void __init map_range(struct range *range) { unsigned long start; unsigned long end; @@ -26,7 +155,7 @@ static int __init map_range(struct range *range) start = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->start)); end = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->end)); - return vmemmap_populate(start, end, NUMA_NO_NODE); + kasan_populate_shadow(start, end, early_pfn_to_nid(range->start)); } static void __init clear_pgds(unsigned long start, @@ -189,16 +318,16 @@ void __init kasan_init(void) if (pfn_mapped[i].end == 0) break; - if (map_range(&pfn_mapped[i])) - panic("kasan: unable to allocate shadow!"); + map_range(&pfn_mapped[i]); } + kasan_populate_zero_shadow( kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), kasan_mem_to_shadow((void *)__START_KERNEL_map)); - vmemmap_populate((unsigned long)kasan_mem_to_shadow(_stext), - (unsigned long)kasan_mem_to_shadow(_end), - NUMA_NO_NODE); + kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext), + (unsigned long)kasan_mem_to_shadow(_end), + early_pfn_to_nid(__pa(_stext))); kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), (void *)KASAN_SHADOW_END); From b9f5fb1800d8a4a3bc6cd3152c5f3d252986cf79 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 10 Nov 2017 15:57:21 +0100 Subject: [PATCH 267/269] cramfs: fix MTD dependency With CONFIG_MTD=m and CONFIG_CRAMFS=y, we now get a link failure: fs/cramfs/inode.o: In function `cramfs_mount': inode.c:(.text+0x220): undefined reference to `mount_mtd' fs/cramfs/inode.o: In function `cramfs_mtd_fill_super': inode.c:(.text+0x6d8): undefined reference to `mtd_point' inode.c:(.text+0xae4): undefined reference to `mtd_unpoint' This adds a more specific Kconfig dependency to avoid the broken configuration. Alternatively we could make CRAMFS itself depend on "MTD || !MTD" with a similar result. Fixes: 99c18ce580c6 ("cramfs: direct memory access support") Signed-off-by: Arnd Bergmann Signed-off-by: Nicolas Pitre Signed-off-by: Linus Torvalds --- fs/cramfs/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/cramfs/Kconfig b/fs/cramfs/Kconfig index f937082f3244..58e2fe40b2a0 100644 --- a/fs/cramfs/Kconfig +++ b/fs/cramfs/Kconfig @@ -34,6 +34,7 @@ config CRAMFS_BLOCKDEV config CRAMFS_MTD bool "Support CramFs image directly mapped in physical memory" depends on CRAMFS && MTD + depends on CRAMFS=m || MTD=y default y if !CRAMFS_BLOCKDEV help This option allows the CramFs driver to load data directly from From 779f4e1c6c7c661db40dfebd6dd6bda7b5f88aa3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 12 Dec 2017 11:28:38 -0800 Subject: [PATCH 268/269] Revert "exec: avoid RLIMIT_STACK races with prlimit()" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 04e35f4495dd560db30c25efca4eecae8ec8c375. SELinux runs with secureexec for all non-"noatsecure" domain transitions, which means lots of processes end up hitting the stack hard-limit change that was introduced in order to fix a race with prlimit(). That race fix will need to be redesigned. Reported-by: Laura Abbott Reported-by: Tomáš Trnka Cc: stable@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: Linus Torvalds --- fs/exec.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 156f56acfe8e..5688b5e1b937 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1339,15 +1339,10 @@ void setup_new_exec(struct linux_binprm * bprm) * avoid bad behavior from the prior rlimits. This has to * happen before arch_pick_mmap_layout(), which examines * RLIMIT_STACK, but after the point of no return to avoid - * races from other threads changing the limits. This also - * must be protected from races with prlimit() calls. + * needing to clean up the change on failure. */ - task_lock(current->group_leader); if (current->signal->rlim[RLIMIT_STACK].rlim_cur > _STK_LIM) current->signal->rlim[RLIMIT_STACK].rlim_cur = _STK_LIM; - if (current->signal->rlim[RLIMIT_STACK].rlim_max > _STK_LIM) - current->signal->rlim[RLIMIT_STACK].rlim_max = _STK_LIM; - task_unlock(current->group_leader); } arch_pick_mmap_layout(current->mm); From 1291a0d5049dbc06baaaf66a9ff3f53db493b19b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 17 Dec 2017 18:59:59 -0800 Subject: [PATCH 269/269] Linux 4.15-rc4 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 3f4d157add54..7e02f951b284 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 4 PATCHLEVEL = 15 SUBLEVEL = 0 -EXTRAVERSION = -rc3 +EXTRAVERSION = -rc4 NAME = Fearless Coyote # *DOCUMENTATION*