From f9f0a7d0dcbd19e9705e8b96a4b408f035e25c93 Mon Sep 17 00:00:00 2001 From: Olof Johansson Date: Mon, 8 Jul 2013 15:59:35 -0700 Subject: [PATCH 001/118] drivers/dma/iop-adma.c: fix new warnings The recent "drivers/dma: remove unused support for MEMSET operations" change has fallout from lack of build testing by the author. This fixes: drivers/dma/iop-adma.c:1020:13: warning: unused variable 'dma_addr' [-Wunused-variable] drivers/dma/iop-adma.c:1519:2: warning: format '%s' expects a matching 'char *' argument [-Wformat=] Signed-off-by: Olof Johansson Cc: Bartlomiej Zolnierkiewicz Cc: Kyungmin Park Cc: Sebastian Hesselbarth Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/dma/iop-adma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c index c9cc08c2dbba..cc727ec78c4e 100644 --- a/drivers/dma/iop-adma.c +++ b/drivers/dma/iop-adma.c @@ -1017,7 +1017,7 @@ iop_adma_xor_val_self_test(struct iop_adma_device *device) struct page *xor_srcs[IOP_ADMA_NUM_SRC_TEST]; struct page *zero_sum_srcs[IOP_ADMA_NUM_SRC_TEST + 1]; dma_addr_t dma_srcs[IOP_ADMA_NUM_SRC_TEST + 1]; - dma_addr_t dma_addr, dest_dma; + dma_addr_t dest_dma; struct dma_async_tx_descriptor *tx; struct dma_chan *dma_chan; dma_cookie_t cookie; @@ -1516,7 +1516,7 @@ static int iop_adma_probe(struct platform_device *pdev) goto err_free_iop_chan; } - dev_info(&pdev->dev, "Intel(R) IOP: ( %s%s%s%s%s%s%s)\n", + dev_info(&pdev->dev, "Intel(R) IOP: ( %s%s%s%s%s%s)\n", dma_has_cap(DMA_PQ, dma_dev->cap_mask) ? "pq " : "", dma_has_cap(DMA_PQ_VAL, dma_dev->cap_mask) ? "pq_val " : "", dma_has_cap(DMA_XOR, dma_dev->cap_mask) ? "xor " : "", From 79f6530cb59e2a0af6953742a33cc29e98ca631c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 8 Jul 2013 15:59:36 -0700 Subject: [PATCH 002/118] audit: fix mq_open and mq_unlink to add the MQ root as a hidden parent audit_names record The old audit PATH records for mq_open looked like this: type=PATH msg=audit(1366282323.982:869): item=1 name=(null) inode=6777 dev=00:0c mode=041777 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:tmpfs_t:s15:c0.c1023 type=PATH msg=audit(1366282323.982:869): item=0 name="test_mq" inode=26732 dev=00:0c mode=0100700 ouid=0 ogid=0 rdev=00:00 obj=staff_u:object_r:user_tmpfs_t:s15:c0.c1023 ...with the audit related changes that went into 3.7, they now look like this: type=PATH msg=audit(1366282236.776:3606): item=2 name=(null) inode=66655 dev=00:0c mode=0100700 ouid=0 ogid=0 rdev=00:00 obj=staff_u:object_r:user_tmpfs_t:s15:c0.c1023 type=PATH msg=audit(1366282236.776:3606): item=1 name=(null) inode=6926 dev=00:0c mode=041777 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:tmpfs_t:s15:c0.c1023 type=PATH msg=audit(1366282236.776:3606): item=0 name="test_mq" Both of these look wrong to me. As Steve Grubb pointed out: "What we need is 1 PATH record that identifies the MQ. The other PATH records probably should not be there." Fix it to record the mq root as a parent, and flag it such that it should be hidden from view when the names are logged, since the root of the mq filesystem isn't terribly interesting. With this change, we get a single PATH record that looks more like this: type=PATH msg=audit(1368021604.836:484): item=0 name="test_mq" inode=16914 dev=00:0c mode=0100644 ouid=0 ogid=0 rdev=00:00 obj=unconfined_u:object_r:user_tmpfs_t:s0 In order to do this, a new audit_inode_parent_hidden() function is added. If we do it this way, then we avoid having the existing callers of audit_inode needing to do any sort of flag conversion if auditing is inactive. Signed-off-by: Jeff Layton Reported-by: Jiri Jaburek Cc: Steve Grubb Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/audit.h | 26 ++++++++++++++++++++++---- ipc/mqueue.c | 2 ++ kernel/audit.h | 1 + kernel/auditsc.c | 12 +++++++++--- 4 files changed, 34 insertions(+), 7 deletions(-) diff --git a/include/linux/audit.h b/include/linux/audit.h index b20b03852f21..729a4d165bcc 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -103,8 +103,11 @@ extern void __audit_syscall_exit(int ret_success, long ret_value); extern struct filename *__audit_reusename(const __user char *uptr); extern void __audit_getname(struct filename *name); extern void audit_putname(struct filename *name); + +#define AUDIT_INODE_PARENT 1 /* dentry represents the parent */ +#define AUDIT_INODE_HIDDEN 2 /* audit record should be hidden */ extern void __audit_inode(struct filename *name, const struct dentry *dentry, - unsigned int parent); + unsigned int flags); extern void __audit_inode_child(const struct inode *parent, const struct dentry *dentry, const unsigned char type); @@ -148,10 +151,22 @@ static inline void audit_getname(struct filename *name) if (unlikely(!audit_dummy_context())) __audit_getname(name); } -static inline void audit_inode(struct filename *name, const struct dentry *dentry, +static inline void audit_inode(struct filename *name, + const struct dentry *dentry, unsigned int parent) { + if (unlikely(!audit_dummy_context())) { + unsigned int flags = 0; + if (parent) + flags |= AUDIT_INODE_PARENT; + __audit_inode(name, dentry, flags); + } +} +static inline void audit_inode_parent_hidden(struct filename *name, + const struct dentry *dentry) +{ if (unlikely(!audit_dummy_context())) - __audit_inode(name, dentry, parent); + __audit_inode(name, dentry, + AUDIT_INODE_PARENT | AUDIT_INODE_HIDDEN); } static inline void audit_inode_child(const struct inode *parent, const struct dentry *dentry, @@ -311,7 +326,7 @@ static inline void audit_putname(struct filename *name) { } static inline void __audit_inode(struct filename *name, const struct dentry *dentry, - unsigned int parent) + unsigned int flags) { } static inline void __audit_inode_child(const struct inode *parent, const struct dentry *dentry, @@ -321,6 +336,9 @@ static inline void audit_inode(struct filename *name, const struct dentry *dentry, unsigned int parent) { } +static inline void audit_inode_parent_hidden(struct filename *name, + const struct dentry *dentry) +{ } static inline void audit_inode_child(const struct inode *parent, const struct dentry *dentry, const unsigned char type) diff --git a/ipc/mqueue.c b/ipc/mqueue.c index e4e47f647446..ae1996d3c539 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -823,6 +823,7 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, umode_t, mode, error = ro; goto out; } + audit_inode_parent_hidden(name, root); filp = do_create(ipc_ns, root->d_inode, &path, oflag, mode, u_attr ? &attr : NULL); @@ -868,6 +869,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name) if (IS_ERR(name)) return PTR_ERR(name); + audit_inode_parent_hidden(name, mnt->mnt_root); err = mnt_want_write(mnt); if (err) goto out_name; diff --git a/kernel/audit.h b/kernel/audit.h index 1c95131ef760..123c9b7c3979 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -85,6 +85,7 @@ struct audit_names { struct filename *name; int name_len; /* number of chars to log */ + bool hidden; /* don't log this record */ bool name_put; /* call __putname()? */ unsigned long ino; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3c8a601324a2..9845cb32b60a 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1399,8 +1399,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts } i = 0; - list_for_each_entry(n, &context->names_list, list) + list_for_each_entry(n, &context->names_list, list) { + if (n->hidden) + continue; audit_log_name(context, n, NULL, i++, &call_panic); + } /* Send end of event record to help user space know we are finished */ ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); @@ -1769,14 +1772,15 @@ void audit_putname(struct filename *name) * __audit_inode - store the inode and device from a lookup * @name: name being audited * @dentry: dentry being audited - * @parent: does this dentry represent the parent? + * @flags: attributes for this particular entry */ void __audit_inode(struct filename *name, const struct dentry *dentry, - unsigned int parent) + unsigned int flags) { struct audit_context *context = current->audit_context; const struct inode *inode = dentry->d_inode; struct audit_names *n; + bool parent = flags & AUDIT_INODE_PARENT; if (!context->in_syscall) return; @@ -1831,6 +1835,8 @@ out: if (parent) { n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; n->type = AUDIT_TYPE_PARENT; + if (flags & AUDIT_INODE_HIDDEN) + n->hidden = true; } else { n->name_len = AUDIT_NAME_FULL; n->type = AUDIT_TYPE_NORMAL; From 6beb8a23b50d38a003e80c5f16b50c56e8ae3387 Mon Sep 17 00:00:00 2001 From: "Raphael S. Carvalho" Date: Mon, 8 Jul 2013 15:59:37 -0700 Subject: [PATCH 003/118] kernel/auditfilter.c: fixing build warning kernel/auditfilter.c:426: warning: this decimal constant is unsigned only in ISO C90 Signed-off-by: Raphael S. Carvalho Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/auditfilter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 6bd4a90d1991..0ee9eff866d6 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -423,7 +423,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, f->lsm_rule = NULL; /* Support legacy tests for a valid loginuid */ - if ((f->type == AUDIT_LOGINUID) && (f->val == 4294967295)) { + if ((f->type == AUDIT_LOGINUID) && (f->val == 4294967295U)) { f->type = AUDIT_LOGINUID_SET; f->val = 0; } From 2f992ee85aaa7dfd2bda43efe4493af1e108d054 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Mon, 8 Jul 2013 15:59:38 -0700 Subject: [PATCH 004/118] kernel/auditfilter.c: fix leak in audit_add_rule() error path If both 'tree' and 'watch' are valid we must call audit_put_tree(), just like the preceding code within audit_add_rule(). Signed-off-by: Chen Gang Cc: Al Viro Cc: Eric Paris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/auditfilter.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 0ee9eff866d6..3d15c66b7f0b 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -865,6 +865,12 @@ static inline int audit_add_rule(struct audit_entry *entry) err = audit_add_watch(&entry->rule, &list); if (err) { mutex_unlock(&audit_filter_mutex); + /* + * normally audit_add_tree_rule() will free it + * on failure + */ + if (tree) + audit_put_tree(tree); goto error; } } From b9ce54c9f59894e787e3067d2f758c297fcd6fd0 Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Mon, 8 Jul 2013 15:59:39 -0700 Subject: [PATCH 005/118] audit: Fix decimal constant description Use proper decimal type for comparison with u32. Compilation warning was introduced by 780a7654 ("audit: Make testing for a valid loginuid explicit.") kernel/auditfilter.c: In function 'audit_data_to_entry': kernel/auditfilter.c:426:3: warning: this decimal constant is unsigned only in ISO C90 [enabled by default] if ((f->type == AUDIT_LOGINUID) && (f->val == 4294967295)) { Signed-off-by: Michal Simek Cc: Al Viro Cc: Eric Paris Acked-by: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/auditfilter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 3d15c66b7f0b..f7aee8be7fb2 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -423,7 +423,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, f->lsm_rule = NULL; /* Support legacy tests for a valid loginuid */ - if ((f->type == AUDIT_LOGINUID) && (f->val == 4294967295U)) { + if ((f->type == AUDIT_LOGINUID) && (f->val == ~0U)) { f->type = AUDIT_LOGINUID_SET; f->val = 0; } From de1e0c40aceb9d5bff09c3a3b97b2f1b178af53f Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 8 Jul 2013 15:59:40 -0700 Subject: [PATCH 006/118] fanotify: info leak in copy_event_to_user() The ->reserved field isn't cleared so we leak one byte of stack information to userspace. Signed-off-by: Dan Carpenter Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fanotify/fanotify_user.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 1ea52f7c031f..e16076d386c4 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -122,6 +122,7 @@ static int fill_event_metadata(struct fsnotify_group *group, metadata->event_len = FAN_EVENT_METADATA_LEN; metadata->metadata_len = FAN_EVENT_METADATA_LEN; metadata->vers = FANOTIFY_METADATA_VERSION; + metadata->reserved = 0; metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS; metadata->pid = pid_vnr(event->tgid); if (unlikely(event->mask & FAN_Q_OVERFLOW)) From 7b18527c4a95397b443c8c22f75634d5d11c9d47 Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Mon, 8 Jul 2013 15:59:42 -0700 Subject: [PATCH 007/118] fanotify: fix races when adding/removing marks For both adding an event to an existing mark and destroying a mark we first have to find it via fsnotify_find_[inode|vfsmount]_mark(). But getting the mark and adding an event (or destroying it) is not done atomically. This opens a race where a thread is about to destroy a mark while another thread still finds the same mark and adds an event to its mask although it will be destroyed. Another race exists concerning the excess of a groups number of marks limit: When a mark is added the number of group marks is checked against the max number of marks per group and increased afterwards. Since check and increment is also not done atomically, this may result in 2 or more processes passing the check at the same time and increasing the number of group marks above the allowed limit. With this patch both races are avoided by doing the concerning operations with the groups mark mutex locked. Signed-off-by: Lino Sanfilippo Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fanotify/fanotify_user.c | 49 ++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 12 deletions(-) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index e16076d386c4..4e1d8ec77b04 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -524,14 +524,18 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group, __u32 removed; int destroy_mark; + mutex_lock(&group->mark_mutex); fsn_mark = fsnotify_find_vfsmount_mark(group, mnt); - if (!fsn_mark) + if (!fsn_mark) { + mutex_unlock(&group->mark_mutex); return -ENOENT; + } removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, &destroy_mark); if (destroy_mark) - fsnotify_destroy_mark(fsn_mark, group); + fsnotify_destroy_mark_locked(fsn_mark, group); + mutex_unlock(&group->mark_mutex); fsnotify_put_mark(fsn_mark); if (removed & real_mount(mnt)->mnt_fsnotify_mask) @@ -548,14 +552,19 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group, __u32 removed; int destroy_mark; + mutex_lock(&group->mark_mutex); fsn_mark = fsnotify_find_inode_mark(group, inode); - if (!fsn_mark) + if (!fsn_mark) { + mutex_unlock(&group->mark_mutex); return -ENOENT; + } removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, &destroy_mark); if (destroy_mark) - fsnotify_destroy_mark(fsn_mark, group); + fsnotify_destroy_mark_locked(fsn_mark, group); + mutex_unlock(&group->mark_mutex); + /* matches the fsnotify_find_inode_mark() */ fsnotify_put_mark(fsn_mark); if (removed & inode->i_fsnotify_mask) @@ -599,21 +608,29 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, __u32 added; int ret = 0; + mutex_lock(&group->mark_mutex); fsn_mark = fsnotify_find_vfsmount_mark(group, mnt); if (!fsn_mark) { - if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) + if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) { + mutex_unlock(&group->mark_mutex); return -ENOSPC; + } fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); - if (!fsn_mark) + if (!fsn_mark) { + mutex_unlock(&group->mark_mutex); return -ENOMEM; + } fsnotify_init_mark(fsn_mark, fanotify_free_mark); - ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0); - if (ret) + ret = fsnotify_add_mark_locked(fsn_mark, group, NULL, mnt, 0); + if (ret) { + mutex_unlock(&group->mark_mutex); goto err; + } } added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); + mutex_unlock(&group->mark_mutex); if (added & ~real_mount(mnt)->mnt_fsnotify_mask) fsnotify_recalc_vfsmount_mask(mnt); @@ -642,21 +659,29 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, (atomic_read(&inode->i_writecount) > 0)) return 0; + mutex_lock(&group->mark_mutex); fsn_mark = fsnotify_find_inode_mark(group, inode); if (!fsn_mark) { - if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) + if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) { + mutex_unlock(&group->mark_mutex); return -ENOSPC; + } fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); - if (!fsn_mark) + if (!fsn_mark) { + mutex_unlock(&group->mark_mutex); return -ENOMEM; + } fsnotify_init_mark(fsn_mark, fanotify_free_mark); - ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0); - if (ret) + ret = fsnotify_add_mark_locked(fsn_mark, group, inode, NULL, 0); + if (ret) { + mutex_unlock(&group->mark_mutex); goto err; + } } added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); + mutex_unlock(&group->mark_mutex); if (added & ~inode->i_fsnotify_mask) fsnotify_recalc_inode_mask(inode); From 5e9c070ca085439fbec9e9629dd6171ae325d4d8 Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Mon, 8 Jul 2013 15:59:43 -0700 Subject: [PATCH 008/118] fanotify: put duplicate code for adding vfsmount/inode marks into an own function The code under the groups mark_mutex in fanotify_add_inode_mark() and fanotify_add_vfsmount_mark() is almost identical. So put it into a seperate function. Signed-off-by: Lino Sanfilippo Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fanotify/fanotify_user.c | 71 +++++++++++++++--------------- 1 file changed, 35 insertions(+), 36 deletions(-) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 4e1d8ec77b04..e44cb6427df3 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -600,33 +600,45 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, return mask & ~oldmask; } +static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, + struct inode *inode, + struct vfsmount *mnt) +{ + struct fsnotify_mark *mark; + int ret; + + if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) + return ERR_PTR(-ENOSPC); + + mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); + if (!mark) + return ERR_PTR(-ENOMEM); + + fsnotify_init_mark(mark, fanotify_free_mark); + ret = fsnotify_add_mark_locked(mark, group, inode, mnt, 0); + if (ret) { + fsnotify_put_mark(mark); + return ERR_PTR(ret); + } + + return mark; +} + + static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, struct vfsmount *mnt, __u32 mask, unsigned int flags) { struct fsnotify_mark *fsn_mark; __u32 added; - int ret = 0; mutex_lock(&group->mark_mutex); fsn_mark = fsnotify_find_vfsmount_mark(group, mnt); if (!fsn_mark) { - if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) { + fsn_mark = fanotify_add_new_mark(group, NULL, mnt); + if (IS_ERR(fsn_mark)) { mutex_unlock(&group->mark_mutex); - return -ENOSPC; - } - - fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); - if (!fsn_mark) { - mutex_unlock(&group->mark_mutex); - return -ENOMEM; - } - - fsnotify_init_mark(fsn_mark, fanotify_free_mark); - ret = fsnotify_add_mark_locked(fsn_mark, group, NULL, mnt, 0); - if (ret) { - mutex_unlock(&group->mark_mutex); - goto err; + return PTR_ERR(fsn_mark); } } added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); @@ -634,9 +646,9 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, if (added & ~real_mount(mnt)->mnt_fsnotify_mask) fsnotify_recalc_vfsmount_mask(mnt); -err: + fsnotify_put_mark(fsn_mark); - return ret; + return 0; } static int fanotify_add_inode_mark(struct fsnotify_group *group, @@ -645,7 +657,6 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, { struct fsnotify_mark *fsn_mark; __u32 added; - int ret = 0; pr_debug("%s: group=%p inode=%p\n", __func__, group, inode); @@ -662,22 +673,10 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, mutex_lock(&group->mark_mutex); fsn_mark = fsnotify_find_inode_mark(group, inode); if (!fsn_mark) { - if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) { + fsn_mark = fanotify_add_new_mark(group, inode, NULL); + if (IS_ERR(fsn_mark)) { mutex_unlock(&group->mark_mutex); - return -ENOSPC; - } - - fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); - if (!fsn_mark) { - mutex_unlock(&group->mark_mutex); - return -ENOMEM; - } - - fsnotify_init_mark(fsn_mark, fanotify_free_mark); - ret = fsnotify_add_mark_locked(fsn_mark, group, inode, NULL, 0); - if (ret) { - mutex_unlock(&group->mark_mutex); - goto err; + return PTR_ERR(fsn_mark); } } added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); @@ -685,9 +684,9 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, if (added & ~inode->i_fsnotify_mask) fsnotify_recalc_inode_mask(inode); -err: + fsnotify_put_mark(fsn_mark); - return ret; + return 0; } /* fanotify syscalls */ From 52f85729805b7a0ec5a7a70e2c814193929de2f0 Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Mon, 8 Jul 2013 15:59:44 -0700 Subject: [PATCH 009/118] dnotify: replace dnotify_mark_mutex with mark mutex of dnotify_group There is no need to use a special mutex to protect against the fcntl/close race (see dnotify.c for a description of this race). Instead the dnotify_groups mark mutex can be used. Signed-off-by: Lino Sanfilippo Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/dnotify/dnotify.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 2bfe6dc413a0..1fedd5f7ccc4 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -31,7 +31,6 @@ int dir_notify_enable __read_mostly = 1; static struct kmem_cache *dnotify_struct_cache __read_mostly; static struct kmem_cache *dnotify_mark_cache __read_mostly; static struct fsnotify_group *dnotify_group __read_mostly; -static DEFINE_MUTEX(dnotify_mark_mutex); /* * dnotify will attach one of these to each inode (i_fsnotify_marks) which @@ -183,7 +182,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id) return; dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); - mutex_lock(&dnotify_mark_mutex); + mutex_lock(&dnotify_group->mark_mutex); spin_lock(&fsn_mark->lock); prev = &dn_mark->dn; @@ -199,11 +198,12 @@ void dnotify_flush(struct file *filp, fl_owner_t id) spin_unlock(&fsn_mark->lock); - /* nothing else could have found us thanks to the dnotify_mark_mutex */ + /* nothing else could have found us thanks to the dnotify_groups + mark_mutex */ if (dn_mark->dn == NULL) - fsnotify_destroy_mark(fsn_mark, dnotify_group); + fsnotify_destroy_mark_locked(fsn_mark, dnotify_group); - mutex_unlock(&dnotify_mark_mutex); + mutex_unlock(&dnotify_group->mark_mutex); fsnotify_put_mark(fsn_mark); } @@ -326,7 +326,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) new_dn_mark->dn = NULL; /* this is needed to prevent the fcntl/close race described below */ - mutex_lock(&dnotify_mark_mutex); + mutex_lock(&dnotify_group->mark_mutex); /* add the new_fsn_mark or find an old one. */ fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode); @@ -334,7 +334,8 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); spin_lock(&fsn_mark->lock); } else { - fsnotify_add_mark(new_fsn_mark, dnotify_group, inode, NULL, 0); + fsnotify_add_mark_locked(new_fsn_mark, dnotify_group, inode, + NULL, 0); spin_lock(&new_fsn_mark->lock); fsn_mark = new_fsn_mark; dn_mark = new_dn_mark; @@ -348,9 +349,9 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) /* if (f != filp) means that we lost a race and another task/thread * actually closed the fd we are still playing with before we grabbed - * the dnotify_mark_mutex and fsn_mark->lock. Since closing the fd is the - * only time we clean up the marks we need to get our mark off - * the list. */ + * the dnotify_groups mark_mutex and fsn_mark->lock. Since closing the + * fd is the only time we clean up the marks we need to get our mark + * off the list. */ if (f != filp) { /* if we added ourselves, shoot ourselves, it's possible that * the flush actually did shoot this fsn_mark. That's fine too @@ -385,9 +386,9 @@ out: spin_unlock(&fsn_mark->lock); if (destroy) - fsnotify_destroy_mark(fsn_mark, dnotify_group); + fsnotify_destroy_mark_locked(fsn_mark, dnotify_group); - mutex_unlock(&dnotify_mark_mutex); + mutex_unlock(&dnotify_group->mark_mutex); fsnotify_put_mark(fsn_mark); out_err: if (new_fsn_mark) From e1e5a9f84e4dbd3567bb8b0d5e79db6e1e5ebc35 Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Mon, 8 Jul 2013 15:59:45 -0700 Subject: [PATCH 010/118] inotify: fix race when adding a new watch In inotify_new_watch() the number of watches for a group is compared against the max number of allowed watches and increased afterwards. The check and incrementation is not done atomically, so it is possible for multiple concurrent threads to pass the check and increment the number of marks above the allowed max. This patch uses an inotify groups mark_lock to ensure that both check and incrementation are done atomic. Furthermore we dont have to worry about the race that allows a concurrent thread to add a watch just after inotify_update_existing_watch() returned with -ENOENT anymore, since this is also synchronized by the groups mark mutex now. Signed-off-by: Lino Sanfilippo Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/inotify/inotify_user.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 959815c1e017..60f954a891ab 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -636,7 +636,8 @@ static int inotify_new_watch(struct fsnotify_group *group, goto out_err; /* we are on the idr, now get on the inode */ - ret = fsnotify_add_mark(&tmp_i_mark->fsn_mark, group, inode, NULL, 0); + ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode, + NULL, 0); if (ret) { /* we failed to get on the inode, get off the idr */ inotify_remove_from_idr(group, tmp_i_mark); @@ -660,19 +661,13 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod { int ret = 0; -retry: + mutex_lock(&group->mark_mutex); /* try to update and existing watch with the new arg */ ret = inotify_update_existing_watch(group, inode, arg); /* no mark present, try to add a new one */ if (ret == -ENOENT) ret = inotify_new_watch(group, inode, arg); - /* - * inotify_new_watch could race with another thread which did an - * inotify_new_watch between the update_existing and the add watch - * here, go back and try to update an existing mark again. - */ - if (ret == -EEXIST) - goto retry; + mutex_unlock(&group->mark_mutex); return ret; } From 9756b9187eebb093b9f6a154ecceb67648e53391 Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Mon, 8 Jul 2013 15:59:46 -0700 Subject: [PATCH 011/118] fsnotify: update comments concerning locking scheme There have been changes in the locking scheme of fsnotify but the comments in the source code have not been updated yet. This patch corrects this. Signed-off-by: Lino Sanfilippo Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/mark.c | 50 +++++++++++++++++++++--------------------------- 1 file changed, 22 insertions(+), 28 deletions(-) diff --git a/fs/notify/mark.c b/fs/notify/mark.c index fc6b49bf7360..923fe4a5f503 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -20,28 +20,29 @@ * fsnotify inode mark locking/lifetime/and refcnting * * REFCNT: - * The mark->refcnt tells how many "things" in the kernel currently are - * referencing this object. The object typically will live inside the kernel - * with a refcnt of 2, one for each list it is on (i_list, g_list). Any task - * which can find this object holding the appropriete locks, can take a reference - * and the object itself is guaranteed to survive until the reference is dropped. + * The group->recnt and mark->refcnt tell how many "things" in the kernel + * currently are referencing the objects. Both kind of objects typically will + * live inside the kernel with a refcnt of 2, one for its creation and one for + * the reference a group and a mark hold to each other. + * If you are holding the appropriate locks, you can take a reference and the + * object itself is guaranteed to survive until the reference is dropped. * * LOCKING: - * There are 3 spinlocks involved with fsnotify inode marks and they MUST - * be taken in order as follows: + * There are 3 locks involved with fsnotify inode marks and they MUST be taken + * in order as follows: * + * group->mark_mutex * mark->lock - * group->mark_lock * inode->i_lock * - * mark->lock protects 2 things, mark->group and mark->inode. You must hold - * that lock to dereference either of these things (they could be NULL even with - * the lock) - * - * group->mark_lock protects the marks_list anchored inside a given group - * and each mark is hooked via the g_list. It also sorta protects the - * free_g_list, which when used is anchored by a private list on the stack of the - * task which held the group->mark_lock. + * group->mark_mutex protects the marks_list anchored inside a given group and + * each mark is hooked via the g_list. It also protects the groups private + * data (i.e group limits). + + * mark->lock protects the marks attributes like its masks and flags. + * Furthermore it protects the access to a reference of the group that the mark + * is assigned to as well as the access to a reference of the inode/vfsmount + * that is being watched by the mark. * * inode->i_lock protects the i_fsnotify_marks list anchored inside a * given inode and each mark is hooked via the i_list. (and sorta the @@ -64,18 +65,11 @@ * inode. We take i_lock and walk the i_fsnotify_marks safely. For each * mark on the list we take a reference (so the mark can't disappear under us). * We remove that mark form the inode's list of marks and we add this mark to a - * private list anchored on the stack using i_free_list; At this point we no - * longer fear anything finding the mark using the inode's list of marks. - * - * We can safely and locklessly run the private list on the stack of everything - * we just unattached from the original inode. For each mark on the private list - * we grab the mark-> and can thus dereference mark->group and mark->inode. If - * we see the group and inode are not NULL we take those locks. Now holding all - * 3 locks we can completely remove the mark from other tasks finding it in the - * future. Remember, 10 things might already be referencing this mark, but they - * better be holding a ref. We drop our reference we took before we unhooked it - * from the inode. When the ref hits 0 we can free the mark. - * + * private list anchored on the stack using i_free_list; we walk i_free_list + * and before we destroy the mark we make sure that we dont race with a + * concurrent destroy_group by getting a ref to the marks group and taking the + * groups mutex. + * Very similarly for freeing by group, except we use free_g_list. * * This has the very interesting property of being able to run concurrently with From 34e3a58c66aafd90cc16c061569fbefc3ff451e9 Mon Sep 17 00:00:00 2001 From: Libo Chen Date: Mon, 8 Jul 2013 15:59:47 -0700 Subject: [PATCH 012/118] drivers/iommu/msm_iommu_dev.c: fix leak and clean up error paths Fix two obvious problems: 1. We have registered msm_iommu_driver first, and need unregister it when registered msm_iommu_ctx_driver fail 2. We don't need to kfree drvdata before kzalloc was successful. [akpm@linux-foundation.org: remove now-unneeded initialization of ctx_drvdata, remove unneeded braces] Signed-off-by: Libo Chen Acked-by: David Brown Cc: David Woodhouse Cc: James Hogan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/iommu/msm_iommu_dev.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/drivers/iommu/msm_iommu_dev.c b/drivers/iommu/msm_iommu_dev.c index 9144a6beed92..6ba351477132 100644 --- a/drivers/iommu/msm_iommu_dev.c +++ b/drivers/iommu/msm_iommu_dev.c @@ -291,25 +291,20 @@ static int msm_iommu_ctx_probe(struct platform_device *pdev) { struct msm_iommu_ctx_dev *c = pdev->dev.platform_data; struct msm_iommu_drvdata *drvdata; - struct msm_iommu_ctx_drvdata *ctx_drvdata = NULL; + struct msm_iommu_ctx_drvdata *ctx_drvdata; int i, ret; - if (!c || !pdev->dev.parent) { - ret = -EINVAL; - goto fail; - } + + if (!c || !pdev->dev.parent) + return -EINVAL; drvdata = dev_get_drvdata(pdev->dev.parent); - - if (!drvdata) { - ret = -ENODEV; - goto fail; - } + if (!drvdata) + return -ENODEV; ctx_drvdata = kzalloc(sizeof(*ctx_drvdata), GFP_KERNEL); - if (!ctx_drvdata) { - ret = -ENOMEM; - goto fail; - } + if (!ctx_drvdata) + return -ENOMEM; + ctx_drvdata->num = c->num; ctx_drvdata->pdev = pdev; @@ -403,6 +398,7 @@ static int __init msm_iommu_driver_init(void) ret = platform_driver_register(&msm_iommu_ctx_driver); if (ret != 0) { + platform_driver_unregister(&msm_iommu_driver); pr_err("Failed to register IOMMU context driver\n"); goto error; } From 9a2458a633d4b3c9e0eae506da40cf44dc075314 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Mon, 8 Jul 2013 15:59:48 -0700 Subject: [PATCH 013/118] mm: mremap: validate input before taking lock This patch is very similar to commit 84d96d897671 ("mm: madvise: complete input validation before taking lock"): perform some basic validation of the input to mremap() before taking the ¤t->mm->mmap_sem lock. This also makes the MREMAP_FIXED => MREMAP_MAYMOVE dependency slightly more explicit. Signed-off-by: Rasmus Villemoes Cc: KOSAKI Motohiro Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mremap.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/mm/mremap.c b/mm/mremap.c index 3708655378e9..457d34ef3bf2 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -456,13 +456,14 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, unsigned long charged = 0; bool locked = false; - down_write(¤t->mm->mmap_sem); - if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) - goto out; + return ret; + + if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE)) + return ret; if (addr & ~PAGE_MASK) - goto out; + return ret; old_len = PAGE_ALIGN(old_len); new_len = PAGE_ALIGN(new_len); @@ -473,12 +474,13 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, * a zero new-len is nonsensical. */ if (!new_len) - goto out; + return ret; + + down_write(¤t->mm->mmap_sem); if (flags & MREMAP_FIXED) { - if (flags & MREMAP_MAYMOVE) - ret = mremap_to(addr, old_len, new_addr, new_len, - &locked); + ret = mremap_to(addr, old_len, new_addr, new_len, + &locked); goto out; } From 54f72fe022d9b2c4de40043a118881121190a117 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 8 Jul 2013 15:59:49 -0700 Subject: [PATCH 014/118] memcg: clean up memcg->nodeinfo Remove struct mem_cgroup_lru_info and fold its single member, the variably sized nodeinfo[0], directly into struct mem_cgroup. This should make it more obvious why it has to be the last member there. Also move the comment that's above that special last member below it, so it is more visible to somebody that considers appending to the struct mem_cgroup. Signed-off-by: Johannes Weiner Cc: David Rientjes Acked-by: Michal Hocko Cc: Glauber Costa Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2e851f453814..2b7cd24d4cda 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -187,10 +187,6 @@ struct mem_cgroup_per_node { struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; }; -struct mem_cgroup_lru_info { - struct mem_cgroup_per_node *nodeinfo[0]; -}; - /* * Cgroups above their limits are maintained in a RB-Tree, independent of * their hierarchy representation @@ -366,14 +362,8 @@ struct mem_cgroup { atomic_t numainfo_updating; #endif - /* - * Per cgroup active and inactive list, similar to the - * per zone LRU lists. - * - * WARNING: This has to be the last element of the struct. Don't - * add new fields after this point. - */ - struct mem_cgroup_lru_info info; + struct mem_cgroup_per_node *nodeinfo[0]; + /* WARNING: nodeinfo must be the last member here */ }; static size_t memcg_size(void) @@ -683,7 +673,7 @@ static struct mem_cgroup_per_zone * mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) { VM_BUG_ON((unsigned)nid >= nr_node_ids); - return &memcg->info.nodeinfo[nid]->zoneinfo[zid]; + return &memcg->nodeinfo[nid]->zoneinfo[zid]; } struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) @@ -6087,13 +6077,13 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) mz->on_tree = false; mz->memcg = memcg; } - memcg->info.nodeinfo[node] = pn; + memcg->nodeinfo[node] = pn; return 0; } static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) { - kfree(memcg->info.nodeinfo[node]); + kfree(memcg->nodeinfo[node]); } static struct mem_cgroup *mem_cgroup_alloc(void) From 609838cfed972d49a65aac7923a9ff5cbe482e30 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 8 Jul 2013 15:59:50 -0700 Subject: [PATCH 015/118] mm: invoke oom-killer from remaining unconverted page fault handlers A few remaining architectures directly kill the page faulting task in an out of memory situation. This is usually not a good idea since that task might not even use a significant amount of memory and so may not be the optimal victim to resolve the situation. Since 2.6.29's 1c0fe6e ("mm: invoke oom-killer from page fault") there is a hook that architecture page fault handlers are supposed to call to invoke the OOM killer and let it pick the right task to kill. Convert the remaining architectures over to this hook. To have the previous behavior of simply taking out the faulting task the vm.oom_kill_allocating_task sysctl can be set to 1. Signed-off-by: Johannes Weiner Reviewed-by: Michal Hocko Cc: KAMEZAWA Hiroyuki Acked-by: David Rientjes Acked-by: Vineet Gupta [arch/arc bits] Cc: James Hogan Cc: David Howells Cc: Jonas Bonn Cc: Chen Liqin Cc: Lennox Wu Cc: Chris Metcalf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arc/mm/fault.c | 6 ++++-- arch/metag/mm/fault.c | 6 ++++-- arch/mn10300/mm/fault.c | 7 ++++--- arch/openrisc/mm/fault.c | 8 ++++---- arch/score/mm/fault.c | 8 ++++---- arch/tile/mm/fault.c | 8 ++++---- 6 files changed, 24 insertions(+), 19 deletions(-) diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c index 318164cabdfc..0fd1f0d515ff 100644 --- a/arch/arc/mm/fault.c +++ b/arch/arc/mm/fault.c @@ -207,8 +207,10 @@ out_of_memory: } up_read(&mm->mmap_sem); - if (user_mode(regs)) - do_group_exit(SIGKILL); /* This will never return */ + if (user_mode(regs)) { + pagefault_out_of_memory(); + return; + } goto no_context; diff --git a/arch/metag/mm/fault.c b/arch/metag/mm/fault.c index 2c75bf7357c5..8fddf46e6c62 100644 --- a/arch/metag/mm/fault.c +++ b/arch/metag/mm/fault.c @@ -224,8 +224,10 @@ do_sigbus: */ out_of_memory: up_read(&mm->mmap_sem); - if (user_mode(regs)) - do_group_exit(SIGKILL); + if (user_mode(regs)) { + pagefault_out_of_memory(); + return 1; + } no_context: /* Are we prepared to handle this kernel fault? */ diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c index d48a84fd7fae..8a2e6ded9a44 100644 --- a/arch/mn10300/mm/fault.c +++ b/arch/mn10300/mm/fault.c @@ -345,9 +345,10 @@ no_context: */ out_of_memory: up_read(&mm->mmap_sem); - printk(KERN_ALERT "VM: killing process %s\n", tsk->comm); - if ((fault_code & MMUFCR_xFC_ACCESS) == MMUFCR_xFC_ACCESS_USR) - do_exit(SIGKILL); + if ((fault_code & MMUFCR_xFC_ACCESS) == MMUFCR_xFC_ACCESS_USR) { + pagefault_out_of_memory(); + return; + } goto no_context; do_sigbus: diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c index e2bfafce66c5..4a41f8493ab0 100644 --- a/arch/openrisc/mm/fault.c +++ b/arch/openrisc/mm/fault.c @@ -267,10 +267,10 @@ out_of_memory: __asm__ __volatile__("l.nop 1"); up_read(&mm->mmap_sem); - printk("VM: killing process %s\n", tsk->comm); - if (user_mode(regs)) - do_exit(SIGKILL); - goto no_context; + if (!user_mode(regs)) + goto no_context; + pagefault_out_of_memory(); + return; do_sigbus: up_read(&mm->mmap_sem); diff --git a/arch/score/mm/fault.c b/arch/score/mm/fault.c index 47b600e4b2c5..6b18fb0189ae 100644 --- a/arch/score/mm/fault.c +++ b/arch/score/mm/fault.c @@ -172,10 +172,10 @@ out_of_memory: down_read(&mm->mmap_sem); goto survive; } - printk("VM: killing process %s\n", tsk->comm); - if (user_mode(regs)) - do_group_exit(SIGKILL); - goto no_context; + if (!user_mode(regs)) + goto no_context; + pagefault_out_of_memory(); + return; do_sigbus: up_read(&mm->mmap_sem); diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c index 3d2b81c163a6..f7f99f90cbe0 100644 --- a/arch/tile/mm/fault.c +++ b/arch/tile/mm/fault.c @@ -573,10 +573,10 @@ out_of_memory: down_read(&mm->mmap_sem); goto survive; } - pr_alert("VM: killing process %s\n", tsk->comm); - if (!is_kernel_mode) - do_group_exit(SIGKILL); - goto no_context; + if (is_kernel_mode) + goto no_context; + pagefault_out_of_memory(); + return 0; do_sigbus: up_read(&mm->mmap_sem); From 7960aedde8cfa72e4caf488806ea7ea7d2fa8dba Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 8 Jul 2013 15:59:52 -0700 Subject: [PATCH 016/118] mm: remove duplicated call of get_pfn_range_for_nid When calculating pages in a node, for each zone in that node, we will have zone_spanned_pages_in_node --> get_pfn_range_for_nid zone_absent_pages_in_node --> get_pfn_range_for_nid That is to say, we call the get_pfn_range_for_nid to get start_pfn and end_pfn of the node for MAX_NR_ZONES * 2 times. And this is totally unnecessary if we call the get_pfn_range_for_nid before zone_*_pages_in_node add two extra arguments node_start_pfn and node_end_pfn for zone_*_pages_in_node, then we can remove the get_pfn_range_in_node in zone_*_pages_in_node. [akpm@linux-foundation.org: make definitions more readable] Signed-off-by: Zhang Yanfei Cc: Michal Hocko Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 43 ++++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 327516b7aee9..7d5e40fe0c29 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4421,13 +4421,13 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid, */ static unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, unsigned long *ignored) { - unsigned long node_start_pfn, node_end_pfn; unsigned long zone_start_pfn, zone_end_pfn; - /* Get the start and end of the node and zone */ - get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); + /* Get the start and end of the zone */ zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; adjust_zone_range_for_zone_movable(nid, zone_type, @@ -4482,14 +4482,14 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn, /* Return the number of page frames in holes in a zone on a node */ static unsigned long __meminit zone_absent_pages_in_node(int nid, unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, unsigned long *ignored) { unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; - unsigned long node_start_pfn, node_end_pfn; unsigned long zone_start_pfn, zone_end_pfn; - get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); @@ -4502,6 +4502,8 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, unsigned long *zones_size) { return zones_size[zone_type]; @@ -4509,6 +4511,8 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, static inline unsigned long __meminit zone_absent_pages_in_node(int nid, unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, unsigned long *zholes_size) { if (!zholes_size) @@ -4520,21 +4524,27 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid, #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, - unsigned long *zones_size, unsigned long *zholes_size) + unsigned long node_start_pfn, + unsigned long node_end_pfn, + unsigned long *zones_size, + unsigned long *zholes_size) { unsigned long realtotalpages, totalpages = 0; enum zone_type i; for (i = 0; i < MAX_NR_ZONES; i++) totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, - zones_size); + node_start_pfn, + node_end_pfn, + zones_size); pgdat->node_spanned_pages = totalpages; realtotalpages = totalpages; for (i = 0; i < MAX_NR_ZONES; i++) realtotalpages -= zone_absent_pages_in_node(pgdat->node_id, i, - zholes_size); + node_start_pfn, node_end_pfn, + zholes_size); pgdat->node_present_pages = realtotalpages; printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); @@ -4643,6 +4653,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, * NOTE: pgdat should get zeroed by caller. */ static void __paginginit free_area_init_core(struct pglist_data *pgdat, + unsigned long node_start_pfn, unsigned long node_end_pfn, unsigned long *zones_size, unsigned long *zholes_size) { enum zone_type j; @@ -4664,8 +4675,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, freesize, memmap_pages; - size = zone_spanned_pages_in_node(nid, j, zones_size); + size = zone_spanned_pages_in_node(nid, j, node_start_pfn, + node_end_pfn, zones_size); realsize = freesize = size - zone_absent_pages_in_node(nid, j, + node_start_pfn, + node_end_pfn, zholes_size); /* @@ -4779,6 +4793,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, unsigned long node_start_pfn, unsigned long *zholes_size) { pg_data_t *pgdat = NODE_DATA(nid); + unsigned long start_pfn = 0; + unsigned long end_pfn = 0; /* pg_data_t should be reset to zero when it's allocated */ WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); @@ -4786,7 +4802,11 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; init_zone_allows_reclaim(nid); - calculate_node_totalpages(pgdat, zones_size, zholes_size); +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); +#endif + calculate_node_totalpages(pgdat, start_pfn, end_pfn, + zones_size, zholes_size); alloc_node_mem_map(pgdat); #ifdef CONFIG_FLAT_NODE_MEM_MAP @@ -4795,7 +4815,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, (unsigned long)pgdat->node_mem_map); #endif - free_area_init_core(pgdat, zones_size, zholes_size); + free_area_init_core(pgdat, start_pfn, end_pfn, + zones_size, zholes_size); } #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP From ab15d9b4cbc2b6497023f554a152c2573ca53671 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 8 Jul 2013 15:59:53 -0700 Subject: [PATCH 017/118] mm/vmalloc.c: unbreak __vunmap() There is an extra semi-colon so the function always returns. Signed-off-by: Dan Carpenter Acked-by: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 91a10472a39a..96b77a982545 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1453,7 +1453,7 @@ static void __vunmap(const void *addr, int deallocate_pages) return; if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n", - addr)); + addr)) return; area = remove_vm_area(addr); From 3fcd76e8028e0be37b02a2002b4f56755daeda06 Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 8 Jul 2013 15:59:54 -0700 Subject: [PATCH 018/118] mm/vmalloc.c: remove dead code in vb_alloc Space in a vmap block that was once allocated is considered dirty and not made available for allocation again before the whole block is recycled. The result is that free space within a vmap block is always contiguous. So if a vmap block has enough free space for allocation, the allocation is impossible to fail. Thus, the fragmented block purging was never invoked from vb_alloc(). So remove this dead code. [ Same patches also sent by: Chanho Min Johannes Weiner but git doesn't do "multiple authors" ] Signed-off-by: Zhang Yanfei Cc: Johannes Weiner Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 96b77a982545..a35f4f5bb908 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -910,7 +910,6 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) struct vmap_block *vb; unsigned long addr = 0; unsigned int order; - int purge = 0; BUG_ON(size & ~PAGE_MASK); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); @@ -934,17 +933,7 @@ again: if (vb->free < 1UL << order) goto next; - i = bitmap_find_free_region(vb->alloc_map, - VMAP_BBMAP_BITS, order); - - if (i < 0) { - if (vb->free + vb->dirty == VMAP_BBMAP_BITS) { - /* fragmented and no outstanding allocations */ - BUG_ON(vb->dirty != VMAP_BBMAP_BITS); - purge = 1; - } - goto next; - } + i = VMAP_BBMAP_BITS - vb->free; addr = vb->va->va_start + (i << PAGE_SHIFT); BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(vb->va->va_start)); @@ -960,9 +949,6 @@ next: spin_unlock(&vb->lock); } - if (purge) - purge_fragmented_blocks_thiscpu(); - put_cpu_var(vmap_block_queue); rcu_read_unlock(); From 9da3f59fbdb57c9447ddb42681f6ab98faef353a Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 8 Jul 2013 15:59:55 -0700 Subject: [PATCH 019/118] mm/vmalloc.c: remove unused purge_fragmented_blocks_thiscpu This function is nowhere used now, so remove it. Signed-off-by: Zhang Yanfei Cc: Johannes Weiner Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a35f4f5bb908..99d045a0a0eb 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -891,11 +891,6 @@ static void purge_fragmented_blocks(int cpu) } } -static void purge_fragmented_blocks_thiscpu(void) -{ - purge_fragmented_blocks(smp_processor_id()); -} - static void purge_fragmented_blocks_allcpus(void) { int cpu; From b8e748b6c32999f221ea4786557b8e7e6c4e4e7a Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 8 Jul 2013 15:59:56 -0700 Subject: [PATCH 020/118] mm/vmalloc.c: remove alloc_map from vmap_block As we have removed the dead code in the vb_alloc, it seems there is no place to use the alloc_map. So there is no reason to maintain the alloc_map in vmap_block. Signed-off-by: Zhang Yanfei Cc: Johannes Weiner Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 99d045a0a0eb..7ac2a1f8358a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -754,7 +754,6 @@ struct vmap_block { struct vmap_area *va; struct vmap_block_queue *vbq; unsigned long free, dirty; - DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); struct list_head free_list; struct rcu_head rcu_head; @@ -820,7 +819,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) vb->va = va; vb->free = VMAP_BBMAP_BITS; vb->dirty = 0; - bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS); bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); INIT_LIST_HEAD(&vb->free_list); @@ -873,7 +871,6 @@ static void purge_fragmented_blocks(int cpu) if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { vb->free = 0; /* prevent further allocs after releasing lock */ vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ - bitmap_fill(vb->alloc_map, VMAP_BBMAP_BITS); bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS); spin_lock(&vbq->lock); list_del_rcu(&vb->free_list); From 46c001a2753f47ffa621131baa3409e636515347 Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 8 Jul 2013 15:59:57 -0700 Subject: [PATCH 021/118] mm/vmalloc.c: emit the failure message before return Use goto to jump to the fail label to give a failure message before returning NULL. This makes the failure handling in this function consistent. Signed-off-by: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 7ac2a1f8358a..d81b9f70d92f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1642,7 +1642,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); if (!addr) - return NULL; + goto fail; /* * In this function, newly allocated vm_struct has VM_UNLIST flag. From 20fc02b477c526c6a85f84e3770373778ff2f97e Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 8 Jul 2013 15:59:58 -0700 Subject: [PATCH 022/118] mm/vmalloc.c: rename VM_UNLIST to VM_UNINITIALIZED VM_UNLIST was used to indicate that the vm_struct is not listed in vmlist. But after commit 4341fa454796 ("mm, vmalloc: remove list management of vmlist after initializing vmalloc"), the meaning of this flag changed. It now means the vm_struct is not fully initialized. So renaming it to VM_UNINITIALIZED seems more reasonable. Also change clear_vm_unlist to clear_vm_uninitialized_flag. Signed-off-by: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 12 ++++++------ mm/vmalloc.c | 18 +++++++++--------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index dd0a2c810529..4b8a89189a29 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -10,12 +10,12 @@ struct vm_area_struct; /* vma defining user mapping in mm_types.h */ /* bits in flags of vmalloc's vm_struct below */ -#define VM_IOREMAP 0x00000001 /* ioremap() and friends */ -#define VM_ALLOC 0x00000002 /* vmalloc() */ -#define VM_MAP 0x00000004 /* vmap()ed pages */ -#define VM_USERMAP 0x00000008 /* suitable for remap_vmalloc_range */ -#define VM_VPAGES 0x00000010 /* buffer for pages was vmalloc'ed */ -#define VM_UNLIST 0x00000020 /* vm_struct is not listed in vmlist */ +#define VM_IOREMAP 0x00000001 /* ioremap() and friends */ +#define VM_ALLOC 0x00000002 /* vmalloc() */ +#define VM_MAP 0x00000004 /* vmap()ed pages */ +#define VM_USERMAP 0x00000008 /* suitable for remap_vmalloc_range */ +#define VM_VPAGES 0x00000010 /* buffer for pages was vmalloc'ed */ +#define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */ /* bits [20..32] reserved for arch specific ioremap internals */ /* diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d81b9f70d92f..af40068271c4 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1289,15 +1289,15 @@ static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, spin_unlock(&vmap_area_lock); } -static void clear_vm_unlist(struct vm_struct *vm) +static void clear_vm_uninitialized_flag(struct vm_struct *vm) { /* - * Before removing VM_UNLIST, + * Before removing VM_UNINITIALIZED, * we should make sure that vm has proper values. * Pair with smp_rmb() in show_numa_info(). */ smp_wmb(); - vm->flags &= ~VM_UNLIST; + vm->flags &= ~VM_UNINITIALIZED; } static struct vm_struct *__get_vm_area_node(unsigned long size, @@ -1635,7 +1635,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, if (!size || (size >> PAGE_SHIFT) > totalram_pages) goto fail; - area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST, + area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED, start, end, node, gfp_mask, caller); if (!area) goto fail; @@ -1645,11 +1645,11 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, goto fail; /* - * In this function, newly allocated vm_struct has VM_UNLIST flag. - * It means that vm_struct is not fully initialized. + * In this function, newly allocated vm_struct has VM_UNINITIALIZED + * flag. It means that vm_struct is not fully initialized. * Now, it is fully initialized, so remove this flag here. */ - clear_vm_unlist(area); + clear_vm_uninitialized_flag(area); /* * A ref_count = 3 is needed because the vm_struct and vmap_area @@ -2569,9 +2569,9 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) if (!counters) return; - /* Pair with smp_wmb() in clear_vm_unlist() */ + /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ smp_rmb(); - if (v->flags & VM_UNLIST) + if (v->flags & VM_UNINITIALIZED) return; memset(counters, 0, nr_node_ids * sizeof(unsigned int)); From d157a55815ffff48caec311dfb543ce8a79e283e Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 8 Jul 2013 15:59:59 -0700 Subject: [PATCH 023/118] mm/vmalloc.c: check VM_UNINITIALIZED flag in s_show instead of show_numa_info We should check the VM_UNITIALIZED flag in s_show(). If this flag is set, that said, the vm_struct is not fully initialized. So it is unnecessary to try to show the information contained in vm_struct. We checked this flag in show_numa_info(), but I think it's better to check it earlier. Signed-off-by: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index af40068271c4..318c5007f226 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2569,11 +2569,6 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) if (!counters) return; - /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ - smp_rmb(); - if (v->flags & VM_UNINITIALIZED) - return; - memset(counters, 0, nr_node_ids * sizeof(unsigned int)); for (nr = 0; nr < v->nr_pages; nr++) @@ -2602,6 +2597,11 @@ static int s_show(struct seq_file *m, void *p) v = va->vm; + /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ + smp_rmb(); + if (v->flags & VM_UNINITIALIZED) + return 0; + seq_printf(m, "0x%pK-0x%pK %7ld", v->addr, v->addr + v->size, v->size); From 6d42c232bd1e77288b2660153299b7d12a5c8e15 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Mon, 8 Jul 2013 16:00:00 -0700 Subject: [PATCH 024/118] memcg: also test for skip accounting at the page allocation level The memory we used to hold the memcg arrays is currently accounted to the current memcg. But that creates a problem, because that memory can only be freed after the last user is gone. Our only way to know which is the last user, is to hook up to freeing time, but the fact that we still have some in flight kmallocs will prevent freeing to happen. I believe therefore to be just easier to account this memory as global overhead. This patch (of 2): Disabling accounting is only relevant for some specific memcg internal allocations. Therefore we would initially not have such check at memcg_kmem_newpage_charge, since direct calls to the page allocator that are marked with GFP_KMEMCG only happen outside memcg core. We are mostly concerned with cache allocations and by having this test at memcg_kmem_get_cache we are already able to relay the allocation to the root cache and bypass the memcg caches altogether. There is one exception, though: the SLUB allocator does not create large order caches, but rather service large kmallocs directly from the page allocator. Therefore, the following sequence, when backed by the SLUB allocator: memcg_stop_kmem_account(); kmalloc() memcg_resume_kmem_account(); would effectively ignore the fact that we should skip accounting, since it will drive us directly to this function without passing through the cache selector memcg_kmem_get_cache. Such large allocations are extremely rare but can happen, for instance, for the cache arrays. This was never a problem in practice, because we weren't skipping accounting for the cache arrays. All the allocations we were skipping were fairly small. However, the fact that we were not skipping those allocations are a problem and can prevent the memcgs from going away. As we fix that, we need to make sure that the fix will also work with the SLUB allocator. Signed-off-by: Glauber Costa Reported-by: Michal Hocko Cc: Johannes Weiner Cc: Kamezawa Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2b7cd24d4cda..06a595fd6400 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3637,6 +3637,34 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) int ret; *_memcg = NULL; + + /* + * Disabling accounting is only relevant for some specific memcg + * internal allocations. Therefore we would initially not have such + * check here, since direct calls to the page allocator that are marked + * with GFP_KMEMCG only happen outside memcg core. We are mostly + * concerned with cache allocations, and by having this test at + * memcg_kmem_get_cache, we are already able to relay the allocation to + * the root cache and bypass the memcg cache altogether. + * + * There is one exception, though: the SLUB allocator does not create + * large order caches, but rather service large kmallocs directly from + * the page allocator. Therefore, the following sequence when backed by + * the SLUB allocator: + * + * memcg_stop_kmem_account(); + * kmalloc() + * memcg_resume_kmem_account(); + * + * would effectively ignore the fact that we should skip accounting, + * since it will drive us directly to this function without passing + * through the cache selector memcg_kmem_get_cache. Such large + * allocations are extremely rare but can happen, for instance, for the + * cache arrays. We bring this test here. + */ + if (!current->mm || current->memcg_kmem_skip_account) + return true; + memcg = try_get_mem_cgroup_from_mm(current->mm); /* From 425c598d583883c33c75780225ba8e0794b43bd9 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Mon, 8 Jul 2013 16:00:01 -0700 Subject: [PATCH 025/118] memcg: do not account memory used for cache creation The memory we used to hold the memcg arrays is currently accounted to the current memcg. But that creates a problem, because that memory can only be freed after the last user is gone. Our only way to know which is the last user, is to hook up to freeing time, but the fact that we still have some in flight kmallocs will prevent freeing to happen. I believe therefore to be just easier to account this memory as global overhead. Signed-off-by: Glauber Costa Cc: Johannes Weiner Cc: Michal Hocko Cc: Kamezawa Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 06a595fd6400..64f726599ff0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5232,7 +5232,9 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg) static_key_slow_inc(&memcg_kmem_enabled_key); mutex_lock(&set_limit_mutex); + memcg_stop_kmem_account(); ret = memcg_update_cache_sizes(memcg); + memcg_resume_kmem_account(); mutex_unlock(&set_limit_mutex); out: return ret; From 537926caedb335b198eb53930ebeeb6426a541f9 Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 8 Jul 2013 16:00:02 -0700 Subject: [PATCH 026/118] include/linux/gfp.h: fix the comment for GFP_ZONE_TABLE 0xc just means MOVABLE + DMA32, which results in zone DMA32. Signed-off-by: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 0f615eb23d05..9b4dd491f7e8 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -209,7 +209,7 @@ static inline int allocflags_to_migratetype(gfp_t gfp_flags) * 0x9 => DMA or NORMAL (MOVABLE+DMA) * 0xa => MOVABLE (Movable is valid only if HIGHMEM is set too) * 0xb => BAD (MOVABLE+HIGHMEM+DMA) - * 0xc => DMA32 (MOVABLE+HIGHMEM+DMA32) + * 0xc => DMA32 (MOVABLE+DMA32) * 0xd => BAD (MOVABLE+DMA32+DMA) * 0xe => BAD (MOVABLE+DMA32+HIGHMEM) * 0xf => BAD (MOVABLE+DMA32+HIGHMEM+DMA) From 0cf31ec10e92253e2908cd830145a71043740d77 Mon Sep 17 00:00:00 2001 From: Seth Jennings Date: Mon, 8 Jul 2013 16:00:05 -0700 Subject: [PATCH 027/118] MAINTAINERS: add zswap and zbud maintainer Add maintainer information for zswap and zbud into the MAINTAINERS file. Signed-off-by: Seth Jennings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 97762ad25300..70cf679d3904 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9263,6 +9263,13 @@ F: Documentation/networking/z8530drv.txt F: drivers/net/hamradio/*scc.c F: drivers/net/hamradio/z8530.h +ZBUD COMPRESSED PAGE ALLOCATOR +M: Seth Jennings +L: linux-mm@kvack.org +S: Maintained +F: mm/zbud.c +F: include/linux/zbud.h + ZD1211RW WIRELESS DRIVER M: Daniel Drake M: Ulrich Kunitz @@ -9285,6 +9292,12 @@ M: "Maciej W. Rozycki" S: Maintained F: drivers/tty/serial/zs.* +ZSWAP COMPRESSED SWAP CACHING +M: Seth Jennings +L: linux-mm@kvack.org +S: Maintained +F: mm/zswap.c + THE REST M: Linus Torvalds L: linux-kernel@vger.kernel.org From bc732f1d55cf41627ee4c64078812b2fa592b394 Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 8 Jul 2013 16:00:06 -0700 Subject: [PATCH 028/118] mm/page_alloc.c: remove zone_type argument of build_zonelists_node The callers of build_zonelists_node always pass MAX_NR_ZONES -1 as the zone_type argument, so we can directly use the value in build_zonelists_node and remove zone_type argument. Signed-off-by: Zhang Yanfei Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7d5e40fe0c29..27f9d4beac98 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3153,12 +3153,10 @@ static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) * Add all populated zones of a node to the zonelist. */ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, - int nr_zones, enum zone_type zone_type) + int nr_zones) { struct zone *zone; - - BUG_ON(zone_type >= MAX_NR_ZONES); - zone_type++; + enum zone_type zone_type = MAX_NR_ZONES; do { zone_type--; @@ -3168,8 +3166,8 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, &zonelist->_zonerefs[nr_zones++]); check_highest_zone(zone_type); } - } while (zone_type); + return nr_zones; } @@ -3363,8 +3361,7 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) zonelist = &pgdat->node_zonelists[0]; for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) ; - j = build_zonelists_node(NODE_DATA(node), zonelist, j, - MAX_NR_ZONES - 1); + j = build_zonelists_node(NODE_DATA(node), zonelist, j); zonelist->_zonerefs[j].zone = NULL; zonelist->_zonerefs[j].zone_idx = 0; } @@ -3378,7 +3375,7 @@ static void build_thisnode_zonelists(pg_data_t *pgdat) struct zonelist *zonelist; zonelist = &pgdat->node_zonelists[1]; - j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); + j = build_zonelists_node(pgdat, zonelist, 0); zonelist->_zonerefs[j].zone = NULL; zonelist->_zonerefs[j].zone_idx = 0; } @@ -3586,7 +3583,7 @@ static void build_zonelists(pg_data_t *pgdat) local_node = pgdat->node_id; zonelist = &pgdat->node_zonelists[0]; - j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); + j = build_zonelists_node(pgdat, zonelist, 0); /* * Now we build the zonelist so that it contains the zones @@ -3599,14 +3596,12 @@ static void build_zonelists(pg_data_t *pgdat) for (node = local_node + 1; node < MAX_NUMNODES; node++) { if (!node_online(node)) continue; - j = build_zonelists_node(NODE_DATA(node), zonelist, j, - MAX_NR_ZONES - 1); + j = build_zonelists_node(NODE_DATA(node), zonelist, j); } for (node = 0; node < local_node; node++) { if (!node_online(node)) continue; - j = build_zonelists_node(NODE_DATA(node), zonelist, j, - MAX_NR_ZONES - 1); + j = build_zonelists_node(NODE_DATA(node), zonelist, j); } zonelist->_zonerefs[j].zone = NULL; From b21fbccd4b8aba805cbc231998ec7bf83616a79e Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 8 Jul 2013 16:00:07 -0700 Subject: [PATCH 029/118] mm: remove unused functions is_{normal_idx, normal, dma32, dma} These functions are nowhere used, so remove them. Signed-off-by: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ae19af5ec02c..af4a3b77a8de 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -869,11 +869,6 @@ static inline int is_highmem_idx(enum zone_type idx) #endif } -static inline int is_normal_idx(enum zone_type idx) -{ - return (idx == ZONE_NORMAL); -} - /** * is_highmem - helper function to quickly check if a struct zone is a * highmem zone or not. This is an attempt to keep references @@ -892,29 +887,6 @@ static inline int is_highmem(struct zone *zone) #endif } -static inline int is_normal(struct zone *zone) -{ - return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL; -} - -static inline int is_dma32(struct zone *zone) -{ -#ifdef CONFIG_ZONE_DMA32 - return zone == zone->zone_pgdat->node_zones + ZONE_DMA32; -#else - return 0; -#endif -} - -static inline int is_dma(struct zone *zone) -{ -#ifdef CONFIG_ZONE_DMA - return zone == zone->zone_pgdat->node_zones + ZONE_DMA; -#else - return 0; -#endif -} - /* These two functions are used to setup the per zone pages min values */ struct ctl_table; int min_free_kbytes_sysctl_handler(struct ctl_table *, int, From 345606d42971fc4ed164fbabac118708d51b8e0a Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 8 Jul 2013 16:00:08 -0700 Subject: [PATCH 030/118] mm/page_alloc.c: remove unlikely() from the current_order test In __rmqueue_fallback(), current_order loops down from MAX_ORDER - 1 to the order passed. MAX_ORDER is typically 11 and pageblock_order is typically 9 on x86. Integer division truncates, so pageblock_order / 2 is 4. For the first eight iterations, it's guaranteed that current_order >= pageblock_order / 2 if it even gets that far! So just remove the unlikely(), it's completely bogus. Signed-off-by: Zhang Yanfei Suggested-by: David Rientjes Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 27f9d4beac98..b5855e545eec 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1046,7 +1046,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) * MIGRATE_CMA areas. */ if (!is_migrate_cma(migratetype) && - (unlikely(current_order >= pageblock_order / 2) || + (current_order >= pageblock_order / 2 || start_migratetype == MIGRATE_RECLAIMABLE || page_group_by_mobility_disabled)) { int pages; From 59d3132f8abdc18301898febf205d00db5f0458c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 8 Jul 2013 16:00:08 -0700 Subject: [PATCH 031/118] vfree: don't schedule free_work() if llist_add() returns false vfree() only needs schedule_work(&p->wq) if p->list was empty, otherwise vfree_deferred->wq is already pending or it is running and didn't do llist_del_all() yet. Signed-off-by: Oleg Nesterov Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 318c5007f226..a649186669a3 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1477,7 +1477,6 @@ static void __vunmap(const void *addr, int deallocate_pages) * conventions for vfree() arch-depenedent would be a really bad idea) * * NOTE: assumes that the object at *addr has a size >= sizeof(llist_node) - * */ void vfree(const void *addr) { @@ -1489,8 +1488,8 @@ void vfree(const void *addr) return; if (unlikely(in_interrupt())) { struct vfree_deferred *p = &__get_cpu_var(vfree_deferred); - llist_add((struct llist_node *)addr, &p->list); - schedule_work(&p->wq); + if (llist_add((struct llist_node *)addr, &p->list)) + schedule_work(&p->wq); } else __vunmap(addr, 1); } From 929aaf56958ab2300919653b923413af695470a5 Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 8 Jul 2013 16:00:09 -0700 Subject: [PATCH 032/118] mm: remove unused __put_page() This function is nowhere used, and it has a confusing name with put_page in mm/swap.c. So better to remove it. Signed-off-by: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 8562de0a5197..4390ac6c106e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -32,11 +32,6 @@ static inline void set_page_refcounted(struct page *page) set_page_count(page, 1); } -static inline void __put_page(struct page *page) -{ - atomic_dec(&page->_count); -} - static inline void __get_page_tail_foll(struct page *page, bool get_page_head) { From f3deb6872b946a851a3799b315f3c85ce4c027fc Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 8 Jul 2013 16:00:10 -0700 Subject: [PATCH 033/118] mm/sparse.c: put clear_hwpoisoned_pages within CONFIG_MEMORY_HOTREMOVE With CONFIG_MEMORY_HOTREMOVE unset, there is a compile warning: mm/sparse.c:755: warning: `clear_hwpoisoned_pages' defined but not used And Bisecting it ended up pointing to 4edd7ceff ("mm, hotplug: avoid compiling memory hotremove functions when disabled"). This is because the commit above put sparse_remove_one_section() within the protection of CONFIG_MEMORY_HOTREMOVE but the only user of clear_hwpoisoned_pages() is sparse_remove_one_section(), and it is not within the protection of CONFIG_MEMORY_HOTREMOVE. So put clear_hwpoisoned_pages within CONFIG_MEMORY_HOTREMOVE should fix the warning. Signed-off-by: Zhang Yanfei Cc: David Rientjes Acked-by: Toshi Kani Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/sparse.c b/mm/sparse.c index b38400f0fb8d..308d50331bc3 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -753,6 +753,7 @@ out: return ret; } +#ifdef CONFIG_MEMORY_HOTREMOVE #ifdef CONFIG_MEMORY_FAILURE static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) { @@ -774,7 +775,6 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) } #endif -#ifdef CONFIG_MEMORY_HOTREMOVE static void free_section_usemap(struct page *memmap, unsigned long *usemap) { struct page *usemap_page; From 12057841008534236e52df3d3e63e089f27c5406 Mon Sep 17 00:00:00 2001 From: Haicheng Li Date: Mon, 8 Jul 2013 16:00:11 -0700 Subject: [PATCH 034/118] fs/fs-writeback.c: : make wb_do_writeback() as static It's not used globally and could be static. Signed-off-by: Haicheng Li Cc: Jan Kara Cc: Wu Fengguang Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 2 +- include/linux/writeback.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index a85ac4e33436..aca8835c8c17 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -963,7 +963,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) /* * Retrieve work items and do the writeback they describe */ -long wb_do_writeback(struct bdi_writeback *wb, int force_wait) +static long wb_do_writeback(struct bdi_writeback *wb, int force_wait) { struct backing_dev_info *bdi = wb->bdi; struct wb_writeback_work *work; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index abfe11787af3..e0efffafcd31 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -95,7 +95,6 @@ int try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr, void sync_inodes_sb(struct super_block *); long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, enum wb_reason reason); -long wb_do_writeback(struct bdi_writeback *wb, int force_wait); void wakeup_flusher_threads(long nr_pages, enum wb_reason reason); void inode_wait_for_writeback(struct inode *inode); From 6ce1bc86ae8b8f74095f2694732ccbab2f3849e5 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 8 Jul 2013 16:00:12 -0700 Subject: [PATCH 035/118] mm/writeback: remove wb_reason_name wb_reason_name is not used any more - remove it. Signed-off-by: Wanpeng Li Reviewed-by: Tejun Heo Reviewed-by: Fengguang Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/writeback.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index e0efffafcd31..e1703ded543a 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -51,7 +51,6 @@ enum wb_reason { WB_REASON_MAX, }; -extern const char *wb_reason_name[]; /* * A control structure which tells the writeback code what to do. These are From 25d130ba22362757a90135fd8a0f75cc7fc71e79 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 8 Jul 2013 16:00:14 -0700 Subject: [PATCH 036/118] mm/writeback: don't check force_wait to handle bdi->work_list After commit 839a8e8660b6 ("writeback: replace custom worker pool implementation with unbound workqueue"), bdi_writeback_workfn runs off bdi_writeback->dwork, on each execution, it processes bdi->work_list and reschedules if there are more things to do instead of flush any work that race with us existing. It is unecessary to check force_wait in wb_do_writeback since it is always 0 after the mentioned commit. This patch remove the force_wait in wb_do_writeback. Signed-off-by: Wanpeng Li Reviewed-by: Tejun Heo Reviewed-by: Fengguang Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index aca8835c8c17..68851ff2fd41 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -963,7 +963,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) /* * Retrieve work items and do the writeback they describe */ -static long wb_do_writeback(struct bdi_writeback *wb, int force_wait) +static long wb_do_writeback(struct bdi_writeback *wb) { struct backing_dev_info *bdi = wb->bdi; struct wb_writeback_work *work; @@ -971,12 +971,6 @@ static long wb_do_writeback(struct bdi_writeback *wb, int force_wait) set_bit(BDI_writeback_running, &wb->bdi->state); while ((work = get_next_work_item(bdi)) != NULL) { - /* - * Override sync mode, in case we must wait for completion - * because this thread is exiting now. - */ - if (force_wait) - work->sync_mode = WB_SYNC_ALL; trace_writeback_exec(bdi, work); @@ -1025,7 +1019,7 @@ void bdi_writeback_workfn(struct work_struct *work) * rescuer as work_list needs to be drained. */ do { - pages_written = wb_do_writeback(wb, 0); + pages_written = wb_do_writeback(wb); trace_writeback_pages_written(pages_written); } while (!list_empty(&bdi->work_list)); } else { From fc6df808aaf00eed564e2e7fc0f246691363cd12 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 8 Jul 2013 16:00:15 -0700 Subject: [PATCH 037/118] mm/writeback: commit reason of WB_REASON_FORKER_THREAD mismatch name After commit 839a8e8660b6 ("writeback: replace custom worker pool implementation with unbound workqueue"), there is no bdi forker thread any more. However, WB_REASON_FORKER_THREAD is still used due to it is TPs userland visible and we won't be exposing exactly the same information with just a different name. Signed-off-by: Wanpeng Li Reviewed-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/writeback.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index e1703ded543a..4e198ca1f685 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -47,6 +47,12 @@ enum wb_reason { WB_REASON_LAPTOP_TIMER, WB_REASON_FREE_MORE_MEM, WB_REASON_FS_FREE_SPACE, + /* + * There is no bdi forker thread any more and works are done + * by emergency worker, however, this is TPs userland visible + * and we'll be exposing exactly the same information, + * so it has a mismatch name. + */ WB_REASON_FORKER_THREAD, WB_REASON_MAX, From f8f191f1addf0b31f188fd88e71e97200871c99c Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 8 Jul 2013 16:00:16 -0700 Subject: [PATCH 038/118] mm/page_alloc: fix doc for numa_zonelist_order The default zonelist order selecter will select "node" order if any nodes DMA zone comprises greater than 70% of its local memory instead of 60%, according to default_zonelist_order::low_kmem_size > total * 70/100. Signed-off-by: Wanpeng Li Reviewed-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index dcc75a9ed919..36ecc26c7433 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -510,7 +510,7 @@ Specify "[Dd]efault" to request automatic configuration. Autoconfiguration will select "node" order in following case. (1) if the DMA zone does not exist or (2) if the DMA zone comprises greater than 50% of the available memory or -(3) if any node's DMA zone comprises greater than 60% of its local memory and +(3) if any node's DMA zone comprises greater than 70% of its local memory and the amount of local memory is big enough. Otherwise, "zone" order will be selected. Default order is recommended unless From f49cbdde495f62e1c2d906b16e833cec27de5e59 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 8 Jul 2013 16:00:16 -0700 Subject: [PATCH 039/118] mm/thp: fix doc for transparent huge zero page Transparent huge zero page is used during the page fault instead of in khugepaged. # ls /sys/kernel/mm/transparent_hugepage/ defrag enabled khugepaged use_zero_page # ls /sys/kernel/mm/transparent_hugepage/khugepaged/ alloc_sleep_millisecs defrag full_scans max_ptes_none pages_collapsed pages_to_scan scan_sleep_millisecs This patch corrects the documentation just like the codes done. Signed-off-by: Wanpeng Li Acked-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/transhuge.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt index 8785fb87d9c7..4a63953a41f1 100644 --- a/Documentation/vm/transhuge.txt +++ b/Documentation/vm/transhuge.txt @@ -120,8 +120,8 @@ By default kernel tries to use huge zero page on read page fault. It's possible to disable huge zero page by writing 0 or enable it back by writing 1: -echo 0 >/sys/kernel/mm/transparent_hugepage/khugepaged/use_zero_page -echo 1 >/sys/kernel/mm/transparent_hugepage/khugepaged/use_zero_page +echo 0 >/sys/kernel/mm/transparent_hugepage/use_zero_page +echo 1 >/sys/kernel/mm/transparent_hugepage/use_zero_page khugepaged will be automatically started when transparent_hugepage/enabled is set to "always" or "madvise, and it'll From 73b44ff43c4b3cf517826da03c51948593f88753 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 8 Jul 2013 16:00:17 -0700 Subject: [PATCH 040/118] mm/pgtable: don't accumulate addr during pgd prepopulate pmd The old codes accumulate addr to get right pmd, however, currently pmds are preallocated and transfered as a parameter, there is unnecessary to accumulate addr variable any more, this patch remove it. Signed-off-by: Wanpeng Li Reviewed-by: Michal Hocko Reviewed-by: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/pgtable.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 17fda6a8b3c2..dfa537a03be1 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -240,7 +240,6 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) { pud_t *pud; - unsigned long addr; int i; if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */ @@ -248,8 +247,7 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) pud = pud_offset(pgd, 0); - for (addr = i = 0; i < PREALLOCATED_PMDS; - i++, pud++, addr += PUD_SIZE) { + for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) { pmd_t *pmd = pmds[i]; if (i >= KERNEL_PGD_BOUNDARY) From 64363aad5ff1b878230e91223038c26a2205bff3 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 8 Jul 2013 16:00:18 -0700 Subject: [PATCH 041/118] mm: remove unused VM_ macros and expand other in-place These VM_ macros aren't used very often and three of them aren't used at all. Expand the ones that are used in-place, and remove all the now unused #define VM_ macros. VM_READHINTMASK, VM_NormalReadHint and VM_ClearReadHint were added just before 2.4 and appears have never been used. Signed-off-by: Joe Perches Acked-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 6 ------ mm/filemap.c | 6 +++--- mm/memory.c | 2 +- mm/rmap.c | 2 +- 4 files changed, 5 insertions(+), 11 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index b87681adf0ba..f0224608d15e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -151,12 +151,6 @@ extern unsigned int kobjsize(const void *objp); #define VM_STACK_FLAGS (VM_GROWSDOWN | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) #endif -#define VM_READHINTMASK (VM_SEQ_READ | VM_RAND_READ) -#define VM_ClearReadHint(v) (v)->vm_flags &= ~VM_READHINTMASK -#define VM_NormalReadHint(v) (!((v)->vm_flags & VM_READHINTMASK)) -#define VM_SequentialReadHint(v) ((v)->vm_flags & VM_SEQ_READ) -#define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ) - /* * Special vmas that are non-mergable, non-mlock()able. * Note: mm/huge_memory.c VM_NO_THP depends on this definition. diff --git a/mm/filemap.c b/mm/filemap.c index 7905fe721aa8..4b51ac1acae7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1539,12 +1539,12 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, struct address_space *mapping = file->f_mapping; /* If we don't want any read-ahead, don't bother */ - if (VM_RandomReadHint(vma)) + if (vma->vm_flags & VM_RAND_READ) return; if (!ra->ra_pages) return; - if (VM_SequentialReadHint(vma)) { + if (vma->vm_flags & VM_SEQ_READ) { page_cache_sync_readahead(mapping, ra, file, offset, ra->ra_pages); return; @@ -1584,7 +1584,7 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma, struct address_space *mapping = file->f_mapping; /* If we don't want any read-ahead, don't bother */ - if (VM_RandomReadHint(vma)) + if (vma->vm_flags & VM_RAND_READ) return; if (ra->mmap_miss > 0) ra->mmap_miss--; diff --git a/mm/memory.c b/mm/memory.c index b68812d682b6..1ce2e2a734fc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1150,7 +1150,7 @@ again: if (pte_dirty(ptent)) set_page_dirty(page); if (pte_young(ptent) && - likely(!VM_SequentialReadHint(vma))) + likely(!(vma->vm_flags & VM_SEQ_READ))) mark_page_accessed(page); rss[MM_FILEPAGES]--; } diff --git a/mm/rmap.c b/mm/rmap.c index e22ceeb6e5ec..cd356df4f71a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -720,7 +720,7 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, * mapping is already gone, the unmap path will have * set PG_referenced or activated the page. */ - if (likely(!VM_SequentialReadHint(vma))) + if (likely(!(vma->vm_flags & VM_SEQ_READ))) referenced++; } pte_unmap_unlock(pte, ptl); From bcb615a81b1765864c71c50afb56631e7a1e5283 Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 8 Jul 2013 16:00:19 -0700 Subject: [PATCH 042/118] mm/vmalloc.c: fix an overflow bug in alloc_vmap_area() When searching a vmap area in the vmalloc space, we use (addr + size - 1) to check if the value is less than addr, which is an overflow. But we assign (addr + size) to vmap_area->va_end. So if we come across the below case: (addr + size - 1) : not overflow (addr + size) : overflow we will assign an overflow value (e.g 0) to vmap_area->va_end, And this will trigger BUG in __insert_vmap_area, causing system panic. So using (addr + size) to check the overflow should be the correct behaviour, not (addr + size - 1). Signed-off-by: Zhang Yanfei Reported-by: Ghennadi Procopciuc Tested-by: Daniel Baluta Cc: David Rientjes Cc: Minchan Kim Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a649186669a3..13a54953a273 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -388,12 +388,12 @@ nocache: addr = ALIGN(first->va_end, align); if (addr < vstart) goto nocache; - if (addr + size - 1 < addr) + if (addr + size < addr) goto overflow; } else { addr = ALIGN(vstart, align); - if (addr + size - 1 < addr) + if (addr + size < addr) goto overflow; n = vmap_area_root.rb_node; @@ -420,7 +420,7 @@ nocache: if (addr + cached_hole_size < first->va_start) cached_hole_size = first->va_start - addr; addr = ALIGN(first->va_end, align); - if (addr + size - 1 < addr) + if (addr + size < addr) goto overflow; if (list_is_last(&first->list, &vmap_area_list)) From ef277c73ca3b1aade278036ae11640090681d558 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Mon, 8 Jul 2013 16:00:21 -0700 Subject: [PATCH 043/118] page migration: fix wrong comment in address_space_operations.migratepage() There is no parameter "sync" in address_space_operations->migratepage(). It should be migrate_mode. And the comment is for MIGRATE_ASYNC. Signed-off-by: Tang Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 99be011e00de..cb771ecc2362 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -372,8 +372,8 @@ struct address_space_operations { int (*get_xip_mem)(struct address_space *, pgoff_t, int, void **, unsigned long *); /* - * migrate the contents of a page to the specified target. If sync - * is false, it must not block. + * migrate the contents of a page to the specified target. If + * migrate_mode is MIGRATE_ASYNC, it must not block. */ int (*migratepage) (struct address_space *, struct page *, struct page *, enum migrate_mode); From d8bbdd773d64b30b6b36f027ad2e182ed2045f3c Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Mon, 8 Jul 2013 16:00:22 -0700 Subject: [PATCH 044/118] mm/memblock.c: fix wrong comment in __next_free_mem_range() Remove one redundant "nid" in the comment. Signed-off-by: Tang Chen Signed-off-by: Linus Torvalds --- mm/memblock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memblock.c b/mm/memblock.c index c5fad932fa51..a847bfe6f3ba 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -566,7 +566,7 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) /** * __next_free_mem_range - next function for for_each_free_mem_range() * @idx: pointer to u64 loop variable - * @nid: nid: node selector, %MAX_NUMNODES for all nodes + * @nid: node selector, %MAX_NUMNODES for all nodes * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL * @out_nid: ptr to int for nid of the range, can be %NULL From 7e9f5eb03d3762ec89dda1888c774ae7b4040af7 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Mon, 8 Jul 2013 16:00:23 -0700 Subject: [PATCH 045/118] mm/memory_hotplug.c: fix a comment typo in register_page_bootmem_info_node() Signed-off-by: Tang Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index f5ba127b2051..cd2990fdf6c1 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -208,13 +208,13 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) pfn = pgdat->node_start_pfn; end_pfn = pgdat_end_pfn(pgdat); - /* register_section info */ + /* register section info */ for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { /* * Some platforms can assign the same pfn to multiple nodes - on * node0 as well as nodeN. To avoid registering a pfn against * multiple nodes we check that this pfn does not already - * reside in some other node. + * reside in some other nodes. */ if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node)) register_page_bootmem_info_section(pfn); From 5a1c9cbc1550f93335d7c03eb6c271e642deff04 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 8 Jul 2013 16:00:24 -0700 Subject: [PATCH 046/118] mm: vmscan: do not continue scanning if reclaim was aborted for compaction Direct reclaim is not aborting to allow compaction to go ahead properly. do_try_to_free_pages is told to abort reclaim which is happily ignores and instead increases priority instead until it reaches 0 and starts shrinking file/anon equally. This patch corrects the situation by aborting reclaim when requested instead of raising priority. Signed-off-by: Mel Gorman Cc: Rik van Riel Cc: Johannes Weiner Cc: Michal Hocko Cc: Dave Chinner Cc: Kamezawa Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 99b3ac7771ad..2385663ae5e5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2361,8 +2361,10 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, aborted_reclaim = shrink_zones(zonelist, sc); /* - * Don't shrink slabs when reclaiming memory from - * over limit cgroups + * Don't shrink slabs when reclaiming memory from over limit + * cgroups but do shrink slab at least once when aborting + * reclaim for compaction to avoid unevenly scanning file/anon + * LRU pages over slab pages. */ if (global_reclaim(sc)) { unsigned long lru_pages = 0; @@ -2404,7 +2406,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, WB_REASON_TRY_TO_FREE_PAGES); sc->may_writepage = 1; } - } while (--sc->priority >= 0); + } while (--sc->priority >= 0 && !aborted_reclaim); out: delayacct_freepages_end(); From 918fc718c5922520c499ad60f61b8df86b998ae9 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 8 Jul 2013 16:00:25 -0700 Subject: [PATCH 047/118] mm: vmscan: do not scale writeback pages when deciding whether to set ZONE_WRITEBACK After the patch "mm: vmscan: Flatten kswapd priority loop" was merged the scanning priority of kswapd changed. The priority now rises until it is scanning enough pages to meet the high watermark. shrink_inactive_list sets ZONE_WRITEBACK if a number of pages were encountered under writeback but this value is scaled based on the priority. As kswapd frequently scans with a higher priority now it is relatively easy to set ZONE_WRITEBACK. This patch removes the scaling and treates writeback pages similar to how it treats unqueued dirty pages and congested pages. The user-visible effect should be that kswapd will writeback fewer pages from reclaim context. Signed-off-by: Mel Gorman Cc: Rik van Riel Cc: Johannes Weiner Cc: Michal Hocko Cc: Dave Chinner Cc: Kamezawa Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 2385663ae5e5..2cff0d491c6d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1443,25 +1443,11 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * as there is no guarantee the dirtying process is throttled in the * same way balance_dirty_pages() manages. * - * This scales the number of dirty pages that must be under writeback - * before a zone gets flagged ZONE_WRITEBACK. It is a simple backoff - * function that has the most effect in the range DEF_PRIORITY to - * DEF_PRIORITY-2 which is the priority reclaim is considered to be - * in trouble and reclaim is considered to be in trouble. - * - * DEF_PRIORITY 100% isolated pages must be PageWriteback to throttle - * DEF_PRIORITY-1 50% must be PageWriteback - * DEF_PRIORITY-2 25% must be PageWriteback, kswapd in trouble - * ... - * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any - * isolated page is PageWriteback - * * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number * of pages under pages flagged for immediate reclaim and stall if any * are encountered in the nr_immediate check below. */ - if (nr_writeback && nr_writeback >= - (nr_taken >> (DEF_PRIORITY - sc->priority))) + if (nr_writeback && nr_writeback == nr_taken) zone_set_flag(zone, ZONE_WRITEBACK); /* From 493af578040e690f93f0fc8d9e7667ffff8155bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Engel?= Date: Mon, 8 Jul 2013 16:00:26 -0700 Subject: [PATCH 048/118] mmap: allow MAP_HUGETLB for hugetlbfs files v2 It is counterintuitive at best that mmap'ing a hugetlbfs file with MAP_HUGETLB fails, while mmap'ing it without will a) succeed and b) return huge pages. v2: use is_file_hugepages(), as suggested by Jianguo Signed-off-by: Joern Engel Cc: Jianguo Wu Signed-off-by: Linus Torvalds --- mm/mmap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 8468ffd05bae..0718c175db8f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1358,13 +1358,14 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, if (!(flags & MAP_ANONYMOUS)) { audit_mmap_fd(fd, flags); - if (unlikely(flags & MAP_HUGETLB)) - return -EINVAL; file = fget(fd); if (!file) goto out; if (is_file_hugepages(file)) len = ALIGN(len, huge_page_size(hstate_file(file))); + retval = -EINVAL; + if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file))) + goto out_fput; } else if (flags & MAP_HUGETLB) { struct user_struct *user = NULL; struct hstate *hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & @@ -1391,6 +1392,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); +out_fput: if (file) fput(file); out: From fa460c2d37870e0a6f94c70e8b76d05ca11b6db0 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 8 Jul 2013 16:00:27 -0700 Subject: [PATCH 049/118] Revert "memcg: avoid dangling reference count in creation failure" This reverts commit e4715f01be697a. mem_cgroup_put is hierarchy aware so mem_cgroup_put(memcg) already drops an additional reference from all parents so the additional mem_cgrroup_put(parent) potentially causes use-after-free. Signed-off-by: Michal Hocko Signed-off-by: Li Zefan Acked-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Tejun Heo Cc: Glauber Costa Cc: Johannes Weiner Cc: [3.9+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 64f726599ff0..6b73d8657d64 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6352,8 +6352,6 @@ mem_cgroup_css_online(struct cgroup *cont) * call __mem_cgroup_free, so return directly */ mem_cgroup_put(memcg); - if (parent->use_hierarchy) - mem_cgroup_put(parent); } return error; } From f37a96914d1aea10fed8d9af10251f0b9caea31b Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 8 Jul 2013 16:00:29 -0700 Subject: [PATCH 050/118] memcg, kmem: fix reference count handling on the error path mem_cgroup_css_online calls mem_cgroup_put if memcg_init_kmem fails. This is not correct because only memcg_propagate_kmem takes an additional reference while mem_cgroup_sockets_init is allowed to fail as well (although no current implementation fails) but it doesn't take any reference. This all suggests that it should be memcg_propagate_kmem that should clean up after itself so this patch moves mem_cgroup_put over there. Unfortunately this is not that easy (as pointed out by Li Zefan) because memcg_kmem_mark_dead marks the group dead (KMEM_ACCOUNTED_DEAD) if it is marked active (KMEM_ACCOUNTED_ACTIVE) which is the case even if memcg_propagate_kmem fails so the additional reference is dropped in that case in kmem_cgroup_destroy which means that the reference would be dropped two times. The easiest way then would be to simply remove mem_cgrroup_put from mem_cgroup_css_online and rely on kmem_cgroup_destroy doing the right thing. Signed-off-by: Michal Hocko Signed-off-by: Li Zefan Acked-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Tejun Heo Cc: Glauber Costa Cc: Johannes Weiner Cc: [3.8] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6b73d8657d64..bdeb82ca6c20 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6345,14 +6345,6 @@ mem_cgroup_css_online(struct cgroup *cont) error = memcg_init_kmem(memcg, &mem_cgroup_subsys); mutex_unlock(&memcg_create_mutex); - if (error) { - /* - * We call put now because our (and parent's) refcnts - * are already in place. mem_cgroup_put() will internally - * call __mem_cgroup_free, so return directly - */ - mem_cgroup_put(memcg); - } return error; } From 5347e5ae13710420eebbbd0b22c045685704da80 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 8 Jul 2013 16:00:30 -0700 Subject: [PATCH 051/118] memcg: use css_get() in sock_update_memcg() Use css_get/css_put instead of mem_cgroup_get/put. Note, if at the same time someone is moving @current to a different cgroup and removing the old cgroup, css_tryget() may return false, and sock->sk_cgrp won't be initialized, which is fine. Signed-off-by: Li Zefan Acked-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Tejun Heo Cc: Glauber Costa Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bdeb82ca6c20..4c31a21a55be 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -551,15 +551,15 @@ void sock_update_memcg(struct sock *sk) */ if (sk->sk_cgrp) { BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); - mem_cgroup_get(sk->sk_cgrp->memcg); + css_get(&sk->sk_cgrp->memcg->css); return; } rcu_read_lock(); memcg = mem_cgroup_from_task(current); cg_proto = sk->sk_prot->proto_cgroup(memcg); - if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) { - mem_cgroup_get(memcg); + if (!mem_cgroup_is_root(memcg) && + memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) { sk->sk_cgrp = cg_proto; } rcu_read_unlock(); @@ -573,7 +573,7 @@ void sock_release_memcg(struct sock *sk) struct mem_cgroup *memcg; WARN_ON(!sk->sk_cgrp->memcg); memcg = sk->sk_cgrp->memcg; - mem_cgroup_put(memcg); + css_put(&sk->sk_cgrp->memcg->css); } } From 20f05310ba62d5816fb339d08effe78683137197 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 8 Jul 2013 16:00:31 -0700 Subject: [PATCH 052/118] memcg: don't use mem_cgroup_get() when creating a kmemcg cache Use css_get()/css_put() instead of mem_cgroup_get()/mem_cgroup_put(). There are two things being done in the current code: First, we acquired a css_ref to make sure that the underlying cgroup would not go away. That is a short lived reference, and it is put as soon as the cache is created. At this point, we acquire a long-lived per-cache memcg reference count to guarantee that the memcg will still be alive. so it is: enqueue: css_get create : memcg_get, css_put destroy: memcg_put So we only need to get rid of the memcg_get, change the memcg_put to css_put, and get rid of the now extra css_put. (This changelog is mostly written by Glauber) Signed-off-by: Li Zefan Acked-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Tejun Heo Cc: Glauber Costa Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4c31a21a55be..80175ded718d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3242,7 +3242,7 @@ void memcg_release_cache(struct kmem_cache *s) list_del(&s->memcg_params->list); mutex_unlock(&memcg->slab_caches_mutex); - mem_cgroup_put(memcg); + css_put(&memcg->css); out: kfree(s->memcg_params); } @@ -3402,16 +3402,18 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, mutex_lock(&memcg_cache_mutex); new_cachep = cachep->memcg_params->memcg_caches[idx]; - if (new_cachep) + if (new_cachep) { + css_put(&memcg->css); goto out; + } new_cachep = kmem_cache_dup(memcg, cachep); if (new_cachep == NULL) { new_cachep = cachep; + css_put(&memcg->css); goto out; } - mem_cgroup_get(memcg); atomic_set(&new_cachep->memcg_params->nr_pages , 0); cachep->memcg_params->memcg_caches[idx] = new_cachep; @@ -3499,8 +3501,6 @@ static void memcg_create_cache_work_func(struct work_struct *w) cw = container_of(w, struct create_work, work); memcg_create_kmem_cache(cw->memcg, cw->cachep); - /* Drop the reference gotten when we enqueued. */ - css_put(&cw->memcg->css); kfree(cw); } From 10d5ebf40ff09db03b97cb177f24b9c7c8b4bb52 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 8 Jul 2013 16:00:33 -0700 Subject: [PATCH 053/118] memcg: use css_get/put when charging/uncharging kmem Use css_get/put instead of mem_cgroup_get/put. We can't do a simple replacement, because here mem_cgroup_put() is called during mem_cgroup_css_free(), while mem_cgroup_css_free() won't be called until css refcnt goes down to 0. Instead we increment css refcnt in mem_cgroup_css_offline(), and then check if there's still kmem charges. If not, css refcnt will be decremented immediately, otherwise the refcnt will be released after the last kmem allocation is uncahred. [akpm@linux-foundation.org: tweak comment] Signed-off-by: Li Zefan Acked-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Reviewed-by: Tejun Heo Cc: Michal Hocko Cc: Glauber Costa Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 80 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 54 insertions(+), 26 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 80175ded718d..bdc9582585af 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -406,6 +406,11 @@ static void memcg_kmem_clear_activated(struct mem_cgroup *memcg) static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) { + /* + * Our caller must use css_get() first, because memcg_uncharge_kmem() + * will call css_put() if it sees the memcg is dead. + */ + smp_wmb(); if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags); } @@ -3050,8 +3055,16 @@ static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) if (res_counter_uncharge(&memcg->kmem, size)) return; + /* + * Releases a reference taken in kmem_cgroup_css_offline in case + * this last uncharge is racing with the offlining code or it is + * outliving the memcg existence. + * + * The memory barrier imposed by test&clear is paired with the + * explicit one in memcg_kmem_mark_dead(). + */ if (memcg_kmem_test_and_clear_dead(memcg)) - mem_cgroup_put(memcg); + css_put(&memcg->css); } void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep) @@ -5183,14 +5196,6 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) * starts accounting before all call sites are patched */ memcg_kmem_set_active(memcg); - - /* - * kmem charges can outlive the cgroup. In the case of slab - * pages, for instance, a page contain objects from various - * processes, so it is unfeasible to migrate them away. We - * need to reference count the memcg because of that. - */ - mem_cgroup_get(memcg); } else ret = res_counter_set_limit(&memcg->kmem, val); out: @@ -5223,12 +5228,10 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg) goto out; /* - * destroy(), called if we fail, will issue static_key_slow_inc() and - * mem_cgroup_put() if kmem is enabled. We have to either call them - * unconditionally, or clear the KMEM_ACTIVE flag. I personally find - * this more consistent, since it always leads to the same destroy path + * __mem_cgroup_free() will issue static_key_slow_dec() because this + * memcg is active already. If the later initialization fails then the + * cgroup core triggers the cleanup so we do not have to do it here. */ - mem_cgroup_get(memcg); static_key_slow_inc(&memcg_kmem_enabled_key); mutex_lock(&set_limit_mutex); @@ -5913,23 +5916,43 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) return mem_cgroup_sockets_init(memcg, ss); } -static void kmem_cgroup_destroy(struct mem_cgroup *memcg) +static void memcg_destroy_kmem(struct mem_cgroup *memcg) { mem_cgroup_sockets_destroy(memcg); +} + +static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) +{ + if (!memcg_kmem_is_active(memcg)) + return; + + /* + * kmem charges can outlive the cgroup. In the case of slab + * pages, for instance, a page contain objects from various + * processes. As we prevent from taking a reference for every + * such allocation we have to be careful when doing uncharge + * (see memcg_uncharge_kmem) and here during offlining. + * + * The idea is that that only the _last_ uncharge which sees + * the dead memcg will drop the last reference. An additional + * reference is taken here before the group is marked dead + * which is then paired with css_put during uncharge resp. here. + * + * Although this might sound strange as this path is called from + * css_offline() when the referencemight have dropped down to 0 + * and shouldn't be incremented anymore (css_tryget would fail) + * we do not have other options because of the kmem allocations + * lifetime. + */ + css_get(&memcg->css); memcg_kmem_mark_dead(memcg); if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0) return; - /* - * Charges already down to 0, undo mem_cgroup_get() done in the charge - * path here, being careful not to race with memcg_uncharge_kmem: it is - * possible that the charges went down to 0 between mark_dead and the - * res_counter read, so in that case, we don't need the put - */ if (memcg_kmem_test_and_clear_dead(memcg)) - mem_cgroup_put(memcg); + css_put(&memcg->css); } #else static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) @@ -5937,7 +5960,11 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) return 0; } -static void kmem_cgroup_destroy(struct mem_cgroup *memcg) +static void memcg_destroy_kmem(struct mem_cgroup *memcg) +{ +} + +static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) { } #endif @@ -6370,6 +6397,8 @@ static void mem_cgroup_css_offline(struct cgroup *cont) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + kmem_cgroup_css_offline(memcg); + mem_cgroup_invalidate_reclaim_iterators(memcg); mem_cgroup_reparent_charges(memcg); mem_cgroup_destroy_all_caches(memcg); @@ -6379,9 +6408,8 @@ static void mem_cgroup_css_free(struct cgroup *cont) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - kmem_cgroup_destroy(memcg); - - mem_cgroup_put(memcg); + memcg_destroy_kmem(memcg); + __mem_cgroup_free(memcg); } #ifdef CONFIG_MMU From 4050377b509b326c14b275fedb2f69b46f37a7a9 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 8 Jul 2013 16:00:34 -0700 Subject: [PATCH 054/118] memcg: use css_get/put for swap memcg Use css_get/put instead of mem_cgroup_get/put. A simple replacement will do. The historical reason that memcg has its own refcnt instead of always using css_get/put, is that cgroup couldn't be removed if there're still css refs, so css refs can't be used as long-lived reference. The situation has changed so that rmdir a cgroup will succeed regardless css refs, but won't be freed until css refs goes down to 0. Signed-off-by: Li Zefan Acked-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Tejun Heo Cc: Glauber Costa Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bdc9582585af..76c0c99b002f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4231,12 +4231,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, unlock_page_cgroup(pc); /* * even after unlock, we have memcg->res.usage here and this memcg - * will never be freed. + * will never be freed, so it's safe to call css_get(). */ memcg_check_events(memcg, page); if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { mem_cgroup_swap_statistics(memcg, true); - mem_cgroup_get(memcg); + css_get(&memcg->css); } /* * Migration does not charge the res_counter for the @@ -4348,7 +4348,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) /* * record memcg information, if swapout && memcg != NULL, - * mem_cgroup_get() was called in uncharge(). + * css_get() was called in uncharge(). */ if (do_swap_account && swapout && memcg) swap_cgroup_record(ent, css_id(&memcg->css)); @@ -4379,7 +4379,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) if (!mem_cgroup_is_root(memcg)) res_counter_uncharge(&memcg->memsw, PAGE_SIZE); mem_cgroup_swap_statistics(memcg, false); - mem_cgroup_put(memcg); + css_put(&memcg->css); } rcu_read_unlock(); } @@ -4413,11 +4413,14 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, * This function is only called from task migration context now. * It postpones res_counter and refcount handling till the end * of task migration(mem_cgroup_clear_mc()) for performance - * improvement. But we cannot postpone mem_cgroup_get(to) - * because if the process that has been moved to @to does - * swap-in, the refcount of @to might be decreased to 0. + * improvement. But we cannot postpone css_get(to) because if + * the process that has been moved to @to does swap-in, the + * refcount of @to might be decreased to 0. + * + * We are in attach() phase, so the cgroup is guaranteed to be + * alive, so we can just call css_get(). */ - mem_cgroup_get(to); + css_get(&to->css); return 0; } return -EINVAL; @@ -6718,6 +6721,7 @@ static void __mem_cgroup_clear_mc(void) { struct mem_cgroup *from = mc.from; struct mem_cgroup *to = mc.to; + int i; /* we must uncharge all the leftover precharges from mc.to */ if (mc.precharge) { @@ -6738,7 +6742,9 @@ static void __mem_cgroup_clear_mc(void) if (!mem_cgroup_is_root(mc.from)) res_counter_uncharge(&mc.from->memsw, PAGE_SIZE * mc.moved_swap); - __mem_cgroup_put(mc.from, mc.moved_swap); + + for (i = 0; i < mc.moved_swap; i++) + css_put(&mc.from->css); if (!mem_cgroup_is_root(mc.to)) { /* @@ -6748,7 +6754,7 @@ static void __mem_cgroup_clear_mc(void) res_counter_uncharge(&mc.to->res, PAGE_SIZE * mc.moved_swap); } - /* we've already done mem_cgroup_get(mc.to) */ + /* we've already done css_get(mc.to) */ mc.moved_swap = 0; } memcg_oom_recover(from); From 8d76a9797882fc517d87e2b5db2a4f04edaeccec Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 8 Jul 2013 16:00:36 -0700 Subject: [PATCH 055/118] memcg: don't need to get a reference to the parent The cgroup core guarantees it's always safe to access the parent. Signed-off-by: Li Zefan Acked-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Tejun Heo Cc: Glauber Costa Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 76c0c99b002f..c508258d61a1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -503,7 +503,6 @@ enum res_type { */ static DEFINE_MUTEX(memcg_create_mutex); -static void mem_cgroup_get(struct mem_cgroup *memcg); static void mem_cgroup_put(struct mem_cgroup *memcg); static inline @@ -6239,19 +6238,10 @@ static void free_rcu(struct rcu_head *rcu_head) schedule_work(&memcg->work_freeing); } -static void mem_cgroup_get(struct mem_cgroup *memcg) -{ - atomic_inc(&memcg->refcnt); -} - static void __mem_cgroup_put(struct mem_cgroup *memcg, int count) { - if (atomic_sub_and_test(count, &memcg->refcnt)) { - struct mem_cgroup *parent = parent_mem_cgroup(memcg); + if (atomic_sub_and_test(count, &memcg->refcnt)) call_rcu(&memcg->rcu_freeing, free_rcu); - if (parent) - mem_cgroup_put(parent); - } } static void mem_cgroup_put(struct mem_cgroup *memcg) @@ -6354,12 +6344,9 @@ mem_cgroup_css_online(struct cgroup *cont) res_counter_init(&memcg->kmem, &parent->kmem); /* - * We increment refcnt of the parent to ensure that we can - * safely access it on res_counter_charge/uncharge. - * This refcnt will be decremented when freeing this - * mem_cgroup(see mem_cgroup_put). + * No need to take a reference to the parent because cgroup + * core guarantees its existence. */ - mem_cgroup_get(parent); } else { res_counter_init(&memcg->res, NULL); res_counter_init(&memcg->memsw, NULL); From e0743e6bc5b7587dd0bfa902d67d3f81ef3f6618 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 8 Jul 2013 16:00:37 -0700 Subject: [PATCH 056/118] memcg: kill memcg refcnt Now memcg has the same life cycle as its corresponding cgroup. Kill the useless refcnt. Signed-off-by: Li Zefan Acked-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Tejun Heo Cc: Glauber Costa Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c508258d61a1..fa521a2f4bf6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -299,8 +299,6 @@ struct mem_cgroup { bool oom_lock; atomic_t under_oom; - atomic_t refcnt; - int swappiness; /* OOM-Killer disable */ int oom_kill_disable; @@ -503,8 +501,6 @@ enum res_type { */ static DEFINE_MUTEX(memcg_create_mutex); -static void mem_cgroup_put(struct mem_cgroup *memcg); - static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) { @@ -6238,17 +6234,6 @@ static void free_rcu(struct rcu_head *rcu_head) schedule_work(&memcg->work_freeing); } -static void __mem_cgroup_put(struct mem_cgroup *memcg, int count) -{ - if (atomic_sub_and_test(count, &memcg->refcnt)) - call_rcu(&memcg->rcu_freeing, free_rcu); -} - -static void mem_cgroup_put(struct mem_cgroup *memcg) -{ - __mem_cgroup_put(memcg, 1); -} - /* * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. */ @@ -6308,7 +6293,6 @@ mem_cgroup_css_alloc(struct cgroup *cont) memcg->last_scanned_node = MAX_NUMNODES; INIT_LIST_HEAD(&memcg->oom_notify); - atomic_set(&memcg->refcnt, 1); memcg->move_charge_at_immigrate = 0; mutex_init(&memcg->thresholds_lock); spin_lock_init(&memcg->move_lock); @@ -6399,7 +6383,7 @@ static void mem_cgroup_css_free(struct cgroup *cont) struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); memcg_destroy_kmem(memcg); - __mem_cgroup_free(memcg); + call_rcu(&memcg->rcu_freeing, free_rcu); } #ifdef CONFIG_MMU From 465939a1fa283cf2a5194362c5accf4429c99c42 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 8 Jul 2013 16:00:38 -0700 Subject: [PATCH 057/118] memcg: don't need to free memcg via RCU or workqueue Now memcg has the same life cycle with its corresponding cgroup, and a cgroup is freed via RCU and then mem_cgroup_css_free() will be called in a work function, so we can simply call __mem_cgroup_free() in mem_cgroup_css_free(). This actually reverts commit 59927fb984d ("memcg: free mem_cgroup by RCU to fix oops"). Signed-off-by: Li Zefan Cc: Hugh Dickins Acked-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Tejun Heo Cc: Glauber Costa Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 51 +++++-------------------------------------------- 1 file changed, 5 insertions(+), 46 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fa521a2f4bf6..d12ca6f3c293 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -263,28 +263,10 @@ struct mem_cgroup { /* vmpressure notifications */ struct vmpressure vmpressure; - union { - /* - * the counter to account for mem+swap usage. - */ - struct res_counter memsw; - - /* - * rcu_freeing is used only when freeing struct mem_cgroup, - * so put it into a union to avoid wasting more memory. - * It must be disjoint from the css field. It could be - * in a union with the res field, but res plays a much - * larger part in mem_cgroup life than memsw, and might - * be of interest, even at time of free, when debugging. - * So share rcu_head with the less interesting memsw. - */ - struct rcu_head rcu_freeing; - /* - * We also need some space for a worker in deferred freeing. - * By the time we call it, rcu_freeing is no longer in use. - */ - struct work_struct work_freeing; - }; + /* + * the counter to account for mem+swap usage. + */ + struct res_counter memsw; /* * the counter to account for kernel memory usage. @@ -6211,29 +6193,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) vfree(memcg); } - -/* - * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU, - * but in process context. The work_freeing structure is overlaid - * on the rcu_freeing structure, which itself is overlaid on memsw. - */ -static void free_work(struct work_struct *work) -{ - struct mem_cgroup *memcg; - - memcg = container_of(work, struct mem_cgroup, work_freeing); - __mem_cgroup_free(memcg); -} - -static void free_rcu(struct rcu_head *rcu_head) -{ - struct mem_cgroup *memcg; - - memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); - INIT_WORK(&memcg->work_freeing, free_work); - schedule_work(&memcg->work_freeing); -} - /* * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. */ @@ -6383,7 +6342,7 @@ static void mem_cgroup_css_free(struct cgroup *cont) struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); memcg_destroy_kmem(memcg); - call_rcu(&memcg->rcu_freeing, free_rcu); + __mem_cgroup_free(memcg); } #ifdef CONFIG_MMU From 5f12733e9d976132e6cbbae9d08f71406fdacdfb Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 8 Jul 2013 16:00:40 -0700 Subject: [PATCH 058/118] mm: honor min_free_kbytes set by user min_free_kbytes is updated during memory hotplug (by init_per_zone_wmark_min) currently which is right thing to do in most cases but this could be unexpected if admin increased the value to prevent from allocation failures and the new min_free_kbytes would be decreased as a result of memory hotadd. This patch saves the user defined value and allows updating min_free_kbytes only if it is higher than the saved one. A warning is printed when the new value is ignored. Signed-off-by: Michal Hocko Cc: Mel Gorman Acked-by: Zhang Yanfei Acked-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b5855e545eec..b100255dedda 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -204,6 +204,7 @@ static char * const zone_names[MAX_NR_ZONES] = { }; int min_free_kbytes = 1024; +int user_min_free_kbytes; static unsigned long __meminitdata nr_kernel_pages; static unsigned long __meminitdata nr_all_pages; @@ -5589,14 +5590,21 @@ static void __meminit setup_per_zone_inactive_ratio(void) int __meminit init_per_zone_wmark_min(void) { unsigned long lowmem_kbytes; + int new_min_free_kbytes; lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); + new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16); - min_free_kbytes = int_sqrt(lowmem_kbytes * 16); - if (min_free_kbytes < 128) - min_free_kbytes = 128; - if (min_free_kbytes > 65536) - min_free_kbytes = 65536; + if (new_min_free_kbytes > user_min_free_kbytes) { + min_free_kbytes = new_min_free_kbytes; + if (min_free_kbytes < 128) + min_free_kbytes = 128; + if (min_free_kbytes > 65536) + min_free_kbytes = 65536; + } else { + pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", + new_min_free_kbytes, user_min_free_kbytes); + } setup_per_zone_wmarks(); refresh_zone_stat_thresholds(); setup_per_zone_lowmem_reserve(); @@ -5614,8 +5622,10 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec(table, write, buffer, length, ppos); - if (write) + if (write) { + user_min_free_kbytes = min_free_kbytes; setup_per_zone_wmarks(); + } return 0; } From 0a1be15097a5f5ee8cbaf7cf0a55146363db0e4d Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Mon, 8 Jul 2013 16:00:41 -0700 Subject: [PATCH 059/118] mm/memory_hotplug.c: fix return value of online_pages() online_pages() is called from memory_block_action() when a user requests to online a memory block via sysfs. This function needs to return a proper error value in case of error. Signed-off-by: Toshi Kani Cc: Yasuaki Ishimatsu Cc: Tang Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index cd2990fdf6c1..ca1dd3aa5eee 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -914,19 +914,19 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && !can_online_high_movable(zone)) { unlock_memory_hotplug(); - return -1; + return -EINVAL; } if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { unlock_memory_hotplug(); - return -1; + return -EINVAL; } } if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { unlock_memory_hotplug(); - return -1; + return -EINVAL; } } From dcb6b45254e2281b6f99ea7f2d51343954aa3ba8 Mon Sep 17 00:00:00 2001 From: Alex Thorlton Date: Mon, 8 Jul 2013 16:00:42 -0700 Subject: [PATCH 060/118] panic: add cpu/pid to warn_slowpath_common in WARNING printk()s Add the cpu/pid that called WARN() so that the stack traces can be matched up with the WARNING messages. [akpm@linux-foundation.org: remove stray quote] Signed-off-by: Alex Thorlton Reviewed-by: Robin Holt Cc: Stephen Boyd Cc: Vikram Mulukutla Cc: Rusty Russell Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/panic.c b/kernel/panic.c index 167ec097ce8b..97712319f128 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -399,8 +399,9 @@ struct slowpath_args { static void warn_slowpath_common(const char *file, int line, void *caller, unsigned taint, struct slowpath_args *args) { - printk(KERN_WARNING "------------[ cut here ]------------\n"); - printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller); + pr_warn("------------[ cut here ]------------\n"); + pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS()\n", + raw_smp_processor_id(), current->pid, file, line, caller); if (args) vprintk(args->fmt, args->args); From c707a81de71a27a499fde60fbb963f60602c1a94 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 8 Jul 2013 16:00:43 -0700 Subject: [PATCH 061/118] checkpatch: make the CamelCase cache work for non-git trees too Might as well check include timestamps and cache the include file CamelCase uses for the non-git case too. The camelcase cache file is now named: for git: .checkpatch-camelcase.git. for non-git: .checkpatch-camelcase.date. All .checkpatch-camelcase* files are deleted if not current. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 54 ++++++++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 19 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 6afcd1239ca5..2ee9eb750560 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -6,6 +6,7 @@ # Licensed under the terms of the GNU GPL License version 2 use strict; +use POSIX; my $P = $0; $P =~ s@.*/@@g; @@ -399,37 +400,52 @@ sub seed_camelcase_includes { return if ($camelcase_seeded); my $files; - my $camelcase_git_file = ""; + my $camelcase_cache = ""; + my @include_files = (); + + $camelcase_seeded = 1; if (-d ".git") { my $git_last_include_commit = `git log --no-merges --pretty=format:"%h%n" -1 -- include`; chomp $git_last_include_commit; - $camelcase_git_file = ".checkpatch-camelcase.$git_last_include_commit"; - if (-f $camelcase_git_file) { - open(my $camelcase_file, '<', "$camelcase_git_file") - or warn "$P: Can't read '$camelcase_git_file' $!\n"; - while (<$camelcase_file>) { - chomp; - $camelcase{$_} = 1; - } - close($camelcase_file); - - return; - } - $files = `git ls-files include`; + $camelcase_cache = ".checkpatch-camelcase.git.$git_last_include_commit"; } else { + my $last_mod_date = 0; $files = `find $root/include -name "*.h"`; + @include_files = split('\n', $files); + foreach my $file (@include_files) { + my $date = POSIX::strftime("%Y%m%d%H%M", + localtime((stat $file)[9])); + $last_mod_date = $date if ($last_mod_date < $date); + } + $camelcase_cache = ".checkpatch-camelcase.date.$last_mod_date"; } - my @include_files = split('\n', $files); + + if ($camelcase_cache ne "" && -f $camelcase_cache) { + open(my $camelcase_file, '<', "$camelcase_cache") + or warn "$P: Can't read '$camelcase_cache' $!\n"; + while (<$camelcase_file>) { + chomp; + $camelcase{$_} = 1; + } + close($camelcase_file); + + return; + } + + if (-d ".git") { + $files = `git ls-files "include/*.h"`; + @include_files = split('\n', $files); + } + foreach my $file (@include_files) { seed_camelcase_file($file); } - $camelcase_seeded = 1; - if ($camelcase_git_file ne "") { + if ($camelcase_cache ne "") { unlink glob ".checkpatch-camelcase.*"; - open(my $camelcase_file, '>', "$camelcase_git_file") - or warn "$P: Can't write '$camelcase_git_file' $!\n"; + open(my $camelcase_file, '>', "$camelcase_cache") + or warn "$P: Can't write '$camelcase_cache' $!\n"; foreach (sort { lc($a) cmp lc($b) } keys(%camelcase)) { print $camelcase_file ("$_\n"); } From 2417898b34ad3fbf2f31771c97ba87792bf97f0c Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 8 Jul 2013 16:00:44 -0700 Subject: [PATCH 062/118] ncpfs: fix error return code in ncp_parse_options() Fix to return -EINVAL from the option parse error handling case instead of 0, as done elsewhere in this function. Signed-off-by: Wei Yongjun Cc: Petr Vandrovec Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ncpfs/inode.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 0765ad12c382..4659da67e7f6 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -403,18 +403,24 @@ static int ncp_parse_options(struct ncp_mount_data_kernel *data, char *options) switch (optval) { case 'u': data->uid = make_kuid(current_user_ns(), optint); - if (!uid_valid(data->uid)) + if (!uid_valid(data->uid)) { + ret = -EINVAL; goto err; + } break; case 'g': data->gid = make_kgid(current_user_ns(), optint); - if (!gid_valid(data->gid)) + if (!gid_valid(data->gid)) { + ret = -EINVAL; goto err; + } break; case 'o': data->mounted_uid = make_kuid(current_user_ns(), optint); - if (!uid_valid(data->mounted_uid)) + if (!uid_valid(data->mounted_uid)) { + ret = -EINVAL; goto err; + } break; case 'm': data->file_mode = optint; From 4e80b1880c5a31d051d1e4a7377dec0a20701c23 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Mon, 8 Jul 2013 16:00:45 -0700 Subject: [PATCH 063/118] drivers/rtc/rtc-stmp3xxx.c: check the return value from stmp_reset_block() stmp_reset_block() may fail, so let's check its return value and propagate it in the case of error. Signed-off-by: Fabio Estevam Acked-by: Shawn Guo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-stmp3xxx.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-stmp3xxx.c b/drivers/rtc/rtc-stmp3xxx.c index 90a3e864b8fe..767fee2ab340 100644 --- a/drivers/rtc/rtc-stmp3xxx.c +++ b/drivers/rtc/rtc-stmp3xxx.c @@ -261,7 +261,12 @@ static int stmp3xxx_rtc_probe(struct platform_device *pdev) platform_set_drvdata(pdev, rtc_data); - stmp_reset_block(rtc_data->io); + err = stmp_reset_block(rtc_data->io); + if (err) { + dev_err(&pdev->dev, "stmp_reset_block failed: %d\n", err); + return err; + } + writel(STMP3XXX_RTC_PERSISTENT0_ALARM_EN | STMP3XXX_RTC_PERSISTENT0_ALARM_WAKE_EN | STMP3XXX_RTC_PERSISTENT0_ALARM_WAKE, From 6e5b93ee55d401f1619092fb675b57c28c9ed7ec Mon Sep 17 00:00:00 2001 From: Mike Lockwood Date: Mon, 8 Jul 2013 16:00:46 -0700 Subject: [PATCH 064/118] fatfs: add FAT_IOCTL_GET_VOLUME_ID This patch, originally from Android kernel, adds vfat ioctl command FAT_IOCTL_GET_VOLUME_ID, with this command we can get the vfat volume ID using following code: ioctl(fd, FAT_IOCTL_GET_VOLUME_ID, &volume_ID) This patch is a modified version of the patch by Mike Lockwood, with changes from Dmitry Pervushin, who noticed the original patch makes some volume IDs abiguous with error returns: for example, if volume id is 0xFFFFFDAD, that matches -ENOIOCTLCMD, we get "FFFFFFFF" from the user space. So add a parameter to ioctl to get the correct volume ID. Android uses vfat volume ID to identify different sd card, when a new sd card is inserted to device, android can scan the media on it and pop up new contents. Signed-off-by: Bintian Wang Cc: dmitry pervushin Cc: Mike Lockwood Cc: Colin Cross Acked-by: OGAWA Hirofumi Cc: John Stultz Cc: Sean McNeil Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fat/fat.h | 1 + fs/fat/file.c | 8 ++++++++ fs/fat/inode.c | 12 ++++++++++++ include/uapi/linux/msdos_fs.h | 10 ++++++++++ 4 files changed, 31 insertions(+) diff --git a/fs/fat/fat.h b/fs/fat/fat.h index 21664fcf3616..4241e6f39e86 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -86,6 +86,7 @@ struct msdos_sb_info { const void *dir_ops; /* Opaque; default directory operations */ int dir_per_block; /* dir entries per block */ int dir_per_block_bits; /* log2(dir_per_block) */ + unsigned int vol_id; /*volume ID*/ int fatent_shift; struct fatent_operations *fatent_ops; diff --git a/fs/fat/file.c b/fs/fat/file.c index b0b632e50ddb..9b104f543056 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -114,6 +114,12 @@ out: return err; } +static int fat_ioctl_get_volume_id(struct inode *inode, u32 __user *user_attr) +{ + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + return put_user(sbi->vol_id, user_attr); +} + long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -124,6 +130,8 @@ long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return fat_ioctl_get_attributes(inode, user_attr); case FAT_IOCTL_SET_ATTRIBUTES: return fat_ioctl_set_attributes(filp, user_attr); + case FAT_IOCTL_GET_VOLUME_ID: + return fat_ioctl_get_volume_id(inode, user_attr); default: return -ENOTTY; /* Inappropriate ioctl for device */ } diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 5d4513cb1b3c..11b51bb55b42 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1415,6 +1415,18 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, brelse(fsinfo_bh); } + /* interpret volume ID as a little endian 32 bit integer */ + if (sbi->fat_bits == 32) + sbi->vol_id = (((u32)b->fat32.vol_id[0]) | + ((u32)b->fat32.vol_id[1] << 8) | + ((u32)b->fat32.vol_id[2] << 16) | + ((u32)b->fat32.vol_id[3] << 24)); + else /* fat 16 or 12 */ + sbi->vol_id = (((u32)b->fat16.vol_id[0]) | + ((u32)b->fat16.vol_id[1] << 8) | + ((u32)b->fat16.vol_id[2] << 16) | + ((u32)b->fat16.vol_id[3] << 24)); + sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry); sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1; diff --git a/include/uapi/linux/msdos_fs.h b/include/uapi/linux/msdos_fs.h index f055e58b3147..e284ff919d6e 100644 --- a/include/uapi/linux/msdos_fs.h +++ b/include/uapi/linux/msdos_fs.h @@ -104,6 +104,8 @@ struct __fat_dirent { /* has used 0x72 ('r') in collision, so skip a few */ #define FAT_IOCTL_GET_ATTRIBUTES _IOR('r', 0x10, __u32) #define FAT_IOCTL_SET_ATTRIBUTES _IOW('r', 0x11, __u32) +/*Android kernel has used 0x12, so we use 0x13*/ +#define FAT_IOCTL_GET_VOLUME_ID _IOR('r', 0x13, __u32) struct fat_boot_sector { __u8 ignored[3]; /* Boot strap short or near jump */ @@ -128,6 +130,10 @@ struct fat_boot_sector { __u8 drive_number; /* Physical drive number */ __u8 state; /* undocumented, but used for mount state. */ + __u8 signature; /* extended boot signature */ + __u8 vol_id[4]; /* volume ID */ + __u8 vol_label[11]; /* volume label */ + __u8 fs_type[8]; /* file system type */ /* other fiealds are not added here */ } fat16; @@ -147,6 +153,10 @@ struct fat_boot_sector { __u8 drive_number; /* Physical drive number */ __u8 state; /* undocumented, but used for mount state. */ + __u8 signature; /* extended boot signature */ + __u8 vol_id[4]; /* volume ID */ + __u8 vol_label[11]; /* volume label */ + __u8 fs_type[8]; /* file system type */ /* other fiealds are not added here */ } fat32; }; From 02be46fba4b154b4a201a729b2d2b4ff6affd031 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 8 Jul 2013 16:00:47 -0700 Subject: [PATCH 065/118] ptrace/x86: revert "hw_breakpoints: Fix racy access to ptrace breakpoints" This reverts commit 87dc669ba257 ("hw_breakpoints: Fix racy access to ptrace breakpoints"). The patch was fine but we can no longer race with SIGKILL after commit 9899d11f6544 ("ptrace: ensure arch_ptrace/ptrace_request can never race with SIGKILL"), the __TASK_TRACED tracee can't be woken up and ->ptrace_bps[] can't go away. The patch only removes ptrace_get_breakpoints/ptrace_put_breakpoints and does a couple of "while at it" cleanups, it doesn't remove other changes from the reverted commit. Signed-off-by: Oleg Nesterov Acked-by: Ingo Molnar Acked-by: Frederic Weisbecker Cc: Benjamin Herrenschmidt Cc: Jan Kratochvil Cc: Michael Neuling Cc: Paul Mackerras Cc: Paul Mundt Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/ptrace.c | 28 +++++----------------------- 1 file changed, 5 insertions(+), 23 deletions(-) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 29a8120e6fe8..7a98b21945aa 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -641,9 +641,6 @@ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) unsigned len, type; struct perf_event *bp; - if (ptrace_get_breakpoints(tsk) < 0) - return -ESRCH; - data &= ~DR_CONTROL_RESERVED; old_dr7 = ptrace_get_dr7(thread->ptrace_bps); restore: @@ -692,9 +689,7 @@ restore: goto restore; } - ptrace_put_breakpoints(tsk); - - return ((orig_ret < 0) ? orig_ret : rc); + return orig_ret < 0 ? orig_ret : rc; } /* @@ -706,18 +701,10 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) unsigned long val = 0; if (n < HBP_NUM) { - struct perf_event *bp; + struct perf_event *bp = thread->ptrace_bps[n]; - if (ptrace_get_breakpoints(tsk) < 0) - return -ESRCH; - - bp = thread->ptrace_bps[n]; - if (!bp) - val = 0; - else + if (bp) val = bp->hw.info.address; - - ptrace_put_breakpoints(tsk); } else if (n == 6) { val = thread->debugreg6; } else if (n == 7) { @@ -734,9 +721,6 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, struct perf_event_attr attr; int err = 0; - if (ptrace_get_breakpoints(tsk) < 0) - return -ESRCH; - if (!t->ptrace_bps[nr]) { ptrace_breakpoint_init(&attr); /* @@ -762,7 +746,7 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, */ if (IS_ERR(bp)) { err = PTR_ERR(bp); - goto put; + goto out; } t->ptrace_bps[nr] = bp; @@ -773,9 +757,7 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, attr.bp_addr = addr; err = modify_user_hw_breakpoint(bp, &attr); } - -put: - ptrace_put_breakpoints(tsk); +out: return err; } From 6961ed96f14463d7c6e38d8c2093f5d53bd70574 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 8 Jul 2013 16:00:49 -0700 Subject: [PATCH 066/118] ptrace/powerpc: revert "hw_breakpoints: Fix racy access to ptrace breakpoints" This reverts commit 07fa7a0a8a58 ("hw_breakpoints: Fix racy access to ptrace breakpoints") and removes ptrace_get/put_breakpoints() added by other commits. The patch was fine but we can no longer race with SIGKILL after commit 9899d11f6544 ("ptrace: ensure arch_ptrace/ptrace_request can never race with SIGKILL"), the __TASK_TRACED tracee can't be woken up and ->ptrace_bps[] can't go away. Signed-off-by: Oleg Nesterov Acked-by: Michael Neuling Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Jan Kratochvil Cc: Paul Mundt Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/kernel/ptrace.c | 30 ++++-------------------------- 1 file changed, 4 insertions(+), 26 deletions(-) diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 64f7bd5b1b0f..9a0d24c390a3 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -975,16 +975,12 @@ int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, hw_brk.type = (data & HW_BRK_TYPE_DABR) | HW_BRK_TYPE_PRIV_ALL; hw_brk.len = 8; #ifdef CONFIG_HAVE_HW_BREAKPOINT - if (ptrace_get_breakpoints(task) < 0) - return -ESRCH; - bp = thread->ptrace_bps[0]; if ((!data) || !(hw_brk.type & HW_BRK_TYPE_RDWR)) { if (bp) { unregister_hw_breakpoint(bp); thread->ptrace_bps[0] = NULL; } - ptrace_put_breakpoints(task); return 0; } if (bp) { @@ -997,11 +993,9 @@ int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, ret = modify_user_hw_breakpoint(bp, &attr); if (ret) { - ptrace_put_breakpoints(task); return ret; } thread->ptrace_bps[0] = bp; - ptrace_put_breakpoints(task); thread->hw_brk = hw_brk; return 0; } @@ -1016,12 +1010,9 @@ int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, ptrace_triggered, NULL, task); if (IS_ERR(bp)) { thread->ptrace_bps[0] = NULL; - ptrace_put_breakpoints(task); return PTR_ERR(bp); } - ptrace_put_breakpoints(task); - #endif /* CONFIG_HAVE_HW_BREAKPOINT */ task->thread.hw_brk = hw_brk; #else /* CONFIG_PPC_ADV_DEBUG_REGS */ @@ -1440,26 +1431,19 @@ static long ppc_set_hwdebug(struct task_struct *child, if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_WRITE) brk.type |= HW_BRK_TYPE_WRITE; #ifdef CONFIG_HAVE_HW_BREAKPOINT - if (ptrace_get_breakpoints(child) < 0) - return -ESRCH; - /* * Check if the request is for 'range' breakpoints. We can * support it if range < 8 bytes. */ - if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE) { + if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE) len = bp_info->addr2 - bp_info->addr; - } else if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_EXACT) + else if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_EXACT) len = 1; - else { - ptrace_put_breakpoints(child); + else return -EINVAL; - } bp = thread->ptrace_bps[0]; - if (bp) { - ptrace_put_breakpoints(child); + if (bp) return -ENOSPC; - } /* Create a new breakpoint request if one doesn't exist already */ hw_breakpoint_init(&attr); @@ -1471,11 +1455,9 @@ static long ppc_set_hwdebug(struct task_struct *child, ptrace_triggered, NULL, child); if (IS_ERR(bp)) { thread->ptrace_bps[0] = NULL; - ptrace_put_breakpoints(child); return PTR_ERR(bp); } - ptrace_put_breakpoints(child); return 1; #endif /* CONFIG_HAVE_HW_BREAKPOINT */ @@ -1519,16 +1501,12 @@ static long ppc_del_hwdebug(struct task_struct *child, long data) return -EINVAL; #ifdef CONFIG_HAVE_HW_BREAKPOINT - if (ptrace_get_breakpoints(child) < 0) - return -ESRCH; - bp = thread->ptrace_bps[0]; if (bp) { unregister_hw_breakpoint(bp); thread->ptrace_bps[0] = NULL; } else ret = -ENOENT; - ptrace_put_breakpoints(child); return ret; #else /* CONFIG_HAVE_HW_BREAKPOINT */ if (child->thread.hw_brk.address == 0) From 6af9df7f5ba35806a5919d3a36d95fd40e210b89 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 8 Jul 2013 16:00:51 -0700 Subject: [PATCH 067/118] ptrace/arm: revert "hw_breakpoints: Fix racy access to ptrace breakpoints" This reverts commit bf0b8f4b55e5 ("hw_breakpoints: Fix racy access to ptrace breakpoints"). The patch was fine but we can no longer race with SIGKILL after commit 9899d11f6544 ("ptrace: ensure arch_ptrace/ptrace_request can never race with SIGKILL"), the __TASK_TRACED tracee can't be woken up and ->ptrace_bps[] can't go away. Signed-off-by: Oleg Nesterov Acked-by: Will Deacon Cc: Benjamin Herrenschmidt Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Jan Kratochvil Cc: Michael Neuling Cc: Paul Mackerras Cc: Paul Mundt Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/kernel/ptrace.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c index 2bc1514d6dbe..0dd3b79b15c3 100644 --- a/arch/arm/kernel/ptrace.c +++ b/arch/arm/kernel/ptrace.c @@ -886,20 +886,12 @@ long arch_ptrace(struct task_struct *child, long request, #ifdef CONFIG_HAVE_HW_BREAKPOINT case PTRACE_GETHBPREGS: - if (ptrace_get_breakpoints(child) < 0) - return -ESRCH; - ret = ptrace_gethbpregs(child, addr, (unsigned long __user *)data); - ptrace_put_breakpoints(child); break; case PTRACE_SETHBPREGS: - if (ptrace_get_breakpoints(child) < 0) - return -ESRCH; - ret = ptrace_sethbpregs(child, addr, (unsigned long __user *)data); - ptrace_put_breakpoints(child); break; #endif From e8c073c4ff51207f5c1c37fb054360bbc0f38251 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 8 Jul 2013 16:00:52 -0700 Subject: [PATCH 068/118] ptrace/sh: revert "hw_breakpoints: Fix racy access to ptrace breakpoints" This reverts commit e0ac8457d020 ("hw_breakpoints: Fix racy access to ptrace breakpoints"). The patch was fine but we can no longer race with SIGKILL after commit 9899d11f6544 ("ptrace: ensure arch_ptrace/ptrace_request can never race with SIGKILL"), the __TASK_TRACED tracee can't be woken up and ->ptrace_bps[] can't go away. Signed-off-by: Oleg Nesterov Cc: Paul Mundt Cc: Benjamin Herrenschmidt Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Jan Kratochvil Cc: Michael Neuling Cc: Paul Mackerras Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/kernel/ptrace_32.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c index 81f999a672f6..668c81631c08 100644 --- a/arch/sh/kernel/ptrace_32.c +++ b/arch/sh/kernel/ptrace_32.c @@ -117,11 +117,7 @@ void user_enable_single_step(struct task_struct *child) set_tsk_thread_flag(child, TIF_SINGLESTEP); - if (ptrace_get_breakpoints(child) < 0) - return; - set_single_step(child, pc); - ptrace_put_breakpoints(child); } void user_disable_single_step(struct task_struct *child) From 7c8df28633bf0b7eb253f866029be0ac59ddb062 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 8 Jul 2013 16:00:54 -0700 Subject: [PATCH 069/118] ptrace: revert "Prepare to fix racy accesses on task breakpoints" This reverts commit bf26c018490c ("Prepare to fix racy accesses on task breakpoints"). The patch was fine but we can no longer race with SIGKILL after commit 9899d11f6544 ("ptrace: ensure arch_ptrace/ptrace_request can never race with SIGKILL"), the __TASK_TRACED tracee can't be woken up and ->ptrace_bps[] can't go away. Now that ptrace_get_breakpoints/ptrace_put_breakpoints have no callers, we can kill them and remove task->ptrace_bp_refcnt. Signed-off-by: Oleg Nesterov Acked-by: Frederic Weisbecker Acked-by: Michael Neuling Cc: Benjamin Herrenschmidt Cc: Ingo Molnar Cc: Jan Kratochvil Cc: Paul Mackerras Cc: Paul Mundt Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ptrace.h | 10 ---------- include/linux/sched.h | 3 --- kernel/exit.c | 2 +- kernel/ptrace.c | 16 ---------------- 4 files changed, 1 insertion(+), 30 deletions(-) diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 89573a33ab3c..07d0df6bf768 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -142,9 +142,6 @@ static inline void ptrace_init_task(struct task_struct *child, bool ptrace) { INIT_LIST_HEAD(&child->ptrace_entry); INIT_LIST_HEAD(&child->ptraced); -#ifdef CONFIG_HAVE_HW_BREAKPOINT - atomic_set(&child->ptrace_bp_refcnt, 1); -#endif child->jobctl = 0; child->ptrace = 0; child->parent = child->real_parent; @@ -351,11 +348,4 @@ extern int task_current_syscall(struct task_struct *target, long *callno, unsigned long args[6], unsigned int maxargs, unsigned long *sp, unsigned long *pc); -#ifdef CONFIG_HAVE_HW_BREAKPOINT -extern int ptrace_get_breakpoints(struct task_struct *tsk); -extern void ptrace_put_breakpoints(struct task_struct *tsk); -#else -static inline void ptrace_put_breakpoints(struct task_struct *tsk) { } -#endif /* CONFIG_HAVE_HW_BREAKPOINT */ - #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index cdd5407b37e2..75324d8157e3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1401,9 +1401,6 @@ struct task_struct { } memcg_batch; unsigned int memcg_kmem_skip_account; #endif -#ifdef CONFIG_HAVE_HW_BREAKPOINT - atomic_t ptrace_bp_refcnt; -#endif #ifdef CONFIG_UPROBES struct uprobe_task *utask; #endif diff --git a/kernel/exit.c b/kernel/exit.c index fafe75d9e6f6..a949819055d5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -808,7 +808,7 @@ void do_exit(long code) /* * FIXME: do that only when needed, using sched_exit tracepoint */ - ptrace_put_breakpoints(tsk); + flush_ptrace_hw_breakpoint(tsk); exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA diff --git a/kernel/ptrace.c b/kernel/ptrace.c index ba5e6cea181a..a146ee327f6a 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -1221,19 +1221,3 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, return ret; } #endif /* CONFIG_COMPAT */ - -#ifdef CONFIG_HAVE_HW_BREAKPOINT -int ptrace_get_breakpoints(struct task_struct *tsk) -{ - if (atomic_inc_not_zero(&tsk->ptrace_bp_refcnt)) - return 0; - - return -1; -} - -void ptrace_put_breakpoints(struct task_struct *tsk) -{ - if (atomic_dec_and_test(&tsk->ptrace_bp_refcnt)) - flush_ptrace_hw_breakpoint(tsk); -} -#endif /* CONFIG_HAVE_HW_BREAKPOINT */ From e6a7d6077106e5c72f0519ec113d986df67ee001 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 8 Jul 2013 16:00:56 -0700 Subject: [PATCH 070/118] ptrace/x86: simplify the "disable" logic in ptrace_write_dr7() ptrace_write_dr7() looks unnecessarily overcomplicated. We can factor out ptrace_modify_breakpoint() and do not do "continue" twice, just we need to pass the proper "disabled" argument to ptrace_modify_breakpoint(). Signed-off-by: Oleg Nesterov Acked-by: Frederic Weisbecker Cc: Benjamin Herrenschmidt Cc: Ingo Molnar Cc: Jan Kratochvil Cc: Michael Neuling Cc: Paul Mackerras Cc: Paul Mundt Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/ptrace.c | 38 ++++++++++++++------------------------ 1 file changed, 14 insertions(+), 24 deletions(-) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 7a98b21945aa..0649f166d7c6 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -637,9 +637,7 @@ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) struct thread_struct *thread = &(tsk->thread); unsigned long old_dr7; int i, orig_ret = 0, rc = 0; - int enabled, second_pass = 0; - unsigned len, type; - struct perf_event *bp; + int second_pass = 0; data &= ~DR_CONTROL_RESERVED; old_dr7 = ptrace_get_dr7(thread->ptrace_bps); @@ -649,30 +647,22 @@ restore: * appropriate changes to each. */ for (i = 0; i < HBP_NUM; i++) { - enabled = decode_dr7(data, i, &len, &type); - bp = thread->ptrace_bps[i]; + unsigned len, type; + bool disabled = !decode_dr7(data, i, &len, &type); + struct perf_event *bp = thread->ptrace_bps[i]; - if (!enabled) { - if (bp) { - /* - * Don't unregister the breakpoints right-away, - * unless all register_user_hw_breakpoint() - * requests have succeeded. This prevents - * any window of opportunity for debug - * register grabbing by other users. - */ - if (!second_pass) - continue; - - rc = ptrace_modify_breakpoint(bp, len, type, - tsk, 1); - if (rc) - break; - } - continue; + if (disabled) { + /* + * Don't unregister the breakpoints right-away, unless + * all register_user_hw_breakpoint() requests have + * succeeded. This prevents any window of opportunity + * for debug register grabbing by other users. + */ + if (!bp || !second_pass) + continue; } - rc = ptrace_modify_breakpoint(bp, len, type, tsk, 0); + rc = ptrace_modify_breakpoint(bp, len, type, tsk, disabled); if (rc) break; } From 29a55513414187b50d3cebb99884955a78d97283 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 8 Jul 2013 16:00:58 -0700 Subject: [PATCH 071/118] ptrace/x86: dont delay "disable" till second pass in ptrace_write_dr7() ptrace_write_dr7() skips ptrace_modify_breakpoint(disabled => true) unless second_pass, this buys nothing but complicates the code and means that we always do the main loop twice even if "disabled" was never true. The comment says: Don't unregister the breakpoints right-away, unless all register_user_hw_breakpoint() requests have succeeded. Firstly, we do not do register_user_hw_breakpoint(), it was removed by commit 24f1e32c60c4 ("hw-breakpoints: Rewrite the hw-breakpoints layer on top of perf events"). We are going to restore register_user_hw_breakpoint() (see the next patch) but this doesn't matter: after commit 44234adcdce3 ("hw-breakpoints: Modify breakpoints without unregistering them") perf_event_disable() can not hurt, hw_breakpoint_del() does not free the slot. Remove the "second_pass" check from the main loop and simplify the code. Since we have to check "bp != NULL" anyway, the patch also removes the same check in ptrace_modify_breakpoint() and moves the comment into ptrace_write_dr7(). With this patch the second pass is only needed to restore the saved old_dr7. This should never fail, so the patch adds WARN_ON() to catch the potential problems as Frederic suggested. Signed-off-by: Oleg Nesterov Acked-by: Frederic Weisbecker Cc: Benjamin Herrenschmidt Cc: Ingo Molnar Cc: Jan Kratochvil Cc: Michael Neuling Cc: Paul Mackerras Cc: Paul Mundt Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/ptrace.c | 55 +++++++++++++++------------------------- 1 file changed, 21 insertions(+), 34 deletions(-) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 0649f166d7c6..98b0a2ccc33c 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -609,14 +609,6 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, int gen_len, gen_type; struct perf_event_attr attr; - /* - * We should have at least an inactive breakpoint at this - * slot. It means the user is writing dr7 without having - * written the address register first - */ - if (!bp) - return -EINVAL; - err = arch_bp_generic_fields(len, type, &gen_len, &gen_type); if (err) return err; @@ -634,52 +626,47 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, */ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) { - struct thread_struct *thread = &(tsk->thread); + struct thread_struct *thread = &tsk->thread; unsigned long old_dr7; - int i, orig_ret = 0, rc = 0; - int second_pass = 0; + bool second_pass = false; + int i, rc, ret = 0; data &= ~DR_CONTROL_RESERVED; old_dr7 = ptrace_get_dr7(thread->ptrace_bps); + restore: - /* - * Loop through all the hardware breakpoints, making the - * appropriate changes to each. - */ + rc = 0; for (i = 0; i < HBP_NUM; i++) { unsigned len, type; bool disabled = !decode_dr7(data, i, &len, &type); struct perf_event *bp = thread->ptrace_bps[i]; - if (disabled) { - /* - * Don't unregister the breakpoints right-away, unless - * all register_user_hw_breakpoint() requests have - * succeeded. This prevents any window of opportunity - * for debug register grabbing by other users. - */ - if (!bp || !second_pass) + if (!bp) { + if (disabled) continue; + /* + * We should have at least an inactive breakpoint at + * this slot. It means the user is writing dr7 without + * having written the address register first. + */ + rc = -EINVAL; + break; } rc = ptrace_modify_breakpoint(bp, len, type, tsk, disabled); if (rc) break; } - /* - * Make a second pass to free the remaining unused breakpoints - * or to restore the original breakpoints if an error occurred. - */ - if (!second_pass) { - second_pass = 1; - if (rc < 0) { - orig_ret = rc; - data = old_dr7; - } + + /* Restore if the first pass failed, second_pass shouldn't fail. */ + if (rc && !WARN_ON(second_pass)) { + ret = rc; + data = old_dr7; + second_pass = true; goto restore; } - return orig_ret < 0 ? orig_ret : rc; + return ret; } /* From 9afe33ada275f2413dfeae27cc58fbb27474ac72 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 8 Jul 2013 16:00:59 -0700 Subject: [PATCH 072/118] ptrace/x86: introduce ptrace_register_breakpoint() No functional changes, preparation. Extract the "register breakpoint" code from ptrace_get_debugreg() into the new/generic helper, ptrace_register_breakpoint(). It will have more users. The patch also adds another simple helper, ptrace_fill_bp_fields(), to factor out the arch_bp_generic_fields() logic in register/modify. Signed-off-by: Oleg Nesterov Acked-by: Frederic Weisbecker Cc: Benjamin Herrenschmidt Cc: Ingo Molnar Cc: Jan Kratochvil Cc: Michael Neuling Cc: Paul Mackerras Cc: Paul Mundt Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/ptrace.c | 88 +++++++++++++++++++++++----------------- 1 file changed, 51 insertions(+), 37 deletions(-) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 98b0a2ccc33c..052636801b41 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -601,23 +601,49 @@ static unsigned long ptrace_get_dr7(struct perf_event *bp[]) return dr7; } -static int -ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, - struct task_struct *tsk, int disabled) +static int ptrace_fill_bp_fields(struct perf_event_attr *attr, + int len, int type, bool disabled) { - int err; - int gen_len, gen_type; - struct perf_event_attr attr; + int err, bp_len, bp_type; - err = arch_bp_generic_fields(len, type, &gen_len, &gen_type); + err = arch_bp_generic_fields(len, type, &bp_len, &bp_type); + if (!err) { + attr->bp_len = bp_len; + attr->bp_type = bp_type; + attr->disabled = disabled; + } + + return err; +} + +static struct perf_event * +ptrace_register_breakpoint(struct task_struct *tsk, int len, int type, + unsigned long addr, bool disabled) +{ + struct perf_event_attr attr; + int err; + + ptrace_breakpoint_init(&attr); + attr.bp_addr = addr; + + err = ptrace_fill_bp_fields(&attr, len, type, disabled); + if (err) + return ERR_PTR(err); + + return register_user_hw_breakpoint(&attr, ptrace_triggered, + NULL, tsk); +} + +static int ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, + int disabled) +{ + struct perf_event_attr attr = bp->attr; + int err; + + err = ptrace_fill_bp_fields(&attr, len, type, disabled); if (err) return err; - attr = bp->attr; - attr.bp_len = gen_len; - attr.bp_type = gen_type; - attr.disabled = disabled; - return modify_user_hw_breakpoint(bp, &attr); } @@ -653,7 +679,7 @@ restore: break; } - rc = ptrace_modify_breakpoint(bp, len, type, tsk, disabled); + rc = ptrace_modify_breakpoint(bp, len, type, disabled); if (rc) break; } @@ -693,26 +719,14 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, unsigned long addr) { - struct perf_event *bp; struct thread_struct *t = &tsk->thread; - struct perf_event_attr attr; + struct perf_event *bp = t->ptrace_bps[nr]; int err = 0; - if (!t->ptrace_bps[nr]) { - ptrace_breakpoint_init(&attr); - /* - * Put stub len and type to register (reserve) an inactive but - * correct bp - */ - attr.bp_addr = addr; - attr.bp_len = HW_BREAKPOINT_LEN_1; - attr.bp_type = HW_BREAKPOINT_W; - attr.disabled = 1; - - bp = register_user_hw_breakpoint(&attr, ptrace_triggered, - NULL, tsk); - + if (!bp) { /* + * Put stub len and type to create an inactive but correct bp. + * * CHECKME: the previous code returned -EIO if the addr wasn't * a valid task virtual addr. The new one will return -EINVAL in * this case. @@ -721,20 +735,20 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, * writing for the user. And anyway this is the previous * behaviour. */ - if (IS_ERR(bp)) { + bp = ptrace_register_breakpoint(tsk, + X86_BREAKPOINT_LEN_1, X86_BREAKPOINT_WRITE, + addr, true); + if (IS_ERR(bp)) err = PTR_ERR(bp); - goto out; - } - - t->ptrace_bps[nr] = bp; + else + t->ptrace_bps[nr] = bp; } else { - bp = t->ptrace_bps[nr]; + struct perf_event_attr attr = bp->attr; - attr = bp->attr; attr.bp_addr = addr; err = modify_user_hw_breakpoint(bp, &attr); } -out: + return err; } From b87a95ad609619482df0690320d5ace33ace8e7a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 8 Jul 2013 16:01:01 -0700 Subject: [PATCH 073/118] ptrace/x86: ptrace_write_dr7() should create bp if !disabled Commit 24f1e32c60c4 ("hw-breakpoints: Rewrite the hw-breakpoints layer on top of perf events") introduced the minor regression. Before this commit PTRACE_POKEUSER DR7, enableDR0 PTRACE_POKEUSER DR0, address was perfectly valid, now PTRACE_POKEUSER(DR7) fails if DR0 was not previously initialized by PTRACE_POKEUSER(DR0). Change ptrace_write_dr7() to do ptrace_register_breakpoint(addr => 0) if !bp && !disabled. This fixes watchpoint-zeroaddr from ptrace-tests, see https://bugzilla.redhat.com/show_bug.cgi?id=660204. Signed-off-by: Oleg Nesterov Reported-by: Jan Kratochvil Acked-by: Frederic Weisbecker Cc: Benjamin Herrenschmidt Cc: Ingo Molnar Cc: Michael Neuling Cc: Paul Mackerras Cc: Paul Mundt Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/ptrace.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 052636801b41..5c387b3dce3f 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -670,13 +670,16 @@ restore: if (!bp) { if (disabled) continue; - /* - * We should have at least an inactive breakpoint at - * this slot. It means the user is writing dr7 without - * having written the address register first. - */ - rc = -EINVAL; - break; + + bp = ptrace_register_breakpoint(tsk, + len, type, 0, disabled); + if (IS_ERR(bp)) { + rc = PTR_ERR(bp); + break; + } + + thread->ptrace_bps[i] = bp; + continue; } rc = ptrace_modify_breakpoint(bp, len, type, disabled); From 61e305c716c0737c97bd133313cc90e99a93712e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 8 Jul 2013 16:01:03 -0700 Subject: [PATCH 074/118] ptrace/x86: cleanup ptrace_set_debugreg() ptrace_set_debugreg() is trivial but looks horrible. Kill the unnecessary goto's and return's to cleanup the code. This matches ptrace_get_debugreg() which also needs the trivial whitespace cleanups. Signed-off-by: Oleg Nesterov Acked-by: Frederic Weisbecker Cc: Benjamin Herrenschmidt Cc: Ingo Molnar Cc: Jan Kratochvil Cc: Michael Neuling Cc: Paul Mackerras Cc: Paul Mundt Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/ptrace.c | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 5c387b3dce3f..7461f50d5bb1 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -703,7 +703,7 @@ restore: */ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) { - struct thread_struct *thread = &(tsk->thread); + struct thread_struct *thread = &tsk->thread; unsigned long val = 0; if (n < HBP_NUM) { @@ -713,7 +713,7 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) val = bp->hw.info.address; } else if (n == 6) { val = thread->debugreg6; - } else if (n == 7) { + } else if (n == 7) { val = thread->ptrace_dr7; } return val; @@ -761,30 +761,20 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, static int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val) { - struct thread_struct *thread = &(tsk->thread); - int rc = 0; - + struct thread_struct *thread = &tsk->thread; /* There are no DR4 or DR5 registers */ - if (n == 4 || n == 5) - return -EIO; + int rc = -EIO; - if (n == 6) { - thread->debugreg6 = val; - goto ret_path; - } if (n < HBP_NUM) { rc = ptrace_set_breakpoint_addr(tsk, n, val); - if (rc) - return rc; - } - /* All that's left is DR7 */ - if (n == 7) { + } else if (n == 6) { + thread->debugreg6 = val; + rc = 0; + } else if (n == 7) { rc = ptrace_write_dr7(tsk, val); if (!rc) thread->ptrace_dr7 = val; } - -ret_path: return rc; } From fab840fc2d542fabcab903db8e03589a6702ba5f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 8 Jul 2013 16:01:05 -0700 Subject: [PATCH 075/118] ptrace: PTRACE_DETACH should do flush_ptrace_hw_breakpoint(child) Change ptrace_detach() to call flush_ptrace_hw_breakpoint(child). This frees the slots for non-ptrace PERF_TYPE_BREAKPOINT users, and this ensures that the tracee won't be killed by SIGTRAP triggered by the active breakpoints. Test-case: unsigned long encode_dr7(int drnum, int enable, unsigned int type, unsigned int len) { unsigned long dr7; dr7 = ((len | type) & 0xf) << (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE); if (enable) dr7 |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)); return dr7; } int write_dr(int pid, int dr, unsigned long val) { return ptrace(PTRACE_POKEUSER, pid, offsetof (struct user, u_debugreg[dr]), val); } void func(void) { } int main(void) { int pid, stat; unsigned long dr7; pid = fork(); if (!pid) { assert(ptrace(PTRACE_TRACEME, 0,0,0) == 0); kill(getpid(), SIGHUP); func(); return 0x13; } assert(pid == waitpid(-1, &stat, 0)); assert(WSTOPSIG(stat) == SIGHUP); assert(write_dr(pid, 0, (long)func) == 0); dr7 = encode_dr7(0, 1, DR_RW_EXECUTE, DR_LEN_1); assert(write_dr(pid, 7, dr7) == 0); assert(ptrace(PTRACE_DETACH, pid, 0,0) == 0); assert(pid == waitpid(-1, &stat, 0)); assert(stat == 0x1300); return 0; } Before this patch the child is killed after PTRACE_DETACH. Signed-off-by: Oleg Nesterov Acked-by: Frederic Weisbecker Cc: Benjamin Herrenschmidt Cc: Ingo Molnar Cc: Jan Kratochvil Cc: Michael Neuling Cc: Paul Mackerras Cc: Paul Mundt Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index a146ee327f6a..4041f5747e73 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -469,6 +469,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) /* Architecture-specific hardware disable .. */ ptrace_disable(child); clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + flush_ptrace_hw_breakpoint(child); write_lock_irq(&tasklist_lock); /* From f7da04c9e363e479258135ac825734d78aecd2b0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 8 Jul 2013 16:01:06 -0700 Subject: [PATCH 076/118] ptrace/x86: flush_ptrace_hw_breakpoint() shoule clear the virtual debug registers flush_ptrace_hw_breakpoint() destroys the counters set by ptrace, but "leaks" ->debugreg6 and ->ptrace_dr7. The problem is minor, but still it doesn't look right and flush_thread() did this until commit 66cb59172959 ("hw-breakpoints: use the new wrapper routines to access debug registers in process/thread code"). Now that PTRACE_DETACH does flush_ too this makes even more sense. Signed-off-by: Oleg Nesterov Cc: Benjamin Herrenschmidt Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Jan Kratochvil Cc: Michael Neuling Cc: Paul Mackerras Cc: Paul Mundt Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/hw_breakpoint.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 02f07634d265..f66ff162dce8 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -393,6 +393,9 @@ void flush_ptrace_hw_breakpoint(struct task_struct *tsk) unregister_hw_breakpoint(t->ptrace_bps[i]); t->ptrace_bps[i] = NULL; } + + t->debugreg6 = 0; + t->ptrace_dr7 = 0; } void hw_breakpoint_restore(void) From c103a4dc4a32f53f095b66cd798d648c652f05b4 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 8 Jul 2013 16:01:08 -0700 Subject: [PATCH 077/118] ipc/shmc.c: eliminate ugly 80-col tricks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/shm.c | 4 ++-- mm/mmap.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ipc/shm.c b/ipc/shm.c index 7e199fa1960f..85dc001634b1 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -491,10 +491,10 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) sprintf (name, "SYSV%08x", key); if (shmflg & SHM_HUGETLB) { - struct hstate *hs = hstate_sizelog((shmflg >> SHM_HUGE_SHIFT) - & SHM_HUGE_MASK); + struct hstate *hs; size_t hugesize; + hs = hstate_sizelog((shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK); if (!hs) { error = -EINVAL; goto no_file; diff --git a/mm/mmap.c b/mm/mmap.c index 0718c175db8f..f81311173b4d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1368,9 +1368,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, goto out_fput; } else if (flags & MAP_HUGETLB) { struct user_struct *user = NULL; - struct hstate *hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & - SHM_HUGE_MASK); + struct hstate *hs; + hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK); if (!hs) return -EINVAL; From dbfcd91f06f0e2d5564b2fd184e9c2a43675f9ab Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 8 Jul 2013 16:01:09 -0700 Subject: [PATCH 078/118] ipc: move rcu lock out of ipc_addid This patchset continues the work that began in the sysv ipc semaphore scaling series, see https://lkml.org/lkml/2013/3/20/546 Just like semaphores used to be, sysv shared memory and msg queues also abuse the ipc lock, unnecessarily holding it for operations such as permission and security checks. This patchset mostly deals with mqueues, and while shared mem can be done in a very similar way, I want to get these patches out in the open first. It also does some pending cleanups, mostly focused on the two level locking we have in ipc code, taking care of ipc_addid() and ipcctl_pre_down_nolock() - yes there are still functions that need to be updated as well. This patch: Make all callers explicitly take and release the RCU read lock. This addresses the two level locking seen in newary(), newseg() and newqueue(). For the last two, explicitly unlock the ipc object and the rcu lock, instead of calling the custom shm_unlock and msg_unlock functions. The next patch will deal with the open coded locking for ->perm.lock Signed-off-by: Davidlohr Bueso Cc: Andi Kleen Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/msg.c | 7 +++---- ipc/shm.c | 5 ++++- ipc/util.c | 3 +-- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/ipc/msg.c b/ipc/msg.c index d0c6d967b390..996feb819248 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -199,9 +199,7 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params) return retval; } - /* - * ipc_addid() locks msq - */ + /* ipc_addid() locks msq upon success. */ id = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni); if (id < 0) { security_msg_queue_free(msq); @@ -218,7 +216,8 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params) INIT_LIST_HEAD(&msq->q_receivers); INIT_LIST_HEAD(&msq->q_senders); - msg_unlock(msq); + spin_unlock(&msq->q_perm.lock); + rcu_read_unlock(); return msq->q_perm.id; } diff --git a/ipc/shm.c b/ipc/shm.c index 85dc001634b1..bd2b14ef1bba 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -535,6 +535,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) shp->shm_nattch = 0; shp->shm_file = file; shp->shm_creator = current; + /* * shmid gets reported as "inode#" in /proc/pid/maps. * proc-ps tools use this. Changing this will break them. @@ -543,7 +544,9 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) ns->shm_tot += numpages; error = shp->shm_perm.id; - shm_unlock(shp); + + spin_unlock(&shp->shm_perm.lock); + rcu_read_unlock(); return error; no_id: diff --git a/ipc/util.c b/ipc/util.c index 809ec5ec8122..399821ac0a9a 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -246,9 +246,8 @@ int ipc_get_maxid(struct ipc_ids *ids) * is returned. The 'new' entry is returned in a locked state on success. * On failure the entry is not locked and a negative err-code is returned. * - * Called with ipc_ids.rw_mutex held as a writer. + * Called with writer ipc_ids.rw_mutex held. */ - int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size) { kuid_t euid; From 1ca7003ab41152d673d9e359632283d05294f3d6 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 8 Jul 2013 16:01:10 -0700 Subject: [PATCH 079/118] ipc: introduce ipc object locking helpers Simple helpers around the (kern_ipc_perm *)->lock spinlock. Signed-off-by: Davidlohr Bueso Cc: Andi Kleen Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/util.h | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/ipc/util.h b/ipc/util.h index 2b0bdd5d92ce..da65e8afb8f4 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -159,6 +159,21 @@ static inline int ipc_checkid(struct kern_ipc_perm *ipcp, int uid) return uid / SEQ_MULTIPLIER != ipcp->seq; } +static inline void ipc_lock_object(struct kern_ipc_perm *perm) +{ + spin_lock(&perm->lock); +} + +static inline void ipc_unlock_object(struct kern_ipc_perm *perm) +{ + spin_unlock(&perm->lock); +} + +static inline void ipc_assert_locked_object(struct kern_ipc_perm *perm) +{ + assert_spin_locked(&perm->lock); +} + static inline void ipc_lock_by_ptr(struct kern_ipc_perm *perm) { rcu_read_lock(); @@ -171,11 +186,6 @@ static inline void ipc_unlock(struct kern_ipc_perm *perm) rcu_read_unlock(); } -static inline void ipc_lock_object(struct kern_ipc_perm *perm) -{ - spin_lock(&perm->lock); -} - struct kern_ipc_perm *ipc_lock_check(struct ipc_ids *ids, int id); struct kern_ipc_perm *ipc_obtain_object_check(struct ipc_ids *ids, int id); int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids, From cf9d5d78d05bca96df7618dfc3a5ee4414dcae58 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 8 Jul 2013 16:01:11 -0700 Subject: [PATCH 080/118] ipc: close open coded spin lock calls Signed-off-by: Davidlohr Bueso Cc: Andi Kleen Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/msg.c | 2 +- ipc/sem.c | 14 +++++++------- ipc/shm.c | 4 ++-- ipc/util.h | 4 ++-- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/ipc/msg.c b/ipc/msg.c index 996feb819248..7a3d6aab369d 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -216,7 +216,7 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params) INIT_LIST_HEAD(&msq->q_receivers); INIT_LIST_HEAD(&msq->q_senders); - spin_unlock(&msq->q_perm.lock); + ipc_unlock_object(&msq->q_perm); rcu_read_unlock(); return msq->q_perm.id; diff --git a/ipc/sem.c b/ipc/sem.c index 70480a3aa698..92ec6c69bab5 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -246,7 +246,7 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops, * their critical section while the array lock is held. */ lock_array: - spin_lock(&sma->sem_perm.lock); + ipc_lock_object(&sma->sem_perm); for (i = 0; i < sma->sem_nsems; i++) { struct sem *sem = sma->sem_base + i; spin_unlock_wait(&sem->lock); @@ -259,7 +259,7 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops, static inline void sem_unlock(struct sem_array *sma, int locknum) { if (locknum == -1) { - spin_unlock(&sma->sem_perm.lock); + ipc_unlock_object(&sma->sem_perm); } else { struct sem *sem = sma->sem_base + locknum; spin_unlock(&sem->lock); @@ -872,7 +872,7 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) int i; /* Free the existing undo structures for this semaphore set. */ - assert_spin_locked(&sma->sem_perm.lock); + ipc_assert_locked_object(&sma->sem_perm); list_for_each_entry_safe(un, tu, &sma->list_id, list_id) { list_del(&un->list_id); spin_lock(&un->ulp->lock); @@ -1070,7 +1070,7 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum, curr = &sma->sem_base[semnum]; - assert_spin_locked(&sma->sem_perm.lock); + ipc_assert_locked_object(&sma->sem_perm); list_for_each_entry(un, &sma->list_id, list_id) un->semadj[semnum] = 0; @@ -1199,7 +1199,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, for (i = 0; i < nsems; i++) sma->sem_base[i].semval = sem_io[i]; - assert_spin_locked(&sma->sem_perm.lock); + ipc_assert_locked_object(&sma->sem_perm); list_for_each_entry(un, &sma->list_id, list_id) { for (i = 0; i < nsems; i++) un->semadj[i] = 0; @@ -1496,7 +1496,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) new->semid = semid; assert_spin_locked(&ulp->lock); list_add_rcu(&new->list_proc, &ulp->list_proc); - assert_spin_locked(&sma->sem_perm.lock); + ipc_assert_locked_object(&sma->sem_perm); list_add(&new->list_id, &sma->list_id); un = new; @@ -1833,7 +1833,7 @@ void exit_sem(struct task_struct *tsk) } /* remove un from the linked lists */ - assert_spin_locked(&sma->sem_perm.lock); + ipc_assert_locked_object(&sma->sem_perm); list_del(&un->list_id); spin_lock(&ulp->lock); diff --git a/ipc/shm.c b/ipc/shm.c index bd2b14ef1bba..e7d51072d1c7 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -141,7 +141,7 @@ static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id) static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp) { rcu_read_lock(); - spin_lock(&ipcp->shm_perm.lock); + ipc_lock_object(&ipcp->shm_perm); } static inline struct shmid_kernel *shm_lock_check(struct ipc_namespace *ns, @@ -545,7 +545,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) ns->shm_tot += numpages; error = shp->shm_perm.id; - spin_unlock(&shp->shm_perm.lock); + ipc_unlock_object(&shp->shm_perm); rcu_read_unlock(); return error; diff --git a/ipc/util.h b/ipc/util.h index da65e8afb8f4..b6a6a88f3002 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -177,12 +177,12 @@ static inline void ipc_assert_locked_object(struct kern_ipc_perm *perm) static inline void ipc_lock_by_ptr(struct kern_ipc_perm *perm) { rcu_read_lock(); - spin_lock(&perm->lock); + ipc_lock_object(perm); } static inline void ipc_unlock(struct kern_ipc_perm *perm) { - spin_unlock(&perm->lock); + ipc_unlock_object(perm); rcu_read_unlock(); } From 7b4cc5d8411bd4e9d61d8714f53859740cf830c2 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 8 Jul 2013 16:01:12 -0700 Subject: [PATCH 081/118] ipc: move locking out of ipcctl_pre_down_nolock This function currently acquires both the rw_mutex and the rcu lock on successful lookups, leaving the callers to explicitly unlock them, creating another two level locking situation. Make the callers (including those that still use ipcctl_pre_down()) explicitly lock and unlock the rwsem and rcu lock. Signed-off-by: Davidlohr Bueso Cc: Andi Kleen Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/msg.c | 24 +++++++++++++++++------- ipc/sem.c | 27 ++++++++++++++++----------- ipc/shm.c | 23 +++++++++++++++++------ ipc/util.c | 21 ++++++--------------- 4 files changed, 56 insertions(+), 39 deletions(-) diff --git a/ipc/msg.c b/ipc/msg.c index 7a3d6aab369d..f62fa5eed847 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -407,31 +407,38 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, return -EFAULT; } + down_write(&msg_ids(ns).rw_mutex); + rcu_read_lock(); + ipcp = ipcctl_pre_down(ns, &msg_ids(ns), msqid, cmd, &msqid64.msg_perm, msqid64.msg_qbytes); - if (IS_ERR(ipcp)) - return PTR_ERR(ipcp); + if (IS_ERR(ipcp)) { + err = PTR_ERR(ipcp); + /* the ipc lock is not held upon failure */ + goto out_unlock1; + } msq = container_of(ipcp, struct msg_queue, q_perm); err = security_msg_queue_msgctl(msq, cmd); if (err) - goto out_unlock; + goto out_unlock0; switch (cmd) { case IPC_RMID: + /* freeque unlocks the ipc object and rcu */ freeque(ns, ipcp); goto out_up; case IPC_SET: if (msqid64.msg_qbytes > ns->msg_ctlmnb && !capable(CAP_SYS_RESOURCE)) { err = -EPERM; - goto out_unlock; + goto out_unlock0; } err = ipc_update_perm(&msqid64.msg_perm, ipcp); if (err) - goto out_unlock; + goto out_unlock0; msq->q_qbytes = msqid64.msg_qbytes; @@ -448,8 +455,11 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, default: err = -EINVAL; } -out_unlock: - msg_unlock(msq); + +out_unlock0: + ipc_unlock_object(&msq->q_perm); +out_unlock1: + rcu_read_unlock(); out_up: up_write(&msg_ids(ns).rw_mutex); return err; diff --git a/ipc/sem.c b/ipc/sem.c index 92ec6c69bab5..b4b892b5c5f8 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -1289,39 +1289,44 @@ static int semctl_down(struct ipc_namespace *ns, int semid, return -EFAULT; } + down_write(&sem_ids(ns).rw_mutex); + rcu_read_lock(); + ipcp = ipcctl_pre_down_nolock(ns, &sem_ids(ns), semid, cmd, &semid64.sem_perm, 0); - if (IS_ERR(ipcp)) - return PTR_ERR(ipcp); + if (IS_ERR(ipcp)) { + err = PTR_ERR(ipcp); + /* the ipc lock is not held upon failure */ + goto out_unlock1; + } sma = container_of(ipcp, struct sem_array, sem_perm); err = security_sem_semctl(sma, cmd); - if (err) { - rcu_read_unlock(); - goto out_up; - } + if (err) + goto out_unlock1; - switch(cmd){ + switch (cmd) { case IPC_RMID: sem_lock(sma, NULL, -1); + /* freeary unlocks the ipc object and rcu */ freeary(ns, ipcp); goto out_up; case IPC_SET: sem_lock(sma, NULL, -1); err = ipc_update_perm(&semid64.sem_perm, ipcp); if (err) - goto out_unlock; + goto out_unlock0; sma->sem_ctime = get_seconds(); break; default: - rcu_read_unlock(); err = -EINVAL; - goto out_up; + goto out_unlock1; } -out_unlock: +out_unlock0: sem_unlock(sma, -1); +out_unlock1: rcu_read_unlock(); out_up: up_write(&sem_ids(ns).rw_mutex); diff --git a/ipc/shm.c b/ipc/shm.c index e7d51072d1c7..c6b4ad5ce3b7 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -757,31 +757,42 @@ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd, return -EFAULT; } + down_write(&shm_ids(ns).rw_mutex); + rcu_read_lock(); + ipcp = ipcctl_pre_down(ns, &shm_ids(ns), shmid, cmd, &shmid64.shm_perm, 0); - if (IS_ERR(ipcp)) - return PTR_ERR(ipcp); + if (IS_ERR(ipcp)) { + err = PTR_ERR(ipcp); + /* the ipc lock is not held upon failure */ + goto out_unlock1; + } shp = container_of(ipcp, struct shmid_kernel, shm_perm); err = security_shm_shmctl(shp, cmd); if (err) - goto out_unlock; + goto out_unlock0; + switch (cmd) { case IPC_RMID: + /* do_shm_rmid unlocks the ipc object and rcu */ do_shm_rmid(ns, ipcp); goto out_up; case IPC_SET: err = ipc_update_perm(&shmid64.shm_perm, ipcp); if (err) - goto out_unlock; + goto out_unlock0; shp->shm_ctim = get_seconds(); break; default: err = -EINVAL; } -out_unlock: - shm_unlock(shp); + +out_unlock0: + ipc_unlock_object(&shp->shm_perm); +out_unlock1: + rcu_read_unlock(); out_up: up_write(&shm_ids(ns).rw_mutex); return err; diff --git a/ipc/util.c b/ipc/util.c index 399821ac0a9a..a0c139f3d1f3 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -746,8 +746,10 @@ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out) * It must be called without any lock held and * - retrieves the ipc with the given id in the given table. * - performs some audit and permission check, depending on the given cmd - * - returns the ipc with both ipc and rw_mutex locks held in case of success + * - returns the ipc with the ipc lock held in case of success * or an err-code without any lock held otherwise. + * + * Call holding the both the rw_mutex and the rcu read lock. */ struct kern_ipc_perm *ipcctl_pre_down(struct ipc_namespace *ns, struct ipc_ids *ids, int id, int cmd, @@ -772,13 +774,10 @@ struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns, int err = -EPERM; struct kern_ipc_perm *ipcp; - down_write(&ids->rw_mutex); - rcu_read_lock(); - ipcp = ipc_obtain_object_check(ids, id); if (IS_ERR(ipcp)) { err = PTR_ERR(ipcp); - goto out_up; + goto err; } audit_ipc_obj(ipcp); @@ -789,16 +788,8 @@ struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns, euid = current_euid(); if (uid_eq(euid, ipcp->cuid) || uid_eq(euid, ipcp->uid) || ns_capable(ns->user_ns, CAP_SYS_ADMIN)) - return ipcp; - -out_up: - /* - * Unsuccessful lookup, unlock and return - * the corresponding error. - */ - rcu_read_unlock(); - up_write(&ids->rw_mutex); - + return ipcp; /* successful lookup */ +err: return ERR_PTR(err); } From 15724ecb7e9bab35fc694c666ad563adba820cc3 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 8 Jul 2013 16:01:13 -0700 Subject: [PATCH 082/118] ipc,msg: shorten critical region in msgctl_down Instead of holding the ipc lock for the entire function, use the ipcctl_pre_down_nolock and only acquire the lock for specific commands: RMID and SET. Signed-off-by: Davidlohr Bueso Cc: Andi Kleen Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/msg.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/ipc/msg.c b/ipc/msg.c index f62fa5eed847..de422ff71c87 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -410,11 +410,10 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, down_write(&msg_ids(ns).rw_mutex); rcu_read_lock(); - ipcp = ipcctl_pre_down(ns, &msg_ids(ns), msqid, cmd, - &msqid64.msg_perm, msqid64.msg_qbytes); + ipcp = ipcctl_pre_down_nolock(ns, &msg_ids(ns), msqid, cmd, + &msqid64.msg_perm, msqid64.msg_qbytes); if (IS_ERR(ipcp)) { err = PTR_ERR(ipcp); - /* the ipc lock is not held upon failure */ goto out_unlock1; } @@ -422,10 +421,11 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, err = security_msg_queue_msgctl(msq, cmd); if (err) - goto out_unlock0; + goto out_unlock1; switch (cmd) { case IPC_RMID: + ipc_lock_object(&msq->q_perm); /* freeque unlocks the ipc object and rcu */ freeque(ns, ipcp); goto out_up; @@ -433,9 +433,10 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, if (msqid64.msg_qbytes > ns->msg_ctlmnb && !capable(CAP_SYS_RESOURCE)) { err = -EPERM; - goto out_unlock0; + goto out_unlock1; } + ipc_lock_object(&msq->q_perm); err = ipc_update_perm(&msqid64.msg_perm, ipcp); if (err) goto out_unlock0; @@ -454,6 +455,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, break; default: err = -EINVAL; + goto out_unlock1; } out_unlock0: From 2cafed30f150f7314f98717b372df8173516cae0 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 8 Jul 2013 16:01:14 -0700 Subject: [PATCH 083/118] ipc,msg: introduce msgctl_nolock Similar to semctl, when calling msgctl, the *_INFO and *_STAT commands can be performed without acquiring the ipc object. Add a msgctl_nolock() function and move the logic of *_INFO and *_STAT out of msgctl(). This change still takes the lock and it will be properly lockless in the next patch Signed-off-by: Davidlohr Bueso Cc: Andi Kleen Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/msg.c | 49 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/ipc/msg.c b/ipc/msg.c index de422ff71c87..f45be81f6de9 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -467,17 +467,11 @@ out_up: return err; } -SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf) +static int msgctl_nolock(struct ipc_namespace *ns, int msqid, + int cmd, int version, void __user *buf) { + int err; struct msg_queue *msq; - int err, version; - struct ipc_namespace *ns; - - if (msqid < 0 || cmd < 0) - return -EINVAL; - - version = ipc_parse_version(&cmd); - ns = current->nsproxy->ipc_ns; switch (cmd) { case IPC_INFO: @@ -488,6 +482,7 @@ SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf) if (!buf) return -EFAULT; + /* * We must not return kernel stack data. * due to padding, it's not enough @@ -519,7 +514,8 @@ SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf) return -EFAULT; return (max_id < 0) ? 0 : max_id; } - case MSG_STAT: /* msqid is an index rather than a msg queue id */ + + case MSG_STAT: case IPC_STAT: { struct msqid64_ds tbuf; @@ -563,19 +559,42 @@ SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf) return -EFAULT; return success_return; } - case IPC_SET: - case IPC_RMID: - err = msgctl_down(ns, msqid, cmd, buf, version); - return err; + default: - return -EINVAL; + return -EINVAL; } + return err; out_unlock: msg_unlock(msq); return err; } +SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf) +{ + int version; + struct ipc_namespace *ns; + + if (msqid < 0 || cmd < 0) + return -EINVAL; + + version = ipc_parse_version(&cmd); + ns = current->nsproxy->ipc_ns; + + switch (cmd) { + case IPC_INFO: + case MSG_INFO: + case MSG_STAT: /* msqid is an index rather than a msg queue id */ + case IPC_STAT: + return msgctl_nolock(ns, msqid, cmd, version, buf); + case IPC_SET: + case IPC_RMID: + return msgctl_down(ns, msqid, cmd, buf, version); + default: + return -EINVAL; + } +} + static int testmsg(struct msg_msg *msg, long type, int mode) { switch(mode) From a5001a0d9768568de5d613c3b3a5b9c7721299da Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 8 Jul 2013 16:01:15 -0700 Subject: [PATCH 084/118] ipc,msg: introduce lockless functions to obtain the ipc object Add msq_obtain_object() and msq_obtain_object_check(), which will allow us to get the ipc object without acquiring the lock. Just as with semaphores, these functions are basically wrappers around ipc_obtain_object*(). Signed-off-by: Davidlohr Bueso Cc: Andi Kleen Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/msg.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/ipc/msg.c b/ipc/msg.c index f45be81f6de9..c53c13716064 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -166,6 +166,27 @@ static inline struct msg_queue *msg_lock_check(struct ipc_namespace *ns, return container_of(ipcp, struct msg_queue, q_perm); } +static inline struct msg_queue *msq_obtain_object(struct ipc_namespace *ns, int id) +{ + struct kern_ipc_perm *ipcp = ipc_obtain_object(&msg_ids(ns), id); + + if (IS_ERR(ipcp)) + return ERR_CAST(ipcp); + + return container_of(ipcp, struct msg_queue, q_perm); +} + +static inline struct msg_queue *msq_obtain_object_check(struct ipc_namespace *ns, + int id) +{ + struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&msg_ids(ns), id); + + if (IS_ERR(ipcp)) + return ERR_CAST(ipcp); + + return container_of(ipcp, struct msg_queue, q_perm); +} + static inline void msg_rmid(struct ipc_namespace *ns, struct msg_queue *s) { ipc_rmid(&msg_ids(ns), &s->q_perm); From ac0ba20ea6f2201a1589d6dc26ad1a4f0f967bb8 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 8 Jul 2013 16:01:16 -0700 Subject: [PATCH 085/118] ipc,msg: make msgctl_nolock lockless While the INFO cmd doesn't take the ipc lock, the STAT commands do acquire it unnecessarily. We can do the permissions and security checks only holding the rcu lock. This function now mimics semctl_nolock(). Signed-off-by: Davidlohr Bueso Cc: Andi Kleen Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/msg.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/ipc/msg.c b/ipc/msg.c index c53c13716064..c218328b5980 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -545,17 +545,25 @@ static int msgctl_nolock(struct ipc_namespace *ns, int msqid, if (!buf) return -EFAULT; + memset(&tbuf, 0, sizeof(tbuf)); + + rcu_read_lock(); if (cmd == MSG_STAT) { - msq = msg_lock(ns, msqid); - if (IS_ERR(msq)) - return PTR_ERR(msq); + msq = msq_obtain_object(ns, msqid); + if (IS_ERR(msq)) { + err = PTR_ERR(msq); + goto out_unlock; + } success_return = msq->q_perm.id; } else { - msq = msg_lock_check(ns, msqid); - if (IS_ERR(msq)) - return PTR_ERR(msq); + msq = msq_obtain_object_check(ns, msqid); + if (IS_ERR(msq)) { + err = PTR_ERR(msq); + goto out_unlock; + } success_return = 0; } + err = -EACCES; if (ipcperms(ns, &msq->q_perm, S_IRUGO)) goto out_unlock; @@ -564,8 +572,6 @@ static int msgctl_nolock(struct ipc_namespace *ns, int msqid, if (err) goto out_unlock; - memset(&tbuf, 0, sizeof(tbuf)); - kernel_to_ipc64_perm(&msq->q_perm, &tbuf.msg_perm); tbuf.msg_stime = msq->q_stime; tbuf.msg_rtime = msq->q_rtime; @@ -575,7 +581,8 @@ static int msgctl_nolock(struct ipc_namespace *ns, int msqid, tbuf.msg_qbytes = msq->q_qbytes; tbuf.msg_lspid = msq->q_lspid; tbuf.msg_lrpid = msq->q_lrpid; - msg_unlock(msq); + rcu_read_unlock(); + if (copy_msqid_to_user(buf, &tbuf, version)) return -EFAULT; return success_return; @@ -587,7 +594,7 @@ static int msgctl_nolock(struct ipc_namespace *ns, int msqid, return err; out_unlock: - msg_unlock(msq); + rcu_read_unlock(); return err; } From 3dd1f784ed6603d7ab1043e51e6371235edf2313 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 8 Jul 2013 16:01:17 -0700 Subject: [PATCH 086/118] ipc,msg: shorten critical region in msgsnd do_msgsnd() is another function that does too many things with the ipc object lock acquired. Take it only when needed when actually updating msq. Signed-off-by: Davidlohr Bueso Cc: Andi Kleen Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/msg.c | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/ipc/msg.c b/ipc/msg.c index c218328b5980..f2a1a8f30cd4 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -698,10 +698,11 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, msg->m_type = mtype; msg->m_ts = msgsz; - msq = msg_lock_check(ns, msqid); + rcu_read_lock(); + msq = msq_obtain_object_check(ns, msqid); if (IS_ERR(msq)) { err = PTR_ERR(msq); - goto out_free; + goto out_unlock1; } for (;;) { @@ -709,11 +710,11 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, err = -EACCES; if (ipcperms(ns, &msq->q_perm, S_IWUGO)) - goto out_unlock_free; + goto out_unlock1; err = security_msg_queue_msgsnd(msq, msg, msgflg); if (err) - goto out_unlock_free; + goto out_unlock1; if (msgsz + msq->q_cbytes <= msq->q_qbytes && 1 + msq->q_qnum <= msq->q_qbytes) { @@ -723,32 +724,41 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, /* queue full, wait: */ if (msgflg & IPC_NOWAIT) { err = -EAGAIN; - goto out_unlock_free; + goto out_unlock1; } + + ipc_lock_object(&msq->q_perm); ss_add(msq, &s); if (!ipc_rcu_getref(msq)) { err = -EIDRM; - goto out_unlock_free; + goto out_unlock0; } - msg_unlock(msq); + ipc_unlock_object(&msq->q_perm); + rcu_read_unlock(); schedule(); - ipc_lock_by_ptr(&msq->q_perm); + rcu_read_lock(); + ipc_lock_object(&msq->q_perm); + ipc_rcu_putref(msq); if (msq->q_perm.deleted) { err = -EIDRM; - goto out_unlock_free; + goto out_unlock0; } + ss_del(&s); if (signal_pending(current)) { err = -ERESTARTNOHAND; - goto out_unlock_free; + goto out_unlock0; } + + ipc_unlock_object(&msq->q_perm); } + ipc_lock_object(&msq->q_perm); msq->q_lspid = task_tgid_vnr(current); msq->q_stime = get_seconds(); @@ -764,9 +774,10 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, err = 0; msg = NULL; -out_unlock_free: - msg_unlock(msq); -out_free: +out_unlock0: + ipc_unlock_object(&msq->q_perm); +out_unlock1: + rcu_read_unlock(); if (msg != NULL) free_msg(msg); return err; From 41a0d523d0f626e9da0dc01de47f1b89058033cf Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 8 Jul 2013 16:01:18 -0700 Subject: [PATCH 087/118] ipc,msg: shorten critical region in msgrcv do_msgrcv() is the last msg queue function that abuses the ipc lock Take it only when needed when actually updating msq. Signed-off-by: Davidlohr Bueso Cc: Andi Kleen Cc: Rik van Riel Tested-by: Sedat Dilek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/msg.c | 58 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 32 insertions(+), 26 deletions(-) diff --git a/ipc/msg.c b/ipc/msg.c index f2a1a8f30cd4..a3c0dc40a0cf 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -885,21 +885,19 @@ static struct msg_msg *find_msg(struct msg_queue *msq, long *msgtyp, int mode) return ERR_PTR(-EAGAIN); } - -long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, - int msgflg, +long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgflg, long (*msg_handler)(void __user *, struct msg_msg *, size_t)) { - struct msg_queue *msq; - struct msg_msg *msg; int mode; + struct msg_queue *msq; struct ipc_namespace *ns; - struct msg_msg *copy = NULL; + struct msg_msg *msg, *copy = NULL; ns = current->nsproxy->ipc_ns; if (msqid < 0 || (long) bufsz < 0) return -EINVAL; + if (msgflg & MSG_COPY) { copy = prepare_copy(buf, min_t(size_t, bufsz, ns->msg_ctlmax)); if (IS_ERR(copy)) @@ -907,8 +905,10 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, } mode = convert_mode(&msgtyp, msgflg); - msq = msg_lock_check(ns, msqid); + rcu_read_lock(); + msq = msq_obtain_object_check(ns, msqid); if (IS_ERR(msq)) { + rcu_read_unlock(); free_copy(copy); return PTR_ERR(msq); } @@ -918,10 +918,10 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, msg = ERR_PTR(-EACCES); if (ipcperms(ns, &msq->q_perm, S_IRUGO)) - goto out_unlock; + goto out_unlock1; + ipc_lock_object(&msq->q_perm); msg = find_msg(msq, &msgtyp, mode); - if (!IS_ERR(msg)) { /* * Found a suitable message. @@ -929,7 +929,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, */ if ((bufsz < msg->m_ts) && !(msgflg & MSG_NOERROR)) { msg = ERR_PTR(-E2BIG); - goto out_unlock; + goto out_unlock0; } /* * If we are copying, then do not unlink message and do @@ -937,8 +937,9 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, */ if (msgflg & MSG_COPY) { msg = copy_msg(msg, copy); - goto out_unlock; + goto out_unlock0; } + list_del(&msg->m_list); msq->q_qnum--; msq->q_rtime = get_seconds(); @@ -947,14 +948,16 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, atomic_sub(msg->m_ts, &ns->msg_bytes); atomic_dec(&ns->msg_hdrs); ss_wakeup(&msq->q_senders, 0); - msg_unlock(msq); - break; + + goto out_unlock0; } + /* No message waiting. Wait for a message */ if (msgflg & IPC_NOWAIT) { msg = ERR_PTR(-ENOMSG); - goto out_unlock; + goto out_unlock0; } + list_add_tail(&msr_d.r_list, &msq->q_receivers); msr_d.r_tsk = current; msr_d.r_msgtype = msgtyp; @@ -965,8 +968,9 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, msr_d.r_maxsize = bufsz; msr_d.r_msg = ERR_PTR(-EAGAIN); current->state = TASK_INTERRUPTIBLE; - msg_unlock(msq); + ipc_unlock_object(&msq->q_perm); + rcu_read_unlock(); schedule(); /* Lockless receive, part 1: @@ -977,7 +981,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, * Prior to destruction, expunge_all(-EIRDM) changes r_msg. * Thus if r_msg is -EAGAIN, then the queue not yet destroyed. * rcu_read_lock() prevents preemption between reading r_msg - * and the spin_lock() inside ipc_lock_by_ptr(). + * and acquiring the q_perm.lock in ipc_lock_object(). */ rcu_read_lock(); @@ -996,32 +1000,34 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, * If there is a message or an error then accept it without * locking. */ - if (msg != ERR_PTR(-EAGAIN)) { - rcu_read_unlock(); - break; - } + if (msg != ERR_PTR(-EAGAIN)) + goto out_unlock1; /* Lockless receive, part 3: * Acquire the queue spinlock. */ - ipc_lock_by_ptr(&msq->q_perm); - rcu_read_unlock(); + ipc_lock_object(&msq->q_perm); /* Lockless receive, part 4: * Repeat test after acquiring the spinlock. */ msg = (struct msg_msg*)msr_d.r_msg; if (msg != ERR_PTR(-EAGAIN)) - goto out_unlock; + goto out_unlock0; list_del(&msr_d.r_list); if (signal_pending(current)) { msg = ERR_PTR(-ERESTARTNOHAND); -out_unlock: - msg_unlock(msq); - break; + goto out_unlock0; } + + ipc_unlock_object(&msq->q_perm); } + +out_unlock0: + ipc_unlock_object(&msq->q_perm); +out_unlock1: + rcu_read_unlock(); if (IS_ERR(msg)) { free_copy(copy); return PTR_ERR(msg); From 9ad66ae65fc8d3e7e3344310fb0aa835910264fe Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 8 Jul 2013 16:01:19 -0700 Subject: [PATCH 088/118] ipc: remove unused functions We can now drop the msg_lock and msg_lock_check functions along with a bogus comment introduced previously in semctl_down. Signed-off-by: Davidlohr Bueso Cc: Andi Kleen Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/msg.c | 25 ------------------------- ipc/sem.c | 1 - 2 files changed, 26 deletions(-) diff --git a/ipc/msg.c b/ipc/msg.c index a3c0dc40a0cf..bd60d7e159e8 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -141,31 +141,6 @@ void __init msg_init(void) IPC_MSG_IDS, sysvipc_msg_proc_show); } -/* - * msg_lock_(check_) routines are called in the paths where the rw_mutex - * is not held. - */ -static inline struct msg_queue *msg_lock(struct ipc_namespace *ns, int id) -{ - struct kern_ipc_perm *ipcp = ipc_lock(&msg_ids(ns), id); - - if (IS_ERR(ipcp)) - return (struct msg_queue *)ipcp; - - return container_of(ipcp, struct msg_queue, q_perm); -} - -static inline struct msg_queue *msg_lock_check(struct ipc_namespace *ns, - int id) -{ - struct kern_ipc_perm *ipcp = ipc_lock_check(&msg_ids(ns), id); - - if (IS_ERR(ipcp)) - return (struct msg_queue *)ipcp; - - return container_of(ipcp, struct msg_queue, q_perm); -} - static inline struct msg_queue *msq_obtain_object(struct ipc_namespace *ns, int id) { struct kern_ipc_perm *ipcp = ipc_obtain_object(&msg_ids(ns), id); diff --git a/ipc/sem.c b/ipc/sem.c index b4b892b5c5f8..d3ad3573bc6f 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -1296,7 +1296,6 @@ static int semctl_down(struct ipc_namespace *ns, int semid, &semid64.sem_perm, 0); if (IS_ERR(ipcp)) { err = PTR_ERR(ipcp); - /* the ipc lock is not held upon failure */ goto out_unlock1; } From 196aa0132fc7261f34b10ae1bfb44abc1bc69b3c Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Mon, 8 Jul 2013 16:01:20 -0700 Subject: [PATCH 089/118] ipc/util.c, ipc_rcu_alloc: cacheline align allocation Enforce that ipc_rcu_alloc returns a cacheline aligned pointer on SMP. Rationale: The SysV sem code tries to move the main spinlock into a seperate cacheline (____cacheline_aligned_in_smp). This works only if ipc_rcu_alloc returns cacheline aligned pointers. vmalloc and kmalloc return cacheline algined pointers, the implementation of ipc_rcu_alloc breaks that. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Manfred Spraul Cc: Rik van Riel Cc: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/util.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ipc/util.c b/ipc/util.c index a0c139f3d1f3..4704223bfad4 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -468,9 +468,7 @@ void ipc_free(void* ptr, int size) struct ipc_rcu { struct rcu_head rcu; atomic_t refcount; - /* "void *" makes sure alignment of following data is sane. */ - void *data[0]; -}; +} ____cacheline_aligned_in_smp; /** * ipc_rcu_alloc - allocate ipc and rcu space @@ -488,12 +486,14 @@ void *ipc_rcu_alloc(int size) if (unlikely(!out)) return NULL; atomic_set(&out->refcount, 1); - return out->data; + return out + 1; } int ipc_rcu_getref(void *ptr) { - return atomic_inc_not_zero(&container_of(ptr, struct ipc_rcu, data)->refcount); + struct ipc_rcu *p = ((struct ipc_rcu *)ptr) - 1; + + return atomic_inc_not_zero(&p->refcount); } /** @@ -507,7 +507,7 @@ static void ipc_schedule_free(struct rcu_head *head) void ipc_rcu_putref(void *ptr) { - struct ipc_rcu *p = container_of(ptr, struct ipc_rcu, data); + struct ipc_rcu *p = ((struct ipc_rcu *)ptr) - 1; if (!atomic_dec_and_test(&p->refcount)) return; From f5c936c0f267ec58641451cf8b8d39b4c207ee4d Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Mon, 8 Jul 2013 16:01:22 -0700 Subject: [PATCH 090/118] ipc/sem.c: cacheline align the semaphore structures As now each semaphore has its own spinlock and parallel operations are possible, give each semaphore its own cacheline. On a i3 laptop, this gives up to 28% better performance: #semscale 10 | grep "interleave 2" - before: Cpus 1, interleave 2 delay 0: 36109234 in 10 secs Cpus 2, interleave 2 delay 0: 55276317 in 10 secs Cpus 3, interleave 2 delay 0: 62411025 in 10 secs Cpus 4, interleave 2 delay 0: 81963928 in 10 secs -after: Cpus 1, interleave 2 delay 0: 35527306 in 10 secs Cpus 2, interleave 2 delay 0: 70922909 in 10 secs <<< + 28% Cpus 3, interleave 2 delay 0: 80518538 in 10 secs Cpus 4, interleave 2 delay 0: 89115148 in 10 secs <<< + 8.7% i3, with 2 cores and with hyperthreading enabled. Interleave 2 in order use first the full cores. HT partially hides the delay from cacheline trashing, thus the improvement is "only" 8.7% if 4 threads are running. Signed-off-by: Manfred Spraul Cc: Rik van Riel Cc: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/sem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ipc/sem.c b/ipc/sem.c index d3ad3573bc6f..8498b67a3b62 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -96,7 +96,7 @@ struct sem { int sempid; /* pid of last operation */ spinlock_t lock; /* spinlock for fine-grained semtimedop */ struct list_head sem_pending; /* pending single-sop operations */ -}; +} ____cacheline_aligned_in_smp; /* One queue for each sleeping process in the system. */ struct sem_queue { From 1a82e9e1d0f1b45f47a97c9e2349020536ff8987 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Mon, 8 Jul 2013 16:01:23 -0700 Subject: [PATCH 091/118] ipc/sem: separate wait-for-zero and alter tasks into seperate queues Introduce separate queues for operations that do not modify the semaphore values. Advantages: - Simpler logic in check_restart(). - Faster update_queue(): Right now, all wait-for-zero operations are always tested, even if the semaphore value is not 0. - wait-for-zero gets again priority, as in linux <=3.0.9 Signed-off-by: Manfred Spraul Cc: Rik van Riel Cc: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sem.h | 5 +- ipc/sem.c | 211 +++++++++++++++++++++++++++++++------------- 2 files changed, 155 insertions(+), 61 deletions(-) diff --git a/include/linux/sem.h b/include/linux/sem.h index 53d42650b193..55e17f68d256 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -15,7 +15,10 @@ struct sem_array { time_t sem_otime; /* last semop time */ time_t sem_ctime; /* last change time */ struct sem *sem_base; /* ptr to first semaphore in array */ - struct list_head sem_pending; /* pending operations to be processed */ + struct list_head pending_alter; /* pending operations */ + /* that alter the array */ + struct list_head pending_const; /* pending complex operations */ + /* that do not alter semvals */ struct list_head list_id; /* undo requests on this array */ int sem_nsems; /* no. of semaphores in array */ int complex_count; /* pending complex operations */ diff --git a/ipc/sem.c b/ipc/sem.c index 8498b67a3b62..4d7f88cefada 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -95,7 +95,10 @@ struct sem { int semval; /* current value */ int sempid; /* pid of last operation */ spinlock_t lock; /* spinlock for fine-grained semtimedop */ - struct list_head sem_pending; /* pending single-sop operations */ + struct list_head pending_alter; /* pending single-sop operations */ + /* that alter the semaphore */ + struct list_head pending_const; /* pending single-sop operations */ + /* that do not alter the semaphore*/ } ____cacheline_aligned_in_smp; /* One queue for each sleeping process in the system. */ @@ -152,7 +155,7 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it); /* * linked list protection: * sem_undo.id_next, - * sem_array.sem_pending{,last}, + * sem_array.pending{_alter,_cont}, * sem_array.sem_undo: sem_lock() for read/write * sem_undo.proc_next: only "current" is allowed to read/write that field. * @@ -337,7 +340,7 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s) * Without the check/retry algorithm a lockless wakeup is possible: * - queue.status is initialized to -EINTR before blocking. * - wakeup is performed by - * * unlinking the queue entry from sma->sem_pending + * * unlinking the queue entry from the pending list * * setting queue.status to IN_WAKEUP * This is the notification for the blocked thread that a * result value is imminent. @@ -418,12 +421,14 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params) sma->sem_base = (struct sem *) &sma[1]; for (i = 0; i < nsems; i++) { - INIT_LIST_HEAD(&sma->sem_base[i].sem_pending); + INIT_LIST_HEAD(&sma->sem_base[i].pending_alter); + INIT_LIST_HEAD(&sma->sem_base[i].pending_const); spin_lock_init(&sma->sem_base[i].lock); } sma->complex_count = 0; - INIT_LIST_HEAD(&sma->sem_pending); + INIT_LIST_HEAD(&sma->pending_alter); + INIT_LIST_HEAD(&sma->pending_const); INIT_LIST_HEAD(&sma->list_id); sma->sem_nsems = nsems; sma->sem_ctime = get_seconds(); @@ -609,60 +614,132 @@ static void unlink_queue(struct sem_array *sma, struct sem_queue *q) * update_queue is O(N^2) when it restarts scanning the whole queue of * waiting operations. Therefore this function checks if the restart is * really necessary. It is called after a previously waiting operation - * was completed. + * modified the array. + * Note that wait-for-zero operations are handled without restart. */ static int check_restart(struct sem_array *sma, struct sem_queue *q) { - struct sem *curr; - struct sem_queue *h; - - /* if the operation didn't modify the array, then no restart */ - if (q->alter == 0) - return 0; - - /* pending complex operations are too difficult to analyse */ - if (sma->complex_count) + /* pending complex alter operations are too difficult to analyse */ + if (!list_empty(&sma->pending_alter)) return 1; /* we were a sleeping complex operation. Too difficult */ if (q->nsops > 1) return 1; - curr = sma->sem_base + q->sops[0].sem_num; + /* It is impossible that someone waits for the new value: + * - complex operations always restart. + * - wait-for-zero are handled seperately. + * - q is a previously sleeping simple operation that + * altered the array. It must be a decrement, because + * simple increments never sleep. + * - If there are older (higher priority) decrements + * in the queue, then they have observed the original + * semval value and couldn't proceed. The operation + * decremented to value - thus they won't proceed either. + */ + return 0; +} - /* No-one waits on this queue */ - if (list_empty(&curr->sem_pending)) - return 0; +/** + * wake_const_ops(sma, semnum, pt) - Wake up non-alter tasks + * @sma: semaphore array. + * @semnum: semaphore that was modified. + * @pt: list head for the tasks that must be woken up. + * + * wake_const_ops must be called after a semaphore in a semaphore array + * was set to 0. If complex const operations are pending, wake_const_ops must + * be called with semnum = -1, as well as with the number of each modified + * semaphore. + * The tasks that must be woken up are added to @pt. The return code + * is stored in q->pid. + * The function returns 1 if at least one operation was completed successfully. + */ +static int wake_const_ops(struct sem_array *sma, int semnum, + struct list_head *pt) +{ + struct sem_queue *q; + struct list_head *walk; + struct list_head *pending_list; + int semop_completed = 0; - /* the new semaphore value */ - if (curr->semval) { - /* It is impossible that someone waits for the new value: - * - q is a previously sleeping simple operation that - * altered the array. It must be a decrement, because - * simple increments never sleep. - * - The value is not 0, thus wait-for-zero won't proceed. - * - If there are older (higher priority) decrements - * in the queue, then they have observed the original - * semval value and couldn't proceed. The operation - * decremented to value - thus they won't proceed either. + if (semnum == -1) + pending_list = &sma->pending_const; + else + pending_list = &sma->sem_base[semnum].pending_const; + + walk = pending_list->next; + while (walk != pending_list) { + int error; + + q = container_of(walk, struct sem_queue, list); + walk = walk->next; + + error = try_atomic_semop(sma, q->sops, q->nsops, + q->undo, q->pid); + + if (error <= 0) { + /* operation completed, remove from queue & wakeup */ + + unlink_queue(sma, q); + + wake_up_sem_queue_prepare(pt, q, error); + if (error == 0) + semop_completed = 1; + } + } + return semop_completed; +} + +/** + * do_smart_wakeup_zero(sma, sops, nsops, pt) - wakeup all wait for zero tasks + * @sma: semaphore array + * @sops: operations that were performed + * @nsops: number of operations + * @pt: list head of the tasks that must be woken up. + * + * do_smart_wakeup_zero() checks all required queue for wait-for-zero + * operations, based on the actual changes that were performed on the + * semaphore array. + * The function returns 1 if at least one operation was completed successfully. + */ +static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, + int nsops, struct list_head *pt) +{ + int i; + int semop_completed = 0; + int got_zero = 0; + + /* first: the per-semaphore queues, if known */ + if (sops) { + for (i = 0; i < nsops; i++) { + int num = sops[i].sem_num; + + if (sma->sem_base[num].semval == 0) { + got_zero = 1; + semop_completed |= wake_const_ops(sma, num, pt); + } + } + } else { + /* + * No sops means modified semaphores not known. + * Assume all were changed. */ - BUG_ON(q->sops[0].sem_op >= 0); - return 0; + for (i = 0; i < sma->sem_nsems; i++) { + if (sma->sem_base[i].semval == 0) { + got_zero = 1; + semop_completed |= wake_const_ops(sma, i, pt); + } + } } /* - * semval is 0. Check if there are wait-for-zero semops. - * They must be the first entries in the per-semaphore queue + * If one of the modified semaphores got 0, + * then check the global queue, too. */ - h = list_first_entry(&curr->sem_pending, struct sem_queue, list); - BUG_ON(h->nsops != 1); - BUG_ON(h->sops[0].sem_num != q->sops[0].sem_num); + if (got_zero) + semop_completed |= wake_const_ops(sma, -1, pt); - /* Yes, there is a wait-for-zero semop. Restart */ - if (h->sops[0].sem_op == 0) - return 1; - - /* Again - no-one is waiting for the new value. */ - return 0; + return semop_completed; } @@ -678,6 +755,8 @@ static int check_restart(struct sem_array *sma, struct sem_queue *q) * semaphore. * The tasks that must be woken up are added to @pt. The return code * is stored in q->pid. + * The function internally checks if const operations can now succeed. + * * The function return 1 if at least one semop was completed successfully. */ static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt) @@ -688,9 +767,9 @@ static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt) int semop_completed = 0; if (semnum == -1) - pending_list = &sma->sem_pending; + pending_list = &sma->pending_alter; else - pending_list = &sma->sem_base[semnum].sem_pending; + pending_list = &sma->sem_base[semnum].pending_alter; again: walk = pending_list->next; @@ -702,13 +781,12 @@ again: /* If we are scanning the single sop, per-semaphore list of * one semaphore and that semaphore is 0, then it is not - * necessary to scan the "alter" entries: simple increments + * necessary to scan further: simple increments * that affect only one entry succeed immediately and cannot * be in the per semaphore pending queue, and decrements * cannot be successful if the value is already 0. */ - if (semnum != -1 && sma->sem_base[semnum].semval == 0 && - q->alter) + if (semnum != -1 && sma->sem_base[semnum].semval == 0) break; error = try_atomic_semop(sma, q->sops, q->nsops, @@ -724,6 +802,7 @@ again: restart = 0; } else { semop_completed = 1; + do_smart_wakeup_zero(sma, q->sops, q->nsops, pt); restart = check_restart(sma, q); } @@ -742,8 +821,8 @@ again: * @otime: force setting otime * @pt: list head of the tasks that must be woken up. * - * do_smart_update() does the required called to update_queue, based on the - * actual changes that were performed on the semaphore array. + * do_smart_update() does the required calls to update_queue and wakeup_zero, + * based on the actual changes that were performed on the semaphore array. * Note that the function does not do the actual wake-up: the caller is * responsible for calling wake_up_sem_queue_do(@pt). * It is safe to perform this call after dropping all locks. @@ -754,6 +833,8 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop int i; int progress; + otime |= do_smart_wakeup_zero(sma, sops, nsops, pt); + progress = 1; retry_global: if (sma->complex_count) { @@ -813,14 +894,14 @@ static int count_semncnt (struct sem_array * sma, ushort semnum) struct sem_queue * q; semncnt = 0; - list_for_each_entry(q, &sma->sem_base[semnum].sem_pending, list) { + list_for_each_entry(q, &sma->sem_base[semnum].pending_alter, list) { struct sembuf * sops = q->sops; BUG_ON(sops->sem_num != semnum); if ((sops->sem_op < 0) && !(sops->sem_flg & IPC_NOWAIT)) semncnt++; } - list_for_each_entry(q, &sma->sem_pending, list) { + list_for_each_entry(q, &sma->pending_alter, list) { struct sembuf * sops = q->sops; int nsops = q->nsops; int i; @@ -839,14 +920,14 @@ static int count_semzcnt (struct sem_array * sma, ushort semnum) struct sem_queue * q; semzcnt = 0; - list_for_each_entry(q, &sma->sem_base[semnum].sem_pending, list) { + list_for_each_entry(q, &sma->sem_base[semnum].pending_const, list) { struct sembuf * sops = q->sops; BUG_ON(sops->sem_num != semnum); if ((sops->sem_op == 0) && !(sops->sem_flg & IPC_NOWAIT)) semzcnt++; } - list_for_each_entry(q, &sma->sem_pending, list) { + list_for_each_entry(q, &sma->pending_const, list) { struct sembuf * sops = q->sops; int nsops = q->nsops; int i; @@ -884,13 +965,22 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) /* Wake up all pending processes and let them fail with EIDRM. */ INIT_LIST_HEAD(&tasks); - list_for_each_entry_safe(q, tq, &sma->sem_pending, list) { + list_for_each_entry_safe(q, tq, &sma->pending_const, list) { + unlink_queue(sma, q); + wake_up_sem_queue_prepare(&tasks, q, -EIDRM); + } + + list_for_each_entry_safe(q, tq, &sma->pending_alter, list) { unlink_queue(sma, q); wake_up_sem_queue_prepare(&tasks, q, -EIDRM); } for (i = 0; i < sma->sem_nsems; i++) { struct sem *sem = sma->sem_base + i; - list_for_each_entry_safe(q, tq, &sem->sem_pending, list) { + list_for_each_entry_safe(q, tq, &sem->pending_const, list) { + unlink_queue(sma, q); + wake_up_sem_queue_prepare(&tasks, q, -EIDRM); + } + list_for_each_entry_safe(q, tq, &sem->pending_alter, list) { unlink_queue(sma, q); wake_up_sem_queue_prepare(&tasks, q, -EIDRM); } @@ -1658,14 +1748,15 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, curr = &sma->sem_base[sops->sem_num]; if (alter) - list_add_tail(&queue.list, &curr->sem_pending); + list_add_tail(&queue.list, &curr->pending_alter); else - list_add(&queue.list, &curr->sem_pending); + list_add_tail(&queue.list, &curr->pending_const); } else { if (alter) - list_add_tail(&queue.list, &sma->sem_pending); + list_add_tail(&queue.list, &sma->pending_alter); else - list_add(&queue.list, &sma->sem_pending); + list_add_tail(&queue.list, &sma->pending_const); + sma->complex_count++; } From f269f40ad5aeee229ed70044926f44318abe41ef Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Mon, 8 Jul 2013 16:01:24 -0700 Subject: [PATCH 092/118] ipc/sem.c: always use only one queue for alter operations There are two places that can contain alter operations: - the global queue: sma->pending_alter - the per-semaphore queues: sma->sem_base[].pending_alter. Since one of the queues must be processed first, this causes an odd priorization of the wakeups: complex operations have priority over simple ops. The patch restores the behavior of linux <=3.0.9: The longest waiting operation has the highest priority. This is done by using only one queue: - if there are complex ops, then sma->pending_alter is used. - otherwise, the per-semaphore queues are used. As a side effect, do_smart_update_queue() becomes much simpler: no more goto logic. Signed-off-by: Manfred Spraul Cc: Rik van Riel Cc: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/sem.c | 128 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 88 insertions(+), 40 deletions(-) diff --git a/ipc/sem.c b/ipc/sem.c index 4d7f88cefada..6291257ee049 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -192,6 +192,53 @@ void __init sem_init (void) IPC_SEM_IDS, sysvipc_sem_proc_show); } +/** + * unmerge_queues - unmerge queues, if possible. + * @sma: semaphore array + * + * The function unmerges the wait queues if complex_count is 0. + * It must be called prior to dropping the global semaphore array lock. + */ +static void unmerge_queues(struct sem_array *sma) +{ + struct sem_queue *q, *tq; + + /* complex operations still around? */ + if (sma->complex_count) + return; + /* + * We will switch back to simple mode. + * Move all pending operation back into the per-semaphore + * queues. + */ + list_for_each_entry_safe(q, tq, &sma->pending_alter, list) { + struct sem *curr; + curr = &sma->sem_base[q->sops[0].sem_num]; + + list_add_tail(&q->list, &curr->pending_alter); + } + INIT_LIST_HEAD(&sma->pending_alter); +} + +/** + * merge_queues - Merge single semop queues into global queue + * @sma: semaphore array + * + * This function merges all per-semaphore queues into the global queue. + * It is necessary to achieve FIFO ordering for the pending single-sop + * operations when a multi-semop operation must sleep. + * Only the alter operations must be moved, the const operations can stay. + */ +static void merge_queues(struct sem_array *sma) +{ + int i; + for (i = 0; i < sma->sem_nsems; i++) { + struct sem *sem = sma->sem_base + i; + + list_splice_init(&sem->pending_alter, &sma->pending_alter); + } +} + /* * If the request contains only one semaphore operation, and there are * no complex transactions pending, lock only the semaphore involved. @@ -262,6 +309,7 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops, static inline void sem_unlock(struct sem_array *sma, int locknum) { if (locknum == -1) { + unmerge_queues(sma); ipc_unlock_object(&sma->sem_perm); } else { struct sem *sem = sma->sem_base + locknum; @@ -831,49 +879,38 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop int otime, struct list_head *pt) { int i; - int progress; otime |= do_smart_wakeup_zero(sma, sops, nsops, pt); - progress = 1; -retry_global: - if (sma->complex_count) { - if (update_queue(sma, -1, pt)) { - progress = 1; - otime = 1; - sops = NULL; - } - } - if (!progress) - goto done; - - if (!sops) { - /* No semops; something special is going on. */ - for (i = 0; i < sma->sem_nsems; i++) { - if (update_queue(sma, i, pt)) { - otime = 1; - progress = 1; + if (!list_empty(&sma->pending_alter)) { + /* semaphore array uses the global queue - just process it. */ + otime |= update_queue(sma, -1, pt); + } else { + if (!sops) { + /* + * No sops, thus the modified semaphores are not + * known. Check all. + */ + for (i = 0; i < sma->sem_nsems; i++) + otime |= update_queue(sma, i, pt); + } else { + /* + * Check the semaphores that were increased: + * - No complex ops, thus all sleeping ops are + * decrease. + * - if we decreased the value, then any sleeping + * semaphore ops wont be able to run: If the + * previous value was too small, then the new + * value will be too small, too. + */ + for (i = 0; i < nsops; i++) { + if (sops[i].sem_op > 0) { + otime |= update_queue(sma, + sops[i].sem_num, pt); + } } } - goto done_checkretry; } - - /* Check the semaphores that were modified. */ - for (i = 0; i < nsops; i++) { - if (sops[i].sem_op > 0 || - (sops[i].sem_op < 0 && - sma->sem_base[sops[i].sem_num].semval == 0)) - if (update_queue(sma, sops[i].sem_num, pt)) { - otime = 1; - progress = 1; - } - } -done_checkretry: - if (progress) { - progress = 0; - goto retry_global; - } -done: if (otime) sma->sem_otime = get_seconds(); } @@ -1747,11 +1784,22 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, struct sem *curr; curr = &sma->sem_base[sops->sem_num]; - if (alter) - list_add_tail(&queue.list, &curr->pending_alter); - else + if (alter) { + if (sma->complex_count) { + list_add_tail(&queue.list, + &sma->pending_alter); + } else { + + list_add_tail(&queue.list, + &curr->pending_alter); + } + } else { list_add_tail(&queue.list, &curr->pending_const); + } } else { + if (!sma->complex_count) + merge_queues(sma); + if (alter) list_add_tail(&queue.list, &sma->pending_alter); else From d12e1e50e47e0900dbbf52237b7e171f4f15ea1e Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Mon, 8 Jul 2013 16:01:25 -0700 Subject: [PATCH 093/118] ipc/sem.c: replace shared sem_otime with per-semaphore value sem_otime contains the time of the last semaphore operation that completed successfully. Every operation updates this value, thus access from multiple cpus can cause thrashing. Therefore the patch replaces the variable with a per-semaphore variable. The per-array sem_otime is only calculated when required. No performance improvement on a single-socket i3 - only important for larger systems. Signed-off-by: Manfred Spraul Cc: Rik van Riel Cc: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sem.h | 1 - ipc/sem.c | 37 +++++++++++++++++++++++++++++++------ 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/include/linux/sem.h b/include/linux/sem.h index 55e17f68d256..976ce3a19f1b 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -12,7 +12,6 @@ struct task_struct; struct sem_array { struct kern_ipc_perm ____cacheline_aligned_in_smp sem_perm; /* permissions .. see ipc.h */ - time_t sem_otime; /* last semop time */ time_t sem_ctime; /* last change time */ struct sem *sem_base; /* ptr to first semaphore in array */ struct list_head pending_alter; /* pending operations */ diff --git a/ipc/sem.c b/ipc/sem.c index 6291257ee049..51352e1bfff9 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -99,6 +99,7 @@ struct sem { /* that alter the semaphore */ struct list_head pending_const; /* pending single-sop operations */ /* that do not alter the semaphore*/ + time_t sem_otime; /* candidate for sem_otime */ } ____cacheline_aligned_in_smp; /* One queue for each sleeping process in the system. */ @@ -911,8 +912,14 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop } } } - if (otime) - sma->sem_otime = get_seconds(); + if (otime) { + if (sops == NULL) { + sma->sem_base[0].sem_otime = get_seconds(); + } else { + sma->sem_base[sops[0].sem_num].sem_otime = + get_seconds(); + } + } } @@ -1058,6 +1065,21 @@ static unsigned long copy_semid_to_user(void __user *buf, struct semid64_ds *in, } } +static time_t get_semotime(struct sem_array *sma) +{ + int i; + time_t res; + + res = sma->sem_base[0].sem_otime; + for (i = 1; i < sma->sem_nsems; i++) { + time_t to = sma->sem_base[i].sem_otime; + + if (to > res) + res = to; + } + return res; +} + static int semctl_nolock(struct ipc_namespace *ns, int semid, int cmd, int version, void __user *p) { @@ -1131,9 +1153,9 @@ static int semctl_nolock(struct ipc_namespace *ns, int semid, goto out_unlock; kernel_to_ipc64_perm(&sma->sem_perm, &tbuf.sem_perm); - tbuf.sem_otime = sma->sem_otime; - tbuf.sem_ctime = sma->sem_ctime; - tbuf.sem_nsems = sma->sem_nsems; + tbuf.sem_otime = get_semotime(sma); + tbuf.sem_ctime = sma->sem_ctime; + tbuf.sem_nsems = sma->sem_nsems; rcu_read_unlock(); if (copy_semid_to_user(p, &tbuf, version)) return -EFAULT; @@ -2025,6 +2047,9 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it) { struct user_namespace *user_ns = seq_user_ns(s); struct sem_array *sma = it; + time_t sem_otime; + + sem_otime = get_semotime(sma); return seq_printf(s, "%10d %10d %4o %10u %5u %5u %5u %5u %10lu %10lu\n", @@ -2036,7 +2061,7 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it) from_kgid_munged(user_ns, sma->sem_perm.gid), from_kuid_munged(user_ns, sma->sem_perm.cuid), from_kgid_munged(user_ns, sma->sem_perm.cgid), - sma->sem_otime, + sem_otime, sma->sem_ctime); } #endif From 758a6ba39ef6df4cdc615e5edd7bd86eab81a5f7 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Mon, 8 Jul 2013 16:01:26 -0700 Subject: [PATCH 094/118] ipc/sem.c: rename try_atomic_semop() to perform_atomic_semop(), docu update Cleanup: Some minor points that I noticed while writing the previous patches 1) The name try_atomic_semop() is misleading: The function performs the operation (if it is possible). 2) Some documentation updates. No real code change, a rename and documentation changes. Signed-off-by: Manfred Spraul Cc: Rik van Riel Cc: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/sem.c | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/ipc/sem.c b/ipc/sem.c index 51352e1bfff9..41088899783d 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -154,12 +154,15 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it); #define SEMOPM_FAST 64 /* ~ 372 bytes on stack */ /* - * linked list protection: + * Locking: * sem_undo.id_next, + * sem_array.complex_count, * sem_array.pending{_alter,_cont}, - * sem_array.sem_undo: sem_lock() for read/write + * sem_array.sem_undo: global sem_lock() for read/write * sem_undo.proc_next: only "current" is allowed to read/write that field. * + * sem_array.sem_base[i].pending_{const,alter}: + * global or semaphore sem_lock() for read/write */ #define sc_semmsl sem_ctls[0] @@ -536,12 +539,19 @@ SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg) return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params); } -/* - * Determine whether a sequence of semaphore operations would succeed - * all at once. Return 0 if yes, 1 if need to sleep, else return error code. +/** perform_atomic_semop - Perform (if possible) a semaphore operation + * @sma: semaphore array + * @sops: array with operations that should be checked + * @nsems: number of sops + * @un: undo array + * @pid: pid that did the change + * + * Returns 0 if the operation was possible. + * Returns 1 if the operation is impossible, the caller must sleep. + * Negative values are error codes. */ -static int try_atomic_semop (struct sem_array * sma, struct sembuf * sops, +static int perform_atomic_semop(struct sem_array *sma, struct sembuf *sops, int nsops, struct sem_undo *un, int pid) { int result, sem_op; @@ -724,8 +734,8 @@ static int wake_const_ops(struct sem_array *sma, int semnum, q = container_of(walk, struct sem_queue, list); walk = walk->next; - error = try_atomic_semop(sma, q->sops, q->nsops, - q->undo, q->pid); + error = perform_atomic_semop(sma, q->sops, q->nsops, + q->undo, q->pid); if (error <= 0) { /* operation completed, remove from queue & wakeup */ @@ -838,7 +848,7 @@ again: if (semnum != -1 && sma->sem_base[semnum].semval == 0) break; - error = try_atomic_semop(sma, q->sops, q->nsops, + error = perform_atomic_semop(sma, q->sops, q->nsops, q->undo, q->pid); /* Does q->sleeper still need to sleep? */ @@ -1686,7 +1696,6 @@ static int get_queue_result(struct sem_queue *q) return error; } - SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, unsigned, nsops, const struct timespec __user *, timeout) { @@ -1784,7 +1793,8 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, if (un && un->semid == -1) goto out_unlock_free; - error = try_atomic_semop (sma, sops, nsops, un, task_tgid_vnr(current)); + error = perform_atomic_semop(sma, sops, nsops, un, + task_tgid_vnr(current)); if (error <= 0) { if (alter && error == 0) do_smart_update(sma, sops, nsops, 1, &tasks); From 026dadad6b44f0469a475efb4cae48269d8848bd Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 8 Jul 2013 16:01:27 -0700 Subject: [PATCH 095/118] mwave: fix info leak in mwave_ioctl() Smatch complains that on 64 bit systems, there is a hole in the MW_ABILITIES struct between ->component_count and ->component_list[]. It leaks stack information from the mwave_ioctl() function. I've added a memset() to initialize the struct to zero. Signed-off-by: Dan Carpenter Cc: Greg KH Cc: Jiri Kosina Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/mwave/tp3780i.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/char/mwave/tp3780i.c b/drivers/char/mwave/tp3780i.c index c68969708068..04e6d6a27994 100644 --- a/drivers/char/mwave/tp3780i.c +++ b/drivers/char/mwave/tp3780i.c @@ -479,6 +479,7 @@ int tp3780I_QueryAbilities(THINKPAD_BD_DATA * pBDData, MW_ABILITIES * pAbilities PRINTK_2(TRACE_TP3780I, "tp3780i::tp3780I_QueryAbilities entry pBDData %p\n", pBDData); + memset(pAbilities, 0, sizeof(*pAbilities)); /* fill out standard constant fields */ pAbilities->instr_per_sec = pBDData->rDspSettings.uIps; pAbilities->data_size = pBDData->rDspSettings.uDStoreSize; From 1d04f3c6ab6bbdc6187ba44b8a667a785b63c4f2 Mon Sep 17 00:00:00 2001 From: Philippe De Muyter Date: Mon, 8 Jul 2013 16:01:28 -0700 Subject: [PATCH 096/118] partitions/msdos.c: end-of-line whitespace and semicolon cleanup Signed-off-by: Philippe De Muyter Cc: Karel Zak Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/partitions/msdos.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index 7681cd295ab8..9bf19e6fd949 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -90,7 +90,7 @@ static int aix_magic_present(struct parsed_partitions *state, unsigned char *p) if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M') ret = 1; put_dev_sector(sect); - }; + } return ret; } @@ -142,7 +142,7 @@ static void parse_extended(struct parsed_partitions *state, return; if (!msdos_magic_present(data + 510)) - goto done; + goto done; p = (struct partition *) (data + 0x1be); @@ -155,7 +155,7 @@ static void parse_extended(struct parsed_partitions *state, * and OS/2 seems to use all four entries. */ - /* + /* * First process the data partition(s) */ for (i=0; i<4; i++, p++) { @@ -263,7 +263,7 @@ static void parse_solaris_x86(struct parsed_partitions *state, } #if defined(CONFIG_BSD_DISKLABEL) -/* +/* * Create devices for BSD partitions listed in a disklabel, under a * dos-like partition. See parse_extended() for more information. */ @@ -294,7 +294,7 @@ static void parse_bsd(struct parsed_partitions *state, if (state->next == state->limit) break; - if (p->p_fstype == BSD_FS_UNUSED) + if (p->p_fstype == BSD_FS_UNUSED) continue; bsd_start = le32_to_cpu(p->p_offset); bsd_size = le32_to_cpu(p->p_size); @@ -441,7 +441,7 @@ static struct { {NEW_SOLARIS_X86_PARTITION, parse_solaris_x86}, {0, NULL}, }; - + int msdos_partition(struct parsed_partitions *state) { sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; From 6ceea22bbbc84fcf6bf0913bb3db8a657e9002f6 Mon Sep 17 00:00:00 2001 From: Philippe De Muyter Date: Mon, 8 Jul 2013 16:01:29 -0700 Subject: [PATCH 097/118] partitions: add aix lvm partition support files Add partitions/aix.h and partitions/aix.c. AIX LVM permits to make "logical volumes" which are made of multiple slices of multiple disks. The new code allows only access to the "logical volumes" which are made of one slice on the probed disk, a slice being a contiguous disk area. The code also detects "logical volumes" made of multiple slices on the probed disk, but can not describe them to the partition layer, because the partition layer generic code does not support that. When such non-contiguous "logical volumes" are detected, a diagnostic message is printed. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Philippe De Muyter Cc: Karel Zak Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/partitions/Kconfig | 11 ++ block/partitions/Makefile | 1 + block/partitions/aix.c | 293 ++++++++++++++++++++++++++++++++++++++ block/partitions/aix.h | 1 + 4 files changed, 306 insertions(+) create mode 100644 block/partitions/aix.c create mode 100644 block/partitions/aix.h diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig index 75a54e1adbb5..4cebb2f0d2f4 100644 --- a/block/partitions/Kconfig +++ b/block/partitions/Kconfig @@ -68,6 +68,17 @@ config ACORN_PARTITION_RISCIX of machines called RISCiX. If you say 'Y' here, Linux will be able to read disks partitioned under RISCiX. +config AIX_PARTITION + bool "AIX basic partition table support" if PARTITION_ADVANCED + help + Say Y here if you would like to be able to read the hard disk + partition table format used by IBM or Motorola PowerPC machines + running AIX. AIX actually uses a Logical Volume Manager, where + "logical volumes" can be spread across one or multiple disks, + but this driver works only for the simple case of partitions which + are contiguous. + Otherwise, say N. + config OSF_PARTITION bool "Alpha OSF partition support" if PARTITION_ADVANCED default y if ALPHA diff --git a/block/partitions/Makefile b/block/partitions/Makefile index 03af8eac51da..2be4d7ba4e3a 100644 --- a/block/partitions/Makefile +++ b/block/partitions/Makefile @@ -7,6 +7,7 @@ obj-$(CONFIG_BLOCK) := check.o obj-$(CONFIG_ACORN_PARTITION) += acorn.o obj-$(CONFIG_AMIGA_PARTITION) += amiga.o obj-$(CONFIG_ATARI_PARTITION) += atari.o +obj-$(CONFIG_AIX_PARTITION) += aix.o obj-$(CONFIG_MAC_PARTITION) += mac.o obj-$(CONFIG_LDM_PARTITION) += ldm.o obj-$(CONFIG_MSDOS_PARTITION) += msdos.o diff --git a/block/partitions/aix.c b/block/partitions/aix.c new file mode 100644 index 000000000000..43be471d9b1d --- /dev/null +++ b/block/partitions/aix.c @@ -0,0 +1,293 @@ +/* + * fs/partitions/aix.c + * + * Copyright (C) 2012-2013 Philippe De Muyter + */ + +#include "check.h" +#include "aix.h" + +struct lvm_rec { + char lvm_id[4]; /* "_LVM" */ + char reserved4[16]; + __be32 lvmarea_len; + __be32 vgda_len; + __be32 vgda_psn[2]; + char reserved36[10]; + __be16 pp_size; /* log2(pp_size) */ + char reserved46[12]; + __be16 version; + }; + +struct vgda { + __be32 secs; + __be32 usec; + char reserved8[16]; + __be16 numlvs; + __be16 maxlvs; + __be16 pp_size; + __be16 numpvs; + __be16 total_vgdas; + __be16 vgda_size; + }; + +struct lvd { + __be16 lv_ix; + __be16 res2; + __be16 res4; + __be16 maxsize; + __be16 lv_state; + __be16 mirror; + __be16 mirror_policy; + __be16 num_lps; + __be16 res10[8]; + }; + +struct lvname { + char name[64]; + }; + +struct ppe { + __be16 lv_ix; + unsigned short res2; + unsigned short res4; + __be16 lp_ix; + unsigned short res8[12]; + }; + +struct pvd { + char reserved0[16]; + __be16 pp_count; + char reserved18[2]; + __be32 psn_part1; + char reserved24[8]; + struct ppe ppe[1016]; + }; + +#define LVM_MAXLVS 256 + +/** + * last_lba(): return number of last logical block of device + * @bdev: block device + * + * Description: Returns last LBA value on success, 0 on error. + * This is stored (by sd and ide-geometry) in + * the part[0] entry for this disk, and is the number of + * physical sectors available on the disk. + */ +static u64 last_lba(struct block_device *bdev) +{ + if (!bdev || !bdev->bd_inode) + return 0; + return (bdev->bd_inode->i_size >> 9) - 1ULL; +} + +/** + * read_lba(): Read bytes from disk, starting at given LBA + * @state + * @lba + * @buffer + * @count + * + * Description: Reads @count bytes from @state->bdev into @buffer. + * Returns number of bytes read on success, 0 on error. + */ +static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 *buffer, + size_t count) +{ + size_t totalreadcount = 0; + + if (!buffer || lba + count / 512 > last_lba(state->bdev)) + return 0; + + while (count) { + int copied = 512; + Sector sect; + unsigned char *data = read_part_sector(state, lba++, §); + if (!data) + break; + if (copied > count) + copied = count; + memcpy(buffer, data, copied); + put_dev_sector(sect); + buffer += copied; + totalreadcount += copied; + count -= copied; + } + return totalreadcount; +} + +/** + * alloc_pvd(): reads physical volume descriptor + * @state + * @lba + * + * Description: Returns pvd on success, NULL on error. + * Allocates space for pvd and fill it with disk blocks at @lba + * Notes: remember to free pvd when you're done! + */ +static struct pvd *alloc_pvd(struct parsed_partitions *state, u32 lba) +{ + size_t count = sizeof(struct pvd); + struct pvd *p; + + p = kmalloc(count, GFP_KERNEL); + if (!p) + return NULL; + + if (read_lba(state, lba, (u8 *) p, count) < count) { + kfree(p); + return NULL; + } + return p; +} + +/** + * alloc_lvn(): reads logical volume names + * @state + * @lba + * + * Description: Returns lvn on success, NULL on error. + * Allocates space for lvn and fill it with disk blocks at @lba + * Notes: remember to free lvn when you're done! + */ +static struct lvname *alloc_lvn(struct parsed_partitions *state, u32 lba) +{ + size_t count = sizeof(struct lvname) * LVM_MAXLVS; + struct lvname *p; + + p = kmalloc(count, GFP_KERNEL); + if (!p) + return NULL; + + if (read_lba(state, lba, (u8 *) p, count) < count) { + kfree(p); + return NULL; + } + return p; +} + +int aix_partition(struct parsed_partitions *state) +{ + int ret = 0; + Sector sect; + unsigned char *d; + u32 pp_bytes_size; + u32 pp_blocks_size = 0; + u32 vgda_sector = 0; + u32 vgda_len = 0; + int numlvs = 0; + struct pvd *pvd; + struct lv_info { + unsigned short pps_per_lv; + unsigned short pps_found; + unsigned char lv_is_contiguous; + } *lvip; + struct lvname *n = NULL; + + d = read_part_sector(state, 7, §); + if (d) { + struct lvm_rec *p = (struct lvm_rec *)d; + u16 lvm_version = be16_to_cpu(p->version); + char tmp[64]; + + if (lvm_version == 1) { + int pp_size_log2 = be16_to_cpu(p->pp_size); + + pp_bytes_size = 1 << pp_size_log2; + pp_blocks_size = pp_bytes_size / 512; + snprintf(tmp, sizeof(tmp), + " AIX LVM header version %u found\n", + lvm_version); + vgda_len = be32_to_cpu(p->vgda_len); + vgda_sector = be32_to_cpu(p->vgda_psn[0]); + } else { + snprintf(tmp, sizeof(tmp), + " unsupported AIX LVM version %d found\n", + lvm_version); + } + strlcat(state->pp_buf, tmp, PAGE_SIZE); + put_dev_sector(sect); + } + if (vgda_sector && (d = read_part_sector(state, vgda_sector, §))) { + struct vgda *p = (struct vgda *)d; + + numlvs = be16_to_cpu(p->numlvs); + put_dev_sector(sect); + } + lvip = kzalloc(sizeof(struct lv_info) * state->limit, GFP_KERNEL); + if (!lvip) + return 0; + if (numlvs && (d = read_part_sector(state, vgda_sector + 1, §))) { + struct lvd *p = (struct lvd *)d; + int i; + + n = alloc_lvn(state, vgda_sector + vgda_len - 33); + if (n) { + int foundlvs = 0; + + for (i = 0; foundlvs < numlvs && i < state->limit; i += 1) { + lvip[i].pps_per_lv = be16_to_cpu(p[i].num_lps); + if (lvip[i].pps_per_lv) + foundlvs += 1; + } + } + put_dev_sector(sect); + } + pvd = alloc_pvd(state, vgda_sector + 17); + if (pvd) { + int numpps = be16_to_cpu(pvd->pp_count); + int psn_part1 = be32_to_cpu(pvd->psn_part1); + int i; + int cur_lv_ix = -1; + int next_lp_ix = 1; + int lp_ix; + + for (i = 0; i < numpps; i += 1) { + struct ppe *p = pvd->ppe + i; + unsigned int lv_ix; + + lp_ix = be16_to_cpu(p->lp_ix); + if (!lp_ix) { + next_lp_ix = 1; + continue; + } + lv_ix = be16_to_cpu(p->lv_ix) - 1; + if (lv_ix > state->limit) { + cur_lv_ix = -1; + continue; + } + lvip[lv_ix].pps_found += 1; + if (lp_ix == 1) { + cur_lv_ix = lv_ix; + next_lp_ix = 1; + } else if (lv_ix != cur_lv_ix || lp_ix != next_lp_ix) { + next_lp_ix = 1; + continue; + } + if (lp_ix == lvip[lv_ix].pps_per_lv) { + char tmp[70]; + + put_partition(state, lv_ix + 1, + (i + 1 - lp_ix) * pp_blocks_size + psn_part1, + lvip[lv_ix].pps_per_lv * pp_blocks_size); + snprintf(tmp, sizeof(tmp), " <%s>\n", + n[lv_ix].name); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + lvip[lv_ix].lv_is_contiguous = 1; + ret = 1; + next_lp_ix = 1; + } else + next_lp_ix += 1; + } + for (i = 0; i < state->limit; i += 1) + if (lvip[i].pps_found && !lvip[i].lv_is_contiguous) + pr_warn("partition %s (%u pp's found) is " + "not contiguous\n", + n[i].name, lvip[i].pps_found); + kfree(pvd); + } + kfree(n); + kfree(lvip); + return ret; +} diff --git a/block/partitions/aix.h b/block/partitions/aix.h new file mode 100644 index 000000000000..e0c66a987523 --- /dev/null +++ b/block/partitions/aix.h @@ -0,0 +1 @@ +extern int aix_partition(struct parsed_partitions *state); From f8f066033b015a744065f6c7ed83741b4760376b Mon Sep 17 00:00:00 2001 From: Philippe De Muyter Date: Mon, 8 Jul 2013 16:01:30 -0700 Subject: [PATCH 098/118] partitions/msdos: enumerate also AIX LVM partitions Graft AIX partitions enumeration into partitions/msdos.c There is already a AIX disks detection logic in msdos.c. When an AIX disk has been found, and if configured to, call the aix partitions recognizer. This avoids removal of AIX disks protection from msdos.c, avoids code duplication, and ensures that AIX partitions enumeration is called before plain msdos partitions enumeration. Signed-off-by: Philippe De Muyter Cc: Karel Zak Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/partitions/msdos.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index 9bf19e6fd949..9123f250b425 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -23,6 +23,7 @@ #include "check.h" #include "msdos.h" #include "efi.h" +#include "aix.h" /* * Many architectures don't like unaligned accesses, while @@ -462,8 +463,12 @@ int msdos_partition(struct parsed_partitions *state) */ if (aix_magic_present(state, data)) { put_dev_sector(sect); +#ifdef CONFIG_AIX_PARTITION + return aix_partition(state); +#else strlcat(state->pp_buf, " [AIX]", PAGE_SIZE); return 0; +#endif } if (!msdos_magic_present(data + 510)) { From 0efbee70890c992f31a7b294ac654ff6c62d51c5 Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Mon, 8 Jul 2013 16:01:31 -0700 Subject: [PATCH 099/118] reboot: remove -stable friendly PF_THREAD_BOUND define Remove the prior patch's #define for easier backporting to the stable releases. Signed-off-by: Robin Holt Cc: H. Peter Anvin Cc: Russ Anderson Cc: Robin Holt Cc: Russell King Cc: Guan Xuetao Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/kernel/sys.c b/kernel/sys.c index 071de900c824..b882440bd0c0 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -362,11 +362,6 @@ int unregister_reboot_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_reboot_notifier); -/* Add backwards compatibility for stable trees. */ -#ifndef PF_NO_SETAFFINITY -#define PF_NO_SETAFFINITY PF_THREAD_BOUND -#endif - static void migrate_to_reboot_cpu(void) { /* The boot cpu is always logical cpu 0 */ From 15d94b82565ebfb0cf27830b96e6cf5ed2d12a9a Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Mon, 8 Jul 2013 16:01:32 -0700 Subject: [PATCH 100/118] reboot: move shutdown/reboot related functions to kernel/reboot.c This patch is preparatory. It moves reboot related syscall, etc functions from kernel/sys.c to kernel/reboot.c. Signed-off-by: Robin Holt Cc: H. Peter Anvin Cc: Russ Anderson Cc: Robin Holt Cc: Russell King Cc: Guan Xuetao Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 2 +- kernel/reboot.c | 346 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sys.c | 331 --------------------------------------------- 3 files changed, 347 insertions(+), 332 deletions(-) create mode 100644 kernel/reboot.c diff --git a/kernel/Makefile b/kernel/Makefile index 271fd3119af9..470839d1a30e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -9,7 +9,7 @@ obj-y = fork.o exec_domain.o panic.o printk.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ - notifier.o ksysfs.o cred.o \ + notifier.o ksysfs.o cred.o reboot.o \ async.o range.o groups.o lglock.o smpboot.o ifdef CONFIG_FUNCTION_TRACER diff --git a/kernel/reboot.c b/kernel/reboot.c new file mode 100644 index 000000000000..37d2636a65c2 --- /dev/null +++ b/kernel/reboot.c @@ -0,0 +1,346 @@ +/* + * linux/kernel/reboot.c + * + * Copyright (C) 2013 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * this indicates whether you can reboot with ctrl-alt-del: the default is yes + */ + +int C_A_D = 1; +struct pid *cad_pid; +EXPORT_SYMBOL(cad_pid); + +/* + * If set, this is used for preparing the system to power off. + */ + +void (*pm_power_off_prepare)(void); + +/** + * emergency_restart - reboot the system + * + * Without shutting down any hardware or taking any locks + * reboot the system. This is called when we know we are in + * trouble so this is our best effort to reboot. This is + * safe to call in interrupt context. + */ +void emergency_restart(void) +{ + kmsg_dump(KMSG_DUMP_EMERG); + machine_emergency_restart(); +} +EXPORT_SYMBOL_GPL(emergency_restart); + +void kernel_restart_prepare(char *cmd) +{ + blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); + system_state = SYSTEM_RESTART; + usermodehelper_disable(); + device_shutdown(); +} + +/** + * register_reboot_notifier - Register function to be called at reboot time + * @nb: Info about notifier function to be called + * + * Registers a function with the list of functions + * to be called at reboot time. + * + * Currently always returns zero, as blocking_notifier_chain_register() + * always returns zero. + */ +int register_reboot_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&reboot_notifier_list, nb); +} +EXPORT_SYMBOL(register_reboot_notifier); + +/** + * unregister_reboot_notifier - Unregister previously registered reboot notifier + * @nb: Hook to be unregistered + * + * Unregisters a previously registered reboot + * notifier function. + * + * Returns zero on success, or %-ENOENT on failure. + */ +int unregister_reboot_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&reboot_notifier_list, nb); +} +EXPORT_SYMBOL(unregister_reboot_notifier); + +static void migrate_to_reboot_cpu(void) +{ + /* The boot cpu is always logical cpu 0 */ + int cpu = 0; + + cpu_hotplug_disable(); + + /* Make certain the cpu I'm about to reboot on is online */ + if (!cpu_online(cpu)) + cpu = cpumask_first(cpu_online_mask); + + /* Prevent races with other tasks migrating this task */ + current->flags |= PF_NO_SETAFFINITY; + + /* Make certain I only run on the appropriate processor */ + set_cpus_allowed_ptr(current, cpumask_of(cpu)); +} + +/** + * kernel_restart - reboot the system + * @cmd: pointer to buffer containing command to execute for restart + * or %NULL + * + * Shutdown everything and perform a clean reboot. + * This is not safe to call in interrupt context. + */ +void kernel_restart(char *cmd) +{ + kernel_restart_prepare(cmd); + migrate_to_reboot_cpu(); + syscore_shutdown(); + if (!cmd) + printk(KERN_EMERG "Restarting system.\n"); + else + printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); + kmsg_dump(KMSG_DUMP_RESTART); + machine_restart(cmd); +} +EXPORT_SYMBOL_GPL(kernel_restart); + +static void kernel_shutdown_prepare(enum system_states state) +{ + blocking_notifier_call_chain(&reboot_notifier_list, + (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); + system_state = state; + usermodehelper_disable(); + device_shutdown(); +} +/** + * kernel_halt - halt the system + * + * Shutdown everything and perform a clean system halt. + */ +void kernel_halt(void) +{ + kernel_shutdown_prepare(SYSTEM_HALT); + migrate_to_reboot_cpu(); + syscore_shutdown(); + printk(KERN_EMERG "System halted.\n"); + kmsg_dump(KMSG_DUMP_HALT); + machine_halt(); +} + +EXPORT_SYMBOL_GPL(kernel_halt); + +/** + * kernel_power_off - power_off the system + * + * Shutdown everything and perform a clean system power_off. + */ +void kernel_power_off(void) +{ + kernel_shutdown_prepare(SYSTEM_POWER_OFF); + if (pm_power_off_prepare) + pm_power_off_prepare(); + migrate_to_reboot_cpu(); + syscore_shutdown(); + printk(KERN_EMERG "Power down.\n"); + kmsg_dump(KMSG_DUMP_POWEROFF); + machine_power_off(); +} +EXPORT_SYMBOL_GPL(kernel_power_off); + +static DEFINE_MUTEX(reboot_mutex); + +/* + * Reboot system call: for obvious reasons only root may call it, + * and even root needs to set up some magic numbers in the registers + * so that some mistake won't make this reboot the whole machine. + * You can also set the meaning of the ctrl-alt-del-key here. + * + * reboot doesn't sync: do that yourself before calling this. + */ +SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, + void __user *, arg) +{ + struct pid_namespace *pid_ns = task_active_pid_ns(current); + char buffer[256]; + int ret = 0; + + /* We only trust the superuser with rebooting the system. */ + if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT)) + return -EPERM; + + /* For safety, we require "magic" arguments. */ + if (magic1 != LINUX_REBOOT_MAGIC1 || + (magic2 != LINUX_REBOOT_MAGIC2 && + magic2 != LINUX_REBOOT_MAGIC2A && + magic2 != LINUX_REBOOT_MAGIC2B && + magic2 != LINUX_REBOOT_MAGIC2C)) + return -EINVAL; + + /* + * If pid namespaces are enabled and the current task is in a child + * pid_namespace, the command is handled by reboot_pid_ns() which will + * call do_exit(). + */ + ret = reboot_pid_ns(pid_ns, cmd); + if (ret) + return ret; + + /* Instead of trying to make the power_off code look like + * halt when pm_power_off is not set do it the easy way. + */ + if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) + cmd = LINUX_REBOOT_CMD_HALT; + + mutex_lock(&reboot_mutex); + switch (cmd) { + case LINUX_REBOOT_CMD_RESTART: + kernel_restart(NULL); + break; + + case LINUX_REBOOT_CMD_CAD_ON: + C_A_D = 1; + break; + + case LINUX_REBOOT_CMD_CAD_OFF: + C_A_D = 0; + break; + + case LINUX_REBOOT_CMD_HALT: + kernel_halt(); + do_exit(0); + panic("cannot halt"); + + case LINUX_REBOOT_CMD_POWER_OFF: + kernel_power_off(); + do_exit(0); + break; + + case LINUX_REBOOT_CMD_RESTART2: + if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { + ret = -EFAULT; + break; + } + buffer[sizeof(buffer) - 1] = '\0'; + + kernel_restart(buffer); + break; + +#ifdef CONFIG_KEXEC + case LINUX_REBOOT_CMD_KEXEC: + ret = kernel_kexec(); + break; +#endif + +#ifdef CONFIG_HIBERNATION + case LINUX_REBOOT_CMD_SW_SUSPEND: + ret = hibernate(); + break; +#endif + + default: + ret = -EINVAL; + break; + } + mutex_unlock(&reboot_mutex); + return ret; +} + +static void deferred_cad(struct work_struct *dummy) +{ + kernel_restart(NULL); +} + +/* + * This function gets called by ctrl-alt-del - ie the keyboard interrupt. + * As it's called within an interrupt, it may NOT sync: the only choice + * is whether to reboot at once, or just ignore the ctrl-alt-del. + */ +void ctrl_alt_del(void) +{ + static DECLARE_WORK(cad_work, deferred_cad); + + if (C_A_D) + schedule_work(&cad_work); + else + kill_cad_pid(SIGINT, 1); +} + +char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; + +static int __orderly_poweroff(bool force) +{ + char **argv; + static char *envp[] = { + "HOME=/", + "PATH=/sbin:/bin:/usr/sbin:/usr/bin", + NULL + }; + int ret; + + argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL); + if (argv) { + ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); + argv_free(argv); + } else { + printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", + __func__, poweroff_cmd); + ret = -ENOMEM; + } + + if (ret && force) { + printk(KERN_WARNING "Failed to start orderly shutdown: " + "forcing the issue\n"); + /* + * I guess this should try to kick off some daemon to sync and + * poweroff asap. Or not even bother syncing if we're doing an + * emergency shutdown? + */ + emergency_sync(); + kernel_power_off(); + } + + return ret; +} + +static bool poweroff_force; + +static void poweroff_work_func(struct work_struct *work) +{ + __orderly_poweroff(poweroff_force); +} + +static DECLARE_WORK(poweroff_work, poweroff_work_func); + +/** + * orderly_poweroff - Trigger an orderly system poweroff + * @force: force poweroff if command execution fails + * + * This may be called from any context to trigger a system shutdown. + * If the orderly shutdown fails, it will force an immediate shutdown. + */ +int orderly_poweroff(bool force) +{ + if (force) /* do not override the pending "true" */ + poweroff_force = true; + schedule_work(&poweroff_work); + return 0; +} +EXPORT_SYMBOL_GPL(orderly_poweroff); diff --git a/kernel/sys.c b/kernel/sys.c index b882440bd0c0..771129b299f8 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -115,20 +115,6 @@ int fs_overflowgid = DEFAULT_FS_OVERFLOWUID; EXPORT_SYMBOL(fs_overflowuid); EXPORT_SYMBOL(fs_overflowgid); -/* - * this indicates whether you can reboot with ctrl-alt-del: the default is yes - */ - -int C_A_D = 1; -struct pid *cad_pid; -EXPORT_SYMBOL(cad_pid); - -/* - * If set, this is used for preparing the system to power off. - */ - -void (*pm_power_off_prepare)(void); - /* * Returns true if current's euid is same as p's uid or euid, * or has CAP_SYS_NICE to p's user_ns. @@ -308,261 +294,6 @@ out_unlock: return retval; } -/** - * emergency_restart - reboot the system - * - * Without shutting down any hardware or taking any locks - * reboot the system. This is called when we know we are in - * trouble so this is our best effort to reboot. This is - * safe to call in interrupt context. - */ -void emergency_restart(void) -{ - kmsg_dump(KMSG_DUMP_EMERG); - machine_emergency_restart(); -} -EXPORT_SYMBOL_GPL(emergency_restart); - -void kernel_restart_prepare(char *cmd) -{ - blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); - system_state = SYSTEM_RESTART; - usermodehelper_disable(); - device_shutdown(); -} - -/** - * register_reboot_notifier - Register function to be called at reboot time - * @nb: Info about notifier function to be called - * - * Registers a function with the list of functions - * to be called at reboot time. - * - * Currently always returns zero, as blocking_notifier_chain_register() - * always returns zero. - */ -int register_reboot_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_register(&reboot_notifier_list, nb); -} -EXPORT_SYMBOL(register_reboot_notifier); - -/** - * unregister_reboot_notifier - Unregister previously registered reboot notifier - * @nb: Hook to be unregistered - * - * Unregisters a previously registered reboot - * notifier function. - * - * Returns zero on success, or %-ENOENT on failure. - */ -int unregister_reboot_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_unregister(&reboot_notifier_list, nb); -} -EXPORT_SYMBOL(unregister_reboot_notifier); - -static void migrate_to_reboot_cpu(void) -{ - /* The boot cpu is always logical cpu 0 */ - int cpu = 0; - - cpu_hotplug_disable(); - - /* Make certain the cpu I'm about to reboot on is online */ - if (!cpu_online(cpu)) - cpu = cpumask_first(cpu_online_mask); - - /* Prevent races with other tasks migrating this task */ - current->flags |= PF_NO_SETAFFINITY; - - /* Make certain I only run on the appropriate processor */ - set_cpus_allowed_ptr(current, cpumask_of(cpu)); -} - -/** - * kernel_restart - reboot the system - * @cmd: pointer to buffer containing command to execute for restart - * or %NULL - * - * Shutdown everything and perform a clean reboot. - * This is not safe to call in interrupt context. - */ -void kernel_restart(char *cmd) -{ - kernel_restart_prepare(cmd); - migrate_to_reboot_cpu(); - syscore_shutdown(); - if (!cmd) - printk(KERN_EMERG "Restarting system.\n"); - else - printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); - kmsg_dump(KMSG_DUMP_RESTART); - machine_restart(cmd); -} -EXPORT_SYMBOL_GPL(kernel_restart); - -static void kernel_shutdown_prepare(enum system_states state) -{ - blocking_notifier_call_chain(&reboot_notifier_list, - (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); - system_state = state; - usermodehelper_disable(); - device_shutdown(); -} -/** - * kernel_halt - halt the system - * - * Shutdown everything and perform a clean system halt. - */ -void kernel_halt(void) -{ - kernel_shutdown_prepare(SYSTEM_HALT); - migrate_to_reboot_cpu(); - syscore_shutdown(); - printk(KERN_EMERG "System halted.\n"); - kmsg_dump(KMSG_DUMP_HALT); - machine_halt(); -} - -EXPORT_SYMBOL_GPL(kernel_halt); - -/** - * kernel_power_off - power_off the system - * - * Shutdown everything and perform a clean system power_off. - */ -void kernel_power_off(void) -{ - kernel_shutdown_prepare(SYSTEM_POWER_OFF); - if (pm_power_off_prepare) - pm_power_off_prepare(); - migrate_to_reboot_cpu(); - syscore_shutdown(); - printk(KERN_EMERG "Power down.\n"); - kmsg_dump(KMSG_DUMP_POWEROFF); - machine_power_off(); -} -EXPORT_SYMBOL_GPL(kernel_power_off); - -static DEFINE_MUTEX(reboot_mutex); - -/* - * Reboot system call: for obvious reasons only root may call it, - * and even root needs to set up some magic numbers in the registers - * so that some mistake won't make this reboot the whole machine. - * You can also set the meaning of the ctrl-alt-del-key here. - * - * reboot doesn't sync: do that yourself before calling this. - */ -SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, - void __user *, arg) -{ - struct pid_namespace *pid_ns = task_active_pid_ns(current); - char buffer[256]; - int ret = 0; - - /* We only trust the superuser with rebooting the system. */ - if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT)) - return -EPERM; - - /* For safety, we require "magic" arguments. */ - if (magic1 != LINUX_REBOOT_MAGIC1 || - (magic2 != LINUX_REBOOT_MAGIC2 && - magic2 != LINUX_REBOOT_MAGIC2A && - magic2 != LINUX_REBOOT_MAGIC2B && - magic2 != LINUX_REBOOT_MAGIC2C)) - return -EINVAL; - - /* - * If pid namespaces are enabled and the current task is in a child - * pid_namespace, the command is handled by reboot_pid_ns() which will - * call do_exit(). - */ - ret = reboot_pid_ns(pid_ns, cmd); - if (ret) - return ret; - - /* Instead of trying to make the power_off code look like - * halt when pm_power_off is not set do it the easy way. - */ - if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) - cmd = LINUX_REBOOT_CMD_HALT; - - mutex_lock(&reboot_mutex); - switch (cmd) { - case LINUX_REBOOT_CMD_RESTART: - kernel_restart(NULL); - break; - - case LINUX_REBOOT_CMD_CAD_ON: - C_A_D = 1; - break; - - case LINUX_REBOOT_CMD_CAD_OFF: - C_A_D = 0; - break; - - case LINUX_REBOOT_CMD_HALT: - kernel_halt(); - do_exit(0); - panic("cannot halt.\n"); - - case LINUX_REBOOT_CMD_POWER_OFF: - kernel_power_off(); - do_exit(0); - break; - - case LINUX_REBOOT_CMD_RESTART2: - if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { - ret = -EFAULT; - break; - } - buffer[sizeof(buffer) - 1] = '\0'; - - kernel_restart(buffer); - break; - -#ifdef CONFIG_KEXEC - case LINUX_REBOOT_CMD_KEXEC: - ret = kernel_kexec(); - break; -#endif - -#ifdef CONFIG_HIBERNATION - case LINUX_REBOOT_CMD_SW_SUSPEND: - ret = hibernate(); - break; -#endif - - default: - ret = -EINVAL; - break; - } - mutex_unlock(&reboot_mutex); - return ret; -} - -static void deferred_cad(struct work_struct *dummy) -{ - kernel_restart(NULL); -} - -/* - * This function gets called by ctrl-alt-del - ie the keyboard interrupt. - * As it's called within an interrupt, it may NOT sync: the only choice - * is whether to reboot at once, or just ignore the ctrl-alt-del. - */ -void ctrl_alt_del(void) -{ - static DECLARE_WORK(cad_work, deferred_cad); - - if (C_A_D) - schedule_work(&cad_work); - else - kill_cad_pid(SIGINT, 1); -} - /* * Unprivileged users may change the real gid to the effective gid * or vice versa. (BSD-style) @@ -2287,68 +2018,6 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, return err ? -EFAULT : 0; } -char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; - -static int __orderly_poweroff(bool force) -{ - char **argv; - static char *envp[] = { - "HOME=/", - "PATH=/sbin:/bin:/usr/sbin:/usr/bin", - NULL - }; - int ret; - - argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL); - if (argv) { - ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); - argv_free(argv); - } else { - printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", - __func__, poweroff_cmd); - ret = -ENOMEM; - } - - if (ret && force) { - printk(KERN_WARNING "Failed to start orderly shutdown: " - "forcing the issue\n"); - /* - * I guess this should try to kick off some daemon to sync and - * poweroff asap. Or not even bother syncing if we're doing an - * emergency shutdown? - */ - emergency_sync(); - kernel_power_off(); - } - - return ret; -} - -static bool poweroff_force; - -static void poweroff_work_func(struct work_struct *work) -{ - __orderly_poweroff(poweroff_force); -} - -static DECLARE_WORK(poweroff_work, poweroff_work_func); - -/** - * orderly_poweroff - Trigger an orderly system poweroff - * @force: force poweroff if command execution fails - * - * This may be called from any context to trigger a system shutdown. - * If the orderly shutdown fails, it will force an immediate shutdown. - */ -int orderly_poweroff(bool force) -{ - if (force) /* do not override the pending "true" */ - poweroff_force = true; - schedule_work(&poweroff_work); - return 0; -} -EXPORT_SYMBOL_GPL(orderly_poweroff); - /** * do_sysinfo - fill in sysinfo struct * @info: pointer to buffer to fill From 972ee83df88a7fd84c228a31b4f9611299898984 Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Mon, 8 Jul 2013 16:01:34 -0700 Subject: [PATCH 101/118] reboot: checkpatch.pl the new kernel/reboot.c file Get the new file to pass scripts/checkpatch.pl Signed-off-by: Robin Holt Cc: H. Peter Anvin Cc: Russ Anderson Cc: Robin Holt Cc: Russell King Cc: Guan Xuetao Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/reboot.h | 2 +- kernel/reboot.c | 27 +++++++++++++-------------- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/include/linux/reboot.h b/include/linux/reboot.h index 23b36304cd88..c6eba210e592 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -26,7 +26,7 @@ extern void machine_shutdown(void); struct pt_regs; extern void machine_crash_shutdown(struct pt_regs *); -/* +/* * Architecture independent implemenations of sys_reboot commands. */ diff --git a/kernel/reboot.c b/kernel/reboot.c index 37d2636a65c2..abb6a0483716 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -4,6 +4,8 @@ * Copyright (C) 2013 Linus Torvalds */ +#define pr_fmt(fmt) "reboot: " fmt + #include #include #include @@ -114,9 +116,9 @@ void kernel_restart(char *cmd) migrate_to_reboot_cpu(); syscore_shutdown(); if (!cmd) - printk(KERN_EMERG "Restarting system.\n"); + pr_emerg("Restarting system\n"); else - printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); + pr_emerg("Restarting system with command '%s'\n", cmd); kmsg_dump(KMSG_DUMP_RESTART); machine_restart(cmd); } @@ -125,7 +127,7 @@ EXPORT_SYMBOL_GPL(kernel_restart); static void kernel_shutdown_prepare(enum system_states state) { blocking_notifier_call_chain(&reboot_notifier_list, - (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); + (state == SYSTEM_HALT) ? SYS_HALT : SYS_POWER_OFF, NULL); system_state = state; usermodehelper_disable(); device_shutdown(); @@ -140,11 +142,10 @@ void kernel_halt(void) kernel_shutdown_prepare(SYSTEM_HALT); migrate_to_reboot_cpu(); syscore_shutdown(); - printk(KERN_EMERG "System halted.\n"); + pr_emerg("System halted\n"); kmsg_dump(KMSG_DUMP_HALT); machine_halt(); } - EXPORT_SYMBOL_GPL(kernel_halt); /** @@ -159,7 +160,7 @@ void kernel_power_off(void) pm_power_off_prepare(); migrate_to_reboot_cpu(); syscore_shutdown(); - printk(KERN_EMERG "Power down.\n"); + pr_emerg("Power down\n"); kmsg_dump(KMSG_DUMP_POWEROFF); machine_power_off(); } @@ -188,10 +189,10 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, /* For safety, we require "magic" arguments. */ if (magic1 != LINUX_REBOOT_MAGIC1 || - (magic2 != LINUX_REBOOT_MAGIC2 && - magic2 != LINUX_REBOOT_MAGIC2A && + (magic2 != LINUX_REBOOT_MAGIC2 && + magic2 != LINUX_REBOOT_MAGIC2A && magic2 != LINUX_REBOOT_MAGIC2B && - magic2 != LINUX_REBOOT_MAGIC2C)) + magic2 != LINUX_REBOOT_MAGIC2C)) return -EINVAL; /* @@ -234,7 +235,8 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, break; case LINUX_REBOOT_CMD_RESTART2: - if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { + ret = strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1); + if (ret < 0) { ret = -EFAULT; break; } @@ -300,14 +302,11 @@ static int __orderly_poweroff(bool force) ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); argv_free(argv); } else { - printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", - __func__, poweroff_cmd); ret = -ENOMEM; } if (ret && force) { - printk(KERN_WARNING "Failed to start orderly shutdown: " - "forcing the issue\n"); + pr_warn("Failed to start orderly shutdown: forcing the issue\n"); /* * I guess this should try to kick off some daemon to sync and * poweroff asap. Or not even bother syncing if we're doing an From edf2b1394611fef7806d4af72179dc3ac101f275 Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Mon, 8 Jul 2013 16:01:35 -0700 Subject: [PATCH 102/118] reboot: x86: prepare reboot_mode for moving to generic kernel code Prepare for the moving the parsing of reboot= to the generic kernel code by making reboot_mode into a more generic form. Signed-off-by: Robin Holt Cc: H. Peter Anvin Cc: Miguel Boton Cc: Russ Anderson Cc: Robin Holt Cc: Russell King Cc: Guan Xuetao Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/reboot.c | 12 +++++++----- include/linux/reboot.h | 5 +++++ 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 76fa1e9a2b39..f7703401d6cb 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -36,7 +36,7 @@ void (*pm_power_off)(void); EXPORT_SYMBOL(pm_power_off); static const struct desc_ptr no_idt = {}; -static int reboot_mode; +static enum reboot_mode reboot_mode; enum reboot_type reboot_type = BOOT_ACPI; int reboot_force; @@ -88,11 +88,11 @@ static int __init reboot_setup(char *str) switch (*str) { case 'w': - reboot_mode = 0x1234; + reboot_mode = REBOOT_WARM; break; case 'c': - reboot_mode = 0; + reboot_mode = REBOOT_COLD; break; #ifdef CONFIG_SMP @@ -536,6 +536,7 @@ static void native_machine_emergency_restart(void) int i; int attempt = 0; int orig_reboot_type = reboot_type; + unsigned short mode; if (reboot_emergency) emergency_vmx_disable_all(); @@ -543,7 +544,8 @@ static void native_machine_emergency_restart(void) tboot_shutdown(TB_SHUTDOWN_REBOOT); /* Tell the BIOS if we want cold or warm reboot */ - *((unsigned short *)__va(0x472)) = reboot_mode; + mode = reboot_mode == REBOOT_WARM ? 0x1234 : 0; + *((unsigned short *)__va(0x472)) = mode; for (;;) { /* Could also try the reset bit in the Hammer NB */ @@ -585,7 +587,7 @@ static void native_machine_emergency_restart(void) case BOOT_EFI: if (efi_enabled(EFI_RUNTIME_SERVICES)) - efi.reset_system(reboot_mode ? + efi.reset_system(reboot_mode == REBOOT_WARM ? EFI_RESET_WARM : EFI_RESET_COLD, EFI_SUCCESS, 0, NULL); diff --git a/include/linux/reboot.h b/include/linux/reboot.h index c6eba210e592..37d56c356a06 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -10,6 +10,11 @@ #define SYS_HALT 0x0002 /* Notify of system halt */ #define SYS_POWER_OFF 0x0003 /* Notify of system power off */ +enum reboot_mode { + REBOOT_COLD = 0, + REBOOT_WARM, +}; + extern int register_reboot_notifier(struct notifier_block *); extern int unregister_reboot_notifier(struct notifier_block *); From c97a7008517abb7c805fbdd49410032a652def26 Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Mon, 8 Jul 2013 16:01:36 -0700 Subject: [PATCH 103/118] reboot: unicore32: prepare reboot_mode for moving to generic kernel code Prepare for the moving the parsing of reboot= to the generic kernel code by making reboot_mode into a more generic form. Signed-off-by: Robin Holt Cc: Guan Xuetao Cc: Russ Anderson Cc: Robin Holt Cc: Russell King Cc: H. Peter Anvin Acked-by: Guan Xuetao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/unicore32/kernel/process.c | 10 +++++----- arch/unicore32/kernel/setup.h | 2 +- arch/unicore32/mm/mmu.c | 2 +- include/linux/reboot.h | 2 ++ 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c index c9447691bdac..93dd035a8c33 100644 --- a/arch/unicore32/kernel/process.c +++ b/arch/unicore32/kernel/process.c @@ -51,14 +51,14 @@ void arch_cpu_idle(void) local_irq_enable(); } -static char reboot_mode = 'h'; +static enum reboot_mode reboot_mode = REBOOT_HARD; int __init reboot_setup(char *str) { - reboot_mode = str[0]; + if ('s' == str[0]) + reboot_mode = REBOOT_SOFT; return 1; } - __setup("reboot=", reboot_setup); void machine_halt(void) @@ -88,7 +88,7 @@ void machine_restart(char *cmd) * we may need it to insert some 1:1 mappings so that * soft boot works. */ - setup_mm_for_reboot(reboot_mode); + setup_mm_for_reboot(); /* Clean and invalidate caches */ flush_cache_all(); @@ -102,7 +102,7 @@ void machine_restart(char *cmd) /* * Now handle reboot code. */ - if (reboot_mode == 's') { + if (reboot_mode == REBOOT_SOFT) { /* Jump into ROM at address 0xffff0000 */ cpu_reset(VECTORS_BASE); } else { diff --git a/arch/unicore32/kernel/setup.h b/arch/unicore32/kernel/setup.h index 30f749da8f73..f5c51b85ad24 100644 --- a/arch/unicore32/kernel/setup.h +++ b/arch/unicore32/kernel/setup.h @@ -22,7 +22,7 @@ extern void puv3_ps2_init(void); extern void pci_puv3_preinit(void); extern void __init puv3_init_gpio(void); -extern void setup_mm_for_reboot(char mode); +extern void setup_mm_for_reboot(void); extern char __stubs_start[], __stubs_end[]; extern char __vectors_start[], __vectors_end[]; diff --git a/arch/unicore32/mm/mmu.c b/arch/unicore32/mm/mmu.c index 43c20b40e444..4f5a532bee13 100644 --- a/arch/unicore32/mm/mmu.c +++ b/arch/unicore32/mm/mmu.c @@ -445,7 +445,7 @@ void __init paging_init(void) * the user-mode pages. This will then ensure that we have predictable * results when turning the mmu off */ -void setup_mm_for_reboot(char mode) +void setup_mm_for_reboot(void) { unsigned long base_pmdval; pgd_t *pgd; diff --git a/include/linux/reboot.h b/include/linux/reboot.h index 37d56c356a06..ca29a6ffc08e 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -13,6 +13,8 @@ enum reboot_mode { REBOOT_COLD = 0, REBOOT_WARM, + REBOOT_HARD, + REBOOT_SOFT, }; extern int register_reboot_notifier(struct notifier_block *); From 58591942789abe1ea18e3fb1e8d8502c70060c29 Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Mon, 8 Jul 2013 16:01:38 -0700 Subject: [PATCH 104/118] reboot: arm: remove unused restart_mode fields from some arm subarchs These restart_mode fields are not used at all. Remove them to make moving the reboot= cmdline options to the general kernel easier. Signed-off-by: Robin Holt Cc: Russell King Cc: Russ Anderson Cc: Robin Holt Cc: H. Peter Anvin Cc: Guan Xuetao Acked-by: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mach-ebsa110/core.c | 1 - arch/arm/mach-pxa/mioa701.c | 1 - arch/arm/mach-pxa/spitz.c | 3 --- arch/arm/mach-pxa/tosa.c | 1 - 4 files changed, 6 deletions(-) diff --git a/arch/arm/mach-ebsa110/core.c b/arch/arm/mach-ebsa110/core.c index 8a53f346cdb3..41d2d90afaa3 100644 --- a/arch/arm/mach-ebsa110/core.c +++ b/arch/arm/mach-ebsa110/core.c @@ -321,7 +321,6 @@ MACHINE_START(EBSA110, "EBSA110") .atag_offset = 0x400, .reserve_lp0 = 1, .reserve_lp2 = 1, - .restart_mode = 's', .map_io = ebsa110_map_io, .init_early = ebsa110_init_early, .init_irq = ebsa110_init_irq, diff --git a/arch/arm/mach-pxa/mioa701.c b/arch/arm/mach-pxa/mioa701.c index 654b0ac84dea..e6b0a936c150 100644 --- a/arch/arm/mach-pxa/mioa701.c +++ b/arch/arm/mach-pxa/mioa701.c @@ -761,7 +761,6 @@ static void mioa701_machine_exit(void) MACHINE_START(MIOA701, "MIO A701") .atag_offset = 0x100, - .restart_mode = 's', .map_io = &pxa27x_map_io, .nr_irqs = PXA_NR_IRQS, .init_irq = &pxa27x_init_irq, diff --git a/arch/arm/mach-pxa/spitz.c b/arch/arm/mach-pxa/spitz.c index 362726c49c70..c3c00424bb35 100644 --- a/arch/arm/mach-pxa/spitz.c +++ b/arch/arm/mach-pxa/spitz.c @@ -979,7 +979,6 @@ static void __init spitz_fixup(struct tag *tags, char **cmdline, #ifdef CONFIG_MACH_SPITZ MACHINE_START(SPITZ, "SHARP Spitz") - .restart_mode = 'g', .fixup = spitz_fixup, .map_io = pxa27x_map_io, .nr_irqs = PXA_NR_IRQS, @@ -993,7 +992,6 @@ MACHINE_END #ifdef CONFIG_MACH_BORZOI MACHINE_START(BORZOI, "SHARP Borzoi") - .restart_mode = 'g', .fixup = spitz_fixup, .map_io = pxa27x_map_io, .nr_irqs = PXA_NR_IRQS, @@ -1007,7 +1005,6 @@ MACHINE_END #ifdef CONFIG_MACH_AKITA MACHINE_START(AKITA, "SHARP Akita") - .restart_mode = 'g', .fixup = spitz_fixup, .map_io = pxa27x_map_io, .nr_irqs = PXA_NR_IRQS, diff --git a/arch/arm/mach-pxa/tosa.c b/arch/arm/mach-pxa/tosa.c index 3d91d2e5bf3a..a41992fea720 100644 --- a/arch/arm/mach-pxa/tosa.c +++ b/arch/arm/mach-pxa/tosa.c @@ -969,7 +969,6 @@ static void __init fixup_tosa(struct tag *tags, char **cmdline, } MACHINE_START(TOSA, "SHARP Tosa") - .restart_mode = 'g', .fixup = fixup_tosa, .map_io = pxa25x_map_io, .nr_irqs = TOSA_NR_IRQS, From 16d6d5b00ee75307bab7e4ede9452c97b28f30e2 Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Mon, 8 Jul 2013 16:01:39 -0700 Subject: [PATCH 105/118] reboot: arm: prepare reboot_mode for moving to generic kernel code Prepare for the moving the parsing of reboot= to the generic kernel code by making reboot_mode into a more generic form. Signed-off-by: Robin Holt Cc: Russell King Cc: Russ Anderson Cc: Robin Holt Cc: H. Peter Anvin Cc: Guan Xuetao Acked-by: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/mach/arch.h | 3 ++- arch/arm/kernel/process.c | 8 ++++---- arch/arm/kernel/setup.c | 6 +++--- arch/arm/mach-footbridge/cats-hw.c | 2 +- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/arm/include/asm/mach/arch.h b/arch/arm/include/asm/mach/arch.h index 75bf07910b81..fdf62b4246b0 100644 --- a/arch/arm/include/asm/mach/arch.h +++ b/arch/arm/include/asm/mach/arch.h @@ -11,6 +11,7 @@ #include #ifndef __ASSEMBLY__ +#include struct tag; struct meminfo; @@ -43,7 +44,7 @@ struct machine_desc { unsigned char reserve_lp0 :1; /* never has lp0 */ unsigned char reserve_lp1 :1; /* never has lp1 */ unsigned char reserve_lp2 :1; /* never has lp2 */ - char restart_mode; /* default restart mode */ + enum reboot_mode reboot_mode; /* default restart mode */ struct smp_operations *smp; /* SMP operations */ bool (*smp_init)(void); void (*fixup)(struct tag *, char **, diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 7f1efcd4a6e9..2d544062fd7d 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -175,14 +175,14 @@ void arch_cpu_idle(void) default_idle(); } -static char reboot_mode = 'h'; +enum reboot_mode reboot_mode = REBOOT_HARD; -int __init reboot_setup(char *str) +static int __init reboot_setup(char *str) { - reboot_mode = str[0]; + if ('s' == str[0]) + reboot_mode = REBOOT_SOFT; return 1; } - __setup("reboot=", reboot_setup); /* diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index 9b653278c9e8..63af9a7ae512 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -74,7 +74,7 @@ __setup("fpe=", fpe_setup); extern void paging_init(struct machine_desc *desc); extern void sanity_check_meminfo(void); -extern void reboot_setup(char *str); +extern enum reboot_mode reboot_mode; extern void setup_dma_zone(struct machine_desc *desc); unsigned int processor_id; @@ -861,8 +861,8 @@ void __init setup_arch(char **cmdline_p) setup_dma_zone(mdesc); - if (mdesc->restart_mode) - reboot_setup(&mdesc->restart_mode); + if (mdesc->reboot_mode != REBOOT_HARD) + reboot_mode = mdesc->reboot_mode; init_mm.start_code = (unsigned long) _text; init_mm.end_code = (unsigned long) _etext; diff --git a/arch/arm/mach-footbridge/cats-hw.c b/arch/arm/mach-footbridge/cats-hw.c index 6987a09ec219..9669cc0b6318 100644 --- a/arch/arm/mach-footbridge/cats-hw.c +++ b/arch/arm/mach-footbridge/cats-hw.c @@ -86,7 +86,7 @@ fixup_cats(struct tag *tags, char **cmdline, struct meminfo *mi) MACHINE_START(CATS, "Chalice-CATS") /* Maintainer: Philip Blundell */ .atag_offset = 0x100, - .restart_mode = 's', + .reboot_mode = REBOOT_SOFT, .fixup = fixup_cats, .map_io = footbridge_map_io, .init_irq = footbridge_init_irq, From 7b6d864b48d95e6ea1df7df64475b9cb9616dcf9 Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Mon, 8 Jul 2013 16:01:40 -0700 Subject: [PATCH 106/118] reboot: arm: change reboot_mode to use enum reboot_mode Preparing to move the parsing of reboot= to generic kernel code forces the change in reboot_mode handling to use the enum. [akpm@linux-foundation.org: fix arch/arm/mach-socfpga/socfpga.c] Signed-off-by: Robin Holt Cc: Russell King Cc: Russ Anderson Cc: Robin Holt Cc: H. Peter Anvin Cc: Guan Xuetao Acked-by: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/hardware/iop3xx.h | 3 ++- arch/arm/include/asm/mach/arch.h | 2 +- arch/arm/include/asm/system_misc.h | 3 ++- arch/arm/kernel/process.c | 5 +++-- arch/arm/mach-at91/at91rm9200.c | 3 ++- arch/arm/mach-at91/generic.h | 5 +++-- arch/arm/mach-bcm2835/bcm2835.c | 4 ++-- arch/arm/mach-clps711x/common.c | 2 +- arch/arm/mach-clps711x/common.h | 4 +++- arch/arm/mach-cns3xxx/core.h | 4 +++- arch/arm/mach-cns3xxx/pm.c | 2 +- arch/arm/mach-davinci/devices-da8xx.c | 3 ++- arch/arm/mach-davinci/devices.c | 3 ++- arch/arm/mach-davinci/include/mach/common.h | 3 ++- arch/arm/mach-davinci/include/mach/da8xx.h | 3 ++- arch/arm/mach-davinci/include/mach/tnetv107x.h | 3 ++- arch/arm/mach-davinci/tnetv107x.c | 3 ++- arch/arm/mach-dove/common.c | 2 +- arch/arm/mach-dove/common.h | 4 +++- arch/arm/mach-ebsa110/core.c | 2 +- arch/arm/mach-ep93xx/core.c | 3 ++- arch/arm/mach-ep93xx/include/mach/platform.h | 4 +++- arch/arm/mach-exynos/common.c | 4 ++-- arch/arm/mach-exynos/common.h | 5 +++-- arch/arm/mach-footbridge/common.c | 4 ++-- arch/arm/mach-footbridge/common.h | 3 ++- arch/arm/mach-footbridge/netwinder-hw.c | 4 ++-- arch/arm/mach-highbank/core.h | 4 +++- arch/arm/mach-highbank/system.c | 5 +++-- arch/arm/mach-imx/common.h | 4 +++- arch/arm/mach-imx/mach-imx6q.c | 3 ++- arch/arm/mach-imx/system.c | 2 +- arch/arm/mach-integrator/common.h | 3 ++- arch/arm/mach-integrator/core.c | 2 +- arch/arm/mach-iop13xx/include/mach/iop13xx.h | 5 ++++- arch/arm/mach-iop13xx/setup.c | 2 +- arch/arm/mach-iop32x/n2100.c | 2 +- arch/arm/mach-ixp4xx/common.c | 4 ++-- arch/arm/mach-ixp4xx/dsmg600-setup.c | 1 + arch/arm/mach-ixp4xx/include/mach/platform.h | 4 +++- arch/arm/mach-kirkwood/common.c | 3 ++- arch/arm/mach-kirkwood/common.h | 4 +++- arch/arm/mach-ks8695/generic.h | 2 +- arch/arm/mach-ks8695/time.c | 4 ++-- arch/arm/mach-lpc32xx/common.c | 6 +++--- arch/arm/mach-lpc32xx/common.h | 3 ++- arch/arm/mach-mmp/common.c | 2 +- arch/arm/mach-mmp/common.h | 3 ++- arch/arm/mach-mmp/include/mach/pxa168.h | 4 +++- arch/arm/mach-mmp/pxa168.c | 2 +- arch/arm/mach-mv78xx0/common.c | 2 +- arch/arm/mach-mv78xx0/common.h | 4 +++- arch/arm/mach-mvebu/common.h | 4 +++- arch/arm/mach-mvebu/system-controller.c | 3 ++- arch/arm/mach-mxs/mach-mxs.c | 3 ++- arch/arm/mach-netx/generic.c | 3 ++- arch/arm/mach-netx/generic.h | 4 +++- arch/arm/mach-nomadik/cpu-8815.c | 2 +- arch/arm/mach-omap1/board-voiceblue.c | 3 ++- arch/arm/mach-omap1/common.h | 3 ++- arch/arm/mach-omap1/reset.c | 3 ++- arch/arm/mach-omap2/am33xx-restart.c | 3 ++- arch/arm/mach-omap2/common.h | 17 +++++++++-------- arch/arm/mach-omap2/omap2-restart.c | 2 +- arch/arm/mach-omap2/omap3-restart.c | 3 ++- arch/arm/mach-omap2/omap4-common.c | 1 + arch/arm/mach-omap2/omap4-restart.c | 3 ++- arch/arm/mach-orion5x/common.c | 2 +- arch/arm/mach-orion5x/common.h | 4 +++- arch/arm/mach-orion5x/ls-chl-setup.c | 2 +- arch/arm/mach-orion5x/ls_hgl-setup.c | 2 +- arch/arm/mach-orion5x/lsmini-setup.c | 2 +- arch/arm/mach-picoxcell/common.c | 3 ++- arch/arm/mach-prima2/common.h | 4 +++- arch/arm/mach-prima2/rstc.c | 3 ++- arch/arm/mach-pxa/corgi.c | 6 +++--- arch/arm/mach-pxa/generic.h | 4 +++- arch/arm/mach-pxa/mioa701.c | 7 ++++--- arch/arm/mach-pxa/poodle.c | 2 +- arch/arm/mach-pxa/reset.c | 8 ++++---- arch/arm/mach-pxa/spitz.c | 5 +++-- arch/arm/mach-pxa/tosa.c | 5 +++-- arch/arm/mach-realview/realview_eb.c | 3 ++- arch/arm/mach-realview/realview_pb1176.c | 3 ++- arch/arm/mach-realview/realview_pb11mp.c | 3 ++- arch/arm/mach-realview/realview_pba8.c | 3 ++- arch/arm/mach-realview/realview_pbx.c | 3 ++- arch/arm/mach-rpc/riscpc.c | 3 ++- arch/arm/mach-s3c24xx/common.h | 12 +++++++----- arch/arm/mach-s3c24xx/s3c2410.c | 5 +++-- arch/arm/mach-s3c24xx/s3c2412.c | 5 +++-- arch/arm/mach-s3c24xx/s3c2416.c | 5 +++-- arch/arm/mach-s3c24xx/s3c2443.c | 5 +++-- arch/arm/mach-s3c24xx/s3c244x.c | 5 +++-- arch/arm/mach-s3c64xx/common.c | 5 +++-- arch/arm/mach-s3c64xx/common.h | 4 +++- arch/arm/mach-s5p64x0/common.c | 5 +++-- arch/arm/mach-s5p64x0/common.h | 4 +++- arch/arm/mach-s5pc100/common.c | 5 +++-- arch/arm/mach-s5pc100/common.h | 4 +++- arch/arm/mach-s5pv210/common.c | 2 +- arch/arm/mach-s5pv210/common.h | 4 +++- arch/arm/mach-sa1100/generic.c | 5 +++-- arch/arm/mach-sa1100/generic.h | 3 ++- arch/arm/mach-shark/core.c | 3 ++- arch/arm/mach-shmobile/board-armadillo800eva.c | 3 ++- arch/arm/mach-shmobile/board-kzm9g.c | 3 ++- arch/arm/mach-socfpga/socfpga.c | 5 +++-- arch/arm/mach-spear/generic.h | 4 +++- arch/arm/mach-spear/restart.c | 5 +++-- arch/arm/mach-sunxi/sunxi.c | 3 ++- arch/arm/mach-tegra/board.h | 3 ++- arch/arm/mach-tegra/common.c | 3 ++- arch/arm/mach-u300/core.c | 6 +++--- arch/arm/mach-versatile/core.c | 3 ++- arch/arm/mach-versatile/core.h | 3 ++- arch/arm/mach-vt8500/vt8500.c | 3 ++- arch/arm/mach-w90x900/cpu.c | 4 ++-- arch/arm/mach-w90x900/nuc9xx.h | 5 ++++- arch/arm/plat-iop/gpio.c | 1 + arch/arm/plat-iop/restart.c | 2 +- drivers/power/reset/restart-poweroff.c | 3 ++- drivers/power/reset/vexpress-poweroff.c | 2 +- include/linux/reboot.h | 1 + include/linux/vexpress.h | 1 + 125 files changed, 282 insertions(+), 166 deletions(-) diff --git a/arch/arm/include/asm/hardware/iop3xx.h b/arch/arm/include/asm/hardware/iop3xx.h index ed94b1a366ae..423744bf18eb 100644 --- a/arch/arm/include/asm/hardware/iop3xx.h +++ b/arch/arm/include/asm/hardware/iop3xx.h @@ -223,11 +223,12 @@ extern int iop3xx_get_init_atu(void); #ifndef __ASSEMBLY__ #include +#include void iop3xx_map_io(void); void iop_init_cp6_handler(void); void iop_init_time(unsigned long tickrate); -void iop3xx_restart(char, const char *); +void iop3xx_restart(enum reboot_mode, const char *); static inline u32 read_tmr0(void) { diff --git a/arch/arm/include/asm/mach/arch.h b/arch/arm/include/asm/mach/arch.h index fdf62b4246b0..441efc491b50 100644 --- a/arch/arm/include/asm/mach/arch.h +++ b/arch/arm/include/asm/mach/arch.h @@ -59,7 +59,7 @@ struct machine_desc { #ifdef CONFIG_MULTI_IRQ_HANDLER void (*handle_irq)(struct pt_regs *); #endif - void (*restart)(char, const char *); + void (*restart)(enum reboot_mode, const char *); }; /* diff --git a/arch/arm/include/asm/system_misc.h b/arch/arm/include/asm/system_misc.h index 21a23e378bbe..a3d61ad984af 100644 --- a/arch/arm/include/asm/system_misc.h +++ b/arch/arm/include/asm/system_misc.h @@ -6,11 +6,12 @@ #include #include #include +#include extern void cpu_init(void); void soft_restart(unsigned long); -extern void (*arm_pm_restart)(char str, const char *cmd); +extern void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd); extern void (*arm_pm_idle)(void); #define UDBG_UNDEFINED (1 << 0) diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 2d544062fd7d..b7fdd864c839 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -113,7 +114,7 @@ void soft_restart(unsigned long addr) BUG(); } -static void null_restart(char mode, const char *cmd) +static void null_restart(enum reboot_mode reboot_mode, const char *cmd) { } @@ -123,7 +124,7 @@ static void null_restart(char mode, const char *cmd) void (*pm_power_off)(void); EXPORT_SYMBOL(pm_power_off); -void (*arm_pm_restart)(char str, const char *cmd) = null_restart; +void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd) = null_restart; EXPORT_SYMBOL_GPL(arm_pm_restart); /* diff --git a/arch/arm/mach-at91/at91rm9200.c b/arch/arm/mach-at91/at91rm9200.c index 9eb574397ee1..4aad93d54d6f 100644 --- a/arch/arm/mach-at91/at91rm9200.c +++ b/arch/arm/mach-at91/at91rm9200.c @@ -11,6 +11,7 @@ */ #include +#include #include #include @@ -304,7 +305,7 @@ static void at91rm9200_idle(void) at91_pmc_write(AT91_PMC_SCDR, AT91_PMC_PCK); } -static void at91rm9200_restart(char mode, const char *cmd) +static void at91rm9200_restart(enum reboot_mode reboot_mode, const char *cmd) { /* * Perform a hardware reset with the use of the Watchdog timer. diff --git a/arch/arm/mach-at91/generic.h b/arch/arm/mach-at91/generic.h index f6de36aefe85..dc6e2f5f804d 100644 --- a/arch/arm/mach-at91/generic.h +++ b/arch/arm/mach-at91/generic.h @@ -10,6 +10,7 @@ #include #include +#include /* Map io */ extern void __init at91_map_io(void); @@ -60,8 +61,8 @@ extern void at91sam9_idle(void); /* reset */ extern void at91_ioremap_rstc(u32 base_addr); -extern void at91sam9_alt_restart(char, const char *); -extern void at91sam9g45_restart(char, const char *); +extern void at91sam9_alt_restart(enum reboot_mode, const char *); +extern void at91sam9g45_restart(enum reboot_mode, const char *); /* shutdown */ extern void at91_ioremap_shdwc(u32 base_addr); diff --git a/arch/arm/mach-bcm2835/bcm2835.c b/arch/arm/mach-bcm2835/bcm2835.c index 740fa9ebe249..40686d7ef500 100644 --- a/arch/arm/mach-bcm2835/bcm2835.c +++ b/arch/arm/mach-bcm2835/bcm2835.c @@ -53,7 +53,7 @@ static void bcm2835_setup_restart(void) WARN(!wdt_regs, "failed to remap watchdog regs"); } -static void bcm2835_restart(char mode, const char *cmd) +static void bcm2835_restart(enum reboot_mode mode, const char *cmd) { u32 val; @@ -91,7 +91,7 @@ static void bcm2835_power_off(void) writel_relaxed(val, wdt_regs + PM_RSTS); /* Continue with normal reset mechanism */ - bcm2835_restart(0, ""); + bcm2835_restart(REBOOT_HARD, ""); } static struct map_desc io_map __initdata = { diff --git a/arch/arm/mach-clps711x/common.c b/arch/arm/mach-clps711x/common.c index f6d1746366d4..4ca2f3ca2de4 100644 --- a/arch/arm/mach-clps711x/common.c +++ b/arch/arm/mach-clps711x/common.c @@ -384,7 +384,7 @@ void __init clps711x_timer_init(void) setup_irq(IRQ_TC2OI, &clps711x_timer_irq); } -void clps711x_restart(char mode, const char *cmd) +void clps711x_restart(enum reboot_mode mode, const char *cmd) { soft_restart(0); } diff --git a/arch/arm/mach-clps711x/common.h b/arch/arm/mach-clps711x/common.h index 2a22f4c6cc75..9a6767bfdc47 100644 --- a/arch/arm/mach-clps711x/common.h +++ b/arch/arm/mach-clps711x/common.h @@ -4,6 +4,8 @@ * Common bits. */ +#include + #define CLPS711X_NR_IRQS (33) #define CLPS711X_NR_GPIO (4 * 8 + 3) #define CLPS711X_GPIO(prt, bit) ((prt) * 8 + (bit)) @@ -12,5 +14,5 @@ extern void clps711x_map_io(void); extern void clps711x_init_irq(void); extern void clps711x_timer_init(void); extern void clps711x_handle_irq(struct pt_regs *regs); -extern void clps711x_restart(char mode, const char *cmd); +extern void clps711x_restart(enum reboot_mode mode, const char *cmd); extern void clps711x_init_early(void); diff --git a/arch/arm/mach-cns3xxx/core.h b/arch/arm/mach-cns3xxx/core.h index b23b17b4da10..5218b6198dc2 100644 --- a/arch/arm/mach-cns3xxx/core.h +++ b/arch/arm/mach-cns3xxx/core.h @@ -11,6 +11,8 @@ #ifndef __CNS3XXX_CORE_H #define __CNS3XXX_CORE_H +#include + extern void cns3xxx_timer_init(void); #ifdef CONFIG_CACHE_L2X0 @@ -22,6 +24,6 @@ static inline void cns3xxx_l2x0_init(void) {} void __init cns3xxx_map_io(void); void __init cns3xxx_init_irq(void); void cns3xxx_power_off(void); -void cns3xxx_restart(char, const char *); +void cns3xxx_restart(enum reboot_mode, const char *); #endif /* __CNS3XXX_CORE_H */ diff --git a/arch/arm/mach-cns3xxx/pm.c b/arch/arm/mach-cns3xxx/pm.c index 79e3d47aad65..fb38c726e987 100644 --- a/arch/arm/mach-cns3xxx/pm.c +++ b/arch/arm/mach-cns3xxx/pm.c @@ -89,7 +89,7 @@ void cns3xxx_pwr_soft_rst(unsigned int block) } EXPORT_SYMBOL(cns3xxx_pwr_soft_rst); -void cns3xxx_restart(char mode, const char *cmd) +void cns3xxx_restart(enum reboot_mode mode, const char *cmd) { /* * To reset, we hit the on-board reset register diff --git a/arch/arm/mach-davinci/devices-da8xx.c b/arch/arm/mach-davinci/devices-da8xx.c index eb254fe861ac..71a46a348761 100644 --- a/arch/arm/mach-davinci/devices-da8xx.c +++ b/arch/arm/mach-davinci/devices-da8xx.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -366,7 +367,7 @@ static struct platform_device da8xx_wdt_device = { .resource = da8xx_watchdog_resources, }; -void da8xx_restart(char mode, const char *cmd) +void da8xx_restart(enum reboot_mode mode, const char *cmd) { struct device *dev; diff --git a/arch/arm/mach-davinci/devices.c b/arch/arm/mach-davinci/devices.c index 90b83d00fe2b..111573c0aad1 100644 --- a/arch/arm/mach-davinci/devices.c +++ b/arch/arm/mach-davinci/devices.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -307,7 +308,7 @@ struct platform_device davinci_wdt_device = { .resource = wdt_resources, }; -void davinci_restart(char mode, const char *cmd) +void davinci_restart(enum reboot_mode mode, const char *cmd) { davinci_watchdog_reset(&davinci_wdt_device); } diff --git a/arch/arm/mach-davinci/include/mach/common.h b/arch/arm/mach-davinci/include/mach/common.h index b124b77c90c5..cce316b92c06 100644 --- a/arch/arm/mach-davinci/include/mach/common.h +++ b/arch/arm/mach-davinci/include/mach/common.h @@ -14,6 +14,7 @@ #include #include +#include extern void davinci_timer_init(void); @@ -81,7 +82,7 @@ extern struct davinci_soc_info davinci_soc_info; extern void davinci_common_init(struct davinci_soc_info *soc_info); extern void davinci_init_ide(void); -void davinci_restart(char mode, const char *cmd); +void davinci_restart(enum reboot_mode mode, const char *cmd); void davinci_init_late(void); #ifdef CONFIG_DAVINCI_RESET_CLOCKS diff --git a/arch/arm/mach-davinci/include/mach/da8xx.h b/arch/arm/mach-davinci/include/mach/da8xx.h index 3c797e2272f8..7b41a5e9bc31 100644 --- a/arch/arm/mach-davinci/include/mach/da8xx.h +++ b/arch/arm/mach-davinci/include/mach/da8xx.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -106,7 +107,7 @@ int da850_register_vpif_display (struct vpif_display_config *display_config); int da850_register_vpif_capture (struct vpif_capture_config *capture_config); -void da8xx_restart(char mode, const char *cmd); +void da8xx_restart(enum reboot_mode mode, const char *cmd); void da8xx_rproc_reserve_cma(void); int da8xx_register_rproc(void); diff --git a/arch/arm/mach-davinci/include/mach/tnetv107x.h b/arch/arm/mach-davinci/include/mach/tnetv107x.h index 366e975effa8..16314c64f755 100644 --- a/arch/arm/mach-davinci/include/mach/tnetv107x.h +++ b/arch/arm/mach-davinci/include/mach/tnetv107x.h @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -54,7 +55,7 @@ extern struct platform_device tnetv107x_serial_device; extern void tnetv107x_init(void); extern void tnetv107x_devices_init(struct tnetv107x_device_info *); extern void tnetv107x_irq_init(void); -void tnetv107x_restart(char mode, const char *cmd); +void tnetv107x_restart(enum reboot_mode mode, const char *cmd); #endif diff --git a/arch/arm/mach-davinci/tnetv107x.c b/arch/arm/mach-davinci/tnetv107x.c index 3b2a70d43efa..4545667ecd3c 100644 --- a/arch/arm/mach-davinci/tnetv107x.c +++ b/arch/arm/mach-davinci/tnetv107x.c @@ -19,6 +19,7 @@ #include #include #include +#include #include @@ -730,7 +731,7 @@ static void tnetv107x_watchdog_reset(struct platform_device *pdev) __raw_writel(1, ®s->kick); } -void tnetv107x_restart(char mode, const char *cmd) +void tnetv107x_restart(enum reboot_mode mode, const char *cmd) { tnetv107x_watchdog_reset(&tnetv107x_wdt_device); } diff --git a/arch/arm/mach-dove/common.c b/arch/arm/mach-dove/common.c index 2a9443d04d92..00247c771313 100644 --- a/arch/arm/mach-dove/common.c +++ b/arch/arm/mach-dove/common.c @@ -381,7 +381,7 @@ void __init dove_init(void) dove_xor1_init(); } -void dove_restart(char mode, const char *cmd) +void dove_restart(enum reboot_mode mode, const char *cmd) { /* * Enable soft reset to assert RSTOUTn. diff --git a/arch/arm/mach-dove/common.h b/arch/arm/mach-dove/common.h index e86347928b67..1d725224d146 100644 --- a/arch/arm/mach-dove/common.h +++ b/arch/arm/mach-dove/common.h @@ -11,6 +11,8 @@ #ifndef __ARCH_DOVE_COMMON_H #define __ARCH_DOVE_COMMON_H +#include + struct mv643xx_eth_platform_data; struct mv_sata_platform_data; @@ -42,6 +44,6 @@ void dove_spi1_init(void); void dove_i2c_init(void); void dove_sdio0_init(void); void dove_sdio1_init(void); -void dove_restart(char, const char *); +void dove_restart(enum reboot_mode, const char *); #endif diff --git a/arch/arm/mach-ebsa110/core.c b/arch/arm/mach-ebsa110/core.c index 41d2d90afaa3..68ac934d4565 100644 --- a/arch/arm/mach-ebsa110/core.c +++ b/arch/arm/mach-ebsa110/core.c @@ -311,7 +311,7 @@ static int __init ebsa110_init(void) arch_initcall(ebsa110_init); -static void ebsa110_restart(char mode, const char *cmd) +static void ebsa110_restart(enum reboot_mode mode, const char *cmd) { soft_restart(0x80000000); } diff --git a/arch/arm/mach-ep93xx/core.c b/arch/arm/mach-ep93xx/core.c index c49ed3dc1aea..df8612fbbc9c 100644 --- a/arch/arm/mach-ep93xx/core.c +++ b/arch/arm/mach-ep93xx/core.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -921,7 +922,7 @@ void __init ep93xx_init_devices(void) gpio_led_register_device(-1, &ep93xx_led_data); } -void ep93xx_restart(char mode, const char *cmd) +void ep93xx_restart(enum reboot_mode mode, const char *cmd) { /* * Set then clear the SWRST bit to initiate a software reset diff --git a/arch/arm/mach-ep93xx/include/mach/platform.h b/arch/arm/mach-ep93xx/include/mach/platform.h index a14e1b37beff..e256e0baec2e 100644 --- a/arch/arm/mach-ep93xx/include/mach/platform.h +++ b/arch/arm/mach-ep93xx/include/mach/platform.h @@ -4,6 +4,8 @@ #ifndef __ASSEMBLY__ +#include + struct i2c_gpio_platform_data; struct i2c_board_info; struct spi_board_info; @@ -55,7 +57,7 @@ void ep93xx_ide_release_gpio(struct platform_device *pdev); void ep93xx_init_devices(void); extern void ep93xx_timer_init(void); -void ep93xx_restart(char, const char *); +void ep93xx_restart(enum reboot_mode, const char *); void ep93xx_init_late(void); #ifdef CONFIG_CRUNCH diff --git a/arch/arm/mach-exynos/common.c b/arch/arm/mach-exynos/common.c index 2c655db4b78e..164685bd25c8 100644 --- a/arch/arm/mach-exynos/common.c +++ b/arch/arm/mach-exynos/common.c @@ -285,12 +285,12 @@ static struct map_desc exynos5440_iodesc0[] __initdata = { }, }; -void exynos4_restart(char mode, const char *cmd) +void exynos4_restart(enum reboot_mode mode, const char *cmd) { __raw_writel(0x1, S5P_SWRESET); } -void exynos5_restart(char mode, const char *cmd) +void exynos5_restart(enum reboot_mode mode, const char *cmd) { struct device_node *np; u32 val; diff --git a/arch/arm/mach-exynos/common.h b/arch/arm/mach-exynos/common.h index 38d45fd23be4..3e156bcddcb4 100644 --- a/arch/arm/mach-exynos/common.h +++ b/arch/arm/mach-exynos/common.h @@ -12,6 +12,7 @@ #ifndef __ARCH_ARM_MACH_EXYNOS_COMMON_H #define __ARCH_ARM_MACH_EXYNOS_COMMON_H +#include #include void mct_init(void __iomem *base, int irq_g0, int irq_l0, int irq_l1); @@ -20,8 +21,8 @@ extern unsigned long xxti_f, xusbxti_f; struct map_desc; void exynos_init_io(void); -void exynos4_restart(char mode, const char *cmd); -void exynos5_restart(char mode, const char *cmd); +void exynos4_restart(enum reboot_mode mode, const char *cmd); +void exynos5_restart(enum reboot_mode mode, const char *cmd); void exynos_init_late(void); /* ToDo: remove these after migrating legacy exynos4 platforms to dt */ diff --git a/arch/arm/mach-footbridge/common.c b/arch/arm/mach-footbridge/common.c index a42b369bc439..2739ca2c1334 100644 --- a/arch/arm/mach-footbridge/common.c +++ b/arch/arm/mach-footbridge/common.c @@ -198,9 +198,9 @@ void __init footbridge_map_io(void) } } -void footbridge_restart(char mode, const char *cmd) +void footbridge_restart(enum reboot_mode mode, const char *cmd) { - if (mode == 's') { + if (mode == REBOOT_SOFT) { /* Jump into the ROM */ soft_restart(0x41000000); } else { diff --git a/arch/arm/mach-footbridge/common.h b/arch/arm/mach-footbridge/common.h index a846e50a07b8..56607b3a773e 100644 --- a/arch/arm/mach-footbridge/common.h +++ b/arch/arm/mach-footbridge/common.h @@ -1,3 +1,4 @@ +#include extern void footbridge_timer_init(void); extern void isa_timer_init(void); @@ -8,4 +9,4 @@ extern void footbridge_map_io(void); extern void footbridge_init_irq(void); extern void isa_init_irq(unsigned int irq); -extern void footbridge_restart(char, const char *); +extern void footbridge_restart(enum reboot_mode, const char *); diff --git a/arch/arm/mach-footbridge/netwinder-hw.c b/arch/arm/mach-footbridge/netwinder-hw.c index 90ea23fdce4c..1fd2cf097e30 100644 --- a/arch/arm/mach-footbridge/netwinder-hw.c +++ b/arch/arm/mach-footbridge/netwinder-hw.c @@ -634,9 +634,9 @@ fixup_netwinder(struct tag *tags, char **cmdline, struct meminfo *mi) #endif } -static void netwinder_restart(char mode, const char *cmd) +static void netwinder_restart(enum reboot_mode mode, const char *cmd) { - if (mode == 's') { + if (mode == REBOOT_SOFT) { /* Jump into the ROM */ soft_restart(0x41000000); } else { diff --git a/arch/arm/mach-highbank/core.h b/arch/arm/mach-highbank/core.h index 3f65206a9b92..aea1ec5ab6f8 100644 --- a/arch/arm/mach-highbank/core.h +++ b/arch/arm/mach-highbank/core.h @@ -1,8 +1,10 @@ #ifndef __HIGHBANK_CORE_H #define __HIGHBANK_CORE_H +#include + extern void highbank_set_cpu_jump(int cpu, void *jump_addr); -extern void highbank_restart(char, const char *); +extern void highbank_restart(enum reboot_mode, const char *); extern void __iomem *scu_base_addr; #ifdef CONFIG_PM_SLEEP diff --git a/arch/arm/mach-highbank/system.c b/arch/arm/mach-highbank/system.c index 37d8384dcf19..2df5870b7583 100644 --- a/arch/arm/mach-highbank/system.c +++ b/arch/arm/mach-highbank/system.c @@ -15,13 +15,14 @@ */ #include #include +#include #include "core.h" #include "sysregs.h" -void highbank_restart(char mode, const char *cmd) +void highbank_restart(enum reboot_mode mode, const char *cmd) { - if (mode == 'h') + if (mode == REBOOT_HARD) highbank_set_pwr_hard_reset(); else highbank_set_pwr_soft_reset(); diff --git a/arch/arm/mach-imx/common.h b/arch/arm/mach-imx/common.h index ee78847abf47..cb6c838b63ed 100644 --- a/arch/arm/mach-imx/common.h +++ b/arch/arm/mach-imx/common.h @@ -11,6 +11,8 @@ #ifndef __ASM_ARCH_MXC_COMMON_H__ #define __ASM_ARCH_MXC_COMMON_H__ +#include + struct platform_device; struct pt_regs; struct clk; @@ -71,7 +73,7 @@ extern int mx53_clocks_init_dt(void); extern struct platform_device *mxc_register_gpio(char *name, int id, resource_size_t iobase, resource_size_t iosize, int irq, int irq_high); extern void mxc_set_cpu_type(unsigned int type); -extern void mxc_restart(char, const char *); +extern void mxc_restart(enum reboot_mode, const char *); extern void mxc_arch_reset_init(void __iomem *); extern void mxc_arch_reset_init_dt(void); extern int mx53_revision(void); diff --git a/arch/arm/mach-imx/mach-imx6q.c b/arch/arm/mach-imx/mach-imx6q.c index f5965220a4d8..7be13f8e69a0 100644 --- a/arch/arm/mach-imx/mach-imx6q.c +++ b/arch/arm/mach-imx/mach-imx6q.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -67,7 +68,7 @@ static void __init imx6q_init_revision(void) mxc_set_cpu_type(rev >> 16 & 0xff); } -static void imx6q_restart(char mode, const char *cmd) +static void imx6q_restart(enum reboot_mode mode, const char *cmd) { struct device_node *np; void __iomem *wdog_base; diff --git a/arch/arm/mach-imx/system.c b/arch/arm/mach-imx/system.c index 7cdc79a9657c..6fe81bb4d3c9 100644 --- a/arch/arm/mach-imx/system.c +++ b/arch/arm/mach-imx/system.c @@ -37,7 +37,7 @@ static struct clk *wdog_clk; /* * Reset the system. It is called by machine_restart(). */ -void mxc_restart(char mode, const char *cmd) +void mxc_restart(enum reboot_mode mode, const char *cmd) { unsigned int wcr_enable; diff --git a/arch/arm/mach-integrator/common.h b/arch/arm/mach-integrator/common.h index 72516658be1e..ad0ac5547b2c 100644 --- a/arch/arm/mach-integrator/common.h +++ b/arch/arm/mach-integrator/common.h @@ -1,7 +1,8 @@ +#include #include extern struct amba_pl010_data ap_uart_data; void integrator_init_early(void); int integrator_init(bool is_cp); void integrator_reserve(void); -void integrator_restart(char, const char *); +void integrator_restart(enum reboot_mode, const char *); void integrator_init_sysfs(struct device *parent, u32 id); diff --git a/arch/arm/mach-integrator/core.c b/arch/arm/mach-integrator/core.c index 81461d218717..4cdfd7365925 100644 --- a/arch/arm/mach-integrator/core.c +++ b/arch/arm/mach-integrator/core.c @@ -124,7 +124,7 @@ void __init integrator_reserve(void) /* * To reset, we hit the on-board reset register in the system FPGA */ -void integrator_restart(char mode, const char *cmd) +void integrator_restart(enum reboot_mode mode, const char *cmd) { cm_control(CM_CTRL_RESET, CM_CTRL_RESET); } diff --git a/arch/arm/mach-iop13xx/include/mach/iop13xx.h b/arch/arm/mach-iop13xx/include/mach/iop13xx.h index 7480f58267aa..17b40279e0a4 100644 --- a/arch/arm/mach-iop13xx/include/mach/iop13xx.h +++ b/arch/arm/mach-iop13xx/include/mach/iop13xx.h @@ -2,6 +2,9 @@ #define _IOP13XX_HW_H_ #ifndef __ASSEMBLY__ + +#include + /* The ATU offsets can change based on the strapping */ extern u32 iop13xx_atux_pmmr_offset; extern u32 iop13xx_atue_pmmr_offset; @@ -11,7 +14,7 @@ void iop13xx_map_io(void); void iop13xx_platform_init(void); void iop13xx_add_tpmi_devices(void); void iop13xx_init_irq(void); -void iop13xx_restart(char, const char *); +void iop13xx_restart(enum reboot_mode, const char *); /* CPUID CP6 R0 Page 0 */ static inline int iop13xx_cpu_id(void) diff --git a/arch/arm/mach-iop13xx/setup.c b/arch/arm/mach-iop13xx/setup.c index 1c5bd7637b05..96e6c7a6793b 100644 --- a/arch/arm/mach-iop13xx/setup.c +++ b/arch/arm/mach-iop13xx/setup.c @@ -594,7 +594,7 @@ __setup("iop13xx_init_adma", iop13xx_init_adma_setup); __setup("iop13xx_init_uart", iop13xx_init_uart_setup); __setup("iop13xx_init_i2c", iop13xx_init_i2c_setup); -void iop13xx_restart(char mode, const char *cmd) +void iop13xx_restart(enum reboot_mode mode, const char *cmd) { /* * Reset the internal bus (warning both cores are reset) diff --git a/arch/arm/mach-iop32x/n2100.c b/arch/arm/mach-iop32x/n2100.c index ea0984a7449e..069144300b77 100644 --- a/arch/arm/mach-iop32x/n2100.c +++ b/arch/arm/mach-iop32x/n2100.c @@ -286,7 +286,7 @@ static void n2100_power_off(void) ; } -static void n2100_restart(char mode, const char *cmd) +static void n2100_restart(enum reboot_mode mode, const char *cmd) { gpio_line_set(N2100_HARDWARE_RESET, GPIO_LOW); gpio_line_config(N2100_HARDWARE_RESET, GPIO_OUT); diff --git a/arch/arm/mach-ixp4xx/common.c b/arch/arm/mach-ixp4xx/common.c index 1f6c1fb353ad..5327decde5a0 100644 --- a/arch/arm/mach-ixp4xx/common.c +++ b/arch/arm/mach-ixp4xx/common.c @@ -531,9 +531,9 @@ static void __init ixp4xx_clockevent_init(void) 0xf, 0xfffffffe); } -void ixp4xx_restart(char mode, const char *cmd) +void ixp4xx_restart(enum reboot_mode mode, const char *cmd) { - if ( 1 && mode == 's') { + if ( 1 && mode == REBOOT_SOFT) { /* Jump into ROM at address 0 */ soft_restart(0); } else { diff --git a/arch/arm/mach-ixp4xx/dsmg600-setup.c b/arch/arm/mach-ixp4xx/dsmg600-setup.c index 5d413f8c5700..686ef34c69f5 100644 --- a/arch/arm/mach-ixp4xx/dsmg600-setup.c +++ b/arch/arm/mach-ixp4xx/dsmg600-setup.c @@ -27,6 +27,7 @@ #include #include +#include #include #include #include diff --git a/arch/arm/mach-ixp4xx/include/mach/platform.h b/arch/arm/mach-ixp4xx/include/mach/platform.h index db5afb69c123..4c4c6a6f4526 100644 --- a/arch/arm/mach-ixp4xx/include/mach/platform.h +++ b/arch/arm/mach-ixp4xx/include/mach/platform.h @@ -13,6 +13,8 @@ #ifndef __ASSEMBLY__ +#include + #include #ifndef __ARMEB__ @@ -123,7 +125,7 @@ extern void ixp4xx_init_early(void); extern void ixp4xx_init_irq(void); extern void ixp4xx_sys_init(void); extern void ixp4xx_timer_init(void); -extern void ixp4xx_restart(char, const char *); +extern void ixp4xx_restart(enum reboot_mode, const char *); extern void ixp4xx_pci_preinit(void); struct pci_sys_data; extern int ixp4xx_setup(int nr, struct pci_sys_data *sys); diff --git a/arch/arm/mach-kirkwood/common.c b/arch/arm/mach-kirkwood/common.c index 7c72c725b711..e9238b5567ee 100644 --- a/arch/arm/mach-kirkwood/common.c +++ b/arch/arm/mach-kirkwood/common.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -722,7 +723,7 @@ void __init kirkwood_init(void) #endif } -void kirkwood_restart(char mode, const char *cmd) +void kirkwood_restart(enum reboot_mode mode, const char *cmd) { /* * Enable soft reset to assert RSTOUTn. diff --git a/arch/arm/mach-kirkwood/common.h b/arch/arm/mach-kirkwood/common.h index 1c09f3f93fbb..fcf3ba682e24 100644 --- a/arch/arm/mach-kirkwood/common.h +++ b/arch/arm/mach-kirkwood/common.h @@ -11,6 +11,8 @@ #ifndef __ARCH_KIRKWOOD_COMMON_H #define __ARCH_KIRKWOOD_COMMON_H +#include + struct dsa_platform_data; struct mv643xx_eth_platform_data; struct mv_sata_platform_data; @@ -53,7 +55,7 @@ void kirkwood_audio_init(void); void kirkwood_cpuidle_init(void); void kirkwood_cpufreq_init(void); -void kirkwood_restart(char, const char *); +void kirkwood_restart(enum reboot_mode, const char *); void kirkwood_clk_init(void); /* board init functions for boards not fully converted to fdt */ diff --git a/arch/arm/mach-ks8695/generic.h b/arch/arm/mach-ks8695/generic.h index 6e97ce462d73..43253f8e6de4 100644 --- a/arch/arm/mach-ks8695/generic.h +++ b/arch/arm/mach-ks8695/generic.h @@ -12,5 +12,5 @@ extern __init void ks8695_map_io(void); extern __init void ks8695_init_irq(void); -extern void ks8695_restart(char, const char *); +extern void ks8695_restart(enum reboot_mode, const char *); extern void ks8695_timer_init(void); diff --git a/arch/arm/mach-ks8695/time.c b/arch/arm/mach-ks8695/time.c index c272a3863d5f..426c97662f5b 100644 --- a/arch/arm/mach-ks8695/time.c +++ b/arch/arm/mach-ks8695/time.c @@ -154,11 +154,11 @@ void __init ks8695_timer_init(void) setup_irq(KS8695_IRQ_TIMER1, &ks8695_timer_irq); } -void ks8695_restart(char mode, const char *cmd) +void ks8695_restart(enum reboot_mode reboot_mode, const char *cmd) { unsigned int reg; - if (mode == 's') + if (reboot_mode == REBOOT_SOFT) soft_restart(0); /* disable timer0 */ diff --git a/arch/arm/mach-lpc32xx/common.c b/arch/arm/mach-lpc32xx/common.c index 0d4db8c544b5..d7aa54c25c59 100644 --- a/arch/arm/mach-lpc32xx/common.c +++ b/arch/arm/mach-lpc32xx/common.c @@ -207,11 +207,11 @@ void __init lpc32xx_map_io(void) iotable_init(lpc32xx_io_desc, ARRAY_SIZE(lpc32xx_io_desc)); } -void lpc23xx_restart(char mode, const char *cmd) +void lpc23xx_restart(enum reboot_mode mode, const char *cmd) { switch (mode) { - case 's': - case 'h': + case REBOOT_SOFT: + case REBOOT_HARD: lpc32xx_watchdog_reset(); break; diff --git a/arch/arm/mach-lpc32xx/common.h b/arch/arm/mach-lpc32xx/common.h index e0b26062a272..1cd8853b2f9b 100644 --- a/arch/arm/mach-lpc32xx/common.h +++ b/arch/arm/mach-lpc32xx/common.h @@ -21,6 +21,7 @@ #include #include +#include /* * Other arch specific structures and functions @@ -29,7 +30,7 @@ extern void lpc32xx_timer_init(void); extern void __init lpc32xx_init_irq(void); extern void __init lpc32xx_map_io(void); extern void __init lpc32xx_serial_init(void); -extern void lpc23xx_restart(char, const char *); +extern void lpc23xx_restart(enum reboot_mode, const char *); /* diff --git a/arch/arm/mach-mmp/common.c b/arch/arm/mach-mmp/common.c index 9292b7966e3b..c03b4ab582db 100644 --- a/arch/arm/mach-mmp/common.c +++ b/arch/arm/mach-mmp/common.c @@ -47,7 +47,7 @@ void __init mmp_map_io(void) mmp_chip_id = __raw_readl(MMP_CHIPID); } -void mmp_restart(char mode, const char *cmd) +void mmp_restart(enum reboot_mode mode, const char *cmd) { soft_restart(0); } diff --git a/arch/arm/mach-mmp/common.h b/arch/arm/mach-mmp/common.h index 0bdc50b134ce..991d7e9877de 100644 --- a/arch/arm/mach-mmp/common.h +++ b/arch/arm/mach-mmp/common.h @@ -1,10 +1,11 @@ +#include #define ARRAY_AND_SIZE(x) (x), ARRAY_SIZE(x) extern void timer_init(int irq); extern void __init icu_init_irq(void); extern void __init mmp_map_io(void); -extern void mmp_restart(char, const char *); +extern void mmp_restart(enum reboot_mode, const char *); extern void __init pxa168_clk_init(void); extern void __init pxa910_clk_init(void); extern void __init mmp2_clk_init(void); diff --git a/arch/arm/mach-mmp/include/mach/pxa168.h b/arch/arm/mach-mmp/include/mach/pxa168.h index 7ed1df21ea1c..459c2d03eb5c 100644 --- a/arch/arm/mach-mmp/include/mach/pxa168.h +++ b/arch/arm/mach-mmp/include/mach/pxa168.h @@ -1,9 +1,11 @@ #ifndef __ASM_MACH_PXA168_H #define __ASM_MACH_PXA168_H +#include + extern void pxa168_timer_init(void); extern void __init pxa168_init_irq(void); -extern void pxa168_restart(char, const char *); +extern void pxa168_restart(enum reboot_mode, const char *); extern void pxa168_clear_keypad_wakeup(void); #include diff --git a/arch/arm/mach-mmp/pxa168.c b/arch/arm/mach-mmp/pxa168.c index a30dcf3b7d9e..144e997624c0 100644 --- a/arch/arm/mach-mmp/pxa168.c +++ b/arch/arm/mach-mmp/pxa168.c @@ -172,7 +172,7 @@ int __init pxa168_add_usb_host(struct mv_usb_platform_data *pdata) return platform_device_register(&pxa168_device_usb_host); } -void pxa168_restart(char mode, const char *cmd) +void pxa168_restart(enum reboot_mode mode, const char *cmd) { soft_restart(0xffff0000); } diff --git a/arch/arm/mach-mv78xx0/common.c b/arch/arm/mach-mv78xx0/common.c index 749a7f8c4992..75062eff2494 100644 --- a/arch/arm/mach-mv78xx0/common.c +++ b/arch/arm/mach-mv78xx0/common.c @@ -413,7 +413,7 @@ void __init mv78xx0_init(void) clk_init(); } -void mv78xx0_restart(char mode, const char *cmd) +void mv78xx0_restart(enum reboot_mode mode, const char *cmd) { /* * Enable soft reset to assert RSTOUTn. diff --git a/arch/arm/mach-mv78xx0/common.h b/arch/arm/mach-mv78xx0/common.h index 5e9485bad0ac..6889af26077d 100644 --- a/arch/arm/mach-mv78xx0/common.h +++ b/arch/arm/mach-mv78xx0/common.h @@ -11,6 +11,8 @@ #ifndef __ARCH_MV78XX0_COMMON_H #define __ARCH_MV78XX0_COMMON_H +#include + struct mv643xx_eth_platform_data; struct mv_sata_platform_data; @@ -45,7 +47,7 @@ void mv78xx0_uart1_init(void); void mv78xx0_uart2_init(void); void mv78xx0_uart3_init(void); void mv78xx0_i2c_init(void); -void mv78xx0_restart(char, const char *); +void mv78xx0_restart(enum reboot_mode, const char *); extern void mv78xx0_timer_init(void); diff --git a/arch/arm/mach-mvebu/common.h b/arch/arm/mach-mvebu/common.h index 98defd5e92cd..e366010e1d91 100644 --- a/arch/arm/mach-mvebu/common.h +++ b/arch/arm/mach-mvebu/common.h @@ -17,7 +17,9 @@ #define ARMADA_XP_MAX_CPUS 4 -void mvebu_restart(char mode, const char *cmd); +#include + +void mvebu_restart(enum reboot_mode mode, const char *cmd); void armada_370_xp_init_irq(void); void armada_370_xp_handle_irq(struct pt_regs *regs); diff --git a/arch/arm/mach-mvebu/system-controller.c b/arch/arm/mach-mvebu/system-controller.c index b8079df8c986..f875124ff4f9 100644 --- a/arch/arm/mach-mvebu/system-controller.c +++ b/arch/arm/mach-mvebu/system-controller.c @@ -26,6 +26,7 @@ #include #include #include +#include static void __iomem *system_controller_base; @@ -63,7 +64,7 @@ static struct of_device_id of_system_controller_table[] = { { /* end of list */ }, }; -void mvebu_restart(char mode, const char *cmd) +void mvebu_restart(enum reboot_mode mode, const char *cmd) { if (!system_controller_base) { pr_err("Cannot restart, system-controller not available: check the device tree\n"); diff --git a/arch/arm/mach-mxs/mach-mxs.c b/arch/arm/mach-mxs/mach-mxs.c index 7fa611c1b287..6298adb8d335 100644 --- a/arch/arm/mach-mxs/mach-mxs.c +++ b/arch/arm/mach-mxs/mach-mxs.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -500,7 +501,7 @@ static void __init mxs_machine_init(void) /* * Reset the system. It is called by machine_restart(). */ -static void mxs_restart(char mode, const char *cmd) +static void mxs_restart(enum reboot_mode mode, const char *cmd) { struct device_node *np; void __iomem *reset_addr; diff --git a/arch/arm/mach-netx/generic.c b/arch/arm/mach-netx/generic.c index 1504b68f4c66..db25b0cef3a7 100644 --- a/arch/arm/mach-netx/generic.c +++ b/arch/arm/mach-netx/generic.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -187,7 +188,7 @@ static int __init netx_init(void) subsys_initcall(netx_init); -void netx_restart(char mode, const char *cmd) +void netx_restart(enum reboot_mode mode, const char *cmd) { writel(NETX_SYSTEM_RES_CR_FIRMW_RES_EN | NETX_SYSTEM_RES_CR_FIRMW_RES, NETX_SYSTEM_RES_CR); diff --git a/arch/arm/mach-netx/generic.h b/arch/arm/mach-netx/generic.h index 768b26bbb42b..bb2ce471cc28 100644 --- a/arch/arm/mach-netx/generic.h +++ b/arch/arm/mach-netx/generic.h @@ -17,8 +17,10 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include + extern void __init netx_map_io(void); extern void __init netx_init_irq(void); -extern void netx_restart(char, const char *); +extern void netx_restart(enum reboot_mode, const char *); extern void netx_timer_init(void); diff --git a/arch/arm/mach-nomadik/cpu-8815.c b/arch/arm/mach-nomadik/cpu-8815.c index 2df209ed1a07..13e0df9c11ce 100644 --- a/arch/arm/mach-nomadik/cpu-8815.c +++ b/arch/arm/mach-nomadik/cpu-8815.c @@ -103,7 +103,7 @@ static void __init cpu8815_map_io(void) iotable_init(cpu8815_io_desc, ARRAY_SIZE(cpu8815_io_desc)); } -static void cpu8815_restart(char mode, const char *cmd) +static void cpu8815_restart(enum reboot_mode mode, const char *cmd) { void __iomem *srcbase = ioremap(NOMADIK_SRC_BASE, SZ_4K); diff --git a/arch/arm/mach-omap1/board-voiceblue.c b/arch/arm/mach-omap1/board-voiceblue.c index 6c116e1a4b01..4677a9ccb3cb 100644 --- a/arch/arm/mach-omap1/board-voiceblue.c +++ b/arch/arm/mach-omap1/board-voiceblue.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -215,7 +216,7 @@ void voiceblue_wdt_ping(void) gpio_set_value(0, wdt_gpio_state); } -static void voiceblue_restart(char mode, const char *cmd) +static void voiceblue_restart(enum reboot_mode mode, const char *cmd) { /* * Workaround for 5912/1611b bug mentioned in sprz209d.pdf p. 28 diff --git a/arch/arm/mach-omap1/common.h b/arch/arm/mach-omap1/common.h index 14f7e9920479..abec019a5281 100644 --- a/arch/arm/mach-omap1/common.h +++ b/arch/arm/mach-omap1/common.h @@ -28,6 +28,7 @@ #include #include +#include #include @@ -70,7 +71,7 @@ static inline int omap_serial_wakeup_init(void) void omap1_init_early(void); void omap1_init_irq(void); void omap1_init_late(void); -void omap1_restart(char, const char *); +void omap1_restart(enum reboot_mode, const char *); extern void __init omap_check_revision(void); diff --git a/arch/arm/mach-omap1/reset.c b/arch/arm/mach-omap1/reset.c index 5eebd7e889d0..72bf4bf4a702 100644 --- a/arch/arm/mach-omap1/reset.c +++ b/arch/arm/mach-omap1/reset.c @@ -3,6 +3,7 @@ */ #include #include +#include #include @@ -22,7 +23,7 @@ #define OMAP_EXTWARM_RST_SRC_ID_SHIFT 5 -void omap1_restart(char mode, const char *cmd) +void omap1_restart(enum reboot_mode mode, const char *cmd) { /* * Workaround for 5912/1611b bug mentioned in sprz209d.pdf p. 28 diff --git a/arch/arm/mach-omap2/am33xx-restart.c b/arch/arm/mach-omap2/am33xx-restart.c index 88e4fa8af031..1eae96212315 100644 --- a/arch/arm/mach-omap2/am33xx-restart.c +++ b/arch/arm/mach-omap2/am33xx-restart.c @@ -6,6 +6,7 @@ * published by the Free Software Foundation. */ #include +#include #include "common.h" #include "prm-regbits-33xx.h" @@ -19,7 +20,7 @@ * Resets the SoC. For @cmd, see the 'reboot' syscall in * kernel/sys.c. No return value. */ -void am33xx_restart(char mode, const char *cmd) +void am33xx_restart(enum reboot_mode mode, const char *cmd) { /* TODO: Handle mode and cmd if necessary */ diff --git a/arch/arm/mach-omap2/common.h b/arch/arm/mach-omap2/common.h index 72cab3f4f16d..dfcc182ecff9 100644 --- a/arch/arm/mach-omap2/common.h +++ b/arch/arm/mach-omap2/common.h @@ -31,6 +31,7 @@ #include #include #include +#include #include @@ -119,33 +120,33 @@ static inline void omap_soc_device_init(void) #endif #if defined(CONFIG_SOC_OMAP2420) || defined(CONFIG_SOC_OMAP2430) -void omap2xxx_restart(char mode, const char *cmd); +void omap2xxx_restart(enum reboot_mode mode, const char *cmd); #else -static inline void omap2xxx_restart(char mode, const char *cmd) +static inline void omap2xxx_restart(enum reboot_mode mode, const char *cmd) { } #endif #ifdef CONFIG_SOC_AM33XX -void am33xx_restart(char mode, const char *cmd); +void am33xx_restart(enum reboot_mode mode, const char *cmd); #else -static inline void am33xx_restart(char mode, const char *cmd) +static inline void am33xx_restart(enum reboot_mode mode, const char *cmd) { } #endif #ifdef CONFIG_ARCH_OMAP3 -void omap3xxx_restart(char mode, const char *cmd); +void omap3xxx_restart(enum reboot_mode mode, const char *cmd); #else -static inline void omap3xxx_restart(char mode, const char *cmd) +static inline void omap3xxx_restart(enum reboot_mode mode, const char *cmd) { } #endif #if defined(CONFIG_ARCH_OMAP4) || defined(CONFIG_SOC_OMAP5) -void omap44xx_restart(char mode, const char *cmd); +void omap44xx_restart(enum reboot_mode mode, const char *cmd); #else -static inline void omap44xx_restart(char mode, const char *cmd) +static inline void omap44xx_restart(enum reboot_mode mode, const char *cmd) { } #endif diff --git a/arch/arm/mach-omap2/omap2-restart.c b/arch/arm/mach-omap2/omap2-restart.c index 719b716a4494..68423e26399d 100644 --- a/arch/arm/mach-omap2/omap2-restart.c +++ b/arch/arm/mach-omap2/omap2-restart.c @@ -31,7 +31,7 @@ static struct clk *reset_virt_prcm_set_ck, *reset_sys_ck; * Set the DPLL to bypass so that reboot completes successfully. No * return value. */ -void omap2xxx_restart(char mode, const char *cmd) +void omap2xxx_restart(enum reboot_mode mode, const char *cmd) { u32 rate; diff --git a/arch/arm/mach-omap2/omap3-restart.c b/arch/arm/mach-omap2/omap3-restart.c index 923c582189e5..5de2a0c2979d 100644 --- a/arch/arm/mach-omap2/omap3-restart.c +++ b/arch/arm/mach-omap2/omap3-restart.c @@ -12,6 +12,7 @@ */ #include #include +#include #include "iomap.h" #include "common.h" @@ -28,7 +29,7 @@ * Resets the SoC. For @cmd, see the 'reboot' syscall in * kernel/sys.c. No return value. */ -void omap3xxx_restart(char mode, const char *cmd) +void omap3xxx_restart(enum reboot_mode mode, const char *cmd) { omap3_ctrl_write_boot_mode((cmd ? (u8)*cmd : 0)); omap3xxx_prm_dpll3_reset(); /* never returns */ diff --git a/arch/arm/mach-omap2/omap4-common.c b/arch/arm/mach-omap2/omap4-common.c index 38cd3a69cff3..57911430324e 100644 --- a/arch/arm/mach-omap2/omap4-common.c +++ b/arch/arm/mach-omap2/omap4-common.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include diff --git a/arch/arm/mach-omap2/omap4-restart.c b/arch/arm/mach-omap2/omap4-restart.c index f90e02e11898..41dfd7da8170 100644 --- a/arch/arm/mach-omap2/omap4-restart.c +++ b/arch/arm/mach-omap2/omap4-restart.c @@ -8,6 +8,7 @@ */ #include +#include #include "prminst44xx.h" /** @@ -18,7 +19,7 @@ * Resets the SoC. For @cmd, see the 'reboot' syscall in * kernel/sys.c. No return value. */ -void omap44xx_restart(char mode, const char *cmd) +void omap44xx_restart(enum reboot_mode mode, const char *cmd) { /* XXX Should save 'cmd' into scratchpad for use after reboot */ omap4_prminst_global_warm_sw_reset(); /* never returns */ diff --git a/arch/arm/mach-orion5x/common.c b/arch/arm/mach-orion5x/common.c index f8a6db9239bf..b41599f98a8e 100644 --- a/arch/arm/mach-orion5x/common.c +++ b/arch/arm/mach-orion5x/common.c @@ -347,7 +347,7 @@ void __init orion5x_init(void) orion5x_wdt_init(); } -void orion5x_restart(char mode, const char *cmd) +void orion5x_restart(enum reboot_mode mode, const char *cmd) { /* * Enable and issue soft reset diff --git a/arch/arm/mach-orion5x/common.h b/arch/arm/mach-orion5x/common.h index cdaa01f3d186..a909afb384fb 100644 --- a/arch/arm/mach-orion5x/common.h +++ b/arch/arm/mach-orion5x/common.h @@ -1,6 +1,8 @@ #ifndef __ARCH_ORION5X_COMMON_H #define __ARCH_ORION5X_COMMON_H +#include + struct dsa_platform_data; struct mv643xx_eth_platform_data; struct mv_sata_platform_data; @@ -29,7 +31,7 @@ void orion5x_spi_init(void); void orion5x_uart0_init(void); void orion5x_uart1_init(void); void orion5x_xor_init(void); -void orion5x_restart(char, const char *); +void orion5x_restart(enum reboot_mode, const char *); /* * PCIe/PCI functions. diff --git a/arch/arm/mach-orion5x/ls-chl-setup.c b/arch/arm/mach-orion5x/ls-chl-setup.c index 24f4e14e5893..6234977b5aea 100644 --- a/arch/arm/mach-orion5x/ls-chl-setup.c +++ b/arch/arm/mach-orion5x/ls-chl-setup.c @@ -139,7 +139,7 @@ static struct mv_sata_platform_data lschl_sata_data = { static void lschl_power_off(void) { - orion5x_restart('h', NULL); + orion5x_restart(REBOOT_HARD, NULL); } /***************************************************************************** diff --git a/arch/arm/mach-orion5x/ls_hgl-setup.c b/arch/arm/mach-orion5x/ls_hgl-setup.c index fc653bb41e78..fe04c4b64569 100644 --- a/arch/arm/mach-orion5x/ls_hgl-setup.c +++ b/arch/arm/mach-orion5x/ls_hgl-setup.c @@ -185,7 +185,7 @@ static struct mv_sata_platform_data ls_hgl_sata_data = { static void ls_hgl_power_off(void) { - orion5x_restart('h', NULL); + orion5x_restart(REBOOT_HARD, NULL); } diff --git a/arch/arm/mach-orion5x/lsmini-setup.c b/arch/arm/mach-orion5x/lsmini-setup.c index 18e66e617dc2..ca4dbe973daf 100644 --- a/arch/arm/mach-orion5x/lsmini-setup.c +++ b/arch/arm/mach-orion5x/lsmini-setup.c @@ -185,7 +185,7 @@ static struct mv_sata_platform_data lsmini_sata_data = { static void lsmini_power_off(void) { - orion5x_restart('h', NULL); + orion5x_restart(REBOOT_HARD, NULL); } diff --git a/arch/arm/mach-picoxcell/common.c b/arch/arm/mach-picoxcell/common.c index b13f51bc35cf..ec79fea82704 100644 --- a/arch/arm/mach-picoxcell/common.c +++ b/arch/arm/mach-picoxcell/common.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -63,7 +64,7 @@ static const char *picoxcell_dt_match[] = { NULL }; -static void picoxcell_wdt_restart(char mode, const char *cmd) +static void picoxcell_wdt_restart(enum reboot_mode mode, const char *cmd) { /* * Configure the watchdog to reset with the shortest possible timeout diff --git a/arch/arm/mach-prima2/common.h b/arch/arm/mach-prima2/common.h index 81135cd88e54..a6304858474a 100644 --- a/arch/arm/mach-prima2/common.h +++ b/arch/arm/mach-prima2/common.h @@ -10,6 +10,8 @@ #define __MACH_PRIMA2_COMMON_H__ #include +#include + #include #include @@ -22,7 +24,7 @@ extern void sirfsoc_cpu_die(unsigned int cpu); extern void __init sirfsoc_of_irq_init(void); extern void __init sirfsoc_of_clk_init(void); -extern void sirfsoc_restart(char, const char *); +extern void sirfsoc_restart(enum reboot_mode, const char *); extern asmlinkage void __exception_irq_entry sirfsoc_handle_irq(struct pt_regs *regs); #ifndef CONFIG_DEBUG_LL diff --git a/arch/arm/mach-prima2/rstc.c b/arch/arm/mach-prima2/rstc.c index d5e0cbc934c0..ccb53391147a 100644 --- a/arch/arm/mach-prima2/rstc.c +++ b/arch/arm/mach-prima2/rstc.c @@ -13,6 +13,7 @@ #include #include #include +#include void __iomem *sirfsoc_rstc_base; static DEFINE_MUTEX(rstc_lock); @@ -84,7 +85,7 @@ int sirfsoc_reset_device(struct device *dev) #define SIRFSOC_SYS_RST_BIT BIT(31) -void sirfsoc_restart(char mode, const char *cmd) +void sirfsoc_restart(enum reboot_mode mode, const char *cmd) { writel(SIRFSOC_SYS_RST_BIT, sirfsoc_rstc_base); } diff --git a/arch/arm/mach-pxa/corgi.c b/arch/arm/mach-pxa/corgi.c index a5b8fead7d61..f162f1b77cd2 100644 --- a/arch/arm/mach-pxa/corgi.c +++ b/arch/arm/mach-pxa/corgi.c @@ -663,16 +663,16 @@ static void corgi_poweroff(void) /* Green LED off tells the bootloader to halt */ gpio_set_value(CORGI_GPIO_LED_GREEN, 0); - pxa_restart('h', NULL); + pxa_restart(REBOOT_HARD, NULL); } -static void corgi_restart(char mode, const char *cmd) +static void corgi_restart(enum reboot_mode mode, const char *cmd) { if (!machine_is_corgi()) /* Green LED on tells the bootloader to reboot */ gpio_set_value(CORGI_GPIO_LED_GREEN, 1); - pxa_restart('h', cmd); + pxa_restart(REBOOT_HARD, cmd); } static void __init corgi_init(void) diff --git a/arch/arm/mach-pxa/generic.h b/arch/arm/mach-pxa/generic.h index fd7ea39b78c0..8963984d1f43 100644 --- a/arch/arm/mach-pxa/generic.h +++ b/arch/arm/mach-pxa/generic.h @@ -9,6 +9,8 @@ * published by the Free Software Foundation. */ +#include + struct irq_data; extern void pxa_timer_init(void); @@ -56,4 +58,4 @@ void __init pxa_set_btuart_info(void *info); void __init pxa_set_stuart_info(void *info); void __init pxa_set_hwuart_info(void *info); -void pxa_restart(char, const char *); +void pxa_restart(enum reboot_mode, const char *); diff --git a/arch/arm/mach-pxa/mioa701.c b/arch/arm/mach-pxa/mioa701.c index e6b0a936c150..acc9d3cc0762 100644 --- a/arch/arm/mach-pxa/mioa701.c +++ b/arch/arm/mach-pxa/mioa701.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -696,13 +697,13 @@ static void mioa701_machine_exit(void); static void mioa701_poweroff(void) { mioa701_machine_exit(); - pxa_restart('s', NULL); + pxa_restart(REBOOT_SOFT, NULL); } -static void mioa701_restart(char c, const char *cmd) +static void mioa701_restart(enum reboot_mode c, const char *cmd) { mioa701_machine_exit(); - pxa_restart('s', cmd); + pxa_restart(REBOOT_SOFT, cmd); } static struct gpio global_gpios[] = { diff --git a/arch/arm/mach-pxa/poodle.c b/arch/arm/mach-pxa/poodle.c index 50ccd5f1d560..711d37e26bd8 100644 --- a/arch/arm/mach-pxa/poodle.c +++ b/arch/arm/mach-pxa/poodle.c @@ -422,7 +422,7 @@ static struct i2c_board_info __initdata poodle_i2c_devices[] = { static void poodle_poweroff(void) { - pxa_restart('h', NULL); + pxa_restart(REBOOT_HARD, NULL); } static void __init poodle_init(void) diff --git a/arch/arm/mach-pxa/reset.c b/arch/arm/mach-pxa/reset.c index 3fab583755d4..0d5dd646f61f 100644 --- a/arch/arm/mach-pxa/reset.c +++ b/arch/arm/mach-pxa/reset.c @@ -83,7 +83,7 @@ static void do_hw_reset(void) writel_relaxed(readl_relaxed(OSCR) + 368640, OSMR3); } -void pxa_restart(char mode, const char *cmd) +void pxa_restart(enum reboot_mode mode, const char *cmd) { local_irq_disable(); local_fiq_disable(); @@ -91,14 +91,14 @@ void pxa_restart(char mode, const char *cmd) clear_reset_status(RESET_STATUS_ALL); switch (mode) { - case 's': + case REBOOT_SOFT: /* Jump into ROM at address 0 */ soft_restart(0); break; - case 'g': + case REBOOT_GPIO: do_gpio_reset(); break; - case 'h': + case REBOOT_HARD: default: do_hw_reset(); break; diff --git a/arch/arm/mach-pxa/spitz.c b/arch/arm/mach-pxa/spitz.c index c3c00424bb35..2125df0444e7 100644 --- a/arch/arm/mach-pxa/spitz.c +++ b/arch/arm/mach-pxa/spitz.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -924,10 +925,10 @@ static inline void spitz_i2c_init(void) {} ******************************************************************************/ static void spitz_poweroff(void) { - pxa_restart('g', NULL); + pxa_restart(REBOOT_GPIO, NULL); } -static void spitz_restart(char mode, const char *cmd) +static void spitz_restart(enum reboot_mode mode, const char *cmd) { uint32_t msc0 = __raw_readl(MSC0); /* Bootloader magic for a reboot */ diff --git a/arch/arm/mach-pxa/tosa.c b/arch/arm/mach-pxa/tosa.c index a41992fea720..0206b915a6f6 100644 --- a/arch/arm/mach-pxa/tosa.c +++ b/arch/arm/mach-pxa/tosa.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -911,10 +912,10 @@ static struct platform_device *devices[] __initdata = { static void tosa_poweroff(void) { - pxa_restart('g', NULL); + pxa_restart(REBOOT_GPIO, NULL); } -static void tosa_restart(char mode, const char *cmd) +static void tosa_restart(enum reboot_mode mode, const char *cmd) { uint32_t msc0 = __raw_readl(MSC0); diff --git a/arch/arm/mach-realview/realview_eb.c b/arch/arm/mach-realview/realview_eb.c index 5b1c8bfe6fa9..c85ddb2a0ad0 100644 --- a/arch/arm/mach-realview/realview_eb.c +++ b/arch/arm/mach-realview/realview_eb.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -418,7 +419,7 @@ static void __init realview_eb_timer_init(void) realview_eb_twd_init(); } -static void realview_eb_restart(char mode, const char *cmd) +static void realview_eb_restart(enum reboot_mode mode, const char *cmd) { void __iomem *reset_ctrl = __io_address(REALVIEW_SYS_RESETCTL); void __iomem *lock_ctrl = __io_address(REALVIEW_SYS_LOCK); diff --git a/arch/arm/mach-realview/realview_pb1176.c b/arch/arm/mach-realview/realview_pb1176.c index d5e83a1f6982..c5eade76461b 100644 --- a/arch/arm/mach-realview/realview_pb1176.c +++ b/arch/arm/mach-realview/realview_pb1176.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -329,7 +330,7 @@ static void __init realview_pb1176_timer_init(void) realview_timer_init(IRQ_DC1176_TIMER0); } -static void realview_pb1176_restart(char mode, const char *cmd) +static void realview_pb1176_restart(enum reboot_mode mode, const char *cmd) { void __iomem *reset_ctrl = __io_address(REALVIEW_SYS_RESETCTL); void __iomem *lock_ctrl = __io_address(REALVIEW_SYS_LOCK); diff --git a/arch/arm/mach-realview/realview_pb11mp.c b/arch/arm/mach-realview/realview_pb11mp.c index c3cfe213b5e6..f4b0962578fe 100644 --- a/arch/arm/mach-realview/realview_pb11mp.c +++ b/arch/arm/mach-realview/realview_pb11mp.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -316,7 +317,7 @@ static void __init realview_pb11mp_timer_init(void) realview_pb11mp_twd_init(); } -static void realview_pb11mp_restart(char mode, const char *cmd) +static void realview_pb11mp_restart(enum reboot_mode mode, const char *cmd) { void __iomem *reset_ctrl = __io_address(REALVIEW_SYS_RESETCTL); void __iomem *lock_ctrl = __io_address(REALVIEW_SYS_LOCK); diff --git a/arch/arm/mach-realview/realview_pba8.c b/arch/arm/mach-realview/realview_pba8.c index dde652a59620..10a3e1d76891 100644 --- a/arch/arm/mach-realview/realview_pba8.c +++ b/arch/arm/mach-realview/realview_pba8.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -264,7 +265,7 @@ static void __init realview_pba8_timer_init(void) realview_timer_init(IRQ_PBA8_TIMER0_1); } -static void realview_pba8_restart(char mode, const char *cmd) +static void realview_pba8_restart(enum reboot_mode mode, const char *cmd) { void __iomem *reset_ctrl = __io_address(REALVIEW_SYS_RESETCTL); void __iomem *lock_ctrl = __io_address(REALVIEW_SYS_LOCK); diff --git a/arch/arm/mach-realview/realview_pbx.c b/arch/arm/mach-realview/realview_pbx.c index 54f0185b01e3..9d75493e3f0c 100644 --- a/arch/arm/mach-realview/realview_pbx.c +++ b/arch/arm/mach-realview/realview_pbx.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -344,7 +345,7 @@ static void realview_pbx_fixup(struct tag *tags, char **from, #endif } -static void realview_pbx_restart(char mode, const char *cmd) +static void realview_pbx_restart(enum reboot_mode mode, const char *cmd) { void __iomem *reset_ctrl = __io_address(REALVIEW_SYS_RESETCTL); void __iomem *lock_ctrl = __io_address(REALVIEW_SYS_LOCK); diff --git a/arch/arm/mach-rpc/riscpc.c b/arch/arm/mach-rpc/riscpc.c index a302cf5e0fc7..09d602b10d57 100644 --- a/arch/arm/mach-rpc/riscpc.c +++ b/arch/arm/mach-rpc/riscpc.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -201,7 +202,7 @@ static int __init rpc_init(void) arch_initcall(rpc_init); -static void rpc_restart(char mode, const char *cmd) +static void rpc_restart(enum reboot_mode mode, const char *cmd) { iomd_writeb(0, IOMD_ROMCR0); diff --git a/arch/arm/mach-s3c24xx/common.h b/arch/arm/mach-s3c24xx/common.h index 307c3714be55..84b280654f4c 100644 --- a/arch/arm/mach-s3c24xx/common.h +++ b/arch/arm/mach-s3c24xx/common.h @@ -12,6 +12,8 @@ #ifndef __ARCH_ARM_MACH_S3C24XX_COMMON_H #define __ARCH_ARM_MACH_S3C24XX_COMMON_H __FILE__ +#include + struct s3c2410_uartcfg; #ifdef CONFIG_CPU_S3C2410 @@ -20,7 +22,7 @@ extern int s3c2410a_init(void); extern void s3c2410_map_io(void); extern void s3c2410_init_uarts(struct s3c2410_uartcfg *cfg, int no); extern void s3c2410_init_clocks(int xtal); -extern void s3c2410_restart(char mode, const char *cmd); +extern void s3c2410_restart(enum reboot_mode mode, const char *cmd); extern void s3c2410_init_irq(void); #else #define s3c2410_init_clocks NULL @@ -36,7 +38,7 @@ extern void s3c2412_map_io(void); extern void s3c2412_init_uarts(struct s3c2410_uartcfg *cfg, int no); extern void s3c2412_init_clocks(int xtal); extern int s3c2412_baseclk_add(void); -extern void s3c2412_restart(char mode, const char *cmd); +extern void s3c2412_restart(enum reboot_mode mode, const char *cmd); extern void s3c2412_init_irq(void); #else #define s3c2412_init_clocks NULL @@ -51,7 +53,7 @@ extern void s3c2416_map_io(void); extern void s3c2416_init_uarts(struct s3c2410_uartcfg *cfg, int no); extern void s3c2416_init_clocks(int xtal); extern int s3c2416_baseclk_add(void); -extern void s3c2416_restart(char mode, const char *cmd); +extern void s3c2416_restart(enum reboot_mode mode, const char *cmd); extern void s3c2416_init_irq(void); extern struct syscore_ops s3c2416_irq_syscore_ops; @@ -66,7 +68,7 @@ extern struct syscore_ops s3c2416_irq_syscore_ops; extern void s3c244x_map_io(void); extern void s3c244x_init_uarts(struct s3c2410_uartcfg *cfg, int no); extern void s3c244x_init_clocks(int xtal); -extern void s3c244x_restart(char mode, const char *cmd); +extern void s3c244x_restart(enum reboot_mode mode, const char *cmd); #else #define s3c244x_init_clocks NULL #define s3c244x_init_uarts NULL @@ -96,7 +98,7 @@ extern void s3c2443_map_io(void); extern void s3c2443_init_uarts(struct s3c2410_uartcfg *cfg, int no); extern void s3c2443_init_clocks(int xtal); extern int s3c2443_baseclk_add(void); -extern void s3c2443_restart(char mode, const char *cmd); +extern void s3c2443_restart(enum reboot_mode mode, const char *cmd); extern void s3c2443_init_irq(void); #else #define s3c2443_init_clocks NULL diff --git a/arch/arm/mach-s3c24xx/s3c2410.c b/arch/arm/mach-s3c24xx/s3c2410.c index ff384acc65b2..34676d1d5fec 100644 --- a/arch/arm/mach-s3c24xx/s3c2410.c +++ b/arch/arm/mach-s3c24xx/s3c2410.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -196,9 +197,9 @@ int __init s3c2410a_init(void) return s3c2410_init(); } -void s3c2410_restart(char mode, const char *cmd) +void s3c2410_restart(enum reboot_mode mode, const char *cmd) { - if (mode == 's') { + if (mode == REBOOT_SOFT) { soft_restart(0); } diff --git a/arch/arm/mach-s3c24xx/s3c2412.c b/arch/arm/mach-s3c24xx/s3c2412.c index 0f864d4c97de..0251650cbf80 100644 --- a/arch/arm/mach-s3c24xx/s3c2412.c +++ b/arch/arm/mach-s3c24xx/s3c2412.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -129,9 +130,9 @@ static void s3c2412_idle(void) cpu_do_idle(); } -void s3c2412_restart(char mode, const char *cmd) +void s3c2412_restart(enum reboot_mode mode, const char *cmd) { - if (mode == 's') + if (mode == REBOOT_SOFT) soft_restart(0); /* errata "Watch-dog/Software Reset Problem" specifies that diff --git a/arch/arm/mach-s3c24xx/s3c2416.c b/arch/arm/mach-s3c24xx/s3c2416.c index b9c5d382dafb..9ef3ccfbe196 100644 --- a/arch/arm/mach-s3c24xx/s3c2416.c +++ b/arch/arm/mach-s3c24xx/s3c2416.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -79,9 +80,9 @@ static struct device s3c2416_dev = { .bus = &s3c2416_subsys, }; -void s3c2416_restart(char mode, const char *cmd) +void s3c2416_restart(enum reboot_mode mode, const char *cmd) { - if (mode == 's') + if (mode == REBOOT_SOFT) soft_restart(0); __raw_writel(S3C2443_SWRST_RESET, S3C2443_SWRST); diff --git a/arch/arm/mach-s3c24xx/s3c2443.c b/arch/arm/mach-s3c24xx/s3c2443.c index 8328cd65bf3d..b6c71918b25c 100644 --- a/arch/arm/mach-s3c24xx/s3c2443.c +++ b/arch/arm/mach-s3c24xx/s3c2443.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -59,9 +60,9 @@ static struct device s3c2443_dev = { .bus = &s3c2443_subsys, }; -void s3c2443_restart(char mode, const char *cmd) +void s3c2443_restart(enum reboot_mode mode, const char *cmd) { - if (mode == 's') + if (mode == REBOOT_SOFT) soft_restart(0); __raw_writel(S3C2443_SWRST_RESET, S3C2443_SWRST); diff --git a/arch/arm/mach-s3c24xx/s3c244x.c b/arch/arm/mach-s3c24xx/s3c244x.c index d0423e2544c1..911b555029fc 100644 --- a/arch/arm/mach-s3c24xx/s3c244x.c +++ b/arch/arm/mach-s3c24xx/s3c244x.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -198,9 +199,9 @@ struct syscore_ops s3c244x_pm_syscore_ops = { .resume = s3c244x_resume, }; -void s3c244x_restart(char mode, const char *cmd) +void s3c244x_restart(enum reboot_mode mode, const char *cmd) { - if (mode == 's') + if (mode == REBOOT_SOFT) soft_restart(0); samsung_wdt_reset(); diff --git a/arch/arm/mach-s3c64xx/common.c b/arch/arm/mach-s3c64xx/common.c index 1aed6f4be1ce..3f62e467b129 100644 --- a/arch/arm/mach-s3c64xx/common.c +++ b/arch/arm/mach-s3c64xx/common.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -381,9 +382,9 @@ static int __init s3c64xx_init_irq_eint(void) } arch_initcall(s3c64xx_init_irq_eint); -void s3c64xx_restart(char mode, const char *cmd) +void s3c64xx_restart(enum reboot_mode mode, const char *cmd) { - if (mode != 's') + if (mode != REBOOT_SOFT) samsung_wdt_reset(); /* if all else fails, or mode was for soft, jump to 0 */ diff --git a/arch/arm/mach-s3c64xx/common.h b/arch/arm/mach-s3c64xx/common.h index 6cfc99bdfb37..e8f990b37665 100644 --- a/arch/arm/mach-s3c64xx/common.h +++ b/arch/arm/mach-s3c64xx/common.h @@ -17,13 +17,15 @@ #ifndef __ARCH_ARM_MACH_S3C64XX_COMMON_H #define __ARCH_ARM_MACH_S3C64XX_COMMON_H +#include + void s3c64xx_init_irq(u32 vic0, u32 vic1); void s3c64xx_init_io(struct map_desc *mach_desc, int size); void s3c64xx_register_clocks(unsigned long xtal, unsigned armclk_limit); void s3c64xx_setup_clocks(void); -void s3c64xx_restart(char mode, const char *cmd); +void s3c64xx_restart(enum reboot_mode mode, const char *cmd); void s3c64xx_init_late(void); #ifdef CONFIG_CPU_S3C6400 diff --git a/arch/arm/mach-s5p64x0/common.c b/arch/arm/mach-s5p64x0/common.c index 76d0053bf564..dfdfdc320ce7 100644 --- a/arch/arm/mach-s5p64x0/common.c +++ b/arch/arm/mach-s5p64x0/common.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -439,9 +440,9 @@ static int __init s5p64x0_init_irq_eint(void) } arch_initcall(s5p64x0_init_irq_eint); -void s5p64x0_restart(char mode, const char *cmd) +void s5p64x0_restart(enum reboot_mode mode, const char *cmd) { - if (mode != 's') + if (mode != REBOOT_SOFT) samsung_wdt_reset(); soft_restart(0); diff --git a/arch/arm/mach-s5p64x0/common.h b/arch/arm/mach-s5p64x0/common.h index f8a60fdc5884..f3a9b43cba4a 100644 --- a/arch/arm/mach-s5p64x0/common.h +++ b/arch/arm/mach-s5p64x0/common.h @@ -12,6 +12,8 @@ #ifndef __ARCH_ARM_MACH_S5P64X0_COMMON_H #define __ARCH_ARM_MACH_S5P64X0_COMMON_H +#include + void s5p6440_init_irq(void); void s5p6450_init_irq(void); void s5p64x0_init_io(struct map_desc *mach_desc, int size); @@ -22,7 +24,7 @@ void s5p6440_setup_clocks(void); void s5p6450_register_clocks(void); void s5p6450_setup_clocks(void); -void s5p64x0_restart(char mode, const char *cmd); +void s5p64x0_restart(enum reboot_mode mode, const char *cmd); #ifdef CONFIG_CPU_S5P6440 diff --git a/arch/arm/mach-s5pc100/common.c b/arch/arm/mach-s5pc100/common.c index 511031564d35..4bdfecf6d024 100644 --- a/arch/arm/mach-s5pc100/common.c +++ b/arch/arm/mach-s5pc100/common.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -217,9 +218,9 @@ void __init s5pc100_init_uarts(struct s3c2410_uartcfg *cfg, int no) s3c24xx_init_uartdevs("s3c6400-uart", s5p_uart_resources, cfg, no); } -void s5pc100_restart(char mode, const char *cmd) +void s5pc100_restart(enum reboot_mode mode, const char *cmd) { - if (mode != 's') + if (mode != REBOOT_SOFT) samsung_wdt_reset(); soft_restart(0); diff --git a/arch/arm/mach-s5pc100/common.h b/arch/arm/mach-s5pc100/common.h index c41f912e9e1f..08d782d65d7b 100644 --- a/arch/arm/mach-s5pc100/common.h +++ b/arch/arm/mach-s5pc100/common.h @@ -12,13 +12,15 @@ #ifndef __ARCH_ARM_MACH_S5PC100_COMMON_H #define __ARCH_ARM_MACH_S5PC100_COMMON_H +#include + void s5pc100_init_io(struct map_desc *mach_desc, int size); void s5pc100_init_irq(void); void s5pc100_register_clocks(void); void s5pc100_setup_clocks(void); -void s5pc100_restart(char mode, const char *cmd); +void s5pc100_restart(enum reboot_mode mode, const char *cmd); extern int s5pc100_init(void); extern void s5pc100_map_io(void); diff --git a/arch/arm/mach-s5pv210/common.c b/arch/arm/mach-s5pv210/common.c index 9dfe93e2624d..023f1a796a9c 100644 --- a/arch/arm/mach-s5pv210/common.c +++ b/arch/arm/mach-s5pv210/common.c @@ -143,7 +143,7 @@ static struct map_desc s5pv210_iodesc[] __initdata = { } }; -void s5pv210_restart(char mode, const char *cmd) +void s5pv210_restart(enum reboot_mode mode, const char *cmd) { __raw_writel(0x1, S5P_SWRESET); } diff --git a/arch/arm/mach-s5pv210/common.h b/arch/arm/mach-s5pv210/common.h index 0a1cc0aef720..fe1beb54e548 100644 --- a/arch/arm/mach-s5pv210/common.h +++ b/arch/arm/mach-s5pv210/common.h @@ -12,13 +12,15 @@ #ifndef __ARCH_ARM_MACH_S5PV210_COMMON_H #define __ARCH_ARM_MACH_S5PV210_COMMON_H +#include + void s5pv210_init_io(struct map_desc *mach_desc, int size); void s5pv210_init_irq(void); void s5pv210_register_clocks(void); void s5pv210_setup_clocks(void); -void s5pv210_restart(char mode, const char *cmd); +void s5pv210_restart(enum reboot_mode mode, const char *cmd); extern int s5pv210_init(void); extern void s5pv210_map_io(void); diff --git a/arch/arm/mach-sa1100/generic.c b/arch/arm/mach-sa1100/generic.c index 9db3e98e8b85..f25b6119e028 100644 --- a/arch/arm/mach-sa1100/generic.c +++ b/arch/arm/mach-sa1100/generic.c @@ -19,6 +19,7 @@ #include #include #include +#include #include