From 1404ff3cc3a14cb1fe8535e30b87d20da9513767 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 7 Oct 2016 16:56:49 -0700 Subject: [PATCH 001/127] fsnotify: drop notification_mutex before destroying event fsnotify_flush_notify() and fanotify_release() destroy notification event while holding notification_mutex. The destruction of fanotify event includes a path_put() call which may end up calling into a filesystem to delete an inode if we happen to be the last holders of dentry reference which happens to be the last holder of inode reference. That in turn may violate lock ordering for some filesystems since notification_mutex is also acquired e. g. during write when generating fanotify event. Also this is the only thing that forces notification_mutex to be a sleeping lock. So drop notification_mutex before destroying a notification event. Link: http://lkml.kernel.org/r/1473797711-14111-4-git-send-email-jack@suse.cz Signed-off-by: Jan Kara Cc: Miklos Szeredi Cc: Lino Sanfilippo Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fanotify/fanotify_user.c | 6 ++++-- fs/notify/notification.c | 2 ++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index a64313868d3a..46d135c4988f 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -390,9 +390,11 @@ static int fanotify_release(struct inode *ignored, struct file *file) mutex_lock(&group->notification_mutex); while (!fsnotify_notify_queue_is_empty(group)) { fsn_event = fsnotify_remove_first_event(group); - if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS)) + if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS)) { + mutex_unlock(&group->notification_mutex); fsnotify_destroy_event(group, fsn_event); - else + mutex_lock(&group->notification_mutex); + } else FANOTIFY_PE(fsn_event)->response = FAN_ALLOW; } mutex_unlock(&group->notification_mutex); diff --git a/fs/notify/notification.c b/fs/notify/notification.c index e455e83ceeeb..7d563dea52a4 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -178,7 +178,9 @@ void fsnotify_flush_notify(struct fsnotify_group *group) mutex_lock(&group->notification_mutex); while (!fsnotify_notify_queue_is_empty(group)) { event = fsnotify_remove_first_event(group); + mutex_unlock(&group->notification_mutex); fsnotify_destroy_event(group, event); + mutex_lock(&group->notification_mutex); } mutex_unlock(&group->notification_mutex); } From c21dbe20f606219fe54faf555b7bc5565487c58f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 7 Oct 2016 16:56:52 -0700 Subject: [PATCH 002/127] fsnotify: convert notification_mutex to a spinlock notification_mutex is used to protect the list of pending events. As such there's no reason to use a sleeping lock for it. Convert it to a spinlock. [jack@suse.cz: fixed version] Link: http://lkml.kernel.org/r/1474031567-1831-1-git-send-email-jack@suse.cz Link: http://lkml.kernel.org/r/1473797711-14111-5-git-send-email-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Lino Sanfilippo Tested-by: Guenter Roeck Cc: Miklos Szeredi Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fanotify/fanotify_user.c | 27 ++++++++++++++------------- fs/notify/group.c | 6 +++--- fs/notify/inotify/inotify_user.c | 16 ++++++++-------- fs/notify/notification.c | 27 +++++++++++++++------------ include/linux/fsnotify_backend.h | 2 +- 5 files changed, 41 insertions(+), 37 deletions(-) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 46d135c4988f..80091a5dc8c0 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -49,12 +49,13 @@ struct kmem_cache *fanotify_perm_event_cachep __read_mostly; * enough to fit in "count". Return an error pointer if the count * is not large enough. * - * Called with the group->notification_mutex held. + * Called with the group->notification_lock held. */ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, size_t count) { - BUG_ON(!mutex_is_locked(&group->notification_mutex)); + BUG_ON(IS_ENABLED(CONFIG_SMP) && + !spin_is_locked(&group->notification_lock)); pr_debug("%s: group=%p count=%zd\n", __func__, group, count); @@ -64,7 +65,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, if (FAN_EVENT_METADATA_LEN > count) return ERR_PTR(-EINVAL); - /* held the notification_mutex the whole time, so this is the + /* held the notification_lock the whole time, so this is the * same event we peeked above */ return fsnotify_remove_first_event(group); } @@ -244,10 +245,10 @@ static unsigned int fanotify_poll(struct file *file, poll_table *wait) int ret = 0; poll_wait(file, &group->notification_waitq, wait); - mutex_lock(&group->notification_mutex); + spin_lock(&group->notification_lock); if (!fsnotify_notify_queue_is_empty(group)) ret = POLLIN | POLLRDNORM; - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); return ret; } @@ -268,9 +269,9 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, add_wait_queue(&group->notification_waitq, &wait); while (1) { - mutex_lock(&group->notification_mutex); + spin_lock(&group->notification_lock); kevent = get_one_event(group, count); - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); if (IS_ERR(kevent)) { ret = PTR_ERR(kevent); @@ -387,17 +388,17 @@ static int fanotify_release(struct inode *ignored, struct file *file) * dequeue them and set the response. They will be freed once the * response is consumed and fanotify_get_response() returns. */ - mutex_lock(&group->notification_mutex); + spin_lock(&group->notification_lock); while (!fsnotify_notify_queue_is_empty(group)) { fsn_event = fsnotify_remove_first_event(group); if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS)) { - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); fsnotify_destroy_event(group, fsn_event); - mutex_lock(&group->notification_mutex); + spin_lock(&group->notification_lock); } else FANOTIFY_PE(fsn_event)->response = FAN_ALLOW; } - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); /* Response for all permission events it set, wakeup waiters */ wake_up(&group->fanotify_data.access_waitq); @@ -423,10 +424,10 @@ static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar switch (cmd) { case FIONREAD: - mutex_lock(&group->notification_mutex); + spin_lock(&group->notification_lock); list_for_each_entry(fsn_event, &group->notification_list, list) send_len += FAN_EVENT_METADATA_LEN; - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); ret = put_user(send_len, (int __user *) p); break; } diff --git a/fs/notify/group.c b/fs/notify/group.c index b47f7cfdcaa4..fbe3cbebec16 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -45,9 +45,9 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group) */ void fsnotify_group_stop_queueing(struct fsnotify_group *group) { - mutex_lock(&group->notification_mutex); + spin_lock(&group->notification_lock); group->shutdown = true; - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); } /* @@ -125,7 +125,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) atomic_set(&group->refcnt, 1); atomic_set(&group->num_marks, 0); - mutex_init(&group->notification_mutex); + spin_lock_init(&group->notification_lock); INIT_LIST_HEAD(&group->notification_list); init_waitqueue_head(&group->notification_waitq); group->max_events = UINT_MAX; diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index b8d08d0d0a4d..69d1ea3d292a 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -115,10 +115,10 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait) int ret = 0; poll_wait(file, &group->notification_waitq, wait); - mutex_lock(&group->notification_mutex); + spin_lock(&group->notification_lock); if (!fsnotify_notify_queue_is_empty(group)) ret = POLLIN | POLLRDNORM; - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); return ret; } @@ -138,7 +138,7 @@ static int round_event_name_len(struct fsnotify_event *fsn_event) * enough to fit in "count". Return an error pointer if * not large enough. * - * Called with the group->notification_mutex held. + * Called with the group->notification_lock held. */ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, size_t count) @@ -157,7 +157,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, if (event_size > count) return ERR_PTR(-EINVAL); - /* held the notification_mutex the whole time, so this is the + /* held the notification_lock the whole time, so this is the * same event we peeked above */ fsnotify_remove_first_event(group); @@ -234,9 +234,9 @@ static ssize_t inotify_read(struct file *file, char __user *buf, add_wait_queue(&group->notification_waitq, &wait); while (1) { - mutex_lock(&group->notification_mutex); + spin_lock(&group->notification_lock); kevent = get_one_event(group, count); - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); pr_debug("%s: group=%p kevent=%p\n", __func__, group, kevent); @@ -300,13 +300,13 @@ static long inotify_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case FIONREAD: - mutex_lock(&group->notification_mutex); + spin_lock(&group->notification_lock); list_for_each_entry(fsn_event, &group->notification_list, list) { send_len += sizeof(struct inotify_event); send_len += round_event_name_len(fsn_event); } - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); ret = put_user(send_len, (int __user *) p); break; } diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 7d563dea52a4..8a7a8cd041e8 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -63,7 +63,8 @@ EXPORT_SYMBOL_GPL(fsnotify_get_cookie); /* return true if the notify queue is empty, false otherwise */ bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group) { - BUG_ON(!mutex_is_locked(&group->notification_mutex)); + BUG_ON(IS_ENABLED(CONFIG_SMP) && + !spin_is_locked(&group->notification_lock)); return list_empty(&group->notification_list) ? true : false; } @@ -95,10 +96,10 @@ int fsnotify_add_event(struct fsnotify_group *group, pr_debug("%s: group=%p event=%p\n", __func__, group, event); - mutex_lock(&group->notification_mutex); + spin_lock(&group->notification_lock); if (group->shutdown) { - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); return 2; } @@ -106,7 +107,7 @@ int fsnotify_add_event(struct fsnotify_group *group, ret = 2; /* Queue overflow event only if it isn't already queued */ if (!list_empty(&group->overflow_event->list)) { - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); return ret; } event = group->overflow_event; @@ -116,7 +117,7 @@ int fsnotify_add_event(struct fsnotify_group *group, if (!list_empty(list) && merge) { ret = merge(list, event); if (ret) { - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); return ret; } } @@ -124,7 +125,7 @@ int fsnotify_add_event(struct fsnotify_group *group, queue: group->q_len++; list_add_tail(&event->list, list); - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); wake_up(&group->notification_waitq); kill_fasync(&group->fsn_fa, SIGIO, POLL_IN); @@ -139,7 +140,8 @@ struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group) { struct fsnotify_event *event; - BUG_ON(!mutex_is_locked(&group->notification_mutex)); + BUG_ON(IS_ENABLED(CONFIG_SMP) && + !spin_is_locked(&group->notification_lock)); pr_debug("%s: group=%p\n", __func__, group); @@ -161,7 +163,8 @@ struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group) */ struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group) { - BUG_ON(!mutex_is_locked(&group->notification_mutex)); + BUG_ON(IS_ENABLED(CONFIG_SMP) && + !spin_is_locked(&group->notification_lock)); return list_first_entry(&group->notification_list, struct fsnotify_event, list); @@ -175,14 +178,14 @@ void fsnotify_flush_notify(struct fsnotify_group *group) { struct fsnotify_event *event; - mutex_lock(&group->notification_mutex); + spin_lock(&group->notification_lock); while (!fsnotify_notify_queue_is_empty(group)) { event = fsnotify_remove_first_event(group); - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); fsnotify_destroy_event(group, event); - mutex_lock(&group->notification_mutex); + spin_lock(&group->notification_lock); } - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); } /* diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 7268ed076be8..0713e873b1c9 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -135,7 +135,7 @@ struct fsnotify_group { const struct fsnotify_ops *ops; /* how this group handles things */ /* needed to send notification to userspace */ - struct mutex notification_mutex; /* protect the notification_list */ + spinlock_t notification_lock; /* protect the notification_list */ struct list_head notification_list; /* list of event_holder this group needs to send to userspace */ wait_queue_head_t notification_waitq; /* read() on the notification file blocks on this waitq */ unsigned int q_len; /* events on the queue */ From 073f65522aeb23e46fc8a809d69513132d3acc81 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 7 Oct 2016 16:56:55 -0700 Subject: [PATCH 003/127] fanotify: use notification_lock instead of access_lock Fanotify code has its own lock (access_lock) to protect a list of events waiting for a response from userspace. However this is somewhat awkward as the same list_head in the event is protected by notification_lock if it is part of the notification queue and by access_lock if it is part of the fanotify private queue which makes it difficult for any reliable checks in the generic code. So make fanotify use the same lock - notification_lock - for protecting its private event list. Link: http://lkml.kernel.org/r/1473797711-14111-6-git-send-email-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Lino Sanfilippo Cc: Miklos Szeredi Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fanotify/fanotify_user.c | 13 +++++-------- include/linux/fsnotify_backend.h | 1 - 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 80091a5dc8c0..189fab3ac4e6 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -148,7 +148,7 @@ static struct fanotify_perm_event_info *dequeue_event( { struct fanotify_perm_event_info *event, *return_e = NULL; - spin_lock(&group->fanotify_data.access_lock); + spin_lock(&group->notification_lock); list_for_each_entry(event, &group->fanotify_data.access_list, fae.fse.list) { if (event->fd != fd) @@ -158,7 +158,7 @@ static struct fanotify_perm_event_info *dequeue_event( return_e = event; break; } - spin_unlock(&group->fanotify_data.access_lock); + spin_unlock(&group->notification_lock); pr_debug("%s: found return_re=%p\n", __func__, return_e); @@ -310,10 +310,10 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, wake_up(&group->fanotify_data.access_waitq); break; } - spin_lock(&group->fanotify_data.access_lock); + spin_lock(&group->notification_lock); list_add_tail(&kevent->list, &group->fanotify_data.access_list); - spin_unlock(&group->fanotify_data.access_lock); + spin_unlock(&group->notification_lock); #endif } buf += ret; @@ -372,7 +372,7 @@ static int fanotify_release(struct inode *ignored, struct file *file) * Process all permission events on access_list and notification queue * and simulate reply from userspace. */ - spin_lock(&group->fanotify_data.access_lock); + spin_lock(&group->notification_lock); list_for_each_entry_safe(event, next, &group->fanotify_data.access_list, fae.fse.list) { pr_debug("%s: found group=%p event=%p\n", __func__, group, @@ -381,14 +381,12 @@ static int fanotify_release(struct inode *ignored, struct file *file) list_del_init(&event->fae.fse.list); event->response = FAN_ALLOW; } - spin_unlock(&group->fanotify_data.access_lock); /* * Destroy all non-permission events. For permission events just * dequeue them and set the response. They will be freed once the * response is consumed and fanotify_get_response() returns. */ - spin_lock(&group->notification_lock); while (!fsnotify_notify_queue_is_empty(group)) { fsn_event = fsnotify_remove_first_event(group); if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS)) { @@ -768,7 +766,6 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) event_f_flags |= O_LARGEFILE; group->fanotify_data.f_flags = event_f_flags; #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - spin_lock_init(&group->fanotify_data.access_lock); init_waitqueue_head(&group->fanotify_data.access_waitq); INIT_LIST_HEAD(&group->fanotify_data.access_list); #endif diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 0713e873b1c9..79467b239fcf 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -177,7 +177,6 @@ struct fsnotify_group { struct fanotify_group_private_data { #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS /* allows a group to block waiting for a userspace response */ - spinlock_t access_lock; struct list_head access_list; wait_queue_head_t access_waitq; #endif /* CONFIG_FANOTIFY_ACCESS_PERMISSIONS */ From 0b1b86527df4b1f398266c23e926dd788925bb69 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 7 Oct 2016 16:56:58 -0700 Subject: [PATCH 004/127] fanotify: fix possible false warning when freeing events When freeing permission events by fsnotify_destroy_event(), the warning WARN_ON(!list_empty(&event->list)); may falsely hit. This is because although fanotify_get_response() saw event->response set, there is nothing to make sure the current CPU also sees the removal of the event from the list. Add proper locking around the WARN_ON() to avoid the false warning. Link: http://lkml.kernel.org/r/1473797711-14111-7-git-send-email-jack@suse.cz Reported-by: Miklos Szeredi Signed-off-by: Jan Kara Reviewed-by: Lino Sanfilippo Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/notification.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 8a7a8cd041e8..1a8010e7a2a0 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -74,8 +74,17 @@ void fsnotify_destroy_event(struct fsnotify_group *group, /* Overflow events are per-group and we don't want to free them */ if (!event || event->mask == FS_Q_OVERFLOW) return; - /* If the event is still queued, we have a problem... */ - WARN_ON(!list_empty(&event->list)); + /* + * If the event is still queued, we have a problem... Do an unreliable + * lockless check first to avoid locking in the common case. The + * locking may be necessary for permission events which got removed + * from the list by a different CPU than the one freeing the event. + */ + if (!list_empty(&event->list)) { + spin_lock(&group->notification_lock); + WARN_ON(!list_empty(&event->list)); + spin_unlock(&group->notification_lock); + } group->ops->free_event(event); } From ed2726406c6a71f5da63719c0ba7d9e21dd9581c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 7 Oct 2016 16:57:01 -0700 Subject: [PATCH 005/127] fsnotify: clean up spinlock assertions Use assert_spin_locked() macro instead of hand-made BUG_ON statements. Link: http://lkml.kernel.org/r/1474537439-18919-1-git-send-email-jack@suse.cz Signed-off-by: Jan Kara Suggested-by: Heiner Kallweit Reviewed-by: Jeff Layton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fanotify/fanotify_user.c | 3 +-- fs/notify/notification.c | 9 +++------ 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 189fab3ac4e6..7ebfca6a1427 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -54,8 +54,7 @@ struct kmem_cache *fanotify_perm_event_cachep __read_mostly; static struct fsnotify_event *get_one_event(struct fsnotify_group *group, size_t count) { - BUG_ON(IS_ENABLED(CONFIG_SMP) && - !spin_is_locked(&group->notification_lock)); + assert_spin_locked(&group->notification_lock); pr_debug("%s: group=%p count=%zd\n", __func__, group, count); diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 1a8010e7a2a0..66f85c651c52 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -63,8 +63,7 @@ EXPORT_SYMBOL_GPL(fsnotify_get_cookie); /* return true if the notify queue is empty, false otherwise */ bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group) { - BUG_ON(IS_ENABLED(CONFIG_SMP) && - !spin_is_locked(&group->notification_lock)); + assert_spin_locked(&group->notification_lock); return list_empty(&group->notification_list) ? true : false; } @@ -149,8 +148,7 @@ struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group) { struct fsnotify_event *event; - BUG_ON(IS_ENABLED(CONFIG_SMP) && - !spin_is_locked(&group->notification_lock)); + assert_spin_locked(&group->notification_lock); pr_debug("%s: group=%p\n", __func__, group); @@ -172,8 +170,7 @@ struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group) */ struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group) { - BUG_ON(IS_ENABLED(CONFIG_SMP) && - !spin_is_locked(&group->notification_lock)); + assert_spin_locked(&group->notification_lock); return list_first_entry(&group->notification_list, struct fsnotify_event, list); From 3740dcdf8a77ae6a66e99350e9fbd8a6ce4d493a Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Fri, 7 Oct 2016 16:57:04 -0700 Subject: [PATCH 006/127] jiffies: add time comparison functions for 64 bit jiffies Though the time_before and time_after family of functions were nicely extended to support jiffies64, so that the interface would be consistent, it was forgotten to also extend the before/after jiffies functions to support jiffies64. This commit brings the interface to parity between jiffies and jiffies64, which is quite convenient. Link: http://lkml.kernel.org/r/20160929033319.12188-1-Jason@zx2c4.com Signed-off-by: Jason A. Donenfeld Cc: Thomas Gleixner Cc: John Stultz Signed-off-by: Linus Torvalds --- include/linux/jiffies.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 5fdc55312334..589d14e970ad 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -150,15 +150,19 @@ static inline u64 get_jiffies_64(void) /* time_is_before_jiffies(a) return true if a is before jiffies */ #define time_is_before_jiffies(a) time_after(jiffies, a) +#define time_is_before_jiffies64(a) time_after64(get_jiffies_64(), a) /* time_is_after_jiffies(a) return true if a is after jiffies */ #define time_is_after_jiffies(a) time_before(jiffies, a) +#define time_is_after_jiffies64(a) time_before64(get_jiffies_64(), a) /* time_is_before_eq_jiffies(a) return true if a is before or equal to jiffies*/ #define time_is_before_eq_jiffies(a) time_after_eq(jiffies, a) +#define time_is_before_eq_jiffies64(a) time_after_eq64(get_jiffies_64(), a) /* time_is_after_eq_jiffies(a) return true if a is after or equal to jiffies*/ #define time_is_after_eq_jiffies(a) time_before_eq(jiffies, a) +#define time_is_after_eq_jiffies64(a) time_before_eq64(get_jiffies_64(), a) /* * Have the 32 bit jiffies value wrap 5 minutes after boot From 0b41be07630ef4ef45e082fed530c633e6cdce88 Mon Sep 17 00:00:00 2001 From: Bhaktipriya Shridhar Date: Fri, 7 Oct 2016 16:57:07 -0700 Subject: [PATCH 007/127] fs/ocfs2/dlmfs: remove deprecated create_singlethread_workqueue() The workqueue "user_dlm_worker" queues a single work item &lockres->l_work per user_lock_res instance and so it doesn't require execution ordering. Hence, alloc_workqueue has been used to replace the deprecated create_singlethread_workqueue instance. The WQ_MEM_RECLAIM flag has been set to ensure forward progress under memory pressure. Since there are fixed number of work items, explicit concurrency limit is unnecessary here. Link: http://lkml.kernel.org/r/9748136d3a3b18138ad1d6ba708367aa1fe9f98c.1472590094.git.bhaktipriya96@gmail.com Signed-off-by: Bhaktipriya Shridhar Acked-by: Tejun Heo Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlmfs/dlmfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index ef474cdd6404..354cdf9714aa 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -646,7 +646,7 @@ static int __init init_dlmfs_fs(void) } cleanup_inode = 1; - user_dlm_worker = create_singlethread_workqueue("user_dlm"); + user_dlm_worker = alloc_workqueue("user_dlm", WQ_MEM_RECLAIM, 0); if (!user_dlm_worker) { status = -ENOMEM; goto bail; From bf940776c051ad6226567d117f8102f1c09f0431 Mon Sep 17 00:00:00 2001 From: Bhaktipriya Shridhar Date: Fri, 7 Oct 2016 16:57:10 -0700 Subject: [PATCH 008/127] fs/ocfs2/cluster: remove deprecated create_singlethread_workqueue() The workqueue "o2net_wq" queues multiple work items viz &old_sc->sc_shutdown_work, &sc->sc_rx_work, &sc->sc_connect_work which require strict execution ordering. Hence, an ordered dedicated workqueue has been used. WQ_MEM_RECLAIM has been set to ensure forward progress under memory pressure. Link: http://lkml.kernel.org/r/ddc12e5766c79ba26f8a00d98049107f8a1d4866.1472590094.git.bhaktipriya96@gmail.com Signed-off-by: Bhaktipriya Shridhar Acked-by: Tejun Heo Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 1d67fcbf7160..8abab16b4602 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -2104,7 +2104,7 @@ int o2net_start_listening(struct o2nm_node *node) BUG_ON(o2net_listen_sock != NULL); mlog(ML_KTHREAD, "starting o2net thread...\n"); - o2net_wq = create_singlethread_workqueue("o2net"); + o2net_wq = alloc_ordered_workqueue("o2net", WQ_MEM_RECLAIM); if (o2net_wq == NULL) { mlog(ML_ERROR, "unable to launch o2net thread\n"); return -ENOMEM; /* ? */ From 44be975691703cda8e23fe45296f78b287e40232 Mon Sep 17 00:00:00 2001 From: Bhaktipriya Shridhar Date: Fri, 7 Oct 2016 16:57:13 -0700 Subject: [PATCH 009/127] fs/ocfs2/super: remove deprecated create_singlethread_workqueue() The workqueue "ocfs2_wq" queues multiple work items viz &osb->la_enable_wq, &journal->j_recovery_work, &os->os_orphan_scan_work, &osb->osb_truncate_log_wq which require strict execution ordering. Hence, an ordered dedicated workqueue has been used. WQ_MEM_RECLAIM has been set to ensure forward progress under memory pressure because the workqueue is being used on a memory reclaim path. Link: http://lkml.kernel.org/r/66279de510a7f4cfc6e386d99b7e04b3f65fb11b.1472590094.git.bhaktipriya96@gmail.com Signed-off-by: Bhaktipriya Shridhar Acked-by: Tejun Heo Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 603b28d6f008..f56fe39fab04 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2329,7 +2329,7 @@ static int ocfs2_initialize_super(struct super_block *sb, } cleancache_init_shared_fs(sb); - osb->ocfs2_wq = create_singlethread_workqueue("ocfs2_wq"); + osb->ocfs2_wq = alloc_ordered_workqueue("ocfs2_wq", WQ_MEM_RECLAIM); if (!osb->ocfs2_wq) { status = -ENOMEM; mlog_errno(status); From 055fdcff35e5e688f5002e7ab330469a58883ff7 Mon Sep 17 00:00:00 2001 From: Bhaktipriya Shridhar Date: Fri, 7 Oct 2016 16:57:17 -0700 Subject: [PATCH 010/127] fs/ocfs2/dlm: remove deprecated create_singlethread_workqueue() The workqueue "dlm_worker" queues a single work item &dlm->dispatched_work and thus it doesn't require execution ordering. Hence, alloc_workqueue has been used to replace the deprecated create_singlethread_workqueue instance. The WQ_MEM_RECLAIM flag has been set to ensure forward progress under memory pressure. Since there are fixed number of work items, explicit concurrency limit is unnecessary here. Link: http://lkml.kernel.org/r/2b5ad8d6688effe1a9ddb2bc2082d26fbbe00302.1472590094.git.bhaktipriya96@gmail.com Signed-off-by: Bhaktipriya Shridhar Acked-by: Tejun Heo Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmdomain.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 533bd524e41e..733e4e79c8e2 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1904,7 +1904,7 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) } snprintf(wq_name, O2NM_MAX_NAME_LEN, "dlm_wq-%s", dlm->name); - dlm->dlm_worker = create_singlethread_workqueue(wq_name); + dlm->dlm_worker = alloc_workqueue(wq_name, WQ_MEM_RECLAIM, 0); if (!dlm->dlm_worker) { status = -ENOMEM; mlog_errno(status); From 48e509ece97e00b68e52d1d18e3e4b809c5b3991 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Fri, 7 Oct 2016 16:57:20 -0700 Subject: [PATCH 011/127] ocfs2: fix undefined struct variable in inode.h The extern struct variable ocfs2_inode_cache is not defined. It meant to use ocfs2_inode_cachep defined in super.c, I think. Fortunately it is not used anywhere now, so no impact actually. Clean it up to fix this mistake. Link: http://lkml.kernel.org/r/57E1E49D.8050503@huawei.com Signed-off-by: Joseph Qi Reviewed-by: Eric Ren Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/inode.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 50cc55047443..5af68fcdf9d3 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -123,8 +123,6 @@ static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) #define INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags & OCFS2_INODE_JOURNAL) #define SET_INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags |= OCFS2_INODE_JOURNAL) -extern struct kmem_cache *ocfs2_inode_cache; - extern const struct address_space_operations ocfs2_aops; extern const struct ocfs2_caching_operations ocfs2_inode_caching_ops; From 7c5f64f84483bd13886348edda8b3e7b799a7fdb Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 7 Oct 2016 16:57:23 -0700 Subject: [PATCH 012/127] mm: oom: deduplicate victim selection code for memcg and global oom When selecting an oom victim, we use the same heuristic for both memory cgroup and global oom. The only difference is the scope of tasks to select the victim from. So we could just export an iterator over all memcg tasks and keep all oom related logic in oom_kill.c, but instead we duplicate pieces of it in memcontrol.c reusing some initially private functions of oom_kill.c in order to not duplicate all of it. That looks ugly and error prone, because any modification of select_bad_process should also be propagated to mem_cgroup_out_of_memory. Let's rework this as follows: keep all oom heuristic related code private to oom_kill.c and make oom_kill.c use exported memcg functions when it's really necessary (like in case of iterating over memcg tasks). Link: http://lkml.kernel.org/r/1470056933-7505-1-git-send-email-vdavydov@virtuozzo.com Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Tetsuo Handa Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 15 +++ include/linux/oom.h | 43 +------- mm/memcontrol.c | 114 ++++++++------------- mm/oom_kill.c | 200 ++++++++++++++++++++----------------- 4 files changed, 167 insertions(+), 205 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5d8ca6e02e39..0710143723bc 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -366,6 +366,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, struct mem_cgroup *, struct mem_cgroup_reclaim_cookie *); void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); +int mem_cgroup_scan_tasks(struct mem_cgroup *, + int (*)(struct task_struct *, void *), void *); static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) { @@ -446,6 +448,8 @@ unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) void mem_cgroup_handle_over_high(void); +unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg); + void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p); @@ -639,6 +643,12 @@ static inline void mem_cgroup_iter_break(struct mem_cgroup *root, { } +static inline int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, + int (*fn)(struct task_struct *, void *), void *arg) +{ + return 0; +} + static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) { return 0; @@ -669,6 +679,11 @@ mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, return 0; } +static inline unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) +{ + return 0; +} + static inline void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { diff --git a/include/linux/oom.h b/include/linux/oom.h index 5bc0457ee3a8..17946e5121b6 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -34,23 +34,11 @@ struct oom_control { * for display purposes. */ const int order; -}; -/* - * Types of limitations to the nodes from which allocations may occur - */ -enum oom_constraint { - CONSTRAINT_NONE, - CONSTRAINT_CPUSET, - CONSTRAINT_MEMORY_POLICY, - CONSTRAINT_MEMCG, -}; - -enum oom_scan_t { - OOM_SCAN_OK, /* scan thread and find its badness */ - OOM_SCAN_CONTINUE, /* do not consider thread for oom kill */ - OOM_SCAN_ABORT, /* abort the iteration and return */ - OOM_SCAN_SELECT, /* always select this thread first */ + /* Used by oom implementation, do not set */ + unsigned long totalpages; + struct task_struct *chosen; + unsigned long chosen_points; }; extern struct mutex oom_lock; @@ -70,30 +58,10 @@ static inline bool oom_task_origin(const struct task_struct *p) return p->signal->oom_flag_origin; } -extern void mark_oom_victim(struct task_struct *tsk); - -#ifdef CONFIG_MMU -extern void wake_oom_reaper(struct task_struct *tsk); -#else -static inline void wake_oom_reaper(struct task_struct *tsk) -{ -} -#endif - extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages); -extern void oom_kill_process(struct oom_control *oc, struct task_struct *p, - unsigned int points, unsigned long totalpages, - const char *message); - -extern void check_panic_on_oom(struct oom_control *oc, - enum oom_constraint constraint); - -extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, - struct task_struct *task); - extern bool out_of_memory(struct oom_control *oc); extern void exit_oom_victim(struct task_struct *tsk); @@ -101,14 +69,11 @@ extern void exit_oom_victim(struct task_struct *tsk); extern int register_oom_notifier(struct notifier_block *nb); extern int unregister_oom_notifier(struct notifier_block *nb); -extern bool oom_killer_disabled; extern bool oom_killer_disable(void); extern void oom_killer_enable(void); extern struct task_struct *find_lock_task_mm(struct task_struct *p); -bool task_will_free_mem(struct task_struct *task); - /* sysctls */ extern int sysctl_oom_dump_tasks; extern int sysctl_oom_kill_allocating_task; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4be518d4e68a..48747ef5b88f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -920,6 +920,43 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) iter != NULL; \ iter = mem_cgroup_iter(NULL, iter, NULL)) +/** + * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy + * @memcg: hierarchy root + * @fn: function to call for each task + * @arg: argument passed to @fn + * + * This function iterates over tasks attached to @memcg or to any of its + * descendants and calls @fn for each task. If @fn returns a non-zero + * value, the function breaks the iteration loop and returns the value. + * Otherwise, it will iterate over all tasks and return 0. + * + * This function must not be called for the root memory cgroup. + */ +int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, + int (*fn)(struct task_struct *, void *), void *arg) +{ + struct mem_cgroup *iter; + int ret = 0; + + BUG_ON(memcg == root_mem_cgroup); + + for_each_mem_cgroup_tree(iter, memcg) { + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&iter->css, &it); + while (!ret && (task = css_task_iter_next(&it))) + ret = fn(task, arg); + css_task_iter_end(&it); + if (ret) { + mem_cgroup_iter_break(memcg, iter); + break; + } + } + return ret; +} + /** * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page * @page: the page @@ -1178,7 +1215,7 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg) /* * Return the memory (and swap, if configured) limit for a memcg. */ -static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) +unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) { unsigned long limit; @@ -1205,79 +1242,12 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, .gfp_mask = gfp_mask, .order = order, }; - struct mem_cgroup *iter; - unsigned long chosen_points = 0; - unsigned long totalpages; - unsigned int points = 0; - struct task_struct *chosen = NULL; + bool ret; mutex_lock(&oom_lock); - - /* - * If current has a pending SIGKILL or is exiting, then automatically - * select it. The goal is to allow it to allocate so that it may - * quickly exit and free its memory. - */ - if (task_will_free_mem(current)) { - mark_oom_victim(current); - wake_oom_reaper(current); - goto unlock; - } - - check_panic_on_oom(&oc, CONSTRAINT_MEMCG); - totalpages = mem_cgroup_get_limit(memcg) ? : 1; - for_each_mem_cgroup_tree(iter, memcg) { - struct css_task_iter it; - struct task_struct *task; - - css_task_iter_start(&iter->css, &it); - while ((task = css_task_iter_next(&it))) { - switch (oom_scan_process_thread(&oc, task)) { - case OOM_SCAN_SELECT: - if (chosen) - put_task_struct(chosen); - chosen = task; - chosen_points = ULONG_MAX; - get_task_struct(chosen); - /* fall through */ - case OOM_SCAN_CONTINUE: - continue; - case OOM_SCAN_ABORT: - css_task_iter_end(&it); - mem_cgroup_iter_break(memcg, iter); - if (chosen) - put_task_struct(chosen); - /* Set a dummy value to return "true". */ - chosen = (void *) 1; - goto unlock; - case OOM_SCAN_OK: - break; - }; - points = oom_badness(task, memcg, NULL, totalpages); - if (!points || points < chosen_points) - continue; - /* Prefer thread group leaders for display purposes */ - if (points == chosen_points && - thread_group_leader(chosen)) - continue; - - if (chosen) - put_task_struct(chosen); - chosen = task; - chosen_points = points; - get_task_struct(chosen); - } - css_task_iter_end(&it); - } - - if (chosen) { - points = chosen_points * 1000 / totalpages; - oom_kill_process(&oc, chosen, points, totalpages, - "Memory cgroup out of memory"); - } -unlock: + ret = out_of_memory(&oc); mutex_unlock(&oom_lock); - return chosen; + return ret; } #if MAX_NUMNODES > 1 @@ -1600,7 +1570,7 @@ bool mem_cgroup_oom_synchronize(bool handle) if (!memcg) return false; - if (!handle || oom_killer_disabled) + if (!handle) goto cleanup; owait.memcg = memcg; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d53a9aa00977..ef175518f05f 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -132,6 +132,11 @@ static inline bool is_sysrq_oom(struct oom_control *oc) return oc->order == -1; } +static inline bool is_memcg_oom(struct oom_control *oc) +{ + return oc->memcg != NULL; +} + /* return true if the task is not adequate as candidate victim task. */ static bool oom_unkillable_task(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask) @@ -213,12 +218,17 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, return points > 0 ? points : 1; } +enum oom_constraint { + CONSTRAINT_NONE, + CONSTRAINT_CPUSET, + CONSTRAINT_MEMORY_POLICY, + CONSTRAINT_MEMCG, +}; + /* * Determine the type of allocation constraint. */ -#ifdef CONFIG_NUMA -static enum oom_constraint constrained_alloc(struct oom_control *oc, - unsigned long *totalpages) +static enum oom_constraint constrained_alloc(struct oom_control *oc) { struct zone *zone; struct zoneref *z; @@ -226,8 +236,16 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc, bool cpuset_limited = false; int nid; + if (is_memcg_oom(oc)) { + oc->totalpages = mem_cgroup_get_limit(oc->memcg) ?: 1; + return CONSTRAINT_MEMCG; + } + /* Default to all available memory */ - *totalpages = totalram_pages + total_swap_pages; + oc->totalpages = totalram_pages + total_swap_pages; + + if (!IS_ENABLED(CONFIG_NUMA)) + return CONSTRAINT_NONE; if (!oc->zonelist) return CONSTRAINT_NONE; @@ -246,9 +264,9 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc, */ if (oc->nodemask && !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) { - *totalpages = total_swap_pages; + oc->totalpages = total_swap_pages; for_each_node_mask(nid, *oc->nodemask) - *totalpages += node_spanned_pages(nid); + oc->totalpages += node_spanned_pages(nid); return CONSTRAINT_MEMORY_POLICY; } @@ -259,27 +277,21 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc, cpuset_limited = true; if (cpuset_limited) { - *totalpages = total_swap_pages; + oc->totalpages = total_swap_pages; for_each_node_mask(nid, cpuset_current_mems_allowed) - *totalpages += node_spanned_pages(nid); + oc->totalpages += node_spanned_pages(nid); return CONSTRAINT_CPUSET; } return CONSTRAINT_NONE; } -#else -static enum oom_constraint constrained_alloc(struct oom_control *oc, - unsigned long *totalpages) -{ - *totalpages = totalram_pages + total_swap_pages; - return CONSTRAINT_NONE; -} -#endif -enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, - struct task_struct *task) +static int oom_evaluate_task(struct task_struct *task, void *arg) { + struct oom_control *oc = arg; + unsigned long points; + if (oom_unkillable_task(task, NULL, oc->nodemask)) - return OOM_SCAN_CONTINUE; + goto next; /* * This task already has access to memory reserves and is being killed. @@ -289,68 +301,67 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, */ if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) { struct task_struct *p = find_lock_task_mm(task); - enum oom_scan_t ret = OOM_SCAN_ABORT; + bool reaped = false; if (p) { - if (test_bit(MMF_OOM_REAPED, &p->mm->flags)) - ret = OOM_SCAN_CONTINUE; + reaped = test_bit(MMF_OOM_REAPED, &p->mm->flags); task_unlock(p); } - - return ret; + if (reaped) + goto next; + goto abort; } /* * If task is allocating a lot of memory and has been marked to be * killed first if it triggers an oom, then select it. */ - if (oom_task_origin(task)) - return OOM_SCAN_SELECT; + if (oom_task_origin(task)) { + points = ULONG_MAX; + goto select; + } - return OOM_SCAN_OK; + points = oom_badness(task, NULL, oc->nodemask, oc->totalpages); + if (!points || points < oc->chosen_points) + goto next; + + /* Prefer thread group leaders for display purposes */ + if (points == oc->chosen_points && thread_group_leader(oc->chosen)) + goto next; +select: + if (oc->chosen) + put_task_struct(oc->chosen); + get_task_struct(task); + oc->chosen = task; + oc->chosen_points = points; +next: + return 0; +abort: + if (oc->chosen) + put_task_struct(oc->chosen); + oc->chosen = (void *)-1UL; + return 1; } /* - * Simple selection loop. We chose the process with the highest - * number of 'points'. Returns -1 on scan abort. + * Simple selection loop. We choose the process with the highest number of + * 'points'. In case scan was aborted, oc->chosen is set to -1. */ -static struct task_struct *select_bad_process(struct oom_control *oc, - unsigned int *ppoints, unsigned long totalpages) +static void select_bad_process(struct oom_control *oc) { - struct task_struct *p; - struct task_struct *chosen = NULL; - unsigned long chosen_points = 0; + if (is_memcg_oom(oc)) + mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc); + else { + struct task_struct *p; - rcu_read_lock(); - for_each_process(p) { - unsigned int points; - - switch (oom_scan_process_thread(oc, p)) { - case OOM_SCAN_SELECT: - chosen = p; - chosen_points = ULONG_MAX; - /* fall through */ - case OOM_SCAN_CONTINUE: - continue; - case OOM_SCAN_ABORT: - rcu_read_unlock(); - return (struct task_struct *)(-1UL); - case OOM_SCAN_OK: - break; - }; - points = oom_badness(p, NULL, oc->nodemask, totalpages); - if (!points || points < chosen_points) - continue; - - chosen = p; - chosen_points = points; + rcu_read_lock(); + for_each_process(p) + if (oom_evaluate_task(p, oc)) + break; + rcu_read_unlock(); } - if (chosen) - get_task_struct(chosen); - rcu_read_unlock(); - *ppoints = chosen_points * 1000 / totalpages; - return chosen; + oc->chosen_points = oc->chosen_points * 1000 / oc->totalpages; } /** @@ -419,7 +430,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p) static atomic_t oom_victims = ATOMIC_INIT(0); static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); -bool oom_killer_disabled __read_mostly; +static bool oom_killer_disabled __read_mostly; #define K(x) ((x) << (PAGE_SHIFT-10)) @@ -627,7 +638,7 @@ static int oom_reaper(void *unused) return 0; } -void wake_oom_reaper(struct task_struct *tsk) +static void wake_oom_reaper(struct task_struct *tsk) { if (!oom_reaper_th) return; @@ -656,7 +667,11 @@ static int __init oom_init(void) return 0; } subsys_initcall(oom_init) -#endif +#else +static inline void wake_oom_reaper(struct task_struct *tsk) +{ +} +#endif /* CONFIG_MMU */ /** * mark_oom_victim - mark the given task as OOM victim @@ -665,7 +680,7 @@ subsys_initcall(oom_init) * Has to be called with oom_lock held and never after * oom has been disabled already. */ -void mark_oom_victim(struct task_struct *tsk) +static void mark_oom_victim(struct task_struct *tsk) { WARN_ON(oom_killer_disabled); /* OOM killer might race with memcg OOM */ @@ -760,7 +775,7 @@ static inline bool __task_will_free_mem(struct task_struct *task) * Caller has to make sure that task->mm is stable (hold task_lock or * it operates on the current). */ -bool task_will_free_mem(struct task_struct *task) +static bool task_will_free_mem(struct task_struct *task) { struct mm_struct *mm = task->mm; struct task_struct *p; @@ -806,14 +821,10 @@ bool task_will_free_mem(struct task_struct *task) return ret; } -/* - * Must be called while holding a reference to p, which will be released upon - * returning. - */ -void oom_kill_process(struct oom_control *oc, struct task_struct *p, - unsigned int points, unsigned long totalpages, - const char *message) +static void oom_kill_process(struct oom_control *oc, const char *message) { + struct task_struct *p = oc->chosen; + unsigned int points = oc->chosen_points; struct task_struct *victim = p; struct task_struct *child; struct task_struct *t; @@ -860,7 +871,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, * oom_badness() returns 0 if the thread is unkillable */ child_points = oom_badness(child, - oc->memcg, oc->nodemask, totalpages); + oc->memcg, oc->nodemask, oc->totalpages); if (child_points > victim_points) { put_task_struct(victim); victim = child; @@ -942,7 +953,8 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, /* * Determines whether the kernel must panic because of the panic_on_oom sysctl. */ -void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint) +static void check_panic_on_oom(struct oom_control *oc, + enum oom_constraint constraint) { if (likely(!sysctl_panic_on_oom)) return; @@ -988,19 +1000,18 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier); */ bool out_of_memory(struct oom_control *oc) { - struct task_struct *p; - unsigned long totalpages; unsigned long freed = 0; - unsigned int uninitialized_var(points); enum oom_constraint constraint = CONSTRAINT_NONE; if (oom_killer_disabled) return false; - blocking_notifier_call_chain(&oom_notify_list, 0, &freed); - if (freed > 0) - /* Got some memory back in the last second. */ - return true; + if (!is_memcg_oom(oc)) { + blocking_notifier_call_chain(&oom_notify_list, 0, &freed); + if (freed > 0) + /* Got some memory back in the last second. */ + return true; + } /* * If current has a pending SIGKILL or is exiting, then automatically @@ -1024,37 +1035,38 @@ bool out_of_memory(struct oom_control *oc) /* * Check if there were limitations on the allocation (only relevant for - * NUMA) that may require different handling. + * NUMA and memcg) that may require different handling. */ - constraint = constrained_alloc(oc, &totalpages); + constraint = constrained_alloc(oc); if (constraint != CONSTRAINT_MEMORY_POLICY) oc->nodemask = NULL; check_panic_on_oom(oc, constraint); - if (sysctl_oom_kill_allocating_task && current->mm && - !oom_unkillable_task(current, NULL, oc->nodemask) && + if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task && + current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) && current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { get_task_struct(current); - oom_kill_process(oc, current, 0, totalpages, - "Out of memory (oom_kill_allocating_task)"); + oc->chosen = current; + oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)"); return true; } - p = select_bad_process(oc, &points, totalpages); + select_bad_process(oc); /* Found nothing?!?! Either we hang forever, or we panic. */ - if (!p && !is_sysrq_oom(oc)) { + if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) { dump_header(oc, NULL); panic("Out of memory and no killable processes...\n"); } - if (p && p != (void *)-1UL) { - oom_kill_process(oc, p, points, totalpages, "Out of memory"); + if (oc->chosen && oc->chosen != (void *)-1UL) { + oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" : + "Memory cgroup out of memory"); /* * Give the killed process a good chance to exit before trying * to allocate memory again. */ schedule_timeout_killable(1); } - return true; + return !!oc->chosen; } /* From 252e5c6e2e5b4557599ef86ea5d02b0395e9056c Mon Sep 17 00:00:00 2001 From: zijun_hu Date: Fri, 7 Oct 2016 16:57:26 -0700 Subject: [PATCH 013/127] mm/vmalloc.c: fix align value calculation error It causes double align requirement for __get_vm_area_node() if parameter size is power of 2 and VM_IOREMAP is set in parameter flags, for example size=0x10000 -> fls_long(0x10000)=17 -> align=0x20000 get_count_order_long() is implemented and can be used instead of fls_long() for fixing the bug, for example size=0x10000 -> get_count_order_long(0x10000)=16 -> align=0x10000 [akpm@linux-foundation.org: s/get_order_long()/get_count_order_long()/] [zijun_hu@zoho.com: fixes] Link: http://lkml.kernel.org/r/57AABC8B.1040409@zoho.com [akpm@linux-foundation.org: locate get_count_order_long() next to get_count_order()] [akpm@linux-foundation.org: move get_count_order[_long] definitions to pick up fls_long()] [zijun_hu@htc.com: move out get_count_order[_long]() from __KERNEL__ scope] Link: http://lkml.kernel.org/r/57B2C4CE.80303@zoho.com Link: http://lkml.kernel.org/r/fc045ecf-20fa-0722-b3ac-9a6140488fad@zoho.com Signed-off-by: zijun_hu Cc: Tejun Heo Cc: Johannes Weiner Cc: Minchan Kim Cc: David Rientjes Signed-off-by: zijun_hu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitops.h | 36 ++++++++++++++++++++++++++---------- mm/vmalloc.c | 8 ++++---- 2 files changed, 30 insertions(+), 14 deletions(-) diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 299e76b59fe9..a83c822c35c2 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -65,16 +65,6 @@ static inline int get_bitmask_order(unsigned int count) return order; /* We could be slightly more clever with -1 here... */ } -static inline int get_count_order(unsigned int count) -{ - int order; - - order = fls(count) - 1; - if (count & (count - 1)) - order++; - return order; -} - static __always_inline unsigned long hweight_long(unsigned long w) { return sizeof(w) == 4 ? hweight32(w) : hweight64(w); @@ -191,6 +181,32 @@ static inline unsigned fls_long(unsigned long l) return fls64(l); } +static inline int get_count_order(unsigned int count) +{ + int order; + + order = fls(count) - 1; + if (count & (count - 1)) + order++; + return order; +} + +/** + * get_count_order_long - get order after rounding @l up to power of 2 + * @l: parameter + * + * it is same as get_count_order() but with long type parameter + */ +static inline int get_count_order_long(unsigned long l) +{ + if (l == 0UL) + return -1; + else if (l & (l - 1UL)) + return (int)fls_long(l); + else + return (int)fls_long(l) - 1; +} + /** * __ffs64 - find first set bit in a 64 bit word * @word: The 64 bit word diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 91f44e78c516..80660a0f989b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1359,14 +1359,14 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, struct vm_struct *area; BUG_ON(in_interrupt()); - if (flags & VM_IOREMAP) - align = 1ul << clamp_t(int, fls_long(size), - PAGE_SHIFT, IOREMAP_MAX_ORDER); - size = PAGE_ALIGN(size); if (unlikely(!size)) return NULL; + if (flags & VM_IOREMAP) + align = 1ul << clamp_t(int, get_count_order_long(size), + PAGE_SHIFT, IOREMAP_MAX_ORDER); + area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); if (unlikely(!area)) return NULL; From 58fa2a5512d9f224775fb01433f195e639953c5f Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 7 Oct 2016 16:57:29 -0700 Subject: [PATCH 014/127] mm: memcontrol: add sanity checks for memcg->id.ref on get/put Link: http://lkml.kernel.org/r/1c5ddb1c171dbdfc3262252769d6138a29b35b70.1470219853.git.vdavydov@virtuozzo.com Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 48747ef5b88f..5579e762b1ce 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4062,11 +4062,13 @@ static DEFINE_IDR(mem_cgroup_idr); static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) { + VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0); atomic_add(n, &memcg->id.ref); } static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) { + VM_BUG_ON(atomic_read(&memcg->id.ref) < n); if (atomic_sub_and_test(n, &memcg->id.ref)) { idr_remove(&mem_cgroup_idr, memcg->id.id); memcg->id.id = 0; @@ -4255,8 +4257,10 @@ fail: static int mem_cgroup_css_online(struct cgroup_subsys_state *css) { + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + /* Online state pins memcg ID, memcg ID pins CSS */ - mem_cgroup_id_get(mem_cgroup_from_css(css)); + atomic_set(&memcg->id.ref, 1); css_get(css); return 0; } From 5870c2e1d78b043b69de3199469c056ca3b05102 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 7 Oct 2016 16:57:32 -0700 Subject: [PATCH 015/127] mm/oom_kill.c: fix task_will_free_mem() comment Attempt to demystify the task_will_free_mem() loop. Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ef175518f05f..463cdd22d4e0 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -803,8 +803,9 @@ static bool task_will_free_mem(struct task_struct *task) return true; /* - * This is really pessimistic but we do not have any reliable way - * to check that external processes share with our mm + * Make sure that all tasks which share the mm with the given tasks + * are dying as well to make sure that a) nobody pins its mm and + * b) the task is also reapable by the oom reaper. */ rcu_read_lock(); for_each_process(p) { From 06ed29989f39f5129d4f76f4a2d7ce2efa46a6a1 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Oct 2016 16:57:35 -0700 Subject: [PATCH 016/127] mm, compaction: make whole_zone flag ignore cached scanner positions Patch series "make direct compaction more deterministic") This is mostly a followup to Michal's oom detection rework, which highlighted the need for direct compaction to provide better feedback in reclaim/compaction loop, so that it can reliably recognize when compaction cannot make further progress, and allocation should invoke OOM killer or fail. We've discussed this at LSF/MM [1] where I proposed expanding the async/sync migration mode used in compaction to more general "priorities". This patchset adds one new priority that just overrides all the heuristics and makes compaction fully scan all zones. I don't currently think that we need more fine-grained priorities, but we'll see. Other than that there's some smaller fixes and cleanups, mainly related to the THP-specific hacks. I've tested this with stress-highalloc in GFP_KERNEL order-4 and THP-like order-9 scenarios. There's some improvement for compaction stats for the order-4, which is likely due to the better watermarks handling. In the previous version I reported mostly noise wrt compaction stats, and decreased direct reclaim - now the reclaim is without difference. I believe this is due to the less aggressive compaction priority increase in patch 6. "before" is a mmotm tree prior to 4.7 release plus the first part of the series that was sent and merged separately before after order-4: Compaction stalls 27216 30759 Compaction success 19598 25475 Compaction failures 7617 5283 Page migrate success 370510 464919 Page migrate failure 25712 27987 Compaction pages isolated 849601 1041581 Compaction migrate scanned 143146541 101084990 Compaction free scanned 208355124 144863510 Compaction cost 1403 1210 order-9: Compaction stalls 7311 7401 Compaction success 1634 1683 Compaction failures 5677 5718 Page migrate success 194657 183988 Page migrate failure 4753 4170 Compaction pages isolated 498790 456130 Compaction migrate scanned 565371 524174 Compaction free scanned 4230296 4250744 Compaction cost 215 203 [1] https://lwn.net/Articles/684611/ This patch (of 11): A recent patch has added whole_zone flag that compaction sets when scanning starts from the zone boundary, in order to report that zone has been fully scanned in one attempt. For allocations that want to try really hard or cannot fail, we will want to introduce a mode where scanning whole zone is guaranteed regardless of the cached positions. This patch reuses the whole_zone flag in a way that if it's already passed true to compaction, the cached scanner positions are ignored. Employing this flag during reclaim/compaction loop will be done in the next patch. This patch however converts compaction invoked from userspace via procfs to use this flag. Before this patch, the cached positions were first reset to zone boundaries and then read back from struct zone, so there was a window where a parallel compaction could replace the reset values, making the manual compaction less effective. Using the flag instead of performing reset is more robust. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20160810091226.6709-2-vbabka@suse.cz Signed-off-by: Vlastimil Babka Tested-by: Lorenzo Stoakes Acked-by: Michal Hocko Cc: Mel Gorman Cc: Joonsoo Kim Cc: David Rientjes Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 43 +++++++++++++++++++++---------------------- mm/internal.h | 2 +- 2 files changed, 22 insertions(+), 23 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 9affb2908304..c684ca141e4b 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1492,23 +1492,29 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro /* * Setup to move all movable pages to the end of the zone. Used cached - * information on where the scanners should start but check that it - * is initialised by ensuring the values are within zone boundaries. + * information on where the scanners should start (unless we explicitly + * want to compact the whole zone), but check that it is initialised + * by ensuring the values are within zone boundaries. */ - cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; - cc->free_pfn = zone->compact_cached_free_pfn; - if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) { - cc->free_pfn = pageblock_start_pfn(end_pfn - 1); - zone->compact_cached_free_pfn = cc->free_pfn; - } - if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) { + if (cc->whole_zone) { cc->migrate_pfn = start_pfn; - zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; - zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; - } + cc->free_pfn = pageblock_start_pfn(end_pfn - 1); + } else { + cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; + cc->free_pfn = zone->compact_cached_free_pfn; + if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) { + cc->free_pfn = pageblock_start_pfn(end_pfn - 1); + zone->compact_cached_free_pfn = cc->free_pfn; + } + if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) { + cc->migrate_pfn = start_pfn; + zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; + zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; + } - if (cc->migrate_pfn == start_pfn) - cc->whole_zone = true; + if (cc->migrate_pfn == start_pfn) + cc->whole_zone = true; + } cc->last_migrated_pfn = 0; @@ -1747,14 +1753,6 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) INIT_LIST_HEAD(&cc->freepages); INIT_LIST_HEAD(&cc->migratepages); - /* - * When called via /proc/sys/vm/compact_memory - * this makes sure we compact the whole zone regardless of - * cached scanner positions. - */ - if (is_via_compact_memory(cc->order)) - __reset_isolation_suitable(zone); - if (is_via_compact_memory(cc->order) || !compaction_deferred(zone, cc->order)) compact_zone(zone, cc); @@ -1790,6 +1788,7 @@ static void compact_node(int nid) .order = -1, .mode = MIGRATE_SYNC, .ignore_skip_hint = true, + .whole_zone = true, }; __compact_pgdat(NODE_DATA(nid), &cc); diff --git a/mm/internal.h b/mm/internal.h index 1501304f87a4..5214bf8e3171 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -179,7 +179,7 @@ struct compact_control { enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ bool direct_compaction; /* False from kcompactd or /proc/... */ - bool whole_zone; /* Whole zone has been scanned */ + bool whole_zone; /* Whole zone should/has been scanned */ int order; /* order a direct compactor needs */ const gfp_t gfp_mask; /* gfp mask of a direct compactor */ const unsigned int alloc_flags; /* alloc flags of a direct compactor */ From 791cae9620e35d18df2cedf2bd444920c3ecf04a Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Oct 2016 16:57:38 -0700 Subject: [PATCH 017/127] mm, compaction: cleanup unused functions Since kswapd compaction moved to kcompactd, compact_pgdat() is not called anymore, so we remove it. The only caller of __compact_pgdat() is compact_node(), so we merge them and remove code that was only reachable from kswapd. Link: http://lkml.kernel.org/r/20160810091226.6709-3-vbabka@suse.cz Signed-off-by: Vlastimil Babka Tested-by: Lorenzo Stoakes Acked-by: Michal Hocko Cc: Mel Gorman Cc: Joonsoo Kim Cc: David Rientjes Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 5 --- mm/compaction.c | 68 ++++++++++++-------------------------- 2 files changed, 21 insertions(+), 52 deletions(-) diff --git a/include/linux/compaction.h b/include/linux/compaction.h index d4e106b5dc27..1bb58581301c 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -70,7 +70,6 @@ extern int fragmentation_index(struct zone *zone, unsigned int order); extern enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, enum compact_priority prio); -extern void compact_pgdat(pg_data_t *pgdat, int order); extern void reset_isolation_suitable(pg_data_t *pgdat); extern enum compact_result compaction_suitable(struct zone *zone, int order, unsigned int alloc_flags, int classzone_idx); @@ -154,10 +153,6 @@ extern void kcompactd_stop(int nid); extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx); #else -static inline void compact_pgdat(pg_data_t *pgdat, int order) -{ -} - static inline void reset_isolation_suitable(pg_data_t *pgdat) { } diff --git a/mm/compaction.c b/mm/compaction.c index c684ca141e4b..8e32778fba5b 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1736,54 +1736,11 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, /* Compact all zones within a node */ -static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) -{ - int zoneid; - struct zone *zone; - - for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { - - zone = &pgdat->node_zones[zoneid]; - if (!populated_zone(zone)) - continue; - - cc->nr_freepages = 0; - cc->nr_migratepages = 0; - cc->zone = zone; - INIT_LIST_HEAD(&cc->freepages); - INIT_LIST_HEAD(&cc->migratepages); - - if (is_via_compact_memory(cc->order) || - !compaction_deferred(zone, cc->order)) - compact_zone(zone, cc); - - VM_BUG_ON(!list_empty(&cc->freepages)); - VM_BUG_ON(!list_empty(&cc->migratepages)); - - if (is_via_compact_memory(cc->order)) - continue; - - if (zone_watermark_ok(zone, cc->order, - low_wmark_pages(zone), 0, 0)) - compaction_defer_reset(zone, cc->order, false); - } -} - -void compact_pgdat(pg_data_t *pgdat, int order) -{ - struct compact_control cc = { - .order = order, - .mode = MIGRATE_ASYNC, - }; - - if (!order) - return; - - __compact_pgdat(pgdat, &cc); -} - static void compact_node(int nid) { + pg_data_t *pgdat = NODE_DATA(nid); + int zoneid; + struct zone *zone; struct compact_control cc = { .order = -1, .mode = MIGRATE_SYNC, @@ -1791,7 +1748,24 @@ static void compact_node(int nid) .whole_zone = true, }; - __compact_pgdat(NODE_DATA(nid), &cc); + + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { + + zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + cc.nr_freepages = 0; + cc.nr_migratepages = 0; + cc.zone = zone; + INIT_LIST_HEAD(&cc.freepages); + INIT_LIST_HEAD(&cc.migratepages); + + compact_zone(zone, &cc); + + VM_BUG_ON(!list_empty(&cc.freepages)); + VM_BUG_ON(!list_empty(&cc.migratepages)); + } } /* Compact all nodes in the system */ From cf378319d335663b6722e74db0211b8af55049d5 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Oct 2016 16:57:41 -0700 Subject: [PATCH 018/127] mm, compaction: rename COMPACT_PARTIAL to COMPACT_SUCCESS COMPACT_PARTIAL has historically meant that compaction returned after doing some work without fully compacting a zone. It however didn't distinguish if compaction terminated because it succeeded in creating the requested high-order page. This has changed recently and now we only return COMPACT_PARTIAL when compaction thinks it succeeded, or the high-order watermark check in compaction_suitable() passes and no compaction needs to be done. So at this point we can make the return value clearer by renaming it to COMPACT_SUCCESS. The next patch will remove some redundant tests for success where compaction just returned COMPACT_SUCCESS. Link: http://lkml.kernel.org/r/20160810091226.6709-4-vbabka@suse.cz Signed-off-by: Vlastimil Babka Tested-by: Lorenzo Stoakes Acked-by: Michal Hocko Cc: Mel Gorman Cc: Joonsoo Kim Cc: David Rientjes Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 8 ++++---- include/trace/events/compaction.h | 2 +- mm/compaction.c | 12 ++++++------ mm/vmscan.c | 2 +- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 1bb58581301c..e88c037afe47 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -49,10 +49,10 @@ enum compact_result { COMPACT_CONTENDED, /* - * direct compaction partially compacted a zone and there might be - * suitable pages + * direct compaction terminated after concluding that the allocation + * should now succeed */ - COMPACT_PARTIAL, + COMPACT_SUCCESS, }; struct alloc_context; /* in mm/internal.h */ @@ -88,7 +88,7 @@ static inline bool compaction_made_progress(enum compact_result result) * that the compaction successfully isolated and migrated some * pageblocks. */ - if (result == COMPACT_PARTIAL) + if (result == COMPACT_SUCCESS) return true; return false; diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index c2ba402ab256..cbdb90b6b308 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -13,7 +13,7 @@ EM( COMPACT_SKIPPED, "skipped") \ EM( COMPACT_DEFERRED, "deferred") \ EM( COMPACT_CONTINUE, "continue") \ - EM( COMPACT_PARTIAL, "partial") \ + EM( COMPACT_SUCCESS, "success") \ EM( COMPACT_PARTIAL_SKIPPED, "partial_skipped") \ EM( COMPACT_COMPLETE, "complete") \ EM( COMPACT_NO_SUITABLE_PAGE, "no_suitable_page") \ diff --git a/mm/compaction.c b/mm/compaction.c index 8e32778fba5b..335eeeed0c91 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1329,13 +1329,13 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_ /* Job done if page is free of the right migratetype */ if (!list_empty(&area->free_list[migratetype])) - return COMPACT_PARTIAL; + return COMPACT_SUCCESS; #ifdef CONFIG_CMA /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */ if (migratetype == MIGRATE_MOVABLE && !list_empty(&area->free_list[MIGRATE_CMA])) - return COMPACT_PARTIAL; + return COMPACT_SUCCESS; #endif /* * Job done if allocation would steal freepages from @@ -1343,7 +1343,7 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_ */ if (find_suitable_fallback(area, order, migratetype, true, &can_steal) != -1) - return COMPACT_PARTIAL; + return COMPACT_SUCCESS; } return COMPACT_NO_SUITABLE_PAGE; @@ -1367,7 +1367,7 @@ static enum compact_result compact_finished(struct zone *zone, * compaction_suitable: Is this suitable to run compaction on this zone now? * Returns * COMPACT_SKIPPED - If there are too few free pages for compaction - * COMPACT_PARTIAL - If the allocation would succeed without compaction + * COMPACT_SUCCESS - If the allocation would succeed without compaction * COMPACT_CONTINUE - If compaction should run now */ static enum compact_result __compaction_suitable(struct zone *zone, int order, @@ -1388,7 +1388,7 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, */ if (zone_watermark_ok(zone, order, watermark, classzone_idx, alloc_flags)) - return COMPACT_PARTIAL; + return COMPACT_SUCCESS; /* * Watermarks for order-0 must be met for compaction. Note the 2UL. @@ -1477,7 +1477,7 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro ret = compaction_suitable(zone, cc->order, cc->alloc_flags, cc->classzone_idx); /* Compaction is likely to fail */ - if (ret == COMPACT_PARTIAL || ret == COMPACT_SKIPPED) + if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED) return ret; /* huh, compaction_suitable is returning something unexpected */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 0fe8b7113868..981fc84e7434 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2495,7 +2495,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, continue; switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) { - case COMPACT_PARTIAL: + case COMPACT_SUCCESS: case COMPACT_CONTINUE: return false; default: From 7ceb009a22517297ae0e32863eb86ec766782263 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Oct 2016 16:57:44 -0700 Subject: [PATCH 019/127] mm, compaction: don't recheck watermarks after COMPACT_SUCCESS Joonsoo has reminded me that in a later patch changing watermark checks throughout compaction I forgot to update checks in try_to_compact_pages() and compactd_do_work(). Closer inspection however shows that they are redundant now in the success case, because compact_zone() now reliably reports this with COMPACT_SUCCESS. So effectively the checks just repeat (a subset) of checks that have just passed. So instead of checking watermarks again, just test the return value. Note it's also possible that compaction would declare failure e.g. because its find_suitable_fallback() is more strict than simple watermark check, and then the watermark check we are removing would then still succeed. After this patch this is not possible and it's arguably better, because for long-term fragmentation avoidance we should rather try a different zone than allocate with the unsuitable fallback. If compaction of all zones fail and the allocation is important enough, it will retry and succeed anyway. Also remove the stray "bool success" variable from kcompactd_do_work(). Link: http://lkml.kernel.org/r/20160810091226.6709-5-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reported-by: Joonsoo Kim Tested-by: Lorenzo Stoakes Acked-by: Michal Hocko Cc: Mel Gorman Cc: David Rientjes Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 335eeeed0c91..2e1113ff7a03 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1698,9 +1698,8 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, alloc_flags, ac_classzone_idx(ac)); rc = max(status, rc); - /* If a normal allocation would succeed, stop compacting */ - if (zone_watermark_ok(zone, order, low_wmark_pages(zone), - ac_classzone_idx(ac), alloc_flags)) { + /* The allocation should succeed, stop compacting */ + if (status == COMPACT_SUCCESS) { /* * We think the allocation will succeed in this zone, * but it is not certain, hence the false. The caller @@ -1873,8 +1872,6 @@ static void kcompactd_do_work(pg_data_t *pgdat) .ignore_skip_hint = true, }; - bool success = false; - trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, cc.classzone_idx); count_vm_event(KCOMPACTD_WAKE); @@ -1903,9 +1900,7 @@ static void kcompactd_do_work(pg_data_t *pgdat) return; status = compact_zone(zone, &cc); - if (zone_watermark_ok(zone, cc.order, low_wmark_pages(zone), - cc.classzone_idx, 0)) { - success = true; + if (status == COMPACT_SUCCESS) { compaction_defer_reset(zone, cc.order, false); } else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) { /* From a8e025e55b35f7eaf6c6c011de1f98d47ddf0843 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Oct 2016 16:57:47 -0700 Subject: [PATCH 020/127] mm, compaction: add the ultimate direct compaction priority During reclaim/compaction loop, it's desirable to get a final answer from unsuccessful compaction so we can either fail the allocation or invoke the OOM killer. However, heuristics such as deferred compaction or pageblock skip bits can cause compaction to skip parts or whole zones and lead to premature OOM's, failures or excessive reclaim/compaction retries. To remedy this, we introduce a new direct compaction priority called COMPACT_PRIO_SYNC_FULL, which instructs direct compaction to: - ignore deferred compaction status for a zone - ignore pageblock skip hints - ignore cached scanner positions and scan the whole zone The new priority should get eventually picked up by should_compact_retry() and this should improve success rates for costly allocations using __GFP_REPEAT, such as hugetlbfs allocations, and reduce some corner-case OOM's for non-costly allocations. Link: http://lkml.kernel.org/r/20160810091226.6709-6-vbabka@suse.cz [vbabka@suse.cz: use the MIN_COMPACT_PRIORITY alias] Link: http://lkml.kernel.org/r/d443b884-87e7-1c93-8684-3a3a35759fb1@suse.cz Signed-off-by: Vlastimil Babka Tested-by: Lorenzo Stoakes Acked-by: Michal Hocko Cc: Mel Gorman Cc: Joonsoo Kim Cc: David Rientjes Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 3 ++- mm/compaction.c | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/include/linux/compaction.h b/include/linux/compaction.h index e88c037afe47..a1fba9994728 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -6,8 +6,9 @@ * Lower value means higher priority, analogically to reclaim priority. */ enum compact_priority { + COMPACT_PRIO_SYNC_FULL, + MIN_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_FULL, COMPACT_PRIO_SYNC_LIGHT, - MIN_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_LIGHT, DEF_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_LIGHT, COMPACT_PRIO_ASYNC, INIT_COMPACT_PRIORITY = COMPACT_PRIO_ASYNC diff --git a/mm/compaction.c b/mm/compaction.c index 2e1113ff7a03..21040304f4d2 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1644,6 +1644,8 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, .alloc_flags = alloc_flags, .classzone_idx = classzone_idx, .direct_compaction = true, + .whole_zone = (prio == MIN_COMPACT_PRIORITY), + .ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY) }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -1689,7 +1691,8 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, ac->nodemask) { enum compact_result status; - if (compaction_deferred(zone, order)) { + if (prio > MIN_COMPACT_PRIORITY + && compaction_deferred(zone, order)) { rc = max_t(enum compact_result, COMPACT_DEFERRED, rc); continue; } From f2b8228c5f99a92bc07efd36f8dc840e0705a266 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Oct 2016 16:57:50 -0700 Subject: [PATCH 021/127] mm, compaction: use correct watermark when checking compaction success The __compact_finished() function uses low watermark in a check that has to pass if the direct compaction is to finish and allocation should succeed. This is too pessimistic, as the allocation will typically use min watermark. It may happen that during compaction, we drop below the low watermark (due to parallel activity), but still form the target high-order page. By checking against low watermark, we might needlessly continue compaction. Similarly, __compaction_suitable() uses low watermark in a check whether allocation can succeed without compaction. Again, this is unnecessarily pessimistic. After this patch, these check will use direct compactor's alloc_flags to determine the watermark, which is effectively the min watermark. Link: http://lkml.kernel.org/r/20160810091226.6709-8-vbabka@suse.cz Signed-off-by: Vlastimil Babka Tested-by: Lorenzo Stoakes Acked-by: Michal Hocko Cc: Mel Gorman Cc: Joonsoo Kim Cc: David Rientjes Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 21040304f4d2..e2618ac062a6 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1316,7 +1316,7 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_ return COMPACT_CONTINUE; /* Compaction run is not finished if the watermark is not met */ - watermark = low_wmark_pages(zone); + watermark = zone->watermark[cc->alloc_flags & ALLOC_WMARK_MASK]; if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx, cc->alloc_flags)) @@ -1381,7 +1381,7 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, if (is_via_compact_memory(order)) return COMPACT_CONTINUE; - watermark = low_wmark_pages(zone); + watermark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; /* * If watermarks for high-order allocation are already met, there * should be no need for compaction at all. @@ -1395,7 +1395,7 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, * This is because during migration, copies of pages need to be * allocated and for a short time, the footprint is higher */ - watermark += (2UL << order); + watermark = low_wmark_pages(zone) + (2UL << order); if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx, alloc_flags, wmark_target)) return COMPACT_SKIPPED; From 9861a62c335cd34a2b6b25aaaf5898e8370299ec Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Oct 2016 16:57:53 -0700 Subject: [PATCH 022/127] mm, compaction: create compact_gap wrapper Compaction uses a watermark gap of (2UL << order) pages at various places and it's not immediately obvious why. Abstract it through a compact_gap() wrapper to create a single place with a thorough explanation. [vbabka@suse.cz: clarify the comment of compact_gap()] Link: http://lkml.kernel.org/r/7b6aed1f-fdf8-2063-9ff4-bbe4de712d37@suse.cz Link: http://lkml.kernel.org/r/20160810091226.6709-9-vbabka@suse.cz Signed-off-by: Vlastimil Babka Tested-by: Lorenzo Stoakes Acked-by: Michal Hocko Cc: Mel Gorman Cc: Joonsoo Kim Cc: David Rientjes Cc: Rik van Riel Signed-off-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 23 +++++++++++++++++++++++ mm/compaction.c | 7 +++---- mm/vmscan.c | 6 +++--- 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/include/linux/compaction.h b/include/linux/compaction.h index a1fba9994728..585d55cb0dc0 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -58,6 +58,29 @@ enum compact_result { struct alloc_context; /* in mm/internal.h */ +/* + * Number of free order-0 pages that should be available above given watermark + * to make sure compaction has reasonable chance of not running out of free + * pages that it needs to isolate as migration target during its work. + */ +static inline unsigned long compact_gap(unsigned int order) +{ + /* + * Although all the isolations for migration are temporary, compaction + * free scanner may have up to 1 << order pages on its list and then + * try to split an (order - 1) free page. At that point, a gap of + * 1 << order might not be enough, so it's safer to require twice that + * amount. Note that the number of pages on the list is also + * effectively limited by COMPACT_CLUSTER_MAX, as that's the maximum + * that the migrate scanner can have isolated on migrate list, and free + * scanner is only invoked when the number of isolated free pages is + * lower than that. But it's not worth to complicate the formula here + * as a bigger gap for higher orders than strictly necessary can also + * improve chances of compaction success. + */ + return 2UL << order; +} + #ifdef CONFIG_COMPACTION extern int sysctl_compact_memory; extern int sysctl_compaction_handler(struct ctl_table *table, int write, diff --git a/mm/compaction.c b/mm/compaction.c index e2618ac062a6..bbf41ee99142 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1391,11 +1391,10 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, return COMPACT_SUCCESS; /* - * Watermarks for order-0 must be met for compaction. Note the 2UL. - * This is because during migration, copies of pages need to be - * allocated and for a short time, the footprint is higher + * Watermarks for order-0 must be met for compaction to be able to + * isolate free pages for migration targets. */ - watermark = low_wmark_pages(zone) + (2UL << order); + watermark = low_wmark_pages(zone) + compact_gap(order); if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx, alloc_flags, wmark_target)) return COMPACT_SKIPPED; diff --git a/mm/vmscan.c b/mm/vmscan.c index 981fc84e7434..2a6978a07d56 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2480,7 +2480,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, * If we have not reclaimed enough pages for compaction and the * inactive lists are large enough, continue reclaiming */ - pages_for_compaction = (2UL << sc->order); + pages_for_compaction = compact_gap(sc->order); inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); if (get_nr_swap_pages() > 0) inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); @@ -2612,7 +2612,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) * there is a buffer of free pages available to give compaction * a reasonable chance of completing and allocating the page */ - watermark = high_wmark_pages(zone) + (2UL << sc->order); + watermark = high_wmark_pages(zone) + compact_gap(sc->order); watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx); /* @@ -3169,7 +3169,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat, * excessive reclaim. Assume that a process requested a high-order * can direct reclaim/compact. */ - if (sc->order && sc->nr_reclaimed >= 2UL << sc->order) + if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order)) sc->order = 0; return sc->nr_scanned >= sc->nr_to_reclaim; From 984fdba6a32e4e9819ebc06ca3acec6582ffd99f Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Oct 2016 16:57:57 -0700 Subject: [PATCH 023/127] mm, compaction: use proper alloc_flags in __compaction_suitable() The __compaction_suitable() function checks the low watermark plus a compact_gap() gap to decide if there's enough free memory to perform compaction. This check uses direct compactor's alloc_flags, but that's wrong, since these flags are not applicable for freepage isolation. For example, alloc_flags may indicate access to memory reserves, making compaction proceed, and then fail watermark check during the isolation. A similar problem exists for ALLOC_CMA, which may be part of alloc_flags, but not during freepage isolation. In this case however it makes sense to use ALLOC_CMA both in __compaction_suitable() and __isolate_free_page(), since there's actually nothing preventing the freepage scanner to isolate from CMA pageblocks, with the assumption that a page that could be migrated once by compaction can be migrated also later by CMA allocation. Thus we should count pages in CMA pageblocks when considering compaction suitability and when isolating freepages. To sum up, this patch should remove some false positives from __compaction_suitable(), and allow compaction to proceed when free pages required for compaction reside in the CMA pageblocks. Link: http://lkml.kernel.org/r/20160810091226.6709-10-vbabka@suse.cz Signed-off-by: Vlastimil Babka Tested-by: Lorenzo Stoakes Cc: Michal Hocko Cc: Mel Gorman Cc: Joonsoo Kim Cc: David Rientjes Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 12 ++++++++++-- mm/page_alloc.c | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index bbf41ee99142..658c009d60cc 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1392,11 +1392,19 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, /* * Watermarks for order-0 must be met for compaction to be able to - * isolate free pages for migration targets. + * isolate free pages for migration targets. This means that the + * watermark and alloc_flags have to match, or be more pessimistic than + * the check in __isolate_free_page(). We don't use the direct + * compactor's alloc_flags, as they are not relevant for freepage + * isolation. We however do use the direct compactor's classzone_idx to + * skip over zones where lowmem reserves would prevent allocation even + * if compaction succeeds. + * ALLOC_CMA is used, as pages in CMA pageblocks are considered + * suitable migration targets */ watermark = low_wmark_pages(zone) + compact_gap(order); if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx, - alloc_flags, wmark_target)) + ALLOC_CMA, wmark_target)) return COMPACT_SKIPPED; /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a2214c64ed3c..637b0e907df0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2491,7 +2491,7 @@ int __isolate_free_page(struct page *page, unsigned int order) if (!is_migrate_isolate(mt)) { /* Obey watermarks as if the page was being allocated */ watermark = low_wmark_pages(zone) + (1 << order); - if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) + if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) return 0; __mod_zone_freepage_state(zone, -(1UL << order), mt); From 8348faf91f56371d4bada6fc5915e19580a15ffe Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Oct 2016 16:58:00 -0700 Subject: [PATCH 024/127] mm, compaction: require only min watermarks for non-costly orders The __compaction_suitable() function checks the low watermark plus a compact_gap() gap to decide if there's enough free memory to perform compaction. Then __isolate_free_page uses low watermark check to decide if particular free page can be isolated. In the latter case, using low watermark is needlessly pessimistic, as the free page isolations are only temporary. For __compaction_suitable() the higher watermark makes sense for high-order allocations where more freepages increase the chance of success, and we can typically fail with some order-0 fallback when the system is struggling to reach that watermark. But for low-order allocation, forming the page should not be that hard. So using low watermark here might just prevent compaction from even trying, and eventually lead to OOM killer even if we are above min watermarks. So after this patch, we use min watermark for non-costly orders in __compaction_suitable(), and for all orders in __isolate_free_page(). [vbabka@suse.cz: clarify __isolate_free_page() comment] Link: http://lkml.kernel.org/r/7ae4baec-4eca-e70b-2a69-94bea4fb19fa@suse.cz Link: http://lkml.kernel.org/r/20160810091226.6709-11-vbabka@suse.cz Signed-off-by: Vlastimil Babka Tested-by: Lorenzo Stoakes Acked-by: Michal Hocko Cc: Mel Gorman Cc: Joonsoo Kim Cc: David Rientjes Cc: Rik van Riel Signed-off-by: Vlastimil Babka Tested-by: Lorenzo Stoakes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 6 +++++- mm/page_alloc.c | 9 +++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 658c009d60cc..29f6c49dc9c2 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1399,10 +1399,14 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, * isolation. We however do use the direct compactor's classzone_idx to * skip over zones where lowmem reserves would prevent allocation even * if compaction succeeds. + * For costly orders, we require low watermark instead of min for + * compaction to proceed to increase its chances. * ALLOC_CMA is used, as pages in CMA pageblocks are considered * suitable migration targets */ - watermark = low_wmark_pages(zone) + compact_gap(order); + watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ? + low_wmark_pages(zone) : min_wmark_pages(zone); + watermark += compact_gap(order); if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx, ALLOC_CMA, wmark_target)) return COMPACT_SKIPPED; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 637b0e907df0..c988d324e3f6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2489,8 +2489,13 @@ int __isolate_free_page(struct page *page, unsigned int order) mt = get_pageblock_migratetype(page); if (!is_migrate_isolate(mt)) { - /* Obey watermarks as if the page was being allocated */ - watermark = low_wmark_pages(zone) + (1 << order); + /* + * Obey watermarks as if the page was being allocated. We can + * emulate a high-order watermark check with a raised order-0 + * watermark, because we already know our high-order page + * exists. + */ + watermark = min_wmark_pages(zone) + (1UL << order); if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) return 0; From fdd4c6149a71ff1da98317adb6f18c28f75a6e3f Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Oct 2016 16:58:03 -0700 Subject: [PATCH 025/127] mm, vmscan: make compaction_ready() more accurate and readable The compaction_ready() is used during direct reclaim for costly order allocations to skip reclaim for zones where compaction should be attempted instead. It's combining the standard compaction_suitable() check with its own watermark check based on high watermark with extra gap, and the result is confusing at best. This patch attempts to better structure and document the checks involved. First, compaction_suitable() can determine that the allocation should either succeed already, or that compaction doesn't have enough free pages to proceed. The third possibility is that compaction has enough free pages, but we still decide to reclaim first - unless we are already above the high watermark with gap. This does not mean that the reclaim will actually reach this watermark during single attempt, this is rather an over-reclaim protection. So document the code as such. The check for compaction_deferred() is removed completely, as it in fact had no proper role here. The result after this patch is mainly a less confusing code. We also skip some over-reclaim in cases where the allocation should already succed. Link: http://lkml.kernel.org/r/20160810091226.6709-12-vbabka@suse.cz Signed-off-by: Vlastimil Babka Tested-by: Lorenzo Stoakes Acked-by: Michal Hocko Cc: Mel Gorman Cc: Joonsoo Kim Cc: David Rientjes Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 47 ++++++++++++++++++++++------------------------- 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 2a6978a07d56..f406e6fbaaa5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2598,38 +2598,35 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) } /* - * Returns true if compaction should go ahead for a high-order request, or - * the high-order allocation would succeed without compaction. + * Returns true if compaction should go ahead for a costly-order request, or + * the allocation would already succeed without compaction. Return false if we + * should reclaim first. */ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) { unsigned long watermark; - bool watermark_ok; + enum compact_result suitable; - /* - * Compaction takes time to run and there are potentially other - * callers using the pages just freed. Continue reclaiming until - * there is a buffer of free pages available to give compaction - * a reasonable chance of completing and allocating the page - */ - watermark = high_wmark_pages(zone) + compact_gap(sc->order); - watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx); - - /* - * If compaction is deferred, reclaim up to a point where - * compaction will have a chance of success when re-enabled - */ - if (compaction_deferred(zone, sc->order)) - return watermark_ok; - - /* - * If compaction is not ready to start and allocation is not likely - * to succeed without it, then keep reclaiming. - */ - if (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx) == COMPACT_SKIPPED) + suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx); + if (suitable == COMPACT_SUCCESS) + /* Allocation should succeed already. Don't reclaim. */ + return true; + if (suitable == COMPACT_SKIPPED) + /* Compaction cannot yet proceed. Do reclaim. */ return false; - return watermark_ok; + /* + * Compaction is already possible, but it takes time to run and there + * are potentially other callers using the pages just freed. So proceed + * with reclaim to make a buffer of free pages available to give + * compaction a reasonable chance of completing and allocating the page. + * Note that we won't actually reclaim the whole buffer in one attempt + * as the target watermark in should_continue_reclaim() is lower. But if + * we are already above the high+gap watermark, don't reclaim at all. + */ + watermark = high_wmark_pages(zone) + compact_gap(sc->order); + + return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx); } /* From e506b99696a296e9aba2e5f3bc5768aa7d8e2396 Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Fri, 7 Oct 2016 16:58:06 -0700 Subject: [PATCH 026/127] mem-hotplug: fix node spanned pages when we have a movable node Commit 342332e6a925 ("mm/page_alloc.c: introduce kernelcore=mirror option") rewrote the calculation of node spanned pages. But when we have a movable node, the size of node spanned pages is double added. That's because we have an empty normal zone, the present pages is zero, but its spanned pages is not zero. e.g. Zone ranges: DMA [mem 0x0000000000001000-0x0000000000ffffff] DMA32 [mem 0x0000000001000000-0x00000000ffffffff] Normal [mem 0x0000000100000000-0x0000007c7fffffff] Movable zone start for each node Node 1: 0x0000001080000000 Node 2: 0x0000002080000000 Node 3: 0x0000003080000000 Node 4: 0x0000003c80000000 Node 5: 0x0000004c80000000 Node 6: 0x0000005c80000000 Early memory node ranges node 0: [mem 0x0000000000001000-0x000000000009ffff] node 0: [mem 0x0000000000100000-0x000000007552afff] node 0: [mem 0x000000007bd46000-0x000000007bd46fff] node 0: [mem 0x000000007bdcd000-0x000000007bffffff] node 0: [mem 0x0000000100000000-0x000000107fffffff] node 1: [mem 0x0000001080000000-0x000000207fffffff] node 2: [mem 0x0000002080000000-0x000000307fffffff] node 3: [mem 0x0000003080000000-0x0000003c7fffffff] node 4: [mem 0x0000003c80000000-0x0000004c7fffffff] node 5: [mem 0x0000004c80000000-0x0000005c7fffffff] node 6: [mem 0x0000005c80000000-0x0000006c7fffffff] node 7: [mem 0x0000006c80000000-0x0000007c7fffffff] node1: Normal, start=0x1080000, present=0x0, spanned=0x1000000 Movable, start=0x1080000, present=0x1000000, spanned=0x1000000 pgdat, start=0x1080000, present=0x1000000, spanned=0x2000000 After this patch, the problem is fixed. node1: Normal, start=0x0, present=0x0, spanned=0x0 Movable, start=0x1080000, present=0x1000000, spanned=0x1000000 pgdat, start=0x1080000, present=0x1000000, spanned=0x1000000 Link: http://lkml.kernel.org/r/57A325E8.6070100@huawei.com Signed-off-by: Xishi Qiu Cc: Taku Izumi Cc: Vlastimil Babka Cc: Mel Gorman Cc: Michal Hocko Cc: David Rientjes Cc: Joonsoo Kim Cc: "Kirill A . Shutemov" Cc: Kamezawa Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 48 ++++++++++++++++++++---------------------------- 1 file changed, 20 insertions(+), 28 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c988d324e3f6..26246fdf45b5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5004,15 +5004,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, break; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP - /* - * If not mirrored_kernelcore and ZONE_MOVABLE exists, range - * from zone_movable_pfn[nid] to end of each node should be - * ZONE_MOVABLE not ZONE_NORMAL. skip it. - */ - if (!mirrored_kernelcore && zone_movable_pfn[nid]) - if (zone == ZONE_NORMAL && pfn >= zone_movable_pfn[nid]) - continue; - /* * Check given memblock attribute by firmware which can affect * kernel memory layout. If zone==ZONE_MOVABLE but memory is @@ -5456,6 +5447,12 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid, *zone_end_pfn = min(node_end_pfn, arch_zone_highest_possible_pfn[movable_zone]); + /* Adjust for ZONE_MOVABLE starting within this range */ + } else if (!mirrored_kernelcore && + *zone_start_pfn < zone_movable_pfn[nid] && + *zone_end_pfn > zone_movable_pfn[nid]) { + *zone_end_pfn = zone_movable_pfn[nid]; + /* Check if this whole range is within ZONE_MOVABLE */ } else if (*zone_start_pfn >= zone_movable_pfn[nid]) *zone_start_pfn = *zone_end_pfn; @@ -5559,28 +5556,23 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages * and vice versa. */ - if (zone_movable_pfn[nid]) { - if (mirrored_kernelcore) { - unsigned long start_pfn, end_pfn; - struct memblock_region *r; + if (mirrored_kernelcore && zone_movable_pfn[nid]) { + unsigned long start_pfn, end_pfn; + struct memblock_region *r; - for_each_memblock(memory, r) { - start_pfn = clamp(memblock_region_memory_base_pfn(r), - zone_start_pfn, zone_end_pfn); - end_pfn = clamp(memblock_region_memory_end_pfn(r), - zone_start_pfn, zone_end_pfn); + for_each_memblock(memory, r) { + start_pfn = clamp(memblock_region_memory_base_pfn(r), + zone_start_pfn, zone_end_pfn); + end_pfn = clamp(memblock_region_memory_end_pfn(r), + zone_start_pfn, zone_end_pfn); - if (zone_type == ZONE_MOVABLE && - memblock_is_mirror(r)) - nr_absent += end_pfn - start_pfn; + if (zone_type == ZONE_MOVABLE && + memblock_is_mirror(r)) + nr_absent += end_pfn - start_pfn; - if (zone_type == ZONE_NORMAL && - !memblock_is_mirror(r)) - nr_absent += end_pfn - start_pfn; - } - } else { - if (zone_type == ZONE_NORMAL) - nr_absent += node_end_pfn - zone_movable_pfn[nid]; + if (zone_type == ZONE_NORMAL && + !memblock_is_mirror(r)) + nr_absent += end_pfn - start_pfn; } } From e780149bcd4be171421535db0514fa9ff556cb87 Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Fri, 7 Oct 2016 16:58:09 -0700 Subject: [PATCH 027/127] mm: fix set pageblock migratetype in deferred struct page init On x86_64 MAX_ORDER_NR_PAGES is usually 4M, and a pageblock is usually 2M, so we only set one pageblock's migratetype in deferred_free_range() if pfn is aligned to MAX_ORDER_NR_PAGES. That means it causes uninitialized migratetype blocks, you can see from "cat /proc/pagetypeinfo", almost half blocks are Unmovable. Also we missed freeing the last block in deferred_init_memmap(), it causes memory leak. Fixes: ac5d2539b238 ("mm: meminit: reduce number of times pageblocks are set during struct page init") Link: http://lkml.kernel.org/r/57A3260F.4050709@huawei.com Signed-off-by: Xishi Qiu Cc: Taku Izumi Cc: Vlastimil Babka Cc: Mel Gorman Cc: Michal Hocko Cc: David Rientjes Cc: Joonsoo Kim Cc: "Kirill A . Shutemov" Cc: Kamezawa Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 26246fdf45b5..0c34633720c0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1393,15 +1393,18 @@ static void __init deferred_free_range(struct page *page, return; /* Free a large naturally-aligned chunk if possible */ - if (nr_pages == MAX_ORDER_NR_PAGES && - (pfn & (MAX_ORDER_NR_PAGES-1)) == 0) { + if (nr_pages == pageblock_nr_pages && + (pfn & (pageblock_nr_pages - 1)) == 0) { set_pageblock_migratetype(page, MIGRATE_MOVABLE); - __free_pages_boot_core(page, MAX_ORDER-1); + __free_pages_boot_core(page, pageblock_order); return; } - for (i = 0; i < nr_pages; i++, page++) + for (i = 0; i < nr_pages; i++, page++, pfn++) { + if ((pfn & (pageblock_nr_pages - 1)) == 0) + set_pageblock_migratetype(page, MIGRATE_MOVABLE); __free_pages_boot_core(page, 0); + } } /* Completion tracking for deferred_init_memmap() threads */ @@ -1469,9 +1472,9 @@ static int __init deferred_init_memmap(void *data) /* * Ensure pfn_valid is checked every - * MAX_ORDER_NR_PAGES for memory holes + * pageblock_nr_pages for memory holes */ - if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) { + if ((pfn & (pageblock_nr_pages - 1)) == 0) { if (!pfn_valid(pfn)) { page = NULL; goto free_range; @@ -1484,7 +1487,7 @@ static int __init deferred_init_memmap(void *data) } /* Minimise pfn page lookups and scheduler checks */ - if (page && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) { + if (page && (pfn & (pageblock_nr_pages - 1)) != 0) { page++; } else { nr_pages += nr_to_free; @@ -1520,6 +1523,9 @@ free_range: free_base_page = NULL; free_base_pfn = nr_to_free = 0; } + /* Free the last block of pages to allocator */ + nr_pages += nr_to_free; + deferred_free_range(free_base_page, free_base_pfn, nr_to_free); first_init_pfn = max(end_pfn, first_init_pfn); } From bf48438354a79df50fadd2e1c0b81baa2619a8b6 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 7 Oct 2016 16:58:12 -0700 Subject: [PATCH 028/127] mm, vmscan: get rid of throttle_vm_writeout throttle_vm_writeout() was introduced back in 2005 to fix OOMs caused by excessive pageout activity during the reclaim. Too many pages could be put under writeback therefore LRUs would be full of unreclaimable pages until the IO completes and in turn the OOM killer could be invoked. There have been some important changes introduced since then in the reclaim path though. Writers are throttled by balance_dirty_pages when initiating the buffered IO and later during the memory pressure, the direct reclaim is throttled by wait_iff_congested if the node is considered congested by dirty pages on LRUs and the underlying bdi is congested by the queued IO. The kswapd is throttled as well if it encounters pages marked for immediate reclaim or under writeback which signals that that there are too many pages under writeback already. Finally should_reclaim_retry does congestion_wait if the reclaim cannot make any progress and there are too many dirty/writeback pages. Another important aspect is that we do not issue any IO from the direct reclaim context anymore. In a heavy parallel load this could queue a lot of IO which would be very scattered and thus unefficient which would just make the problem worse. This three mechanisms should throttle and keep the amount of IO in a steady state even under heavy IO and memory pressure so yet another throttling point doesn't really seem helpful. Quite contrary, Mikulas Patocka has reported that swap backed by dm-crypt doesn't work properly because the swapout IO cannot make sufficient progress as the writeout path depends on dm_crypt worker which has to allocate memory to perform the encryption. In order to guarantee a forward progress it relies on the mempool allocator. mempool_alloc(), however, prefers to use the underlying (usually page) allocator before it grabs objects from the pool. Such an allocation can dive into the memory reclaim and consequently to throttle_vm_writeout. If there are too many dirty or pages under writeback it will get throttled even though it is in fact a flusher to clear pending pages. kworker/u4:0 D ffff88003df7f438 10488 6 2 0x00000000 Workqueue: kcryptd kcryptd_crypt [dm_crypt] Call Trace: schedule+0x3c/0x90 schedule_timeout+0x1d8/0x360 io_schedule_timeout+0xa4/0x110 congestion_wait+0x86/0x1f0 throttle_vm_writeout+0x44/0xd0 shrink_zone_memcg+0x613/0x720 shrink_zone+0xe0/0x300 do_try_to_free_pages+0x1ad/0x450 try_to_free_pages+0xef/0x300 __alloc_pages_nodemask+0x879/0x1210 alloc_pages_current+0xa1/0x1f0 new_slab+0x2d7/0x6a0 ___slab_alloc+0x3fb/0x5c0 __slab_alloc+0x51/0x90 kmem_cache_alloc+0x27b/0x310 mempool_alloc_slab+0x1d/0x30 mempool_alloc+0x91/0x230 bio_alloc_bioset+0xbd/0x260 kcryptd_crypt+0x114/0x3b0 [dm_crypt] Let's just drop throttle_vm_writeout altogether. It is not very much helpful anymore. I have tried to test a potential writeback IO runaway similar to the one described in the original patch which has introduced that [1]. Small virtual machine (512MB RAM, 4 CPUs, 2G of swap space and disk image on a rather slow NFS in a sync mode on the host) with 8 parallel writers each writing 1G worth of data. As soon as the pagecache fills up and the direct reclaim hits then I start anon memory consumer in a loop (allocating 300M and exiting after populating it) in the background to make the memory pressure even stronger as well as to disrupt the steady state for the IO. The direct reclaim is throttled because of the congestion as well as kswapd hitting congestion_wait due to nr_immediate but throttle_vm_writeout doesn't ever trigger the sleep throughout the test. Dirty+writeback are close to nr_dirty_threshold with some fluctuations caused by the anon consumer. [1] https://www2.kernel.org/pub/linux/kernel/people/akpm/patches/2.6/2.6.9-rc1/2.6.9-rc1-mm3/broken-out/vm-pageout-throttling.patch Link: http://lkml.kernel.org/r/1471171473-21418-1-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Mikulas Patocka Cc: Marcelo Tosatti Cc: NeilBrown Cc: Ondrej Kozina Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/writeback.h | 1 - mm/page-writeback.c | 30 ------------------------------ mm/vmscan.c | 2 -- 3 files changed, 33 deletions(-) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index fc1e16c25a29..797100e10010 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -319,7 +319,6 @@ void laptop_mode_timer_fn(unsigned long data); #else static inline void laptop_sync_completion(void) { } #endif -void throttle_vm_writeout(gfp_t gfp_mask); bool node_dirty_ok(struct pglist_data *pgdat); int wb_domain_init(struct wb_domain *dom, gfp_t gfp); #ifdef CONFIG_CGROUP_WRITEBACK diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 28d6f36a2d79..5ed3381818ec 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1965,36 +1965,6 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) return false; } -void throttle_vm_writeout(gfp_t gfp_mask) -{ - unsigned long background_thresh; - unsigned long dirty_thresh; - - for ( ; ; ) { - global_dirty_limits(&background_thresh, &dirty_thresh); - dirty_thresh = hard_dirty_limit(&global_wb_domain, dirty_thresh); - - /* - * Boost the allowable dirty threshold a bit for page - * allocators so they don't get DoS'ed by heavy writers - */ - dirty_thresh += dirty_thresh / 10; /* wheeee... */ - - if (global_node_page_state(NR_UNSTABLE_NFS) + - global_node_page_state(NR_WRITEBACK) <= dirty_thresh) - break; - congestion_wait(BLK_RW_ASYNC, HZ/10); - - /* - * The caller might hold locks which can prevent IO completion - * or progress in the filesystem. So we cannot just sit here - * waiting for IO to complete. - */ - if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO)) - break; - } -} - /* * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ diff --git a/mm/vmscan.c b/mm/vmscan.c index f406e6fbaaa5..d3715c1b2e36 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2418,8 +2418,6 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc if (inactive_list_is_low(lruvec, false, sc)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); - - throttle_vm_writeout(sc->gfp_mask); } /* Use reclaim/compaction for costly allocs or under memory pressure */ From acbc15a4b397f86d39416df143e30982b1da528b Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Fri, 7 Oct 2016 16:58:15 -0700 Subject: [PATCH 029/127] mm/debug_pagealloc.c: clean-up guard page handling code Patch series "Reduce memory waste by page extension user". This patchset tries to reduce memory waste by page extension user. First case is architecture supported debug_pagealloc. It doesn't requires additional memory if guard page isn't used. 8 bytes per page will be saved in this case. Second case is related to page owner feature. Until now, if page_ext users want to use it's own fields on page_ext, fields should be defined in struct page_ext by hard-coding. It has a following problem. struct page_ext { #ifdef CONFIG_A int a; #endif #ifdef CONFIG_B int b; #endif }; Assume that kernel is built with both CONFIG_A and CONFIG_B. Even if we enable feature A and doesn't enable feature B at runtime, each entry of struct page_ext takes two int rather than one int. It's undesirable waste so this patch tries to reduce it. By this patchset, we can save 20 bytes per page dedicated for page owner feature in some configurations. This patch (of 6): We can make code clean by moving decision condition for set_page_guard() into set_page_guard() itself. It will help code readability. There is no functional change. Link: http://lkml.kernel.org/r/1471315879-32294-2-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Minchan Kim Cc: Michal Hocko Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0c34633720c0..e150ba9e1311 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -637,17 +637,20 @@ static int __init debug_guardpage_minorder_setup(char *buf) } __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); -static inline void set_page_guard(struct zone *zone, struct page *page, +static inline bool set_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) { struct page_ext *page_ext; if (!debug_guardpage_enabled()) - return; + return false; + + if (order >= debug_guardpage_minorder()) + return false; page_ext = lookup_page_ext(page); if (unlikely(!page_ext)) - return; + return false; __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); @@ -655,6 +658,8 @@ static inline void set_page_guard(struct zone *zone, struct page *page, set_page_private(page, order); /* Guard pages are not available for any usage */ __mod_zone_freepage_state(zone, -(1 << order), migratetype); + + return true; } static inline void clear_page_guard(struct zone *zone, struct page *page, @@ -677,8 +682,8 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, } #else struct page_ext_operations debug_guardpage_ops = { NULL, }; -static inline void set_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) {} +static inline bool set_page_guard(struct zone *zone, struct page *page, + unsigned int order, int migratetype) { return false; } static inline void clear_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) {} #endif @@ -1622,18 +1627,15 @@ static inline void expand(struct zone *zone, struct page *page, size >>= 1; VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); - if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && - debug_guardpage_enabled() && - high < debug_guardpage_minorder()) { - /* - * Mark as guard pages (or page), that will allow to - * merge back to allocator when buddy will be freed. - * Corresponding page table entries will not be touched, - * pages will stay not present in virtual address space - */ - set_page_guard(zone, &page[size], high, migratetype); + /* + * Mark as guard pages (or page), that will allow to + * merge back to allocator when buddy will be freed. + * Corresponding page table entries will not be touched, + * pages will stay not present in virtual address space + */ + if (set_page_guard(zone, &page[size], high, migratetype)) continue; - } + list_add(&page[size].lru, &area->free_list[migratetype]); area->nr_free++; set_page_order(&page[size], high); From f1c1e9f7b5b3ddce6b4f1986939ec87b27515086 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Fri, 7 Oct 2016 16:58:18 -0700 Subject: [PATCH 030/127] mm/debug_pagealloc.c: don't allocate page_ext if we don't use guard page What debug_pagealloc does is just mapping/unmapping page table. Basically, it doesn't need additional memory space to memorize something. But, with guard page feature, it requires additional memory to distinguish if the page is for guard or not. Guard page is only used when debug_guardpage_minorder is non-zero so this patch removes additional memory allocation (page_ext) if debug_guardpage_minorder is zero. It saves memory if we just use debug_pagealloc and not guard page. Link: http://lkml.kernel.org/r/1471315879-32294-3-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Joonsoo Kim Reviewed-by: Sergey Senozhatsky Acked-by: Vlastimil Babka Cc: Minchan Kim Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e150ba9e1311..06ea805d1b14 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -607,6 +607,9 @@ static bool need_debug_guardpage(void) if (!debug_pagealloc_enabled()) return false; + if (!debug_guardpage_minorder()) + return false; + return true; } @@ -615,6 +618,9 @@ static void init_debug_guardpage(void) if (!debug_pagealloc_enabled()) return; + if (!debug_guardpage_minorder()) + return; + _debug_guardpage_enabled = true; } @@ -635,7 +641,7 @@ static int __init debug_guardpage_minorder_setup(char *buf) pr_info("Setting debug_guardpage_minorder to %lu\n", res); return 0; } -__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); +early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup); static inline bool set_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) From e2f612e673f61931b2fe62722832cf5fcf6b3313 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Fri, 7 Oct 2016 16:58:21 -0700 Subject: [PATCH 031/127] mm/page_owner: move page_owner specific function to page_owner.c There is no reason that page_owner specific function resides on vmstat.c. Link: http://lkml.kernel.org/r/1471315879-32294-4-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Joonsoo Kim Reviewed-by: Sergey Senozhatsky Acked-by: Vlastimil Babka Cc: Minchan Kim Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_owner.h | 2 + mm/page_owner.c | 77 +++++++++++++++++++++++++++++++++++++ mm/vmstat.c | 79 -------------------------------------- 3 files changed, 79 insertions(+), 79 deletions(-) diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h index 30583ab0ffb1..2be728d156b5 100644 --- a/include/linux/page_owner.h +++ b/include/linux/page_owner.h @@ -14,6 +14,8 @@ extern void __split_page_owner(struct page *page, unsigned int order); extern void __copy_page_owner(struct page *oldpage, struct page *newpage); extern void __set_page_owner_migrate_reason(struct page *page, int reason); extern void __dump_page_owner(struct page *page); +extern void pagetypeinfo_showmixedcount_print(struct seq_file *m, + pg_data_t *pgdat, struct zone *zone); static inline void reset_page_owner(struct page *page, unsigned int order) { diff --git a/mm/page_owner.c b/mm/page_owner.c index ec6dc1886f71..0f4246d109a0 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "internal.h" @@ -214,6 +215,82 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage) __set_bit(PAGE_EXT_OWNER, &new_ext->flags); } +void pagetypeinfo_showmixedcount_print(struct seq_file *m, + pg_data_t *pgdat, struct zone *zone) +{ + struct page *page; + struct page_ext *page_ext; + unsigned long pfn = zone->zone_start_pfn, block_end_pfn; + unsigned long end_pfn = pfn + zone->spanned_pages; + unsigned long count[MIGRATE_TYPES] = { 0, }; + int pageblock_mt, page_mt; + int i; + + /* Scan block by block. First and last block may be incomplete */ + pfn = zone->zone_start_pfn; + + /* + * Walk the zone in pageblock_nr_pages steps. If a page block spans + * a zone boundary, it will be double counted between zones. This does + * not matter as the mixed block count will still be correct + */ + for (; pfn < end_pfn; ) { + if (!pfn_valid(pfn)) { + pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); + continue; + } + + block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_end_pfn = min(block_end_pfn, end_pfn); + + page = pfn_to_page(pfn); + pageblock_mt = get_pageblock_migratetype(page); + + for (; pfn < block_end_pfn; pfn++) { + if (!pfn_valid_within(pfn)) + continue; + + page = pfn_to_page(pfn); + + if (page_zone(page) != zone) + continue; + + if (PageBuddy(page)) { + pfn += (1UL << page_order(page)) - 1; + continue; + } + + if (PageReserved(page)) + continue; + + page_ext = lookup_page_ext(page); + if (unlikely(!page_ext)) + continue; + + if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) + continue; + + page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); + if (pageblock_mt != page_mt) { + if (is_migrate_cma(pageblock_mt)) + count[MIGRATE_MOVABLE]++; + else + count[pageblock_mt]++; + + pfn = block_end_pfn; + break; + } + pfn += (1UL << page_ext->order) - 1; + } + } + + /* Print counts */ + seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); + for (i = 0; i < MIGRATE_TYPES; i++) + seq_printf(m, "%12lu ", count[i]); + seq_putc(m, '\n'); +} + static ssize_t print_page_owner(char __user *buf, size_t count, unsigned long pfn, struct page *page, struct page_ext *page_ext, diff --git a/mm/vmstat.c b/mm/vmstat.c index 89cec42d19ff..dc04e76c7950 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1254,85 +1254,6 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) return 0; } -#ifdef CONFIG_PAGE_OWNER -static void pagetypeinfo_showmixedcount_print(struct seq_file *m, - pg_data_t *pgdat, - struct zone *zone) -{ - struct page *page; - struct page_ext *page_ext; - unsigned long pfn = zone->zone_start_pfn, block_end_pfn; - unsigned long end_pfn = pfn + zone->spanned_pages; - unsigned long count[MIGRATE_TYPES] = { 0, }; - int pageblock_mt, page_mt; - int i; - - /* Scan block by block. First and last block may be incomplete */ - pfn = zone->zone_start_pfn; - - /* - * Walk the zone in pageblock_nr_pages steps. If a page block spans - * a zone boundary, it will be double counted between zones. This does - * not matter as the mixed block count will still be correct - */ - for (; pfn < end_pfn; ) { - if (!pfn_valid(pfn)) { - pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); - continue; - } - - block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); - block_end_pfn = min(block_end_pfn, end_pfn); - - page = pfn_to_page(pfn); - pageblock_mt = get_pageblock_migratetype(page); - - for (; pfn < block_end_pfn; pfn++) { - if (!pfn_valid_within(pfn)) - continue; - - page = pfn_to_page(pfn); - - if (page_zone(page) != zone) - continue; - - if (PageBuddy(page)) { - pfn += (1UL << page_order(page)) - 1; - continue; - } - - if (PageReserved(page)) - continue; - - page_ext = lookup_page_ext(page); - if (unlikely(!page_ext)) - continue; - - if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) - continue; - - page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); - if (pageblock_mt != page_mt) { - if (is_migrate_cma(pageblock_mt)) - count[MIGRATE_MOVABLE]++; - else - count[pageblock_mt]++; - - pfn = block_end_pfn; - break; - } - pfn += (1UL << page_ext->order) - 1; - } - } - - /* Print counts */ - seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); - for (i = 0; i < MIGRATE_TYPES; i++) - seq_printf(m, "%12lu ", count[i]); - seq_putc(m, '\n'); -} -#endif /* CONFIG_PAGE_OWNER */ - /* * Print out the number of pageblocks for each migratetype that contain pages * of other types. This gives an indication of how well fallbacks are being From 0b06bb3f6075803a92a0075ba4eb44888dd8a68a Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Fri, 7 Oct 2016 16:58:24 -0700 Subject: [PATCH 032/127] mm/page_ext: rename offset to index Here, 'offset' means entry index in page_ext array. Following patch will use 'offset' for field offset in each entry so rename current 'offset' to prevent confusion. Link: http://lkml.kernel.org/r/1471315879-32294-5-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Minchan Kim Cc: Michal Hocko Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_ext.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/page_ext.c b/mm/page_ext.c index 44a4c029c8e7..16292829c5c5 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -102,7 +102,7 @@ void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) struct page_ext *lookup_page_ext(struct page *page) { unsigned long pfn = page_to_pfn(page); - unsigned long offset; + unsigned long index; struct page_ext *base; base = NODE_DATA(page_to_nid(page))->node_page_ext; @@ -119,9 +119,9 @@ struct page_ext *lookup_page_ext(struct page *page) if (unlikely(!base)) return NULL; #endif - offset = pfn - round_down(node_start_pfn(page_to_nid(page)), + index = pfn - round_down(node_start_pfn(page_to_nid(page)), MAX_ORDER_NR_PAGES); - return base + offset; + return base + index; } static int __init alloc_node_page_ext(int nid) From 980ac1672e7edaa927557a5186f1967cd45afcf5 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Fri, 7 Oct 2016 16:58:27 -0700 Subject: [PATCH 033/127] mm/page_ext: support extra space allocation by page_ext user Until now, if some page_ext users want to use it's own field on page_ext, it should be defined in struct page_ext by hard-coding. It has a problem that wastes memory in following situation. struct page_ext { #ifdef CONFIG_A int a; #endif #ifdef CONFIG_B int b; #endif }; Assume that kernel is built with both CONFIG_A and CONFIG_B. Even if we enable feature A and doesn't enable feature B at runtime, each entry of struct page_ext takes two int rather than one int. It's undesirable result so this patch tries to fix it. To solve above problem, this patch implements to support extra space allocation at runtime. When need() callback returns true, it's extra memory requirement is summed to entry size of page_ext. Also, offset for each user's extra memory space is returned. With this offset, user can use this extra space and there is no need to define needed field on page_ext by hard-coding. This patch only implements an infrastructure. Following patch will use it for page_owner which is only user having it's own fields on page_ext. Link: http://lkml.kernel.org/r/1471315879-32294-6-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Minchan Kim Cc: Michal Hocko Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_ext.h | 2 ++ mm/page_alloc.c | 2 +- mm/page_ext.c | 41 ++++++++++++++++++++++++++++++---------- 3 files changed, 34 insertions(+), 11 deletions(-) diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index 03f2a3e7d76d..179bdc4a470c 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -7,6 +7,8 @@ struct pglist_data; struct page_ext_operations { + size_t offset; + size_t size; bool (*need)(void); void (*init)(void); }; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 06ea805d1b14..b0f133f2c655 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -687,7 +687,7 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, __mod_zone_freepage_state(zone, (1 << order), migratetype); } #else -struct page_ext_operations debug_guardpage_ops = { NULL, }; +struct page_ext_operations debug_guardpage_ops; static inline bool set_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) { return false; } static inline void clear_page_guard(struct zone *zone, struct page *page, diff --git a/mm/page_ext.c b/mm/page_ext.c index 16292829c5c5..121dcffc4ec1 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -42,6 +42,11 @@ * and page extension core can skip to allocate memory. As result, * none of memory is wasted. * + * When need callback returns true, page_ext checks if there is a request for + * extra memory through size in struct page_ext_operations. If it is non-zero, + * extra space is allocated for each page_ext entry and offset is returned to + * user through offset in struct page_ext_operations. + * * The init callback is used to do proper initialization after page extension * is completely initialized. In sparse memory system, extra memory is * allocated some time later than memmap is allocated. In other words, lifetime @@ -66,18 +71,24 @@ static struct page_ext_operations *page_ext_ops[] = { }; static unsigned long total_usage; +static unsigned long extra_mem; static bool __init invoke_need_callbacks(void) { int i; int entries = ARRAY_SIZE(page_ext_ops); + bool need = false; for (i = 0; i < entries; i++) { - if (page_ext_ops[i]->need && page_ext_ops[i]->need()) - return true; + if (page_ext_ops[i]->need && page_ext_ops[i]->need()) { + page_ext_ops[i]->offset = sizeof(struct page_ext) + + extra_mem; + extra_mem += page_ext_ops[i]->size; + need = true; + } } - return false; + return need; } static void __init invoke_init_callbacks(void) @@ -91,6 +102,16 @@ static void __init invoke_init_callbacks(void) } } +static unsigned long get_entry_size(void) +{ + return sizeof(struct page_ext) + extra_mem; +} + +static inline struct page_ext *get_entry(void *base, unsigned long index) +{ + return base + get_entry_size() * index; +} + #if !defined(CONFIG_SPARSEMEM) @@ -121,7 +142,7 @@ struct page_ext *lookup_page_ext(struct page *page) #endif index = pfn - round_down(node_start_pfn(page_to_nid(page)), MAX_ORDER_NR_PAGES); - return base + index; + return get_entry(base, index); } static int __init alloc_node_page_ext(int nid) @@ -143,7 +164,7 @@ static int __init alloc_node_page_ext(int nid) !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES)) nr_pages += MAX_ORDER_NR_PAGES; - table_size = sizeof(struct page_ext) * nr_pages; + table_size = get_entry_size() * nr_pages; base = memblock_virt_alloc_try_nid_nopanic( table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), @@ -196,7 +217,7 @@ struct page_ext *lookup_page_ext(struct page *page) if (!section->page_ext) return NULL; #endif - return section->page_ext + pfn; + return get_entry(section->page_ext, pfn); } static void *__meminit alloc_page_ext(size_t size, int nid) @@ -229,7 +250,7 @@ static int __meminit init_section_page_ext(unsigned long pfn, int nid) if (section->page_ext) return 0; - table_size = sizeof(struct page_ext) * PAGES_PER_SECTION; + table_size = get_entry_size() * PAGES_PER_SECTION; base = alloc_page_ext(table_size, nid); /* @@ -249,7 +270,7 @@ static int __meminit init_section_page_ext(unsigned long pfn, int nid) * we need to apply a mask. */ pfn &= PAGE_SECTION_MASK; - section->page_ext = base - pfn; + section->page_ext = (void *)base - get_entry_size() * pfn; total_usage += table_size; return 0; } @@ -262,7 +283,7 @@ static void free_page_ext(void *addr) struct page *page = virt_to_page(addr); size_t table_size; - table_size = sizeof(struct page_ext) * PAGES_PER_SECTION; + table_size = get_entry_size() * PAGES_PER_SECTION; BUG_ON(PageReserved(page)); free_pages_exact(addr, table_size); @@ -277,7 +298,7 @@ static void __free_page_ext(unsigned long pfn) ms = __pfn_to_section(pfn); if (!ms || !ms->page_ext) return; - base = ms->page_ext + pfn; + base = get_entry(ms->page_ext, pfn); free_page_ext(base); ms->page_ext = NULL; } From 9300d8dfd282bd1473395c5c4c76bfdc90b05978 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Fri, 7 Oct 2016 16:58:30 -0700 Subject: [PATCH 034/127] mm/page_owner: don't define fields on struct page_ext by hard-coding There is a memory waste problem if we define field on struct page_ext by hard-coding. Entry size of struct page_ext includes the size of those fields even if it is disabled at runtime. Now, extra memory request at runtime is possible so page_owner don't need to define it's own fields by hard-coding. This patch removes hard-coded define and uses extra memory for storing page_owner information in page_owner. Most of code are just mechanical changes. Link: http://lkml.kernel.org/r/1471315879-32294-7-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Minchan Kim Cc: Michal Hocko Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_ext.h | 6 --- mm/page_owner.c | 83 ++++++++++++++++++++++++++++------------ 2 files changed, 58 insertions(+), 31 deletions(-) diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index 179bdc4a470c..9298c393ddaa 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -44,12 +44,6 @@ enum page_ext_flags { */ struct page_ext { unsigned long flags; -#ifdef CONFIG_PAGE_OWNER - unsigned int order; - gfp_t gfp_mask; - int last_migrate_reason; - depot_stack_handle_t handle; -#endif }; extern void pgdat_page_ext_init(struct pglist_data *pgdat); diff --git a/mm/page_owner.c b/mm/page_owner.c index 0f4246d109a0..60634dc53a88 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -18,6 +18,13 @@ */ #define PAGE_OWNER_STACK_DEPTH (16) +struct page_owner { + unsigned int order; + gfp_t gfp_mask; + int last_migrate_reason; + depot_stack_handle_t handle; +}; + static bool page_owner_disabled = true; DEFINE_STATIC_KEY_FALSE(page_owner_inited); @@ -86,10 +93,16 @@ static void init_page_owner(void) } struct page_ext_operations page_owner_ops = { + .size = sizeof(struct page_owner), .need = need_page_owner, .init = init_page_owner, }; +static inline struct page_owner *get_page_owner(struct page_ext *page_ext) +{ + return (void *)page_ext + page_owner_ops.offset; +} + void __reset_page_owner(struct page *page, unsigned int order) { int i; @@ -156,14 +169,16 @@ noinline void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) { struct page_ext *page_ext = lookup_page_ext(page); + struct page_owner *page_owner; if (unlikely(!page_ext)) return; - page_ext->handle = save_stack(gfp_mask); - page_ext->order = order; - page_ext->gfp_mask = gfp_mask; - page_ext->last_migrate_reason = -1; + page_owner = get_page_owner(page_ext); + page_owner->handle = save_stack(gfp_mask); + page_owner->order = order; + page_owner->gfp_mask = gfp_mask; + page_owner->last_migrate_reason = -1; __set_bit(PAGE_EXT_OWNER, &page_ext->flags); } @@ -171,21 +186,26 @@ noinline void __set_page_owner(struct page *page, unsigned int order, void __set_page_owner_migrate_reason(struct page *page, int reason) { struct page_ext *page_ext = lookup_page_ext(page); + struct page_owner *page_owner; + if (unlikely(!page_ext)) return; - page_ext->last_migrate_reason = reason; + page_owner = get_page_owner(page_ext); + page_owner->last_migrate_reason = reason; } void __split_page_owner(struct page *page, unsigned int order) { int i; struct page_ext *page_ext = lookup_page_ext(page); + struct page_owner *page_owner; if (unlikely(!page_ext)) return; - page_ext->order = 0; + page_owner = get_page_owner(page_ext); + page_owner->order = 0; for (i = 1; i < (1 << order); i++) __copy_page_owner(page, page + i); } @@ -194,14 +214,18 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage) { struct page_ext *old_ext = lookup_page_ext(oldpage); struct page_ext *new_ext = lookup_page_ext(newpage); + struct page_owner *old_page_owner, *new_page_owner; if (unlikely(!old_ext || !new_ext)) return; - new_ext->order = old_ext->order; - new_ext->gfp_mask = old_ext->gfp_mask; - new_ext->last_migrate_reason = old_ext->last_migrate_reason; - new_ext->handle = old_ext->handle; + old_page_owner = get_page_owner(old_ext); + new_page_owner = get_page_owner(new_ext); + new_page_owner->order = old_page_owner->order; + new_page_owner->gfp_mask = old_page_owner->gfp_mask; + new_page_owner->last_migrate_reason = + old_page_owner->last_migrate_reason; + new_page_owner->handle = old_page_owner->handle; /* * We don't clear the bit on the oldpage as it's going to be freed @@ -220,6 +244,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, { struct page *page; struct page_ext *page_ext; + struct page_owner *page_owner; unsigned long pfn = zone->zone_start_pfn, block_end_pfn; unsigned long end_pfn = pfn + zone->spanned_pages; unsigned long count[MIGRATE_TYPES] = { 0, }; @@ -270,7 +295,9 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) continue; - page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); + page_owner = get_page_owner(page_ext); + page_mt = gfpflags_to_migratetype( + page_owner->gfp_mask); if (pageblock_mt != page_mt) { if (is_migrate_cma(pageblock_mt)) count[MIGRATE_MOVABLE]++; @@ -280,7 +307,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, pfn = block_end_pfn; break; } - pfn += (1UL << page_ext->order) - 1; + pfn += (1UL << page_owner->order) - 1; } } @@ -293,7 +320,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, static ssize_t print_page_owner(char __user *buf, size_t count, unsigned long pfn, - struct page *page, struct page_ext *page_ext, + struct page *page, struct page_owner *page_owner, depot_stack_handle_t handle) { int ret; @@ -313,15 +340,15 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, ret = snprintf(kbuf, count, "Page allocated via order %u, mask %#x(%pGg)\n", - page_ext->order, page_ext->gfp_mask, - &page_ext->gfp_mask); + page_owner->order, page_owner->gfp_mask, + &page_owner->gfp_mask); if (ret >= count) goto err; /* Print information relevant to grouping pages by mobility */ pageblock_mt = get_pageblock_migratetype(page); - page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); + page_mt = gfpflags_to_migratetype(page_owner->gfp_mask); ret += snprintf(kbuf + ret, count - ret, "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n", pfn, @@ -338,10 +365,10 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, if (ret >= count) goto err; - if (page_ext->last_migrate_reason != -1) { + if (page_owner->last_migrate_reason != -1) { ret += snprintf(kbuf + ret, count - ret, "Page has been migrated, last migrate reason: %s\n", - migrate_reason_names[page_ext->last_migrate_reason]); + migrate_reason_names[page_owner->last_migrate_reason]); if (ret >= count) goto err; } @@ -364,6 +391,7 @@ err: void __dump_page_owner(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); + struct page_owner *page_owner; unsigned long entries[PAGE_OWNER_STACK_DEPTH]; struct stack_trace trace = { .nr_entries = 0, @@ -379,7 +407,9 @@ void __dump_page_owner(struct page *page) pr_alert("There is not page extension available.\n"); return; } - gfp_mask = page_ext->gfp_mask; + + page_owner = get_page_owner(page_ext); + gfp_mask = page_owner->gfp_mask; mt = gfpflags_to_migratetype(gfp_mask); if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { @@ -387,7 +417,7 @@ void __dump_page_owner(struct page *page) return; } - handle = READ_ONCE(page_ext->handle); + handle = READ_ONCE(page_owner->handle); if (!handle) { pr_alert("page_owner info is not active (free page?)\n"); return; @@ -395,12 +425,12 @@ void __dump_page_owner(struct page *page) depot_fetch_stack(handle, &trace); pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n", - page_ext->order, migratetype_names[mt], gfp_mask, &gfp_mask); + page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask); print_stack_trace(&trace, 0); - if (page_ext->last_migrate_reason != -1) + if (page_owner->last_migrate_reason != -1) pr_alert("page has been migrated, last migrate reason: %s\n", - migrate_reason_names[page_ext->last_migrate_reason]); + migrate_reason_names[page_owner->last_migrate_reason]); } static ssize_t @@ -409,6 +439,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) unsigned long pfn; struct page *page; struct page_ext *page_ext; + struct page_owner *page_owner; depot_stack_handle_t handle; if (!static_branch_unlikely(&page_owner_inited)) @@ -458,11 +489,13 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) continue; + page_owner = get_page_owner(page_ext); + /* * Access to page_ext->handle isn't synchronous so we should * be careful to access it. */ - handle = READ_ONCE(page_ext->handle); + handle = READ_ONCE(page_owner->handle); if (!handle) continue; @@ -470,7 +503,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) *ppos = (pfn - min_low_pfn) + 1; return print_page_owner(buf, count, pfn, page, - page_ext, handle); + page_owner, handle); } return 0; From c4b209a426847b55c40360c1d04dc7986b55ddc7 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 7 Oct 2016 16:58:33 -0700 Subject: [PATCH 035/127] do_generic_file_read(): fail immediately if killed If a fatal signal has been received, fail immediately instead of trying to read more data. If wait_on_page_locked_killable() was interrupted then this page is most likely is not PageUptodate() and in this case do_generic_file_read() will fail after lock_page_killable(). See also commit ebded02788b5 ("mm: filemap: avoid unnecessary calls to lock_page when waiting for IO to complete during a read") [oleg@redhat.com: changelog addition] Link: http://lkml.kernel.org/r/63068e8e-8bee-b208-8441-a3c39a9d9eb6@sandisk.com Signed-off-by: Bart Van Assche Reviewed-by: Jan Kara Acked-by: Oleg Nesterov Acked-by: Michal Hocko Cc: Mel Gorman Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index 68f1813fbdc3..1b05f75aea0f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1721,7 +1721,9 @@ find_page: * wait_on_page_locked is used to avoid unnecessarily * serialisations and why it's safe. */ - wait_on_page_locked_killable(page); + error = wait_on_page_locked_killable(page); + if (unlikely(error)) + goto readpage_error; if (PageUptodate(page)) goto page_ok; From f7e2355f0f8635ddcfd26858f58732b7bf85f9f4 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 7 Oct 2016 16:58:36 -0700 Subject: [PATCH 036/127] mm: pagewalk: fix the comment for test_walk Modify the comment describing struct mm_walk->test_walk()s behaviour to match the comment on walk_page_test() and the behaviour of walk_page_vma(). Fixes: fafaa4264eba4 ("pagewalk: improve vma handling") Link: http://lkml.kernel.org/r/1471622518-21980-1-git-send-email-james.morse@arm.com Signed-off-by: James Morse Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 5f14534f0c90..0a063b4e4456 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1197,10 +1197,10 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, * @pte_hole: if set, called for each hole at all levels * @hugetlb_entry: if set, called for each hugetlb entry * @test_walk: caller specific callback function to determine whether - * we walk over the current vma or not. A positive returned + * we walk over the current vma or not. Returning 0 * value means "do page table walk over the current vma," * and a negative one means "abort current page table walk - * right now." 0 means "skip the current vma." + * right now." 1 means "skip the current vma." * @mm: mm_struct representing the target process of page table walk * @vma: vma currently walked (NULL if walking outside vmas) * @private: private data for callbacks' usage From 131ddc5c7d814d61f945b6322019e5148f6d39f0 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 7 Oct 2016 16:58:39 -0700 Subject: [PATCH 037/127] mm: unrig VMA cache hit ratio Current code doesn't count first FIND operation after VMA cache flush (which happen surprisingly often) artificially increasing cache hit ratio. On my regular setup the difference is: Before After ========================================================== * boot, login into KDE vmacache_find_calls 446216 vmacache_find_calls 492741 vmacache_find_hits 277596 vmacache_find_hits 276096 ~62.2% ~56.0% * rebuild kernel (no changes to code, usual config) vmacache_find_calls 1943007 vmacache_find_calls 2083718 vmacache_find_hits 1246123 vmacache_find_hits 1244146 ~64.1% ~59.7% * rebuild kernel (full rebuild, usual config) vmacache_find_calls 32163155 vmacache_find_calls 33677183 vmacache_find_hits 27889956 vmacache_find_hits 27877591 ~88.2% ~84.3% Total: ~4% cache hit ratio. If someone is counting _relative_ cache _miss_ ratio, misreporting is much higher. Link: http://lkml.kernel.org/r/20160822225009.GA3934@p183.telecom.by Signed-off-by: Alexey Dobriyan Cc: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmacache.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/vmacache.c b/mm/vmacache.c index fd09dc9c6812..035fdeb35b43 100644 --- a/mm/vmacache.c +++ b/mm/vmacache.c @@ -87,11 +87,11 @@ struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) { int i; + count_vm_vmacache_event(VMACACHE_FIND_CALLS); + if (!vmacache_valid(mm)) return NULL; - count_vm_vmacache_event(VMACACHE_FIND_CALLS); - for (i = 0; i < VMACACHE_SIZE; i++) { struct vm_area_struct *vma = current->vmacache[i]; @@ -115,11 +115,11 @@ struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, { int i; + count_vm_vmacache_event(VMACACHE_FIND_CALLS); + if (!vmacache_valid(mm)) return NULL; - count_vm_vmacache_event(VMACACHE_FIND_CALLS); - for (i = 0; i < VMACACHE_SIZE; i++) { struct vm_area_struct *vma = current->vmacache[i]; From 6b53491598a4d9694318e6e2b11d8c9988a483d4 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 7 Oct 2016 16:58:42 -0700 Subject: [PATCH 038/127] mm, swap: add swap_cluster_list This is a code clean up patch without functionality changes. The swap_cluster_list data structure and its operations are introduced to provide some better encapsulation for the free cluster and discard cluster list operations. This avoid some code duplication, improved the code readability, and reduced the total line number. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/1472067356-16004-1-git-send-email-ying.huang@intel.com Signed-off-by: "Huang, Ying" Acked-by: Minchan Kim Acked-by: Rik van Riel Cc: Tim Chen Cc: Hugh Dickins Cc: Shaohua Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 11 ++-- mm/swapfile.c | 133 ++++++++++++++++++++----------------------- 2 files changed, 70 insertions(+), 74 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index e1d761463243..a56523cefb9b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -191,6 +191,11 @@ struct percpu_cluster { unsigned int next; /* Likely next allocation offset */ }; +struct swap_cluster_list { + struct swap_cluster_info head; + struct swap_cluster_info tail; +}; + /* * The in-memory structure used to track swap areas. */ @@ -203,8 +208,7 @@ struct swap_info_struct { unsigned int max; /* extent of the swap_map */ unsigned char *swap_map; /* vmalloc'ed array of usage counts */ struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ - struct swap_cluster_info free_cluster_head; /* free cluster list head */ - struct swap_cluster_info free_cluster_tail; /* free cluster list tail */ + struct swap_cluster_list free_clusters; /* free clusters list */ unsigned int lowest_bit; /* index of first free in swap_map */ unsigned int highest_bit; /* index of last free in swap_map */ unsigned int pages; /* total of usable pages of swap */ @@ -235,8 +239,7 @@ struct swap_info_struct { * first. */ struct work_struct discard_work; /* discard worker */ - struct swap_cluster_info discard_cluster_head; /* list head of discard clusters */ - struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */ + struct swap_cluster_list discard_clusters; /* discard clusters list */ }; /* linux/mm/workingset.c */ diff --git a/mm/swapfile.c b/mm/swapfile.c index 2657accc6e2b..134c085d0d7b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -257,6 +257,53 @@ static inline void cluster_set_null(struct swap_cluster_info *info) info->data = 0; } +static inline bool cluster_list_empty(struct swap_cluster_list *list) +{ + return cluster_is_null(&list->head); +} + +static inline unsigned int cluster_list_first(struct swap_cluster_list *list) +{ + return cluster_next(&list->head); +} + +static void cluster_list_init(struct swap_cluster_list *list) +{ + cluster_set_null(&list->head); + cluster_set_null(&list->tail); +} + +static void cluster_list_add_tail(struct swap_cluster_list *list, + struct swap_cluster_info *ci, + unsigned int idx) +{ + if (cluster_list_empty(list)) { + cluster_set_next_flag(&list->head, idx, 0); + cluster_set_next_flag(&list->tail, idx, 0); + } else { + unsigned int tail = cluster_next(&list->tail); + + cluster_set_next(&ci[tail], idx); + cluster_set_next_flag(&list->tail, idx, 0); + } +} + +static unsigned int cluster_list_del_first(struct swap_cluster_list *list, + struct swap_cluster_info *ci) +{ + unsigned int idx; + + idx = cluster_next(&list->head); + if (cluster_next(&list->tail) == idx) { + cluster_set_null(&list->head); + cluster_set_null(&list->tail); + } else + cluster_set_next_flag(&list->head, + cluster_next(&ci[idx]), 0); + + return idx; +} + /* Add a cluster to discard list and schedule it to do discard */ static void swap_cluster_schedule_discard(struct swap_info_struct *si, unsigned int idx) @@ -270,17 +317,7 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, memset(si->swap_map + idx * SWAPFILE_CLUSTER, SWAP_MAP_BAD, SWAPFILE_CLUSTER); - if (cluster_is_null(&si->discard_cluster_head)) { - cluster_set_next_flag(&si->discard_cluster_head, - idx, 0); - cluster_set_next_flag(&si->discard_cluster_tail, - idx, 0); - } else { - unsigned int tail = cluster_next(&si->discard_cluster_tail); - cluster_set_next(&si->cluster_info[tail], idx); - cluster_set_next_flag(&si->discard_cluster_tail, - idx, 0); - } + cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx); schedule_work(&si->discard_work); } @@ -296,15 +333,8 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si) info = si->cluster_info; - while (!cluster_is_null(&si->discard_cluster_head)) { - idx = cluster_next(&si->discard_cluster_head); - - cluster_set_next_flag(&si->discard_cluster_head, - cluster_next(&info[idx]), 0); - if (cluster_next(&si->discard_cluster_tail) == idx) { - cluster_set_null(&si->discard_cluster_head); - cluster_set_null(&si->discard_cluster_tail); - } + while (!cluster_list_empty(&si->discard_clusters)) { + idx = cluster_list_del_first(&si->discard_clusters, info); spin_unlock(&si->lock); discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, @@ -312,19 +342,7 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si) spin_lock(&si->lock); cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE); - if (cluster_is_null(&si->free_cluster_head)) { - cluster_set_next_flag(&si->free_cluster_head, - idx, 0); - cluster_set_next_flag(&si->free_cluster_tail, - idx, 0); - } else { - unsigned int tail; - - tail = cluster_next(&si->free_cluster_tail); - cluster_set_next(&info[tail], idx); - cluster_set_next_flag(&si->free_cluster_tail, - idx, 0); - } + cluster_list_add_tail(&si->free_clusters, info, idx); memset(si->swap_map + idx * SWAPFILE_CLUSTER, 0, SWAPFILE_CLUSTER); } @@ -353,13 +371,8 @@ static void inc_cluster_info_page(struct swap_info_struct *p, if (!cluster_info) return; if (cluster_is_free(&cluster_info[idx])) { - VM_BUG_ON(cluster_next(&p->free_cluster_head) != idx); - cluster_set_next_flag(&p->free_cluster_head, - cluster_next(&cluster_info[idx]), 0); - if (cluster_next(&p->free_cluster_tail) == idx) { - cluster_set_null(&p->free_cluster_tail); - cluster_set_null(&p->free_cluster_head); - } + VM_BUG_ON(cluster_list_first(&p->free_clusters) != idx); + cluster_list_del_first(&p->free_clusters, cluster_info); cluster_set_count_flag(&cluster_info[idx], 0, 0); } @@ -398,14 +411,7 @@ static void dec_cluster_info_page(struct swap_info_struct *p, } cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); - if (cluster_is_null(&p->free_cluster_head)) { - cluster_set_next_flag(&p->free_cluster_head, idx, 0); - cluster_set_next_flag(&p->free_cluster_tail, idx, 0); - } else { - unsigned int tail = cluster_next(&p->free_cluster_tail); - cluster_set_next(&cluster_info[tail], idx); - cluster_set_next_flag(&p->free_cluster_tail, idx, 0); - } + cluster_list_add_tail(&p->free_clusters, cluster_info, idx); } } @@ -421,8 +427,8 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, bool conflict; offset /= SWAPFILE_CLUSTER; - conflict = !cluster_is_null(&si->free_cluster_head) && - offset != cluster_next(&si->free_cluster_head) && + conflict = !cluster_list_empty(&si->free_clusters) && + offset != cluster_list_first(&si->free_clusters) && cluster_is_free(&si->cluster_info[offset]); if (!conflict) @@ -447,11 +453,11 @@ static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, new_cluster: cluster = this_cpu_ptr(si->percpu_cluster); if (cluster_is_null(&cluster->index)) { - if (!cluster_is_null(&si->free_cluster_head)) { - cluster->index = si->free_cluster_head; + if (!cluster_list_empty(&si->free_clusters)) { + cluster->index = si->free_clusters.head; cluster->next = cluster_next(&cluster->index) * SWAPFILE_CLUSTER; - } else if (!cluster_is_null(&si->discard_cluster_head)) { + } else if (!cluster_list_empty(&si->discard_clusters)) { /* * we don't have free cluster but have some clusters in * discarding, do discard now and reclaim them @@ -2292,10 +2298,8 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, nr_good_pages = maxpages - 1; /* omit header page */ - cluster_set_null(&p->free_cluster_head); - cluster_set_null(&p->free_cluster_tail); - cluster_set_null(&p->discard_cluster_head); - cluster_set_null(&p->discard_cluster_tail); + cluster_list_init(&p->free_clusters); + cluster_list_init(&p->discard_clusters); for (i = 0; i < swap_header->info.nr_badpages; i++) { unsigned int page_nr = swap_header->info.badpages[i]; @@ -2341,19 +2345,8 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, for (i = 0; i < nr_clusters; i++) { if (!cluster_count(&cluster_info[idx])) { cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); - if (cluster_is_null(&p->free_cluster_head)) { - cluster_set_next_flag(&p->free_cluster_head, - idx, 0); - cluster_set_next_flag(&p->free_cluster_tail, - idx, 0); - } else { - unsigned int tail; - - tail = cluster_next(&p->free_cluster_tail); - cluster_set_next(&cluster_info[tail], idx); - cluster_set_next_flag(&p->free_cluster_tail, - idx, 0); - } + cluster_list_add_tail(&p->free_clusters, cluster_info, + idx); } idx++; if (idx == nr_clusters) From 7ebffa45551fe7db86a2b32bf586f124ef484e6e Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 7 Oct 2016 16:58:45 -0700 Subject: [PATCH 039/127] mm,oom_reaper: reduce find_lock_task_mm() usage Patch series "fortify oom killer even more", v2. This patch (of 9): __oom_reap_task() can be simplified a bit if it receives a valid mm from oom_reap_task() which also uses that mm when __oom_reap_task() failed. We can drop one find_lock_task_mm() call and also make the __oom_reap_task() code flow easier to follow. Moreover, this will make later patch in the series easier to review. Pinning mm's mm_count for longer time is not really harmful because this will not pin much memory. This patch doesn't introduce any functional change. Link: http://lkml.kernel.org/r/1472119394-11342-2-git-send-email-mhocko@kernel.org Signed-off-by: Tetsuo Handa Signed-off-by: Michal Hocko Cc: Oleg Nesterov Cc: David Rientjes Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 81 ++++++++++++++++++++++++--------------------------- 1 file changed, 38 insertions(+), 43 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 463cdd22d4e0..87fad956c96b 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -463,12 +463,10 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); static struct task_struct *oom_reaper_list; static DEFINE_SPINLOCK(oom_reaper_lock); -static bool __oom_reap_task(struct task_struct *tsk) +static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) { struct mmu_gather tlb; struct vm_area_struct *vma; - struct mm_struct *mm = NULL; - struct task_struct *p; struct zap_details details = {.check_swap_entries = true, .ignore_dirty = true}; bool ret = true; @@ -476,7 +474,7 @@ static bool __oom_reap_task(struct task_struct *tsk) /* * We have to make sure to not race with the victim exit path * and cause premature new oom victim selection: - * __oom_reap_task exit_mm + * __oom_reap_task_mm exit_mm * mmget_not_zero * mmput * atomic_dec_and_test @@ -489,22 +487,9 @@ static bool __oom_reap_task(struct task_struct *tsk) */ mutex_lock(&oom_lock); - /* - * Make sure we find the associated mm_struct even when the particular - * thread has already terminated and cleared its mm. - * We might have race with exit path so consider our work done if there - * is no mm. - */ - p = find_lock_task_mm(tsk); - if (!p) - goto unlock_oom; - mm = p->mm; - atomic_inc(&mm->mm_count); - task_unlock(p); - if (!down_read_trylock(&mm->mmap_sem)) { ret = false; - goto mm_drop; + goto unlock_oom; } /* @@ -514,7 +499,7 @@ static bool __oom_reap_task(struct task_struct *tsk) */ if (!mmget_not_zero(mm)) { up_read(&mm->mmap_sem); - goto mm_drop; + goto unlock_oom; } tlb_gather_mmu(&tlb, mm, 0, -1); @@ -562,8 +547,6 @@ static bool __oom_reap_task(struct task_struct *tsk) * put the oom_reaper out of the way. */ mmput_async(mm); -mm_drop: - mmdrop(mm); unlock_oom: mutex_unlock(&oom_lock); return ret; @@ -573,36 +556,45 @@ unlock_oom: static void oom_reap_task(struct task_struct *tsk) { int attempts = 0; + struct mm_struct *mm = NULL; + struct task_struct *p = find_lock_task_mm(tsk); + + /* + * Make sure we find the associated mm_struct even when the particular + * thread has already terminated and cleared its mm. + * We might have race with exit path so consider our work done if there + * is no mm. + */ + if (!p) + goto done; + mm = p->mm; + atomic_inc(&mm->mm_count); + task_unlock(p); /* Retry the down_read_trylock(mmap_sem) a few times */ - while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task(tsk)) + while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm)) schedule_timeout_idle(HZ/10); - if (attempts > MAX_OOM_REAP_RETRIES) { - struct task_struct *p; + if (attempts <= MAX_OOM_REAP_RETRIES) + goto done; - pr_info("oom_reaper: unable to reap pid:%d (%s)\n", - task_pid_nr(tsk), tsk->comm); + pr_info("oom_reaper: unable to reap pid:%d (%s)\n", + task_pid_nr(tsk), tsk->comm); - /* - * If we've already tried to reap this task in the past and - * failed it probably doesn't make much sense to try yet again - * so hide the mm from the oom killer so that it can move on - * to another task with a different mm struct. - */ - p = find_lock_task_mm(tsk); - if (p) { - if (test_and_set_bit(MMF_OOM_NOT_REAPABLE, &p->mm->flags)) { - pr_info("oom_reaper: giving up pid:%d (%s)\n", - task_pid_nr(tsk), tsk->comm); - set_bit(MMF_OOM_REAPED, &p->mm->flags); - } - task_unlock(p); - } - - debug_show_all_locks(); + /* + * If we've already tried to reap this task in the past and + * failed it probably doesn't make much sense to try yet again + * so hide the mm from the oom killer so that it can move on + * to another task with a different mm struct. + */ + if (test_and_set_bit(MMF_OOM_NOT_REAPABLE, &mm->flags)) { + pr_info("oom_reaper: giving up pid:%d (%s)\n", + task_pid_nr(tsk), tsk->comm); + set_bit(MMF_OOM_REAPED, &mm->flags); } + debug_show_all_locks(); +done: /* * Clear TIF_MEMDIE because the task shouldn't be sitting on a * reasonably reclaimable memory anymore or it is not a good candidate @@ -614,6 +606,9 @@ static void oom_reap_task(struct task_struct *tsk) /* Drop a reference taken by wake_oom_reaper */ put_task_struct(tsk); + /* Drop a reference taken above. */ + if (mm) + mmdrop(mm); } static int oom_reaper(void *unused) From 8496afaba93ece80a83cbd096f0675a1020ddfc4 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 7 Oct 2016 16:58:48 -0700 Subject: [PATCH 040/127] mm,oom_reaper: do not attempt to reap a task twice "mm, oom_reaper: do not attempt to reap a task twice" tried to give the OOM reaper one more chance to retry using MMF_OOM_NOT_REAPABLE flag. But the usefulness of the flag is rather limited and actually never shown in practice. If the flag is set, it means that the holder of mm->mmap_sem cannot call up_write() due to presumably being blocked at unkillable wait waiting for other thread's memory allocation. But since one of threads sharing that mm will queue that mm immediately via task_will_free_mem() shortcut (otherwise, oom_badness() will select the same mm again due to oom_score_adj value unchanged), retrying MMF_OOM_NOT_REAPABLE mm is unlikely helpful. Let's always set MMF_OOM_REAPED. Link: http://lkml.kernel.org/r/1472119394-11342-3-git-send-email-mhocko@kernel.org Signed-off-by: Tetsuo Handa Signed-off-by: Michal Hocko Cc: Oleg Nesterov Cc: David Rientjes Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 - mm/oom_kill.c | 15 +++------------ 2 files changed, 3 insertions(+), 13 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 7543a476178b..b48cd32be445 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -525,7 +525,6 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_HAS_UPROBES 19 /* has uprobes */ #define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ #define MMF_OOM_REAPED 21 /* mm has been already reaped */ -#define MMF_OOM_NOT_REAPABLE 22 /* mm couldn't be reaped */ #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 87fad956c96b..45097f5a8f30 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -578,20 +578,11 @@ static void oom_reap_task(struct task_struct *tsk) if (attempts <= MAX_OOM_REAP_RETRIES) goto done; + /* Ignore this mm because somebody can't call up_write(mmap_sem). */ + set_bit(MMF_OOM_REAPED, &mm->flags); + pr_info("oom_reaper: unable to reap pid:%d (%s)\n", task_pid_nr(tsk), tsk->comm); - - /* - * If we've already tried to reap this task in the past and - * failed it probably doesn't make much sense to try yet again - * so hide the mm from the oom killer so that it can move on - * to another task with a different mm struct. - */ - if (test_and_set_bit(MMF_OOM_NOT_REAPABLE, &mm->flags)) { - pr_info("oom_reaper: giving up pid:%d (%s)\n", - task_pid_nr(tsk), tsk->comm); - set_bit(MMF_OOM_REAPED, &mm->flags); - } debug_show_all_locks(); done: From 26db62f179d112d345031e14926a4cda9cd40d6e Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 7 Oct 2016 16:58:51 -0700 Subject: [PATCH 041/127] oom: keep mm of the killed task available oom_reap_task has to call exit_oom_victim in order to make sure that the oom vicim will not block the oom killer for ever. This is, however, opening new problems (e.g oom_killer_disable exclusion - see commit 74070542099c ("oom, suspend: fix oom_reaper vs. oom_killer_disable race")). exit_oom_victim should be only called from the victim's context ideally. One way to achieve this would be to rely on per mm_struct flags. We already have MMF_OOM_REAPED to hide a task from the oom killer since "mm, oom: hide mm which is shared with kthread or global init". The problem is that the exit path: do_exit exit_mm tsk->mm = NULL; mmput __mmput exit_oom_victim doesn't guarantee that exit_oom_victim will get called in a bounded amount of time. At least exit_aio depends on IO which might get blocked due to lack of memory and who knows what else is lurking there. This patch takes a different approach. We remember tsk->mm into the signal_struct and bind it to the signal struct life time for all oom victims. __oom_reap_task_mm as well as oom_scan_process_thread do not have to rely on find_lock_task_mm anymore and they will have a reliable reference to the mm struct. As a result all the oom specific communication inside the OOM killer can be done via tsk->signal->oom_mm. Increasing the signal_struct for something as unlikely as the oom killer is far from ideal but this approach will make the code much more reasonable and long term we even might want to move task->mm into the signal_struct anyway. In the next step we might want to make the oom killer exclusion and access to memory reserves completely independent which would be also nice. Link: http://lkml.kernel.org/r/1472119394-11342-4-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: Tetsuo Handa Cc: Oleg Nesterov Cc: David Rientjes Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 ++ kernel/fork.c | 2 ++ mm/oom_kill.c | 51 ++++++++++++++++--------------------------- 3 files changed, 23 insertions(+), 32 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index b48cd32be445..67ea79610e67 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -805,6 +805,8 @@ struct signal_struct { short oom_score_adj; /* OOM kill score adjustment */ short oom_score_adj_min; /* OOM kill score adjustment min value. * Only settable by CAP_SYS_RESOURCE. */ + struct mm_struct *oom_mm; /* recorded mm when the thread group got + * killed by the oom killer */ struct mutex cred_guard_mutex; /* guard against foreign influences on * credential calculations diff --git a/kernel/fork.c b/kernel/fork.c index 9a05bd93f8e7..48cafe787b75 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -359,6 +359,8 @@ static inline void free_signal_struct(struct signal_struct *sig) { taskstats_tgid_free(sig); sched_autogroup_exit(sig); + if (sig->oom_mm) + mmdrop(sig->oom_mm); kmem_cache_free(signal_cachep, sig); } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 45097f5a8f30..f16ec0840a0e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -300,14 +300,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) * any memory is quite low. */ if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) { - struct task_struct *p = find_lock_task_mm(task); - bool reaped = false; - - if (p) { - reaped = test_bit(MMF_OOM_REAPED, &p->mm->flags); - task_unlock(p); - } - if (reaped) + if (test_bit(MMF_OOM_REAPED, &task->signal->oom_mm->flags)) goto next; goto abort; } @@ -536,11 +529,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) K(get_mm_counter(mm, MM_SHMEMPAGES))); up_read(&mm->mmap_sem); - /* - * This task can be safely ignored because we cannot do much more - * to release its memory. - */ - set_bit(MMF_OOM_REAPED, &mm->flags); /* * Drop our reference but make sure the mmput slow path is called from a * different context because we shouldn't risk we get stuck there and @@ -556,20 +544,7 @@ unlock_oom: static void oom_reap_task(struct task_struct *tsk) { int attempts = 0; - struct mm_struct *mm = NULL; - struct task_struct *p = find_lock_task_mm(tsk); - - /* - * Make sure we find the associated mm_struct even when the particular - * thread has already terminated and cleared its mm. - * We might have race with exit path so consider our work done if there - * is no mm. - */ - if (!p) - goto done; - mm = p->mm; - atomic_inc(&mm->mm_count); - task_unlock(p); + struct mm_struct *mm = tsk->signal->oom_mm; /* Retry the down_read_trylock(mmap_sem) a few times */ while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm)) @@ -578,8 +553,6 @@ static void oom_reap_task(struct task_struct *tsk) if (attempts <= MAX_OOM_REAP_RETRIES) goto done; - /* Ignore this mm because somebody can't call up_write(mmap_sem). */ - set_bit(MMF_OOM_REAPED, &mm->flags); pr_info("oom_reaper: unable to reap pid:%d (%s)\n", task_pid_nr(tsk), tsk->comm); @@ -595,11 +568,14 @@ done: tsk->oom_reaper_list = NULL; exit_oom_victim(tsk); + /* + * Hide this mm from OOM killer because it has been either reaped or + * somebody can't call up_write(mmap_sem). + */ + set_bit(MMF_OOM_REAPED, &mm->flags); + /* Drop a reference taken by wake_oom_reaper */ put_task_struct(tsk); - /* Drop a reference taken above. */ - if (mm) - mmdrop(mm); } static int oom_reaper(void *unused) @@ -665,14 +641,25 @@ static inline void wake_oom_reaper(struct task_struct *tsk) * * Has to be called with oom_lock held and never after * oom has been disabled already. + * + * tsk->mm has to be non NULL and caller has to guarantee it is stable (either + * under task_lock or operate on the current). */ static void mark_oom_victim(struct task_struct *tsk) { + struct mm_struct *mm = tsk->mm; + WARN_ON(oom_killer_disabled); /* OOM killer might race with memcg OOM */ if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE)) return; + atomic_inc(&tsk->signal->oom_victims); + + /* oom_mm is bound to the signal struct life time. */ + if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) + atomic_inc(&tsk->signal->oom_mm->mm_count); + /* * Make sure that the task is woken up from uninterruptible sleep * if it is frozen because OOM killer wouldn't be able to free From 7283094ec3db318e87ec9e31cf75f136ac2a4dd3 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 7 Oct 2016 16:58:54 -0700 Subject: [PATCH 042/127] kernel, oom: fix potential pgd_lock deadlock from __mmdrop Lockdep complains that __mmdrop is not safe from the softirq context: ================================= [ INFO: inconsistent lock state ] 4.6.0-oomfortification2-00011-geeb3eadeab96-dirty #949 Tainted: G W --------------------------------- inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage. swapper/1/0 [HC0[0]:SC1[1]:HE1:SE0] takes: (pgd_lock){+.?...}, at: pgd_free+0x19/0x6b {SOFTIRQ-ON-W} state was registered at: __lock_acquire+0xa06/0x196e lock_acquire+0x139/0x1e1 _raw_spin_lock+0x32/0x41 __change_page_attr_set_clr+0x2a5/0xacd change_page_attr_set_clr+0x16f/0x32c set_memory_nx+0x37/0x3a free_init_pages+0x9e/0xc7 alternative_instructions+0xa2/0xb3 check_bugs+0xe/0x2d start_kernel+0x3ce/0x3ea x86_64_start_reservations+0x2a/0x2c x86_64_start_kernel+0x17a/0x18d irq event stamp: 105916 hardirqs last enabled at (105916): free_hot_cold_page+0x37e/0x390 hardirqs last disabled at (105915): free_hot_cold_page+0x2c1/0x390 softirqs last enabled at (105878): _local_bh_enable+0x42/0x44 softirqs last disabled at (105879): irq_exit+0x6f/0xd1 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(pgd_lock); lock(pgd_lock); *** DEADLOCK *** 1 lock held by swapper/1/0: #0: (rcu_callback){......}, at: rcu_process_callbacks+0x390/0x800 stack backtrace: CPU: 1 PID: 0 Comm: swapper/1 Tainted: G W 4.6.0-oomfortification2-00011-geeb3eadeab96-dirty #949 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Debian-1.8.2-1 04/01/2014 Call Trace: print_usage_bug.part.25+0x259/0x268 mark_lock+0x381/0x567 __lock_acquire+0x993/0x196e lock_acquire+0x139/0x1e1 _raw_spin_lock+0x32/0x41 pgd_free+0x19/0x6b __mmdrop+0x25/0xb9 __put_task_struct+0x103/0x11e delayed_put_task_struct+0x157/0x15e rcu_process_callbacks+0x660/0x800 __do_softirq+0x1ec/0x4d5 irq_exit+0x6f/0xd1 smp_apic_timer_interrupt+0x42/0x4d apic_timer_interrupt+0x8e/0xa0 arch_cpu_idle+0xf/0x11 default_idle_call+0x32/0x34 cpu_startup_entry+0x20c/0x399 start_secondary+0xfe/0x101 More over commit a79e53d85683 ("x86/mm: Fix pgd_lock deadlock") was explicit about pgd_lock not to be called from the irq context. This means that __mmdrop called from free_signal_struct has to be postponed to a user context. We already have a similar mechanism for mmput_async so we can use it here as well. This is safe because mm_count is pinned by mm_users. This fixes bug introduced by "oom: keep mm of the killed task available" Link: http://lkml.kernel.org/r/1472119394-11342-5-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: Tetsuo Handa Cc: Oleg Nesterov Cc: David Rientjes Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 2 -- include/linux/sched.h | 14 ++++++++++++++ kernel/fork.c | 6 +++++- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 903200f4ec41..4a8acedf4b7d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -515,9 +515,7 @@ struct mm_struct { #ifdef CONFIG_HUGETLB_PAGE atomic_long_t hugetlb_usage; #endif -#ifdef CONFIG_MMU struct work_struct async_put_work; -#endif }; static inline void mm_init_cpumask(struct mm_struct *mm) diff --git a/include/linux/sched.h b/include/linux/sched.h index 67ea79610e67..c4b588358296 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2877,6 +2877,20 @@ static inline void mmdrop(struct mm_struct *mm) __mmdrop(mm); } +static inline void mmdrop_async_fn(struct work_struct *work) +{ + struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work); + __mmdrop(mm); +} + +static inline void mmdrop_async(struct mm_struct *mm) +{ + if (unlikely(atomic_dec_and_test(&mm->mm_count))) { + INIT_WORK(&mm->async_put_work, mmdrop_async_fn); + schedule_work(&mm->async_put_work); + } +} + static inline bool mmget_not_zero(struct mm_struct *mm) { return atomic_inc_not_zero(&mm->mm_users); diff --git a/kernel/fork.c b/kernel/fork.c index 48cafe787b75..5650e35dda43 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -359,8 +359,12 @@ static inline void free_signal_struct(struct signal_struct *sig) { taskstats_tgid_free(sig); sched_autogroup_exit(sig); + /* + * __mmdrop is not safe to call from softirq context on x86 due to + * pgd_dtor so postpone it to the async context + */ if (sig->oom_mm) - mmdrop(sig->oom_mm); + mmdrop_async(sig->oom_mm); kmem_cache_free(signal_cachep, sig); } From 862e3073b3eed13f17bd6be6ca6052db15c0b728 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 7 Oct 2016 16:58:57 -0700 Subject: [PATCH 043/127] mm, oom: get rid of signal_struct::oom_victims After "oom: keep mm of the killed task available" we can safely detect an oom victim by checking task->signal->oom_mm so we do not need the signal_struct counter anymore so let's get rid of it. This alone wouldn't be sufficient for nommu archs because exit_oom_victim doesn't hide the process from the oom killer anymore. We can, however, mark the mm with a MMF flag in __mmput. We can reuse MMF_OOM_REAPED and rename it to a more generic MMF_OOM_SKIP. Link: http://lkml.kernel.org/r/1472119394-11342-6-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: Tetsuo Handa Cc: Oleg Nesterov Cc: David Rientjes Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 5 +++++ include/linux/sched.h | 3 +-- kernel/fork.c | 1 + mm/oom_kill.c | 17 +++++++---------- 4 files changed, 14 insertions(+), 12 deletions(-) diff --git a/include/linux/oom.h b/include/linux/oom.h index 17946e5121b6..b61357d07170 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -58,6 +58,11 @@ static inline bool oom_task_origin(const struct task_struct *p) return p->signal->oom_flag_origin; } +static inline bool tsk_is_oom_victim(struct task_struct * tsk) +{ + return tsk->signal->oom_mm; +} + extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages); diff --git a/include/linux/sched.h b/include/linux/sched.h index c4b588358296..af0721364788 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -524,7 +524,7 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_HAS_UPROBES 19 /* has uprobes */ #define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ -#define MMF_OOM_REAPED 21 /* mm has been already reaped */ +#define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */ #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) @@ -672,7 +672,6 @@ struct signal_struct { atomic_t sigcnt; atomic_t live; int nr_threads; - atomic_t oom_victims; /* # of TIF_MEDIE threads in this thread group */ struct list_head thread_head; wait_queue_head_t wait_chldexit; /* for wait4() */ diff --git a/kernel/fork.c b/kernel/fork.c index 5650e35dda43..9a8ec66cd4df 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -862,6 +862,7 @@ static inline void __mmput(struct mm_struct *mm) } if (mm->binfmt) module_put(mm->binfmt->module); + set_bit(MMF_OOM_SKIP, &mm->flags); mmdrop(mm); } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f16ec0840a0e..e2a2c35dd493 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -186,7 +186,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, */ adj = (long)p->signal->oom_score_adj; if (adj == OOM_SCORE_ADJ_MIN || - test_bit(MMF_OOM_REAPED, &p->mm->flags) || + test_bit(MMF_OOM_SKIP, &p->mm->flags) || in_vfork(p)) { task_unlock(p); return 0; @@ -296,11 +296,11 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) /* * This task already has access to memory reserves and is being killed. * Don't allow any other task to have access to the reserves unless - * the task has MMF_OOM_REAPED because chances that it would release + * the task has MMF_OOM_SKIP because chances that it would release * any memory is quite low. */ - if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) { - if (test_bit(MMF_OOM_REAPED, &task->signal->oom_mm->flags)) + if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) { + if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) goto next; goto abort; } @@ -572,7 +572,7 @@ done: * Hide this mm from OOM killer because it has been either reaped or * somebody can't call up_write(mmap_sem). */ - set_bit(MMF_OOM_REAPED, &mm->flags); + set_bit(MMF_OOM_SKIP, &mm->flags); /* Drop a reference taken by wake_oom_reaper */ put_task_struct(tsk); @@ -654,8 +654,6 @@ static void mark_oom_victim(struct task_struct *tsk) if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE)) return; - atomic_inc(&tsk->signal->oom_victims); - /* oom_mm is bound to the signal struct life time. */ if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) atomic_inc(&tsk->signal->oom_mm->mm_count); @@ -677,7 +675,6 @@ void exit_oom_victim(struct task_struct *tsk) { if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE)) return; - atomic_dec(&tsk->signal->oom_victims); if (!atomic_dec_return(&oom_victims)) wake_up_all(&oom_victims_wait); @@ -769,7 +766,7 @@ static bool task_will_free_mem(struct task_struct *task) * This task has already been drained by the oom reaper so there are * only small chances it will free some more */ - if (test_bit(MMF_OOM_REAPED, &mm->flags)) + if (test_bit(MMF_OOM_SKIP, &mm->flags)) return false; if (atomic_read(&mm->mm_users) <= 1) @@ -906,7 +903,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) * killer to guarantee OOM forward progress. */ can_oom_reap = false; - set_bit(MMF_OOM_REAPED, &mm->flags); + set_bit(MMF_OOM_SKIP, &mm->flags); pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n", task_pid_nr(victim), victim->comm, task_pid_nr(p), p->comm); From 7d2e7a22cf27e7569e6816ccc05dd74248048b30 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 7 Oct 2016 16:59:00 -0700 Subject: [PATCH 044/127] oom, suspend: fix oom_killer_disable vs. pm suspend properly Commit 74070542099c ("oom, suspend: fix oom_reaper vs. oom_killer_disable race") has workaround an existing race between oom_killer_disable and oom_reaper by adding another round of try_to_freeze_tasks after the oom killer was disabled. This was the easiest thing to do for a late 4.7 fix. Let's fix it properly now. After "oom: keep mm of the killed task available" we no longer have to call exit_oom_victim from the oom reaper because we have stable mm available and hide the oom_reaped mm by MMF_OOM_SKIP flag. So let's remove exit_oom_victim and the race described in the above commit doesn't exist anymore if. Unfortunately this alone is not sufficient for the oom_killer_disable usecase because now we do not have any reliable way to reach exit_oom_victim (the victim might get stuck on a way to exit for an unbounded amount of time). OOM killer can cope with that by checking mm flags and move on to another victim but we cannot do the same for oom_killer_disable as we would lose the guarantee of no further interference of the victim with the rest of the system. What we can do instead is to cap the maximum time the oom_killer_disable waits for victims. The only current user of this function (pm suspend) already has a concept of timeout for back off so we can reuse the same value there. Let's drop set_freezable for the oom_reaper kthread because it is no longer needed as the reaper doesn't wake or thaw any processes. Link: http://lkml.kernel.org/r/1472119394-11342-7-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: Tetsuo Handa Cc: Oleg Nesterov Cc: David Rientjes Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 2 +- kernel/power/process.c | 17 +++-------------- mm/oom_kill.c | 40 ++++++++++++++++++++-------------------- 3 files changed, 24 insertions(+), 35 deletions(-) diff --git a/include/linux/oom.h b/include/linux/oom.h index b61357d07170..0f1b9da108e4 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -74,7 +74,7 @@ extern void exit_oom_victim(struct task_struct *tsk); extern int register_oom_notifier(struct notifier_block *nb); extern int unregister_oom_notifier(struct notifier_block *nb); -extern bool oom_killer_disable(void); +extern bool oom_killer_disable(signed long timeout); extern void oom_killer_enable(void); extern struct task_struct *find_lock_task_mm(struct task_struct *p); diff --git a/kernel/power/process.c b/kernel/power/process.c index 8f27d5a8adf6..2fba066e125f 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -144,23 +144,12 @@ int freeze_processes(void) /* * Now that the whole userspace is frozen we need to disbale * the OOM killer to disallow any further interference with - * killable tasks. + * killable tasks. There is no guarantee oom victims will + * ever reach a point they go away we have to wait with a timeout. */ - if (!error && !oom_killer_disable()) + if (!error && !oom_killer_disable(msecs_to_jiffies(freeze_timeout_msecs))) error = -EBUSY; - /* - * There is a hard to fix race between oom_reaper kernel thread - * and oom_killer_disable. oom_reaper calls exit_oom_victim - * before the victim reaches exit_mm so try to freeze all the tasks - * again and catch such a left over task. - */ - if (!error) { - pr_info("Double checking all user space processes after OOM killer disable... "); - error = try_to_freeze_tasks(true); - pr_cont("\n"); - } - if (error) thaw_processes(); return error; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index e2a2c35dd493..895a51fe8e18 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -559,14 +559,7 @@ static void oom_reap_task(struct task_struct *tsk) debug_show_all_locks(); done: - /* - * Clear TIF_MEMDIE because the task shouldn't be sitting on a - * reasonably reclaimable memory anymore or it is not a good candidate - * for the oom victim right now because it cannot release its memory - * itself nor by the oom reaper. - */ tsk->oom_reaper_list = NULL; - exit_oom_victim(tsk); /* * Hide this mm from OOM killer because it has been either reaped or @@ -580,8 +573,6 @@ done: static int oom_reaper(void *unused) { - set_freezable(); - while (true) { struct task_struct *tsk = NULL; @@ -680,11 +671,21 @@ void exit_oom_victim(struct task_struct *tsk) wake_up_all(&oom_victims_wait); } +/** + * oom_killer_enable - enable OOM killer + */ +void oom_killer_enable(void) +{ + oom_killer_disabled = false; +} + /** * oom_killer_disable - disable OOM killer + * @timeout: maximum timeout to wait for oom victims in jiffies * * Forces all page allocations to fail rather than trigger OOM killer. - * Will block and wait until all OOM victims are killed. + * Will block and wait until all OOM victims are killed or the given + * timeout expires. * * The function cannot be called when there are runnable user tasks because * the userspace would see unexpected allocation failures as a result. Any @@ -693,8 +694,10 @@ void exit_oom_victim(struct task_struct *tsk) * Returns true if successful and false if the OOM killer cannot be * disabled. */ -bool oom_killer_disable(void) +bool oom_killer_disable(signed long timeout) { + signed long ret; + /* * Make sure to not race with an ongoing OOM killer. Check that the * current is not killed (possibly due to sharing the victim's memory). @@ -704,19 +707,16 @@ bool oom_killer_disable(void) oom_killer_disabled = true; mutex_unlock(&oom_lock); - wait_event(oom_victims_wait, !atomic_read(&oom_victims)); + ret = wait_event_interruptible_timeout(oom_victims_wait, + !atomic_read(&oom_victims), timeout); + if (ret <= 0) { + oom_killer_enable(); + return false; + } return true; } -/** - * oom_killer_enable - enable OOM killer - */ -void oom_killer_enable(void) -{ - oom_killer_disabled = false; -} - static inline bool __task_will_free_mem(struct task_struct *task) { struct signal_struct *sig = task->signal; From 38531201c12144cd7d96abfdfe7449c2b01375e8 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 7 Oct 2016 16:59:03 -0700 Subject: [PATCH 045/127] mm, oom: enforce exit_oom_victim on current task There are no users of exit_oom_victim on !current task anymore so enforce the API to always work on the current. Link: http://lkml.kernel.org/r/1472119394-11342-8-git-send-email-mhocko@kernel.org Signed-off-by: Tetsuo Handa Signed-off-by: Michal Hocko Cc: Oleg Nesterov Cc: David Rientjes Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 2 +- kernel/exit.c | 2 +- mm/oom_kill.c | 5 ++--- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/include/linux/oom.h b/include/linux/oom.h index 0f1b9da108e4..b4e36e92bc87 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -69,7 +69,7 @@ extern unsigned long oom_badness(struct task_struct *p, extern bool out_of_memory(struct oom_control *oc); -extern void exit_oom_victim(struct task_struct *tsk); +extern void exit_oom_victim(void); extern int register_oom_notifier(struct notifier_block *nb); extern int unregister_oom_notifier(struct notifier_block *nb); diff --git a/kernel/exit.c b/kernel/exit.c index 1e1d913914c0..9d68c45ebbe3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -511,7 +511,7 @@ static void exit_mm(struct task_struct *tsk) mm_update_next_owner(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) - exit_oom_victim(tsk); + exit_oom_victim(); } static struct task_struct *find_alive_thread(struct task_struct *p) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 895a51fe8e18..3b990544db6d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -662,10 +662,9 @@ static void mark_oom_victim(struct task_struct *tsk) /** * exit_oom_victim - note the exit of an OOM victim */ -void exit_oom_victim(struct task_struct *tsk) +void exit_oom_victim(void) { - if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE)) - return; + clear_thread_flag(TIF_MEMDIE); if (!atomic_dec_return(&oom_victims)) wake_up_all(&oom_victims_wait); From 3f70dc38cec2ad6e5355f80c4c7a15a3f7e97a19 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 7 Oct 2016 16:59:06 -0700 Subject: [PATCH 046/127] mm: make sure that kthreads will not refault oom reaped memory There are only few use_mm() users in the kernel right now. Most of them write to the target memory but vhost driver relies on copy_from_user/get_user from a kernel thread context. This makes it impossible to reap the memory of an oom victim which shares the mm with the vhost kernel thread because it could see a zero page unexpectedly and theoretically make an incorrect decision visible outside of the killed task context. To quote Michael S. Tsirkin: : Getting an error from __get_user and friends is handled gracefully. : Getting zero instead of a real value will cause userspace : memory corruption. The vhost kernel thread is bound to an open fd of the vhost device which is not tight to the mm owner life cycle in general. The device fd can be inherited or passed over to another process which means that we really have to be careful about unexpected memory corruption because unlike for normal oom victims the result will be visible outside of the oom victim context. Make sure that no kthread context (users of use_mm) can ever see corrupted data because of the oom reaper and hook into the page fault path by checking MMF_UNSTABLE mm flag. __oom_reap_task_mm will set the flag before it starts unmapping the address space while the flag is checked after the page fault has been handled. If the flag is set then SIGBUS is triggered so any g-u-p user will get a error code. Regular tasks do not need this protection because all which share the mm are killed when the mm is reaped and so the corruption will not outlive them. This patch shouldn't have any visible effect at this moment because the OOM killer doesn't invoke oom reaper for tasks with mm shared with kthreads yet. Link: http://lkml.kernel.org/r/1472119394-11342-9-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: "Michael S. Tsirkin" Cc: Tetsuo Handa Cc: Oleg Nesterov Cc: David Rientjes Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + mm/memory.c | 13 +++++++++++++ mm/oom_kill.c | 8 ++++++++ 3 files changed, 22 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index af0721364788..6bee6f988912 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -525,6 +525,7 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_HAS_UPROBES 19 /* has uprobes */ #define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ #define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */ +#define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) diff --git a/mm/memory.c b/mm/memory.c index f1a68049edff..4bfc3a9c3b18 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3658,6 +3658,19 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, mem_cgroup_oom_synchronize(false); } + /* + * This mm has been already reaped by the oom reaper and so the + * refault cannot be trusted in general. Anonymous refaults would + * lose data and give a zero page instead e.g. This is especially + * problem for use_mm() because regular tasks will just die and + * the corrupted data will not be visible anywhere while kthread + * will outlive the oom victim and potentially propagate the date + * further. + */ + if (unlikely((current->flags & PF_KTHREAD) && !(ret & VM_FAULT_ERROR) + && test_bit(MMF_UNSTABLE, &vma->vm_mm->flags))) + ret = VM_FAULT_SIGBUS; + return ret; } EXPORT_SYMBOL_GPL(handle_mm_fault); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3b990544db6d..5a3ba96c8338 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -495,6 +495,14 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) goto unlock_oom; } + /* + * Tell all users of get_user/copy_from_user etc... that the content + * is no longer stable. No barriers really needed because unmapping + * should imply barriers already and the reader would hit a page fault + * if it stumbled over a reaped memory. + */ + set_bit(MMF_UNSTABLE, &mm->flags); + tlb_gather_mmu(&tlb, mm, 0, -1); for (vma = mm->mmap ; vma; vma = vma->vm_next) { if (is_vm_hugetlb_page(vma)) From 1b51e65eab64fac72cab009691e8ca9915624876 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 7 Oct 2016 16:59:09 -0700 Subject: [PATCH 047/127] oom, oom_reaper: allow to reap mm shared by the kthreads oom reaper was skipped for an mm which is shared with the kernel thread (aka use_mm()). The primary concern was that such a kthread might want to read from the userspace memory and see zero page as a result of the oom reaper action. This is no longer a problem after "mm: make sure that kthreads will not refault oom reaped memory" because any attempt to fault in when the MMF_UNSTABLE is set will result in SIGBUS and so the target user should see an error. This means that we can finally allow oom reaper also to tasks which share their mm with kthreads. Link: http://lkml.kernel.org/r/1472119394-11342-10-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: Tetsuo Handa Cc: Oleg Nesterov Cc: David Rientjes Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 5a3ba96c8338..10f686969fc4 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -902,13 +902,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) continue; if (same_thread_group(p, victim)) continue; - if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p)) { - /* - * We cannot use oom_reaper for the mm shared by this - * process because it wouldn't get killed and so the - * memory might be still used. Hide the mm from the oom - * killer to guarantee OOM forward progress. - */ + if (is_global_init(p)) { can_oom_reap = false; set_bit(MMF_OOM_SKIP, &mm->flags); pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n", @@ -916,6 +910,12 @@ static void oom_kill_process(struct oom_control *oc, const char *message) task_pid_nr(p), p->comm); continue; } + /* + * No use_mm() user needs to read from the userspace so we are + * ok to reap it. + */ + if (unlikely(p->flags & PF_KTHREAD)) + continue; do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); } rcu_read_unlock(); From c9634cf012321243ee8e4ea0fb0709904cd58395 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 7 Oct 2016 16:59:12 -0700 Subject: [PATCH 048/127] mm: use zonelist name instead of using hardcoded index Use the existing enums instead of hardcoded index when looking at the zonelist. This makes it more readable. No functionality change by this patch. Link: http://lkml.kernel.org/r/1472227078-24852-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 2 +- mm/page_alloc.c | 8 ++++---- mm/vmscan.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 2da72a5b6ecc..ad1c96ac313c 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1749,7 +1749,7 @@ unsigned int mempolicy_slab_node(void) */ struct zonelist *zonelist; enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); - zonelist = &NODE_DATA(node)->node_zonelists[0]; + zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK]; z = first_zones_zonelist(zonelist, highest_zoneidx, &policy->v.nodes); return z->zone ? z->zone->node : node; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b0f133f2c655..f6a5a221496a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4574,7 +4574,7 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) int j; struct zonelist *zonelist; - zonelist = &pgdat->node_zonelists[0]; + zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) ; j = build_zonelists_node(NODE_DATA(node), zonelist, j); @@ -4590,7 +4590,7 @@ static void build_thisnode_zonelists(pg_data_t *pgdat) int j; struct zonelist *zonelist; - zonelist = &pgdat->node_zonelists[1]; + zonelist = &pgdat->node_zonelists[ZONELIST_NOFALLBACK]; j = build_zonelists_node(pgdat, zonelist, 0); zonelist->_zonerefs[j].zone = NULL; zonelist->_zonerefs[j].zone_idx = 0; @@ -4611,7 +4611,7 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) struct zone *z; struct zonelist *zonelist; - zonelist = &pgdat->node_zonelists[0]; + zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; pos = 0; for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { for (j = 0; j < nr_nodes; j++) { @@ -4746,7 +4746,7 @@ static void build_zonelists(pg_data_t *pgdat) local_node = pgdat->node_id; - zonelist = &pgdat->node_zonelists[0]; + zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; j = build_zonelists_node(pgdat, zonelist, 0); /* diff --git a/mm/vmscan.c b/mm/vmscan.c index d3715c1b2e36..744f926af442 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3036,7 +3036,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, */ nid = mem_cgroup_select_victim_node(memcg); - zonelist = NODE_DATA(nid)->node_zonelists; + zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK]; trace_mm_vmscan_memcg_reclaim_begin(0, sc.may_writepage, From f6f34b4387d9e18304451a131b35d7c4f27a0b5a Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Fri, 7 Oct 2016 16:59:15 -0700 Subject: [PATCH 049/127] mm: introduce arch_reserved_kernel_pages() Currently arch specific code can reserve memory blocks but alloc_large_system_hash() may not take it into consideration when sizing the hashes. This can lead to bigger hash than required and lead to no available memory for other purposes. This is specifically true for systems with CONFIG_DEFERRED_STRUCT_PAGE_INIT enabled. One approach to solve this problem would be to walk through the memblock regions and calculate the available memory and base the size of hash system on the available memory. The other approach would be to depend on the architecture to provide the number of pages that are reserved. This change provides hooks to allow the architecture to provide the required info. Link: http://lkml.kernel.org/r/1472476010-4709-2-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Srikar Dronamraju Suggested-by: Mel Gorman Cc: Vlastimil Babka Cc: Michal Hocko Cc: Michael Ellerman Cc: Mahesh Salgaonkar Cc: Hari Bathini Cc: Dave Hansen Cc: Balbir Singh Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 +++ mm/page_alloc.c | 12 ++++++++++++ 2 files changed, 15 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 0a063b4e4456..046077b4209d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1924,6 +1924,9 @@ extern void show_mem(unsigned int flags); extern long si_mem_available(void); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); +#ifdef __HAVE_ARCH_RESERVED_KERNEL_PAGES +extern unsigned long arch_reserved_kernel_pages(void); +#endif extern __printf(3, 4) void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f6a5a221496a..e00f545c2398 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6940,6 +6940,17 @@ static int __init set_hashdist(char *str) __setup("hashdist=", set_hashdist); #endif +#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES +/* + * Returns the number of pages that arch has reserved but + * is not known to alloc_large_system_hash(). + */ +static unsigned long __init arch_reserved_kernel_pages(void) +{ + return 0; +} +#endif + /* * allocate a large system hash table from bootmem * - it is assumed that the hash table must contain an exact power-of-2 @@ -6964,6 +6975,7 @@ void *__init alloc_large_system_hash(const char *tablename, if (!numentries) { /* round applicable memory size up to nearest megabyte */ numentries = nr_kernel_pages; + numentries -= arch_reserved_kernel_pages(); /* It isn't necessary when PAGE_SIZE >= 1MB */ if (PAGE_SHIFT < 20) From 8907de5dc6e9d5925cf3b0a698cc3a4272fda073 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Fri, 7 Oct 2016 16:59:18 -0700 Subject: [PATCH 050/127] mm/memblock.c: expose total reserved memory The total reserved memory in a system is accounted but not available for use use outside mm/memblock.c. By exposing the total reserved memory, systems can better calculate the size of large hashes. Link: http://lkml.kernel.org/r/1472476010-4709-3-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Srikar Dronamraju Suggested-by: Mel Gorman Cc: Vlastimil Babka Cc: Michal Hocko Cc: Michael Ellerman Cc: Mahesh Salgaonkar Cc: Hari Bathini Cc: Dave Hansen Cc: Balbir Singh Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 1 + mm/memblock.c | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 2925da23505d..5b759c9acf97 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -328,6 +328,7 @@ phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr); phys_addr_t memblock_phys_mem_size(void); +phys_addr_t memblock_reserved_size(void); phys_addr_t memblock_mem_size(unsigned long limit_pfn); phys_addr_t memblock_start_of_DRAM(void); phys_addr_t memblock_end_of_DRAM(void); diff --git a/mm/memblock.c b/mm/memblock.c index 483197ef613f..c8dfa430342b 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1438,6 +1438,11 @@ phys_addr_t __init_memblock memblock_phys_mem_size(void) return memblock.memory.total_size; } +phys_addr_t __init_memblock memblock_reserved_size(void) +{ + return memblock.reserved.total_size; +} + phys_addr_t __init memblock_mem_size(unsigned long limit_pfn) { unsigned long pages = 0; From 1e76609cc1646c5222feefe86a3433a79be4fb73 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Fri, 7 Oct 2016 16:59:21 -0700 Subject: [PATCH 051/127] powerpc: implement arch_reserved_kernel_pages Currently significant amount of memory is reserved only in kernel booted to capture kernel dump using the fa_dump method. Kernels compiled with CONFIG_DEFERRED_STRUCT_PAGE_INIT will initialize only certain size memory per node. The certain size takes into account the dentry and inode cache sizes. Currently the cache sizes are calculated based on the total system memory including the reserved memory. However such a kernel when booting the same kernel as fadump kernel will not be able to allocate the required amount of memory to suffice for the dentry and inode caches. This results in crashes like Hence only implement arch_reserved_kernel_pages() for CONFIG_FA_DUMP configurations. The amount reserved will be reduced while calculating the large caches and will avoid crashes like the below on large systems such as 32 TB systems. Dentry cache hash table entries: 536870912 (order: 16, 4294967296 bytes) vmalloc: allocation failure, allocated 4097114112 of 17179934720 bytes swapper/0: page allocation failure: order:0, mode:0x2080020(GFP_ATOMIC) CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.6-master+ #3 Call Trace: dump_stack+0xb0/0xf0 (unreliable) warn_alloc_failed+0x114/0x160 __vmalloc_node_range+0x304/0x340 __vmalloc+0x6c/0x90 alloc_large_system_hash+0x1b8/0x2c0 inode_init+0x94/0xe4 vfs_caches_init+0x8c/0x13c start_kernel+0x50c/0x578 start_here_common+0x20/0xa8 Link: http://lkml.kernel.org/r/1472476010-4709-4-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Srikar Dronamraju Suggested-by: Mel Gorman Cc: Vlastimil Babka Cc: Michal Hocko Cc: Michael Ellerman Cc: Mahesh Salgaonkar Cc: Hari Bathini Cc: Dave Hansen Cc: Balbir Singh Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/mmzone.h | 3 +++ arch/powerpc/kernel/fadump.c | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/arch/powerpc/include/asm/mmzone.h b/arch/powerpc/include/asm/mmzone.h index 7b589178be46..4d52ccfc2366 100644 --- a/arch/powerpc/include/asm/mmzone.h +++ b/arch/powerpc/include/asm/mmzone.h @@ -41,6 +41,9 @@ u64 memory_hotplug_max(void); #else #define memory_hotplug_max() memblock_end_of_DRAM() #endif /* CONFIG_NEED_MULTIPLE_NODES */ +#ifdef CONFIG_FA_DUMP +#define __HAVE_ARCH_RESERVED_KERNEL_PAGES +#endif #endif /* __KERNEL__ */ #endif /* _ASM_MMZONE_H_ */ diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index b3a663333d36..eeb80de1f982 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -333,6 +333,11 @@ int __init fadump_reserve_mem(void) return 1; } +unsigned long __init arch_reserved_kernel_pages(void) +{ + return memblock_reserved_size() / PAGE_SIZE; +} + /* Look for fadump= cmdline option. */ static int __init early_fadump_param(char *p) { From 2382705f22c1436a153800cf6051b08f0ea14838 Mon Sep 17 00:00:00 2001 From: zijun_hu Date: Fri, 7 Oct 2016 16:59:24 -0700 Subject: [PATCH 052/127] mm/nobootmem.c: remove duplicate macro ARCH_LOW_ADDRESS_LIMIT statements Fix the following bugs: - the same ARCH_LOW_ADDRESS_LIMIT statements are duplicated between header and relevant source - don't ensure ARCH_LOW_ADDRESS_LIMIT perhaps defined by ARCH in asm/processor.h is preferred over default in linux/bootmem.h completely since the former header isn't included by the latter Link: http://lkml.kernel.org/r/e046aeaa-e160-6d9e-dc1b-e084c2fd999f@zoho.com Signed-off-by: zijun_hu Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bootmem.h | 9 +++++---- mm/nobootmem.c | 10 +++++----- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index f9be32691718..962164d36506 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -7,6 +7,7 @@ #include #include #include +#include /* * simple boot-time physical memory area allocator. @@ -119,6 +120,10 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat, #define BOOTMEM_LOW_LIMIT __pa(MAX_DMA_ADDRESS) #endif +#ifndef ARCH_LOW_ADDRESS_LIMIT +#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL +#endif + #define alloc_bootmem(x) \ __alloc_bootmem(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT) #define alloc_bootmem_align(x, align) \ @@ -180,10 +185,6 @@ static inline void * __init memblock_virt_alloc_nopanic( NUMA_NO_NODE); } -#ifndef ARCH_LOW_ADDRESS_LIMIT -#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL -#endif - static inline void * __init memblock_virt_alloc_low( phys_addr_t size, phys_addr_t align) { diff --git a/mm/nobootmem.c b/mm/nobootmem.c index bd05a70f44b9..490d46abddad 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -11,18 +11,21 @@ #include #include #include -#include #include #include #include #include +#include #include #include -#include #include "internal.h" +#ifndef CONFIG_HAVE_MEMBLOCK +#error CONFIG_HAVE_MEMBLOCK not defined +#endif + #ifndef CONFIG_NEED_MULTIPLE_NODES struct pglist_data __refdata contig_page_data; EXPORT_SYMBOL(contig_page_data); @@ -395,9 +398,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, return __alloc_bootmem_node(pgdat, size, align, goal); } -#ifndef ARCH_LOW_ADDRESS_LIMIT -#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL -#endif /** * __alloc_bootmem_low - allocate low boot memory From 1d8bf926f8739bd35d054097907fef35d881e403 Mon Sep 17 00:00:00 2001 From: zijun_hu Date: Fri, 7 Oct 2016 16:59:27 -0700 Subject: [PATCH 053/127] mm/bootmem.c: replace kzalloc() by kzalloc_node() In ___alloc_bootmem_node_nopanic(), replace kzalloc() by kzalloc_node() in order to allocate memory within given node preferentially when slab is available Link: http://lkml.kernel.org/r/1f487f12-6af4-5e4f-a28c-1de2361cdcd8@zoho.com Signed-off-by: zijun_hu Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/mm/bootmem.c b/mm/bootmem.c index 0aa7dda52402..a869f84f44d3 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -11,15 +11,12 @@ #include #include #include -#include #include #include #include -#include #include #include - -#include +#include #include "internal.h" @@ -712,7 +709,7 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, void *ptr; if (WARN_ON_ONCE(slab_is_available())) - return kzalloc(size, GFP_NOWAIT); + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); again: /* do not panic in alloc_bootmem_bdata() */ @@ -738,9 +735,6 @@ again: void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); } @@ -812,10 +806,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, } -#ifndef ARCH_LOW_ADDRESS_LIMIT -#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL -#endif - /** * __alloc_bootmem_low - allocate low boot memory * @size: size of the request in bytes From 371a096edf43a8c71844cf71c20765c8b21d07d9 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 7 Oct 2016 16:59:30 -0700 Subject: [PATCH 054/127] mm: don't use radix tree writeback tags for pages in swap cache MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit File pages use a set of radix tree tags (DIRTY, TOWRITE, WRITEBACK, etc.) to accelerate finding the pages with a specific tag in the radix tree during inode writeback. But for anonymous pages in the swap cache, there is no inode writeback. So there is no need to find the pages with some writeback tags in the radix tree. It is not necessary to touch radix tree writeback tags for pages in the swap cache. Per Rik van Riel's suggestion, a new flag AS_NO_WRITEBACK_TAGS is introduced for address spaces which don't need to update the writeback tags. The flag is set for swap caches. It may be used for DAX file systems, etc. With this patch, the swap out bandwidth improved 22.3% (from ~1.2GB/s to ~1.48GBps) in the vm-scalability swap-w-seq test case with 8 processes. The test is done on a Xeon E5 v3 system. The swap device used is a RAM simulated PMEM (persistent memory) device. The improvement comes from the reduced contention on the swap cache radix tree lock. To test sequential swapping out, the test case uses 8 processes, which sequentially allocate and write to the anonymous pages until RAM and part of the swap device is used up. Details of comparison is as follow, base base+patch ---------------- -------------------------- %stddev %change %stddev \ | \ 2506952 ± 2% +28.1% 3212076 ± 7% vm-scalability.throughput 1207402 ± 7% +22.3% 1476578 ± 6% vmstat.swap.so 10.86 ± 12% -23.4% 8.31 ± 16% perf-profile.cycles-pp._raw_spin_lock_irq.__add_to_swap_cache.add_to_swap_cache.add_to_swap.shrink_page_list 10.82 ± 13% -33.1% 7.24 ± 14% perf-profile.cycles-pp._raw_spin_lock_irqsave.__remove_mapping.shrink_page_list.shrink_inactive_list.shrink_zone_memcg 10.36 ± 11% -100.0% 0.00 ± -1% perf-profile.cycles-pp._raw_spin_lock_irqsave.__test_set_page_writeback.bdev_write_page.__swap_writepage.swap_writepage 10.52 ± 12% -100.0% 0.00 ± -1% perf-profile.cycles-pp._raw_spin_lock_irqsave.test_clear_page_writeback.end_page_writeback.page_endio.pmem_rw_page Link: http://lkml.kernel.org/r/1472578089-5560-1-git-send-email-ying.huang@intel.com Signed-off-by: "Huang, Ying" Acked-by: Rik van Riel Cc: Hugh Dickins Cc: Shaohua Li Cc: Minchan Kim Cc: Mel Gorman Cc: Tejun Heo Cc: Wu Fengguang Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 12 ++++++++++++ mm/page-writeback.c | 4 ++-- mm/swap_state.c | 2 ++ 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 01e84436cddf..48d9cf04337c 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -25,6 +25,8 @@ enum mapping_flags { AS_MM_ALL_LOCKS = __GFP_BITS_SHIFT + 2, /* under mm_take_all_locks() */ AS_UNEVICTABLE = __GFP_BITS_SHIFT + 3, /* e.g., ramdisk, SHM_LOCK */ AS_EXITING = __GFP_BITS_SHIFT + 4, /* final truncate in progress */ + /* writeback related tags are not used */ + AS_NO_WRITEBACK_TAGS = __GFP_BITS_SHIFT + 5, }; static inline void mapping_set_error(struct address_space *mapping, int error) @@ -64,6 +66,16 @@ static inline int mapping_exiting(struct address_space *mapping) return test_bit(AS_EXITING, &mapping->flags); } +static inline void mapping_set_no_writeback_tags(struct address_space *mapping) +{ + set_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags); +} + +static inline int mapping_use_writeback_tags(struct address_space *mapping) +{ + return !test_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags); +} + static inline gfp_t mapping_gfp_mask(struct address_space * mapping) { return (__force gfp_t)mapping->flags & __GFP_BITS_MASK; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5ed3381818ec..439cc63ad903 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2716,7 +2716,7 @@ int test_clear_page_writeback(struct page *page) int ret; lock_page_memcg(page); - if (mapping) { + if (mapping && mapping_use_writeback_tags(mapping)) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; @@ -2759,7 +2759,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) int ret; lock_page_memcg(page); - if (mapping) { + if (mapping && mapping_use_writeback_tags(mapping)) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; diff --git a/mm/swap_state.c b/mm/swap_state.c index c8310a37be3a..268b8191982b 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -37,6 +37,8 @@ struct address_space swapper_spaces[MAX_SWAPFILES] = { .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), .i_mmap_writable = ATOMIC_INIT(0), .a_ops = &swap_aops, + /* swap cache doesn't use writeback related tags */ + .flags = 1 << AS_NO_WRITEBACK_TAGS, } }; From 9254990fb9f0f15f25605748da20cfbeced7c816 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 7 Oct 2016 16:59:33 -0700 Subject: [PATCH 055/127] oom: warn if we go OOM for higher order and compaction is disabled Since the lumpy reclaim is gone there is no source of higher order pages if CONFIG_COMPACTION=n except for the order-0 pages reclaim which is unreliable for that purpose to say the least. Hitting an OOM for !costly higher order requests is therefore all not that hard to imagine. We are trying hard to not invoke OOM killer as much as possible but there is simply no reliable way to detect whether more reclaim retries make sense. Disabling COMPACTION is not widespread but it seems that some users might have disable the feature without realizing full consequences (mostly along with disabling THP because compaction used to be THP mainly thing). This patch just adds a note if the OOM killer was triggered by higher order request with compaction disabled. This will help us identifying possible misconfiguration right from the oom report which is easier than to always keep in mind that somebody might have disabled COMPACTION without a good reason. Link: http://lkml.kernel.org/r/20160830111632.GD23963@dhcp22.suse.cz Signed-off-by: Michal Hocko Cc: David Rientjes Cc: Johannes Weiner Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 10f686969fc4..0034baf35f0c 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -406,6 +406,8 @@ static void dump_header(struct oom_control *oc, struct task_struct *p) pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n", current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, current->signal->oom_score_adj); + if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order) + pr_warn("COMPACTION is disabled!!!\n"); cpuset_print_current_mems_allowed(); dump_stack(); From 0cf2f6f6dc605e587d2c1120f295934c77e810e8 Mon Sep 17 00:00:00 2001 From: Simon Guo Date: Fri, 7 Oct 2016 16:59:36 -0700 Subject: [PATCH 056/127] mm: mlock: check against vma for actual mlock() size In do_mlock(), the check against locked memory limitation has a hole which will fail following cases at step 3): 1) User has a memory chunk from addressA with 50k, and user mem lock rlimit is 64k. 2) mlock(addressA, 30k) 3) mlock(addressA, 40k) The 3rd step should have been allowed since the 40k request is intersected with the previous 30k at step 2), and the 3rd step is actually for mlock on the extra 10k memory. This patch checks vma to caculate the actual "new" mlock size, if necessary, and ajust the logic to fix this issue. [akpm@linux-foundation.org: clean up comment layout] [wei.guo.simon@gmail.com: correct a typo in count_mm_mlocked_page_nr()] Link: http://lkml.kernel.org/r/1473325970-11393-2-git-send-email-wei.guo.simon@gmail.com Link: http://lkml.kernel.org/r/1472554781-9835-2-git-send-email-wei.guo.simon@gmail.com Signed-off-by: Simon Guo Cc: Alexey Klimov Cc: Eric B Munson Cc: Geert Uytterhoeven Cc: "Kirill A. Shutemov" Cc: Mel Gorman Cc: Michal Hocko Cc: Shuah Khan Cc: Simon Guo Cc: Thierry Reding Cc: Vlastimil Babka Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/mm/mlock.c b/mm/mlock.c index 14645be06e30..b1fec89bd1c5 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -617,6 +617,45 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, return error; } +/* + * Go through vma areas and sum size of mlocked + * vma pages, as return value. + * Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT) + * is also counted. + * Return value: previously mlocked page counts + */ +static int count_mm_mlocked_page_nr(struct mm_struct *mm, + unsigned long start, size_t len) +{ + struct vm_area_struct *vma; + int count = 0; + + if (mm == NULL) + mm = current->mm; + + vma = find_vma(mm, start); + if (vma == NULL) + vma = mm->mmap; + + for (; vma ; vma = vma->vm_next) { + if (start >= vma->vm_end) + continue; + if (start + len <= vma->vm_start) + break; + if (vma->vm_flags & VM_LOCKED) { + if (start > vma->vm_start) + count -= (start - vma->vm_start); + if (start + len < vma->vm_end) { + count += start + len - vma->vm_start; + break; + } + count += vma->vm_end - vma->vm_start; + } + } + + return count >> PAGE_SHIFT; +} + static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags) { unsigned long locked; @@ -639,6 +678,16 @@ static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t fla return -EINTR; locked += current->mm->locked_vm; + if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) { + /* + * It is possible that the regions requested intersect with + * previously mlocked areas, that part area in "mm->locked_vm" + * should not be counted to new mlock increment count. So check + * and adjust locked count if necessary. + */ + locked -= count_mm_mlocked_page_nr(current->mm, + start, len); + } /* check against resource limits */ if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) From b155b4fde5bdde9fed439cd1f5ea07173df2ed31 Mon Sep 17 00:00:00 2001 From: Simon Guo Date: Fri, 7 Oct 2016 16:59:40 -0700 Subject: [PATCH 057/127] mm: mlock: avoid increase mm->locked_vm on mlock() when already mlock2(,MLOCK_ONFAULT) When one vma was with flag VM_LOCKED|VM_LOCKONFAULT (by invoking mlock2(,MLOCK_ONFAULT)), it can again be populated with mlock() with VM_LOCKED flag only. There is a hole in mlock_fixup() which increase mm->locked_vm twice even the two operations are on the same vma and both with VM_LOCKED flags. The issue can be reproduced by following code: mlock2(p, 1024 * 64, MLOCK_ONFAULT); //VM_LOCKED|VM_LOCKONFAULT mlock(p, 1024 * 64); //VM_LOCKED Then check the increase VmLck field in /proc/pid/status(to 128k). When vma is set with different vm_flags, and the new vm_flags is with VM_LOCKED, it is not necessarily be a "new locked" vma. This patch corrects this bug by prevent mm->locked_vm from increment when old vm_flags is already VM_LOCKED. Link: http://lkml.kernel.org/r/1472554781-9835-3-git-send-email-wei.guo.simon@gmail.com Signed-off-by: Simon Guo Acked-by: Kirill A. Shutemov Cc: Alexey Klimov Cc: Eric B Munson Cc: Geert Uytterhoeven Cc: Mel Gorman Cc: Michal Hocko Cc: Shuah Khan Cc: Simon Guo Cc: Thierry Reding Cc: Vlastimil Babka Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/mlock.c b/mm/mlock.c index b1fec89bd1c5..145a4258ddbc 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -516,6 +516,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, int nr_pages; int ret = 0; int lock = !!(newflags & VM_LOCKED); + vm_flags_t old_flags = vma->vm_flags; if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) @@ -550,6 +551,8 @@ success: nr_pages = (end - start) >> PAGE_SHIFT; if (!lock) nr_pages = -nr_pages; + else if (old_flags & VM_LOCKED) + nr_pages = 0; mm->locked_vm += nr_pages; /* From c7f032bbe4cacb57e49d5c48bf06de8dc28a449f Mon Sep 17 00:00:00 2001 From: Simon Guo Date: Fri, 7 Oct 2016 16:59:43 -0700 Subject: [PATCH 058/127] selftest: split mlock2_ funcs into separate mlock2.h To prepare mlock2.h whose functionality will be reused. Link: http://lkml.kernel.org/r/1472554781-9835-4-git-send-email-wei.guo.simon@gmail.com Signed-off-by: Simon Guo Cc: Alexey Klimov Cc: Eric B Munson Cc: Geert Uytterhoeven Cc: "Kirill A. Shutemov" Cc: Mel Gorman Cc: Michal Hocko Cc: Shuah Khan Cc: Simon Guo Cc: Thierry Reding Cc: Vlastimil Babka Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/mlock2-tests.c | 21 +-------------------- tools/testing/selftests/vm/mlock2.h | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+), 20 deletions(-) create mode 100644 tools/testing/selftests/vm/mlock2.h diff --git a/tools/testing/selftests/vm/mlock2-tests.c b/tools/testing/selftests/vm/mlock2-tests.c index 02ca5e0177c5..7cb13cede483 100644 --- a/tools/testing/selftests/vm/mlock2-tests.c +++ b/tools/testing/selftests/vm/mlock2-tests.c @@ -7,27 +7,8 @@ #include #include #include -#include -#include #include - -#ifndef MLOCK_ONFAULT -#define MLOCK_ONFAULT 1 -#endif - -#ifndef MCL_ONFAULT -#define MCL_ONFAULT (MCL_FUTURE << 1) -#endif - -static int mlock2_(void *start, size_t len, int flags) -{ -#ifdef __NR_mlock2 - return syscall(__NR_mlock2, start, len, flags); -#else - errno = ENOSYS; - return -1; -#endif -} +#include "mlock2.h" struct vm_boundaries { unsigned long start; diff --git a/tools/testing/selftests/vm/mlock2.h b/tools/testing/selftests/vm/mlock2.h new file mode 100644 index 000000000000..b9c6d9fe372f --- /dev/null +++ b/tools/testing/selftests/vm/mlock2.h @@ -0,0 +1,20 @@ +#include +#include + +#ifndef MLOCK_ONFAULT +#define MLOCK_ONFAULT 1 +#endif + +#ifndef MCL_ONFAULT +#define MCL_ONFAULT (MCL_FUTURE << 1) +#endif + +static int mlock2_(void *start, size_t len, int flags) +{ +#ifdef __NR_mlock2 + return syscall(__NR_mlock2, start, len, flags); +#else + errno = ENOSYS; + return -1; +#endif +} From 1448d4d8931c53a9b965a6883055a4c6150f859a Mon Sep 17 00:00:00 2001 From: Simon Guo Date: Fri, 7 Oct 2016 16:59:46 -0700 Subject: [PATCH 059/127] selftests/vm: add test for mlock() when areas are intersected This patch adds mlock() test for multiple invocation on the same address area, and verify it doesn't mess the rlimit mlock limitation. Link: http://lkml.kernel.org/r/1472554781-9835-5-git-send-email-wei.guo.simon@gmail.com Signed-off-by: Simon Guo Cc: Alexey Klimov Cc: Eric B Munson Cc: Geert Uytterhoeven Cc: "Kirill A. Shutemov" Cc: Mel Gorman Cc: Michal Hocko Cc: Shuah Khan Cc: Simon Guo Cc: Thierry Reding Cc: Vlastimil Babka Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/.gitignore | 1 + tools/testing/selftests/vm/Makefile | 4 + .../selftests/vm/mlock-intersect-test.c | 76 +++++++++++++++++++ 3 files changed, 81 insertions(+) create mode 100644 tools/testing/selftests/vm/mlock-intersect-test.c diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index a937a9d26b60..142c565bb351 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -7,3 +7,4 @@ mlock2-tests on-fault-limit transhuge-stress userfaultfd +mlock-intersect-test diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index e4bb1de1d526..a0412a80b679 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -10,6 +10,7 @@ BINARIES += on-fault-limit BINARIES += thuge-gen BINARIES += transhuge-stress BINARIES += userfaultfd +BINARIES += mlock-intersect-test all: $(BINARIES) %: %.c @@ -17,6 +18,9 @@ all: $(BINARIES) userfaultfd: userfaultfd.c ../../../../usr/include/linux/kernel.h $(CC) $(CFLAGS) -O2 -o $@ $< -lpthread +mlock-intersect-test: mlock-intersect-test.c + $(CC) $(CFLAGS) -o $@ $< -lcap + ../../../../usr/include/linux/kernel.h: make -C ../../../.. headers_install diff --git a/tools/testing/selftests/vm/mlock-intersect-test.c b/tools/testing/selftests/vm/mlock-intersect-test.c new file mode 100644 index 000000000000..f78e68a0967c --- /dev/null +++ b/tools/testing/selftests/vm/mlock-intersect-test.c @@ -0,0 +1,76 @@ +/* + * It tests the duplicate mlock result: + * - the ulimit of lock page is 64k + * - allocate address area 64k starting from p + * - mlock [p -- p + 30k] + * - Then mlock address [ p -- p + 40k ] + * + * It should succeed since totally we locked + * 40k < 64k limitation. + * + * It should not be run with CAP_IPC_LOCK. + */ +#include +#include +#include +#include +#include +#include +#include "mlock2.h" + +int main(int argc, char **argv) +{ + struct rlimit new; + char *p = NULL; + cap_t cap = cap_init(); + int i; + + /* drop capabilities including CAP_IPC_LOCK */ + if (cap_set_proc(cap)) + return -1; + + /* set mlock limits to 64k */ + new.rlim_cur = 65536; + new.rlim_max = 65536; + setrlimit(RLIMIT_MEMLOCK, &new); + + /* test VM_LOCK */ + p = malloc(1024 * 64); + if (mlock(p, 1024 * 30)) { + printf("mlock() 30k return failure.\n"); + return -1; + } + for (i = 0; i < 10; i++) { + if (mlock(p, 1024 * 40)) { + printf("mlock() #%d 40k returns failure.\n", i); + return -1; + } + } + for (i = 0; i < 10; i++) { + if (mlock2_(p, 1024 * 40, MLOCK_ONFAULT)) { + printf("mlock2_() #%d 40k returns failure.\n", i); + return -1; + } + } + free(p); + + /* Test VM_LOCKONFAULT */ + p = malloc(1024 * 64); + if (mlock2_(p, 1024 * 30, MLOCK_ONFAULT)) { + printf("mlock2_() 30k return failure.\n"); + return -1; + } + for (i = 0; i < 10; i++) { + if (mlock2_(p, 1024 * 40, MLOCK_ONFAULT)) { + printf("mlock2_() #%d 40k returns failure.\n", i); + return -1; + } + } + for (i = 0; i < 10; i++) { + if (mlock(p, 1024 * 40)) { + printf("mlock() #%d 40k returns failure.\n", i); + return -1; + } + } + return 0; +} From d5aed9c06712520a6e919dc5c0525e39d9795124 Mon Sep 17 00:00:00 2001 From: Simon Guo Date: Fri, 7 Oct 2016 16:59:49 -0700 Subject: [PATCH 060/127] selftest: move seek_to_smaps_entry() out of mlock2-tests.c Function seek_to_smaps_entry() can be useful for other selftest functionalities, so move it out to header file. Link: http://lkml.kernel.org/r/1473325970-11393-3-git-send-email-wei.guo.simon@gmail.com Signed-off-by: Simon Guo Cc: Shuah Khan Cc: Vlastimil Babka Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Eric B Munson Cc: Simon Guo Cc: Mel Gorman Cc: Alexey Klimov Cc: Andrea Arcangeli Cc: Thierry Reding Cc: Mike Kravetz Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/mlock2-tests.c | 42 ----------------------- tools/testing/selftests/vm/mlock2.h | 42 +++++++++++++++++++++++ 2 files changed, 42 insertions(+), 42 deletions(-) diff --git a/tools/testing/selftests/vm/mlock2-tests.c b/tools/testing/selftests/vm/mlock2-tests.c index 7cb13cede483..ff0cda2b19c9 100644 --- a/tools/testing/selftests/vm/mlock2-tests.c +++ b/tools/testing/selftests/vm/mlock2-tests.c @@ -1,8 +1,6 @@ #define _GNU_SOURCE #include #include -#include -#include #include #include #include @@ -119,46 +117,6 @@ static uint64_t get_kpageflags(unsigned long pfn) return flags; } -static FILE *seek_to_smaps_entry(unsigned long addr) -{ - FILE *file; - char *line = NULL; - size_t size = 0; - unsigned long start, end; - char perms[5]; - unsigned long offset; - char dev[32]; - unsigned long inode; - char path[BUFSIZ]; - - file = fopen("/proc/self/smaps", "r"); - if (!file) { - perror("fopen smaps"); - _exit(1); - } - - while (getline(&line, &size, file) > 0) { - if (sscanf(line, "%lx-%lx %s %lx %s %lu %s\n", - &start, &end, perms, &offset, dev, &inode, path) < 6) - goto next; - - if (start <= addr && addr < end) - goto out; - -next: - free(line); - line = NULL; - size = 0; - } - - fclose(file); - file = NULL; - -out: - free(line); - return file; -} - #define VMFLAGS "VmFlags:" static bool is_vmflag_set(unsigned long addr, const char *vmflag) diff --git a/tools/testing/selftests/vm/mlock2.h b/tools/testing/selftests/vm/mlock2.h index b9c6d9fe372f..7ee062929d3e 100644 --- a/tools/testing/selftests/vm/mlock2.h +++ b/tools/testing/selftests/vm/mlock2.h @@ -1,5 +1,7 @@ #include #include +#include +#include #ifndef MLOCK_ONFAULT #define MLOCK_ONFAULT 1 @@ -18,3 +20,43 @@ static int mlock2_(void *start, size_t len, int flags) return -1; #endif } + +static FILE *seek_to_smaps_entry(unsigned long addr) +{ + FILE *file; + char *line = NULL; + size_t size = 0; + unsigned long start, end; + char perms[5]; + unsigned long offset; + char dev[32]; + unsigned long inode; + char path[BUFSIZ]; + + file = fopen("/proc/self/smaps", "r"); + if (!file) { + perror("fopen smaps"); + _exit(1); + } + + while (getline(&line, &size, file) > 0) { + if (sscanf(line, "%lx-%lx %s %lx %s %lu %s\n", + &start, &end, perms, &offset, dev, &inode, path) < 6) + goto next; + + if (start <= addr && addr < end) + goto out; + +next: + free(line); + line = NULL; + size = 0; + } + + fclose(file); + file = NULL; + +out: + free(line); + return file; +} From 26b4224d99615a19c002508c6e80bd3d1d783b64 Mon Sep 17 00:00:00 2001 From: Simon Guo Date: Fri, 7 Oct 2016 16:59:52 -0700 Subject: [PATCH 061/127] selftests: expanding more mlock selftest This patch will randomly perform mlock/mlock2 on a given memory region, and verify the RLIMIT_MEMLOCK limitation works properly. Suggested-by: David Rientjes Link: http://lkml.kernel.org/r/1473325970-11393-4-git-send-email-wei.guo.simon@gmail.com Signed-off-by: Simon Guo Cc: Shuah Khan Cc: Vlastimil Babka Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Eric B Munson Cc: Simon Guo Cc: Mel Gorman Cc: Alexey Klimov Cc: Andrea Arcangeli Cc: Thierry Reding Cc: Mike Kravetz Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/Makefile | 4 +- .../selftests/vm/mlock-intersect-test.c | 76 ----- .../testing/selftests/vm/mlock-random-test.c | 293 ++++++++++++++++++ 3 files changed, 295 insertions(+), 78 deletions(-) delete mode 100644 tools/testing/selftests/vm/mlock-intersect-test.c create mode 100644 tools/testing/selftests/vm/mlock-random-test.c diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index a0412a80b679..bbab7f4664ac 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -10,7 +10,7 @@ BINARIES += on-fault-limit BINARIES += thuge-gen BINARIES += transhuge-stress BINARIES += userfaultfd -BINARIES += mlock-intersect-test +BINARIES += mlock-random-test all: $(BINARIES) %: %.c @@ -18,7 +18,7 @@ all: $(BINARIES) userfaultfd: userfaultfd.c ../../../../usr/include/linux/kernel.h $(CC) $(CFLAGS) -O2 -o $@ $< -lpthread -mlock-intersect-test: mlock-intersect-test.c +mlock-random-test: mlock-random-test.c $(CC) $(CFLAGS) -o $@ $< -lcap ../../../../usr/include/linux/kernel.h: diff --git a/tools/testing/selftests/vm/mlock-intersect-test.c b/tools/testing/selftests/vm/mlock-intersect-test.c deleted file mode 100644 index f78e68a0967c..000000000000 --- a/tools/testing/selftests/vm/mlock-intersect-test.c +++ /dev/null @@ -1,76 +0,0 @@ -/* - * It tests the duplicate mlock result: - * - the ulimit of lock page is 64k - * - allocate address area 64k starting from p - * - mlock [p -- p + 30k] - * - Then mlock address [ p -- p + 40k ] - * - * It should succeed since totally we locked - * 40k < 64k limitation. - * - * It should not be run with CAP_IPC_LOCK. - */ -#include -#include -#include -#include -#include -#include -#include "mlock2.h" - -int main(int argc, char **argv) -{ - struct rlimit new; - char *p = NULL; - cap_t cap = cap_init(); - int i; - - /* drop capabilities including CAP_IPC_LOCK */ - if (cap_set_proc(cap)) - return -1; - - /* set mlock limits to 64k */ - new.rlim_cur = 65536; - new.rlim_max = 65536; - setrlimit(RLIMIT_MEMLOCK, &new); - - /* test VM_LOCK */ - p = malloc(1024 * 64); - if (mlock(p, 1024 * 30)) { - printf("mlock() 30k return failure.\n"); - return -1; - } - for (i = 0; i < 10; i++) { - if (mlock(p, 1024 * 40)) { - printf("mlock() #%d 40k returns failure.\n", i); - return -1; - } - } - for (i = 0; i < 10; i++) { - if (mlock2_(p, 1024 * 40, MLOCK_ONFAULT)) { - printf("mlock2_() #%d 40k returns failure.\n", i); - return -1; - } - } - free(p); - - /* Test VM_LOCKONFAULT */ - p = malloc(1024 * 64); - if (mlock2_(p, 1024 * 30, MLOCK_ONFAULT)) { - printf("mlock2_() 30k return failure.\n"); - return -1; - } - for (i = 0; i < 10; i++) { - if (mlock2_(p, 1024 * 40, MLOCK_ONFAULT)) { - printf("mlock2_() #%d 40k returns failure.\n", i); - return -1; - } - } - for (i = 0; i < 10; i++) { - if (mlock(p, 1024 * 40)) { - printf("mlock() #%d 40k returns failure.\n", i); - return -1; - } - } - return 0; -} diff --git a/tools/testing/selftests/vm/mlock-random-test.c b/tools/testing/selftests/vm/mlock-random-test.c new file mode 100644 index 000000000000..83de4f58d262 --- /dev/null +++ b/tools/testing/selftests/vm/mlock-random-test.c @@ -0,0 +1,293 @@ +/* + * It tests the mlock/mlock2() when they are invoked + * on randomly memory region. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "mlock2.h" + +#define CHUNK_UNIT (128 * 1024) +#define MLOCK_RLIMIT_SIZE (CHUNK_UNIT * 2) +#define MLOCK_WITHIN_LIMIT_SIZE CHUNK_UNIT +#define MLOCK_OUTOF_LIMIT_SIZE (CHUNK_UNIT * 3) + +#define TEST_LOOP 100 +#define PAGE_ALIGN(size, ps) (((size) + ((ps) - 1)) & ~((ps) - 1)) + +int set_cap_limits(rlim_t max) +{ + struct rlimit new; + cap_t cap = cap_init(); + + new.rlim_cur = max; + new.rlim_max = max; + if (setrlimit(RLIMIT_MEMLOCK, &new)) { + perror("setrlimit() returns error\n"); + return -1; + } + + /* drop capabilities including CAP_IPC_LOCK */ + if (cap_set_proc(cap)) { + perror("cap_set_proc() returns error\n"); + return -2; + } + + return 0; +} + +int get_proc_locked_vm_size(void) +{ + FILE *f; + int ret = -1; + char line[1024] = {0}; + unsigned long lock_size = 0; + + f = fopen("/proc/self/status", "r"); + if (!f) { + perror("fopen"); + return -1; + } + + while (fgets(line, 1024, f)) { + if (strstr(line, "VmLck")) { + ret = sscanf(line, "VmLck:\t%8lu kB", &lock_size); + if (ret <= 0) { + printf("sscanf() on VmLck error: %s: %d\n", + line, ret); + fclose(f); + return -1; + } + fclose(f); + return (int)(lock_size << 10); + } + } + + perror("cann't parse VmLck in /proc/self/status\n"); + fclose(f); + return -1; +} + +/* + * Get the MMUPageSize of the memory region including input + * address from proc file. + * + * return value: on error case, 0 will be returned. + * Otherwise the page size(in bytes) is returned. + */ +int get_proc_page_size(unsigned long addr) +{ + FILE *smaps; + char *line; + unsigned long mmupage_size = 0; + size_t size; + + smaps = seek_to_smaps_entry(addr); + if (!smaps) { + printf("Unable to parse /proc/self/smaps\n"); + return 0; + } + + while (getline(&line, &size, smaps) > 0) { + if (!strstr(line, "MMUPageSize")) { + free(line); + line = NULL; + size = 0; + continue; + } + + /* found the MMUPageSize of this section */ + if (sscanf(line, "MMUPageSize: %8lu kB", + &mmupage_size) < 1) { + printf("Unable to parse smaps entry for Size:%s\n", + line); + break; + } + + } + free(line); + if (smaps) + fclose(smaps); + return mmupage_size << 10; +} + +/* + * Test mlock/mlock2() on provided memory chunk. + * It expects the mlock/mlock2() to be successful (within rlimit) + * + * With allocated memory chunk [p, p + alloc_size), this + * test will choose start/len randomly to perform mlock/mlock2 + * [start, start + len] memory range. The range is within range + * of the allocated chunk. + * + * The memory region size alloc_size is within the rlimit. + * So we always expect a success of mlock/mlock2. + * + * VmLck is assumed to be 0 before this test. + * + * return value: 0 - success + * else: failure + */ +int test_mlock_within_limit(char *p, int alloc_size) +{ + int i; + int ret = 0; + int locked_vm_size = 0; + struct rlimit cur; + int page_size = 0; + + getrlimit(RLIMIT_MEMLOCK, &cur); + if (cur.rlim_cur < alloc_size) { + printf("alloc_size[%d] < %u rlimit,lead to mlock failure\n", + alloc_size, (unsigned int)cur.rlim_cur); + return -1; + } + + srand(time(NULL)); + for (i = 0; i < TEST_LOOP; i++) { + /* + * - choose mlock/mlock2 randomly + * - choose lock_size randomly but lock_size < alloc_size + * - choose start_offset randomly but p+start_offset+lock_size + * < p+alloc_size + */ + int is_mlock = !!(rand() % 2); + int lock_size = rand() % alloc_size; + int start_offset = rand() % (alloc_size - lock_size); + + if (is_mlock) + ret = mlock(p + start_offset, lock_size); + else + ret = mlock2_(p + start_offset, lock_size, + MLOCK_ONFAULT); + + if (ret) { + printf("%s() failure at |%p(%d)| mlock:|%p(%d)|\n", + is_mlock ? "mlock" : "mlock2", + p, alloc_size, + p + start_offset, lock_size); + return ret; + } + } + + /* + * Check VmLck left by the tests. + */ + locked_vm_size = get_proc_locked_vm_size(); + page_size = get_proc_page_size((unsigned long)p); + if (page_size == 0) { + printf("cannot get proc MMUPageSize\n"); + return -1; + } + + if (locked_vm_size > PAGE_ALIGN(alloc_size, page_size) + page_size) { + printf("test_mlock_within_limit() left VmLck:%d on %d chunk\n", + locked_vm_size, alloc_size); + return -1; + } + + return 0; +} + + +/* + * We expect the mlock/mlock2() to be fail (outof limitation) + * + * With allocated memory chunk [p, p + alloc_size), this + * test will randomly choose start/len and perform mlock/mlock2 + * on [start, start+len] range. + * + * The memory region size alloc_size is above the rlimit. + * And the len to be locked is higher than rlimit. + * So we always expect a failure of mlock/mlock2. + * No locked page number should be increased as a side effect. + * + * return value: 0 - success + * else: failure + */ +int test_mlock_outof_limit(char *p, int alloc_size) +{ + int i; + int ret = 0; + int locked_vm_size = 0, old_locked_vm_size = 0; + struct rlimit cur; + + getrlimit(RLIMIT_MEMLOCK, &cur); + if (cur.rlim_cur >= alloc_size) { + printf("alloc_size[%d] >%u rlimit, violates test condition\n", + alloc_size, (unsigned int)cur.rlim_cur); + return -1; + } + + old_locked_vm_size = get_proc_locked_vm_size(); + srand(time(NULL)); + for (i = 0; i < TEST_LOOP; i++) { + int is_mlock = !!(rand() % 2); + int lock_size = (rand() % (alloc_size - cur.rlim_cur)) + + cur.rlim_cur; + int start_offset = rand() % (alloc_size - lock_size); + + if (is_mlock) + ret = mlock(p + start_offset, lock_size); + else + ret = mlock2_(p + start_offset, lock_size, + MLOCK_ONFAULT); + if (ret == 0) { + printf("%s() succeeds? on %p(%d) mlock%p(%d)\n", + is_mlock ? "mlock" : "mlock2", + p, alloc_size, + p + start_offset, lock_size); + return -1; + } + } + + locked_vm_size = get_proc_locked_vm_size(); + if (locked_vm_size != old_locked_vm_size) { + printf("tests leads to new mlocked page: old[%d], new[%d]\n", + old_locked_vm_size, + locked_vm_size); + return -1; + } + + return 0; +} + +int main(int argc, char **argv) +{ + char *p = NULL; + int ret = 0; + + if (set_cap_limits(MLOCK_RLIMIT_SIZE)) + return -1; + + p = malloc(MLOCK_WITHIN_LIMIT_SIZE); + if (p == NULL) { + perror("malloc() failure\n"); + return -1; + } + ret = test_mlock_within_limit(p, MLOCK_WITHIN_LIMIT_SIZE); + if (ret) + return ret; + munlock(p, MLOCK_WITHIN_LIMIT_SIZE); + free(p); + + + p = malloc(MLOCK_OUTOF_LIMIT_SIZE); + if (p == NULL) { + perror("malloc() failure\n"); + return -1; + } + ret = test_mlock_outof_limit(p, MLOCK_OUTOF_LIMIT_SIZE); + if (ret) + return ret; + munlock(p, MLOCK_OUTOF_LIMIT_SIZE); + free(p); + + return 0; +} From 74d2fad1334d12bac8fe017aba598dd66c86628b Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Fri, 7 Oct 2016 16:59:56 -0700 Subject: [PATCH 062/127] thp, dax: add thp_get_unmapped_area for pmd mappings When CONFIG_FS_DAX_PMD is set, DAX supports mmap() using pmd page size. This feature relies on both mmap virtual address and FS block (i.e. physical address) to be aligned by the pmd page size. Users can use mkfs options to specify FS to align block allocations. However, aligning mmap address requires code changes to existing applications for providing a pmd-aligned address to mmap(). For instance, fio with "ioengine=mmap" performs I/Os with mmap() [1]. It calls mmap() with a NULL address, which needs to be changed to provide a pmd-aligned address for testing with DAX pmd mappings. Changing all applications that call mmap() with NULL is undesirable. Add thp_get_unmapped_area(), which can be called by filesystem's get_unmapped_area to align an mmap address by the pmd size for a DAX file. It calls the default handler, mm->get_unmapped_area(), to find a range and then aligns it for a DAX file. The patch is based on Matthew Wilcox's change that allows adding support of the pud page size easily. [1]: https://github.com/axboe/fio/blob/master/engines/mmap.c Link: http://lkml.kernel.org/r/1472497881-9323-2-git-send-email-toshi.kani@hpe.com Signed-off-by: Toshi Kani Reviewed-by: Dan Williams Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Kirill A. Shutemov Cc: Dave Chinner Cc: Jan Kara Cc: Theodore Ts'o Cc: Andreas Dilger Cc: Mike Kravetz Cc: "Kirill A. Shutemov" Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 7 +++++++ mm/huge_memory.c | 43 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 6f14de45b5ce..4fca5263fd42 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -87,6 +87,10 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma); extern unsigned long transparent_hugepage_flags; +extern unsigned long thp_get_unmapped_area(struct file *filp, + unsigned long addr, unsigned long len, unsigned long pgoff, + unsigned long flags); + extern void prep_transhuge_page(struct page *page); extern void free_transhuge_page(struct page *page); @@ -169,6 +173,9 @@ void put_huge_zero_page(void); static inline void prep_transhuge_page(struct page *page) {} #define transparent_hugepage_flags 0UL + +#define thp_get_unmapped_area NULL + static inline int split_huge_page_to_list(struct page *page, struct list_head *list) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 283583fcb1e7..a0b0e562407d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -469,6 +469,49 @@ void prep_transhuge_page(struct page *page) set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); } +unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len, + loff_t off, unsigned long flags, unsigned long size) +{ + unsigned long addr; + loff_t off_end = off + len; + loff_t off_align = round_up(off, size); + unsigned long len_pad; + + if (off_end <= off_align || (off_end - off_align) < size) + return 0; + + len_pad = len + size; + if (len_pad < len || (off + len_pad) < off) + return 0; + + addr = current->mm->get_unmapped_area(filp, 0, len_pad, + off >> PAGE_SHIFT, flags); + if (IS_ERR_VALUE(addr)) + return 0; + + addr += (off - addr) & (size - 1); + return addr; +} + +unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + loff_t off = (loff_t)pgoff << PAGE_SHIFT; + + if (addr) + goto out; + if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD)) + goto out; + + addr = __thp_get_unmapped_area(filp, len, off, flags, PMD_SIZE); + if (addr) + return addr; + + out: + return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); +} +EXPORT_SYMBOL_GPL(thp_get_unmapped_area); + static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, gfp_t gfp) { From dbe6ec815641aa22b50775aaeb47fa3a8d04ccf1 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Fri, 7 Oct 2016 16:59:59 -0700 Subject: [PATCH 063/127] ext2/4, xfs: call thp_get_unmapped_area() for pmd mappings To support DAX pmd mappings with unmodified applications, filesystems need to align an mmap address by the pmd size. Call thp_get_unmapped_area() from f_op->get_unmapped_area. Note, there is no change in behavior for a non-DAX file. Link: http://lkml.kernel.org/r/1472497881-9323-3-git-send-email-toshi.kani@hpe.com Signed-off-by: Toshi Kani Cc: Dan Williams Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Kirill A. Shutemov Cc: Dave Chinner Cc: Jan Kara Cc: Theodore Ts'o Cc: Andreas Dilger Cc: Mike Kravetz Cc: "Kirill A. Shutemov" Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext2/file.c | 1 + fs/ext4/file.c | 1 + fs/xfs/xfs_file.c | 1 + 3 files changed, 3 insertions(+) diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 423cc01c9d41..0ca363d1341c 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -234,6 +234,7 @@ const struct file_operations ext2_file_operations = { .open = dquot_file_open, .release = ext2_release_file, .fsync = ext2_fsync, + .get_unmapped_area = thp_get_unmapped_area, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, }; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 261ac3734c58..28f542bb0bda 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -703,6 +703,7 @@ const struct file_operations ext4_file_operations = { .open = ext4_file_open, .release = ext4_release_file, .fsync = ext4_sync_file, + .get_unmapped_area = thp_get_unmapped_area, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ext4_fallocate, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index c68517b0f248..bac55c687085 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1618,6 +1618,7 @@ const struct file_operations xfs_file_operations = { .open = xfs_file_open, .release = xfs_file_release, .fsync = xfs_file_fsync, + .get_unmapped_area = thp_get_unmapped_area, .fallocate = xfs_file_fallocate, }; From 03e86dba5b628a13a58adae62e5b918b969ae93e Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Fri, 7 Oct 2016 17:00:02 -0700 Subject: [PATCH 064/127] cpu: fix node state for whether it contains CPU In current kernel code, we only call node_set_state(cpu_to_node(cpu), N_CPU) when a cpu is hot plugged. But we do not set the node state for N_CPU when the cpus are brought online during boot. So this could lead to failure when we check to see if a node contains cpu with node_state(node_id, N_CPU). One use case is in the node_reclaime function: /* * Only run node reclaim on the local node or on nodes that do * not * have associated processors. This will favor the local * processor * over remote processors and spread off node memory allocations * as wide as possible. */ if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id()) return NODE_RECLAIM_NOSCAN; I instrumented the kernel to call this function after boot and it always returns 0 on a x86 desktop machine until I apply the attached patch. int num_cpu_node(void) { int i, nr_cpu_nodes = 0; for_each_node(i) { if (node_state(i, N_CPU)) ++ nr_cpu_nodes; } return nr_cpu_nodes; } Fix this by checking each node for online CPU when we initialize vmstat that's responsible for maintaining node state. Link: http://lkml.kernel.org/r/20160829175922.GA21775@linux.intel.com Signed-off-by: Tim Chen Acked-by: David Rientjes Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Peter Zijlstra Cc: Tim Chen Cc: Cc: Ying Cc: Andi Kleen Cc: Dave Hansen Cc: Dan Williams Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mm/vmstat.c b/mm/vmstat.c index dc04e76c7950..73aab319969d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1715,6 +1715,16 @@ static void __init start_shepherd_timer(void) round_jiffies_relative(sysctl_stat_interval)); } +static void __init init_cpu_node_state(void) +{ + int cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) + node_set_state(cpu_to_node(cpu), N_CPU); + put_online_cpus(); +} + static void vmstat_cpu_dead(int node) { int cpu; @@ -1772,6 +1782,7 @@ static int __init setup_vmstat(void) #ifdef CONFIG_SMP cpu_notifier_register_begin(); __register_cpu_notifier(&vmstat_notifier); + init_cpu_node_state(); start_shepherd_timer(); cpu_notifier_register_done(); From 0f30206bf2a42e278c2cec32e4b722626458c75b Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 7 Oct 2016 17:00:06 -0700 Subject: [PATCH 065/127] fs/proc/task_mmu.c: make the task_mmu walk_page_range() limit in clear_refs_write() obvious Trying to walk all of virtual memory requires architecture specific knowledge. On x86_64, addresses must be sign extended from bit 48, whereas on arm64 the top VA_BITS of address space have their own set of page tables. clear_refs_write() calls walk_page_range() on the range 0 to ~0UL, it provides a test_walk() callback that only expects to be walking over VMAs. Currently walk_pmd_range() will skip memory regions that don't have a VMA, reporting them as a hole. As this call only expects to walk user address space, make it walk 0 to 'highest_vm_end'. Link: http://lkml.kernel.org/r/1472655792-22439-1-git-send-email-james.morse@arm.com Signed-off-by: James Morse Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f6fa99eca515..d2a70cf2154e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1070,7 +1070,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, } mmu_notifier_invalidate_range_start(mm, 0, -1); } - walk_page_range(0, ~0UL, &clear_refs_walk); + walk_page_range(0, mm->highest_vm_end, &clear_refs_walk); if (type == CLEAR_REFS_SOFT_DIRTY) mmu_notifier_invalidate_range_end(mm, 0, -1); flush_tlb_mm(mm); From 6fcb52a56ff60d240f06296b12827e7f20d45f63 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Fri, 7 Oct 2016 17:00:08 -0700 Subject: [PATCH 066/127] thp: reduce usage of huge zero page's atomic counter The global zero page is used to satisfy an anonymous read fault. If THP(Transparent HugePage) is enabled then the global huge zero page is used. The global huge zero page uses an atomic counter for reference counting and is allocated/freed dynamically according to its counter value. CPU time spent on that counter will greatly increase if there are a lot of processes doing anonymous read faults. This patch proposes a way to reduce the access to the global counter so that the CPU load can be reduced accordingly. To do this, a new flag of the mm_struct is introduced: MMF_USED_HUGE_ZERO_PAGE. With this flag, the process only need to touch the global counter in two cases: 1 The first time it uses the global huge zero page; 2 The time when mm_user of its mm_struct reaches zero. Note that right now, the huge zero page is eligible to be freed as soon as its last use goes away. With this patch, the page will not be eligible to be freed until the exit of the last process from which it was ever used. And with the use of mm_user, the kthread is not eligible to use huge zero page either. Since no kthread is using huge zero page today, there is no difference after applying this patch. But if that is not desired, I can change it to when mm_count reaches zero. Case used for test on Haswell EP: usemem -n 72 --readonly -j 0x200000 100G Which spawns 72 processes and each will mmap 100G anonymous space and then do read only access to that space sequentially with a step of 2MB. CPU cycles from perf report for base commit: 54.03% usemem [kernel.kallsyms] [k] get_huge_zero_page CPU cycles from perf report for this commit: 0.11% usemem [kernel.kallsyms] [k] mm_get_huge_zero_page Performance(throughput) of the workload for base commit: 1784430792 Performance(throughput) of the workload for this commit: 4726928591 164% increase. Runtime of the workload for base commit: 707592 us Runtime of the workload for this commit: 303970 us 50% drop. Link: http://lkml.kernel.org/r/fe51a88f-446a-4622-1363-ad1282d71385@intel.com Signed-off-by: Aaron Lu Cc: Sergey Senozhatsky Cc: "Kirill A. Shutemov" Cc: Dave Hansen Cc: Tim Chen Cc: Huang Ying Cc: Vlastimil Babka Cc: Jerome Marchand Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Ebru Akagunduz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 2 +- include/linux/huge_mm.h | 8 ++++---- include/linux/sched.h | 1 + kernel/fork.c | 1 + mm/huge_memory.c | 36 +++++++++++++++++++++++++----------- mm/swap.c | 4 +--- mm/swap_state.c | 4 +--- 7 files changed, 34 insertions(+), 22 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index cc025f82ef07..014defd2e744 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1036,7 +1036,7 @@ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, if (!write && !buffer_mapped(&bh)) { spinlock_t *ptl; pmd_t entry; - struct page *zero_page = get_huge_zero_page(); + struct page *zero_page = mm_get_huge_zero_page(vma->vm_mm); if (unlikely(!zero_page)) { dax_pmd_dbg(&bh, address, "no zero page"); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 4fca5263fd42..9b9f65d99873 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -156,8 +156,8 @@ static inline bool is_huge_zero_pmd(pmd_t pmd) return is_huge_zero_page(pmd_page(pmd)); } -struct page *get_huge_zero_page(void); -void put_huge_zero_page(void); +struct page *mm_get_huge_zero_page(struct mm_struct *mm); +void mm_put_huge_zero_page(struct mm_struct *mm); #define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot)) @@ -220,9 +220,9 @@ static inline bool is_huge_zero_page(struct page *page) return false; } -static inline void put_huge_zero_page(void) +static inline void mm_put_huge_zero_page(struct mm_struct *mm) { - BUILD_BUG(); + return; } static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma, diff --git a/include/linux/sched.h b/include/linux/sched.h index 6bee6f988912..348f51b0ec92 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -526,6 +526,7 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ #define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */ #define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ +#define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) diff --git a/kernel/fork.c b/kernel/fork.c index 9a8ec66cd4df..6d42242485cb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -854,6 +854,7 @@ static inline void __mmput(struct mm_struct *mm) ksm_exit(mm); khugepaged_exit(mm); /* must run before exit_mmap */ exit_mmap(mm); + mm_put_huge_zero_page(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a0b0e562407d..12b9f1a39b63 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -59,7 +59,7 @@ static struct shrinker deferred_split_shrinker; static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; -struct page *get_huge_zero_page(void) +static struct page *get_huge_zero_page(void) { struct page *zero_page; retry: @@ -86,7 +86,7 @@ retry: return READ_ONCE(huge_zero_page); } -void put_huge_zero_page(void) +static void put_huge_zero_page(void) { /* * Counter should never go to zero here. Only shrinker can put @@ -95,6 +95,26 @@ void put_huge_zero_page(void) BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); } +struct page *mm_get_huge_zero_page(struct mm_struct *mm) +{ + if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) + return READ_ONCE(huge_zero_page); + + if (!get_huge_zero_page()) + return NULL; + + if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) + put_huge_zero_page(); + + return READ_ONCE(huge_zero_page); +} + +void mm_put_huge_zero_page(struct mm_struct *mm) +{ + if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) + put_huge_zero_page(); +} + static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink, struct shrink_control *sc) { @@ -644,7 +664,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) pgtable = pte_alloc_one(vma->vm_mm, haddr); if (unlikely(!pgtable)) return VM_FAULT_OOM; - zero_page = get_huge_zero_page(); + zero_page = mm_get_huge_zero_page(vma->vm_mm); if (unlikely(!zero_page)) { pte_free(vma->vm_mm, pgtable); count_vm_event(THP_FAULT_FALLBACK); @@ -666,10 +686,8 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) } } else spin_unlock(fe->ptl); - if (!set) { + if (!set) pte_free(vma->vm_mm, pgtable); - put_huge_zero_page(); - } return ret; } gfp = alloc_hugepage_direct_gfpmask(vma); @@ -823,7 +841,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, * since we already have a zero page to copy. It just takes a * reference. */ - zero_page = get_huge_zero_page(); + zero_page = mm_get_huge_zero_page(dst_mm); set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, zero_page); ret = 0; @@ -1081,7 +1099,6 @@ alloc: update_mmu_cache_pmd(vma, fe->address, fe->pmd); if (!page) { add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); - put_huge_zero_page(); } else { VM_BUG_ON_PAGE(!PageHead(page), page); page_remove_rmap(page, true); @@ -1542,7 +1559,6 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, } smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); - put_huge_zero_page(); } static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, @@ -1565,8 +1581,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (!vma_is_anonymous(vma)) { _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); - if (is_huge_zero_pmd(_pmd)) - put_huge_zero_page(); if (vma_is_dax(vma)) return; page = pmd_page(_pmd); diff --git a/mm/swap.c b/mm/swap.c index 75c63bb2a1da..4dcf852e1e6d 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -748,10 +748,8 @@ void release_pages(struct page **pages, int nr, bool cold) locked_pgdat = NULL; } - if (is_huge_zero_page(page)) { - put_huge_zero_page(); + if (is_huge_zero_page(page)) continue; - } page = compound_head(page); if (!put_page_testzero(page)) diff --git a/mm/swap_state.c b/mm/swap_state.c index 268b8191982b..8679c997eab6 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -254,9 +254,7 @@ static inline void free_swap_cache(struct page *page) void free_page_and_swap_cache(struct page *page) { free_swap_cache(page); - if (is_huge_zero_page(page)) - put_huge_zero_page(); - else + if (!is_huge_zero_page(page)) put_page(page); } From 0247f3f4d78a475cd3181dc9fc162fdef773aaaa Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 7 Oct 2016 17:00:12 -0700 Subject: [PATCH 067/127] mm/memcontrol.c: make the walk_page_range() limit obvious mem_cgroup_count_precharge() and mem_cgroup_move_charge() both call walk_page_range() on the range 0 to ~0UL, neither provide a pte_hole callback, which causes the current implementation to skip non-vma regions. This is all fine but follow up changes would like to make walk_page_range more generic so it is better to be explicit about which range to traverse so let's use highest_vm_end to explicitly traverse only user mmaped memory. [mhocko@kernel.org: rewrote changelog] Link: http://lkml.kernel.org/r/1472655897-22532-1-git-send-email-james.morse@arm.com Signed-off-by: James Morse Acked-by: Naoya Horiguchi Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5579e762b1ce..0739d4129a93 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4681,7 +4681,8 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) .mm = mm, }; down_read(&mm->mmap_sem); - walk_page_range(0, ~0UL, &mem_cgroup_count_precharge_walk); + walk_page_range(0, mm->highest_vm_end, + &mem_cgroup_count_precharge_walk); up_read(&mm->mmap_sem); precharge = mc.precharge; @@ -4969,7 +4970,8 @@ retry: * When we have consumed all precharges and failed in doing * additional charge, the page walk just aborts. */ - walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk); + walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk); + up_read(&mc.mm->mmap_sem); atomic_dec(&mc.from->moving_account); } From d66ba15bde22703b3c0cec6782519cb0765a6777 Mon Sep 17 00:00:00 2001 From: Reza Arbab Date: Fri, 7 Oct 2016 17:00:15 -0700 Subject: [PATCH 068/127] memory-hotplug: fix store_mem_state() return value If store_mem_state() is called to online memory which is already online, it will return 1, the value it got from device_online(). This is wrong because store_mem_state() is a device_attribute .store function. Thus a non-negative return value represents input bytes read. Set the return value to -EINVAL in this case. Link: http://lkml.kernel.org/r/1472743777-24266-1-git-send-email-arbab@linux.vnet.ibm.com Signed-off-by: Reza Arbab Cc: Greg Kroah-Hartman Cc: Vlastimil Babka Cc: Vitaly Kuznetsov Cc: David Rientjes Cc: Yaowei Bai Cc: Joonsoo Kim Cc: Dan Williams Cc: Xishi Qiu Cc: David Vrabel Cc: Chen Yucong Cc: Andrew Banman Cc: Seth Jennings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index dc75de9059cd..62c63c0c5c22 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -361,8 +361,11 @@ store_mem_state(struct device *dev, err: unlock_device_hotplug(); - if (ret) + if (ret < 0) return ret; + if (ret) + return -EINVAL; + return count; } From 87744ab3832b83ba71b931f86f9cfdb000d07da5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 7 Oct 2016 17:00:18 -0700 Subject: [PATCH 069/127] mm: fix cache mode tracking in vm_insert_mixed() vm_insert_mixed() unlike vm_insert_pfn_prot() and vmf_insert_pfn_pmd(), fails to check the pgprot_t it uses for the mapping against the one recorded in the memtype tracking tree. Add the missing call to track_pfn_insert() to preclude cases where incompatible aliased mappings are established for a given physical address range. Link: http://lkml.kernel.org/r/147328717909.35069.14256589123570653697.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Cc: David Airlie Cc: Matthew Wilcox Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 4bfc3a9c3b18..fc1987dfd8cc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1649,10 +1649,14 @@ EXPORT_SYMBOL(vm_insert_pfn_prot); int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn) { + pgprot_t pgprot = vma->vm_page_prot; + BUG_ON(!(vma->vm_flags & VM_MIXEDMAP)); if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; + if (track_pfn_insert(vma, &pgprot, pfn)) + return -EINVAL; /* * If we don't have pte special, then we have to use the pfn_valid() @@ -1670,9 +1674,9 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, * result in pfn_t_has_page() == false. */ page = pfn_to_page(pfn_t_to_pfn(pfn)); - return insert_page(vma, addr, page, vma->vm_page_prot); + return insert_page(vma, addr, page, pgprot); } - return insert_pfn(vma, addr, pfn, vma->vm_page_prot); + return insert_pfn(vma, addr, pfn, pgprot); } EXPORT_SYMBOL(vm_insert_mixed); From f6ab1f7f6b2d8e48c5fc47746a67363b20d79a1d Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 7 Oct 2016 17:00:21 -0700 Subject: [PATCH 070/127] mm, swap: use offset of swap entry as key of swap cache This patch is to improve the performance of swap cache operations when the type of the swap device is not 0. Originally, the whole swap entry value is used as the key of the swap cache, even though there is one radix tree for each swap device. If the type of the swap device is not 0, the height of the radix tree of the swap cache will be increased unnecessary, especially on 64bit architecture. For example, for a 1GB swap device on the x86_64 architecture, the height of the radix tree of the swap cache is 11. But if the offset of the swap entry is used as the key of the swap cache, the height of the radix tree of the swap cache is 4. The increased height causes unnecessary radix tree descending and increased cache footprint. This patch reduces the height of the radix tree of the swap cache via using the offset of the swap entry instead of the whole swap entry value as the key of the swap cache. In 32 processes sequential swap out test case on a Xeon E5 v3 system with RAM disk as swap, the lock contention for the spinlock of the swap cache is reduced from 20.15% to 12.19%, when the type of the swap device is 1. Use the whole swap entry as key, perf-profile.calltrace.cycles-pp._raw_spin_lock_irq.__add_to_swap_cache.add_to_swap_cache.add_to_swap.shrink_page_list: 10.37, perf-profile.calltrace.cycles-pp._raw_spin_lock_irqsave.__remove_mapping.shrink_page_list.shrink_inactive_list.shrink_node_memcg: 9.78, Use the swap offset as key, perf-profile.calltrace.cycles-pp._raw_spin_lock_irq.__add_to_swap_cache.add_to_swap_cache.add_to_swap.shrink_page_list: 6.25, perf-profile.calltrace.cycles-pp._raw_spin_lock_irqsave.__remove_mapping.shrink_page_list.shrink_inactive_list.shrink_node_memcg: 5.94, Link: http://lkml.kernel.org/r/1473270649-27229-1-git-send-email-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: "Kirill A. Shutemov" Cc: Dave Hansen Cc: Dan Williams Cc: Joonsoo Kim Cc: Hugh Dickins Cc: Mel Gorman Cc: Minchan Kim Cc: Aaron Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 8 ++++---- mm/memcontrol.c | 5 +++-- mm/mincore.c | 5 +++-- mm/swap_state.c | 8 ++++---- mm/swapfile.c | 4 ++-- 5 files changed, 16 insertions(+), 14 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 046077b4209d..028e84e2ab42 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1048,19 +1048,19 @@ struct address_space *page_file_mapping(struct page *page) return page->mapping; } +extern pgoff_t __page_file_index(struct page *page); + /* * Return the pagecache index of the passed page. Regular pagecache pages - * use ->index whereas swapcache pages use ->private + * use ->index whereas swapcache pages use swp_offset(->private) */ static inline pgoff_t page_index(struct page *page) { if (unlikely(PageSwapCache(page))) - return page_private(page); + return __page_file_index(page); return page->index; } -extern pgoff_t __page_file_index(struct page *page); - /* * Return the file index of the page. Regular pagecache pages use ->index * whereas swapcache pages use swp_offset(->private) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0739d4129a93..60bb830abc34 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4408,7 +4408,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, * Because lookup_swap_cache() updates some statistics counter, * we call find_get_page() with swapper_space directly. */ - page = find_get_page(swap_address_space(ent), ent.val); + page = find_get_page(swap_address_space(ent), swp_offset(ent)); if (do_memsw_account()) entry->val = ent.val; @@ -4446,7 +4446,8 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, swp_entry_t swp = radix_to_swp_entry(page); if (do_memsw_account()) *entry = swp; - page = find_get_page(swap_address_space(swp), swp.val); + page = find_get_page(swap_address_space(swp), + swp_offset(swp)); } } else page = find_get_page(mapping, pgoff); diff --git a/mm/mincore.c b/mm/mincore.c index c0b5ba965200..bfb866435478 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -66,7 +66,8 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) */ if (radix_tree_exceptional_entry(page)) { swp_entry_t swp = radix_to_swp_entry(page); - page = find_get_page(swap_address_space(swp), swp.val); + page = find_get_page(swap_address_space(swp), + swp_offset(swp)); } } else page = find_get_page(mapping, pgoff); @@ -150,7 +151,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, } else { #ifdef CONFIG_SWAP *vec = mincore_page(swap_address_space(entry), - entry.val); + swp_offset(entry)); #else WARN_ON(1); *vec = 1; diff --git a/mm/swap_state.c b/mm/swap_state.c index 8679c997eab6..35d7e0ee1c77 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -94,7 +94,7 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry) address_space = swap_address_space(entry); spin_lock_irq(&address_space->tree_lock); error = radix_tree_insert(&address_space->page_tree, - entry.val, page); + swp_offset(entry), page); if (likely(!error)) { address_space->nrpages++; __inc_node_page_state(page, NR_FILE_PAGES); @@ -145,7 +145,7 @@ void __delete_from_swap_cache(struct page *page) entry.val = page_private(page); address_space = swap_address_space(entry); - radix_tree_delete(&address_space->page_tree, page_private(page)); + radix_tree_delete(&address_space->page_tree, swp_offset(entry)); set_page_private(page, 0); ClearPageSwapCache(page); address_space->nrpages--; @@ -283,7 +283,7 @@ struct page * lookup_swap_cache(swp_entry_t entry) { struct page *page; - page = find_get_page(swap_address_space(entry), entry.val); + page = find_get_page(swap_address_space(entry), swp_offset(entry)); if (page) { INC_CACHE_INFO(find_success); @@ -310,7 +310,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * called after lookup_swap_cache() failed, re-calling * that would confuse statistics. */ - found_page = find_get_page(swapper_space, entry.val); + found_page = find_get_page(swapper_space, swp_offset(entry)); if (found_page) break; diff --git a/mm/swapfile.c b/mm/swapfile.c index 134c085d0d7b..2210de290b54 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -105,7 +105,7 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) struct page *page; int ret = 0; - page = find_get_page(swap_address_space(entry), entry.val); + page = find_get_page(swap_address_space(entry), swp_offset(entry)); if (!page) return 0; /* @@ -1005,7 +1005,7 @@ int free_swap_and_cache(swp_entry_t entry) if (p) { if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { page = find_get_page(swap_address_space(entry), - entry.val); + swp_offset(entry)); if (page && !trylock_page(page)) { put_page(page); page = NULL; From 8cd797887ae0a73313ba248e027e59c0a597d693 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 7 Oct 2016 17:00:24 -0700 Subject: [PATCH 071/127] mm: remove page_file_index After using the offset of the swap entry as the key of the swap cache, the page_index() becomes exactly same as page_file_index(). So the page_file_index() is removed and the callers are changed to use page_index() instead. Link: http://lkml.kernel.org/r/1473270649-27229-2-git-send-email-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Trond Myklebust Cc: Anna Schumaker Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Dave Hansen Cc: Johannes Weiner Cc: Dan Williams Cc: Joonsoo Kim Cc: Ross Zwisler Cc: Eric Dumazet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfs/internal.h | 6 +++--- fs/nfs/pagelist.c | 2 +- fs/nfs/read.c | 2 +- fs/nfs/write.c | 4 ++-- include/linux/mm.h | 12 ------------ include/linux/pagemap.h | 2 +- 6 files changed, 8 insertions(+), 20 deletions(-) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 74935a19e4bf..da9e5584bfdc 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -681,11 +681,11 @@ unsigned int nfs_page_length(struct page *page) loff_t i_size = i_size_read(page_file_mapping(page)->host); if (i_size > 0) { - pgoff_t page_index = page_file_index(page); + pgoff_t index = page_index(page); pgoff_t end_index = (i_size - 1) >> PAGE_SHIFT; - if (page_index < end_index) + if (index < end_index) return PAGE_SIZE; - if (page_index == end_index) + if (index == end_index) return ((i_size - 1) & ~PAGE_MASK) + 1; } return 0; diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 174dd4cf5747..965db474f4b0 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -342,7 +342,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page, * update_nfs_request below if the region is not locked. */ req->wb_page = page; if (page) { - req->wb_index = page_file_index(page); + req->wb_index = page_index(page); get_page(page); } req->wb_offset = offset; diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 572e5b3b06f1..defc9233e985 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -295,7 +295,7 @@ int nfs_readpage(struct file *file, struct page *page) int error; dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", - page, PAGE_SIZE, page_file_index(page)); + page, PAGE_SIZE, page_index(page)); nfs_inc_stats(inode, NFSIOS_VFSREADPAGE); nfs_add_stats(inode, NFSIOS_READPAGES, 1); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 3a6724c6eb5f..53211838f72a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -151,7 +151,7 @@ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int c spin_lock(&inode->i_lock); i_size = i_size_read(inode); end_index = (i_size - 1) >> PAGE_SHIFT; - if (i_size > 0 && page_file_index(page) < end_index) + if (i_size > 0 && page_index(page) < end_index) goto out; end = page_file_offset(page) + ((loff_t)offset+count); if (i_size >= end) @@ -603,7 +603,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, { int ret; - nfs_pageio_cond_complete(pgio, page_file_index(page)); + nfs_pageio_cond_complete(pgio, page_index(page)); ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE, launder); if (ret == -EAGAIN) { diff --git a/include/linux/mm.h b/include/linux/mm.h index 028e84e2ab42..3e8807e0b9d2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1061,18 +1061,6 @@ static inline pgoff_t page_index(struct page *page) return page->index; } -/* - * Return the file index of the page. Regular pagecache pages use ->index - * whereas swapcache pages use swp_offset(->private) - */ -static inline pgoff_t page_file_index(struct page *page) -{ - if (unlikely(PageSwapCache(page))) - return __page_file_index(page); - - return page->index; -} - bool page_mapped(struct page *page); struct address_space *page_mapping(struct page *page); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 48d9cf04337c..794dbcb91084 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -408,7 +408,7 @@ static inline loff_t page_offset(struct page *page) static inline loff_t page_file_offset(struct page *page) { - return ((loff_t)page_file_index(page)) << PAGE_SHIFT; + return ((loff_t)page_index(page)) << PAGE_SHIFT; } extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma, From 3250845d0526407330592dd43b9f1354b6fe7a14 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Oct 2016 17:00:28 -0700 Subject: [PATCH 072/127] Revert "mm, oom: prevent premature OOM killer invocation for high order request" Patch series "reintroduce compaction feedback for OOM decisions". After several people reported OOM's for order-2 allocations in 4.7 due to Michal Hocko's OOM rework, he reverted the part that considered compaction feedback [1] in the decisions to retry reclaim/compaction. This was to provide a fix quickly for 4.8 rc and 4.7 stable series, while mmotm had an almost complete solution that instead improved compaction reliability. This series completes the mmotm solution and reintroduces the compaction feedback into OOM decisions. The first two patches restore the state of mmotm before the temporary solution was merged, the last patch should be the missing piece for reliability. The third patch restricts the hardened compaction to non-costly orders, since costly orders don't result in OOMs in the first place. [1] http://marc.info/?i=20160822093249.GA14916%40dhcp22.suse.cz%3E This patch (of 4): Commit 6b4e3181d7bd ("mm, oom: prevent premature OOM killer invocation for high order request") was intended as a quick fix of OOM regressions for 4.8 and stable 4.7.x kernels. For a better long-term solution, we still want to consider compaction feedback, which should be possible after some more improvements in the following patches. This reverts commit 6b4e3181d7bd5ca5ab6f45929e4a5ffa7ab4ab7f. Link: http://lkml.kernel.org/r/20160906135258.18335-2-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Mel Gorman Cc: Joonsoo Kim Cc: David Rientjes Cc: Rik van Riel Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e00f545c2398..634806f55120 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3156,6 +3156,54 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, return NULL; } +static inline bool +should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, + enum compact_result compact_result, + enum compact_priority *compact_priority, + int compaction_retries) +{ + int max_retries = MAX_COMPACT_RETRIES; + + if (!order) + return false; + + /* + * compaction considers all the zone as desperately out of memory + * so it doesn't really make much sense to retry except when the + * failure could be caused by insufficient priority + */ + if (compaction_failed(compact_result)) { + if (*compact_priority > MIN_COMPACT_PRIORITY) { + (*compact_priority)--; + return true; + } + return false; + } + + /* + * make sure the compaction wasn't deferred or didn't bail out early + * due to locks contention before we declare that we should give up. + * But do not retry if the given zonelist is not suitable for + * compaction. + */ + if (compaction_withdrawn(compact_result)) + return compaction_zonelist_suitable(ac, order, alloc_flags); + + /* + * !costly requests are much more important than __GFP_REPEAT + * costly ones because they are de facto nofail and invoke OOM + * killer to move on while costly can fail and users are ready + * to cope with that. 1/4 retries is rather arbitrary but we + * would need much more detailed feedback from compaction to + * make a better decision. + */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + max_retries /= 4; + if (compaction_retries <= max_retries) + return true; + + return false; +} #else static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, @@ -3166,8 +3214,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, return NULL; } -#endif /* CONFIG_COMPACTION */ - static inline bool should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags, enum compact_result compact_result, @@ -3194,6 +3240,7 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla } return false; } +#endif /* CONFIG_COMPACTION */ /* Perform direct synchronous page reclaim */ static int From d943649831aba0fcdda37a0e9e25b332a634cf5e Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Oct 2016 17:00:31 -0700 Subject: [PATCH 073/127] mm, compaction: more reliably increase direct compaction priority During reclaim/compaction loop, compaction priority can be increased by the should_compact_retry() function, but the current code is not optimal. Priority is only increased when compaction_failed() is true, which means that compaction has scanned the whole zone. This may not happen even after multiple attempts with a lower priority due to parallel activity, so we might needlessly struggle on the lower priorities and possibly run out of compaction retry attempts in the process. After this patch we are guaranteed at least one attempt at the highest compaction priority even if we exhaust all retries at the lower priorities. Link: http://lkml.kernel.org/r/20160906135258.18335-3-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Michal Hocko Cc: Mel Gorman Cc: Joonsoo Kim Cc: David Rientjes Cc: Rik van Riel Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 634806f55120..a8703b592c39 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3160,25 +3160,23 @@ static inline bool should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, enum compact_result compact_result, enum compact_priority *compact_priority, - int compaction_retries) + int *compaction_retries) { int max_retries = MAX_COMPACT_RETRIES; if (!order) return false; + if (compaction_made_progress(compact_result)) + (*compaction_retries)++; + /* * compaction considers all the zone as desperately out of memory * so it doesn't really make much sense to retry except when the * failure could be caused by insufficient priority */ - if (compaction_failed(compact_result)) { - if (*compact_priority > MIN_COMPACT_PRIORITY) { - (*compact_priority)--; - return true; - } - return false; - } + if (compaction_failed(compact_result)) + goto check_priority; /* * make sure the compaction wasn't deferred or didn't bail out early @@ -3199,9 +3197,19 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, */ if (order > PAGE_ALLOC_COSTLY_ORDER) max_retries /= 4; - if (compaction_retries <= max_retries) + if (*compaction_retries <= max_retries) return true; + /* + * Make sure there are attempts at the highest priority if we exhausted + * all retries or failed at the lower priorities. + */ +check_priority: + if (*compact_priority > MIN_COMPACT_PRIORITY) { + (*compact_priority)--; + *compaction_retries = 0; + return true; + } return false; } #else @@ -3218,7 +3226,7 @@ static inline bool should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags, enum compact_result compact_result, enum compact_priority *compact_priority, - int compaction_retries) + int *compaction_retries) { struct zone *zone; struct zoneref *z; @@ -3620,9 +3628,6 @@ retry: if (page) goto got_pg; - if (order && compaction_made_progress(compact_result)) - compaction_retries++; - /* Do not loop if specifically requested */ if (gfp_mask & __GFP_NORETRY) goto nopage; @@ -3657,7 +3662,7 @@ retry: if (did_some_progress > 0 && should_compact_retry(ac, order, alloc_flags, compact_result, &compact_priority, - compaction_retries)) + &compaction_retries)) goto retry; /* Reclaim has failed us, start killing things */ From c2033b00dbe856909fcaccf038e4e0d3dcfb85af Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Oct 2016 17:00:34 -0700 Subject: [PATCH 074/127] mm, compaction: restrict full priority to non-costly orders The new ultimate compaction priority disables some heuristics, which may result in excessive cost. This is fine for non-costly orders where we want to try hard before resulting for OOM, but might be disruptive for costly orders which do not trigger OOM and should generally have some fallback. Thus, we disable the full priority for costly orders. Suggested-by: Michal Hocko Link: http://lkml.kernel.org/r/20160906135258.18335-4-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Michal Hocko Cc: Mel Gorman Cc: Joonsoo Kim Cc: David Rientjes Cc: Rik van Riel Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 1 + mm/page_alloc.c | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 585d55cb0dc0..0d8415820fc3 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -9,6 +9,7 @@ enum compact_priority { COMPACT_PRIO_SYNC_FULL, MIN_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_FULL, COMPACT_PRIO_SYNC_LIGHT, + MIN_COMPACT_COSTLY_PRIORITY = COMPACT_PRIO_SYNC_LIGHT, DEF_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_LIGHT, COMPACT_PRIO_ASYNC, INIT_COMPACT_PRIORITY = COMPACT_PRIO_ASYNC diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a8703b592c39..891e3881a6e0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3163,6 +3163,7 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, int *compaction_retries) { int max_retries = MAX_COMPACT_RETRIES; + int min_priority; if (!order) return false; @@ -3205,7 +3206,9 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, * all retries or failed at the lower priorities. */ check_priority: - if (*compact_priority > MIN_COMPACT_PRIORITY) { + min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ? + MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY; + if (*compact_priority > min_priority) { (*compact_priority)--; *compaction_retries = 0; return true; From 9f7e3387939b036faacf4e7f32de7bb92a6635d6 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Oct 2016 17:00:37 -0700 Subject: [PATCH 075/127] mm, compaction: make full priority ignore pageblock suitability Several people have reported premature OOMs for order-2 allocations (stack) due to OOM rework in 4.7. In the scenario (parallel kernel build and dd writing to two drives) many pageblocks get marked as Unmovable and compaction free scanner struggles to isolate free pages. Joonsoo Kim pointed out that the free scanner skips pageblocks that are not movable to prevent filling them and forcing non-movable allocations to fallback to other pageblocks. Such heuristic makes sense to help prevent long-term fragmentation, but premature OOMs are relatively more urgent problem. As a compromise, this patch disables the heuristic only for the ultimate compaction priority. Link: http://lkml.kernel.org/r/20160906135258.18335-5-vbabka@suse.cz Reported-by: Ralf-Peter Rohbeck Reported-by: Arkadiusz Miskiewicz Reported-by: Olaf Hering Suggested-by: Joonsoo Kim Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Michal Hocko Cc: Mel Gorman Cc: Joonsoo Kim Cc: David Rientjes Cc: Rik van Riel Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 11 ++++++++--- mm/internal.h | 1 + 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 29f6c49dc9c2..86d4d0bbfc7c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -997,8 +997,12 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, #ifdef CONFIG_COMPACTION /* Returns true if the page is within a block suitable for migration to */ -static bool suitable_migration_target(struct page *page) +static bool suitable_migration_target(struct compact_control *cc, + struct page *page) { + if (cc->ignore_block_suitable) + return true; + /* If the page is a large free page, then disallow migration */ if (PageBuddy(page)) { /* @@ -1083,7 +1087,7 @@ static void isolate_freepages(struct compact_control *cc) continue; /* Check the block is suitable for migration */ - if (!suitable_migration_target(page)) + if (!suitable_migration_target(cc, page)) continue; /* If isolation recently failed, do not retry */ @@ -1656,7 +1660,8 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, .classzone_idx = classzone_idx, .direct_compaction = true, .whole_zone = (prio == MIN_COMPACT_PRIORITY), - .ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY) + .ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY), + .ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY) }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); diff --git a/mm/internal.h b/mm/internal.h index 5214bf8e3171..537ac9951f5f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -178,6 +178,7 @@ struct compact_control { unsigned long last_migrated_pfn;/* Not yet flushed page being freed */ enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ + bool ignore_block_suitable; /* Scan blocks considered unsuitable */ bool direct_compaction; /* False from kcompactd or /proc/... */ bool whole_zone; /* Whole zone should/has been scanned */ int order; /* order a direct compactor needs */ From 423b452e1553e3d19b632880bf2adf1f058ab267 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Oct 2016 17:00:40 -0700 Subject: [PATCH 076/127] mm, page_alloc: pull no_progress_loops update to should_reclaim_retry() The should_reclaim_retry() makes decisions based on no_progress_loops, so it makes sense to also update the counter there. It will be also consistent with should_compact_retry() and compaction_retries. No functional change. [hillf.zj@alibaba-inc.com: fix missing pointer dereferences] Link: http://lkml.kernel.org/r/20160926162025.21555-3-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Hillf Danton Acked-by: Michal Hocko Cc: Mel Gorman Cc: Joonsoo Kim Cc: David Rientjes Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 891e3881a6e0..bcfa647c1752 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3402,16 +3402,26 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) static inline bool should_reclaim_retry(gfp_t gfp_mask, unsigned order, struct alloc_context *ac, int alloc_flags, - bool did_some_progress, int no_progress_loops) + bool did_some_progress, int *no_progress_loops) { struct zone *zone; struct zoneref *z; + /* + * Costly allocations might have made a progress but this doesn't mean + * their order will become available due to high fragmentation so + * always increment the no progress counter for them + */ + if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) + *no_progress_loops = 0; + else + (*no_progress_loops)++; + /* * Make sure we converge to OOM if we cannot make any progress * several times in the row. */ - if (no_progress_loops > MAX_RECLAIM_RETRIES) + if (*no_progress_loops > MAX_RECLAIM_RETRIES) return false; /* @@ -3426,7 +3436,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, unsigned long reclaimable; available = reclaimable = zone_reclaimable_pages(zone); - available -= DIV_ROUND_UP(no_progress_loops * available, + available -= DIV_ROUND_UP((*no_progress_loops) * available, MAX_RECLAIM_RETRIES); available += zone_page_state_snapshot(zone, NR_FREE_PAGES); @@ -3642,18 +3652,8 @@ retry: if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) goto nopage; - /* - * Costly allocations might have made a progress but this doesn't mean - * their order will become available due to high fragmentation so - * always increment the no progress counter for them - */ - if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) - no_progress_loops = 0; - else - no_progress_loops++; - if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, - did_some_progress > 0, no_progress_loops)) + did_some_progress > 0, &no_progress_loops)) goto retry; /* From cc5c9f098fe48a8736add8a23c983524ca16cea5 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Oct 2016 17:00:43 -0700 Subject: [PATCH 077/127] mm, compaction: ignore fragindex from compaction_zonelist_suitable() The compaction_zonelist_suitable() function tries to determine if compaction will be able to proceed after sufficient reclaim, i.e. whether there are enough reclaimable pages to provide enough order-0 freepages for compaction. This addition of reclaimable pages to the free pages works well for the order-0 watermark check, but in the fragmentation index check we only consider truly free pages. Thus we can get fragindex value close to 0 which indicates failure do to lack of memory, and wrongly decide that compaction won't be suitable even after reclaim. Instead of trying to somehow adjust fragindex for reclaimable pages, let's just skip it from compaction_zonelist_suitable(). Link: http://lkml.kernel.org/r/20160926162025.21555-4-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Mel Gorman Cc: Joonsoo Kim Cc: David Rientjes Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 86d4d0bbfc7c..b918bdb28aed 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1379,7 +1379,6 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, int classzone_idx, unsigned long wmark_target) { - int fragindex; unsigned long watermark; if (is_via_compact_memory(order)) @@ -1415,6 +1414,18 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, ALLOC_CMA, wmark_target)) return COMPACT_SKIPPED; + return COMPACT_CONTINUE; +} + +enum compact_result compaction_suitable(struct zone *zone, int order, + unsigned int alloc_flags, + int classzone_idx) +{ + enum compact_result ret; + int fragindex; + + ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx, + zone_page_state(zone, NR_FREE_PAGES)); /* * fragmentation index determines if allocation failures are due to * low memory or external fragmentation @@ -1426,21 +1437,12 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, * * Only compact if a failure would be due to fragmentation. */ - fragindex = fragmentation_index(zone, order); - if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) - return COMPACT_NOT_SUITABLE_ZONE; + if (ret == COMPACT_CONTINUE) { + fragindex = fragmentation_index(zone, order); + if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) + ret = COMPACT_NOT_SUITABLE_ZONE; + } - return COMPACT_CONTINUE; -} - -enum compact_result compaction_suitable(struct zone *zone, int order, - unsigned int alloc_flags, - int classzone_idx) -{ - enum compact_result ret; - - ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx, - zone_page_state(zone, NR_FREE_PAGES)); trace_mm_compaction_suitable(zone, order, ret); if (ret == COMPACT_NOT_SUITABLE_ZONE) ret = COMPACT_SKIPPED; @@ -1473,8 +1475,7 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, available += zone_page_state_snapshot(zone, NR_FREE_PAGES); compact_result = __compaction_suitable(zone, order, alloc_flags, ac_classzone_idx(ac), available); - if (compact_result != COMPACT_SKIPPED && - compact_result != COMPACT_NOT_SUITABLE_ZONE) + if (compact_result != COMPACT_SKIPPED) return true; } From 20311420282f3402888f1d9b8b80d924d491aadf Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Oct 2016 17:00:46 -0700 Subject: [PATCH 078/127] mm, compaction: restrict fragindex to costly orders Fragmentation index and the vm.extfrag_threshold sysctl is meant as a heuristic to prevent excessive compaction for costly orders (i.e. THP). It's unlikely to make any difference for non-costly orders, especially with the default threshold. But we cannot afford any uncertainty for the non-costly orders where the only alternative to successful reclaim/compaction is OOM. After the recent patches we are guaranteed maximum effort without heuristics from compaction before deciding OOM, and fragindex is the last remaining heuristic. Therefore skip fragindex altogether for non-costly orders. Suggested-by: Michal Hocko Link: http://lkml.kernel.org/r/20160926162025.21555-5-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Mel Gorman Cc: Joonsoo Kim Cc: David Rientjes Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index b918bdb28aed..0409a4ad6ea1 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1435,9 +1435,14 @@ enum compact_result compaction_suitable(struct zone *zone, int order, * index towards 0 implies failure is due to lack of memory * index towards 1000 implies failure is due to fragmentation * - * Only compact if a failure would be due to fragmentation. + * Only compact if a failure would be due to fragmentation. Also + * ignore fragindex for non-costly orders where the alternative to + * a successful reclaim/compaction is OOM. Fragindex and the + * vm.extfrag_threshold sysctl is meant as a heuristic to prevent + * excessive compaction for costly orders, but it should not be at the + * expense of system stability. */ - if (ret == COMPACT_CONTINUE) { + if (ret == COMPACT_CONTINUE && (order > PAGE_ALLOC_COSTLY_ORDER)) { fragindex = fragmentation_index(zone, order); if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) ret = COMPACT_NOT_SUITABLE_ZONE; From a104808e212a9ee97e6b9cb6945185e50905f009 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 7 Oct 2016 17:00:49 -0700 Subject: [PATCH 079/127] mm: don't emit warning from pagefault_out_of_memory() Commit c32b3cbe0d06 ("oom, PM: make OOM detection in the freezer path raceless") inserted a WARN_ON() into pagefault_out_of_memory() in order to warn when we raced with disabling the OOM killer. Now, patch "oom, suspend: fix oom_killer_disable vs. pm suspend properly" introduced a timeout for oom_killer_disable(). Even if we raced with disabling the OOM killer and the system is OOM livelocked, the OOM killer will be enabled eventually (in 20 seconds by default) and the OOM livelock will be solved. Therefore, we no longer need to warn when we raced with disabling the OOM killer. Link: http://lkml.kernel.org/r/1473442120-7246-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp Signed-off-by: Tetsuo Handa Acked-by: Michal Hocko Cc: David Rientjes Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 0034baf35f0c..f284e92a71f0 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1069,16 +1069,6 @@ void pagefault_out_of_memory(void) if (!mutex_trylock(&oom_lock)) return; - - if (!out_of_memory(&oc)) { - /* - * There shouldn't be any user tasks runnable while the - * OOM killer is disabled, so the current task has to - * be a racing OOM victim for which oom_killer_disable() - * is waiting for. - */ - WARN_ON(test_thread_flag(TIF_MEMDIE)); - } - + out_of_memory(&oc); mutex_unlock(&oom_lock); } From cc30c5d6461a2813406f7f84d581643781922a82 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 7 Oct 2016 17:00:52 -0700 Subject: [PATCH 080/127] mm/page_io.c: replace some BUG_ON()s with VM_BUG_ON_PAGE() So they are CONFIG_DEBUG_VM-only and more informative. Cc: Al Viro Cc: David S. Miller Cc: Hugh Dickins Cc: Jens Axboe Cc: Joe Perches Cc: Mel Gorman Cc: Michal Hocko Cc: Peter Zijlstra Cc: Rik van Riel Cc: Santosh Shilimkar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_io.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index eafe5ddc2b54..a2651f58c86a 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -264,7 +264,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, int ret; struct swap_info_struct *sis = page_swap_info(page); - BUG_ON(!PageSwapCache(page)); + VM_BUG_ON_PAGE(!PageSwapCache(page), page); if (sis->flags & SWP_FILE) { struct kiocb kiocb; struct file *swap_file = sis->swap_file; @@ -338,7 +338,7 @@ int swap_readpage(struct page *page) int ret = 0; struct swap_info_struct *sis = page_swap_info(page); - BUG_ON(!PageSwapCache(page)); + VM_BUG_ON_PAGE(!PageSwapCache(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageUptodate(page), page); if (frontswap_load(page) == 0) { @@ -388,7 +388,8 @@ int swap_set_page_dirty(struct page *page) if (sis->flags & SWP_FILE) { struct address_space *mapping = sis->swap_file->f_mapping; - BUG_ON(!PageSwapCache(page)); + + VM_BUG_ON_PAGE(!PageSwapCache(page), page); return mapping->a_ops->set_page_dirty(page); } else { return __set_page_dirty_no_writeback(page); From 08ea8c07fb56d6eb8194d8ad408b469544bf2c29 Mon Sep 17 00:00:00 2001 From: Baoyou Xie Date: Fri, 7 Oct 2016 17:00:55 -0700 Subject: [PATCH 081/127] mm: move phys_mem_access_prot_allowed() declaration to pgtable.h We get 1 warning when building kernel with W=1: drivers/char/mem.c:220:12: warning: no previous prototype for 'phys_mem_access_prot_allowed' [-Wmissing-prototypes] int __weak phys_mem_access_prot_allowed(struct file *file, In fact, its declaration is spreading to several header files in different architecture, but need to be declare in common header file. So this patch moves phys_mem_access_prot_allowed() to pgtable.h. Link: http://lkml.kernel.org/r/1473751597-12139-1-git-send-email-baoyou.xie@linaro.org Signed-off-by: Baoyou Xie Acked-by: Thomas Gleixner Acked-by: Ralf Baechle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/include/asm/pgtable.h | 2 -- arch/x86/include/asm/pgtable_types.h | 2 -- include/asm-generic/pgtable.h | 3 +++ 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 70128d3f770a..9e9e94415d08 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -673,8 +673,6 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, struct file; pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot); -int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, - unsigned long size, pgprot_t *vma_prot); #endif /* diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index f1218f512f62..8b4de22d6429 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -439,8 +439,6 @@ extern pgprot_t pgprot_writethrough(pgprot_t prot); struct file; pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot); -int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, - unsigned long size, pgprot_t *vma_prot); /* Install a pte for a particular vaddr in kernel space. */ void set_pte_vaddr(unsigned long vaddr, pte_t pte); diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index d4458b6dbfb4..c4f8fd2fd384 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -800,6 +800,9 @@ static inline int pmd_clear_huge(pmd_t *pmd) #endif #endif +struct file; +int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t *vma_prot); #endif /* !__ASSEMBLY__ */ #ifndef io_remap_pfn_range From 2d75807383459c04d457bf2d295fa6ad858507d2 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 7 Oct 2016 17:00:58 -0700 Subject: [PATCH 082/127] mm: memcontrol: consolidate cgroup socket tracking The cgroup core and the memory controller need to track socket ownership for different purposes, but the tracking sites being entirely different is kind of ugly. Be a better citizen and rename the memory controller callbacks to match the cgroup core callbacks, then move them to the same place. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20160914194846.11153-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Tejun Heo Cc: "David S. Miller" Cc: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 ++++-- mm/memcontrol.c | 23 +++++++++++++---------- net/core/sock.c | 6 +++--- net/ipv4/tcp.c | 2 -- net/ipv4/tcp_ipv4.c | 3 --- 5 files changed, 20 insertions(+), 20 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0710143723bc..61d20c17f3b7 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -773,13 +773,13 @@ static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb, #endif /* CONFIG_CGROUP_WRITEBACK */ struct sock; -void sock_update_memcg(struct sock *sk); -void sock_release_memcg(struct sock *sk); bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages); void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages); #ifdef CONFIG_MEMCG extern struct static_key_false memcg_sockets_enabled_key; #define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key) +void mem_cgroup_sk_alloc(struct sock *sk); +void mem_cgroup_sk_free(struct sock *sk); static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_pressure) @@ -792,6 +792,8 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) } #else #define mem_cgroup_sockets_enabled 0 +static inline void mem_cgroup_sk_alloc(struct sock *sk) { }; +static inline void mem_cgroup_sk_free(struct sock *sk) { }; static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { return false; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 60bb830abc34..ae052b5e3315 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2939,16 +2939,16 @@ static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit) /* * The active flag needs to be written after the static_key * update. This is what guarantees that the socket activation - * function is the last one to run. See sock_update_memcg() for - * details, and note that we don't mark any socket as belonging - * to this memcg until that flag is up. + * function is the last one to run. See mem_cgroup_sk_alloc() + * for details, and note that we don't mark any socket as + * belonging to this memcg until that flag is up. * * We need to do this, because static_keys will span multiple * sites, but we can't control their order. If we mark a socket * as accounted, but the accounting functions are not patched in * yet, we'll lose accounting. * - * We never race with the readers in sock_update_memcg(), + * We never race with the readers in mem_cgroup_sk_alloc(), * because when this value change, the code to process it is not * patched in yet. */ @@ -5651,11 +5651,15 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); EXPORT_SYMBOL(memcg_sockets_enabled_key); -void sock_update_memcg(struct sock *sk) +void mem_cgroup_sk_alloc(struct sock *sk) { struct mem_cgroup *memcg; - /* Socket cloning can throw us here with sk_cgrp already + if (!mem_cgroup_sockets_enabled) + return; + + /* + * Socket cloning can throw us here with sk_memcg already * filled. It won't however, necessarily happen from * process context. So the test for root memcg given * the current task's memcg won't help us in this case. @@ -5680,12 +5684,11 @@ void sock_update_memcg(struct sock *sk) out: rcu_read_unlock(); } -EXPORT_SYMBOL(sock_update_memcg); -void sock_release_memcg(struct sock *sk) +void mem_cgroup_sk_free(struct sock *sk) { - WARN_ON(!sk->sk_memcg); - css_put(&sk->sk_memcg->css); + if (sk->sk_memcg) + css_put(&sk->sk_memcg->css); } /** diff --git a/net/core/sock.c b/net/core/sock.c index 038e660ef844..c73e28fc9c2a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1363,6 +1363,7 @@ static void sk_prot_free(struct proto *prot, struct sock *sk) slab = prot->slab; cgroup_sk_free(&sk->sk_cgrp_data); + mem_cgroup_sk_free(sk); security_sk_free(sk); if (slab != NULL) kmem_cache_free(slab, sk); @@ -1399,6 +1400,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, sock_net_set(sk, net); atomic_set(&sk->sk_wmem_alloc, 1); + mem_cgroup_sk_alloc(sk); cgroup_sk_alloc(&sk->sk_cgrp_data); sock_update_classid(&sk->sk_cgrp_data); sock_update_netprioidx(&sk->sk_cgrp_data); @@ -1545,6 +1547,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_incoming_cpu = raw_smp_processor_id(); atomic64_set(&newsk->sk_cookie, 0); + mem_cgroup_sk_alloc(newsk); cgroup_sk_alloc(&newsk->sk_cgrp_data); /* @@ -1569,9 +1572,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) sk_set_socket(newsk, NULL); newsk->sk_wq = NULL; - if (mem_cgroup_sockets_enabled && sk->sk_memcg) - sock_update_memcg(newsk); - if (newsk->sk_prot->sockets_allocated) sk_sockets_allocated_inc(newsk); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f253e5019d22..ab984d2ff88a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -424,8 +424,6 @@ void tcp_init_sock(struct sock *sk) sk->sk_rcvbuf = sysctl_tcp_rmem[1]; local_bh_disable(); - if (mem_cgroup_sockets_enabled) - sock_update_memcg(sk); sk_sockets_allocated_inc(sk); local_bh_enable(); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 7ac37c314312..bd5e8d10893f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1871,9 +1871,6 @@ void tcp_v4_destroy_sock(struct sock *sk) local_bh_disable(); sk_sockets_allocated_dec(sk); local_bh_enable(); - - if (mem_cgroup_sockets_enabled && sk->sk_memcg) - sock_release_memcg(sk); } EXPORT_SYMBOL(tcp_v4_destroy_sock); From 19938e350adc60f3b9381ae6fc68da40f7d1a9f6 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 7 Oct 2016 17:01:01 -0700 Subject: [PATCH 083/127] mm/shmem.c: constify anon_ops Every other dentry_operations instance is const, and this one might as well be. Link: http://lkml.kernel.org/r/1473890528-7009-1-git-send-email-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index 971fc83e6402..dee06310c9c8 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -4078,7 +4078,7 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range); /* common code */ -static struct dentry_operations anon_ops = { +static const struct dentry_operations anon_ops = { .d_dname = simple_dname }; From 914a051654c5401cb216a939e214e17ec018b6a9 Mon Sep 17 00:00:00 2001 From: Wanlong Gao Date: Fri, 7 Oct 2016 17:01:04 -0700 Subject: [PATCH 084/127] mm: nobootmem: move the comment of free_all_bootmem Commit b4def3509d18 ("mm, nobootmem: clean-up of free_low_memory_core_early()") removed the unnecessary nodeid argument, after that, this comment becomes more confused. We should move it to the right place. Fixes: b4def3509d18c1db9 ("mm, nobootmem: clean-up of free_low_memory_core_early()") Link: http://lkml.kernel.org/r/1473996082-14603-1-git-send-email-wanlong.gao@gmail.com Signed-off-by: Wanlong Gao Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/nobootmem.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 490d46abddad..ba609b684d7a 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -137,6 +137,11 @@ static unsigned long __init free_low_memory_core_early(void) for_each_reserved_mem_region(i, &start, &end) reserve_bootmem_region(start, end); + /* + * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id + * because in some case like Node0 doesn't have RAM installed + * low ram will be on Node1 + */ for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) count += __free_memory_core(start, end); @@ -194,11 +199,6 @@ unsigned long __init free_all_bootmem(void) reset_all_zones_managed_pages(); - /* - * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id - * because in some case like Node0 doesn't have RAM installed - * low ram will be on Node1 - */ pages = free_low_memory_core_early(); totalram_pages += pages; From 2247bb335ab9c40058484cac36ea74ee652f3b7b Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Fri, 7 Oct 2016 17:01:07 -0700 Subject: [PATCH 085/127] mm/hugetlb: fix memory offline with hugepage size > memory block size Patch series "mm/hugetlb: memory offline issues with hugepages", v4. This addresses several issues with hugepages and memory offline. While the first patch fixes a panic, and is therefore rather important, the last patch is just a performance optimization. The second patch fixes a theoretical issue with reserved hugepages, while still leaving some ugly usability issue, see description. This patch (of 3): dissolve_free_huge_pages() will either run into the VM_BUG_ON() or a list corruption and addressing exception when trying to set a memory block offline that is part (but not the first part) of a "gigantic" hugetlb page with a size > memory block size. When no other smaller hugetlb page sizes are present, the VM_BUG_ON() will trigger directly. In the other case we will run into an addressing exception later, because dissolve_free_huge_page() will not work on the head page of the compound hugetlb page which will result in a NULL hstate from page_hstate(). To fix this, first remove the VM_BUG_ON() because it is wrong, and then use the compound head page in dissolve_free_huge_page(). This means that an unused pre-allocated gigantic page that has any part of itself inside the memory block that is going offline will be dissolved completely. Losing an unused gigantic hugepage is preferable to failing the memory offline, for example in the situation where a (possibly faulty) memory DIMM needs to go offline. Fixes: c8721bbb ("mm: memory-hotplug: enable memory hotplug to handle hugepage") Link: http://lkml.kernel.org/r/20160926172811.94033-2-gerald.schaefer@de.ibm.com Signed-off-by: Gerald Schaefer Acked-by: Michal Hocko Acked-by: Naoya Horiguchi Cc: "Kirill A . Shutemov" Cc: Vlastimil Babka Cc: Mike Kravetz Cc: "Aneesh Kumar K . V" Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Rui Teng Cc: Dave Hansen Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 87e11d8ad536..603bdd01ec2c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1443,13 +1443,14 @@ static void dissolve_free_huge_page(struct page *page) { spin_lock(&hugetlb_lock); if (PageHuge(page) && !page_count(page)) { - struct hstate *h = page_hstate(page); - int nid = page_to_nid(page); - list_del(&page->lru); + struct page *head = compound_head(page); + struct hstate *h = page_hstate(head); + int nid = page_to_nid(head); + list_del(&head->lru); h->free_huge_pages--; h->free_huge_pages_node[nid]--; h->max_huge_pages--; - update_and_free_page(h, page); + update_and_free_page(h, head); } spin_unlock(&hugetlb_lock); } @@ -1457,7 +1458,8 @@ static void dissolve_free_huge_page(struct page *page) /* * Dissolve free hugepages in a given pfn range. Used by memory hotplug to * make specified memory blocks removable from the system. - * Note that start_pfn should aligned with (minimum) hugepage size. + * Note that this will dissolve a free gigantic hugepage completely, if any + * part of it lies within the given range. */ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) { @@ -1466,7 +1468,6 @@ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) if (!hugepages_supported()) return; - VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << minimum_order)); for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) dissolve_free_huge_page(pfn_to_page(pfn)); } From 082d5b6b60e9f25e1511557fcfcb21eedd267446 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Fri, 7 Oct 2016 17:01:10 -0700 Subject: [PATCH 086/127] mm/hugetlb: check for reserved hugepages during memory offline In dissolve_free_huge_pages(), free hugepages will be dissolved without making sure that there are enough of them left to satisfy hugepage reservations. Fix this by adding a return value to dissolve_free_huge_pages() and checking h->free_huge_pages vs. h->resv_huge_pages. Note that this may lead to the situation where dissolve_free_huge_page() returns an error and all free hugepages that were dissolved before that error are lost, while the memory block still cannot be set offline. Fixes: c8721bbb ("mm: memory-hotplug: enable memory hotplug to handle hugepage") Link: http://lkml.kernel.org/r/20160926172811.94033-3-gerald.schaefer@de.ibm.com Signed-off-by: Gerald Schaefer Acked-by: Michal Hocko Acked-by: Naoya Horiguchi Cc: "Kirill A . Shutemov" Cc: Vlastimil Babka Cc: Mike Kravetz Cc: "Aneesh Kumar K . V" Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Rui Teng Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 6 +++--- mm/hugetlb.c | 26 +++++++++++++++++++++----- mm/memory_hotplug.c | 4 +++- 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index c26d4638f665..fe99e6f956e2 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -450,8 +450,8 @@ static inline pgoff_t basepage_index(struct page *page) return __basepage_index(page); } -extern void dissolve_free_huge_pages(unsigned long start_pfn, - unsigned long end_pfn); +extern int dissolve_free_huge_pages(unsigned long start_pfn, + unsigned long end_pfn); static inline bool hugepage_migration_supported(struct hstate *h) { #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION @@ -518,7 +518,7 @@ static inline pgoff_t basepage_index(struct page *page) { return page->index; } -#define dissolve_free_huge_pages(s, e) do {} while (0) +#define dissolve_free_huge_pages(s, e) 0 #define hugepage_migration_supported(h) false static inline spinlock_t *huge_pte_lockptr(struct hstate *h, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 603bdd01ec2c..91ae1f567997 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1437,22 +1437,32 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, /* * Dissolve a given free hugepage into free buddy pages. This function does - * nothing for in-use (including surplus) hugepages. + * nothing for in-use (including surplus) hugepages. Returns -EBUSY if the + * number of free hugepages would be reduced below the number of reserved + * hugepages. */ -static void dissolve_free_huge_page(struct page *page) +static int dissolve_free_huge_page(struct page *page) { + int rc = 0; + spin_lock(&hugetlb_lock); if (PageHuge(page) && !page_count(page)) { struct page *head = compound_head(page); struct hstate *h = page_hstate(head); int nid = page_to_nid(head); + if (h->free_huge_pages - h->resv_huge_pages == 0) { + rc = -EBUSY; + goto out; + } list_del(&head->lru); h->free_huge_pages--; h->free_huge_pages_node[nid]--; h->max_huge_pages--; update_and_free_page(h, head); } +out: spin_unlock(&hugetlb_lock); + return rc; } /* @@ -1460,16 +1470,22 @@ static void dissolve_free_huge_page(struct page *page) * make specified memory blocks removable from the system. * Note that this will dissolve a free gigantic hugepage completely, if any * part of it lies within the given range. + * Also note that if dissolve_free_huge_page() returns with an error, all + * free hugepages that were dissolved before that error are lost. */ -void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) +int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn; + int rc = 0; if (!hugepages_supported()) - return; + return rc; for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) - dissolve_free_huge_page(pfn_to_page(pfn)); + if (rc = dissolve_free_huge_page(pfn_to_page(pfn))) + break; + + return rc; } /* diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9d29ba0f7192..962927309b6e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1945,7 +1945,9 @@ repeat: * dissolve free hugepages in the memory block before doing offlining * actually in order to make hugetlbfs's object counting consistent. */ - dissolve_free_huge_pages(start_pfn, end_pfn); + ret = dissolve_free_huge_pages(start_pfn, end_pfn); + if (ret) + goto failed_removal; /* check again */ offlined_pages = check_pages_isolated(start_pfn, end_pfn); if (offlined_pages < 0) { From eb03aa008561004257900983193d024e57abdd96 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Fri, 7 Oct 2016 17:01:13 -0700 Subject: [PATCH 087/127] mm/hugetlb: improve locking in dissolve_free_huge_pages() For every pfn aligned to minimum_order, dissolve_free_huge_pages() will call dissolve_free_huge_page() which takes the hugetlb spinlock, even if the page is not huge at all or a hugepage that is in-use. Improve this by doing the PageHuge() and page_count() checks already in dissolve_free_huge_pages() before calling dissolve_free_huge_page(). In dissolve_free_huge_page(), when holding the spinlock, those checks need to be revalidated. Link: http://lkml.kernel.org/r/20160926172811.94033-4-gerald.schaefer@de.ibm.com Signed-off-by: Gerald Schaefer Acked-by: Michal Hocko Acked-by: Naoya Horiguchi Cc: "Kirill A . Shutemov" Cc: Vlastimil Babka Cc: Mike Kravetz Cc: "Aneesh Kumar K . V" Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Rui Teng Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 91ae1f567997..770d83eb3f48 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1476,14 +1476,20 @@ out: int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn; + struct page *page; int rc = 0; if (!hugepages_supported()) return rc; - for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) - if (rc = dissolve_free_huge_page(pfn_to_page(pfn))) - break; + for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) { + page = pfn_to_page(pfn); + if (PageHuge(page) && !page_count(page)) { + rc = dissolve_free_huge_page(page); + if (rc) + break; + } + } return rc; } From ac34dcd263a3afe9a2e4d58a2d93bb66d700ac7c Mon Sep 17 00:00:00 2001 From: Yisheng Xie Date: Fri, 7 Oct 2016 17:01:16 -0700 Subject: [PATCH 088/127] mm/page_isolation: fix typo: "paes" -> "pages" Fix typo in comment. Link: http://lkml.kernel.org/r/1474788764-5774-1-git-send-email-ysxie@foxmail.com Signed-off-by: Yisheng Xie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_isolation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 064b7fb6e0b5..a5594bfcc5ed 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -55,7 +55,7 @@ static int set_migratetype_isolate(struct page *page, ret = 0; /* - * immobile means "not-on-lru" paes. If immobile is larger than + * immobile means "not-on-lru" pages. If immobile is larger than * removable-by-driver pages reported by notifier, we'll fail. */ From 6213055f2c068b63078649457391ecea9b489ea3 Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Fri, 7 Oct 2016 17:01:19 -0700 Subject: [PATCH 089/127] mm,ksm: add __GFP_HIGH to the allocation in alloc_stable_node() According to Hugh's suggestion, alloc_stable_node() with GFP_KERNEL can in rare cases cause a hung task warning. At present, if alloc_stable_node() allocation fails, two break_cows may want to allocate a couple of pages, and the issue will come up when free memory is under pressure. We fix it by adding __GFP_HIGH to GFP, to grant access to memory reserves, increasing the likelihood of allocation success. [akpm@linux-foundation.org: tweak comment] Link: http://lkml.kernel.org/r/1474354484-58233-1-git-send-email-zhongjiang@huawei.com Signed-off-by: zhong jiang Suggested-by: Hugh Dickins Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mm/ksm.c b/mm/ksm.c index 5048083b60f2..9ae6011a41f8 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -299,7 +299,12 @@ static inline void free_rmap_item(struct rmap_item *rmap_item) static inline struct stable_node *alloc_stable_node(void) { - return kmem_cache_alloc(stable_node_cache, GFP_KERNEL); + /* + * The allocation can take too long with GFP_KERNEL when memory is under + * pressure, which may lead to hung task warnings. Adding __GFP_HIGH + * grants access to memory reserves, helping to avoid this problem. + */ + return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH); } static inline void free_stable_node(struct stable_node *stable_node) From 6d2329f8872f23e46a19d240930571510ce525eb Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 7 Oct 2016 17:01:22 -0700 Subject: [PATCH 090/127] mm: vm_page_prot: update with WRITE_ONCE/READ_ONCE vma->vm_page_prot is read lockless from the rmap_walk, it may be updated concurrently and this prevents the risk of reading intermediate values. Link: http://lkml.kernel.org/r/1474660305-19222-1-git-send-email-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Cc: Rik van Riel Cc: Hugh Dickins Cc: Mel Gorman Cc: Jan Vorlicek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- mm/huge_memory.c | 2 +- mm/migrate.c | 2 +- mm/mmap.c | 16 +++++++++------- mm/mprotect.c | 2 +- 5 files changed, 13 insertions(+), 11 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 3e8807e0b9d2..040a04a88996 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1517,7 +1517,7 @@ static inline int pte_devmap(pte_t pte) } #endif -int vma_wants_writenotify(struct vm_area_struct *vma); +int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 12b9f1a39b63..cdcd25cb30fe 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1620,7 +1620,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (soft_dirty) entry = pte_swp_mksoft_dirty(entry); } else { - entry = mk_pte(page + i, vma->vm_page_prot); + entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot)); entry = maybe_mkwrite(entry, vma); if (!write) entry = pte_wrprotect(entry); diff --git a/mm/migrate.c b/mm/migrate.c index f7ee04a5ae27..99250aee1ac1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -234,7 +234,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, goto unlock; get_page(new); - pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); + pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot))); if (pte_swp_soft_dirty(*ptep)) pte = pte_mksoft_dirty(pte); diff --git a/mm/mmap.c b/mm/mmap.c index 7a0707a48047..b3b74cc705ae 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -116,13 +116,15 @@ static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) void vma_set_page_prot(struct vm_area_struct *vma) { unsigned long vm_flags = vma->vm_flags; + pgprot_t vm_page_prot; - vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags); - if (vma_wants_writenotify(vma)) { + vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags); + if (vma_wants_writenotify(vma, vm_page_prot)) { vm_flags &= ~VM_SHARED; - vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, - vm_flags); + vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags); } + /* remove_protection_ptes reads vma->vm_page_prot without mmap_sem */ + WRITE_ONCE(vma->vm_page_prot, vm_page_prot); } /* @@ -1386,7 +1388,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) * to the private version (using protection_map[] without the * VM_SHARED bit). */ -int vma_wants_writenotify(struct vm_area_struct *vma) +int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) { vm_flags_t vm_flags = vma->vm_flags; const struct vm_operations_struct *vm_ops = vma->vm_ops; @@ -1401,8 +1403,8 @@ int vma_wants_writenotify(struct vm_area_struct *vma) /* The open routine did something to the protections that pgprot_modify * won't preserve? */ - if (pgprot_val(vma->vm_page_prot) != - pgprot_val(vm_pgprot_modify(vma->vm_page_prot, vm_flags))) + if (pgprot_val(vm_page_prot) != + pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags))) return 0; /* Do we need to track softdirty? */ diff --git a/mm/mprotect.c b/mm/mprotect.c index a4830f0325fe..063bbed22c7b 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -327,7 +327,7 @@ success: * held in write mode. */ vma->vm_flags = newflags; - dirty_accountable = vma_wants_writenotify(vma); + dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot); vma_set_page_prot(vma); change_protection(vma, start, end, vma->vm_page_prot, From fb8c41e9ad1f356b06b46a63ada10b7dce2a5d94 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 7 Oct 2016 17:01:25 -0700 Subject: [PATCH 091/127] mm: vma_adjust: remove superfluous confusing update in remove_next == 1 case mm->highest_vm_end doesn't need any update. After finally removing the oddness from vma_merge case 8 that was causing: 1) constant risk of trouble whenever anybody would check vma fields from rmap_walks, like it happened when page migration was introduced and it read the vma->vm_page_prot from a rmap_walk 2) the callers of vma_merge to re-initialize any value different from the current vma, instead of vma_merge() more reliably returning a vma that already matches all fields passed as parameter .. it is also worth to take the opportunity of cleaning up superfluous code in vma_adjust(), that if not removed adds up to the hard readability of the function. Link: http://lkml.kernel.org/r/1474492522-2261-5-git-send-email-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Cc: Rik van Riel Cc: Hugh Dickins Cc: Mel Gorman Cc: Jan Vorlicek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index b3b74cc705ae..183694b80bcc 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -817,8 +817,28 @@ again: } else if (next) vma_gap_update(next); - else - mm->highest_vm_end = end; + else { + /* + * If remove_next == 2 we obviously can't + * reach this path. + * + * If remove_next == 3 we can't reach this + * path because pre-swap() next is always not + * NULL. pre-swap() "next" is not being + * removed and its next->vm_end is not altered + * (and furthermore "end" already matches + * next->vm_end in remove_next == 3). + * + * We reach this only in the remove_next == 1 + * case if the "next" vma that was removed was + * the highest vma of the mm. However in such + * case next->vm_end == "end" and the extended + * "vma" has vma->vm_end == next->vm_end so + * mm->highest_vm_end doesn't need any update + * in remove_next == 1 case. + */ + VM_WARN_ON(mm->highest_vm_end != end); + } } if (insert && file) uprobe_mmap(insert); From e86f15ee64d8ee46255d964d55f74f5ba9af8c36 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 7 Oct 2016 17:01:28 -0700 Subject: [PATCH 092/127] mm: vma_merge: fix vm_page_prot SMP race condition against rmap_walk The rmap_walk can access vm_page_prot (and potentially vm_flags in the pte/pmd manipulations). So it's not safe to wait the caller to update the vm_page_prot/vm_flags after vma_merge returned potentially removing the "next" vma and extending the "current" vma over the next->vm_start,vm_end range, but still with the "current" vma vm_page_prot, after releasing the rmap locks. The vm_page_prot/vm_flags must be transferred from the "next" vma to the current vma while vma_merge still holds the rmap locks. The side effect of this race condition is pte corruption during migrate as remove_migration_ptes when run on a address of the "next" vma that got removed, used the vm_page_prot of the current vma. migrate mprotect ------------ ------------- migrating in "next" vma vma_merge() # removes "next" vma and # extends "current" vma # current vma is not with # vm_page_prot updated remove_migration_ptes read vm_page_prot of current "vma" establish pte with wrong permissions vm_set_page_prot(vma) # too late! change_protection in the old vma range only, next range is not updated This caused segmentation faults and potentially memory corruption in heavy mprotect loads with some light page migration caused by compaction in the background. Hugh Dickins pointed out the comment about the Odd case 8 in vma_merge which confirms the case 8 is only buggy one where the race can trigger, in all other vma_merge cases the above cannot happen. This fix removes the oddness factor from case 8 and it converts it from: AAAA PPPPNNNNXXXX -> PPPPNNNNNNNN to: AAAA PPPPNNNNXXXX -> PPPPXXXXXXXX XXXX has the right vma properties for the whole merged vma returned by vma_adjust, so it solves the problem fully. It has the added benefits that the callers could stop updating vma properties when vma_merge succeeds however the callers are not updated by this patch (there are bits like VM_SOFTDIRTY that still need special care for the whole range, as the vma merging ignores them, but as long as they're not processed by rmap walks and instead they're accessed with the mmap_sem at least for reading, they are fine not to be updated within vma_adjust before releasing the rmap_locks). Link: http://lkml.kernel.org/r/1474309513-20313-1-git-send-email-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Reported-by: Aditya Mandaleeka Cc: Rik van Riel Cc: Hugh Dickins Cc: Mel Gorman Cc: Jan Vorlicek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 10 ++- mm/mmap.c | 157 +++++++++++++++++++++++++++++++++++++-------- mm/mprotect.c | 1 + 3 files changed, 139 insertions(+), 29 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 040a04a88996..2c8ed8a894c8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1968,8 +1968,14 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node); /* mmap.c */ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); -extern int vma_adjust(struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert); +extern int __vma_adjust(struct vm_area_struct *vma, unsigned long start, + unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, + struct vm_area_struct *expand); +static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start, + unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) +{ + return __vma_adjust(vma, start, end, pgoff, insert, NULL); +} extern struct vm_area_struct *vma_merge(struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, diff --git a/mm/mmap.c b/mm/mmap.c index 183694b80bcc..e53637f8ac42 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -601,14 +601,24 @@ static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) mm->map_count++; } -static inline void -__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev) +static __always_inline void __vma_unlink_common(struct mm_struct *mm, + struct vm_area_struct *vma, + struct vm_area_struct *prev, + bool has_prev) { struct vm_area_struct *next; vma_rb_erase(vma, &mm->mm_rb); - prev->vm_next = next = vma->vm_next; + next = vma->vm_next; + if (has_prev) + prev->vm_next = next; + else { + prev = vma->vm_prev; + if (prev) + prev->vm_next = next; + else + mm->mmap = next; + } if (next) next->vm_prev = prev; @@ -616,6 +626,19 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, vmacache_invalidate(mm); } +static inline void __vma_unlink_prev(struct mm_struct *mm, + struct vm_area_struct *vma, + struct vm_area_struct *prev) +{ + __vma_unlink_common(mm, vma, prev, true); +} + +static inline void __vma_unlink(struct mm_struct *mm, + struct vm_area_struct *vma) +{ + __vma_unlink_common(mm, vma, NULL, false); +} + /* * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that * is already present in an i_mmap tree without adjusting the tree. @@ -623,11 +646,12 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, * are necessary. The "insert" vma (if any) is to be inserted * before we drop the necessary locks. */ -int vma_adjust(struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) +int __vma_adjust(struct vm_area_struct *vma, unsigned long start, + unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, + struct vm_area_struct *expand) { struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *next = vma->vm_next; + struct vm_area_struct *next = vma->vm_next, *orig_vma = vma; struct address_space *mapping = NULL; struct rb_root *root = NULL; struct anon_vma *anon_vma = NULL; @@ -643,9 +667,38 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, /* * vma expands, overlapping all the next, and * perhaps the one after too (mprotect case 6). + * The only two other cases that gets here are + * case 1, case 7 and case 8. */ - remove_next = 1 + (end > next->vm_end); - end = next->vm_end; + if (next == expand) { + /* + * The only case where we don't expand "vma" + * and we expand "next" instead is case 8. + */ + VM_WARN_ON(end != next->vm_end); + /* + * remove_next == 3 means we're + * removing "vma" and that to do so we + * swapped "vma" and "next". + */ + remove_next = 3; + VM_WARN_ON(file != next->vm_file); + swap(vma, next); + } else { + VM_WARN_ON(expand != vma); + /* + * case 1, 6, 7, remove_next == 2 is case 6, + * remove_next == 1 is case 1 or 7. + */ + remove_next = 1 + (end > next->vm_end); + VM_WARN_ON(remove_next == 2 && + end != next->vm_next->vm_end); + VM_WARN_ON(remove_next == 1 && + end != next->vm_end); + /* trim end to next, for case 6 first pass */ + end = next->vm_end; + } + exporter = next; importer = vma; @@ -664,6 +717,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, adjust_next = (end - next->vm_start) >> PAGE_SHIFT; exporter = next; importer = vma; + VM_WARN_ON(expand != importer); } else if (end < vma->vm_end) { /* * vma shrinks, and !insert tells it's not @@ -673,6 +727,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT); exporter = vma; importer = next; + VM_WARN_ON(expand != importer); } /* @@ -690,7 +745,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, } } again: - vma_adjust_trans_huge(vma, start, end, adjust_next); + vma_adjust_trans_huge(orig_vma, start, end, adjust_next); if (file) { mapping = file->f_mapping; @@ -716,8 +771,8 @@ again: if (!anon_vma && adjust_next) anon_vma = next->anon_vma; if (anon_vma) { - VM_BUG_ON_VMA(adjust_next && next->anon_vma && - anon_vma != next->anon_vma, next); + VM_WARN_ON(adjust_next && next->anon_vma && + anon_vma != next->anon_vma); anon_vma_lock_write(anon_vma); anon_vma_interval_tree_pre_update_vma(vma); if (adjust_next) @@ -757,7 +812,11 @@ again: * vma_merge has merged next into vma, and needs * us to remove next before dropping the locks. */ - __vma_unlink(mm, next, vma); + if (remove_next != 3) + __vma_unlink_prev(mm, next, vma); + else + /* vma is not before next if they've been swapped */ + __vma_unlink(mm, next); if (file) __remove_shared_vm_struct(next, file, mapping); } else if (insert) { @@ -809,7 +868,27 @@ again: * we must remove another next too. It would clutter * up the code too much to do both in one go. */ - next = vma->vm_next; + if (remove_next != 3) { + /* + * If "next" was removed and vma->vm_end was + * expanded (up) over it, in turn + * "next->vm_prev->vm_end" changed and the + * "vma->vm_next" gap must be updated. + */ + next = vma->vm_next; + } else { + /* + * For the scope of the comment "next" and + * "vma" considered pre-swap(): if "vma" was + * removed, next->vm_start was expanded (down) + * over it and the "next" gap must be updated. + * Because of the swap() the post-swap() "vma" + * actually points to pre-swap() "next" + * (post-swap() "next" as opposed is now a + * dangling pointer). + */ + next = vma; + } if (remove_next == 2) { remove_next = 1; end = next->vm_end; @@ -958,13 +1037,24 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, * cannot merge might become might become might become * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or - * mremap move: PPPPNNNNNNNN 8 + * mremap move: PPPPXXXXXXXX 8 * AAAA * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN * might become case 1 below case 2 below case 3 below * - * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX: - * mprotect_fixup updates vm_flags & vm_page_prot on successful return. + * It is important for case 8 that the the vma NNNN overlapping the + * region AAAA is never going to extended over XXXX. Instead XXXX must + * be extended in region AAAA and NNNN must be removed. This way in + * all cases where vma_merge succeeds, the moment vma_adjust drops the + * rmap_locks, the properties of the merged vma will be already + * correct for the whole merged range. Some of those properties like + * vm_page_prot/vm_flags may be accessed by rmap_walks and they must + * be correct for the whole merged range immediately after the + * rmap_locks are released. Otherwise if XXXX would be removed and + * NNNN would be extended over the XXXX range, remove_migration_ptes + * or other rmap walkers (if working on addresses beyond the "end" + * parameter) may establish ptes with the wrong permissions of NNNN + * instead of the right permissions of XXXX. */ struct vm_area_struct *vma_merge(struct mm_struct *mm, struct vm_area_struct *prev, unsigned long addr, @@ -989,9 +1079,14 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, else next = mm->mmap; area = next; - if (next && next->vm_end == end) /* cases 6, 7, 8 */ + if (area && area->vm_end == end) /* cases 6, 7, 8 */ next = next->vm_next; + /* verify some invariant that must be enforced by the caller */ + VM_WARN_ON(prev && addr <= prev->vm_start); + VM_WARN_ON(area && end > area->vm_end); + VM_WARN_ON(addr >= end); + /* * Can it merge with the predecessor? */ @@ -1012,11 +1107,12 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { /* cases 1, 6 */ - err = vma_adjust(prev, prev->vm_start, - next->vm_end, prev->vm_pgoff, NULL); + err = __vma_adjust(prev, prev->vm_start, + next->vm_end, prev->vm_pgoff, NULL, + prev); } else /* cases 2, 5, 7 */ - err = vma_adjust(prev, prev->vm_start, - end, prev->vm_pgoff, NULL); + err = __vma_adjust(prev, prev->vm_start, + end, prev->vm_pgoff, NULL, prev); if (err) return NULL; khugepaged_enter_vma_merge(prev, vm_flags); @@ -1032,11 +1128,18 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, anon_vma, file, pgoff+pglen, vm_userfaultfd_ctx)) { if (prev && addr < prev->vm_end) /* case 4 */ - err = vma_adjust(prev, prev->vm_start, - addr, prev->vm_pgoff, NULL); - else /* cases 3, 8 */ - err = vma_adjust(area, addr, next->vm_end, - next->vm_pgoff - pglen, NULL); + err = __vma_adjust(prev, prev->vm_start, + addr, prev->vm_pgoff, NULL, next); + else { /* cases 3, 8 */ + err = __vma_adjust(area, addr, next->vm_end, + next->vm_pgoff - pglen, NULL, next); + /* + * In case 3 area is already equal to next and + * this is a noop, but in case 8 "area" has + * been removed and next was expanded over it. + */ + area = next; + } if (err) return NULL; khugepaged_enter_vma_merge(area, vm_flags); diff --git a/mm/mprotect.c b/mm/mprotect.c index 063bbed22c7b..ec91dfd3f900 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -304,6 +304,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, vma->vm_userfaultfd_ctx); if (*pprev) { vma = *pprev; + VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY); goto success; } From 97a42cd4398162aba77da55b568d85e5ec6b7705 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 7 Oct 2016 17:01:31 -0700 Subject: [PATCH 093/127] mm: vma_adjust: remove superfluous check for next not NULL If next would be NULL we couldn't reach such code path. Link: http://lkml.kernel.org/r/1474309513-20313-2-git-send-email-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Cc: Rik van Riel Cc: Hugh Dickins Cc: Mel Gorman Cc: Jan Vorlicek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mmap.c b/mm/mmap.c index e53637f8ac42..aa29d43130b3 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -706,7 +706,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, * If next doesn't have anon_vma, import from vma after * next, if the vma overlaps with it. */ - if (remove_next == 2 && next && !next->anon_vma) + if (remove_next == 2 && !next->anon_vma) exporter = next->vm_next; } else if (end > next->vm_start) { From 86d12e471d9f152217744f2054e63e3742949879 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 7 Oct 2016 17:01:34 -0700 Subject: [PATCH 094/127] mm: vma_adjust: minor comment correction The cases are three not two. Link: http://lkml.kernel.org/r/1474492522-2261-3-git-send-email-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Cc: Rik van Riel Cc: Hugh Dickins Cc: Mel Gorman Cc: Jan Vorlicek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mmap.c b/mm/mmap.c index aa29d43130b3..4dc65be4766f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -667,7 +667,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, /* * vma expands, overlapping all the next, and * perhaps the one after too (mprotect case 6). - * The only two other cases that gets here are + * The only other cases that gets here are * case 1, case 7 and case 8. */ if (next == expand) { From 8f26e0b176f3484c49d55d88fe6083a9cf9ff443 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 7 Oct 2016 17:01:37 -0700 Subject: [PATCH 095/127] mm: vma_merge: correct false positive from __vma_unlink->validate_mm_rb The old code was always doing: vma->vm_end = next->vm_end vma_rb_erase(next) // in __vma_unlink vma->vm_next = next->vm_next // in __vma_unlink next = vma->vm_next vma_gap_update(next) The new code still does the above for remove_next == 1 and 2, but for remove_next == 3 it has been changed and it does: next->vm_start = vma->vm_start vma_rb_erase(vma) // in __vma_unlink vma_gap_update(next) In the latter case, while unlinking "vma", validate_mm_rb() is told to ignore "vma" that is being removed, but next->vm_start was reduced instead. So for the new case, to avoid the false positive from validate_mm_rb, it should be "next" that is ignored when "vma" is being unlinked. "vma" and "next" in the above comment, considered pre-swap(). Link: http://lkml.kernel.org/r/1474492522-2261-4-git-send-email-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Tested-by: Shaun Tancheff Cc: Rik van Riel Cc: Hugh Dickins Cc: Mel Gorman Cc: Jan Vorlicek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 59 ++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 18 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 4dc65be4766f..1af87c14183d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -402,7 +402,32 @@ static inline void vma_rb_insert(struct vm_area_struct *vma, rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks); } -static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) +static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) +{ + /* + * Note rb_erase_augmented is a fairly large inline function, + * so make sure we instantiate it only once with our desired + * augmented rbtree callbacks. + */ + rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks); +} + +static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma, + struct rb_root *root, + struct vm_area_struct *ignore) +{ + /* + * All rb_subtree_gap values must be consistent prior to erase, + * with the possible exception of the "next" vma being erased if + * next->vm_start was reduced. + */ + validate_mm_rb(root, ignore); + + __vma_rb_erase(vma, root); +} + +static __always_inline void vma_rb_erase(struct vm_area_struct *vma, + struct rb_root *root) { /* * All rb_subtree_gap values must be consistent prior to erase, @@ -410,12 +435,7 @@ static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) */ validate_mm_rb(root, vma); - /* - * Note rb_erase_augmented is a fairly large inline function, - * so make sure we instantiate it only once with our desired - * augmented rbtree callbacks. - */ - rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks); + __vma_rb_erase(vma, root); } /* @@ -604,11 +624,12 @@ static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) static __always_inline void __vma_unlink_common(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, - bool has_prev) + bool has_prev, + struct vm_area_struct *ignore) { struct vm_area_struct *next; - vma_rb_erase(vma, &mm->mm_rb); + vma_rb_erase_ignore(vma, &mm->mm_rb, ignore); next = vma->vm_next; if (has_prev) prev->vm_next = next; @@ -630,13 +651,7 @@ static inline void __vma_unlink_prev(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev) { - __vma_unlink_common(mm, vma, prev, true); -} - -static inline void __vma_unlink(struct mm_struct *mm, - struct vm_area_struct *vma) -{ - __vma_unlink_common(mm, vma, NULL, false); + __vma_unlink_common(mm, vma, prev, true, vma); } /* @@ -815,8 +830,16 @@ again: if (remove_next != 3) __vma_unlink_prev(mm, next, vma); else - /* vma is not before next if they've been swapped */ - __vma_unlink(mm, next); + /* + * vma is not before next if they've been + * swapped. + * + * pre-swap() next->vm_start was reduced so + * tell validate_mm_rb to ignore pre-swap() + * "next" (which is stored in post-swap() + * "vma"). + */ + __vma_unlink_common(mm, next, NULL, false, vma); if (file) __remove_shared_vm_struct(next, file, mapping); } else if (insert) { From 9996f05eac09815121bb718249f21914a667791f Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 7 Oct 2016 17:01:40 -0700 Subject: [PATCH 096/127] mm: clarify why we avoid page_mapcount() for slab pages in dump_page() Let's add comment on why we skip page_mapcount() for sl[aou]b pages. Link: http://lkml.kernel.org/r/20160922105532.GB24593@node Signed-off-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/debug.c b/mm/debug.c index 74c7cae4f683..9feb699c5d25 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -42,6 +42,11 @@ const struct trace_print_flags vmaflag_names[] = { void __dump_page(struct page *page, const char *reason) { + /* + * Avoid VM_BUG_ON() in page_mapcount(). + * page->_mapcount space in struct page is used by sl[aou]b pages to + * encode own info. + */ int mapcount = PageSlab(page) ? 0 : page_mapcount(page); pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx", From 82e7d3abec86cba9df945a765bba384f8ac113a7 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 7 Oct 2016 17:01:43 -0700 Subject: [PATCH 097/127] oom: print nodemask in the oom report We have received a hard to explain oom report from a customer. The oom triggered regardless there is a lot of free memory: PoolThread invoked oom-killer: gfp_mask=0x280da, order=0, oom_adj=0, oom_score_adj=0 PoolThread cpuset=/ mems_allowed=0-7 Pid: 30055, comm: PoolThread Tainted: G E X 3.0.101-80-default #1 Call Trace: dump_trace+0x75/0x300 dump_stack+0x69/0x6f dump_header+0x8e/0x110 oom_kill_process+0xa6/0x350 out_of_memory+0x2b7/0x310 __alloc_pages_slowpath+0x7dd/0x820 __alloc_pages_nodemask+0x1e9/0x200 alloc_pages_vma+0xe1/0x290 do_anonymous_page+0x13e/0x300 do_page_fault+0x1fd/0x4c0 page_fault+0x25/0x30 [...] active_anon:1135959151 inactive_anon:1051962 isolated_anon:0 active_file:13093 inactive_file:222506 isolated_file:0 unevictable:262144 dirty:2 writeback:0 unstable:0 free:432672819 slab_reclaimable:7917 slab_unreclaimable:95308 mapped:261139 shmem:166297 pagetables:2228282 bounce:0 [...] Node 0 DMA free:15896kB min:0kB low:0kB high:0kB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15672kB mlocked:0kB dirty:0kB writeback:0kB mapped:0kB shmem:0kB slab_reclaimable:0kB slab_unreclaimable:0kB kernel_stack:0kB pagetables:0kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? yes lowmem_reserve[]: 0 2892 775542 775542 Node 0 DMA32 free:2783784kB min:28kB low:32kB high:40kB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:2961572kB mlocked:0kB dirty:0kB writeback:0kB mapped:0kB shmem:0kB slab_reclaimable:0kB slab_unreclaimable:0kB kernel_stack:0kB pagetables:0kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? yes lowmem_reserve[]: 0 0 772650 772650 Node 0 Normal free:8120kB min:8160kB low:10200kB high:12240kB active_anon:779334960kB inactive_anon:2198744kB active_file:0kB inactive_file:180kB unevictable:131072kB isolated(anon):0kB isolated(file):0kB present:791193600kB mlocked:131072kB dirty:0kB writeback:0kB mapped:372940kB shmem:361480kB slab_reclaimable:4536kB slab_unreclaimable:68472kB kernel_stack:10104kB pagetables:1414820kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:2280 all_unreclaimable? yes lowmem_reserve[]: 0 0 0 0 Node 1 Normal free:476718144kB min:8192kB low:10240kB high:12288kB active_anon:307623696kB inactive_anon:283620kB active_file:10392kB inactive_file:69908kB unevictable:131072kB isolated(anon):0kB isolated(file):0kB present:794296320kB mlocked:131072kB dirty:4kB writeback:0kB mapped:257208kB shmem:189896kB slab_reclaimable:3868kB slab_unreclaimable:44756kB kernel_stack:1848kB pagetables:1369432kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no lowmem_reserve[]: 0 0 0 0 Node 2 Normal free:386002452kB min:8192kB low:10240kB high:12288kB active_anon:398563752kB inactive_anon:68184kB active_file:10292kB inactive_file:29936kB unevictable:131072kB isolated(anon):0kB isolated(file):0kB present:794296320kB mlocked:131072kB dirty:0kB writeback:0kB mapped:32084kB shmem:776kB slab_reclaimable:6888kB slab_unreclaimable:60056kB kernel_stack:8208kB pagetables:1282880kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no lowmem_reserve[]: 0 0 0 0 Node 3 Normal free:196406760kB min:8192kB low:10240kB high:12288kB active_anon:587445640kB inactive_anon:164396kB active_file:5716kB inactive_file:709844kB unevictable:131072kB isolated(anon):0kB isolated(file):0kB present:794296320kB mlocked:131072kB dirty:0kB writeback:0kB mapped:291776kB shmem:111416kB slab_reclaimable:5152kB slab_unreclaimable:44516kB kernel_stack:2168kB pagetables:1455956kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no lowmem_reserve[]: 0 0 0 0 Node 4 Normal free:425338880kB min:8192kB low:10240kB high:12288kB active_anon:359695204kB inactive_anon:43216kB active_file:5748kB inactive_file:14772kB unevictable:131072kB isolated(anon):0kB isolated(file):0kB present:794296320kB mlocked:131072kB dirty:0kB writeback:0kB mapped:24708kB shmem:1120kB slab_reclaimable:1884kB slab_unreclaimable:41060kB kernel_stack:1856kB pagetables:1100208kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no lowmem_reserve[]: 0 0 0 0 Node 5 Normal free:11140kB min:8192kB low:10240kB high:12288kB active_anon:784240872kB inactive_anon:1217164kB active_file:28kB inactive_file:48kB unevictable:131072kB isolated(anon):0kB isolated(file):0kB present:794296320kB mlocked:131072kB dirty:0kB writeback:0kB mapped:11408kB shmem:0kB slab_reclaimable:2008kB slab_unreclaimable:49220kB kernel_stack:1360kB pagetables:531600kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:1202 all_unreclaimable? yes lowmem_reserve[]: 0 0 0 0 Node 6 Normal free:243395332kB min:8192kB low:10240kB high:12288kB active_anon:542015544kB inactive_anon:40208kB active_file:968kB inactive_file:8484kB unevictable:131072kB isolated(anon):0kB isolated(file):0kB present:794296320kB mlocked:131072kB dirty:0kB writeback:0kB mapped:19992kB shmem:496kB slab_reclaimable:1672kB slab_unreclaimable:37052kB kernel_stack:2088kB pagetables:750264kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no lowmem_reserve[]: 0 0 0 0 Node 7 Normal free:10768kB min:8192kB low:10240kB high:12288kB active_anon:784916936kB inactive_anon:192316kB active_file:19228kB inactive_file:56852kB unevictable:131072kB isolated(anon):0kB isolated(file):0kB present:794296320kB mlocked:131072kB dirty:4kB writeback:0kB mapped:34440kB shmem:4kB slab_reclaimable:5660kB slab_unreclaimable:36100kB kernel_stack:1328kB pagetables:1007968kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no lowmem_reserve[]: 0 0 0 0 So all nodes but Node 0 have a lot of free memory which should suggest that there is an available memory especially when mems_allowed=0-7. One could speculate that a massive process has managed to terminate and free up a lot of memory while racing with the above allocation request. Although this is highly unlikely it cannot be ruled out. A further debugging, however shown that the faulting process had mempolicy (not cpuset) to bind to Node 0. We cannot see that information from the report though. mems_allowed turned out to be more confusing than really helpful. Fix this by always priting the nodemask. It is either mempolicy mask (and non-null) or the one defined by the cpusets. The new output for the above oom report would be PoolThread invoked oom-killer: gfp_mask=0x280da(GFP_HIGHUSER_MOVABLE|__GFP_ZERO), nodemask=0, order=0, oom_adj=0, oom_score_adj=0 This patch doesn't touch show_mem and the node filtering based on the cpuset node mask because mempolicy is always a subset of cpusets and seeing the full cpuset oom context might be helpful for tunning more specific mempolicies inside cpusets (e.g. when they turn out to be too restrictive). To prevent from ugly ifdefs the mask is printed even for !NUMA configurations but this should be OK (a single node will be printed). Link: http://lkml.kernel.org/r/20160930214146.28600-1-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Sellami Abdelkader Acked-by: Vlastimil Babka Cc: David Rientjes Cc: Sellami Abdelkader Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f284e92a71f0..ec9f11d4f094 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -403,8 +403,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) static void dump_header(struct oom_control *oc, struct task_struct *p) { - pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n", - current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, + nodemask_t *nm = (oc->nodemask) ? oc->nodemask : &cpuset_current_mems_allowed; + + pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n", + current->comm, oc->gfp_mask, &oc->gfp_mask, + nodemask_pr_args(nm), oc->order, current->signal->oom_score_adj); if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order) pr_warn("COMPACTION is disabled!!!\n"); From 461a7184320a1b4d2c12ad538354062fef4ee0f1 Mon Sep 17 00:00:00 2001 From: Yisheng Xie Date: Fri, 7 Oct 2016 17:01:46 -0700 Subject: [PATCH 098/127] mm/hugetlb: introduce ARCH_HAS_GIGANTIC_PAGE Avoid making ifdef get pretty unwieldy if many ARCHs support gigantic page. No functional change with this patch. Link: http://lkml.kernel.org/r/1475227569-63446-2-git-send-email-xieyisheng1@huawei.com Signed-off-by: Yisheng Xie Suggested-by: Michal Hocko Acked-by: Michal Hocko Acked-by: Naoya Horiguchi Acked-by: Hillf Danton Cc: Hanjun Guo Cc: Will Deacon Cc: Dave Hansen Cc: Sudeep Holla Cc: Catalin Marinas Cc: Mark Rutland Cc: Rob Herring Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/Kconfig | 1 + arch/x86/Kconfig | 1 + fs/Kconfig | 3 +++ mm/hugetlb.c | 2 +- 4 files changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index deeadfa291ba..fb538031569a 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -71,6 +71,7 @@ config S390 select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_GCOV_PROFILE_ALL + select ARCH_HAS_GIGANTIC_PAGE select ARCH_HAS_KCOV select ARCH_HAS_SG_CHAIN select ARCH_HAS_UBSAN_SANITIZE_ALL diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 58bec8f9641a..827273c56ae9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -28,6 +28,7 @@ config X86 select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_GCOV_PROFILE_ALL + select ARCH_HAS_GIGANTIC_PAGE if X86_64 select ARCH_HAS_KCOV if X86_64 select ARCH_HAS_PMEM_API if X86_64 select ARCH_HAS_MMIO_FLUSH diff --git a/fs/Kconfig b/fs/Kconfig index 3ef62bad8f2b..4bd03a2b0518 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -200,6 +200,9 @@ config HUGETLBFS config HUGETLB_PAGE def_bool HUGETLBFS +config ARCH_HAS_GIGANTIC_PAGE + bool + source "fs/configfs/Kconfig" source "fs/efivarfs/Kconfig" diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 770d83eb3f48..e4a4500758f2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1022,7 +1022,7 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) ((node = hstate_next_node_to_free(hs, mask)) || 1); \ nr_nodes--) -#if (defined(CONFIG_X86_64) || defined(CONFIG_S390)) && \ +#if defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE) && \ ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || \ defined(CONFIG_CMA)) static void destroy_compound_gigantic_page(struct page *page, From 14f099107aacea5ffa77d4fd3be52671e3925b93 Mon Sep 17 00:00:00 2001 From: Yisheng Xie Date: Fri, 7 Oct 2016 17:01:49 -0700 Subject: [PATCH 099/127] arm64 Kconfig: select gigantic page Arm64 supports gigantic pages after commit 084bd29810a5 ("ARM64: mm: HugeTLB support.") however, it can only be allocated at boottime and can't be freed. This patch selects ARCH_HAS_GIGANTIC_PAGE to make gigantic pages can be allocated and freed at runtime for arch arm64. Link: http://lkml.kernel.org/r/1475227569-63446-3-git-send-email-xieyisheng1@huawei.com Signed-off-by: Yisheng Xie Acked-by: Michal Hocko Acked-by: Catalin Marinas Acked-by: Hillf Danton Cc: Hanjun Guo Cc: Will Deacon Cc: Dave Hansen Cc: Sudeep Holla Cc: Mark Rutland Cc: Rob Herring Cc: Mike Kravetz Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 01600401a53e..c068aa689563 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -11,6 +11,7 @@ config ARM64 select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_GCOV_PROFILE_ALL + select ARCH_HAS_GIGANTIC_PAGE select ARCH_HAS_KCOV select ARCH_HAS_SG_CHAIN select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST From c2a9737f45e27d8263ff9643f994bda9bac0b944 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Fri, 7 Oct 2016 17:01:52 -0700 Subject: [PATCH 100/127] vfs,mm: fix a dead loop in truncate_inode_pages_range() We triggered a deadloop in truncate_inode_pages_range() on 32 bits architecture with the test case bellow: ... fd = open(); write(fd, buf, 4096); preadv64(fd, &iovec, 1, 0xffffffff000); ftruncate(fd, 0); ... Then ftruncate() will not return forever. The filesystem used in this case is ubifs, but it can be triggered on many other filesystems. When preadv64() is called with offset=0xffffffff000, a page with index=0xffffffff will be added to the radix tree of ->mapping. Then this page can be found in ->mapping with pagevec_lookup(). After that, truncate_inode_pages_range(), which is called in ftruncate(), will fall into an infinite loop: - find a page with index=0xffffffff, since index>=end, this page won't be truncated - index++, and index become 0 - the page with index=0xffffffff will be found again The data type of index is unsigned long, so index won't overflow to 0 on 64 bits architecture in this case, and the dead loop won't happen. Since truncate_inode_pages_range() is executed with holding lock of inode->i_rwsem, any operation related with this lock will be blocked, and a hung task will happen, e.g.: INFO: task truncate_test:3364 blocked for more than 120 seconds. ... call_rwsem_down_write_failed+0x17/0x30 generic_file_write_iter+0x32/0x1c0 ubifs_write_iter+0xcc/0x170 __vfs_write+0xc4/0x120 vfs_write+0xb2/0x1b0 SyS_write+0x46/0xa0 The page with index=0xffffffff added to ->mapping is useless. Fix this by checking the read position before allocating pages. Link: http://lkml.kernel.org/r/1475151010-40166-1-git-send-email-fangwei1@huawei.com Signed-off-by: Wei Fang Cc: Christoph Hellwig Cc: Dave Chinner Cc: Al Viro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/filemap.c b/mm/filemap.c index 1b05f75aea0f..2f7b7783bd6b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1687,6 +1687,10 @@ static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos, unsigned int prev_offset; int error = 0; + if (unlikely(*ppos >= inode->i_sb->s_maxbytes)) + return -EINVAL; + iov_iter_truncate(iter, inode->i_sb->s_maxbytes); + index = *ppos >> PAGE_SHIFT; prev_index = ra->prev_pos >> PAGE_SHIFT; prev_offset = ra->prev_pos & (PAGE_SIZE-1); From 7877cdcc3893c1bd9a833b2f0398e7320794c6e6 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 7 Oct 2016 17:01:55 -0700 Subject: [PATCH 101/127] mm: consolidate warn_alloc_failed users warn_alloc_failed is currently used from the page and vmalloc allocators. This is a good reuse of the code except that vmalloc would appreciate a slightly different warning message. This is already handled by the fmt parameter except that "%s: page allocation failure: order:%u, mode:%#x(%pGg)" is printed anyway. This might be quite misleading because it might be a vmalloc failure which leads to the warning while the page allocator is not the culprit here. Fix this by always using the fmt string and only print the context that makes sense for the particular context (e.g. order makes only very little sense for the vmalloc context). Rename the function to not miss any user and also because a later patch will reuse it also for !failure cases. Link: http://lkml.kernel.org/r/20160929084407.7004-2-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Tetsuo Handa Cc: Johannes Weiner Cc: Mel Gorman Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 5 ++--- mm/page_alloc.c | 27 ++++++++++++--------------- mm/vmalloc.c | 14 ++++++-------- 3 files changed, 20 insertions(+), 26 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 2c8ed8a894c8..f7231411ad5a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1916,9 +1916,8 @@ extern void si_meminfo_node(struct sysinfo *val, int nid); extern unsigned long arch_reserved_kernel_pages(void); #endif -extern __printf(3, 4) -void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, - const char *fmt, ...); +extern __printf(2, 3) +void warn_alloc(gfp_t gfp_mask, const char *fmt, ...); extern void setup_per_cpu_pageset(void); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bcfa647c1752..5ab2e30a1006 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2979,9 +2979,11 @@ static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); -void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...) +void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) { unsigned int filter = SHOW_MEM_FILTER_NODES; + struct va_format vaf; + va_list args; if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || debug_guardpage_minorder() > 0) @@ -2999,22 +3001,16 @@ void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...) if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) filter &= ~SHOW_MEM_FILTER_NODES; - if (fmt) { - struct va_format vaf; - va_list args; + pr_warn("%s: ", current->comm); - va_start(args, fmt); + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + pr_cont("%pV", &vaf); + va_end(args); - vaf.fmt = fmt; - vaf.va = &args; + pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask); - pr_warn("%pV", &vaf); - - va_end(args); - } - - pr_warn("%s: page allocation failure: order:%u, mode:%#x(%pGg)\n", - current->comm, order, gfp_mask, &gfp_mask); dump_stack(); if (!should_suppress_show_mem()) show_mem(filter); @@ -3680,7 +3676,8 @@ retry: } nopage: - warn_alloc_failed(gfp_mask, order, NULL); + warn_alloc(gfp_mask, + "page allocation failure: order:%u", order); got_pg: return page; } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 80660a0f989b..f2481cb4e6b2 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1601,7 +1601,6 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, int node) { - const int order = 0; struct page **pages; unsigned int nr_pages, array_size, i; const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; @@ -1629,9 +1628,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, struct page *page; if (node == NUMA_NO_NODE) - page = alloc_pages(alloc_mask, order); + page = alloc_page(alloc_mask); else - page = alloc_pages_node(node, alloc_mask, order); + page = alloc_pages_node(node, alloc_mask, 0); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vunmap() */ @@ -1648,8 +1647,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, return area->addr; fail: - warn_alloc_failed(gfp_mask, order, - "vmalloc: allocation failure, allocated %ld of %ld bytes\n", + warn_alloc(gfp_mask, + "vmalloc: allocation failure, allocated %ld of %ld bytes", (area->nr_pages*PAGE_SIZE), area->size); vfree(area->addr); return NULL; @@ -1710,9 +1709,8 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, return addr; fail: - warn_alloc_failed(gfp_mask, 0, - "vmalloc: allocation failure: %lu bytes\n", - real_size); + warn_alloc(gfp_mask, + "vmalloc: allocation failure: %lu bytes", real_size); return NULL; } From 63f53dea0c9866e93802d50a230c460a024c44e5 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 7 Oct 2016 17:01:58 -0700 Subject: [PATCH 102/127] mm: warn about allocations which stall for too long Currently we do warn only about allocation failures but small allocations are basically nofail and they might loop in the page allocator for a long time. Especially when the reclaim cannot make any progress - e.g. GFP_NOFS cannot invoke the oom killer and rely on a different context to make a forward progress in case there is a lot memory used by filesystems. Give us at least a clue when something like this happens and warn about allocations which take more than 10s. Print the basic allocation context information along with the cumulative time spent in the allocation as well as the allocation stack. Repeat the warning after every 10 seconds so that we know that the problem is permanent rather than ephemeral. Link: http://lkml.kernel.org/r/20160929084407.7004-3-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: Vlastimil Babka Cc: Tetsuo Handa Cc: Johannes Weiner Cc: Mel Gorman Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5ab2e30a1006..ca423cc20b59 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3493,6 +3493,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, enum compact_result compact_result; int compaction_retries = 0; int no_progress_loops = 0; + unsigned long alloc_start = jiffies; + unsigned int stall_timeout = 10 * HZ; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -3648,6 +3650,14 @@ retry: if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) goto nopage; + /* Make sure we know about allocations which stall for too long */ + if (time_after(jiffies, alloc_start + stall_timeout)) { + warn_alloc(gfp_mask, + "page alloction stalls for %ums, order:%u\n", + jiffies_to_msecs(jiffies-alloc_start), order); + stall_timeout += 10 * HZ; + } + if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, did_some_progress > 0, &no_progress_loops)) goto retry; From 72e2936c04f7d2a4bf87d7f72d3bf11cf91ebb47 Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Fri, 7 Oct 2016 17:02:01 -0700 Subject: [PATCH 103/127] mm: remove unnecessary condition in remove_inode_hugepages When the huge page is added to the page cahce (huge_add_to_page_cache), the page private flag will be cleared. since this code (remove_inode_hugepages) will only be called for pages in the page cahce, PagePrivate(page) will always be false. The patch remove the code without any functional change. Link: http://lkml.kernel.org/r/1475113323-29368-1-git-send-email-zhongjiang@huawei.com Signed-off-by: zhong jiang Reviewed-by: Naoya Horiguchi Reviewed-by: Mike Kravetz Tested-by: Mike Kravetz Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 12 +++++------- include/linux/hugetlb.h | 2 +- mm/hugetlb.c | 4 ++-- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 4ea71eba40a5..7337cac29e9e 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -416,7 +416,6 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, for (i = 0; i < pagevec_count(&pvec); ++i) { struct page *page = pvec.pages[i]; - bool rsv_on_error; u32 hash; /* @@ -458,18 +457,17 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, * cache (remove_huge_page) BEFORE removing the * region/reserve map (hugetlb_unreserve_pages). In * rare out of memory conditions, removal of the - * region/reserve map could fail. Before free'ing - * the page, note PagePrivate which is used in case - * of error. + * region/reserve map could fail. Correspondingly, + * the subpool and global reserve usage count can need + * to be adjusted. */ - rsv_on_error = !PagePrivate(page); + VM_BUG_ON(PagePrivate(page)); remove_huge_page(page); freed++; if (!truncate_op) { if (unlikely(hugetlb_unreserve_pages(inode, next, next + 1, 1))) - hugetlb_fix_reserve_counts(inode, - rsv_on_error); + hugetlb_fix_reserve_counts(inode); } unlock_page(page); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index fe99e6f956e2..48c76d612d40 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -90,7 +90,7 @@ int dequeue_hwpoisoned_huge_page(struct page *page); bool isolate_huge_page(struct page *page, struct list_head *list); void putback_active_hugepage(struct page *page); void free_huge_page(struct page *page); -void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve); +void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, struct vm_area_struct *vma, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e4a4500758f2..ec49d9ef1eef 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -567,13 +567,13 @@ retry: * appear as a "reserved" entry instead of simply dangling with incorrect * counts. */ -void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve) +void hugetlb_fix_reserve_counts(struct inode *inode) { struct hugepage_subpool *spool = subpool_inode(inode); long rsv_adjust; rsv_adjust = hugepage_subpool_get_pages(spool, 1); - if (restore_reserve && rsv_adjust) { + if (rsv_adjust) { struct hstate *h = hstate_inode(inode); hugetlb_acct_memory(h, 1); From 1061b0d21e16550e7d7893a5deee2e49ea3990ad Mon Sep 17 00:00:00 2001 From: zijun_hu Date: Fri, 7 Oct 2016 17:02:04 -0700 Subject: [PATCH 104/127] linux/mm.h: canonicalize macro PAGE_ALIGNED() definition The macro PAGE_ALIGNED() is prone to cause error because it doesn't follow convention to parenthesize parameter @addr within macro body, for example unsigned long *ptr = kmalloc(...); PAGE_ALIGNED(ptr + 16); for the left parameter of macro IS_ALIGNED(), (unsigned long)(ptr + 16) is desired but the actual one is (unsigned long)ptr + 16. It is fixed by simply canonicalizing macro PAGE_ALIGNED() definition. Link: http://lkml.kernel.org/r/57EA6AE7.7090807@zoho.com Signed-off-by: zijun_hu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index f7231411ad5a..e9caec6a51e9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -126,7 +126,7 @@ extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *, #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE) /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */ -#define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)addr, PAGE_SIZE) +#define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE) /* * Linux kernel virtual memory manager primitives. From 445ed0a0eac50d2a76441278a084554a4b9dcfda Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Fri, 7 Oct 2016 17:02:07 -0700 Subject: [PATCH 105/127] ia64: implement atomic64_dec_if_positive This is based on s390 version and needed to get rid of CONFIG_ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE Link: http://lkml.kernel.org/r/1473703083-8625-2-git-send-email-vgupta@synopsys.com Signed-off-by: Vineet Gupta Reported-by: kbuild test robot Cc: Tony Luck Cc: Fenghua Yu Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/include/asm/atomic.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/ia64/include/asm/atomic.h b/arch/ia64/include/asm/atomic.h index f565ad376142..65d4bb2b6685 100644 --- a/arch/ia64/include/asm/atomic.h +++ b/arch/ia64/include/asm/atomic.h @@ -269,6 +269,22 @@ static __inline__ long atomic64_add_unless(atomic64_t *v, long a, long u) #define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0) +static __inline__ long atomic64_dec_if_positive(atomic64_t *v) +{ + long c, old, dec; + c = atomic64_read(v); + for (;;) { + dec = c - 1; + if (unlikely(dec < 0)) + break; + old = atomic64_cmpxchg((v), c, dec); + if (likely(old == c)) + break; + c = old; + } + return dec; +} + /* * Atomically add I to V and return TRUE if the resulting value is * negative. From 51a021244b9d579be6b4f8c15c493a76deb2a79e Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Fri, 7 Oct 2016 17:02:10 -0700 Subject: [PATCH 106/127] atomic64: no need for CONFIG_ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE This came to light when implementing native 64-bit atomics for ARCv2. The atomic64 self-test code uses CONFIG_ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE to check whether atomic64_dec_if_positive() is available. It seems it was needed when not every arch defined it. However as of current code the Kconfig option seems needless - for CONFIG_GENERIC_ATOMIC64 it is auto-enabled in lib/Kconfig and a generic definition of API is present lib/atomic64.c - arches with native 64-bit atomics select it in arch/*/Kconfig and define the API in their headers So I see no point in keeping the Kconfig option Compile tested for: - blackfin (CONFIG_GENERIC_ATOMIC64) - x86 (!CONFIG_GENERIC_ATOMIC64) - ia64 Link: http://lkml.kernel.org/r/1473703083-8625-3-git-send-email-vgupta@synopsys.com Signed-off-by: Vineet Gupta Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon Cc: Ralf Baechle Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: "David S. Miller" Cc: Chris Metcalf Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Vineet Gupta Cc: Zhaoxiu Zeng Cc: Linus Walleij Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Herbert Xu Cc: Ming Lin Cc: Arnd Bergmann Cc: Geert Uytterhoeven Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Andi Kleen Cc: Boqun Feng Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/Kconfig | 1 - arch/arm/Kconfig | 1 - arch/arm64/Kconfig | 1 - arch/mips/Kconfig | 1 - arch/parisc/Kconfig | 1 - arch/powerpc/Kconfig | 1 - arch/s390/Kconfig | 1 - arch/sparc/Kconfig | 1 - arch/tile/Kconfig | 1 - arch/x86/Kconfig | 1 - lib/Kconfig | 3 --- lib/atomic64_test.c | 4 ---- 12 files changed, 17 deletions(-) diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 7f312d80b43b..0e49d39ea74a 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -15,7 +15,6 @@ config ALPHA select GENERIC_IRQ_SHOW select ARCH_WANT_IPC_PARSE_VERSION select ARCH_HAVE_NMI_SAFE_CMPXCHG - select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select AUDIT_ARCH select GENERIC_CLOCKEVENTS select GENERIC_SMP_IDLE_THREAD diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 3cd9042fbb62..c297bc5e341c 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -2,7 +2,6 @@ config ARM bool default y select ARCH_CLOCKSOURCE_DATA - select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index c068aa689563..30398dbc940a 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -8,7 +8,6 @@ config ARM64 select ARCH_CLOCKSOURCE_DATA select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI - select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_GIGANTIC_PAGE diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 212ff92920d2..1a322c807f22 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -30,7 +30,6 @@ config MIPS select HAVE_ARCH_TRANSPARENT_HUGEPAGE if CPU_SUPPORTS_HUGEPAGES && 64BIT select RTC_LIB if !MACH_LOONGSON64 select GENERIC_ATOMIC64 if !64BIT - select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select HAVE_DMA_CONTIGUOUS select HAVE_DMA_API_DEBUG select GENERIC_IRQ_PROBE diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index af12c2db9bb8..8a96bdcc3807 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -14,7 +14,6 @@ config PARISC select BUILDTIME_EXTABLE_SORT select HAVE_PERF_EVENTS select GENERIC_ATOMIC64 if !64BIT - select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select BROKEN_RODATA select GENERIC_IRQ_PROBE select GENERIC_PCI_IOMAP diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 927d2ab2ce08..18d1b42cf545 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -113,7 +113,6 @@ config PPC select HAVE_DEBUG_KMEMLEAK select ARCH_HAS_SG_CHAIN select GENERIC_ATOMIC64 if PPC32 - select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select HAVE_PERF_EVENTS select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index fb538031569a..426481d4cc86 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -67,7 +67,6 @@ config DEBUG_RODATA config S390 def_bool y - select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_GCOV_PROFILE_ALL diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index f5d60f14a0bc..b23c76b42d6e 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -22,7 +22,6 @@ config SPARC select HAVE_ARCH_TRACEHOOK select HAVE_EXIT_THREAD select SYSCTL_EXCEPTION_TRACE - select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select RTC_CLASS select RTC_DRV_M48T59 select RTC_SYSTOHC diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index 78da75b670bc..4583c0320059 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -3,7 +3,6 @@ config TILE def_bool y - select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_WANT_FRAME_POINTERS diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 827273c56ae9..bada636d1065 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -23,7 +23,6 @@ config X86 select ARCH_CLOCKSOURCE_DATA select ARCH_DISCARD_MEMBLOCK select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI - select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER diff --git a/lib/Kconfig b/lib/Kconfig index d79909dc01ec..0e74df3c5441 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -457,9 +457,6 @@ config NLATTR config GENERIC_ATOMIC64 bool -config ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE - def_bool y if GENERIC_ATOMIC64 - config LRU_CACHE tristate diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c index dbb369145dda..46042901130f 100644 --- a/lib/atomic64_test.c +++ b/lib/atomic64_test.c @@ -213,7 +213,6 @@ static __init void test_atomic64(void) r += one; BUG_ON(v.counter != r); -#ifdef CONFIG_ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE INIT(onestwos); BUG_ON(atomic64_dec_if_positive(&v) != (onestwos - 1)); r -= one; @@ -226,9 +225,6 @@ static __init void test_atomic64(void) INIT(-one); BUG_ON(atomic64_dec_if_positive(&v) != (-one - one)); BUG_ON(v.counter != r); -#else -#warning Please implement atomic64_dec_if_positive for your architecture and select the above Kconfig symbol -#endif INIT(onestwos); BUG_ON(!atomic64_inc_not_zero(&v)); From 68ba0326b4e14988f9e0c24a6e12a85cf2acd1ca Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 7 Oct 2016 17:02:14 -0700 Subject: [PATCH 107/127] proc: much faster /proc/vmstat Every current KDE system has process named ksysguardd polling files below once in several seconds: $ strace -e trace=open -p $(pidof ksysguardd) Process 1812 attached open("/etc/mtab", O_RDONLY|O_CLOEXEC) = 8 open("/etc/mtab", O_RDONLY|O_CLOEXEC) = 8 open("/proc/net/dev", O_RDONLY) = 8 open("/proc/net/wireless", O_RDONLY) = -1 ENOENT (No such file or directory) open("/proc/stat", O_RDONLY) = 8 open("/proc/vmstat", O_RDONLY) = 8 Hell knows what it is doing but speed up reading /proc/vmstat by 33%! Benchmark is open+read+close 1.000.000 times. BEFORE $ perf stat -r 10 taskset -c 3 ./proc-vmstat Performance counter stats for 'taskset -c 3 ./proc-vmstat' (10 runs): 13146.768464 task-clock (msec) # 0.960 CPUs utilized ( +- 0.60% ) 15 context-switches # 0.001 K/sec ( +- 1.41% ) 1 cpu-migrations # 0.000 K/sec ( +- 11.11% ) 104 page-faults # 0.008 K/sec ( +- 0.57% ) 45,489,799,349 cycles # 3.460 GHz ( +- 0.03% ) 9,970,175,743 stalled-cycles-frontend # 21.92% frontend cycles idle ( +- 0.10% ) 2,800,298,015 stalled-cycles-backend # 6.16% backend cycles idle ( +- 0.32% ) 79,241,190,850 instructions # 1.74 insn per cycle # 0.13 stalled cycles per insn ( +- 0.00% ) 17,616,096,146 branches # 1339.956 M/sec ( +- 0.00% ) 176,106,232 branch-misses # 1.00% of all branches ( +- 0.18% ) 13.691078109 seconds time elapsed ( +- 0.03% ) ^^^^^^^^^^^^ AFTER $ perf stat -r 10 taskset -c 3 ./proc-vmstat Performance counter stats for 'taskset -c 3 ./proc-vmstat' (10 runs): 8688.353749 task-clock (msec) # 0.950 CPUs utilized ( +- 1.25% ) 10 context-switches # 0.001 K/sec ( +- 2.13% ) 1 cpu-migrations # 0.000 K/sec 104 page-faults # 0.012 K/sec ( +- 0.56% ) 30,384,010,730 cycles # 3.497 GHz ( +- 0.07% ) 12,296,259,407 stalled-cycles-frontend # 40.47% frontend cycles idle ( +- 0.13% ) 3,370,668,651 stalled-cycles-backend # 11.09% backend cycles idle ( +- 0.69% ) 28,969,052,879 instructions # 0.95 insn per cycle # 0.42 stalled cycles per insn ( +- 0.01% ) 6,308,245,891 branches # 726.058 M/sec ( +- 0.00% ) 214,685,502 branch-misses # 3.40% of all branches ( +- 0.26% ) 9.146081052 seconds time elapsed ( +- 0.07% ) ^^^^^^^^^^^ vsnprintf() is slow because: 1. format_decode() is busy looking for format specifier: 2 branches per character (not in this case, but in others) 2. approximately million branches while parsing format mini language and everywhere 3. just look at what string() does /proc/vmstat is good case because most of its content are strings Link: http://lkml.kernel.org/r/20160806125455.GA1187@p183.telecom.by Signed-off-by: Alexey Dobriyan Cc: Joe Perches Cc: Andi Kleen Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 73aab319969d..8857e0eee1e1 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1513,7 +1513,10 @@ static int vmstat_show(struct seq_file *m, void *arg) { unsigned long *l = arg; unsigned long off = l - (unsigned long *)m->private; - seq_printf(m, "%s %lu\n", vmstat_text[off], *l); + + seq_puts(m, vmstat_text[off]); + seq_put_decimal_ull(m, ' ', *l); + seq_putc(m, '\n'); return 0; } From f7a5f132b447cb6301ab3f0b0468a63db29e41f5 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 7 Oct 2016 17:02:17 -0700 Subject: [PATCH 108/127] proc: faster /proc/*/status top(1) opens the following files for every PID: /proc/*/stat /proc/*/statm /proc/*/status This patch switches /proc/*/status away from seq_printf(). The result is 13.5% speedup. Benchmark is open("/proc/self/status")+read+close 1.000.000 million times. BEFORE $ perf stat -r 10 taskset -c 3 ./proc-self-status Performance counter stats for 'taskset -c 3 ./proc-self-status' (10 runs): 10748.474301 task-clock (msec) # 0.954 CPUs utilized ( +- 0.91% ) 12 context-switches # 0.001 K/sec ( +- 1.09% ) 1 cpu-migrations # 0.000 K/sec 104 page-faults # 0.010 K/sec ( +- 0.45% ) 37,424,127,876 cycles # 3.482 GHz ( +- 0.04% ) 8,453,010,029 stalled-cycles-frontend # 22.59% frontend cycles idle ( +- 0.12% ) 3,747,609,427 stalled-cycles-backend # 10.01% backend cycles idle ( +- 0.68% ) 65,632,764,147 instructions # 1.75 insn per cycle # 0.13 stalled cycles per insn ( +- 0.00% ) 13,981,324,775 branches # 1300.773 M/sec ( +- 0.00% ) 138,967,110 branch-misses # 0.99% of all branches ( +- 0.18% ) 11.263885428 seconds time elapsed ( +- 0.04% ) ^^^^^^^^^^^^ AFTER $ perf stat -r 10 taskset -c 3 ./proc-self-status Performance counter stats for 'taskset -c 3 ./proc-self-status' (10 runs): 9010.521776 task-clock (msec) # 0.925 CPUs utilized ( +- 1.54% ) 11 context-switches # 0.001 K/sec ( +- 1.54% ) 1 cpu-migrations # 0.000 K/sec ( +- 11.11% ) 103 page-faults # 0.011 K/sec ( +- 0.60% ) 32,352,310,603 cycles # 3.591 GHz ( +- 0.07% ) 7,849,199,578 stalled-cycles-frontend # 24.26% frontend cycles idle ( +- 0.27% ) 3,269,738,842 stalled-cycles-backend # 10.11% backend cycles idle ( +- 0.73% ) 56,012,163,567 instructions # 1.73 insn per cycle # 0.14 stalled cycles per insn ( +- 0.00% ) 11,735,778,795 branches # 1302.453 M/sec ( +- 0.00% ) 98,084,459 branch-misses # 0.84% of all branches ( +- 0.28% ) 9.741247736 seconds time elapsed ( +- 0.07% ) ^^^^^^^^^^^ Link: http://lkml.kernel.org/r/20160806125608.GB1187@p183.telecom.by Signed-off-by: Alexey Dobriyan Cc: Joe Perches Cc: Andi Kleen Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 85 ++++++++++++++++++++++++++----------------------- 1 file changed, 46 insertions(+), 39 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index 88c7de12197b..5e7d2521d496 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -186,51 +186,52 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, task_unlock(p); rcu_read_unlock(); - seq_printf(m, - "State:\t%s\n" - "Tgid:\t%d\n" - "Ngid:\t%d\n" - "Pid:\t%d\n" - "PPid:\t%d\n" - "TracerPid:\t%d\n" - "Uid:\t%d\t%d\t%d\t%d\n" - "Gid:\t%d\t%d\t%d\t%d\n" - "FDSize:\t%d\nGroups:\t", - get_task_state(p), - tgid, ngid, pid_nr_ns(pid, ns), ppid, tpid, - from_kuid_munged(user_ns, cred->uid), - from_kuid_munged(user_ns, cred->euid), - from_kuid_munged(user_ns, cred->suid), - from_kuid_munged(user_ns, cred->fsuid), - from_kgid_munged(user_ns, cred->gid), - from_kgid_munged(user_ns, cred->egid), - from_kgid_munged(user_ns, cred->sgid), - from_kgid_munged(user_ns, cred->fsgid), - max_fds); + seq_printf(m, "State:\t%s", get_task_state(p)); + seq_puts(m, "\nTgid:\t"); + seq_put_decimal_ull(m, 0, tgid); + seq_puts(m, "\nNgid:\t"); + seq_put_decimal_ull(m, 0, ngid); + seq_puts(m, "\nPid:\t"); + seq_put_decimal_ull(m, 0, pid_nr_ns(pid, ns)); + seq_puts(m, "\nPPid:\t"); + seq_put_decimal_ull(m, 0, ppid); + seq_puts(m, "\nTracerPid:\t"); + seq_put_decimal_ull(m, 0, tpid); + seq_puts(m, "\nUid:"); + seq_put_decimal_ull(m, '\t', from_kuid_munged(user_ns, cred->uid)); + seq_put_decimal_ull(m, '\t', from_kuid_munged(user_ns, cred->euid)); + seq_put_decimal_ull(m, '\t', from_kuid_munged(user_ns, cred->suid)); + seq_put_decimal_ull(m, '\t', from_kuid_munged(user_ns, cred->fsuid)); + seq_puts(m, "\nGid:"); + seq_put_decimal_ull(m, '\t', from_kgid_munged(user_ns, cred->gid)); + seq_put_decimal_ull(m, '\t', from_kgid_munged(user_ns, cred->egid)); + seq_put_decimal_ull(m, '\t', from_kgid_munged(user_ns, cred->sgid)); + seq_put_decimal_ull(m, '\t', from_kgid_munged(user_ns, cred->fsgid)); + seq_puts(m, "\nFDSize:\t"); + seq_put_decimal_ull(m, 0, max_fds); + + seq_puts(m, "\nGroups:\t"); group_info = cred->group_info; for (g = 0; g < group_info->ngroups; g++) - seq_printf(m, "%d ", - from_kgid_munged(user_ns, GROUP_AT(group_info, g))); + seq_put_decimal_ull(m, g ? ' ' : 0, from_kgid_munged(user_ns, GROUP_AT(group_info, g))); put_cred(cred); + /* Trailing space shouldn't have been added in the first place. */ + seq_putc(m, ' '); #ifdef CONFIG_PID_NS seq_puts(m, "\nNStgid:"); for (g = ns->level; g <= pid->level; g++) - seq_printf(m, "\t%d", - task_tgid_nr_ns(p, pid->numbers[g].ns)); + seq_put_decimal_ull(m, '\t', task_tgid_nr_ns(p, pid->numbers[g].ns)); seq_puts(m, "\nNSpid:"); for (g = ns->level; g <= pid->level; g++) - seq_printf(m, "\t%d", - task_pid_nr_ns(p, pid->numbers[g].ns)); + seq_put_decimal_ull(m, '\t', task_pid_nr_ns(p, pid->numbers[g].ns)); seq_puts(m, "\nNSpgid:"); for (g = ns->level; g <= pid->level; g++) - seq_printf(m, "\t%d", - task_pgrp_nr_ns(p, pid->numbers[g].ns)); + seq_put_decimal_ull(m, '\t', task_pgrp_nr_ns(p, pid->numbers[g].ns)); seq_puts(m, "\nNSsid:"); for (g = ns->level; g <= pid->level; g++) - seq_printf(m, "\t%d", - task_session_nr_ns(p, pid->numbers[g].ns)); + seq_put_decimal_ull(m, '\t', task_session_nr_ns(p, pid->numbers[g].ns)); #endif seq_putc(m, '\n'); } @@ -299,11 +300,14 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) unlock_task_sighand(p, &flags); } - seq_printf(m, "Threads:\t%d\n", num_threads); - seq_printf(m, "SigQ:\t%lu/%lu\n", qsize, qlim); + seq_puts(m, "Threads:\t"); + seq_put_decimal_ull(m, 0, num_threads); + seq_puts(m, "\nSigQ:\t"); + seq_put_decimal_ull(m, 0, qsize); + seq_put_decimal_ull(m, '/', qlim); /* render them all */ - render_sigset_t(m, "SigPnd:\t", &pending); + render_sigset_t(m, "\nSigPnd:\t", &pending); render_sigset_t(m, "ShdPnd:\t", &shpending); render_sigset_t(m, "SigBlk:\t", &blocked); render_sigset_t(m, "SigIgn:\t", &ignored); @@ -348,17 +352,20 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p) static inline void task_seccomp(struct seq_file *m, struct task_struct *p) { #ifdef CONFIG_SECCOMP - seq_printf(m, "Seccomp:\t%d\n", p->seccomp.mode); + seq_puts(m, "Seccomp:\t"); + seq_put_decimal_ull(m, 0, p->seccomp.mode); + seq_putc(m, '\n'); #endif } static inline void task_context_switch_counts(struct seq_file *m, struct task_struct *p) { - seq_printf(m, "voluntary_ctxt_switches:\t%lu\n" - "nonvoluntary_ctxt_switches:\t%lu\n", - p->nvcsw, - p->nivcsw); + seq_puts(m, "voluntary_ctxt_switches:\t"); + seq_put_decimal_ull(m, 0, p->nvcsw); + seq_puts(m, "\nnonvoluntary_ctxt_switches:\t"); + seq_put_decimal_ull(m, 0, p->nivcsw); + seq_putc(m, '\n'); } static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) From 75ba1d07fd6a494851db5132612944a9d4773f9c Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 7 Oct 2016 17:02:20 -0700 Subject: [PATCH 109/127] seq/proc: modify seq_put_decimal_[u]ll to take a const char *, not char Allow some seq_puts removals by taking a string instead of a single char. [akpm@linux-foundation.org: update vmstat_show(), per Joe] Link: http://lkml.kernel.org/r/667e1cf3d436de91a5698170a1e98d882905e956.1470704995.git.joe@perches.com Signed-off-by: Joe Perches Cc: Joe Perches Cc: Andi Kleen Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 178 ++++++++++++++++++--------------------- fs/proc/stat.c | 49 ++++++----- fs/seq_file.c | 57 ++++++++++--- include/linux/seq_file.h | 4 +- mm/vmstat.c | 2 +- 5 files changed, 154 insertions(+), 136 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index 5e7d2521d496..d25b44601b30 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -188,33 +188,26 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, seq_printf(m, "State:\t%s", get_task_state(p)); - seq_puts(m, "\nTgid:\t"); - seq_put_decimal_ull(m, 0, tgid); - seq_puts(m, "\nNgid:\t"); - seq_put_decimal_ull(m, 0, ngid); - seq_puts(m, "\nPid:\t"); - seq_put_decimal_ull(m, 0, pid_nr_ns(pid, ns)); - seq_puts(m, "\nPPid:\t"); - seq_put_decimal_ull(m, 0, ppid); - seq_puts(m, "\nTracerPid:\t"); - seq_put_decimal_ull(m, 0, tpid); - seq_puts(m, "\nUid:"); - seq_put_decimal_ull(m, '\t', from_kuid_munged(user_ns, cred->uid)); - seq_put_decimal_ull(m, '\t', from_kuid_munged(user_ns, cred->euid)); - seq_put_decimal_ull(m, '\t', from_kuid_munged(user_ns, cred->suid)); - seq_put_decimal_ull(m, '\t', from_kuid_munged(user_ns, cred->fsuid)); - seq_puts(m, "\nGid:"); - seq_put_decimal_ull(m, '\t', from_kgid_munged(user_ns, cred->gid)); - seq_put_decimal_ull(m, '\t', from_kgid_munged(user_ns, cred->egid)); - seq_put_decimal_ull(m, '\t', from_kgid_munged(user_ns, cred->sgid)); - seq_put_decimal_ull(m, '\t', from_kgid_munged(user_ns, cred->fsgid)); - seq_puts(m, "\nFDSize:\t"); - seq_put_decimal_ull(m, 0, max_fds); + seq_put_decimal_ull(m, "\nTgid:\t", tgid); + seq_put_decimal_ull(m, "\nNgid:\t", ngid); + seq_put_decimal_ull(m, "\nPid:\t", pid_nr_ns(pid, ns)); + seq_put_decimal_ull(m, "\nPPid:\t", ppid); + seq_put_decimal_ull(m, "\nTracerPid:\t", tpid); + seq_put_decimal_ull(m, "\nUid:\t", from_kuid_munged(user_ns, cred->uid)); + seq_put_decimal_ull(m, "\t", from_kuid_munged(user_ns, cred->euid)); + seq_put_decimal_ull(m, "\t", from_kuid_munged(user_ns, cred->suid)); + seq_put_decimal_ull(m, "\t", from_kuid_munged(user_ns, cred->fsuid)); + seq_put_decimal_ull(m, "\nGid:\t", from_kgid_munged(user_ns, cred->gid)); + seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->egid)); + seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->sgid)); + seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->fsgid)); + seq_put_decimal_ull(m, "\nFDSize:\t", max_fds); seq_puts(m, "\nGroups:\t"); group_info = cred->group_info; for (g = 0; g < group_info->ngroups; g++) - seq_put_decimal_ull(m, g ? ' ' : 0, from_kgid_munged(user_ns, GROUP_AT(group_info, g))); + seq_put_decimal_ull(m, g ? " " : "", + from_kgid_munged(user_ns, GROUP_AT(group_info, g))); put_cred(cred); /* Trailing space shouldn't have been added in the first place. */ seq_putc(m, ' '); @@ -222,16 +215,16 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, #ifdef CONFIG_PID_NS seq_puts(m, "\nNStgid:"); for (g = ns->level; g <= pid->level; g++) - seq_put_decimal_ull(m, '\t', task_tgid_nr_ns(p, pid->numbers[g].ns)); + seq_put_decimal_ull(m, "\t", task_tgid_nr_ns(p, pid->numbers[g].ns)); seq_puts(m, "\nNSpid:"); for (g = ns->level; g <= pid->level; g++) - seq_put_decimal_ull(m, '\t', task_pid_nr_ns(p, pid->numbers[g].ns)); + seq_put_decimal_ull(m, "\t", task_pid_nr_ns(p, pid->numbers[g].ns)); seq_puts(m, "\nNSpgid:"); for (g = ns->level; g <= pid->level; g++) - seq_put_decimal_ull(m, '\t', task_pgrp_nr_ns(p, pid->numbers[g].ns)); + seq_put_decimal_ull(m, "\t", task_pgrp_nr_ns(p, pid->numbers[g].ns)); seq_puts(m, "\nNSsid:"); for (g = ns->level; g <= pid->level; g++) - seq_put_decimal_ull(m, '\t', task_session_nr_ns(p, pid->numbers[g].ns)); + seq_put_decimal_ull(m, "\t", task_session_nr_ns(p, pid->numbers[g].ns)); #endif seq_putc(m, '\n'); } @@ -300,11 +293,9 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) unlock_task_sighand(p, &flags); } - seq_puts(m, "Threads:\t"); - seq_put_decimal_ull(m, 0, num_threads); - seq_puts(m, "\nSigQ:\t"); - seq_put_decimal_ull(m, 0, qsize); - seq_put_decimal_ull(m, '/', qlim); + seq_put_decimal_ull(m, "Threads:\t", num_threads); + seq_put_decimal_ull(m, "\nSigQ:\t", qsize); + seq_put_decimal_ull(m, "/", qlim); /* render them all */ render_sigset_t(m, "\nSigPnd:\t", &pending); @@ -352,8 +343,7 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p) static inline void task_seccomp(struct seq_file *m, struct task_struct *p) { #ifdef CONFIG_SECCOMP - seq_puts(m, "Seccomp:\t"); - seq_put_decimal_ull(m, 0, p->seccomp.mode); + seq_put_decimal_ull(m, "Seccomp:\t", p->seccomp.mode); seq_putc(m, '\n'); #endif } @@ -361,10 +351,8 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p) static inline void task_context_switch_counts(struct seq_file *m, struct task_struct *p) { - seq_puts(m, "voluntary_ctxt_switches:\t"); - seq_put_decimal_ull(m, 0, p->nvcsw); - seq_puts(m, "\nnonvoluntary_ctxt_switches:\t"); - seq_put_decimal_ull(m, 0, p->nivcsw); + seq_put_decimal_ull(m, "voluntary_ctxt_switches:\t", p->nvcsw); + seq_put_decimal_ull(m, "\nnonvoluntary_ctxt_switches:\t", p->nivcsw); seq_putc(m, '\n'); } @@ -497,41 +485,41 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, start_time = nsec_to_clock_t(task->real_start_time); seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state); - seq_put_decimal_ll(m, ' ', ppid); - seq_put_decimal_ll(m, ' ', pgid); - seq_put_decimal_ll(m, ' ', sid); - seq_put_decimal_ll(m, ' ', tty_nr); - seq_put_decimal_ll(m, ' ', tty_pgrp); - seq_put_decimal_ull(m, ' ', task->flags); - seq_put_decimal_ull(m, ' ', min_flt); - seq_put_decimal_ull(m, ' ', cmin_flt); - seq_put_decimal_ull(m, ' ', maj_flt); - seq_put_decimal_ull(m, ' ', cmaj_flt); - seq_put_decimal_ull(m, ' ', cputime_to_clock_t(utime)); - seq_put_decimal_ull(m, ' ', cputime_to_clock_t(stime)); - seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cutime)); - seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cstime)); - seq_put_decimal_ll(m, ' ', priority); - seq_put_decimal_ll(m, ' ', nice); - seq_put_decimal_ll(m, ' ', num_threads); - seq_put_decimal_ull(m, ' ', 0); - seq_put_decimal_ull(m, ' ', start_time); - seq_put_decimal_ull(m, ' ', vsize); - seq_put_decimal_ull(m, ' ', mm ? get_mm_rss(mm) : 0); - seq_put_decimal_ull(m, ' ', rsslim); - seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->start_code : 1) : 0); - seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->end_code : 1) : 0); - seq_put_decimal_ull(m, ' ', (permitted && mm) ? mm->start_stack : 0); - seq_put_decimal_ull(m, ' ', esp); - seq_put_decimal_ull(m, ' ', eip); + seq_put_decimal_ll(m, " ", ppid); + seq_put_decimal_ll(m, " ", pgid); + seq_put_decimal_ll(m, " ", sid); + seq_put_decimal_ll(m, " ", tty_nr); + seq_put_decimal_ll(m, " ", tty_pgrp); + seq_put_decimal_ull(m, " ", task->flags); + seq_put_decimal_ull(m, " ", min_flt); + seq_put_decimal_ull(m, " ", cmin_flt); + seq_put_decimal_ull(m, " ", maj_flt); + seq_put_decimal_ull(m, " ", cmaj_flt); + seq_put_decimal_ull(m, " ", cputime_to_clock_t(utime)); + seq_put_decimal_ull(m, " ", cputime_to_clock_t(stime)); + seq_put_decimal_ll(m, " ", cputime_to_clock_t(cutime)); + seq_put_decimal_ll(m, " ", cputime_to_clock_t(cstime)); + seq_put_decimal_ll(m, " ", priority); + seq_put_decimal_ll(m, " ", nice); + seq_put_decimal_ll(m, " ", num_threads); + seq_put_decimal_ull(m, " ", 0); + seq_put_decimal_ull(m, " ", start_time); + seq_put_decimal_ull(m, " ", vsize); + seq_put_decimal_ull(m, " ", mm ? get_mm_rss(mm) : 0); + seq_put_decimal_ull(m, " ", rsslim); + seq_put_decimal_ull(m, " ", mm ? (permitted ? mm->start_code : 1) : 0); + seq_put_decimal_ull(m, " ", mm ? (permitted ? mm->end_code : 1) : 0); + seq_put_decimal_ull(m, " ", (permitted && mm) ? mm->start_stack : 0); + seq_put_decimal_ull(m, " ", esp); + seq_put_decimal_ull(m, " ", eip); /* The signal information here is obsolete. * It must be decimal for Linux 2.0 compatibility. * Use /proc/#/status for real-time signals. */ - seq_put_decimal_ull(m, ' ', task->pending.signal.sig[0] & 0x7fffffffUL); - seq_put_decimal_ull(m, ' ', task->blocked.sig[0] & 0x7fffffffUL); - seq_put_decimal_ull(m, ' ', sigign.sig[0] & 0x7fffffffUL); - seq_put_decimal_ull(m, ' ', sigcatch.sig[0] & 0x7fffffffUL); + seq_put_decimal_ull(m, " ", task->pending.signal.sig[0] & 0x7fffffffUL); + seq_put_decimal_ull(m, " ", task->blocked.sig[0] & 0x7fffffffUL); + seq_put_decimal_ull(m, " ", sigign.sig[0] & 0x7fffffffUL); + seq_put_decimal_ull(m, " ", sigcatch.sig[0] & 0x7fffffffUL); /* * We used to output the absolute kernel address, but that's an @@ -545,31 +533,31 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, else seq_puts(m, " 0"); - seq_put_decimal_ull(m, ' ', 0); - seq_put_decimal_ull(m, ' ', 0); - seq_put_decimal_ll(m, ' ', task->exit_signal); - seq_put_decimal_ll(m, ' ', task_cpu(task)); - seq_put_decimal_ull(m, ' ', task->rt_priority); - seq_put_decimal_ull(m, ' ', task->policy); - seq_put_decimal_ull(m, ' ', delayacct_blkio_ticks(task)); - seq_put_decimal_ull(m, ' ', cputime_to_clock_t(gtime)); - seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cgtime)); + seq_put_decimal_ull(m, " ", 0); + seq_put_decimal_ull(m, " ", 0); + seq_put_decimal_ll(m, " ", task->exit_signal); + seq_put_decimal_ll(m, " ", task_cpu(task)); + seq_put_decimal_ull(m, " ", task->rt_priority); + seq_put_decimal_ull(m, " ", task->policy); + seq_put_decimal_ull(m, " ", delayacct_blkio_ticks(task)); + seq_put_decimal_ull(m, " ", cputime_to_clock_t(gtime)); + seq_put_decimal_ll(m, " ", cputime_to_clock_t(cgtime)); if (mm && permitted) { - seq_put_decimal_ull(m, ' ', mm->start_data); - seq_put_decimal_ull(m, ' ', mm->end_data); - seq_put_decimal_ull(m, ' ', mm->start_brk); - seq_put_decimal_ull(m, ' ', mm->arg_start); - seq_put_decimal_ull(m, ' ', mm->arg_end); - seq_put_decimal_ull(m, ' ', mm->env_start); - seq_put_decimal_ull(m, ' ', mm->env_end); + seq_put_decimal_ull(m, " ", mm->start_data); + seq_put_decimal_ull(m, " ", mm->end_data); + seq_put_decimal_ull(m, " ", mm->start_brk); + seq_put_decimal_ull(m, " ", mm->arg_start); + seq_put_decimal_ull(m, " ", mm->arg_end); + seq_put_decimal_ull(m, " ", mm->env_start); + seq_put_decimal_ull(m, " ", mm->env_end); } else - seq_printf(m, " 0 0 0 0 0 0 0"); + seq_puts(m, " 0 0 0 0 0 0 0"); if (permitted) - seq_put_decimal_ll(m, ' ', task->exit_code); + seq_put_decimal_ll(m, " ", task->exit_code); else - seq_put_decimal_ll(m, ' ', 0); + seq_puts(m, " 0"); seq_putc(m, '\n'); if (mm) @@ -605,13 +593,13 @@ int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n", * size, resident, shared, text, data); */ - seq_put_decimal_ull(m, 0, size); - seq_put_decimal_ull(m, ' ', resident); - seq_put_decimal_ull(m, ' ', shared); - seq_put_decimal_ull(m, ' ', text); - seq_put_decimal_ull(m, ' ', 0); - seq_put_decimal_ull(m, ' ', data); - seq_put_decimal_ull(m, ' ', 0); + seq_put_decimal_ull(m, "", size); + seq_put_decimal_ull(m, " ", resident); + seq_put_decimal_ull(m, " ", shared); + seq_put_decimal_ull(m, " ", text); + seq_put_decimal_ull(m, " ", 0); + seq_put_decimal_ull(m, " ", data); + seq_put_decimal_ull(m, " ", 0); seq_putc(m, '\n'); return 0; diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 7907e456ac4f..d700c42b3572 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -115,17 +115,16 @@ static int show_stat(struct seq_file *p, void *v) } sum += arch_irq_stat(); - seq_puts(p, "cpu "); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user)); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice)); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system)); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle)); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait)); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq)); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq)); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal)); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest)); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice)); + seq_put_decimal_ull(p, "cpu ", cputime64_to_clock_t(user)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(nice)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(system)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(idle)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(iowait)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(irq)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(softirq)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(steal)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest_nice)); seq_putc(p, '\n'); for_each_online_cpu(i) { @@ -141,23 +140,23 @@ static int show_stat(struct seq_file *p, void *v) guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; seq_printf(p, "cpu%d", i); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user)); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice)); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system)); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle)); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait)); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq)); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq)); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal)); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest)); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(user)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(nice)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(system)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(idle)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(iowait)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(irq)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(softirq)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(steal)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest_nice)); seq_putc(p, '\n'); } - seq_printf(p, "intr %llu", (unsigned long long)sum); + seq_put_decimal_ull(p, "intr ", (unsigned long long)sum); /* sum again ? it could be updated? */ for_each_irq_nr(j) - seq_put_decimal_ull(p, ' ', kstat_irqs_usr(j)); + seq_put_decimal_ull(p, " ", kstat_irqs_usr(j)); seq_printf(p, "\nctxt %llu\n" @@ -171,10 +170,10 @@ static int show_stat(struct seq_file *p, void *v) nr_running(), nr_iowait()); - seq_printf(p, "softirq %llu", (unsigned long long)sum_softirq); + seq_put_decimal_ull(p, "softirq ", (unsigned long long)sum_softirq); for (i = 0; i < NR_SOFTIRQS; i++) - seq_put_decimal_ull(p, ' ', per_softirq_sums[i]); + seq_put_decimal_ull(p, " ", per_softirq_sums[i]); seq_putc(p, '\n'); return 0; diff --git a/fs/seq_file.c b/fs/seq_file.c index 6dc4296eed62..368bfb92b115 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -679,11 +679,11 @@ EXPORT_SYMBOL(seq_puts); /* * A helper routine for putting decimal numbers without rich format of printf(). * only 'unsigned long long' is supported. - * This routine will put one byte delimiter + number into seq_file. + * This routine will put strlen(delimiter) + number into seq_file. * This routine is very quick when you show lots of numbers. * In usual cases, it will be better to use seq_printf(). It's easier to read. */ -void seq_put_decimal_ull(struct seq_file *m, char delimiter, +void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, unsigned long long num) { int len; @@ -691,8 +691,15 @@ void seq_put_decimal_ull(struct seq_file *m, char delimiter, if (m->count + 2 >= m->size) /* we'll write 2 bytes at least */ goto overflow; - if (delimiter) - m->buf[m->count++] = delimiter; + len = strlen(delimiter); + if (m->count + len >= m->size) + goto overflow; + + memcpy(m->buf + m->count, delimiter, len); + m->count += len; + + if (m->count + 1 >= m->size) + goto overflow; if (num < 10) { m->buf[m->count++] = num + '0'; @@ -702,6 +709,7 @@ void seq_put_decimal_ull(struct seq_file *m, char delimiter, len = num_to_str(m->buf + m->count, m->size - m->count, num); if (!len) goto overflow; + m->count += len; return; @@ -710,19 +718,42 @@ overflow: } EXPORT_SYMBOL(seq_put_decimal_ull); -void seq_put_decimal_ll(struct seq_file *m, char delimiter, long long num) +void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num) { + int len; + + if (m->count + 3 >= m->size) /* we'll write 2 bytes at least */ + goto overflow; + + len = strlen(delimiter); + if (m->count + len >= m->size) + goto overflow; + + memcpy(m->buf + m->count, delimiter, len); + m->count += len; + + if (m->count + 2 >= m->size) + goto overflow; + if (num < 0) { - if (m->count + 3 >= m->size) { - seq_set_overflow(m); - return; - } - if (delimiter) - m->buf[m->count++] = delimiter; + m->buf[m->count++] = '-'; num = -num; - delimiter = '-'; } - seq_put_decimal_ull(m, delimiter, num); + + if (num < 10) { + m->buf[m->count++] = num + '0'; + return; + } + + len = num_to_str(m->buf + m->count, m->size - m->count, num); + if (!len) + goto overflow; + + m->count += len; + return; + +overflow: + seq_set_overflow(m); } EXPORT_SYMBOL(seq_put_decimal_ll); diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index f3d45dd42695..e305b66a9fb9 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -117,9 +117,9 @@ __printf(2, 3) void seq_printf(struct seq_file *m, const char *fmt, ...); void seq_putc(struct seq_file *m, char c); void seq_puts(struct seq_file *m, const char *s); -void seq_put_decimal_ull(struct seq_file *m, char delimiter, +void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, unsigned long long num); -void seq_put_decimal_ll(struct seq_file *m, char delimiter, long long num); +void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num); void seq_escape(struct seq_file *m, const char *s, const char *esc); void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type, diff --git a/mm/vmstat.c b/mm/vmstat.c index 8857e0eee1e1..604f26a4f696 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1515,7 +1515,7 @@ static int vmstat_show(struct seq_file *m, void *arg) unsigned long off = l - (unsigned long *)m->private; seq_puts(m, vmstat_text[off]); - seq_put_decimal_ull(m, ' ', *l); + seq_put_decimal_ull(m, " ", *l); seq_putc(m, '\n'); return 0; } From e16e2d8e14a14bd87df8482c637dde8f760a8d5f Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 7 Oct 2016 17:02:23 -0700 Subject: [PATCH 110/127] meminfo: break apart a very long seq_printf with #ifdefs Use a specific routine to emit most lines so that the code is easier to read and maintain. akpm: text data bss dec hex filename 2976 8 0 2984 ba8 fs/proc/meminfo.o before 2669 8 0 2677 a75 fs/proc/meminfo.o after Link: http://lkml.kernel.org/r/8fce7fdef2ba081a4ef531594e97da8a9feebb58.1470810406.git.joe@perches.com Signed-off-by: Joe Perches Cc: Andi Kleen Cc: Alexey Dobriyan Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/meminfo.c | 211 +++++++++++++++++++++------------------------- 1 file changed, 95 insertions(+), 116 deletions(-) diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index b9a8c813e5e6..8a428498d6b2 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -23,6 +23,25 @@ void __attribute__((weak)) arch_report_meminfo(struct seq_file *m) { } +static void show_val_kb(struct seq_file *m, const char *s, unsigned long num) +{ + char v[32]; + static const char blanks[7] = {' ', ' ', ' ', ' ',' ', ' ', ' '}; + int len; + + len = num_to_str(v, sizeof(v), num << (PAGE_SHIFT - 10)); + + seq_write(m, s, 16); + + if (len > 0) { + if (len < 8) + seq_write(m, blanks, 8 - len); + + seq_write(m, v, len); + } + seq_write(m, " kB\n", 4); +} + static int meminfo_proc_show(struct seq_file *m, void *v) { struct sysinfo i; @@ -32,10 +51,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v) unsigned long pages[NR_LRU_LISTS]; int lru; -/* - * display in kilobytes. - */ -#define K(x) ((x) << (PAGE_SHIFT - 10)) si_meminfo(&i); si_swapinfo(&i); committed = percpu_counter_read_positive(&vm_committed_as); @@ -50,136 +65,100 @@ static int meminfo_proc_show(struct seq_file *m, void *v) available = si_mem_available(); - /* - * Tagged format, for easy grepping and expansion. - */ - seq_printf(m, - "MemTotal: %8lu kB\n" - "MemFree: %8lu kB\n" - "MemAvailable: %8lu kB\n" - "Buffers: %8lu kB\n" - "Cached: %8lu kB\n" - "SwapCached: %8lu kB\n" - "Active: %8lu kB\n" - "Inactive: %8lu kB\n" - "Active(anon): %8lu kB\n" - "Inactive(anon): %8lu kB\n" - "Active(file): %8lu kB\n" - "Inactive(file): %8lu kB\n" - "Unevictable: %8lu kB\n" - "Mlocked: %8lu kB\n" + show_val_kb(m, "MemTotal: ", i.totalram); + show_val_kb(m, "MemFree: ", i.freeram); + show_val_kb(m, "MemAvailable: ", available); + show_val_kb(m, "Buffers: ", i.bufferram); + show_val_kb(m, "Cached: ", cached); + show_val_kb(m, "SwapCached: ", total_swapcache_pages()); + show_val_kb(m, "Active: ", pages[LRU_ACTIVE_ANON] + + pages[LRU_ACTIVE_FILE]); + show_val_kb(m, "Inactive: ", pages[LRU_INACTIVE_ANON] + + pages[LRU_INACTIVE_FILE]); + show_val_kb(m, "Active(anon): ", pages[LRU_ACTIVE_ANON]); + show_val_kb(m, "Inactive(anon): ", pages[LRU_INACTIVE_ANON]); + show_val_kb(m, "Active(file): ", pages[LRU_ACTIVE_FILE]); + show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]); + show_val_kb(m, "Unevictable: ", pages[LRU_UNEVICTABLE]); + show_val_kb(m, "Mlocked: ", global_page_state(NR_MLOCK)); + #ifdef CONFIG_HIGHMEM - "HighTotal: %8lu kB\n" - "HighFree: %8lu kB\n" - "LowTotal: %8lu kB\n" - "LowFree: %8lu kB\n" + show_val_kb(m, "HighTotal: ", i.totalhigh); + show_val_kb(m, "HighFree: ", i.freehigh); + show_val_kb(m, "LowTotal: ", i.totalram - i.totalhigh); + show_val_kb(m, "LowFree: ", i.freeram - i.freehigh); #endif + #ifndef CONFIG_MMU - "MmapCopy: %8lu kB\n" + show_val_kb(m, "MmapCopy: ", + (unsigned long)atomic_long_read(&mmap_pages_allocated)); #endif - "SwapTotal: %8lu kB\n" - "SwapFree: %8lu kB\n" - "Dirty: %8lu kB\n" - "Writeback: %8lu kB\n" - "AnonPages: %8lu kB\n" - "Mapped: %8lu kB\n" - "Shmem: %8lu kB\n" - "Slab: %8lu kB\n" - "SReclaimable: %8lu kB\n" - "SUnreclaim: %8lu kB\n" - "KernelStack: %8lu kB\n" - "PageTables: %8lu kB\n" + + show_val_kb(m, "SwapTotal: ", i.totalswap); + show_val_kb(m, "SwapFree: ", i.freeswap); + show_val_kb(m, "Dirty: ", + global_node_page_state(NR_FILE_DIRTY)); + show_val_kb(m, "Writeback: ", + global_node_page_state(NR_WRITEBACK)); + show_val_kb(m, "AnonPages: ", + global_node_page_state(NR_ANON_MAPPED)); + show_val_kb(m, "Mapped: ", + global_node_page_state(NR_FILE_MAPPED)); + show_val_kb(m, "Shmem: ", i.sharedram); + show_val_kb(m, "Slab: ", + global_page_state(NR_SLAB_RECLAIMABLE) + + global_page_state(NR_SLAB_UNRECLAIMABLE)); + + show_val_kb(m, "SReclaimable: ", + global_page_state(NR_SLAB_RECLAIMABLE)); + show_val_kb(m, "SUnreclaim: ", + global_page_state(NR_SLAB_UNRECLAIMABLE)); + seq_printf(m, "KernelStack: %8lu kB\n", + global_page_state(NR_KERNEL_STACK_KB)); + show_val_kb(m, "PageTables: ", + global_page_state(NR_PAGETABLE)); #ifdef CONFIG_QUICKLIST - "Quicklists: %8lu kB\n" + show_val_kb(m, "Quicklists: ", quicklist_total_size()); #endif - "NFS_Unstable: %8lu kB\n" - "Bounce: %8lu kB\n" - "WritebackTmp: %8lu kB\n" - "CommitLimit: %8lu kB\n" - "Committed_AS: %8lu kB\n" - "VmallocTotal: %8lu kB\n" - "VmallocUsed: %8lu kB\n" - "VmallocChunk: %8lu kB\n" + + show_val_kb(m, "NFS_Unstable: ", + global_node_page_state(NR_UNSTABLE_NFS)); + show_val_kb(m, "Bounce: ", + global_page_state(NR_BOUNCE)); + show_val_kb(m, "WritebackTmp: ", + global_node_page_state(NR_WRITEBACK_TEMP)); + show_val_kb(m, "CommitLimit: ", vm_commit_limit()); + show_val_kb(m, "Committed_AS: ", committed); + seq_printf(m, "VmallocTotal: %8lu kB\n", + (unsigned long)VMALLOC_TOTAL >> 10); + show_val_kb(m, "VmallocUsed: ", 0ul); + show_val_kb(m, "VmallocChunk: ", 0ul); + #ifdef CONFIG_MEMORY_FAILURE - "HardwareCorrupted: %5lu kB\n" + seq_printf(m, "HardwareCorrupted: %5lu kB\n", + atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)); #endif + #ifdef CONFIG_TRANSPARENT_HUGEPAGE - "AnonHugePages: %8lu kB\n" - "ShmemHugePages: %8lu kB\n" - "ShmemPmdMapped: %8lu kB\n" + show_val_kb(m, "AnonHugePages: ", + global_node_page_state(NR_ANON_THPS) * HPAGE_PMD_NR); + show_val_kb(m, "ShmemHugePages: ", + global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR); + show_val_kb(m, "ShmemPmdMapped: ", + global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR); #endif + #ifdef CONFIG_CMA - "CmaTotal: %8lu kB\n" - "CmaFree: %8lu kB\n" + show_val_kb(m, "CmaTotal: ", totalcma_pages); + show_val_kb(m, "CmaFree: ", + global_page_state(NR_FREE_CMA_PAGES)); #endif - , - K(i.totalram), - K(i.freeram), - K(available), - K(i.bufferram), - K(cached), - K(total_swapcache_pages()), - K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]), - K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]), - K(pages[LRU_ACTIVE_ANON]), - K(pages[LRU_INACTIVE_ANON]), - K(pages[LRU_ACTIVE_FILE]), - K(pages[LRU_INACTIVE_FILE]), - K(pages[LRU_UNEVICTABLE]), - K(global_page_state(NR_MLOCK)), -#ifdef CONFIG_HIGHMEM - K(i.totalhigh), - K(i.freehigh), - K(i.totalram-i.totalhigh), - K(i.freeram-i.freehigh), -#endif -#ifndef CONFIG_MMU - K((unsigned long) atomic_long_read(&mmap_pages_allocated)), -#endif - K(i.totalswap), - K(i.freeswap), - K(global_node_page_state(NR_FILE_DIRTY)), - K(global_node_page_state(NR_WRITEBACK)), - K(global_node_page_state(NR_ANON_MAPPED)), - K(global_node_page_state(NR_FILE_MAPPED)), - K(i.sharedram), - K(global_page_state(NR_SLAB_RECLAIMABLE) + - global_page_state(NR_SLAB_UNRECLAIMABLE)), - K(global_page_state(NR_SLAB_RECLAIMABLE)), - K(global_page_state(NR_SLAB_UNRECLAIMABLE)), - global_page_state(NR_KERNEL_STACK_KB), - K(global_page_state(NR_PAGETABLE)), -#ifdef CONFIG_QUICKLIST - K(quicklist_total_size()), -#endif - K(global_node_page_state(NR_UNSTABLE_NFS)), - K(global_page_state(NR_BOUNCE)), - K(global_node_page_state(NR_WRITEBACK_TEMP)), - K(vm_commit_limit()), - K(committed), - (unsigned long)VMALLOC_TOTAL >> 10, - 0ul, // used to be vmalloc 'used' - 0ul // used to be vmalloc 'largest_chunk' -#ifdef CONFIG_MEMORY_FAILURE - , atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10) -#endif -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - , K(global_node_page_state(NR_ANON_THPS) * HPAGE_PMD_NR) - , K(global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR) - , K(global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR) -#endif -#ifdef CONFIG_CMA - , K(totalcma_pages) - , K(global_page_state(NR_FREE_CMA_PAGES)) -#endif - ); hugetlb_report_meminfo(m); arch_report_meminfo(m); return 0; -#undef K } static int meminfo_proc_open(struct inode *inode, struct file *file) From 7abbaf94049914f074306d960b0f968ffe52e59f Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 7 Oct 2016 17:02:26 -0700 Subject: [PATCH 111/127] proc: relax /proc//timerslack_ns capability requirements When an interface to allow a task to change another tasks timerslack was first proposed, it was suggested that something greater then CAP_SYS_NICE would be needed, as a task could be delayed further then what normally could be done with nice adjustments. So CAP_SYS_PTRACE was adopted instead for what became the /proc//timerslack_ns interface. However, for Android (where this feature originates), giving the system_server CAP_SYS_PTRACE would allow it to observe and modify all tasks memory. This is considered too high a privilege level for only needing to change the timerslack. After some discussion, it was realized that a CAP_SYS_NICE process can set a task as SCHED_FIFO, so they could fork some spinning processes and set them all SCHED_FIFO 99, in effect delaying all other tasks for an infinite amount of time. So as a CAP_SYS_NICE task can already cause trouble for other tasks, using it as a required capability for accessing and modifying /proc//timerslack_ns seems sufficient. Thus, this patch loosens the capability requirements to CAP_SYS_NICE and removes CAP_SYS_PTRACE, simplifying some of the code flow as well. This is technically an ABI change, but as the feature just landed in 4.6, I suspect no one is yet using it. Link: http://lkml.kernel.org/r/1469132667-17377-1-git-send-email-john.stultz@linaro.org Signed-off-by: John Stultz Reviewed-by: Nick Kralevich Acked-by: Serge Hallyn Acked-by: Kees Cook Cc: Kees Cook Cc: "Serge E. Hallyn" Cc: Thomas Gleixner Cc: Arjan van de Ven Cc: Oren Laadan Cc: Ruchi Kandoi Cc: Rom Lemarchand Cc: Todd Kjos Cc: Colin Cross Cc: Nick Kralevich Cc: Dmitry Shmidt Cc: Elliott Hughes Cc: Android Kernel Team Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 3b792ab3c0dc..248f008d46b8 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2280,16 +2280,19 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf, if (!p) return -ESRCH; - if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) { - task_lock(p); - if (slack_ns == 0) - p->timer_slack_ns = p->default_timer_slack_ns; - else - p->timer_slack_ns = slack_ns; - task_unlock(p); - } else + if (!capable(CAP_SYS_NICE)) { count = -EPERM; + goto out; + } + task_lock(p); + if (slack_ns == 0) + p->timer_slack_ns = p->default_timer_slack_ns; + else + p->timer_slack_ns = slack_ns; + task_unlock(p); + +out: put_task_struct(p); return count; @@ -2299,19 +2302,22 @@ static int timerslack_ns_show(struct seq_file *m, void *v) { struct inode *inode = m->private; struct task_struct *p; - int err = 0; + int err = 0; p = get_proc_task(inode); if (!p) return -ESRCH; - if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) { - task_lock(p); - seq_printf(m, "%llu\n", p->timer_slack_ns); - task_unlock(p); - } else + if (!capable(CAP_SYS_NICE)) { err = -EPERM; + goto out; + } + task_lock(p); + seq_printf(m, "%llu\n", p->timer_slack_ns); + task_unlock(p); + +out: put_task_struct(p); return err; From 904763e1fb5eebf8249ec41a2019e5e32246df2f Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 7 Oct 2016 17:02:29 -0700 Subject: [PATCH 112/127] proc: add LSM hook checks to /proc//timerslack_ns As requested, this patch checks the existing LSM hooks task_getscheduler/task_setscheduler when reading or modifying the task's timerslack value. Previous versions added new get/settimerslack LSM hooks, but since they checked the same PROCESS__SET/GETSCHED values as existing hooks, it was suggested we just use the existing ones. Link: http://lkml.kernel.org/r/1469132667-17377-2-git-send-email-john.stultz@linaro.org Signed-off-by: John Stultz Cc: Kees Cook Cc: "Serge E. Hallyn" Cc: Thomas Gleixner Cc: Arjan van de Ven Cc: Oren Laadan Cc: Ruchi Kandoi Cc: Rom Lemarchand Cc: Todd Kjos Cc: Colin Cross Cc: Nick Kralevich Cc: Dmitry Shmidt Cc: Elliott Hughes Cc: James Morris Cc: Android Kernel Team Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/proc/base.c b/fs/proc/base.c index 248f008d46b8..ebccdc192830 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2285,6 +2285,12 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf, goto out; } + err = security_task_setscheduler(p); + if (err) { + count = err; + goto out; + } + task_lock(p); if (slack_ns == 0) p->timer_slack_ns = p->default_timer_slack_ns; @@ -2313,6 +2319,10 @@ static int timerslack_ns_show(struct seq_file *m, void *v) goto out; } + err = security_task_getscheduler(p); + if (err) + goto out; + task_lock(p); seq_printf(m, "%llu\n", p->timer_slack_ns); task_unlock(p); From 4b2bd5fec007a4fd3fc82474b9199af25013de4c Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 7 Oct 2016 17:02:33 -0700 Subject: [PATCH 113/127] proc: fix timerslack_ns CAP_SYS_NICE check when adjusting self In changing from checking ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS) to capable(CAP_SYS_NICE), I missed that ptrace_my_access succeeds when p == current, but the CAP_SYS_NICE doesn't. Thus while the previous commit was intended to loosen the needed privileges to modify a processes timerslack, it needlessly restricted a task modifying its own timerslack via the proc//timerslack_ns (which is permitted also via the PR_SET_TIMERSLACK method). This patch corrects this by checking if p == current before checking the CAP_SYS_NICE value. This patch applies on top of my two previous patches currently in -mm Link: http://lkml.kernel.org/r/1471906870-28624-1-git-send-email-john.stultz@linaro.org Signed-off-by: John Stultz Acked-by: Kees Cook Cc: "Serge E. Hallyn" Cc: Thomas Gleixner Cc: Arjan van de Ven Cc: Oren Laadan Cc: Ruchi Kandoi Cc: Rom Lemarchand Cc: Todd Kjos Cc: Colin Cross Cc: Nick Kralevich Cc: Dmitry Shmidt Cc: Elliott Hughes Cc: Android Kernel Team Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index ebccdc192830..dc7fe5f3a53c 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2280,15 +2280,17 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf, if (!p) return -ESRCH; - if (!capable(CAP_SYS_NICE)) { - count = -EPERM; - goto out; - } + if (p != current) { + if (!capable(CAP_SYS_NICE)) { + count = -EPERM; + goto out; + } - err = security_task_setscheduler(p); - if (err) { - count = err; - goto out; + err = security_task_setscheduler(p); + if (err) { + count = err; + goto out; + } } task_lock(p); @@ -2314,14 +2316,16 @@ static int timerslack_ns_show(struct seq_file *m, void *v) if (!p) return -ESRCH; - if (!capable(CAP_SYS_NICE)) { - err = -EPERM; - goto out; - } + if (p != current) { - err = security_task_getscheduler(p); - if (err) - goto out; + if (!capable(CAP_SYS_NICE)) { + err = -EPERM; + goto out; + } + err = security_task_getscheduler(p); + if (err) + goto out; + } task_lock(p); seq_printf(m, "%llu\n", p->timer_slack_ns); From 855af072b6c40aeb266f4dc98fd9a6a49edf22af Mon Sep 17 00:00:00 2001 From: Robert Ho Date: Fri, 7 Oct 2016 17:02:36 -0700 Subject: [PATCH 114/127] mm, proc: fix region lost in /proc/self/smaps Recently, Redhat reported that nvml test suite failed on QEMU/KVM, more detailed info please refer to: https://bugzilla.redhat.com/show_bug.cgi?id=1365721 Actually, this bug is not only for NVDIMM/DAX but also for any other file systems. This simple test case abstracted from nvml can easily reproduce this bug in common environment: -------------------------- testcase.c ----------------------------- int is_pmem_proc(const void *addr, size_t len) { const char *caddr = addr; FILE *fp; if ((fp = fopen("/proc/self/smaps", "r")) == NULL) { printf("!/proc/self/smaps"); return 0; } int retval = 0; /* assume false until proven otherwise */ char line[PROCMAXLEN]; /* for fgets() */ char *lo = NULL; /* beginning of current range in smaps file */ char *hi = NULL; /* end of current range in smaps file */ int needmm = 0; /* looking for mm flag for current range */ while (fgets(line, PROCMAXLEN, fp) != NULL) { static const char vmflags[] = "VmFlags:"; static const char mm[] = " wr"; /* check for range line */ if (sscanf(line, "%p-%p", &lo, &hi) == 2) { if (needmm) { /* last range matched, but no mm flag found */ printf("never found mm flag.\n"); break; } else if (caddr < lo) { /* never found the range for caddr */ printf("#######no match for addr %p.\n", caddr); break; } else if (caddr < hi) { /* start address is in this range */ size_t rangelen = (size_t)(hi - caddr); /* remember that matching has started */ needmm = 1; /* calculate remaining range to search for */ if (len > rangelen) { len -= rangelen; caddr += rangelen; printf("matched %zu bytes in range " "%p-%p, %zu left over.\n", rangelen, lo, hi, len); } else { len = 0; printf("matched all bytes in range " "%p-%p.\n", lo, hi); } } } else if (needmm && strncmp(line, vmflags, sizeof(vmflags) - 1) == 0) { if (strstr(&line[sizeof(vmflags) - 1], mm) != NULL) { printf("mm flag found.\n"); if (len == 0) { /* entire range matched */ retval = 1; break; } needmm = 0; /* saw what was needed */ } else { /* mm flag not set for some or all of range */ printf("range has no mm flag.\n"); break; } } } fclose(fp); printf("returning %d.\n", retval); return retval; } void *Addr; size_t Size; /* * worker -- the work each thread performs */ static void * worker(void *arg) { int *ret = (int *)arg; *ret = is_pmem_proc(Addr, Size); return NULL; } int main(int argc, char *argv[]) { if (argc < 2 || argc > 3) { printf("usage: %s file [env].\n", argv[0]); return -1; } int fd = open(argv[1], O_RDWR); struct stat stbuf; fstat(fd, &stbuf); Size = stbuf.st_size; Addr = mmap(0, stbuf.st_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0); close(fd); pthread_t threads[NTHREAD]; int ret[NTHREAD]; /* kick off NTHREAD threads */ for (int i = 0; i < NTHREAD; i++) pthread_create(&threads[i], NULL, worker, &ret[i]); /* wait for all the threads to complete */ for (int i = 0; i < NTHREAD; i++) pthread_join(threads[i], NULL); /* verify that all the threads return the same value */ for (int i = 1; i < NTHREAD; i++) { if (ret[0] != ret[i]) { printf("Error i %d ret[0] = %d ret[i] = %d.\n", i, ret[0], ret[i]); } } printf("%d", ret[0]); return 0; } It failed as some threads can not find the memory region in "/proc/self/smaps" which is allocated in the main process It is caused by proc fs which uses 'file->version' to indicate the VMA that is the last one has already been handled by read() system call. When the next read() issues, it uses the 'version' to find the VMA, then the next VMA is what we want to handle, the related code is as follows: if (last_addr) { vma = find_vma(mm, last_addr); if (vma && (vma = m_next_vma(priv, vma))) return vma; } However, VMA will be lost if the last VMA is gone, e.g: The process VMA list is A->B->C->D CPU 0 CPU 1 read() system call handle VMA B version = B return to userspace unmap VMA B issue read() again to continue to get the region info find_vma(version) will get VMA C m_next_vma(C) will get VMA D handle D !!! VMA C is lost !!! In order to fix this bug, we make 'file->version' indicate the end address of the current VMA. m_start will then look up a vma which with vma_start < last_vm_end and moves on to the next vma if we found the same or an overlapping vma. This will guarantee that we will not miss an exclusive vma but we can still miss one if the previous vma was shrunk. This is acceptable because guaranteeing "never miss a vma" is simply not feasible. User has to cope with some inconsistencies if the file is not read in one go. [mhocko@suse.com: changelog fixes] Link: http://lkml.kernel.org/r/1475296958-27652-1-git-send-email-robert.hu@intel.com Acked-by: Dave Hansen Signed-off-by: Xiao Guangrong Signed-off-by: Robert Hu Acked-by: Michal Hocko Acked-by: Oleg Nesterov Cc: Paolo Bonzini Cc: Dan Williams Cc: Gleb Natapov Cc: Marcelo Tosatti Cc: Stefan Hajnoczi Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index d2a70cf2154e..6909582ce5e5 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -147,7 +147,7 @@ m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma) static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma) { if (m->count < m->size) /* vma is copied successfully */ - m->version = m_next_vma(m->private, vma) ? vma->vm_start : -1UL; + m->version = m_next_vma(m->private, vma) ? vma->vm_end : -1UL; } static void *m_start(struct seq_file *m, loff_t *ppos) @@ -175,8 +175,10 @@ static void *m_start(struct seq_file *m, loff_t *ppos) priv->tail_vma = get_gate_vma(mm); if (last_addr) { - vma = find_vma(mm, last_addr); - if (vma && (vma = m_next_vma(priv, vma))) + vma = find_vma(mm, last_addr - 1); + if (vma && vma->vm_start <= last_addr) + vma = m_next_vma(priv, vma); + if (vma) return vma; } From 53aeee7a86620b4dca81f6b807b37f36e7f99b09 Mon Sep 17 00:00:00 2001 From: Robert Ho Date: Fri, 7 Oct 2016 17:02:39 -0700 Subject: [PATCH 115/127] Documentation/filesystems/proc.txt: add more description for maps/smaps Add some more description on the limitations for smaps/maps readings, as well as some guaruntees we can make. Link: http://lkml.kernel.org/r/1475296958-27652-2-git-send-email-robert.hu@intel.com Signed-off-by: Robert Ho Acked-by: Michal Hocko Cc: Dave Hansen Cc: Xiao Guangrong Cc: Robert Hu Cc: Oleg Nesterov Cc: Paolo Bonzini Cc: Dan Williams Cc: Gleb Natapov Cc: Marcelo Tosatti Cc: Stefan Hajnoczi Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index fcc1ac094282..219ffd41a911 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -515,6 +515,18 @@ be vanished or the reverse -- new added. This file is only present if the CONFIG_MMU kernel configuration option is enabled. +Note: reading /proc/PID/maps or /proc/PID/smaps is inherently racy (consistent +output can be achieved only in the single read call). +This typically manifests when doing partial reads of these files while the +memory map is being modified. Despite the races, we do provide the following +guarantees: + +1) The mapped addresses never go backwards, which implies no two + regions will ever overlap. +2) If there is something at a given vaddr during the entirety of the + life of the smaps/maps walk, there will be some output for it. + + The /proc/PID/clear_refs is used to reset the PG_Referenced and ACCESSED/YOUNG bits on both physical and virtual pages associated with a process, and the soft-dirty bit on pte (see Documentation/vm/soft-dirty.txt for details). From 589a9785ee3a7cb85f1dedc3dad1c9754c691880 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 7 Oct 2016 17:02:42 -0700 Subject: [PATCH 116/127] min/max: remove sparse warnings when they're nested Currently, when min/max are nested within themselves, sparse will warn: warning: symbol '_min1' shadows an earlier one originally declared here warning: symbol '_min1' shadows an earlier one originally declared here warning: symbol '_min2' shadows an earlier one originally declared here This also immediately happens when min3() or max3() are used. Since sparse implements __COUNTER__, we can use __UNIQUE_ID() to generate unique variable names, avoiding this. Link: http://lkml.kernel.org/r/1471519773-29882-1-git-send-email-johannes@sipsolutions.net Signed-off-by: Johannes Berg Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 44 +++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 74fd6f05bc5b..bc6ed52a39b9 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -733,17 +733,25 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } * strict type-checking.. See the * "unnecessary" pointer comparison. */ -#define min(x, y) ({ \ - typeof(x) _min1 = (x); \ - typeof(y) _min2 = (y); \ - (void) (&_min1 == &_min2); \ - _min1 < _min2 ? _min1 : _min2; }) +#define __min(t1, t2, min1, min2, x, y) ({ \ + t1 min1 = (x); \ + t2 min2 = (y); \ + (void) (&min1 == &min2); \ + min1 < min2 ? min1 : min2; }) +#define min(x, y) \ + __min(typeof(x), typeof(y), \ + __UNIQUE_ID(min1_), __UNIQUE_ID(min2_), \ + x, y) -#define max(x, y) ({ \ - typeof(x) _max1 = (x); \ - typeof(y) _max2 = (y); \ - (void) (&_max1 == &_max2); \ - _max1 > _max2 ? _max1 : _max2; }) +#define __max(t1, t2, max1, max2, x, y) ({ \ + t1 max1 = (x); \ + t2 max2 = (y); \ + (void) (&max1 == &max2); \ + max1 > max2 ? max1 : max2; }) +#define max(x, y) \ + __max(typeof(x), typeof(y), \ + __UNIQUE_ID(max1_), __UNIQUE_ID(max2_), \ + x, y) #define min3(x, y, z) min((typeof(x))min(x, y), z) #define max3(x, y, z) max((typeof(x))max(x, y), z) @@ -775,15 +783,15 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } * * Or not use min/max/clamp at all, of course. */ -#define min_t(type, x, y) ({ \ - type __min1 = (x); \ - type __min2 = (y); \ - __min1 < __min2 ? __min1: __min2; }) +#define min_t(type, x, y) \ + __min(type, type, \ + __UNIQUE_ID(min1_), __UNIQUE_ID(min2_), \ + x, y) -#define max_t(type, x, y) ({ \ - type __max1 = (x); \ - type __max2 = (y); \ - __max1 > __max2 ? __max1: __max2; }) +#define max_t(type, x, y) \ + __max(type, type, \ + __UNIQUE_ID(min1_), __UNIQUE_ID(min2_), \ + x, y) /** * clamp_t - return a value clamped to a given range using a given type From 9a01c3ed5cdb35d9004eb92510ee6ea11b4a5f16 Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Fri, 7 Oct 2016 17:02:45 -0700 Subject: [PATCH 117/127] nmi_backtrace: add more trigger_*_cpu_backtrace() methods Patch series "improvements to the nmi_backtrace code" v9. This patch series modifies the trigger_xxx_backtrace() NMI-based remote backtracing code to make it more flexible, and makes a few small improvements along the way. The motivation comes from the task isolation code, where there are scenarios where we want to be able to diagnose a case where some cpu is about to interrupt a task-isolated cpu. It can be helpful to see both where the interrupting cpu is, and also an approximation of where the cpu that is being interrupted is. The nmi_backtrace framework allows us to discover the stack of the interrupted cpu. I've tested that the change works as desired on tile, and build-tested x86, arm, mips, and sparc64. For x86 I confirmed that the generic cpuidle stuff as well as the architecture-specific routines are in the new cpuidle section. For arm, mips, and sparc I just build-tested it and made sure the generic cpuidle routines were in the new cpuidle section, but I didn't attempt to figure out which the platform-specific idle routines might be. That might be more usefully done by someone with platform experience in follow-up patches. This patch (of 4): Currently you can only request a backtrace of either all cpus, or all cpus but yourself. It can also be helpful to request a remote backtrace of a single cpu, and since we want that, the logical extension is to support a cpumask as the underlying primitive. This change modifies the existing lib/nmi_backtrace.c code to take a cpumask as its basic primitive, and modifies the linux/nmi.h code to use the new "cpumask" method instead. The existing clients of nmi_backtrace (arm and x86) are converted to using the new cpumask approach in this change. The other users of the backtracing API (sparc64 and mips) are converted to use the cpumask approach rather than the all/allbutself approach. The mips code ignored the "include_self" boolean but with this change it will now also dump a local backtrace if requested. Link: http://lkml.kernel.org/r/1472487169-14923-2-git-send-email-cmetcalf@mellanox.com Signed-off-by: Chris Metcalf Tested-by: Daniel Thompson [arm] Reviewed-by: Aaron Tomlin Reviewed-by: Petr Mladek Cc: "Rafael J. Wysocki" Cc: Russell King Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Ralf Baechle Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/irq.h | 5 +++-- arch/arm/kernel/smp.c | 4 ++-- arch/mips/include/asm/irq.h | 5 +++-- arch/mips/kernel/process.c | 11 +++++++++-- arch/sparc/include/asm/irq_64.h | 5 +++-- arch/sparc/kernel/process_64.c | 10 +++++----- arch/x86/include/asm/irq.h | 5 +++-- arch/x86/kernel/apic/hw_nmi.c | 18 +++++++++--------- include/linux/nmi.h | 31 ++++++++++++++++++++++++++----- lib/nmi_backtrace.c | 17 +++++++++-------- 10 files changed, 72 insertions(+), 39 deletions(-) diff --git a/arch/arm/include/asm/irq.h b/arch/arm/include/asm/irq.h index 1bd9510de1b9..e53638c8ed8a 100644 --- a/arch/arm/include/asm/irq.h +++ b/arch/arm/include/asm/irq.h @@ -36,8 +36,9 @@ extern void set_handle_irq(void (*handle_irq)(struct pt_regs *)); #endif #ifdef CONFIG_SMP -extern void arch_trigger_all_cpu_backtrace(bool); -#define arch_trigger_all_cpu_backtrace(x) arch_trigger_all_cpu_backtrace(x) +extern void arch_trigger_cpumask_backtrace(const cpumask_t *mask, + bool exclude_self); +#define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace #endif static inline int nr_legacy_irqs(void) diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 937c8920d741..5abc5697e4e5 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -760,7 +760,7 @@ static void raise_nmi(cpumask_t *mask) smp_cross_call(mask, IPI_CPU_BACKTRACE); } -void arch_trigger_all_cpu_backtrace(bool include_self) +void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self) { - nmi_trigger_all_cpu_backtrace(include_self, raise_nmi); + nmi_trigger_cpumask_backtrace(mask, exclude_self, raise_nmi); } diff --git a/arch/mips/include/asm/irq.h b/arch/mips/include/asm/irq.h index 15e0fecbc300..6bf10e796553 100644 --- a/arch/mips/include/asm/irq.h +++ b/arch/mips/include/asm/irq.h @@ -51,7 +51,8 @@ extern int cp0_fdc_irq; extern int get_c0_fdc_int(void); -void arch_trigger_all_cpu_backtrace(bool); -#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace +void arch_trigger_cpumask_backtrace(const struct cpumask *mask, + bool exclude_self); +#define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace #endif /* _ASM_IRQ_H */ diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index d2d061520a23..9514e5f2209f 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -569,9 +569,16 @@ static void arch_dump_stack(void *info) dump_stack(); } -void arch_trigger_all_cpu_backtrace(bool include_self) +void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self) { - smp_call_function(arch_dump_stack, NULL, 1); + long this_cpu = get_cpu(); + + if (cpumask_test_cpu(this_cpu, mask) && !exclude_self) + dump_stack(); + + smp_call_function_many(mask, arch_dump_stack, NULL, 1); + + put_cpu(); } int mips_get_process_fp_mode(struct task_struct *task) diff --git a/arch/sparc/include/asm/irq_64.h b/arch/sparc/include/asm/irq_64.h index 3f70f900e834..1d51a11fb261 100644 --- a/arch/sparc/include/asm/irq_64.h +++ b/arch/sparc/include/asm/irq_64.h @@ -86,8 +86,9 @@ static inline unsigned long get_softint(void) return retval; } -void arch_trigger_all_cpu_backtrace(bool); -#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace +void arch_trigger_cpumask_backtrace(const struct cpumask *mask, + bool exclude_self); +#define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace extern void *hardirq_stack[NR_CPUS]; extern void *softirq_stack[NR_CPUS]; diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index fa14402b33f9..47ff5588e521 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -239,7 +239,7 @@ static void __global_reg_poll(struct global_reg_snapshot *gp) } } -void arch_trigger_all_cpu_backtrace(bool include_self) +void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self) { struct thread_info *tp = current_thread_info(); struct pt_regs *regs = get_irq_regs(); @@ -255,15 +255,15 @@ void arch_trigger_all_cpu_backtrace(bool include_self) memset(global_cpu_snapshot, 0, sizeof(global_cpu_snapshot)); - if (include_self) + if (cpumask_test_cpu(this_cpu, mask) && !exclude_self) __global_reg_self(tp, regs, this_cpu); smp_fetch_global_regs(); - for_each_online_cpu(cpu) { + for_each_cpu(cpu, mask) { struct global_reg_snapshot *gp; - if (!include_self && cpu == this_cpu) + if (exclude_self && cpu == this_cpu) continue; gp = &global_cpu_snapshot[cpu].reg; @@ -300,7 +300,7 @@ void arch_trigger_all_cpu_backtrace(bool include_self) static void sysrq_handle_globreg(int key) { - arch_trigger_all_cpu_backtrace(true); + trigger_all_cpu_backtrace(); } static struct sysrq_key_op sparc_globalreg_op = { diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index e7de5c9a4fbd..16d3fa211962 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -50,8 +50,9 @@ extern int vector_used_by_percpu_irq(unsigned int vector); extern void init_ISA_irqs(void); #ifdef CONFIG_X86_LOCAL_APIC -void arch_trigger_all_cpu_backtrace(bool); -#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace +void arch_trigger_cpumask_backtrace(const struct cpumask *mask, + bool exclude_self); +#define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace #endif #endif /* _ASM_X86_IRQ_H */ diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index f29501e1a5c1..c73c9fb281e1 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -26,32 +26,32 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh) } #endif -#ifdef arch_trigger_all_cpu_backtrace +#ifdef arch_trigger_cpumask_backtrace static void nmi_raise_cpu_backtrace(cpumask_t *mask) { apic->send_IPI_mask(mask, NMI_VECTOR); } -void arch_trigger_all_cpu_backtrace(bool include_self) +void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self) { - nmi_trigger_all_cpu_backtrace(include_self, nmi_raise_cpu_backtrace); + nmi_trigger_cpumask_backtrace(mask, exclude_self, + nmi_raise_cpu_backtrace); } -static int -arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs) +static int nmi_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs) { if (nmi_cpu_backtrace(regs)) return NMI_HANDLED; return NMI_DONE; } -NOKPROBE_SYMBOL(arch_trigger_all_cpu_backtrace_handler); +NOKPROBE_SYMBOL(nmi_cpu_backtrace_handler); -static int __init register_trigger_all_cpu_backtrace(void) +static int __init register_nmi_cpu_backtrace_handler(void) { - register_nmi_handler(NMI_LOCAL, arch_trigger_all_cpu_backtrace_handler, + register_nmi_handler(NMI_LOCAL, nmi_cpu_backtrace_handler, 0, "arch_bt"); return 0; } -early_initcall(register_trigger_all_cpu_backtrace); +early_initcall(register_nmi_cpu_backtrace_handler); #endif diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 4630eeae18e0..a78c35cff1ae 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -35,21 +35,34 @@ static inline void hardlockup_detector_disable(void) {} * base function. Return whether such support was available, * to allow calling code to fall back to some other mechanism: */ -#ifdef arch_trigger_all_cpu_backtrace +#ifdef arch_trigger_cpumask_backtrace static inline bool trigger_all_cpu_backtrace(void) { - arch_trigger_all_cpu_backtrace(true); - + arch_trigger_cpumask_backtrace(cpu_online_mask, false); return true; } + static inline bool trigger_allbutself_cpu_backtrace(void) { - arch_trigger_all_cpu_backtrace(false); + arch_trigger_cpumask_backtrace(cpu_online_mask, true); + return true; +} + +static inline bool trigger_cpumask_backtrace(struct cpumask *mask) +{ + arch_trigger_cpumask_backtrace(mask, false); + return true; +} + +static inline bool trigger_single_cpu_backtrace(int cpu) +{ + arch_trigger_cpumask_backtrace(cpumask_of(cpu), false); return true; } /* generic implementation */ -void nmi_trigger_all_cpu_backtrace(bool include_self, +void nmi_trigger_cpumask_backtrace(const cpumask_t *mask, + bool exclude_self, void (*raise)(cpumask_t *mask)); bool nmi_cpu_backtrace(struct pt_regs *regs); @@ -62,6 +75,14 @@ static inline bool trigger_allbutself_cpu_backtrace(void) { return false; } +static inline bool trigger_cpumask_backtrace(struct cpumask *mask) +{ + return false; +} +static inline bool trigger_single_cpu_backtrace(int cpu) +{ + return false; +} #endif #ifdef CONFIG_LOCKUP_DETECTOR diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c index 26caf51cc238..df347e355267 100644 --- a/lib/nmi_backtrace.c +++ b/lib/nmi_backtrace.c @@ -17,20 +17,21 @@ #include #include -#ifdef arch_trigger_all_cpu_backtrace +#ifdef arch_trigger_cpumask_backtrace /* For reliability, we're prepared to waste bits here. */ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; -/* "in progress" flag of arch_trigger_all_cpu_backtrace */ +/* "in progress" flag of arch_trigger_cpumask_backtrace */ static unsigned long backtrace_flag; /* - * When raise() is called it will be is passed a pointer to the + * When raise() is called it will be passed a pointer to the * backtrace_mask. Architectures that call nmi_cpu_backtrace() * directly from their raise() functions may rely on the mask * they are passed being updated as a side effect of this call. */ -void nmi_trigger_all_cpu_backtrace(bool include_self, +void nmi_trigger_cpumask_backtrace(const cpumask_t *mask, + bool exclude_self, void (*raise)(cpumask_t *mask)) { int i, this_cpu = get_cpu(); @@ -44,13 +45,13 @@ void nmi_trigger_all_cpu_backtrace(bool include_self, return; } - cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); - if (!include_self) + cpumask_copy(to_cpumask(backtrace_mask), mask); + if (exclude_self) cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask)); if (!cpumask_empty(to_cpumask(backtrace_mask))) { - pr_info("Sending NMI to %s CPUs:\n", - (include_self ? "all" : "other")); + pr_info("Sending NMI from CPU %d to CPUs %*pbl:\n", + this_cpu, nr_cpumask_bits, to_cpumask(backtrace_mask)); raise(to_cpumask(backtrace_mask)); } From 677664895278267a80bda0e3b26821d60cdbebf5 Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Fri, 7 Oct 2016 17:02:49 -0700 Subject: [PATCH 118/127] nmi_backtrace: do a local dump_stack() instead of a self-NMI Currently on arm there is code that checks whether it should call dump_stack() explicitly, to avoid trying to raise an NMI when the current context is not preemptible by the backtrace IPI. Similarly, the forthcoming arch/tile support uses an IPI mechanism that does not support generating an NMI to self. Accordingly, move the code that guards this case into the generic mechanism, and invoke it unconditionally whenever we want a backtrace of the current cpu. It seems plausible that in all cases, dump_stack() will generate better information than generating a stack from the NMI handler. The register state will be missing, but that state is likely not particularly helpful in any case. Or, if we think it is helpful, we should be capturing and emitting the current register state in all cases when regs == NULL is passed to nmi_cpu_backtrace(). Link: http://lkml.kernel.org/r/1472487169-14923-3-git-send-email-cmetcalf@mellanox.com Signed-off-by: Chris Metcalf Tested-by: Daniel Thompson [arm] Reviewed-by: Petr Mladek Acked-by: Aaron Tomlin Cc: "Rafael J. Wysocki" Cc: Russell King Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/kernel/smp.c | 9 --------- lib/nmi_backtrace.c | 9 +++++++++ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 5abc5697e4e5..7dd14e8395e6 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -748,15 +748,6 @@ core_initcall(register_cpufreq_notifier); static void raise_nmi(cpumask_t *mask) { - /* - * Generate the backtrace directly if we are running in a calling - * context that is not preemptible by the backtrace IPI. Note - * that nmi_cpu_backtrace() automatically removes the current cpu - * from mask. - */ - if (cpumask_test_cpu(smp_processor_id(), mask) && irqs_disabled()) - nmi_cpu_backtrace(NULL); - smp_cross_call(mask, IPI_CPU_BACKTRACE); } diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c index df347e355267..393a3cca1f47 100644 --- a/lib/nmi_backtrace.c +++ b/lib/nmi_backtrace.c @@ -49,6 +49,15 @@ void nmi_trigger_cpumask_backtrace(const cpumask_t *mask, if (exclude_self) cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask)); + /* + * Don't try to send an NMI to this cpu; it may work on some + * architectures, but on others it may not, and we'll get + * information at least as useful just by doing a dump_stack() here. + * Note that nmi_cpu_backtrace(NULL) will clear the cpu bit. + */ + if (cpumask_test_cpu(this_cpu, to_cpumask(backtrace_mask))) + nmi_cpu_backtrace(NULL); + if (!cpumask_empty(to_cpumask(backtrace_mask))) { pr_info("Sending NMI from CPU %d to CPUs %*pbl:\n", this_cpu, nr_cpumask_bits, to_cpumask(backtrace_mask)); From 511f8389454e55ece5115dc3bc84a0947788ff4f Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Fri, 7 Oct 2016 17:02:52 -0700 Subject: [PATCH 119/127] arch/tile: adopt the new nmi_backtrace framework Previously tile was rolling its own method of capturing backtrace data in the NMI handlers, but it was relying on running printk() from the NMI handler, which is not always safe. So adopt the nmi_backtrace model (with the new cpumask extension) instead. So we can call the nmi_backtrace code directly from the nmi handler, move the nmi_enter()/exit() into the top-level tile NMI handler. The semantics of the routine change slightly since it is now synchronous with the remote cores completing the backtraces. Previously it was asynchronous, but with protection to avoid starting a new remote backtrace if the old one was still in progress. Link: http://lkml.kernel.org/r/1472487169-14923-4-git-send-email-cmetcalf@mellanox.com Signed-off-by: Chris Metcalf Cc: Daniel Thompson [arm] Cc: Petr Mladek Cc: Aaron Tomlin Cc: Peter Zijlstra (Intel) Cc: "Rafael J. Wysocki" Cc: Russell King Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/tile/include/asm/irq.h | 5 ++- arch/tile/kernel/pmc.c | 3 -- arch/tile/kernel/process.c | 73 +++++++++---------------------------- arch/tile/kernel/traps.c | 9 ++++- 4 files changed, 27 insertions(+), 63 deletions(-) diff --git a/arch/tile/include/asm/irq.h b/arch/tile/include/asm/irq.h index 84a924034bdb..1fa1f2544ff9 100644 --- a/arch/tile/include/asm/irq.h +++ b/arch/tile/include/asm/irq.h @@ -79,8 +79,9 @@ void tile_irq_activate(unsigned int irq, int tile_irq_type); void setup_irq_regs(void); #ifdef __tilegx__ -void arch_trigger_all_cpu_backtrace(bool self); -#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace +void arch_trigger_cpumask_backtrace(const struct cpumask *mask, + bool exclude_self); +#define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace #endif #endif /* _ASM_TILE_IRQ_H */ diff --git a/arch/tile/kernel/pmc.c b/arch/tile/kernel/pmc.c index db62cc34b955..81cf8743a3f3 100644 --- a/arch/tile/kernel/pmc.c +++ b/arch/tile/kernel/pmc.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include @@ -29,9 +28,7 @@ int handle_perf_interrupt(struct pt_regs *regs, int fault) if (!perf_irq) panic("Unexpected PERF_COUNT interrupt %d\n", fault); - nmi_enter(); retval = perf_irq(regs, fault); - nmi_exit(); return retval; } diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c index a465d8372edd..9f37106ef93a 100644 --- a/arch/tile/kernel/process.c +++ b/arch/tile/kernel/process.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include #include @@ -594,66 +594,18 @@ void show_regs(struct pt_regs *regs) tile_show_stack(&kbt); } -/* To ensure stack dump on tiles occurs one by one. */ -static DEFINE_SPINLOCK(backtrace_lock); -/* To ensure no backtrace occurs before all of the stack dump are done. */ -static atomic_t backtrace_cpus; -/* The cpu mask to avoid reentrance. */ -static struct cpumask backtrace_mask; - -void do_nmi_dump_stack(struct pt_regs *regs) -{ - int is_idle = is_idle_task(current) && !in_interrupt(); - int cpu; - - nmi_enter(); - cpu = smp_processor_id(); - if (WARN_ON_ONCE(!cpumask_test_and_clear_cpu(cpu, &backtrace_mask))) - goto done; - - spin_lock(&backtrace_lock); - if (is_idle) - pr_info("CPU: %d idle\n", cpu); - else - show_regs(regs); - spin_unlock(&backtrace_lock); - atomic_dec(&backtrace_cpus); -done: - nmi_exit(); -} - #ifdef __tilegx__ -void arch_trigger_all_cpu_backtrace(bool self) +void nmi_raise_cpu_backtrace(struct cpumask *in_mask) { struct cpumask mask; HV_Coord tile; unsigned int timeout; int cpu; - int ongoing; HV_NMI_Info info[NR_CPUS]; - ongoing = atomic_cmpxchg(&backtrace_cpus, 0, num_online_cpus() - 1); - if (ongoing != 0) { - pr_err("Trying to do all-cpu backtrace.\n"); - pr_err("But another all-cpu backtrace is ongoing (%d cpus left)\n", - ongoing); - if (self) { - pr_err("Reporting the stack on this cpu only.\n"); - dump_stack(); - } - return; - } - - cpumask_copy(&mask, cpu_online_mask); - cpumask_clear_cpu(smp_processor_id(), &mask); - cpumask_copy(&backtrace_mask, &mask); - - /* Backtrace for myself first. */ - if (self) - dump_stack(); - /* Tentatively dump stack on remote tiles via NMI. */ timeout = 100; + cpumask_copy(&mask, in_mask); while (!cpumask_empty(&mask) && timeout) { for_each_cpu(cpu, &mask) { tile.x = cpu_x(cpu); @@ -664,12 +616,17 @@ void arch_trigger_all_cpu_backtrace(bool self) } mdelay(10); + touch_softlockup_watchdog(); timeout--; } - /* Warn about cpus stuck in ICS and decrement their counts here. */ + /* Warn about cpus stuck in ICS. */ if (!cpumask_empty(&mask)) { for_each_cpu(cpu, &mask) { + + /* Clear the bit as if nmi_cpu_backtrace() ran. */ + cpumask_clear_cpu(cpu, in_mask); + switch (info[cpu].result) { case HV_NMI_RESULT_FAIL_ICS: pr_warn("Skipping stack dump of cpu %d in ICS at pc %#llx\n", @@ -680,16 +637,20 @@ void arch_trigger_all_cpu_backtrace(bool self) cpu); break; case HV_ENOSYS: - pr_warn("Hypervisor too old to allow remote stack dumps.\n"); - goto skip_for_each; + WARN_ONCE(1, "Hypervisor too old to allow remote stack dumps.\n"); + break; default: /* should not happen */ pr_warn("Skipping stack dump of cpu %d [%d,%#llx]\n", cpu, info[cpu].result, info[cpu].pc); break; } } -skip_for_each: - atomic_sub(cpumask_weight(&mask), &backtrace_cpus); } } + +void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self) +{ + nmi_trigger_cpumask_backtrace(mask, exclude_self, + nmi_raise_cpu_backtrace); +} #endif /* __tilegx_ */ diff --git a/arch/tile/kernel/traps.c b/arch/tile/kernel/traps.c index 4d9651c5b1ad..39f427bb0de2 100644 --- a/arch/tile/kernel/traps.c +++ b/arch/tile/kernel/traps.c @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include #include #include @@ -392,14 +394,17 @@ void __kprobes do_trap(struct pt_regs *regs, int fault_num, void do_nmi(struct pt_regs *regs, int fault_num, unsigned long reason) { + nmi_enter(); switch (reason) { +#ifdef arch_trigger_cpumask_backtrace case TILE_NMI_DUMP_STACK: - do_nmi_dump_stack(regs); + nmi_cpu_backtrace(regs); break; +#endif default: panic("Unexpected do_nmi type %ld", reason); - return; } + nmi_exit(); } /* Deprecated function currently only used here. */ From 6727ad9e206cc08b80d8000a4d67f8417e53539d Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Fri, 7 Oct 2016 17:02:55 -0700 Subject: [PATCH 120/127] nmi_backtrace: generate one-line reports for idle cpus When doing an nmi backtrace of many cores, most of which are idle, the output is a little overwhelming and very uninformative. Suppress messages for cpus that are idling when they are interrupted and just emit one line, "NMI backtrace for N skipped: idling at pc 0xNNN". We do this by grouping all the cpuidle code together into a new .cpuidle.text section, and then checking the address of the interrupted PC to see if it lies within that section. This commit suitably tags x86 and tile idle routines, and only adds in the minimal framework for other architectures. Link: http://lkml.kernel.org/r/1472487169-14923-5-git-send-email-cmetcalf@mellanox.com Signed-off-by: Chris Metcalf Acked-by: Peter Zijlstra (Intel) Tested-by: Peter Zijlstra (Intel) Tested-by: Daniel Thompson [arm] Tested-by: Petr Mladek Cc: Aaron Tomlin Cc: Peter Zijlstra (Intel) Cc: "Rafael J. Wysocki" Cc: Russell King Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/kernel/vmlinux.lds.S | 1 + arch/arc/kernel/vmlinux.lds.S | 1 + arch/arm/kernel/vmlinux-xip.lds.S | 1 + arch/arm/kernel/vmlinux.lds.S | 1 + arch/arm64/kernel/vmlinux.lds.S | 1 + arch/avr32/kernel/vmlinux.lds.S | 1 + arch/blackfin/kernel/vmlinux.lds.S | 1 + arch/c6x/kernel/vmlinux.lds.S | 1 + arch/cris/kernel/vmlinux.lds.S | 1 + arch/frv/kernel/vmlinux.lds.S | 1 + arch/h8300/kernel/vmlinux.lds.S | 1 + arch/hexagon/kernel/vmlinux.lds.S | 1 + arch/ia64/kernel/vmlinux.lds.S | 1 + arch/m32r/kernel/vmlinux.lds.S | 1 + arch/m68k/kernel/vmlinux-nommu.lds | 1 + arch/m68k/kernel/vmlinux-std.lds | 1 + arch/m68k/kernel/vmlinux-sun3.lds | 1 + arch/metag/kernel/vmlinux.lds.S | 1 + arch/microblaze/kernel/vmlinux.lds.S | 1 + arch/mips/kernel/vmlinux.lds.S | 1 + arch/mn10300/kernel/vmlinux.lds.S | 1 + arch/nios2/kernel/vmlinux.lds.S | 1 + arch/openrisc/kernel/vmlinux.lds.S | 1 + arch/parisc/kernel/vmlinux.lds.S | 1 + arch/powerpc/kernel/vmlinux.lds.S | 1 + arch/s390/kernel/vmlinux.lds.S | 1 + arch/score/kernel/vmlinux.lds.S | 1 + arch/sh/kernel/vmlinux.lds.S | 1 + arch/sparc/kernel/vmlinux.lds.S | 1 + arch/tile/kernel/entry.S | 2 +- arch/tile/kernel/vmlinux.lds.S | 1 + arch/um/kernel/dyn.lds.S | 1 + arch/um/kernel/uml.lds.S | 1 + arch/unicore32/kernel/vmlinux.lds.S | 1 + arch/x86/include/asm/irqflags.h | 12 ++++++++---- arch/x86/kernel/acpi/cstate.c | 2 +- arch/x86/kernel/process.c | 4 ++-- arch/x86/kernel/vmlinux.lds.S | 1 + arch/xtensa/kernel/vmlinux.lds.S | 3 +++ drivers/acpi/processor_idle.c | 5 +++-- drivers/cpuidle/driver.c | 5 +++-- drivers/idle/intel_idle.c | 4 ++-- include/asm-generic/vmlinux.lds.h | 6 ++++++ include/linux/cpu.h | 5 +++++ kernel/sched/idle.c | 13 +++++++++++-- lib/nmi_backtrace.c | 16 +++++++++++----- scripts/mod/modpost.c | 2 +- scripts/recordmcount.c | 1 + scripts/recordmcount.pl | 1 + 49 files changed, 93 insertions(+), 22 deletions(-) diff --git a/arch/alpha/kernel/vmlinux.lds.S b/arch/alpha/kernel/vmlinux.lds.S index 647b84c15382..cebecfb76fbf 100644 --- a/arch/alpha/kernel/vmlinux.lds.S +++ b/arch/alpha/kernel/vmlinux.lds.S @@ -22,6 +22,7 @@ SECTIONS HEAD_TEXT TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT *(.fixup) *(.gnu.warning) diff --git a/arch/arc/kernel/vmlinux.lds.S b/arch/arc/kernel/vmlinux.lds.S index 36611072305f..f35ed578e007 100644 --- a/arch/arc/kernel/vmlinux.lds.S +++ b/arch/arc/kernel/vmlinux.lds.S @@ -89,6 +89,7 @@ SECTIONS _text = .; TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT *(.fixup) diff --git a/arch/arm/kernel/vmlinux-xip.lds.S b/arch/arm/kernel/vmlinux-xip.lds.S index cba1ec899a69..7fa487ef7e2f 100644 --- a/arch/arm/kernel/vmlinux-xip.lds.S +++ b/arch/arm/kernel/vmlinux-xip.lds.S @@ -98,6 +98,7 @@ SECTIONS IRQENTRY_TEXT TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT *(.gnu.warning) diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S index d24e5dd2aa7a..f7f55df0bf7b 100644 --- a/arch/arm/kernel/vmlinux.lds.S +++ b/arch/arm/kernel/vmlinux.lds.S @@ -111,6 +111,7 @@ SECTIONS SOFTIRQENTRY_TEXT TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT HYPERVISOR_TEXT KPROBES_TEXT diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 5ce9b2929e0d..1105aab1e6d6 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -122,6 +122,7 @@ SECTIONS ENTRY_TEXT TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT HYPERVISOR_TEXT diff --git a/arch/avr32/kernel/vmlinux.lds.S b/arch/avr32/kernel/vmlinux.lds.S index a4589176bed5..17f2730eb497 100644 --- a/arch/avr32/kernel/vmlinux.lds.S +++ b/arch/avr32/kernel/vmlinux.lds.S @@ -52,6 +52,7 @@ SECTIONS KPROBES_TEXT TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT *(.fixup) *(.gnu.warning) diff --git a/arch/blackfin/kernel/vmlinux.lds.S b/arch/blackfin/kernel/vmlinux.lds.S index d920b959ff3a..68069a120055 100644 --- a/arch/blackfin/kernel/vmlinux.lds.S +++ b/arch/blackfin/kernel/vmlinux.lds.S @@ -33,6 +33,7 @@ SECTIONS #ifndef CONFIG_SCHEDULE_L1 SCHED_TEXT #endif + CPUIDLE_TEXT LOCK_TEXT IRQENTRY_TEXT SOFTIRQENTRY_TEXT diff --git a/arch/c6x/kernel/vmlinux.lds.S b/arch/c6x/kernel/vmlinux.lds.S index 50bc10f97bcb..a1a5c166bc9b 100644 --- a/arch/c6x/kernel/vmlinux.lds.S +++ b/arch/c6x/kernel/vmlinux.lds.S @@ -70,6 +70,7 @@ SECTIONS _stext = .; TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT IRQENTRY_TEXT SOFTIRQENTRY_TEXT diff --git a/arch/cris/kernel/vmlinux.lds.S b/arch/cris/kernel/vmlinux.lds.S index 7552c2557506..979586261520 100644 --- a/arch/cris/kernel/vmlinux.lds.S +++ b/arch/cris/kernel/vmlinux.lds.S @@ -43,6 +43,7 @@ SECTIONS HEAD_TEXT TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT *(.fixup) *(.text.__*) diff --git a/arch/frv/kernel/vmlinux.lds.S b/arch/frv/kernel/vmlinux.lds.S index 7e958d829ec9..aa6e573d57da 100644 --- a/arch/frv/kernel/vmlinux.lds.S +++ b/arch/frv/kernel/vmlinux.lds.S @@ -63,6 +63,7 @@ SECTIONS *(.text..tlbmiss) TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT #ifdef CONFIG_DEBUG_INFO INIT_TEXT diff --git a/arch/h8300/kernel/vmlinux.lds.S b/arch/h8300/kernel/vmlinux.lds.S index cb5dfb02c88d..7f11da1b895e 100644 --- a/arch/h8300/kernel/vmlinux.lds.S +++ b/arch/h8300/kernel/vmlinux.lds.S @@ -29,6 +29,7 @@ SECTIONS _stext = . ; TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT #if defined(CONFIG_ROMKERNEL) *(.int_redirect) diff --git a/arch/hexagon/kernel/vmlinux.lds.S b/arch/hexagon/kernel/vmlinux.lds.S index 5f268c1071b3..ec87e67feb19 100644 --- a/arch/hexagon/kernel/vmlinux.lds.S +++ b/arch/hexagon/kernel/vmlinux.lds.S @@ -50,6 +50,7 @@ SECTIONS _text = .; TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT *(.fixup) diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S index dc506b05ffbd..f89d20c97412 100644 --- a/arch/ia64/kernel/vmlinux.lds.S +++ b/arch/ia64/kernel/vmlinux.lds.S @@ -46,6 +46,7 @@ SECTIONS { __end_ivt_text = .; TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT *(.gnu.linkonce.t*) diff --git a/arch/m32r/kernel/vmlinux.lds.S b/arch/m32r/kernel/vmlinux.lds.S index 018e4a711d79..ad1fe56455aa 100644 --- a/arch/m32r/kernel/vmlinux.lds.S +++ b/arch/m32r/kernel/vmlinux.lds.S @@ -31,6 +31,7 @@ SECTIONS HEAD_TEXT TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT *(.fixup) *(.gnu.warning) diff --git a/arch/m68k/kernel/vmlinux-nommu.lds b/arch/m68k/kernel/vmlinux-nommu.lds index 06a763f49fd3..d2c8abf1c8c4 100644 --- a/arch/m68k/kernel/vmlinux-nommu.lds +++ b/arch/m68k/kernel/vmlinux-nommu.lds @@ -45,6 +45,7 @@ SECTIONS { HEAD_TEXT TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT *(.fixup) . = ALIGN(16); diff --git a/arch/m68k/kernel/vmlinux-std.lds b/arch/m68k/kernel/vmlinux-std.lds index d0993594f558..5b5ce1e4d1ed 100644 --- a/arch/m68k/kernel/vmlinux-std.lds +++ b/arch/m68k/kernel/vmlinux-std.lds @@ -16,6 +16,7 @@ SECTIONS HEAD_TEXT TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT *(.fixup) *(.gnu.warning) diff --git a/arch/m68k/kernel/vmlinux-sun3.lds b/arch/m68k/kernel/vmlinux-sun3.lds index 8080469ee6c1..fe5ea1974b16 100644 --- a/arch/m68k/kernel/vmlinux-sun3.lds +++ b/arch/m68k/kernel/vmlinux-sun3.lds @@ -16,6 +16,7 @@ SECTIONS HEAD_TEXT TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT *(.fixup) *(.gnu.warning) diff --git a/arch/metag/kernel/vmlinux.lds.S b/arch/metag/kernel/vmlinux.lds.S index 150ace92c7ad..e6c700eaf207 100644 --- a/arch/metag/kernel/vmlinux.lds.S +++ b/arch/metag/kernel/vmlinux.lds.S @@ -21,6 +21,7 @@ SECTIONS .text : { TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT diff --git a/arch/microblaze/kernel/vmlinux.lds.S b/arch/microblaze/kernel/vmlinux.lds.S index 0a47f0410554..289d0e7f3e3a 100644 --- a/arch/microblaze/kernel/vmlinux.lds.S +++ b/arch/microblaze/kernel/vmlinux.lds.S @@ -33,6 +33,7 @@ SECTIONS { EXIT_TEXT EXIT_CALL SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S index a82c178d0bb9..d5de67591735 100644 --- a/arch/mips/kernel/vmlinux.lds.S +++ b/arch/mips/kernel/vmlinux.lds.S @@ -55,6 +55,7 @@ SECTIONS .text : { TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT diff --git a/arch/mn10300/kernel/vmlinux.lds.S b/arch/mn10300/kernel/vmlinux.lds.S index 13c4814c29f8..2d5f1c3f1afb 100644 --- a/arch/mn10300/kernel/vmlinux.lds.S +++ b/arch/mn10300/kernel/vmlinux.lds.S @@ -30,6 +30,7 @@ SECTIONS HEAD_TEXT TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT *(.fixup) diff --git a/arch/nios2/kernel/vmlinux.lds.S b/arch/nios2/kernel/vmlinux.lds.S index e23e89539967..6a8045bb1a77 100644 --- a/arch/nios2/kernel/vmlinux.lds.S +++ b/arch/nios2/kernel/vmlinux.lds.S @@ -37,6 +37,7 @@ SECTIONS .text : { TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT IRQENTRY_TEXT SOFTIRQENTRY_TEXT diff --git a/arch/openrisc/kernel/vmlinux.lds.S b/arch/openrisc/kernel/vmlinux.lds.S index d936de4c07ca..d68b9ede8423 100644 --- a/arch/openrisc/kernel/vmlinux.lds.S +++ b/arch/openrisc/kernel/vmlinux.lds.S @@ -47,6 +47,7 @@ SECTIONS _stext = .; TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S index f3ead0b6ce46..9ec8ec075dae 100644 --- a/arch/parisc/kernel/vmlinux.lds.S +++ b/arch/parisc/kernel/vmlinux.lds.S @@ -69,6 +69,7 @@ SECTIONS .text ALIGN(PAGE_SIZE) : { TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index b5fba689fca6..7ed59f0d947f 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -52,6 +52,7 @@ SECTIONS /* careful! __ftr_alt_* sections need to be close to .text */ *(.text .fixup __ftr_alt_* .ref.text) SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 429bfd111961..000e6e91f6a0 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -35,6 +35,7 @@ SECTIONS HEAD_TEXT TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT diff --git a/arch/score/kernel/vmlinux.lds.S b/arch/score/kernel/vmlinux.lds.S index 7274b5c4287e..4117890b1db1 100644 --- a/arch/score/kernel/vmlinux.lds.S +++ b/arch/score/kernel/vmlinux.lds.S @@ -40,6 +40,7 @@ SECTIONS _text = .; /* Text and read-only data */ TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT *(.text.*) diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S index 235a4101999f..5b9a3cc90c58 100644 --- a/arch/sh/kernel/vmlinux.lds.S +++ b/arch/sh/kernel/vmlinux.lds.S @@ -36,6 +36,7 @@ SECTIONS TEXT_TEXT EXTRA_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S index d79b3b734245..572db686f845 100644 --- a/arch/sparc/kernel/vmlinux.lds.S +++ b/arch/sparc/kernel/vmlinux.lds.S @@ -49,6 +49,7 @@ SECTIONS HEAD_TEXT TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT diff --git a/arch/tile/kernel/entry.S b/arch/tile/kernel/entry.S index 670a3569450f..101de132e363 100644 --- a/arch/tile/kernel/entry.S +++ b/arch/tile/kernel/entry.S @@ -50,7 +50,7 @@ STD_ENTRY(smp_nap) * When interrupted at _cpu_idle_nap, we bump the PC forward 8, and * as a result return to the function that called _cpu_idle(). */ -STD_ENTRY(_cpu_idle) +STD_ENTRY_SECTION(_cpu_idle, .cpuidle.text) movei r1, 1 IRQ_ENABLE_LOAD(r2, r3) mtspr INTERRUPT_CRITICAL_SECTION, r1 diff --git a/arch/tile/kernel/vmlinux.lds.S b/arch/tile/kernel/vmlinux.lds.S index 9d449caf8910..e1baf094fba4 100644 --- a/arch/tile/kernel/vmlinux.lds.S +++ b/arch/tile/kernel/vmlinux.lds.S @@ -42,6 +42,7 @@ SECTIONS .text : AT (ADDR(.text) - LOAD_OFFSET) { HEAD_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT diff --git a/arch/um/kernel/dyn.lds.S b/arch/um/kernel/dyn.lds.S index adde088aeeff..4fdbcf958cd5 100644 --- a/arch/um/kernel/dyn.lds.S +++ b/arch/um/kernel/dyn.lds.S @@ -68,6 +68,7 @@ SECTIONS _stext = .; TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT *(.fixup) *(.stub .text.* .gnu.linkonce.t.*) diff --git a/arch/um/kernel/uml.lds.S b/arch/um/kernel/uml.lds.S index 6899195602b7..1840f55ed042 100644 --- a/arch/um/kernel/uml.lds.S +++ b/arch/um/kernel/uml.lds.S @@ -28,6 +28,7 @@ SECTIONS _stext = .; TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT *(.fixup) /* .gnu.warning sections are handled specially by elf32.em. */ diff --git a/arch/unicore32/kernel/vmlinux.lds.S b/arch/unicore32/kernel/vmlinux.lds.S index 77e407e49a63..56e788e8ee83 100644 --- a/arch/unicore32/kernel/vmlinux.lds.S +++ b/arch/unicore32/kernel/vmlinux.lds.S @@ -37,6 +37,7 @@ SECTIONS .text : { /* Real text segment */ TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT *(.fixup) diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index b77f5edb03b0..ac7692dcfa2e 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -4,6 +4,10 @@ #include #ifndef __ASSEMBLY__ + +/* Provide __cpuidle; we can't safely include */ +#define __cpuidle __attribute__((__section__(".cpuidle.text"))) + /* * Interrupt control: */ @@ -44,12 +48,12 @@ static inline void native_irq_enable(void) asm volatile("sti": : :"memory"); } -static inline void native_safe_halt(void) +static inline __cpuidle void native_safe_halt(void) { asm volatile("sti; hlt": : :"memory"); } -static inline void native_halt(void) +static inline __cpuidle void native_halt(void) { asm volatile("hlt": : :"memory"); } @@ -86,7 +90,7 @@ static inline notrace void arch_local_irq_enable(void) * Used in the idle loop; sti takes one instruction cycle * to complete: */ -static inline void arch_safe_halt(void) +static inline __cpuidle void arch_safe_halt(void) { native_safe_halt(); } @@ -95,7 +99,7 @@ static inline void arch_safe_halt(void) * Used when interrupts are already enabled or to * shutdown the processor: */ -static inline void halt(void) +static inline __cpuidle void halt(void) { native_halt(); } diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index bdfad642123f..af15f4444330 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -152,7 +152,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, } EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); -void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) +void __cpuidle acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) { unsigned int cpu = smp_processor_id(); struct cstate_entry *percpu_entry; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 4002b475171c..28cea7802ecb 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -302,7 +302,7 @@ void arch_cpu_idle(void) /* * We use this if we don't have any better idle routine.. */ -void default_idle(void) +void __cpuidle default_idle(void) { trace_cpu_idle_rcuidle(1, smp_processor_id()); safe_halt(); @@ -417,7 +417,7 @@ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) * with interrupts enabled and no flags, which is backwards compatible with the * original MWAIT implementation. */ -static void mwait_idle(void) +static __cpuidle void mwait_idle(void) { if (!current_set_polling_and_test()) { trace_cpu_idle_rcuidle(1, smp_processor_id()); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 9297a002d8e5..dbf67f64d5ec 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -97,6 +97,7 @@ SECTIONS _stext = .; TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT ENTRY_TEXT diff --git a/arch/xtensa/kernel/vmlinux.lds.S b/arch/xtensa/kernel/vmlinux.lds.S index 72cfe3587dd8..31411fc82662 100644 --- a/arch/xtensa/kernel/vmlinux.lds.S +++ b/arch/xtensa/kernel/vmlinux.lds.S @@ -89,6 +89,9 @@ SECTIONS VMLINUX_SYMBOL(__sched_text_start) = .; *(.sched.literal .sched.text) VMLINUX_SYMBOL(__sched_text_end) = .; + VMLINUX_SYMBOL(__cpuidle_text_start) = .; + *(.cpuidle.literal .cpuidle.text) + VMLINUX_SYMBOL(__cpuidle_text_end) = .; VMLINUX_SYMBOL(__lock_text_start) = .; *(.spinlock.literal .spinlock.text) VMLINUX_SYMBOL(__lock_text_end) = .; diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index cea52528aa18..2237d3f24f0e 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -31,6 +31,7 @@ #include /* need_resched() */ #include #include +#include #include /* @@ -115,7 +116,7 @@ static const struct dmi_system_id processor_power_dmi_table[] = { * Callers should disable interrupts before the call and enable * interrupts after return. */ -static void acpi_safe_halt(void) +static void __cpuidle acpi_safe_halt(void) { if (!tif_need_resched()) { safe_halt(); @@ -645,7 +646,7 @@ static int acpi_idle_bm_check(void) * * Caller disables interrupt before call and enables interrupt after return. */ -static void acpi_idle_do_entry(struct acpi_processor_cx *cx) +static void __cpuidle acpi_idle_do_entry(struct acpi_processor_cx *cx) { if (cx->entry_method == ACPI_CSTATE_FFH) { /* Call into architectural FFH based C-state */ diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c index 389ade4572be..ab264d393233 100644 --- a/drivers/cpuidle/driver.c +++ b/drivers/cpuidle/driver.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "cpuidle.h" @@ -178,8 +179,8 @@ static void __cpuidle_driver_init(struct cpuidle_driver *drv) } #ifdef CONFIG_ARCH_HAS_CPU_RELAX -static int poll_idle(struct cpuidle_device *dev, - struct cpuidle_driver *drv, int index) +static int __cpuidle poll_idle(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index) { local_irq_enable(); if (!current_set_polling_and_test()) { diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 67ec58f9ef99..4466a2f969d7 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -863,8 +863,8 @@ static struct cpuidle_state dnv_cstates[] = { * * Must be called under local_irq_disable(). */ -static int intel_idle(struct cpuidle_device *dev, - struct cpuidle_driver *drv, int index) +static __cpuidle int intel_idle(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index) { unsigned long ecx = 1; /* break on interrupt flag */ struct cpuidle_state *state = &drv->states[index]; diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 24563970ff7b..3e42bcdd014b 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -454,6 +454,12 @@ *(.spinlock.text) \ VMLINUX_SYMBOL(__lock_text_end) = .; +#define CPUIDLE_TEXT \ + ALIGN_FUNCTION(); \ + VMLINUX_SYMBOL(__cpuidle_text_start) = .; \ + *(.cpuidle.text) \ + VMLINUX_SYMBOL(__cpuidle_text_end) = .; + #define KPROBES_TEXT \ ALIGN_FUNCTION(); \ VMLINUX_SYMBOL(__kprobes_text_start) = .; \ diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 7572d9e9dced..b886dc17f2f3 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -231,6 +231,11 @@ void cpu_startup_entry(enum cpuhp_state state); void cpu_idle_poll_ctrl(bool enable); +/* Attach to any functions which should be considered cpuidle. */ +#define __cpuidle __attribute__((__section__(".cpuidle.text"))) + +bool cpu_in_idle(unsigned long pc); + void arch_cpu_idle(void); void arch_cpu_idle_prepare(void); void arch_cpu_idle_enter(void); diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 9fb873cfc75c..1d8718d5300d 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -16,6 +16,9 @@ #include "sched.h" +/* Linker adds these: start and end of __cpuidle functions */ +extern char __cpuidle_text_start[], __cpuidle_text_end[]; + /** * sched_idle_set_state - Record idle state for the current CPU. * @idle_state: State to record. @@ -53,7 +56,7 @@ static int __init cpu_idle_nopoll_setup(char *__unused) __setup("hlt", cpu_idle_nopoll_setup); #endif -static inline int cpu_idle_poll(void) +static noinline int __cpuidle cpu_idle_poll(void) { rcu_idle_enter(); trace_cpu_idle_rcuidle(0, smp_processor_id()); @@ -84,7 +87,7 @@ void __weak arch_cpu_idle(void) * * To use when the cpuidle framework cannot be used. */ -void default_idle_call(void) +void __cpuidle default_idle_call(void) { if (current_clr_polling_and_test()) { local_irq_enable(); @@ -271,6 +274,12 @@ static void cpu_idle_loop(void) } } +bool cpu_in_idle(unsigned long pc) +{ + return pc >= (unsigned long)__cpuidle_text_start && + pc < (unsigned long)__cpuidle_text_end; +} + void cpu_startup_entry(enum cpuhp_state state) { /* diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c index 393a3cca1f47..75554754eadf 100644 --- a/lib/nmi_backtrace.c +++ b/lib/nmi_backtrace.c @@ -16,6 +16,7 @@ #include #include #include +#include #ifdef arch_trigger_cpumask_backtrace /* For reliability, we're prepared to waste bits here. */ @@ -87,11 +88,16 @@ bool nmi_cpu_backtrace(struct pt_regs *regs) int cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { - pr_warn("NMI backtrace for cpu %d\n", cpu); - if (regs) - show_regs(regs); - else - dump_stack(); + if (regs && cpu_in_idle(instruction_pointer(regs))) { + pr_warn("NMI backtrace for cpu %d skipped: idling at pc %#lx\n", + cpu, instruction_pointer(regs)); + } else { + pr_warn("NMI backtrace for cpu %d\n", cpu); + if (regs) + show_regs(regs); + else + dump_stack(); + } cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); return true; } diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 48958d3cec9e..bd8349759095 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -888,7 +888,7 @@ static void check_section(const char *modname, struct elf_info *elf, #define DATA_SECTIONS ".data", ".data.rel" #define TEXT_SECTIONS ".text", ".text.unlikely", ".sched.text", \ - ".kprobes.text" + ".kprobes.text", ".cpuidle.text" #define OTHER_TEXT_SECTIONS ".ref.text", ".head.text", ".spinlock.text", \ ".fixup", ".entry.text", ".exception.text", ".text.*", \ ".coldtext" diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c index a68f03133df9..5423a58d1b06 100644 --- a/scripts/recordmcount.c +++ b/scripts/recordmcount.c @@ -365,6 +365,7 @@ is_mcounted_section_name(char const *const txtname) strcmp(".irqentry.text", txtname) == 0 || strcmp(".softirqentry.text", txtname) == 0 || strcmp(".kprobes.text", txtname) == 0 || + strcmp(".cpuidle.text", txtname) == 0 || strcmp(".text.unlikely", txtname) == 0; } diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 2d48011bc362..faac4b10d8ea 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -136,6 +136,7 @@ my %text_sections = ( ".irqentry.text" => 1, ".softirqentry.text" => 1, ".kprobes.text" => 1, + ".cpuidle.text" => 1, ".text.unlikely" => 1, ); From 470164572dc2f9e76e078e08498dfe32ae2faa6f Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 7 Oct 2016 17:02:58 -0700 Subject: [PATCH 121/127] spelling.txt: "modeled" is spelt correctly No need to correct the correct. Link: http://lkml.kernel.org/r/1472490791.3425.38.camel@perches.com Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/spelling.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/spelling.txt b/scripts/spelling.txt index fa79c6d2a5b8..163c720d3f2b 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -629,7 +629,6 @@ mispelt||misspelt miximum||maximum mmnemonic||mnemonic mnay||many -modeled||modelled modulues||modules monochorome||monochrome monochromo||monochrome From ea036230f7c604e3c90fbf57f4643bd733034af9 Mon Sep 17 00:00:00 2001 From: Marcin Nowakowski Date: Fri, 7 Oct 2016 17:03:01 -0700 Subject: [PATCH 122/127] uprobes: remove function declarations from arch/{mips,s390} The declarations of arch-specific functions have been moved to a common header in commit 3820b4d2789f ('uprobes: Move function declarations out of arch'), but MIPS and S390 has added them to their own trees later. Remove the unnecessary duplicates. Link: http://lkml.kernel.org/r/1472804384-17830-1-git-send-email-marcin.nowakowski@imgtec.com Signed-off-by: Marcin Nowakowski Acked-by: Heiko Carstens Cc: Martin Schwidefsky Cc: Ralf Baechle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/include/asm/uprobes.h | 12 ------------ arch/s390/include/asm/uprobes.h | 10 ---------- 2 files changed, 22 deletions(-) diff --git a/arch/mips/include/asm/uprobes.h b/arch/mips/include/asm/uprobes.h index 70a4a2f173ff..b86d1ae07125 100644 --- a/arch/mips/include/asm/uprobes.h +++ b/arch/mips/include/asm/uprobes.h @@ -42,16 +42,4 @@ struct arch_uprobe_task { unsigned long saved_trap_nr; }; -extern int arch_uprobe_analyze_insn(struct arch_uprobe *aup, - struct mm_struct *mm, unsigned long addr); -extern int arch_uprobe_pre_xol(struct arch_uprobe *aup, struct pt_regs *regs); -extern int arch_uprobe_post_xol(struct arch_uprobe *aup, struct pt_regs *regs); -extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk); -extern int arch_uprobe_exception_notify(struct notifier_block *self, - unsigned long val, void *data); -extern void arch_uprobe_abort_xol(struct arch_uprobe *aup, - struct pt_regs *regs); -extern unsigned long arch_uretprobe_hijack_return_addr( - unsigned long trampoline_vaddr, struct pt_regs *regs); - #endif /* __ASM_UPROBES_H */ diff --git a/arch/s390/include/asm/uprobes.h b/arch/s390/include/asm/uprobes.h index 1411dff7fea7..658393c65d7e 100644 --- a/arch/s390/include/asm/uprobes.h +++ b/arch/s390/include/asm/uprobes.h @@ -29,14 +29,4 @@ struct arch_uprobe { struct arch_uprobe_task { }; -int arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm, - unsigned long addr); -int arch_uprobe_pre_xol(struct arch_uprobe *aup, struct pt_regs *regs); -int arch_uprobe_post_xol(struct arch_uprobe *aup, struct pt_regs *regs); -bool arch_uprobe_xol_was_trapped(struct task_struct *tsk); -int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, - void *data); -void arch_uprobe_abort_xol(struct arch_uprobe *ap, struct pt_regs *regs); -unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline, - struct pt_regs *regs); #endif /* _ASM_UPROBES_H */ From 218dd85887da3d7d08119de18e9d325fcf30d7a4 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Fri, 7 Oct 2016 17:03:04 -0700 Subject: [PATCH 123/127] .gitattributes: set git diff driver for C source code files Git can be told to apply language-specific rules when generating diffs. Enable this for C source code files (*.c and *.h) so that function names are printed right. Specifically, doing so prevents "git diff" from mistakenly considering unindented goto labels as function names. Link: http://lkml.kernel.org/r/20160907143403.1449324f@endymion Signed-off-by: Jean Delvare Cc: Peter Zijlstra Cc: Joe Perches Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .gitattributes | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .gitattributes diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 000000000000..89c411b5ce6b --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +*.c diff=cpp +*.h diff=cpp From 69474afb042337f5f189cd15de5eb49923ce20dc Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 7 Oct 2016 17:03:07 -0700 Subject: [PATCH 124/127] mailmap: add Johan Hovold Add two entries to map to my primary address. Link: http://lkml.kernel.org/r/1473850348-19177-1-git-send-email-johan@kernel.org Signed-off-by: Johan Hovold Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .mailmap | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index 967f88210b12..2408e56e241b 100644 --- a/.mailmap +++ b/.mailmap @@ -75,6 +75,8 @@ Jean Tourrilhes Jeff Garzik Jens Axboe Jens Osterkamp +Johan Hovold +Johan Hovold John Paul Adrian Glaubitz John Stultz From 954f74bf45268bcee0af21b6393c9c8acca7e075 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Fri, 7 Oct 2016 17:03:09 -0700 Subject: [PATCH 125/127] CREDITS: update Pavel's information, add GPG key, remove snail mail address Link: http://lkml.kernel.org/r/20161003082312.GA20634@amd Signed-off-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- CREDITS | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CREDITS b/CREDITS index df0a50eb3f64..513aaa3546bf 100644 --- a/CREDITS +++ b/CREDITS @@ -2296,11 +2296,11 @@ D: Initial implementation of VC's, pty's and select() N: Pavel Machek E: pavel@ucw.cz -D: Softcursor for vga, hypertech cdrom support, vcsa bugfix, nbd +P: 4096R/92DFCE96 4FA7 9EEF FCD4 C44F C585 B8C7 C060 2241 92DF CE96 +D: Softcursor for vga, hypertech cdrom support, vcsa bugfix, nbd, D: sun4/330 port, capabilities for elf, speedup for rm on ext2, USB, -D: work on suspend-to-ram/disk, killing duplicates from ioctl32 -S: Volkova 1131 -S: 198 00 Praha 9 +D: work on suspend-to-ram/disk, killing duplicates from ioctl32, +D: Altera SoCFPGA and Nokia N900 support. S: Czech Republic N: Paul Mackerras From 81243eacfa400f5f7b89f4c2323d0de9982bb0fb Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 7 Oct 2016 17:03:12 -0700 Subject: [PATCH 126/127] cred: simpler, 1D supplementary groups Current supplementary groups code can massively overallocate memory and is implemented in a way so that access to individual gid is done via 2D array. If number of gids is <= 32, memory allocation is more or less tolerable (140/148 bytes). But if it is not, code allocates full page (!) regardless and, what's even more fun, doesn't reuse small 32-entry array. 2D array means dependent shifts, loads and LEAs without possibility to optimize them (gid is never known at compile time). All of the above is unnecessary. Switch to the usual trailing-zero-len-array scheme. Memory is allocated with kmalloc/vmalloc() and only as much as needed. Accesses become simpler (LEA 8(gi,idx,4) or even without displacement). Maximum number of gids is 65536 which translates to 256KB+8 bytes. I think kernel can handle such allocation. On my usual desktop system with whole 9 (nine) aux groups, struct group_info shrinks from 148 bytes to 44 bytes, yay! Nice side effects: - "gi->gid[i]" is shorter than "GROUP_AT(gi, i)", less typing, - fix little mess in net/ipv4/ping.c should have been using GROUP_AT macro but this point becomes moot, - aux group allocation is persistent and should be accounted as such. Link: http://lkml.kernel.org/r/20160817201927.GA2096@p183.telecom.by Signed-off-by: Alexey Dobriyan Cc: Vasily Kulikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/kernel/compat_linux.c | 4 +- drivers/staging/lustre/lustre/ptlrpc/sec.c | 2 +- fs/nfsd/auth.c | 6 +- fs/nfsd/nfs4state.c | 2 +- fs/proc/array.c | 2 +- include/linux/cred.h | 11 +--- kernel/groups.c | 65 +++++++--------------- kernel/uid16.c | 4 +- net/ipv4/ping.c | 15 ++--- net/sunrpc/auth_generic.c | 4 +- net/sunrpc/auth_gss/gss_rpc_xdr.c | 2 +- net/sunrpc/auth_gss/svcauth_gss.c | 2 +- net/sunrpc/auth_unix.c | 4 +- net/sunrpc/svcauth_unix.c | 6 +- 14 files changed, 45 insertions(+), 84 deletions(-) diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c index 437e61159279..0f9cd90c11af 100644 --- a/arch/s390/kernel/compat_linux.c +++ b/arch/s390/kernel/compat_linux.c @@ -189,7 +189,7 @@ static int groups16_to_user(u16 __user *grouplist, struct group_info *group_info kgid_t kgid; for (i = 0; i < group_info->ngroups; i++) { - kgid = GROUP_AT(group_info, i); + kgid = group_info->gid[i]; group = (u16)from_kgid_munged(user_ns, kgid); if (put_user(group, grouplist+i)) return -EFAULT; @@ -213,7 +213,7 @@ static int groups16_from_user(struct group_info *group_info, u16 __user *groupli if (!gid_valid(kgid)) return -EINVAL; - GROUP_AT(group_info, i) = kgid; + group_info->gid[i] = kgid; } return 0; diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec.c b/drivers/staging/lustre/lustre/ptlrpc/sec.c index 5d3995d5c69a..a7416cd9ac71 100644 --- a/drivers/staging/lustre/lustre/ptlrpc/sec.c +++ b/drivers/staging/lustre/lustre/ptlrpc/sec.c @@ -2220,7 +2220,7 @@ int sptlrpc_pack_user_desc(struct lustre_msg *msg, int offset) task_lock(current); if (pud->pud_ngroups > current_ngroups) pud->pud_ngroups = current_ngroups; - memcpy(pud->pud_groups, current_cred()->group_info->blocks[0], + memcpy(pud->pud_groups, current_cred()->group_info->gid, pud->pud_ngroups * sizeof(__u32)); task_unlock(current); diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 9d46a0bdd9f9..62469c60be23 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -55,10 +55,10 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) goto oom; for (i = 0; i < rqgi->ngroups; i++) { - if (gid_eq(GLOBAL_ROOT_GID, GROUP_AT(rqgi, i))) - GROUP_AT(gi, i) = exp->ex_anon_gid; + if (gid_eq(GLOBAL_ROOT_GID, rqgi->gid[i])) + gi->gid[i] = exp->ex_anon_gid; else - GROUP_AT(gi, i) = GROUP_AT(rqgi, i); + gi->gid[i] = rqgi->gid[i]; } } else { gi = get_group_info(rqgi); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index a204d7e109d4..39bfaba9c99c 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1903,7 +1903,7 @@ static bool groups_equal(struct group_info *g1, struct group_info *g2) if (g1->ngroups != g2->ngroups) return false; for (i=0; ingroups; i++) - if (!gid_eq(GROUP_AT(g1, i), GROUP_AT(g2, i))) + if (!gid_eq(g1->gid[i], g2->gid[i])) return false; return true; } diff --git a/fs/proc/array.c b/fs/proc/array.c index d25b44601b30..89600fd5963d 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -207,7 +207,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, group_info = cred->group_info; for (g = 0; g < group_info->ngroups; g++) seq_put_decimal_ull(m, g ? " " : "", - from_kgid_munged(user_ns, GROUP_AT(group_info, g))); + from_kgid_munged(user_ns, group_info->gid[g])); put_cred(cred); /* Trailing space shouldn't have been added in the first place. */ seq_putc(m, ' '); diff --git a/include/linux/cred.h b/include/linux/cred.h index 257db64562e5..f0e70a1bb3ac 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -26,15 +26,10 @@ struct inode; /* * COW Supplementary groups list */ -#define NGROUPS_SMALL 32 -#define NGROUPS_PER_BLOCK ((unsigned int)(PAGE_SIZE / sizeof(kgid_t))) - struct group_info { atomic_t usage; int ngroups; - int nblocks; - kgid_t small_block[NGROUPS_SMALL]; - kgid_t *blocks[0]; + kgid_t gid[0]; }; /** @@ -88,10 +83,6 @@ extern void set_groups(struct cred *, struct group_info *); extern int groups_search(const struct group_info *, kgid_t); extern bool may_setgroups(void); -/* access the groups "array" with this macro */ -#define GROUP_AT(gi, i) \ - ((gi)->blocks[(i) / NGROUPS_PER_BLOCK][(i) % NGROUPS_PER_BLOCK]) - /* * The security context of a task * diff --git a/kernel/groups.c b/kernel/groups.c index 74d431d25251..2fcadd66a8fd 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -7,55 +7,31 @@ #include #include #include +#include #include struct group_info *groups_alloc(int gidsetsize) { - struct group_info *group_info; - int nblocks; - int i; + struct group_info *gi; + unsigned int len; - nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK; - /* Make sure we always allocate at least one indirect block pointer */ - nblocks = nblocks ? : 1; - group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER); - if (!group_info) + len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize; + gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY); + if (!gi) + gi = __vmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_HIGHMEM, PAGE_KERNEL); + if (!gi) return NULL; - group_info->ngroups = gidsetsize; - group_info->nblocks = nblocks; - atomic_set(&group_info->usage, 1); - if (gidsetsize <= NGROUPS_SMALL) - group_info->blocks[0] = group_info->small_block; - else { - for (i = 0; i < nblocks; i++) { - kgid_t *b; - b = (void *)__get_free_page(GFP_USER); - if (!b) - goto out_undo_partial_alloc; - group_info->blocks[i] = b; - } - } - return group_info; - -out_undo_partial_alloc: - while (--i >= 0) { - free_page((unsigned long)group_info->blocks[i]); - } - kfree(group_info); - return NULL; + atomic_set(&gi->usage, 1); + gi->ngroups = gidsetsize; + return gi; } EXPORT_SYMBOL(groups_alloc); void groups_free(struct group_info *group_info) { - if (group_info->blocks[0] != group_info->small_block) { - int i; - for (i = 0; i < group_info->nblocks; i++) - free_page((unsigned long)group_info->blocks[i]); - } - kfree(group_info); + kvfree(group_info); } EXPORT_SYMBOL(groups_free); @@ -70,7 +46,7 @@ static int groups_to_user(gid_t __user *grouplist, for (i = 0; i < count; i++) { gid_t gid; - gid = from_kgid_munged(user_ns, GROUP_AT(group_info, i)); + gid = from_kgid_munged(user_ns, group_info->gid[i]); if (put_user(gid, grouplist+i)) return -EFAULT; } @@ -95,7 +71,7 @@ static int groups_from_user(struct group_info *group_info, if (!gid_valid(kgid)) return -EINVAL; - GROUP_AT(group_info, i) = kgid; + group_info->gid[i] = kgid; } return 0; } @@ -115,15 +91,14 @@ static void groups_sort(struct group_info *group_info) for (base = 0; base < max; base++) { int left = base; int right = left + stride; - kgid_t tmp = GROUP_AT(group_info, right); + kgid_t tmp = group_info->gid[right]; - while (left >= 0 && gid_gt(GROUP_AT(group_info, left), tmp)) { - GROUP_AT(group_info, right) = - GROUP_AT(group_info, left); + while (left >= 0 && gid_gt(group_info->gid[left], tmp)) { + group_info->gid[right] = group_info->gid[left]; right = left; left -= stride; } - GROUP_AT(group_info, right) = tmp; + group_info->gid[right] = tmp; } stride /= 3; } @@ -141,9 +116,9 @@ int groups_search(const struct group_info *group_info, kgid_t grp) right = group_info->ngroups; while (left < right) { unsigned int mid = (left+right)/2; - if (gid_gt(grp, GROUP_AT(group_info, mid))) + if (gid_gt(grp, group_info->gid[mid])) left = mid + 1; - else if (gid_lt(grp, GROUP_AT(group_info, mid))) + else if (gid_lt(grp, group_info->gid[mid])) right = mid; else return 1; diff --git a/kernel/uid16.c b/kernel/uid16.c index d58cc4d8f0d1..cc40793464e3 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -117,7 +117,7 @@ static int groups16_to_user(old_gid_t __user *grouplist, kgid_t kgid; for (i = 0; i < group_info->ngroups; i++) { - kgid = GROUP_AT(group_info, i); + kgid = group_info->gid[i]; group = high2lowgid(from_kgid_munged(user_ns, kgid)); if (put_user(group, grouplist+i)) return -EFAULT; @@ -142,7 +142,7 @@ static int groups16_from_user(struct group_info *group_info, if (!gid_valid(kgid)) return -EINVAL; - GROUP_AT(group_info, i) = kgid; + group_info->gid[i] = kgid; } return 0; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 66ddcb60519a..7cf7d6e380c2 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -258,7 +258,7 @@ int ping_init_sock(struct sock *sk) struct net *net = sock_net(sk); kgid_t group = current_egid(); struct group_info *group_info; - int i, j, count; + int i; kgid_t low, high; int ret = 0; @@ -270,16 +270,11 @@ int ping_init_sock(struct sock *sk) return 0; group_info = get_current_groups(); - count = group_info->ngroups; - for (i = 0; i < group_info->nblocks; i++) { - int cp_count = min_t(int, NGROUPS_PER_BLOCK, count); - for (j = 0; j < cp_count; j++) { - kgid_t gid = group_info->blocks[i][j]; - if (gid_lte(low, gid) && gid_lte(gid, high)) - goto out_release_group; - } + for (i = 0; i < group_info->ngroups; i++) { + kgid_t gid = group_info->gid[i]; - count -= cp_count; + if (gid_lte(low, gid) && gid_lte(gid, high)) + goto out_release_group; } ret = -EACCES; diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c index 168219535a34..83dffeadf20a 100644 --- a/net/sunrpc/auth_generic.c +++ b/net/sunrpc/auth_generic.c @@ -176,8 +176,8 @@ generic_match(struct auth_cred *acred, struct rpc_cred *cred, int flags) if (gcred->acred.group_info->ngroups != acred->group_info->ngroups) goto out_nomatch; for (i = 0; i < gcred->acred.group_info->ngroups; i++) { - if (!gid_eq(GROUP_AT(gcred->acred.group_info, i), - GROUP_AT(acred->group_info, i))) + if (!gid_eq(gcred->acred.group_info->gid[i], + acred->group_info->gid[i])) goto out_nomatch; } out_match: diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c index eeeba5adee6d..dc6fb79a361f 100644 --- a/net/sunrpc/auth_gss/gss_rpc_xdr.c +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c @@ -229,7 +229,7 @@ static int gssx_dec_linux_creds(struct xdr_stream *xdr, kgid = make_kgid(&init_user_ns, tmp); if (!gid_valid(kgid)) goto out_free_groups; - GROUP_AT(creds->cr_group_info, i) = kgid; + creds->cr_group_info->gid[i] = kgid; } return 0; diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index d8582028b346..d67f7e1bc82d 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -479,7 +479,7 @@ static int rsc_parse(struct cache_detail *cd, kgid = make_kgid(&init_user_ns, id); if (!gid_valid(kgid)) goto out; - GROUP_AT(rsci.cred.cr_group_info, i) = kgid; + rsci.cred.cr_group_info->gid[i] = kgid; } /* mech name */ diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index a99278c984e8..a1d768a973f5 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -79,7 +79,7 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t cred->uc_gid = acred->gid; for (i = 0; i < groups; i++) - cred->uc_gids[i] = GROUP_AT(acred->group_info, i); + cred->uc_gids[i] = acred->group_info->gid[i]; if (i < NFS_NGROUPS) cred->uc_gids[i] = INVALID_GID; @@ -127,7 +127,7 @@ unx_match(struct auth_cred *acred, struct rpc_cred *rcred, int flags) if (groups > NFS_NGROUPS) groups = NFS_NGROUPS; for (i = 0; i < groups ; i++) - if (!gid_eq(cred->uc_gids[i], GROUP_AT(acred->group_info, i))) + if (!gid_eq(cred->uc_gids[i], acred->group_info->gid[i])) return 0; if (groups < NFS_NGROUPS && gid_valid(cred->uc_gids[groups])) return 0; diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index dfacdc95b3f5..64af4f034de6 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -517,7 +517,7 @@ static int unix_gid_parse(struct cache_detail *cd, kgid = make_kgid(&init_user_ns, gid); if (!gid_valid(kgid)) goto out; - GROUP_AT(ug.gi, i) = kgid; + ug.gi->gid[i] = kgid; } ugp = unix_gid_lookup(cd, uid); @@ -564,7 +564,7 @@ static int unix_gid_show(struct seq_file *m, seq_printf(m, "%u %d:", from_kuid_munged(user_ns, ug->uid), glen); for (i = 0; i < glen; i++) - seq_printf(m, " %d", from_kgid_munged(user_ns, GROUP_AT(ug->gi, i))); + seq_printf(m, " %d", from_kgid_munged(user_ns, ug->gi->gid[i])); seq_printf(m, "\n"); return 0; } @@ -817,7 +817,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) return SVC_CLOSE; for (i = 0; i < slen; i++) { kgid_t kgid = make_kgid(&init_user_ns, svc_getnl(argv)); - GROUP_AT(cred->cr_group_info, i) = kgid; + cred->cr_group_info->gid[i] = kgid; } if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) { *authp = rpc_autherr_badverf; From 05fd007e46296afb24d15c7d589d535e5a5b9d5c Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Fri, 7 Oct 2016 17:03:15 -0700 Subject: [PATCH 127/127] console: don't prefer first registered if DT specifies stdout-path If a device tree specifies a preferred device for kernel console output via the stdout-path or linux,stdout-path chosen node properties or the stdout alias then the kernel ought to honor it & output the kernel console to that device. As it stands, this isn't the case. Whilst we parse the stdout-path properties & set an of_stdout variable from of_alias_scan(), and use that from of_console_check() to determine whether to add a console device as a preferred console whilst registering it, we also prefer the first registered console if no other has been selected at the time of its registration. This means that if a console other than the one the device tree selects via stdout-path is registered first, we will switch to using it & when the stdout-path console is later registered the call to add_preferred_console() via of_console_check() is too late to do anything useful. In practice this seems to mean that we switch to the dummy console device fairly early & see no further console output: Console: colour dummy device 80x25 console [tty0] enabled bootconsole [ns16550a0] disabled Fix this by not automatically preferring the first registered console if one is specified by the device tree. This allows consoles to be registered but not enabled, and once the driver for the console selected by stdout-path calls of_console_check() the driver will be added to the list of preferred consoles before any other console has been enabled. When that console is then registered via register_console() it will be enabled as expected. Link: http://lkml.kernel.org/r/20160809151937.26118-1-paul.burton@imgtec.com Signed-off-by: Paul Burton Cc: Ralf Baechle Cc: Paul Burton Cc: Tejun Heo Cc: Sergey Senozhatsky Cc: Jiri Slaby Cc: Daniel Vetter Cc: Ivan Delalande Cc: Thierry Reding Cc: Borislav Petkov Cc: Jan Kara Cc: Petr Mladek Cc: Joe Perches Cc: Greg Kroah-Hartman Cc: Rob Herring Cc: Frank Rowand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/of/base.c | 2 ++ include/linux/console.h | 6 ++++++ kernel/printk/printk.c | 13 ++++++++++++- 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/drivers/of/base.c b/drivers/of/base.c index a0bccb54a9bd..d687e6de24a0 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -2077,6 +2077,8 @@ void of_alias_scan(void * (*dt_alloc)(u64 size, u64 align)) name = of_get_property(of_aliases, "stdout", NULL); if (name) of_stdout = of_find_node_opts_by_path(name, &of_stdout_options); + if (of_stdout) + console_set_by_of(); } if (!of_aliases) diff --git a/include/linux/console.h b/include/linux/console.h index d530c4627e54..3672809234a7 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -173,6 +173,12 @@ static inline void console_sysfs_notify(void) #endif extern bool console_suspend_enabled; +#ifdef CONFIG_OF +extern void console_set_by_of(void); +#else +static inline void console_set_by_of(void) {} +#endif + /* Suspend and resume console messages over PM events */ extern void suspend_console(void); extern void resume_console(void); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index eea6dbc2d8cf..8019cc0d3a73 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -253,6 +253,17 @@ static int preferred_console = -1; int console_set_on_cmdline; EXPORT_SYMBOL(console_set_on_cmdline); +#ifdef CONFIG_OF +static bool of_specified_console; + +void console_set_by_of(void) +{ + of_specified_console = true; +} +#else +# define of_specified_console false +#endif + /* Flag: console code may call schedule() */ static int console_may_schedule; @@ -2647,7 +2658,7 @@ void register_console(struct console *newcon) * didn't select a console we take the first one * that registers here. */ - if (preferred_console < 0) { + if (preferred_console < 0 && !of_specified_console) { if (newcon->index < 0) newcon->index = 0; if (newcon->setup == NULL ||