2017-02-23 07:45:39 +08:00
|
|
|
/*
|
|
|
|
* Manage cache of swap slots to be used for and returned from
|
|
|
|
* swap.
|
|
|
|
*
|
|
|
|
* Copyright(c) 2016 Intel Corporation.
|
|
|
|
*
|
|
|
|
* Author: Tim Chen <tim.c.chen@linux.intel.com>
|
|
|
|
*
|
|
|
|
* We allocate the swap slots from the global pool and put
|
|
|
|
* it into local per cpu caches. This has the advantage
|
|
|
|
* of no needing to acquire the swap_info lock every time
|
|
|
|
* we need a new slot.
|
|
|
|
*
|
|
|
|
* There is also opportunity to simply return the slot
|
|
|
|
* to local caches without needing to acquire swap_info
|
|
|
|
* lock. We do not reuse the returned slots directly but
|
|
|
|
* move them back to the global pool in a batch. This
|
|
|
|
* allows the slots to coaellesce and reduce fragmentation.
|
|
|
|
*
|
|
|
|
* The swap entry allocated is marked with SWAP_HAS_CACHE
|
|
|
|
* flag in map_count that prevents it from being allocated
|
|
|
|
* again from the global pool.
|
|
|
|
*
|
|
|
|
* The swap slots cache is protected by a mutex instead of
|
|
|
|
* a spin lock as when we search for slots with scan_swap_map,
|
|
|
|
* we can possibly sleep.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/swap_slots.h>
|
|
|
|
#include <linux/cpu.h>
|
|
|
|
#include <linux/cpumask.h>
|
|
|
|
#include <linux/vmalloc.h>
|
|
|
|
#include <linux/mutex.h>
|
mm, swap: use kvzalloc to allocate some swap data structures
Now vzalloc() is used in swap code to allocate various data structures,
such as swap cache, swap slots cache, cluster info, etc. Because the
size may be too large on some system, so that normal kzalloc() may fail.
But using kzalloc() has some advantages, for example, less memory
fragmentation, less TLB pressure, etc. So change the data structure
allocation in swap code to use kvzalloc() which will try kzalloc()
firstly, and fallback to vzalloc() if kzalloc() failed.
In general, although kmalloc() will reduce the number of high-order
pages in short term, vmalloc() will cause more pain for memory
fragmentation in the long term. And the swap data structure allocation
that is changed in this patch is expected to be long term allocation.
From Dave Hansen:
"for example, we have a two-page data structure. vmalloc() takes two
effectively random order-0 pages, probably from two different 2M pages
and pins them. That "kills" two 2M pages. kmalloc(), allocating two
*contiguous* pages, will not cross a 2M boundary. That means it will
only "kill" the possibility of a single 2M page. More 2M pages == less
fragmentation.
The allocation in this patch occurs during swap on time, which is
usually done during system boot, so usually we have high opportunity to
allocate the contiguous pages successfully.
The allocation for swap_map[] in struct swap_info_struct is not changed,
because that is usually quite large and vmalloc_to_page() is used for
it. That makes it a little harder to change.
Link: http://lkml.kernel.org/r/20170407064911.25447-1-ying.huang@intel.com
Signed-off-by: Huang Ying <ying.huang@intel.com>
Acked-by: Tim Chen <tim.c.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-05-09 06:57:40 +08:00
|
|
|
#include <linux/mm.h>
|
2017-02-23 07:45:39 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_SWAP
|
|
|
|
|
|
|
|
static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots);
|
|
|
|
static bool swap_slot_cache_active;
|
2017-02-23 07:45:46 +08:00
|
|
|
bool swap_slot_cache_enabled;
|
2017-02-23 07:45:39 +08:00
|
|
|
static bool swap_slot_cache_initialized;
|
|
|
|
DEFINE_MUTEX(swap_slots_cache_mutex);
|
|
|
|
/* Serialize swap slots cache enable/disable operations */
|
|
|
|
DEFINE_MUTEX(swap_slots_cache_enable_mutex);
|
|
|
|
|
|
|
|
static void __drain_swap_slots_cache(unsigned int type);
|
|
|
|
static void deactivate_swap_slots_cache(void);
|
|
|
|
static void reactivate_swap_slots_cache(void);
|
|
|
|
|
|
|
|
#define use_swap_slot_cache (swap_slot_cache_active && \
|
|
|
|
swap_slot_cache_enabled && swap_slot_cache_initialized)
|
|
|
|
#define SLOTS_CACHE 0x1
|
|
|
|
#define SLOTS_CACHE_RET 0x2
|
|
|
|
|
|
|
|
static void deactivate_swap_slots_cache(void)
|
|
|
|
{
|
|
|
|
mutex_lock(&swap_slots_cache_mutex);
|
|
|
|
swap_slot_cache_active = false;
|
|
|
|
__drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET);
|
|
|
|
mutex_unlock(&swap_slots_cache_mutex);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void reactivate_swap_slots_cache(void)
|
|
|
|
{
|
|
|
|
mutex_lock(&swap_slots_cache_mutex);
|
|
|
|
swap_slot_cache_active = true;
|
|
|
|
mutex_unlock(&swap_slots_cache_mutex);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Must not be called with cpu hot plug lock */
|
|
|
|
void disable_swap_slots_cache_lock(void)
|
|
|
|
{
|
|
|
|
mutex_lock(&swap_slots_cache_enable_mutex);
|
|
|
|
swap_slot_cache_enabled = false;
|
|
|
|
if (swap_slot_cache_initialized) {
|
|
|
|
/* serialize with cpu hotplug operations */
|
|
|
|
get_online_cpus();
|
|
|
|
__drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET);
|
|
|
|
put_online_cpus();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __reenable_swap_slots_cache(void)
|
|
|
|
{
|
|
|
|
swap_slot_cache_enabled = has_usable_swap();
|
|
|
|
}
|
|
|
|
|
|
|
|
void reenable_swap_slots_cache_unlock(void)
|
|
|
|
{
|
|
|
|
__reenable_swap_slots_cache();
|
|
|
|
mutex_unlock(&swap_slots_cache_enable_mutex);
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool check_cache_active(void)
|
|
|
|
{
|
|
|
|
long pages;
|
|
|
|
|
|
|
|
if (!swap_slot_cache_enabled || !swap_slot_cache_initialized)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
pages = get_nr_swap_pages();
|
|
|
|
if (!swap_slot_cache_active) {
|
|
|
|
if (pages > num_online_cpus() *
|
|
|
|
THRESHOLD_ACTIVATE_SWAP_SLOTS_CACHE)
|
|
|
|
reactivate_swap_slots_cache();
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* if global pool of slot caches too low, deactivate cache */
|
|
|
|
if (pages < num_online_cpus() * THRESHOLD_DEACTIVATE_SWAP_SLOTS_CACHE)
|
|
|
|
deactivate_swap_slots_cache();
|
|
|
|
out:
|
|
|
|
return swap_slot_cache_active;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int alloc_swap_slot_cache(unsigned int cpu)
|
|
|
|
{
|
|
|
|
struct swap_slots_cache *cache;
|
|
|
|
swp_entry_t *slots, *slots_ret;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Do allocation outside swap_slots_cache_mutex
|
mm, swap: use kvzalloc to allocate some swap data structures
Now vzalloc() is used in swap code to allocate various data structures,
such as swap cache, swap slots cache, cluster info, etc. Because the
size may be too large on some system, so that normal kzalloc() may fail.
But using kzalloc() has some advantages, for example, less memory
fragmentation, less TLB pressure, etc. So change the data structure
allocation in swap code to use kvzalloc() which will try kzalloc()
firstly, and fallback to vzalloc() if kzalloc() failed.
In general, although kmalloc() will reduce the number of high-order
pages in short term, vmalloc() will cause more pain for memory
fragmentation in the long term. And the swap data structure allocation
that is changed in this patch is expected to be long term allocation.
From Dave Hansen:
"for example, we have a two-page data structure. vmalloc() takes two
effectively random order-0 pages, probably from two different 2M pages
and pins them. That "kills" two 2M pages. kmalloc(), allocating two
*contiguous* pages, will not cross a 2M boundary. That means it will
only "kill" the possibility of a single 2M page. More 2M pages == less
fragmentation.
The allocation in this patch occurs during swap on time, which is
usually done during system boot, so usually we have high opportunity to
allocate the contiguous pages successfully.
The allocation for swap_map[] in struct swap_info_struct is not changed,
because that is usually quite large and vmalloc_to_page() is used for
it. That makes it a little harder to change.
Link: http://lkml.kernel.org/r/20170407064911.25447-1-ying.huang@intel.com
Signed-off-by: Huang Ying <ying.huang@intel.com>
Acked-by: Tim Chen <tim.c.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-05-09 06:57:40 +08:00
|
|
|
* as kvzalloc could trigger reclaim and get_swap_page,
|
2017-02-23 07:45:39 +08:00
|
|
|
* which can lock swap_slots_cache_mutex.
|
|
|
|
*/
|
mm, swap: use kvzalloc to allocate some swap data structures
Now vzalloc() is used in swap code to allocate various data structures,
such as swap cache, swap slots cache, cluster info, etc. Because the
size may be too large on some system, so that normal kzalloc() may fail.
But using kzalloc() has some advantages, for example, less memory
fragmentation, less TLB pressure, etc. So change the data structure
allocation in swap code to use kvzalloc() which will try kzalloc()
firstly, and fallback to vzalloc() if kzalloc() failed.
In general, although kmalloc() will reduce the number of high-order
pages in short term, vmalloc() will cause more pain for memory
fragmentation in the long term. And the swap data structure allocation
that is changed in this patch is expected to be long term allocation.
From Dave Hansen:
"for example, we have a two-page data structure. vmalloc() takes two
effectively random order-0 pages, probably from two different 2M pages
and pins them. That "kills" two 2M pages. kmalloc(), allocating two
*contiguous* pages, will not cross a 2M boundary. That means it will
only "kill" the possibility of a single 2M page. More 2M pages == less
fragmentation.
The allocation in this patch occurs during swap on time, which is
usually done during system boot, so usually we have high opportunity to
allocate the contiguous pages successfully.
The allocation for swap_map[] in struct swap_info_struct is not changed,
because that is usually quite large and vmalloc_to_page() is used for
it. That makes it a little harder to change.
Link: http://lkml.kernel.org/r/20170407064911.25447-1-ying.huang@intel.com
Signed-off-by: Huang Ying <ying.huang@intel.com>
Acked-by: Tim Chen <tim.c.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-05-09 06:57:40 +08:00
|
|
|
slots = kvzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE,
|
|
|
|
GFP_KERNEL);
|
2017-02-23 07:45:39 +08:00
|
|
|
if (!slots)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
mm, swap: use kvzalloc to allocate some swap data structures
Now vzalloc() is used in swap code to allocate various data structures,
such as swap cache, swap slots cache, cluster info, etc. Because the
size may be too large on some system, so that normal kzalloc() may fail.
But using kzalloc() has some advantages, for example, less memory
fragmentation, less TLB pressure, etc. So change the data structure
allocation in swap code to use kvzalloc() which will try kzalloc()
firstly, and fallback to vzalloc() if kzalloc() failed.
In general, although kmalloc() will reduce the number of high-order
pages in short term, vmalloc() will cause more pain for memory
fragmentation in the long term. And the swap data structure allocation
that is changed in this patch is expected to be long term allocation.
From Dave Hansen:
"for example, we have a two-page data structure. vmalloc() takes two
effectively random order-0 pages, probably from two different 2M pages
and pins them. That "kills" two 2M pages. kmalloc(), allocating two
*contiguous* pages, will not cross a 2M boundary. That means it will
only "kill" the possibility of a single 2M page. More 2M pages == less
fragmentation.
The allocation in this patch occurs during swap on time, which is
usually done during system boot, so usually we have high opportunity to
allocate the contiguous pages successfully.
The allocation for swap_map[] in struct swap_info_struct is not changed,
because that is usually quite large and vmalloc_to_page() is used for
it. That makes it a little harder to change.
Link: http://lkml.kernel.org/r/20170407064911.25447-1-ying.huang@intel.com
Signed-off-by: Huang Ying <ying.huang@intel.com>
Acked-by: Tim Chen <tim.c.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-05-09 06:57:40 +08:00
|
|
|
slots_ret = kvzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE,
|
|
|
|
GFP_KERNEL);
|
2017-02-23 07:45:39 +08:00
|
|
|
if (!slots_ret) {
|
mm, swap: use kvzalloc to allocate some swap data structures
Now vzalloc() is used in swap code to allocate various data structures,
such as swap cache, swap slots cache, cluster info, etc. Because the
size may be too large on some system, so that normal kzalloc() may fail.
But using kzalloc() has some advantages, for example, less memory
fragmentation, less TLB pressure, etc. So change the data structure
allocation in swap code to use kvzalloc() which will try kzalloc()
firstly, and fallback to vzalloc() if kzalloc() failed.
In general, although kmalloc() will reduce the number of high-order
pages in short term, vmalloc() will cause more pain for memory
fragmentation in the long term. And the swap data structure allocation
that is changed in this patch is expected to be long term allocation.
From Dave Hansen:
"for example, we have a two-page data structure. vmalloc() takes two
effectively random order-0 pages, probably from two different 2M pages
and pins them. That "kills" two 2M pages. kmalloc(), allocating two
*contiguous* pages, will not cross a 2M boundary. That means it will
only "kill" the possibility of a single 2M page. More 2M pages == less
fragmentation.
The allocation in this patch occurs during swap on time, which is
usually done during system boot, so usually we have high opportunity to
allocate the contiguous pages successfully.
The allocation for swap_map[] in struct swap_info_struct is not changed,
because that is usually quite large and vmalloc_to_page() is used for
it. That makes it a little harder to change.
Link: http://lkml.kernel.org/r/20170407064911.25447-1-ying.huang@intel.com
Signed-off-by: Huang Ying <ying.huang@intel.com>
Acked-by: Tim Chen <tim.c.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-05-09 06:57:40 +08:00
|
|
|
kvfree(slots);
|
2017-02-23 07:45:39 +08:00
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_lock(&swap_slots_cache_mutex);
|
|
|
|
cache = &per_cpu(swp_slots, cpu);
|
|
|
|
if (cache->slots || cache->slots_ret)
|
|
|
|
/* cache already allocated */
|
|
|
|
goto out;
|
|
|
|
if (!cache->lock_initialized) {
|
|
|
|
mutex_init(&cache->alloc_lock);
|
|
|
|
spin_lock_init(&cache->free_lock);
|
|
|
|
cache->lock_initialized = true;
|
|
|
|
}
|
|
|
|
cache->nr = 0;
|
|
|
|
cache->cur = 0;
|
|
|
|
cache->n_ret = 0;
|
|
|
|
cache->slots = slots;
|
|
|
|
slots = NULL;
|
|
|
|
cache->slots_ret = slots_ret;
|
|
|
|
slots_ret = NULL;
|
|
|
|
out:
|
|
|
|
mutex_unlock(&swap_slots_cache_mutex);
|
|
|
|
if (slots)
|
mm, swap: use kvzalloc to allocate some swap data structures
Now vzalloc() is used in swap code to allocate various data structures,
such as swap cache, swap slots cache, cluster info, etc. Because the
size may be too large on some system, so that normal kzalloc() may fail.
But using kzalloc() has some advantages, for example, less memory
fragmentation, less TLB pressure, etc. So change the data structure
allocation in swap code to use kvzalloc() which will try kzalloc()
firstly, and fallback to vzalloc() if kzalloc() failed.
In general, although kmalloc() will reduce the number of high-order
pages in short term, vmalloc() will cause more pain for memory
fragmentation in the long term. And the swap data structure allocation
that is changed in this patch is expected to be long term allocation.
From Dave Hansen:
"for example, we have a two-page data structure. vmalloc() takes two
effectively random order-0 pages, probably from two different 2M pages
and pins them. That "kills" two 2M pages. kmalloc(), allocating two
*contiguous* pages, will not cross a 2M boundary. That means it will
only "kill" the possibility of a single 2M page. More 2M pages == less
fragmentation.
The allocation in this patch occurs during swap on time, which is
usually done during system boot, so usually we have high opportunity to
allocate the contiguous pages successfully.
The allocation for swap_map[] in struct swap_info_struct is not changed,
because that is usually quite large and vmalloc_to_page() is used for
it. That makes it a little harder to change.
Link: http://lkml.kernel.org/r/20170407064911.25447-1-ying.huang@intel.com
Signed-off-by: Huang Ying <ying.huang@intel.com>
Acked-by: Tim Chen <tim.c.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-05-09 06:57:40 +08:00
|
|
|
kvfree(slots);
|
2017-02-23 07:45:39 +08:00
|
|
|
if (slots_ret)
|
mm, swap: use kvzalloc to allocate some swap data structures
Now vzalloc() is used in swap code to allocate various data structures,
such as swap cache, swap slots cache, cluster info, etc. Because the
size may be too large on some system, so that normal kzalloc() may fail.
But using kzalloc() has some advantages, for example, less memory
fragmentation, less TLB pressure, etc. So change the data structure
allocation in swap code to use kvzalloc() which will try kzalloc()
firstly, and fallback to vzalloc() if kzalloc() failed.
In general, although kmalloc() will reduce the number of high-order
pages in short term, vmalloc() will cause more pain for memory
fragmentation in the long term. And the swap data structure allocation
that is changed in this patch is expected to be long term allocation.
From Dave Hansen:
"for example, we have a two-page data structure. vmalloc() takes two
effectively random order-0 pages, probably from two different 2M pages
and pins them. That "kills" two 2M pages. kmalloc(), allocating two
*contiguous* pages, will not cross a 2M boundary. That means it will
only "kill" the possibility of a single 2M page. More 2M pages == less
fragmentation.
The allocation in this patch occurs during swap on time, which is
usually done during system boot, so usually we have high opportunity to
allocate the contiguous pages successfully.
The allocation for swap_map[] in struct swap_info_struct is not changed,
because that is usually quite large and vmalloc_to_page() is used for
it. That makes it a little harder to change.
Link: http://lkml.kernel.org/r/20170407064911.25447-1-ying.huang@intel.com
Signed-off-by: Huang Ying <ying.huang@intel.com>
Acked-by: Tim Chen <tim.c.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-05-09 06:57:40 +08:00
|
|
|
kvfree(slots_ret);
|
2017-02-23 07:45:39 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type,
|
|
|
|
bool free_slots)
|
|
|
|
{
|
|
|
|
struct swap_slots_cache *cache;
|
|
|
|
swp_entry_t *slots = NULL;
|
|
|
|
|
|
|
|
cache = &per_cpu(swp_slots, cpu);
|
|
|
|
if ((type & SLOTS_CACHE) && cache->slots) {
|
|
|
|
mutex_lock(&cache->alloc_lock);
|
|
|
|
swapcache_free_entries(cache->slots + cache->cur, cache->nr);
|
|
|
|
cache->cur = 0;
|
|
|
|
cache->nr = 0;
|
|
|
|
if (free_slots && cache->slots) {
|
mm, swap: use kvzalloc to allocate some swap data structures
Now vzalloc() is used in swap code to allocate various data structures,
such as swap cache, swap slots cache, cluster info, etc. Because the
size may be too large on some system, so that normal kzalloc() may fail.
But using kzalloc() has some advantages, for example, less memory
fragmentation, less TLB pressure, etc. So change the data structure
allocation in swap code to use kvzalloc() which will try kzalloc()
firstly, and fallback to vzalloc() if kzalloc() failed.
In general, although kmalloc() will reduce the number of high-order
pages in short term, vmalloc() will cause more pain for memory
fragmentation in the long term. And the swap data structure allocation
that is changed in this patch is expected to be long term allocation.
From Dave Hansen:
"for example, we have a two-page data structure. vmalloc() takes two
effectively random order-0 pages, probably from two different 2M pages
and pins them. That "kills" two 2M pages. kmalloc(), allocating two
*contiguous* pages, will not cross a 2M boundary. That means it will
only "kill" the possibility of a single 2M page. More 2M pages == less
fragmentation.
The allocation in this patch occurs during swap on time, which is
usually done during system boot, so usually we have high opportunity to
allocate the contiguous pages successfully.
The allocation for swap_map[] in struct swap_info_struct is not changed,
because that is usually quite large and vmalloc_to_page() is used for
it. That makes it a little harder to change.
Link: http://lkml.kernel.org/r/20170407064911.25447-1-ying.huang@intel.com
Signed-off-by: Huang Ying <ying.huang@intel.com>
Acked-by: Tim Chen <tim.c.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-05-09 06:57:40 +08:00
|
|
|
kvfree(cache->slots);
|
2017-02-23 07:45:39 +08:00
|
|
|
cache->slots = NULL;
|
|
|
|
}
|
|
|
|
mutex_unlock(&cache->alloc_lock);
|
|
|
|
}
|
|
|
|
if ((type & SLOTS_CACHE_RET) && cache->slots_ret) {
|
|
|
|
spin_lock_irq(&cache->free_lock);
|
|
|
|
swapcache_free_entries(cache->slots_ret, cache->n_ret);
|
|
|
|
cache->n_ret = 0;
|
|
|
|
if (free_slots && cache->slots_ret) {
|
|
|
|
slots = cache->slots_ret;
|
|
|
|
cache->slots_ret = NULL;
|
|
|
|
}
|
|
|
|
spin_unlock_irq(&cache->free_lock);
|
|
|
|
if (slots)
|
mm, swap: use kvzalloc to allocate some swap data structures
Now vzalloc() is used in swap code to allocate various data structures,
such as swap cache, swap slots cache, cluster info, etc. Because the
size may be too large on some system, so that normal kzalloc() may fail.
But using kzalloc() has some advantages, for example, less memory
fragmentation, less TLB pressure, etc. So change the data structure
allocation in swap code to use kvzalloc() which will try kzalloc()
firstly, and fallback to vzalloc() if kzalloc() failed.
In general, although kmalloc() will reduce the number of high-order
pages in short term, vmalloc() will cause more pain for memory
fragmentation in the long term. And the swap data structure allocation
that is changed in this patch is expected to be long term allocation.
From Dave Hansen:
"for example, we have a two-page data structure. vmalloc() takes two
effectively random order-0 pages, probably from two different 2M pages
and pins them. That "kills" two 2M pages. kmalloc(), allocating two
*contiguous* pages, will not cross a 2M boundary. That means it will
only "kill" the possibility of a single 2M page. More 2M pages == less
fragmentation.
The allocation in this patch occurs during swap on time, which is
usually done during system boot, so usually we have high opportunity to
allocate the contiguous pages successfully.
The allocation for swap_map[] in struct swap_info_struct is not changed,
because that is usually quite large and vmalloc_to_page() is used for
it. That makes it a little harder to change.
Link: http://lkml.kernel.org/r/20170407064911.25447-1-ying.huang@intel.com
Signed-off-by: Huang Ying <ying.huang@intel.com>
Acked-by: Tim Chen <tim.c.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-05-09 06:57:40 +08:00
|
|
|
kvfree(slots);
|
2017-02-23 07:45:39 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __drain_swap_slots_cache(unsigned int type)
|
|
|
|
{
|
|
|
|
unsigned int cpu;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This function is called during
|
|
|
|
* 1) swapoff, when we have to make sure no
|
|
|
|
* left over slots are in cache when we remove
|
|
|
|
* a swap device;
|
|
|
|
* 2) disabling of swap slot cache, when we run low
|
|
|
|
* on swap slots when allocating memory and need
|
|
|
|
* to return swap slots to global pool.
|
|
|
|
*
|
|
|
|
* We cannot acquire cpu hot plug lock here as
|
|
|
|
* this function can be invoked in the cpu
|
|
|
|
* hot plug path:
|
|
|
|
* cpu_up -> lock cpu_hotplug -> cpu hotplug state callback
|
|
|
|
* -> memory allocation -> direct reclaim -> get_swap_page
|
|
|
|
* -> drain_swap_slots_cache
|
|
|
|
*
|
|
|
|
* Hence the loop over current online cpu below could miss cpu that
|
|
|
|
* is being brought online but not yet marked as online.
|
|
|
|
* That is okay as we do not schedule and run anything on a
|
|
|
|
* cpu before it has been marked online. Hence, we will not
|
|
|
|
* fill any swap slots in slots cache of such cpu.
|
|
|
|
* There are no slots on such cpu that need to be drained.
|
|
|
|
*/
|
|
|
|
for_each_online_cpu(cpu)
|
|
|
|
drain_slots_cache_cpu(cpu, type, false);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int free_slot_cache(unsigned int cpu)
|
|
|
|
{
|
|
|
|
mutex_lock(&swap_slots_cache_mutex);
|
|
|
|
drain_slots_cache_cpu(cpu, SLOTS_CACHE | SLOTS_CACHE_RET, true);
|
|
|
|
mutex_unlock(&swap_slots_cache_mutex);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int enable_swap_slots_cache(void)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
mutex_lock(&swap_slots_cache_enable_mutex);
|
|
|
|
if (swap_slot_cache_initialized) {
|
|
|
|
__reenable_swap_slots_cache();
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache",
|
|
|
|
alloc_swap_slot_cache, free_slot_cache);
|
2017-05-04 05:54:48 +08:00
|
|
|
if (WARN_ONCE(ret < 0, "Cache allocation failed (%s), operating "
|
|
|
|
"without swap slots cache.\n", __func__))
|
2017-02-23 07:45:39 +08:00
|
|
|
goto out_unlock;
|
2017-05-04 05:54:48 +08:00
|
|
|
|
2017-02-23 07:45:39 +08:00
|
|
|
swap_slot_cache_initialized = true;
|
|
|
|
__reenable_swap_slots_cache();
|
|
|
|
out_unlock:
|
|
|
|
mutex_unlock(&swap_slots_cache_enable_mutex);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* called with swap slot cache's alloc lock held */
|
|
|
|
static int refill_swap_slots_cache(struct swap_slots_cache *cache)
|
|
|
|
{
|
|
|
|
if (!use_swap_slot_cache || cache->nr)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
cache->cur = 0;
|
|
|
|
if (swap_slot_cache_active)
|
|
|
|
cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, cache->slots);
|
|
|
|
|
|
|
|
return cache->nr;
|
|
|
|
}
|
|
|
|
|
|
|
|
int free_swap_slot(swp_entry_t entry)
|
|
|
|
{
|
|
|
|
struct swap_slots_cache *cache;
|
|
|
|
|
|
|
|
cache = &get_cpu_var(swp_slots);
|
|
|
|
if (use_swap_slot_cache && cache->slots_ret) {
|
|
|
|
spin_lock_irq(&cache->free_lock);
|
|
|
|
/* Swap slots cache may be deactivated before acquiring lock */
|
|
|
|
if (!use_swap_slot_cache) {
|
|
|
|
spin_unlock_irq(&cache->free_lock);
|
|
|
|
goto direct_free;
|
|
|
|
}
|
|
|
|
if (cache->n_ret >= SWAP_SLOTS_CACHE_SIZE) {
|
|
|
|
/*
|
|
|
|
* Return slots to global pool.
|
|
|
|
* The current swap_map value is SWAP_HAS_CACHE.
|
|
|
|
* Set it to 0 to indicate it is available for
|
|
|
|
* allocation in global pool
|
|
|
|
*/
|
|
|
|
swapcache_free_entries(cache->slots_ret, cache->n_ret);
|
|
|
|
cache->n_ret = 0;
|
|
|
|
}
|
|
|
|
cache->slots_ret[cache->n_ret++] = entry;
|
|
|
|
spin_unlock_irq(&cache->free_lock);
|
|
|
|
} else {
|
|
|
|
direct_free:
|
|
|
|
swapcache_free_entries(&entry, 1);
|
|
|
|
}
|
|
|
|
put_cpu_var(swp_slots);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
swp_entry_t get_swap_page(void)
|
|
|
|
{
|
|
|
|
swp_entry_t entry, *pentry;
|
|
|
|
struct swap_slots_cache *cache;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Preemption is allowed here, because we may sleep
|
|
|
|
* in refill_swap_slots_cache(). But it is safe, because
|
|
|
|
* accesses to the per-CPU data structure are protected by the
|
|
|
|
* mutex cache->alloc_lock.
|
|
|
|
*
|
|
|
|
* The alloc path here does not touch cache->slots_ret
|
|
|
|
* so cache->free_lock is not taken.
|
|
|
|
*/
|
|
|
|
cache = raw_cpu_ptr(&swp_slots);
|
|
|
|
|
|
|
|
entry.val = 0;
|
|
|
|
if (check_cache_active()) {
|
|
|
|
mutex_lock(&cache->alloc_lock);
|
|
|
|
if (cache->slots) {
|
|
|
|
repeat:
|
|
|
|
if (cache->nr) {
|
|
|
|
pentry = &cache->slots[cache->cur++];
|
|
|
|
entry = *pentry;
|
|
|
|
pentry->val = 0;
|
|
|
|
cache->nr--;
|
|
|
|
} else {
|
|
|
|
if (refill_swap_slots_cache(cache))
|
|
|
|
goto repeat;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
mutex_unlock(&cache->alloc_lock);
|
|
|
|
if (entry.val)
|
|
|
|
return entry;
|
|
|
|
}
|
|
|
|
|
|
|
|
get_swap_pages(1, &entry);
|
|
|
|
|
|
|
|
return entry;
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* CONFIG_SWAP */
|