2008-07-26 10:44:36 +08:00
|
|
|
#include <linux/mm.h>
|
2006-01-08 17:01:43 +08:00
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/string.h>
|
2014-04-08 06:37:26 +08:00
|
|
|
#include <linux/compiler.h>
|
2011-10-16 14:01:52 +08:00
|
|
|
#include <linux/export.h>
|
2006-03-24 19:18:42 +08:00
|
|
|
#include <linux/err.h>
|
2008-07-27 06:22:28 +08:00
|
|
|
#include <linux/sched.h>
|
2012-05-31 08:17:35 +08:00
|
|
|
#include <linux/security.h>
|
2013-02-23 08:34:35 +08:00
|
|
|
#include <linux/swap.h>
|
2013-02-23 08:34:37 +08:00
|
|
|
#include <linux/swapops.h>
|
2013-11-13 07:08:31 +08:00
|
|
|
#include <linux/mman.h>
|
|
|
|
#include <linux/hugetlb.h>
|
2014-05-07 02:02:53 +08:00
|
|
|
#include <linux/vmalloc.h>
|
2013-11-13 07:08:31 +08:00
|
|
|
|
2015-02-14 06:36:24 +08:00
|
|
|
#include <asm/sections.h>
|
2006-03-24 19:18:42 +08:00
|
|
|
#include <asm/uaccess.h>
|
2006-01-08 17:01:43 +08:00
|
|
|
|
mm: nommu: sort mm->mmap list properly
When I was reading nommu code, I found that it handles the vma list/tree
in an unusual way. IIUC, because there can be more than one
identical/overrapped vmas in the list/tree, it sorts the tree more
strictly and does a linear search on the tree. But it doesn't applied to
the list (i.e. the list could be constructed in a different order than
the tree so that we can't use the list when finding the first vma in that
order).
Since inserting/sorting a vma in the tree and link is done at the same
time, we can easily construct both of them in the same order. And linear
searching on the tree could be more costly than doing it on the list, it
can be converted to use the list.
Also, after the commit 297c5eee3724 ("mm: make the vma list be doubly
linked") made the list be doubly linked, there were a couple of code need
to be fixed to construct the list properly.
Patch 1/6 is a preparation. It maintains the list sorted same as the tree
and construct doubly-linked list properly. Patch 2/6 is a simple
optimization for the vma deletion. Patch 3/6 and 4/6 convert tree
traversal to list traversal and the rest are simple fixes and cleanups.
This patch:
@vma added into @mm should be sorted by start addr, end addr and VMA
struct addr in that order because we may get identical VMAs in the @mm.
However this was true only for the rbtree, not for the list.
This patch fixes this by remembering 'rb_prev' during the tree traversal
like find_vma_prepare() does and linking the @vma via __vma_link_list().
After this patch, we can iterate the whole VMAs in correct order simply by
using @mm->mmap list.
[akpm@linux-foundation.org: avoid duplicating __vma_link_list()]
Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Acked-by: Greg Ungerer <gerg@uclinux.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-05-25 08:11:22 +08:00
|
|
|
#include "internal.h"
|
|
|
|
|
2015-02-14 06:36:24 +08:00
|
|
|
static inline int is_kernel_rodata(unsigned long addr)
|
|
|
|
{
|
|
|
|
return addr >= (unsigned long)__start_rodata &&
|
|
|
|
addr < (unsigned long)__end_rodata;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* kfree_const - conditionally free memory
|
|
|
|
* @x: pointer to the memory
|
|
|
|
*
|
|
|
|
* Function calls kfree only if @x is not in .rodata section.
|
|
|
|
*/
|
|
|
|
void kfree_const(const void *x)
|
|
|
|
{
|
|
|
|
if (!is_kernel_rodata((unsigned long)x))
|
|
|
|
kfree(x);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(kfree_const);
|
|
|
|
|
2006-01-08 17:01:43 +08:00
|
|
|
/**
|
|
|
|
* kstrdup - allocate space for and copy an existing string
|
|
|
|
* @s: the string to duplicate
|
|
|
|
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
|
|
|
|
*/
|
|
|
|
char *kstrdup(const char *s, gfp_t gfp)
|
|
|
|
{
|
|
|
|
size_t len;
|
|
|
|
char *buf;
|
|
|
|
|
|
|
|
if (!s)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
len = strlen(s) + 1;
|
2006-10-04 17:15:25 +08:00
|
|
|
buf = kmalloc_track_caller(len, gfp);
|
2006-01-08 17:01:43 +08:00
|
|
|
if (buf)
|
|
|
|
memcpy(buf, s, len);
|
|
|
|
return buf;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(kstrdup);
|
2006-03-24 19:18:42 +08:00
|
|
|
|
2015-02-14 06:36:24 +08:00
|
|
|
/**
|
|
|
|
* kstrdup_const - conditionally duplicate an existing const string
|
|
|
|
* @s: the string to duplicate
|
|
|
|
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
|
|
|
|
*
|
|
|
|
* Function returns source string if it is in .rodata section otherwise it
|
|
|
|
* fallbacks to kstrdup.
|
|
|
|
* Strings allocated by kstrdup_const should be freed by kfree_const.
|
|
|
|
*/
|
|
|
|
const char *kstrdup_const(const char *s, gfp_t gfp)
|
|
|
|
{
|
|
|
|
if (is_kernel_rodata((unsigned long)s))
|
|
|
|
return s;
|
|
|
|
|
|
|
|
return kstrdup(s, gfp);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(kstrdup_const);
|
|
|
|
|
2007-07-18 09:37:02 +08:00
|
|
|
/**
|
|
|
|
* kstrndup - allocate space for and copy an existing string
|
|
|
|
* @s: the string to duplicate
|
|
|
|
* @max: read at most @max chars from @s
|
|
|
|
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
|
|
|
|
*/
|
|
|
|
char *kstrndup(const char *s, size_t max, gfp_t gfp)
|
|
|
|
{
|
|
|
|
size_t len;
|
|
|
|
char *buf;
|
|
|
|
|
|
|
|
if (!s)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
len = strnlen(s, max);
|
|
|
|
buf = kmalloc_track_caller(len+1, gfp);
|
|
|
|
if (buf) {
|
|
|
|
memcpy(buf, s, len);
|
|
|
|
buf[len] = '\0';
|
|
|
|
}
|
|
|
|
return buf;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(kstrndup);
|
|
|
|
|
[PATCH] kmemdup: introduce
One of idiomatic ways to duplicate a region of memory is
dst = kmalloc(len, GFP_KERNEL);
if (!dst)
return -ENOMEM;
memcpy(dst, src, len);
which is neat code except a programmer needs to write size twice. Which
sometimes leads to mistakes. If len passed to kmalloc is smaller that len
passed to memcpy, it's straight overwrite-beyond-end. If len passed to
memcpy is smaller than len passed to kmalloc, it's either a) legit
behaviour ;-), or b) cloned buffer will contain garbage in second half.
Slight trolling of commit lists shows several duplications bugs
done exactly because of diverged lenghts:
Linux:
[CRYPTO]: Fix memcpy/memset args.
[PATCH] memcpy/memset fixes
OpenBSD:
kerberosV/src/lib/asn1: der_copy.c:1.4
If programmer is given only one place to play with lengths, I believe, such
mistakes could be avoided.
With kmemdup, the snippet above will be rewritten as:
dst = kmemdup(src, len, GFP_KERNEL);
if (!dst)
return -ENOMEM;
This also leads to smaller code (kzalloc effect). Quick grep shows
200+ places where kmemdup() can be used.
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-10-01 14:27:20 +08:00
|
|
|
/**
|
|
|
|
* kmemdup - duplicate region of memory
|
|
|
|
*
|
|
|
|
* @src: memory region to duplicate
|
|
|
|
* @len: memory region length
|
|
|
|
* @gfp: GFP mask to use
|
|
|
|
*/
|
|
|
|
void *kmemdup(const void *src, size_t len, gfp_t gfp)
|
|
|
|
{
|
|
|
|
void *p;
|
|
|
|
|
2006-10-04 17:15:25 +08:00
|
|
|
p = kmalloc_track_caller(len, gfp);
|
[PATCH] kmemdup: introduce
One of idiomatic ways to duplicate a region of memory is
dst = kmalloc(len, GFP_KERNEL);
if (!dst)
return -ENOMEM;
memcpy(dst, src, len);
which is neat code except a programmer needs to write size twice. Which
sometimes leads to mistakes. If len passed to kmalloc is smaller that len
passed to memcpy, it's straight overwrite-beyond-end. If len passed to
memcpy is smaller than len passed to kmalloc, it's either a) legit
behaviour ;-), or b) cloned buffer will contain garbage in second half.
Slight trolling of commit lists shows several duplications bugs
done exactly because of diverged lenghts:
Linux:
[CRYPTO]: Fix memcpy/memset args.
[PATCH] memcpy/memset fixes
OpenBSD:
kerberosV/src/lib/asn1: der_copy.c:1.4
If programmer is given only one place to play with lengths, I believe, such
mistakes could be avoided.
With kmemdup, the snippet above will be rewritten as:
dst = kmemdup(src, len, GFP_KERNEL);
if (!dst)
return -ENOMEM;
This also leads to smaller code (kzalloc effect). Quick grep shows
200+ places where kmemdup() can be used.
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-10-01 14:27:20 +08:00
|
|
|
if (p)
|
|
|
|
memcpy(p, src, len);
|
|
|
|
return p;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(kmemdup);
|
|
|
|
|
2009-04-01 06:23:16 +08:00
|
|
|
/**
|
|
|
|
* memdup_user - duplicate memory region from user space
|
|
|
|
*
|
|
|
|
* @src: source address in user space
|
|
|
|
* @len: number of bytes to copy
|
|
|
|
*
|
|
|
|
* Returns an ERR_PTR() on failure.
|
|
|
|
*/
|
|
|
|
void *memdup_user(const void __user *src, size_t len)
|
|
|
|
{
|
|
|
|
void *p;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Always use GFP_KERNEL, since copy_from_user() can sleep and
|
|
|
|
* cause pagefault, which makes it pointless to use GFP_NOFS
|
|
|
|
* or GFP_ATOMIC.
|
|
|
|
*/
|
|
|
|
p = kmalloc_track_caller(len, GFP_KERNEL);
|
|
|
|
if (!p)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
if (copy_from_user(p, src, len)) {
|
|
|
|
kfree(p);
|
|
|
|
return ERR_PTR(-EFAULT);
|
|
|
|
}
|
|
|
|
|
|
|
|
return p;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(memdup_user);
|
|
|
|
|
2006-03-24 19:18:42 +08:00
|
|
|
/*
|
|
|
|
* strndup_user - duplicate an existing string from user space
|
|
|
|
* @s: The string to duplicate
|
|
|
|
* @n: Maximum number of bytes to copy, including the trailing NUL.
|
|
|
|
*/
|
|
|
|
char *strndup_user(const char __user *s, long n)
|
|
|
|
{
|
|
|
|
char *p;
|
|
|
|
long length;
|
|
|
|
|
|
|
|
length = strnlen_user(s, n);
|
|
|
|
|
|
|
|
if (!length)
|
|
|
|
return ERR_PTR(-EFAULT);
|
|
|
|
|
|
|
|
if (length > n)
|
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
|
2010-08-10 08:18:26 +08:00
|
|
|
p = memdup_user(s, length);
|
2006-03-24 19:18:42 +08:00
|
|
|
|
2010-08-10 08:18:26 +08:00
|
|
|
if (IS_ERR(p))
|
|
|
|
return p;
|
2006-03-24 19:18:42 +08:00
|
|
|
|
|
|
|
p[length - 1] = '\0';
|
|
|
|
|
|
|
|
return p;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(strndup_user);
|
2008-07-26 10:44:36 +08:00
|
|
|
|
2015-12-24 13:06:05 +08:00
|
|
|
/**
|
|
|
|
* memdup_user_nul - duplicate memory region from user space and NUL-terminate
|
|
|
|
*
|
|
|
|
* @src: source address in user space
|
|
|
|
* @len: number of bytes to copy
|
|
|
|
*
|
|
|
|
* Returns an ERR_PTR() on failure.
|
|
|
|
*/
|
|
|
|
void *memdup_user_nul(const void __user *src, size_t len)
|
|
|
|
{
|
|
|
|
char *p;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Always use GFP_KERNEL, since copy_from_user() can sleep and
|
|
|
|
* cause pagefault, which makes it pointless to use GFP_NOFS
|
|
|
|
* or GFP_ATOMIC.
|
|
|
|
*/
|
|
|
|
p = kmalloc_track_caller(len + 1, GFP_KERNEL);
|
|
|
|
if (!p)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
if (copy_from_user(p, src, len)) {
|
|
|
|
kfree(p);
|
|
|
|
return ERR_PTR(-EFAULT);
|
|
|
|
}
|
|
|
|
p[len] = '\0';
|
|
|
|
|
|
|
|
return p;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(memdup_user_nul);
|
|
|
|
|
mm: nommu: sort mm->mmap list properly
When I was reading nommu code, I found that it handles the vma list/tree
in an unusual way. IIUC, because there can be more than one
identical/overrapped vmas in the list/tree, it sorts the tree more
strictly and does a linear search on the tree. But it doesn't applied to
the list (i.e. the list could be constructed in a different order than
the tree so that we can't use the list when finding the first vma in that
order).
Since inserting/sorting a vma in the tree and link is done at the same
time, we can easily construct both of them in the same order. And linear
searching on the tree could be more costly than doing it on the list, it
can be converted to use the list.
Also, after the commit 297c5eee3724 ("mm: make the vma list be doubly
linked") made the list be doubly linked, there were a couple of code need
to be fixed to construct the list properly.
Patch 1/6 is a preparation. It maintains the list sorted same as the tree
and construct doubly-linked list properly. Patch 2/6 is a simple
optimization for the vma deletion. Patch 3/6 and 4/6 convert tree
traversal to list traversal and the rest are simple fixes and cleanups.
This patch:
@vma added into @mm should be sorted by start addr, end addr and VMA
struct addr in that order because we may get identical VMAs in the @mm.
However this was true only for the rbtree, not for the list.
This patch fixes this by remembering 'rb_prev' during the tree traversal
like find_vma_prepare() does and linking the @vma via __vma_link_list().
After this patch, we can iterate the whole VMAs in correct order simply by
using @mm->mmap list.
[akpm@linux-foundation.org: avoid duplicating __vma_link_list()]
Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Acked-by: Greg Ungerer <gerg@uclinux.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-05-25 08:11:22 +08:00
|
|
|
void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
|
|
struct vm_area_struct *prev, struct rb_node *rb_parent)
|
|
|
|
{
|
|
|
|
struct vm_area_struct *next;
|
|
|
|
|
|
|
|
vma->vm_prev = prev;
|
|
|
|
if (prev) {
|
|
|
|
next = prev->vm_next;
|
|
|
|
prev->vm_next = vma;
|
|
|
|
} else {
|
|
|
|
mm->mmap = vma;
|
|
|
|
if (rb_parent)
|
|
|
|
next = rb_entry(rb_parent,
|
|
|
|
struct vm_area_struct, vm_rb);
|
|
|
|
else
|
|
|
|
next = NULL;
|
|
|
|
}
|
|
|
|
vma->vm_next = next;
|
|
|
|
if (next)
|
|
|
|
next->vm_prev = vma;
|
|
|
|
}
|
|
|
|
|
procfs: mark thread stack correctly in proc/<pid>/maps
Stack for a new thread is mapped by userspace code and passed via
sys_clone. This memory is currently seen as anonymous in
/proc/<pid>/maps, which makes it difficult to ascertain which mappings
are being used for thread stacks. This patch uses the individual task
stack pointers to determine which vmas are actually thread stacks.
For a multithreaded program like the following:
#include <pthread.h>
void *thread_main(void *foo)
{
while(1);
}
int main()
{
pthread_t t;
pthread_create(&t, NULL, thread_main, NULL);
pthread_join(t, NULL);
}
proc/PID/maps looks like the following:
00400000-00401000 r-xp 00000000 fd:0a 3671804 /home/siddhesh/a.out
00600000-00601000 rw-p 00000000 fd:0a 3671804 /home/siddhesh/a.out
019ef000-01a10000 rw-p 00000000 00:00 0 [heap]
7f8a44491000-7f8a44492000 ---p 00000000 00:00 0
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0
7f8a44c92000-7f8a44e3d000 r-xp 00000000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a44e3d000-7f8a4503d000 ---p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a4503d000-7f8a45041000 r--p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45041000-7f8a45043000 rw-p 001af000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45043000-7f8a45048000 rw-p 00000000 00:00 0
7f8a45048000-7f8a4505f000 r-xp 00000000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4505f000-7f8a4525e000 ---p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525e000-7f8a4525f000 r--p 00016000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525f000-7f8a45260000 rw-p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a45260000-7f8a45264000 rw-p 00000000 00:00 0
7f8a45264000-7f8a45286000 r-xp 00000000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45457000-7f8a4545a000 rw-p 00000000 00:00 0
7f8a45484000-7f8a45485000 rw-p 00000000 00:00 0
7f8a45485000-7f8a45486000 r--p 00021000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45486000-7f8a45487000 rw-p 00022000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45487000-7f8a45488000 rw-p 00000000 00:00 0
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack]
7fff627ff000-7fff62800000 r-xp 00000000 00:00 0 [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
Here, one could guess that 7f8a44492000-7f8a44c92000 is a stack since
the earlier vma that has no permissions (7f8a44e3d000-7f8a4503d000) but
that is not always a reliable way to find out which vma is a thread
stack. Also, /proc/PID/maps and /proc/PID/task/TID/maps has the same
content.
With this patch in place, /proc/PID/task/TID/maps are treated as 'maps
as the task would see it' and hence, only the vma that that task uses as
stack is marked as [stack]. All other 'stack' vmas are marked as
anonymous memory. /proc/PID/maps acts as a thread group level view,
where all thread stack vmas are marked as [stack:TID] where TID is the
process ID of the task that uses that vma as stack, while the process
stack is marked as [stack].
So /proc/PID/maps will look like this:
00400000-00401000 r-xp 00000000 fd:0a 3671804 /home/siddhesh/a.out
00600000-00601000 rw-p 00000000 fd:0a 3671804 /home/siddhesh/a.out
019ef000-01a10000 rw-p 00000000 00:00 0 [heap]
7f8a44491000-7f8a44492000 ---p 00000000 00:00 0
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack:1442]
7f8a44c92000-7f8a44e3d000 r-xp 00000000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a44e3d000-7f8a4503d000 ---p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a4503d000-7f8a45041000 r--p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45041000-7f8a45043000 rw-p 001af000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45043000-7f8a45048000 rw-p 00000000 00:00 0
7f8a45048000-7f8a4505f000 r-xp 00000000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4505f000-7f8a4525e000 ---p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525e000-7f8a4525f000 r--p 00016000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525f000-7f8a45260000 rw-p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a45260000-7f8a45264000 rw-p 00000000 00:00 0
7f8a45264000-7f8a45286000 r-xp 00000000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45457000-7f8a4545a000 rw-p 00000000 00:00 0
7f8a45484000-7f8a45485000 rw-p 00000000 00:00 0
7f8a45485000-7f8a45486000 r--p 00021000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45486000-7f8a45487000 rw-p 00022000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45487000-7f8a45488000 rw-p 00000000 00:00 0
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack]
7fff627ff000-7fff62800000 r-xp 00000000 00:00 0 [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
Thus marking all vmas that are used as stacks by the threads in the
thread group along with the process stack. The task level maps will
however like this:
00400000-00401000 r-xp 00000000 fd:0a 3671804 /home/siddhesh/a.out
00600000-00601000 rw-p 00000000 fd:0a 3671804 /home/siddhesh/a.out
019ef000-01a10000 rw-p 00000000 00:00 0 [heap]
7f8a44491000-7f8a44492000 ---p 00000000 00:00 0
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack]
7f8a44c92000-7f8a44e3d000 r-xp 00000000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a44e3d000-7f8a4503d000 ---p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a4503d000-7f8a45041000 r--p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45041000-7f8a45043000 rw-p 001af000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45043000-7f8a45048000 rw-p 00000000 00:00 0
7f8a45048000-7f8a4505f000 r-xp 00000000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4505f000-7f8a4525e000 ---p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525e000-7f8a4525f000 r--p 00016000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525f000-7f8a45260000 rw-p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a45260000-7f8a45264000 rw-p 00000000 00:00 0
7f8a45264000-7f8a45286000 r-xp 00000000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45457000-7f8a4545a000 rw-p 00000000 00:00 0
7f8a45484000-7f8a45485000 rw-p 00000000 00:00 0
7f8a45485000-7f8a45486000 r--p 00021000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45486000-7f8a45487000 rw-p 00022000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45487000-7f8a45488000 rw-p 00000000 00:00 0
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0
7fff627ff000-7fff62800000 r-xp 00000000 00:00 0 [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
where only the vma that is being used as a stack by *that* task is
marked as [stack].
Analogous changes have been made to /proc/PID/smaps,
/proc/PID/numa_maps, /proc/PID/task/TID/smaps and
/proc/PID/task/TID/numa_maps. Relevant snippets from smaps and
numa_maps:
[siddhesh@localhost ~ ]$ pgrep a.out
1441
[siddhesh@localhost ~ ]$ cat /proc/1441/smaps | grep "\[stack"
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack:1442]
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack]
[siddhesh@localhost ~ ]$ cat /proc/1441/task/1442/smaps | grep "\[stack"
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack]
[siddhesh@localhost ~ ]$ cat /proc/1441/task/1441/smaps | grep "\[stack"
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack]
[siddhesh@localhost ~ ]$ cat /proc/1441/numa_maps | grep "stack"
7f8a44492000 default stack:1442 anon=2 dirty=2 N0=2
7fff6273a000 default stack anon=3 dirty=3 N0=3
[siddhesh@localhost ~ ]$ cat /proc/1441/task/1442/numa_maps | grep "stack"
7f8a44492000 default stack anon=2 dirty=2 N0=2
[siddhesh@localhost ~ ]$ cat /proc/1441/task/1441/numa_maps | grep "stack"
7fff6273a000 default stack anon=3 dirty=3 N0=3
[akpm@linux-foundation.org: checkpatch fixes]
[akpm@linux-foundation.org: fix build]
Signed-off-by: Siddhesh Poyarekar <siddhesh.poyarekar@gmail.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Jamie Lokier <jamie@shareable.org>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-22 07:34:04 +08:00
|
|
|
/* Check if the vma is being used as a stack by this task */
|
2016-02-03 08:57:29 +08:00
|
|
|
int vma_is_stack_for_task(struct vm_area_struct *vma, struct task_struct *t)
|
procfs: mark thread stack correctly in proc/<pid>/maps
Stack for a new thread is mapped by userspace code and passed via
sys_clone. This memory is currently seen as anonymous in
/proc/<pid>/maps, which makes it difficult to ascertain which mappings
are being used for thread stacks. This patch uses the individual task
stack pointers to determine which vmas are actually thread stacks.
For a multithreaded program like the following:
#include <pthread.h>
void *thread_main(void *foo)
{
while(1);
}
int main()
{
pthread_t t;
pthread_create(&t, NULL, thread_main, NULL);
pthread_join(t, NULL);
}
proc/PID/maps looks like the following:
00400000-00401000 r-xp 00000000 fd:0a 3671804 /home/siddhesh/a.out
00600000-00601000 rw-p 00000000 fd:0a 3671804 /home/siddhesh/a.out
019ef000-01a10000 rw-p 00000000 00:00 0 [heap]
7f8a44491000-7f8a44492000 ---p 00000000 00:00 0
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0
7f8a44c92000-7f8a44e3d000 r-xp 00000000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a44e3d000-7f8a4503d000 ---p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a4503d000-7f8a45041000 r--p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45041000-7f8a45043000 rw-p 001af000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45043000-7f8a45048000 rw-p 00000000 00:00 0
7f8a45048000-7f8a4505f000 r-xp 00000000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4505f000-7f8a4525e000 ---p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525e000-7f8a4525f000 r--p 00016000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525f000-7f8a45260000 rw-p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a45260000-7f8a45264000 rw-p 00000000 00:00 0
7f8a45264000-7f8a45286000 r-xp 00000000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45457000-7f8a4545a000 rw-p 00000000 00:00 0
7f8a45484000-7f8a45485000 rw-p 00000000 00:00 0
7f8a45485000-7f8a45486000 r--p 00021000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45486000-7f8a45487000 rw-p 00022000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45487000-7f8a45488000 rw-p 00000000 00:00 0
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack]
7fff627ff000-7fff62800000 r-xp 00000000 00:00 0 [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
Here, one could guess that 7f8a44492000-7f8a44c92000 is a stack since
the earlier vma that has no permissions (7f8a44e3d000-7f8a4503d000) but
that is not always a reliable way to find out which vma is a thread
stack. Also, /proc/PID/maps and /proc/PID/task/TID/maps has the same
content.
With this patch in place, /proc/PID/task/TID/maps are treated as 'maps
as the task would see it' and hence, only the vma that that task uses as
stack is marked as [stack]. All other 'stack' vmas are marked as
anonymous memory. /proc/PID/maps acts as a thread group level view,
where all thread stack vmas are marked as [stack:TID] where TID is the
process ID of the task that uses that vma as stack, while the process
stack is marked as [stack].
So /proc/PID/maps will look like this:
00400000-00401000 r-xp 00000000 fd:0a 3671804 /home/siddhesh/a.out
00600000-00601000 rw-p 00000000 fd:0a 3671804 /home/siddhesh/a.out
019ef000-01a10000 rw-p 00000000 00:00 0 [heap]
7f8a44491000-7f8a44492000 ---p 00000000 00:00 0
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack:1442]
7f8a44c92000-7f8a44e3d000 r-xp 00000000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a44e3d000-7f8a4503d000 ---p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a4503d000-7f8a45041000 r--p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45041000-7f8a45043000 rw-p 001af000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45043000-7f8a45048000 rw-p 00000000 00:00 0
7f8a45048000-7f8a4505f000 r-xp 00000000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4505f000-7f8a4525e000 ---p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525e000-7f8a4525f000 r--p 00016000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525f000-7f8a45260000 rw-p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a45260000-7f8a45264000 rw-p 00000000 00:00 0
7f8a45264000-7f8a45286000 r-xp 00000000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45457000-7f8a4545a000 rw-p 00000000 00:00 0
7f8a45484000-7f8a45485000 rw-p 00000000 00:00 0
7f8a45485000-7f8a45486000 r--p 00021000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45486000-7f8a45487000 rw-p 00022000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45487000-7f8a45488000 rw-p 00000000 00:00 0
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack]
7fff627ff000-7fff62800000 r-xp 00000000 00:00 0 [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
Thus marking all vmas that are used as stacks by the threads in the
thread group along with the process stack. The task level maps will
however like this:
00400000-00401000 r-xp 00000000 fd:0a 3671804 /home/siddhesh/a.out
00600000-00601000 rw-p 00000000 fd:0a 3671804 /home/siddhesh/a.out
019ef000-01a10000 rw-p 00000000 00:00 0 [heap]
7f8a44491000-7f8a44492000 ---p 00000000 00:00 0
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack]
7f8a44c92000-7f8a44e3d000 r-xp 00000000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a44e3d000-7f8a4503d000 ---p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a4503d000-7f8a45041000 r--p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45041000-7f8a45043000 rw-p 001af000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45043000-7f8a45048000 rw-p 00000000 00:00 0
7f8a45048000-7f8a4505f000 r-xp 00000000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4505f000-7f8a4525e000 ---p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525e000-7f8a4525f000 r--p 00016000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525f000-7f8a45260000 rw-p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a45260000-7f8a45264000 rw-p 00000000 00:00 0
7f8a45264000-7f8a45286000 r-xp 00000000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45457000-7f8a4545a000 rw-p 00000000 00:00 0
7f8a45484000-7f8a45485000 rw-p 00000000 00:00 0
7f8a45485000-7f8a45486000 r--p 00021000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45486000-7f8a45487000 rw-p 00022000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45487000-7f8a45488000 rw-p 00000000 00:00 0
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0
7fff627ff000-7fff62800000 r-xp 00000000 00:00 0 [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
where only the vma that is being used as a stack by *that* task is
marked as [stack].
Analogous changes have been made to /proc/PID/smaps,
/proc/PID/numa_maps, /proc/PID/task/TID/smaps and
/proc/PID/task/TID/numa_maps. Relevant snippets from smaps and
numa_maps:
[siddhesh@localhost ~ ]$ pgrep a.out
1441
[siddhesh@localhost ~ ]$ cat /proc/1441/smaps | grep "\[stack"
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack:1442]
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack]
[siddhesh@localhost ~ ]$ cat /proc/1441/task/1442/smaps | grep "\[stack"
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack]
[siddhesh@localhost ~ ]$ cat /proc/1441/task/1441/smaps | grep "\[stack"
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack]
[siddhesh@localhost ~ ]$ cat /proc/1441/numa_maps | grep "stack"
7f8a44492000 default stack:1442 anon=2 dirty=2 N0=2
7fff6273a000 default stack anon=3 dirty=3 N0=3
[siddhesh@localhost ~ ]$ cat /proc/1441/task/1442/numa_maps | grep "stack"
7f8a44492000 default stack anon=2 dirty=2 N0=2
[siddhesh@localhost ~ ]$ cat /proc/1441/task/1441/numa_maps | grep "stack"
7fff6273a000 default stack anon=3 dirty=3 N0=3
[akpm@linux-foundation.org: checkpatch fixes]
[akpm@linux-foundation.org: fix build]
Signed-off-by: Siddhesh Poyarekar <siddhesh.poyarekar@gmail.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Jamie Lokier <jamie@shareable.org>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-22 07:34:04 +08:00
|
|
|
{
|
|
|
|
return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
|
|
|
|
}
|
|
|
|
|
2010-01-16 09:01:35 +08:00
|
|
|
#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
|
2008-07-26 10:44:36 +08:00
|
|
|
void arch_pick_mmap_layout(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
mm->mmap_base = TASK_UNMAPPED_BASE;
|
|
|
|
mm->get_unmapped_area = arch_get_unmapped_area;
|
|
|
|
}
|
|
|
|
#endif
|
2008-08-13 06:52:52 +08:00
|
|
|
|
2010-08-22 19:08:57 +08:00
|
|
|
/*
|
|
|
|
* Like get_user_pages_fast() except its IRQ-safe in that it won't fall
|
|
|
|
* back to the regular GUP.
|
2011-03-31 09:57:33 +08:00
|
|
|
* If the architecture not support this function, simply return with no
|
2010-08-22 19:08:57 +08:00
|
|
|
* page pinned
|
|
|
|
*/
|
2014-04-08 06:37:26 +08:00
|
|
|
int __weak __get_user_pages_fast(unsigned long start,
|
2010-08-22 19:08:57 +08:00
|
|
|
int nr_pages, int write, struct page **pages)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(__get_user_pages_fast);
|
|
|
|
|
2009-04-14 05:40:05 +08:00
|
|
|
/**
|
|
|
|
* get_user_pages_fast() - pin user pages in memory
|
|
|
|
* @start: starting user address
|
|
|
|
* @nr_pages: number of pages from start to pin
|
|
|
|
* @write: whether pages will be written to
|
|
|
|
* @pages: array that receives pointers to the pages pinned.
|
|
|
|
* Should be at least nr_pages long.
|
|
|
|
*
|
|
|
|
* Returns number of pages pinned. This may be fewer than the number
|
|
|
|
* requested. If nr_pages is 0 or negative, returns 0. If no pages
|
|
|
|
* were pinned, returns -errno.
|
2009-06-17 06:31:39 +08:00
|
|
|
*
|
|
|
|
* get_user_pages_fast provides equivalent functionality to get_user_pages,
|
|
|
|
* operating on current and current->mm, with force=0 and vma=NULL. However
|
|
|
|
* unlike get_user_pages, it must be called without mmap_sem held.
|
|
|
|
*
|
|
|
|
* get_user_pages_fast may take mmap_sem and page table locks, so no
|
|
|
|
* assumptions can be made about lack of locking. get_user_pages_fast is to be
|
|
|
|
* implemented in a way that is advantageous (vs get_user_pages()) when the
|
|
|
|
* user memory area is already faulted in and present in ptes. However if the
|
|
|
|
* pages have to be faulted in, it may turn out to be slightly slower so
|
|
|
|
* callers need to carefully consider what to use. On many architectures,
|
|
|
|
* get_user_pages_fast simply falls back to get_user_pages.
|
2009-04-14 05:40:05 +08:00
|
|
|
*/
|
2014-04-08 06:37:26 +08:00
|
|
|
int __weak get_user_pages_fast(unsigned long start,
|
2008-08-13 06:52:52 +08:00
|
|
|
int nr_pages, int write, struct page **pages)
|
|
|
|
{
|
2016-02-13 05:01:55 +08:00
|
|
|
return get_user_pages_unlocked(start, nr_pages, write, 0, pages);
|
2008-08-13 06:52:52 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(get_user_pages_fast);
|
2009-03-23 21:12:24 +08:00
|
|
|
|
2012-05-31 08:17:35 +08:00
|
|
|
unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
|
|
|
|
unsigned long len, unsigned long prot,
|
|
|
|
unsigned long flag, unsigned long pgoff)
|
|
|
|
{
|
|
|
|
unsigned long ret;
|
|
|
|
struct mm_struct *mm = current->mm;
|
2013-02-23 08:32:47 +08:00
|
|
|
unsigned long populate;
|
2012-05-31 08:17:35 +08:00
|
|
|
|
|
|
|
ret = security_mmap_file(file, prot, flag);
|
|
|
|
if (!ret) {
|
|
|
|
down_write(&mm->mmap_sem);
|
2013-02-23 08:32:37 +08:00
|
|
|
ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
|
|
|
|
&populate);
|
2012-05-31 08:17:35 +08:00
|
|
|
up_write(&mm->mmap_sem);
|
2013-02-23 08:32:47 +08:00
|
|
|
if (populate)
|
|
|
|
mm_populate(ret, populate);
|
2012-05-31 08:17:35 +08:00
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned long vm_mmap(struct file *file, unsigned long addr,
|
|
|
|
unsigned long len, unsigned long prot,
|
|
|
|
unsigned long flag, unsigned long offset)
|
|
|
|
{
|
|
|
|
if (unlikely(offset + PAGE_ALIGN(len) < offset))
|
|
|
|
return -EINVAL;
|
2015-11-06 10:46:46 +08:00
|
|
|
if (unlikely(offset_in_page(offset)))
|
2012-05-31 08:17:35 +08:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(vm_mmap);
|
|
|
|
|
2014-05-07 02:02:53 +08:00
|
|
|
void kvfree(const void *addr)
|
|
|
|
{
|
|
|
|
if (is_vmalloc_addr(addr))
|
|
|
|
vfree(addr);
|
|
|
|
else
|
|
|
|
kfree(addr);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(kvfree);
|
|
|
|
|
2015-04-16 07:14:53 +08:00
|
|
|
static inline void *__page_rmapping(struct page *page)
|
|
|
|
{
|
|
|
|
unsigned long mapping;
|
|
|
|
|
|
|
|
mapping = (unsigned long)page->mapping;
|
|
|
|
mapping &= ~PAGE_MAPPING_FLAGS;
|
|
|
|
|
|
|
|
return (void *)mapping;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Neutral page->mapping pointer to address_space or anon_vma or other */
|
|
|
|
void *page_rmapping(struct page *page)
|
|
|
|
{
|
|
|
|
page = compound_head(page);
|
|
|
|
return __page_rmapping(page);
|
|
|
|
}
|
|
|
|
|
|
|
|
struct anon_vma *page_anon_vma(struct page *page)
|
|
|
|
{
|
|
|
|
unsigned long mapping;
|
|
|
|
|
|
|
|
page = compound_head(page);
|
|
|
|
mapping = (unsigned long)page->mapping;
|
|
|
|
if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
|
|
|
|
return NULL;
|
|
|
|
return __page_rmapping(page);
|
|
|
|
}
|
|
|
|
|
2013-02-23 08:34:35 +08:00
|
|
|
struct address_space *page_mapping(struct page *page)
|
|
|
|
{
|
2016-01-16 08:52:07 +08:00
|
|
|
struct address_space *mapping;
|
|
|
|
|
|
|
|
page = compound_head(page);
|
2013-02-23 08:34:35 +08:00
|
|
|
|
2014-01-15 09:56:40 +08:00
|
|
|
/* This happens if someone calls flush_dcache_page on slab page */
|
|
|
|
if (unlikely(PageSlab(page)))
|
|
|
|
return NULL;
|
|
|
|
|
2013-02-23 08:34:37 +08:00
|
|
|
if (unlikely(PageSwapCache(page))) {
|
|
|
|
swp_entry_t entry;
|
|
|
|
|
|
|
|
entry.val = page_private(page);
|
2015-04-16 07:14:53 +08:00
|
|
|
return swap_address_space(entry);
|
|
|
|
}
|
|
|
|
|
2016-01-16 08:52:07 +08:00
|
|
|
mapping = page->mapping;
|
|
|
|
if ((unsigned long)mapping & PAGE_MAPPING_FLAGS)
|
2015-04-16 07:14:53 +08:00
|
|
|
return NULL;
|
2016-01-16 08:52:07 +08:00
|
|
|
return mapping;
|
2013-02-23 08:34:35 +08:00
|
|
|
}
|
|
|
|
|
2016-01-16 08:54:37 +08:00
|
|
|
/* Slow path of page_mapcount() for compound pages */
|
|
|
|
int __page_mapcount(struct page *page)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = atomic_read(&page->_mapcount) + 1;
|
|
|
|
page = compound_head(page);
|
|
|
|
ret += atomic_read(compound_mapcount_ptr(page)) + 1;
|
|
|
|
if (PageDoubleMap(page))
|
|
|
|
ret--;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(__page_mapcount);
|
|
|
|
|
2016-03-18 05:18:50 +08:00
|
|
|
int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
|
|
|
|
int sysctl_overcommit_ratio __read_mostly = 50;
|
|
|
|
unsigned long sysctl_overcommit_kbytes __read_mostly;
|
|
|
|
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
|
|
|
|
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
|
|
|
|
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
|
|
|
|
|
2014-01-22 07:49:14 +08:00
|
|
|
int overcommit_ratio_handler(struct ctl_table *table, int write,
|
|
|
|
void __user *buffer, size_t *lenp,
|
|
|
|
loff_t *ppos)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = proc_dointvec(table, write, buffer, lenp, ppos);
|
|
|
|
if (ret == 0 && write)
|
|
|
|
sysctl_overcommit_kbytes = 0;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
int overcommit_kbytes_handler(struct ctl_table *table, int write,
|
|
|
|
void __user *buffer, size_t *lenp,
|
|
|
|
loff_t *ppos)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
|
|
|
|
if (ret == 0 && write)
|
|
|
|
sysctl_overcommit_ratio = 0;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2013-11-13 07:08:31 +08:00
|
|
|
/*
|
|
|
|
* Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
|
|
|
|
*/
|
|
|
|
unsigned long vm_commit_limit(void)
|
|
|
|
{
|
2014-01-22 07:49:14 +08:00
|
|
|
unsigned long allowed;
|
|
|
|
|
|
|
|
if (sysctl_overcommit_kbytes)
|
|
|
|
allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
|
|
|
|
else
|
|
|
|
allowed = ((totalram_pages - hugetlb_total_pages())
|
|
|
|
* sysctl_overcommit_ratio / 100);
|
|
|
|
allowed += total_swap_pages;
|
|
|
|
|
|
|
|
return allowed;
|
2013-11-13 07:08:31 +08:00
|
|
|
}
|
|
|
|
|
2016-03-18 05:18:50 +08:00
|
|
|
/*
|
|
|
|
* Make sure vm_committed_as in one cacheline and not cacheline shared with
|
|
|
|
* other variables. It can be updated by several CPUs frequently.
|
|
|
|
*/
|
|
|
|
struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The global memory commitment made in the system can be a metric
|
|
|
|
* that can be used to drive ballooning decisions when Linux is hosted
|
|
|
|
* as a guest. On Hyper-V, the host implements a policy engine for dynamically
|
|
|
|
* balancing memory across competing virtual machines that are hosted.
|
|
|
|
* Several metrics drive this policy engine including the guest reported
|
|
|
|
* memory commitment.
|
|
|
|
*/
|
|
|
|
unsigned long vm_memory_committed(void)
|
|
|
|
{
|
|
|
|
return percpu_counter_read_positive(&vm_committed_as);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(vm_memory_committed);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check that a process has enough memory to allocate a new virtual
|
|
|
|
* mapping. 0 means there is enough memory for the allocation to
|
|
|
|
* succeed and -ENOMEM implies there is not.
|
|
|
|
*
|
|
|
|
* We currently support three overcommit policies, which are set via the
|
|
|
|
* vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting
|
|
|
|
*
|
|
|
|
* Strict overcommit modes added 2002 Feb 26 by Alan Cox.
|
|
|
|
* Additional code 2002 Jul 20 by Robert Love.
|
|
|
|
*
|
|
|
|
* cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
|
|
|
|
*
|
|
|
|
* Note this is a helper function intended to be used by LSMs which
|
|
|
|
* wish to use this logic.
|
|
|
|
*/
|
|
|
|
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
|
|
|
|
{
|
|
|
|
long free, allowed, reserve;
|
|
|
|
|
|
|
|
VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
|
|
|
|
-(s64)vm_committed_as_batch * num_online_cpus(),
|
|
|
|
"memory commitment underflow");
|
|
|
|
|
|
|
|
vm_acct_memory(pages);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Sometimes we want to use more memory than we have
|
|
|
|
*/
|
|
|
|
if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
|
|
|
|
free = global_page_state(NR_FREE_PAGES);
|
|
|
|
free += global_page_state(NR_FILE_PAGES);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* shmem pages shouldn't be counted as free in this
|
|
|
|
* case, they can't be purged, only swapped out, and
|
|
|
|
* that won't affect the overall amount of available
|
|
|
|
* memory in the system.
|
|
|
|
*/
|
|
|
|
free -= global_page_state(NR_SHMEM);
|
|
|
|
|
|
|
|
free += get_nr_swap_pages();
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Any slabs which are created with the
|
|
|
|
* SLAB_RECLAIM_ACCOUNT flag claim to have contents
|
|
|
|
* which are reclaimable, under pressure. The dentry
|
|
|
|
* cache and most inode caches should fall into this
|
|
|
|
*/
|
|
|
|
free += global_page_state(NR_SLAB_RECLAIMABLE);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Leave reserved pages. The pages are not for anonymous pages.
|
|
|
|
*/
|
|
|
|
if (free <= totalreserve_pages)
|
|
|
|
goto error;
|
|
|
|
else
|
|
|
|
free -= totalreserve_pages;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Reserve some for root
|
|
|
|
*/
|
|
|
|
if (!cap_sys_admin)
|
|
|
|
free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
|
|
|
|
|
|
|
|
if (free > pages)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
|
|
|
allowed = vm_commit_limit();
|
|
|
|
/*
|
|
|
|
* Reserve some for root
|
|
|
|
*/
|
|
|
|
if (!cap_sys_admin)
|
|
|
|
allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Don't let a single process grow so big a user can't recover
|
|
|
|
*/
|
|
|
|
if (mm) {
|
|
|
|
reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
|
|
|
|
allowed -= min_t(long, mm->total_vm / 32, reserve);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (percpu_counter_read_positive(&vm_committed_as) < allowed)
|
|
|
|
return 0;
|
|
|
|
error:
|
|
|
|
vm_unacct_memory(pages);
|
|
|
|
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
2014-02-12 02:11:59 +08:00
|
|
|
/**
|
|
|
|
* get_cmdline() - copy the cmdline value to a buffer.
|
|
|
|
* @task: the task whose cmdline value to copy.
|
|
|
|
* @buffer: the buffer to copy to.
|
|
|
|
* @buflen: the length of the buffer. Larger cmdline values are truncated
|
|
|
|
* to this length.
|
|
|
|
* Returns the size of the cmdline field copied. Note that the copy does
|
|
|
|
* not guarantee an ending NULL byte.
|
|
|
|
*/
|
|
|
|
int get_cmdline(struct task_struct *task, char *buffer, int buflen)
|
|
|
|
{
|
|
|
|
int res = 0;
|
|
|
|
unsigned int len;
|
|
|
|
struct mm_struct *mm = get_task_mm(task);
|
2016-01-21 07:01:05 +08:00
|
|
|
unsigned long arg_start, arg_end, env_start, env_end;
|
2014-02-12 02:11:59 +08:00
|
|
|
if (!mm)
|
|
|
|
goto out;
|
|
|
|
if (!mm->arg_end)
|
|
|
|
goto out_mm; /* Shh! No looking before we're done */
|
|
|
|
|
2016-01-21 07:01:05 +08:00
|
|
|
down_read(&mm->mmap_sem);
|
|
|
|
arg_start = mm->arg_start;
|
|
|
|
arg_end = mm->arg_end;
|
|
|
|
env_start = mm->env_start;
|
|
|
|
env_end = mm->env_end;
|
|
|
|
up_read(&mm->mmap_sem);
|
|
|
|
|
|
|
|
len = arg_end - arg_start;
|
2014-02-12 02:11:59 +08:00
|
|
|
|
|
|
|
if (len > buflen)
|
|
|
|
len = buflen;
|
|
|
|
|
2016-01-21 07:01:05 +08:00
|
|
|
res = access_process_vm(task, arg_start, buffer, len, 0);
|
2014-02-12 02:11:59 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If the nul at the end of args has been overwritten, then
|
|
|
|
* assume application is using setproctitle(3).
|
|
|
|
*/
|
|
|
|
if (res > 0 && buffer[res-1] != '\0' && len < buflen) {
|
|
|
|
len = strnlen(buffer, res);
|
|
|
|
if (len < res) {
|
|
|
|
res = len;
|
|
|
|
} else {
|
2016-01-21 07:01:05 +08:00
|
|
|
len = env_end - env_start;
|
2014-02-12 02:11:59 +08:00
|
|
|
if (len > buflen - res)
|
|
|
|
len = buflen - res;
|
2016-01-21 07:01:05 +08:00
|
|
|
res += access_process_vm(task, env_start,
|
2014-02-12 02:11:59 +08:00
|
|
|
buffer+res, len, 0);
|
|
|
|
res = strnlen(buffer, res);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
out_mm:
|
|
|
|
mmput(mm);
|
|
|
|
out:
|
|
|
|
return res;
|
|
|
|
}
|