2005-04-17 06:20:36 +08:00
|
|
|
#ifndef _LINUX_MM_H
|
|
|
|
#define _LINUX_MM_H
|
|
|
|
|
|
|
|
#include <linux/errno.h>
|
|
|
|
|
|
|
|
#ifdef __KERNEL__
|
|
|
|
|
|
|
|
#include <linux/gfp.h>
|
|
|
|
#include <linux/list.h>
|
|
|
|
#include <linux/mmzone.h>
|
|
|
|
#include <linux/rbtree.h>
|
|
|
|
#include <linux/prio_tree.h>
|
2006-07-03 15:24:33 +08:00
|
|
|
#include <linux/debug_locks.h>
|
2006-09-27 16:50:01 +08:00
|
|
|
#include <linux/mm_types.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
struct mempolicy;
|
|
|
|
struct anon_vma;
|
2007-07-30 06:36:13 +08:00
|
|
|
struct file_ra_state;
|
Detach sched.h from mm.h
First thing mm.h does is including sched.h solely for can_do_mlock() inline
function which has "current" dereference inside. By dealing with can_do_mlock()
mm.h can be detached from sched.h which is good. See below, why.
This patch
a) removes unconditional inclusion of sched.h from mm.h
b) makes can_do_mlock() normal function in mm/mlock.c
c) exports can_do_mlock() to not break compilation
d) adds sched.h inclusions back to files that were getting it indirectly.
e) adds less bloated headers to some files (asm/signal.h, jiffies.h) that were
getting them indirectly
Net result is:
a) mm.h users would get less code to open, read, preprocess, parse, ... if
they don't need sched.h
b) sched.h stops being dependency for significant number of files:
on x86_64 allmodconfig touching sched.h results in recompile of 4083 files,
after patch it's only 3744 (-8.3%).
Cross-compile tested on
all arm defconfigs, all mips defconfigs, all powerpc defconfigs,
alpha alpha-up
arm
i386 i386-up i386-defconfig i386-allnoconfig
ia64 ia64-up
m68k
mips
parisc parisc-up
powerpc powerpc-up
s390 s390-up
sparc sparc-up
sparc64 sparc64-up
um-x86_64
x86_64 x86_64-up x86_64-defconfig x86_64-allnoconfig
as well as my two usual configs.
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-21 05:22:52 +08:00
|
|
|
struct user_struct;
|
2007-07-30 06:36:13 +08:00
|
|
|
struct writeback_control;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */
|
|
|
|
extern unsigned long max_mapnr;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
extern unsigned long num_physpages;
|
|
|
|
extern void * high_memory;
|
|
|
|
extern int page_cluster;
|
|
|
|
|
|
|
|
#ifdef CONFIG_SYSCTL
|
|
|
|
extern int sysctl_legacy_va_layout;
|
|
|
|
#else
|
|
|
|
#define sysctl_legacy_va_layout 0
|
|
|
|
#endif
|
|
|
|
|
2007-12-26 12:12:37 +08:00
|
|
|
extern unsigned long mmap_min_addr;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <asm/page.h>
|
|
|
|
#include <asm/pgtable.h>
|
|
|
|
#include <asm/processor.h>
|
|
|
|
|
|
|
|
#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
|
|
|
|
|
2008-07-24 12:28:13 +08:00
|
|
|
/* to align the pointer to the (next) page boundary */
|
|
|
|
#define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE)
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Linux kernel virtual memory manager primitives.
|
|
|
|
* The idea being to have a "virtual" mm in the same way
|
|
|
|
* we have a virtual fs - giving a cleaner interface to the
|
|
|
|
* mm details, and allowing different kinds of memory mappings
|
|
|
|
* (from shared memory to executable loading to arbitrary
|
|
|
|
* mmap() functions).
|
|
|
|
*/
|
|
|
|
|
2006-12-07 12:32:48 +08:00
|
|
|
extern struct kmem_cache *vm_area_cachep;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* This struct defines the per-mm list of VMAs for uClinux. If CONFIG_MMU is
|
|
|
|
* disabled, then there's a single shared list of VMAs maintained by the
|
|
|
|
* system, and mm's subscribe to these individually
|
|
|
|
*/
|
|
|
|
struct vm_list_struct {
|
|
|
|
struct vm_list_struct *next;
|
|
|
|
struct vm_area_struct *vma;
|
|
|
|
};
|
|
|
|
|
|
|
|
#ifndef CONFIG_MMU
|
|
|
|
extern struct rb_root nommu_vma_tree;
|
|
|
|
extern struct rw_semaphore nommu_vma_sem;
|
|
|
|
|
|
|
|
extern unsigned int kobjsize(const void *objp);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
2008-08-16 18:07:21 +08:00
|
|
|
* vm_flags in vm_area_struct, see mm_types.h.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
|
|
|
#define VM_READ 0x00000001 /* currently active flags */
|
|
|
|
#define VM_WRITE 0x00000002
|
|
|
|
#define VM_EXEC 0x00000004
|
|
|
|
#define VM_SHARED 0x00000008
|
|
|
|
|
2005-09-22 00:55:39 +08:00
|
|
|
/* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */
|
2005-04-17 06:20:36 +08:00
|
|
|
#define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */
|
|
|
|
#define VM_MAYWRITE 0x00000020
|
|
|
|
#define VM_MAYEXEC 0x00000040
|
|
|
|
#define VM_MAYSHARE 0x00000080
|
|
|
|
|
|
|
|
#define VM_GROWSDOWN 0x00000100 /* general info on the segment */
|
|
|
|
#define VM_GROWSUP 0x00000200
|
2005-11-29 06:34:23 +08:00
|
|
|
#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */
|
2005-04-17 06:20:36 +08:00
|
|
|
#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */
|
|
|
|
|
|
|
|
#define VM_EXECUTABLE 0x00001000
|
|
|
|
#define VM_LOCKED 0x00002000
|
|
|
|
#define VM_IO 0x00004000 /* Memory mapped I/O or similar */
|
|
|
|
|
|
|
|
/* Used by sys_madvise() */
|
|
|
|
#define VM_SEQ_READ 0x00008000 /* App will access data sequentially */
|
|
|
|
#define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */
|
|
|
|
|
|
|
|
#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */
|
|
|
|
#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */
|
[PATCH] unpaged: VM_UNPAGED
Although we tend to associate VM_RESERVED with remap_pfn_range, quite a few
drivers set VM_RESERVED on areas which are then populated by nopage. The
PageReserved removal in 2.6.15-rc1 changed VM_RESERVED not to free pages in
zap_pte_range, without changing those drivers not to set it: so their pages
just leak away.
Let's not change miscellaneous drivers now: introduce VM_UNPAGED at the core,
to flag the special areas where the ptes may have no struct page, or if they
have then it's not to be touched. Replace most instances of VM_RESERVED in
core mm by VM_UNPAGED. Force it on in remap_pfn_range, and the sparc and
sparc64 io_remap_pfn_range.
Revert addition of VM_RESERVED to powerpc vdso, it's not needed there. Is it
needed anywhere? It still governs the mm->reserved_vm statistic, and special
vmas not to be merged, and areas not to be core dumped; but could probably be
eliminated later (the drivers are probably specifying it because in 2.4 it
kept swapout off the vma, but in 2.6 we work from the LRU, which these pages
don't get on).
Use the VM_SHM slot for VM_UNPAGED, and define VM_SHM to 0: it serves no
purpose whatsoever, and should be removed from drivers when we clean up.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: William Irwin <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-11-22 13:32:15 +08:00
|
|
|
#define VM_RESERVED 0x00080000 /* Count as reserved_vm like IO */
|
2005-04-17 06:20:36 +08:00
|
|
|
#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */
|
2008-07-24 12:27:28 +08:00
|
|
|
#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */
|
2005-04-17 06:20:36 +08:00
|
|
|
#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
|
|
|
|
#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */
|
|
|
|
#define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */
|
2005-12-17 02:21:23 +08:00
|
|
|
#define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */
|
2007-01-26 16:56:48 +08:00
|
|
|
#define VM_ALWAYSDUMP 0x04000000 /* Always include in core dumps */
|
mm: fix fault vs invalidate race for linear mappings
Fix the race between invalidate_inode_pages and do_no_page.
Andrea Arcangeli identified a subtle race between invalidation of pages from
pagecache with userspace mappings, and do_no_page.
The issue is that invalidation has to shoot down all mappings to the page,
before it can be discarded from the pagecache. Between shooting down ptes to
a particular page, and actually dropping the struct page from the pagecache,
do_no_page from any process might fault on that page and establish a new
mapping to the page just before it gets discarded from the pagecache.
The most common case where such invalidation is used is in file truncation.
This case was catered for by doing a sort of open-coded seqlock between the
file's i_size, and its truncate_count.
Truncation will decrease i_size, then increment truncate_count before
unmapping userspace pages; do_no_page will read truncate_count, then find the
page if it is within i_size, and then check truncate_count under the page
table lock and back out and retry if it had subsequently been changed (ptl
will serialise against unmapping, and ensure a potentially updated
truncate_count is actually visible).
Complexity and documentation issues aside, the locking protocol fails in the
case where we would like to invalidate pagecache inside i_size. do_no_page
can come in anytime and filemap_nopage is not aware of the invalidation in
progress (as it is when it is outside i_size). The end result is that
dangling (->mapping == NULL) pages that appear to be from a particular file
may be mapped into userspace with nonsense data. Valid mappings to the same
place will see a different page.
Andrea implemented two working fixes, one using a real seqlock, another using
a page->flags bit. He also proposed using the page lock in do_no_page, but
that was initially considered too heavyweight. However, it is not a global or
per-file lock, and the page cacheline is modified in do_no_page to increment
_count and _mapcount anyway, so a further modification should not be a large
performance hit. Scalability is not an issue.
This patch implements this latter approach. ->nopage implementations return
with the page locked if it is possible for their underlying file to be
invalidated (in that case, they must set a special vm_flags bit to indicate
so). do_no_page only unlocks the page after setting up the mapping
completely. invalidation is excluded because it holds the page lock during
invalidation of each page (and ensures that the page is not mapped while
holding the lock).
This also allows significant simplifications in do_no_page, because we have
the page locked in the right place in the pagecache from the start.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-07-19 16:46:57 +08:00
|
|
|
|
2007-07-19 16:47:03 +08:00
|
|
|
#define VM_CAN_NONLINEAR 0x08000000 /* Has ->fault & does nonlinear pages */
|
mm: introduce VM_MIXEDMAP
This series introduces some important infrastructure work. The overall result
is that:
1. We now support XIP backed filesystems using memory that have no
struct page allocated to them. And patches 6 and 7 actually implement
this for s390.
This is pretty important in a number of cases. As far as I understand,
in the case of virtualisation (eg. s390), each guest may mount a
readonly copy of the same filesystem (eg. the distro). Currently,
guests need to allocate struct pages for this image. So if you have
100 guests, you already need to allocate more memory for the struct
pages than the size of the image. I think. (Carsten?)
For other (eg. embedded) systems, you may have a very large non-
volatile filesystem. If you have to have struct pages for this, then
your RAM consumption will go up proportionally to fs size. Even
though it is just a small proportion, the RAM can be much more costly
eg in terms of power, so every KB less that Linux uses makes it more
attractive to a lot of these guys.
2. VM_MIXEDMAP allows us to support mappings where you actually do want
to refcount _some_ pages in the mapping, but not others, and support
COW on arbitrary (non-linear) mappings. Jared needs this for his NVRAM
filesystem in progress. Future iterations of this filesystem will
most likely want to migrate pages between pagecache and XIP backing,
which is where the requirement for mixed (some refcounted, some not)
comes from.
3. pte_special also has a peripheral usage that I need for my lockless
get_user_pages patch. That was shown to speed up "oltp" on db2 by
10% on a 2 socket system, which is kind of significant because they
scrounge for months to try to find 0.1% improvement on these
workloads. I'm hoping we might finally be faster than AIX on
pSeries with this :). My reference to lockless get_user_pages is not
meant to justify this patchset (which doesn't include lockless gup),
but just to show that pte_special is not some s390 specific thing that
should be hidden in arch code or xip code: I definitely want to use it
on at least x86 and powerpc as well.
This patch:
Introduce a new type of mapping, VM_MIXEDMAP. This is unlike VM_PFNMAP in
that it can support COW mappings of arbitrary ranges including ranges without
struct page *and* ranges with a struct page that we actually want to refcount
(PFNMAP can only support COW in those cases where the un-COW-ed translations
are mapped linearly in the virtual address, and can only support non
refcounted ranges).
VM_MIXEDMAP achieves this by refcounting all pfn_valid pages, and not
refcounting !pfn_valid pages (which is not an option for VM_PFNMAP, because it
needs to avoid refcounting pfn_valid pages eg. for /dev/mem mappings).
Signed-off-by: Jared Hulbert <jaredeh@gmail.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Carsten Otte <cotte@de.ibm.com>
Cc: Jared Hulbert <jaredeh@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 17:12:58 +08:00
|
|
|
#define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */
|
2008-07-07 22:28:52 +08:00
|
|
|
#define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
|
|
|
|
#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef CONFIG_STACK_GROWSUP
|
|
|
|
#define VM_STACK_FLAGS (VM_GROWSUP | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
|
|
|
|
#else
|
|
|
|
#define VM_STACK_FLAGS (VM_GROWSDOWN | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#define VM_READHINTMASK (VM_SEQ_READ | VM_RAND_READ)
|
|
|
|
#define VM_ClearReadHint(v) (v)->vm_flags &= ~VM_READHINTMASK
|
|
|
|
#define VM_NormalReadHint(v) (!((v)->vm_flags & VM_READHINTMASK))
|
|
|
|
#define VM_SequentialReadHint(v) ((v)->vm_flags & VM_SEQ_READ)
|
|
|
|
#define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* mapping from the currently active vm_flags protection bits (the
|
|
|
|
* low four bits) to a page protection mask..
|
|
|
|
*/
|
|
|
|
extern pgprot_t protection_map[16];
|
|
|
|
|
2007-07-19 16:47:03 +08:00
|
|
|
#define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */
|
|
|
|
#define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */
|
|
|
|
|
|
|
|
|
2007-07-19 16:46:59 +08:00
|
|
|
/*
|
2007-07-19 16:47:03 +08:00
|
|
|
* vm_fault is filled by the the pagefault handler and passed to the vma's
|
2007-07-19 16:47:05 +08:00
|
|
|
* ->fault function. The vma's ->fault is responsible for returning a bitmask
|
|
|
|
* of VM_FAULT_xxx flags that give details about how the fault was handled.
|
2007-07-19 16:46:59 +08:00
|
|
|
*
|
2007-07-19 16:47:03 +08:00
|
|
|
* pgoff should be used in favour of virtual_address, if possible. If pgoff
|
|
|
|
* is used, one may set VM_CAN_NONLINEAR in the vma->vm_flags to get nonlinear
|
|
|
|
* mapping support.
|
2007-07-19 16:46:59 +08:00
|
|
|
*/
|
2007-07-19 16:47:03 +08:00
|
|
|
struct vm_fault {
|
|
|
|
unsigned int flags; /* FAULT_FLAG_xxx flags */
|
|
|
|
pgoff_t pgoff; /* Logical page offset based on vma */
|
|
|
|
void __user *virtual_address; /* Faulting virtual address */
|
|
|
|
|
|
|
|
struct page *page; /* ->fault handlers should return a
|
2007-07-19 16:47:05 +08:00
|
|
|
* page here, unless VM_FAULT_NOPAGE
|
2007-07-19 16:47:03 +08:00
|
|
|
* is set (which is also implied by
|
2007-07-19 16:47:05 +08:00
|
|
|
* VM_FAULT_ERROR).
|
2007-07-19 16:47:03 +08:00
|
|
|
*/
|
2007-07-19 16:46:59 +08:00
|
|
|
};
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* These are the virtual MM functions - opening of an area, closing and
|
|
|
|
* unmapping it (needed to keep files on disk up-to-date etc), pointer
|
|
|
|
* to the functions called when a no-page or a wp-page exception occurs.
|
|
|
|
*/
|
|
|
|
struct vm_operations_struct {
|
|
|
|
void (*open)(struct vm_area_struct * area);
|
|
|
|
void (*close)(struct vm_area_struct * area);
|
2007-07-19 16:47:03 +08:00
|
|
|
int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
|
2006-06-23 17:03:43 +08:00
|
|
|
|
|
|
|
/* notification that a previously read-only page is about to become
|
|
|
|
* writable, if an error is returned it will cause a SIGBUS */
|
|
|
|
int (*page_mkwrite)(struct vm_area_struct *vma, struct page *page);
|
2008-07-24 12:27:05 +08:00
|
|
|
|
|
|
|
/* called by access_process_vm when get_user_pages() fails, typically
|
|
|
|
* for use by special VMAs that can switch between memory and hardware
|
|
|
|
*/
|
|
|
|
int (*access)(struct vm_area_struct *vma, unsigned long addr,
|
|
|
|
void *buf, int len, int write);
|
2005-04-17 06:20:36 +08:00
|
|
|
#ifdef CONFIG_NUMA
|
2008-04-28 17:13:14 +08:00
|
|
|
/*
|
|
|
|
* set_policy() op must add a reference to any non-NULL @new mempolicy
|
|
|
|
* to hold the policy upon return. Caller should pass NULL @new to
|
|
|
|
* remove a policy and fall back to surrounding context--i.e. do not
|
|
|
|
* install a MPOL_DEFAULT policy, nor the task or system default
|
|
|
|
* mempolicy.
|
|
|
|
*/
|
2005-04-17 06:20:36 +08:00
|
|
|
int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);
|
2008-04-28 17:13:14 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* get_policy() op must add reference [mpol_get()] to any policy at
|
|
|
|
* (vma,addr) marked as MPOL_SHARED. The shared policy infrastructure
|
|
|
|
* in mm/mempolicy.c will do this automatically.
|
|
|
|
* get_policy() must NOT add a ref if the policy at (vma,addr) is not
|
|
|
|
* marked as MPOL_SHARED. vma policies are protected by the mmap_sem.
|
|
|
|
* If no [shared/vma] mempolicy exists at the addr, get_policy() op
|
|
|
|
* must return NULL--i.e., do not "fallback" to task or system default
|
|
|
|
* policy.
|
|
|
|
*/
|
2005-04-17 06:20:36 +08:00
|
|
|
struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
|
|
|
|
unsigned long addr);
|
2006-06-25 20:46:48 +08:00
|
|
|
int (*migrate)(struct vm_area_struct *vma, const nodemask_t *from,
|
|
|
|
const nodemask_t *to, unsigned long flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif
|
|
|
|
};
|
|
|
|
|
|
|
|
struct mmu_gather;
|
|
|
|
struct inode;
|
|
|
|
|
2006-01-08 17:04:36 +08:00
|
|
|
#define page_private(page) ((page)->private)
|
|
|
|
#define set_page_private(page, v) ((page)->private = (v))
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:40 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* FIXME: take this include out, include page-flags.h in
|
|
|
|
* files which need it (119 of them)
|
|
|
|
*/
|
|
|
|
#include <linux/page-flags.h>
|
|
|
|
|
2006-09-26 14:30:55 +08:00
|
|
|
#ifdef CONFIG_DEBUG_VM
|
|
|
|
#define VM_BUG_ON(cond) BUG_ON(cond)
|
|
|
|
#else
|
|
|
|
#define VM_BUG_ON(condition) do { } while(0)
|
|
|
|
#endif
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Methods to modify the page usage count.
|
|
|
|
*
|
|
|
|
* What counts for a page usage:
|
|
|
|
* - cache mapping (page->mapping)
|
|
|
|
* - private data (page->private)
|
|
|
|
* - page mapped in a task's page tables, each mapping
|
|
|
|
* is counted separately
|
|
|
|
*
|
|
|
|
* Also, many kernel routines increase the page count before a critical
|
|
|
|
* routine so they can be sure the page doesn't go away from under them.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
2006-09-26 14:31:35 +08:00
|
|
|
* Drop a ref, return true if the refcount fell to zero (the page has no users)
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2006-03-22 16:08:03 +08:00
|
|
|
static inline int put_page_testzero(struct page *page)
|
|
|
|
{
|
2006-09-26 14:30:55 +08:00
|
|
|
VM_BUG_ON(atomic_read(&page->_count) == 0);
|
2006-03-22 16:08:03 +08:00
|
|
|
return atomic_dec_and_test(&page->_count);
|
2006-03-22 16:08:03 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
2006-03-22 16:08:03 +08:00
|
|
|
* Try to grab a ref unless the page has a refcount of zero, return false if
|
|
|
|
* that is the case.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2006-03-22 16:08:03 +08:00
|
|
|
static inline int get_page_unless_zero(struct page *page)
|
|
|
|
{
|
2008-02-05 14:28:35 +08:00
|
|
|
VM_BUG_ON(PageTail(page));
|
2006-03-22 16:08:03 +08:00
|
|
|
return atomic_inc_not_zero(&page->_count);
|
2006-03-22 16:08:03 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-02-05 14:28:31 +08:00
|
|
|
/* Support for virtually mapped pages */
|
2008-02-05 14:28:32 +08:00
|
|
|
struct page *vmalloc_to_page(const void *addr);
|
|
|
|
unsigned long vmalloc_to_pfn(const void *addr);
|
2008-02-05 14:28:31 +08:00
|
|
|
|
2008-03-12 15:51:31 +08:00
|
|
|
/*
|
|
|
|
* Determine if an address is within the vmalloc range
|
|
|
|
*
|
|
|
|
* On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there
|
|
|
|
* is no special casing required.
|
|
|
|
*/
|
2008-02-05 14:28:34 +08:00
|
|
|
static inline int is_vmalloc_addr(const void *x)
|
|
|
|
{
|
2008-03-12 15:51:31 +08:00
|
|
|
#ifdef CONFIG_MMU
|
2008-02-05 14:28:34 +08:00
|
|
|
unsigned long addr = (unsigned long)x;
|
|
|
|
|
|
|
|
return addr >= VMALLOC_START && addr < VMALLOC_END;
|
2008-03-12 15:51:31 +08:00
|
|
|
#else
|
|
|
|
return 0;
|
2008-02-24 07:23:37 +08:00
|
|
|
#endif
|
2008-03-12 15:51:31 +08:00
|
|
|
}
|
2008-02-05 14:28:34 +08:00
|
|
|
|
2007-05-07 05:49:39 +08:00
|
|
|
static inline struct page *compound_head(struct page *page)
|
|
|
|
{
|
2007-05-07 05:49:40 +08:00
|
|
|
if (unlikely(PageTail(page)))
|
2007-05-07 05:49:39 +08:00
|
|
|
return page->first_page;
|
|
|
|
return page;
|
|
|
|
}
|
|
|
|
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:40 +08:00
|
|
|
static inline int page_count(struct page *page)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2007-05-07 05:49:39 +08:00
|
|
|
return atomic_read(&compound_head(page)->_count);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void get_page(struct page *page)
|
|
|
|
{
|
2007-05-07 05:49:39 +08:00
|
|
|
page = compound_head(page);
|
2006-09-26 14:30:55 +08:00
|
|
|
VM_BUG_ON(atomic_read(&page->_count) == 0);
|
2005-04-17 06:20:36 +08:00
|
|
|
atomic_inc(&page->_count);
|
|
|
|
}
|
|
|
|
|
2007-05-07 05:49:41 +08:00
|
|
|
static inline struct page *virt_to_head_page(const void *x)
|
|
|
|
{
|
|
|
|
struct page *page = virt_to_page(x);
|
|
|
|
return compound_head(page);
|
|
|
|
}
|
|
|
|
|
2006-03-22 16:08:40 +08:00
|
|
|
/*
|
|
|
|
* Setup the page count before being freed into the page allocator for
|
|
|
|
* the first time (boot or memory hotplug)
|
|
|
|
*/
|
|
|
|
static inline void init_page_count(struct page *page)
|
|
|
|
{
|
|
|
|
atomic_set(&page->_count, 1);
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
void put_page(struct page *page);
|
2006-08-14 14:24:27 +08:00
|
|
|
void put_pages_list(struct list_head *pages);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-03-22 16:08:05 +08:00
|
|
|
void split_page(struct page *page, unsigned int order);
|
|
|
|
|
2006-12-07 12:33:32 +08:00
|
|
|
/*
|
|
|
|
* Compound pages have a destructor function. Provide a
|
|
|
|
* prototype for that function and accessor functions.
|
|
|
|
* These are _only_ valid on the head of a PG_compound page.
|
|
|
|
*/
|
|
|
|
typedef void compound_page_dtor(struct page *);
|
|
|
|
|
|
|
|
static inline void set_compound_page_dtor(struct page *page,
|
|
|
|
compound_page_dtor *dtor)
|
|
|
|
{
|
|
|
|
page[1].lru.next = (void *)dtor;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline compound_page_dtor *get_compound_page_dtor(struct page *page)
|
|
|
|
{
|
|
|
|
return (compound_page_dtor *)page[1].lru.next;
|
|
|
|
}
|
|
|
|
|
2007-05-07 05:49:39 +08:00
|
|
|
static inline int compound_order(struct page *page)
|
|
|
|
{
|
2007-05-07 05:49:40 +08:00
|
|
|
if (!PageHead(page))
|
2007-05-07 05:49:39 +08:00
|
|
|
return 0;
|
|
|
|
return (unsigned long)page[1].lru.prev;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void set_compound_order(struct page *page, unsigned long order)
|
|
|
|
{
|
|
|
|
page[1].lru.prev = (void *)order;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Multiple processes may "see" the same page. E.g. for untouched
|
|
|
|
* mappings of /dev/null, all processes see the same page full of
|
|
|
|
* zeroes, and text pages of executables and shared libraries have
|
|
|
|
* only one copy in memory, at most, normally.
|
|
|
|
*
|
|
|
|
* For the non-reserved pages, page_count(page) denotes a reference count.
|
2005-09-22 00:55:38 +08:00
|
|
|
* page_count() == 0 means the page is free. page->lru is then used for
|
|
|
|
* freelist management in the buddy allocator.
|
2006-09-26 14:31:35 +08:00
|
|
|
* page_count() > 0 means the page has been allocated.
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
2006-09-26 14:31:35 +08:00
|
|
|
* Pages are allocated by the slab allocator in order to provide memory
|
|
|
|
* to kmalloc and kmem_cache_alloc. In this case, the management of the
|
|
|
|
* page, and the fields in 'struct page' are the responsibility of mm/slab.c
|
|
|
|
* unless a particular usage is carefully commented. (the responsibility of
|
|
|
|
* freeing the kmalloc memory is the caller's, of course).
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
2006-09-26 14:31:35 +08:00
|
|
|
* A page may be used by anyone else who does a __get_free_page().
|
|
|
|
* In this case, page_count still tracks the references, and should only
|
|
|
|
* be used through the normal accessor functions. The top bits of page->flags
|
|
|
|
* and page->virtual store page management information, but all other fields
|
|
|
|
* are unused and could be used privately, carefully. The management of this
|
|
|
|
* page is the responsibility of the one who allocated it, and those who have
|
|
|
|
* subsequently been given references to it.
|
|
|
|
*
|
|
|
|
* The other pages (we may call them "pagecache pages") are completely
|
2005-04-17 06:20:36 +08:00
|
|
|
* managed by the Linux memory manager: I/O, buffers, swapping etc.
|
|
|
|
* The following discussion applies only to them.
|
|
|
|
*
|
2006-09-26 14:31:35 +08:00
|
|
|
* A pagecache page contains an opaque `private' member, which belongs to the
|
|
|
|
* page's address_space. Usually, this is the address of a circular list of
|
|
|
|
* the page's disk buffers. PG_private must be set to tell the VM to call
|
|
|
|
* into the filesystem to release these pages.
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
2006-09-26 14:31:35 +08:00
|
|
|
* A page may belong to an inode's memory mapping. In this case, page->mapping
|
|
|
|
* is the pointer to the inode, and page->index is the file offset of the page,
|
|
|
|
* in units of PAGE_CACHE_SIZE.
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
2006-09-26 14:31:35 +08:00
|
|
|
* If pagecache pages are not associated with an inode, they are said to be
|
|
|
|
* anonymous pages. These may become associated with the swapcache, and in that
|
|
|
|
* case PG_swapcache is set, and page->private is an offset into the swapcache.
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
2006-09-26 14:31:35 +08:00
|
|
|
* In either case (swapcache or inode backed), the pagecache itself holds one
|
|
|
|
* reference to the page. Setting PG_private should also increment the
|
|
|
|
* refcount. The each user mapping also has a reference to the page.
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
2006-09-26 14:31:35 +08:00
|
|
|
* The pagecache pages are stored in a per-mapping radix tree, which is
|
|
|
|
* rooted at mapping->page_tree, and indexed by offset.
|
|
|
|
* Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space
|
|
|
|
* lists, we instead now tag pages as dirty/writeback in the radix tree.
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
2006-09-26 14:31:35 +08:00
|
|
|
* All pagecache pages may be subject to I/O:
|
2005-04-17 06:20:36 +08:00
|
|
|
* - inode pages may need to be read from disk,
|
|
|
|
* - inode pages which have been modified and are MAP_SHARED may need
|
2006-09-26 14:31:35 +08:00
|
|
|
* to be written back to the inode on disk,
|
|
|
|
* - anonymous pages (including MAP_PRIVATE file mappings) which have been
|
|
|
|
* modified may need to be swapped out to swap space and (later) to be read
|
|
|
|
* back into memory.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The zone field is never updated after free_area_init_core()
|
|
|
|
* sets it, so none of the operations on it need to be atomic.
|
|
|
|
*/
|
2005-06-23 15:07:40 +08:00
|
|
|
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 15:07:54 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* page->flags layout:
|
|
|
|
*
|
|
|
|
* There are three possibilities for how page->flags get
|
|
|
|
* laid out. The first is for the normal case, without
|
|
|
|
* sparsemem. The second is for sparsemem when there is
|
|
|
|
* plenty of space for node and section. The last is when
|
|
|
|
* we have run out of space and have to fall back to an
|
|
|
|
* alternate (slower) way of determining the node.
|
|
|
|
*
|
2008-04-28 17:12:43 +08:00
|
|
|
* No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS |
|
|
|
|
* classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS |
|
|
|
|
* classic sparse no space for node: | SECTION | ZONE | ... | FLAGS |
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 15:07:54 +08:00
|
|
|
*/
|
2008-04-28 17:12:43 +08:00
|
|
|
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 15:07:54 +08:00
|
|
|
#define SECTIONS_WIDTH SECTIONS_SHIFT
|
|
|
|
#else
|
|
|
|
#define SECTIONS_WIDTH 0
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#define ZONES_WIDTH ZONES_SHIFT
|
|
|
|
|
2008-04-28 17:12:48 +08:00
|
|
|
#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 15:07:54 +08:00
|
|
|
#define NODES_WIDTH NODES_SHIFT
|
|
|
|
#else
|
2008-04-28 17:12:43 +08:00
|
|
|
#ifdef CONFIG_SPARSEMEM_VMEMMAP
|
|
|
|
#error "Vmemmap: No space for nodes field in page flags"
|
|
|
|
#endif
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 15:07:54 +08:00
|
|
|
#define NODES_WIDTH 0
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* Page flags: | [SECTION] | [NODE] | ZONE | ... | FLAGS | */
|
2005-11-06 00:25:53 +08:00
|
|
|
#define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 15:07:54 +08:00
|
|
|
#define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH)
|
|
|
|
#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We are going to use the flags for the page to node mapping if its in
|
|
|
|
* there. This includes the case where there is no node, so it is implicit.
|
|
|
|
*/
|
2006-12-07 12:31:45 +08:00
|
|
|
#if !(NODES_WIDTH > 0 || NODES_SHIFT == 0)
|
|
|
|
#define NODE_NOT_IN_PAGE_FLAGS
|
|
|
|
#endif
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 15:07:54 +08:00
|
|
|
|
|
|
|
#ifndef PFN_SECTION_SHIFT
|
|
|
|
#define PFN_SECTION_SHIFT 0
|
|
|
|
#endif
|
2005-06-23 15:07:40 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Define the bit shifts to access each section. For non-existant
|
|
|
|
* sections we define the shift as 0; that plus a 0 mask ensures
|
|
|
|
* the compiler will optimise away reference to them.
|
|
|
|
*/
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 15:07:54 +08:00
|
|
|
#define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
|
|
|
|
#define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0))
|
|
|
|
#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0))
|
2005-06-23 15:07:40 +08:00
|
|
|
|
2006-12-07 12:31:45 +08:00
|
|
|
/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allcator */
|
|
|
|
#ifdef NODE_NOT_IN_PAGEFLAGS
|
|
|
|
#define ZONEID_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT)
|
2007-02-10 17:43:14 +08:00
|
|
|
#define ZONEID_PGOFF ((SECTIONS_PGOFF < ZONES_PGOFF)? \
|
|
|
|
SECTIONS_PGOFF : ZONES_PGOFF)
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 15:07:54 +08:00
|
|
|
#else
|
2006-12-07 12:31:45 +08:00
|
|
|
#define ZONEID_SHIFT (NODES_SHIFT + ZONES_SHIFT)
|
2007-02-10 17:43:14 +08:00
|
|
|
#define ZONEID_PGOFF ((NODES_PGOFF < ZONES_PGOFF)? \
|
|
|
|
NODES_PGOFF : ZONES_PGOFF)
|
2006-12-07 12:31:45 +08:00
|
|
|
#endif
|
|
|
|
|
2007-02-10 17:43:14 +08:00
|
|
|
#define ZONEID_PGSHIFT (ZONEID_PGOFF * (ZONEID_SHIFT != 0))
|
2005-06-23 15:07:40 +08:00
|
|
|
|
2008-04-28 17:12:48 +08:00
|
|
|
#if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
|
|
|
|
#error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
|
2005-06-23 15:07:40 +08:00
|
|
|
#endif
|
|
|
|
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 15:07:54 +08:00
|
|
|
#define ZONES_MASK ((1UL << ZONES_WIDTH) - 1)
|
|
|
|
#define NODES_MASK ((1UL << NODES_WIDTH) - 1)
|
|
|
|
#define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1)
|
2006-12-07 12:31:45 +08:00
|
|
|
#define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1)
|
2005-06-23 15:07:40 +08:00
|
|
|
|
2006-09-26 14:31:13 +08:00
|
|
|
static inline enum zone_type page_zonenum(struct page *page)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2005-06-23 15:07:40 +08:00
|
|
|
return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2006-12-07 12:31:45 +08:00
|
|
|
/*
|
|
|
|
* The identification function is only used by the buddy allocator for
|
|
|
|
* determining if two pages could be buddies. We are not really
|
|
|
|
* identifying a zone since we could be using a the section number
|
|
|
|
* id if we have not node id available in page flags.
|
|
|
|
* We guarantee only that it will return the same value for two
|
|
|
|
* combinable pages in a zone.
|
|
|
|
*/
|
2006-06-23 17:03:01 +08:00
|
|
|
static inline int page_zone_id(struct page *page)
|
|
|
|
{
|
2006-12-07 12:31:45 +08:00
|
|
|
return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK;
|
2005-06-23 15:07:40 +08:00
|
|
|
}
|
|
|
|
|
2006-12-07 12:33:03 +08:00
|
|
|
static inline int zone_to_nid(struct zone *zone)
|
2006-09-26 14:31:55 +08:00
|
|
|
{
|
2006-09-27 16:50:08 +08:00
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
return zone->node;
|
|
|
|
#else
|
|
|
|
return 0;
|
|
|
|
#endif
|
2006-09-26 14:31:55 +08:00
|
|
|
}
|
|
|
|
|
2006-12-07 12:31:45 +08:00
|
|
|
#ifdef NODE_NOT_IN_PAGE_FLAGS
|
2006-12-07 12:33:03 +08:00
|
|
|
extern int page_to_nid(struct page *page);
|
2006-12-07 12:31:45 +08:00
|
|
|
#else
|
2006-12-07 12:33:03 +08:00
|
|
|
static inline int page_to_nid(struct page *page)
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 15:07:54 +08:00
|
|
|
{
|
2006-12-07 12:31:45 +08:00
|
|
|
return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 15:07:54 +08:00
|
|
|
}
|
2006-12-07 12:31:45 +08:00
|
|
|
#endif
|
|
|
|
|
|
|
|
static inline struct zone *page_zone(struct page *page)
|
|
|
|
{
|
|
|
|
return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
|
|
|
|
}
|
|
|
|
|
2008-04-28 17:12:43 +08:00
|
|
|
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 15:07:54 +08:00
|
|
|
static inline unsigned long page_to_section(struct page *page)
|
|
|
|
{
|
|
|
|
return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
|
|
|
|
}
|
2008-04-28 17:12:43 +08:00
|
|
|
#endif
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 15:07:54 +08:00
|
|
|
|
2006-09-26 14:31:13 +08:00
|
|
|
static inline void set_page_zone(struct page *page, enum zone_type zone)
|
2005-06-23 15:07:40 +08:00
|
|
|
{
|
|
|
|
page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
|
|
|
|
page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
|
|
|
|
}
|
2006-09-26 14:31:13 +08:00
|
|
|
|
2005-06-23 15:07:40 +08:00
|
|
|
static inline void set_page_node(struct page *page, unsigned long node)
|
|
|
|
{
|
|
|
|
page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
|
|
|
|
page->flags |= (node & NODES_MASK) << NODES_PGSHIFT;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2006-12-07 12:31:45 +08:00
|
|
|
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 15:07:54 +08:00
|
|
|
static inline void set_page_section(struct page *page, unsigned long section)
|
|
|
|
{
|
|
|
|
page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
|
|
|
|
page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-09-26 14:31:13 +08:00
|
|
|
static inline void set_page_links(struct page *page, enum zone_type zone,
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 15:07:54 +08:00
|
|
|
unsigned long node, unsigned long pfn)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2005-06-23 15:07:40 +08:00
|
|
|
set_page_zone(page, zone);
|
|
|
|
set_page_node(page, node);
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 15:07:54 +08:00
|
|
|
set_page_section(page, pfn_to_section_nr(pfn));
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2007-11-27 07:47:40 +08:00
|
|
|
/*
|
|
|
|
* If a hint addr is less than mmap_min_addr change hint to be as
|
|
|
|
* low as possible but still greater than mmap_min_addr
|
|
|
|
*/
|
|
|
|
static inline unsigned long round_hint_to_min(unsigned long hint)
|
|
|
|
{
|
|
|
|
#ifdef CONFIG_SECURITY
|
|
|
|
hint &= PAGE_MASK;
|
|
|
|
if (((void *)hint != NULL) &&
|
|
|
|
(hint < mmap_min_addr))
|
|
|
|
return PAGE_ALIGN(mmap_min_addr);
|
|
|
|
#endif
|
|
|
|
return hint;
|
|
|
|
}
|
|
|
|
|
2006-06-30 16:55:32 +08:00
|
|
|
/*
|
|
|
|
* Some inline functions in vmstat.h depend on page_zone()
|
|
|
|
*/
|
|
|
|
#include <linux/vmstat.h>
|
|
|
|
|
2006-01-15 05:21:30 +08:00
|
|
|
static __always_inline void *lowmem_page_address(struct page *page)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
return __va(page_to_pfn(page) << PAGE_SHIFT);
|
|
|
|
}
|
|
|
|
|
|
|
|
#if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL)
|
|
|
|
#define HASHED_PAGE_VIRTUAL
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if defined(WANT_PAGE_VIRTUAL)
|
|
|
|
#define page_address(page) ((page)->virtual)
|
|
|
|
#define set_page_address(page, address) \
|
|
|
|
do { \
|
|
|
|
(page)->virtual = (address); \
|
|
|
|
} while(0)
|
|
|
|
#define page_address_init() do { } while(0)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if defined(HASHED_PAGE_VIRTUAL)
|
|
|
|
void *page_address(struct page *page);
|
|
|
|
void set_page_address(struct page *page, void *virtual);
|
|
|
|
void page_address_init(void);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL)
|
|
|
|
#define page_address(page) lowmem_page_address(page)
|
|
|
|
#define set_page_address(page, address) do { } while(0)
|
|
|
|
#define page_address_init() do { } while(0)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* On an anonymous page mapped into a user virtual memory area,
|
|
|
|
* page->mapping points to its anon_vma, not to a struct address_space;
|
|
|
|
* with the PAGE_MAPPING_ANON bit set to distinguish it.
|
|
|
|
*
|
|
|
|
* Please note that, confusingly, "page_mapping" refers to the inode
|
|
|
|
* address_space which maps the page from disk; whereas "page_mapped"
|
|
|
|
* refers to user virtual address space into which the page is mapped.
|
|
|
|
*/
|
|
|
|
#define PAGE_MAPPING_ANON 1
|
|
|
|
|
|
|
|
extern struct address_space swapper_space;
|
|
|
|
static inline struct address_space *page_mapping(struct page *page)
|
|
|
|
{
|
|
|
|
struct address_space *mapping = page->mapping;
|
|
|
|
|
2007-07-17 19:03:33 +08:00
|
|
|
VM_BUG_ON(PageSlab(page));
|
2008-04-28 17:12:44 +08:00
|
|
|
#ifdef CONFIG_SWAP
|
2005-04-17 06:20:36 +08:00
|
|
|
if (unlikely(PageSwapCache(page)))
|
|
|
|
mapping = &swapper_space;
|
2008-04-28 17:12:44 +08:00
|
|
|
else
|
|
|
|
#endif
|
|
|
|
if (unlikely((unsigned long)mapping & PAGE_MAPPING_ANON))
|
2005-04-17 06:20:36 +08:00
|
|
|
mapping = NULL;
|
|
|
|
return mapping;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int PageAnon(struct page *page)
|
|
|
|
{
|
|
|
|
return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return the pagecache index of the passed page. Regular pagecache pages
|
|
|
|
* use ->index whereas swapcache pages use ->private
|
|
|
|
*/
|
|
|
|
static inline pgoff_t page_index(struct page *page)
|
|
|
|
{
|
|
|
|
if (unlikely(PageSwapCache(page)))
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:40 +08:00
|
|
|
return page_private(page);
|
2005-04-17 06:20:36 +08:00
|
|
|
return page->index;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The atomic page->_mapcount, like _count, starts from -1:
|
|
|
|
* so that transitions both from it and to it can be tracked,
|
|
|
|
* using atomic_inc_and_test and atomic_add_negative(-1).
|
|
|
|
*/
|
|
|
|
static inline void reset_page_mapcount(struct page *page)
|
|
|
|
{
|
|
|
|
atomic_set(&(page)->_mapcount, -1);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int page_mapcount(struct page *page)
|
|
|
|
{
|
|
|
|
return atomic_read(&(page)->_mapcount) + 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return true if this page is mapped into pagetables.
|
|
|
|
*/
|
|
|
|
static inline int page_mapped(struct page *page)
|
|
|
|
{
|
|
|
|
return atomic_read(&(page)->_mapcount) >= 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Different kinds of faults, as returned by handle_mm_fault().
|
|
|
|
* Used to decide whether a process gets delivered SIGBUS or
|
|
|
|
* just gets major/minor fault counters bumped up.
|
|
|
|
*/
|
2007-07-19 16:47:03 +08:00
|
|
|
|
2007-07-19 16:47:05 +08:00
|
|
|
#define VM_FAULT_MINOR 0 /* For backwards compat. Remove me quickly. */
|
2007-07-19 16:47:03 +08:00
|
|
|
|
2007-07-19 16:47:05 +08:00
|
|
|
#define VM_FAULT_OOM 0x0001
|
|
|
|
#define VM_FAULT_SIGBUS 0x0002
|
|
|
|
#define VM_FAULT_MAJOR 0x0004
|
|
|
|
#define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */
|
[PATCH] fix get_user_pages bug
Checking pte_dirty instead of pte_write in __follow_page is problematic
for s390, and for copy_one_pte which leaves dirty when clearing write.
So revert __follow_page to check pte_write as before, and make
do_wp_page pass back a special extra VM_FAULT_WRITE bit to say it has
done its full job: once get_user_pages receives this value, it no longer
requires pte_write in __follow_page.
But most callers of handle_mm_fault, in the various architectures, have
switch statements which do not expect this new case. To avoid changing
them all in a hurry, make an inline wrapper function (using the old
name) that masks off the new bit, and use the extended interface with
double underscores.
Yes, we do have a call to do_wp_page from do_swap_page, but no need to
change that: in rare case it's needed, another do_wp_page will follow.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
[ Cleanups by Nick Piggin ]
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-08-03 18:24:01 +08:00
|
|
|
|
2007-07-19 16:47:05 +08:00
|
|
|
#define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */
|
|
|
|
#define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-07-19 16:47:05 +08:00
|
|
|
#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS)
|
2007-07-19 16:47:03 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK)
|
|
|
|
|
|
|
|
extern void show_free_areas(void);
|
|
|
|
|
|
|
|
#ifdef CONFIG_SHMEM
|
|
|
|
int shmem_lock(struct file *file, int lock, struct user_struct *user);
|
|
|
|
#else
|
2006-01-06 16:10:52 +08:00
|
|
|
static inline int shmem_lock(struct file *file, int lock,
|
|
|
|
struct user_struct *user)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif
|
|
|
|
struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags);
|
|
|
|
|
|
|
|
int shmem_zero_setup(struct vm_area_struct *);
|
|
|
|
|
2006-01-06 16:11:42 +08:00
|
|
|
#ifndef CONFIG_MMU
|
|
|
|
extern unsigned long shmem_get_unmapped_area(struct file *file,
|
|
|
|
unsigned long addr,
|
|
|
|
unsigned long len,
|
|
|
|
unsigned long pgoff,
|
|
|
|
unsigned long flags);
|
|
|
|
#endif
|
|
|
|
|
Detach sched.h from mm.h
First thing mm.h does is including sched.h solely for can_do_mlock() inline
function which has "current" dereference inside. By dealing with can_do_mlock()
mm.h can be detached from sched.h which is good. See below, why.
This patch
a) removes unconditional inclusion of sched.h from mm.h
b) makes can_do_mlock() normal function in mm/mlock.c
c) exports can_do_mlock() to not break compilation
d) adds sched.h inclusions back to files that were getting it indirectly.
e) adds less bloated headers to some files (asm/signal.h, jiffies.h) that were
getting them indirectly
Net result is:
a) mm.h users would get less code to open, read, preprocess, parse, ... if
they don't need sched.h
b) sched.h stops being dependency for significant number of files:
on x86_64 allmodconfig touching sched.h results in recompile of 4083 files,
after patch it's only 3744 (-8.3%).
Cross-compile tested on
all arm defconfigs, all mips defconfigs, all powerpc defconfigs,
alpha alpha-up
arm
i386 i386-up i386-defconfig i386-allnoconfig
ia64 ia64-up
m68k
mips
parisc parisc-up
powerpc powerpc-up
s390 s390-up
sparc sparc-up
sparc64 sparc64-up
um-x86_64
x86_64 x86_64-up x86_64-defconfig x86_64-allnoconfig
as well as my two usual configs.
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-21 05:22:52 +08:00
|
|
|
extern int can_do_mlock(void);
|
2005-04-17 06:20:36 +08:00
|
|
|
extern int user_shm_lock(size_t, struct user_struct *);
|
|
|
|
extern void user_shm_unlock(size_t, struct user_struct *);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Parameter block passed down to zap_pte_range in exceptional cases.
|
|
|
|
*/
|
|
|
|
struct zap_details {
|
|
|
|
struct vm_area_struct *nonlinear_vma; /* Check page->index if set */
|
|
|
|
struct address_space *check_mapping; /* Check page->mapping if set */
|
|
|
|
pgoff_t first_index; /* Lowest page->index to unmap */
|
|
|
|
pgoff_t last_index; /* Highest page->index to unmap */
|
|
|
|
spinlock_t *i_mmap_lock; /* For unmap_mapping_range: */
|
|
|
|
unsigned long truncate_count; /* Compare vm_truncate_count */
|
|
|
|
};
|
|
|
|
|
mm: introduce pte_special pte bit
s390 for one, cannot implement VM_MIXEDMAP with pfn_valid, due to their memory
model (which is more dynamic than most). Instead, they had proposed to
implement it with an additional path through vm_normal_page(), using a bit in
the pte to determine whether or not the page should be refcounted:
vm_normal_page()
{
...
if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
if (vma->vm_flags & VM_MIXEDMAP) {
#ifdef s390
if (!mixedmap_refcount_pte(pte))
return NULL;
#else
if (!pfn_valid(pfn))
return NULL;
#endif
goto out;
}
...
}
This is fine, however if we are allowed to use a bit in the pte to determine
refcountedness, we can use that to _completely_ replace all the vma based
schemes. So instead of adding more cases to the already complex vma-based
scheme, we can have a clearly seperate and simple pte-based scheme (and get
slightly better code generation in the process):
vm_normal_page()
{
#ifdef s390
if (!mixedmap_refcount_pte(pte))
return NULL;
return pte_page(pte);
#else
...
#endif
}
And finally, we may rather make this concept usable by any architecture rather
than making it s390 only, so implement a new type of pte state for this.
Unfortunately the old vma based code must stay, because some architectures may
not be able to spare pte bits. This makes vm_normal_page a little bit more
ugly than we would like, but the 2 cases are clearly seperate.
So introduce a pte_special pte state, and use it in mm/memory.c. It is
currently a noop for all architectures, so this doesn't actually result in any
compiled code changes to mm/memory.o.
BTW:
I haven't put vm_normal_page() into arch code as-per an earlier suggestion.
The reason is that, regardless of where vm_normal_page is actually
implemented, the *abstraction* is still exactly the same. Also, while it
depends on whether the architecture has pte_special or not, that is the
only two possible cases, and it really isn't an arch specific function --
the role of the arch code should be to provide primitive functions and
accessors with which to build the core code; pte_special does that. We do
not want architectures to know or care about vm_normal_page itself, and
we definitely don't want them being able to invent something new there
out of sight of mm/ code. If we made vm_normal_page an arch function, then
we have to make vm_insert_mixed (next patch) an arch function too. So I
don't think moving it to arch code fundamentally improves any abstractions,
while it does practically make the code more difficult to follow, for both
mm and arch developers, and easier to misuse.
[akpm@linux-foundation.org: build fix]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Carsten Otte <cotte@de.ibm.com>
Cc: Jared Hulbert <jaredeh@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 17:13:00 +08:00
|
|
|
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
|
|
|
|
pte_t pte);
|
|
|
|
|
2008-07-30 13:33:53 +08:00
|
|
|
int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
|
|
|
|
unsigned long size);
|
2005-04-20 04:29:15 +08:00
|
|
|
unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned long size, struct zap_details *);
|
2005-10-30 09:16:30 +08:00
|
|
|
unsigned long unmap_vmas(struct mmu_gather **tlb,
|
2005-04-17 06:20:36 +08:00
|
|
|
struct vm_area_struct *start_vma, unsigned long start_addr,
|
|
|
|
unsigned long end_addr, unsigned long *nr_accounted,
|
|
|
|
struct zap_details *);
|
2008-02-05 14:29:01 +08:00
|
|
|
|
|
|
|
/**
|
|
|
|
* mm_walk - callbacks for walk_page_range
|
|
|
|
* @pgd_entry: if set, called for each non-empty PGD (top-level) entry
|
|
|
|
* @pud_entry: if set, called for each non-empty PUD (2nd-level) entry
|
|
|
|
* @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry
|
|
|
|
* @pte_entry: if set, called for each non-empty PTE (4th-level) entry
|
|
|
|
* @pte_hole: if set, called for each hole at all levels
|
|
|
|
*
|
|
|
|
* (see walk_page_range for more details)
|
|
|
|
*/
|
|
|
|
struct mm_walk {
|
2008-06-13 06:21:47 +08:00
|
|
|
int (*pgd_entry)(pgd_t *, unsigned long, unsigned long, struct mm_walk *);
|
|
|
|
int (*pud_entry)(pud_t *, unsigned long, unsigned long, struct mm_walk *);
|
|
|
|
int (*pmd_entry)(pmd_t *, unsigned long, unsigned long, struct mm_walk *);
|
|
|
|
int (*pte_entry)(pte_t *, unsigned long, unsigned long, struct mm_walk *);
|
|
|
|
int (*pte_hole)(unsigned long, unsigned long, struct mm_walk *);
|
|
|
|
struct mm_struct *mm;
|
|
|
|
void *private;
|
2008-02-05 14:29:01 +08:00
|
|
|
};
|
|
|
|
|
2008-06-13 06:21:47 +08:00
|
|
|
int walk_page_range(unsigned long addr, unsigned long end,
|
|
|
|
struct mm_walk *walk);
|
2008-07-24 12:27:10 +08:00
|
|
|
void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
|
2005-04-20 04:29:16 +08:00
|
|
|
unsigned long end, unsigned long floor, unsigned long ceiling);
|
2005-04-17 06:20:36 +08:00
|
|
|
int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
|
|
|
|
struct vm_area_struct *vma);
|
|
|
|
void unmap_mapping_range(struct address_space *mapping,
|
|
|
|
loff_t const holebegin, loff_t const holelen, int even_cows);
|
2008-07-24 12:27:05 +08:00
|
|
|
int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
|
|
|
|
void *buf, int len, int write);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
static inline void unmap_shared_mapping_range(struct address_space *mapping,
|
|
|
|
loff_t const holebegin, loff_t const holelen)
|
|
|
|
{
|
|
|
|
unmap_mapping_range(mapping, holebegin, holelen, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
extern int vmtruncate(struct inode * inode, loff_t offset);
|
[PATCH] madvise(MADV_REMOVE): remove pages from tmpfs shm backing store
Here is the patch to implement madvise(MADV_REMOVE) - which frees up a
given range of pages & its associated backing store. Current
implementation supports only shmfs/tmpfs and other filesystems return
-ENOSYS.
"Some app allocates large tmpfs files, then when some task quits and some
client disconnect, some memory can be released. However the only way to
release tmpfs-swap is to MADV_REMOVE". - Andrea Arcangeli
Databases want to use this feature to drop a section of their bufferpool
(shared memory segments) - without writing back to disk/swap space.
This feature is also useful for supporting hot-plug memory on UML.
Concerns raised by Andrew Morton:
- "We have no plan for holepunching! If we _do_ have such a plan (or
might in the future) then what would the API look like? I think
sys_holepunch(fd, start, len), so we should start out with that."
- Using madvise is very weird, because people will ask "why do I need to
mmap my file before I can stick a hole in it?"
- None of the other madvise operations call into the filesystem in this
manner. A broad question is: is this capability an MM operation or a
filesytem operation? truncate, for example, is a filesystem operation
which sometimes has MM side-effects. madvise is an mm operation and with
this patch, it gains FS side-effects, only they're really, really
significant ones."
Comments:
- Andrea suggested the fs operation too but then it's more efficient to
have it as a mm operation with fs side effects, because they don't
immediatly know fd and physical offset of the range. It's possible to
fixup in userland and to use the fs operation but it's more expensive,
the vmas are already in the kernel and we can use them.
Short term plan & Future Direction:
- We seem to need this interface only for shmfs/tmpfs files in the short
term. We have to add hooks into the filesystem for correctness and
completeness. This is what this patch does.
- In the future, plan is to support both fs and mmap apis also. This
also involves (other) filesystem specific functions to be implemented.
- Current patch doesn't support VM_NONLINEAR - which can be addressed in
the future.
Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Andrea Arcangeli <andrea@suse.de>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Cc: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-06 16:10:38 +08:00
|
|
|
extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);
|
[PATCH] fix get_user_pages bug
Checking pte_dirty instead of pte_write in __follow_page is problematic
for s390, and for copy_one_pte which leaves dirty when clearing write.
So revert __follow_page to check pte_write as before, and make
do_wp_page pass back a special extra VM_FAULT_WRITE bit to say it has
done its full job: once get_user_pages receives this value, it no longer
requires pte_write in __follow_page.
But most callers of handle_mm_fault, in the various architectures, have
switch statements which do not expect this new case. To avoid changing
them all in a hurry, make an inline wrapper function (using the old
name) that masks off the new bit, and use the extended interface with
double underscores.
Yes, we do have a call to do_wp_page from do_swap_page, but no need to
change that: in rare case it's needed, another do_wp_page will follow.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
[ Cleanups by Nick Piggin ]
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-08-03 18:24:01 +08:00
|
|
|
|
2006-01-06 16:11:44 +08:00
|
|
|
#ifdef CONFIG_MMU
|
2007-07-19 16:47:05 +08:00
|
|
|
extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
2006-01-06 16:11:44 +08:00
|
|
|
unsigned long address, int write_access);
|
|
|
|
#else
|
|
|
|
static inline int handle_mm_fault(struct mm_struct *mm,
|
|
|
|
struct vm_area_struct *vma, unsigned long address,
|
|
|
|
int write_access)
|
|
|
|
{
|
|
|
|
/* should never happen if there's no MMU */
|
|
|
|
BUG();
|
|
|
|
return VM_FAULT_SIGBUS;
|
|
|
|
}
|
|
|
|
#endif
|
[PATCH] fix get_user_pages bug
Checking pte_dirty instead of pte_write in __follow_page is problematic
for s390, and for copy_one_pte which leaves dirty when clearing write.
So revert __follow_page to check pte_write as before, and make
do_wp_page pass back a special extra VM_FAULT_WRITE bit to say it has
done its full job: once get_user_pages receives this value, it no longer
requires pte_write in __follow_page.
But most callers of handle_mm_fault, in the various architectures, have
switch statements which do not expect this new case. To avoid changing
them all in a hurry, make an inline wrapper function (using the old
name) that masks off the new bit, and use the extended interface with
double underscores.
Yes, we do have a call to do_wp_page from do_swap_page, but no need to
change that: in rare case it's needed, another do_wp_page will follow.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
[ Cleanups by Nick Piggin ]
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-08-03 18:24:01 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
extern int make_pages_present(unsigned long addr, unsigned long end);
|
|
|
|
extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
|
|
|
|
|
|
|
|
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
|
|
|
|
int len, int write, int force, struct page **pages, struct vm_area_struct **vmas);
|
|
|
|
|
2006-08-30 02:05:54 +08:00
|
|
|
extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
|
|
|
|
extern void do_invalidatepage(struct page *page, unsigned long offset);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
int __set_page_dirty_nobuffers(struct page *page);
|
2007-02-10 17:43:15 +08:00
|
|
|
int __set_page_dirty_no_writeback(struct page *page);
|
2005-04-17 06:20:36 +08:00
|
|
|
int redirty_page_for_writepage(struct writeback_control *wbc,
|
|
|
|
struct page *page);
|
2008-02-14 07:03:15 +08:00
|
|
|
int set_page_dirty(struct page *page);
|
2005-04-17 06:20:36 +08:00
|
|
|
int set_page_dirty_lock(struct page *page);
|
|
|
|
int clear_page_dirty_for_io(struct page *page);
|
|
|
|
|
2007-07-19 16:48:16 +08:00
|
|
|
extern unsigned long move_page_tables(struct vm_area_struct *vma,
|
|
|
|
unsigned long old_addr, struct vm_area_struct *new_vma,
|
|
|
|
unsigned long new_addr, unsigned long len);
|
2005-04-17 06:20:36 +08:00
|
|
|
extern unsigned long do_mremap(unsigned long addr,
|
|
|
|
unsigned long old_len, unsigned long new_len,
|
|
|
|
unsigned long flags, unsigned long new_addr);
|
2007-07-19 16:48:16 +08:00
|
|
|
extern int mprotect_fixup(struct vm_area_struct *vma,
|
|
|
|
struct vm_area_struct **pprev, unsigned long start,
|
|
|
|
unsigned long end, unsigned long newflags);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-07-26 10:45:22 +08:00
|
|
|
/*
|
|
|
|
* get_user_pages_fast provides equivalent functionality to get_user_pages,
|
|
|
|
* operating on current and current->mm (force=0 and doesn't return any vmas).
|
|
|
|
*
|
|
|
|
* get_user_pages_fast may take mmap_sem and page tables, so no assumptions
|
|
|
|
* can be made about locking. get_user_pages_fast is to be implemented in a
|
|
|
|
* way that is advantageous (vs get_user_pages()) when the user memory area is
|
|
|
|
* already faulted in and present in ptes. However if the pages have to be
|
|
|
|
* faulted in, it may turn out to be slightly slower).
|
|
|
|
*/
|
|
|
|
int get_user_pages_fast(unsigned long start, int nr_pages, int write,
|
|
|
|
struct page **pages);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
2007-07-17 19:03:17 +08:00
|
|
|
* A callback you can register to apply pressure to ageable caches.
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
2007-07-17 19:03:17 +08:00
|
|
|
* 'shrink' is passed a count 'nr_to_scan' and a 'gfpmask'. It should
|
|
|
|
* look through the least-recently-used 'nr_to_scan' entries and
|
|
|
|
* attempt to free them up. It should return the number of objects
|
|
|
|
* which remain in the cache. If it returns -1, it means it cannot do
|
|
|
|
* any scanning at this time (eg. there is a risk of deadlock).
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
2007-07-17 19:03:17 +08:00
|
|
|
* The 'gfpmask' refers to the allocation we are currently trying to
|
|
|
|
* fulfil.
|
|
|
|
*
|
|
|
|
* Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
|
|
|
|
* querying the cache size, so a fastpath for that case is appropriate.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2007-07-17 19:03:17 +08:00
|
|
|
struct shrinker {
|
|
|
|
int (*shrink)(int nr_to_scan, gfp_t gfp_mask);
|
|
|
|
int seeks; /* seeks to recreate an obj */
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-07-17 19:03:17 +08:00
|
|
|
/* These are for internal use */
|
|
|
|
struct list_head list;
|
|
|
|
long nr; /* objs pending delete */
|
|
|
|
};
|
|
|
|
#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
|
|
|
|
extern void register_shrinker(struct shrinker *);
|
|
|
|
extern void unregister_shrinker(struct shrinker *);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-07-30 06:36:13 +08:00
|
|
|
int vma_wants_writenotify(struct vm_area_struct *vma);
|
2006-09-26 14:30:57 +08:00
|
|
|
|
2008-02-14 07:03:15 +08:00
|
|
|
extern pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl);
|
2005-11-30 06:03:14 +08:00
|
|
|
|
2007-05-07 05:49:02 +08:00
|
|
|
#ifdef __PAGETABLE_PUD_FOLDED
|
|
|
|
static inline int __pud_alloc(struct mm_struct *mm, pgd_t *pgd,
|
|
|
|
unsigned long address)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#else
|
2005-10-30 09:16:22 +08:00
|
|
|
int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
|
2007-05-07 05:49:02 +08:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef __PAGETABLE_PMD_FOLDED
|
|
|
|
static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
|
|
|
|
unsigned long address)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#else
|
2005-10-30 09:16:22 +08:00
|
|
|
int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);
|
2007-05-07 05:49:02 +08:00
|
|
|
#endif
|
|
|
|
|
2005-10-30 09:16:22 +08:00
|
|
|
int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address);
|
|
|
|
int __pte_alloc_kernel(pmd_t *pmd, unsigned long address);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* The following ifdef needed to get the 4level-fixup.h header to work.
|
|
|
|
* Remove it when 4level-fixup.h has been removed.
|
|
|
|
*/
|
2005-10-30 09:16:22 +08:00
|
|
|
#if defined(CONFIG_MMU) && !defined(__ARCH_HAS_4LEVEL_HACK)
|
2005-04-17 06:20:36 +08:00
|
|
|
static inline pud_t *pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
|
|
|
|
{
|
2005-10-30 09:16:22 +08:00
|
|
|
return (unlikely(pgd_none(*pgd)) && __pud_alloc(mm, pgd, address))?
|
|
|
|
NULL: pud_offset(pgd, address);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
|
|
|
|
{
|
2005-10-30 09:16:22 +08:00
|
|
|
return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))?
|
|
|
|
NULL: pmd_offset(pud, address);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2005-10-30 09:16:22 +08:00
|
|
|
#endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */
|
|
|
|
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:40 +08:00
|
|
|
#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
|
|
|
|
/*
|
|
|
|
* We tuck a spinlock to guard each pagetable page into its struct page,
|
|
|
|
* at page->private, with BUILD_BUG_ON to make sure that this will not
|
|
|
|
* overflow into the next struct page (as it might with DEBUG_SPINLOCK).
|
|
|
|
* When freeing, reset page->mapping so free_pages_check won't complain.
|
|
|
|
*/
|
2006-01-08 17:04:36 +08:00
|
|
|
#define __pte_lockptr(page) &((page)->ptl)
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:40 +08:00
|
|
|
#define pte_lock_init(_page) do { \
|
|
|
|
spin_lock_init(__pte_lockptr(_page)); \
|
|
|
|
} while (0)
|
|
|
|
#define pte_lock_deinit(page) ((page)->mapping = NULL)
|
|
|
|
#define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
|
|
|
|
#else
|
|
|
|
/*
|
|
|
|
* We use mm->page_table_lock to guard all pagetable pages of the mm.
|
|
|
|
*/
|
|
|
|
#define pte_lock_init(page) do {} while (0)
|
|
|
|
#define pte_lock_deinit(page) do {} while (0)
|
|
|
|
#define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;})
|
|
|
|
#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
|
|
|
|
|
2008-02-08 20:22:04 +08:00
|
|
|
static inline void pgtable_page_ctor(struct page *page)
|
|
|
|
{
|
|
|
|
pte_lock_init(page);
|
|
|
|
inc_zone_page_state(page, NR_PAGETABLE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void pgtable_page_dtor(struct page *page)
|
|
|
|
{
|
|
|
|
pte_lock_deinit(page);
|
|
|
|
dec_zone_page_state(page, NR_PAGETABLE);
|
|
|
|
}
|
|
|
|
|
2005-10-30 09:16:23 +08:00
|
|
|
#define pte_offset_map_lock(mm, pmd, address, ptlp) \
|
|
|
|
({ \
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:40 +08:00
|
|
|
spinlock_t *__ptl = pte_lockptr(mm, pmd); \
|
2005-10-30 09:16:23 +08:00
|
|
|
pte_t *__pte = pte_offset_map(pmd, address); \
|
|
|
|
*(ptlp) = __ptl; \
|
|
|
|
spin_lock(__ptl); \
|
|
|
|
__pte; \
|
|
|
|
})
|
|
|
|
|
|
|
|
#define pte_unmap_unlock(pte, ptl) do { \
|
|
|
|
spin_unlock(ptl); \
|
|
|
|
pte_unmap(pte); \
|
|
|
|
} while (0)
|
|
|
|
|
2005-10-30 09:16:22 +08:00
|
|
|
#define pte_alloc_map(mm, pmd, address) \
|
|
|
|
((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \
|
|
|
|
NULL: pte_offset_map(pmd, address))
|
|
|
|
|
2005-10-30 09:16:23 +08:00
|
|
|
#define pte_alloc_map_lock(mm, pmd, address, ptlp) \
|
|
|
|
((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \
|
|
|
|
NULL: pte_offset_map_lock(mm, pmd, address, ptlp))
|
|
|
|
|
2005-10-30 09:16:22 +08:00
|
|
|
#define pte_alloc_kernel(pmd, address) \
|
|
|
|
((unlikely(!pmd_present(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
|
|
|
|
NULL: pte_offset_kernel(pmd, address))
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
extern void free_area_init(unsigned long * zones_size);
|
2008-07-24 12:27:20 +08:00
|
|
|
extern void free_area_init_node(int nid, unsigned long * zones_size,
|
|
|
|
unsigned long zone_start_pfn, unsigned long *zholes_size);
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
|
|
|
|
/*
|
|
|
|
* With CONFIG_ARCH_POPULATES_NODE_MAP set, an architecture may initialise its
|
|
|
|
* zones, allocate the backing mem_map and account for memory holes in a more
|
|
|
|
* architecture independent manner. This is a substitute for creating the
|
|
|
|
* zone_sizes[] and zholes_size[] arrays and passing them to
|
|
|
|
* free_area_init_node()
|
|
|
|
*
|
|
|
|
* An architecture is expected to register range of page frames backed by
|
|
|
|
* physical memory with add_active_range() before calling
|
|
|
|
* free_area_init_nodes() passing in the PFN each zone ends at. At a basic
|
|
|
|
* usage, an architecture is expected to do something like
|
|
|
|
*
|
|
|
|
* unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn,
|
|
|
|
* max_highmem_pfn};
|
|
|
|
* for_each_valid_physical_page_range()
|
|
|
|
* add_active_range(node_id, start_pfn, end_pfn)
|
|
|
|
* free_area_init_nodes(max_zone_pfns);
|
|
|
|
*
|
|
|
|
* If the architecture guarantees that there are no holes in the ranges
|
|
|
|
* registered with add_active_range(), free_bootmem_active_regions()
|
|
|
|
* will call free_bootmem_node() for each registered physical page range.
|
|
|
|
* Similarly sparse_memory_present_with_active_regions() calls
|
|
|
|
* memory_present() for each range when SPARSEMEM is enabled.
|
|
|
|
*
|
|
|
|
* See mm/page_alloc.c for more information on each function exposed by
|
|
|
|
* CONFIG_ARCH_POPULATES_NODE_MAP
|
|
|
|
*/
|
|
|
|
extern void free_area_init_nodes(unsigned long *max_zone_pfn);
|
|
|
|
extern void add_active_range(unsigned int nid, unsigned long start_pfn,
|
|
|
|
unsigned long end_pfn);
|
2008-06-14 10:08:52 +08:00
|
|
|
extern void remove_active_range(unsigned int nid, unsigned long start_pfn,
|
|
|
|
unsigned long end_pfn);
|
2006-09-27 16:49:59 +08:00
|
|
|
extern void push_node_boundaries(unsigned int nid, unsigned long start_pfn,
|
|
|
|
unsigned long end_pfn);
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
extern void remove_all_active_ranges(void);
|
|
|
|
extern unsigned long absent_pages_in_range(unsigned long start_pfn,
|
|
|
|
unsigned long end_pfn);
|
|
|
|
extern void get_pfn_range_for_nid(unsigned int nid,
|
|
|
|
unsigned long *start_pfn, unsigned long *end_pfn);
|
|
|
|
extern unsigned long find_min_pfn_with_active_regions(void);
|
|
|
|
extern void free_bootmem_with_active_regions(int nid,
|
|
|
|
unsigned long max_low_pfn);
|
2008-06-17 11:10:55 +08:00
|
|
|
typedef int (*work_fn_t)(unsigned long, unsigned long, void *);
|
2008-06-15 09:32:52 +08:00
|
|
|
extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data);
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
extern void sparse_memory_present_with_active_regions(int nid);
|
|
|
|
#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
|
|
|
|
extern int early_pfn_to_nid(unsigned long pfn);
|
|
|
|
#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
|
|
|
|
#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
|
2006-09-27 16:49:56 +08:00
|
|
|
extern void set_dma_reserve(unsigned long new_dma_reserve);
|
2007-01-11 15:15:30 +08:00
|
|
|
extern void memmap_init_zone(unsigned long, int, unsigned long,
|
|
|
|
unsigned long, enum memmap_context);
|
2005-10-30 09:16:54 +08:00
|
|
|
extern void setup_per_zone_pages_min(void);
|
2005-04-17 06:20:36 +08:00
|
|
|
extern void mem_init(void);
|
|
|
|
extern void show_mem(void);
|
|
|
|
extern void si_meminfo(struct sysinfo * val);
|
|
|
|
extern void si_meminfo_node(struct sysinfo *val, int nid);
|
2008-05-13 03:21:13 +08:00
|
|
|
extern int after_bootmem;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-06-22 08:14:47 +08:00
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
extern void setup_per_cpu_pageset(void);
|
|
|
|
#else
|
|
|
|
static inline void setup_per_cpu_pageset(void) {}
|
|
|
|
#endif
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* prio_tree.c */
|
|
|
|
void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old);
|
|
|
|
void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *);
|
|
|
|
void vma_prio_tree_remove(struct vm_area_struct *, struct prio_tree_root *);
|
|
|
|
struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma,
|
|
|
|
struct prio_tree_iter *iter);
|
|
|
|
|
|
|
|
#define vma_prio_tree_foreach(vma, iter, root, begin, end) \
|
|
|
|
for (prio_tree_iter_init(iter, root, begin, end), vma = NULL; \
|
|
|
|
(vma = vma_prio_tree_next(vma, iter)); )
|
|
|
|
|
|
|
|
static inline void vma_nonlinear_insert(struct vm_area_struct *vma,
|
|
|
|
struct list_head *list)
|
|
|
|
{
|
|
|
|
vma->shared.vm_set.parent = NULL;
|
|
|
|
list_add_tail(&vma->shared.vm_set.list, list);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* mmap.c */
|
2007-08-23 05:01:28 +08:00
|
|
|
extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
|
2005-04-17 06:20:36 +08:00
|
|
|
extern void vma_adjust(struct vm_area_struct *vma, unsigned long start,
|
|
|
|
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert);
|
|
|
|
extern struct vm_area_struct *vma_merge(struct mm_struct *,
|
|
|
|
struct vm_area_struct *prev, unsigned long addr, unsigned long end,
|
|
|
|
unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
|
|
|
|
struct mempolicy *);
|
|
|
|
extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
|
|
|
|
extern int split_vma(struct mm_struct *,
|
|
|
|
struct vm_area_struct *, unsigned long addr, int new_below);
|
|
|
|
extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
|
|
|
|
extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
|
|
|
|
struct rb_node **, struct rb_node *);
|
2005-10-30 09:15:57 +08:00
|
|
|
extern void unlink_file_vma(struct vm_area_struct *);
|
2005-04-17 06:20:36 +08:00
|
|
|
extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
|
|
|
|
unsigned long addr, unsigned long len, pgoff_t pgoff);
|
|
|
|
extern void exit_mmap(struct mm_struct *);
|
2008-04-29 16:01:36 +08:00
|
|
|
|
2008-07-29 06:46:26 +08:00
|
|
|
extern int mm_take_all_locks(struct mm_struct *mm);
|
|
|
|
extern void mm_drop_all_locks(struct mm_struct *mm);
|
|
|
|
|
2008-04-29 16:01:36 +08:00
|
|
|
#ifdef CONFIG_PROC_FS
|
|
|
|
/* From fs/proc/base.c. callers must _not_ hold the mm's exe_file_lock */
|
|
|
|
extern void added_exe_file_vma(struct mm_struct *mm);
|
|
|
|
extern void removed_exe_file_vma(struct mm_struct *mm);
|
|
|
|
#else
|
|
|
|
static inline void added_exe_file_vma(struct mm_struct *mm)
|
|
|
|
{}
|
|
|
|
|
|
|
|
static inline void removed_exe_file_vma(struct mm_struct *mm)
|
|
|
|
{}
|
|
|
|
#endif /* CONFIG_PROC_FS */
|
|
|
|
|
2005-05-01 23:58:35 +08:00
|
|
|
extern int may_expand_vm(struct mm_struct *mm, unsigned long npages);
|
2007-02-09 06:20:41 +08:00
|
|
|
extern int install_special_mapping(struct mm_struct *mm,
|
|
|
|
unsigned long addr, unsigned long len,
|
|
|
|
unsigned long flags, struct page **pages);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
|
|
|
|
|
|
|
|
extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
|
|
|
|
unsigned long len, unsigned long prot,
|
|
|
|
unsigned long flag, unsigned long pgoff);
|
2007-07-16 14:38:26 +08:00
|
|
|
extern unsigned long mmap_region(struct file *file, unsigned long addr,
|
|
|
|
unsigned long len, unsigned long flags,
|
|
|
|
unsigned int vm_flags, unsigned long pgoff,
|
|
|
|
int accountable);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
static inline unsigned long do_mmap(struct file *file, unsigned long addr,
|
|
|
|
unsigned long len, unsigned long prot,
|
|
|
|
unsigned long flag, unsigned long offset)
|
|
|
|
{
|
|
|
|
unsigned long ret = -EINVAL;
|
|
|
|
if ((offset + PAGE_ALIGN(len)) < offset)
|
|
|
|
goto out;
|
|
|
|
if (!(offset & ~PAGE_MASK))
|
|
|
|
ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
extern int do_munmap(struct mm_struct *, unsigned long, size_t);
|
|
|
|
|
|
|
|
extern unsigned long do_brk(unsigned long, unsigned long);
|
|
|
|
|
|
|
|
/* filemap.c */
|
|
|
|
extern unsigned long page_unuse(struct page *);
|
|
|
|
extern void truncate_inode_pages(struct address_space *, loff_t);
|
2006-01-06 16:10:36 +08:00
|
|
|
extern void truncate_inode_pages_range(struct address_space *,
|
|
|
|
loff_t lstart, loff_t lend);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* generic vm_area_ops exported for stackable file systems */
|
2007-07-19 16:47:03 +08:00
|
|
|
extern int filemap_fault(struct vm_area_struct *, struct vm_fault *);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* mm/page-writeback.c */
|
|
|
|
int write_one_page(struct page *page, int wait);
|
|
|
|
|
|
|
|
/* readahead.c */
|
|
|
|
#define VM_MAX_READAHEAD 128 /* kbytes */
|
|
|
|
#define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */
|
|
|
|
|
|
|
|
int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
|
2005-11-07 16:59:28 +08:00
|
|
|
pgoff_t offset, unsigned long nr_to_read);
|
2005-04-17 06:20:36 +08:00
|
|
|
int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
|
2005-11-07 16:59:28 +08:00
|
|
|
pgoff_t offset, unsigned long nr_to_read);
|
2007-07-19 16:48:08 +08:00
|
|
|
|
|
|
|
void page_cache_sync_readahead(struct address_space *mapping,
|
|
|
|
struct file_ra_state *ra,
|
|
|
|
struct file *filp,
|
|
|
|
pgoff_t offset,
|
|
|
|
unsigned long size);
|
|
|
|
|
|
|
|
void page_cache_async_readahead(struct address_space *mapping,
|
|
|
|
struct file_ra_state *ra,
|
|
|
|
struct file *filp,
|
|
|
|
struct page *pg,
|
|
|
|
pgoff_t offset,
|
|
|
|
unsigned long size);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned long max_sane_readahead(unsigned long nr);
|
|
|
|
|
|
|
|
/* Do stack extension */
|
2005-10-30 09:16:20 +08:00
|
|
|
extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
|
2005-11-19 05:16:42 +08:00
|
|
|
#ifdef CONFIG_IA64
|
2005-10-30 09:16:20 +08:00
|
|
|
extern int expand_upwards(struct vm_area_struct *vma, unsigned long address);
|
2005-11-19 05:16:42 +08:00
|
|
|
#endif
|
2007-07-19 16:48:16 +08:00
|
|
|
extern int expand_stack_downwards(struct vm_area_struct *vma,
|
|
|
|
unsigned long address);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
|
|
|
|
extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
|
|
|
|
extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
|
|
|
|
struct vm_area_struct **pprev);
|
|
|
|
|
|
|
|
/* Look up the first VMA which intersects the interval start_addr..end_addr-1,
|
|
|
|
NULL if none. Assume start_addr < end_addr. */
|
|
|
|
static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr)
|
|
|
|
{
|
|
|
|
struct vm_area_struct * vma = find_vma(mm,start_addr);
|
|
|
|
|
|
|
|
if (vma && end_addr <= vma->vm_start)
|
|
|
|
vma = NULL;
|
|
|
|
return vma;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline unsigned long vma_pages(struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
|
|
|
|
}
|
|
|
|
|
2006-07-27 04:39:49 +08:00
|
|
|
pgprot_t vm_get_page_prot(unsigned long vm_flags);
|
2005-10-30 09:16:33 +08:00
|
|
|
struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
|
|
|
|
int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
|
|
|
|
unsigned long pfn, unsigned long size, pgprot_t);
|
2005-12-01 01:35:19 +08:00
|
|
|
int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
|
2007-02-12 16:51:36 +08:00
|
|
|
int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
|
|
|
|
unsigned long pfn);
|
2008-04-28 17:13:01 +08:00
|
|
|
int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
|
|
|
|
unsigned long pfn);
|
2005-10-30 09:16:33 +08:00
|
|
|
|
2005-11-29 06:34:23 +08:00
|
|
|
struct page *follow_page(struct vm_area_struct *, unsigned long address,
|
2005-10-30 09:16:33 +08:00
|
|
|
unsigned int foll_flags);
|
|
|
|
#define FOLL_WRITE 0x01 /* check pte is writable */
|
|
|
|
#define FOLL_TOUCH 0x02 /* mark page accessed */
|
|
|
|
#define FOLL_GET 0x04 /* do get_page on page */
|
|
|
|
#define FOLL_ANON 0x08 /* give ZERO_PAGE if no pgtable */
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-02-08 20:22:04 +08:00
|
|
|
typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
|
2007-05-07 05:48:54 +08:00
|
|
|
void *data);
|
|
|
|
extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
|
|
|
|
unsigned long size, pte_fn_t fn, void *data);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#ifdef CONFIG_PROC_FS
|
2005-10-30 09:15:56 +08:00
|
|
|
void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
|
2005-04-17 06:20:36 +08:00
|
|
|
#else
|
2005-10-30 09:15:56 +08:00
|
|
|
static inline void vm_stat_account(struct mm_struct *mm,
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned long flags, struct file *file, long pages)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_PROC_FS */
|
|
|
|
|
2008-01-30 20:33:58 +08:00
|
|
|
#ifdef CONFIG_DEBUG_PAGEALLOC
|
|
|
|
extern int debug_pagealloc_enabled;
|
|
|
|
|
|
|
|
extern void kernel_map_pages(struct page *page, int numpages, int enable);
|
|
|
|
|
|
|
|
static inline void enable_debug_pagealloc(void)
|
|
|
|
{
|
|
|
|
debug_pagealloc_enabled = 1;
|
|
|
|
}
|
2008-02-20 08:47:44 +08:00
|
|
|
#ifdef CONFIG_HIBERNATION
|
|
|
|
extern bool kernel_page_present(struct page *page);
|
|
|
|
#endif /* CONFIG_HIBERNATION */
|
2008-01-30 20:33:58 +08:00
|
|
|
#else
|
2005-04-17 06:20:36 +08:00
|
|
|
static inline void
|
2006-10-11 16:21:30 +08:00
|
|
|
kernel_map_pages(struct page *page, int numpages, int enable) {}
|
2008-01-30 20:33:58 +08:00
|
|
|
static inline void enable_debug_pagealloc(void)
|
|
|
|
{
|
|
|
|
}
|
2008-02-20 08:47:44 +08:00
|
|
|
#ifdef CONFIG_HIBERNATION
|
|
|
|
static inline bool kernel_page_present(struct page *page) { return true; }
|
|
|
|
#endif /* CONFIG_HIBERNATION */
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif
|
|
|
|
|
|
|
|
extern struct vm_area_struct *get_gate_vma(struct task_struct *tsk);
|
|
|
|
#ifdef __HAVE_ARCH_GATE_AREA
|
|
|
|
int in_gate_area_no_task(unsigned long addr);
|
|
|
|
int in_gate_area(struct task_struct *task, unsigned long addr);
|
|
|
|
#else
|
|
|
|
int in_gate_area_no_task(unsigned long addr);
|
|
|
|
#define in_gate_area(task, addr) ({(void)task; in_gate_area_no_task(addr);})
|
|
|
|
#endif /* __HAVE_ARCH_GATE_AREA */
|
|
|
|
|
2006-01-08 17:00:39 +08:00
|
|
|
int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *,
|
|
|
|
void __user *, size_t *, loff_t *);
|
2006-03-22 16:08:19 +08:00
|
|
|
unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
|
2006-01-08 17:00:39 +08:00
|
|
|
unsigned long lru_pages);
|
|
|
|
|
2006-02-21 10:28:07 +08:00
|
|
|
#ifndef CONFIG_MMU
|
|
|
|
#define randomize_va_space 0
|
|
|
|
#else
|
2006-02-17 06:41:58 +08:00
|
|
|
extern int randomize_va_space;
|
2006-02-21 10:28:07 +08:00
|
|
|
#endif
|
2006-02-17 06:41:58 +08:00
|
|
|
|
2007-07-27 01:41:13 +08:00
|
|
|
const char * arch_vma_name(struct vm_area_struct *vma);
|
2008-01-30 20:33:18 +08:00
|
|
|
void print_vma_addr(char *prefix, unsigned long rip);
|
[PATCH] vdso: randomize the i386 vDSO by moving it into a vma
Move the i386 VDSO down into a vma and thus randomize it.
Besides the security implications, this feature also helps debuggers, which
can COW a vma-backed VDSO just like a normal DSO and can thus do
single-stepping and other debugging features.
It's good for hypervisors (Xen, VMWare) too, which typically live in the same
high-mapped address space as the VDSO, hence whenever the VDSO is used, they
get lots of guest pagefaults and have to fix such guest accesses up - which
slows things down instead of speeding things up (the primary purpose of the
VDSO).
There's a new CONFIG_COMPAT_VDSO (default=y) option, which provides support
for older glibcs that still rely on a prelinked high-mapped VDSO. Newer
distributions (using glibc 2.3.3 or later) can turn this option off. Turning
it off is also recommended for security reasons: attackers cannot use the
predictable high-mapped VDSO page as syscall trampoline anymore.
There is a new vdso=[0|1] boot option as well, and a runtime
/proc/sys/vm/vdso_enabled sysctl switch, that allows the VDSO to be turned
on/off.
(This version of the VDSO-randomization patch also has working ELF
coredumping, the previous patch crashed in the coredumping code.)
This code is a combined work of the exec-shield VDSO randomization
code and Gerd Hoffmann's hypervisor-centric VDSO patch. Rusty Russell
started this patch and i completed it.
[akpm@osdl.org: cleanups]
[akpm@osdl.org: compile fix]
[akpm@osdl.org: compile fix 2]
[akpm@osdl.org: compile fix 3]
[akpm@osdl.org: revernt MAXMEM change]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Cc: Gerd Hoffmann <kraxel@suse.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 17:53:50 +08:00
|
|
|
|
2007-10-16 16:26:14 +08:00
|
|
|
struct page *sparse_mem_map_populate(unsigned long pnum, int nid);
|
2007-10-16 16:24:14 +08:00
|
|
|
pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
|
|
|
|
pud_t *vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node);
|
|
|
|
pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node);
|
|
|
|
pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node);
|
2007-10-16 16:24:13 +08:00
|
|
|
void *vmemmap_alloc_block(unsigned long size, int node);
|
|
|
|
void vmemmap_verify(pte_t *, int, unsigned long, unsigned long);
|
2007-10-16 16:24:14 +08:00
|
|
|
int vmemmap_populate_basepages(struct page *start_page,
|
|
|
|
unsigned long pages, int node);
|
|
|
|
int vmemmap_populate(struct page *start_page, unsigned long pages, int node);
|
2008-04-12 16:19:24 +08:00
|
|
|
void vmemmap_populate_print_last(void);
|
2007-10-16 16:24:13 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif /* __KERNEL__ */
|
|
|
|
#endif /* _LINUX_MM_H */
|