2005-04-17 06:20:36 +08:00
|
|
|
/*
|
2006-10-04 05:01:26 +08:00
|
|
|
* include/linux/writeback.h
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
|
|
|
#ifndef WRITEBACK_H
|
|
|
|
#define WRITEBACK_H
|
|
|
|
|
Detach sched.h from mm.h
First thing mm.h does is including sched.h solely for can_do_mlock() inline
function which has "current" dereference inside. By dealing with can_do_mlock()
mm.h can be detached from sched.h which is good. See below, why.
This patch
a) removes unconditional inclusion of sched.h from mm.h
b) makes can_do_mlock() normal function in mm/mlock.c
c) exports can_do_mlock() to not break compilation
d) adds sched.h inclusions back to files that were getting it indirectly.
e) adds less bloated headers to some files (asm/signal.h, jiffies.h) that were
getting them indirectly
Net result is:
a) mm.h users would get less code to open, read, preprocess, parse, ... if
they don't need sched.h
b) sched.h stops being dependency for significant number of files:
on x86_64 allmodconfig touching sched.h results in recompile of 4083 files,
after patch it's only 3744 (-8.3%).
Cross-compile tested on
all arm defconfigs, all mips defconfigs, all powerpc defconfigs,
alpha alpha-up
arm
i386 i386-up i386-defconfig i386-allnoconfig
ia64 ia64-up
m68k
mips
parisc parisc-up
powerpc powerpc-up
s390 s390-up
sparc sparc-up
sparc64 sparc64-up
um-x86_64
x86_64 x86_64-up x86_64-defconfig x86_64-allnoconfig
as well as my two usual configs.
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-21 05:22:52 +08:00
|
|
|
#include <linux/sched.h>
|
2013-05-08 07:19:08 +08:00
|
|
|
#include <linux/workqueue.h>
|
2007-09-21 15:19:54 +08:00
|
|
|
#include <linux/fs.h>
|
2015-05-23 06:23:21 +08:00
|
|
|
#include <linux/flex_proportions.h>
|
2015-05-29 02:50:49 +08:00
|
|
|
#include <linux/backing-dev-defs.h>
|
Detach sched.h from mm.h
First thing mm.h does is including sched.h solely for can_do_mlock() inline
function which has "current" dereference inside. By dealing with can_do_mlock()
mm.h can be detached from sched.h which is good. See below, why.
This patch
a) removes unconditional inclusion of sched.h from mm.h
b) makes can_do_mlock() normal function in mm/mlock.c
c) exports can_do_mlock() to not break compilation
d) adds sched.h inclusions back to files that were getting it indirectly.
e) adds less bloated headers to some files (asm/signal.h, jiffies.h) that were
getting them indirectly
Net result is:
a) mm.h users would get less code to open, read, preprocess, parse, ... if
they don't need sched.h
b) sched.h stops being dependency for significant number of files:
on x86_64 allmodconfig touching sched.h results in recompile of 4083 files,
after patch it's only 3744 (-8.3%).
Cross-compile tested on
all arm defconfigs, all mips defconfigs, all powerpc defconfigs,
alpha alpha-up
arm
i386 i386-up i386-defconfig i386-allnoconfig
ia64 ia64-up
m68k
mips
parisc parisc-up
powerpc powerpc-up
s390 s390-up
sparc sparc-up
sparc64 sparc64-up
um-x86_64
x86_64 x86_64-up x86_64-defconfig x86_64-allnoconfig
as well as my two usual configs.
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-21 05:22:52 +08:00
|
|
|
|
2011-04-06 03:21:19 +08:00
|
|
|
DECLARE_PER_CPU(int, dirty_throttle_leaks);
|
|
|
|
|
2011-06-20 12:18:42 +08:00
|
|
|
/*
|
2010-08-30 03:28:09 +08:00
|
|
|
* The 1/4 region under the global dirty thresh is for smooth dirty throttling:
|
|
|
|
*
|
|
|
|
* (thresh - thresh/DIRTY_FULL_SCOPE, thresh)
|
|
|
|
*
|
2011-06-20 12:18:42 +08:00
|
|
|
* Further beyond, all dirtier tasks will enter a loop waiting (possibly long
|
|
|
|
* time) for the dirty pages to drop, unless written enough pages.
|
|
|
|
*
|
|
|
|
* The global dirty threshold is normally equal to the global dirty limit,
|
|
|
|
* except when the system suddenly allocates a lot of anonymous memory and
|
|
|
|
* knocks down the global dirty threshold quickly, in which case the global
|
|
|
|
* dirty limit will follow down slowly to prevent livelocking all dirtier tasks.
|
|
|
|
*/
|
2010-08-30 03:28:09 +08:00
|
|
|
#define DIRTY_SCOPE 8
|
|
|
|
#define DIRTY_FULL_SCOPE (DIRTY_SCOPE / 2)
|
2011-06-20 12:18:42 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
struct backing_dev_info;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* fs/fs-writeback.c
|
|
|
|
*/
|
|
|
|
enum writeback_sync_modes {
|
|
|
|
WB_SYNC_NONE, /* Don't wait on anything */
|
|
|
|
WB_SYNC_ALL, /* Wait on every mapping */
|
|
|
|
};
|
|
|
|
|
2011-10-08 11:54:10 +08:00
|
|
|
/*
|
|
|
|
* why some writeback work was initiated
|
|
|
|
*/
|
|
|
|
enum wb_reason {
|
|
|
|
WB_REASON_BACKGROUND,
|
|
|
|
WB_REASON_TRY_TO_FREE_PAGES,
|
|
|
|
WB_REASON_SYNC,
|
|
|
|
WB_REASON_PERIODIC,
|
|
|
|
WB_REASON_LAPTOP_TIMER,
|
|
|
|
WB_REASON_FREE_MORE_MEM,
|
|
|
|
WB_REASON_FS_FREE_SPACE,
|
2013-07-09 07:00:15 +08:00
|
|
|
/*
|
|
|
|
* There is no bdi forker thread any more and works are done
|
|
|
|
* by emergency worker, however, this is TPs userland visible
|
|
|
|
* and we'll be exposing exactly the same information,
|
|
|
|
* so it has a mismatch name.
|
|
|
|
*/
|
2011-10-08 11:54:10 +08:00
|
|
|
WB_REASON_FORKER_THREAD,
|
|
|
|
|
|
|
|
WB_REASON_MAX,
|
|
|
|
};
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* A control structure which tells the writeback code what to do. These are
|
|
|
|
* always on the stack, and hence need no locking. They are always initialised
|
|
|
|
* in a manner such that unspecified fields are set to zero.
|
|
|
|
*/
|
|
|
|
struct writeback_control {
|
|
|
|
long nr_to_write; /* Write this many pages, and decrement
|
|
|
|
this for each page written */
|
|
|
|
long pages_skipped; /* Pages which were not written */
|
|
|
|
|
|
|
|
/*
|
2012-03-06 07:06:02 +08:00
|
|
|
* For a_ops->writepages(): if start or end are non-zero then this is
|
2005-04-17 06:20:36 +08:00
|
|
|
* a hint that the filesystem need only write out the pages inside that
|
|
|
|
* byterange. The byte at `end' is included in the writeout request.
|
|
|
|
*/
|
[PATCH] writeback: fix range handling
When a writeback_control's `start' and `end' fields are used to
indicate a one-byte-range starting at file offset zero, the required
values of .start=0,.end=0 mean that the ->writepages() implementation
has no way of telling that it is being asked to perform a range
request. Because we're currently overloading (start == 0 && end == 0)
to mean "this is not a write-a-range request".
To make all this sane, the patch changes range of writeback_control.
So caller does: If it is calling ->writepages() to write pages, it
sets range (range_start/end or range_cyclic) always.
And if range_cyclic is true, ->writepages() thinks the range is
cyclic, otherwise it just uses range_start and range_end.
This patch does,
- Add LLONG_MAX, LLONG_MIN, ULLONG_MAX to include/linux/kernel.h
-1 is usually ok for range_end (type is long long). But, if someone did,
range_end += val; range_end is "val - 1"
u64val = range_end >> bits; u64val is "~(0ULL)"
or something, they are wrong. So, this adds LLONG_MAX to avoid nasty
things, and uses LLONG_MAX for range_end.
- All callers of ->writepages() sets range_start/end or range_cyclic.
- Fix updates of ->writeback_index. It seems already bit strange.
If it starts at 0 and ended by check of nr_to_write, this last
index may reduce chance to scan end of file. So, this updates
->writeback_index only if range_cyclic is true or whole-file is
scanned.
Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Nathan Scott <nathans@sgi.com>
Cc: Anton Altaparmakov <aia21@cantab.net>
Cc: Steven French <sfrench@us.ibm.com>
Cc: "Vladimir V. Saveliev" <vs@namesys.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-23 17:03:26 +08:00
|
|
|
loff_t range_start;
|
|
|
|
loff_t range_end;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2012-04-25 21:53:05 +08:00
|
|
|
enum writeback_sync_modes sync_mode;
|
|
|
|
|
2005-11-17 07:07:01 +08:00
|
|
|
unsigned for_kupdate:1; /* A kupdate writeback */
|
2009-12-03 20:54:25 +08:00
|
|
|
unsigned for_background:1; /* A background writeback */
|
2010-06-07 00:38:15 +08:00
|
|
|
unsigned tagged_writepages:1; /* tag-and-write to avoid livelock */
|
2005-11-17 07:07:01 +08:00
|
|
|
unsigned for_reclaim:1; /* Invoked from the page allocator */
|
[PATCH] writeback: fix range handling
When a writeback_control's `start' and `end' fields are used to
indicate a one-byte-range starting at file offset zero, the required
values of .start=0,.end=0 mean that the ->writepages() implementation
has no way of telling that it is being asked to perform a range
request. Because we're currently overloading (start == 0 && end == 0)
to mean "this is not a write-a-range request".
To make all this sane, the patch changes range of writeback_control.
So caller does: If it is calling ->writepages() to write pages, it
sets range (range_start/end or range_cyclic) always.
And if range_cyclic is true, ->writepages() thinks the range is
cyclic, otherwise it just uses range_start and range_end.
This patch does,
- Add LLONG_MAX, LLONG_MIN, ULLONG_MAX to include/linux/kernel.h
-1 is usually ok for range_end (type is long long). But, if someone did,
range_end += val; range_end is "val - 1"
u64val = range_end >> bits; u64val is "~(0ULL)"
or something, they are wrong. So, this adds LLONG_MAX to avoid nasty
things, and uses LLONG_MAX for range_end.
- All callers of ->writepages() sets range_start/end or range_cyclic.
- Fix updates of ->writeback_index. It seems already bit strange.
If it starts at 0 and ended by check of nr_to_write, this last
index may reduce chance to scan end of file. So, this updates
->writeback_index only if range_cyclic is true or whole-file is
scanned.
Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Nathan Scott <nathans@sgi.com>
Cc: Anton Altaparmakov <aia21@cantab.net>
Cc: Steven French <sfrench@us.ibm.com>
Cc: "Vladimir V. Saveliev" <vs@namesys.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-23 17:03:26 +08:00
|
|
|
unsigned range_cyclic:1; /* range_start is cyclic */
|
2013-07-02 20:38:35 +08:00
|
|
|
unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
|
2015-06-02 22:39:48 +08:00
|
|
|
#ifdef CONFIG_CGROUP_WRITEBACK
|
|
|
|
struct bdi_writeback *wb; /* wb this writeback is issued under */
|
2015-05-29 02:50:51 +08:00
|
|
|
struct inode *inode; /* inode being written out */
|
|
|
|
|
|
|
|
/* foreign inode detection, see wbc_detach_inode() */
|
|
|
|
int wb_id; /* current wb id */
|
|
|
|
int wb_lcand_id; /* last foreign candidate wb id */
|
|
|
|
int wb_tcand_id; /* this foreign candidate wb id */
|
|
|
|
size_t wb_bytes; /* bytes written by current wb */
|
|
|
|
size_t wb_lcand_bytes; /* bytes written by last candidate */
|
|
|
|
size_t wb_tcand_bytes; /* bytes written by this candidate */
|
2015-06-02 22:39:48 +08:00
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
2015-05-23 06:23:21 +08:00
|
|
|
/*
|
|
|
|
* A wb_domain represents a domain that wb's (bdi_writeback's) belong to
|
|
|
|
* and are measured against each other in. There always is one global
|
|
|
|
* domain, global_wb_domain, that every wb in the system is a member of.
|
|
|
|
* This allows measuring the relative bandwidth of each wb to distribute
|
|
|
|
* dirtyable memory accordingly.
|
|
|
|
*/
|
|
|
|
struct wb_domain {
|
2015-05-23 06:23:22 +08:00
|
|
|
spinlock_t lock;
|
|
|
|
|
2015-05-23 06:23:21 +08:00
|
|
|
/*
|
|
|
|
* Scale the writeback cache size proportional to the relative
|
|
|
|
* writeout speed.
|
|
|
|
*
|
|
|
|
* We do this by keeping a floating proportion between BDIs, based
|
|
|
|
* on page writeback completions [end_page_writeback()]. Those
|
|
|
|
* devices that write out pages fastest will get the larger share,
|
|
|
|
* while the slower will get a smaller share.
|
|
|
|
*
|
|
|
|
* We use page writeout completions because we are interested in
|
|
|
|
* getting rid of dirty pages. Having them written out is the
|
|
|
|
* primary goal.
|
|
|
|
*
|
|
|
|
* We introduce a concept of time, a period over which we measure
|
|
|
|
* these events, because demand can/will vary over time. The length
|
|
|
|
* of this period itself is measured in page writeback completions.
|
|
|
|
*/
|
|
|
|
struct fprop_global completions;
|
|
|
|
struct timer_list period_timer; /* timer for aging of completions */
|
|
|
|
unsigned long period_time;
|
2015-05-23 06:23:22 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The dirtyable memory and dirty threshold could be suddenly
|
|
|
|
* knocked down by a large amount (eg. on the startup of KVM in a
|
|
|
|
* swapless system). This may throw the system into deep dirty
|
|
|
|
* exceeded state and throttle heavy/light dirtiers alike. To
|
|
|
|
* retain good responsiveness, maintain global_dirty_limit for
|
|
|
|
* tracking slowly down to the knocked down dirty threshold.
|
|
|
|
*
|
|
|
|
* Both fields are protected by ->lock.
|
|
|
|
*/
|
|
|
|
unsigned long dirty_limit_tstamp;
|
|
|
|
unsigned long dirty_limit;
|
2015-05-23 06:23:21 +08:00
|
|
|
};
|
|
|
|
|
2015-05-23 06:23:34 +08:00
|
|
|
/**
|
|
|
|
* wb_domain_size_changed - memory available to a wb_domain has changed
|
|
|
|
* @dom: wb_domain of interest
|
|
|
|
*
|
|
|
|
* This function should be called when the amount of memory available to
|
|
|
|
* @dom has changed. It resets @dom's dirty limit parameters to prevent
|
|
|
|
* the past values which don't match the current configuration from skewing
|
|
|
|
* dirty throttling. Without this, when memory size of a wb_domain is
|
|
|
|
* greatly reduced, the dirty throttling logic may allow too many pages to
|
|
|
|
* be dirtied leading to consecutive unnecessary OOMs and may get stuck in
|
|
|
|
* that situation.
|
|
|
|
*/
|
|
|
|
static inline void wb_domain_size_changed(struct wb_domain *dom)
|
|
|
|
{
|
|
|
|
spin_lock(&dom->lock);
|
|
|
|
dom->dirty_limit_tstamp = jiffies;
|
|
|
|
dom->dirty_limit = 0;
|
|
|
|
spin_unlock(&dom->lock);
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* fs/fs-writeback.c
|
|
|
|
*/
|
2009-09-09 15:08:54 +08:00
|
|
|
struct bdi_writeback;
|
2011-10-08 11:54:10 +08:00
|
|
|
void writeback_inodes_sb(struct super_block *, enum wb_reason reason);
|
|
|
|
void writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
|
|
|
|
enum wb_reason reason);
|
2015-05-23 05:14:00 +08:00
|
|
|
bool try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason);
|
|
|
|
bool try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
|
|
|
|
enum wb_reason reason);
|
2014-02-21 18:19:04 +08:00
|
|
|
void sync_inodes_sb(struct super_block *);
|
2011-10-08 11:54:10 +08:00
|
|
|
void wakeup_flusher_threads(long nr_pages, enum wb_reason reason);
|
2012-05-03 20:48:03 +08:00
|
|
|
void inode_wait_for_writeback(struct inode *inode);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* writeback.h requires fs.h; it, too, is not included from here. */
|
|
|
|
static inline void wait_on_inode(struct inode *inode)
|
|
|
|
{
|
|
|
|
might_sleep();
|
sched: Remove proliferation of wait_on_bit() action functions
The current "wait_on_bit" interface requires an 'action'
function to be provided which does the actual waiting.
There are over 20 such functions, many of them identical.
Most cases can be satisfied by one of just two functions, one
which uses io_schedule() and one which just uses schedule().
So:
Rename wait_on_bit and wait_on_bit_lock to
wait_on_bit_action and wait_on_bit_lock_action
to make it explicit that they need an action function.
Introduce new wait_on_bit{,_lock} and wait_on_bit{,_lock}_io
which are *not* given an action function but implicitly use
a standard one.
The decision to error-out if a signal is pending is now made
based on the 'mode' argument rather than being encoded in the action
function.
All instances of the old wait_on_bit and wait_on_bit_lock which
can use the new version have been changed accordingly and their
action functions have been discarded.
wait_on_bit{_lock} does not return any specific error code in the
event of a signal so the caller must check for non-zero and
interpolate their own error code as appropriate.
The wait_on_bit() call in __fscache_wait_on_invalidate() was
ambiguous as it specified TASK_UNINTERRUPTIBLE but used
fscache_wait_bit_interruptible as an action function.
David Howells confirms this should be uniformly
"uninterruptible"
The main remaining user of wait_on_bit{,_lock}_action is NFS
which needs to use a freezer-aware schedule() call.
A comment in fs/gfs2/glock.c notes that having multiple 'action'
functions is useful as they display differently in the 'wchan'
field of 'ps'. (and /proc/$PID/wchan).
As the new bit_wait{,_io} functions are tagged "__sched", they
will not show up at all, but something higher in the stack. So
the distinction will still be visible, only with different
function names (gds2_glock_wait versus gfs2_glock_dq_wait in the
gfs2/glock.c case).
Since first version of this patch (against 3.15) two new action
functions appeared, on in NFS and one in CIFS. CIFS also now
uses an action function that makes the same freezer aware
schedule call as NFS.
Signed-off-by: NeilBrown <neilb@suse.de>
Acked-by: David Howells <dhowells@redhat.com> (fscache, keys)
Acked-by: Steven Whitehouse <swhiteho@redhat.com> (gfs2)
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steve French <sfrench@samba.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/20140707051603.28027.72349.stgit@notabene.brown
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-07-07 13:16:04 +08:00
|
|
|
wait_on_bit(&inode->i_state, __I_NEW, TASK_UNINTERRUPTIBLE);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2007-10-17 14:30:44 +08:00
|
|
|
|
2015-05-29 02:50:49 +08:00
|
|
|
#ifdef CONFIG_CGROUP_WRITEBACK
|
|
|
|
|
2015-06-02 22:39:48 +08:00
|
|
|
#include <linux/cgroup.h>
|
|
|
|
#include <linux/bio.h>
|
|
|
|
|
2015-05-29 02:50:49 +08:00
|
|
|
void __inode_attach_wb(struct inode *inode, struct page *page);
|
2015-06-02 22:39:48 +08:00
|
|
|
void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
|
|
|
|
struct inode *inode)
|
|
|
|
__releases(&inode->i_lock);
|
|
|
|
void wbc_detach_inode(struct writeback_control *wbc);
|
2015-05-29 02:50:51 +08:00
|
|
|
void wbc_account_io(struct writeback_control *wbc, struct page *page,
|
|
|
|
size_t bytes);
|
2016-03-01 07:28:53 +08:00
|
|
|
void cgroup_writeback_umount(void);
|
2015-05-29 02:50:49 +08:00
|
|
|
|
|
|
|
/**
|
|
|
|
* inode_attach_wb - associate an inode with its wb
|
|
|
|
* @inode: inode of interest
|
|
|
|
* @page: page being dirtied (may be NULL)
|
|
|
|
*
|
|
|
|
* If @inode doesn't have its wb, associate it with the wb matching the
|
|
|
|
* memcg of @page or, if @page is NULL, %current. May be called w/ or w/o
|
|
|
|
* @inode->i_lock.
|
|
|
|
*/
|
|
|
|
static inline void inode_attach_wb(struct inode *inode, struct page *page)
|
|
|
|
{
|
|
|
|
if (!inode->i_wb)
|
|
|
|
__inode_attach_wb(inode, page);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* inode_detach_wb - disassociate an inode from its wb
|
|
|
|
* @inode: inode of interest
|
|
|
|
*
|
|
|
|
* @inode is being freed. Detach from its wb.
|
|
|
|
*/
|
|
|
|
static inline void inode_detach_wb(struct inode *inode)
|
|
|
|
{
|
|
|
|
if (inode->i_wb) {
|
|
|
|
wb_put(inode->i_wb);
|
|
|
|
inode->i_wb = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-06-02 22:39:48 +08:00
|
|
|
/**
|
|
|
|
* wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite
|
|
|
|
* @wbc: writeback_control of interest
|
|
|
|
* @inode: target inode
|
|
|
|
*
|
|
|
|
* This function is to be used by __filemap_fdatawrite_range(), which is an
|
|
|
|
* alternative entry point into writeback code, and first ensures @inode is
|
|
|
|
* associated with a bdi_writeback and attaches it to @wbc.
|
|
|
|
*/
|
|
|
|
static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
|
|
|
|
struct inode *inode)
|
|
|
|
{
|
|
|
|
spin_lock(&inode->i_lock);
|
|
|
|
inode_attach_wb(inode, NULL);
|
|
|
|
wbc_attach_and_unlock_inode(wbc, inode);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* wbc_init_bio - writeback specific initializtion of bio
|
|
|
|
* @wbc: writeback_control for the writeback in progress
|
|
|
|
* @bio: bio to be initialized
|
|
|
|
*
|
|
|
|
* @bio is a part of the writeback in progress controlled by @wbc. Perform
|
|
|
|
* writeback specific initialization. This is used to apply the cgroup
|
|
|
|
* writeback context.
|
|
|
|
*/
|
|
|
|
static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* pageout() path doesn't attach @wbc to the inode being written
|
|
|
|
* out. This is intentional as we don't want the function to block
|
|
|
|
* behind a slow cgroup. Ultimately, we want pageout() to kick off
|
|
|
|
* regular writeback instead of writing things out itself.
|
|
|
|
*/
|
|
|
|
if (wbc->wb)
|
|
|
|
bio_associate_blkcg(bio, wbc->wb->blkcg_css);
|
|
|
|
}
|
|
|
|
|
2015-05-29 02:50:49 +08:00
|
|
|
#else /* CONFIG_CGROUP_WRITEBACK */
|
|
|
|
|
|
|
|
static inline void inode_attach_wb(struct inode *inode, struct page *page)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void inode_detach_wb(struct inode *inode)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2015-06-02 22:39:48 +08:00
|
|
|
static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
|
|
|
|
struct inode *inode)
|
|
|
|
__releases(&inode->i_lock)
|
|
|
|
{
|
|
|
|
spin_unlock(&inode->i_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
|
|
|
|
struct inode *inode)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void wbc_detach_inode(struct writeback_control *wbc)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2015-05-29 02:50:51 +08:00
|
|
|
static inline void wbc_account_io(struct writeback_control *wbc,
|
|
|
|
struct page *page, size_t bytes)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2016-03-01 07:28:53 +08:00
|
|
|
static inline void cgroup_writeback_umount(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2015-05-29 02:50:49 +08:00
|
|
|
#endif /* CONFIG_CGROUP_WRITEBACK */
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* mm/page-writeback.c
|
|
|
|
*/
|
2010-05-20 15:18:47 +08:00
|
|
|
#ifdef CONFIG_BLOCK
|
2010-04-06 20:25:14 +08:00
|
|
|
void laptop_io_completion(struct backing_dev_info *info);
|
2005-04-17 06:20:36 +08:00
|
|
|
void laptop_sync_completion(void);
|
2010-04-06 20:25:14 +08:00
|
|
|
void laptop_mode_sync(struct work_struct *work);
|
|
|
|
void laptop_mode_timer_fn(unsigned long data);
|
2010-05-20 15:18:47 +08:00
|
|
|
#else
|
|
|
|
static inline void laptop_sync_completion(void) { }
|
|
|
|
#endif
|
2007-03-01 12:13:21 +08:00
|
|
|
void throttle_vm_writeout(gfp_t gfp_mask);
|
mm: try to distribute dirty pages fairly across zones
The maximum number of dirty pages that exist in the system at any time is
determined by a number of pages considered dirtyable and a user-configured
percentage of those, or an absolute number in bytes.
This number of dirtyable pages is the sum of memory provided by all the
zones in the system minus their lowmem reserves and high watermarks, so
that the system can retain a healthy number of free pages without having
to reclaim dirty pages.
But there is a flaw in that we have a zoned page allocator which does not
care about the global state but rather the state of individual memory
zones. And right now there is nothing that prevents one zone from filling
up with dirty pages while other zones are spared, which frequently leads
to situations where kswapd, in order to restore the watermark of free
pages, does indeed have to write pages from that zone's LRU list. This
can interfere so badly with IO from the flusher threads that major
filesystems (btrfs, xfs, ext4) mostly ignore write requests from reclaim
already, taking away the VM's only possibility to keep such a zone
balanced, aside from hoping the flushers will soon clean pages from that
zone.
Enter per-zone dirty limits. They are to a zone's dirtyable memory what
the global limit is to the global amount of dirtyable memory, and try to
make sure that no single zone receives more than its fair share of the
globally allowed dirty pages in the first place. As the number of pages
considered dirtyable excludes the zones' lowmem reserves and high
watermarks, the maximum number of dirty pages in a zone is such that the
zone can always be balanced without requiring page cleaning.
As this is a placement decision in the page allocator and pages are
dirtied only after the allocation, this patch allows allocators to pass
__GFP_WRITE when they know in advance that the page will be written to and
become dirty soon. The page allocator will then attempt to allocate from
the first zone of the zonelist - which on NUMA is determined by the task's
NUMA memory policy - that has not exceeded its dirty limit.
At first glance, it would appear that the diversion to lower zones can
increase pressure on them, but this is not the case. With a full high
zone, allocations will be diverted to lower zones eventually, so it is
more of a shift in timing of the lower zone allocations. Workloads that
previously could fit their dirty pages completely in the higher zone may
be forced to allocate from lower zones, but the amount of pages that
"spill over" are limited themselves by the lower zones' dirty constraints,
and thus unlikely to become a problem.
For now, the problem of unfair dirty page distribution remains for NUMA
configurations where the zones allowed for allocation are in sum not big
enough to trigger the global dirty limits, wake up the flusher threads and
remedy the situation. Because of this, an allocation that could not
succeed on any of the considered zones is allowed to ignore the dirty
limits before going into direct reclaim or even failing the allocation,
until a future patch changes the global dirty throttling and flusher
thread activation so that they take individual zone states into account.
Test results
15M DMA + 3246M DMA32 + 504 Normal = 3765M memory
40% dirty ratio
16G USB thumb drive
10 runs of dd if=/dev/zero of=disk/zeroes bs=32k count=$((10 << 15))
seconds nr_vmscan_write
(stddev) min| median| max
xfs
vanilla: 549.747( 3.492) 0.000| 0.000| 0.000
patched: 550.996( 3.802) 0.000| 0.000| 0.000
fuse-ntfs
vanilla: 1183.094(53.178) 54349.000| 59341.000| 65163.000
patched: 558.049(17.914) 0.000| 0.000| 43.000
btrfs
vanilla: 573.679(14.015) 156657.000| 460178.000| 606926.000
patched: 563.365(11.368) 0.000| 0.000| 1362.000
ext4
vanilla: 561.197(15.782) 0.000|2725438.000|4143837.000
patched: 568.806(17.496) 0.000| 0.000| 0.000
Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Shaohua Li <shaohua.li@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-01-11 07:07:49 +08:00
|
|
|
bool zone_dirty_ok(struct zone *zone);
|
2015-05-23 06:23:21 +08:00
|
|
|
int wb_domain_init(struct wb_domain *dom, gfp_t gfp);
|
2015-05-23 06:23:33 +08:00
|
|
|
#ifdef CONFIG_CGROUP_WRITEBACK
|
|
|
|
void wb_domain_exit(struct wb_domain *dom);
|
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2015-05-23 06:23:22 +08:00
|
|
|
extern struct wb_domain global_wb_domain;
|
2011-03-03 05:54:09 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* These are exported to sysctl. */
|
|
|
|
extern int dirty_background_ratio;
|
mm: add dirty_background_bytes and dirty_bytes sysctls
This change introduces two new sysctls to /proc/sys/vm:
dirty_background_bytes and dirty_bytes.
dirty_background_bytes is the counterpart to dirty_background_ratio and
dirty_bytes is the counterpart to dirty_ratio.
With growing memory capacities of individual machines, it's no longer
sufficient to specify dirty thresholds as a percentage of the amount of
dirtyable memory over the entire system.
dirty_background_bytes and dirty_bytes specify quantities of memory, in
bytes, that represent the dirty limits for the entire system. If either
of these values is set, its value represents the amount of dirty memory
that is needed to commence either background or direct writeback.
When a `bytes' or `ratio' file is written, its counterpart becomes a
function of the written value. For example, if dirty_bytes is written to
be 8096, 8K of memory is required to commence direct writeback.
dirty_ratio is then functionally equivalent to 8K / the amount of
dirtyable memory:
dirtyable_memory = free pages + mapped pages + file cache
dirty_background_bytes = dirty_background_ratio * dirtyable_memory
-or-
dirty_background_ratio = dirty_background_bytes / dirtyable_memory
AND
dirty_bytes = dirty_ratio * dirtyable_memory
-or-
dirty_ratio = dirty_bytes / dirtyable_memory
Only one of dirty_background_bytes and dirty_background_ratio may be
specified at a time, and only one of dirty_bytes and dirty_ratio may be
specified. When one sysctl is written, the other appears as 0 when read.
The `bytes' files operate on a page size granularity since dirty limits
are compared with ZVC values, which are in page units.
Prior to this change, the minimum dirty_ratio was 5 as implemented by
get_dirty_limits() although /proc/sys/vm/dirty_ratio would show any user
written value between 0 and 100. This restriction is maintained, but
dirty_bytes has a lower limit of only one page.
Also prior to this change, the dirty_background_ratio could not equal or
exceed dirty_ratio. This restriction is maintained in addition to
restricting dirty_background_bytes. If either background threshold equals
or exceeds that of the dirty threshold, it is implicitly set to half the
dirty threshold.
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Andrea Righi <righi.andrea@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-07 06:39:31 +08:00
|
|
|
extern unsigned long dirty_background_bytes;
|
2005-04-17 06:20:36 +08:00
|
|
|
extern int vm_dirty_ratio;
|
mm: add dirty_background_bytes and dirty_bytes sysctls
This change introduces two new sysctls to /proc/sys/vm:
dirty_background_bytes and dirty_bytes.
dirty_background_bytes is the counterpart to dirty_background_ratio and
dirty_bytes is the counterpart to dirty_ratio.
With growing memory capacities of individual machines, it's no longer
sufficient to specify dirty thresholds as a percentage of the amount of
dirtyable memory over the entire system.
dirty_background_bytes and dirty_bytes specify quantities of memory, in
bytes, that represent the dirty limits for the entire system. If either
of these values is set, its value represents the amount of dirty memory
that is needed to commence either background or direct writeback.
When a `bytes' or `ratio' file is written, its counterpart becomes a
function of the written value. For example, if dirty_bytes is written to
be 8096, 8K of memory is required to commence direct writeback.
dirty_ratio is then functionally equivalent to 8K / the amount of
dirtyable memory:
dirtyable_memory = free pages + mapped pages + file cache
dirty_background_bytes = dirty_background_ratio * dirtyable_memory
-or-
dirty_background_ratio = dirty_background_bytes / dirtyable_memory
AND
dirty_bytes = dirty_ratio * dirtyable_memory
-or-
dirty_ratio = dirty_bytes / dirtyable_memory
Only one of dirty_background_bytes and dirty_background_ratio may be
specified at a time, and only one of dirty_bytes and dirty_ratio may be
specified. When one sysctl is written, the other appears as 0 when read.
The `bytes' files operate on a page size granularity since dirty limits
are compared with ZVC values, which are in page units.
Prior to this change, the minimum dirty_ratio was 5 as implemented by
get_dirty_limits() although /proc/sys/vm/dirty_ratio would show any user
written value between 0 and 100. This restriction is maintained, but
dirty_bytes has a lower limit of only one page.
Also prior to this change, the dirty_background_ratio could not equal or
exceed dirty_ratio. This restriction is maintained in addition to
restricting dirty_background_bytes. If either background threshold equals
or exceeds that of the dirty threshold, it is implicitly set to half the
dirty threshold.
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Andrea Righi <righi.andrea@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-07 06:39:31 +08:00
|
|
|
extern unsigned long vm_dirty_bytes;
|
2009-04-01 06:23:18 +08:00
|
|
|
extern unsigned int dirty_writeback_interval;
|
|
|
|
extern unsigned int dirty_expire_interval;
|
2015-03-18 00:23:32 +08:00
|
|
|
extern unsigned int dirtytime_expire_interval;
|
2008-02-05 14:29:20 +08:00
|
|
|
extern int vm_highmem_is_dirtyable;
|
2005-04-17 06:20:36 +08:00
|
|
|
extern int block_dump;
|
|
|
|
extern int laptop_mode;
|
|
|
|
|
mm: add dirty_background_bytes and dirty_bytes sysctls
This change introduces two new sysctls to /proc/sys/vm:
dirty_background_bytes and dirty_bytes.
dirty_background_bytes is the counterpart to dirty_background_ratio and
dirty_bytes is the counterpart to dirty_ratio.
With growing memory capacities of individual machines, it's no longer
sufficient to specify dirty thresholds as a percentage of the amount of
dirtyable memory over the entire system.
dirty_background_bytes and dirty_bytes specify quantities of memory, in
bytes, that represent the dirty limits for the entire system. If either
of these values is set, its value represents the amount of dirty memory
that is needed to commence either background or direct writeback.
When a `bytes' or `ratio' file is written, its counterpart becomes a
function of the written value. For example, if dirty_bytes is written to
be 8096, 8K of memory is required to commence direct writeback.
dirty_ratio is then functionally equivalent to 8K / the amount of
dirtyable memory:
dirtyable_memory = free pages + mapped pages + file cache
dirty_background_bytes = dirty_background_ratio * dirtyable_memory
-or-
dirty_background_ratio = dirty_background_bytes / dirtyable_memory
AND
dirty_bytes = dirty_ratio * dirtyable_memory
-or-
dirty_ratio = dirty_bytes / dirtyable_memory
Only one of dirty_background_bytes and dirty_background_ratio may be
specified at a time, and only one of dirty_bytes and dirty_ratio may be
specified. When one sysctl is written, the other appears as 0 when read.
The `bytes' files operate on a page size granularity since dirty limits
are compared with ZVC values, which are in page units.
Prior to this change, the minimum dirty_ratio was 5 as implemented by
get_dirty_limits() although /proc/sys/vm/dirty_ratio would show any user
written value between 0 and 100. This restriction is maintained, but
dirty_bytes has a lower limit of only one page.
Also prior to this change, the dirty_background_ratio could not equal or
exceed dirty_ratio. This restriction is maintained in addition to
restricting dirty_background_bytes. If either background threshold equals
or exceeds that of the dirty threshold, it is implicitly set to half the
dirty threshold.
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Andrea Righi <righi.andrea@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-07 06:39:31 +08:00
|
|
|
extern int dirty_background_ratio_handler(struct ctl_table *table, int write,
|
2009-09-24 06:57:19 +08:00
|
|
|
void __user *buffer, size_t *lenp,
|
mm: add dirty_background_bytes and dirty_bytes sysctls
This change introduces two new sysctls to /proc/sys/vm:
dirty_background_bytes and dirty_bytes.
dirty_background_bytes is the counterpart to dirty_background_ratio and
dirty_bytes is the counterpart to dirty_ratio.
With growing memory capacities of individual machines, it's no longer
sufficient to specify dirty thresholds as a percentage of the amount of
dirtyable memory over the entire system.
dirty_background_bytes and dirty_bytes specify quantities of memory, in
bytes, that represent the dirty limits for the entire system. If either
of these values is set, its value represents the amount of dirty memory
that is needed to commence either background or direct writeback.
When a `bytes' or `ratio' file is written, its counterpart becomes a
function of the written value. For example, if dirty_bytes is written to
be 8096, 8K of memory is required to commence direct writeback.
dirty_ratio is then functionally equivalent to 8K / the amount of
dirtyable memory:
dirtyable_memory = free pages + mapped pages + file cache
dirty_background_bytes = dirty_background_ratio * dirtyable_memory
-or-
dirty_background_ratio = dirty_background_bytes / dirtyable_memory
AND
dirty_bytes = dirty_ratio * dirtyable_memory
-or-
dirty_ratio = dirty_bytes / dirtyable_memory
Only one of dirty_background_bytes and dirty_background_ratio may be
specified at a time, and only one of dirty_bytes and dirty_ratio may be
specified. When one sysctl is written, the other appears as 0 when read.
The `bytes' files operate on a page size granularity since dirty limits
are compared with ZVC values, which are in page units.
Prior to this change, the minimum dirty_ratio was 5 as implemented by
get_dirty_limits() although /proc/sys/vm/dirty_ratio would show any user
written value between 0 and 100. This restriction is maintained, but
dirty_bytes has a lower limit of only one page.
Also prior to this change, the dirty_background_ratio could not equal or
exceed dirty_ratio. This restriction is maintained in addition to
restricting dirty_background_bytes. If either background threshold equals
or exceeds that of the dirty threshold, it is implicitly set to half the
dirty threshold.
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Andrea Righi <righi.andrea@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-07 06:39:31 +08:00
|
|
|
loff_t *ppos);
|
|
|
|
extern int dirty_background_bytes_handler(struct ctl_table *table, int write,
|
2009-09-24 06:57:19 +08:00
|
|
|
void __user *buffer, size_t *lenp,
|
mm: add dirty_background_bytes and dirty_bytes sysctls
This change introduces two new sysctls to /proc/sys/vm:
dirty_background_bytes and dirty_bytes.
dirty_background_bytes is the counterpart to dirty_background_ratio and
dirty_bytes is the counterpart to dirty_ratio.
With growing memory capacities of individual machines, it's no longer
sufficient to specify dirty thresholds as a percentage of the amount of
dirtyable memory over the entire system.
dirty_background_bytes and dirty_bytes specify quantities of memory, in
bytes, that represent the dirty limits for the entire system. If either
of these values is set, its value represents the amount of dirty memory
that is needed to commence either background or direct writeback.
When a `bytes' or `ratio' file is written, its counterpart becomes a
function of the written value. For example, if dirty_bytes is written to
be 8096, 8K of memory is required to commence direct writeback.
dirty_ratio is then functionally equivalent to 8K / the amount of
dirtyable memory:
dirtyable_memory = free pages + mapped pages + file cache
dirty_background_bytes = dirty_background_ratio * dirtyable_memory
-or-
dirty_background_ratio = dirty_background_bytes / dirtyable_memory
AND
dirty_bytes = dirty_ratio * dirtyable_memory
-or-
dirty_ratio = dirty_bytes / dirtyable_memory
Only one of dirty_background_bytes and dirty_background_ratio may be
specified at a time, and only one of dirty_bytes and dirty_ratio may be
specified. When one sysctl is written, the other appears as 0 when read.
The `bytes' files operate on a page size granularity since dirty limits
are compared with ZVC values, which are in page units.
Prior to this change, the minimum dirty_ratio was 5 as implemented by
get_dirty_limits() although /proc/sys/vm/dirty_ratio would show any user
written value between 0 and 100. This restriction is maintained, but
dirty_bytes has a lower limit of only one page.
Also prior to this change, the dirty_background_ratio could not equal or
exceed dirty_ratio. This restriction is maintained in addition to
restricting dirty_background_bytes. If either background threshold equals
or exceeds that of the dirty threshold, it is implicitly set to half the
dirty threshold.
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Andrea Righi <righi.andrea@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-07 06:39:31 +08:00
|
|
|
loff_t *ppos);
|
2007-10-17 14:25:50 +08:00
|
|
|
extern int dirty_ratio_handler(struct ctl_table *table, int write,
|
2009-09-24 06:57:19 +08:00
|
|
|
void __user *buffer, size_t *lenp,
|
2007-10-17 14:25:50 +08:00
|
|
|
loff_t *ppos);
|
mm: add dirty_background_bytes and dirty_bytes sysctls
This change introduces two new sysctls to /proc/sys/vm:
dirty_background_bytes and dirty_bytes.
dirty_background_bytes is the counterpart to dirty_background_ratio and
dirty_bytes is the counterpart to dirty_ratio.
With growing memory capacities of individual machines, it's no longer
sufficient to specify dirty thresholds as a percentage of the amount of
dirtyable memory over the entire system.
dirty_background_bytes and dirty_bytes specify quantities of memory, in
bytes, that represent the dirty limits for the entire system. If either
of these values is set, its value represents the amount of dirty memory
that is needed to commence either background or direct writeback.
When a `bytes' or `ratio' file is written, its counterpart becomes a
function of the written value. For example, if dirty_bytes is written to
be 8096, 8K of memory is required to commence direct writeback.
dirty_ratio is then functionally equivalent to 8K / the amount of
dirtyable memory:
dirtyable_memory = free pages + mapped pages + file cache
dirty_background_bytes = dirty_background_ratio * dirtyable_memory
-or-
dirty_background_ratio = dirty_background_bytes / dirtyable_memory
AND
dirty_bytes = dirty_ratio * dirtyable_memory
-or-
dirty_ratio = dirty_bytes / dirtyable_memory
Only one of dirty_background_bytes and dirty_background_ratio may be
specified at a time, and only one of dirty_bytes and dirty_ratio may be
specified. When one sysctl is written, the other appears as 0 when read.
The `bytes' files operate on a page size granularity since dirty limits
are compared with ZVC values, which are in page units.
Prior to this change, the minimum dirty_ratio was 5 as implemented by
get_dirty_limits() although /proc/sys/vm/dirty_ratio would show any user
written value between 0 and 100. This restriction is maintained, but
dirty_bytes has a lower limit of only one page.
Also prior to this change, the dirty_background_ratio could not equal or
exceed dirty_ratio. This restriction is maintained in addition to
restricting dirty_background_bytes. If either background threshold equals
or exceeds that of the dirty threshold, it is implicitly set to half the
dirty threshold.
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Andrea Righi <righi.andrea@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-07 06:39:31 +08:00
|
|
|
extern int dirty_bytes_handler(struct ctl_table *table, int write,
|
2009-09-24 06:57:19 +08:00
|
|
|
void __user *buffer, size_t *lenp,
|
mm: add dirty_background_bytes and dirty_bytes sysctls
This change introduces two new sysctls to /proc/sys/vm:
dirty_background_bytes and dirty_bytes.
dirty_background_bytes is the counterpart to dirty_background_ratio and
dirty_bytes is the counterpart to dirty_ratio.
With growing memory capacities of individual machines, it's no longer
sufficient to specify dirty thresholds as a percentage of the amount of
dirtyable memory over the entire system.
dirty_background_bytes and dirty_bytes specify quantities of memory, in
bytes, that represent the dirty limits for the entire system. If either
of these values is set, its value represents the amount of dirty memory
that is needed to commence either background or direct writeback.
When a `bytes' or `ratio' file is written, its counterpart becomes a
function of the written value. For example, if dirty_bytes is written to
be 8096, 8K of memory is required to commence direct writeback.
dirty_ratio is then functionally equivalent to 8K / the amount of
dirtyable memory:
dirtyable_memory = free pages + mapped pages + file cache
dirty_background_bytes = dirty_background_ratio * dirtyable_memory
-or-
dirty_background_ratio = dirty_background_bytes / dirtyable_memory
AND
dirty_bytes = dirty_ratio * dirtyable_memory
-or-
dirty_ratio = dirty_bytes / dirtyable_memory
Only one of dirty_background_bytes and dirty_background_ratio may be
specified at a time, and only one of dirty_bytes and dirty_ratio may be
specified. When one sysctl is written, the other appears as 0 when read.
The `bytes' files operate on a page size granularity since dirty limits
are compared with ZVC values, which are in page units.
Prior to this change, the minimum dirty_ratio was 5 as implemented by
get_dirty_limits() although /proc/sys/vm/dirty_ratio would show any user
written value between 0 and 100. This restriction is maintained, but
dirty_bytes has a lower limit of only one page.
Also prior to this change, the dirty_background_ratio could not equal or
exceed dirty_ratio. This restriction is maintained in addition to
restricting dirty_background_bytes. If either background threshold equals
or exceeds that of the dirty threshold, it is implicitly set to half the
dirty threshold.
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Andrea Righi <righi.andrea@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-07 06:39:31 +08:00
|
|
|
loff_t *ppos);
|
2015-03-18 00:23:32 +08:00
|
|
|
int dirtytime_interval_handler(struct ctl_table *table, int write,
|
|
|
|
void __user *buffer, size_t *lenp, loff_t *ppos);
|
2007-10-17 14:25:50 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
struct ctl_table;
|
2009-09-24 06:57:19 +08:00
|
|
|
int dirty_writeback_centisecs_handler(struct ctl_table *, int,
|
2005-04-17 06:20:36 +08:00
|
|
|
void __user *, size_t *, loff_t *);
|
|
|
|
|
2010-08-12 05:17:39 +08:00
|
|
|
void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
|
2015-05-23 06:23:19 +08:00
|
|
|
unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
|
writeback: move bandwidth related fields from backing_dev_info into bdi_writeback
Currently, a bdi (backing_dev_info) embeds single wb (bdi_writeback)
and the role of the separation is unclear. For cgroup support for
writeback IOs, a bdi will be updated to host multiple wb's where each
wb serves writeback IOs of a different cgroup on the bdi. To achieve
that, a wb should carry all states necessary for servicing writeback
IOs for a cgroup independently.
This patch moves bandwidth related fields from backing_dev_info into
bdi_writeback.
* The moved fields are: bw_time_stamp, dirtied_stamp, written_stamp,
write_bandwidth, avg_write_bandwidth, dirty_ratelimit,
balanced_dirty_ratelimit, completions and dirty_exceeded.
* writeback_chunk_size() and over_bground_thresh() now take @wb
instead of @bdi.
* bdi_writeout_fraction(bdi, ...) -> wb_writeout_fraction(wb, ...)
bdi_dirty_limit(bdi, ...) -> wb_dirty_limit(wb, ...)
bdi_position_ration(bdi, ...) -> wb_position_ratio(wb, ...)
bdi_update_writebandwidth(bdi, ...) -> wb_update_write_bandwidth(wb, ...)
[__]bdi_update_bandwidth(bdi, ...) -> [__]wb_update_bandwidth(wb, ...)
bdi_{max|min}_pause(bdi, ...) -> wb_{max|min}_pause(wb, ...)
bdi_dirty_limits(bdi, ...) -> wb_dirty_limits(wb, ...)
* Init/exits of the relocated fields are moved to bdi_wb_init/exit()
respectively. Note that explicit zeroing is dropped in the process
as wb's are cleared in entirety anyway.
* As there's still only one bdi_writeback per backing_dev_info, all
uses of bdi->stat[] are mechanically replaced with bdi->wb.stat[]
introducing no behavior changes.
v2: Typo in description fixed as suggested by Jan.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
2015-05-23 05:13:28 +08:00
|
|
|
|
2015-05-23 06:23:20 +08:00
|
|
|
void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time);
|
2005-04-17 06:20:36 +08:00
|
|
|
void page_writeback_init(void);
|
2012-12-12 08:00:21 +08:00
|
|
|
void balance_dirty_pages_ratelimited(struct address_space *mapping);
|
2015-05-23 06:23:31 +08:00
|
|
|
bool wb_over_bg_thresh(struct bdi_writeback *wb);
|
2006-03-24 19:18:10 +08:00
|
|
|
|
2007-05-11 13:22:51 +08:00
|
|
|
typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
|
|
|
|
void *data);
|
|
|
|
|
|
|
|
int generic_writepages(struct address_space *mapping,
|
|
|
|
struct writeback_control *wbc);
|
ext4: implement writeback livelock avoidance using page tagging
This is analogous to Jan Kara's commit,
f446daaea9d4a420d16c606f755f3689dcb2d0ce
mm: implement writeback livelock avoidance using page tagging
but since we forked write_cache_pages, we need to reimplement
it there (and in ext4_da_writepages, since range_cyclic handling
was moved to there)
If you start a large buffered IO to a file, and then set
fsync after it, you'll find that fsync does not complete
until the other IO stops.
If you continue re-dirtying the file (say, putting dd
with conv=notrunc in a loop), when fsync finally completes
(after all IO is done), it reports via tracing that
it has written many more pages than the file contains;
in other words it has synced and re-synced pages in
the file multiple times.
This then leads to problems with our writeback_index
update, since it advances it by pages written, and
essentially sets writeback_index off the end of the
file...
With the following patch, we only sync as much as was
dirty at the time of the sync.
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
2010-10-28 09:30:13 +08:00
|
|
|
void tag_pages_for_writeback(struct address_space *mapping,
|
|
|
|
pgoff_t start, pgoff_t end);
|
2007-05-11 13:22:51 +08:00
|
|
|
int write_cache_pages(struct address_space *mapping,
|
|
|
|
struct writeback_control *wbc, writepage_t writepage,
|
|
|
|
void *data);
|
2005-04-17 06:20:36 +08:00
|
|
|
int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
|
2006-09-29 17:01:25 +08:00
|
|
|
void writeback_set_ratelimit(void);
|
2010-10-27 05:22:03 +08:00
|
|
|
void tag_pages_for_writeback(struct address_space *mapping,
|
|
|
|
pgoff_t start, pgoff_t end);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-08-09 05:22:00 +08:00
|
|
|
void account_page_redirty(struct page *page);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif /* WRITEBACK_H */
|