2005-04-17 06:20:36 +08:00
|
|
|
/*
|
2006-10-04 05:01:26 +08:00
|
|
|
* include/linux/writeback.h
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
|
|
|
#ifndef WRITEBACK_H
|
|
|
|
#define WRITEBACK_H
|
|
|
|
|
Detach sched.h from mm.h
First thing mm.h does is including sched.h solely for can_do_mlock() inline
function which has "current" dereference inside. By dealing with can_do_mlock()
mm.h can be detached from sched.h which is good. See below, why.
This patch
a) removes unconditional inclusion of sched.h from mm.h
b) makes can_do_mlock() normal function in mm/mlock.c
c) exports can_do_mlock() to not break compilation
d) adds sched.h inclusions back to files that were getting it indirectly.
e) adds less bloated headers to some files (asm/signal.h, jiffies.h) that were
getting them indirectly
Net result is:
a) mm.h users would get less code to open, read, preprocess, parse, ... if
they don't need sched.h
b) sched.h stops being dependency for significant number of files:
on x86_64 allmodconfig touching sched.h results in recompile of 4083 files,
after patch it's only 3744 (-8.3%).
Cross-compile tested on
all arm defconfigs, all mips defconfigs, all powerpc defconfigs,
alpha alpha-up
arm
i386 i386-up i386-defconfig i386-allnoconfig
ia64 ia64-up
m68k
mips
parisc parisc-up
powerpc powerpc-up
s390 s390-up
sparc sparc-up
sparc64 sparc64-up
um-x86_64
x86_64 x86_64-up x86_64-defconfig x86_64-allnoconfig
as well as my two usual configs.
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-21 05:22:52 +08:00
|
|
|
#include <linux/sched.h>
|
2007-09-21 15:19:54 +08:00
|
|
|
#include <linux/fs.h>
|
Detach sched.h from mm.h
First thing mm.h does is including sched.h solely for can_do_mlock() inline
function which has "current" dereference inside. By dealing with can_do_mlock()
mm.h can be detached from sched.h which is good. See below, why.
This patch
a) removes unconditional inclusion of sched.h from mm.h
b) makes can_do_mlock() normal function in mm/mlock.c
c) exports can_do_mlock() to not break compilation
d) adds sched.h inclusions back to files that were getting it indirectly.
e) adds less bloated headers to some files (asm/signal.h, jiffies.h) that were
getting them indirectly
Net result is:
a) mm.h users would get less code to open, read, preprocess, parse, ... if
they don't need sched.h
b) sched.h stops being dependency for significant number of files:
on x86_64 allmodconfig touching sched.h results in recompile of 4083 files,
after patch it's only 3744 (-8.3%).
Cross-compile tested on
all arm defconfigs, all mips defconfigs, all powerpc defconfigs,
alpha alpha-up
arm
i386 i386-up i386-defconfig i386-allnoconfig
ia64 ia64-up
m68k
mips
parisc parisc-up
powerpc powerpc-up
s390 s390-up
sparc sparc-up
sparc64 sparc64-up
um-x86_64
x86_64 x86_64-up x86_64-defconfig x86_64-allnoconfig
as well as my two usual configs.
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-21 05:22:52 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
struct backing_dev_info;
|
|
|
|
|
2011-03-22 19:23:41 +08:00
|
|
|
extern spinlock_t inode_wb_list_lock;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* fs/fs-writeback.c
|
|
|
|
*/
|
|
|
|
enum writeback_sync_modes {
|
|
|
|
WB_SYNC_NONE, /* Don't wait on anything */
|
|
|
|
WB_SYNC_ALL, /* Wait on every mapping */
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A control structure which tells the writeback code what to do. These are
|
|
|
|
* always on the stack, and hence need no locking. They are always initialised
|
|
|
|
* in a manner such that unspecified fields are set to zero.
|
|
|
|
*/
|
|
|
|
struct writeback_control {
|
|
|
|
enum writeback_sync_modes sync_mode;
|
|
|
|
unsigned long *older_than_this; /* If !NULL, only write back inodes
|
|
|
|
older than this */
|
2010-03-12 06:09:47 +08:00
|
|
|
unsigned long wb_start; /* Time writeback_inodes_wb was
|
|
|
|
called. This is needed to avoid
|
|
|
|
extra jobs and livelock */
|
2005-04-17 06:20:36 +08:00
|
|
|
long nr_to_write; /* Write this many pages, and decrement
|
|
|
|
this for each page written */
|
|
|
|
long pages_skipped; /* Pages which were not written */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* For a_ops->writepages(): is start or end are non-zero then this is
|
|
|
|
* a hint that the filesystem need only write out the pages inside that
|
|
|
|
* byterange. The byte at `end' is included in the writeout request.
|
|
|
|
*/
|
[PATCH] writeback: fix range handling
When a writeback_control's `start' and `end' fields are used to
indicate a one-byte-range starting at file offset zero, the required
values of .start=0,.end=0 mean that the ->writepages() implementation
has no way of telling that it is being asked to perform a range
request. Because we're currently overloading (start == 0 && end == 0)
to mean "this is not a write-a-range request".
To make all this sane, the patch changes range of writeback_control.
So caller does: If it is calling ->writepages() to write pages, it
sets range (range_start/end or range_cyclic) always.
And if range_cyclic is true, ->writepages() thinks the range is
cyclic, otherwise it just uses range_start and range_end.
This patch does,
- Add LLONG_MAX, LLONG_MIN, ULLONG_MAX to include/linux/kernel.h
-1 is usually ok for range_end (type is long long). But, if someone did,
range_end += val; range_end is "val - 1"
u64val = range_end >> bits; u64val is "~(0ULL)"
or something, they are wrong. So, this adds LLONG_MAX to avoid nasty
things, and uses LLONG_MAX for range_end.
- All callers of ->writepages() sets range_start/end or range_cyclic.
- Fix updates of ->writeback_index. It seems already bit strange.
If it starts at 0 and ended by check of nr_to_write, this last
index may reduce chance to scan end of file. So, this updates
->writeback_index only if range_cyclic is true or whole-file is
scanned.
Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Nathan Scott <nathans@sgi.com>
Cc: Anton Altaparmakov <aia21@cantab.net>
Cc: Steven French <sfrench@us.ibm.com>
Cc: "Vladimir V. Saveliev" <vs@namesys.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-23 17:03:26 +08:00
|
|
|
loff_t range_start;
|
|
|
|
loff_t range_end;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-11-17 07:07:01 +08:00
|
|
|
unsigned nonblocking:1; /* Don't get stuck on request queues */
|
|
|
|
unsigned encountered_congestion:1; /* An output: a queue is full */
|
|
|
|
unsigned for_kupdate:1; /* A kupdate writeback */
|
2009-12-03 20:54:25 +08:00
|
|
|
unsigned for_background:1; /* A background writeback */
|
2005-11-17 07:07:01 +08:00
|
|
|
unsigned for_reclaim:1; /* Invoked from the page allocator */
|
[PATCH] writeback: fix range handling
When a writeback_control's `start' and `end' fields are used to
indicate a one-byte-range starting at file offset zero, the required
values of .start=0,.end=0 mean that the ->writepages() implementation
has no way of telling that it is being asked to perform a range
request. Because we're currently overloading (start == 0 && end == 0)
to mean "this is not a write-a-range request".
To make all this sane, the patch changes range of writeback_control.
So caller does: If it is calling ->writepages() to write pages, it
sets range (range_start/end or range_cyclic) always.
And if range_cyclic is true, ->writepages() thinks the range is
cyclic, otherwise it just uses range_start and range_end.
This patch does,
- Add LLONG_MAX, LLONG_MIN, ULLONG_MAX to include/linux/kernel.h
-1 is usually ok for range_end (type is long long). But, if someone did,
range_end += val; range_end is "val - 1"
u64val = range_end >> bits; u64val is "~(0ULL)"
or something, they are wrong. So, this adds LLONG_MAX to avoid nasty
things, and uses LLONG_MAX for range_end.
- All callers of ->writepages() sets range_start/end or range_cyclic.
- Fix updates of ->writeback_index. It seems already bit strange.
If it starts at 0 and ended by check of nr_to_write, this last
index may reduce chance to scan end of file. So, this updates
->writeback_index only if range_cyclic is true or whole-file is
scanned.
Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Nathan Scott <nathans@sgi.com>
Cc: Anton Altaparmakov <aia21@cantab.net>
Cc: Steven French <sfrench@us.ibm.com>
Cc: "Vladimir V. Saveliev" <vs@namesys.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-23 17:03:26 +08:00
|
|
|
unsigned range_cyclic:1; /* range_start is cyclic */
|
writeback: speed up writeback of big dirty files
After making dirty a 100M file, the normal behavior is to start the
writeback for all data after 30s delays. But sometimes the following
happens instead:
- after 30s: ~4M
- after 5s: ~4M
- after 5s: all remaining 92M
Some analyze shows that the internal io dispatch queues goes like this:
s_io s_more_io
-------------------------
1) 100M,1K 0
2) 1K 96M
3) 0 96M
1) initial state with a 100M file and a 1K file
2) 4M written, nr_to_write <= 0, so write more
3) 1K written, nr_to_write > 0, no more writes(BUG)
nr_to_write > 0 in (3) fools the upper layer to think that data have all
been written out. The big dirty file is actually still sitting in
s_more_io. We cannot simply splice s_more_io back to s_io as soon as s_io
becomes empty, and let the loop in generic_sync_sb_inodes() continue: this
may starve newly expired inodes in s_dirty. It is also not an option to
draw inodes from both s_more_io and s_dirty, an let the loop go on: this
might lead to live locks, and might also starve other superblocks in sync
time(well kupdate may still starve some superblocks, that's another bug).
We have to return when a full scan of s_io completes. So nr_to_write > 0
does not necessarily mean that "all data are written". This patch
introduces a flag writeback_control.more_io to indicate that more io should
be done. With it the big dirty file no longer has to wait for the next
kupdate invokation 5s later.
In sync_sb_inodes() we only set more_io on super_blocks we actually
visited. This avoids the interaction between two pdflush deamons.
Also in __sync_single_inode() we don't blindly keep requeuing the io if the
filesystem cannot progress. Failing to do so may lead to 100% iowait.
Tested-by: Mike Snitzer <snitzer@gmail.com>
Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn>
Cc: Michael Rubin <mrubin@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-02-05 14:29:36 +08:00
|
|
|
unsigned more_io:1; /* more io to be dispatched */
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* fs/fs-writeback.c
|
|
|
|
*/
|
2009-09-09 15:08:54 +08:00
|
|
|
struct bdi_writeback;
|
2005-04-17 06:20:36 +08:00
|
|
|
int inode_wait(void *);
|
2009-09-16 21:13:54 +08:00
|
|
|
void writeback_inodes_sb(struct super_block *);
|
2010-10-29 23:16:17 +08:00
|
|
|
void writeback_inodes_sb_nr(struct super_block *, unsigned long nr);
|
2009-12-23 20:57:07 +08:00
|
|
|
int writeback_inodes_sb_if_idle(struct super_block *);
|
2010-10-29 23:16:17 +08:00
|
|
|
int writeback_inodes_sb_nr_if_idle(struct super_block *, unsigned long nr);
|
2009-09-16 21:13:54 +08:00
|
|
|
void sync_inodes_sb(struct super_block *);
|
2010-06-10 18:07:27 +08:00
|
|
|
void writeback_inodes_wb(struct bdi_writeback *wb,
|
|
|
|
struct writeback_control *wbc);
|
2009-09-09 15:08:54 +08:00
|
|
|
long wb_do_writeback(struct bdi_writeback *wb, int force_wait);
|
|
|
|
void wakeup_flusher_threads(long nr_pages);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* writeback.h requires fs.h; it, too, is not included from here. */
|
|
|
|
static inline void wait_on_inode(struct inode *inode)
|
|
|
|
{
|
|
|
|
might_sleep();
|
2009-12-17 21:25:01 +08:00
|
|
|
wait_on_bit(&inode->i_state, __I_NEW, inode_wait, TASK_UNINTERRUPTIBLE);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2007-10-17 14:30:44 +08:00
|
|
|
static inline void inode_sync_wait(struct inode *inode)
|
|
|
|
{
|
|
|
|
might_sleep();
|
|
|
|
wait_on_bit(&inode->i_state, __I_SYNC, inode_wait,
|
|
|
|
TASK_UNINTERRUPTIBLE);
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* mm/page-writeback.c
|
|
|
|
*/
|
2010-05-20 15:18:47 +08:00
|
|
|
#ifdef CONFIG_BLOCK
|
2010-04-06 20:25:14 +08:00
|
|
|
void laptop_io_completion(struct backing_dev_info *info);
|
2005-04-17 06:20:36 +08:00
|
|
|
void laptop_sync_completion(void);
|
2010-04-06 20:25:14 +08:00
|
|
|
void laptop_mode_sync(struct work_struct *work);
|
|
|
|
void laptop_mode_timer_fn(unsigned long data);
|
2010-05-20 15:18:47 +08:00
|
|
|
#else
|
|
|
|
static inline void laptop_sync_completion(void) { }
|
|
|
|
#endif
|
2007-03-01 12:13:21 +08:00
|
|
|
void throttle_vm_writeout(gfp_t gfp_mask);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* These are exported to sysctl. */
|
|
|
|
extern int dirty_background_ratio;
|
mm: add dirty_background_bytes and dirty_bytes sysctls
This change introduces two new sysctls to /proc/sys/vm:
dirty_background_bytes and dirty_bytes.
dirty_background_bytes is the counterpart to dirty_background_ratio and
dirty_bytes is the counterpart to dirty_ratio.
With growing memory capacities of individual machines, it's no longer
sufficient to specify dirty thresholds as a percentage of the amount of
dirtyable memory over the entire system.
dirty_background_bytes and dirty_bytes specify quantities of memory, in
bytes, that represent the dirty limits for the entire system. If either
of these values is set, its value represents the amount of dirty memory
that is needed to commence either background or direct writeback.
When a `bytes' or `ratio' file is written, its counterpart becomes a
function of the written value. For example, if dirty_bytes is written to
be 8096, 8K of memory is required to commence direct writeback.
dirty_ratio is then functionally equivalent to 8K / the amount of
dirtyable memory:
dirtyable_memory = free pages + mapped pages + file cache
dirty_background_bytes = dirty_background_ratio * dirtyable_memory
-or-
dirty_background_ratio = dirty_background_bytes / dirtyable_memory
AND
dirty_bytes = dirty_ratio * dirtyable_memory
-or-
dirty_ratio = dirty_bytes / dirtyable_memory
Only one of dirty_background_bytes and dirty_background_ratio may be
specified at a time, and only one of dirty_bytes and dirty_ratio may be
specified. When one sysctl is written, the other appears as 0 when read.
The `bytes' files operate on a page size granularity since dirty limits
are compared with ZVC values, which are in page units.
Prior to this change, the minimum dirty_ratio was 5 as implemented by
get_dirty_limits() although /proc/sys/vm/dirty_ratio would show any user
written value between 0 and 100. This restriction is maintained, but
dirty_bytes has a lower limit of only one page.
Also prior to this change, the dirty_background_ratio could not equal or
exceed dirty_ratio. This restriction is maintained in addition to
restricting dirty_background_bytes. If either background threshold equals
or exceeds that of the dirty threshold, it is implicitly set to half the
dirty threshold.
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Andrea Righi <righi.andrea@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-07 06:39:31 +08:00
|
|
|
extern unsigned long dirty_background_bytes;
|
2005-04-17 06:20:36 +08:00
|
|
|
extern int vm_dirty_ratio;
|
mm: add dirty_background_bytes and dirty_bytes sysctls
This change introduces two new sysctls to /proc/sys/vm:
dirty_background_bytes and dirty_bytes.
dirty_background_bytes is the counterpart to dirty_background_ratio and
dirty_bytes is the counterpart to dirty_ratio.
With growing memory capacities of individual machines, it's no longer
sufficient to specify dirty thresholds as a percentage of the amount of
dirtyable memory over the entire system.
dirty_background_bytes and dirty_bytes specify quantities of memory, in
bytes, that represent the dirty limits for the entire system. If either
of these values is set, its value represents the amount of dirty memory
that is needed to commence either background or direct writeback.
When a `bytes' or `ratio' file is written, its counterpart becomes a
function of the written value. For example, if dirty_bytes is written to
be 8096, 8K of memory is required to commence direct writeback.
dirty_ratio is then functionally equivalent to 8K / the amount of
dirtyable memory:
dirtyable_memory = free pages + mapped pages + file cache
dirty_background_bytes = dirty_background_ratio * dirtyable_memory
-or-
dirty_background_ratio = dirty_background_bytes / dirtyable_memory
AND
dirty_bytes = dirty_ratio * dirtyable_memory
-or-
dirty_ratio = dirty_bytes / dirtyable_memory
Only one of dirty_background_bytes and dirty_background_ratio may be
specified at a time, and only one of dirty_bytes and dirty_ratio may be
specified. When one sysctl is written, the other appears as 0 when read.
The `bytes' files operate on a page size granularity since dirty limits
are compared with ZVC values, which are in page units.
Prior to this change, the minimum dirty_ratio was 5 as implemented by
get_dirty_limits() although /proc/sys/vm/dirty_ratio would show any user
written value between 0 and 100. This restriction is maintained, but
dirty_bytes has a lower limit of only one page.
Also prior to this change, the dirty_background_ratio could not equal or
exceed dirty_ratio. This restriction is maintained in addition to
restricting dirty_background_bytes. If either background threshold equals
or exceeds that of the dirty threshold, it is implicitly set to half the
dirty threshold.
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Andrea Righi <righi.andrea@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-07 06:39:31 +08:00
|
|
|
extern unsigned long vm_dirty_bytes;
|
2009-04-01 06:23:18 +08:00
|
|
|
extern unsigned int dirty_writeback_interval;
|
|
|
|
extern unsigned int dirty_expire_interval;
|
2008-02-05 14:29:20 +08:00
|
|
|
extern int vm_highmem_is_dirtyable;
|
2005-04-17 06:20:36 +08:00
|
|
|
extern int block_dump;
|
|
|
|
extern int laptop_mode;
|
|
|
|
|
2008-05-13 03:21:04 +08:00
|
|
|
extern unsigned long determine_dirtyable_memory(void);
|
|
|
|
|
mm: add dirty_background_bytes and dirty_bytes sysctls
This change introduces two new sysctls to /proc/sys/vm:
dirty_background_bytes and dirty_bytes.
dirty_background_bytes is the counterpart to dirty_background_ratio and
dirty_bytes is the counterpart to dirty_ratio.
With growing memory capacities of individual machines, it's no longer
sufficient to specify dirty thresholds as a percentage of the amount of
dirtyable memory over the entire system.
dirty_background_bytes and dirty_bytes specify quantities of memory, in
bytes, that represent the dirty limits for the entire system. If either
of these values is set, its value represents the amount of dirty memory
that is needed to commence either background or direct writeback.
When a `bytes' or `ratio' file is written, its counterpart becomes a
function of the written value. For example, if dirty_bytes is written to
be 8096, 8K of memory is required to commence direct writeback.
dirty_ratio is then functionally equivalent to 8K / the amount of
dirtyable memory:
dirtyable_memory = free pages + mapped pages + file cache
dirty_background_bytes = dirty_background_ratio * dirtyable_memory
-or-
dirty_background_ratio = dirty_background_bytes / dirtyable_memory
AND
dirty_bytes = dirty_ratio * dirtyable_memory
-or-
dirty_ratio = dirty_bytes / dirtyable_memory
Only one of dirty_background_bytes and dirty_background_ratio may be
specified at a time, and only one of dirty_bytes and dirty_ratio may be
specified. When one sysctl is written, the other appears as 0 when read.
The `bytes' files operate on a page size granularity since dirty limits
are compared with ZVC values, which are in page units.
Prior to this change, the minimum dirty_ratio was 5 as implemented by
get_dirty_limits() although /proc/sys/vm/dirty_ratio would show any user
written value between 0 and 100. This restriction is maintained, but
dirty_bytes has a lower limit of only one page.
Also prior to this change, the dirty_background_ratio could not equal or
exceed dirty_ratio. This restriction is maintained in addition to
restricting dirty_background_bytes. If either background threshold equals
or exceeds that of the dirty threshold, it is implicitly set to half the
dirty threshold.
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Andrea Righi <righi.andrea@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-07 06:39:31 +08:00
|
|
|
extern int dirty_background_ratio_handler(struct ctl_table *table, int write,
|
2009-09-24 06:57:19 +08:00
|
|
|
void __user *buffer, size_t *lenp,
|
mm: add dirty_background_bytes and dirty_bytes sysctls
This change introduces two new sysctls to /proc/sys/vm:
dirty_background_bytes and dirty_bytes.
dirty_background_bytes is the counterpart to dirty_background_ratio and
dirty_bytes is the counterpart to dirty_ratio.
With growing memory capacities of individual machines, it's no longer
sufficient to specify dirty thresholds as a percentage of the amount of
dirtyable memory over the entire system.
dirty_background_bytes and dirty_bytes specify quantities of memory, in
bytes, that represent the dirty limits for the entire system. If either
of these values is set, its value represents the amount of dirty memory
that is needed to commence either background or direct writeback.
When a `bytes' or `ratio' file is written, its counterpart becomes a
function of the written value. For example, if dirty_bytes is written to
be 8096, 8K of memory is required to commence direct writeback.
dirty_ratio is then functionally equivalent to 8K / the amount of
dirtyable memory:
dirtyable_memory = free pages + mapped pages + file cache
dirty_background_bytes = dirty_background_ratio * dirtyable_memory
-or-
dirty_background_ratio = dirty_background_bytes / dirtyable_memory
AND
dirty_bytes = dirty_ratio * dirtyable_memory
-or-
dirty_ratio = dirty_bytes / dirtyable_memory
Only one of dirty_background_bytes and dirty_background_ratio may be
specified at a time, and only one of dirty_bytes and dirty_ratio may be
specified. When one sysctl is written, the other appears as 0 when read.
The `bytes' files operate on a page size granularity since dirty limits
are compared with ZVC values, which are in page units.
Prior to this change, the minimum dirty_ratio was 5 as implemented by
get_dirty_limits() although /proc/sys/vm/dirty_ratio would show any user
written value between 0 and 100. This restriction is maintained, but
dirty_bytes has a lower limit of only one page.
Also prior to this change, the dirty_background_ratio could not equal or
exceed dirty_ratio. This restriction is maintained in addition to
restricting dirty_background_bytes. If either background threshold equals
or exceeds that of the dirty threshold, it is implicitly set to half the
dirty threshold.
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Andrea Righi <righi.andrea@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-07 06:39:31 +08:00
|
|
|
loff_t *ppos);
|
|
|
|
extern int dirty_background_bytes_handler(struct ctl_table *table, int write,
|
2009-09-24 06:57:19 +08:00
|
|
|
void __user *buffer, size_t *lenp,
|
mm: add dirty_background_bytes and dirty_bytes sysctls
This change introduces two new sysctls to /proc/sys/vm:
dirty_background_bytes and dirty_bytes.
dirty_background_bytes is the counterpart to dirty_background_ratio and
dirty_bytes is the counterpart to dirty_ratio.
With growing memory capacities of individual machines, it's no longer
sufficient to specify dirty thresholds as a percentage of the amount of
dirtyable memory over the entire system.
dirty_background_bytes and dirty_bytes specify quantities of memory, in
bytes, that represent the dirty limits for the entire system. If either
of these values is set, its value represents the amount of dirty memory
that is needed to commence either background or direct writeback.
When a `bytes' or `ratio' file is written, its counterpart becomes a
function of the written value. For example, if dirty_bytes is written to
be 8096, 8K of memory is required to commence direct writeback.
dirty_ratio is then functionally equivalent to 8K / the amount of
dirtyable memory:
dirtyable_memory = free pages + mapped pages + file cache
dirty_background_bytes = dirty_background_ratio * dirtyable_memory
-or-
dirty_background_ratio = dirty_background_bytes / dirtyable_memory
AND
dirty_bytes = dirty_ratio * dirtyable_memory
-or-
dirty_ratio = dirty_bytes / dirtyable_memory
Only one of dirty_background_bytes and dirty_background_ratio may be
specified at a time, and only one of dirty_bytes and dirty_ratio may be
specified. When one sysctl is written, the other appears as 0 when read.
The `bytes' files operate on a page size granularity since dirty limits
are compared with ZVC values, which are in page units.
Prior to this change, the minimum dirty_ratio was 5 as implemented by
get_dirty_limits() although /proc/sys/vm/dirty_ratio would show any user
written value between 0 and 100. This restriction is maintained, but
dirty_bytes has a lower limit of only one page.
Also prior to this change, the dirty_background_ratio could not equal or
exceed dirty_ratio. This restriction is maintained in addition to
restricting dirty_background_bytes. If either background threshold equals
or exceeds that of the dirty threshold, it is implicitly set to half the
dirty threshold.
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Andrea Righi <righi.andrea@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-07 06:39:31 +08:00
|
|
|
loff_t *ppos);
|
2007-10-17 14:25:50 +08:00
|
|
|
extern int dirty_ratio_handler(struct ctl_table *table, int write,
|
2009-09-24 06:57:19 +08:00
|
|
|
void __user *buffer, size_t *lenp,
|
2007-10-17 14:25:50 +08:00
|
|
|
loff_t *ppos);
|
mm: add dirty_background_bytes and dirty_bytes sysctls
This change introduces two new sysctls to /proc/sys/vm:
dirty_background_bytes and dirty_bytes.
dirty_background_bytes is the counterpart to dirty_background_ratio and
dirty_bytes is the counterpart to dirty_ratio.
With growing memory capacities of individual machines, it's no longer
sufficient to specify dirty thresholds as a percentage of the amount of
dirtyable memory over the entire system.
dirty_background_bytes and dirty_bytes specify quantities of memory, in
bytes, that represent the dirty limits for the entire system. If either
of these values is set, its value represents the amount of dirty memory
that is needed to commence either background or direct writeback.
When a `bytes' or `ratio' file is written, its counterpart becomes a
function of the written value. For example, if dirty_bytes is written to
be 8096, 8K of memory is required to commence direct writeback.
dirty_ratio is then functionally equivalent to 8K / the amount of
dirtyable memory:
dirtyable_memory = free pages + mapped pages + file cache
dirty_background_bytes = dirty_background_ratio * dirtyable_memory
-or-
dirty_background_ratio = dirty_background_bytes / dirtyable_memory
AND
dirty_bytes = dirty_ratio * dirtyable_memory
-or-
dirty_ratio = dirty_bytes / dirtyable_memory
Only one of dirty_background_bytes and dirty_background_ratio may be
specified at a time, and only one of dirty_bytes and dirty_ratio may be
specified. When one sysctl is written, the other appears as 0 when read.
The `bytes' files operate on a page size granularity since dirty limits
are compared with ZVC values, which are in page units.
Prior to this change, the minimum dirty_ratio was 5 as implemented by
get_dirty_limits() although /proc/sys/vm/dirty_ratio would show any user
written value between 0 and 100. This restriction is maintained, but
dirty_bytes has a lower limit of only one page.
Also prior to this change, the dirty_background_ratio could not equal or
exceed dirty_ratio. This restriction is maintained in addition to
restricting dirty_background_bytes. If either background threshold equals
or exceeds that of the dirty threshold, it is implicitly set to half the
dirty threshold.
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Andrea Righi <righi.andrea@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-07 06:39:31 +08:00
|
|
|
extern int dirty_bytes_handler(struct ctl_table *table, int write,
|
2009-09-24 06:57:19 +08:00
|
|
|
void __user *buffer, size_t *lenp,
|
mm: add dirty_background_bytes and dirty_bytes sysctls
This change introduces two new sysctls to /proc/sys/vm:
dirty_background_bytes and dirty_bytes.
dirty_background_bytes is the counterpart to dirty_background_ratio and
dirty_bytes is the counterpart to dirty_ratio.
With growing memory capacities of individual machines, it's no longer
sufficient to specify dirty thresholds as a percentage of the amount of
dirtyable memory over the entire system.
dirty_background_bytes and dirty_bytes specify quantities of memory, in
bytes, that represent the dirty limits for the entire system. If either
of these values is set, its value represents the amount of dirty memory
that is needed to commence either background or direct writeback.
When a `bytes' or `ratio' file is written, its counterpart becomes a
function of the written value. For example, if dirty_bytes is written to
be 8096, 8K of memory is required to commence direct writeback.
dirty_ratio is then functionally equivalent to 8K / the amount of
dirtyable memory:
dirtyable_memory = free pages + mapped pages + file cache
dirty_background_bytes = dirty_background_ratio * dirtyable_memory
-or-
dirty_background_ratio = dirty_background_bytes / dirtyable_memory
AND
dirty_bytes = dirty_ratio * dirtyable_memory
-or-
dirty_ratio = dirty_bytes / dirtyable_memory
Only one of dirty_background_bytes and dirty_background_ratio may be
specified at a time, and only one of dirty_bytes and dirty_ratio may be
specified. When one sysctl is written, the other appears as 0 when read.
The `bytes' files operate on a page size granularity since dirty limits
are compared with ZVC values, which are in page units.
Prior to this change, the minimum dirty_ratio was 5 as implemented by
get_dirty_limits() although /proc/sys/vm/dirty_ratio would show any user
written value between 0 and 100. This restriction is maintained, but
dirty_bytes has a lower limit of only one page.
Also prior to this change, the dirty_background_ratio could not equal or
exceed dirty_ratio. This restriction is maintained in addition to
restricting dirty_background_bytes. If either background threshold equals
or exceeds that of the dirty threshold, it is implicitly set to half the
dirty threshold.
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Andrea Righi <righi.andrea@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-07 06:39:31 +08:00
|
|
|
loff_t *ppos);
|
2007-10-17 14:25:50 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
struct ctl_table;
|
2009-09-24 06:57:19 +08:00
|
|
|
int dirty_writeback_centisecs_handler(struct ctl_table *, int,
|
2005-04-17 06:20:36 +08:00
|
|
|
void __user *, size_t *, loff_t *);
|
|
|
|
|
2010-08-12 05:17:39 +08:00
|
|
|
void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
|
|
|
|
unsigned long bdi_dirty_limit(struct backing_dev_info *bdi,
|
|
|
|
unsigned long dirty);
|
2008-04-30 15:54:32 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
void page_writeback_init(void);
|
2006-03-24 19:18:10 +08:00
|
|
|
void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
|
|
|
|
unsigned long nr_pages_dirtied);
|
|
|
|
|
|
|
|
static inline void
|
|
|
|
balance_dirty_pages_ratelimited(struct address_space *mapping)
|
|
|
|
{
|
|
|
|
balance_dirty_pages_ratelimited_nr(mapping, 1);
|
|
|
|
}
|
|
|
|
|
2007-05-11 13:22:51 +08:00
|
|
|
typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
|
|
|
|
void *data);
|
|
|
|
|
|
|
|
int generic_writepages(struct address_space *mapping,
|
|
|
|
struct writeback_control *wbc);
|
ext4: implement writeback livelock avoidance using page tagging
This is analogous to Jan Kara's commit,
f446daaea9d4a420d16c606f755f3689dcb2d0ce
mm: implement writeback livelock avoidance using page tagging
but since we forked write_cache_pages, we need to reimplement
it there (and in ext4_da_writepages, since range_cyclic handling
was moved to there)
If you start a large buffered IO to a file, and then set
fsync after it, you'll find that fsync does not complete
until the other IO stops.
If you continue re-dirtying the file (say, putting dd
with conv=notrunc in a loop), when fsync finally completes
(after all IO is done), it reports via tracing that
it has written many more pages than the file contains;
in other words it has synced and re-synced pages in
the file multiple times.
This then leads to problems with our writeback_index
update, since it advances it by pages written, and
essentially sets writeback_index off the end of the
file...
With the following patch, we only sync as much as was
dirty at the time of the sync.
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
2010-10-28 09:30:13 +08:00
|
|
|
void tag_pages_for_writeback(struct address_space *mapping,
|
|
|
|
pgoff_t start, pgoff_t end);
|
2007-05-11 13:22:51 +08:00
|
|
|
int write_cache_pages(struct address_space *mapping,
|
|
|
|
struct writeback_control *wbc, writepage_t writepage,
|
|
|
|
void *data);
|
2005-04-17 06:20:36 +08:00
|
|
|
int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
|
2007-10-09 00:54:37 +08:00
|
|
|
void set_page_dirty_balance(struct page *page, int page_mkwrite);
|
2006-09-29 17:01:25 +08:00
|
|
|
void writeback_set_ratelimit(void);
|
2010-10-27 05:22:03 +08:00
|
|
|
void tag_pages_for_writeback(struct address_space *mapping,
|
|
|
|
pgoff_t start, pgoff_t end);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* pdflush.c */
|
|
|
|
extern int nr_pdflush_threads; /* Global so it can be exported to sysctl
|
|
|
|
read-only. */
|
|
|
|
|
|
|
|
|
|
|
|
#endif /* WRITEBACK_H */
|