Btrfs: add a priority queue to the async thread helpers

Btrfs is using WRITE_SYNC_PLUG to send down synchronous IOs with a
higher priority.  But, the checksumming helper threads prevent it
from being fully effective.

There are two problems.  First, a big queue of pending checksumming
will delay the synchronous IO behind other lower priority writes.  Second,
the checksumming uses an ordered async work queue.  The ordering makes sure
that IOs are sent to the block layer in the same order they are sent
to the checksumming threads.  Usually this gives us less seeky IO.

But, when we start mixing IO priorities, the lower priority IO can delay
the higher priority IO.

This patch solves both problems by adding a high priority list to the async
helper threads, and a new btrfs_set_work_high_prio(), which is used
to make put a new async work item onto the higher priority list.

The ordering is still done on high priority IO, but all of the high
priority bios are ordered separately from the low priority bios.  This
ordering is purely an IO optimization, it is not involved in data
or metadata integrity.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
This commit is contained in:
Chris Mason 2009-04-20 15:50:09 -04:00
parent ffbd517d5a
commit d313d7a31a
5 changed files with 56 additions and 15 deletions

View File

@ -25,6 +25,7 @@
#define WORK_QUEUED_BIT 0 #define WORK_QUEUED_BIT 0
#define WORK_DONE_BIT 1 #define WORK_DONE_BIT 1
#define WORK_ORDER_DONE_BIT 2 #define WORK_ORDER_DONE_BIT 2
#define WORK_HIGH_PRIO_BIT 3
/* /*
* container for the kthread task pointer and the list of pending work * container for the kthread task pointer and the list of pending work
@ -36,6 +37,7 @@ struct btrfs_worker_thread {
/* list of struct btrfs_work that are waiting for service */ /* list of struct btrfs_work that are waiting for service */
struct list_head pending; struct list_head pending;
struct list_head prio_pending;
/* list of worker threads from struct btrfs_workers */ /* list of worker threads from struct btrfs_workers */
struct list_head worker_list; struct list_head worker_list;
@ -103,10 +105,16 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,
spin_lock_irqsave(&workers->lock, flags); spin_lock_irqsave(&workers->lock, flags);
while (!list_empty(&workers->order_list)) { while (1) {
work = list_entry(workers->order_list.next, if (!list_empty(&workers->prio_order_list)) {
struct btrfs_work, order_list); work = list_entry(workers->prio_order_list.next,
struct btrfs_work, order_list);
} else if (!list_empty(&workers->order_list)) {
work = list_entry(workers->order_list.next,
struct btrfs_work, order_list);
} else {
break;
}
if (!test_bit(WORK_DONE_BIT, &work->flags)) if (!test_bit(WORK_DONE_BIT, &work->flags))
break; break;
@ -143,8 +151,14 @@ static int worker_loop(void *arg)
do { do {
spin_lock_irq(&worker->lock); spin_lock_irq(&worker->lock);
again_locked: again_locked:
while (!list_empty(&worker->pending)) { while (1) {
cur = worker->pending.next; if (!list_empty(&worker->prio_pending))
cur = worker->prio_pending.next;
else if (!list_empty(&worker->pending))
cur = worker->pending.next;
else
break;
work = list_entry(cur, struct btrfs_work, list); work = list_entry(cur, struct btrfs_work, list);
list_del(&work->list); list_del(&work->list);
clear_bit(WORK_QUEUED_BIT, &work->flags); clear_bit(WORK_QUEUED_BIT, &work->flags);
@ -163,7 +177,6 @@ again_locked:
spin_lock_irq(&worker->lock); spin_lock_irq(&worker->lock);
check_idle_worker(worker); check_idle_worker(worker);
} }
if (freezing(current)) { if (freezing(current)) {
worker->working = 0; worker->working = 0;
@ -178,7 +191,8 @@ again_locked:
* jump_in? * jump_in?
*/ */
smp_mb(); smp_mb();
if (!list_empty(&worker->pending)) if (!list_empty(&worker->pending) ||
!list_empty(&worker->prio_pending))
continue; continue;
/* /*
@ -191,7 +205,8 @@ again_locked:
*/ */
schedule_timeout(1); schedule_timeout(1);
smp_mb(); smp_mb();
if (!list_empty(&worker->pending)) if (!list_empty(&worker->pending) ||
!list_empty(&worker->prio_pending))
continue; continue;
if (kthread_should_stop()) if (kthread_should_stop())
@ -200,7 +215,8 @@ again_locked:
/* still no more work?, sleep for real */ /* still no more work?, sleep for real */
spin_lock_irq(&worker->lock); spin_lock_irq(&worker->lock);
set_current_state(TASK_INTERRUPTIBLE); set_current_state(TASK_INTERRUPTIBLE);
if (!list_empty(&worker->pending)) if (!list_empty(&worker->pending) ||
!list_empty(&worker->prio_pending))
goto again_locked; goto again_locked;
/* /*
@ -248,6 +264,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
INIT_LIST_HEAD(&workers->worker_list); INIT_LIST_HEAD(&workers->worker_list);
INIT_LIST_HEAD(&workers->idle_list); INIT_LIST_HEAD(&workers->idle_list);
INIT_LIST_HEAD(&workers->order_list); INIT_LIST_HEAD(&workers->order_list);
INIT_LIST_HEAD(&workers->prio_order_list);
spin_lock_init(&workers->lock); spin_lock_init(&workers->lock);
workers->max_workers = max; workers->max_workers = max;
workers->idle_thresh = 32; workers->idle_thresh = 32;
@ -273,6 +290,7 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
} }
INIT_LIST_HEAD(&worker->pending); INIT_LIST_HEAD(&worker->pending);
INIT_LIST_HEAD(&worker->prio_pending);
INIT_LIST_HEAD(&worker->worker_list); INIT_LIST_HEAD(&worker->worker_list);
spin_lock_init(&worker->lock); spin_lock_init(&worker->lock);
atomic_set(&worker->num_pending, 0); atomic_set(&worker->num_pending, 0);
@ -396,7 +414,10 @@ int btrfs_requeue_work(struct btrfs_work *work)
goto out; goto out;
spin_lock_irqsave(&worker->lock, flags); spin_lock_irqsave(&worker->lock, flags);
list_add_tail(&work->list, &worker->pending); if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
list_add_tail(&work->list, &worker->prio_pending);
else
list_add_tail(&work->list, &worker->pending);
atomic_inc(&worker->num_pending); atomic_inc(&worker->num_pending);
/* by definition we're busy, take ourselves off the idle /* by definition we're busy, take ourselves off the idle
@ -422,6 +443,11 @@ out:
return 0; return 0;
} }
void btrfs_set_work_high_prio(struct btrfs_work *work)
{
set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
}
/* /*
* places a struct btrfs_work into the pending queue of one of the kthreads * places a struct btrfs_work into the pending queue of one of the kthreads
*/ */
@ -438,7 +464,12 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
worker = find_worker(workers); worker = find_worker(workers);
if (workers->ordered) { if (workers->ordered) {
spin_lock_irqsave(&workers->lock, flags); spin_lock_irqsave(&workers->lock, flags);
list_add_tail(&work->order_list, &workers->order_list); if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
list_add_tail(&work->order_list,
&workers->prio_order_list);
} else {
list_add_tail(&work->order_list, &workers->order_list);
}
spin_unlock_irqrestore(&workers->lock, flags); spin_unlock_irqrestore(&workers->lock, flags);
} else { } else {
INIT_LIST_HEAD(&work->order_list); INIT_LIST_HEAD(&work->order_list);
@ -446,7 +477,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
spin_lock_irqsave(&worker->lock, flags); spin_lock_irqsave(&worker->lock, flags);
list_add_tail(&work->list, &worker->pending); if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
list_add_tail(&work->list, &worker->prio_pending);
else
list_add_tail(&work->list, &worker->pending);
atomic_inc(&worker->num_pending); atomic_inc(&worker->num_pending);
check_busy_worker(worker); check_busy_worker(worker);

View File

@ -85,6 +85,7 @@ struct btrfs_workers {
* of work items waiting for completion * of work items waiting for completion
*/ */
struct list_head order_list; struct list_head order_list;
struct list_head prio_order_list;
/* lock for finding the next worker thread to queue on */ /* lock for finding the next worker thread to queue on */
spinlock_t lock; spinlock_t lock;
@ -98,4 +99,5 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
int btrfs_stop_workers(struct btrfs_workers *workers); int btrfs_stop_workers(struct btrfs_workers *workers);
void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max); void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
int btrfs_requeue_work(struct btrfs_work *work); int btrfs_requeue_work(struct btrfs_work *work);
void btrfs_set_work_high_prio(struct btrfs_work *work);
#endif #endif

View File

@ -579,6 +579,10 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
async->bio_flags = bio_flags; async->bio_flags = bio_flags;
atomic_inc(&fs_info->nr_async_submits); atomic_inc(&fs_info->nr_async_submits);
if (rw & (1 << BIO_RW_SYNCIO))
btrfs_set_work_high_prio(&async->work);
btrfs_queue_worker(&fs_info->workers, &async->work); btrfs_queue_worker(&fs_info->workers, &async->work);
#if 0 #if 0
int limit = btrfs_async_submit_limit(fs_info); int limit = btrfs_async_submit_limit(fs_info);
@ -656,6 +660,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
mirror_num, 0); mirror_num, 0);
} }
/* /*
* kthread helpers are used to submit writes so that checksumming * kthread helpers are used to submit writes so that checksumming
* can happen in parallel across all CPUs * can happen in parallel across all CPUs

View File

@ -2501,7 +2501,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
}; };
struct writeback_control wbc_writepages = { struct writeback_control wbc_writepages = {
.bdi = wbc->bdi, .bdi = wbc->bdi,
.sync_mode = WB_SYNC_NONE, .sync_mode = wbc->sync_mode,
.older_than_this = NULL, .older_than_this = NULL,
.nr_to_write = 64, .nr_to_write = 64,
.range_start = page_offset(page) + PAGE_CACHE_SIZE, .range_start = page_offset(page) + PAGE_CACHE_SIZE,

View File

@ -1131,7 +1131,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
if (will_write) { if (will_write) {
btrfs_fdatawrite_range(inode->i_mapping, pos, btrfs_fdatawrite_range(inode->i_mapping, pos,
pos + write_bytes - 1, pos + write_bytes - 1,
WB_SYNC_NONE); WB_SYNC_ALL);
} else { } else {
balance_dirty_pages_ratelimited_nr(inode->i_mapping, balance_dirty_pages_ratelimited_nr(inode->i_mapping,
num_pages); num_pages);