Merge branch 'for-3.11/core' of git://git.kernel.dk/linux-block

Pull core block IO updates from Jens Axboe: "Here are the core IO block bits for 3.11. It contains: - A tweak to the reserved tag logic from Jan, for weirdo devices with just 3 free tags. But for those it improves things substantially for random writes. - Periodic writeback fix from Jan. Marked for stable as well. - Fix for a race condition in IO scheduler switching from Jianpeng. - The hierarchical blk-cgroup support from Tejun. This is the grunt of the series. - blk-throttle fix from Vivek. Just a note that I'm in the middle of a relocation, whole family is flying out tomorrow. Hence I will be awal the remainder of this week, but back at work again on Monday the 15th. CC'ing Tejun, since any potential "surprises" will most likely be from the blk-cgroup work. But it's been brewing for a while and sitting in my tree and linux-next for a long time, so should be solid." * 'for-3.11/core' of git://git.kernel.dk/linux-block: (36 commits) elevator: Fix a race in elevator switching block: Reserve only one queue tag for sync IO if only 3 tags are available writeback: Fix periodic writeback after fs mount blk-throttle: implement proper hierarchy support blk-throttle: implement throtl_grp->has_rules[] blk-throttle: Account for child group's start time in parent while bio climbs up blk-throttle: add throtl_qnode for dispatch fairness blk-throttle: make throtl_pending_timer_fn() ready for hierarchy blk-throttle: make tg_dispatch_one_bio() ready for hierarchy blk-throttle: make blk_throtl_bio() ready for hierarchy blk-throttle: make blk_throtl_drain() ready for hierarchy blk-throttle: dispatch from throtl_pending_timer_fn() blk-throttle: implement dispatch looping blk-throttle: separate out throtl_service_queue->pending_timer from throtl_data->dispatch_work blk-throttle: set REQ_THROTTLED from throtl_charge_bio() and gate stats update with it blk-throttle: implement sq_to_tg(), sq_to_td() and throtl_log() blk-throttle: add throtl_service_queue->parent_sq blk-throttle: generalize update_disptime optimization in blk_throtl_bio() blk-throttle: dispatch to throtl_data->service_queue.bio_lists[] blk-throttle: move bio_lists[] and friends to throtl_service_queue ...
2013-07-11 13:03:24 -07:00 · 2013-07-11 13:03:24 -07:00 · 36805aaea5
parent 6d2fa9e141 d50235b7bc
commit 36805aaea5
12 changed files with 910 additions and 439 deletions
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@ -94,11 +94,13 @@ Throttling/Upper Limit policy
 Hierarchical Cgroups
 ====================
 - Currently only CFQ supports hierarchical groups. For throttling,
  cgroup interface does allow creation of hierarchical cgroups and
  internally it treats them as flat hierarchy.
-  If somebody created a hierarchy like as follows.
+Both CFQ and throttling implement hierarchy support; however,
 throttling's hierarchy support is enabled iff "sane_behavior" is
 enabled from cgroup side, which currently is a development option and
 not publicly available.
 If somebody created a hierarchy like as follows.
 			root
 			/  \
@ -106,21 +108,20 @@ Hierarchical Cgroups
 			|
 		     test3
-  CFQ will handle the hierarchy correctly but and throttling will
+CFQ by default and throttling with "sane_behavior" will handle the
-  practically treat all groups at same level. For details on CFQ
+hierarchy correctly.  For details on CFQ hierarchy support, refer to
-  hierarchy support, refer to Documentation/block/cfq-iosched.txt.
+Documentation/block/cfq-iosched.txt.  For throttling, all limits apply
-  Throttling will treat the hierarchy as if it looks like the
+to the whole subtree while all statistics are local to the IOs
-  following.
+directly generated by tasks in that cgroup.
 Throttling without "sane_behavior" enabled from cgroup side will
 practically treat all groups at same level as if it looks like the
 following.
 				pivot
 			     /  /   \  \
 			root  test1 test2  test3
  Nesting cgroups, while allowed, isn't officially supported and blkio
  genereates warning when cgroups nest. Once throttling implements
  hierarchy support, hierarchy will be supported and the warning will
  be removed.
 Various user visible config options
 ===================================
 CONFIG_BLK_CGROUP
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@ -32,26 +32,6 @@ EXPORT_SYMBOL_GPL(blkcg_root);
 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
 static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
 				      struct request_queue *q, bool update_hint);
 /**
 * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
 * @d_blkg: loop cursor pointing to the current descendant
 * @pos_cgrp: used for iteration
 * @p_blkg: target blkg to walk descendants of
 *
 * Walk @c_blkg through the descendants of @p_blkg.  Must be used with RCU
 * read locked.  If called under either blkcg or queue lock, the iteration
 * is guaranteed to include all and only online blkgs.  The caller may
 * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip
 * subtree.
 */
 #define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg)		\
 	cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
 		if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
 					      (p_blkg)->q, false)))
 static bool blkcg_policy_enabled(struct request_queue *q,
 				 const struct blkcg_policy *pol)
 {
@ -71,18 +51,8 @@ static void blkg_free(struct blkcg_gq *blkg)
 	if (!blkg)
 		return;
-	for (i = 0; i < BLKCG_MAX_POLS; i++) {
+	for (i = 0; i < BLKCG_MAX_POLS; i++)
-		struct blkcg_policy *pol = blkcg_policy[i];
+		kfree(blkg->pd[i]);
 		struct blkg_policy_data *pd = blkg->pd[i];
 		if (!pd)
 			continue;
 		if (pol && pol->pd_exit_fn)
 			pol->pd_exit_fn(blkg);
 		kfree(pd);
 	}
 	blk_exit_rl(&blkg->rl);
 	kfree(blkg);
@ -134,10 +104,6 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
 		blkg->pd[i] = pd;
 		pd->blkg = blkg;
 		pd->plid = i;
 		/* invoke per-policy init */
 		if (pol->pd_init_fn)
 			pol->pd_init_fn(blkg);
 	}
 	return blkg;
@ -158,8 +124,8 @@ err_free:
 * @q's bypass state.  If @update_hint is %true, the caller should be
 * holding @q->queue_lock and lookup hint is updated on success.
 */
-static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
+struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
-				      struct request_queue *q, bool update_hint)
+			       bool update_hint)
 {
 	struct blkcg_gq *blkg;
@ -234,16 +200,25 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 	}
 	blkg = new_blkg;
-	/* link parent and insert */
+	/* link parent */
 	if (blkcg_parent(blkcg)) {
 		blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
 		if (WARN_ON_ONCE(!blkg->parent)) {
-			blkg = ERR_PTR(-EINVAL);
+			ret = -EINVAL;
 			goto err_put_css;
 		}
 		blkg_get(blkg->parent);
 	}
 	/* invoke per-policy init */
 	for (i = 0; i < BLKCG_MAX_POLS; i++) {
 		struct blkcg_policy *pol = blkcg_policy[i];
 		if (blkg->pd[i] && pol->pd_init_fn)
 			pol->pd_init_fn(blkg);
 	}
 	/* insert */
 	spin_lock(&blkcg->lock);
 	ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
 	if (likely(!ret)) {
@ -394,30 +369,38 @@ static void blkg_destroy_all(struct request_queue *q)
 	q->root_rl.blkg = NULL;
 }
-static void blkg_rcu_free(struct rcu_head *rcu_head)
+/*
 * A group is RCU protected, but having an rcu lock does not mean that one
 * can access all the fields of blkg and assume these are valid.  For
 * example, don't try to follow throtl_data and request queue links.
 *
 * Having a reference to blkg under an rcu allows accesses to only values
 * local to groups like group stats and group rate limits.
 */
 void __blkg_release_rcu(struct rcu_head *rcu_head)
 {
-	blkg_free(container_of(rcu_head, struct blkcg_gq, rcu_head));
+	struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head);
-}
+	int i;
 	/* tell policies that this one is being freed */
 	for (i = 0; i < BLKCG_MAX_POLS; i++) {
 		struct blkcg_policy *pol = blkcg_policy[i];
 		if (blkg->pd[i] && pol->pd_exit_fn)
 			pol->pd_exit_fn(blkg);
 	}
 void __blkg_release(struct blkcg_gq *blkg)
 {
 	/* release the blkcg and parent blkg refs this blkg has been holding */
 	css_put(&blkg->blkcg->css);
-	if (blkg->parent)
+	if (blkg->parent) {
 		spin_lock_irq(blkg->q->queue_lock);
 		blkg_put(blkg->parent);
 		spin_unlock_irq(blkg->q->queue_lock);
 	}
-	/*
+	blkg_free(blkg);
 	 * A group is freed in rcu manner. But having an rcu lock does not
 	 * mean that one can access all the fields of blkg and assume these
 	 * are valid. For example, don't try to follow throtl_data and
 	 * request queue links.
 	 *
 	 * Having a reference to blkg under an rcu allows acess to only
 	 * values local to groups like group stats and group rate limits
 	 */
 	call_rcu(&blkg->rcu_head, blkg_rcu_free);
 }
-EXPORT_SYMBOL_GPL(__blkg_release);
+EXPORT_SYMBOL_GPL(__blkg_release_rcu);
 /*
 * The next function used by blk_queue_for_each_rl().  It's a bit tricky
@ -928,14 +911,6 @@ struct cgroup_subsys blkio_subsys = {
 	.subsys_id = blkio_subsys_id,
 	.base_cftypes = blkcg_files,
 	.module = THIS_MODULE,
 	/*
 	 * blkio subsystem is utterly broken in terms of hierarchy support.
 	 * It treats all cgroups equally regardless of where they're
 	 * located in the hierarchy - all cgroups are treated as if they're
 	 * right below the root.  Fix it and remove the following.
 	 */
 	.broken_hierarchy = true,
 };
 EXPORT_SYMBOL_GPL(blkio_subsys);
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@ -266,7 +266,7 @@ static inline void blkg_get(struct blkcg_gq *blkg)
 	blkg->refcnt++;
 }
-void __blkg_release(struct blkcg_gq *blkg);
+void __blkg_release_rcu(struct rcu_head *rcu);
 /**
 * blkg_put - put a blkg reference
@ -279,9 +279,43 @@ static inline void blkg_put(struct blkcg_gq *blkg)
 	lockdep_assert_held(blkg->q->queue_lock);
 	WARN_ON_ONCE(blkg->refcnt <= 0);
 	if (!--blkg->refcnt)
-		__blkg_release(blkg);
+		call_rcu(&blkg->rcu_head, __blkg_release_rcu);
 }
 struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
 			       bool update_hint);
 /**
 * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
 * @d_blkg: loop cursor pointing to the current descendant
 * @pos_cgrp: used for iteration
 * @p_blkg: target blkg to walk descendants of
 *
 * Walk @c_blkg through the descendants of @p_blkg.  Must be used with RCU
 * read locked.  If called under either blkcg or queue lock, the iteration
 * is guaranteed to include all and only online blkgs.  The caller may
 * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip
 * subtree.
 */
 #define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg)		\
 	cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
 		if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
 					      (p_blkg)->q, false)))
 /**
 * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
 * @d_blkg: loop cursor pointing to the current descendant
 * @pos_cgrp: used for iteration
 * @p_blkg: target blkg to walk descendants of
 *
 * Similar to blkg_for_each_descendant_pre() but performs post-order
 * traversal instead.  Synchronization rules are the same.
 */
 #define blkg_for_each_descendant_post(d_blkg, pos_cgrp, p_blkg)		\
 	cgroup_for_each_descendant_post((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
 		if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
 					      (p_blkg)->q, false)))
 /**
 * blk_get_rl - get request_list to use
 * @q: request_queue of interest
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@ -348,9 +348,16 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq)
 	 */
 	max_depth = bqt->max_depth;
 	if (!rq_is_sync(rq) && max_depth > 1) {
-		max_depth -= 2;
+		switch (max_depth) {
-		if (!max_depth)
+		case 2:
 			max_depth = 1;
 			break;
 		case 3:
 			max_depth = 2;
 			break;
 		default:
 			max_depth -= 2;
 		}
 		if (q->in_flight[BLK_RW_ASYNC] > max_depth)
 			return 1;
 	}
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@ -4347,18 +4347,28 @@ static void cfq_exit_queue(struct elevator_queue *e)
 	kfree(cfqd);
 }
-static int cfq_init_queue(struct request_queue *q)
+static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
 {
 	struct cfq_data *cfqd;
 	struct blkcg_gq *blkg __maybe_unused;
 	int i, ret;
 	struct elevator_queue *eq;
-	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
+	eq = elevator_alloc(q, e);
-	if (!cfqd)
+	if (!eq)
 		return -ENOMEM;
 	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
 	if (!cfqd) {
 		kobject_put(&eq->kobj);
 		return -ENOMEM;
 	}
 	eq->elevator_data = cfqd;
 	cfqd->queue = q;
-	q->elevator->elevator_data = cfqd;
+	spin_lock_irq(q->queue_lock);
 	q->elevator = eq;
 	spin_unlock_irq(q->queue_lock);
 	/* Init root service tree */
 	cfqd->grp_service_tree = CFQ_RB_ROOT;
@ -4433,6 +4443,7 @@ static int cfq_init_queue(struct request_queue *q)
 out_free:
 	kfree(cfqd);
 	kobject_put(&eq->kobj);
 	return ret;
 }
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@ -337,13 +337,21 @@ static void deadline_exit_queue(struct elevator_queue *e)
 /*
 * initialize elevator private data (deadline_data).
 */
-static int deadline_init_queue(struct request_queue *q)
+static int deadline_init_queue(struct request_queue *q, struct elevator_type *e)
 {
 	struct deadline_data *dd;
 	struct elevator_queue *eq;
 	eq = elevator_alloc(q, e);
 	if (!eq)
 		return -ENOMEM;
 	dd = kmalloc_node(sizeof(*dd), GFP_KERNEL | __GFP_ZERO, q->node);
-	if (!dd)
+	if (!dd) {
 		kobject_put(&eq->kobj);
 		return -ENOMEM;
 	}
 	eq->elevator_data = dd;
 	INIT_LIST_HEAD(&dd->fifo_list[READ]);
 	INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
@ -355,7 +363,9 @@ static int deadline_init_queue(struct request_queue *q)
 	dd->front_merges = 1;
 	dd->fifo_batch = fifo_batch;
-	q->elevator->elevator_data = dd;
+	spin_lock_irq(q->queue_lock);
 	q->elevator = eq;
 	spin_unlock_irq(q->queue_lock);
 	return 0;
 }
--- a/block/elevator.c
+++ b/block/elevator.c
@ -150,7 +150,7 @@ void __init load_default_elevator_module(void)
 static struct kobj_type elv_ktype;
-static struct elevator_queue *elevator_alloc(struct request_queue *q,
+struct elevator_queue *elevator_alloc(struct request_queue *q,
 				  struct elevator_type *e)
 {
 	struct elevator_queue *eq;
@ -170,6 +170,7 @@ err:
 	elevator_put(e);
 	return NULL;
 }
 EXPORT_SYMBOL(elevator_alloc);
 static void elevator_release(struct kobject *kobj)
 {
@ -221,16 +222,7 @@ int elevator_init(struct request_queue *q, char *name)
 		}
 	}
-	q->elevator = elevator_alloc(q, e);
+	err = e->ops.elevator_init_fn(q, e);
 	if (!q->elevator)
 		return -ENOMEM;
 	err = e->ops.elevator_init_fn(q);
 	if (err) {
 		kobject_put(&q->elevator->kobj);
 		return err;
 	}
 	return 0;
 }
 EXPORT_SYMBOL(elevator_init);
@ -935,17 +927,10 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 	spin_unlock_irq(q->queue_lock);
 	/* allocate, init and register new elevator */
-	err = -ENOMEM;
+	err = new_e->ops.elevator_init_fn(q, new_e);
-	q->elevator = elevator_alloc(q, new_e);
+	if (err)
 	if (!q->elevator)
 		goto fail_init;
 	err = new_e->ops.elevator_init_fn(q);
 	if (err) {
 		kobject_put(&q->elevator->kobj);
 		goto fail_init;
 	}
 	if (registered) {
 		err = elv_register_queue(q);
 		if (err)
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@ -59,16 +59,27 @@ noop_latter_request(struct request_queue *q, struct request *rq)
 	return list_entry(rq->queuelist.next, struct request, queuelist);
 }
-static int noop_init_queue(struct request_queue *q)
+static int noop_init_queue(struct request_queue *q, struct elevator_type *e)
 {
 	struct noop_data *nd;
 	struct elevator_queue *eq;
-	nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node);
+	eq = elevator_alloc(q, e);
-	if (!nd)
+	if (!eq)
 		return -ENOMEM;
 	nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node);
 	if (!nd) {
 		kobject_put(&eq->kobj);
 		return -ENOMEM;
 	}
 	eq->elevator_data = nd;
 	INIT_LIST_HEAD(&nd->queue);
-	q->elevator->elevator_data = nd;
+
 	spin_lock_irq(q->queue_lock);
 	q->elevator = eq;
 	spin_unlock_irq(q->queue_lock);
 	return 0;
 }
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@ -58,17 +58,24 @@ static void bdev_inode_switch_bdi(struct inode *inode,
 			struct backing_dev_info *dst)
 {
 	struct backing_dev_info *old = inode->i_data.backing_dev_info;
 	bool wakeup_bdi = false;
 	if (unlikely(dst == old))		/* deadlock avoidance */
 		return;
 	bdi_lock_two(&old->wb, &dst->wb);
 	spin_lock(&inode->i_lock);
 	inode->i_data.backing_dev_info = dst;
-	if (inode->i_state & I_DIRTY)
+	if (inode->i_state & I_DIRTY) {
 		if (bdi_cap_writeback_dirty(dst) && !wb_has_dirty_io(&dst->wb))
 			wakeup_bdi = true;
 		list_move(&inode->i_wb_list, &dst->wb.b_dirty);
 	}
 	spin_unlock(&inode->i_lock);
 	spin_unlock(&old->wb.list_lock);
 	spin_unlock(&dst->wb.list_lock);
 	if (wakeup_bdi)
 		bdi_wakeup_thread_delayed(dst);
 }
 /* Kill _all_ buffers and pagecache , dirty or not.. */
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@ -278,6 +278,8 @@ enum {
 	 *
 	 * - memcg: use_hierarchy is on by default and the cgroup file for
 	 *   the flag is not created.
 	 *
 	 * - blkcg: blk-throttle becomes properly hierarchical.
 	 */
 	CGRP_ROOT_SANE_BEHAVIOR	= (1 << 0),
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@ -7,6 +7,7 @@
 #ifdef CONFIG_BLOCK
 struct io_cq;
 struct elevator_type;
 typedef int (elevator_merge_fn) (struct request_queue *, struct request **,
 				 struct bio *);
@ -35,7 +36,8 @@ typedef void (elevator_put_req_fn) (struct request *);
 typedef void (elevator_activate_req_fn) (struct request_queue *, struct request *);
 typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct request *);
-typedef int (elevator_init_fn) (struct request_queue *);
+typedef int (elevator_init_fn) (struct request_queue *,
 				struct elevator_type *e);
 typedef void (elevator_exit_fn) (struct elevator_queue *);
 struct elevator_ops
@ -155,6 +157,8 @@ extern int elevator_init(struct request_queue *, char *);
 extern void elevator_exit(struct elevator_queue *);
 extern int elevator_change(struct request_queue *, const char *);
 extern bool elv_rq_merge_ok(struct request *, struct bio *);
 extern struct elevator_queue *elevator_alloc(struct request_queue *,
 					struct elevator_type *);
 /*
 * Helper functions.