block, bfq: do not idle for lowest-weight queues

In most cases, it is detrimental for throughput to plug I/O dispatch when the in-service bfq_queue becomes temporarily empty (plugging is performed to wait for the possible arrival, soon, of new I/O from the in-service queue). There is however a case where plugging is needed for service guarantees. If a bfq_queue, say Q, has a higher weight than some other active bfq_queue, and is sync, i.e., contains sync I/O, then, to guarantee that Q does receive a higher share of the throughput than other lower-weight queues, it is necessary to plug I/O dispatch when Q remains temporarily empty while being served. For this reason, BFQ performs I/O plugging when some active bfq_queue has a higher weight than some other active bfq_queue. But this is unnecessarily overkill. In fact, if the in-service bfq_queue actually has a weight lower than or equal to the other queues, then the queue *must not* be guaranteed a higher share of the throughput than the other queues. So, not plugging I/O cannot cause any harm to the queue. And can boost throughput. Taking advantage of this fact, this commit does not plug I/O for sync bfq_queues with a weight lower than or equal to the weights of the other queues. Here is an example of the resulting throughput boost with the dbench workload, which is particularly nasty for BFQ. With the dbench test in the Phoronix suite, BFQ reaches its lowest total throughput with 6 clients on a filesystem with journaling, in case the journaling daemon has a higher weight than normal processes. Before this commit, the total throughput was ~80 MB/sec on a PLEXTOR PX-256M5, after this commit it is ~100 MB/sec. Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com> Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name> Signed-off-by: Paolo Valente <paolo.valente@linaro.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-03-12 09:59:28 +01:00 · 2019-03-12 09:59:28 +01:00 · fb53ac6cd0
parent 778c02a236
commit fb53ac6cd0
3 changed files with 118 additions and 94 deletions
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@ -629,12 +629,19 @@ void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 }
 /*
- * The following function returns true if every queue must receive the
+ * The following function returns false either if every active queue
- * same share of the throughput (this condition is used when deciding
+ * must receive the same share of the throughput (symmetric scenario),
- * whether idling may be disabled, see the comments in the function
+ * or, as a special case, if bfqq must receive a share of the
- * bfq_better_to_idle()).
+ * throughput lower than or equal to the share that every other active
 * queue must receive.  If bfqq does sync I/O, then these are the only
 * two cases where bfqq happens to be guaranteed its share of the
 * throughput even if I/O dispatching is not plugged when bfqq remains
 * temporarily empty (for more details, see the comments in the
 * function bfq_better_to_idle()). For this reason, the return value
 * of this function is used to check whether I/O-dispatch plugging can
 * be avoided.
 *
- * Such a scenario occurs when:
+ * The above first case (symmetric scenario) occurs when:
 * 1) all active queues have the same weight,
 * 2) all active queues belong to the same I/O-priority class,
 * 3) all active groups at the same level in the groups tree have the same
@ -654,30 +661,36 @@ void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 * support or the cgroups interface are not enabled, thus no state
 * needs to be maintained in this case.
 */
-static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
+static bool bfq_asymmetric_scenario(struct bfq_data *bfqd,
 				   struct bfq_queue *bfqq)
 {
 	bool smallest_weight = bfqq &&
 		bfqq->weight_counter &&
 		bfqq->weight_counter ==
 		container_of(
 			rb_first_cached(&bfqd->queue_weights_tree),
 			struct bfq_weight_counter,
 			weights_node);
 	/*
 	 * For queue weights to differ, queue_weights_tree must contain
 	 * at least two nodes.
 	 */
-	bool varied_queue_weights = !RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
+	bool varied_queue_weights = !smallest_weight &&
-		(bfqd->queue_weights_tree.rb_node->rb_left ||
+		!RB_EMPTY_ROOT(&bfqd->queue_weights_tree.rb_root) &&
-		 bfqd->queue_weights_tree.rb_node->rb_right);
+		(bfqd->queue_weights_tree.rb_root.rb_node->rb_left ||
 		 bfqd->queue_weights_tree.rb_root.rb_node->rb_right);
 	bool multiple_classes_busy =
 		(bfqd->busy_queues[0] && bfqd->busy_queues[1]) ||
 		(bfqd->busy_queues[0] && bfqd->busy_queues[2]) ||
 		(bfqd->busy_queues[1] && bfqd->busy_queues[2]);
-	/*
+	return varied_queue_weights || multiple_classes_busy
 	 * For queue weights to differ, queue_weights_tree must contain
 	 * at least two nodes.
 	 */
 	return !(varied_queue_weights || multiple_classes_busy
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
 	       || bfqd->num_groups_with_pending_reqs > 0
 #endif
-		);
+		;
 }
 /*
@ -694,10 +707,11 @@ static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
 * should be low too.
 */
 void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-			  struct rb_root *root)
+			  struct rb_root_cached *root)
 {
 	struct bfq_entity *entity = &bfqq->entity;
-	struct rb_node **new = &(root->rb_node), *parent = NULL;
+	struct rb_node **new = &(root->rb_root.rb_node), *parent = NULL;
 	bool leftmost = true;
 	/*
 	 * Do not insert if the queue is already associated with a
@ -726,8 +740,10 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 		}
 		if (entity->weight < __counter->weight)
 			new = &((*new)->rb_left);
-		else
+		else {
 			new = &((*new)->rb_right);
 			leftmost = false;
 		}
 	}
 	bfqq->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
@ -736,7 +752,7 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	/*
 	 * In the unlucky event of an allocation failure, we just
 	 * exit. This will cause the weight of queue to not be
-	 * considered in bfq_symmetric_scenario, which, in its turn,
+	 * considered in bfq_asymmetric_scenario, which, in its turn,
 	 * causes the scenario to be deemed wrongly symmetric in case
 	 * bfqq's weight would have been the only weight making the
 	 * scenario asymmetric.  On the bright side, no unbalance will
@ -750,7 +766,8 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	bfqq->weight_counter->weight = entity->weight;
 	rb_link_node(&bfqq->weight_counter->weights_node, parent, new);
-	rb_insert_color(&bfqq->weight_counter->weights_node, root);
+	rb_insert_color_cached(&bfqq->weight_counter->weights_node, root,
 				leftmost);
 inc_counter:
 	bfqq->weight_counter->num_active++;
@ -765,7 +782,7 @@ inc_counter:
 */
 void __bfq_weights_tree_remove(struct bfq_data *bfqd,
 			       struct bfq_queue *bfqq,
-			       struct rb_root *root)
+			       struct rb_root_cached *root)
 {
 	if (!bfqq->weight_counter)
 		return;
@ -774,7 +791,7 @@ void __bfq_weights_tree_remove(struct bfq_data *bfqd,
 	if (bfqq->weight_counter->num_active > 0)
 		goto reset_entity_pointer;
-	rb_erase(&bfqq->weight_counter->weights_node, root);
+	rb_erase_cached(&bfqq->weight_counter->weights_node, root);
 	kfree(bfqq->weight_counter);
 reset_entity_pointer:
@ -889,7 +906,7 @@ static unsigned long bfq_serv_to_charge(struct request *rq,
 					struct bfq_queue *bfqq)
 {
 	if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1 ||
-	    !bfq_symmetric_scenario(bfqq->bfqd))
+	    bfq_asymmetric_scenario(bfqq->bfqd, bfqq))
 		return blk_rq_sectors(rq);
 	return blk_rq_sectors(rq) * bfq_async_charge_factor;
@ -2543,7 +2560,7 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd)
 	 * queue).
 	 */
 	if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 &&
-	    bfq_symmetric_scenario(bfqd))
+	    !bfq_asymmetric_scenario(bfqd, bfqq))
 		sl = min_t(u64, sl, BFQ_MIN_TT);
 	else if (bfqq->wr_coeff > 1)
 		sl = max_t(u32, sl, 20ULL * NSEC_PER_MSEC);
@ -3500,8 +3517,9 @@ static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd,
 }
 /*
- * There is a case where idling must be performed not for
+ * There is a case where idling does not have to be performed for
- * throughput concerns, but to preserve service guarantees.
+ * throughput concerns, but to preserve the throughput share of
 * the process associated with bfqq.
 *
 * To introduce this case, we can note that allowing the drive
 * to enqueue more than one request at a time, and hence
@ -3517,77 +3535,83 @@ static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd,
 * concern about per-process throughput distribution, and
 * makes its decisions only on a per-request basis. Therefore,
 * the service distribution enforced by the drive's internal
- * scheduler is likely to coincide with the desired
+ * scheduler is likely to coincide with the desired throughput
- * device-throughput distribution only in a completely
+ * distribution only in a completely symmetric, or favorably
- * symmetric scenario where:
+ * skewed scenario where:
- * (i)  each of these processes must get the same throughput as
+ * (i-a) each of these processes must get the same throughput as
- *      the others;
+ *	 the others,
- * (ii) the I/O of each process has the same properties, in
+ * (i-b) in case (i-a) does not hold, it holds that the process
- *      terms of locality (sequential or random), direction
+ *       associated with bfqq must receive a lower or equal
- *      (reads or writes), request sizes, greediness
+ *	 throughput than any of the other processes;
- *      (from I/O-bound to sporadic), and so on.
+ * (ii)  the I/O of each process has the same properties, in
- * In fact, in such a scenario, the drive tends to treat
+ *       terms of locality (sequential or random), direction
- * the requests of each of these processes in about the same
+ *       (reads or writes), request sizes, greediness
- * way as the requests of the others, and thus to provide
+ *       (from I/O-bound to sporadic), and so on;
- * each of these processes with about the same throughput
+
- * (which is exactly the desired throughput distribution). In
+ * In fact, in such a scenario, the drive tends to treat the requests
- * contrast, in any asymmetric scenario, device idling is
+ * of each process in about the same way as the requests of the
- * certainly needed to guarantee that bfqq receives its
+ * others, and thus to provide each of these processes with about the
- * assigned fraction of the device throughput (see [1] for
+ * same throughput.  This is exactly the desired throughput
- * details).
+ * distribution if (i-a) holds, or, if (i-b) holds instead, this is an
- * The problem is that idling may significantly reduce
+ * even more convenient distribution for (the process associated with)
- * throughput with certain combinations of types of I/O and
+ * bfqq.
 * devices. An important example is sync random I/O, on flash
 * storage with command queueing. So, unless bfqq falls in the
 * above cases where idling also boosts throughput, it would
 * be important to check conditions (i) and (ii) accurately,
 * so as to avoid idling when not strictly needed for service
 * guarantees.
 *
- * Unfortunately, it is extremely difficult to thoroughly
+ * In contrast, in any asymmetric or unfavorable scenario, device
- * check condition (ii). And, in case there are active groups,
+ * idling (I/O-dispatch plugging) is certainly needed to guarantee
- * it becomes very difficult to check condition (i) too. In
+ * that bfqq receives its assigned fraction of the device throughput
- * fact, if there are active groups, then, for condition (i)
+ * (see [1] for details).
- * to become false, it is enough that an active group contains
+ *
- * more active processes or sub-groups than some other active
+ * The problem is that idling may significantly reduce throughput with
- * group. More precisely, for condition (i) to hold because of
+ * certain combinations of types of I/O and devices. An important
- * such a group, it is not even necessary that the group is
+ * example is sync random I/O on flash storage with command
- * (still) active: it is sufficient that, even if the group
+ * queueing. So, unless bfqq falls in cases where idling also boosts
- * has become inactive, some of its descendant processes still
+ * throughput, it is important to check conditions (i-a), i(-b) and
- * have some request already dispatched but still waiting for
+ * (ii) accurately, so as to avoid idling when not strictly needed for
- * completion. In fact, requests have still to be guaranteed
+ * service guarantees.
- * their share of the throughput even after being
+ *
- * dispatched. In this respect, it is easy to show that, if a
+ * Unfortunately, it is extremely difficult to thoroughly check
- * group frequently becomes inactive while still having
+ * condition (ii). And, in case there are active groups, it becomes
- * in-flight requests, and if, when this happens, the group is
+ * very difficult to check conditions (i-a) and (i-b) too.  In fact,
- * not considered in the calculation of whether the scenario
+ * if there are active groups, then, for conditions (i-a) or (i-b) to
- * is asymmetric, then the group may fail to be guaranteed its
+ * become false 'indirectly', it is enough that an active group
- * fair share of the throughput (basically because idling may
+ * contains more active processes or sub-groups than some other active
- * not be performed for the descendant processes of the group,
+ * group. More precisely, for conditions (i-a) or (i-b) to become
- * but it had to be).  We address this issue with the
+ * false because of such a group, it is not even necessary that the
- * following bi-modal behavior, implemented in the function
+ * group is (still) active: it is sufficient that, even if the group
- * bfq_symmetric_scenario().
+ * has become inactive, some of its descendant processes still have
 * some request already dispatched but still waiting for
 * completion. In fact, requests have still to be guaranteed their
 * share of the throughput even after being dispatched. In this
 * respect, it is easy to show that, if a group frequently becomes
 * inactive while still having in-flight requests, and if, when this
 * happens, the group is not considered in the calculation of whether
 * the scenario is asymmetric, then the group may fail to be
 * guaranteed its fair share of the throughput (basically because
 * idling may not be performed for the descendant processes of the
 * group, but it had to be).  We address this issue with the following
 * bi-modal behavior, implemented in the function
 * bfq_asymmetric_scenario().
 *
 * If there are groups with requests waiting for completion
 * (as commented above, some of these groups may even be
 * already inactive), then the scenario is tagged as
 * asymmetric, conservatively, without checking any of the
- * conditions (i) and (ii). So the device is idled for bfqq.
+ * conditions (i-a), (i-b) or (ii). So the device is idled for bfqq.
 * This behavior matches also the fact that groups are created
 * exactly if controlling I/O is a primary concern (to
 * preserve bandwidth and latency guarantees).
 *
- * On the opposite end, if there are no groups with requests
+ * On the opposite end, if there are no groups with requests waiting
- * waiting for completion, then only condition (i) is actually
+ * for completion, then only conditions (i-a) and (i-b) are actually
- * controlled, i.e., provided that condition (i) holds, idling
+ * controlled, i.e., provided that conditions (i-a) or (i-b) holds,
- * is not performed, regardless of whether condition (ii)
+ * idling is not performed, regardless of whether condition (ii)
- * holds. In other words, only if condition (i) does not hold,
+ * holds.  In other words, only if conditions (i-a) and (i-b) do not
- * then idling is allowed, and the device tends to be
+ * hold, then idling is allowed, and the device tends to be prevented
- * prevented from queueing many requests, possibly of several
+ * from queueing many requests, possibly of several processes. Since
- * processes. Since there are no groups with requests waiting
+ * there are no groups with requests waiting for completion, then, to
- * for completion, then, to control condition (i) it is enough
+ * control conditions (i-a) and (i-b) it is enough to check just
- * to check just whether all the queues with requests waiting
+ * whether all the queues with requests waiting for completion also
- * for completion also have the same weight.
+ * have the same weight.
 *
 * Not checking condition (ii) evidently exposes bfqq to the
 * risk of getting less throughput than its fair share.
@ -3639,7 +3663,7 @@ static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd,
 * compound condition that is checked below for deciding
 * whether the scenario is asymmetric. To explain this
 * compound condition, we need to add that the function
- * bfq_symmetric_scenario checks the weights of only
+ * bfq_asymmetric_scenario checks the weights of only
 * non-weight-raised queues, for efficiency reasons (see
 * comments on bfq_weights_tree_add()). Then the fact that
 * bfqq is weight-raised is checked explicitly here. More
@ -3667,7 +3691,7 @@ static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd,
 	return (bfqq->wr_coeff > 1 &&
 		bfqd->wr_busy_queues <
 		bfq_tot_busy_queues(bfqd)) ||
-		!bfq_symmetric_scenario(bfqd);
+		bfq_asymmetric_scenario(bfqd, bfqq);
 }
 /*
@ -5505,7 +5529,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 		     HRTIMER_MODE_REL);
 	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
-	bfqd->queue_weights_tree = RB_ROOT;
+	bfqd->queue_weights_tree = RB_ROOT_CACHED;
 	bfqd->num_groups_with_pending_reqs = 0;
 	INIT_LIST_HEAD(&bfqd->active_list);
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@ -450,7 +450,7 @@ struct bfq_data {
 	 * weight-raised @bfq_queue (see the comments to the functions
 	 * bfq_weights_tree_[add|remove] for further details).
 	 */
-	struct rb_root queue_weights_tree;
+	struct rb_root_cached queue_weights_tree;
 	/*
 	 * Number of groups with at least one descendant process that
@ -898,10 +898,10 @@ void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync);
 struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic);
 void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);
 void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-			  struct rb_root *root);
+			  struct rb_root_cached *root);
 void __bfq_weights_tree_remove(struct bfq_data *bfqd,
 			       struct bfq_queue *bfqq,
-			       struct rb_root *root);
+			       struct rb_root_cached *root);
 void bfq_weights_tree_remove(struct bfq_data *bfqd,
 			     struct bfq_queue *bfqq);
 void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@ -737,7 +737,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
 		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 		unsigned int prev_weight, new_weight;
 		struct bfq_data *bfqd = NULL;
-		struct rb_root *root;
+		struct rb_root_cached *root;
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
 		struct bfq_sched_data *sd;
 		struct bfq_group *bfqg;