Merge tag 'for-5.1/block-20190302' of git://git.kernel.dk/linux-block

Pull block layer updates from Jens Axboe: "Not a huge amount of changes in this round, the biggest one is that we finally have Mings multi-page bvec support merged. Apart from that, this pull request contains: - Small series that avoids quiescing the queue for sysfs changes that match what we currently have (Aleksei) - Series of bcache fixes (via Coly) - Series of lightnvm fixes (via Mathias) - NVMe pull request from Christoph. Nothing major, just SPDX/license cleanups, RR mp policy (Hannes), and little fixes (Bart, Chaitanya). - BFQ series (Paolo) - Save blk-mq cpu -> hw queue mapping, removing a pointer indirection for the fast path (Jianchao) - fops->iopoll() added for async IO polling, this is a feature that the upcoming io_uring interface will use (Christoph, me) - Partition scan loop fixes (Dongli) - mtip32xx conversion from managed resource API (Christoph) - cdrom registration race fix (Guenter) - MD pull from Song, two minor fixes. - Various documentation fixes (Marcos) - Multi-page bvec feature. This brings a lot of nice improvements with it, like more efficient splitting, larger IOs can be supported without growing the bvec table size, and so on. (Ming) - Various little fixes to core and drivers" * tag 'for-5.1/block-20190302' of git://git.kernel.dk/linux-block: (117 commits) block: fix updating bio's front segment size block: Replace function name in string with __func__ nbd: propagate genlmsg_reply return code floppy: remove set but not used variable 'q' null_blk: fix checking for REQ_FUA block: fix NULL pointer dereference in register_disk fs: fix guard_bio_eod to check for real EOD errors blk-mq: use HCTX_TYPE_DEFAULT but not 0 to index blk_mq_tag_set->map block: optimize bvec iteration in bvec_iter_advance block: introduce mp_bvec_for_each_page() for iterating over page block: optimize blk_bio_segment_split for single-page bvec block: optimize __blk_segment_map_sg() for single-page bvec block: introduce bvec_nth_page() iomap: wire up the iopoll method block: add bio_set_polled() helper block: wire up block device iopoll method fs: add an iopoll method to struct file_operations loop: set GENHD_FL_NO_PART_SCAN after blkdev_reread_part() loop: do not print warn message if partition scan is successful block: bounce: make sure that bvec table is updated ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2019-03-08 23:12:17 +0100
committer: Linus Torvalds <torvalds@linux-foundation.org> 2019-03-08 23:12:17 +0100
commit: 80201fe175cbf7f3e372f53eba0a881a702ad926 (patch)
tree: 8026c68d52763614268a9c3c80759ad386bd5967 /block
parent: Merge tag 'for-5.1/libata-20190301' of git://git.kernel.dk/linux-block (diff)
parent: block: fix updating bio's front segment size (diff)
download: linux-80201fe175cbf7f3e372f53eba0a881a702ad926.tar.xz
linux-80201fe175cbf7f3e372f53eba0a881a702ad926.zip
17 files changed, 658 insertions, 484 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index cd307767a134..4c592496a16a 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -230,11 +230,16 @@ static struct kmem_cache *bfq_pool;
 #define BFQ_MIN_TT		(2 * NSEC_PER_MSEC)
 
 /* hw_tag detection: parallel requests threshold and min samples needed. */
-#define BFQ_HW_QUEUE_THRESHOLD	4
+#define BFQ_HW_QUEUE_THRESHOLD	3
 #define BFQ_HW_QUEUE_SAMPLES	32
 
 #define BFQQ_SEEK_THR		(sector_t)(8 * 100)
 #define BFQQ_SECT_THR_NONROT	(sector_t)(2 * 32)
+#define BFQ_RQ_SEEKY(bfqd, last_pos, rq) \
+	(get_sdist(last_pos, rq) >			\
+	 BFQQ_SEEK_THR &&				\
+	 (!blk_queue_nonrot(bfqd->queue) ||		\
+	  blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT))
 #define BFQQ_CLOSE_THR		(sector_t)(8 * 1024)
 #define BFQQ_SEEKY(bfqq)	(hweight32(bfqq->seek_history) > 19)
 
@@ -624,26 +629,6 @@ void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 }
 
 /*
- * Tell whether there are active queues with different weights or
- * active groups.
- */
-static bool bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd)
-{
-	/*
-	 * For queue weights to differ, queue_weights_tree must contain
-	 * at least two nodes.
-	 */
-	return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
-		(bfqd->queue_weights_tree.rb_node->rb_left ||
-		 bfqd->queue_weights_tree.rb_node->rb_right)
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	       ) ||
-		(bfqd->num_groups_with_pending_reqs > 0
-#endif
-	       );
-}
-
-/*
  * The following function returns true if every queue must receive the
  * same share of the throughput (this condition is used when deciding
  * whether idling may be disabled, see the comments in the function
@@ -651,25 +636,48 @@ static bool bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd)
  *
  * Such a scenario occurs when:
  * 1) all active queues have the same weight,
- * 2) all active groups at the same level in the groups tree have the same
- *    weight,
+ * 2) all active queues belong to the same I/O-priority class,
  * 3) all active groups at the same level in the groups tree have the same
+ *    weight,
+ * 4) all active groups at the same level in the groups tree have the same
  *    number of children.
  *
  * Unfortunately, keeping the necessary state for evaluating exactly
  * the last two symmetry sub-conditions above would be quite complex
- * and time consuming.  Therefore this function evaluates, instead,
- * only the following stronger two sub-conditions, for which it is
+ * and time consuming. Therefore this function evaluates, instead,
+ * only the following stronger three sub-conditions, for which it is
  * much easier to maintain the needed state:
  * 1) all active queues have the same weight,
- * 2) there are no active groups.
+ * 2) all active queues belong to the same I/O-priority class,
+ * 3) there are no active groups.
  * In particular, the last condition is always true if hierarchical
  * support or the cgroups interface are not enabled, thus no state
  * needs to be maintained in this case.
  */
 static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
 {
-	return !bfq_varied_queue_weights_or_active_groups(bfqd);
+	/*
+	 * For queue weights to differ, queue_weights_tree must contain
+	 * at least two nodes.
+	 */
+	bool varied_queue_weights = !RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
+		(bfqd->queue_weights_tree.rb_node->rb_left ||
+		 bfqd->queue_weights_tree.rb_node->rb_right);
+
+	bool multiple_classes_busy =
+		(bfqd->busy_queues[0] && bfqd->busy_queues[1]) ||
+		(bfqd->busy_queues[0] && bfqd->busy_queues[2]) ||
+		(bfqd->busy_queues[1] && bfqd->busy_queues[2]);
+
+	/*
+	 * For queue weights to differ, queue_weights_tree must contain
+	 * at least two nodes.
+	 */
+	return !(varied_queue_weights || multiple_classes_busy
+#ifdef BFQ_GROUP_IOSCHED_ENABLED
+	       || bfqd->num_groups_with_pending_reqs > 0
+#endif
+		);
 }
 
 /*
@@ -728,15 +736,14 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	/*
 	 * In the unlucky event of an allocation failure, we just
 	 * exit. This will cause the weight of queue to not be
-	 * considered in bfq_varied_queue_weights_or_active_groups,
-	 * which, in its turn, causes the scenario to be deemed
-	 * wrongly symmetric in case bfqq's weight would have been
-	 * the only weight making the scenario asymmetric.  On the
-	 * bright side, no unbalance will however occur when bfqq
-	 * becomes inactive again (the invocation of this function
-	 * is triggered by an activation of queue).  In fact,
-	 * bfq_weights_tree_remove does nothing if
-	 * !bfqq->weight_counter.
+	 * considered in bfq_symmetric_scenario, which, in its turn,
+	 * causes the scenario to be deemed wrongly symmetric in case
+	 * bfqq's weight would have been the only weight making the
+	 * scenario asymmetric.  On the bright side, no unbalance will
+	 * however occur when bfqq becomes inactive again (the
+	 * invocation of this function is triggered by an activation
+	 * of queue).  In fact, bfq_weights_tree_remove does nothing
+	 * if !bfqq->weight_counter.
 	 */
 	if (unlikely(!bfqq->weight_counter))
 		return;
@@ -747,6 +754,7 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 
 inc_counter:
 	bfqq->weight_counter->num_active++;
+	bfqq->ref++;
 }
 
 /*
@@ -771,6 +779,7 @@ void __bfq_weights_tree_remove(struct bfq_data *bfqd,
 
 reset_entity_pointer:
 	bfqq->weight_counter = NULL;
+	bfq_put_queue(bfqq);
 }
 
 /*
@@ -782,9 +791,6 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd,
 {
 	struct bfq_entity *entity = bfqq->entity.parent;
 
-	__bfq_weights_tree_remove(bfqd, bfqq,
-				  &bfqd->queue_weights_tree);
-
 	for_each_entity(entity) {
 		struct bfq_sched_data *sd = entity->my_sched_data;
 
@@ -818,6 +824,15 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd,
 			bfqd->num_groups_with_pending_reqs--;
 		}
 	}
+
+	/*
+	 * Next function is invoked last, because it causes bfqq to be
+	 * freed if the following holds: bfqq is not in service and
+	 * has no dispatched request. DO NOT use bfqq after the next
+	 * function invocation.
+	 */
+	__bfq_weights_tree_remove(bfqd, bfqq,
+				  &bfqd->queue_weights_tree);
 }
 
 /*
@@ -873,7 +888,8 @@ static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
 static unsigned long bfq_serv_to_charge(struct request *rq,
 					struct bfq_queue *bfqq)
 {
-	if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1)
+	if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1 ||
+	    !bfq_symmetric_scenario(bfqq->bfqd))
 		return blk_rq_sectors(rq);
 
 	return blk_rq_sectors(rq) * bfq_async_charge_factor;
@@ -907,8 +923,10 @@ static void bfq_updated_next_req(struct bfq_data *bfqd,
 		 */
 		return;
 
-	new_budget = max_t(unsigned long, bfqq->max_budget,
-			   bfq_serv_to_charge(next_rq, bfqq));
+	new_budget = max_t(unsigned long,
+			   max_t(unsigned long, bfqq->max_budget,
+				 bfq_serv_to_charge(next_rq, bfqq)),
+			   entity->service);
 	if (entity->budget != new_budget) {
 		entity->budget = new_budget;
 		bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
@@ -1011,7 +1029,8 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
 
 static int bfqq_process_refs(struct bfq_queue *bfqq)
 {
-	return bfqq->ref - bfqq->allocated - bfqq->entity.on_st;
+	return bfqq->ref - bfqq->allocated - bfqq->entity.on_st -
+		(bfqq->weight_counter != NULL);
 }
 
 /* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */
@@ -1380,7 +1399,15 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
 {
 	struct bfq_entity *entity = &bfqq->entity;
 
-	if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) {
+	/*
+	 * In the next compound condition, we check also whether there
+	 * is some budget left, because otherwise there is no point in
+	 * trying to go on serving bfqq with this same budget: bfqq
+	 * would be expired immediately after being selected for
+	 * service. This would only cause useless overhead.
+	 */
+	if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time &&
+	    bfq_bfqq_budget_left(bfqq) > 0) {
 		/*
 		 * We do not clear the flag non_blocking_wait_rq here, as
 		 * the latter is used in bfq_activate_bfqq to signal
@@ -2217,14 +2244,15 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 		return NULL;
 
 	/* If there is only one backlogged queue, don't search. */
-	if (bfqd->busy_queues == 1)
+	if (bfq_tot_busy_queues(bfqd) == 1)
 		return NULL;
 
 	in_service_bfqq = bfqd->in_service_queue;
 
 	if (in_service_bfqq && in_service_bfqq != bfqq &&
 	    likely(in_service_bfqq != &bfqd->oom_bfqq) &&
-	    bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
+	    bfq_rq_close_to_sector(io_struct, request,
+				   bfqd->in_serv_last_pos) &&
 	    bfqq->entity.parent == in_service_bfqq->entity.parent &&
 	    bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) {
 		new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);
@@ -2742,7 +2770,7 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq)
 
 	if ((bfqd->rq_in_driver > 0 ||
 		now_ns - bfqd->last_completion < BFQ_MIN_TT)
-	     && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR)
+	    && !BFQ_RQ_SEEKY(bfqd, bfqd->last_position, rq))
 		bfqd->sequential_samples++;
 
 	bfqd->tot_sectors_dispatched += blk_rq_sectors(rq);
@@ -2764,6 +2792,8 @@ update_rate_and_reset:
 	bfq_update_rate_reset(bfqd, rq);
 update_last_values:
 	bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
+	if (RQ_BFQQ(rq) == bfqd->in_service_queue)
+		bfqd->in_serv_last_pos = bfqd->last_position;
 	bfqd->last_dispatch = now_ns;
 }
 
@@ -3274,16 +3304,32 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
 		 * requests, then the request pattern is isochronous
 		 * (see the comments on the function
 		 * bfq_bfqq_softrt_next_start()). Thus we can compute
-		 * soft_rt_next_start. If, instead, the queue still
-		 * has outstanding requests, then we have to wait for
-		 * the completion of all the outstanding requests to
-		 * discover whether the request pattern is actually
-		 * isochronous.
+		 * soft_rt_next_start. And we do it, unless bfqq is in
+		 * interactive weight raising. We do not do it in the
+		 * latter subcase, for the following reason. bfqq may
+		 * be conveying the I/O needed to load a soft
+		 * real-time application. Such an application will
+		 * actually exhibit a soft real-time I/O pattern after
+		 * it finally starts doing its job. But, if
+		 * soft_rt_next_start is computed here for an
+		 * interactive bfqq, and bfqq had received a lot of
+		 * service before remaining with no outstanding
+		 * request (likely to happen on a fast device), then
+		 * soft_rt_next_start would be assigned such a high
+		 * value that, for a very long time, bfqq would be
+		 * prevented from being possibly considered as soft
+		 * real time.
+		 *
+		 * If, instead, the queue still has outstanding
+		 * requests, then we have to wait for the completion
+		 * of all the outstanding requests to discover whether
+		 * the request pattern is actually isochronous.
 		 */
-		if (bfqq->dispatched == 0)
+		if (bfqq->dispatched == 0 &&
+		    bfqq->wr_coeff != bfqd->bfq_wr_coeff)
 			bfqq->soft_rt_next_start =
 				bfq_bfqq_softrt_next_start(bfqd, bfqq);
-		else {
+		else if (bfqq->dispatched > 0) {
 			/*
 			 * Schedule an update of soft_rt_next_start to when
 			 * the task may be discovered to be isochronous.
@@ -3376,53 +3422,13 @@ static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
 		bfq_bfqq_budget_timeout(bfqq);
 }
 
-/*
- * For a queue that becomes empty, device idling is allowed only if
- * this function returns true for the queue. As a consequence, since
- * device idling plays a critical role in both throughput boosting and
- * service guarantees, the return value of this function plays a
- * critical role in both these aspects as well.
- *
- * In a nutshell, this function returns true only if idling is
- * beneficial for throughput or, even if detrimental for throughput,
- * idling is however necessary to preserve service guarantees (low
- * latency, desired throughput distribution, ...). In particular, on
- * NCQ-capable devices, this function tries to return false, so as to
- * help keep the drives' internal queues full, whenever this helps the
- * device boost the throughput without causing any service-guarantee
- * issue.
- *
- * In more detail, the return value of this function is obtained by,
- * first, computing a number of boolean variables that take into
- * account throughput and service-guarantee issues, and, then,
- * combining these variables in a logical expression. Most of the
- * issues taken into account are not trivial. We discuss these issues
- * individually while introducing the variables.
- */
-static bool bfq_better_to_idle(struct bfq_queue *bfqq)
+static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd,
+					     struct bfq_queue *bfqq)
 {
-	struct bfq_data *bfqd = bfqq->bfqd;
 	bool rot_without_queueing =
 		!blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag,
 		bfqq_sequential_and_IO_bound,
-		idling_boosts_thr, idling_boosts_thr_without_issues,
-		idling_needed_for_service_guarantees,
-		asymmetric_scenario;
-
-	if (bfqd->strict_guarantees)
-		return true;
-
-	/*
-	 * Idling is performed only if slice_idle > 0. In addition, we
-	 * do not idle if
-	 * (a) bfqq is async
-	 * (b) bfqq is in the idle io prio class: in this case we do
-	 * not idle because we want to minimize the bandwidth that
-	 * queues in this class can steal to higher-priority queues
-	 */
-	if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) ||
-	    bfq_class_idle(bfqq))
-		return false;
+		idling_boosts_thr;
 
 	bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) &&
 		bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_has_short_ttime(bfqq);
@@ -3454,8 +3460,7 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
 		 bfqq_sequential_and_IO_bound);
 
 	/*
-	 * The value of the next variable,
-	 * idling_boosts_thr_without_issues, is equal to that of
+	 * The return value of this function is equal to that of
 	 * idling_boosts_thr, unless a special case holds. In this
 	 * special case, described below, idling may cause problems to
 	 * weight-raised queues.
@@ -3472,217 +3477,252 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
 	 * which enqueue several requests in advance, and further
 	 * reorder internally-queued requests.
 	 *
-	 * For this reason, we force to false the value of
-	 * idling_boosts_thr_without_issues if there are weight-raised
-	 * busy queues. In this case, and if bfqq is not weight-raised,
-	 * this guarantees that the device is not idled for bfqq (if,
-	 * instead, bfqq is weight-raised, then idling will be
-	 * guaranteed by another variable, see below). Combined with
-	 * the timestamping rules of BFQ (see [1] for details), this
-	 * behavior causes bfqq, and hence any sync non-weight-raised
-	 * queue, to get a lower number of requests served, and thus
-	 * to ask for a lower number of requests from the request
-	 * pool, before the busy weight-raised queues get served
-	 * again. This often mitigates starvation problems in the
-	 * presence of heavy write workloads and NCQ, thereby
-	 * guaranteeing a higher application and system responsiveness
-	 * in these hostile scenarios.
+	 * For this reason, we force to false the return value if
+	 * there are weight-raised busy queues. In this case, and if
+	 * bfqq is not weight-raised, this guarantees that the device
+	 * is not idled for bfqq (if, instead, bfqq is weight-raised,
+	 * then idling will be guaranteed by another variable, see
+	 * below). Combined with the timestamping rules of BFQ (see
+	 * [1] for details), this behavior causes bfqq, and hence any
+	 * sync non-weight-raised queue, to get a lower number of
+	 * requests served, and thus to ask for a lower number of
+	 * requests from the request pool, before the busy
+	 * weight-raised queues get served again. This often mitigates
+	 * starvation problems in the presence of heavy write
+	 * workloads and NCQ, thereby guaranteeing a higher
+	 * application and system responsiveness in these hostile
+	 * scenarios.
 	 */
-	idling_boosts_thr_without_issues = idling_boosts_thr &&
+	return idling_boosts_thr &&
 		bfqd->wr_busy_queues == 0;
+}
 
-	/*
-	 * There is then a case where idling must be performed not
-	 * for throughput concerns, but to preserve service
-	 * guarantees.
-	 *
-	 * To introduce this case, we can note that allowing the drive
-	 * to enqueue more than one request at a time, and hence
-	 * delegating de facto final scheduling decisions to the
-	 * drive's internal scheduler, entails loss of control on the
-	 * actual request service order. In particular, the critical
-	 * situation is when requests from different processes happen
-	 * to be present, at the same time, in the internal queue(s)
-	 * of the drive. In such a situation, the drive, by deciding
-	 * the service order of the internally-queued requests, does
-	 * determine also the actual throughput distribution among
-	 * these processes. But the drive typically has no notion or
-	 * concern about per-process throughput distribution, and
-	 * makes its decisions only on a per-request basis. Therefore,
-	 * the service distribution enforced by the drive's internal
-	 * scheduler is likely to coincide with the desired
-	 * device-throughput distribution only in a completely
-	 * symmetric scenario where:
-	 * (i)  each of these processes must get the same throughput as
-	 *      the others;
-	 * (ii) the I/O of each process has the same properties, in
-	 *      terms of locality (sequential or random), direction
-	 *      (reads or writes), request sizes, greediness
-	 *      (from I/O-bound to sporadic), and so on.
-	 * In fact, in such a scenario, the drive tends to treat
-	 * the requests of each of these processes in about the same
-	 * way as the requests of the others, and thus to provide
-	 * each of these processes with about the same throughput
-	 * (which is exactly the desired throughput distribution). In
-	 * contrast, in any asymmetric scenario, device idling is
-	 * certainly needed to guarantee that bfqq receives its
-	 * assigned fraction of the device throughput (see [1] for
-	 * details).
-	 * The problem is that idling may significantly reduce
-	 * throughput with certain combinations of types of I/O and
-	 * devices. An important example is sync random I/O, on flash
-	 * storage with command queueing. So, unless bfqq falls in the
-	 * above cases where idling also boosts throughput, it would
-	 * be important to check conditions (i) and (ii) accurately,
-	 * so as to avoid idling when not strictly needed for service
-	 * guarantees.
-	 *
-	 * Unfortunately, it is extremely difficult to thoroughly
-	 * check condition (ii). And, in case there are active groups,
-	 * it becomes very difficult to check condition (i) too. In
-	 * fact, if there are active groups, then, for condition (i)
-	 * to become false, it is enough that an active group contains
-	 * more active processes or sub-groups than some other active
-	 * group. More precisely, for condition (i) to hold because of
-	 * such a group, it is not even necessary that the group is
-	 * (still) active: it is sufficient that, even if the group
-	 * has become inactive, some of its descendant processes still
-	 * have some request already dispatched but still waiting for
-	 * completion. In fact, requests have still to be guaranteed
-	 * their share of the throughput even after being
-	 * dispatched. In this respect, it is easy to show that, if a
-	 * group frequently becomes inactive while still having
-	 * in-flight requests, and if, when this happens, the group is
-	 * not considered in the calculation of whether the scenario
-	 * is asymmetric, then the group may fail to be guaranteed its
-	 * fair share of the throughput (basically because idling may
-	 * not be performed for the descendant processes of the group,
-	 * but it had to be).  We address this issue with the
-	 * following bi-modal behavior, implemented in the function
-	 * bfq_symmetric_scenario().
-	 *
-	 * If there are groups with requests waiting for completion
-	 * (as commented above, some of these groups may even be
-	 * already inactive), then the scenario is tagged as
-	 * asymmetric, conservatively, without checking any of the
-	 * conditions (i) and (ii). So the device is idled for bfqq.
-	 * This behavior matches also the fact that groups are created
-	 * exactly if controlling I/O is a primary concern (to
-	 * preserve bandwidth and latency guarantees).
-	 *
-	 * On the opposite end, if there are no groups with requests
-	 * waiting for completion, then only condition (i) is actually
-	 * controlled, i.e., provided that condition (i) holds, idling
-	 * is not performed, regardless of whether condition (ii)
-	 * holds. In other words, only if condition (i) does not hold,
-	 * then idling is allowed, and the device tends to be
-	 * prevented from queueing many requests, possibly of several
-	 * processes. Since there are no groups with requests waiting
-	 * for completion, then, to control condition (i) it is enough
-	 * to check just whether all the queues with requests waiting
-	 * for completion also have the same weight.
-	 *
-	 * Not checking condition (ii) evidently exposes bfqq to the
-	 * risk of getting less throughput than its fair share.
-	 * However, for queues with the same weight, a further
-	 * mechanism, preemption, mitigates or even eliminates this
-	 * problem. And it does so without consequences on overall
-	 * throughput. This mechanism and its benefits are explained
-	 * in the next three paragraphs.
-	 *
-	 * Even if a queue, say Q, is expired when it remains idle, Q
-	 * can still preempt the new in-service queue if the next
-	 * request of Q arrives soon (see the comments on
-	 * bfq_bfqq_update_budg_for_activation). If all queues and
-	 * groups have the same weight, this form of preemption,
-	 * combined with the hole-recovery heuristic described in the
-	 * comments on function bfq_bfqq_update_budg_for_activation,
-	 * are enough to preserve a correct bandwidth distribution in
-	 * the mid term, even without idling. In fact, even if not
-	 * idling allows the internal queues of the device to contain
-	 * many requests, and thus to reorder requests, we can rather
-	 * safely assume that the internal scheduler still preserves a
-	 * minimum of mid-term fairness.
-	 *
-	 * More precisely, this preemption-based, idleless approach
-	 * provides fairness in terms of IOPS, and not sectors per
-	 * second. This can be seen with a simple example. Suppose
-	 * that there are two queues with the same weight, but that
-	 * the first queue receives requests of 8 sectors, while the
-	 * second queue receives requests of 1024 sectors. In
-	 * addition, suppose that each of the two queues contains at
-	 * most one request at a time, which implies that each queue
-	 * always remains idle after it is served. Finally, after
-	 * remaining idle, each queue receives very quickly a new
-	 * request. It follows that the two queues are served
-	 * alternatively, preempting each other if needed. This
-	 * implies that, although both queues have the same weight,
-	 * the queue with large requests receives a service that is
-	 * 1024/8 times as high as the service received by the other
-	 * queue.
-	 *
-	 * The motivation for using preemption instead of idling (for
-	 * queues with the same weight) is that, by not idling,
-	 * service guarantees are preserved (completely or at least in
-	 * part) without minimally sacrificing throughput. And, if
-	 * there is no active group, then the primary expectation for
-	 * this device is probably a high throughput.
-	 *
-	 * We are now left only with explaining the additional
-	 * compound condition that is checked below for deciding
-	 * whether the scenario is asymmetric. To explain this
-	 * compound condition, we need to add that the function
-	 * bfq_symmetric_scenario checks the weights of only
-	 * non-weight-raised queues, for efficiency reasons (see
-	 * comments on bfq_weights_tree_add()). Then the fact that
-	 * bfqq is weight-raised is checked explicitly here. More
-	 * precisely, the compound condition below takes into account
-	 * also the fact that, even if bfqq is being weight-raised,
-	 * the scenario is still symmetric if all queues with requests
-	 * waiting for completion happen to be
-	 * weight-raised. Actually, we should be even more precise
-	 * here, and differentiate between interactive weight raising
-	 * and soft real-time weight raising.
-	 *
-	 * As a side note, it is worth considering that the above
-	 * device-idling countermeasures may however fail in the
-	 * following unlucky scenario: if idling is (correctly)
-	 * disabled in a time period during which all symmetry
-	 * sub-conditions hold, and hence the device is allowed to
-	 * enqueue many requests, but at some later point in time some
-	 * sub-condition stops to hold, then it may become impossible
-	 * to let requests be served in the desired order until all
-	 * the requests already queued in the device have been served.
-	 */
-	asymmetric_scenario = (bfqq->wr_coeff > 1 &&
-			       bfqd->wr_busy_queues < bfqd->busy_queues) ||
+/*
+ * There is a case where idling must be performed not for
+ * throughput concerns, but to preserve service guarantees.
+ *
+ * To introduce this case, we can note that allowing the drive
+ * to enqueue more than one request at a time, and hence
+ * delegating de facto final scheduling decisions to the
+ * drive's internal scheduler, entails loss of control on the
+ * actual request service order. In particular, the critical
+ * situation is when requests from different processes happen
+ * to be present, at the same time, in the internal queue(s)
+ * of the drive. In such a situation, the drive, by deciding
+ * the service order of the internally-queued requests, does
+ * determine also the actual throughput distribution among
+ * these processes. But the drive typically has no notion or
+ * concern about per-process throughput distribution, and
+ * makes its decisions only on a per-request basis. Therefore,
+ * the service distribution enforced by the drive's internal
+ * scheduler is likely to coincide with the desired
+ * device-throughput distribution only in a completely
+ * symmetric scenario where:
+ * (i)  each of these processes must get the same throughput as
+ *      the others;
+ * (ii) the I/O of each process has the same properties, in
+ *      terms of locality (sequential or random), direction
+ *      (reads or writes), request sizes, greediness
+ *      (from I/O-bound to sporadic), and so on.
+ * In fact, in such a scenario, the drive tends to treat
+ * the requests of each of these processes in about the same
+ * way as the requests of the others, and thus to provide
+ * each of these processes with about the same throughput
+ * (which is exactly the desired throughput distribution). In
+ * contrast, in any asymmetric scenario, device idling is
+ * certainly needed to guarantee that bfqq receives its
+ * assigned fraction of the device throughput (see [1] for
+ * details).
+ * The problem is that idling may significantly reduce
+ * throughput with certain combinations of types of I/O and
+ * devices. An important example is sync random I/O, on flash
+ * storage with command queueing. So, unless bfqq falls in the
+ * above cases where idling also boosts throughput, it would
+ * be important to check conditions (i) and (ii) accurately,
+ * so as to avoid idling when not strictly needed for service
+ * guarantees.
+ *
+ * Unfortunately, it is extremely difficult to thoroughly
+ * check condition (ii). And, in case there are active groups,
+ * it becomes very difficult to check condition (i) too. In
+ * fact, if there are active groups, then, for condition (i)
+ * to become false, it is enough that an active group contains
+ * more active processes or sub-groups than some other active
+ * group. More precisely, for condition (i) to hold because of
+ * such a group, it is not even necessary that the group is
+ * (still) active: it is sufficient that, even if the group
+ * has become inactive, some of its descendant processes still
+ * have some request already dispatched but still waiting for
+ * completion. In fact, requests have still to be guaranteed
+ * their share of the throughput even after being
+ * dispatched. In this respect, it is easy to show that, if a
+ * group frequently becomes inactive while still having
+ * in-flight requests, and if, when this happens, the group is
+ * not considered in the calculation of whether the scenario
+ * is asymmetric, then the group may fail to be guaranteed its
+ * fair share of the throughput (basically because idling may
+ * not be performed for the descendant processes of the group,
+ * but it had to be).  We address this issue with the
+ * following bi-modal behavior, implemented in the function
+ * bfq_symmetric_scenario().
+ *
+ * If there are groups with requests waiting for completion
+ * (as commented above, some of these groups may even be
+ * already inactive), then the scenario is tagged as
+ * asymmetric, conservatively, without checking any of the
+ * conditions (i) and (ii). So the device is idled for bfqq.
+ * This behavior matches also the fact that groups are created
+ * exactly if controlling I/O is a primary concern (to
+ * preserve bandwidth and latency guarantees).
+ *
+ * On the opposite end, if there are no groups with requests
+ * waiting for completion, then only condition (i) is actually
+ * controlled, i.e., provided that condition (i) holds, idling
+ * is not performed, regardless of whether condition (ii)
+ * holds. In other words, only if condition (i) does not hold,
+ * then idling is allowed, and the device tends to be
+ * prevented from queueing many requests, possibly of several
+ * processes. Since there are no groups with requests waiting
+ * for completion, then, to control condition (i) it is enough
+ * to check just whether all the queues with requests waiting
+ * for completion also have the same weight.
+ *
+ * Not checking condition (ii) evidently exposes bfqq to the
+ * risk of getting less throughput than its fair share.
+ * However, for queues with the same weight, a further
+ * mechanism, preemption, mitigates or even eliminates this
+ * problem. And it does so without consequences on overall
+ * throughput. This mechanism and its benefits are explained
+ * in the next three paragraphs.
+ *
+ * Even if a queue, say Q, is expired when it remains idle, Q
+ * can still preempt the new in-service queue if the next
+ * request of Q arrives soon (see the comments on
+ * bfq_bfqq_update_budg_for_activation). If all queues and
+ * groups have the same weight, this form of preemption,
+ * combined with the hole-recovery heuristic described in the
+ * comments on function bfq_bfqq_update_budg_for_activation,
+ * are enough to preserve a correct bandwidth distribution in
+ * the mid term, even without idling. In fact, even if not
+ * idling allows the internal queues of the device to contain
+ * many requests, and thus to reorder requests, we can rather
+ * safely assume that the internal scheduler still preserves a
+ * minimum of mid-term fairness.
+ *
+ * More precisely, this preemption-based, idleless approach
+ * provides fairness in terms of IOPS, and not sectors per
+ * second. This can be seen with a simple example. Suppose
+ * that there are two queues with the same weight, but that
+ * the first queue receives requests of 8 sectors, while the
+ * second queue receives requests of 1024 sectors. In
+ * addition, suppose that each of the two queues contains at
+ * most one request at a time, which implies that each queue
+ * always remains idle after it is served. Finally, after
+ * remaining idle, each queue receives very quickly a new
+ * request. It follows that the two queues are served
+ * alternatively, preempting each other if needed. This
+ * implies that, although both queues have the same weight,
+ * the queue with large requests receives a service that is
+ * 1024/8 times as high as the service received by the other
+ * queue.
+ *
+ * The motivation for using preemption instead of idling (for
+ * queues with the same weight) is that, by not idling,
+ * service guarantees are preserved (completely or at least in
+ * part) without minimally sacrificing throughput. And, if
+ * there is no active group, then the primary expectation for
+ * this device is probably a high throughput.
+ *
+ * We are now left only with explaining the additional
+ * compound condition that is checked below for deciding
+ * whether the scenario is asymmetric. To explain this
+ * compound condition, we need to add that the function
+ * bfq_symmetric_scenario checks the weights of only
+ * non-weight-raised queues, for efficiency reasons (see
+ * comments on bfq_weights_tree_add()). Then the fact that
+ * bfqq is weight-raised is checked explicitly here. More
+ * precisely, the compound condition below takes into account
+ * also the fact that, even if bfqq is being weight-raised,
+ * the scenario is still symmetric if all queues with requests
+ * waiting for completion happen to be
+ * weight-raised. Actually, we should be even more precise
+ * here, and differentiate between interactive weight raising
+ * and soft real-time weight raising.
+ *
+ * As a side note, it is worth considering that the above
+ * device-idling countermeasures may however fail in the
+ * following unlucky scenario: if idling is (correctly)
+ * disabled in a time period during which all symmetry
+ * sub-conditions hold, and hence the device is allowed to
+ * enqueue many requests, but at some later point in time some
+ * sub-condition stops to hold, then it may become impossible
+ * to let requests be served in the desired order until all
+ * the requests already queued in the device have been served.
+ */
+static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd,
+						 struct bfq_queue *bfqq)
+{
+	return (bfqq->wr_coeff > 1 &&
+		bfqd->wr_busy_queues <
+		bfq_tot_busy_queues(bfqd)) ||
 		!bfq_symmetric_scenario(bfqd);
+}
+
+/*
+ * For a queue that becomes empty, device idling is allowed only if
+ * this function returns true for that queue. As a consequence, since
+ * device idling plays a critical role for both throughput boosting
+ * and service guarantees, the return value of this function plays a
+ * critical role as well.
+ *
+ * In a nutshell, this function returns true only if idling is
+ * beneficial for throughput or, even if detrimental for throughput,
+ * idling is however necessary to preserve service guarantees (low
+ * latency, desired throughput distribution, ...). In particular, on
+ * NCQ-capable devices, this function tries to return false, so as to
+ * help keep the drives' internal queues full, whenever this helps the
+ * device boost the throughput without causing any service-guarantee
+ * issue.
+ *
+ * Most of the issues taken into account to get the return value of
+ * this function are not trivial. We discuss these issues in the two
+ * functions providing the main pieces of information needed by this
+ * function.
+ */
+static bool bfq_better_to_idle(struct bfq_queue *bfqq)
+{
+	struct bfq_data *bfqd = bfqq->bfqd;
+	bool idling_boosts_thr_with_no_issue, idling_needed_for_service_guar;
+
+	if (unlikely(bfqd->strict_guarantees))
+		return true;
 
 	/*
-	 * Finally, there is a case where maximizing throughput is the
-	 * best choice even if it may cause unfairness toward
-	 * bfqq. Such a case is when bfqq became active in a burst of
-	 * queue activations. Queues that became active during a large
-	 * burst benefit only from throughput, as discussed in the
-	 * comments on bfq_handle_burst. Thus, if bfqq became active
-	 * in a burst and not idling the device maximizes throughput,
-	 * then the device must no be idled, because not idling the
-	 * device provides bfqq and all other queues in the burst with
-	 * maximum benefit. Combining this and the above case, we can
-	 * now establish when idling is actually needed to preserve
-	 * service guarantees.
+	 * Idling is performed only if slice_idle > 0. In addition, we
+	 * do not idle if
+	 * (a) bfqq is async
+	 * (b) bfqq is in the idle io prio class: in this case we do
+	 * not idle because we want to minimize the bandwidth that
+	 * queues in this class can steal to higher-priority queues
 	 */
-	idling_needed_for_service_guarantees =
-		asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq);
+	if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) ||
+	   bfq_class_idle(bfqq))
+		return false;
+
+	idling_boosts_thr_with_no_issue =
+		idling_boosts_thr_without_issues(bfqd, bfqq);
+
+	idling_needed_for_service_guar =
+		idling_needed_for_service_guarantees(bfqd, bfqq);
 
 	/*
-	 * We have now all the components we need to compute the
+	 * We have now the two components we need to compute the
 	 * return value of the function, which is true only if idling
 	 * either boosts the throughput (without issues), or is
 	 * necessary to preserve service guarantees.
 	 */
-	return idling_boosts_thr_without_issues ||
-		idling_needed_for_service_guarantees;
+	return idling_boosts_thr_with_no_issue ||
+		idling_needed_for_service_guar;
 }
 
 /*
@@ -3934,7 +3974,7 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd,
 	 * belongs to CLASS_IDLE and other queues are waiting for
 	 * service.
 	 */
-	if (!(bfqd->busy_queues > 1 && bfq_class_idle(bfqq)))
+	if (!(bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq)))
 		goto return_rq;
 
 	bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED);
@@ -3952,7 +3992,7 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx)
 	 * most a call to dispatch for nothing
 	 */
 	return !list_empty_careful(&bfqd->dispatch) ||
-		bfqd->busy_queues > 0;
+		bfq_tot_busy_queues(bfqd) > 0;
 }
 
 static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
@@ -4006,9 +4046,10 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
 		goto start_rq;
 	}
 
-	bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
+	bfq_log(bfqd, "dispatch requests: %d busy queues",
+		bfq_tot_busy_queues(bfqd));
 
-	if (bfqd->busy_queues == 0)
+	if (bfq_tot_busy_queues(bfqd) == 0)
 		goto exit;
 
 	/*
@@ -4488,10 +4529,7 @@ bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 		       struct request *rq)
 {
 	bfqq->seek_history <<= 1;
-	bfqq->seek_history |=
-		get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR &&
-		(!blk_queue_nonrot(bfqd->queue) ||
-		 blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT);
+	bfqq->seek_history |= BFQ_RQ_SEEKY(bfqd, bfqq->last_request_pos, rq);
 }
 
 static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
@@ -4560,28 +4598,31 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 		bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);
 
 		/*
-		 * There is just this request queued: if the request
-		 * is small and the queue is not to be expired, then
-		 * just exit.
+		 * There is just this request queued: if
+		 * - the request is small, and
+		 * - we are idling to boost throughput, and
+		 * - the queue is not to be expired,
+		 * then just exit.
 		 *
 		 * In this way, if the device is being idled to wait
 		 * for a new request from the in-service queue, we
 		 * avoid unplugging the device and committing the
-		 * device to serve just a small request. On the
-		 * contrary, we wait for the block layer to decide
-		 * when to unplug the device: hopefully, new requests
-		 * will be merged to this one quickly, then the device
-		 * will be unplugged and larger requests will be
-		 * dispatched.
+		 * device to serve just a small request. In contrast
+		 * we wait for the block layer to decide when to
+		 * unplug the device: hopefully, new requests will be
+		 * merged to this one quickly, then the device will be
+		 * unplugged and larger requests will be dispatched.
 		 */
-		if (small_req && !budget_timeout)
+		if (small_req && idling_boosts_thr_without_issues(bfqd, bfqq) &&
+		    !budget_timeout)
 			return;
 
 		/*
-		 * A large enough request arrived, or the queue is to
-		 * be expired: in both cases disk idling is to be
-		 * stopped, so clear wait_request flag and reset
-		 * timer.
+		 * A large enough request arrived, or idling is being
+		 * performed to preserve service guarantees, or
+		 * finally the queue is to be expired: in all these
+		 * cases disk idling is to be stopped, so clear
+		 * wait_request flag and reset timer.
 		 */
 		bfq_clear_bfqq_wait_request(bfqq);
 		hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
@@ -4607,8 +4648,6 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
 	bool waiting, idle_timer_disabled = false;
 
 	if (new_bfqq) {
-		if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
-			new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
 		/*
 		 * Release the request's reference to the old bfqq
 		 * and make sure one is taken to the shared queue.
@@ -4751,6 +4790,8 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
 
 static void bfq_update_hw_tag(struct bfq_data *bfqd)
 {
+	struct bfq_queue *bfqq = bfqd->in_service_queue;
+
 	bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,
 				       bfqd->rq_in_driver);
 
@@ -4763,7 +4804,18 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd)
 	 * sum is not exact, as it's not taking into account deactivated
 	 * requests.
 	 */
-	if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
+	if (bfqd->rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD)
+		return;
+
+	/*
+	 * If active queue hasn't enough requests and can idle, bfq might not
+	 * dispatch sufficient requests to hardware. Don't zero hw_tag in this
+	 * case
+	 */
+	if (bfqq && bfq_bfqq_has_short_ttime(bfqq) &&
+	    bfqq->dispatched + bfqq->queued[0] + bfqq->queued[1] <
+	    BFQ_HW_QUEUE_THRESHOLD &&
+	    bfqd->rq_in_driver < BFQ_HW_QUEUE_THRESHOLD)
 		return;
 
 	if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
@@ -4834,11 +4886,14 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
 	 * isochronous, and both requisites for this condition to hold
 	 * are now satisfied, then compute soft_rt_next_start (see the
 	 * comments on the function bfq_bfqq_softrt_next_start()). We
-	 * schedule this delayed check when bfqq expires, if it still
-	 * has in-flight requests.
+	 * do not compute soft_rt_next_start if bfqq is in interactive
+	 * weight raising (see the comments in bfq_bfqq_expire() for
+	 * an explanation). We schedule this delayed update when bfqq
+	 * expires, if it still has in-flight requests.
 	 */
 	if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&
-	    RB_EMPTY_ROOT(&bfqq->sort_list))
+	    RB_EMPTY_ROOT(&bfqq->sort_list) &&
+	    bfqq->wr_coeff != bfqd->bfq_wr_coeff)
 		bfqq->soft_rt_next_start =
 			bfq_bfqq_softrt_next_start(bfqd, bfqq);
 
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 0b02bf302de0..062e1c4787f4 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -501,10 +501,11 @@ struct bfq_data {
 	unsigned int num_groups_with_pending_reqs;
 
 	/*
-	 * Number of bfq_queues containing requests (including the
-	 * queue in service, even if it is idling).
+	 * Per-class (RT, BE, IDLE) number of bfq_queues containing
+	 * requests (including the queue in service, even if it is
+	 * idling).
 	 */
-	int busy_queues;
+	unsigned int busy_queues[3];
 	/* number of weight-raised busy @bfq_queues */
 	int wr_busy_queues;
 	/* number of queued requests */
@@ -537,6 +538,9 @@ struct bfq_data {
 	/* on-disk position of the last served request */
 	sector_t last_position;
 
+	/* position of the last served request for the in-service queue */
+	sector_t in_serv_last_pos;
+
 	/* time of last request completion (ns) */
 	u64 last_completion;
 
@@ -974,6 +978,7 @@ extern struct blkcg_policy blkcg_policy_bfq;
 
 struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq);
 struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
+unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd);
 struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity);
 struct bfq_entity *bfq_entity_of(struct rb_node *node);
 unsigned short bfq_ioprio_to_weight(int ioprio);
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index 72adbbe975d5..63311d1ff1ed 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -44,6 +44,12 @@ static unsigned int bfq_class_idx(struct bfq_entity *entity)
 		BFQ_DEFAULT_GRP_CLASS - 1;
 }
 
+unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd)
+{
+	return bfqd->busy_queues[0] + bfqd->busy_queues[1] +
+		bfqd->busy_queues[2];
+}
+
 static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
 						 bool expiration);
 
@@ -1513,7 +1519,7 @@ struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
 	struct bfq_sched_data *sd;
 	struct bfq_queue *bfqq;
 
-	if (bfqd->busy_queues == 0)
+	if (bfq_tot_busy_queues(bfqd) == 0)
 		return NULL;
 
 	/*
@@ -1665,10 +1671,7 @@ void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 
 	bfq_clear_bfqq_busy(bfqq);
 
-	bfqd->busy_queues--;
-
-	if (!bfqq->dispatched)
-		bfq_weights_tree_remove(bfqd, bfqq);
+	bfqd->busy_queues[bfqq->ioprio_class - 1]--;
 
 	if (bfqq->wr_coeff > 1)
 		bfqd->wr_busy_queues--;
@@ -1676,6 +1679,9 @@ void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	bfqg_stats_update_dequeue(bfqq_group(bfqq));
 
 	bfq_deactivate_bfqq(bfqd, bfqq, true, expiration);
+
+	if (!bfqq->dispatched)
+		bfq_weights_tree_remove(bfqd, bfqq);
 }
 
 /*
@@ -1688,7 +1694,7 @@ void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 	bfq_activate_bfqq(bfqd, bfqq);
 
 	bfq_mark_bfqq_busy(bfqq);
-	bfqd->busy_queues++;
+	bfqd->busy_queues[bfqq->ioprio_class - 1]++;
 
 	if (!bfqq->dispatched)
 		if (bfqq->wr_coeff == 1)
diff --git a/block/bio.c b/block/bio.c
index 4db1008309ed..83a2dfa417ca 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -753,6 +753,8 @@ EXPORT_SYMBOL(bio_add_pc_page);
  * @page: page to add
  * @len: length of the data to add
  * @off: offset of the data in @page
+ * @same_page: if %true only merge if the new data is in the same physical
+ *		page as the last segment of the bio.
  *
  * Try to add the data at @page + @off to the last bvec of @bio.  This is a
  * a useful optimisation for file systems with a block size smaller than the
@@ -761,19 +763,25 @@ EXPORT_SYMBOL(bio_add_pc_page);
  * Return %true on success or %false on failure.
  */
 bool __bio_try_merge_page(struct bio *bio, struct page *page,
-		unsigned int len, unsigned int off)
+		unsigned int len, unsigned int off, bool same_page)
 {
 	if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
 		return false;
 
 	if (bio->bi_vcnt > 0) {
 		struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
+		phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) +
+			bv->bv_offset + bv->bv_len - 1;
+		phys_addr_t page_addr = page_to_phys(page);
 
-		if (page == bv->bv_page && off == bv->bv_offset + bv->bv_len) {
-			bv->bv_len += len;
-			bio->bi_iter.bi_size += len;
-			return true;
-		}
+		if (vec_end_addr + 1 != page_addr + off)
+			return false;
+		if (same_page && (vec_end_addr & PAGE_MASK) != page_addr)
+			return false;
+
+		bv->bv_len += len;
+		bio->bi_iter.bi_size += len;
+		return true;
 	}
 	return false;
 }
@@ -819,7 +827,7 @@ EXPORT_SYMBOL_GPL(__bio_add_page);
 int bio_add_page(struct bio *bio, struct page *page,
 		 unsigned int len, unsigned int offset)
 {
-	if (!__bio_try_merge_page(bio, page, len, offset)) {
+	if (!__bio_try_merge_page(bio, page, len, offset, false)) {
 		if (bio_full(bio))
 			return 0;
 		__bio_add_page(bio, page, len, offset);
@@ -1072,8 +1080,9 @@ static int bio_copy_from_iter(struct bio *bio, struct iov_iter *iter)
 {
 	int i;
 	struct bio_vec *bvec;
+	struct bvec_iter_all iter_all;
 
-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, i, iter_all) {
 		ssize_t ret;
 
 		ret = copy_page_from_iter(bvec->bv_page,
@@ -1103,8 +1112,9 @@ static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter)
 {
 	int i;
 	struct bio_vec *bvec;
+	struct bvec_iter_all iter_all;
 
-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, i, iter_all) {
 		ssize_t ret;
 
 		ret = copy_page_to_iter(bvec->bv_page,
@@ -1126,8 +1136,9 @@ void bio_free_pages(struct bio *bio)
 {
 	struct bio_vec *bvec;
 	int i;
+	struct bvec_iter_all iter_all;
 
-	bio_for_each_segment_all(bvec, bio, i)
+	bio_for_each_segment_all(bvec, bio, i, iter_all)
 		__free_page(bvec->bv_page);
 }
 EXPORT_SYMBOL(bio_free_pages);
@@ -1295,6 +1306,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
 	struct bio *bio;
 	int ret;
 	struct bio_vec *bvec;
+	struct bvec_iter_all iter_all;
 
 	if (!iov_iter_count(iter))
 		return ERR_PTR(-EINVAL);
@@ -1368,7 +1380,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
 	return bio;
 
  out_unmap:
-	bio_for_each_segment_all(bvec, bio, j) {
+	bio_for_each_segment_all(bvec, bio, j, iter_all) {
 		put_page(bvec->bv_page);
 	}
 	bio_put(bio);
@@ -1379,11 +1391,12 @@ static void __bio_unmap_user(struct bio *bio)
 {
 	struct bio_vec *bvec;
 	int i;
+	struct bvec_iter_all iter_all;
 
 	/*
 	 * make sure we dirty pages we wrote to
 	 */
-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, i, iter_all) {
 		if (bio_data_dir(bio) == READ)
 			set_page_dirty_lock(bvec->bv_page);
 
@@ -1475,8 +1488,9 @@ static void bio_copy_kern_endio_read(struct bio *bio)
 	char *p = bio->bi_private;
 	struct bio_vec *bvec;
 	int i;
+	struct bvec_iter_all iter_all;
 
-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, i, iter_all) {
 		memcpy(p, page_address(bvec->bv_page), bvec->bv_len);
 		p += bvec->bv_len;
 	}
@@ -1585,8 +1599,9 @@ void bio_set_pages_dirty(struct bio *bio)
 {
 	struct bio_vec *bvec;
 	int i;
+	struct bvec_iter_all iter_all;
 
-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, i, iter_all) {
 		if (!PageCompound(bvec->bv_page))
 			set_page_dirty_lock(bvec->bv_page);
 	}
@@ -1596,8 +1611,9 @@ static void bio_release_pages(struct bio *bio)
 {
 	struct bio_vec *bvec;
 	int i;
+	struct bvec_iter_all iter_all;
 
-	bio_for_each_segment_all(bvec, bio, i)
+	bio_for_each_segment_all(bvec, bio, i, iter_all)
 		put_page(bvec->bv_page);
 }
 
@@ -1644,8 +1660,9 @@ void bio_check_pages_dirty(struct bio *bio)
 	struct bio_vec *bvec;
 	unsigned long flags;
 	int i;
+	struct bvec_iter_all iter_all;
 
-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, i, iter_all) {
 		if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page))
 			goto defer;
 	}
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 2bed5725aa03..77f37ef8ef06 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1269,7 +1269,7 @@ void blkcg_drain_queue(struct request_queue *q)
  * blkcg_exit_queue - exit and release blkcg part of request_queue
  * @q: request_queue being released
  *
- * Called from blk_release_queue().  Responsible for exiting blkcg part.
+ * Called from blk_exit_queue().  Responsible for exiting blkcg part.
  */
 void blkcg_exit_queue(struct request_queue *q)
 {
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 71e9ac03f621..22467f475ab4 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -161,6 +161,73 @@ static inline unsigned get_max_io_size(struct request_queue *q,
 	return sectors;
 }
 
+static unsigned get_max_segment_size(struct request_queue *q,
+				     unsigned offset)
+{
+	unsigned long mask = queue_segment_boundary(q);
+
+	/* default segment boundary mask means no boundary limit */
+	if (mask == BLK_SEG_BOUNDARY_MASK)
+		return queue_max_segment_size(q);
+
+	return min_t(unsigned long, mask - (mask & offset) + 1,
+		     queue_max_segment_size(q));
+}
+
+/*
+ * Split the bvec @bv into segments, and update all kinds of
+ * variables.
+ */
+static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv,
+		unsigned *nsegs, unsigned *last_seg_size,
+		unsigned *front_seg_size, unsigned *sectors)
+{
+	unsigned len = bv->bv_len;
+	unsigned total_len = 0;
+	unsigned new_nsegs = 0, seg_size = 0;
+
+	/*
+	 * Multi-page bvec may be too big to hold in one segment, so the
+	 * current bvec has to be splitted as multiple segments.
+	 */
+	while (len && new_nsegs + *nsegs < queue_max_segments(q)) {
+		seg_size = get_max_segment_size(q, bv->bv_offset + total_len);
+		seg_size = min(seg_size, len);
+
+		new_nsegs++;
+		total_len += seg_size;
+		len -= seg_size;
+
+		if ((bv->bv_offset + total_len) & queue_virt_boundary(q))
+			break;
+	}
+
+	if (!new_nsegs)
+		return !!len;
+
+	/* update front segment size */
+	if (!*nsegs) {
+		unsigned first_seg_size;
+
+		if (new_nsegs == 1)
+			first_seg_size = get_max_segment_size(q, bv->bv_offset);
+		else
+			first_seg_size = queue_max_segment_size(q);
+
+		if (*front_seg_size < first_seg_size)
+			*front_seg_size = first_seg_size;
+	}
+
+	/* update other varibles */
+	*last_seg_size = seg_size;
+	*nsegs += new_nsegs;
+	if (sectors)
+		*sectors += total_len >> 9;
+
+	/* split in the middle of the bvec if len != 0 */
+	return !!len;
+}
+
 static struct bio *blk_bio_segment_split(struct request_queue *q,
 					 struct bio *bio,
 					 struct bio_set *bs,
@@ -174,7 +241,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 	struct bio *new = NULL;
 	const unsigned max_sectors = get_max_io_size(q, bio);
 
-	bio_for_each_segment(bv, bio, iter) {
+	bio_for_each_bvec(bv, bio, iter) {
 		/*
 		 * If the queue doesn't support SG gaps and adding this
 		 * offset would create a gap, disallow it.
@@ -189,8 +256,12 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 			 */
 			if (nsegs < queue_max_segments(q) &&
 			    sectors < max_sectors) {
-				nsegs++;
-				sectors = max_sectors;
+				/* split in the middle of bvec */
+				bv.bv_len = (max_sectors - sectors) << 9;
+				bvec_split_segs(q, &bv, &nsegs,
+						&seg_size,
+						&front_seg_size,
+						&sectors);
 			}
 			goto split;
 		}
@@ -206,21 +277,28 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 			bvprvp = &bvprv;
 			sectors += bv.bv_len >> 9;
 
+			if (nsegs == 1 && seg_size > front_seg_size)
+				front_seg_size = seg_size;
+
 			continue;
 		}
 new_segment:
 		if (nsegs == queue_max_segments(q))
 			goto split;
 
-		if (nsegs == 1 && seg_size > front_seg_size)
-			front_seg_size = seg_size;
-
-		nsegs++;
 		bvprv = bv;
 		bvprvp = &bvprv;
-		seg_size = bv.bv_len;
-		sectors += bv.bv_len >> 9;
 
+		if (bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
+			nsegs++;
+			seg_size = bv.bv_len;
+			sectors += bv.bv_len >> 9;
+			if (nsegs == 1 && seg_size > front_seg_size)
+				front_seg_size = seg_size;
+		} else if (bvec_split_segs(q, &bv, &nsegs, &seg_size,
+				    &front_seg_size, &sectors)) {
+			goto split;
+		}
 	}
 
 	do_split = false;
@@ -233,8 +311,6 @@ split:
 			bio = new;
 	}
 
-	if (nsegs == 1 && seg_size > front_seg_size)
-		front_seg_size = seg_size;
 	bio->bi_seg_front_size = front_seg_size;
 	if (seg_size > bio->bi_seg_back_size)
 		bio->bi_seg_back_size = seg_size;
@@ -291,18 +367,20 @@ void blk_queue_split(struct request_queue *q, struct bio **bio)
 EXPORT_SYMBOL(blk_queue_split);
 
 static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
-					     struct bio *bio,
-					     bool no_sg_merge)
+					     struct bio *bio)
 {
 	struct bio_vec bv, bvprv = { NULL };
 	int prev = 0;
 	unsigned int seg_size, nr_phys_segs;
+	unsigned front_seg_size;
 	struct bio *fbio, *bbio;
 	struct bvec_iter iter;
 
 	if (!bio)
 		return 0;
 
+	front_seg_size = bio->bi_seg_front_size;
+
 	switch (bio_op(bio)) {
 	case REQ_OP_DISCARD:
 	case REQ_OP_SECURE_ERASE:
@@ -316,14 +394,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 	seg_size = 0;
 	nr_phys_segs = 0;
 	for_each_bio(bio) {
-		bio_for_each_segment(bv, bio, iter) {
-			/*
-			 * If SG merging is disabled, each bio vector is
-			 * a segment
-			 */
-			if (no_sg_merge)
-				goto new_segment;
-
+		bio_for_each_bvec(bv, bio, iter) {
 			if (prev) {
 				if (seg_size + bv.bv_len
 				    > queue_max_segment_size(q))
@@ -333,23 +404,23 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 
 				seg_size += bv.bv_len;
 				bvprv = bv;
+
+				if (nr_phys_segs == 1 && seg_size >
+						front_seg_size)
+					front_seg_size = seg_size;
+
 				continue;
 			}
 new_segment:
-			if (nr_phys_segs == 1 && seg_size >
-			    fbio->bi_seg_front_size)
-				fbio->bi_seg_front_size = seg_size;
-
-			nr_phys_segs++;
 			bvprv = bv;
 			prev = 1;
-			seg_size = bv.bv_len;
+			bvec_split_segs(q, &bv, &nr_phys_segs, &seg_size,
+					&front_seg_size, NULL);
 		}
 		bbio = bio;
 	}
 
-	if (nr_phys_segs == 1 && seg_size > fbio->bi_seg_front_size)
-		fbio->bi_seg_front_size = seg_size;
+	fbio->bi_seg_front_size = front_seg_size;
 	if (seg_size > bbio->bi_seg_back_size)
 		bbio->bi_seg_back_size = seg_size;
 
@@ -358,33 +429,16 @@ new_segment:
 
 void blk_recalc_rq_segments(struct request *rq)
 {
-	bool no_sg_merge = !!test_bit(QUEUE_FLAG_NO_SG_MERGE,
-			&rq->q->queue_flags);
-
-	rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio,
-			no_sg_merge);
+	rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio);
 }
 
 void blk_recount_segments(struct request_queue *q, struct bio *bio)
 {
-	unsigned short seg_cnt;
-
-	/* estimate segment number by bi_vcnt for non-cloned bio */
-	if (bio_flagged(bio, BIO_CLONED))
-		seg_cnt = bio_segments(bio);
-	else
-		seg_cnt = bio->bi_vcnt;
+	struct bio *nxt = bio->bi_next;
 
-	if (test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags) &&
-			(seg_cnt < queue_max_segments(q)))
-		bio->bi_phys_segments = seg_cnt;
-	else {
-		struct bio *nxt = bio->bi_next;
-
-		bio->bi_next = NULL;
-		bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio, false);
-		bio->bi_next = nxt;
-	}
+	bio->bi_next = NULL;
+	bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio);
+	bio->bi_next = nxt;
 
 	bio_set_flag(bio, BIO_SEG_VALID);
 }
@@ -407,6 +461,54 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
 	return biovec_phys_mergeable(q, &end_bv, &nxt_bv);
 }
 
+static inline struct scatterlist *blk_next_sg(struct scatterlist **sg,
+		struct scatterlist *sglist)
+{
+	if (!*sg)
+		return sglist;
+
+	/*
+	 * If the driver previously mapped a shorter list, we could see a
+	 * termination bit prematurely unless it fully inits the sg table
+	 * on each mapping. We KNOW that there must be more entries here
+	 * or the driver would be buggy, so force clear the termination bit
+	 * to avoid doing a full sg_init_table() in drivers for each command.
+	 */
+	sg_unmark_end(*sg);
+	return sg_next(*sg);
+}
+
+static unsigned blk_bvec_map_sg(struct request_queue *q,
+		struct bio_vec *bvec, struct scatterlist *sglist,
+		struct scatterlist **sg)
+{
+	unsigned nbytes = bvec->bv_len;
+	unsigned nsegs = 0, total = 0, offset = 0;
+
+	while (nbytes > 0) {
+		unsigned seg_size;
+		struct page *pg;
+		unsigned idx;
+
+		*sg = blk_next_sg(sg, sglist);
+
+		seg_size = get_max_segment_size(q, bvec->bv_offset + total);
+		seg_size = min(nbytes, seg_size);
+
+		offset = (total + bvec->bv_offset) % PAGE_SIZE;
+		idx = (total + bvec->bv_offset) / PAGE_SIZE;
+		pg = bvec_nth_page(bvec->bv_page, idx);
+
+		sg_set_page(*sg, pg, seg_size, offset);
+
+		total += seg_size;
+		nbytes -= seg_size;
+		nsegs++;
+	}
+
+	return nsegs;
+}
+
 static inline void
 __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
 		     struct scatterlist *sglist, struct bio_vec *bvprv,
@@ -424,25 +526,12 @@ __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
 		(*sg)->length += nbytes;
 	} else {
 new_segment:
-		if (!*sg)
-			*sg = sglist;
-		else {
-			/*
-			 * If the driver previously mapped a shorter
-			 * list, we could see a termination bit
-			 * prematurely unless it fully inits the sg
-			 * table on each mapping. We KNOW that there
-			 * must be more entries here or the driver
-			 * would be buggy, so force clear the
-			 * termination bit to avoid doing a full
-			 * sg_init_table() in drivers for each command.
-			 */
-			sg_unmark_end(*sg);
-			*sg = sg_next(*sg);
-		}
-
-		sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset);
-		(*nsegs)++;
+		if (bvec->bv_offset + bvec->bv_len <= PAGE_SIZE) {
+			*sg = blk_next_sg(sg, sglist);
+			sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset);
+			(*nsegs) += 1;
+		} else
+			(*nsegs) += blk_bvec_map_sg(q, bvec, sglist, sg);
 	}
 	*bvprv = *bvec;
 }
@@ -464,7 +553,7 @@ static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
 	int nsegs = 0;
 
 	for_each_bio(bio)
-		bio_for_each_segment(bvec, bio, iter)
+		bio_for_each_bvec(bvec, bio, iter)
 			__blk_segment_map_sg(q, &bvec, sglist, &bvprv, sg,
 					     &nsegs);
 
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 7921573aebbc..bac34b72b33b 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -128,11 +128,9 @@ static const char *const blk_queue_flag_name[] = {
 	QUEUE_FLAG_NAME(SAME_FORCE),
 	QUEUE_FLAG_NAME(DEAD),
 	QUEUE_FLAG_NAME(INIT_DONE),
-	QUEUE_FLAG_NAME(NO_SG_MERGE),
 	QUEUE_FLAG_NAME(POLL),
 	QUEUE_FLAG_NAME(WC),
 	QUEUE_FLAG_NAME(FUA),
-	QUEUE_FLAG_NAME(FLUSH_NQ),
 	QUEUE_FLAG_NAME(DAX),
 	QUEUE_FLAG_NAME(STATS),
 	QUEUE_FLAG_NAME(POLL_STATS),
@@ -251,7 +249,6 @@ static const char *const alloc_policy_name[] = {
 static const char *const hctx_flag_name[] = {
 	HCTX_FLAG_NAME(SHOULD_MERGE),
 	HCTX_FLAG_NAME(TAG_SHARED),
-	HCTX_FLAG_NAME(SG_MERGE),
 	HCTX_FLAG_NAME(BLOCKING),
 	HCTX_FLAG_NAME(NO_SCHED),
 };
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 140933e4a7d1..40905539afed 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -321,7 +321,7 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
 {
 	struct elevator_queue *e = q->elevator;
 	struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
-	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx->cpu);
+	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
 	bool ret = false;
 	enum hctx_type type;
 
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 2089c6c62f44..a4931fc7be8a 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -170,7 +170,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 
 		data->ctx = blk_mq_get_ctx(data->q);
 		data->hctx = blk_mq_map_queue(data->q, data->cmd_flags,
-						data->ctx->cpu);
+						data->ctx);
 		tags = blk_mq_tags_from_data(data);
 		if (data->flags & BLK_MQ_REQ_RESERVED)
 			bt = &tags->breserved_tags;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9437a5eb07cf..4e502db8b10c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -364,7 +364,7 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 	}
 	if (likely(!data->hctx))
 		data->hctx = blk_mq_map_queue(q, data->cmd_flags,
-						data->ctx->cpu);
+						data->ctx);
 	if (data->cmd_flags & REQ_NOWAIT)
 		data->flags |= BLK_MQ_REQ_NOWAIT;
 
@@ -2069,7 +2069,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
 	struct blk_mq_tags *tags;
 	int node;
 
-	node = blk_mq_hw_queue_to_node(&set->map[0], hctx_idx);
+	node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
 	if (node == NUMA_NO_NODE)
 		node = set->numa_node;
 
@@ -2125,7 +2125,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 	size_t rq_size, left;
 	int node;
 
-	node = blk_mq_hw_queue_to_node(&set->map[0], hctx_idx);
+	node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
 	if (node == NUMA_NO_NODE)
 		node = set->numa_node;
 
@@ -2424,7 +2424,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 	 * If the cpu isn't present, the cpu is mapped to first hctx.
 	 */
 	for_each_possible_cpu(i) {
-		hctx_idx = set->map[0].mq_map[i];
+		hctx_idx = set->map[HCTX_TYPE_DEFAULT].mq_map[i];
 		/* unmapped hw queue can be remapped after CPU topo changed */
 		if (!set->tags[hctx_idx] &&
 		    !__blk_mq_alloc_rq_map(set, hctx_idx)) {
@@ -2434,16 +2434,19 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 			 * case, remap the current ctx to hctx[0] which
 			 * is guaranteed to always have tags allocated
 			 */
-			set->map[0].mq_map[i] = 0;
+			set->map[HCTX_TYPE_DEFAULT].mq_map[i] = 0;
 		}
 
 		ctx = per_cpu_ptr(q->queue_ctx, i);
 		for (j = 0; j < set->nr_maps; j++) {
-			if (!set->map[j].nr_queues)
+			if (!set->map[j].nr_queues) {
+				ctx->hctxs[j] = blk_mq_map_queue_type(q,
+						HCTX_TYPE_DEFAULT, i);
 				continue;
+			}
 
 			hctx = blk_mq_map_queue_type(q, j, i);
-
+			ctx->hctxs[j] = hctx;
 			/*
 			 * If the CPU is already set in the mask, then we've
 			 * mapped this one already. This can happen if
@@ -2463,6 +2466,10 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 			 */
 			BUG_ON(!hctx->nr_ctx);
 		}
+
+		for (; j < HCTX_MAX_TYPES; j++)
+			ctx->hctxs[j] = blk_mq_map_queue_type(q,
+					HCTX_TYPE_DEFAULT, i);
 	}
 
 	mutex_unlock(&q->sysfs_lock);
@@ -2734,7 +2741,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 		int node;
 		struct blk_mq_hw_ctx *hctx;
 
-		node = blk_mq_hw_queue_to_node(&set->map[0], i);
+		node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i);
 		/*
 		 * If the hw queue has been mapped to another numa node,
 		 * we need to realloc the hctx. If allocation fails, fallback
@@ -2838,9 +2845,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	    set->map[HCTX_TYPE_POLL].nr_queues)
 		blk_queue_flag_set(QUEUE_FLAG_POLL, q);
 
-	if (!(set->flags & BLK_MQ_F_SG_MERGE))
-		blk_queue_flag_set(QUEUE_FLAG_NO_SG_MERGE, q);
-
 	q->sg_reserved_size = INT_MAX;
 
 	INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
@@ -2968,7 +2972,7 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
 		return set->ops->map_queues(set);
 	} else {
 		BUG_ON(set->nr_maps > 1);
-		return blk_mq_map_queues(&set->map[0]);
+		return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
 	}
 }
 
@@ -3090,6 +3094,9 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
 	if (!set)
 		return -EINVAL;
 
+	if (q->nr_requests == nr)
+		return 0;
+
 	blk_mq_freeze_queue(q);
 	blk_mq_quiesce_queue(q);
 
@@ -3235,7 +3242,7 @@ fallback:
 			pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
 					nr_hw_queues, prev_nr_hw_queues);
 			set->nr_hw_queues = prev_nr_hw_queues;
-			blk_mq_map_queues(&set->map[0]);
+			blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
 			goto fallback;
 		}
 		blk_mq_map_swqueue(q);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index d0b3dd54ef8d..c11353a3749d 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -23,6 +23,7 @@ struct blk_mq_ctx {
 
 	unsigned int		cpu;
 	unsigned short		index_hw[HCTX_MAX_TYPES];
+	struct blk_mq_hw_ctx 	*hctxs[HCTX_MAX_TYPES];
 
 	/* incremented at dispatch time */
 	unsigned long		rq_dispatched[2];
@@ -96,26 +97,23 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *
  * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue
  * @q: request queue
  * @flags: request command flags
- * @cpu: CPU
+ * @cpu: cpu ctx
  */
 static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
 						     unsigned int flags,
-						     unsigned int cpu)
+						     struct blk_mq_ctx *ctx)
 {
 	enum hctx_type type = HCTX_TYPE_DEFAULT;
 
-	if ((flags & REQ_HIPRI) &&
-	    q->tag_set->nr_maps > HCTX_TYPE_POLL && 
-	    q->tag_set->map[HCTX_TYPE_POLL].nr_queues &&
-	    test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
+	/*
+	 * The caller ensure that if REQ_HIPRI, poll must be enabled.
+	 */
+	if (flags & REQ_HIPRI)
 		type = HCTX_TYPE_POLL;
-
-	else if (((flags & REQ_OP_MASK) == REQ_OP_READ) &&
-	         q->tag_set->nr_maps > HCTX_TYPE_READ &&
-		 q->tag_set->map[HCTX_TYPE_READ].nr_queues)
+	else if ((flags & REQ_OP_MASK) == REQ_OP_READ)
 		type = HCTX_TYPE_READ;
 	
-	return blk_mq_map_queue_type(q, type, cpu);
+	return ctx->hctxs[type];
 }
 
 /*
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 3e7038e475ee..6375afaedcec 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -799,15 +799,6 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
 }
 EXPORT_SYMBOL(blk_queue_update_dma_alignment);
 
-void blk_queue_flush_queueable(struct request_queue *q, bool queueable)
-{
-	if (queueable)
-		blk_queue_flag_clear(QUEUE_FLAG_FLUSH_NQ, q);
-	else
-		blk_queue_flag_set(QUEUE_FLAG_FLUSH_NQ, q);
-}
-EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
-
 /**
  * blk_set_queue_depth - tell the block layer about the device queue depth
  * @q:		the request queue for the device
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 590d1ef2f961..59685918167e 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -468,6 +468,9 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
 	else if (val >= 0)
 		val *= 1000ULL;
 
+	if (wbt_get_min_lat(q) == val)
+		return count;
+
 	/*
 	 * Ensure that the queue is idled, in case the latency update
 	 * ends up either enabling or disabling wbt completely. We can't
@@ -817,21 +820,16 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
 }
 
 /**
- * __blk_release_queue - release a request queue when it is no longer needed
+ * __blk_release_queue - release a request queue
  * @work: pointer to the release_work member of the request queue to be released
  *
  * Description:
- *     blk_release_queue is the counterpart of blk_init_queue(). It should be
- *     called when a request queue is being released; typically when a block
- *     device is being de-registered. Its primary task it to free the queue
- *     itself.
- *
- * Notes:
- *     The low level driver must have finished any outstanding requests first
- *     via blk_cleanup_queue().
- *
- *     Although blk_release_queue() may be called with preemption disabled,
- *     __blk_release_queue() may sleep.
+ *     This function is called when a block device is being unregistered. The
+ *     process of releasing a request queue starts with blk_cleanup_queue, which
+ *     set the appropriate flags and then calls blk_put_queue, that decrements
+ *     the reference counter of the request queue. Once the reference counter
+ *     of the request queue reaches zero, blk_release_queue is called to release
+ *     all allocated resources of the request queue.
  */
 static void __blk_release_queue(struct work_struct *work)
 {
diff --git a/block/blk.h b/block/blk.h
index 848278c52030..5d636ee41663 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -38,7 +38,7 @@ extern struct ida blk_queue_ida;
 static inline struct blk_flush_queue *
 blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx)
 {
-	return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx->cpu)->fq;
+	return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq;
 }
 
 static inline void __blk_get_queue(struct request_queue *q)
diff --git a/block/bounce.c b/block/bounce.c
index ffb9e9ecfa7e..47eb7e936e22 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -165,11 +165,12 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool)
 	struct bio_vec *bvec, orig_vec;
 	int i;
 	struct bvec_iter orig_iter = bio_orig->bi_iter;
+	struct bvec_iter_all iter_all;
 
 	/*
 	 * free up bounce indirect pages used
 	 */
-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, i, iter_all) {
 		orig_vec = bio_iter_iovec(bio_orig, orig_iter);
 		if (bvec->bv_page != orig_vec.bv_page) {
 			dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
@@ -313,7 +314,12 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 	bio = bounce_clone_bio(*bio_orig, GFP_NOIO, passthrough ? NULL :
 			&bounce_bio_set);
 
-	bio_for_each_segment_all(to, bio, i) {
+	/*
+	 * Bvec table can't be updated by bio_for_each_segment_all(),
+	 * so retrieve bvec from the table directly. This way is safe
+	 * because the 'bio' is single-page bvec.
+	 */
+	for (i = 0, to = bio->bi_io_vec; i < bio->bi_vcnt; to++, i++) {
 		struct page *page = to->bv_page;
 
 		if (page_to_pfn(page) <= q->limits.bounce_pfn)
diff --git a/block/elevator.c b/block/elevator.c
index f05e90d4e695..d6d835a08de6 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -667,8 +667,11 @@ static int __elevator_change(struct request_queue *q, const char *name)
 	/*
 	 * Special case for mq, turn off scheduling
 	 */
-	if (!strncmp(name, "none", 4))
+	if (!strncmp(name, "none", 4)) {
+		if (!q->elevator)
+			return 0;
 		return elevator_switch(q, NULL);
+	}
 
 	strlcpy(elevator_name, name, sizeof(elevator_name));
 	e = elevator_get(q, strstrip(elevator_name), true);
diff --git a/block/genhd.c b/block/genhd.c
index 1dd8fd6613b8..703267865f14 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -365,8 +365,8 @@ int register_blkdev(unsigned int major, const char *name)
 		}
 
 		if (index == 0) {
-			printk("register_blkdev: failed to get major for %s\n",
-			       name);
+			printk("%s: failed to get major for %s\n",
+			       __func__, name);
 			ret = -EBUSY;
 			goto out;
 		}
@@ -375,8 +375,8 @@ int register_blkdev(unsigned int major, const char *name)
 	}
 
 	if (major >= BLKDEV_MAJOR_MAX) {
-		pr_err("register_blkdev: major requested (%u) is greater than the maximum (%u) for %s\n",
-		       major, BLKDEV_MAJOR_MAX-1, name);
+		pr_err("%s: major requested (%u) is greater than the maximum (%u) for %s\n",
+		       __func__, major, BLKDEV_MAJOR_MAX-1, name);
 
 		ret = -EINVAL;
 		goto out;
@@ -655,10 +655,12 @@ exit:
 		kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
 	disk_part_iter_exit(&piter);
 
-	err = sysfs_create_link(&ddev->kobj,
-				&disk->queue->backing_dev_info->dev->kobj,
-				"bdi");
-	WARN_ON(err);
+	if (disk->queue->backing_dev_info->dev) {
+		err = sysfs_create_link(&ddev->kobj,
+			  &disk->queue->backing_dev_info->dev->kobj,
+			  "bdi");
+		WARN_ON(err);
+	}
 }
 
 /**
author	Linus Torvalds <torvalds@linux-foundation.org>	2019-03-08 23:12:17 +0100
committer	Linus Torvalds <torvalds@linux-foundation.org>	2019-03-08 23:12:17 +0100
commit	80201fe175cbf7f3e372f53eba0a881a702ad926 (patch)
tree	8026c68d52763614268a9c3c80759ad386bd5967 /block
parent	Merge tag 'for-5.1/libata-20190301' of git://git.kernel.dk/linux-block (diff)
parent	block: fix updating bio's front segment size (diff)
download	linux-80201fe175cbf7f3e372f53eba0a881a702ad926.tar.xz linux-80201fe175cbf7f3e372f53eba0a881a702ad926.zip