diff options
101 files changed, 1945 insertions, 1394 deletions
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 184193bcb262..caf36105a1c7 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1857,8 +1857,10 @@ following two functions. wbc_init_bio(@wbc, @bio) Should be called for each bio carrying writeback data and - associates the bio with the inode's owner cgroup. Can be - called anytime between bio allocation and submission. + associates the bio with the inode's owner cgroup and the + corresponding request queue. This must be called after + a queue (device) has been associated with the bio and + before submission. wbc_account_io(@wbc, @page, @bytes) Should be called for each data segment being written out. @@ -1877,7 +1879,7 @@ the configuration, the bio may be executed at a lower priority and if the writeback session is holding shared resources, e.g. a journal entry, may lead to priority inversion. There is no one easy solution for the problem. Filesystems can try to work around specific problem -cases by skipping wbc_init_bio() or using bio_associate_blkcg() +cases by skipping wbc_init_bio() or using bio_associate_create_blkg() directly. diff --git a/arch/arm/include/asm/io.h b/arch/arm/include/asm/io.h index 2cfbc531f63b..6b51826ab3d1 100644 --- a/arch/arm/include/asm/io.h +++ b/arch/arm/include/asm/io.h @@ -28,7 +28,6 @@ #include <asm/byteorder.h> #include <asm/memory.h> #include <asm-generic/pci_iomap.h> -#include <xen/xen.h> /* * ISA I/O bus memory addresses are 1:1 with the physical address. @@ -459,20 +458,6 @@ extern void pci_iounmap(struct pci_dev *dev, void __iomem *addr); #include <asm-generic/io.h> -/* - * can the hardware map this into one segment or not, given no other - * constraints. - */ -#define BIOVEC_MERGEABLE(vec1, vec2) \ - ((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) - -struct bio_vec; -extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, - const struct bio_vec *vec2); -#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ - (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) && \ - (!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2))) - #ifdef CONFIG_MMU #define ARCH_HAS_VALID_PHYS_ADDR_RANGE extern int valid_phys_addr_range(phys_addr_t addr, size_t size); diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 35b2e50f17fb..9f8b915af3a7 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -31,8 +31,6 @@ #include <asm/alternative.h> #include <asm/cpufeature.h> -#include <xen/xen.h> - /* * Generic IO read/write. These perform native-endian accesses. */ @@ -205,12 +203,5 @@ extern int valid_mmap_phys_addr_range(unsigned long pfn, size_t size); extern int devmem_is_allowed(unsigned long pfn); -struct bio_vec; -extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, - const struct bio_vec *vec2); -#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ - (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) && \ - (!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2))) - #endif /* __KERNEL__ */ #endif /* __ASM_IO_H */ diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c index e9110b9b8bcd..38049357d6d3 100644 --- a/arch/m68k/emu/nfblock.c +++ b/arch/m68k/emu/nfblock.c @@ -73,7 +73,7 @@ static blk_qc_t nfhd_make_request(struct request_queue *queue, struct bio *bio) len = bvec.bv_len; len >>= 9; nfhd_read_write(dev->id, 0, dir, sec >> shift, len >> shift, - bvec_to_phys(&bvec)); + page_to_phys(bvec.bv_page) + bvec.bv_offset); sec += len; } bio_endio(bio); diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 83c470364dfb..6ee4c56032f7 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -891,7 +891,7 @@ static int ubd_disk_register(int major, u64 size, int unit, disk->private_data = &ubd_devs[unit]; disk->queue = ubd_devs[unit].queue; - device_add_disk(parent, disk); + device_add_disk(parent, disk, NULL); *disk_out = disk; return 0; diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 6de64840dd22..9a92a3ac2ac5 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -369,18 +369,6 @@ extern void __iomem *ioremap_wt(resource_size_t offset, unsigned long size); extern bool is_early_ioremap_ptep(pte_t *ptep); -#ifdef CONFIG_XEN -#include <xen/xen.h> -struct bio_vec; - -extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, - const struct bio_vec *vec2); - -#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ - (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) && \ - (!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2))) -#endif /* CONFIG_XEN */ - #define IO_SPACE_LIMIT 0xffff #include <asm-generic/io.h> diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h index d383140e1dc8..068d9b067c83 100644 --- a/arch/x86/include/asm/xen/events.h +++ b/arch/x86/include/asm/xen/events.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_XEN_EVENTS_H #define _ASM_X86_XEN_EVENTS_H +#include <xen/xen.h> + enum ipi_vector { XEN_RESCHEDULE_VECTOR, XEN_CALL_FUNCTION_VECTOR, diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 2eeddd814653..0ca46e03b830 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -5,6 +5,7 @@ #include <linux/kexec.h> #include <linux/slab.h> +#include <xen/xen.h> #include <xen/features.h> #include <xen/page.h> #include <xen/interface/memory.h> diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c index c85d1a88f476..2a9025343534 100644 --- a/arch/x86/xen/enlighten_pvh.c +++ b/arch/x86/xen/enlighten_pvh.c @@ -11,6 +11,7 @@ #include <asm/xen/interface.h> #include <asm/xen/hypercall.h> +#include <xen/xen.h> #include <xen/interface/memory.h> #include <xen/interface/hvm/start_info.h> diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c index 33a783c77d96..b99585034dd2 100644 --- a/arch/x86/xen/platform-pci-unplug.c +++ b/arch/x86/xen/platform-pci-unplug.c @@ -23,6 +23,7 @@ #include <linux/io.h> #include <linux/export.h> +#include <xen/xen.h> #include <xen/platform_pci.h> #include "xen-ops.h" diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c index 95997e6c0696..0972184f3f19 100644 --- a/arch/x86/xen/pmu.c +++ b/arch/x86/xen/pmu.c @@ -3,6 +3,7 @@ #include <linux/interrupt.h> #include <asm/xen/hypercall.h> +#include <xen/xen.h> #include <xen/page.h> #include <xen/interface/xen.h> #include <xen/interface/vcpu.h> diff --git a/block/Kconfig b/block/Kconfig index 1f2469a0123c..85263e7bded6 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -228,4 +228,7 @@ config BLK_MQ_RDMA depends on BLOCK && INFINIBAND default y +config BLK_PM + def_bool BLOCK && PM + source block/Kconfig.iosched diff --git a/block/Makefile b/block/Makefile index 572b33f32c07..27eac600474f 100644 --- a/block/Makefile +++ b/block/Makefile @@ -37,3 +37,4 @@ obj-$(CONFIG_BLK_WBT) += blk-wbt.o obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o +obj-$(CONFIG_BLK_PM) += blk-pm.o diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 9fe5952d117d..d9a7916ff0ab 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -642,7 +642,7 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) uint64_t serial_nr; rcu_read_lock(); - serial_nr = bio_blkcg(bio)->css.serial_nr; + serial_nr = __bio_blkcg(bio)->css.serial_nr; /* * Check whether blkcg has changed. The condition may trigger @@ -651,7 +651,7 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) goto out; - bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); + bfqg = __bfq_bic_change_cgroup(bfqd, bic, __bio_blkcg(bio)); /* * Update blkg_path for bfq_log_* functions. We cache this * path, and update it here, for the following diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 653100fb719e..1a1b80dfd69d 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -3182,6 +3182,13 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); } +static bool bfq_bfqq_injectable(struct bfq_queue *bfqq) +{ + return BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && + blk_queue_nonrot(bfqq->bfqd->queue) && + bfqq->bfqd->hw_tag; +} + /** * bfq_bfqq_expire - expire a queue. * @bfqd: device owning the queue. @@ -3291,6 +3298,8 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, if (ref == 1) /* bfqq is gone, no more actions on it */ return; + bfqq->injected_service = 0; + /* mark bfqq as waiting a request only if a bic still points to it */ if (!bfq_bfqq_busy(bfqq) && reason != BFQQE_BUDGET_TIMEOUT && @@ -3571,7 +3580,12 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq) * whether bfqq is being weight-raised, because * bfq_symmetric_scenario() does not take into account also * weight-raised queues (see comments on - * bfq_weights_tree_add()). + * bfq_weights_tree_add()). In particular, if bfqq is being + * weight-raised, it is important to idle only if there are + * other, non-weight-raised queues that may steal throughput + * to bfqq. Actually, we should be even more precise, and + * differentiate between interactive weight raising and + * soft real-time weight raising. * * As a side note, it is worth considering that the above * device-idling countermeasures may however fail in the @@ -3583,7 +3597,8 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq) * to let requests be served in the desired order until all * the requests already queued in the device have been served. */ - asymmetric_scenario = bfqq->wr_coeff > 1 || + asymmetric_scenario = (bfqq->wr_coeff > 1 && + bfqd->wr_busy_queues < bfqd->busy_queues) || !bfq_symmetric_scenario(bfqd); /* @@ -3629,6 +3644,30 @@ static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_better_to_idle(bfqq); } +static struct bfq_queue *bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq; + + /* + * A linear search; but, with a high probability, very few + * steps are needed to find a candidate queue, i.e., a queue + * with enough budget left for its next request. In fact: + * - BFQ dynamically updates the budget of every queue so as + * to accommodate the expected backlog of the queue; + * - if a queue gets all its requests dispatched as injected + * service, then the queue is removed from the active list + * (and re-added only if it gets new requests, but with + * enough budget for its new backlog). + */ + list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) + if (!RB_EMPTY_ROOT(&bfqq->sort_list) && + bfq_serv_to_charge(bfqq->next_rq, bfqq) <= + bfq_bfqq_budget_left(bfqq)) + return bfqq; + + return NULL; +} + /* * Select a queue for service. If we have a current queue in service, * check whether to continue servicing it, or retrieve and set a new one. @@ -3710,10 +3749,19 @@ check_queue: * No requests pending. However, if the in-service queue is idling * for a new request, or has requests waiting for a completion and * may idle after their completion, then keep it anyway. + * + * Yet, to boost throughput, inject service from other queues if + * possible. */ if (bfq_bfqq_wait_request(bfqq) || (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) { - bfqq = NULL; + if (bfq_bfqq_injectable(bfqq) && + bfqq->injected_service * bfqq->inject_coeff < + bfqq->entity.service * 10) + bfqq = bfq_choose_bfqq_for_injection(bfqd); + else + bfqq = NULL; + goto keep_queue; } @@ -3803,6 +3851,14 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd, bfq_dispatch_remove(bfqd->queue, rq); + if (bfqq != bfqd->in_service_queue) { + if (likely(bfqd->in_service_queue)) + bfqd->in_service_queue->injected_service += + bfq_serv_to_charge(rq, bfqq); + + goto return_rq; + } + /* * If weight raising has to terminate for bfqq, then next * function causes an immediate update of bfqq's weight, @@ -3821,13 +3877,12 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd, * belongs to CLASS_IDLE and other queues are waiting for * service. */ - if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq)) - goto expire; - - return rq; + if (!(bfqd->busy_queues > 1 && bfq_class_idle(bfqq))) + goto return_rq; -expire: bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED); + +return_rq: return rq; } @@ -4232,6 +4287,13 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_mark_bfqq_has_short_ttime(bfqq); bfq_mark_bfqq_sync(bfqq); bfq_mark_bfqq_just_created(bfqq); + /* + * Aggressively inject a lot of service: up to 90%. + * This coefficient remains constant during bfqq life, + * but this behavior might be changed, after enough + * testing and tuning. + */ + bfqq->inject_coeff = 1; } else bfq_clear_bfqq_sync(bfqq); @@ -4297,7 +4359,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, rcu_read_lock(); - bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); + bfqg = bfq_find_set_group(bfqd, __bio_blkcg(bio)); if (!bfqg) { bfqq = &bfqd->oom_bfqq; goto out; diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index a8a2e5aca4d4..37d627afdc2e 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -351,6 +351,32 @@ struct bfq_queue { unsigned long split_time; /* time of last split */ unsigned long first_IO_time; /* time of first I/O for this queue */ + + /* max service rate measured so far */ + u32 max_service_rate; + /* + * Ratio between the service received by bfqq while it is in + * service, and the cumulative service (of requests of other + * queues) that may be injected while bfqq is empty but still + * in service. To increase precision, the coefficient is + * measured in tenths of unit. Here are some example of (1) + * ratios, (2) resulting percentages of service injected + * w.r.t. to the total service dispatched while bfqq is in + * service, and (3) corresponding values of the coefficient: + * 1 (50%) -> 10 + * 2 (33%) -> 20 + * 10 (9%) -> 100 + * 9.9 (9%) -> 99 + * 1.5 (40%) -> 15 + * 0.5 (66%) -> 5 + * 0.1 (90%) -> 1 + * + * So, if the coefficient is lower than 10, then + * injected service is more than bfqq service. + */ + unsigned int inject_coeff; + /* amount of service injected in current service slot */ + unsigned int injected_service; }; /** diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index ae52bff43ce4..ff7c2d470bb8 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -1181,10 +1181,17 @@ bool __bfq_deactivate_entity(struct bfq_entity *entity, bool ins_into_idle_tree) st = bfq_entity_service_tree(entity); is_in_service = entity == sd->in_service_entity; - if (is_in_service) { - bfq_calc_finish(entity, entity->service); + bfq_calc_finish(entity, entity->service); + + if (is_in_service) sd->in_service_entity = NULL; - } + else + /* + * Non in-service entity: nobody will take care of + * resetting its service counter on expiration. Do it + * now. + */ + entity->service = 0; if (entity->tree == &st->active) bfq_active_extract(st, entity); diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 67b5fb861a51..290af497997b 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -306,6 +306,8 @@ bool bio_integrity_prep(struct bio *bio) if (bio_data_dir(bio) == WRITE) { bio_integrity_process(bio, &bio->bi_iter, bi->profile->generate_fn); + } else { + bip->bio_iter = bio->bi_iter; } return true; @@ -331,20 +333,14 @@ static void bio_integrity_verify_fn(struct work_struct *work) container_of(work, struct bio_integrity_payload, bip_work); struct bio *bio = bip->bip_bio; struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); - struct bvec_iter iter = bio->bi_iter; /* * At the moment verify is called bio's iterator was advanced * during split and completion, we need to rewind iterator to * it's original position. */ - if (bio_rewind_iter(bio, &iter, iter.bi_done)) { - bio->bi_status = bio_integrity_process(bio, &iter, - bi->profile->verify_fn); - } else { - bio->bi_status = BLK_STS_IOERR; - } - + bio->bi_status = bio_integrity_process(bio, &bip->bio_iter, + bi->profile->verify_fn); bio_integrity_free(bio); bio_endio(bio); } diff --git a/block/bio.c b/block/bio.c index 0093bed81c0e..17a8b0aa7050 100644 --- a/block/bio.c +++ b/block/bio.c @@ -609,7 +609,9 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) bio->bi_iter = bio_src->bi_iter; bio->bi_io_vec = bio_src->bi_io_vec; - bio_clone_blkcg_association(bio, bio_src); + bio_clone_blkg_association(bio, bio_src); + + blkcg_bio_issue_init(bio); } EXPORT_SYMBOL(__bio_clone_fast); @@ -729,7 +731,7 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page } /* If we may be able to merge these biovecs, force a recount */ - if (bio->bi_vcnt > 1 && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec))) + if (bio->bi_vcnt > 1 && biovec_phys_mergeable(q, bvec - 1, bvec)) bio_clear_flag(bio, BIO_SEG_VALID); done: @@ -827,6 +829,8 @@ int bio_add_page(struct bio *bio, struct page *page, } EXPORT_SYMBOL(bio_add_page); +#define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *)) + /** * __bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio * @bio: bio to add pages to @@ -839,38 +843,35 @@ EXPORT_SYMBOL(bio_add_page); */ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { - unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt, idx; + unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; + unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; struct page **pages = (struct page **)bv; + ssize_t size, left; + unsigned len, i; size_t offset; - ssize_t size; + + /* + * Move page array up in the allocated memory for the bio vecs as far as + * possible so that we can start filling biovecs from the beginning + * without overwriting the temporary page array. + */ + BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2); + pages += entries_left * (PAGE_PTRS_PER_BVEC - 1); size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); if (unlikely(size <= 0)) return size ? size : -EFAULT; - idx = nr_pages = (size + offset + PAGE_SIZE - 1) / PAGE_SIZE; - /* - * Deep magic below: We need to walk the pinned pages backwards - * because we are abusing the space allocated for the bio_vecs - * for the page array. Because the bio_vecs are larger than the - * page pointers by definition this will always work. But it also - * means we can't use bio_add_page, so any changes to it's semantics - * need to be reflected here as well. - */ - bio->bi_iter.bi_size += size; - bio->bi_vcnt += nr_pages; + for (left = size, i = 0; left > 0; left -= len, i++) { + struct page *page = pages[i]; - while (idx--) { - bv[idx].bv_page = pages[idx]; - bv[idx].bv_len = PAGE_SIZE; - bv[idx].bv_offset = 0; + len = min_t(size_t, PAGE_SIZE - offset, left); + if (WARN_ON_ONCE(bio_add_page(bio, page, len, offset) != len)) + return -EINVAL; + offset = 0; } - bv[0].bv_offset += offset; - bv[0].bv_len -= offset; - bv[nr_pages - 1].bv_len -= nr_pages * PAGE_SIZE - offset - size; - iov_iter_advance(iter, size); return 0; } @@ -1807,7 +1808,6 @@ struct bio *bio_split(struct bio *bio, int sectors, bio_integrity_trim(split); bio_advance(bio, split->bi_iter.bi_size); - bio->bi_iter.bi_done = 0; if (bio_flagged(bio, BIO_TRACE_COMPLETION)) bio_set_flag(split, BIO_TRACE_COMPLETION); @@ -1956,69 +1956,131 @@ EXPORT_SYMBOL(bioset_init_from_src); #ifdef CONFIG_BLK_CGROUP -#ifdef CONFIG_MEMCG /** - * bio_associate_blkcg_from_page - associate a bio with the page's blkcg + * bio_associate_blkg - associate a bio with the a blkg * @bio: target bio - * @page: the page to lookup the blkcg from + * @blkg: the blkg to associate + * + * This tries to associate @bio with the specified blkg. Association failure + * is handled by walking up the blkg tree. Therefore, the blkg associated can + * be anything between @blkg and the root_blkg. This situation only happens + * when a cgroup is dying and then the remaining bios will spill to the closest + * alive blkg. * - * Associate @bio with the blkcg from @page's owning memcg. This works like - * every other associate function wrt references. + * A reference will be taken on the @blkg and will be released when @bio is + * freed. */ -int bio_associate_blkcg_from_page(struct bio *bio, struct page *page) +int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg) { - struct cgroup_subsys_state *blkcg_css; - - if (unlikely(bio->bi_css)) + if (unlikely(bio->bi_blkg)) return -EBUSY; - if (!page->mem_cgroup) - return 0; - blkcg_css = cgroup_get_e_css(page->mem_cgroup->css.cgroup, - &io_cgrp_subsys); - bio->bi_css = blkcg_css; + bio->bi_blkg = blkg_tryget_closest(blkg); return 0; } -#endif /* CONFIG_MEMCG */ /** - * bio_associate_blkcg - associate a bio with the specified blkcg - * @bio: target bio - * @blkcg_css: css of the blkcg to associate + * __bio_associate_blkg_from_css - internal blkg association function * - * Associate @bio with the blkcg specified by @blkcg_css. Block layer will - * treat @bio as if it were issued by a task which belongs to the blkcg. + * This in the core association function that all association paths rely on. + * A blkg reference is taken which is released upon freeing of the bio. + */ +static int __bio_associate_blkg_from_css(struct bio *bio, + struct cgroup_subsys_state *css) +{ + struct request_queue *q = bio->bi_disk->queue; + struct blkcg_gq *blkg; + int ret; + + rcu_read_lock(); + + if (!css || !css->parent) + blkg = q->root_blkg; + else + blkg = blkg_lookup_create(css_to_blkcg(css), q); + + ret = bio_associate_blkg(bio, blkg); + + rcu_read_unlock(); + return ret; +} + +/** + * bio_associate_blkg_from_css - associate a bio with a specified css + * @bio: target bio + * @css: target css * - * This function takes an extra reference of @blkcg_css which will be put - * when @bio is released. The caller must own @bio and is responsible for - * synchronizing calls to this function. + * Associate @bio with the blkg found by combining the css's blkg and the + * request_queue of the @bio. This falls back to the queue's root_blkg if + * the association fails with the css. */ -int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) +int bio_associate_blkg_from_css(struct bio *bio, + struct cgroup_subsys_state *css) { - if (unlikely(bio->bi_css)) + if (unlikely(bio->bi_blkg)) return -EBUSY; - css_get(blkcg_css); - bio->bi_css = blkcg_css; - return 0; + return __bio_associate_blkg_from_css(bio, css); } -EXPORT_SYMBOL_GPL(bio_associate_blkcg); +EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); +#ifdef CONFIG_MEMCG /** - * bio_associate_blkg - associate a bio with the specified blkg + * bio_associate_blkg_from_page - associate a bio with the page's blkg * @bio: target bio - * @blkg: the blkg to associate + * @page: the page to lookup the blkcg from * - * Associate @bio with the blkg specified by @blkg. This is the queue specific - * blkcg information associated with the @bio, a reference will be taken on the - * @blkg and will be freed when the bio is freed. + * Associate @bio with the blkg from @page's owning memcg and the respective + * request_queue. If cgroup_e_css returns NULL, fall back to the queue's + * root_blkg. + * + * Note: this must be called after bio has an associated device. */ -int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg) +int bio_associate_blkg_from_page(struct bio *bio, struct page *page) { + struct cgroup_subsys_state *css; + int ret; + if (unlikely(bio->bi_blkg)) return -EBUSY; - if (!blkg_try_get(blkg)) - return -ENODEV; - bio->bi_blkg = blkg; - return 0; + if (!page->mem_cgroup) + return 0; + + rcu_read_lock(); + + css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys); + + ret = __bio_associate_blkg_from_css(bio, css); + + rcu_read_unlock(); + return ret; +} +#endif /* CONFIG_MEMCG */ + +/** + * bio_associate_create_blkg - associate a bio with a blkg from q + * @q: request_queue where bio is going + * @bio: target bio + * + * Associate @bio with the blkg found from the bio's css and the request_queue. + * If one is not found, bio_lookup_blkg creates the blkg. This falls back to + * the queue's root_blkg if association fails. + */ +int bio_associate_create_blkg(struct request_queue *q, struct bio *bio) +{ + struct cgroup_subsys_state *css; + int ret = 0; + + /* someone has already associated this bio with a blkg */ + if (bio->bi_blkg) + return ret; + + rcu_read_lock(); + + css = blkcg_css(); + + ret = __bio_associate_blkg_from_css(bio, css); + + rcu_read_unlock(); + return ret; } /** @@ -2031,10 +2093,6 @@ void bio_disassociate_task(struct bio *bio) put_io_context(bio->bi_ioc); bio->bi_ioc = NULL; } - if (bio->bi_css) { - css_put(bio->bi_css); - bio->bi_css = NULL; - } if (bio->bi_blkg) { blkg_put(bio->bi_blkg); bio->bi_blkg = NULL; @@ -2042,16 +2100,16 @@ void bio_disassociate_task(struct bio *bio) } /** - * bio_clone_blkcg_association - clone blkcg association from src to dst bio + * bio_clone_blkg_association - clone blkg association from src to dst bio * @dst: destination bio * @src: source bio */ -void bio_clone_blkcg_association(struct bio *dst, struct bio *src) +void bio_clone_blkg_association(struct bio *dst, struct bio *src) { - if (src->bi_css) - WARN_ON(bio_associate_blkcg(dst, src->bi_css)); + if (src->bi_blkg) + bio_associate_blkg(dst, src->bi_blkg); } -EXPORT_SYMBOL_GPL(bio_clone_blkcg_association); +EXPORT_SYMBOL_GPL(bio_clone_blkg_association); #endif /* CONFIG_BLK_CGROUP */ static void __init biovec_init_slabs(void) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index c630e02836a8..992da5592c6e 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -84,6 +84,37 @@ static void blkg_free(struct blkcg_gq *blkg) kfree(blkg); } +static void __blkg_release(struct rcu_head *rcu) +{ + struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head); + + percpu_ref_exit(&blkg->refcnt); + + /* release the blkcg and parent blkg refs this blkg has been holding */ + css_put(&blkg->blkcg->css); + if (blkg->parent) + blkg_put(blkg->parent); + + wb_congested_put(blkg->wb_congested); + + blkg_free(blkg); +} + +/* + * A group is RCU protected, but having an rcu lock does not mean that one + * can access all the fields of blkg and assume these are valid. For + * example, don't try to follow throtl_data and request queue links. + * + * Having a reference to blkg under an rcu allows accesses to only values + * local to groups like group stats and group rate limits. + */ +static void blkg_release(struct percpu_ref *ref) +{ + struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt); + + call_rcu(&blkg->rcu_head, __blkg_release); +} + /** * blkg_alloc - allocate a blkg * @blkcg: block cgroup the new blkg is associated with @@ -110,7 +141,6 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, blkg->q = q; INIT_LIST_HEAD(&blkg->q_node); blkg->blkcg = blkcg; - atomic_set(&blkg->refcnt, 1); /* root blkg uses @q->root_rl, init rl only for !root blkgs */ if (blkcg != &blkcg_root) { @@ -217,6 +247,11 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, blkg_get(blkg->parent); } + ret = percpu_ref_init(&blkg->refcnt, blkg_release, 0, + GFP_NOWAIT | __GFP_NOWARN); + if (ret) + goto err_cancel_ref; + /* invoke per-policy init */ for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; @@ -249,6 +284,8 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, blkg_put(blkg); return ERR_PTR(ret); +err_cancel_ref: + percpu_ref_exit(&blkg->refcnt); err_put_congested: wb_congested_put(wb_congested); err_put_css: @@ -259,7 +296,7 @@ err_free_blkg: } /** - * blkg_lookup_create - lookup blkg, try to create one if not there + * __blkg_lookup_create - lookup blkg, try to create one if not there * @blkcg: blkcg of interest * @q: request_queue of interest * @@ -268,12 +305,11 @@ err_free_blkg: * that all non-root blkg's have access to the parent blkg. This function * should be called under RCU read lock and @q->queue_lock. * - * Returns pointer to the looked up or created blkg on success, ERR_PTR() - * value on error. If @q is dead, returns ERR_PTR(-EINVAL). If @q is not - * dead and bypassing, returns ERR_PTR(-EBUSY). + * Returns the blkg or the closest blkg if blkg_create fails as it walks + * down from root. */ -struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q) +struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, + struct request_queue *q) { struct blkcg_gq *blkg; @@ -285,7 +321,7 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, * we shouldn't allow anything to go through for a bypassing queue. */ if (unlikely(blk_queue_bypass(q))) - return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY); + return q->root_blkg; blkg = __blkg_lookup(blkcg, q, true); if (blkg) @@ -293,23 +329,58 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, /* * Create blkgs walking down from blkcg_root to @blkcg, so that all - * non-root blkgs have access to their parents. + * non-root blkgs have access to their parents. Returns the closest + * blkg to the intended blkg should blkg_create() fail. */ while (true) { struct blkcg *pos = blkcg; struct blkcg *parent = blkcg_parent(blkcg); - - while (parent && !__blkg_lookup(parent, q, false)) { + struct blkcg_gq *ret_blkg = q->root_blkg; + + while (parent) { + blkg = __blkg_lookup(parent, q, false); + if (blkg) { + /* remember closest blkg */ + ret_blkg = blkg; + break; + } pos = parent; parent = blkcg_parent(parent); } blkg = blkg_create(pos, q, NULL); - if (pos == blkcg || IS_ERR(blkg)) + if (IS_ERR(blkg)) + return ret_blkg; + if (pos == blkcg) return blkg; } } +/** + * blkg_lookup_create - find or create a blkg + * @blkcg: target block cgroup + * @q: target request_queue + * + * This looks up or creates the blkg representing the unique pair + * of the blkcg and the request_queue. + */ +struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, + struct request_queue *q) +{ + struct blkcg_gq *blkg = blkg_lookup(blkcg, q); + unsigned long flags; + + if (unlikely(!blkg)) { + spin_lock_irqsave(q->queue_lock, flags); + + blkg = __blkg_lookup_create(blkcg, q); + + spin_unlock_irqrestore(q->queue_lock, flags); + } + + return blkg; +} + static void blkg_destroy(struct blkcg_gq *blkg) { struct blkcg *blkcg = blkg->blkcg; @@ -353,7 +424,7 @@ static void blkg_destroy(struct blkcg_gq *blkg) * Put the reference taken at the time of creation so that when all * queues are gone, group can be destroyed. */ - blkg_put(blkg); + percpu_ref_kill(&blkg->refcnt); } /** @@ -381,29 +452,6 @@ static void blkg_destroy_all(struct request_queue *q) } /* - * A group is RCU protected, but having an rcu lock does not mean that one - * can access all the fields of blkg and assume these are valid. For - * example, don't try to follow throtl_data and request queue links. - * - * Having a reference to blkg under an rcu allows accesses to only values - * local to groups like group stats and group rate limits. - */ -void __blkg_release_rcu(struct rcu_head *rcu_head) -{ - struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head); - - /* release the blkcg and parent blkg refs this blkg has been holding */ - css_put(&blkg->blkcg->css); - if (blkg->parent) - blkg_put(blkg->parent); - - wb_congested_put(blkg->wb_congested); - - blkg_free(blkg); -} -EXPORT_SYMBOL_GPL(__blkg_release_rcu); - -/* * The next function used by blk_queue_for_each_rl(). It's a bit tricky * because the root blkg uses @q->root_rl instead of its own rl. */ @@ -1748,8 +1796,7 @@ void blkcg_maybe_throttle_current(void) blkg = blkg_lookup(blkcg, q); if (!blkg) goto out; - blkg = blkg_try_get(blkg); - if (!blkg) + if (!blkg_tryget(blkg)) goto out; rcu_read_unlock(); diff --git a/block/blk-core.c b/block/blk-core.c index cff0a60ee200..f12d2b65e5a5 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -42,6 +42,7 @@ #include "blk.h" #include "blk-mq.h" #include "blk-mq-sched.h" +#include "blk-pm.h" #include "blk-rq-qos.h" #ifdef CONFIG_DEBUG_FS @@ -421,24 +422,25 @@ void blk_sync_queue(struct request_queue *q) EXPORT_SYMBOL(blk_sync_queue); /** - * blk_set_preempt_only - set QUEUE_FLAG_PREEMPT_ONLY + * blk_set_pm_only - increment pm_only counter * @q: request queue pointer - * - * Returns the previous value of the PREEMPT_ONLY flag - 0 if the flag was not - * set and 1 if the flag was already set. */ -int blk_set_preempt_only(struct request_queue *q) +void blk_set_pm_only(struct request_queue *q) { - return blk_queue_flag_test_and_set(QUEUE_FLAG_PREEMPT_ONLY, q); + atomic_inc(&q->pm_only); } -EXPORT_SYMBOL_GPL(blk_set_preempt_only); +EXPORT_SYMBOL_GPL(blk_set_pm_only); -void blk_clear_preempt_only(struct request_queue *q) +void blk_clear_pm_only(struct request_queue *q) { - blk_queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q); - wake_up_all(&q->mq_freeze_wq); + int pm_only; + + pm_only = atomic_dec_return(&q->pm_only); + WARN_ON_ONCE(pm_only < 0); + if (pm_only == 0) + wake_up_all(&q->mq_freeze_wq); } -EXPORT_SYMBOL_GPL(blk_clear_preempt_only); +EXPORT_SYMBOL_GPL(blk_clear_pm_only); /** * __blk_run_queue_uncond - run a queue whether or not it has been stopped @@ -917,7 +919,7 @@ EXPORT_SYMBOL(blk_alloc_queue); */ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) { - const bool preempt = flags & BLK_MQ_REQ_PREEMPT; + const bool pm = flags & BLK_MQ_REQ_PREEMPT; while (true) { bool success = false; @@ -925,11 +927,11 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) rcu_read_lock(); if (percpu_ref_tryget_live(&q->q_usage_counter)) { /* - * The code that sets the PREEMPT_ONLY flag is - * responsible for ensuring that that flag is globally - * visible before the queue is unfrozen. + * The code that increments the pm_only counter is + * responsible for ensuring that that counter is + * globally visible before the queue is unfrozen. */ - if (preempt || !blk_queue_preempt_only(q)) { + if (pm || !blk_queue_pm_only(q)) { success = true; } else { percpu_ref_put(&q->q_usage_counter); @@ -954,7 +956,8 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) wait_event(q->mq_freeze_wq, (atomic_read(&q->mq_freeze_depth) == 0 && - (preempt || !blk_queue_preempt_only(q))) || + (pm || (blk_pm_request_resume(q), + !blk_queue_pm_only(q)))) || blk_queue_dying(q)); if (blk_queue_dying(q)) return -ENODEV; @@ -1726,16 +1729,6 @@ void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part) } EXPORT_SYMBOL_GPL(part_round_stats); -#ifdef CONFIG_PM -static void blk_pm_put_request(struct request *rq) -{ - if (rq->q->dev && !(rq->rq_flags & RQF_PM) && !--rq->q->nr_pending) - pm_runtime_mark_last_busy(rq->q->dev); -} -#else -static inline void blk_pm_put_request(struct request *rq) {} -#endif - void __blk_put_request(struct request_queue *q, struct request *req) { req_flags_t rq_flags = req->rq_flags; @@ -1752,6 +1745,7 @@ void __blk_put_request(struct request_queue *q, struct request *req) blk_req_zone_write_unlock(req); blk_pm_put_request(req); + blk_pm_mark_last_busy(req); elv_completed_request(q, req); @@ -2750,30 +2744,6 @@ void blk_account_io_done(struct request *req, u64 now) } } -#ifdef CONFIG_PM -/* - * Don't process normal requests when queue is suspended - * or in the process of suspending/resuming - */ -static bool blk_pm_allow_request(struct request *rq) -{ - switch (rq->q->rpm_status) { - case RPM_RESUMING: - case RPM_SUSPENDING: - return rq->rq_flags & RQF_PM; - case RPM_SUSPENDED: - return false; - default: - return true; - } -} -#else -static bool blk_pm_allow_request(struct request *rq) -{ - return true; -} -#endif - void blk_account_io_start(struct request *rq, bool new_io) { struct hd_struct *part; @@ -2819,11 +2789,14 @@ static struct request *elv_next_request(struct request_queue *q) while (1) { list_for_each_entry(rq, &q->queue_head, queuelist) { - if (blk_pm_allow_request(rq)) - return rq; - - if (rq->rq_flags & RQF_SOFTBARRIER) - break; +#ifdef CONFIG_PM + /* + * If a request gets queued in state RPM_SUSPENDED + * then that's a kernel bug. + */ + WARN_ON_ONCE(q->rpm_status == RPM_SUSPENDED); +#endif + return rq; } /* @@ -3755,191 +3728,6 @@ void blk_finish_plug(struct blk_plug *plug) } EXPORT_SYMBOL(blk_finish_plug); -#ifdef CONFIG_PM -/** - * blk_pm_runtime_init - Block layer runtime PM initialization routine - * @q: the queue of the device - * @dev: the device the queue belongs to - * - * Description: - * Initialize runtime-PM-related fields for @q and start auto suspend for - * @dev. Drivers that want to take advantage of request-based runtime PM - * should call this function after @dev has been initialized, and its - * request queue @q has been allocated, and runtime PM for it can not happen - * yet(either due to disabled/forbidden or its usage_count > 0). In most - * cases, driver should call this function before any I/O has taken place. - * - * This function takes care of setting up using auto suspend for the device, - * the autosuspend delay is set to -1 to make runtime suspend impossible - * until an updated value is either set by user or by driver. Drivers do - * not need to touch other autosuspend settings. - * - * The block layer runtime PM is request based, so only works for drivers - * that use request as their IO unit instead of those directly use bio's. - */ -void blk_pm_runtime_init(struct request_queue *q, struct device *dev) -{ - /* Don't enable runtime PM for blk-mq until it is ready */ - if (q->mq_ops) { - pm_runtime_disable(dev); - return; - } - - q->dev = dev; - q->rpm_status = RPM_ACTIVE; - pm_runtime_set_autosuspend_delay(q->dev, -1); - pm_runtime_use_autosuspend(q->dev); -} -EXPORT_SYMBOL(blk_pm_runtime_init); - -/** - * blk_pre_runtime_suspend - Pre runtime suspend check - * @q: the queue of the device - * - * Description: - * This function will check if runtime suspend is allowed for the device - * by examining if there are any requests pending in the queue. If there - * are requests pending, the device can not be runtime suspended; otherwise, - * the queue's status will be updated to SUSPENDING and the driver can - * proceed to suspend the device. - * - * For the not allowed case, we mark last busy for the device so that - * runtime PM core will try to autosuspend it some time later. - * - * This function should be called near the start of the device's - * runtime_suspend callback. - * - * Return: - * 0 - OK to runtime suspend the device - * -EBUSY - Device should not be runtime suspended - */ -int blk_pre_runtime_suspend(struct request_queue *q) -{ - int ret = 0; - - if (!q->dev) - return ret; - - spin_lock_irq(q->queue_lock); - if (q->nr_pending) { - ret = -EBUSY; - pm_runtime_mark_last_busy(q->dev); - } else { - q->rpm_status = RPM_SUSPENDING; - } - spin_unlock_irq(q->queue_lock); - return ret; -} -EXPORT_SYMBOL(blk_pre_runtime_suspend); - -/** - * blk_post_runtime_suspend - Post runtime suspend processing - * @q: the queue of the device - * @err: return value of the device's runtime_suspend function - * - * Description: - * Update the queue's runtime status according to the return value of the - * device's runtime suspend function and mark last busy for the device so - * that PM core will try to auto suspend the device at a later time. - * - * This function should be called near the end of the device's - * runtime_suspend callback. - */ -void blk_post_runtime_suspend(struct request_queue *q, int err) -{ - if (!q->dev) - return; - - spin_lock_irq(q->queue_lock); - if (!err) { - q->rpm_status = RPM_SUSPENDED; - } else { - q->rpm_status = RPM_ACTIVE; - pm_runtime_mark_last_busy(q->dev); - } - spin_unlock_irq(q->queue_lock); -} -EXPORT_SYMBOL(blk_post_runtime_suspend); - -/** - * blk_pre_runtime_resume - Pre runtime resume processing - * @q: the queue of the device - * - * Description: - * Update the queue's runtime status to RESUMING in preparation for the - * runtime resume of the device. - * - * This function should be called near the start of the device's - * runtime_resume callback. - */ -void blk_pre_runtime_resume(struct request_queue *q) -{ - if (!q->dev) - return; - - spin_lock_irq(q->queue_lock); - q->rpm_status = RPM_RESUMING; - spin_unlock_irq(q->queue_lock); -} -EXPORT_SYMBOL(blk_pre_runtime_resume); - -/** - * blk_post_runtime_resume - Post runtime resume processing - * @q: the queue of the device - * @err: return value of the device's runtime_resume function - * - * Description: - * Update the queue's runtime status according to the return value of the - * device's runtime_resume function. If it is successfully resumed, process - * the requests that are queued into the device's queue when it is resuming - * and then mark last busy and initiate autosuspend for it. - * - * This function should be called near the end of the device's - * runtime_resume callback. - */ -void blk_post_runtime_resume(struct request_queue *q, int err) -{ - if (!q->dev) - return; - - spin_lock_irq(q->queue_lock); - if (!err) { - q->rpm_status = RPM_ACTIVE; - __blk_run_queue(q); - pm_runtime_mark_last_busy(q->dev); - pm_request_autosuspend(q->dev); - } else { - q->rpm_status = RPM_SUSPENDED; - } - spin_unlock_irq(q->queue_lock); -} -EXPORT_SYMBOL(blk_post_runtime_resume); - -/** - * blk_set_runtime_active - Force runtime status of the queue to be active - * @q: the queue of the device - * - * If the device is left runtime suspended during system suspend the resume - * hook typically resumes the device and corrects runtime status - * accordingly. However, that does not affect the queue runtime PM status - * which is still "suspended". This prevents processing requests from the - * queue. - * - * This function can be used in driver's resume hook to correct queue - * runtime PM status and re-enable peeking requests from the queue. It - * should be called before first request is added to the queue. - */ -void blk_set_runtime_active(struct request_queue *q) -{ - spin_lock_irq(q->queue_lock); - q->rpm_status = RPM_ACTIVE; - pm_runtime_mark_last_busy(q->dev); - pm_request_autosuspend(q->dev); - spin_unlock_irq(q->queue_lock); -} -EXPORT_SYMBOL(blk_set_runtime_active); -#endif - int __init blk_dev_init(void) { BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS)); diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 6121611e1316..d1ab089e0919 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -49,12 +49,8 @@ int blk_rq_count_integrity_sg(struct request_queue *q, struct bio *bio) bio_for_each_integrity_vec(iv, bio, iter) { if (prev) { - if (!BIOVEC_PHYS_MERGEABLE(&ivprv, &iv)) + if (!biovec_phys_mergeable(q, &ivprv, &iv)) goto new_segment; - - if (!BIOVEC_SEG_BOUNDARY(q, &ivprv, &iv)) - goto new_segment; - if (seg_size + iv.bv_len > queue_max_segment_size(q)) goto new_segment; @@ -95,12 +91,8 @@ int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *bio, bio_for_each_integrity_vec(iv, bio, iter) { if (prev) { - if (!BIOVEC_PHYS_MERGEABLE(&ivprv, &iv)) + if (!biovec_phys_mergeable(q, &ivprv, &iv)) goto new_segment; - - if (!BIOVEC_SEG_BOUNDARY(q, &ivprv, &iv)) - goto new_segment; - if (sg->length + iv.bv_len > queue_max_segment_size(q)) goto new_segment; diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 19923f8a029d..35c48d7b8f78 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -115,9 +115,22 @@ struct child_latency_info { atomic_t scale_cookie; }; +struct percentile_stats { + u64 total; + u64 missed; +}; + +struct latency_stat { + union { + struct percentile_stats ps; + struct blk_rq_stat rqs; + }; +}; + struct iolatency_grp { struct blkg_policy_data pd; - struct blk_rq_stat __percpu *stats; + struct latency_stat __percpu *stats; + struct latency_stat cur_stat; struct blk_iolatency *blkiolat; struct rq_depth rq_depth; struct rq_wait rq_wait; @@ -132,6 +145,7 @@ struct iolatency_grp { /* Our current number of IO's for the last summation. */ u64 nr_samples; + bool ssd; struct child_latency_info child_lat; }; @@ -172,6 +186,80 @@ static inline struct blkcg_gq *lat_to_blkg(struct iolatency_grp *iolat) return pd_to_blkg(&iolat->pd); } +static inline void latency_stat_init(struct iolatency_grp *iolat, + struct latency_stat *stat) +{ + if (iolat->ssd) { + stat->ps.total = 0; + stat->ps.missed = 0; + } else + blk_rq_stat_init(&stat->rqs); +} + +static inline void latency_stat_sum(struct iolatency_grp *iolat, + struct latency_stat *sum, + struct latency_stat *stat) +{ + if (iolat->ssd) { + sum->ps.total += stat->ps.total; + sum->ps.missed += stat->ps.missed; + } else + blk_rq_stat_sum(&sum->rqs, &stat->rqs); +} + +static inline void latency_stat_record_time(struct iolatency_grp *iolat, + u64 req_time) +{ + struct latency_stat *stat = get_cpu_ptr(iolat->stats); + if (iolat->ssd) { + if (req_time >= iolat->min_lat_nsec) + stat->ps.missed++; + stat->ps.total++; + } else + blk_rq_stat_add(&stat->rqs, req_time); + put_cpu_ptr(stat); +} + +static inline bool latency_sum_ok(struct iolatency_grp *iolat, + struct latency_stat *stat) +{ + if (iolat->ssd) { + u64 thresh = div64_u64(stat->ps.total, 10); + thresh = max(thresh, 1ULL); + return stat->ps.missed < thresh; + } + return stat->rqs.mean <= iolat->min_lat_nsec; +} + +static inline u64 latency_stat_samples(struct iolatency_grp *iolat, + struct latency_stat *stat) +{ + if (iolat->ssd) + return stat->ps.total; + return stat->rqs.nr_samples; +} + +static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat, + struct latency_stat *stat) +{ + int exp_idx; + + if (iolat->ssd) + return; + + /* + * CALC_LOAD takes in a number stored in fixed point representation. + * Because we are using this for IO time in ns, the values stored + * are significantly larger than the FIXED_1 denominator (2048). + * Therefore, rounding errors in the calculation are negligible and + * can be ignored. + */ + exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1, + div64_u64(iolat->cur_win_nsec, + BLKIOLATENCY_EXP_BUCKET_SIZE)); + CALC_LOAD(iolat->lat_avg, iolatency_exp_factors[exp_idx], stat->rqs.mean); +} + static inline bool iolatency_may_queue(struct iolatency_grp *iolat, wait_queue_entry_t *wait, bool first_block) @@ -255,7 +343,7 @@ static void scale_cookie_change(struct blk_iolatency *blkiolat, struct child_latency_info *lat_info, bool up) { - unsigned long qd = blk_queue_depth(blkiolat->rqos.q); + unsigned long qd = blkiolat->rqos.q->nr_requests; unsigned long scale = scale_amount(qd, up); unsigned long old = atomic_read(&lat_info->scale_cookie); unsigned long max_scale = qd << 1; @@ -295,10 +383,9 @@ static void scale_cookie_change(struct blk_iolatency *blkiolat, */ static void scale_change(struct iolatency_grp *iolat, bool up) { - unsigned long qd = blk_queue_depth(iolat->blkiolat->rqos.q); + unsigned long qd = iolat->blkiolat->rqos.q->nr_requests; unsigned long scale = scale_amount(qd, up); unsigned long old = iolat->rq_depth.max_depth; - bool changed = false; if (old > qd) old = qd; @@ -308,15 +395,13 @@ static void scale_change(struct iolatency_grp *iolat, bool up) return; if (old < qd) { - changed = true; old += scale; old = min(old, qd); iolat->rq_depth.max_depth = old; wake_up_all(&iolat->rq_wait.wait); } - } else if (old > 1) { + } else { old >>= 1; - changed = true; iolat->rq_depth.max_depth = max(old, 1UL); } } @@ -369,7 +454,7 @@ static void check_scale_change(struct iolatency_grp *iolat) * scale down event. */ samples_thresh = lat_info->nr_samples * 5; - samples_thresh = div64_u64(samples_thresh, 100); + samples_thresh = max(1ULL, div64_u64(samples_thresh, 100)); if (iolat->nr_samples <= samples_thresh) return; } @@ -395,34 +480,12 @@ static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock) { struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); - struct blkcg *blkcg; - struct blkcg_gq *blkg; - struct request_queue *q = rqos->q; + struct blkcg_gq *blkg = bio->bi_blkg; bool issue_as_root = bio_issue_as_root_blkg(bio); if (!blk_iolatency_enabled(blkiolat)) return; - rcu_read_lock(); - blkcg = bio_blkcg(bio); - bio_associate_blkcg(bio, &blkcg->css); - blkg = blkg_lookup(blkcg, q); - if (unlikely(!blkg)) { - if (!lock) - spin_lock_irq(q->queue_lock); - blkg = blkg_lookup_create(blkcg, q); - if (IS_ERR(blkg)) - blkg = NULL; - if (!lock) - spin_unlock_irq(q->queue_lock); - } - if (!blkg) - goto out; - - bio_issue_init(&bio->bi_issue, bio_sectors(bio)); - bio_associate_blkg(bio, blkg); -out: - rcu_read_unlock(); while (blkg && blkg->parent) { struct iolatency_grp *iolat = blkg_to_lat(blkg); if (!iolat) { @@ -443,7 +506,6 @@ static void iolatency_record_time(struct iolatency_grp *iolat, struct bio_issue *issue, u64 now, bool issue_as_root) { - struct blk_rq_stat *rq_stat; u64 start = bio_issue_time(issue); u64 req_time; @@ -469,9 +531,7 @@ static void iolatency_record_time(struct iolatency_grp *iolat, return; } - rq_stat = get_cpu_ptr(iolat->stats); - blk_rq_stat_add(rq_stat, req_time); - put_cpu_ptr(rq_stat); + latency_stat_record_time(iolat, req_time); } #define BLKIOLATENCY_MIN_ADJUST_TIME (500 * NSEC_PER_MSEC) @@ -482,17 +542,17 @@ static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now) struct blkcg_gq *blkg = lat_to_blkg(iolat); struct iolatency_grp *parent; struct child_latency_info *lat_info; - struct blk_rq_stat stat; + struct latency_stat stat; unsigned long flags; - int cpu, exp_idx; + int cpu; - blk_rq_stat_init(&stat); + latency_stat_init(iolat, &stat); preempt_disable(); for_each_online_cpu(cpu) { - struct blk_rq_stat *s; + struct latency_stat *s; s = per_cpu_ptr(iolat->stats, cpu); - blk_rq_stat_sum(&stat, s); - blk_rq_stat_init(s); + latency_stat_sum(iolat, &stat, s); + latency_stat_init(iolat, s); } preempt_enable(); @@ -502,41 +562,36 @@ static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now) lat_info = &parent->child_lat; - /* - * CALC_LOAD takes in a number stored in fixed point representation. - * Because we are using this for IO time in ns, the values stored - * are significantly larger than the FIXED_1 denominator (2048). - * Therefore, rounding errors in the calculation are negligible and - * can be ignored. - */ - exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1, - div64_u64(iolat->cur_win_nsec, - BLKIOLATENCY_EXP_BUCKET_SIZE)); - CALC_LOAD(iolat->lat_avg, iolatency_exp_factors[exp_idx], stat.mean); + iolat_update_total_lat_avg(iolat, &stat); /* Everything is ok and we don't need to adjust the scale. */ - if (stat.mean <= iolat->min_lat_nsec && + if (latency_sum_ok(iolat, &stat) && atomic_read(&lat_info->scale_cookie) == DEFAULT_SCALE_COOKIE) return; /* Somebody beat us to the punch, just bail. */ spin_lock_irqsave(&lat_info->lock, flags); + + latency_stat_sum(iolat, &iolat->cur_stat, &stat); lat_info->nr_samples -= iolat->nr_samples; - lat_info->nr_samples += stat.nr_samples; - iolat->nr_samples = stat.nr_samples; + lat_info->nr_samples += latency_stat_samples(iolat, &iolat->cur_stat); + iolat->nr_samples = latency_stat_samples(iolat, &iolat->cur_stat); if ((lat_info->last_scale_event >= now || - now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME) && - lat_info->scale_lat <= iolat->min_lat_nsec) + now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME)) goto out; - if (stat.mean <= iolat->min_lat_nsec && - stat.nr_samples >= BLKIOLATENCY_MIN_GOOD_SAMPLES) { + if (latency_sum_ok(iolat, &iolat->cur_stat) && + latency_sum_ok(iolat, &stat)) { + if (latency_stat_samples(iolat, &iolat->cur_stat) < + BLKIOLATENCY_MIN_GOOD_SAMPLES) + goto out; if (lat_info->scale_grp == iolat) { lat_info->last_scale_event = now; scale_cookie_change(iolat->blkiolat, lat_info, true); } - } else if (stat.mean > iolat->min_lat_nsec) { + } else if (lat_info->scale_lat == 0 || + lat_info->scale_lat >= iolat->min_lat_nsec) { lat_info->last_scale_event = now; if (!lat_info->scale_grp || lat_info->scale_lat > iolat->min_lat_nsec) { @@ -545,6 +600,7 @@ static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now) } scale_cookie_change(iolat->blkiolat, lat_info, false); } + latency_stat_init(iolat, &iolat->cur_stat); out: spin_unlock_irqrestore(&lat_info->lock, flags); } @@ -650,7 +706,7 @@ static void blkiolatency_timer_fn(struct timer_list *t) * We could be exiting, don't access the pd unless we have a * ref on the blkg. */ - if (!blkg_try_get(blkg)) + if (!blkg_tryget(blkg)) continue; iolat = blkg_to_lat(blkg); @@ -761,7 +817,6 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, { struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct blkcg_gq *blkg; - struct blk_iolatency *blkiolat; struct blkg_conf_ctx ctx; struct iolatency_grp *iolat; char *p, *tok; @@ -774,7 +829,6 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, return ret; iolat = blkg_to_lat(ctx.blkg); - blkiolat = iolat->blkiolat; p = ctx.body; ret = -EINVAL; @@ -835,13 +889,43 @@ static int iolatency_print_limit(struct seq_file *sf, void *v) return 0; } +static size_t iolatency_ssd_stat(struct iolatency_grp *iolat, char *buf, + size_t size) +{ + struct latency_stat stat; + int cpu; + + latency_stat_init(iolat, &stat); + preempt_disable(); + for_each_online_cpu(cpu) { + struct latency_stat *s; + s = per_cpu_ptr(iolat->stats, cpu); + latency_stat_sum(iolat, &stat, s); + } + preempt_enable(); + + if (iolat->rq_depth.max_depth == UINT_MAX) + return scnprintf(buf, size, " missed=%llu total=%llu depth=max", + (unsigned long long)stat.ps.missed, + (unsigned long long)stat.ps.total); + return scnprintf(buf, size, " missed=%llu total=%llu depth=%u", + (unsigned long long)stat.ps.missed, + (unsigned long long)stat.ps.total, + iolat->rq_depth.max_depth); +} + static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size) { struct iolatency_grp *iolat = pd_to_lat(pd); - unsigned long long avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC); - unsigned long long cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC); + unsigned long long avg_lat; + unsigned long long cur_win; + + if (iolat->ssd) + return iolatency_ssd_stat(iolat, buf, size); + avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC); + cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC); if (iolat->rq_depth.max_depth == UINT_MAX) return scnprintf(buf, size, " depth=max avg_lat=%llu win=%llu", avg_lat, cur_win); @@ -858,8 +942,8 @@ static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, int node) iolat = kzalloc_node(sizeof(*iolat), gfp, node); if (!iolat) return NULL; - iolat->stats = __alloc_percpu_gfp(sizeof(struct blk_rq_stat), - __alignof__(struct blk_rq_stat), gfp); + iolat->stats = __alloc_percpu_gfp(sizeof(struct latency_stat), + __alignof__(struct latency_stat), gfp); if (!iolat->stats) { kfree(iolat); return NULL; @@ -876,15 +960,21 @@ static void iolatency_pd_init(struct blkg_policy_data *pd) u64 now = ktime_to_ns(ktime_get()); int cpu; + if (blk_queue_nonrot(blkg->q)) + iolat->ssd = true; + else + iolat->ssd = false; + for_each_possible_cpu(cpu) { - struct blk_rq_stat *stat; + struct latency_stat *stat; stat = per_cpu_ptr(iolat->stats, cpu); - blk_rq_stat_init(stat); + latency_stat_init(iolat, stat); } + latency_stat_init(iolat, &iolat->cur_stat); rq_wait_init(&iolat->rq_wait); spin_lock_init(&iolat->child_lat.lock); - iolat->rq_depth.queue_depth = blk_queue_depth(blkg->q); + iolat->rq_depth.queue_depth = blkg->q->nr_requests; iolat->rq_depth.max_depth = UINT_MAX; iolat->rq_depth.default_depth = iolat->rq_depth.queue_depth; iolat->blkiolat = blkiolat; diff --git a/block/blk-merge.c b/block/blk-merge.c index aaec38cc37b8..42a46744c11b 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -12,6 +12,69 @@ #include "blk.h" +/* + * Check if the two bvecs from two bios can be merged to one segment. If yes, + * no need to check gap between the two bios since the 1st bio and the 1st bvec + * in the 2nd bio can be handled in one segment. + */ +static inline bool bios_segs_mergeable(struct request_queue *q, + struct bio *prev, struct bio_vec *prev_last_bv, + struct bio_vec *next_first_bv) +{ + if (!biovec_phys_mergeable(q, prev_last_bv, next_first_bv)) + return false; + if (prev->bi_seg_back_size + next_first_bv->bv_len > + queue_max_segment_size(q)) + return false; + return true; +} + +static inline bool bio_will_gap(struct request_queue *q, + struct request *prev_rq, struct bio *prev, struct bio *next) +{ + struct bio_vec pb, nb; + + if (!bio_has_data(prev) || !queue_virt_boundary(q)) + return false; + + /* + * Don't merge if the 1st bio starts with non-zero offset, otherwise it + * is quite difficult to respect the sg gap limit. We work hard to + * merge a huge number of small single bios in case of mkfs. + */ + if (prev_rq) + bio_get_first_bvec(prev_rq->bio, &pb); + else + bio_get_first_bvec(prev, &pb); + if (pb.bv_offset) + return true; + + /* + * We don't need to worry about the situation that the merged segment + * ends in unaligned virt boundary: + * + * - if 'pb' ends aligned, the merged segment ends aligned + * - if 'pb' ends unaligned, the next bio must include + * one single bvec of 'nb', otherwise the 'nb' can't + * merge with 'pb' + */ + bio_get_last_bvec(prev, &pb); + bio_get_first_bvec(next, &nb); + if (bios_segs_mergeable(q, prev, &pb, &nb)) + return false; + return __bvec_gap_to_prev(q, &pb, nb.bv_offset); +} + +static inline bool req_gap_back_merge(struct request *req, struct bio *bio) +{ + return bio_will_gap(req->q, req, req->biotail, bio); +} + +static inline bool req_gap_front_merge(struct request *req, struct bio *bio) +{ + return bio_will_gap(req->q, NULL, bio, req->bio); +} + static struct bio *blk_bio_discard_split(struct request_queue *q, struct bio *bio, struct bio_set *bs, @@ -134,9 +197,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, if (bvprvp && blk_queue_cluster(q)) { if (seg_size + bv.bv_len > queue_max_segment_size(q)) goto new_segment; - if (!BIOVEC_PHYS_MERGEABLE(bvprvp, &bv)) - goto new_segment; - if (!BIOVEC_SEG_BOUNDARY(q, bvprvp, &bv)) + if (!biovec_phys_mergeable(q, bvprvp, &bv)) goto new_segment; seg_size += bv.bv_len; @@ -267,9 +328,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, if (seg_size + bv.bv_len > queue_max_segment_size(q)) goto new_segment; - if (!BIOVEC_PHYS_MERGEABLE(&bvprv, &bv)) - goto new_segment; - if (!BIOVEC_SEG_BOUNDARY(q, &bvprv, &bv)) + if (!biovec_phys_mergeable(q, &bvprv, &bv)) goto new_segment; seg_size += bv.bv_len; @@ -349,17 +408,7 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, bio_get_last_bvec(bio, &end_bv); bio_get_first_bvec(nxt, &nxt_bv); - if (!BIOVEC_PHYS_MERGEABLE(&end_bv, &nxt_bv)) - return 0; - - /* - * bio and nxt are contiguous in memory; check if the queue allows - * these two to be merged into one - */ - if (BIOVEC_SEG_BOUNDARY(q, &end_bv, &nxt_bv)) - return 1; - - return 0; + return biovec_phys_mergeable(q, &end_bv, &nxt_bv); } static inline void @@ -373,10 +422,7 @@ __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec, if (*sg && *cluster) { if ((*sg)->length + nbytes > queue_max_segment_size(q)) goto new_segment; - - if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) - goto new_segment; - if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec)) + if (!biovec_phys_mergeable(q, bvprv, bvec)) goto new_segment; (*sg)->length += nbytes; diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index cb1e6cf7ac48..a5ea86835fcb 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -102,6 +102,14 @@ static int blk_flags_show(struct seq_file *m, const unsigned long flags, return 0; } +static int queue_pm_only_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + + seq_printf(m, "%d\n", atomic_read(&q->pm_only)); + return 0; +} + #define QUEUE_FLAG_NAME(name) [QUEUE_FLAG_##name] = #name static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(QUEUED), @@ -132,7 +140,6 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(REGISTERED), QUEUE_FLAG_NAME(SCSI_PASSTHROUGH), QUEUE_FLAG_NAME(QUIESCED), - QUEUE_FLAG_NAME(PREEMPT_ONLY), }; #undef QUEUE_FLAG_NAME @@ -209,6 +216,7 @@ static ssize_t queue_write_hint_store(void *data, const char __user *buf, static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = { { "poll_stat", 0400, queue_poll_stat_show }, { "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops }, + { "pm_only", 0600, queue_pm_only_show, NULL }, { "state", 0600, queue_state_show, queue_state_write }, { "write_hints", 0600, queue_write_hint_show, queue_write_hint_store }, { "zone_wlock", 0400, queue_zone_wlock_show, NULL }, diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 4e028ee42430..8a9544203173 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -49,12 +49,12 @@ blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq, return true; } -static inline void blk_mq_sched_completed_request(struct request *rq) +static inline void blk_mq_sched_completed_request(struct request *rq, u64 now) { struct elevator_queue *e = rq->q->elevator; if (e && e->type->ops.mq.completed_request) - e->type->ops.mq.completed_request(rq); + e->type->ops.mq.completed_request(rq, now); } static inline void blk_mq_sched_started_request(struct request *rq) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 41317c50a446..cfda95b85d34 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -232,13 +232,26 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) /* * We can hit rq == NULL here, because the tagging functions - * test and set the bit before assining ->rqs[]. + * test and set the bit before assigning ->rqs[]. */ if (rq && rq->q == hctx->queue) iter_data->fn(hctx, rq, iter_data->data, reserved); return true; } +/** + * bt_for_each - iterate over the requests associated with a hardware queue + * @hctx: Hardware queue to examine. + * @bt: sbitmap to examine. This is either the breserved_tags member + * or the bitmap_tags member of struct blk_mq_tags. + * @fn: Pointer to the function that will be called for each request + * associated with @hctx that has been assigned a driver tag. + * @fn will be called as follows: @fn(@hctx, rq, @data, @reserved) + * where rq is a pointer to a request. + * @data: Will be passed as third argument to @fn. + * @reserved: Indicates whether @bt is the breserved_tags member or the + * bitmap_tags member of struct blk_mq_tags. + */ static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt, busy_iter_fn *fn, void *data, bool reserved) { @@ -280,6 +293,18 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) return true; } +/** + * bt_tags_for_each - iterate over the requests in a tag map + * @tags: Tag map to iterate over. + * @bt: sbitmap to examine. This is either the breserved_tags member + * or the bitmap_tags member of struct blk_mq_tags. + * @fn: Pointer to the function that will be called for each started + * request. @fn will be called as follows: @fn(rq, @data, + * @reserved) where rq is a pointer to a request. + * @data: Will be passed as second argument to @fn. + * @reserved: Indicates whether @bt is the breserved_tags member or the + * bitmap_tags member of struct blk_mq_tags. + */ static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt, busy_tag_iter_fn *fn, void *data, bool reserved) { @@ -294,6 +319,15 @@ static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt, sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data); } +/** + * blk_mq_all_tag_busy_iter - iterate over all started requests in a tag map + * @tags: Tag map to iterate over. + * @fn: Pointer to the function that will be called for each started + * request. @fn will be called as follows: @fn(rq, @priv, + * reserved) where rq is a pointer to a request. 'reserved' + * indicates whether or not @rq is a reserved request. + * @priv: Will be passed as second argument to @fn. + */ static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, void *priv) { @@ -302,6 +336,15 @@ static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, false); } +/** + * blk_mq_tagset_busy_iter - iterate over all started requests in a tag set + * @tagset: Tag set to iterate over. + * @fn: Pointer to the function that will be called for each started + * request. @fn will be called as follows: @fn(rq, @priv, + * reserved) where rq is a pointer to a request. 'reserved' + * indicates whether or not @rq is a reserved request. + * @priv: Will be passed as second argument to @fn. + */ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv) { @@ -314,6 +357,20 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, } EXPORT_SYMBOL(blk_mq_tagset_busy_iter); +/** + * blk_mq_queue_tag_busy_iter - iterate over all requests with a driver tag + * @q: Request queue to examine. + * @fn: Pointer to the function that will be called for each request + * on @q. @fn will be called as follows: @fn(hctx, rq, @priv, + * reserved) where rq is a pointer to a request and hctx points + * to the hardware queue associated with the request. 'reserved' + * indicates whether or not @rq is a reserved request. + * @priv: Will be passed as third argument to @fn. + * + * Note: if @q->tag_set is shared with other request queues then @fn will be + * called for all requests on all queues that share that tag set and not only + * for requests associated with @q. + */ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, void *priv) { @@ -321,9 +378,11 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, int i; /* - * __blk_mq_update_nr_hw_queues will update the nr_hw_queues and - * queue_hw_ctx after freeze the queue, so we use q_usage_counter - * to avoid race with it. + * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx + * while the queue is frozen. So we can use q_usage_counter to avoid + * racing with it. __blk_mq_update_nr_hw_queues() uses + * synchronize_rcu() to ensure this function left the critical section + * below. */ if (!percpu_ref_tryget(&q->q_usage_counter)) return; @@ -332,7 +391,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, struct blk_mq_tags *tags = hctx->tags; /* - * If not software queues are currently mapped to this + * If no software queues are currently mapped to this * hardware queue, there's nothing to check */ if (!blk_mq_hw_queue_mapped(hctx)) diff --git a/block/blk-mq.c b/block/blk-mq.c index e3c39ea8e17b..1dc157c85a83 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -33,6 +33,7 @@ #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-tag.h" +#include "blk-pm.h" #include "blk-stat.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" @@ -198,7 +199,7 @@ void blk_mq_unfreeze_queue(struct request_queue *q) freeze_depth = atomic_dec_return(&q->mq_freeze_depth); WARN_ON_ONCE(freeze_depth < 0); if (!freeze_depth) { - percpu_ref_reinit(&q->q_usage_counter); + percpu_ref_resurrect(&q->q_usage_counter); wake_up_all(&q->mq_freeze_wq); } } @@ -475,6 +476,7 @@ static void __blk_mq_free_request(struct request *rq) struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); const int sched_tag = rq->internal_tag; + blk_pm_mark_last_busy(rq); if (rq->tag != -1) blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); if (sched_tag != -1) @@ -526,6 +528,9 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error) blk_stat_add(rq, now); } + if (rq->internal_tag != -1) + blk_mq_sched_completed_request(rq, now); + blk_account_io_done(rq, now); if (rq->end_io) { @@ -562,8 +567,6 @@ static void __blk_mq_complete_request(struct request *rq) if (!blk_mq_mark_complete(rq)) return; - if (rq->internal_tag != -1) - blk_mq_sched_completed_request(rq); if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) { rq->q->softirq_done_fn(rq); diff --git a/block/blk-pm.c b/block/blk-pm.c new file mode 100644 index 000000000000..f8fdae01bea2 --- /dev/null +++ b/block/blk-pm.c @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/blk-mq.h> +#include <linux/blk-pm.h> +#include <linux/blkdev.h> +#include <linux/pm_runtime.h> +#include "blk-mq.h" +#include "blk-mq-tag.h" + +/** + * blk_pm_runtime_init - Block layer runtime PM initialization routine + * @q: the queue of the device + * @dev: the device the queue belongs to + * + * Description: + * Initialize runtime-PM-related fields for @q and start auto suspend for + * @dev. Drivers that want to take advantage of request-based runtime PM + * should call this function after @dev has been initialized, and its + * request queue @q has been allocated, and runtime PM for it can not happen + * yet(either due to disabled/forbidden or its usage_count > 0). In most + * cases, driver should call this function before any I/O has taken place. + * + * This function takes care of setting up using auto suspend for the device, + * the autosuspend delay is set to -1 to make runtime suspend impossible + * until an updated value is either set by user or by driver. Drivers do + * not need to touch other autosuspend settings. + * + * The block layer runtime PM is request based, so only works for drivers + * that use request as their IO unit instead of those directly use bio's. + */ +void blk_pm_runtime_init(struct request_queue *q, struct device *dev) +{ + q->dev = dev; + q->rpm_status = RPM_ACTIVE; + pm_runtime_set_autosuspend_delay(q->dev, -1); + pm_runtime_use_autosuspend(q->dev); +} +EXPORT_SYMBOL(blk_pm_runtime_init); + +/** + * blk_pre_runtime_suspend - Pre runtime suspend check + * @q: the queue of the device + * + * Description: + * This function will check if runtime suspend is allowed for the device + * by examining if there are any requests pending in the queue. If there + * are requests pending, the device can not be runtime suspended; otherwise, + * the queue's status will be updated to SUSPENDING and the driver can + * proceed to suspend the device. + * + * For the not allowed case, we mark last busy for the device so that + * runtime PM core will try to autosuspend it some time later. + * + * This function should be called near the start of the device's + * runtime_suspend callback. + * + * Return: + * 0 - OK to runtime suspend the device + * -EBUSY - Device should not be runtime suspended + */ +int blk_pre_runtime_suspend(struct request_queue *q) +{ + int ret = 0; + + if (!q->dev) + return ret; + + WARN_ON_ONCE(q->rpm_status != RPM_ACTIVE); + + /* + * Increase the pm_only counter before checking whether any + * non-PM blk_queue_enter() calls are in progress to avoid that any + * new non-PM blk_queue_enter() calls succeed before the pm_only + * counter is decreased again. + */ + blk_set_pm_only(q); + ret = -EBUSY; + /* Switch q_usage_counter from per-cpu to atomic mode. */ + blk_freeze_queue_start(q); + /* + * Wait until atomic mode has been reached. Since that + * involves calling call_rcu(), it is guaranteed that later + * blk_queue_enter() calls see the pm-only state. See also + * http://lwn.net/Articles/573497/. + */ + percpu_ref_switch_to_atomic_sync(&q->q_usage_counter); + if (percpu_ref_is_zero(&q->q_usage_counter)) + ret = 0; + /* Switch q_usage_counter back to per-cpu mode. */ + blk_mq_unfreeze_queue(q); + + spin_lock_irq(q->queue_lock); + if (ret < 0) + pm_runtime_mark_last_busy(q->dev); + else + q->rpm_status = RPM_SUSPENDING; + spin_unlock_irq(q->queue_lock); + + if (ret) + blk_clear_pm_only(q); + + return ret; +} +EXPORT_SYMBOL(blk_pre_runtime_suspend); + +/** + * blk_post_runtime_suspend - Post runtime suspend processing + * @q: the queue of the device + * @err: return value of the device's runtime_suspend function + * + * Description: + * Update the queue's runtime status according to the return value of the + * device's runtime suspend function and mark last busy for the device so + * that PM core will try to auto suspend the device at a later time. + * + * This function should be called near the end of the device's + * runtime_suspend callback. + */ +void blk_post_runtime_suspend(struct request_queue *q, int err) +{ + if (!q->dev) + return; + + spin_lock_irq(q->queue_lock); + if (!err) { + q->rpm_status = RPM_SUSPENDED; + } else { + q->rpm_status = RPM_ACTIVE; + pm_runtime_mark_last_busy(q->dev); + } + spin_unlock_irq(q->queue_lock); + + if (err) + blk_clear_pm_only(q); +} +EXPORT_SYMBOL(blk_post_runtime_suspend); + +/** + * blk_pre_runtime_resume - Pre runtime resume processing + * @q: the queue of the device + * + * Description: + * Update the queue's runtime status to RESUMING in preparation for the + * runtime resume of the device. + * + * This function should be called near the start of the device's + * runtime_resume callback. + */ +void blk_pre_runtime_resume(struct request_queue *q) +{ + if (!q->dev) + return; + + spin_lock_irq(q->queue_lock); + q->rpm_status = RPM_RESUMING; + spin_unlock_irq(q->queue_lock); +} +EXPORT_SYMBOL(blk_pre_runtime_resume); + +/** + * blk_post_runtime_resume - Post runtime resume processing + * @q: the queue of the device + * @err: return value of the device's runtime_resume function + * + * Description: + * Update the queue's runtime status according to the return value of the + * device's runtime_resume function. If it is successfully resumed, process + * the requests that are queued into the device's queue when it is resuming + * and then mark last busy and initiate autosuspend for it. + * + * This function should be called near the end of the device's + * runtime_resume callback. + */ +void blk_post_runtime_resume(struct request_queue *q, int err) +{ + if (!q->dev) + return; + + spin_lock_irq(q->queue_lock); + if (!err) { + q->rpm_status = RPM_ACTIVE; + pm_runtime_mark_last_busy(q->dev); + pm_request_autosuspend(q->dev); + } else { + q->rpm_status = RPM_SUSPENDED; + } + spin_unlock_irq(q->queue_lock); + + if (!err) + blk_clear_pm_only(q); +} +EXPORT_SYMBOL(blk_post_runtime_resume); + +/** + * blk_set_runtime_active - Force runtime status of the queue to be active + * @q: the queue of the device + * + * If the device is left runtime suspended during system suspend the resume + * hook typically resumes the device and corrects runtime status + * accordingly. However, that does not affect the queue runtime PM status + * which is still "suspended". This prevents processing requests from the + * queue. + * + * This function can be used in driver's resume hook to correct queue + * runtime PM status and re-enable peeking requests from the queue. It + * should be called before first request is added to the queue. + */ +void blk_set_runtime_active(struct request_queue *q) +{ + spin_lock_irq(q->queue_lock); + q->rpm_status = RPM_ACTIVE; + pm_runtime_mark_last_busy(q->dev); + pm_request_autosuspend(q->dev); + spin_unlock_irq(q->queue_lock); +} +EXPORT_SYMBOL(blk_set_runtime_active); diff --git a/block/blk-pm.h b/block/blk-pm.h new file mode 100644 index 000000000000..a8564ea72a41 --- /dev/null +++ b/block/blk-pm.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _BLOCK_BLK_PM_H_ +#define _BLOCK_BLK_PM_H_ + +#include <linux/pm_runtime.h> + +#ifdef CONFIG_PM +static inline void blk_pm_request_resume(struct request_queue *q) +{ + if (q->dev && (q->rpm_status == RPM_SUSPENDED || + q->rpm_status == RPM_SUSPENDING)) + pm_request_resume(q->dev); +} + +static inline void blk_pm_mark_last_busy(struct request *rq) +{ + if (rq->q->dev && !(rq->rq_flags & RQF_PM)) + pm_runtime_mark_last_busy(rq->q->dev); +} + +static inline void blk_pm_requeue_request(struct request *rq) +{ + lockdep_assert_held(rq->q->queue_lock); + + if (rq->q->dev && !(rq->rq_flags & RQF_PM)) + rq->q->nr_pending--; +} + +static inline void blk_pm_add_request(struct request_queue *q, + struct request *rq) +{ + lockdep_assert_held(q->queue_lock); + + if (q->dev && !(rq->rq_flags & RQF_PM)) + q->nr_pending++; +} + +static inline void blk_pm_put_request(struct request *rq) +{ + lockdep_assert_held(rq->q->queue_lock); + + if (rq->q->dev && !(rq->rq_flags & RQF_PM)) + --rq->q->nr_pending; +} +#else +static inline void blk_pm_request_resume(struct request_queue *q) +{ +} + +static inline void blk_pm_mark_last_busy(struct request *rq) +{ +} + +static inline void blk_pm_requeue_request(struct request *rq) +{ +} + +static inline void blk_pm_add_request(struct request_queue *q, + struct request *rq) +{ +} + +static inline void blk_pm_put_request(struct request *rq) +{ +} +#endif + +#endif /* _BLOCK_BLK_PM_H_ */ diff --git a/block/blk-stat.c b/block/blk-stat.c index 7587b1c3caaf..90561af85a62 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -190,6 +190,7 @@ void blk_stat_enable_accounting(struct request_queue *q) blk_queue_flag_set(QUEUE_FLAG_STATS, q); spin_unlock(&q->stats->lock); } +EXPORT_SYMBOL_GPL(blk_stat_enable_accounting); struct blk_queue_stats *blk_alloc_queue_stats(void) { diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 01d0620a4e4a..4bda70e8db48 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -84,8 +84,7 @@ struct throtl_service_queue { * RB tree of active children throtl_grp's, which are sorted by * their ->disptime. */ - struct rb_root pending_tree; /* RB tree of active tgs */ - struct rb_node *first_pending; /* first node in the tree */ + struct rb_root_cached pending_tree; /* RB tree of active tgs */ unsigned int nr_pending; /* # queued in the tree */ unsigned long first_pending_disptime; /* disptime of the first tg */ struct timer_list pending_timer; /* fires on first_pending_disptime */ @@ -475,7 +474,7 @@ static void throtl_service_queue_init(struct throtl_service_queue *sq) { INIT_LIST_HEAD(&sq->queued[0]); INIT_LIST_HEAD(&sq->queued[1]); - sq->pending_tree = RB_ROOT; + sq->pending_tree = RB_ROOT_CACHED; timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0); } @@ -616,31 +615,23 @@ static void throtl_pd_free(struct blkg_policy_data *pd) static struct throtl_grp * throtl_rb_first(struct throtl_service_queue *parent_sq) { + struct rb_node *n; /* Service tree is empty */ if (!parent_sq->nr_pending) return NULL; - if (!parent_sq->first_pending) - parent_sq->first_pending = rb_first(&parent_sq->pending_tree); - - if (parent_sq->first_pending) - return rb_entry_tg(parent_sq->first_pending); - - return NULL; -} - -static void rb_erase_init(struct rb_node *n, struct rb_root *root) -{ - rb_erase(n, root); - RB_CLEAR_NODE(n); + n = rb_first_cached(&parent_sq->pending_tree); + WARN_ON_ONCE(!n); + if (!n) + return NULL; + return rb_entry_tg(n); } static void throtl_rb_erase(struct rb_node *n, struct throtl_service_queue *parent_sq) { - if (parent_sq->first_pending == n) - parent_sq->first_pending = NULL; - rb_erase_init(n, &parent_sq->pending_tree); + rb_erase_cached(n, &parent_sq->pending_tree); + RB_CLEAR_NODE(n); --parent_sq->nr_pending; } @@ -658,11 +649,11 @@ static void update_min_dispatch_time(struct throtl_service_queue *parent_sq) static void tg_service_queue_add(struct throtl_grp *tg) { struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq; - struct rb_node **node = &parent_sq->pending_tree.rb_node; + struct rb_node **node = &parent_sq->pending_tree.rb_root.rb_node; struct rb_node *parent = NULL; struct throtl_grp *__tg; unsigned long key = tg->disptime; - int left = 1; + bool leftmost = true; while (*node != NULL) { parent = *node; @@ -672,15 +663,13 @@ static void tg_service_queue_add(struct throtl_grp *tg) node = &parent->rb_left; else { node = &parent->rb_right; - left = 0; + leftmost = false; } } - if (left) - parent_sq->first_pending = &tg->rb_node; - rb_link_node(&tg->rb_node, parent, node); - rb_insert_color(&tg->rb_node, &parent_sq->pending_tree); + rb_insert_color_cached(&tg->rb_node, &parent_sq->pending_tree, + leftmost); } static void __throtl_enqueue_tg(struct throtl_grp *tg) @@ -2126,21 +2115,11 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td) } #endif -static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio) -{ -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW - /* fallback to root_blkg if we fail to get a blkg ref */ - if (bio->bi_css && (bio_associate_blkg(bio, tg_to_blkg(tg)) == -ENODEV)) - bio_associate_blkg(bio, bio->bi_disk->queue->root_blkg); - bio_issue_init(&bio->bi_issue, bio_sectors(bio)); -#endif -} - bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, struct bio *bio) { struct throtl_qnode *qn = NULL; - struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg); + struct throtl_grp *tg = blkg_to_tg(blkg); struct throtl_service_queue *sq; bool rw = bio_data_dir(bio); bool throttled = false; @@ -2159,7 +2138,6 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, if (unlikely(blk_queue_bypass(q))) goto out_unlock; - blk_throtl_assoc_bio(tg, bio); blk_throtl_update_idletime(tg); sq = &tg->service_queue; diff --git a/block/blk.h b/block/blk.h index 9db4e389582c..58c030f727e9 100644 --- a/block/blk.h +++ b/block/blk.h @@ -4,6 +4,7 @@ #include <linux/idr.h> #include <linux/blk-mq.h> +#include <xen/xen.h> #include "blk-mq.h" /* Amount of time in which a process may batch requests */ @@ -149,6 +150,41 @@ static inline void blk_queue_enter_live(struct request_queue *q) percpu_ref_get(&q->q_usage_counter); } +static inline bool biovec_phys_mergeable(struct request_queue *q, + struct bio_vec *vec1, struct bio_vec *vec2) +{ + unsigned long mask = queue_segment_boundary(q); + phys_addr_t addr1 = page_to_phys(vec1->bv_page) + vec1->bv_offset; + phys_addr_t addr2 = page_to_phys(vec2->bv_page) + vec2->bv_offset; + + if (addr1 + vec1->bv_len != addr2) + return false; + if (xen_domain() && !xen_biovec_phys_mergeable(vec1, vec2)) + return false; + if ((addr1 | mask) != ((addr2 + vec2->bv_len - 1) | mask)) + return false; + return true; +} + +static inline bool __bvec_gap_to_prev(struct request_queue *q, + struct bio_vec *bprv, unsigned int offset) +{ + return offset || + ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q)); +} + +/* + * Check if adding a bio_vec after bprv with offset would create a gap in + * the SG list. Most drivers don't care about this, but some do. + */ +static inline bool bvec_gap_to_prev(struct request_queue *q, + struct bio_vec *bprv, unsigned int offset) +{ + if (!queue_virt_boundary(q)) + return false; + return __bvec_gap_to_prev(q, bprv, offset); +} + #ifdef CONFIG_BLK_DEV_INTEGRITY void blk_flush_integrity(void); bool __bio_integrity_endio(struct bio *); @@ -158,7 +194,38 @@ static inline bool bio_integrity_endio(struct bio *bio) return __bio_integrity_endio(bio); return true; } -#else + +static inline bool integrity_req_gap_back_merge(struct request *req, + struct bio *next) +{ + struct bio_integrity_payload *bip = bio_integrity(req->bio); + struct bio_integrity_payload *bip_next = bio_integrity(next); + + return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1], + bip_next->bip_vec[0].bv_offset); +} + +static inline bool integrity_req_gap_front_merge(struct request *req, + struct bio *bio) +{ + struct bio_integrity_payload *bip = bio_integrity(bio); + struct bio_integrity_payload *bip_next = bio_integrity(req->bio); + + return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1], + bip_next->bip_vec[0].bv_offset); +} +#else /* CONFIG_BLK_DEV_INTEGRITY */ +static inline bool integrity_req_gap_back_merge(struct request *req, + struct bio *next) +{ + return false; +} +static inline bool integrity_req_gap_front_merge(struct request *req, + struct bio *bio) +{ + return false; +} + static inline void blk_flush_integrity(void) { } @@ -166,7 +233,7 @@ static inline bool bio_integrity_endio(struct bio *bio) { return true; } -#endif +#endif /* CONFIG_BLK_DEV_INTEGRITY */ void blk_timeout_work(struct work_struct *work); unsigned long blk_rq_timeout(unsigned long timeout); diff --git a/block/bounce.c b/block/bounce.c index bc63b3a2d18c..b30071ac4ec6 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -257,7 +257,9 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask, } } - bio_clone_blkcg_association(bio, bio_src); + bio_clone_blkg_association(bio, bio_src); + + blkcg_bio_issue_init(bio); return bio; } diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 2eb87444b157..d219e9a1af65 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3753,7 +3753,7 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) uint64_t serial_nr; rcu_read_lock(); - serial_nr = bio_blkcg(bio)->css.serial_nr; + serial_nr = __bio_blkcg(bio)->css.serial_nr; rcu_read_unlock(); /* @@ -3818,7 +3818,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, struct cfq_group *cfqg; rcu_read_lock(); - cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio)); + cfqg = cfq_lookup_cfqg(cfqd, __bio_blkcg(bio)); if (!cfqg) { cfqq = &cfqd->oom_cfqq; goto out; diff --git a/block/elevator.c b/block/elevator.c index fae58b2f906f..8fdcd64ae12e 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -41,6 +41,7 @@ #include "blk.h" #include "blk-mq-sched.h" +#include "blk-pm.h" #include "blk-wbt.h" static DEFINE_SPINLOCK(elv_list_lock); @@ -557,27 +558,6 @@ void elv_bio_merged(struct request_queue *q, struct request *rq, e->type->ops.sq.elevator_bio_merged_fn(q, rq, bio); } -#ifdef CONFIG_PM -static void blk_pm_requeue_request(struct request *rq) -{ - if (rq->q->dev && !(rq->rq_flags & RQF_PM)) - rq->q->nr_pending--; -} - -static void blk_pm_add_request(struct request_queue *q, struct request *rq) -{ - if (q->dev && !(rq->rq_flags & RQF_PM) && q->nr_pending++ == 0 && - (q->rpm_status == RPM_SUSPENDED || q->rpm_status == RPM_SUSPENDING)) - pm_request_resume(q->dev); -} -#else -static inline void blk_pm_requeue_request(struct request *rq) {} -static inline void blk_pm_add_request(struct request_queue *q, - struct request *rq) -{ -} -#endif - void elv_requeue_request(struct request_queue *q, struct request *rq) { /* diff --git a/block/genhd.c b/block/genhd.c index be5bab20b2ab..cff6bdf27226 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -567,7 +567,8 @@ static int exact_lock(dev_t devt, void *data) return 0; } -static void register_disk(struct device *parent, struct gendisk *disk) +static void register_disk(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups) { struct device *ddev = disk_to_dev(disk); struct block_device *bdev; @@ -582,6 +583,10 @@ static void register_disk(struct device *parent, struct gendisk *disk) /* delay uevents, until we scanned partition table */ dev_set_uevent_suppress(ddev, 1); + if (groups) { + WARN_ON(ddev->groups); + ddev->groups = groups; + } if (device_add(ddev)) return; if (!sysfs_deprecated) { @@ -647,6 +652,7 @@ exit: * __device_add_disk - add disk information to kernel list * @parent: parent device for the disk * @disk: per-device partitioning information + * @groups: Additional per-device sysfs groups * @register_queue: register the queue if set to true * * This function registers the partitioning information in @disk @@ -655,6 +661,7 @@ exit: * FIXME: error handling */ static void __device_add_disk(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups, bool register_queue) { dev_t devt; @@ -698,7 +705,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, blk_register_region(disk_devt(disk), disk->minors, NULL, exact_match, exact_lock, disk); } - register_disk(parent, disk); + register_disk(parent, disk, groups); if (register_queue) blk_register_queue(disk); @@ -712,15 +719,17 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, blk_integrity_add(disk); } -void device_add_disk(struct device *parent, struct gendisk *disk) +void device_add_disk(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups) + { - __device_add_disk(parent, disk, true); + __device_add_disk(parent, disk, groups, true); } EXPORT_SYMBOL(device_add_disk); void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk) { - __device_add_disk(parent, disk, false); + __device_add_disk(parent, disk, NULL, false); } EXPORT_SYMBOL(device_add_disk_no_queue_reg); diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index a1660bafc912..eccac01a10b6 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -29,19 +29,30 @@ #include "blk-mq-debugfs.h" #include "blk-mq-sched.h" #include "blk-mq-tag.h" -#include "blk-stat.h" -/* Scheduling domains. */ +#define CREATE_TRACE_POINTS +#include <trace/events/kyber.h> + +/* + * Scheduling domains: the device is divided into multiple domains based on the + * request type. + */ enum { KYBER_READ, - KYBER_SYNC_WRITE, - KYBER_OTHER, /* Async writes, discard, etc. */ + KYBER_WRITE, + KYBER_DISCARD, + KYBER_OTHER, KYBER_NUM_DOMAINS, }; -enum { - KYBER_MIN_DEPTH = 256, +static const char *kyber_domain_names[] = { + [KYBER_READ] = "READ", + [KYBER_WRITE] = "WRITE", + [KYBER_DISCARD] = "DISCARD", + [KYBER_OTHER] = "OTHER", +}; +enum { /* * In order to prevent starvation of synchronous requests by a flood of * asynchronous requests, we reserve 25% of requests for synchronous @@ -51,25 +62,87 @@ enum { }; /* - * Initial device-wide depths for each scheduling domain. + * Maximum device-wide depth for each scheduling domain. * - * Even for fast devices with lots of tags like NVMe, you can saturate - * the device with only a fraction of the maximum possible queue depth. - * So, we cap these to a reasonable value. + * Even for fast devices with lots of tags like NVMe, you can saturate the + * device with only a fraction of the maximum possible queue depth. So, we cap + * these to a reasonable value. */ static const unsigned int kyber_depth[] = { [KYBER_READ] = 256, - [KYBER_SYNC_WRITE] = 128, - [KYBER_OTHER] = 64, + [KYBER_WRITE] = 128, + [KYBER_DISCARD] = 64, + [KYBER_OTHER] = 16, }; /* - * Scheduling domain batch sizes. We favor reads. + * Default latency targets for each scheduling domain. + */ +static const u64 kyber_latency_targets[] = { + [KYBER_READ] = 2ULL * NSEC_PER_MSEC, + [KYBER_WRITE] = 10ULL * NSEC_PER_MSEC, + [KYBER_DISCARD] = 5ULL * NSEC_PER_SEC, +}; + +/* + * Batch size (number of requests we'll dispatch in a row) for each scheduling + * domain. */ static const unsigned int kyber_batch_size[] = { [KYBER_READ] = 16, - [KYBER_SYNC_WRITE] = 8, - [KYBER_OTHER] = 8, + [KYBER_WRITE] = 8, + [KYBER_DISCARD] = 1, + [KYBER_OTHER] = 1, +}; + +/* + * Requests latencies are recorded in a histogram with buckets defined relative + * to the target latency: + * + * <= 1/4 * target latency + * <= 1/2 * target latency + * <= 3/4 * target latency + * <= target latency + * <= 1 1/4 * target latency + * <= 1 1/2 * target latency + * <= 1 3/4 * target latency + * > 1 3/4 * target latency + */ +enum { + /* + * The width of the latency histogram buckets is + * 1 / (1 << KYBER_LATENCY_SHIFT) * target latency. + */ + KYBER_LATENCY_SHIFT = 2, + /* + * The first (1 << KYBER_LATENCY_SHIFT) buckets are <= target latency, + * thus, "good". + */ + KYBER_GOOD_BUCKETS = 1 << KYBER_LATENCY_SHIFT, + /* There are also (1 << KYBER_LATENCY_SHIFT) "bad" buckets. */ + KYBER_LATENCY_BUCKETS = 2 << KYBER_LATENCY_SHIFT, +}; + +/* + * We measure both the total latency and the I/O latency (i.e., latency after + * submitting to the device). + */ +enum { + KYBER_TOTAL_LATENCY, + KYBER_IO_LATENCY, +}; + +static const char *kyber_latency_type_names[] = { + [KYBER_TOTAL_LATENCY] = "total", + [KYBER_IO_LATENCY] = "I/O", +}; + +/* + * Per-cpu latency histograms: total latency and I/O latency for each scheduling + * domain except for KYBER_OTHER. + */ +struct kyber_cpu_latency { + atomic_t buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS]; }; /* @@ -88,12 +161,9 @@ struct kyber_ctx_queue { struct kyber_queue_data { struct request_queue *q; - struct blk_stat_callback *cb; - /* - * The device is divided into multiple scheduling domains based on the - * request type. Each domain has a fixed number of in-flight requests of - * that type device-wide, limited by these tokens. + * Each scheduling domain has a limited number of in-flight requests + * device-wide, limited by these tokens. */ struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS]; @@ -103,8 +173,19 @@ struct kyber_queue_data { */ unsigned int async_depth; + struct kyber_cpu_latency __percpu *cpu_latency; + + /* Timer for stats aggregation and adjusting domain tokens. */ + struct timer_list timer; + + unsigned int latency_buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS]; + + unsigned long latency_timeout[KYBER_OTHER]; + + int domain_p99[KYBER_OTHER]; + /* Target latencies in nanoseconds. */ - u64 read_lat_nsec, write_lat_nsec; + u64 latency_targets[KYBER_OTHER]; }; struct kyber_hctx_data { @@ -124,233 +205,219 @@ static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags, static unsigned int kyber_sched_domain(unsigned int op) { - if ((op & REQ_OP_MASK) == REQ_OP_READ) + switch (op & REQ_OP_MASK) { + case REQ_OP_READ: return KYBER_READ; - else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op)) - return KYBER_SYNC_WRITE; - else + case REQ_OP_WRITE: + return KYBER_WRITE; + case REQ_OP_DISCARD: + return KYBER_DISCARD; + default: return KYBER_OTHER; + } } -enum { - NONE = 0, - GOOD = 1, - GREAT = 2, - BAD = -1, - AWFUL = -2, -}; - -#define IS_GOOD(status) ((status) > 0) -#define IS_BAD(status) ((status) < 0) - -static int kyber_lat_status(struct blk_stat_callback *cb, - unsigned int sched_domain, u64 target) +static void flush_latency_buckets(struct kyber_queue_data *kqd, + struct kyber_cpu_latency *cpu_latency, + unsigned int sched_domain, unsigned int type) { - u64 latency; - - if (!cb->stat[sched_domain].nr_samples) - return NONE; + unsigned int *buckets = kqd->latency_buckets[sched_domain][type]; + atomic_t *cpu_buckets = cpu_latency->buckets[sched_domain][type]; + unsigned int bucket; - latency = cb->stat[sched_domain].mean; - if (latency >= 2 * target) - return AWFUL; - else if (latency > target) - return BAD; - else if (latency <= target / 2) - return GREAT; - else /* (latency <= target) */ - return GOOD; + for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++) + buckets[bucket] += atomic_xchg(&cpu_buckets[bucket], 0); } /* - * Adjust the read or synchronous write depth given the status of reads and - * writes. The goal is that the latencies of the two domains are fair (i.e., if - * one is good, then the other is good). + * Calculate the histogram bucket with the given percentile rank, or -1 if there + * aren't enough samples yet. */ -static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd, - unsigned int sched_domain, int this_status, - int other_status) +static int calculate_percentile(struct kyber_queue_data *kqd, + unsigned int sched_domain, unsigned int type, + unsigned int percentile) { - unsigned int orig_depth, depth; + unsigned int *buckets = kqd->latency_buckets[sched_domain][type]; + unsigned int bucket, samples = 0, percentile_samples; + + for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++) + samples += buckets[bucket]; + + if (!samples) + return -1; /* - * If this domain had no samples, or reads and writes are both good or - * both bad, don't adjust the depth. + * We do the calculation once we have 500 samples or one second passes + * since the first sample was recorded, whichever comes first. */ - if (this_status == NONE || - (IS_GOOD(this_status) && IS_GOOD(other_status)) || - (IS_BAD(this_status) && IS_BAD(other_status))) - return; - - orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth; + if (!kqd->latency_timeout[sched_domain]) + kqd->latency_timeout[sched_domain] = max(jiffies + HZ, 1UL); + if (samples < 500 && + time_is_after_jiffies(kqd->latency_timeout[sched_domain])) { + return -1; + } + kqd->latency_timeout[sched_domain] = 0; - if (other_status == NONE) { - depth++; - } else { - switch (this_status) { - case GOOD: - if (other_status == AWFUL) - depth -= max(depth / 4, 1U); - else - depth -= max(depth / 8, 1U); - break; - case GREAT: - if (other_status == AWFUL) - depth /= 2; - else - depth -= max(depth / 4, 1U); + percentile_samples = DIV_ROUND_UP(samples * percentile, 100); + for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS - 1; bucket++) { + if (buckets[bucket] >= percentile_samples) break; - case BAD: - depth++; - break; - case AWFUL: - if (other_status == GREAT) - depth += 2; - else - depth++; - break; - } + percentile_samples -= buckets[bucket]; } + memset(buckets, 0, sizeof(kqd->latency_buckets[sched_domain][type])); - depth = clamp(depth, 1U, kyber_depth[sched_domain]); - if (depth != orig_depth) - sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth); + trace_kyber_latency(kqd->q, kyber_domain_names[sched_domain], + kyber_latency_type_names[type], percentile, + bucket + 1, 1 << KYBER_LATENCY_SHIFT, samples); + + return bucket; } -/* - * Adjust the depth of other requests given the status of reads and synchronous - * writes. As long as either domain is doing fine, we don't throttle, but if - * both domains are doing badly, we throttle heavily. - */ -static void kyber_adjust_other_depth(struct kyber_queue_data *kqd, - int read_status, int write_status, - bool have_samples) -{ - unsigned int orig_depth, depth; - int status; - - orig_depth = depth = kqd->domain_tokens[KYBER_OTHER].sb.depth; - - if (read_status == NONE && write_status == NONE) { - depth += 2; - } else if (have_samples) { - if (read_status == NONE) - status = write_status; - else if (write_status == NONE) - status = read_status; - else - status = max(read_status, write_status); - switch (status) { - case GREAT: - depth += 2; - break; - case GOOD: - depth++; - break; - case BAD: - depth -= max(depth / 4, 1U); - break; - case AWFUL: - depth /= 2; - break; - } +static void kyber_resize_domain(struct kyber_queue_data *kqd, + unsigned int sched_domain, unsigned int depth) +{ + depth = clamp(depth, 1U, kyber_depth[sched_domain]); + if (depth != kqd->domain_tokens[sched_domain].sb.depth) { + sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth); + trace_kyber_adjust(kqd->q, kyber_domain_names[sched_domain], + depth); } - - depth = clamp(depth, 1U, kyber_depth[KYBER_OTHER]); - if (depth != orig_depth) - sbitmap_queue_resize(&kqd->domain_tokens[KYBER_OTHER], depth); } -/* - * Apply heuristics for limiting queue depths based on gathered latency - * statistics. - */ -static void kyber_stat_timer_fn(struct blk_stat_callback *cb) +static void kyber_timer_fn(struct timer_list *t) { - struct kyber_queue_data *kqd = cb->data; - int read_status, write_status; + struct kyber_queue_data *kqd = from_timer(kqd, t, timer); + unsigned int sched_domain; + int cpu; + bool bad = false; + + /* Sum all of the per-cpu latency histograms. */ + for_each_online_cpu(cpu) { + struct kyber_cpu_latency *cpu_latency; + + cpu_latency = per_cpu_ptr(kqd->cpu_latency, cpu); + for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) { + flush_latency_buckets(kqd, cpu_latency, sched_domain, + KYBER_TOTAL_LATENCY); + flush_latency_buckets(kqd, cpu_latency, sched_domain, + KYBER_IO_LATENCY); + } + } - read_status = kyber_lat_status(cb, KYBER_READ, kqd->read_lat_nsec); - write_status = kyber_lat_status(cb, KYBER_SYNC_WRITE, kqd->write_lat_nsec); + /* + * Check if any domains have a high I/O latency, which might indicate + * congestion in the device. Note that we use the p90; we don't want to + * be too sensitive to outliers here. + */ + for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) { + int p90; - kyber_adjust_rw_depth(kqd, KYBER_READ, read_status, write_status); - kyber_adjust_rw_depth(kqd, KYBER_SYNC_WRITE, write_status, read_status); - kyber_adjust_other_depth(kqd, read_status, write_status, - cb->stat[KYBER_OTHER].nr_samples != 0); + p90 = calculate_percentile(kqd, sched_domain, KYBER_IO_LATENCY, + 90); + if (p90 >= KYBER_GOOD_BUCKETS) + bad = true; + } /* - * Continue monitoring latencies if we aren't hitting the targets or - * we're still throttling other requests. + * Adjust the scheduling domain depths. If we determined that there was + * congestion, we throttle all domains with good latencies. Either way, + * we ease up on throttling domains with bad latencies. */ - if (!blk_stat_is_active(kqd->cb) && - ((IS_BAD(read_status) || IS_BAD(write_status) || - kqd->domain_tokens[KYBER_OTHER].sb.depth < kyber_depth[KYBER_OTHER]))) - blk_stat_activate_msecs(kqd->cb, 100); + for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) { + unsigned int orig_depth, depth; + int p99; + + p99 = calculate_percentile(kqd, sched_domain, + KYBER_TOTAL_LATENCY, 99); + /* + * This is kind of subtle: different domains will not + * necessarily have enough samples to calculate the latency + * percentiles during the same window, so we have to remember + * the p99 for the next time we observe congestion; once we do, + * we don't want to throttle again until we get more data, so we + * reset it to -1. + */ + if (bad) { + if (p99 < 0) + p99 = kqd->domain_p99[sched_domain]; + kqd->domain_p99[sched_domain] = -1; + } else if (p99 >= 0) { + kqd->domain_p99[sched_domain] = p99; + } + if (p99 < 0) + continue; + + /* + * If this domain has bad latency, throttle less. Otherwise, + * throttle more iff we determined that there is congestion. + * + * The new depth is scaled linearly with the p99 latency vs the + * latency target. E.g., if the p99 is 3/4 of the target, then + * we throttle down to 3/4 of the current depth, and if the p99 + * is 2x the target, then we double the depth. + */ + if (bad || p99 >= KYBER_GOOD_BUCKETS) { + orig_depth = kqd->domain_tokens[sched_domain].sb.depth; + depth = (orig_depth * (p99 + 1)) >> KYBER_LATENCY_SHIFT; + kyber_resize_domain(kqd, sched_domain, depth); + } + } } -static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd) +static unsigned int kyber_sched_tags_shift(struct request_queue *q) { /* * All of the hardware queues have the same depth, so we can just grab * the shift of the first one. */ - return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift; -} - -static int kyber_bucket_fn(const struct request *rq) -{ - return kyber_sched_domain(rq->cmd_flags); + return q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift; } static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q) { struct kyber_queue_data *kqd; - unsigned int max_tokens; unsigned int shift; int ret = -ENOMEM; int i; - kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node); + kqd = kzalloc_node(sizeof(*kqd), GFP_KERNEL, q->node); if (!kqd) goto err; + kqd->q = q; - kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, kyber_bucket_fn, - KYBER_NUM_DOMAINS, kqd); - if (!kqd->cb) + kqd->cpu_latency = alloc_percpu_gfp(struct kyber_cpu_latency, + GFP_KERNEL | __GFP_ZERO); + if (!kqd->cpu_latency) goto err_kqd; - /* - * The maximum number of tokens for any scheduling domain is at least - * the queue depth of a single hardware queue. If the hardware doesn't - * have many tags, still provide a reasonable number. - */ - max_tokens = max_t(unsigned int, q->tag_set->queue_depth, - KYBER_MIN_DEPTH); + timer_setup(&kqd->timer, kyber_timer_fn, 0); + for (i = 0; i < KYBER_NUM_DOMAINS; i++) { WARN_ON(!kyber_depth[i]); WARN_ON(!kyber_batch_size[i]); ret = sbitmap_queue_init_node(&kqd->domain_tokens[i], - max_tokens, -1, false, GFP_KERNEL, - q->node); + kyber_depth[i], -1, false, + GFP_KERNEL, q->node); if (ret) { while (--i >= 0) sbitmap_queue_free(&kqd->domain_tokens[i]); - goto err_cb; + goto err_buckets; } - sbitmap_queue_resize(&kqd->domain_tokens[i], kyber_depth[i]); } - shift = kyber_sched_tags_shift(kqd); - kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U; + for (i = 0; i < KYBER_OTHER; i++) { + kqd->domain_p99[i] = -1; + kqd->latency_targets[i] = kyber_latency_targets[i]; + } - kqd->read_lat_nsec = 2000000ULL; - kqd->write_lat_nsec = 10000000ULL; + shift = kyber_sched_tags_shift(q); + kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U; return kqd; -err_cb: - blk_stat_free_callback(kqd->cb); +err_buckets: + free_percpu(kqd->cpu_latency); err_kqd: kfree(kqd); err: @@ -372,25 +439,24 @@ static int kyber_init_sched(struct request_queue *q, struct elevator_type *e) return PTR_ERR(kqd); } + blk_stat_enable_accounting(q); + eq->elevator_data = kqd; q->elevator = eq; - blk_stat_add_callback(q, kqd->cb); - return 0; } static void kyber_exit_sched(struct elevator_queue *e) { struct kyber_queue_data *kqd = e->elevator_data; - struct request_queue *q = kqd->q; int i; - blk_stat_remove_callback(q, kqd->cb); + del_timer_sync(&kqd->timer); for (i = 0; i < KYBER_NUM_DOMAINS; i++) sbitmap_queue_free(&kqd->domain_tokens[i]); - blk_stat_free_callback(kqd->cb); + free_percpu(kqd->cpu_latency); kfree(kqd); } @@ -558,41 +624,44 @@ static void kyber_finish_request(struct request *rq) rq_clear_domain_token(kqd, rq); } -static void kyber_completed_request(struct request *rq) +static void add_latency_sample(struct kyber_cpu_latency *cpu_latency, + unsigned int sched_domain, unsigned int type, + u64 target, u64 latency) { - struct request_queue *q = rq->q; - struct kyber_queue_data *kqd = q->elevator->elevator_data; - unsigned int sched_domain; - u64 now, latency, target; + unsigned int bucket; + u64 divisor; - /* - * Check if this request met our latency goal. If not, quickly gather - * some statistics and start throttling. - */ - sched_domain = kyber_sched_domain(rq->cmd_flags); - switch (sched_domain) { - case KYBER_READ: - target = kqd->read_lat_nsec; - break; - case KYBER_SYNC_WRITE: - target = kqd->write_lat_nsec; - break; - default: - return; + if (latency > 0) { + divisor = max_t(u64, target >> KYBER_LATENCY_SHIFT, 1); + bucket = min_t(unsigned int, div64_u64(latency - 1, divisor), + KYBER_LATENCY_BUCKETS - 1); + } else { + bucket = 0; } - /* If we are already monitoring latencies, don't check again. */ - if (blk_stat_is_active(kqd->cb)) - return; + atomic_inc(&cpu_latency->buckets[sched_domain][type][bucket]); +} - now = ktime_get_ns(); - if (now < rq->io_start_time_ns) +static void kyber_completed_request(struct request *rq, u64 now) +{ + struct kyber_queue_data *kqd = rq->q->elevator->elevator_data; + struct kyber_cpu_latency *cpu_latency; + unsigned int sched_domain; + u64 target; + + sched_domain = kyber_sched_domain(rq->cmd_flags); + if (sched_domain == KYBER_OTHER) return; - latency = now - rq->io_start_time_ns; + cpu_latency = get_cpu_ptr(kqd->cpu_latency); + target = kqd->latency_targets[sched_domain]; + add_latency_sample(cpu_latency, sched_domain, KYBER_TOTAL_LATENCY, + target, now - rq->start_time_ns); + add_latency_sample(cpu_latency, sched_domain, KYBER_IO_LATENCY, target, + now - rq->io_start_time_ns); + put_cpu_ptr(kqd->cpu_latency); - if (latency > target) - blk_stat_activate_msecs(kqd->cb, 10); + timer_reduce(&kqd->timer, jiffies + HZ / 10); } struct flush_kcq_data { @@ -713,6 +782,9 @@ kyber_dispatch_cur_domain(struct kyber_queue_data *kqd, rq_set_domain_token(rq, nr); list_del_init(&rq->queuelist); return rq; + } else { + trace_kyber_throttled(kqd->q, + kyber_domain_names[khd->cur_domain]); } } else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) { nr = kyber_get_domain_token(kqd, khd, hctx); @@ -723,6 +795,9 @@ kyber_dispatch_cur_domain(struct kyber_queue_data *kqd, rq_set_domain_token(rq, nr); list_del_init(&rq->queuelist); return rq; + } else { + trace_kyber_throttled(kqd->q, + kyber_domain_names[khd->cur_domain]); } } @@ -790,17 +865,17 @@ static bool kyber_has_work(struct blk_mq_hw_ctx *hctx) return false; } -#define KYBER_LAT_SHOW_STORE(op) \ -static ssize_t kyber_##op##_lat_show(struct elevator_queue *e, \ - char *page) \ +#define KYBER_LAT_SHOW_STORE(domain, name) \ +static ssize_t kyber_##name##_lat_show(struct elevator_queue *e, \ + char *page) \ { \ struct kyber_queue_data *kqd = e->elevator_data; \ \ - return sprintf(page, "%llu\n", kqd->op##_lat_nsec); \ + return sprintf(page, "%llu\n", kqd->latency_targets[domain]); \ } \ \ -static ssize_t kyber_##op##_lat_store(struct elevator_queue *e, \ - const char *page, size_t count) \ +static ssize_t kyber_##name##_lat_store(struct elevator_queue *e, \ + const char *page, size_t count) \ { \ struct kyber_queue_data *kqd = e->elevator_data; \ unsigned long long nsec; \ @@ -810,12 +885,12 @@ static ssize_t kyber_##op##_lat_store(struct elevator_queue *e, \ if (ret) \ return ret; \ \ - kqd->op##_lat_nsec = nsec; \ + kqd->latency_targets[domain] = nsec; \ \ return count; \ } -KYBER_LAT_SHOW_STORE(read); -KYBER_LAT_SHOW_STORE(write); +KYBER_LAT_SHOW_STORE(KYBER_READ, read); +KYBER_LAT_SHOW_STORE(KYBER_WRITE, write); #undef KYBER_LAT_SHOW_STORE #define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store) @@ -882,7 +957,8 @@ static int kyber_##name##_waiting_show(void *data, struct seq_file *m) \ return 0; \ } KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read) -KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_SYNC_WRITE, sync_write) +KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_WRITE, write) +KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_DISCARD, discard) KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other) #undef KYBER_DEBUGFS_DOMAIN_ATTRS @@ -900,20 +976,7 @@ static int kyber_cur_domain_show(void *data, struct seq_file *m) struct blk_mq_hw_ctx *hctx = data; struct kyber_hctx_data *khd = hctx->sched_data; - switch (khd->cur_domain) { - case KYBER_READ: - seq_puts(m, "READ\n"); - break; - case KYBER_SYNC_WRITE: - seq_puts(m, "SYNC_WRITE\n"); - break; - case KYBER_OTHER: - seq_puts(m, "OTHER\n"); - break; - default: - seq_printf(m, "%u\n", khd->cur_domain); - break; - } + seq_printf(m, "%s\n", kyber_domain_names[khd->cur_domain]); return 0; } @@ -930,7 +993,8 @@ static int kyber_batching_show(void *data, struct seq_file *m) {#name "_tokens", 0400, kyber_##name##_tokens_show} static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = { KYBER_QUEUE_DOMAIN_ATTRS(read), - KYBER_QUEUE_DOMAIN_ATTRS(sync_write), + KYBER_QUEUE_DOMAIN_ATTRS(write), + KYBER_QUEUE_DOMAIN_ATTRS(discard), KYBER_QUEUE_DOMAIN_ATTRS(other), {"async_depth", 0400, kyber_async_depth_show}, {}, @@ -942,7 +1006,8 @@ static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = { {#name "_waiting", 0400, kyber_##name##_waiting_show} static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = { KYBER_HCTX_DOMAIN_ATTRS(read), - KYBER_HCTX_DOMAIN_ATTRS(sync_write), + KYBER_HCTX_DOMAIN_ATTRS(write), + KYBER_HCTX_DOMAIN_ATTRS(discard), KYBER_HCTX_DOMAIN_ATTRS(other), {"cur_domain", 0400, kyber_cur_domain_show}, {"batching", 0400, kyber_batching_show}, diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h index c0ebda1283cc..015c68017a1c 100644 --- a/drivers/block/aoe/aoe.h +++ b/drivers/block/aoe/aoe.h @@ -201,7 +201,6 @@ int aoeblk_init(void); void aoeblk_exit(void); void aoeblk_gdalloc(void *); void aoedisk_rm_debugfs(struct aoedev *d); -void aoedisk_rm_sysfs(struct aoedev *d); int aoechr_init(void); void aoechr_exit(void); diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index 429ebb84b592..ff770e7d9e52 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -177,10 +177,15 @@ static struct attribute *aoe_attrs[] = { NULL, }; -static const struct attribute_group attr_group = { +static const struct attribute_group aoe_attr_group = { .attrs = aoe_attrs, }; +static const struct attribute_group *aoe_attr_groups[] = { + &aoe_attr_group, + NULL, +}; + static const struct file_operations aoe_debugfs_fops = { .open = aoe_debugfs_open, .read = seq_read, @@ -220,17 +225,6 @@ aoedisk_rm_debugfs(struct aoedev *d) } static int -aoedisk_add_sysfs(struct aoedev *d) -{ - return sysfs_create_group(&disk_to_dev(d->gd)->kobj, &attr_group); -} -void -aoedisk_rm_sysfs(struct aoedev *d) -{ - sysfs_remove_group(&disk_to_dev(d->gd)->kobj, &attr_group); -} - -static int aoeblk_open(struct block_device *bdev, fmode_t mode) { struct aoedev *d = bdev->bd_disk->private_data; @@ -417,8 +411,7 @@ aoeblk_gdalloc(void *vp) spin_unlock_irqrestore(&d->lock, flags); - add_disk(gd); - aoedisk_add_sysfs(d); + device_add_disk(NULL, gd, aoe_attr_groups); aoedisk_add_debugfs(d); spin_lock_irqsave(&d->lock, flags); diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c index 41060e9cedf2..f29a140cdbc1 100644 --- a/drivers/block/aoe/aoedev.c +++ b/drivers/block/aoe/aoedev.c @@ -275,7 +275,6 @@ freedev(struct aoedev *d) del_timer_sync(&d->timer); if (d->gd) { aoedisk_rm_debugfs(d); - aoedisk_rm_sysfs(d); del_gendisk(d->gd); put_disk(d->gd); blk_cleanup_queue(d->blkq); diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index e35a234b0a8f..8a4c1328e6f9 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -724,10 +724,10 @@ struct drbd_connection { struct list_head transfer_log; /* all requests not yet fully processed */ struct crypto_shash *cram_hmac_tfm; - struct crypto_ahash *integrity_tfm; /* checksums we compute, updates protected by connection->data->mutex */ - struct crypto_ahash *peer_integrity_tfm; /* checksums we verify, only accessed from receiver thread */ - struct crypto_ahash *csums_tfm; - struct crypto_ahash *verify_tfm; + struct crypto_shash *integrity_tfm; /* checksums we compute, updates protected by connection->data->mutex */ + struct crypto_shash *peer_integrity_tfm; /* checksums we verify, only accessed from receiver thread */ + struct crypto_shash *csums_tfm; + struct crypto_shash *verify_tfm; void *int_dig_in; void *int_dig_vv; @@ -1531,8 +1531,9 @@ static inline void ov_out_of_sync_print(struct drbd_device *device) } -extern void drbd_csum_bio(struct crypto_ahash *, struct bio *, void *); -extern void drbd_csum_ee(struct crypto_ahash *, struct drbd_peer_request *, void *); +extern void drbd_csum_bio(struct crypto_shash *, struct bio *, void *); +extern void drbd_csum_ee(struct crypto_shash *, struct drbd_peer_request *, + void *); /* worker callbacks */ extern int w_e_end_data_req(struct drbd_work *, int); extern int w_e_end_rsdata_req(struct drbd_work *, int); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index ef8212a4b73e..15a9ffce9012 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1377,7 +1377,7 @@ void drbd_send_ack_dp(struct drbd_peer_device *peer_device, enum drbd_packet cmd struct p_data *dp, int data_size) { if (peer_device->connection->peer_integrity_tfm) - data_size -= crypto_ahash_digestsize(peer_device->connection->peer_integrity_tfm); + data_size -= crypto_shash_digestsize(peer_device->connection->peer_integrity_tfm); _drbd_send_ack(peer_device, cmd, dp->sector, cpu_to_be32(data_size), dp->block_id); } @@ -1690,7 +1690,7 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request * sock = &peer_device->connection->data; p = drbd_prepare_command(peer_device, sock); digest_size = peer_device->connection->integrity_tfm ? - crypto_ahash_digestsize(peer_device->connection->integrity_tfm) : 0; + crypto_shash_digestsize(peer_device->connection->integrity_tfm) : 0; if (!p) return -EIO; @@ -1796,7 +1796,7 @@ int drbd_send_block(struct drbd_peer_device *peer_device, enum drbd_packet cmd, p = drbd_prepare_command(peer_device, sock); digest_size = peer_device->connection->integrity_tfm ? - crypto_ahash_digestsize(peer_device->connection->integrity_tfm) : 0; + crypto_shash_digestsize(peer_device->connection->integrity_tfm) : 0; if (!p) return -EIO; @@ -2557,11 +2557,11 @@ void conn_free_crypto(struct drbd_connection *connection) { drbd_free_sock(connection); - crypto_free_ahash(connection->csums_tfm); - crypto_free_ahash(connection->verify_tfm); + crypto_free_shash(connection->csums_tfm); + crypto_free_shash(connection->verify_tfm); crypto_free_shash(connection->cram_hmac_tfm); - crypto_free_ahash(connection->integrity_tfm); - crypto_free_ahash(connection->peer_integrity_tfm); + crypto_free_shash(connection->integrity_tfm); + crypto_free_shash(connection->peer_integrity_tfm); kfree(connection->int_dig_in); kfree(connection->int_dig_vv); diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index b4f02768ba47..d15703b1ffe8 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -2303,10 +2303,10 @@ check_net_options(struct drbd_connection *connection, struct net_conf *new_net_c } struct crypto { - struct crypto_ahash *verify_tfm; - struct crypto_ahash *csums_tfm; + struct crypto_shash *verify_tfm; + struct crypto_shash *csums_tfm; struct crypto_shash *cram_hmac_tfm; - struct crypto_ahash *integrity_tfm; + struct crypto_shash *integrity_tfm; }; static int @@ -2324,36 +2324,21 @@ alloc_shash(struct crypto_shash **tfm, char *tfm_name, int err_alg) return NO_ERROR; } -static int -alloc_ahash(struct crypto_ahash **tfm, char *tfm_name, int err_alg) -{ - if (!tfm_name[0]) - return NO_ERROR; - - *tfm = crypto_alloc_ahash(tfm_name, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(*tfm)) { - *tfm = NULL; - return err_alg; - } - - return NO_ERROR; -} - static enum drbd_ret_code alloc_crypto(struct crypto *crypto, struct net_conf *new_net_conf) { char hmac_name[CRYPTO_MAX_ALG_NAME]; enum drbd_ret_code rv; - rv = alloc_ahash(&crypto->csums_tfm, new_net_conf->csums_alg, + rv = alloc_shash(&crypto->csums_tfm, new_net_conf->csums_alg, ERR_CSUMS_ALG); if (rv != NO_ERROR) return rv; - rv = alloc_ahash(&crypto->verify_tfm, new_net_conf->verify_alg, + rv = alloc_shash(&crypto->verify_tfm, new_net_conf->verify_alg, ERR_VERIFY_ALG); if (rv != NO_ERROR) return rv; - rv = alloc_ahash(&crypto->integrity_tfm, new_net_conf->integrity_alg, + rv = alloc_shash(&crypto->integrity_tfm, new_net_conf->integrity_alg, ERR_INTEGRITY_ALG); if (rv != NO_ERROR) return rv; @@ -2371,9 +2356,9 @@ alloc_crypto(struct crypto *crypto, struct net_conf *new_net_conf) static void free_crypto(struct crypto *crypto) { crypto_free_shash(crypto->cram_hmac_tfm); - crypto_free_ahash(crypto->integrity_tfm); - crypto_free_ahash(crypto->csums_tfm); - crypto_free_ahash(crypto->verify_tfm); + crypto_free_shash(crypto->integrity_tfm); + crypto_free_shash(crypto->csums_tfm); + crypto_free_shash(crypto->verify_tfm); } int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) @@ -2450,17 +2435,17 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) rcu_assign_pointer(connection->net_conf, new_net_conf); if (!rsr) { - crypto_free_ahash(connection->csums_tfm); + crypto_free_shash(connection->csums_tfm); connection->csums_tfm = crypto.csums_tfm; crypto.csums_tfm = NULL; } if (!ovr) { - crypto_free_ahash(connection->verify_tfm); + crypto_free_shash(connection->verify_tfm); connection->verify_tfm = crypto.verify_tfm; crypto.verify_tfm = NULL; } - crypto_free_ahash(connection->integrity_tfm); + crypto_free_shash(connection->integrity_tfm); connection->integrity_tfm = crypto.integrity_tfm; if (connection->cstate >= C_WF_REPORT_PARAMS && connection->agreed_pro_version >= 100) /* Do this without trying to take connection->data.mutex again. */ diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 75f6b47169e6..fc67fd853375 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1732,7 +1732,7 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf } /* quick wrapper in case payload size != request_size (write same) */ -static void drbd_csum_ee_size(struct crypto_ahash *h, +static void drbd_csum_ee_size(struct crypto_shash *h, struct drbd_peer_request *r, void *d, unsigned int payload_size) { @@ -1769,7 +1769,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, digest_size = 0; if (!trim && peer_device->connection->peer_integrity_tfm) { - digest_size = crypto_ahash_digestsize(peer_device->connection->peer_integrity_tfm); + digest_size = crypto_shash_digestsize(peer_device->connection->peer_integrity_tfm); /* * FIXME: Receive the incoming digest into the receive buffer * here, together with its struct p_data? @@ -1905,7 +1905,7 @@ static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_req digest_size = 0; if (peer_device->connection->peer_integrity_tfm) { - digest_size = crypto_ahash_digestsize(peer_device->connection->peer_integrity_tfm); + digest_size = crypto_shash_digestsize(peer_device->connection->peer_integrity_tfm); err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size); if (err) return err; @@ -3542,7 +3542,7 @@ static int receive_protocol(struct drbd_connection *connection, struct packet_in int p_proto, p_discard_my_data, p_two_primaries, cf; struct net_conf *nc, *old_net_conf, *new_net_conf = NULL; char integrity_alg[SHARED_SECRET_MAX] = ""; - struct crypto_ahash *peer_integrity_tfm = NULL; + struct crypto_shash *peer_integrity_tfm = NULL; void *int_dig_in = NULL, *int_dig_vv = NULL; p_proto = be32_to_cpu(p->protocol); @@ -3623,7 +3623,7 @@ static int receive_protocol(struct drbd_connection *connection, struct packet_in * change. */ - peer_integrity_tfm = crypto_alloc_ahash(integrity_alg, 0, CRYPTO_ALG_ASYNC); + peer_integrity_tfm = crypto_alloc_shash(integrity_alg, 0, CRYPTO_ALG_ASYNC); if (IS_ERR(peer_integrity_tfm)) { peer_integrity_tfm = NULL; drbd_err(connection, "peer data-integrity-alg %s not supported\n", @@ -3631,7 +3631,7 @@ static int receive_protocol(struct drbd_connection *connection, struct packet_in goto disconnect; } - hash_size = crypto_ahash_digestsize(peer_integrity_tfm); + hash_size = crypto_shash_digestsize(peer_integrity_tfm); int_dig_in = kmalloc(hash_size, GFP_KERNEL); int_dig_vv = kmalloc(hash_size, GFP_KERNEL); if (!(int_dig_in && int_dig_vv)) { @@ -3661,7 +3661,7 @@ static int receive_protocol(struct drbd_connection *connection, struct packet_in mutex_unlock(&connection->resource->conf_update); mutex_unlock(&connection->data.mutex); - crypto_free_ahash(connection->peer_integrity_tfm); + crypto_free_shash(connection->peer_integrity_tfm); kfree(connection->int_dig_in); kfree(connection->int_dig_vv); connection->peer_integrity_tfm = peer_integrity_tfm; @@ -3679,7 +3679,7 @@ static int receive_protocol(struct drbd_connection *connection, struct packet_in disconnect_rcu_unlock: rcu_read_unlock(); disconnect: - crypto_free_ahash(peer_integrity_tfm); + crypto_free_shash(peer_integrity_tfm); kfree(int_dig_in); kfree(int_dig_vv); conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); @@ -3691,15 +3691,16 @@ disconnect: * return: NULL (alg name was "") * ERR_PTR(error) if something goes wrong * or the crypto hash ptr, if it worked out ok. */ -static struct crypto_ahash *drbd_crypto_alloc_digest_safe(const struct drbd_device *device, +static struct crypto_shash *drbd_crypto_alloc_digest_safe( + const struct drbd_device *device, const char *alg, const char *name) { - struct crypto_ahash *tfm; + struct crypto_shash *tfm; if (!alg[0]) return NULL; - tfm = crypto_alloc_ahash(alg, 0, CRYPTO_ALG_ASYNC); + tfm = crypto_alloc_shash(alg, 0, 0); if (IS_ERR(tfm)) { drbd_err(device, "Can not allocate \"%s\" as %s (reason: %ld)\n", alg, name, PTR_ERR(tfm)); @@ -3752,8 +3753,8 @@ static int receive_SyncParam(struct drbd_connection *connection, struct packet_i struct drbd_device *device; struct p_rs_param_95 *p; unsigned int header_size, data_size, exp_max_sz; - struct crypto_ahash *verify_tfm = NULL; - struct crypto_ahash *csums_tfm = NULL; + struct crypto_shash *verify_tfm = NULL; + struct crypto_shash *csums_tfm = NULL; struct net_conf *old_net_conf, *new_net_conf = NULL; struct disk_conf *old_disk_conf = NULL, *new_disk_conf = NULL; const int apv = connection->agreed_pro_version; @@ -3900,14 +3901,14 @@ static int receive_SyncParam(struct drbd_connection *connection, struct packet_i if (verify_tfm) { strcpy(new_net_conf->verify_alg, p->verify_alg); new_net_conf->verify_alg_len = strlen(p->verify_alg) + 1; - crypto_free_ahash(peer_device->connection->verify_tfm); + crypto_free_shash(peer_device->connection->verify_tfm); peer_device->connection->verify_tfm = verify_tfm; drbd_info(device, "using verify-alg: \"%s\"\n", p->verify_alg); } if (csums_tfm) { strcpy(new_net_conf->csums_alg, p->csums_alg); new_net_conf->csums_alg_len = strlen(p->csums_alg) + 1; - crypto_free_ahash(peer_device->connection->csums_tfm); + crypto_free_shash(peer_device->connection->csums_tfm); peer_device->connection->csums_tfm = csums_tfm; drbd_info(device, "using csums-alg: \"%s\"\n", p->csums_alg); } @@ -3951,9 +3952,9 @@ disconnect: mutex_unlock(&connection->resource->conf_update); /* just for completeness: actually not needed, * as this is not reached if csums_tfm was ok. */ - crypto_free_ahash(csums_tfm); + crypto_free_shash(csums_tfm); /* but free the verify_tfm again, if csums_tfm did not work out */ - crypto_free_ahash(verify_tfm); + crypto_free_shash(verify_tfm); conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD); return -EIO; } diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index b8f77e83d456..c7ef48efe871 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -295,60 +295,61 @@ void drbd_request_endio(struct bio *bio) complete_master_bio(device, &m); } -void drbd_csum_ee(struct crypto_ahash *tfm, struct drbd_peer_request *peer_req, void *digest) +void drbd_csum_ee(struct crypto_shash *tfm, struct drbd_peer_request *peer_req, void *digest) { - AHASH_REQUEST_ON_STACK(req, tfm); - struct scatterlist sg; + SHASH_DESC_ON_STACK(desc, tfm); struct page *page = peer_req->pages; struct page *tmp; unsigned len; + void *src; - ahash_request_set_tfm(req, tfm); - ahash_request_set_callback(req, 0, NULL, NULL); + desc->tfm = tfm; + desc->flags = 0; - sg_init_table(&sg, 1); - crypto_ahash_init(req); + crypto_shash_init(desc); + src = kmap_atomic(page); while ((tmp = page_chain_next(page))) { /* all but the last page will be fully used */ - sg_set_page(&sg, page, PAGE_SIZE, 0); - ahash_request_set_crypt(req, &sg, NULL, sg.length); - crypto_ahash_update(req); + crypto_shash_update(desc, src, PAGE_SIZE); + kunmap_atomic(src); page = tmp; + src = kmap_atomic(page); } /* and now the last, possibly only partially used page */ len = peer_req->i.size & (PAGE_SIZE - 1); - sg_set_page(&sg, page, len ?: PAGE_SIZE, 0); - ahash_request_set_crypt(req, &sg, digest, sg.length); - crypto_ahash_finup(req); - ahash_request_zero(req); + crypto_shash_update(desc, src, len ?: PAGE_SIZE); + kunmap_atomic(src); + + crypto_shash_final(desc, digest); + shash_desc_zero(desc); } -void drbd_csum_bio(struct crypto_ahash *tfm, struct bio *bio, void *digest) +void drbd_csum_bio(struct crypto_shash *tfm, struct bio *bio, void *digest) { - AHASH_REQUEST_ON_STACK(req, tfm); - struct scatterlist sg; + SHASH_DESC_ON_STACK(desc, tfm); struct bio_vec bvec; struct bvec_iter iter; - ahash_request_set_tfm(req, tfm); - ahash_request_set_callback(req, 0, NULL, NULL); + desc->tfm = tfm; + desc->flags = 0; - sg_init_table(&sg, 1); - crypto_ahash_init(req); + crypto_shash_init(desc); bio_for_each_segment(bvec, bio, iter) { - sg_set_page(&sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset); - ahash_request_set_crypt(req, &sg, NULL, sg.length); - crypto_ahash_update(req); + u8 *src; + + src = kmap_atomic(bvec.bv_page); + crypto_shash_update(desc, src + bvec.bv_offset, bvec.bv_len); + kunmap_atomic(src); + /* REQ_OP_WRITE_SAME has only one segment, * checksum the payload only once. */ if (bio_op(bio) == REQ_OP_WRITE_SAME) break; } - ahash_request_set_crypt(req, NULL, digest, 0); - crypto_ahash_final(req); - ahash_request_zero(req); + crypto_shash_final(desc, digest); + shash_desc_zero(desc); } /* MAYBE merge common code with w_e_end_ov_req */ @@ -367,7 +368,7 @@ static int w_e_send_csum(struct drbd_work *w, int cancel) if (unlikely((peer_req->flags & EE_WAS_ERROR) != 0)) goto out; - digest_size = crypto_ahash_digestsize(peer_device->connection->csums_tfm); + digest_size = crypto_shash_digestsize(peer_device->connection->csums_tfm); digest = kmalloc(digest_size, GFP_NOIO); if (digest) { sector_t sector = peer_req->i.sector; @@ -1205,7 +1206,7 @@ int w_e_end_csum_rs_req(struct drbd_work *w, int cancel) * a real fix would be much more involved, * introducing more locking mechanisms */ if (peer_device->connection->csums_tfm) { - digest_size = crypto_ahash_digestsize(peer_device->connection->csums_tfm); + digest_size = crypto_shash_digestsize(peer_device->connection->csums_tfm); D_ASSERT(device, digest_size == di->digest_size); digest = kmalloc(digest_size, GFP_NOIO); } @@ -1255,7 +1256,7 @@ int w_e_end_ov_req(struct drbd_work *w, int cancel) if (unlikely(cancel)) goto out; - digest_size = crypto_ahash_digestsize(peer_device->connection->verify_tfm); + digest_size = crypto_shash_digestsize(peer_device->connection->verify_tfm); digest = kmalloc(digest_size, GFP_NOIO); if (!digest) { err = 1; /* terminate the connection in case the allocation failed */ @@ -1327,7 +1328,7 @@ int w_e_end_ov_reply(struct drbd_work *w, int cancel) di = peer_req->digest; if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { - digest_size = crypto_ahash_digestsize(peer_device->connection->verify_tfm); + digest_size = crypto_shash_digestsize(peer_device->connection->verify_tfm); digest = kmalloc(digest_size, GFP_NOIO); if (digest) { drbd_csum_ee(peer_device->connection->verify_tfm, peer_req, digest); diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index f2b6f4da1034..c494a771c06f 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -4679,7 +4679,7 @@ static int __init do_floppy_init(void) /* to be cleaned up... */ disks[drive]->private_data = (void *)(long)drive; disks[drive]->flags |= GENHD_FL_REMOVABLE; - device_add_disk(&floppy_device[drive].dev, disks[drive]); + device_add_disk(&floppy_device[drive].dev, disks[drive], NULL); } return 0; diff --git a/drivers/block/loop.c b/drivers/block/loop.c index ea9debf59b22..abad6d15f956 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -77,6 +77,7 @@ #include <linux/falloc.h> #include <linux/uio.h> #include <linux/ioprio.h> +#include <linux/blk-cgroup.h> #include "loop.h" @@ -1760,8 +1761,8 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, /* always use the first bio's css */ #ifdef CONFIG_BLK_CGROUP - if (cmd->use_aio && rq->bio && rq->bio->bi_css) { - cmd->css = rq->bio->bi_css; + if (cmd->use_aio && rq->bio && rq->bio->bi_blkg) { + cmd->css = &bio_blkcg(rq->bio)->css; css_get(cmd->css); } else #endif diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index d0666f5ce003..1d7d48d8a205 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3861,7 +3861,7 @@ skip_create_disk: set_capacity(dd->disk, capacity); /* Enable the block device and add it to /dev */ - device_add_disk(&dd->pdev->dev, dd->disk); + device_add_disk(&dd->pdev->dev, dd->disk, NULL); dd->bdev = bdget_disk(dd->disk, 0); /* diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index afe1508d82c6..29a4419e8ba3 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -500,7 +500,7 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev) gendisk->disk_name, priv->model, priv->raw_capacity >> 11, get_capacity(gendisk) >> 11); - device_add_disk(&dev->sbd.core, gendisk); + device_add_disk(&dev->sbd.core, gendisk, NULL); return 0; fail_cleanup_queue: diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index 1e3d5de9d838..c0c50816a10b 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -769,7 +769,7 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev) dev_info(&dev->core, "%s: Using %lu MiB of GPU memory\n", gendisk->disk_name, get_capacity(gendisk) >> 11); - device_add_disk(&dev->core, gendisk); + device_add_disk(&dev->core, gendisk, NULL); return 0; fail_cleanup_queue: diff --git a/drivers/block/rsxx/cregs.c b/drivers/block/rsxx/cregs.c index c148e83e4ed7..d9a8758682c9 100644 --- a/drivers/block/rsxx/cregs.c +++ b/drivers/block/rsxx/cregs.c @@ -276,7 +276,7 @@ static void creg_cmd_done(struct work_struct *work) st = -EIO; } - if ((cmd->op == CREG_OP_READ)) { + if (cmd->op == CREG_OP_READ) { unsigned int cnt8 = ioread32(card->regmap + CREG_CNT); /* Paranoid Sanity Checks */ diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c index 1a92f9e65937..3894aa0f350b 100644 --- a/drivers/block/rsxx/dev.c +++ b/drivers/block/rsxx/dev.c @@ -226,7 +226,7 @@ int rsxx_attach_dev(struct rsxx_cardinfo *card) set_capacity(card->gendisk, card->size8 >> 9); else set_capacity(card->gendisk, 0); - device_add_disk(CARD_TO_DEV(card), card->gendisk); + device_add_disk(CARD_TO_DEV(card), card->gendisk, NULL); card->bdev_attached = 1; } diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 87b9e7fbf062..a85c9a622c41 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -3104,7 +3104,7 @@ static int skd_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo) static int skd_bdev_attach(struct device *parent, struct skd_device *skdev) { dev_dbg(&skdev->pdev->dev, "add_disk\n"); - device_add_disk(parent, skdev->disk); + device_add_disk(parent, skdev->disk, NULL); return 0; } diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 5ca56bfae63c..09409edce384 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -850,7 +850,7 @@ static int probe_disk(struct vdc_port *port) port->vdisk_size, (port->vdisk_size >> (20 - 9)), port->vio.ver.major, port->vio.ver.minor); - device_add_disk(&port->vio.vdev->dev, g); + device_add_disk(&port->vio.vdev->dev, g, NULL); return 0; } diff --git a/drivers/block/umem.c b/drivers/block/umem.c index 5c7fb8cc4149..9094ca60949c 100644 --- a/drivers/block/umem.c +++ b/drivers/block/umem.c @@ -421,7 +421,7 @@ static void process_page(unsigned long data) struct cardinfo *card = (struct cardinfo *)data; unsigned int dma_status = card->dma_status; - spin_lock_bh(&card->lock); + spin_lock(&card->lock); if (card->Active < 0) goto out_unlock; page = &card->mm_pages[card->Active]; @@ -496,7 +496,7 @@ static void process_page(unsigned long data) mm_start_io(card); } out_unlock: - spin_unlock_bh(&card->lock); + spin_unlock(&card->lock); while (return_bio) { struct bio *bio = return_bio; diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 23752dc99b00..086c6bb12baa 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -351,8 +351,8 @@ static int minor_to_index(int minor) return minor >> PART_BITS; } -static ssize_t virtblk_serial_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t serial_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); int err; @@ -371,7 +371,7 @@ static ssize_t virtblk_serial_show(struct device *dev, return err; } -static DEVICE_ATTR(serial, 0444, virtblk_serial_show, NULL); +static DEVICE_ATTR_RO(serial); /* The queue's logical block size must be set before calling this */ static void virtblk_update_capacity(struct virtio_blk *vblk, bool resize) @@ -545,8 +545,8 @@ static const char *const virtblk_cache_types[] = { }; static ssize_t -virtblk_cache_type_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) +cache_type_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) { struct gendisk *disk = dev_to_disk(dev); struct virtio_blk *vblk = disk->private_data; @@ -564,8 +564,7 @@ virtblk_cache_type_store(struct device *dev, struct device_attribute *attr, } static ssize_t -virtblk_cache_type_show(struct device *dev, struct device_attribute *attr, - char *buf) +cache_type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); struct virtio_blk *vblk = disk->private_data; @@ -575,12 +574,38 @@ virtblk_cache_type_show(struct device *dev, struct device_attribute *attr, return snprintf(buf, 40, "%s\n", virtblk_cache_types[writeback]); } -static const struct device_attribute dev_attr_cache_type_ro = - __ATTR(cache_type, 0444, - virtblk_cache_type_show, NULL); -static const struct device_attribute dev_attr_cache_type_rw = - __ATTR(cache_type, 0644, - virtblk_cache_type_show, virtblk_cache_type_store); +static DEVICE_ATTR_RW(cache_type); + +static struct attribute *virtblk_attrs[] = { + &dev_attr_serial.attr, + &dev_attr_cache_type.attr, + NULL, +}; + +static umode_t virtblk_attrs_are_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct gendisk *disk = dev_to_disk(dev); + struct virtio_blk *vblk = disk->private_data; + struct virtio_device *vdev = vblk->vdev; + + if (a == &dev_attr_cache_type.attr && + !virtio_has_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE)) + return S_IRUGO; + + return a->mode; +} + +static const struct attribute_group virtblk_attr_group = { + .attrs = virtblk_attrs, + .is_visible = virtblk_attrs_are_visible, +}; + +static const struct attribute_group *virtblk_attr_groups[] = { + &virtblk_attr_group, + NULL, +}; static int virtblk_init_request(struct blk_mq_tag_set *set, struct request *rq, unsigned int hctx_idx, unsigned int numa_node) @@ -780,24 +805,9 @@ static int virtblk_probe(struct virtio_device *vdev) virtblk_update_capacity(vblk, false); virtio_device_ready(vdev); - device_add_disk(&vdev->dev, vblk->disk); - err = device_create_file(disk_to_dev(vblk->disk), &dev_attr_serial); - if (err) - goto out_del_disk; - - if (virtio_has_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE)) - err = device_create_file(disk_to_dev(vblk->disk), - &dev_attr_cache_type_rw); - else - err = device_create_file(disk_to_dev(vblk->disk), - &dev_attr_cache_type_ro); - if (err) - goto out_del_disk; + device_add_disk(&vdev->dev, vblk->disk, virtblk_attr_groups); return 0; -out_del_disk: - del_gendisk(vblk->disk); - blk_cleanup_queue(vblk->disk->queue); out_free_tags: blk_mq_free_tag_set(&vblk->tag_set); out_put_disk: diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 429d20131c7e..9eea83ae01c6 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -2420,7 +2420,7 @@ static void blkfront_connect(struct blkfront_info *info) for (i = 0; i < info->nr_rings; i++) kick_pending_request_queues(&info->rinfo[i]); - device_add_disk(&info->xbdev->dev, info->gd); + device_add_disk(&info->xbdev->dev, info->gd, NULL); info->is_ready = 1; return; diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index a1d6b5597c17..4879595200e1 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1636,6 +1636,11 @@ static const struct attribute_group zram_disk_attr_group = { .attrs = zram_disk_attrs, }; +static const struct attribute_group *zram_disk_attr_groups[] = { + &zram_disk_attr_group, + NULL, +}; + /* * Allocate and initialize new zram device. the function returns * '>= 0' device_id upon success, and negative value otherwise. @@ -1716,24 +1721,14 @@ static int zram_add(void) zram->disk->queue->backing_dev_info->capabilities |= (BDI_CAP_STABLE_WRITES | BDI_CAP_SYNCHRONOUS_IO); - add_disk(zram->disk); - - ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj, - &zram_disk_attr_group); - if (ret < 0) { - pr_err("Error creating sysfs group for device %d\n", - device_id); - goto out_free_disk; - } + device_add_disk(NULL, zram->disk, zram_disk_attr_groups); + strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor)); zram_debugfs_register(zram); pr_info("Added device: %s\n", zram->disk->disk_name); return device_id; -out_free_disk: - del_gendisk(zram->disk); - put_disk(zram->disk); out_free_queue: blk_cleanup_queue(queue); out_free_idr: @@ -1762,15 +1757,6 @@ static int zram_remove(struct zram *zram) mutex_unlock(&bdev->bd_mutex); zram_debugfs_unregister(zram); - /* - * Remove sysfs first, so no one will perform a disksize - * store while we destroy the devices. This also helps during - * hot_remove -- zram_reset_device() is the last holder of - * ->init_lock, no later/concurrent disksize_store() or any - * other sysfs handlers are possible. - */ - sysfs_remove_group(&disk_to_dev(zram->disk)->kobj, - &zram_disk_attr_group); /* Make sure all the pending I/O are finished */ fsync_bdev(bdev); diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 44a7a255ef74..f9b59d41813f 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -1784,7 +1784,7 @@ static int ide_cd_probe(ide_drive_t *drive) ide_cd_read_toc(drive); g->fops = &idecd_ops; g->flags |= GENHD_FL_REMOVABLE | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE; - device_add_disk(&drive->gendev, g); + device_add_disk(&drive->gendev, g, NULL); return 0; out_free_disk: diff --git a/drivers/ide/ide-gd.c b/drivers/ide/ide-gd.c index e823394ed543..04e008e8f6f9 100644 --- a/drivers/ide/ide-gd.c +++ b/drivers/ide/ide-gd.c @@ -416,7 +416,7 @@ static int ide_gd_probe(ide_drive_t *drive) if (drive->dev_flags & IDE_DFLAG_REMOVABLE) g->flags = GENHD_FL_REMOVABLE; g->fops = &ide_gd_ops; - device_add_disk(&drive->gendev, g); + device_add_disk(&drive->gendev, g, NULL); return 0; out_free_disk: diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index ac1cffd2a09b..f3fb5bb8c82a 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -542,7 +542,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) !discard_bio) continue; bio_chain(discard_bio, bio); - bio_clone_blkcg_association(discard_bio, bio); + bio_clone_blkg_association(discard_bio, bio); if (mddev->gendisk) trace_block_bio_remap(bdev_get_queue(rdev->bdev), discard_bio, disk_devt(mddev->gendisk), diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c index 716fc8ed31d3..8a02f11076f9 100644 --- a/drivers/memstick/core/ms_block.c +++ b/drivers/memstick/core/ms_block.c @@ -2146,7 +2146,7 @@ static int msb_init_disk(struct memstick_dev *card) set_disk_ro(msb->disk, 1); msb_start(card); - device_add_disk(&card->dev, msb->disk); + device_add_disk(&card->dev, msb->disk, NULL); dbg("Disk added"); return 0; diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c index 5ee932631fae..0cd30dcb6801 100644 --- a/drivers/memstick/core/mspro_block.c +++ b/drivers/memstick/core/mspro_block.c @@ -1236,7 +1236,7 @@ static int mspro_block_init_disk(struct memstick_dev *card) set_capacity(msb->disk, capacity); dev_dbg(&card->dev, "capacity set %ld\n", capacity); - device_add_disk(&card->dev, msb->disk); + device_add_disk(&card->dev, msb->disk, NULL); msb->active = 1; return 0; diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index a0b9102c4c6e..de8e1a8be690 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -2698,7 +2698,7 @@ static int mmc_add_disk(struct mmc_blk_data *md) int ret; struct mmc_card *card = md->queue.card; - device_add_disk(md->parent, md->disk); + device_add_disk(md->parent, md->disk, NULL); md->force_ro.show = force_ro_show; md->force_ro.store = force_ro_store; sysfs_attr_init(&md->force_ro.attr); diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 29c0bfd74e8a..6a41dfa3c36b 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -447,7 +447,7 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new) if (new->readonly) set_disk_ro(gd, 1); - device_add_disk(&new->mtd->dev, gd); + device_add_disk(&new->mtd->dev, gd, NULL); if (new->disk_attributes) { ret = sysfs_create_group(&disk_to_dev(gd)->kobj, diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c index 62e9cb167aad..db45c6bbb7bb 100644 --- a/drivers/nvdimm/blk.c +++ b/drivers/nvdimm/blk.c @@ -290,7 +290,7 @@ static int nsblk_attach_disk(struct nd_namespace_blk *nsblk) } set_capacity(disk, available_disk_size >> SECTOR_SHIFT); - device_add_disk(dev, disk); + device_add_disk(dev, disk, NULL); revalidate_disk(disk); return 0; } diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 0360c015f658..b123b0dcf274 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1556,7 +1556,7 @@ static int btt_blk_init(struct btt *btt) } } set_capacity(btt->btt_disk, btt->nlba * btt->sector_size >> 9); - device_add_disk(&btt->nd_btt->dev, btt->btt_disk); + device_add_disk(&btt->nd_btt->dev, btt->btt_disk, NULL); btt->nd_btt->size = btt->nlba * (u64)btt->sector_size; revalidate_disk(btt->btt_disk); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 6071e2942053..a75d10c23d80 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -474,7 +474,7 @@ static int pmem_attach_disk(struct device *dev, gendev = disk_to_dev(disk); gendev->groups = pmem_attribute_groups; - device_add_disk(dev, disk); + device_add_disk(dev, disk, NULL); if (devm_add_action_or_reset(dev, pmem_release_disk, pmem)) return -ENOMEM; diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index dd8ec1dd9219..e0a9e1c5b30e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2734,6 +2734,14 @@ const struct attribute_group nvme_ns_id_attr_group = { .is_visible = nvme_ns_id_attrs_are_visible, }; +const struct attribute_group *nvme_ns_id_attr_groups[] = { + &nvme_ns_id_attr_group, +#ifdef CONFIG_NVM + &nvme_nvm_attr_group, +#endif + NULL, +}; + #define nvme_show_str_function(field) \ static ssize_t field##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ @@ -3099,14 +3107,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) nvme_get_ctrl(ctrl); - device_add_disk(ctrl->device, ns->disk); - if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj, - &nvme_ns_id_attr_group)) - pr_warn("%s: failed to create sysfs group for identification\n", - ns->disk->disk_name); - if (ns->ndev && nvme_nvm_register_sysfs(ns)) - pr_warn("%s: failed to register lightnvm sysfs group for identification\n", - ns->disk->disk_name); + device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups); nvme_mpath_add_disk(ns, id); nvme_fault_inject_init(ns); @@ -3132,10 +3133,6 @@ static void nvme_ns_remove(struct nvme_ns *ns) nvme_fault_inject_fini(ns); if (ns->disk && ns->disk->flags & GENHD_FL_UP) { - sysfs_remove_group(&disk_to_dev(ns->disk)->kobj, - &nvme_ns_id_attr_group); - if (ns->ndev) - nvme_nvm_unregister_sysfs(ns); del_gendisk(ns->disk); blk_cleanup_queue(ns->queue); if (blk_get_integrity(ns->disk)) diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 6fe5923c95d4..1e4f97538838 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -1190,10 +1190,29 @@ static NVM_DEV_ATTR_12_RO(multiplane_modes); static NVM_DEV_ATTR_12_RO(media_capabilities); static NVM_DEV_ATTR_12_RO(max_phys_secs); -static struct attribute *nvm_dev_attrs_12[] = { +/* 2.0 values */ +static NVM_DEV_ATTR_20_RO(groups); +static NVM_DEV_ATTR_20_RO(punits); +static NVM_DEV_ATTR_20_RO(chunks); +static NVM_DEV_ATTR_20_RO(clba); +static NVM_DEV_ATTR_20_RO(ws_min); +static NVM_DEV_ATTR_20_RO(ws_opt); +static NVM_DEV_ATTR_20_RO(maxoc); +static NVM_DEV_ATTR_20_RO(maxocpu); +static NVM_DEV_ATTR_20_RO(mw_cunits); +static NVM_DEV_ATTR_20_RO(write_typ); +static NVM_DEV_ATTR_20_RO(write_max); +static NVM_DEV_ATTR_20_RO(reset_typ); +static NVM_DEV_ATTR_20_RO(reset_max); + +static struct attribute *nvm_dev_attrs[] = { + /* version agnostic attrs */ &dev_attr_version.attr, &dev_attr_capabilities.attr, + &dev_attr_read_typ.attr, + &dev_attr_read_max.attr, + /* 1.2 attrs */ &dev_attr_vendor_opcode.attr, &dev_attr_device_mode.attr, &dev_attr_media_manager.attr, @@ -1208,8 +1227,6 @@ static struct attribute *nvm_dev_attrs_12[] = { &dev_attr_page_size.attr, &dev_attr_hw_sector_size.attr, &dev_attr_oob_sector_size.attr, - &dev_attr_read_typ.attr, - &dev_attr_read_max.attr, &dev_attr_prog_typ.attr, &dev_attr_prog_max.attr, &dev_attr_erase_typ.attr, @@ -1218,33 +1235,7 @@ static struct attribute *nvm_dev_attrs_12[] = { &dev_attr_media_capabilities.attr, &dev_attr_max_phys_secs.attr, - NULL, -}; - -static const struct attribute_group nvm_dev_attr_group_12 = { - .name = "lightnvm", - .attrs = nvm_dev_attrs_12, -}; - -/* 2.0 values */ -static NVM_DEV_ATTR_20_RO(groups); -static NVM_DEV_ATTR_20_RO(punits); -static NVM_DEV_ATTR_20_RO(chunks); -static NVM_DEV_ATTR_20_RO(clba); -static NVM_DEV_ATTR_20_RO(ws_min); -static NVM_DEV_ATTR_20_RO(ws_opt); -static NVM_DEV_ATTR_20_RO(maxoc); -static NVM_DEV_ATTR_20_RO(maxocpu); -static NVM_DEV_ATTR_20_RO(mw_cunits); -static NVM_DEV_ATTR_20_RO(write_typ); -static NVM_DEV_ATTR_20_RO(write_max); -static NVM_DEV_ATTR_20_RO(reset_typ); -static NVM_DEV_ATTR_20_RO(reset_max); - -static struct attribute *nvm_dev_attrs_20[] = { - &dev_attr_version.attr, - &dev_attr_capabilities.attr, - + /* 2.0 attrs */ &dev_attr_groups.attr, &dev_attr_punits.attr, &dev_attr_chunks.attr, @@ -1255,8 +1246,6 @@ static struct attribute *nvm_dev_attrs_20[] = { &dev_attr_maxocpu.attr, &dev_attr_mw_cunits.attr, - &dev_attr_read_typ.attr, - &dev_attr_read_max.attr, &dev_attr_write_typ.attr, &dev_attr_write_max.attr, &dev_attr_reset_typ.attr, @@ -1265,44 +1254,38 @@ static struct attribute *nvm_dev_attrs_20[] = { NULL, }; -static const struct attribute_group nvm_dev_attr_group_20 = { - .name = "lightnvm", - .attrs = nvm_dev_attrs_20, -}; - -int nvme_nvm_register_sysfs(struct nvme_ns *ns) +static umode_t nvm_dev_attrs_visible(struct kobject *kobj, + struct attribute *attr, int index) { + struct device *dev = container_of(kobj, struct device, kobj); + struct gendisk *disk = dev_to_disk(dev); + struct nvme_ns *ns = disk->private_data; struct nvm_dev *ndev = ns->ndev; - struct nvm_geo *geo = &ndev->geo; + struct device_attribute *dev_attr = + container_of(attr, typeof(*dev_attr), attr); if (!ndev) - return -EINVAL; - - switch (geo->major_ver_id) { - case 1: - return sysfs_create_group(&disk_to_dev(ns->disk)->kobj, - &nvm_dev_attr_group_12); - case 2: - return sysfs_create_group(&disk_to_dev(ns->disk)->kobj, - &nvm_dev_attr_group_20); - } - - return -EINVAL; -} + return 0; -void nvme_nvm_unregister_sysfs(struct nvme_ns *ns) -{ - struct nvm_dev *ndev = ns->ndev; - struct nvm_geo *geo = &ndev->geo; + if (dev_attr->show == nvm_dev_attr_show) + return attr->mode; - switch (geo->major_ver_id) { + switch (ndev->geo.major_ver_id) { case 1: - sysfs_remove_group(&disk_to_dev(ns->disk)->kobj, - &nvm_dev_attr_group_12); + if (dev_attr->show == nvm_dev_attr_show_12) + return attr->mode; break; case 2: - sysfs_remove_group(&disk_to_dev(ns->disk)->kobj, - &nvm_dev_attr_group_20); + if (dev_attr->show == nvm_dev_attr_show_20) + return attr->mode; break; } + + return 0; } + +const struct attribute_group nvme_nvm_attr_group = { + .name = "lightnvm", + .attrs = nvm_dev_attrs, + .is_visible = nvm_dev_attrs_visible, +}; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 9fe3fff818b8..bfbc6d5b1d93 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -282,13 +282,9 @@ static void nvme_mpath_set_live(struct nvme_ns *ns) if (!head->disk) return; - if (!(head->disk->flags & GENHD_FL_UP)) { - device_add_disk(&head->subsys->dev, head->disk); - if (sysfs_create_group(&disk_to_dev(head->disk)->kobj, - &nvme_ns_id_attr_group)) - dev_warn(&head->subsys->dev, - "failed to create id group.\n"); - } + if (!(head->disk->flags & GENHD_FL_UP)) + device_add_disk(&head->subsys->dev, head->disk, + nvme_ns_id_attr_groups); kblockd_schedule_work(&ns->head->requeue_work); } @@ -494,11 +490,8 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head) { if (!head->disk) return; - if (head->disk->flags & GENHD_FL_UP) { - sysfs_remove_group(&disk_to_dev(head->disk)->kobj, - &nvme_ns_id_attr_group); + if (head->disk->flags & GENHD_FL_UP) del_gendisk(head->disk); - } blk_set_queue_dying(head->disk->queue); /* make sure all pending bios are cleaned up */ kblockd_schedule_work(&head->requeue_work); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index bb4a2003c097..2503f8fd54da 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -459,7 +459,7 @@ int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl); int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, void *log, size_t size, u64 offset); -extern const struct attribute_group nvme_ns_id_attr_group; +extern const struct attribute_group *nvme_ns_id_attr_groups[]; extern const struct block_device_operations nvme_ns_head_ops; #ifdef CONFIG_NVME_MULTIPATH @@ -551,8 +551,7 @@ static inline void nvme_mpath_stop(struct nvme_ctrl *ctrl) void nvme_nvm_update_nvm_info(struct nvme_ns *ns); int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node); void nvme_nvm_unregister(struct nvme_ns *ns); -int nvme_nvm_register_sysfs(struct nvme_ns *ns); -void nvme_nvm_unregister_sysfs(struct nvme_ns *ns); +extern const struct attribute_group nvme_nvm_attr_group; int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg); #else static inline void nvme_nvm_update_nvm_info(struct nvme_ns *ns) {}; @@ -563,11 +562,6 @@ static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, } static inline void nvme_nvm_unregister(struct nvme_ns *ns) {}; -static inline int nvme_nvm_register_sysfs(struct nvme_ns *ns) -{ - return 0; -} -static inline void nvme_nvm_unregister_sysfs(struct nvme_ns *ns) {}; static inline int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg) { diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c index 7036a6c6f86f..5542d9eadfe0 100644 --- a/drivers/s390/block/dasd_genhd.c +++ b/drivers/s390/block/dasd_genhd.c @@ -76,7 +76,7 @@ int dasd_gendisk_alloc(struct dasd_block *block) gdp->queue = block->request_queue; block->gdp = gdp; set_capacity(block->gdp, 0); - device_add_disk(&base->cdev->dev, block->gdp); + device_add_disk(&base->cdev->dev, block->gdp, NULL); return 0; } diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 23e526cda5c1..4e8aedd50cb0 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -685,7 +685,7 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char } get_device(&dev_info->dev); - device_add_disk(&dev_info->dev, dev_info->gd); + device_add_disk(&dev_info->dev, dev_info->gd, NULL); switch (dev_info->segment_type) { case SEG_TYPE_SR: diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c index 98f66b7b6794..e01889394c84 100644 --- a/drivers/s390/block/scm_blk.c +++ b/drivers/s390/block/scm_blk.c @@ -500,7 +500,7 @@ int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev) /* 512 byte sectors */ set_capacity(bdev->gendisk, scmdev->size >> 9); - device_add_disk(&scmdev->dev, bdev->gendisk); + device_add_disk(&scmdev->dev, bdev->gendisk, NULL); return 0; out_queue: diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index eb97d2dd3651..62348412ed1b 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -3046,11 +3046,14 @@ scsi_device_quiesce(struct scsi_device *sdev) */ WARN_ON_ONCE(sdev->quiesced_by && sdev->quiesced_by != current); - blk_set_preempt_only(q); + if (sdev->quiesced_by == current) + return 0; + + blk_set_pm_only(q); blk_mq_freeze_queue(q); /* - * Ensure that the effect of blk_set_preempt_only() will be visible + * Ensure that the effect of blk_set_pm_only() will be visible * for percpu_ref_tryget() callers that occur after the queue * unfreeze even if the queue was already frozen before this function * was called. See also https://lwn.net/Articles/573497/. @@ -3063,7 +3066,7 @@ scsi_device_quiesce(struct scsi_device *sdev) if (err == 0) sdev->quiesced_by = current; else - blk_clear_preempt_only(q); + blk_clear_pm_only(q); mutex_unlock(&sdev->state_mutex); return err; @@ -3088,7 +3091,7 @@ void scsi_device_resume(struct scsi_device *sdev) mutex_lock(&sdev->state_mutex); WARN_ON_ONCE(!sdev->quiesced_by); sdev->quiesced_by = NULL; - blk_clear_preempt_only(sdev->request_queue); + blk_clear_pm_only(sdev->request_queue); if (sdev->sdev_state == SDEV_QUIESCE) scsi_device_set_state(sdev, SDEV_RUNNING); mutex_unlock(&sdev->state_mutex); diff --git a/drivers/scsi/scsi_pm.c b/drivers/scsi/scsi_pm.c index b44c1bb687a2..a2b4179bfdf7 100644 --- a/drivers/scsi/scsi_pm.c +++ b/drivers/scsi/scsi_pm.c @@ -8,6 +8,7 @@ #include <linux/pm_runtime.h> #include <linux/export.h> #include <linux/async.h> +#include <linux/blk-pm.h> #include <scsi/scsi.h> #include <scsi/scsi_device.h> diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 4a57ffecc7e6..b762d0fd773c 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -45,6 +45,7 @@ #include <linux/init.h> #include <linux/blkdev.h> #include <linux/blkpg.h> +#include <linux/blk-pm.h> #include <linux/delay.h> #include <linux/mutex.h> #include <linux/string_helpers.h> @@ -3275,7 +3276,7 @@ static void sd_probe_async(void *data, async_cookie_t cookie) } blk_pm_runtime_init(sdp->request_queue, dev); - device_add_disk(dev, gd); + device_add_disk(dev, gd, NULL); if (sdkp->capacity) sd_dif_config_host(sdkp); diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index d0389b20574d..54dd70ae9731 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -43,6 +43,7 @@ #include <linux/interrupt.h> #include <linux/init.h> #include <linux/blkdev.h> +#include <linux/blk-pm.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/pm_runtime.h> @@ -758,7 +759,7 @@ static int sr_probe(struct device *dev) dev_set_drvdata(dev, cd); disk->flags |= GENHD_FL_REMOVABLE; - device_add_disk(&sdev->sdev_gendev, disk); + device_add_disk(&sdev->sdev_gendev, disk, NULL); sdev_printk(KERN_DEBUG, sdev, "Attached scsi CD-ROM %s\n", cd->cdi.name); diff --git a/drivers/xen/biomerge.c b/drivers/xen/biomerge.c index 55ed80c3a17c..f3fbb700f569 100644 --- a/drivers/xen/biomerge.c +++ b/drivers/xen/biomerge.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/bio.h> -#include <linux/io.h> #include <linux/export.h> +#include <xen/xen.h> #include <xen/page.h> bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, @@ -20,4 +20,3 @@ bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, return false; #endif } -EXPORT_SYMBOL(xen_biovec_phys_mergeable); diff --git a/drivers/xen/xen-acpi-pad.c b/drivers/xen/xen-acpi-pad.c index 23d1808fe027..e25ab76b9c99 100644 --- a/drivers/xen/xen-acpi-pad.c +++ b/drivers/xen/xen-acpi-pad.c @@ -19,6 +19,7 @@ #include <linux/kernel.h> #include <linux/types.h> #include <linux/acpi.h> +#include <xen/xen.h> #include <xen/interface/version.h> #include <xen/xen-ops.h> #include <asm/xen/hypercall.h> diff --git a/fs/buffer.c b/fs/buffer.c index 6f1ae3ac9789..109f55196866 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3060,11 +3060,6 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, */ bio = bio_alloc(GFP_NOIO, 1); - if (wbc) { - wbc_init_bio(wbc, bio); - wbc_account_io(wbc, bh->b_page, bh->b_size); - } - bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio_set_dev(bio, bh->b_bdev); bio->bi_write_hint = write_hint; @@ -3084,6 +3079,11 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, op_flags |= REQ_PRIO; bio_set_op_attrs(bio, op, op_flags); + if (wbc) { + wbc_init_bio(wbc, bio); + wbc_account_io(wbc, bh->b_page, bh->b_size); + } + submit_bio(bio); return 0; } diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index db7590178dfc..2aa62d58d8dd 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -374,13 +374,13 @@ static int io_submit_init_bio(struct ext4_io_submit *io, bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); if (!bio) return -ENOMEM; - wbc_init_bio(io->io_wbc, bio); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio_set_dev(bio, bh->b_bdev); bio->bi_end_io = ext4_end_bio; bio->bi_private = ext4_get_io_end(io->io_end); io->io_bio = bio; io->io_next_block = bh->b_blocknr; + wbc_init_bio(io->io_wbc, bio); return 0; } diff --git a/include/linux/bio.h b/include/linux/bio.h index 51371740d2a8..f447b0ebb288 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -21,12 +21,8 @@ #include <linux/highmem.h> #include <linux/mempool.h> #include <linux/ioprio.h> -#include <linux/bug.h> #ifdef CONFIG_BLOCK - -#include <asm/io.h> - /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */ #include <linux/blk_types.h> @@ -133,32 +129,6 @@ static inline bool bio_full(struct bio *bio) } /* - * will die - */ -#define bvec_to_phys(bv) (page_to_phys((bv)->bv_page) + (unsigned long) (bv)->bv_offset) - -/* - * merge helpers etc - */ - -/* Default implementation of BIOVEC_PHYS_MERGEABLE */ -#define __BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ - ((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) - -/* - * allow arch override, for eg virtualized architectures (put in asm/io.h) - */ -#ifndef BIOVEC_PHYS_MERGEABLE -#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ - __BIOVEC_PHYS_MERGEABLE(vec1, vec2) -#endif - -#define __BIO_SEG_BOUNDARY(addr1, addr2, mask) \ - (((addr1) | (mask)) == (((addr2) - 1) | (mask))) -#define BIOVEC_SEG_BOUNDARY(q, b1, b2) \ - __BIO_SEG_BOUNDARY(bvec_to_phys((b1)), bvec_to_phys((b2)) + (b2)->bv_len, queue_segment_boundary((q))) - -/* * drivers should _never_ use the all version - the bio may have been split * before it got to the driver and the driver won't own all of it */ @@ -170,27 +140,11 @@ static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter, { iter->bi_sector += bytes >> 9; - if (bio_no_advance_iter(bio)) { + if (bio_no_advance_iter(bio)) iter->bi_size -= bytes; - iter->bi_done += bytes; - } else { + else bvec_iter_advance(bio->bi_io_vec, iter, bytes); /* TODO: It is reasonable to complete bio with error here. */ - } -} - -static inline bool bio_rewind_iter(struct bio *bio, struct bvec_iter *iter, - unsigned int bytes) -{ - iter->bi_sector -= bytes >> 9; - - if (bio_no_advance_iter(bio)) { - iter->bi_size += bytes; - iter->bi_done -= bytes; - return true; - } - - return bvec_iter_rewind(bio->bi_io_vec, iter, bytes); } #define __bio_for_each_segment(bvl, bio, iter, start) \ @@ -353,6 +307,8 @@ struct bio_integrity_payload { unsigned short bip_max_vcnt; /* integrity bio_vec slots */ unsigned short bip_flags; /* control flags */ + struct bvec_iter bio_iter; /* for rewinding parent bio */ + struct work_struct bip_work; /* I/O completion */ struct bio_vec *bip_vec; @@ -547,23 +503,28 @@ do { \ disk_devt((bio)->bi_disk) #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) -int bio_associate_blkcg_from_page(struct bio *bio, struct page *page); +int bio_associate_blkg_from_page(struct bio *bio, struct page *page); #else -static inline int bio_associate_blkcg_from_page(struct bio *bio, - struct page *page) { return 0; } +static inline int bio_associate_blkg_from_page(struct bio *bio, + struct page *page) { return 0; } #endif #ifdef CONFIG_BLK_CGROUP -int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg); +int bio_associate_blkg_from_css(struct bio *bio, + struct cgroup_subsys_state *css); +int bio_associate_create_blkg(struct request_queue *q, struct bio *bio); void bio_disassociate_task(struct bio *bio); -void bio_clone_blkcg_association(struct bio *dst, struct bio *src); +void bio_clone_blkg_association(struct bio *dst, struct bio *src); #else /* CONFIG_BLK_CGROUP */ -static inline int bio_associate_blkcg(struct bio *bio, - struct cgroup_subsys_state *blkcg_css) { return 0; } +static inline int bio_associate_blkg_from_css(struct bio *bio, + struct cgroup_subsys_state *css) +{ return 0; } +static inline int bio_associate_create_blkg(struct request_queue *q, + struct bio *bio) { return 0; } static inline void bio_disassociate_task(struct bio *bio) { } -static inline void bio_clone_blkcg_association(struct bio *dst, - struct bio *src) { } +static inline void bio_clone_blkg_association(struct bio *dst, + struct bio *src) { } #endif /* CONFIG_BLK_CGROUP */ #ifdef CONFIG_HIGHMEM diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 6d766a19f2bb..b7fd08013de2 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -126,7 +126,7 @@ struct blkcg_gq { struct request_list rl; /* reference count */ - atomic_t refcnt; + struct percpu_ref refcnt; /* is this blkg online? protected by both blkcg and q locks */ bool online; @@ -184,6 +184,8 @@ extern struct cgroup_subsys_state * const blkcg_root_css; struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, struct request_queue *q, bool update_hint); +struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, + struct request_queue *q); struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, struct request_queue *q); int blkcg_init_queue(struct request_queue *q); @@ -230,22 +232,59 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, char *input, struct blkg_conf_ctx *ctx); void blkg_conf_finish(struct blkg_conf_ctx *ctx); +/** + * blkcg_css - find the current css + * + * Find the css associated with either the kthread or the current task. + * This may return a dying css, so it is up to the caller to use tryget logic + * to confirm it is alive and well. + */ +static inline struct cgroup_subsys_state *blkcg_css(void) +{ + struct cgroup_subsys_state *css; + + css = kthread_blkcg(); + if (css) + return css; + return task_css(current, io_cgrp_id); +} static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) { return css ? container_of(css, struct blkcg, css) : NULL; } -static inline struct blkcg *bio_blkcg(struct bio *bio) +/** + * __bio_blkcg - internal version of bio_blkcg for bfq and cfq + * + * DO NOT USE. + * There is a flaw using this version of the function. In particular, this was + * used in a broken paradigm where association was called on the given css. It + * is possible though that the returned css from task_css() is in the process + * of dying due to migration of the current task. So it is improper to assume + * *_get() is going to succeed. Both BFQ and CFQ rely on this logic and will + * take additional work to handle more gracefully. + */ +static inline struct blkcg *__bio_blkcg(struct bio *bio) { - struct cgroup_subsys_state *css; + if (bio && bio->bi_blkg) + return bio->bi_blkg->blkcg; + return css_to_blkcg(blkcg_css()); +} - if (bio && bio->bi_css) - return css_to_blkcg(bio->bi_css); - css = kthread_blkcg(); - if (css) - return css_to_blkcg(css); - return css_to_blkcg(task_css(current, io_cgrp_id)); +/** + * bio_blkcg - grab the blkcg associated with a bio + * @bio: target bio + * + * This returns the blkcg associated with a bio, NULL if not associated. + * Callers are expected to either handle NULL or know association has been + * done prior to calling this. + */ +static inline struct blkcg *bio_blkcg(struct bio *bio) +{ + if (bio && bio->bi_blkg) + return bio->bi_blkg->blkcg; + return NULL; } static inline bool blk_cgroup_congested(void) @@ -451,26 +490,35 @@ static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) */ static inline void blkg_get(struct blkcg_gq *blkg) { - WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0); - atomic_inc(&blkg->refcnt); + percpu_ref_get(&blkg->refcnt); } /** - * blkg_try_get - try and get a blkg reference + * blkg_tryget - try and get a blkg reference * @blkg: blkg to get * * This is for use when doing an RCU lookup of the blkg. We may be in the midst * of freeing this blkg, so we can only use it if the refcnt is not zero. */ -static inline struct blkcg_gq *blkg_try_get(struct blkcg_gq *blkg) +static inline bool blkg_tryget(struct blkcg_gq *blkg) { - if (atomic_inc_not_zero(&blkg->refcnt)) - return blkg; - return NULL; + return percpu_ref_tryget(&blkg->refcnt); } +/** + * blkg_tryget_closest - try and get a blkg ref on the closet blkg + * @blkg: blkg to get + * + * This walks up the blkg tree to find the closest non-dying blkg and returns + * the blkg that it did association with as it may not be the passed in blkg. + */ +static inline struct blkcg_gq *blkg_tryget_closest(struct blkcg_gq *blkg) +{ + while (!percpu_ref_tryget(&blkg->refcnt)) + blkg = blkg->parent; -void __blkg_release_rcu(struct rcu_head *rcu); + return blkg; +} /** * blkg_put - put a blkg reference @@ -478,9 +526,7 @@ void __blkg_release_rcu(struct rcu_head *rcu); */ static inline void blkg_put(struct blkcg_gq *blkg) { - WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0); - if (atomic_dec_and_test(&blkg->refcnt)) - call_rcu(&blkg->rcu_head, __blkg_release_rcu); + percpu_ref_put(&blkg->refcnt); } /** @@ -533,25 +579,36 @@ static inline struct request_list *blk_get_rl(struct request_queue *q, rcu_read_lock(); - blkcg = bio_blkcg(bio); + if (bio && bio->bi_blkg) { + blkcg = bio->bi_blkg->blkcg; + if (blkcg == &blkcg_root) + goto rl_use_root; + + blkg_get(bio->bi_blkg); + rcu_read_unlock(); + return &bio->bi_blkg->rl; + } - /* bypass blkg lookup and use @q->root_rl directly for root */ + blkcg = css_to_blkcg(blkcg_css()); if (blkcg == &blkcg_root) - goto root_rl; + goto rl_use_root; - /* - * Try to use blkg->rl. blkg lookup may fail under memory pressure - * or if either the blkcg or queue is going away. Fall back to - * root_rl in such cases. - */ blkg = blkg_lookup(blkcg, q); if (unlikely(!blkg)) - goto root_rl; + blkg = __blkg_lookup_create(blkcg, q); + + if (!blkg_tryget(blkg)) + goto rl_use_root; - blkg_get(blkg); rcu_read_unlock(); return &blkg->rl; -root_rl: + + /* + * Each blkg has its own request_list, however, the root blkcg + * uses the request_queue's root_rl. This is to avoid most + * overhead for the root blkcg. + */ +rl_use_root: rcu_read_unlock(); return &q->root_rl; } @@ -797,32 +854,26 @@ static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg struct bio *bio) { return false; } #endif + +static inline void blkcg_bio_issue_init(struct bio *bio) +{ + bio_issue_init(&bio->bi_issue, bio_sectors(bio)); +} + static inline bool blkcg_bio_issue_check(struct request_queue *q, struct bio *bio) { - struct blkcg *blkcg; struct blkcg_gq *blkg; bool throtl = false; rcu_read_lock(); - blkcg = bio_blkcg(bio); - - /* associate blkcg if bio hasn't attached one */ - bio_associate_blkcg(bio, &blkcg->css); - blkg = blkg_lookup(blkcg, q); - if (unlikely(!blkg)) { - spin_lock_irq(q->queue_lock); - blkg = blkg_lookup_create(blkcg, q); - if (IS_ERR(blkg)) - blkg = NULL; - spin_unlock_irq(q->queue_lock); - } + bio_associate_create_blkg(q, bio); + blkg = bio->bi_blkg; throtl = blk_throtl_bio(q, blkg, bio); if (!throtl) { - blkg = blkg ?: q->root_blkg; /* * If the bio is flagged with BIO_QUEUE_ENTERED it means this * is a split bio and we would have already accounted for the @@ -834,6 +885,8 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1); } + blkcg_bio_issue_init(bio); + rcu_read_unlock(); return !throtl; } @@ -930,6 +983,7 @@ static inline int blkcg_activate_policy(struct request_queue *q, static inline void blkcg_deactivate_policy(struct request_queue *q, const struct blkcg_policy *pol) { } +static inline struct blkcg *__bio_blkcg(struct bio *bio) { return NULL; } static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, @@ -945,6 +999,7 @@ static inline void blk_put_rl(struct request_list *rl) { } static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { } static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; } +static inline void blkcg_bio_issue_init(struct bio *bio) { } static inline bool blkcg_bio_issue_check(struct request_queue *q, struct bio *bio) { return true; } diff --git a/include/linux/blk-pm.h b/include/linux/blk-pm.h new file mode 100644 index 000000000000..b80c65aba249 --- /dev/null +++ b/include/linux/blk-pm.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _BLK_PM_H_ +#define _BLK_PM_H_ + +struct device; +struct request_queue; + +/* + * block layer runtime pm functions + */ +#ifdef CONFIG_PM +extern void blk_pm_runtime_init(struct request_queue *q, struct device *dev); +extern int blk_pre_runtime_suspend(struct request_queue *q); +extern void blk_post_runtime_suspend(struct request_queue *q, int err); +extern void blk_pre_runtime_resume(struct request_queue *q); +extern void blk_post_runtime_resume(struct request_queue *q, int err); +extern void blk_set_runtime_active(struct request_queue *q); +#else +static inline void blk_pm_runtime_init(struct request_queue *q, + struct device *dev) {} +#endif + +#endif /* _BLK_PM_H_ */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index f6dfb30737d8..9578c7ab1eb6 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -178,7 +178,6 @@ struct bio { * release. Read comment on top of bio_associate_current(). */ struct io_context *bi_ioc; - struct cgroup_subsys_state *bi_css; struct blkcg_gq *bi_blkg; struct bio_issue bi_issue; #endif diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6980014357d4..dee46c20701b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -504,6 +504,12 @@ struct request_queue { * various queue flags, see QUEUE_* below */ unsigned long queue_flags; + /* + * Number of contexts that have called blk_set_pm_only(). If this + * counter is above zero then only RQF_PM and RQF_PREEMPT requests are + * processed. + */ + atomic_t pm_only; /* * ida allocated id for this queue. Used to index queues from @@ -698,7 +704,6 @@ struct request_queue { #define QUEUE_FLAG_REGISTERED 26 /* queue has been registered to a disk */ #define QUEUE_FLAG_SCSI_PASSTHROUGH 27 /* queue supports SCSI commands */ #define QUEUE_FLAG_QUIESCED 28 /* queue has been quiesced */ -#define QUEUE_FLAG_PREEMPT_ONLY 29 /* only process REQ_PREEMPT requests */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_SAME_COMP) | \ @@ -736,12 +741,11 @@ bool blk_queue_flag_test_and_clear(unsigned int flag, struct request_queue *q); ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \ REQ_FAILFAST_DRIVER)) #define blk_queue_quiesced(q) test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags) -#define blk_queue_preempt_only(q) \ - test_bit(QUEUE_FLAG_PREEMPT_ONLY, &(q)->queue_flags) +#define blk_queue_pm_only(q) atomic_read(&(q)->pm_only) #define blk_queue_fua(q) test_bit(QUEUE_FLAG_FUA, &(q)->queue_flags) -extern int blk_set_preempt_only(struct request_queue *q); -extern void blk_clear_preempt_only(struct request_queue *q); +extern void blk_set_pm_only(struct request_queue *q); +extern void blk_clear_pm_only(struct request_queue *q); static inline int queue_in_flight(struct request_queue *q) { @@ -1281,29 +1285,6 @@ extern void blk_put_queue(struct request_queue *); extern void blk_set_queue_dying(struct request_queue *); /* - * block layer runtime pm functions - */ -#ifdef CONFIG_PM -extern void blk_pm_runtime_init(struct request_queue *q, struct device *dev); -extern int blk_pre_runtime_suspend(struct request_queue *q); -extern void blk_post_runtime_suspend(struct request_queue *q, int err); -extern void blk_pre_runtime_resume(struct request_queue *q); -extern void blk_post_runtime_resume(struct request_queue *q, int err); -extern void blk_set_runtime_active(struct request_queue *q); -#else -static inline void blk_pm_runtime_init(struct request_queue *q, - struct device *dev) {} -static inline int blk_pre_runtime_suspend(struct request_queue *q) -{ - return -ENOSYS; -} -static inline void blk_post_runtime_suspend(struct request_queue *q, int err) {} -static inline void blk_pre_runtime_resume(struct request_queue *q) {} -static inline void blk_post_runtime_resume(struct request_queue *q, int err) {} -static inline void blk_set_runtime_active(struct request_queue *q) {} -#endif - -/* * blk_plug permits building a queue of related requests by holding the I/O * fragments for a short period. This allows merging of sequential requests * into single larger request. As the requests are moved from a per-task list to @@ -1676,94 +1657,6 @@ static inline void put_dev_sector(Sector p) put_page(p.v); } -static inline bool __bvec_gap_to_prev(struct request_queue *q, - struct bio_vec *bprv, unsigned int offset) -{ - return offset || - ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q)); -} - -/* - * Check if adding a bio_vec after bprv with offset would create a gap in - * the SG list. Most drivers don't care about this, but some do. - */ -static inline bool bvec_gap_to_prev(struct request_queue *q, - struct bio_vec *bprv, unsigned int offset) -{ - if (!queue_virt_boundary(q)) - return false; - return __bvec_gap_to_prev(q, bprv, offset); -} - -/* - * Check if the two bvecs from two bios can be merged to one segment. - * If yes, no need to check gap between the two bios since the 1st bio - * and the 1st bvec in the 2nd bio can be handled in one segment. - */ -static inline bool bios_segs_mergeable(struct request_queue *q, - struct bio *prev, struct bio_vec *prev_last_bv, - struct bio_vec *next_first_bv) -{ - if (!BIOVEC_PHYS_MERGEABLE(prev_last_bv, next_first_bv)) - return false; - if (!BIOVEC_SEG_BOUNDARY(q, prev_last_bv, next_first_bv)) - return false; - if (prev->bi_seg_back_size + next_first_bv->bv_len > - queue_max_segment_size(q)) - return false; - return true; -} - -static inline bool bio_will_gap(struct request_queue *q, - struct request *prev_rq, - struct bio *prev, - struct bio *next) -{ - if (bio_has_data(prev) && queue_virt_boundary(q)) { - struct bio_vec pb, nb; - - /* - * don't merge if the 1st bio starts with non-zero - * offset, otherwise it is quite difficult to respect - * sg gap limit. We work hard to merge a huge number of small - * single bios in case of mkfs. - */ - if (prev_rq) - bio_get_first_bvec(prev_rq->bio, &pb); - else - bio_get_first_bvec(prev, &pb); - if (pb.bv_offset) - return true; - - /* - * We don't need to worry about the situation that the - * merged segment ends in unaligned virt boundary: - * - * - if 'pb' ends aligned, the merged segment ends aligned - * - if 'pb' ends unaligned, the next bio must include - * one single bvec of 'nb', otherwise the 'nb' can't - * merge with 'pb' - */ - bio_get_last_bvec(prev, &pb); - bio_get_first_bvec(next, &nb); - - if (!bios_segs_mergeable(q, prev, &pb, &nb)) - return __bvec_gap_to_prev(q, &pb, nb.bv_offset); - } - - return false; -} - -static inline bool req_gap_back_merge(struct request *req, struct bio *bio) -{ - return bio_will_gap(req->q, req, req->biotail, bio); -} - -static inline bool req_gap_front_merge(struct request *req, struct bio *bio) -{ - return bio_will_gap(req->q, NULL, bio, req->bio); -} - int kblockd_schedule_work(struct work_struct *work); int kblockd_schedule_work_on(int cpu, struct work_struct *work); int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay); @@ -1843,26 +1736,6 @@ queue_max_integrity_segments(struct request_queue *q) return q->limits.max_integrity_segments; } -static inline bool integrity_req_gap_back_merge(struct request *req, - struct bio *next) -{ - struct bio_integrity_payload *bip = bio_integrity(req->bio); - struct bio_integrity_payload *bip_next = bio_integrity(next); - - return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1], - bip_next->bip_vec[0].bv_offset); -} - -static inline bool integrity_req_gap_front_merge(struct request *req, - struct bio *bio) -{ - struct bio_integrity_payload *bip = bio_integrity(bio); - struct bio_integrity_payload *bip_next = bio_integrity(req->bio); - - return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1], - bip_next->bip_vec[0].bv_offset); -} - /** * bio_integrity_intervals - Return number of integrity intervals for a bio * @bi: blk_integrity profile for device @@ -1947,17 +1820,6 @@ static inline bool blk_integrity_merge_bio(struct request_queue *rq, return true; } -static inline bool integrity_req_gap_back_merge(struct request *req, - struct bio *next) -{ - return false; -} -static inline bool integrity_req_gap_front_merge(struct request *req, - struct bio *bio) -{ - return false; -} - static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, unsigned int sectors) { diff --git a/include/linux/bvec.h b/include/linux/bvec.h index fe7a22dd133b..02c73c6aa805 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -40,8 +40,6 @@ struct bvec_iter { unsigned int bi_idx; /* current index into bvl_vec */ - unsigned int bi_done; /* number of bytes completed */ - unsigned int bi_bvec_done; /* number of bytes completed in current bvec */ }; @@ -85,7 +83,6 @@ static inline bool bvec_iter_advance(const struct bio_vec *bv, bytes -= len; iter->bi_size -= len; iter->bi_bvec_done += len; - iter->bi_done += len; if (iter->bi_bvec_done == __bvec_iter_bvec(bv, *iter)->bv_len) { iter->bi_bvec_done = 0; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 32c553556bbd..b8bcbdeb2eac 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -93,6 +93,8 @@ extern struct css_set init_css_set; bool css_has_online_children(struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); +struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgroup, + struct cgroup_subsys *ss); struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup, struct cgroup_subsys *ss); struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, diff --git a/include/linux/elevator.h b/include/linux/elevator.h index a02deea30185..015bb59c0331 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -111,7 +111,7 @@ struct elevator_mq_ops { void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool); struct request *(*dispatch_request)(struct blk_mq_hw_ctx *); bool (*has_work)(struct blk_mq_hw_ctx *); - void (*completed_request)(struct request *); + void (*completed_request)(struct request *, u64); void (*started_request)(struct request *); void (*requeue_request)(struct request *); struct request *(*former_request)(struct request_queue *, struct request *); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 25c08c6c7f99..70fc838e6773 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -402,10 +402,11 @@ static inline void free_part_info(struct hd_struct *part) extern void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part); /* block/genhd.c */ -extern void device_add_disk(struct device *parent, struct gendisk *disk); +extern void device_add_disk(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups); static inline void add_disk(struct gendisk *disk) { - device_add_disk(NULL, disk); + device_add_disk(NULL, disk, NULL); } extern void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk); static inline void add_disk_no_queue_reg(struct gendisk *disk) diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 009cdf3d65b6..b297cd1cd4f1 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -108,6 +108,7 @@ void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref); void percpu_ref_switch_to_percpu(struct percpu_ref *ref); void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill); +void percpu_ref_resurrect(struct percpu_ref *ref); void percpu_ref_reinit(struct percpu_ref *ref); /** diff --git a/include/linux/writeback.h b/include/linux/writeback.h index fdfd04e348f6..738a0c24874f 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -246,7 +246,8 @@ static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, * * @bio is a part of the writeback in progress controlled by @wbc. Perform * writeback specific initialization. This is used to apply the cgroup - * writeback context. + * writeback context. Must be called after the bio has been associated with + * a device. */ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) { @@ -257,7 +258,7 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) * regular writeback instead of writing things out itself. */ if (wbc->wb) - bio_associate_blkcg(bio, wbc->wb->blkcg_css); + bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css); } #else /* CONFIG_CGROUP_WRITEBACK */ diff --git a/include/trace/events/kyber.h b/include/trace/events/kyber.h new file mode 100644 index 000000000000..a9834c37ac40 --- /dev/null +++ b/include/trace/events/kyber.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kyber + +#if !defined(_TRACE_KYBER_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KYBER_H + +#include <linux/blkdev.h> +#include <linux/tracepoint.h> + +#define DOMAIN_LEN 16 +#define LATENCY_TYPE_LEN 8 + +TRACE_EVENT(kyber_latency, + + TP_PROTO(struct request_queue *q, const char *domain, const char *type, + unsigned int percentile, unsigned int numerator, + unsigned int denominator, unsigned int samples), + + TP_ARGS(q, domain, type, percentile, numerator, denominator, samples), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __array( char, domain, DOMAIN_LEN ) + __array( char, type, LATENCY_TYPE_LEN ) + __field( u8, percentile ) + __field( u8, numerator ) + __field( u8, denominator ) + __field( unsigned int, samples ) + ), + + TP_fast_assign( + __entry->dev = disk_devt(dev_to_disk(kobj_to_dev(q->kobj.parent))); + strlcpy(__entry->domain, domain, DOMAIN_LEN); + strlcpy(__entry->type, type, DOMAIN_LEN); + __entry->percentile = percentile; + __entry->numerator = numerator; + __entry->denominator = denominator; + __entry->samples = samples; + ), + + TP_printk("%d,%d %s %s p%u %u/%u samples=%u", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->domain, + __entry->type, __entry->percentile, __entry->numerator, + __entry->denominator, __entry->samples) +); + +TRACE_EVENT(kyber_adjust, + + TP_PROTO(struct request_queue *q, const char *domain, + unsigned int depth), + + TP_ARGS(q, domain, depth), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __array( char, domain, DOMAIN_LEN ) + __field( unsigned int, depth ) + ), + + TP_fast_assign( + __entry->dev = disk_devt(dev_to_disk(kobj_to_dev(q->kobj.parent))); + strlcpy(__entry->domain, domain, DOMAIN_LEN); + __entry->depth = depth; + ), + + TP_printk("%d,%d %s %u", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->domain, + __entry->depth) +); + +TRACE_EVENT(kyber_throttled, + + TP_PROTO(struct request_queue *q, const char *domain), + + TP_ARGS(q, domain), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __array( char, domain, DOMAIN_LEN ) + ), + + TP_fast_assign( + __entry->dev = disk_devt(dev_to_disk(kobj_to_dev(q->kobj.parent))); + strlcpy(__entry->domain, domain, DOMAIN_LEN); + ), + + TP_printk("%d,%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->domain) +); + +#define _TRACE_KYBER_H +#endif /* _TRACE_KYBER_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/include/xen/xen.h b/include/xen/xen.h index 1e1d9bd0bd37..d7a2678da77f 100644 --- a/include/xen/xen.h +++ b/include/xen/xen.h @@ -39,4 +39,8 @@ extern uint32_t xen_start_flags; #define xen_initial_domain() (0) #endif /* CONFIG_XEN_DOM0 */ +struct bio_vec; +bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, + const struct bio_vec *vec2); + #endif /* _XEN_XEN_H */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index aae10baf1902..48fb22e49467 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -492,7 +492,7 @@ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, } /** - * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem + * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss * @cgrp: the cgroup of interest * @ss: the subsystem of interest (%NULL returns @cgrp->self) * @@ -501,8 +501,8 @@ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, * enabled. If @ss is associated with the hierarchy @cgrp is on, this * function is guaranteed to return non-NULL css. */ -static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, - struct cgroup_subsys *ss) +static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp, + struct cgroup_subsys *ss) { lockdep_assert_held(&cgroup_mutex); @@ -523,6 +523,35 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, } /** + * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem + * @cgrp: the cgroup of interest + * @ss: the subsystem of interest + * + * Find and get the effective css of @cgrp for @ss. The effective css is + * defined as the matching css of the nearest ancestor including self which + * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on, + * the root css is returned, so this function always returns a valid css. + * + * The returned css is not guaranteed to be online, and therefore it is the + * callers responsiblity to tryget a reference for it. + */ +struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) +{ + struct cgroup_subsys_state *css; + + do { + css = cgroup_css(cgrp, ss); + + if (css) + return css; + cgrp = cgroup_parent(cgrp); + } while (cgrp); + + return init_css_set.subsys[ss->id]; +} + +/** * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest @@ -604,10 +633,11 @@ EXPORT_SYMBOL_GPL(of_css); * * Should be called under cgroup_[tree_]mutex. */ -#define for_each_e_css(css, ssid, cgrp) \ - for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ - if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \ - ; \ +#define for_each_e_css(css, ssid, cgrp) \ + for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ + if (!((css) = cgroup_e_css_by_mask(cgrp, \ + cgroup_subsys[(ssid)]))) \ + ; \ else /** @@ -1006,7 +1036,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, * @ss is in this hierarchy, so we want the * effective css from @cgrp. */ - template[i] = cgroup_e_css(cgrp, ss); + template[i] = cgroup_e_css_by_mask(cgrp, ss); } else { /* * @ss is not in this hierarchy, so we don't want @@ -3019,7 +3049,7 @@ static int cgroup_apply_control(struct cgroup *cgrp) return ret; /* - * At this point, cgroup_e_css() results reflect the new csses + * At this point, cgroup_e_css_by_mask() results reflect the new csses * making the following cgroup_update_dfl_csses() properly update * css associations of all tasks in the subtree. */ diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 2868d85f1fb1..fac0ddf8a8e2 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -764,9 +764,9 @@ blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) return NULL; - if (!bio->bi_css) + if (!bio->bi_blkg) return NULL; - return cgroup_get_kernfs_id(bio->bi_css->cgroup); + return cgroup_get_kernfs_id(bio_blkcg(bio)->css.cgroup); } #else static union kernfs_node_id * diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 9f96fa7bc000..de10b8c0bff6 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -356,11 +356,35 @@ EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); */ void percpu_ref_reinit(struct percpu_ref *ref) { + WARN_ON_ONCE(!percpu_ref_is_zero(ref)); + + percpu_ref_resurrect(ref); +} +EXPORT_SYMBOL_GPL(percpu_ref_reinit); + +/** + * percpu_ref_resurrect - modify a percpu refcount from dead to live + * @ref: perpcu_ref to resurrect + * + * Modify @ref so that it's in the same state as before percpu_ref_kill() was + * called. @ref must be dead but must not yet have exited. + * + * If @ref->release() frees @ref then the caller is responsible for + * guaranteeing that @ref->release() does not get called while this + * function is in progress. + * + * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while + * this function is in progress. + */ +void percpu_ref_resurrect(struct percpu_ref *ref) +{ + unsigned long __percpu *percpu_count; unsigned long flags; spin_lock_irqsave(&percpu_ref_switch_lock, flags); - WARN_ON_ONCE(!percpu_ref_is_zero(ref)); + WARN_ON_ONCE(!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)); + WARN_ON_ONCE(__ref_is_percpu(ref, &percpu_count)); ref->percpu_count_ptr &= ~__PERCPU_REF_DEAD; percpu_ref_get(ref); @@ -368,4 +392,4 @@ void percpu_ref_reinit(struct percpu_ref *ref) spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); } -EXPORT_SYMBOL_GPL(percpu_ref_reinit); +EXPORT_SYMBOL_GPL(percpu_ref_resurrect); diff --git a/mm/page_io.c b/mm/page_io.c index aafd19ec1db4..573d3663d846 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -339,7 +339,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, goto out; } bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc); - bio_associate_blkcg_from_page(bio, page); + bio_associate_blkg_from_page(bio, page); count_swpout_vm_event(page); set_page_writeback(page); unlock_page(page); |