diff options
Diffstat (limited to 'block')
58 files changed, 1603 insertions, 1241 deletions
diff --git a/block/Makefile b/block/Makefile index 1a43750f4b01..206b96e9387f 100644 --- a/block/Makefile +++ b/block/Makefile @@ -8,8 +8,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-sysfs.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \ - genhd.o partition-generic.o ioprio.o \ - badblocks.o partitions/ blk-rq-qos.o + genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_ioctl.o diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 09b69a3ed490..68882b9b8f11 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -610,12 +610,13 @@ struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, */ entity = &bfqg->entity; for_each_entity(entity) { - bfqg = container_of(entity, struct bfq_group, entity); - if (bfqg != bfqd->root_group) { - parent = bfqg_parent(bfqg); + struct bfq_group *curr_bfqg = container_of(entity, + struct bfq_group, entity); + if (curr_bfqg != bfqd->root_group) { + parent = bfqg_parent(curr_bfqg); if (!parent) parent = bfqd->root_group; - bfq_group_set_parent(bfqg, parent); + bfq_group_set_parent(curr_bfqg, parent); } } @@ -641,6 +642,12 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, { struct bfq_entity *entity = &bfqq->entity; + /* + * Get extra reference to prevent bfqq from being freed in + * next possible expire or deactivate. + */ + bfqq->ref++; + /* If bfqq is empty, then bfq_bfqq_expire also invokes * bfq_del_bfqq_busy, thereby removing bfqq and its entity * from data structures related to current group. Otherwise we @@ -651,12 +658,6 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_bfqq_expire(bfqd, bfqd->in_service_queue, false, BFQQE_PREEMPTED); - /* - * get extra reference to prevent bfqq from being freed in - * next possible deactivate - */ - bfqq->ref++; - if (bfq_bfqq_busy(bfqq)) bfq_deactivate_bfqq(bfqd, bfqq, false, false); else if (entity->on_st_or_in_serv) @@ -676,7 +677,7 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (!bfqd->in_service_queue && !bfqd->rq_in_driver) bfq_schedule_dispatch(bfqd); - /* release extra ref taken above */ + /* release extra ref taken above, bfqq may happen to be freed now */ bfq_put_queue(bfqq); } @@ -713,10 +714,7 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, if (entity->sched_data != &bfqg->sched_data) { bic_set_bfqq(bic, NULL, 0); - bfq_log_bfqq(bfqd, async_bfqq, - "bic_change_group: %p %d", - async_bfqq, async_bfqq->ref); - bfq_put_queue(async_bfqq); + bfq_release_process_ref(bfqd, async_bfqq); } } @@ -817,39 +815,53 @@ static void bfq_flush_idle_tree(struct bfq_service_tree *st) /** * bfq_reparent_leaf_entity - move leaf entity to the root_group. * @bfqd: the device data structure with the root group. - * @entity: the entity to move. + * @entity: the entity to move, if entity is a leaf; or the parent entity + * of an active leaf entity to move, if entity is not a leaf. */ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, - struct bfq_entity *entity) + struct bfq_entity *entity, + int ioprio_class) { - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct bfq_queue *bfqq; + struct bfq_entity *child_entity = entity; + while (child_entity->my_sched_data) { /* leaf not reached yet */ + struct bfq_sched_data *child_sd = child_entity->my_sched_data; + struct bfq_service_tree *child_st = child_sd->service_tree + + ioprio_class; + struct rb_root *child_active = &child_st->active; + + child_entity = bfq_entity_of(rb_first(child_active)); + + if (!child_entity) + child_entity = child_sd->in_service_entity; + } + + bfqq = bfq_entity_to_bfqq(child_entity); bfq_bfqq_move(bfqd, bfqq, bfqd->root_group); } /** - * bfq_reparent_active_entities - move to the root group all active - * entities. + * bfq_reparent_active_queues - move to the root group all active queues. * @bfqd: the device data structure with the root group. * @bfqg: the group to move from. - * @st: the service tree with the entities. + * @st: the service tree to start the search from. */ -static void bfq_reparent_active_entities(struct bfq_data *bfqd, - struct bfq_group *bfqg, - struct bfq_service_tree *st) +static void bfq_reparent_active_queues(struct bfq_data *bfqd, + struct bfq_group *bfqg, + struct bfq_service_tree *st, + int ioprio_class) { struct rb_root *active = &st->active; - struct bfq_entity *entity = NULL; - - if (!RB_EMPTY_ROOT(&st->active)) - entity = bfq_entity_of(rb_first(active)); + struct bfq_entity *entity; - for (; entity ; entity = bfq_entity_of(rb_first(active))) - bfq_reparent_leaf_entity(bfqd, entity); + while ((entity = bfq_entity_of(rb_first(active)))) + bfq_reparent_leaf_entity(bfqd, entity, ioprio_class); if (bfqg->sched_data.in_service_entity) bfq_reparent_leaf_entity(bfqd, - bfqg->sched_data.in_service_entity); + bfqg->sched_data.in_service_entity, + ioprio_class); } /** @@ -882,13 +894,6 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) st = bfqg->sched_data.service_tree + i; /* - * The idle tree may still contain bfq_queues belonging - * to exited task because they never migrated to a different - * cgroup from the one being destroyed now. - */ - bfq_flush_idle_tree(st); - - /* * It may happen that some queues are still active * (busy) upon group destruction (if the corresponding * processes have been forced to terminate). We move @@ -900,7 +905,20 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) * There is no need to put the sync queues, as the * scheduler has taken no reference. */ - bfq_reparent_active_entities(bfqd, bfqg, st); + bfq_reparent_active_queues(bfqd, bfqg, st, i); + + /* + * The idle tree may still contain bfq_queues + * belonging to exited task because they never + * migrated to a different cgroup from the one being + * destroyed now. In addition, even + * bfq_reparent_active_queues() may happen to add some + * entities to the idle tree. It happens if, in some + * of the calls to bfq_bfqq_move() performed by + * bfq_reparent_active_queues(), the queue to move is + * empty and gets expired. + */ + bfq_flush_idle_tree(st); } __bfq_deactivate_entity(entity, false); diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 8c436abfaf14..78ba57efd16b 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2716,8 +2716,6 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) } } - -static void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq) { /* @@ -6215,20 +6213,28 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) return bfqq; } -static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) +static void +bfq_idle_slice_timer_body(struct bfq_data *bfqd, struct bfq_queue *bfqq) { - struct bfq_data *bfqd = bfqq->bfqd; enum bfqq_expiration reason; unsigned long flags; spin_lock_irqsave(&bfqd->lock, flags); - bfq_clear_bfqq_wait_request(bfqq); + /* + * Considering that bfqq may be in race, we should firstly check + * whether bfqq is in service before doing something on it. If + * the bfqq in race is not in service, it has already been expired + * through __bfq_bfqq_expire func and its wait_request flags has + * been cleared in __bfq_bfqd_reset_in_service func. + */ if (bfqq != bfqd->in_service_queue) { spin_unlock_irqrestore(&bfqd->lock, flags); return; } + bfq_clear_bfqq_wait_request(bfqq); + if (bfq_bfqq_budget_timeout(bfqq)) /* * Also here the queue can be safely expired @@ -6273,7 +6279,7 @@ static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) * early. */ if (bfqq) - bfq_idle_slice_timer_body(bfqq); + bfq_idle_slice_timer_body(bfqd, bfqq); return HRTIMER_NORESTART; } diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index d1233af9c684..cd224aaf9f52 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -955,6 +955,7 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, bool compensate, enum bfqq_expiration reason); void bfq_put_queue(struct bfq_queue *bfqq); void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); +void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq); void bfq_schedule_dispatch(struct bfq_data *bfqd); void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); diff --git a/block/bio.c b/block/bio.c index 94d697217887..21cbaa6a1c20 100644 --- a/block/bio.c +++ b/block/bio.c @@ -17,6 +17,7 @@ #include <linux/cgroup.h> #include <linux/blk-cgroup.h> #include <linux/highmem.h> +#include <linux/sched/sysctl.h> #include <trace/events/block.h> #include "blk.h" @@ -588,6 +589,49 @@ void bio_truncate(struct bio *bio, unsigned new_size) } /** + * guard_bio_eod - truncate a BIO to fit the block device + * @bio: bio to truncate + * + * This allows us to do IO even on the odd last sectors of a device, even if the + * block size is some multiple of the physical sector size. + * + * We'll just truncate the bio to the size of the device, and clear the end of + * the buffer head manually. Truly out-of-range accesses will turn into actual + * I/O errors, this only handles the "we need to be able to do I/O at the final + * sector" case. + */ +void guard_bio_eod(struct bio *bio) +{ + sector_t maxsector; + struct hd_struct *part; + + rcu_read_lock(); + part = __disk_get_part(bio->bi_disk, bio->bi_partno); + if (part) + maxsector = part_nr_sects_read(part); + else + maxsector = get_capacity(bio->bi_disk); + rcu_read_unlock(); + + if (!maxsector) + return; + + /* + * If the *whole* IO is past the end of the device, + * let it through, and the IO layer will turn it into + * an EIO. + */ + if (unlikely(bio->bi_iter.bi_sector >= maxsector)) + return; + + maxsector -= bio->bi_iter.bi_sector; + if (likely((bio->bi_iter.bi_size >> 9) <= maxsector)) + return; + + bio_truncate(bio, maxsector << 9); +} + +/** * bio_put - release a reference to a bio * @bio: bio to release reference to * @@ -679,6 +723,12 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs) } EXPORT_SYMBOL(bio_clone_fast); +const char *bio_devname(struct bio *bio, char *buf) +{ + return disk_name(bio->bi_disk, bio->bi_partno, buf); +} +EXPORT_SYMBOL(bio_devname); + static inline bool page_is_mergeable(const struct bio_vec *bv, struct page *page, unsigned int len, unsigned int off, bool *same_page) @@ -730,7 +780,7 @@ static bool bio_try_merge_pc_page(struct request_queue *q, struct bio *bio, * * This should only be used by passthrough bios. */ -static int __bio_add_pc_page(struct request_queue *q, struct bio *bio, +int __bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, bool *same_page) { @@ -1019,12 +1069,21 @@ static void submit_bio_wait_endio(struct bio *bio) int submit_bio_wait(struct bio *bio) { DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_disk->lockdep_map); + unsigned long hang_check; bio->bi_private = &done; bio->bi_end_io = submit_bio_wait_endio; bio->bi_opf |= REQ_SYNC; submit_bio(bio); - wait_for_completion_io(&done); + + /* Prevent hang_check timer from firing at us during very long I/O */ + hang_check = sysctl_hung_task_timeout_secs; + if (hang_check) + while (!wait_for_completion_io_timeout(&done, + hang_check * (HZ/2))) + ; + else + wait_for_completion_io(&done); return blk_status_to_errno(bio->bi_status); } @@ -1135,90 +1194,6 @@ void bio_list_copy_data(struct bio *dst, struct bio *src) } EXPORT_SYMBOL(bio_list_copy_data); -struct bio_map_data { - int is_our_pages; - struct iov_iter iter; - struct iovec iov[]; -}; - -static struct bio_map_data *bio_alloc_map_data(struct iov_iter *data, - gfp_t gfp_mask) -{ - struct bio_map_data *bmd; - if (data->nr_segs > UIO_MAXIOV) - return NULL; - - bmd = kmalloc(struct_size(bmd, iov, data->nr_segs), gfp_mask); - if (!bmd) - return NULL; - memcpy(bmd->iov, data->iov, sizeof(struct iovec) * data->nr_segs); - bmd->iter = *data; - bmd->iter.iov = bmd->iov; - return bmd; -} - -/** - * bio_copy_from_iter - copy all pages from iov_iter to bio - * @bio: The &struct bio which describes the I/O as destination - * @iter: iov_iter as source - * - * Copy all pages from iov_iter to bio. - * Returns 0 on success, or error on failure. - */ -static int bio_copy_from_iter(struct bio *bio, struct iov_iter *iter) -{ - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - - bio_for_each_segment_all(bvec, bio, iter_all) { - ssize_t ret; - - ret = copy_page_from_iter(bvec->bv_page, - bvec->bv_offset, - bvec->bv_len, - iter); - - if (!iov_iter_count(iter)) - break; - - if (ret < bvec->bv_len) - return -EFAULT; - } - - return 0; -} - -/** - * bio_copy_to_iter - copy all pages from bio to iov_iter - * @bio: The &struct bio which describes the I/O as source - * @iter: iov_iter as destination - * - * Copy all pages from bio to iov_iter. - * Returns 0 on success, or error on failure. - */ -static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter) -{ - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - - bio_for_each_segment_all(bvec, bio, iter_all) { - ssize_t ret; - - ret = copy_page_to_iter(bvec->bv_page, - bvec->bv_offset, - bvec->bv_len, - &iter); - - if (!iov_iter_count(&iter)) - break; - - if (ret < bvec->bv_len) - return -EFAULT; - } - - return 0; -} - void bio_free_pages(struct bio *bio) { struct bio_vec *bvec; @@ -1229,430 +1204,6 @@ void bio_free_pages(struct bio *bio) } EXPORT_SYMBOL(bio_free_pages); -/** - * bio_uncopy_user - finish previously mapped bio - * @bio: bio being terminated - * - * Free pages allocated from bio_copy_user_iov() and write back data - * to user space in case of a read. - */ -int bio_uncopy_user(struct bio *bio) -{ - struct bio_map_data *bmd = bio->bi_private; - int ret = 0; - - if (!bio_flagged(bio, BIO_NULL_MAPPED)) { - /* - * if we're in a workqueue, the request is orphaned, so - * don't copy into a random user address space, just free - * and return -EINTR so user space doesn't expect any data. - */ - if (!current->mm) - ret = -EINTR; - else if (bio_data_dir(bio) == READ) - ret = bio_copy_to_iter(bio, bmd->iter); - if (bmd->is_our_pages) - bio_free_pages(bio); - } - kfree(bmd); - bio_put(bio); - return ret; -} - -/** - * bio_copy_user_iov - copy user data to bio - * @q: destination block queue - * @map_data: pointer to the rq_map_data holding pages (if necessary) - * @iter: iovec iterator - * @gfp_mask: memory allocation flags - * - * Prepares and returns a bio for indirect user io, bouncing data - * to/from kernel pages as necessary. Must be paired with - * call bio_uncopy_user() on io completion. - */ -struct bio *bio_copy_user_iov(struct request_queue *q, - struct rq_map_data *map_data, - struct iov_iter *iter, - gfp_t gfp_mask) -{ - struct bio_map_data *bmd; - struct page *page; - struct bio *bio; - int i = 0, ret; - int nr_pages; - unsigned int len = iter->count; - unsigned int offset = map_data ? offset_in_page(map_data->offset) : 0; - - bmd = bio_alloc_map_data(iter, gfp_mask); - if (!bmd) - return ERR_PTR(-ENOMEM); - - /* - * We need to do a deep copy of the iov_iter including the iovecs. - * The caller provided iov might point to an on-stack or otherwise - * shortlived one. - */ - bmd->is_our_pages = map_data ? 0 : 1; - - nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); - if (nr_pages > BIO_MAX_PAGES) - nr_pages = BIO_MAX_PAGES; - - ret = -ENOMEM; - bio = bio_kmalloc(gfp_mask, nr_pages); - if (!bio) - goto out_bmd; - - ret = 0; - - if (map_data) { - nr_pages = 1 << map_data->page_order; - i = map_data->offset / PAGE_SIZE; - } - while (len) { - unsigned int bytes = PAGE_SIZE; - - bytes -= offset; - - if (bytes > len) - bytes = len; - - if (map_data) { - if (i == map_data->nr_entries * nr_pages) { - ret = -ENOMEM; - break; - } - - page = map_data->pages[i / nr_pages]; - page += (i % nr_pages); - - i++; - } else { - page = alloc_page(q->bounce_gfp | gfp_mask); - if (!page) { - ret = -ENOMEM; - break; - } - } - - if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) { - if (!map_data) - __free_page(page); - break; - } - - len -= bytes; - offset = 0; - } - - if (ret) - goto cleanup; - - if (map_data) - map_data->offset += bio->bi_iter.bi_size; - - /* - * success - */ - if ((iov_iter_rw(iter) == WRITE && (!map_data || !map_data->null_mapped)) || - (map_data && map_data->from_user)) { - ret = bio_copy_from_iter(bio, iter); - if (ret) - goto cleanup; - } else { - if (bmd->is_our_pages) - zero_fill_bio(bio); - iov_iter_advance(iter, bio->bi_iter.bi_size); - } - - bio->bi_private = bmd; - if (map_data && map_data->null_mapped) - bio_set_flag(bio, BIO_NULL_MAPPED); - return bio; -cleanup: - if (!map_data) - bio_free_pages(bio); - bio_put(bio); -out_bmd: - kfree(bmd); - return ERR_PTR(ret); -} - -/** - * bio_map_user_iov - map user iovec into bio - * @q: the struct request_queue for the bio - * @iter: iovec iterator - * @gfp_mask: memory allocation flags - * - * Map the user space address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -struct bio *bio_map_user_iov(struct request_queue *q, - struct iov_iter *iter, - gfp_t gfp_mask) -{ - int j; - struct bio *bio; - int ret; - - if (!iov_iter_count(iter)) - return ERR_PTR(-EINVAL); - - bio = bio_kmalloc(gfp_mask, iov_iter_npages(iter, BIO_MAX_PAGES)); - if (!bio) - return ERR_PTR(-ENOMEM); - - while (iov_iter_count(iter)) { - struct page **pages; - ssize_t bytes; - size_t offs, added = 0; - int npages; - - bytes = iov_iter_get_pages_alloc(iter, &pages, LONG_MAX, &offs); - if (unlikely(bytes <= 0)) { - ret = bytes ? bytes : -EFAULT; - goto out_unmap; - } - - npages = DIV_ROUND_UP(offs + bytes, PAGE_SIZE); - - if (unlikely(offs & queue_dma_alignment(q))) { - ret = -EINVAL; - j = 0; - } else { - for (j = 0; j < npages; j++) { - struct page *page = pages[j]; - unsigned int n = PAGE_SIZE - offs; - bool same_page = false; - - if (n > bytes) - n = bytes; - - if (!__bio_add_pc_page(q, bio, page, n, offs, - &same_page)) { - if (same_page) - put_page(page); - break; - } - - added += n; - bytes -= n; - offs = 0; - } - iov_iter_advance(iter, added); - } - /* - * release the pages we didn't map into the bio, if any - */ - while (j < npages) - put_page(pages[j++]); - kvfree(pages); - /* couldn't stuff something into bio? */ - if (bytes) - break; - } - - bio_set_flag(bio, BIO_USER_MAPPED); - - /* - * subtle -- if bio_map_user_iov() ended up bouncing a bio, - * it would normally disappear when its bi_end_io is run. - * however, we need it for the unmap, so grab an extra - * reference to it - */ - bio_get(bio); - return bio; - - out_unmap: - bio_release_pages(bio, false); - bio_put(bio); - return ERR_PTR(ret); -} - -/** - * bio_unmap_user - unmap a bio - * @bio: the bio being unmapped - * - * Unmap a bio previously mapped by bio_map_user_iov(). Must be called from - * process context. - * - * bio_unmap_user() may sleep. - */ -void bio_unmap_user(struct bio *bio) -{ - bio_release_pages(bio, bio_data_dir(bio) == READ); - bio_put(bio); - bio_put(bio); -} - -static void bio_invalidate_vmalloc_pages(struct bio *bio) -{ -#ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE - if (bio->bi_private && !op_is_write(bio_op(bio))) { - unsigned long i, len = 0; - - for (i = 0; i < bio->bi_vcnt; i++) - len += bio->bi_io_vec[i].bv_len; - invalidate_kernel_vmap_range(bio->bi_private, len); - } -#endif -} - -static void bio_map_kern_endio(struct bio *bio) -{ - bio_invalidate_vmalloc_pages(bio); - bio_put(bio); -} - -/** - * bio_map_kern - map kernel address into bio - * @q: the struct request_queue for the bio - * @data: pointer to buffer to map - * @len: length in bytes - * @gfp_mask: allocation flags for bio allocation - * - * Map the kernel address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, - gfp_t gfp_mask) -{ - unsigned long kaddr = (unsigned long)data; - unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long start = kaddr >> PAGE_SHIFT; - const int nr_pages = end - start; - bool is_vmalloc = is_vmalloc_addr(data); - struct page *page; - int offset, i; - struct bio *bio; - - bio = bio_kmalloc(gfp_mask, nr_pages); - if (!bio) - return ERR_PTR(-ENOMEM); - - if (is_vmalloc) { - flush_kernel_vmap_range(data, len); - bio->bi_private = data; - } - - offset = offset_in_page(kaddr); - for (i = 0; i < nr_pages; i++) { - unsigned int bytes = PAGE_SIZE - offset; - - if (len <= 0) - break; - - if (bytes > len) - bytes = len; - - if (!is_vmalloc) - page = virt_to_page(data); - else - page = vmalloc_to_page(data); - if (bio_add_pc_page(q, bio, page, bytes, - offset) < bytes) { - /* we don't support partial mappings */ - bio_put(bio); - return ERR_PTR(-EINVAL); - } - - data += bytes; - len -= bytes; - offset = 0; - } - - bio->bi_end_io = bio_map_kern_endio; - return bio; -} - -static void bio_copy_kern_endio(struct bio *bio) -{ - bio_free_pages(bio); - bio_put(bio); -} - -static void bio_copy_kern_endio_read(struct bio *bio) -{ - char *p = bio->bi_private; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - - bio_for_each_segment_all(bvec, bio, iter_all) { - memcpy(p, page_address(bvec->bv_page), bvec->bv_len); - p += bvec->bv_len; - } - - bio_copy_kern_endio(bio); -} - -/** - * bio_copy_kern - copy kernel address into bio - * @q: the struct request_queue for the bio - * @data: pointer to buffer to copy - * @len: length in bytes - * @gfp_mask: allocation flags for bio and page allocation - * @reading: data direction is READ - * - * copy the kernel address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, - gfp_t gfp_mask, int reading) -{ - unsigned long kaddr = (unsigned long)data; - unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long start = kaddr >> PAGE_SHIFT; - struct bio *bio; - void *p = data; - int nr_pages = 0; - - /* - * Overflow, abort - */ - if (end < start) - return ERR_PTR(-EINVAL); - - nr_pages = end - start; - bio = bio_kmalloc(gfp_mask, nr_pages); - if (!bio) - return ERR_PTR(-ENOMEM); - - while (len) { - struct page *page; - unsigned int bytes = PAGE_SIZE; - - if (bytes > len) - bytes = len; - - page = alloc_page(q->bounce_gfp | gfp_mask); - if (!page) - goto cleanup; - - if (!reading) - memcpy(page_address(page), p, bytes); - - if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) - break; - - len -= bytes; - p += bytes; - } - - if (reading) { - bio->bi_end_io = bio_copy_kern_endio_read; - bio->bi_private = data; - } else { - bio->bi_end_io = bio_copy_kern_endio; - } - - return bio; - -cleanup: - bio_free_pages(bio); - bio_put(bio); - return ERR_PTR(-ENOMEM); -} - /* * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions * for performing direct-IO in BIOs. @@ -1752,14 +1303,14 @@ defer: schedule_work(&bio_dirty_work); } -void update_io_ticks(struct hd_struct *part, unsigned long now) +void update_io_ticks(struct hd_struct *part, unsigned long now, bool end) { unsigned long stamp; again: stamp = READ_ONCE(part->stamp); if (unlikely(stamp != now)) { if (likely(cmpxchg(&part->stamp, stamp, now) == stamp)) { - __part_stat_add(part, io_ticks, 1); + __part_stat_add(part, io_ticks, end ? now - stamp : 1); } } if (part->partno) { @@ -1775,7 +1326,7 @@ void generic_start_io_acct(struct request_queue *q, int op, part_stat_lock(); - update_io_ticks(part, jiffies); + update_io_ticks(part, jiffies, false); part_stat_inc(part, ios[sgrp]); part_stat_add(part, sectors[sgrp], sectors); part_inc_in_flight(q, part, op_is_write(op)); @@ -1793,9 +1344,8 @@ void generic_end_io_acct(struct request_queue *q, int req_op, part_stat_lock(); - update_io_ticks(part, now); + update_io_ticks(part, now, true); part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration)); - part_stat_add(part, time_in_queue, duration); part_dec_in_flight(q, part, op_is_write(req_op)); part_stat_unlock(); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index a229b94d5390..c15a26096038 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1010,7 +1010,7 @@ unlock: * blkcg_init_queue - initialize blkcg part of request queue * @q: request_queue to initialize * - * Called from blk_alloc_queue_node(). Responsible for initializing blkcg + * Called from __blk_alloc_queue(). Responsible for initializing blkcg * part of new request_queue @q. * * RETURNS: diff --git a/block/blk-core.c b/block/blk-core.c index 089e890ab208..7e4a1da0715e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -346,7 +346,6 @@ void blk_cleanup_queue(struct request_queue *q) blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); - blk_queue_flag_set(QUEUE_FLAG_DYING, q); /* * Drain all requests queued before DYING marking. Set DEAD flag to @@ -389,12 +388,6 @@ void blk_cleanup_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_cleanup_queue); -struct request_queue *blk_alloc_queue(gfp_t gfp_mask) -{ - return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE); -} -EXPORT_SYMBOL(blk_alloc_queue); - /** * blk_queue_enter() - try to increase q->q_usage_counter * @q: request queue pointer @@ -471,24 +464,19 @@ static void blk_timeout_work(struct work_struct *work) { } -/** - * blk_alloc_queue_node - allocate a request queue - * @gfp_mask: memory allocation flags - * @node_id: NUMA node to allocate memory from - */ -struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) +struct request_queue *__blk_alloc_queue(int node_id) { struct request_queue *q; int ret; q = kmem_cache_alloc_node(blk_requestq_cachep, - gfp_mask | __GFP_ZERO, node_id); + GFP_KERNEL | __GFP_ZERO, node_id); if (!q) return NULL; q->last_merge = NULL; - q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); + q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL); if (q->id < 0) goto fail_q; @@ -496,7 +484,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (ret) goto fail_id; - q->backing_dev_info = bdi_alloc_node(gfp_mask, node_id); + q->backing_dev_info = bdi_alloc_node(GFP_KERNEL, node_id); if (!q->backing_dev_info) goto fail_split; @@ -542,6 +530,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (blkcg_init_queue(q)) goto fail_ref; + blk_queue_dma_alignment(q, 511); + blk_set_default_limits(&q->limits); + return q; fail_ref: @@ -558,7 +549,22 @@ fail_q: kmem_cache_free(blk_requestq_cachep, q); return NULL; } -EXPORT_SYMBOL(blk_alloc_queue_node); + +struct request_queue *blk_alloc_queue(make_request_fn make_request, int node_id) +{ + struct request_queue *q; + + if (WARN_ON_ONCE(!make_request)) + return NULL; + + q = __blk_alloc_queue(node_id); + if (!q) + return NULL; + q->make_request_fn = make_request; + q->nr_requests = BLKDEV_MAX_RQ; + return q; +} +EXPORT_SYMBOL(blk_alloc_queue); bool blk_get_queue(struct request_queue *q) { @@ -1121,10 +1127,9 @@ blk_qc_t direct_make_request(struct bio *bio) if (unlikely(blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0))) { if (nowait && !blk_queue_dying(q)) - bio->bi_status = BLK_STS_AGAIN; + bio_wouldblock_error(bio); else - bio->bi_status = BLK_STS_IOERR; - bio_endio(bio); + bio_io_error(bio); return BLK_QC_T_NONE; } @@ -1203,7 +1208,7 @@ EXPORT_SYMBOL(submit_bio); /** * blk_cloned_rq_check_limits - Helper function to check a cloned request - * for new the queue limits + * for the new queue limits * @q: the queue * @rq: the request being checked * @@ -1339,10 +1344,9 @@ void blk_account_io_done(struct request *req, u64 now) part_stat_lock(); part = req->part; - update_io_ticks(part, jiffies); + update_io_ticks(part, jiffies, true); part_stat_inc(part, ios[sgrp]); part_stat_add(part, nsecs[sgrp], now - req->start_time_ns); - part_stat_add(part, time_in_queue, nsecs_to_jiffies64(now - req->start_time_ns)); part_dec_in_flight(req->q, part, rq_data_dir(req)); hd_struct_put(part); @@ -1381,7 +1385,7 @@ void blk_account_io_start(struct request *rq, bool new_io) rq->part = part; } - update_io_ticks(part, jiffies); + update_io_ticks(part, jiffies, false); part_stat_unlock(); } @@ -1583,23 +1587,6 @@ void blk_rq_unprep_clone(struct request *rq) } EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); -/* - * Copy attributes of the original request to the clone request. - * The actual data parts (e.g. ->cmd, ->sense) are not copied. - */ -static void __blk_rq_prep_clone(struct request *dst, struct request *src) -{ - dst->__sector = blk_rq_pos(src); - dst->__data_len = blk_rq_bytes(src); - if (src->rq_flags & RQF_SPECIAL_PAYLOAD) { - dst->rq_flags |= RQF_SPECIAL_PAYLOAD; - dst->special_vec = src->special_vec; - } - dst->nr_phys_segments = src->nr_phys_segments; - dst->ioprio = src->ioprio; - dst->extra_len = src->extra_len; -} - /** * blk_rq_prep_clone - Helper function to setup clone request * @rq: the request to be setup @@ -1612,8 +1599,6 @@ static void __blk_rq_prep_clone(struct request *dst, struct request *src) * * Description: * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq. - * The actual data parts of @rq_src (e.g. ->cmd, ->sense) - * are not copied, and copying such parts is the caller's responsibility. * Also, pages which the original bios are pointing to are not copied * and the cloned bios just point same pages. * So cloned bios must be completed before original bios, which means @@ -1644,7 +1629,16 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, rq->bio = rq->biotail = bio; } - __blk_rq_prep_clone(rq, rq_src); + /* Copy attributes of the original request to the clone request. */ + rq->__sector = blk_rq_pos(rq_src); + rq->__data_len = blk_rq_bytes(rq_src); + if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) { + rq->rq_flags |= RQF_SPECIAL_PAYLOAD; + rq->special_vec = rq_src->special_vec; + } + rq->nr_phys_segments = rq_src->nr_phys_segments; + rq->ioprio = rq_src->ioprio; + rq->extra_len = rq_src->extra_len; return 0; @@ -1663,12 +1657,6 @@ int kblockd_schedule_work(struct work_struct *work) } EXPORT_SYMBOL(kblockd_schedule_work); -int kblockd_schedule_work_on(int cpu, struct work_struct *work) -{ - return queue_work_on(cpu, kblockd_workqueue, work); -} -EXPORT_SYMBOL(kblockd_schedule_work_on); - int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay) { diff --git a/block/blk-flush.c b/block/blk-flush.c index 3f977c517960..c7f396e3d5e2 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -160,9 +160,6 @@ static void blk_account_io_flush(struct request *rq) * * CONTEXT: * spin_lock_irq(fq->mq_flush_lock) - * - * RETURNS: - * %true if requests were added to the dispatch queue, %false otherwise. */ static void blk_flush_complete_seq(struct request *rq, struct blk_flush_queue *fq, @@ -412,7 +409,7 @@ void blk_insert_flush(struct request *rq) */ if ((policy & REQ_FSEQ_DATA) && !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { - blk_mq_request_bypass_insert(rq, false); + blk_mq_request_bypass_insert(rq, false, false); return; } @@ -457,15 +454,6 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, if (!q) return -ENXIO; - /* - * some block devices may not have their queue correctly set up here - * (e.g. loop device without a backing file) and so issuing a flush - * here will panic. Ensure there is a request function before issuing - * the flush. - */ - if (!q->make_request_fn) - return -ENXIO; - bio = bio_alloc(gfp_mask, 0); bio_set_dev(bio, bdev); bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; @@ -485,8 +473,8 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, } EXPORT_SYMBOL(blkdev_issue_flush); -struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, - int node, int cmd_size, gfp_t flags) +struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, + gfp_t flags) { struct blk_flush_queue *fq; int rq_sz = sizeof(struct request); diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 5ed59ac6ae58..9df50fb507ca 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -84,6 +84,7 @@ static void ioc_destroy_icq(struct io_cq *icq) * making it impossible to determine icq_cache. Record it in @icq. */ icq->__rcu_icq_cache = et->icq_cache; + icq->flags |= ICQ_DESTROYED; call_rcu(&icq->__rcu_head, icq_free_icq_rcu); } @@ -212,15 +213,21 @@ static void __ioc_clear_queue(struct list_head *icq_list) { unsigned long flags; + rcu_read_lock(); while (!list_empty(icq_list)) { struct io_cq *icq = list_entry(icq_list->next, struct io_cq, q_node); struct io_context *ioc = icq->ioc; spin_lock_irqsave(&ioc->lock, flags); + if (icq->flags & ICQ_DESTROYED) { + spin_unlock_irqrestore(&ioc->lock, flags); + continue; + } ioc_destroy_icq(icq); spin_unlock_irqrestore(&ioc->lock, flags); } + rcu_read_unlock(); } /** diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 27ca68621137..db35ee682294 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -46,9 +46,6 @@ * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate * device-specific coefficients. * - * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate - * device-specific coefficients. - * * 2. Control Strategy * * The device virtual time (vtime) is used as the primary control metric. @@ -1318,7 +1315,7 @@ static bool iocg_is_idle(struct ioc_gq *iocg) return false; /* is something in flight? */ - if (atomic64_read(&iocg->done_vtime) < atomic64_read(&iocg->vtime)) + if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime)) return false; return true; diff --git a/block/blk-map.c b/block/blk-map.c index b0790268ed9d..b72c361911a4 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -11,6 +11,514 @@ #include "blk.h" +struct bio_map_data { + int is_our_pages; + struct iov_iter iter; + struct iovec iov[]; +}; + +static struct bio_map_data *bio_alloc_map_data(struct iov_iter *data, + gfp_t gfp_mask) +{ + struct bio_map_data *bmd; + + if (data->nr_segs > UIO_MAXIOV) + return NULL; + + bmd = kmalloc(struct_size(bmd, iov, data->nr_segs), gfp_mask); + if (!bmd) + return NULL; + memcpy(bmd->iov, data->iov, sizeof(struct iovec) * data->nr_segs); + bmd->iter = *data; + bmd->iter.iov = bmd->iov; + return bmd; +} + +/** + * bio_copy_from_iter - copy all pages from iov_iter to bio + * @bio: The &struct bio which describes the I/O as destination + * @iter: iov_iter as source + * + * Copy all pages from iov_iter to bio. + * Returns 0 on success, or error on failure. + */ +static int bio_copy_from_iter(struct bio *bio, struct iov_iter *iter) +{ + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, iter_all) { + ssize_t ret; + + ret = copy_page_from_iter(bvec->bv_page, + bvec->bv_offset, + bvec->bv_len, + iter); + + if (!iov_iter_count(iter)) + break; + + if (ret < bvec->bv_len) + return -EFAULT; + } + + return 0; +} + +/** + * bio_copy_to_iter - copy all pages from bio to iov_iter + * @bio: The &struct bio which describes the I/O as source + * @iter: iov_iter as destination + * + * Copy all pages from bio to iov_iter. + * Returns 0 on success, or error on failure. + */ +static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter) +{ + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, iter_all) { + ssize_t ret; + + ret = copy_page_to_iter(bvec->bv_page, + bvec->bv_offset, + bvec->bv_len, + &iter); + + if (!iov_iter_count(&iter)) + break; + + if (ret < bvec->bv_len) + return -EFAULT; + } + + return 0; +} + +/** + * bio_uncopy_user - finish previously mapped bio + * @bio: bio being terminated + * + * Free pages allocated from bio_copy_user_iov() and write back data + * to user space in case of a read. + */ +static int bio_uncopy_user(struct bio *bio) +{ + struct bio_map_data *bmd = bio->bi_private; + int ret = 0; + + if (!bio_flagged(bio, BIO_NULL_MAPPED)) { + /* + * if we're in a workqueue, the request is orphaned, so + * don't copy into a random user address space, just free + * and return -EINTR so user space doesn't expect any data. + */ + if (!current->mm) + ret = -EINTR; + else if (bio_data_dir(bio) == READ) + ret = bio_copy_to_iter(bio, bmd->iter); + if (bmd->is_our_pages) + bio_free_pages(bio); + } + kfree(bmd); + bio_put(bio); + return ret; +} + +/** + * bio_copy_user_iov - copy user data to bio + * @q: destination block queue + * @map_data: pointer to the rq_map_data holding pages (if necessary) + * @iter: iovec iterator + * @gfp_mask: memory allocation flags + * + * Prepares and returns a bio for indirect user io, bouncing data + * to/from kernel pages as necessary. Must be paired with + * call bio_uncopy_user() on io completion. + */ +static struct bio *bio_copy_user_iov(struct request_queue *q, + struct rq_map_data *map_data, struct iov_iter *iter, + gfp_t gfp_mask) +{ + struct bio_map_data *bmd; + struct page *page; + struct bio *bio; + int i = 0, ret; + int nr_pages; + unsigned int len = iter->count; + unsigned int offset = map_data ? offset_in_page(map_data->offset) : 0; + + bmd = bio_alloc_map_data(iter, gfp_mask); + if (!bmd) + return ERR_PTR(-ENOMEM); + + /* + * We need to do a deep copy of the iov_iter including the iovecs. + * The caller provided iov might point to an on-stack or otherwise + * shortlived one. + */ + bmd->is_our_pages = map_data ? 0 : 1; + + nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); + if (nr_pages > BIO_MAX_PAGES) + nr_pages = BIO_MAX_PAGES; + + ret = -ENOMEM; + bio = bio_kmalloc(gfp_mask, nr_pages); + if (!bio) + goto out_bmd; + + ret = 0; + + if (map_data) { + nr_pages = 1 << map_data->page_order; + i = map_data->offset / PAGE_SIZE; + } + while (len) { + unsigned int bytes = PAGE_SIZE; + + bytes -= offset; + + if (bytes > len) + bytes = len; + + if (map_data) { + if (i == map_data->nr_entries * nr_pages) { + ret = -ENOMEM; + break; + } + + page = map_data->pages[i / nr_pages]; + page += (i % nr_pages); + + i++; + } else { + page = alloc_page(q->bounce_gfp | gfp_mask); + if (!page) { + ret = -ENOMEM; + break; + } + } + + if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) { + if (!map_data) + __free_page(page); + break; + } + + len -= bytes; + offset = 0; + } + + if (ret) + goto cleanup; + + if (map_data) + map_data->offset += bio->bi_iter.bi_size; + + /* + * success + */ + if ((iov_iter_rw(iter) == WRITE && + (!map_data || !map_data->null_mapped)) || + (map_data && map_data->from_user)) { + ret = bio_copy_from_iter(bio, iter); + if (ret) + goto cleanup; + } else { + if (bmd->is_our_pages) + zero_fill_bio(bio); + iov_iter_advance(iter, bio->bi_iter.bi_size); + } + + bio->bi_private = bmd; + if (map_data && map_data->null_mapped) + bio_set_flag(bio, BIO_NULL_MAPPED); + return bio; +cleanup: + if (!map_data) + bio_free_pages(bio); + bio_put(bio); +out_bmd: + kfree(bmd); + return ERR_PTR(ret); +} + +/** + * bio_map_user_iov - map user iovec into bio + * @q: the struct request_queue for the bio + * @iter: iovec iterator + * @gfp_mask: memory allocation flags + * + * Map the user space address into a bio suitable for io to a block + * device. Returns an error pointer in case of error. + */ +static struct bio *bio_map_user_iov(struct request_queue *q, + struct iov_iter *iter, gfp_t gfp_mask) +{ + int j; + struct bio *bio; + int ret; + + if (!iov_iter_count(iter)) + return ERR_PTR(-EINVAL); + + bio = bio_kmalloc(gfp_mask, iov_iter_npages(iter, BIO_MAX_PAGES)); + if (!bio) + return ERR_PTR(-ENOMEM); + + while (iov_iter_count(iter)) { + struct page **pages; + ssize_t bytes; + size_t offs, added = 0; + int npages; + + bytes = iov_iter_get_pages_alloc(iter, &pages, LONG_MAX, &offs); + if (unlikely(bytes <= 0)) { + ret = bytes ? bytes : -EFAULT; + goto out_unmap; + } + + npages = DIV_ROUND_UP(offs + bytes, PAGE_SIZE); + + if (unlikely(offs & queue_dma_alignment(q))) { + ret = -EINVAL; + j = 0; + } else { + for (j = 0; j < npages; j++) { + struct page *page = pages[j]; + unsigned int n = PAGE_SIZE - offs; + bool same_page = false; + + if (n > bytes) + n = bytes; + + if (!__bio_add_pc_page(q, bio, page, n, offs, + &same_page)) { + if (same_page) + put_page(page); + break; + } + + added += n; + bytes -= n; + offs = 0; + } + iov_iter_advance(iter, added); + } + /* + * release the pages we didn't map into the bio, if any + */ + while (j < npages) + put_page(pages[j++]); + kvfree(pages); + /* couldn't stuff something into bio? */ + if (bytes) + break; + } + + bio_set_flag(bio, BIO_USER_MAPPED); + + /* + * subtle -- if bio_map_user_iov() ended up bouncing a bio, + * it would normally disappear when its bi_end_io is run. + * however, we need it for the unmap, so grab an extra + * reference to it + */ + bio_get(bio); + return bio; + + out_unmap: + bio_release_pages(bio, false); + bio_put(bio); + return ERR_PTR(ret); +} + +/** + * bio_unmap_user - unmap a bio + * @bio: the bio being unmapped + * + * Unmap a bio previously mapped by bio_map_user_iov(). Must be called from + * process context. + * + * bio_unmap_user() may sleep. + */ +static void bio_unmap_user(struct bio *bio) +{ + bio_release_pages(bio, bio_data_dir(bio) == READ); + bio_put(bio); + bio_put(bio); +} + +static void bio_invalidate_vmalloc_pages(struct bio *bio) +{ +#ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE + if (bio->bi_private && !op_is_write(bio_op(bio))) { + unsigned long i, len = 0; + + for (i = 0; i < bio->bi_vcnt; i++) + len += bio->bi_io_vec[i].bv_len; + invalidate_kernel_vmap_range(bio->bi_private, len); + } +#endif +} + +static void bio_map_kern_endio(struct bio *bio) +{ + bio_invalidate_vmalloc_pages(bio); + bio_put(bio); +} + +/** + * bio_map_kern - map kernel address into bio + * @q: the struct request_queue for the bio + * @data: pointer to buffer to map + * @len: length in bytes + * @gfp_mask: allocation flags for bio allocation + * + * Map the kernel address into a bio suitable for io to a block + * device. Returns an error pointer in case of error. + */ +static struct bio *bio_map_kern(struct request_queue *q, void *data, + unsigned int len, gfp_t gfp_mask) +{ + unsigned long kaddr = (unsigned long)data; + unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long start = kaddr >> PAGE_SHIFT; + const int nr_pages = end - start; + bool is_vmalloc = is_vmalloc_addr(data); + struct page *page; + int offset, i; + struct bio *bio; + + bio = bio_kmalloc(gfp_mask, nr_pages); + if (!bio) + return ERR_PTR(-ENOMEM); + + if (is_vmalloc) { + flush_kernel_vmap_range(data, len); + bio->bi_private = data; + } + + offset = offset_in_page(kaddr); + for (i = 0; i < nr_pages; i++) { + unsigned int bytes = PAGE_SIZE - offset; + + if (len <= 0) + break; + + if (bytes > len) + bytes = len; + + if (!is_vmalloc) + page = virt_to_page(data); + else + page = vmalloc_to_page(data); + if (bio_add_pc_page(q, bio, page, bytes, + offset) < bytes) { + /* we don't support partial mappings */ + bio_put(bio); + return ERR_PTR(-EINVAL); + } + + data += bytes; + len -= bytes; + offset = 0; + } + + bio->bi_end_io = bio_map_kern_endio; + return bio; +} + +static void bio_copy_kern_endio(struct bio *bio) +{ + bio_free_pages(bio); + bio_put(bio); +} + +static void bio_copy_kern_endio_read(struct bio *bio) +{ + char *p = bio->bi_private; + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, iter_all) { + memcpy(p, page_address(bvec->bv_page), bvec->bv_len); + p += bvec->bv_len; + } + + bio_copy_kern_endio(bio); +} + +/** + * bio_copy_kern - copy kernel address into bio + * @q: the struct request_queue for the bio + * @data: pointer to buffer to copy + * @len: length in bytes + * @gfp_mask: allocation flags for bio and page allocation + * @reading: data direction is READ + * + * copy the kernel address into a bio suitable for io to a block + * device. Returns an error pointer in case of error. + */ +static struct bio *bio_copy_kern(struct request_queue *q, void *data, + unsigned int len, gfp_t gfp_mask, int reading) +{ + unsigned long kaddr = (unsigned long)data; + unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long start = kaddr >> PAGE_SHIFT; + struct bio *bio; + void *p = data; + int nr_pages = 0; + + /* + * Overflow, abort + */ + if (end < start) + return ERR_PTR(-EINVAL); + + nr_pages = end - start; + bio = bio_kmalloc(gfp_mask, nr_pages); + if (!bio) + return ERR_PTR(-ENOMEM); + + while (len) { + struct page *page; + unsigned int bytes = PAGE_SIZE; + + if (bytes > len) + bytes = len; + + page = alloc_page(q->bounce_gfp | gfp_mask); + if (!page) + goto cleanup; + + if (!reading) + memcpy(page_address(page), p, bytes); + + if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) + break; + + len -= bytes; + p += bytes; + } + + if (reading) { + bio->bi_end_io = bio_copy_kern_endio_read; + bio->bi_private = data; + } else { + bio->bi_end_io = bio_copy_kern_endio; + } + + return bio; + +cleanup: + bio_free_pages(bio); + bio_put(bio); + return ERR_PTR(-ENOMEM); +} + /* * Append a bio to a passthrough request. Only works if the bio can be merged * into the request based on the driver constraints. diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index ca22afd47b3d..74cedea56034 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -361,13 +361,19 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, bool has_sched, struct request *rq) { - /* dispatch flush rq directly */ - if (rq->rq_flags & RQF_FLUSH_SEQ) { - spin_lock(&hctx->lock); - list_add(&rq->queuelist, &hctx->dispatch); - spin_unlock(&hctx->lock); + /* + * dispatch flush and passthrough rq directly + * + * passthrough request has to be added to hctx->dispatch directly. + * For some reason, device may be in one situation which can't + * handle FS request, so STS_RESOURCE is always returned and the + * FS request will be added to hctx->dispatch. However passthrough + * request may be required at that time for fixing the problem. If + * passthrough request is added to scheduler queue, there isn't any + * chance to dispatch it given we prioritize requests in hctx->dispatch. + */ + if ((rq->rq_flags & RQF_FLUSH_SEQ) || blk_rq_is_passthrough(rq)) return true; - } if (has_sched) rq->rq_flags |= RQF_SORTED; @@ -391,8 +397,32 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head, WARN_ON(e && (rq->tag != -1)); - if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) + if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) { + /* + * Firstly normal IO request is inserted to scheduler queue or + * sw queue, meantime we add flush request to dispatch queue( + * hctx->dispatch) directly and there is at most one in-flight + * flush request for each hw queue, so it doesn't matter to add + * flush request to tail or front of the dispatch queue. + * + * Secondly in case of NCQ, flush request belongs to non-NCQ + * command, and queueing it will fail when there is any + * in-flight normal IO request(NCQ command). When adding flush + * rq to the front of hctx->dispatch, it is easier to introduce + * extra time to flush rq's latency because of S_SCHED_RESTART + * compared with adding to the tail of dispatch queue, then + * chance of flush merge is increased, and less flush requests + * will be issued to controller. It is observed that ~10% time + * is saved in blktests block/004 on disk attached to AHCI/NCQ + * drive when adding flush rq to the front of hctx->dispatch. + * + * Simply queue flush rq to the front of hctx->dispatch so that + * intensive flush workloads can benefit in case of NCQ HW. + */ + at_head = (rq->rq_flags & RQF_FLUSH_SEQ) ? true : at_head; + blk_mq_request_bypass_insert(rq, at_head, false); goto run; + } if (e && e->type->ops.insert_requests) { LIST_HEAD(list); diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index fbacde454718..586c9d6e904a 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -183,8 +183,8 @@ found_tag: return tag + tag_offset; } -void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags, - struct blk_mq_ctx *ctx, unsigned int tag) +void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, + unsigned int tag) { if (!blk_mq_tag_is_reserved(tags, tag)) { const int real_tag = tag - tags->nr_reserved_tags; diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 15bc74acb57e..2b8321efb682 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -26,8 +26,8 @@ extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int r extern void blk_mq_free_tags(struct blk_mq_tags *tags); extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); -extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags, - struct blk_mq_ctx *ctx, unsigned int tag); +extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, + unsigned int tag); extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags **tags, unsigned int depth, bool can_grow); diff --git a/block/blk-mq-virtio.c b/block/blk-mq-virtio.c index 488341628256..7b8a42c35102 100644 --- a/block/blk-mq-virtio.c +++ b/block/blk-mq-virtio.c @@ -16,7 +16,7 @@ * @first_vec: first interrupt vectors to use for queues (usually 0) * * This function assumes the virtio device @vdev has at least as many available - * interrupt vetors as @set has queues. It will then queuery the vector + * interrupt vectors as @set has queues. It will then query the vector * corresponding to each queue for it's affinity mask and built queue mapping * that maps a queue to the CPUs that have irq affinity for the corresponding * vector. diff --git a/block/blk-mq.c b/block/blk-mq.c index a12b1763508d..f6291ceedee4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -477,9 +477,9 @@ static void __blk_mq_free_request(struct request *rq) blk_pm_mark_last_busy(rq); rq->mq_hctx = NULL; if (rq->tag != -1) - blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); + blk_mq_put_tag(hctx->tags, ctx, rq->tag); if (sched_tag != -1) - blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag); + blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag); blk_mq_sched_restart(hctx); blk_queue_exit(q); } @@ -735,7 +735,7 @@ static void blk_mq_requeue_work(struct work_struct *work) * merge. */ if (rq->rq_flags & RQF_DONTPREP) - blk_mq_request_bypass_insert(rq, false); + blk_mq_request_bypass_insert(rq, false, false); else blk_mq_sched_insert_request(rq, true, false, false); } @@ -1178,6 +1178,23 @@ static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy) #define BLK_MQ_RESOURCE_DELAY 3 /* ms units */ +static void blk_mq_handle_dev_resource(struct request *rq, + struct list_head *list) +{ + struct request *next = + list_first_entry_or_null(list, struct request, queuelist); + + /* + * If an I/O scheduler has been configured and we got a driver tag for + * the next request already, free it. + */ + if (next) + blk_mq_put_driver_tag(next); + + list_add(&rq->queuelist, list); + __blk_mq_requeue_request(rq); +} + /* * Returns true if we did some work AND can potentially do more. */ @@ -1245,17 +1262,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, ret = q->mq_ops->queue_rq(hctx, &bd); if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { - /* - * If an I/O scheduler has been configured and we got a - * driver tag for the next request already, free it - * again. - */ - if (!list_empty(list)) { - nxt = list_first_entry(list, struct request, queuelist); - blk_mq_put_driver_tag(nxt); - } - list_add(&rq->queuelist, list); - __blk_mq_requeue_request(rq); + blk_mq_handle_dev_resource(rq, list); break; } @@ -1286,7 +1293,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, q->mq_ops->commit_rqs(hctx); spin_lock(&hctx->lock); - list_splice_init(list, &hctx->dispatch); + list_splice_tail_init(list, &hctx->dispatch); spin_unlock(&hctx->lock); /* @@ -1677,12 +1684,16 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, * Should only be used carefully, when the caller knows we want to * bypass a potential IO scheduler on the target device. */ -void blk_mq_request_bypass_insert(struct request *rq, bool run_queue) +void blk_mq_request_bypass_insert(struct request *rq, bool at_head, + bool run_queue) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; spin_lock(&hctx->lock); - list_add_tail(&rq->queuelist, &hctx->dispatch); + if (at_head) + list_add(&rq->queuelist, &hctx->dispatch); + else + list_add_tail(&rq->queuelist, &hctx->dispatch); spin_unlock(&hctx->lock); if (run_queue) @@ -1849,7 +1860,7 @@ insert: if (bypass_insert) return BLK_STS_RESOURCE; - blk_mq_request_bypass_insert(rq, run_queue); + blk_mq_request_bypass_insert(rq, false, run_queue); return BLK_STS_OK; } @@ -1876,7 +1887,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true); if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) - blk_mq_request_bypass_insert(rq, true); + blk_mq_request_bypass_insert(rq, false, true); else if (ret != BLK_STS_OK) blk_mq_end_request(rq, ret); @@ -1910,7 +1921,7 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, if (ret != BLK_STS_OK) { if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { - blk_mq_request_bypass_insert(rq, + blk_mq_request_bypass_insert(rq, false, list_empty(list)); break; } @@ -2405,8 +2416,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); INIT_LIST_HEAD(&hctx->dispatch_wait.entry); - hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size, - gfp); + hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp); if (!hctx->fq) goto free_bitmap; @@ -2714,13 +2724,15 @@ void blk_mq_release(struct request_queue *q) blk_mq_sysfs_deinit(q); } -struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) +struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, + void *queuedata) { struct request_queue *uninit_q, *q; - uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); + uninit_q = __blk_alloc_queue(set->numa_node); if (!uninit_q) return ERR_PTR(-ENOMEM); + uninit_q->queuedata = queuedata; /* * Initialize the queue without an elevator. device_add_disk() will do @@ -2732,6 +2744,12 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) return q; } +EXPORT_SYMBOL_GPL(blk_mq_init_queue_data); + +struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) +{ + return blk_mq_init_queue_data(set, NULL); +} EXPORT_SYMBOL(blk_mq_init_queue); /* @@ -2820,7 +2838,6 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, memcpy(new_hctxs, hctxs, q->nr_hw_queues * sizeof(*hctxs)); q->queue_hw_ctx = new_hctxs; - q->nr_hw_queues = set->nr_hw_queues; kfree(hctxs); hctxs = new_hctxs; } @@ -2922,11 +2939,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, INIT_LIST_HEAD(&q->requeue_list); spin_lock_init(&q->requeue_lock); - blk_queue_make_request(q, blk_mq_make_request); - - /* - * Do this after blk_queue_make_request() overrides it... - */ + q->make_request_fn = blk_mq_make_request; q->nr_requests = set->queue_depth; /* @@ -3019,6 +3032,14 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) { + /* + * blk_mq_map_queues() and multiple .map_queues() implementations + * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the + * number of hardware queues. + */ + if (set->nr_maps == 1) + set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues; + if (set->ops->map_queues && !is_kdump_kernel()) { int i; @@ -3398,7 +3419,6 @@ static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb) } static unsigned long blk_mq_poll_nsecs(struct request_queue *q, - struct blk_mq_hw_ctx *hctx, struct request *rq) { unsigned long ret = 0; @@ -3431,7 +3451,6 @@ static unsigned long blk_mq_poll_nsecs(struct request_queue *q, } static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, - struct blk_mq_hw_ctx *hctx, struct request *rq) { struct hrtimer_sleeper hs; @@ -3451,7 +3470,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, if (q->poll_nsec > 0) nsecs = q->poll_nsec; else - nsecs = blk_mq_poll_nsecs(q, hctx, rq); + nsecs = blk_mq_poll_nsecs(q, rq); if (!nsecs) return false; @@ -3506,7 +3525,7 @@ static bool blk_mq_poll_hybrid(struct request_queue *q, return false; } - return blk_mq_poll_hybrid_sleep(q, hctx, rq); + return blk_mq_poll_hybrid_sleep(q, rq); } /** diff --git a/block/blk-mq.h b/block/blk-mq.h index eaaca8fc1c28..10bfdfb494fa 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -66,7 +66,8 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, */ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, bool at_head); -void blk_mq_request_bypass_insert(struct request *rq, bool run_queue); +void blk_mq_request_bypass_insert(struct request *rq, bool at_head, + bool run_queue); void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct list_head *list); @@ -199,7 +200,7 @@ static inline bool blk_mq_get_dispatch_budget(struct blk_mq_hw_ctx *hctx) static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq) { - blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag); + blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag); rq->tag = -1; if (rq->rq_flags & RQF_MQ_INFLIGHT) { diff --git a/block/blk-settings.c b/block/blk-settings.c index c8eda2e7b91e..14397b4c4b53 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -87,42 +87,6 @@ void blk_set_stacking_limits(struct queue_limits *lim) EXPORT_SYMBOL(blk_set_stacking_limits); /** - * blk_queue_make_request - define an alternate make_request function for a device - * @q: the request queue for the device to be affected - * @mfn: the alternate make_request function - * - * Description: - * The normal way for &struct bios to be passed to a device - * driver is for them to be collected into requests on a request - * queue, and then to allow the device driver to select requests - * off that queue when it is ready. This works well for many block - * devices. However some block devices (typically virtual devices - * such as md or lvm) do not benefit from the processing on the - * request queue, and are served best by having the requests passed - * directly to them. This can be achieved by providing a function - * to blk_queue_make_request(). - * - * Caveat: - * The driver that does this *must* be able to deal appropriately - * with buffers in "highmemory". This can be accomplished by either calling - * kmap_atomic() to get a temporary kernel mapping, or by calling - * blk_queue_bounce() to create a buffer in normal memory. - **/ -void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) -{ - /* - * set defaults - */ - q->nr_requests = BLKDEV_MAX_RQ; - - q->make_request_fn = mfn; - blk_queue_dma_alignment(q, 511); - - blk_set_default_limits(&q->limits); -} -EXPORT_SYMBOL(blk_queue_make_request); - -/** * blk_queue_bounce_limit - set bounce buffer limit for queue * @q: the request queue for the device * @max_addr: the maximum address the device can handle @@ -664,6 +628,9 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, printk(KERN_NOTICE "%s: Warning: Device %s is misaligned\n", top, bottom); } + + t->backing_dev_info->io_pages = + t->limits.max_sectors >> (PAGE_SHIFT - 9); } EXPORT_SYMBOL(disk_stack_limits); diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 05741c6f618b..f87956e0dcaf 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -20,6 +20,38 @@ #include "blk.h" +#define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name +static const char *const zone_cond_name[] = { + ZONE_COND_NAME(NOT_WP), + ZONE_COND_NAME(EMPTY), + ZONE_COND_NAME(IMP_OPEN), + ZONE_COND_NAME(EXP_OPEN), + ZONE_COND_NAME(CLOSED), + ZONE_COND_NAME(READONLY), + ZONE_COND_NAME(FULL), + ZONE_COND_NAME(OFFLINE), +}; +#undef ZONE_COND_NAME + +/** + * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX. + * @zone_cond: BLK_ZONE_COND_XXX. + * + * Description: Centralize block layer function to convert BLK_ZONE_COND_XXX + * into string format. Useful in the debugging and tracing zone conditions. For + * invalid BLK_ZONE_COND_XXX it returns string "UNKNOWN". + */ +const char *blk_zone_cond_str(enum blk_zone_cond zone_cond) +{ + static const char *zone_cond_str = "UNKNOWN"; + + if (zone_cond < ARRAY_SIZE(zone_cond_name) && zone_cond_name[zone_cond]) + zone_cond_str = zone_cond_name[zone_cond]; + + return zone_cond_str; +} +EXPORT_SYMBOL_GPL(blk_zone_cond_str); + static inline sector_t blk_zone_start(struct request_queue *q, sector_t sector) { @@ -173,7 +205,7 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op, if (!op_is_zone_mgmt(op)) return -EOPNOTSUPP; - if (!nr_sectors || end_sector > capacity) + if (end_sector <= sector || end_sector > capacity) /* Out of range */ return -EINVAL; diff --git a/block/blk.h b/block/blk.h index 0b8884353f6b..0a94ec68af32 100644 --- a/block/blk.h +++ b/block/blk.h @@ -4,6 +4,7 @@ #include <linux/idr.h> #include <linux/blk-mq.h> +#include <linux/part_stat.h> #include <xen/xen.h> #include "blk-mq.h" #include "blk-mq-sched.h" @@ -55,8 +56,8 @@ is_flush_rq(struct request *req, struct blk_mq_hw_ctx *hctx) return hctx->fq->flush_rq == req; } -struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, - int node, int cmd_size, gfp_t flags); +struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, + gfp_t flags); void blk_free_flush_queue(struct blk_flush_queue *q); void blk_freeze_queue(struct request_queue *q); @@ -149,6 +150,9 @@ static inline bool integrity_req_gap_front_merge(struct request *req, return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1], bip_next->bip_vec[0].bv_offset); } + +void blk_integrity_add(struct gendisk *); +void blk_integrity_del(struct gendisk *); #else /* CONFIG_BLK_DEV_INTEGRITY */ static inline bool integrity_req_gap_back_merge(struct request *req, struct bio *next) @@ -171,6 +175,12 @@ static inline bool bio_integrity_endio(struct bio *bio) static inline void bio_integrity_free(struct bio *bio) { } +static inline void blk_integrity_add(struct gendisk *disk) +{ +} +static inline void blk_integrity_del(struct gendisk *disk) +{ +} #endif /* CONFIG_BLK_DEV_INTEGRITY */ unsigned long blk_rq_timeout(unsigned long timeout); @@ -214,6 +224,17 @@ static inline void elevator_exit(struct request_queue *q, struct hd_struct *__disk_get_part(struct gendisk *disk, int partno); +ssize_t part_size_show(struct device *dev, struct device_attribute *attr, + char *buf); +ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, + char *buf); +ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, + char *buf); +ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, + char *buf); +ssize_t part_fail_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count); + #ifdef CONFIG_FAIL_IO_TIMEOUT int blk_should_fake_timeout(struct request_queue *); ssize_t part_timeout_show(struct device *, struct device_attribute *, char *); @@ -354,4 +375,117 @@ void blk_queue_free_zone_bitmaps(struct request_queue *q); static inline void blk_queue_free_zone_bitmaps(struct request_queue *q) {} #endif +void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, + int rw); +void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, + int rw); +void update_io_ticks(struct hd_struct *part, unsigned long now, bool end); +struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector); + +int blk_alloc_devt(struct hd_struct *part, dev_t *devt); +void blk_free_devt(dev_t devt); +void blk_invalidate_devt(dev_t devt); +char *disk_name(struct gendisk *hd, int partno, char *buf); +#define ADDPART_FLAG_NONE 0 +#define ADDPART_FLAG_RAID 1 +#define ADDPART_FLAG_WHOLEDISK 2 +struct hd_struct *__must_check add_partition(struct gendisk *disk, int partno, + sector_t start, sector_t len, int flags, + struct partition_meta_info *info); +void __delete_partition(struct percpu_ref *ref); +void delete_partition(struct gendisk *disk, int partno); +int disk_expand_part_tbl(struct gendisk *disk, int target); + +static inline int hd_ref_init(struct hd_struct *part) +{ + if (percpu_ref_init(&part->ref, __delete_partition, 0, + GFP_KERNEL)) + return -ENOMEM; + return 0; +} + +static inline void hd_struct_get(struct hd_struct *part) +{ + percpu_ref_get(&part->ref); +} + +static inline int hd_struct_try_get(struct hd_struct *part) +{ + return percpu_ref_tryget_live(&part->ref); +} + +static inline void hd_struct_put(struct hd_struct *part) +{ + percpu_ref_put(&part->ref); +} + +static inline void hd_struct_kill(struct hd_struct *part) +{ + percpu_ref_kill(&part->ref); +} + +static inline void hd_free_part(struct hd_struct *part) +{ + free_part_stats(part); + kfree(part->info); + percpu_ref_exit(&part->ref); +} + +/* + * Any access of part->nr_sects which is not protected by partition + * bd_mutex or gendisk bdev bd_mutex, should be done using this + * accessor function. + * + * Code written along the lines of i_size_read() and i_size_write(). + * CONFIG_PREEMPTION case optimizes the case of UP kernel with preemption + * on. + */ +static inline sector_t part_nr_sects_read(struct hd_struct *part) +{ +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + sector_t nr_sects; + unsigned seq; + do { + seq = read_seqcount_begin(&part->nr_sects_seq); + nr_sects = part->nr_sects; + } while (read_seqcount_retry(&part->nr_sects_seq, seq)); + return nr_sects; +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + sector_t nr_sects; + + preempt_disable(); + nr_sects = part->nr_sects; + preempt_enable(); + return nr_sects; +#else + return part->nr_sects; +#endif +} + +/* + * Should be called with mutex lock held (typically bd_mutex) of partition + * to provide mutual exlusion among writers otherwise seqcount might be + * left in wrong state leaving the readers spinning infinitely. + */ +static inline void part_nr_sects_write(struct hd_struct *part, sector_t size) +{ +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + write_seqcount_begin(&part->nr_sects_seq); + part->nr_sects = size; + write_seqcount_end(&part->nr_sects_seq); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); + part->nr_sects = size; + preempt_enable(); +#else + part->nr_sects = size; +#endif +} + +struct request_queue *__blk_alloc_queue(int node_id); + +int __bio_add_pc_page(struct request_queue *q, struct bio *bio, + struct page *page, unsigned int len, unsigned int offset, + bool *same_page); + #endif /* BLK_INTERNAL_H */ diff --git a/block/genhd.c b/block/genhd.c index ff6268970ddc..06b642b23a07 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -4,6 +4,7 @@ */ #include <linux/module.h> +#include <linux/ctype.h> #include <linux/fs.h> #include <linux/genhd.h> #include <linux/kdev_t.h> @@ -26,7 +27,7 @@ #include "blk.h" static DEFINE_MUTEX(block_class_lock); -struct kobject *block_depr; +static struct kobject *block_depr; /* for extended dynamic devt allocation, currently only one major is used */ #define NR_EXT_DEVT (1 << MINORBITS) @@ -46,6 +47,78 @@ static void disk_add_events(struct gendisk *disk); static void disk_del_events(struct gendisk *disk); static void disk_release_events(struct gendisk *disk); +/* + * Set disk capacity and notify if the size is not currently + * zero and will not be set to zero + */ +void set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, + bool revalidate) +{ + sector_t capacity = get_capacity(disk); + + set_capacity(disk, size); + + if (revalidate) + revalidate_disk(disk); + + if (capacity != size && capacity != 0 && size != 0) { + char *envp[] = { "RESIZE=1", NULL }; + + kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); + } +} + +EXPORT_SYMBOL_GPL(set_capacity_revalidate_and_notify); + +/* + * Format the device name of the indicated disk into the supplied buffer and + * return a pointer to that same buffer for convenience. + */ +char *disk_name(struct gendisk *hd, int partno, char *buf) +{ + if (!partno) + snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name); + else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) + snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno); + else + snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno); + + return buf; +} + +const char *bdevname(struct block_device *bdev, char *buf) +{ + return disk_name(bdev->bd_disk, bdev->bd_part->partno, buf); +} +EXPORT_SYMBOL(bdevname); + +#ifdef CONFIG_SMP +static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) +{ + int cpu; + + memset(stat, 0, sizeof(struct disk_stats)); + for_each_possible_cpu(cpu) { + struct disk_stats *ptr = per_cpu_ptr(part->dkstats, cpu); + int group; + + for (group = 0; group < NR_STAT_GROUPS; group++) { + stat->nsecs[group] += ptr->nsecs[group]; + stat->sectors[group] += ptr->sectors[group]; + stat->ios[group] += ptr->ios[group]; + stat->merges[group] += ptr->merges[group]; + } + + stat->io_ticks += ptr->io_ticks; + } +} +#else /* CONFIG_SMP */ +static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) +{ + memcpy(stat, &part->dkstats, sizeof(struct disk_stats)); +} +#endif /* CONFIG_SMP */ + void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw) { if (queue_is_mq(q)) @@ -66,7 +139,8 @@ void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw) part_stat_local_dec(&part_to_disk(part)->part0, in_flight[rw]); } -unsigned int part_in_flight(struct request_queue *q, struct hd_struct *part) +static unsigned int part_in_flight(struct request_queue *q, + struct hd_struct *part) { int cpu; unsigned int inflight; @@ -86,8 +160,8 @@ unsigned int part_in_flight(struct request_queue *q, struct hd_struct *part) return inflight; } -void part_in_flight_rw(struct request_queue *q, struct hd_struct *part, - unsigned int inflight[2]) +static void part_in_flight_rw(struct request_queue *q, struct hd_struct *part, + unsigned int inflight[2]) { int cpu; @@ -143,7 +217,6 @@ struct hd_struct *disk_get_part(struct gendisk *disk, int partno) return part; } -EXPORT_SYMBOL_GPL(disk_get_part); /** * disk_part_iter_init - initialize partition iterator @@ -299,7 +372,42 @@ struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) } return &disk->part0; } -EXPORT_SYMBOL_GPL(disk_map_sector_rcu); + +/** + * disk_has_partitions + * @disk: gendisk of interest + * + * Walk through the partition table and check if valid partition exists. + * + * CONTEXT: + * Don't care. + * + * RETURNS: + * True if the gendisk has at least one valid non-zero size partition. + * Otherwise false. + */ +bool disk_has_partitions(struct gendisk *disk) +{ + struct disk_part_tbl *ptbl; + int i; + bool ret = false; + + rcu_read_lock(); + ptbl = rcu_dereference(disk->part_tbl); + + /* Iterate partitions skipping the whole device at index 0 */ + for (i = 1; i < ptbl->len; i++) { + if (rcu_dereference(ptbl->part[i])) { + ret = true; + break; + } + } + + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL_GPL(disk_has_partitions); /* * Can be deleted altogether. Later. @@ -908,7 +1016,6 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) } return disk; } -EXPORT_SYMBOL(get_gendisk); /** * bdget_disk - do bdget() by gendisk and partition number @@ -1154,6 +1261,67 @@ static ssize_t disk_ro_show(struct device *dev, return sprintf(buf, "%d\n", get_disk_ro(disk) ? 1 : 0); } +ssize_t part_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%llu\n", + (unsigned long long)part_nr_sects_read(p)); +} + +ssize_t part_stat_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + struct request_queue *q = part_to_disk(p)->queue; + struct disk_stats stat; + unsigned int inflight; + + part_stat_read_all(p, &stat); + inflight = part_in_flight(q, p); + + return sprintf(buf, + "%8lu %8lu %8llu %8u " + "%8lu %8lu %8llu %8u " + "%8u %8u %8u " + "%8lu %8lu %8llu %8u " + "%8lu %8u" + "\n", + stat.ios[STAT_READ], + stat.merges[STAT_READ], + (unsigned long long)stat.sectors[STAT_READ], + (unsigned int)div_u64(stat.nsecs[STAT_READ], NSEC_PER_MSEC), + stat.ios[STAT_WRITE], + stat.merges[STAT_WRITE], + (unsigned long long)stat.sectors[STAT_WRITE], + (unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC), + inflight, + jiffies_to_msecs(stat.io_ticks), + (unsigned int)div_u64(stat.nsecs[STAT_READ] + + stat.nsecs[STAT_WRITE] + + stat.nsecs[STAT_DISCARD] + + stat.nsecs[STAT_FLUSH], + NSEC_PER_MSEC), + stat.ios[STAT_DISCARD], + stat.merges[STAT_DISCARD], + (unsigned long long)stat.sectors[STAT_DISCARD], + (unsigned int)div_u64(stat.nsecs[STAT_DISCARD], NSEC_PER_MSEC), + stat.ios[STAT_FLUSH], + (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC)); +} + +ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + struct request_queue *q = part_to_disk(p)->queue; + unsigned int inflight[2]; + + part_in_flight_rw(q, p, inflight); + return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]); +} + static ssize_t disk_capability_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1192,10 +1360,33 @@ static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL); static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store); + #ifdef CONFIG_FAIL_MAKE_REQUEST +ssize_t part_fail_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%d\n", p->make_it_fail); +} + +ssize_t part_fail_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct hd_struct *p = dev_to_part(dev); + int i; + + if (count > 0 && sscanf(buf, "%d", &i) > 0) + p->make_it_fail = (i == 0) ? 0 : 1; + + return count; +} + static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, 0644, part_fail_show, part_fail_store); -#endif +#endif /* CONFIG_FAIL_MAKE_REQUEST */ + #ifdef CONFIG_FAIL_IO_TIMEOUT static struct device_attribute dev_attr_fail_timeout = __ATTR(io-timeout-fail, 0644, part_timeout_show, part_timeout_store); @@ -1342,8 +1533,8 @@ static char *block_devnode(struct device *dev, umode_t *mode, { struct gendisk *disk = dev_to_disk(dev); - if (disk->devnode) - return disk->devnode(disk, mode); + if (disk->fops->devnode) + return disk->fops->devnode(disk, mode); return NULL; } @@ -1369,6 +1560,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) struct hd_struct *hd; char buf[BDEVNAME_SIZE]; unsigned int inflight; + struct disk_stats stat; /* if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next) @@ -1380,7 +1572,9 @@ static int diskstats_show(struct seq_file *seqf, void *v) disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); while ((hd = disk_part_iter_next(&piter))) { + part_stat_read_all(hd, &stat); inflight = part_in_flight(gp->queue, hd); + seq_printf(seqf, "%4d %7d %s " "%lu %lu %lu %u " "%lu %lu %lu %u " @@ -1390,23 +1584,31 @@ static int diskstats_show(struct seq_file *seqf, void *v) "\n", MAJOR(part_devt(hd)), MINOR(part_devt(hd)), disk_name(gp, hd->partno, buf), - part_stat_read(hd, ios[STAT_READ]), - part_stat_read(hd, merges[STAT_READ]), - part_stat_read(hd, sectors[STAT_READ]), - (unsigned int)part_stat_read_msecs(hd, STAT_READ), - part_stat_read(hd, ios[STAT_WRITE]), - part_stat_read(hd, merges[STAT_WRITE]), - part_stat_read(hd, sectors[STAT_WRITE]), - (unsigned int)part_stat_read_msecs(hd, STAT_WRITE), + stat.ios[STAT_READ], + stat.merges[STAT_READ], + stat.sectors[STAT_READ], + (unsigned int)div_u64(stat.nsecs[STAT_READ], + NSEC_PER_MSEC), + stat.ios[STAT_WRITE], + stat.merges[STAT_WRITE], + stat.sectors[STAT_WRITE], + (unsigned int)div_u64(stat.nsecs[STAT_WRITE], + NSEC_PER_MSEC), inflight, - jiffies_to_msecs(part_stat_read(hd, io_ticks)), - jiffies_to_msecs(part_stat_read(hd, time_in_queue)), - part_stat_read(hd, ios[STAT_DISCARD]), - part_stat_read(hd, merges[STAT_DISCARD]), - part_stat_read(hd, sectors[STAT_DISCARD]), - (unsigned int)part_stat_read_msecs(hd, STAT_DISCARD), - part_stat_read(hd, ios[STAT_FLUSH]), - (unsigned int)part_stat_read_msecs(hd, STAT_FLUSH) + jiffies_to_msecs(stat.io_ticks), + (unsigned int)div_u64(stat.nsecs[STAT_READ] + + stat.nsecs[STAT_WRITE] + + stat.nsecs[STAT_DISCARD] + + stat.nsecs[STAT_FLUSH], + NSEC_PER_MSEC), + stat.ios[STAT_DISCARD], + stat.merges[STAT_DISCARD], + stat.sectors[STAT_DISCARD], + (unsigned int)div_u64(stat.nsecs[STAT_DISCARD], + NSEC_PER_MSEC), + stat.ios[STAT_FLUSH], + (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], + NSEC_PER_MSEC) ); } disk_part_iter_exit(&piter); @@ -1463,7 +1665,6 @@ dev_t blk_lookup_devt(const char *name, int partno) class_dev_iter_exit(&iter); return devt; } -EXPORT_SYMBOL(blk_lookup_devt); struct gendisk *__alloc_disk_node(int minors, int node_id) { diff --git a/block/ioctl.c b/block/ioctl.c index 127194b9f9bd..6e827de1a4c4 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -11,6 +11,7 @@ #include <linux/blktrace_api.h> #include <linux/pr.h> #include <linux/uaccess.h> +#include "blk.h" static int blkpg_do_ioctl(struct block_device *bdev, struct blkpg_partition __user *upart, int op) diff --git a/block/opal_proto.h b/block/opal_proto.h index 325cbba2465f..b486b3ec7dc4 100644 --- a/block/opal_proto.h +++ b/block/opal_proto.h @@ -36,6 +36,7 @@ enum opal_response_token { #define DTAERROR_NO_METHOD_STATUS 0x89 #define GENERIC_HOST_SESSION_NUM 0x41 +#define FIRST_TPER_SESSION_NUM 4096 #define TPER_SYNC_SUPPORTED 0x01 #define MBR_ENABLED_MASK 0x10 diff --git a/block/partitions/Makefile b/block/partitions/Makefile index 2f276b677c81..a7f05cdb02a8 100644 --- a/block/partitions/Makefile +++ b/block/partitions/Makefile @@ -3,8 +3,7 @@ # Makefile for the linux kernel. # -obj-$(CONFIG_BLOCK) := check.o - +obj-$(CONFIG_BLOCK) += core.o obj-$(CONFIG_ACORN_PARTITION) += acorn.o obj-$(CONFIG_AMIGA_PARTITION) += amiga.o obj-$(CONFIG_ATARI_PARTITION) += atari.o diff --git a/block/partitions/acorn.c b/block/partitions/acorn.c index 7587700fad4a..c64c57b958bf 100644 --- a/block/partitions/acorn.c +++ b/block/partitions/acorn.c @@ -11,7 +11,6 @@ #include <linux/adfs_fs.h> #include "check.h" -#include "acorn.h" /* * Partition types. (Oh for reusability) diff --git a/block/partitions/acorn.h b/block/partitions/acorn.h deleted file mode 100644 index 67b06601ca4c..000000000000 --- a/block/partitions/acorn.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * linux/fs/partitions/acorn.h - * - * Copyright (C) 1996-2001 Russell King. - * - * I _hate_ this partitioning mess - why can't we have one defined - * format, and everyone stick to it? - */ - -int adfspart_check_CUMANA(struct parsed_partitions *state); -int adfspart_check_ADFS(struct parsed_partitions *state); -int adfspart_check_ICS(struct parsed_partitions *state); -int adfspart_check_POWERTEC(struct parsed_partitions *state); -int adfspart_check_EESOX(struct parsed_partitions *state); diff --git a/block/partitions/aix.c b/block/partitions/aix.c index 903f3ed175d0..c7b4fd1a4a97 100644 --- a/block/partitions/aix.c +++ b/block/partitions/aix.c @@ -6,7 +6,6 @@ */ #include "check.h" -#include "aix.h" struct lvm_rec { char lvm_id[4]; /* "_LVM" */ diff --git a/block/partitions/aix.h b/block/partitions/aix.h deleted file mode 100644 index b4449f0b9f2b..000000000000 --- a/block/partitions/aix.h +++ /dev/null @@ -1,2 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -extern int aix_partition(struct parsed_partitions *state); diff --git a/block/partitions/amiga.c b/block/partitions/amiga.c index 560936617d9c..9526491d9aed 100644 --- a/block/partitions/amiga.c +++ b/block/partitions/amiga.c @@ -14,7 +14,6 @@ #include <linux/affs_hardblocks.h> #include "check.h" -#include "amiga.h" static __inline__ u32 checksum_block(__be32 *m, int size) @@ -42,9 +41,8 @@ int amiga_partition(struct parsed_partitions *state) goto rdb_done; data = read_part_sector(state, blk, §); if (!data) { - if (warn_no_part) - pr_err("Dev %s: unable to read RDB block %d\n", - bdevname(state->bdev, b), blk); + pr_err("Dev %s: unable to read RDB block %d\n", + bdevname(state->bdev, b), blk); res = -1; goto rdb_done; } @@ -85,9 +83,8 @@ int amiga_partition(struct parsed_partitions *state) blk *= blksize; /* Read in terms partition table understands */ data = read_part_sector(state, blk, §); if (!data) { - if (warn_no_part) - pr_err("Dev %s: unable to read partition block %d\n", - bdevname(state->bdev, b), blk); + pr_err("Dev %s: unable to read partition block %d\n", + bdevname(state->bdev, b), blk); res = -1; goto rdb_done; } diff --git a/block/partitions/amiga.h b/block/partitions/amiga.h deleted file mode 100644 index 7e63f4d9d969..000000000000 --- a/block/partitions/amiga.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * fs/partitions/amiga.h - */ - -int amiga_partition(struct parsed_partitions *state); - diff --git a/block/partitions/atari.h b/block/partitions/atari.h index 01c2b9457394..678202442fd3 100644 --- a/block/partitions/atari.h +++ b/block/partitions/atari.h @@ -34,4 +34,3 @@ struct rootsector u16 checksum; /* checksum for bootable disks */ } __packed; -int atari_partition(struct parsed_partitions *state); diff --git a/block/partitions/check.c b/block/partitions/check.c deleted file mode 100644 index ffe408fead0c..000000000000 --- a/block/partitions/check.c +++ /dev/null @@ -1,198 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * fs/partitions/check.c - * - * Code extracted from drivers/block/genhd.c - * Copyright (C) 1991-1998 Linus Torvalds - * Re-organised Feb 1998 Russell King - * - * We now have independent partition support from the - * block drivers, which allows all the partition code to - * be grouped in one location, and it to be mostly self - * contained. - * - * Added needed MAJORS for new pairs, {hdi,hdj}, {hdk,hdl} - */ - -#include <linux/slab.h> -#include <linux/vmalloc.h> -#include <linux/ctype.h> -#include <linux/genhd.h> - -#include "check.h" - -#include "acorn.h" -#include "amiga.h" -#include "atari.h" -#include "ldm.h" -#include "mac.h" -#include "msdos.h" -#include "osf.h" -#include "sgi.h" -#include "sun.h" -#include "ibm.h" -#include "ultrix.h" -#include "efi.h" -#include "karma.h" -#include "sysv68.h" -#include "cmdline.h" - -int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/ - -static int (*check_part[])(struct parsed_partitions *) = { - /* - * Probe partition formats with tables at disk address 0 - * that also have an ADFS boot block at 0xdc0. - */ -#ifdef CONFIG_ACORN_PARTITION_ICS - adfspart_check_ICS, -#endif -#ifdef CONFIG_ACORN_PARTITION_POWERTEC - adfspart_check_POWERTEC, -#endif -#ifdef CONFIG_ACORN_PARTITION_EESOX - adfspart_check_EESOX, -#endif - - /* - * Now move on to formats that only have partition info at - * disk address 0xdc0. Since these may also have stale - * PC/BIOS partition tables, they need to come before - * the msdos entry. - */ -#ifdef CONFIG_ACORN_PARTITION_CUMANA - adfspart_check_CUMANA, -#endif -#ifdef CONFIG_ACORN_PARTITION_ADFS - adfspart_check_ADFS, -#endif - -#ifdef CONFIG_CMDLINE_PARTITION - cmdline_partition, -#endif -#ifdef CONFIG_EFI_PARTITION - efi_partition, /* this must come before msdos */ -#endif -#ifdef CONFIG_SGI_PARTITION - sgi_partition, -#endif -#ifdef CONFIG_LDM_PARTITION - ldm_partition, /* this must come before msdos */ -#endif -#ifdef CONFIG_MSDOS_PARTITION - msdos_partition, -#endif -#ifdef CONFIG_OSF_PARTITION - osf_partition, -#endif -#ifdef CONFIG_SUN_PARTITION - sun_partition, -#endif -#ifdef CONFIG_AMIGA_PARTITION - amiga_partition, -#endif -#ifdef CONFIG_ATARI_PARTITION - atari_partition, -#endif -#ifdef CONFIG_MAC_PARTITION - mac_partition, -#endif -#ifdef CONFIG_ULTRIX_PARTITION - ultrix_partition, -#endif -#ifdef CONFIG_IBM_PARTITION - ibm_partition, -#endif -#ifdef CONFIG_KARMA_PARTITION - karma_partition, -#endif -#ifdef CONFIG_SYSV68_PARTITION - sysv68_partition, -#endif - NULL -}; - -static struct parsed_partitions *allocate_partitions(struct gendisk *hd) -{ - struct parsed_partitions *state; - int nr; - - state = kzalloc(sizeof(*state), GFP_KERNEL); - if (!state) - return NULL; - - nr = disk_max_parts(hd); - state->parts = vzalloc(array_size(nr, sizeof(state->parts[0]))); - if (!state->parts) { - kfree(state); - return NULL; - } - - state->limit = nr; - - return state; -} - -void free_partitions(struct parsed_partitions *state) -{ - vfree(state->parts); - kfree(state); -} - -struct parsed_partitions * -check_partition(struct gendisk *hd, struct block_device *bdev) -{ - struct parsed_partitions *state; - int i, res, err; - - state = allocate_partitions(hd); - if (!state) - return NULL; - state->pp_buf = (char *)__get_free_page(GFP_KERNEL); - if (!state->pp_buf) { - free_partitions(state); - return NULL; - } - state->pp_buf[0] = '\0'; - - state->bdev = bdev; - disk_name(hd, 0, state->name); - snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name); - if (isdigit(state->name[strlen(state->name)-1])) - sprintf(state->name, "p"); - - i = res = err = 0; - while (!res && check_part[i]) { - memset(state->parts, 0, state->limit * sizeof(state->parts[0])); - res = check_part[i++](state); - if (res < 0) { - /* We have hit an I/O error which we don't report now. - * But record it, and let the others do their job. - */ - err = res; - res = 0; - } - - } - if (res > 0) { - printk(KERN_INFO "%s", state->pp_buf); - - free_page((unsigned long)state->pp_buf); - return state; - } - if (state->access_beyond_eod) - err = -ENOSPC; - if (err) - /* The partition is unrecognized. So report I/O errors if there were any */ - res = err; - if (res) { - if (warn_no_part) - strlcat(state->pp_buf, - " unable to read partition table\n", PAGE_SIZE); - printk(KERN_INFO "%s", state->pp_buf); - } - - free_page((unsigned long)state->pp_buf); - free_partitions(state); - return ERR_PTR(res); -} diff --git a/block/partitions/check.h b/block/partitions/check.h index 6042f769471a..c577e9ee67f0 100644 --- a/block/partitions/check.h +++ b/block/partitions/check.h @@ -2,6 +2,7 @@ #include <linux/pagemap.h> #include <linux/blkdev.h> #include <linux/genhd.h> +#include "../blk.h" /* * add_gd_partition adds a partitions details to the devices partition @@ -23,19 +24,14 @@ struct parsed_partitions { char *pp_buf; }; -void free_partitions(struct parsed_partitions *state); +typedef struct { + struct page *v; +} Sector; -struct parsed_partitions * -check_partition(struct gendisk *, struct block_device *); - -static inline void *read_part_sector(struct parsed_partitions *state, - sector_t n, Sector *p) +void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p); +static inline void put_dev_sector(Sector p) { - if (n >= get_capacity(state->bdev->bd_disk)) { - state->access_beyond_eod = true; - return NULL; - } - return read_dev_sector(state->bdev, n, p); + put_page(p.v); } static inline void @@ -51,5 +47,24 @@ put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size) } } -extern int warn_no_part; - +/* detection routines go here in alphabetical order: */ +int adfspart_check_ADFS(struct parsed_partitions *state); +int adfspart_check_CUMANA(struct parsed_partitions *state); +int adfspart_check_EESOX(struct parsed_partitions *state); +int adfspart_check_ICS(struct parsed_partitions *state); +int adfspart_check_POWERTEC(struct parsed_partitions *state); +int aix_partition(struct parsed_partitions *state); +int amiga_partition(struct parsed_partitions *state); +int atari_partition(struct parsed_partitions *state); +int cmdline_partition(struct parsed_partitions *state); +int efi_partition(struct parsed_partitions *state); +int ibm_partition(struct parsed_partitions *); +int karma_partition(struct parsed_partitions *state); +int ldm_partition(struct parsed_partitions *state); +int mac_partition(struct parsed_partitions *state); +int msdos_partition(struct parsed_partitions *state); +int osf_partition(struct parsed_partitions *state); +int sgi_partition(struct parsed_partitions *state); +int sun_partition(struct parsed_partitions *state); +int sysv68_partition(struct parsed_partitions *state); +int ultrix_partition(struct parsed_partitions *state); diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c index f1edd5452249..8f545c36cde4 100644 --- a/block/partitions/cmdline.c +++ b/block/partitions/cmdline.c @@ -18,7 +18,6 @@ #include <linux/cmdline-parser.h> #include "check.h" -#include "cmdline.h" static char *cmdline; static struct cmdline_parts *bdev_parts; diff --git a/block/partitions/cmdline.h b/block/partitions/cmdline.h deleted file mode 100644 index e64a31636a1f..000000000000 --- a/block/partitions/cmdline.h +++ /dev/null @@ -1,3 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -int cmdline_partition(struct parsed_partitions *state); diff --git a/block/partition-generic.c b/block/partitions/core.c index 564fae77711d..b79c4513629b 100644 --- a/block/partition-generic.c +++ b/block/partitions/core.c @@ -1,75 +1,176 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Code extracted from drivers/block/genhd.c - * Copyright (C) 1991-1998 Linus Torvalds - * Re-organised Feb 1998 Russell King - * - * We now have independent partition support from the - * block drivers, which allows all the partition code to - * be grouped in one location, and it to be mostly self - * contained. + * Copyright (C) 1991-1998 Linus Torvalds + * Re-organised Feb 1998 Russell King */ - -#include <linux/init.h> -#include <linux/module.h> #include <linux/fs.h> #include <linux/slab.h> -#include <linux/kmod.h> #include <linux/ctype.h> #include <linux/genhd.h> +#include <linux/vmalloc.h> #include <linux/blktrace_api.h> +#include <linux/raid/detect.h> +#include "check.h" -#include "partitions/check.h" +static int (*check_part[])(struct parsed_partitions *) = { + /* + * Probe partition formats with tables at disk address 0 + * that also have an ADFS boot block at 0xdc0. + */ +#ifdef CONFIG_ACORN_PARTITION_ICS + adfspart_check_ICS, +#endif +#ifdef CONFIG_ACORN_PARTITION_POWERTEC + adfspart_check_POWERTEC, +#endif +#ifdef CONFIG_ACORN_PARTITION_EESOX + adfspart_check_EESOX, +#endif -#ifdef CONFIG_BLK_DEV_MD -extern void md_autodetect_dev(dev_t dev); + /* + * Now move on to formats that only have partition info at + * disk address 0xdc0. Since these may also have stale + * PC/BIOS partition tables, they need to come before + * the msdos entry. + */ +#ifdef CONFIG_ACORN_PARTITION_CUMANA + adfspart_check_CUMANA, +#endif +#ifdef CONFIG_ACORN_PARTITION_ADFS + adfspart_check_ADFS, #endif - -/* - * disk_name() is used by partition check code and the genhd driver. - * It formats the devicename of the indicated disk into - * the supplied buffer (of size at least 32), and returns - * a pointer to that same buffer (for convenience). - */ -char *disk_name(struct gendisk *hd, int partno, char *buf) +#ifdef CONFIG_CMDLINE_PARTITION + cmdline_partition, +#endif +#ifdef CONFIG_EFI_PARTITION + efi_partition, /* this must come before msdos */ +#endif +#ifdef CONFIG_SGI_PARTITION + sgi_partition, +#endif +#ifdef CONFIG_LDM_PARTITION + ldm_partition, /* this must come before msdos */ +#endif +#ifdef CONFIG_MSDOS_PARTITION + msdos_partition, +#endif +#ifdef CONFIG_OSF_PARTITION + osf_partition, +#endif +#ifdef CONFIG_SUN_PARTITION + sun_partition, +#endif +#ifdef CONFIG_AMIGA_PARTITION + amiga_partition, +#endif +#ifdef CONFIG_ATARI_PARTITION + atari_partition, +#endif +#ifdef CONFIG_MAC_PARTITION + mac_partition, +#endif +#ifdef CONFIG_ULTRIX_PARTITION + ultrix_partition, +#endif +#ifdef CONFIG_IBM_PARTITION + ibm_partition, +#endif +#ifdef CONFIG_KARMA_PARTITION + karma_partition, +#endif +#ifdef CONFIG_SYSV68_PARTITION + sysv68_partition, +#endif + NULL +}; + +static struct parsed_partitions *allocate_partitions(struct gendisk *hd) { - if (!partno) - snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name); - else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) - snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno); - else - snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno); + struct parsed_partitions *state; + int nr; - return buf; -} + state = kzalloc(sizeof(*state), GFP_KERNEL); + if (!state) + return NULL; -const char *bdevname(struct block_device *bdev, char *buf) -{ - return disk_name(bdev->bd_disk, bdev->bd_part->partno, buf); -} + nr = disk_max_parts(hd); + state->parts = vzalloc(array_size(nr, sizeof(state->parts[0]))); + if (!state->parts) { + kfree(state); + return NULL; + } -EXPORT_SYMBOL(bdevname); + state->limit = nr; -const char *bio_devname(struct bio *bio, char *buf) -{ - return disk_name(bio->bi_disk, bio->bi_partno, buf); + return state; } -EXPORT_SYMBOL(bio_devname); -/* - * There's very little reason to use this, you should really - * have a struct block_device just about everywhere and use - * bdevname() instead. - */ -const char *__bdevname(dev_t dev, char *buffer) +static void free_partitions(struct parsed_partitions *state) { - scnprintf(buffer, BDEVNAME_SIZE, "unknown-block(%u,%u)", - MAJOR(dev), MINOR(dev)); - return buffer; + vfree(state->parts); + kfree(state); } -EXPORT_SYMBOL(__bdevname); +static struct parsed_partitions *check_partition(struct gendisk *hd, + struct block_device *bdev) +{ + struct parsed_partitions *state; + int i, res, err; + + state = allocate_partitions(hd); + if (!state) + return NULL; + state->pp_buf = (char *)__get_free_page(GFP_KERNEL); + if (!state->pp_buf) { + free_partitions(state); + return NULL; + } + state->pp_buf[0] = '\0'; + + state->bdev = bdev; + disk_name(hd, 0, state->name); + snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name); + if (isdigit(state->name[strlen(state->name)-1])) + sprintf(state->name, "p"); + + i = res = err = 0; + while (!res && check_part[i]) { + memset(state->parts, 0, state->limit * sizeof(state->parts[0])); + res = check_part[i++](state); + if (res < 0) { + /* + * We have hit an I/O error which we don't report now. + * But record it, and let the others do their job. + */ + err = res; + res = 0; + } + + } + if (res > 0) { + printk(KERN_INFO "%s", state->pp_buf); + + free_page((unsigned long)state->pp_buf); + return state; + } + if (state->access_beyond_eod) + err = -ENOSPC; + /* + * The partition is unrecognized. So report I/O errors if there were any + */ + if (err) + res = err; + if (res) { + strlcat(state->pp_buf, + " unable to read partition table\n", PAGE_SIZE); + printk(KERN_INFO "%s", state->pp_buf); + } + + free_page((unsigned long)state->pp_buf); + free_partitions(state); + return ERR_PTR(res); +} static ssize_t part_partition_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -87,13 +188,6 @@ static ssize_t part_start_show(struct device *dev, return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect); } -ssize_t part_size_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%llu\n",(unsigned long long)part_nr_sects_read(p)); -} - static ssize_t part_ro_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -115,74 +209,6 @@ static ssize_t part_discard_alignment_show(struct device *dev, return sprintf(buf, "%u\n", p->discard_alignment); } -ssize_t part_stat_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - struct request_queue *q = part_to_disk(p)->queue; - unsigned int inflight; - - inflight = part_in_flight(q, p); - return sprintf(buf, - "%8lu %8lu %8llu %8u " - "%8lu %8lu %8llu %8u " - "%8u %8u %8u " - "%8lu %8lu %8llu %8u " - "%8lu %8u" - "\n", - part_stat_read(p, ios[STAT_READ]), - part_stat_read(p, merges[STAT_READ]), - (unsigned long long)part_stat_read(p, sectors[STAT_READ]), - (unsigned int)part_stat_read_msecs(p, STAT_READ), - part_stat_read(p, ios[STAT_WRITE]), - part_stat_read(p, merges[STAT_WRITE]), - (unsigned long long)part_stat_read(p, sectors[STAT_WRITE]), - (unsigned int)part_stat_read_msecs(p, STAT_WRITE), - inflight, - jiffies_to_msecs(part_stat_read(p, io_ticks)), - jiffies_to_msecs(part_stat_read(p, time_in_queue)), - part_stat_read(p, ios[STAT_DISCARD]), - part_stat_read(p, merges[STAT_DISCARD]), - (unsigned long long)part_stat_read(p, sectors[STAT_DISCARD]), - (unsigned int)part_stat_read_msecs(p, STAT_DISCARD), - part_stat_read(p, ios[STAT_FLUSH]), - (unsigned int)part_stat_read_msecs(p, STAT_FLUSH)); -} - -ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - struct request_queue *q = part_to_disk(p)->queue; - unsigned int inflight[2]; - - part_in_flight_rw(q, p, inflight); - return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]); -} - -#ifdef CONFIG_FAIL_MAKE_REQUEST -ssize_t part_fail_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%d\n", p->make_it_fail); -} - -ssize_t part_fail_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct hd_struct *p = dev_to_part(dev); - int i; - - if (count > 0 && sscanf(buf, "%d", &i) > 0) - p->make_it_fail = (i == 0) ? 0 : 1; - - return count; -} -#endif - static DEVICE_ATTR(partition, 0444, part_partition_show, NULL); static DEVICE_ATTR(start, 0444, part_start_show, NULL); static DEVICE_ATTR(size, 0444, part_size_show, NULL); @@ -369,7 +395,9 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, p->policy = get_disk_ro(disk); if (info) { - struct partition_meta_info *pinfo = alloc_part_info(disk); + struct partition_meta_info *pinfo; + + pinfo = kzalloc_node(sizeof(*pinfo), GFP_KERNEL, disk->node_id); if (!pinfo) { err = -ENOMEM; goto out_free_stats; @@ -428,7 +456,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, return p; out_free_info: - free_part_info(p); + kfree(p->info); out_free_stats: free_part_stats(p); out_free: @@ -525,10 +553,10 @@ static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev, return true; } -#ifdef CONFIG_BLK_DEV_MD - if (state->parts[p].flags & ADDPART_FLAG_RAID) + if (IS_BUILTIN(CONFIG_BLK_DEV_MD) && + (state->parts[p].flags & ADDPART_FLAG_RAID)) md_autodetect_dev(part_to_dev(part)->devt); -#endif + return true; } @@ -602,22 +630,29 @@ out_free_state: return ret; } -unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p) +void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p) { - struct address_space *mapping = bdev->bd_inode->i_mapping; + struct address_space *mapping = state->bdev->bd_inode->i_mapping; struct page *page; - page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_SHIFT-9)), NULL); - if (!IS_ERR(page)) { - if (PageError(page)) - goto fail; - p->v = page; - return (unsigned char *)page_address(page) + ((n & ((1 << (PAGE_SHIFT - 9)) - 1)) << 9); -fail: - put_page(page); + if (n >= get_capacity(state->bdev->bd_disk)) { + state->access_beyond_eod = true; + return NULL; } + + page = read_mapping_page(mapping, + (pgoff_t)(n >> (PAGE_SHIFT - 9)), NULL); + if (IS_ERR(page)) + goto out; + if (PageError(page)) + goto out_put_page; + + p->v = page; + return (unsigned char *)page_address(page) + + ((n & ((1 << (PAGE_SHIFT - 9)) - 1)) << SECTOR_SHIFT); +out_put_page: + put_page(page); +out: p->v = NULL; return NULL; } - -EXPORT_SYMBOL(read_dev_sector); diff --git a/block/partitions/efi.c b/block/partitions/efi.c index db2fef7dfc47..b64bfdd4326c 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -657,6 +657,31 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, } /** + * utf16_le_to_7bit(): Naively converts a UTF-16LE string to 7-bit ASCII characters + * @in: input UTF-16LE string + * @size: size of the input string + * @out: output string ptr, should be capable to store @size+1 characters + * + * Description: Converts @size UTF16-LE symbols from @in string to 7-bit + * ASCII characters and stores them to @out. Adds trailing zero to @out array. + */ +static void utf16_le_to_7bit(const __le16 *in, unsigned int size, u8 *out) +{ + unsigned int i = 0; + + out[size] = 0; + + while (i < size) { + u8 c = le16_to_cpu(in[i]) & 0xff; + + if (c && !isprint(c)) + c = '!'; + out[i] = c; + i++; + } +} + +/** * efi_partition(struct parsed_partitions *state) * @state: disk parsed partitions * @@ -692,7 +717,6 @@ int efi_partition(struct parsed_partitions *state) for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) { struct partition_meta_info *info; - unsigned label_count = 0; unsigned label_max; u64 start = le64_to_cpu(ptes[i].starting_lba); u64 size = le64_to_cpu(ptes[i].ending_lba) - @@ -713,14 +737,7 @@ int efi_partition(struct parsed_partitions *state) /* Naively convert UTF16-LE to 7 bits. */ label_max = min(ARRAY_SIZE(info->volname) - 1, ARRAY_SIZE(ptes[i].partition_name)); - info->volname[label_max] = 0; - while (label_count < label_max) { - u8 c = ptes[i].partition_name[label_count] & 0xff; - if (c && !isprint(c)) - c = '!'; - info->volname[label_count] = c; - label_count++; - } + utf16_le_to_7bit(ptes[i].partition_name, label_max, info->volname); state->parts[i + 1].has_info = true; } kfree(ptes); diff --git a/block/partitions/efi.h b/block/partitions/efi.h index 3e8576157575..8cc2b88d0aa8 100644 --- a/block/partitions/efi.h +++ b/block/partitions/efi.h @@ -88,7 +88,7 @@ typedef struct _gpt_entry { __le64 starting_lba; __le64 ending_lba; gpt_entry_attributes attributes; - efi_char16_t partition_name[72 / sizeof (efi_char16_t)]; + __le16 partition_name[72/sizeof(__le16)]; } __packed gpt_entry; typedef struct _gpt_mbr_record { @@ -113,7 +113,4 @@ typedef struct _legacy_mbr { __le16 signature; } __packed legacy_mbr; -/* Functions */ -extern int efi_partition(struct parsed_partitions *state); - #endif diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c index a5d480f807f3..073faa6a69b8 100644 --- a/block/partitions/ibm.c +++ b/block/partitions/ibm.c @@ -15,7 +15,6 @@ #include <asm/vtoc.h> #include "check.h" -#include "ibm.h" union label_t { diff --git a/block/partitions/ibm.h b/block/partitions/ibm.h deleted file mode 100644 index 8bf13febb2b6..000000000000 --- a/block/partitions/ibm.h +++ /dev/null @@ -1,2 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -int ibm_partition(struct parsed_partitions *); diff --git a/block/partitions/karma.c b/block/partitions/karma.c index 59812d705c3d..4d93512f4bd4 100644 --- a/block/partitions/karma.c +++ b/block/partitions/karma.c @@ -8,9 +8,10 @@ */ #include "check.h" -#include "karma.h" #include <linux/compiler.h> +#define KARMA_LABEL_MAGIC 0xAB56 + int karma_partition(struct parsed_partitions *state) { int i; diff --git a/block/partitions/karma.h b/block/partitions/karma.h deleted file mode 100644 index 48e074d417fb..000000000000 --- a/block/partitions/karma.h +++ /dev/null @@ -1,9 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * fs/partitions/karma.h - */ - -#define KARMA_LABEL_MAGIC 0xAB56 - -int karma_partition(struct parsed_partitions *state); - diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c index a2d97ee1908c..6fdfcb40c537 100644 --- a/block/partitions/ldm.c +++ b/block/partitions/ldm.c @@ -14,10 +14,10 @@ #include <linux/stringify.h> #include <linux/kernel.h> #include <linux/uuid.h> +#include <linux/msdos_partition.h> #include "ldm.h" #include "check.h" -#include "msdos.h" /* * ldm_debug/info/error/crit - Output an error message @@ -493,7 +493,7 @@ static bool ldm_validate_partition_table(struct parsed_partitions *state) { Sector sect; u8 *data; - struct partition *p; + struct msdos_partition *p; int i; bool result = false; @@ -508,7 +508,7 @@ static bool ldm_validate_partition_table(struct parsed_partitions *state) if (*(__le16*) (data + 0x01FE) != cpu_to_le16 (MSDOS_LABEL_MAGIC)) goto out; - p = (struct partition*)(data + 0x01BE); + p = (struct msdos_partition *)(data + 0x01BE); for (i = 0; i < 4; i++, p++) if (SYS_IND (p) == LDM_PARTITION) { result = true; diff --git a/block/partitions/ldm.h b/block/partitions/ldm.h index 1ca63e97bccc..841580af7f9b 100644 --- a/block/partitions/ldm.h +++ b/block/partitions/ldm.h @@ -193,7 +193,5 @@ struct ldmdb { /* Cache of the database */ struct list_head v_part; }; -int ldm_partition(struct parsed_partitions *state); - #endif /* _FS_PT_LDM_H_ */ diff --git a/block/partitions/mac.h b/block/partitions/mac.h index 453ed2964804..0e41c9da7532 100644 --- a/block/partitions/mac.h +++ b/block/partitions/mac.h @@ -42,4 +42,3 @@ struct mac_driver_desc { /* ... more stuff */ }; -int mac_partition(struct parsed_partitions *state); diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index 82c44f7df911..8f2fcc080264 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -18,13 +18,18 @@ * Check partition table on IDE disks for common CHS translations * * Re-organised Feb 1998 Russell King + * + * BSD disklabel support by Yossi Gottlieb <yogo@math.tau.ac.il> + * updated by Marc Espie <Marc.Espie@openbsd.org> + * + * Unixware slices support by Andrzej Krzysztofowicz <ankry@mif.pg.gda.pl> + * and Krzysztof G. Baranowski <kgb@knm.org.pl> */ #include <linux/msdos_fs.h> +#include <linux/msdos_partition.h> #include "check.h" -#include "msdos.h" #include "efi.h" -#include "aix.h" /* * Many architectures don't like unaligned accesses, while @@ -35,17 +40,17 @@ #define SYS_IND(p) get_unaligned(&p->sys_ind) -static inline sector_t nr_sects(struct partition *p) +static inline sector_t nr_sects(struct msdos_partition *p) { return (sector_t)get_unaligned_le32(&p->nr_sects); } -static inline sector_t start_sect(struct partition *p) +static inline sector_t start_sect(struct msdos_partition *p) { return (sector_t)get_unaligned_le32(&p->start_sect); } -static inline int is_extended_partition(struct partition *p) +static inline int is_extended_partition(struct msdos_partition *p) { return (SYS_IND(p) == DOS_EXTENDED_PARTITION || SYS_IND(p) == WIN98_EXTENDED_PARTITION || @@ -68,7 +73,7 @@ msdos_magic_present(unsigned char *p) #define AIX_LABEL_MAGIC4 0xC1 static int aix_magic_present(struct parsed_partitions *state, unsigned char *p) { - struct partition *pt = (struct partition *) (p + 0x1be); + struct msdos_partition *pt = (struct msdos_partition *) (p + 0x1be); Sector sect; unsigned char *d; int slot, ret = 0; @@ -78,13 +83,19 @@ static int aix_magic_present(struct parsed_partitions *state, unsigned char *p) p[2] == AIX_LABEL_MAGIC3 && p[3] == AIX_LABEL_MAGIC4)) return 0; - /* Assume the partition table is valid if Linux partitions exists */ + + /* + * Assume the partition table is valid if Linux partitions exists. + * Note that old Solaris/x86 partitions use the same indicator as + * Linux swap partitions, so we consider that a Linux partition as + * well. + */ for (slot = 1; slot <= 4; slot++, pt++) { - if (pt->sys_ind == LINUX_SWAP_PARTITION || - pt->sys_ind == LINUX_RAID_PARTITION || - pt->sys_ind == LINUX_DATA_PARTITION || - pt->sys_ind == LINUX_LVM_PARTITION || - is_extended_partition(pt)) + if (pt->sys_ind == SOLARIS_X86_PARTITION || + pt->sys_ind == LINUX_RAID_PARTITION || + pt->sys_ind == LINUX_DATA_PARTITION || + pt->sys_ind == LINUX_LVM_PARTITION || + is_extended_partition(pt)) return 0; } d = read_part_sector(state, 7, §); @@ -122,7 +133,7 @@ static void parse_extended(struct parsed_partitions *state, sector_t first_sector, sector_t first_size, u32 disksig) { - struct partition *p; + struct msdos_partition *p; Sector sect; unsigned char *data; sector_t this_sector, this_size; @@ -146,7 +157,7 @@ static void parse_extended(struct parsed_partitions *state, if (!msdos_magic_present(data + 510)) goto done; - p = (struct partition *) (data + 0x1be); + p = (struct msdos_partition *) (data + 0x1be); /* * Usually, the first entry is the real data partition, @@ -210,6 +221,30 @@ done: put_dev_sector(sect); } +#define SOLARIS_X86_NUMSLICE 16 +#define SOLARIS_X86_VTOC_SANE (0x600DDEEEUL) + +struct solaris_x86_slice { + __le16 s_tag; /* ID tag of partition */ + __le16 s_flag; /* permission flags */ + __le32 s_start; /* start sector no of partition */ + __le32 s_size; /* # of blocks in partition */ +}; + +struct solaris_x86_vtoc { + unsigned int v_bootinfo[3]; /* info needed by mboot */ + __le32 v_sanity; /* to verify vtoc sanity */ + __le32 v_version; /* layout version */ + char v_volume[8]; /* volume name */ + __le16 v_sectorsz; /* sector size in bytes */ + __le16 v_nparts; /* number of partitions */ + unsigned int v_reserved[10]; /* free space */ + struct solaris_x86_slice + v_slice[SOLARIS_X86_NUMSLICE]; /* slice headers */ + unsigned int timestamp[SOLARIS_X86_NUMSLICE]; /* timestamp */ + char v_asciilabel[128]; /* for compatibility */ +}; + /* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also indicates linux swap. Be careful before believing this is Solaris. */ @@ -265,6 +300,54 @@ static void parse_solaris_x86(struct parsed_partitions *state, #endif } +/* check against BSD src/sys/sys/disklabel.h for consistency */ +#define BSD_DISKMAGIC (0x82564557UL) /* The disk magic number */ +#define BSD_MAXPARTITIONS 16 +#define OPENBSD_MAXPARTITIONS 16 +#define BSD_FS_UNUSED 0 /* disklabel unused partition entry ID */ +struct bsd_disklabel { + __le32 d_magic; /* the magic number */ + __s16 d_type; /* drive type */ + __s16 d_subtype; /* controller/d_type specific */ + char d_typename[16]; /* type name, e.g. "eagle" */ + char d_packname[16]; /* pack identifier */ + __u32 d_secsize; /* # of bytes per sector */ + __u32 d_nsectors; /* # of data sectors per track */ + __u32 d_ntracks; /* # of tracks per cylinder */ + __u32 d_ncylinders; /* # of data cylinders per unit */ + __u32 d_secpercyl; /* # of data sectors per cylinder */ + __u32 d_secperunit; /* # of data sectors per unit */ + __u16 d_sparespertrack; /* # of spare sectors per track */ + __u16 d_sparespercyl; /* # of spare sectors per cylinder */ + __u32 d_acylinders; /* # of alt. cylinders per unit */ + __u16 d_rpm; /* rotational speed */ + __u16 d_interleave; /* hardware sector interleave */ + __u16 d_trackskew; /* sector 0 skew, per track */ + __u16 d_cylskew; /* sector 0 skew, per cylinder */ + __u32 d_headswitch; /* head switch time, usec */ + __u32 d_trkseek; /* track-to-track seek, usec */ + __u32 d_flags; /* generic flags */ +#define NDDATA 5 + __u32 d_drivedata[NDDATA]; /* drive-type specific information */ +#define NSPARE 5 + __u32 d_spare[NSPARE]; /* reserved for future use */ + __le32 d_magic2; /* the magic number (again) */ + __le16 d_checksum; /* xor of data incl. partitions */ + + /* filesystem and partition information: */ + __le16 d_npartitions; /* number of partitions in following */ + __le32 d_bbsize; /* size of boot area at sn0, bytes */ + __le32 d_sbsize; /* max size of fs superblock, bytes */ + struct bsd_partition { /* the partition table */ + __le32 p_size; /* number of sectors in partition */ + __le32 p_offset; /* starting sector */ + __le32 p_fsize; /* filesystem basic fragment size */ + __u8 p_fstype; /* filesystem type, see below */ + __u8 p_frag; /* filesystem fragments per block */ + __le16 p_cpg; /* filesystem cylinders per group */ + } d_partitions[BSD_MAXPARTITIONS]; /* actually may be more */ +}; + #if defined(CONFIG_BSD_DISKLABEL) /* * Create devices for BSD partitions listed in a disklabel, under a @@ -349,6 +432,51 @@ static void parse_openbsd(struct parsed_partitions *state, #endif } +#define UNIXWARE_DISKMAGIC (0xCA5E600DUL) /* The disk magic number */ +#define UNIXWARE_DISKMAGIC2 (0x600DDEEEUL) /* The slice table magic nr */ +#define UNIXWARE_NUMSLICE 16 +#define UNIXWARE_FS_UNUSED 0 /* Unused slice entry ID */ + +struct unixware_slice { + __le16 s_label; /* label */ + __le16 s_flags; /* permission flags */ + __le32 start_sect; /* starting sector */ + __le32 nr_sects; /* number of sectors in slice */ +}; + +struct unixware_disklabel { + __le32 d_type; /* drive type */ + __le32 d_magic; /* the magic number */ + __le32 d_version; /* version number */ + char d_serial[12]; /* serial number of the device */ + __le32 d_ncylinders; /* # of data cylinders per device */ + __le32 d_ntracks; /* # of tracks per cylinder */ + __le32 d_nsectors; /* # of data sectors per track */ + __le32 d_secsize; /* # of bytes per sector */ + __le32 d_part_start; /* # of first sector of this partition*/ + __le32 d_unknown1[12]; /* ? */ + __le32 d_alt_tbl; /* byte offset of alternate table */ + __le32 d_alt_len; /* byte length of alternate table */ + __le32 d_phys_cyl; /* # of physical cylinders per device */ + __le32 d_phys_trk; /* # of physical tracks per cylinder */ + __le32 d_phys_sec; /* # of physical sectors per track */ + __le32 d_phys_bytes; /* # of physical bytes per sector */ + __le32 d_unknown2; /* ? */ + __le32 d_unknown3; /* ? */ + __le32 d_pad[8]; /* pad */ + + struct unixware_vtoc { + __le32 v_magic; /* the magic number */ + __le32 v_version; /* version number */ + char v_name[8]; /* volume name */ + __le16 v_nslices; /* # of slices */ + __le16 v_unknown1; /* ? */ + __le32 v_reserved[10]; /* reserved */ + struct unixware_slice + v_slice[UNIXWARE_NUMSLICE]; /* slice headers */ + } vtoc; +}; /* 408 */ + /* * Create devices for Unixware partitions listed in a disklabel, under a * dos-like partition. See parse_extended() for more information. @@ -392,6 +520,8 @@ static void parse_unixware(struct parsed_partitions *state, #endif } +#define MINIX_NR_SUBPARTITIONS 4 + /* * Minix 2.0.0/2.0.2 subpartition support. * Anand Krishnamurthy <anandk@wiproge.med.ge.com> @@ -403,14 +533,14 @@ static void parse_minix(struct parsed_partitions *state, #ifdef CONFIG_MINIX_SUBPARTITION Sector sect; unsigned char *data; - struct partition *p; + struct msdos_partition *p; int i; data = read_part_sector(state, offset, §); if (!data) return; - p = (struct partition *)(data + 0x1be); + p = (struct msdos_partition *)(data + 0x1be); /* The first sector of a Minix partition can have either * a secondary MBR describing its subpartitions, or @@ -454,7 +584,7 @@ int msdos_partition(struct parsed_partitions *state) sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; Sector sect; unsigned char *data; - struct partition *p; + struct msdos_partition *p; struct fat_boot_sector *fb; int slot; u32 disksig; @@ -488,7 +618,7 @@ int msdos_partition(struct parsed_partitions *state) * partition table. Reject this in case the boot indicator * is not 0 or 0x80. */ - p = (struct partition *) (data + 0x1be); + p = (struct msdos_partition *) (data + 0x1be); for (slot = 1; slot <= 4; slot++, p++) { if (p->boot_ind != 0 && p->boot_ind != 0x80) { /* @@ -510,7 +640,7 @@ int msdos_partition(struct parsed_partitions *state) } #ifdef CONFIG_EFI_PARTITION - p = (struct partition *) (data + 0x1be); + p = (struct msdos_partition *) (data + 0x1be); for (slot = 1 ; slot <= 4 ; slot++, p++) { /* If this is an EFI GPT disk, msdos should ignore it. */ if (SYS_IND(p) == EFI_PMBR_OSTYPE_EFI_GPT) { @@ -519,7 +649,7 @@ int msdos_partition(struct parsed_partitions *state) } } #endif - p = (struct partition *) (data + 0x1be); + p = (struct msdos_partition *) (data + 0x1be); disksig = le32_to_cpup((__le32 *)(data + 0x1b8)); @@ -566,7 +696,7 @@ int msdos_partition(struct parsed_partitions *state) strlcat(state->pp_buf, "\n", PAGE_SIZE); /* second pass - output for each on a separate line */ - p = (struct partition *) (0x1be + data); + p = (struct msdos_partition *) (0x1be + data); for (slot = 1 ; slot <= 4 ; slot++, p++) { unsigned char id = SYS_IND(p); int n; diff --git a/block/partitions/msdos.h b/block/partitions/msdos.h deleted file mode 100644 index fcacfc486092..000000000000 --- a/block/partitions/msdos.h +++ /dev/null @@ -1,9 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * fs/partitions/msdos.h - */ - -#define MSDOS_LABEL_MAGIC 0xAA55 - -int msdos_partition(struct parsed_partitions *state); - diff --git a/block/partitions/osf.c b/block/partitions/osf.c index 4b873973d6c0..84560d0765ed 100644 --- a/block/partitions/osf.c +++ b/block/partitions/osf.c @@ -9,9 +9,9 @@ */ #include "check.h" -#include "osf.h" #define MAX_OSF_PARTITIONS 18 +#define DISKLABELMAGIC (0x82564557UL) int osf_partition(struct parsed_partitions *state) { diff --git a/block/partitions/osf.h b/block/partitions/osf.h deleted file mode 100644 index 4d8088e7ea8c..000000000000 --- a/block/partitions/osf.h +++ /dev/null @@ -1,8 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * fs/partitions/osf.h - */ - -#define DISKLABELMAGIC (0x82564557UL) - -int osf_partition(struct parsed_partitions *state); diff --git a/block/partitions/sgi.c b/block/partitions/sgi.c index d7b421c6e530..4273f1bb0515 100644 --- a/block/partitions/sgi.c +++ b/block/partitions/sgi.c @@ -6,7 +6,12 @@ */ #include "check.h" -#include "sgi.h" + +#define SGI_LABEL_MAGIC 0x0be5a941 + +enum { + LINUX_RAID_PARTITION = 0xfd, /* autodetect RAID partition */ +}; struct sgi_disklabel { __be32 magic_mushroom; /* Big fat spliff... */ diff --git a/block/partitions/sgi.h b/block/partitions/sgi.h deleted file mode 100644 index a5b77c3987cf..000000000000 --- a/block/partitions/sgi.h +++ /dev/null @@ -1,9 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * fs/partitions/sgi.h - */ - -extern int sgi_partition(struct parsed_partitions *state); - -#define SGI_LABEL_MAGIC 0x0be5a941 - diff --git a/block/partitions/sun.c b/block/partitions/sun.c index 90f36724e796..47dc53eccf77 100644 --- a/block/partitions/sun.c +++ b/block/partitions/sun.c @@ -9,7 +9,14 @@ */ #include "check.h" -#include "sun.h" + +#define SUN_LABEL_MAGIC 0xDABE +#define SUN_VTOC_SANITY 0x600DDEEE + +enum { + SUN_WHOLE_DISK = 5, + LINUX_RAID_PARTITION = 0xfd, /* autodetect RAID partition */ +}; int sun_partition(struct parsed_partitions *state) { diff --git a/block/partitions/sun.h b/block/partitions/sun.h deleted file mode 100644 index ae1b9eed3fd7..000000000000 --- a/block/partitions/sun.h +++ /dev/null @@ -1,9 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * fs/partitions/sun.h - */ - -#define SUN_LABEL_MAGIC 0xDABE -#define SUN_VTOC_SANITY 0x600DDEEE - -int sun_partition(struct parsed_partitions *state); diff --git a/block/partitions/sysv68.c b/block/partitions/sysv68.c index 92e810826b01..6f6257fd4eb4 100644 --- a/block/partitions/sysv68.c +++ b/block/partitions/sysv68.c @@ -6,7 +6,6 @@ */ #include "check.h" -#include "sysv68.h" /* * Volume ID structure: on first 256-bytes sector of disk diff --git a/block/partitions/sysv68.h b/block/partitions/sysv68.h deleted file mode 100644 index 4fb6b8ec78ae..000000000000 --- a/block/partitions/sysv68.h +++ /dev/null @@ -1,2 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -extern int sysv68_partition(struct parsed_partitions *state); diff --git a/block/partitions/ultrix.c b/block/partitions/ultrix.c index ecd0d7346c3d..4aaa81043ca0 100644 --- a/block/partitions/ultrix.c +++ b/block/partitions/ultrix.c @@ -8,7 +8,6 @@ */ #include "check.h" -#include "ultrix.h" int ultrix_partition(struct parsed_partitions *state) { diff --git a/block/partitions/ultrix.h b/block/partitions/ultrix.h deleted file mode 100644 index 9f676cead222..000000000000 --- a/block/partitions/ultrix.h +++ /dev/null @@ -1,6 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * fs/partitions/ultrix.h - */ - -int ultrix_partition(struct parsed_partitions *state); diff --git a/block/sed-opal.c b/block/sed-opal.c index 880cc57a5f6b..daafadbb88ca 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -1056,7 +1056,7 @@ static int start_opal_session_cont(struct opal_dev *dev) hsn = response_get_u64(&dev->parsed, 4); tsn = response_get_u64(&dev->parsed, 5); - if (hsn == 0 && tsn == 0) { + if (hsn != GENERIC_HOST_SESSION_NUM || tsn < FIRST_TPER_SESSION_NUM) { pr_debug("Couldn't authenticate session\n"); return -EPERM; } |