From afdc1a780ef84a54b613dae6f971407748aab61c Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 11 Apr 2008 12:56:51 +0200 Subject: block: add bio_copy_user_iov support to blk_rq_map_user_iov With this patch, blk_rq_map_user_iov uses bio_copy_user_iov when a low level driver needs padding or a buffer in sg_iovec isn't aligned. That is, it uses temporary kernel buffers instead of mapping user pages directly. When a LLD needs padding, later blk_rq_map_sg needs to extend the last entry of a scatter list. bio_copy_user_iov guarantees that there is enough space for padding by using temporary kernel buffers instead of user pages. blk_rq_map_user_iov needs buffers in sg_iovec to be aligned. The comment in blk_rq_map_user_iov indicates that drivers/scsi/sg.c also needs buffers in sg_iovec to be aligned. Actually, drivers/scsi/sg.c works with unaligned buffers in sg_iovec (it always uses temporary kernel buffers). Signed-off-by: FUJITA Tomonori Cc: Tejun Heo Cc: Mike Christie Cc: James Bottomley Signed-off-by: Jens Axboe --- block/blk-map.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/blk-map.c b/block/blk-map.c index c07d9c8317f4..ab43533ba641 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -5,6 +5,7 @@ #include #include #include +#include /* for struct sg_iovec */ #include "blk.h" @@ -194,15 +195,26 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, struct sg_iovec *iov, int iov_count, unsigned int len) { struct bio *bio; + int i, read = rq_data_dir(rq) == READ; + int unaligned = 0; if (!iov || iov_count <= 0) return -EINVAL; - /* we don't allow misaligned data like bio_map_user() does. If the - * user is using sg, they're expected to know the alignment constraints - * and respect them accordingly */ - bio = bio_map_user_iov(q, NULL, iov, iov_count, - rq_data_dir(rq) == READ); + for (i = 0; i < iov_count; i++) { + unsigned long uaddr = (unsigned long)iov[i].iov_base; + + if (uaddr & queue_dma_alignment(q)) { + unaligned = 1; + break; + } + } + + if (unaligned || (q->dma_pad_mask & len)) + bio = bio_copy_user_iov(q, iov, iov_count, read); + else + bio = bio_map_user_iov(q, NULL, iov, iov_count, read); + if (IS_ERR(bio)) return PTR_ERR(bio); -- cgit v1.2.3 From f18573abcc57844a7c3c12699d40eead8728cd8a Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 11 Apr 2008 12:56:52 +0200 Subject: block: move the padding adjustment to blk_rq_map_sg blk_rq_map_user adjusts bi_size of the last bio. It breaks the rule that req->data_len (the true data length) is equal to sum(bio). It broke the scsi command completion code. commit e97a294ef6938512b655b1abf17656cf2b26f709 was introduced to fix the above issue. However, the partial completion code doesn't work with it. The commit is also a layer violation (scsi mid-layer should not know about the block layer's padding). This patch moves the padding adjustment to blk_rq_map_sg (suggested by James). The padding works like the drain buffer. This patch breaks the rule that req->data_len is equal to sum(sg), however, the drain buffer already broke it. So this patch just restores the rule that req->data_len is equal to sub(bio) without breaking anything new. Now when a low level driver needs padding, blk_rq_map_user and blk_rq_map_user_iov guarantee there's enough room for padding. blk_rq_map_sg can safely extend the last entry of a scatter list. blk_rq_map_sg must extend the last entry of a scatter list only for a request that got through bio_copy_user_iov. This patches introduces new REQ_COPY_USER flag. Signed-off-by: FUJITA Tomonori Cc: Tejun Heo Cc: Mike Christie Cc: James Bottomley Signed-off-by: Jens Axboe --- block/blk-map.c | 24 +++++------------------- block/blk-merge.c | 9 +++++++++ drivers/scsi/scsi.c | 2 +- include/linux/blkdev.h | 2 ++ 4 files changed, 17 insertions(+), 20 deletions(-) (limited to 'block') diff --git a/block/blk-map.c b/block/blk-map.c index ab43533ba641..3c942bd6422a 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -141,25 +141,8 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq, ubuf += ret; } - /* - * __blk_rq_map_user() copies the buffers if starting address - * or length isn't aligned to dma_pad_mask. As the copied - * buffer is always page aligned, we know that there's enough - * room for padding. Extend the last bio and update - * rq->data_len accordingly. - * - * On unmap, bio_uncopy_user() will use unmodified - * bio_map_data pointed to by bio->bi_private. - */ - if (len & q->dma_pad_mask) { - unsigned int pad_len = (q->dma_pad_mask & ~len) + 1; - struct bio *tail = rq->biotail; - - tail->bi_io_vec[tail->bi_vcnt - 1].bv_len += pad_len; - tail->bi_size += pad_len; - - rq->extra_len += pad_len; - } + if (!bio_flagged(bio, BIO_USER_MAPPED)) + rq->cmd_flags |= REQ_COPY_USER; rq->buffer = rq->data = NULL; return 0; @@ -224,6 +207,9 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, return -EINVAL; } + if (!bio_flagged(bio, BIO_USER_MAPPED)) + rq->cmd_flags |= REQ_COPY_USER; + bio_get(bio); blk_rq_bio_prep(q, rq, bio); rq->buffer = rq->data = NULL; diff --git a/block/blk-merge.c b/block/blk-merge.c index 0f58616bcd7f..b5c5c4a9e3f0 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -220,6 +220,15 @@ new_segment: bvprv = bvec; } /* segments in rq */ + + if (unlikely(rq->cmd_flags & REQ_COPY_USER) && + (rq->data_len & q->dma_pad_mask)) { + unsigned int pad_len = (q->dma_pad_mask & ~rq->data_len) + 1; + + sg->length += pad_len; + rq->extra_len += pad_len; + } + if (q->dma_drain_size && q->dma_drain_needed(rq)) { if (rq->cmd_flags & REQ_RW) memset(q->dma_drain_buffer, 0, q->dma_drain_size); diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c index f6980bd9d8f9..12d69d7c8577 100644 --- a/drivers/scsi/scsi.c +++ b/drivers/scsi/scsi.c @@ -852,7 +852,7 @@ void scsi_finish_command(struct scsi_cmnd *cmd) "Notifying upper driver of completion " "(result %x)\n", cmd->result)); - good_bytes = scsi_bufflen(cmd) + cmd->request->extra_len; + good_bytes = scsi_bufflen(cmd); if (cmd->request->cmd_type != REQ_TYPE_BLOCK_PC) { drv = scsi_cmd_to_driver(cmd); if (drv->done) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6f79d40dd3c0..b3a58adc4352 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -112,6 +112,7 @@ enum rq_flag_bits { __REQ_RW_SYNC, /* request is sync (O_DIRECT) */ __REQ_ALLOCED, /* request came from our alloc pool */ __REQ_RW_META, /* metadata io request */ + __REQ_COPY_USER, /* contains copies of user pages */ __REQ_NR_BITS, /* stops here */ }; @@ -133,6 +134,7 @@ enum rq_flag_bits { #define REQ_RW_SYNC (1 << __REQ_RW_SYNC) #define REQ_ALLOCED (1 << __REQ_ALLOCED) #define REQ_RW_META (1 << __REQ_RW_META) +#define REQ_COPY_USER (1 << __REQ_COPY_USER) #define BLK_MAX_CDB 16 -- cgit v1.2.3 From ee86418d39f28dd10d27c9d7906d8c26c1293e69 Mon Sep 17 00:00:00 2001 From: Nick Andrew Date: Mon, 21 Apr 2008 09:51:04 +0200 Subject: Kconfig: clean up block/Kconfig help descriptions Modify the help descriptions of block/Kconfig for clarity, accuracy and consistency. Refactor the BLOCK description a bit. The wording "This permits ... to be removed" isn't quite right; the block layer is removed when the option is disabled, whereas most descriptions talk about what happens when the option is enabled. Reformat the list of what is affected by disabling the block layer. Add more examples of large block devices to LBD and strive for technical accuracy; block devices of size _exactly_ 2TB require CONFIG_LBD, not only "bigger than 2TB". Also try to say (perhaps not very clearly) that the config option is only needed when you want to have individual block devices of size >= 2TB, for example if you had 3 x 1TB disks in your computer you'd have a total storage size of 3TB but you wouldn't need the option unless you want to aggregate those disks into a RAID or LVM. Improve terminology and grammar on BLK_DEV_IO_TRACE. I also added the boilerplate "If unsure, say N" to most options. Precisely say "2TB and larger" for LSF. Indent the help text for BLK_DEV_BSG by 2 spaces in accordance with the standard. Signed-off-by: Nick Andrew Cc: "Randy.Dunlap" Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- block/Kconfig | 65 ++++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 23 deletions(-) (limited to 'block') diff --git a/block/Kconfig b/block/Kconfig index 7db9a411649d..3e97f2bc446f 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -5,14 +5,18 @@ menuconfig BLOCK bool "Enable the block layer" if EMBEDDED default y help - This permits the block layer to be removed from the kernel if it's not - needed (on some embedded devices for example). If this option is - disabled, then blockdev files will become unusable and some - filesystems (such as ext3) will become unavailable. + Provide block layer support for the kernel. - This option will also disable SCSI character devices and USB storage - since they make use of various block layer definitions and - facilities. + Disable this option to remove the block layer support from the + kernel. This may be useful for embedded devices. + + If this option is disabled: + + - block device files will become unusable + - some filesystems (such as ext3) will become unavailable. + + Also, SCSI character devices and USB storage will be disabled since + they make use of various block layer definitions and facilities. Say Y here unless you know you really don't want to mount disks and suchlike. @@ -23,9 +27,20 @@ config LBD bool "Support for Large Block Devices" depends on !64BIT help - Say Y here if you want to attach large (bigger than 2TB) discs to - your machine, or if you want to have a raid or loopback device - bigger than 2TB. Otherwise say N. + Enable block devices of size 2TB and larger. + + This option is required to support the full capacity of large + (2TB+) block devices, including RAID, disk, Network Block Device, + Logical Volume Manager (LVM) and loopback. + + For example, RAID devices are frequently bigger than the capacity + of the largest individual hard drive. + + This option is not required if you have individual disk drives + which total 2TB+ and you are not aggregating the capacity into + a large block device (e.g. using RAID or LVM). + + If unsure, say N. config BLK_DEV_IO_TRACE bool "Support for tracing block io actions" @@ -33,19 +48,21 @@ config BLK_DEV_IO_TRACE select RELAY select DEBUG_FS help - Say Y here, if you want to be able to trace the block layer actions + Say Y here if you want to be able to trace the block layer actions on a given queue. Tracing allows you to see any traffic happening - on a block device queue. For more information (and the user space - support tools needed), fetch the blktrace app from: + on a block device queue. For more information (and the userspace + support tools needed), fetch the blktrace tools from: git://git.kernel.dk/blktrace.git + If unsure, say N. + config LSF bool "Support for Large Single Files" depends on !64BIT help - Say Y here if you want to be able to handle very large files (bigger - than 2TB), otherwise say N. + Say Y here if you want to be able to handle very large files (2TB + and larger), otherwise say N. If unsure, say Y. @@ -53,14 +70,16 @@ config BLK_DEV_BSG bool "Block layer SG support v4 (EXPERIMENTAL)" depends on EXPERIMENTAL ---help--- - Saying Y here will enable generic SG (SCSI generic) v4 support - for any block device. - - Unlike SG v3 (aka block/scsi_ioctl.c drivers/scsi/sg.c), SG v4 - can handle complicated SCSI commands: tagged variable length cdbs - with bidirectional data transfers and generic request/response - protocols (e.g. Task Management Functions and SMP in Serial - Attached SCSI). + Saying Y here will enable generic SG (SCSI generic) v4 support + for any block device. + + Unlike SG v3 (aka block/scsi_ioctl.c drivers/scsi/sg.c), SG v4 + can handle complicated SCSI commands: tagged variable length cdbs + with bidirectional data transfers and generic request/response + protocols (e.g. Task Management Functions and SMP in Serial + Attached SCSI). + + If unsure, say N. endif # BLOCK -- cgit v1.2.3 From fb199746303a6bfd6121834ec9e810471185c530 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Mon, 21 Apr 2008 09:51:06 +0200 Subject: block: fix blk_register_queue() return value blk_register_queue() returns -ENXIO when queue->request_fn is NULL. But there are some block drivers that call blk_register_queue() via add_disk() with queue->request_fn == NULL. (For example, brd, loop) Although no one checks return value of blk_register_queue(), this patch makes it return 0 instead of -ENXIO when queue->request_fn is NULL, Also this patch adds warning when blk_register_queue() and blk_unregister_queue() are called with queue == NULL rather than ignore invalid usage silently. Signed-off-by: Akinobu Mita Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 54d0db116153..fc41d83be22b 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -276,9 +276,12 @@ int blk_register_queue(struct gendisk *disk) struct request_queue *q = disk->queue; - if (!q || !q->request_fn) + if (WARN_ON(!q)) return -ENXIO; + if (!q->request_fn) + return 0; + ret = kobject_add(&q->kobj, kobject_get(&disk->dev.kobj), "%s", "queue"); if (ret < 0) @@ -300,7 +303,10 @@ void blk_unregister_queue(struct gendisk *disk) { struct request_queue *q = disk->queue; - if (q && q->request_fn) { + if (WARN_ON(!q)) + return; + + if (q->request_fn) { elv_unregister_queue(q); kobject_uevent(&q->kobj, KOBJ_REMOVE); -- cgit v1.2.3