From 8ba32fde2c5be52865b2fd7e5e3752a46971fabe Mon Sep 17 00:00:00 2001
From: Kevin Corry <kevcorry@us.ibm.com>
Date: Thu, 16 Mar 2006 23:04:03 -0800
Subject: [PATCH] dm stripe: Fix bounds

The dm-stripe target currently does not enforce that the size of a stripe
device be a multiple of the chunk-size.  Under certain conditions, this can
lead to I/O requests going off the end of an underlying device.  This
test-case shows one example.

echo "0 100 linear /dev/hdb1 0" | dmsetup create linear0
echo "0 100 linear /dev/hdb1 100" | dmsetup create linear1
echo "0 200 striped 2 32 /dev/mapper/linear0 0 /dev/mapper/linear1 0" | \
   dmsetup create stripe0
dd if=/dev/zero of=/dev/mapper/stripe0 bs=1k

This will produce the output:
dd: writing '/dev/mapper/stripe0': Input/output error
97+0 records in
96+0 records out

And in the kernel log will be:
attempt to access beyond end of device
dm-0: rw=0, want=104, limit=100

The patch will check that the table size is a multiple of the stripe
chunk-size when the table is created, which will prevent the above striped
device from being created.

This should not affect tools like LVM or EVMS, since in all the cases I can
think of, striped devices are always created with the sizes being a
multiple of the chunk-size.

The size of a stripe device must be a multiple of its chunk-size.

(akpm: that typecast is quite gratuitous)

Signed-off-by: Kevin Corry <kevcorry@us.ibm.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm-stripe.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index ab89278a56bf..697aacafb02a 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -103,9 +103,15 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		return -EINVAL;
 	}
 
+	if (((uint32_t)ti->len) & (chunk_size - 1)) {
+		ti->error = "dm-stripe: Target length not divisible by "
+		    "chunk size";
+		return -EINVAL;
+	}
+
 	width = ti->len;
 	if (sector_div(width, stripes)) {
-		ti->error = "dm-stripe: Target length not divisable by "
+		ti->error = "dm-stripe: Target length not divisible by "
 		    "number of stripes";
 		return -EINVAL;
 	}
-- 
cgit v1.2.3


From 1312f40e11c57edb5c3250f1b782cef8e3efea82 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 12 Mar 2006 11:02:03 -0500
Subject: [PATCH] regularize blk_cleanup_queue() use

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/block/loop.c         | 4 ++--
 drivers/block/pktcdvd.c      | 4 ++--
 drivers/block/umem.c         | 2 +-
 drivers/md/dm.c              | 4 ++--
 drivers/md/md.c              | 5 ++++-
 drivers/s390/block/dcssblk.c | 8 ++++----
 6 files changed, 15 insertions(+), 12 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 5f6d1a5cce11..0010704739e3 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1307,7 +1307,7 @@ static int __init loop_init(void)
 
 out_mem4:
 	while (i--)
-		blk_put_queue(loop_dev[i].lo_queue);
+		blk_cleanup_queue(loop_dev[i].lo_queue);
 	devfs_remove("loop");
 	i = max_loop;
 out_mem3:
@@ -1328,7 +1328,7 @@ static void loop_exit(void)
 
 	for (i = 0; i < max_loop; i++) {
 		del_gendisk(disks[i]);
-		blk_put_queue(loop_dev[i].lo_queue);
+		blk_cleanup_queue(loop_dev[i].lo_queue);
 		put_disk(disks[i]);
 	}
 	devfs_remove("loop");
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index bc9b2bcd7dba..476a5b553f34 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2514,7 +2514,7 @@ static int pkt_setup_dev(struct pkt_ctrl_command *ctrl_cmd)
 	return 0;
 
 out_new_dev:
-	blk_put_queue(disk->queue);
+	blk_cleanup_queue(disk->queue);
 out_mem2:
 	put_disk(disk);
 out_mem:
@@ -2555,7 +2555,7 @@ static int pkt_remove_dev(struct pkt_ctrl_command *ctrl_cmd)
 	DPRINTK("pktcdvd: writer %s unmapped\n", pd->name);
 
 	del_gendisk(pd->disk);
-	blk_put_queue(pd->disk->queue);
+	blk_cleanup_queue(pd->disk->queue);
 	put_disk(pd->disk);
 
 	pkt_devs[idx] = NULL;
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index 4ada1268b40d..c16e66b9c7a7 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -1131,7 +1131,7 @@ static void mm_pci_remove(struct pci_dev *dev)
 		pci_free_consistent(card->dev, PAGE_SIZE*2,
 				    card->mm_pages[1].desc,
 				    card->mm_pages[1].page_dma);
-	blk_put_queue(card->queue);
+	blk_cleanup_queue(card->queue);
 }
 
 static const struct pci_device_id mm_pci_ids[] = { {
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 745ca1f67b14..88d60202b9db 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -840,7 +840,7 @@ static struct mapped_device *alloc_dev(unsigned int minor, int persistent)
  bad3:
 	mempool_destroy(md->io_pool);
  bad2:
-	blk_put_queue(md->queue);
+	blk_cleanup_queue(md->queue);
 	free_minor(minor);
  bad1:
 	kfree(md);
@@ -860,7 +860,7 @@ static void free_dev(struct mapped_device *md)
 	del_gendisk(md->disk);
 	free_minor(minor);
 	put_disk(md->disk);
-	blk_put_queue(md->queue);
+	blk_cleanup_queue(md->queue);
 	kfree(md);
 }
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index d05e3125d298..5ed2228745cb 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -213,8 +213,11 @@ static void mddev_put(mddev_t *mddev)
 		return;
 	if (!mddev->raid_disks && list_empty(&mddev->disks)) {
 		list_del(&mddev->all_mddevs);
-		blk_put_queue(mddev->queue);
+		/* that blocks */
+		blk_cleanup_queue(mddev->queue);
+		/* that also blocks */
 		kobject_unregister(&mddev->kobj);
+		/* result blows... */
 	}
 	spin_unlock(&all_mddevs_lock);
 }
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 2e727f49ad19..44133250da2e 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -273,7 +273,7 @@ removeseg:
 	list_del(&dev_info->lh);
 
 	del_gendisk(dev_info->gd);
-	blk_put_queue(dev_info->dcssblk_queue);
+	blk_cleanup_queue(dev_info->dcssblk_queue);
 	dev_info->gd->queue = NULL;
 	put_disk(dev_info->gd);
 	device_unregister(dev);
@@ -491,7 +491,7 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
 unregister_dev:
 	PRINT_ERR("device_create_file() failed!\n");
 	list_del(&dev_info->lh);
-	blk_put_queue(dev_info->dcssblk_queue);
+	blk_cleanup_queue(dev_info->dcssblk_queue);
 	dev_info->gd->queue = NULL;
 	put_disk(dev_info->gd);
 	device_unregister(&dev_info->dev);
@@ -505,7 +505,7 @@ list_del:
 unload_seg:
 	segment_unload(local_buf);
 dealloc_gendisk:
-	blk_put_queue(dev_info->dcssblk_queue);
+	blk_cleanup_queue(dev_info->dcssblk_queue);
 	dev_info->gd->queue = NULL;
 	put_disk(dev_info->gd);
 free_dev_info:
@@ -562,7 +562,7 @@ dcssblk_remove_store(struct device *dev, struct device_attribute *attr, const ch
 	list_del(&dev_info->lh);
 
 	del_gendisk(dev_info->gd);
-	blk_put_queue(dev_info->dcssblk_queue);
+	blk_cleanup_queue(dev_info->dcssblk_queue);
 	dev_info->gd->queue = NULL;
 	put_disk(dev_info->gd);
 	device_unregister(&dev_info->dev);
-- 
cgit v1.2.3


From d2044a94e80b61f68ee7456f82d9b443e9ff6ac7 Mon Sep 17 00:00:00 2001
From: Alasdair G Kergon <agk@redhat.com>
Date: Wed, 22 Mar 2006 00:07:42 -0800
Subject: [PATCH] dm: bio split bvec fix

The code that handles bios that span table target boundaries by breaking
them up into smaller bios will not split an individual struct bio_vec into
more than two pieces.  Sometimes more than that are required.

This patch adds a loop to break the second piece up into as many pieces as
are necessary.

Cc: "Abhishek Gupta" <abhishekgupt@gmail.com>
Cc: Dan Smith <danms@us.ibm.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm.c | 43 ++++++++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 19 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 88d60202b9db..26b08ee425c7 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -533,30 +533,35 @@ static void __clone_and_map(struct clone_info *ci)
 
 	} else {
 		/*
-		 * Create two copy bios to deal with io that has
-		 * been split across a target.
+		 * Handle a bvec that must be split between two or more targets.
 		 */
 		struct bio_vec *bv = bio->bi_io_vec + ci->idx;
+		sector_t remaining = to_sector(bv->bv_len);
+		unsigned int offset = 0;
 
-		clone = split_bvec(bio, ci->sector, ci->idx,
-				   bv->bv_offset, max);
-		__map_bio(ti, clone, tio);
+		do {
+			if (offset) {
+				ti = dm_table_find_target(ci->map, ci->sector);
+				max = max_io_len(ci->md, ci->sector, ti);
 
-		ci->sector += max;
-		ci->sector_count -= max;
-		ti = dm_table_find_target(ci->map, ci->sector);
-
-		len = to_sector(bv->bv_len) - max;
-		clone = split_bvec(bio, ci->sector, ci->idx,
-				   bv->bv_offset + to_bytes(max), len);
-		tio = alloc_tio(ci->md);
-		tio->io = ci->io;
-		tio->ti = ti;
-		memset(&tio->info, 0, sizeof(tio->info));
-		__map_bio(ti, clone, tio);
+				tio = alloc_tio(ci->md);
+				tio->io = ci->io;
+				tio->ti = ti;
+				memset(&tio->info, 0, sizeof(tio->info));
+			}
+
+			len = min(remaining, max);
+
+			clone = split_bvec(bio, ci->sector, ci->idx,
+					   bv->bv_offset + offset, len);
+
+			__map_bio(ti, clone, tio);
+
+			ci->sector += len;
+			ci->sector_count -= len;
+			offset += to_bytes(len);
+		} while (remaining -= len);
 
-		ci->sector += len;
-		ci->sector_count -= len;
 		ci->idx++;
 	}
 }
-- 
cgit v1.2.3


From 2056a782f8e7e65fd4bfd027506b4ce1c5e9ccd4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Thu, 23 Mar 2006 20:00:26 +0100
Subject: [PATCH] Block queue IO tracing support (blktrace) as of 2006-03-23

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 block/Kconfig                |  12 +
 block/Makefile               |   2 +
 block/blktrace.c             | 538 +++++++++++++++++++++++++++++++++++++++++++
 block/elevator.c             |   4 +
 block/ioctl.c                |   6 +
 block/ll_rw_blk.c            |  44 +++-
 drivers/block/cciss.c        |   2 +
 drivers/md/dm.c              |  13 +-
 fs/bio.c                     |   4 +
 fs/compat_ioctl.c            |   1 +
 include/linux/blkdev.h       |   3 +
 include/linux/blktrace_api.h | 277 ++++++++++++++++++++++
 include/linux/compat_ioctl.h |   4 +
 include/linux/fs.h           |   4 +
 include/linux/sched.h        |   1 +
 kernel/fork.c                |   1 +
 mm/highmem.c                 |   3 +
 17 files changed, 916 insertions(+), 3 deletions(-)
 create mode 100644 block/blktrace.c
 create mode 100644 include/linux/blktrace_api.h

(limited to 'drivers/md')

diff --git a/block/Kconfig b/block/Kconfig
index 377f6dd20e17..96783645092d 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -11,4 +11,16 @@ config LBD
 	  your machine, or if you want to have a raid or loopback device
 	  bigger than 2TB.  Otherwise say N.
 
+config BLK_DEV_IO_TRACE
+	bool "Support for tracing block io actions"
+	select RELAY
+	select DEBUG_FS
+	help
+	  Say Y here, if you want to be able to trace the block layer actions
+	  on a given queue. Tracing allows you to see any traffic happening
+	  on a block device queue. For more information (and the user space
+	  support tools needed), fetch the blktrace app from:
+
+	  git://brick.kernel.dk/data/git/blktrace.git
+
 source block/Kconfig.iosched
diff --git a/block/Makefile b/block/Makefile
index 7e4f93e2b44e..c05de0e0037f 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -8,3 +8,5 @@ obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
 obj-$(CONFIG_IOSCHED_AS)	+= as-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
+
+obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
diff --git a/block/blktrace.c b/block/blktrace.c
new file mode 100644
index 000000000000..36f3a172275f
--- /dev/null
+++ b/block/blktrace.c
@@ -0,0 +1,538 @@
+/*
+ * Copyright (C) 2006 Jens Axboe <axboe@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/blktrace_api.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/debugfs.h>
+#include <asm/uaccess.h>
+
+static DEFINE_PER_CPU(unsigned long long, blk_trace_cpu_offset) = { 0, };
+static unsigned int blktrace_seq __read_mostly = 1;
+
+/*
+ * Send out a notify for this process, if we haven't done so since a trace
+ * started
+ */
+static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk)
+{
+	struct blk_io_trace *t;
+
+	t = relay_reserve(bt->rchan, sizeof(*t) + sizeof(tsk->comm));
+	if (t) {
+		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
+		t->device = bt->dev;
+		t->action = BLK_TC_ACT(BLK_TC_NOTIFY);
+		t->pid = tsk->pid;
+		t->cpu = smp_processor_id();
+		t->pdu_len = sizeof(tsk->comm);
+		memcpy((void *) t + sizeof(*t), tsk->comm, t->pdu_len);
+		tsk->btrace_seq = blktrace_seq;
+	}
+}
+
+static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
+			 pid_t pid)
+{
+	if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
+		return 1;
+	if (sector < bt->start_lba || sector > bt->end_lba)
+		return 1;
+	if (bt->pid && pid != bt->pid)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Data direction bit lookup
+ */
+static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) };
+
+/*
+ * Bio action bits of interest
+ */
+static u32 bio_act[3] __read_mostly = { 0, BLK_TC_ACT(BLK_TC_BARRIER), BLK_TC_ACT(BLK_TC_SYNC) };
+
+/*
+ * More could be added as needed, taking care to increment the decrementer
+ * to get correct indexing
+ */
+#define trace_barrier_bit(rw)	\
+	(((rw) & (1 << BIO_RW_BARRIER)) >> (BIO_RW_BARRIER - 0))
+#define trace_sync_bit(rw)	\
+	(((rw) & (1 << BIO_RW_SYNC)) >> (BIO_RW_SYNC - 1))
+
+/*
+ * The worker for the various blk_add_trace*() types. Fills out a
+ * blk_io_trace structure and places it in a per-cpu subbuffer.
+ */
+void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
+		     int rw, u32 what, int error, int pdu_len, void *pdu_data)
+{
+	struct task_struct *tsk = current;
+	struct blk_io_trace *t;
+	unsigned long flags;
+	unsigned long *sequence;
+	pid_t pid;
+	int cpu;
+
+	if (unlikely(bt->trace_state != Blktrace_running))
+		return;
+
+	what |= ddir_act[rw & WRITE];
+	what |= bio_act[trace_barrier_bit(rw)];
+	what |= bio_act[trace_sync_bit(rw)];
+
+	pid = tsk->pid;
+	if (unlikely(act_log_check(bt, what, sector, pid)))
+		return;
+
+	/*
+	 * A word about the locking here - we disable interrupts to reserve
+	 * some space in the relay per-cpu buffer, to prevent an irq
+	 * from coming in and stepping on our toes. Once reserved, it's
+	 * enough to get preemption disabled to prevent read of this data
+	 * before we are through filling it. get_cpu()/put_cpu() does this
+	 * for us
+	 */
+	local_irq_save(flags);
+
+	if (unlikely(tsk->btrace_seq != blktrace_seq))
+		trace_note_tsk(bt, tsk);
+
+	t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
+	if (t) {
+		cpu = smp_processor_id();
+		sequence = per_cpu_ptr(bt->sequence, cpu);
+
+		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
+		t->sequence = ++(*sequence);
+		t->time = sched_clock() - per_cpu(blk_trace_cpu_offset, cpu);
+		t->sector = sector;
+		t->bytes = bytes;
+		t->action = what;
+		t->pid = pid;
+		t->device = bt->dev;
+		t->cpu = cpu;
+		t->error = error;
+		t->pdu_len = pdu_len;
+
+		if (pdu_len)
+			memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
+	}
+
+	local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL_GPL(__blk_add_trace);
+
+static struct dentry *blk_tree_root;
+static struct mutex blk_tree_mutex;
+static unsigned int root_users;
+
+static inline void blk_remove_root(void)
+{
+	if (blk_tree_root) {
+		debugfs_remove(blk_tree_root);
+		blk_tree_root = NULL;
+	}
+}
+
+static void blk_remove_tree(struct dentry *dir)
+{
+	mutex_lock(&blk_tree_mutex);
+	debugfs_remove(dir);
+	if (--root_users == 0)
+		blk_remove_root();
+	mutex_unlock(&blk_tree_mutex);
+}
+
+static struct dentry *blk_create_tree(const char *blk_name)
+{
+	struct dentry *dir = NULL;
+
+	mutex_lock(&blk_tree_mutex);
+
+	if (!blk_tree_root) {
+		blk_tree_root = debugfs_create_dir("block", NULL);
+		if (!blk_tree_root)
+			goto err;
+	}
+
+	dir = debugfs_create_dir(blk_name, blk_tree_root);
+	if (dir)
+		root_users++;
+	else
+		blk_remove_root();
+
+err:
+	mutex_unlock(&blk_tree_mutex);
+	return dir;
+}
+
+static void blk_trace_cleanup(struct blk_trace *bt)
+{
+	relay_close(bt->rchan);
+	debugfs_remove(bt->dropped_file);
+	blk_remove_tree(bt->dir);
+	free_percpu(bt->sequence);
+	kfree(bt);
+}
+
+static int blk_trace_remove(request_queue_t *q)
+{
+	struct blk_trace *bt;
+
+	bt = xchg(&q->blk_trace, NULL);
+	if (!bt)
+		return -EINVAL;
+
+	if (bt->trace_state == Blktrace_setup ||
+	    bt->trace_state == Blktrace_stopped)
+		blk_trace_cleanup(bt);
+
+	return 0;
+}
+
+static int blk_dropped_open(struct inode *inode, struct file *filp)
+{
+	filp->private_data = inode->u.generic_ip;
+
+	return 0;
+}
+
+static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	struct blk_trace *bt = filp->private_data;
+	char buf[16];
+
+	snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));
+
+	return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
+}
+
+static struct file_operations blk_dropped_fops = {
+	.owner =	THIS_MODULE,
+	.open =		blk_dropped_open,
+	.read =		blk_dropped_read,
+};
+
+/*
+ * Keep track of how many times we encountered a full subbuffer, to aid
+ * the user space app in telling how many lost events there were.
+ */
+static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
+				     void *prev_subbuf, size_t prev_padding)
+{
+	struct blk_trace *bt;
+
+	if (!relay_buf_full(buf))
+		return 1;
+
+	bt = buf->chan->private_data;
+	atomic_inc(&bt->dropped);
+	return 0;
+}
+
+static int blk_remove_buf_file_callback(struct dentry *dentry)
+{
+	debugfs_remove(dentry);
+	return 0;
+}
+
+static struct dentry *blk_create_buf_file_callback(const char *filename,
+						   struct dentry *parent,
+						   int mode,
+						   struct rchan_buf *buf,
+						   int *is_global)
+{
+	return debugfs_create_file(filename, mode, parent, buf,
+					&relay_file_operations);
+}
+
+static struct rchan_callbacks blk_relay_callbacks = {
+	.subbuf_start		= blk_subbuf_start_callback,
+	.create_buf_file	= blk_create_buf_file_callback,
+	.remove_buf_file	= blk_remove_buf_file_callback,
+};
+
+/*
+ * Setup everything required to start tracing
+ */
+static int blk_trace_setup(request_queue_t *q, struct block_device *bdev,
+			   char __user *arg)
+{
+	struct blk_user_trace_setup buts;
+	struct blk_trace *old_bt, *bt = NULL;
+	struct dentry *dir = NULL;
+	char b[BDEVNAME_SIZE];
+	int ret, i;
+
+	if (copy_from_user(&buts, arg, sizeof(buts)))
+		return -EFAULT;
+
+	if (!buts.buf_size || !buts.buf_nr)
+		return -EINVAL;
+
+	strcpy(buts.name, bdevname(bdev, b));
+
+	/*
+	 * some device names have larger paths - convert the slashes
+	 * to underscores for this to work as expected
+	 */
+	for (i = 0; i < strlen(buts.name); i++)
+		if (buts.name[i] == '/')
+			buts.name[i] = '_';
+
+	if (copy_to_user(arg, &buts, sizeof(buts)))
+		return -EFAULT;
+
+	ret = -ENOMEM;
+	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
+	if (!bt)
+		goto err;
+
+	bt->sequence = alloc_percpu(unsigned long);
+	if (!bt->sequence)
+		goto err;
+
+	ret = -ENOENT;
+	dir = blk_create_tree(buts.name);
+	if (!dir)
+		goto err;
+
+	bt->dir = dir;
+	bt->dev = bdev->bd_dev;
+	atomic_set(&bt->dropped, 0);
+
+	ret = -EIO;
+	bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops);
+	if (!bt->dropped_file)
+		goto err;
+
+	bt->rchan = relay_open("trace", dir, buts.buf_size, buts.buf_nr, &blk_relay_callbacks);
+	if (!bt->rchan)
+		goto err;
+	bt->rchan->private_data = bt;
+
+	bt->act_mask = buts.act_mask;
+	if (!bt->act_mask)
+		bt->act_mask = (u16) -1;
+
+	bt->start_lba = buts.start_lba;
+	bt->end_lba = buts.end_lba;
+	if (!bt->end_lba)
+		bt->end_lba = -1ULL;
+
+	bt->pid = buts.pid;
+	bt->trace_state = Blktrace_setup;
+
+	ret = -EBUSY;
+	old_bt = xchg(&q->blk_trace, bt);
+	if (old_bt) {
+		(void) xchg(&q->blk_trace, old_bt);
+		goto err;
+	}
+
+	return 0;
+err:
+	if (dir)
+		blk_remove_tree(dir);
+	if (bt) {
+		if (bt->dropped_file)
+			debugfs_remove(bt->dropped_file);
+		if (bt->sequence)
+			free_percpu(bt->sequence);
+		if (bt->rchan)
+			relay_close(bt->rchan);
+		kfree(bt);
+	}
+	return ret;
+}
+
+static int blk_trace_startstop(request_queue_t *q, int start)
+{
+	struct blk_trace *bt;
+	int ret;
+
+	if ((bt = q->blk_trace) == NULL)
+		return -EINVAL;
+
+	/*
+	 * For starting a trace, we can transition from a setup or stopped
+	 * trace. For stopping a trace, the state must be running
+	 */
+	ret = -EINVAL;
+	if (start) {
+		if (bt->trace_state == Blktrace_setup ||
+		    bt->trace_state == Blktrace_stopped) {
+			blktrace_seq++;
+			smp_mb();
+			bt->trace_state = Blktrace_running;
+			ret = 0;
+		}
+	} else {
+		if (bt->trace_state == Blktrace_running) {
+			bt->trace_state = Blktrace_stopped;
+			relay_flush(bt->rchan);
+			ret = 0;
+		}
+	}
+
+	return ret;
+}
+
+/**
+ * blk_trace_ioctl: - handle the ioctls associated with tracing
+ * @bdev:	the block device
+ * @cmd: 	the ioctl cmd
+ * @arg:	the argument data, if any
+ *
+ **/
+int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
+{
+	request_queue_t *q;
+	int ret, start = 0;
+
+	q = bdev_get_queue(bdev);
+	if (!q)
+		return -ENXIO;
+
+	mutex_lock(&bdev->bd_mutex);
+
+	switch (cmd) {
+	case BLKTRACESETUP:
+		ret = blk_trace_setup(q, bdev, arg);
+		break;
+	case BLKTRACESTART:
+		start = 1;
+	case BLKTRACESTOP:
+		ret = blk_trace_startstop(q, start);
+		break;
+	case BLKTRACETEARDOWN:
+		ret = blk_trace_remove(q);
+		break;
+	default:
+		ret = -ENOTTY;
+		break;
+	}
+
+	mutex_unlock(&bdev->bd_mutex);
+	return ret;
+}
+
+/**
+ * blk_trace_shutdown: - stop and cleanup trace structures
+ * @q:    the request queue associated with the device
+ *
+ **/
+void blk_trace_shutdown(request_queue_t *q)
+{
+	blk_trace_startstop(q, 0);
+	blk_trace_remove(q);
+}
+
+/*
+ * Average offset over two calls to sched_clock() with a gettimeofday()
+ * in the middle
+ */
+static void blk_check_time(unsigned long long *t)
+{
+	unsigned long long a, b;
+	struct timeval tv;
+
+	a = sched_clock();
+	do_gettimeofday(&tv);
+	b = sched_clock();
+
+	*t = tv.tv_sec * 1000000000 + tv.tv_usec * 1000;
+	*t -= (a + b) / 2;
+}
+
+static void blk_trace_check_cpu_time(void *data)
+{
+	unsigned long long *t;
+	int cpu = get_cpu();
+
+	t = &per_cpu(blk_trace_cpu_offset, cpu);
+
+	/*
+	 * Just call it twice, hopefully the second call will be cache hot
+	 * and a little more precise
+	 */
+	blk_check_time(t);
+	blk_check_time(t);
+
+	put_cpu();
+}
+
+/*
+ * Call blk_trace_check_cpu_time() on each CPU to calibrate our inter-CPU
+ * timings
+ */
+static void blk_trace_calibrate_offsets(void)
+{
+	unsigned long flags;
+
+	smp_call_function(blk_trace_check_cpu_time, NULL, 1, 1);
+	local_irq_save(flags);
+	blk_trace_check_cpu_time(NULL);
+	local_irq_restore(flags);
+}
+
+static void blk_trace_set_ht_offsets(void)
+{
+#if defined(CONFIG_SCHED_SMT)
+	int cpu, i;
+
+	/*
+	 * now make sure HT siblings have the same time offset
+	 */
+	preempt_disable();
+	for_each_online_cpu(cpu) {
+		unsigned long long *cpu_off, *sibling_off;
+
+		for_each_cpu_mask(i, cpu_sibling_map[cpu]) {
+			if (i == cpu)
+				continue;
+
+			cpu_off = &per_cpu(blk_trace_cpu_offset, cpu);
+			sibling_off = &per_cpu(blk_trace_cpu_offset, i);
+			*sibling_off = *cpu_off;
+		}
+	}
+	preempt_enable();
+#endif
+}
+
+static __init int blk_trace_init(void)
+{
+	mutex_init(&blk_tree_mutex);
+	blk_trace_calibrate_offsets();
+	blk_trace_set_ht_offsets();
+
+	return 0;
+}
+
+module_init(blk_trace_init);
+
diff --git a/block/elevator.c b/block/elevator.c
index db3d0d8296a0..5e558c4689a4 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -33,6 +33,7 @@
 #include <linux/init.h>
 #include <linux/compiler.h>
 #include <linux/delay.h>
+#include <linux/blktrace_api.h>
 
 #include <asm/uaccess.h>
 
@@ -333,6 +334,8 @@ void elv_insert(request_queue_t *q, struct request *rq, int where)
 	struct list_head *pos;
 	unsigned ordseq;
 
+	blk_add_trace_rq(q, rq, BLK_TA_INSERT);
+
 	rq->q = q;
 
 	switch (where) {
@@ -499,6 +502,7 @@ struct request *elv_next_request(request_queue_t *q)
 			 * not be passed by new incoming requests
 			 */
 			rq->flags |= REQ_STARTED;
+			blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
 		}
 
 		if (!q->boundary_rq || q->boundary_rq == rq) {
diff --git a/block/ioctl.c b/block/ioctl.c
index 35fdb7dc6512..9cfa2e1ecb24 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -5,6 +5,7 @@
 #include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
 #include <linux/smp_lock.h>
+#include <linux/blktrace_api.h>
 #include <asm/uaccess.h>
 
 static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user *arg)
@@ -189,6 +190,11 @@ static int blkdev_locked_ioctl(struct file *file, struct block_device *bdev,
 		return put_ulong(arg, bdev->bd_inode->i_size >> 9);
 	case BLKGETSIZE64:
 		return put_u64(arg, bdev->bd_inode->i_size);
+	case BLKTRACESTART:
+	case BLKTRACESTOP:
+	case BLKTRACESETUP:
+	case BLKTRACETEARDOWN:
+		return blk_trace_ioctl(bdev, cmd, (char __user *) arg);
 	}
 	return -ENOIOCTLCMD;
 }
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 6c793b196aa9..062067fa7ead 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -28,6 +28,7 @@
 #include <linux/writeback.h>
 #include <linux/interrupt.h>
 #include <linux/cpu.h>
+#include <linux/blktrace_api.h>
 
 /*
  * for max sense size
@@ -1556,8 +1557,10 @@ void blk_plug_device(request_queue_t *q)
 	if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))
 		return;
 
-	if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
+	if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) {
 		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
+		blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);
+	}
 }
 
 EXPORT_SYMBOL(blk_plug_device);
@@ -1621,14 +1624,21 @@ static void blk_backing_dev_unplug(struct backing_dev_info *bdi,
 	/*
 	 * devices don't necessarily have an ->unplug_fn defined
 	 */
-	if (q->unplug_fn)
+	if (q->unplug_fn) {
+		blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
+					q->rq.count[READ] + q->rq.count[WRITE]);
+
 		q->unplug_fn(q);
+	}
 }
 
 static void blk_unplug_work(void *data)
 {
 	request_queue_t *q = data;
 
+	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
+				q->rq.count[READ] + q->rq.count[WRITE]);
+
 	q->unplug_fn(q);
 }
 
@@ -1636,6 +1646,9 @@ static void blk_unplug_timeout(unsigned long data)
 {
 	request_queue_t *q = (request_queue_t *)data;
 
+	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
+				q->rq.count[READ] + q->rq.count[WRITE]);
+
 	kblockd_schedule_work(&q->unplug_work);
 }
 
@@ -1753,6 +1766,9 @@ static void blk_release_queue(struct kobject *kobj)
 	if (q->queue_tags)
 		__blk_queue_free_tags(q);
 
+	if (q->blk_trace)
+		blk_trace_shutdown(q);
+
 	kmem_cache_free(requestq_cachep, q);
 }
 
@@ -2129,6 +2145,8 @@ rq_starved:
 	
 	rq_init(q, rq);
 	rq->rl = rl;
+
+	blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);
 out:
 	return rq;
 }
@@ -2157,6 +2175,8 @@ static struct request *get_request_wait(request_queue_t *q, int rw,
 		if (!rq) {
 			struct io_context *ioc;
 
+			blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);
+
 			__generic_unplug_device(q);
 			spin_unlock_irq(q->queue_lock);
 			io_schedule();
@@ -2210,6 +2230,8 @@ EXPORT_SYMBOL(blk_get_request);
  */
 void blk_requeue_request(request_queue_t *q, struct request *rq)
 {
+	blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
+
 	if (blk_rq_tagged(rq))
 		blk_queue_end_tag(q, rq);
 
@@ -2844,6 +2866,8 @@ static int __make_request(request_queue_t *q, struct bio *bio)
 			if (!q->back_merge_fn(q, req, bio))
 				break;
 
+			blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+
 			req->biotail->bi_next = bio;
 			req->biotail = bio;
 			req->nr_sectors = req->hard_nr_sectors += nr_sectors;
@@ -2859,6 +2883,8 @@ static int __make_request(request_queue_t *q, struct bio *bio)
 			if (!q->front_merge_fn(q, req, bio))
 				break;
 
+			blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+
 			bio->bi_next = req->bio;
 			req->bio = bio;
 
@@ -2976,6 +3002,7 @@ void generic_make_request(struct bio *bio)
 	request_queue_t *q;
 	sector_t maxsector;
 	int ret, nr_sectors = bio_sectors(bio);
+	dev_t old_dev;
 
 	might_sleep();
 	/* Test device or partition size, when known. */
@@ -3002,6 +3029,8 @@ void generic_make_request(struct bio *bio)
 	 * NOTE: we don't repeat the blk_size check for each new device.
 	 * Stacking drivers are expected to know what they are doing.
 	 */
+	maxsector = -1;
+	old_dev = 0;
 	do {
 		char b[BDEVNAME_SIZE];
 
@@ -3034,6 +3063,15 @@ end_io:
 		 */
 		blk_partition_remap(bio);
 
+		if (maxsector != -1)
+			blk_add_trace_remap(q, bio, old_dev, bio->bi_sector, 
+					    maxsector);
+
+		blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+
+		maxsector = bio->bi_sector;
+		old_dev = bio->bi_bdev->bd_dev;
+
 		ret = q->make_request_fn(q, bio);
 	} while (ret);
 }
@@ -3153,6 +3191,8 @@ static int __end_that_request_first(struct request *req, int uptodate,
 	int total_bytes, bio_nbytes, error, next_idx = 0;
 	struct bio *bio;
 
+	blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
+
 	/*
 	 * extend uptodate bool to allow < 0 value to be direct io error
 	 */
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index e29b8926f80e..1f2890989b56 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -38,6 +38,7 @@
 #include <linux/hdreg.h>
 #include <linux/spinlock.h>
 #include <linux/compat.h>
+#include <linux/blktrace_api.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
@@ -2331,6 +2332,7 @@ static inline void complete_command( ctlr_info_t *h, CommandList_struct *cmd,
 
 	cmd->rq->completion_data = cmd;
 	cmd->rq->errors = status;
+	blk_add_trace_rq(cmd->rq->q, cmd->rq, BLK_TA_COMPLETE);
 	blk_complete_request(cmd->rq);
 }
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 26b08ee425c7..8c82373f7ff3 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -17,6 +17,7 @@
 #include <linux/mempool.h>
 #include <linux/slab.h>
 #include <linux/idr.h>
+#include <linux/blktrace_api.h>
 
 static const char *_name = DM_NAME;
 
@@ -334,6 +335,8 @@ static void dec_pending(struct dm_io *io, int error)
 			/* nudge anyone waiting on suspend queue */
 			wake_up(&io->md->wait);
 
+		blk_add_trace_bio(io->md->queue, io->bio, BLK_TA_COMPLETE);
+
 		bio_endio(io->bio, io->bio->bi_size, io->error);
 		free_io(io->md, io);
 	}
@@ -392,6 +395,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
 		      struct target_io *tio)
 {
 	int r;
+	sector_t sector;
 
 	/*
 	 * Sanity checks.
@@ -407,10 +411,17 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
 	 * this io.
 	 */
 	atomic_inc(&tio->io->io_count);
+	sector = clone->bi_sector;
 	r = ti->type->map(ti, clone, &tio->info);
-	if (r > 0)
+	if (r > 0) {
 		/* the bio has been remapped so dispatch it */
+
+		blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone, 
+				    tio->io->bio->bi_bdev->bd_dev, sector, 
+				    clone->bi_sector);
+
 		generic_make_request(clone);
+	}
 
 	else if (r < 0) {
 		/* error the io and bail out */
diff --git a/fs/bio.c b/fs/bio.c
index 8f1d2e815c96..0a8c59cb68f5 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -25,6 +25,7 @@
 #include <linux/module.h>
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
+#include <linux/blktrace_api.h>
 #include <scsi/sg.h>		/* for struct sg_iovec */
 
 #define BIO_POOL_SIZE 256
@@ -1095,6 +1096,9 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
 	if (!bp)
 		return bp;
 
+	blk_add_trace_pdu_int(bdev_get_queue(bi->bi_bdev), BLK_TA_SPLIT, bi,
+				bi->bi_sector + first_sectors);
+
 	BUG_ON(bi->bi_vcnt != 1);
 	BUG_ON(bi->bi_idx != 0);
 	atomic_set(&bp->cnt, 3);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index c666769a875d..7c031f00fd79 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -72,6 +72,7 @@
 #include <linux/i2c-dev.h>
 #include <linux/wireless.h>
 #include <linux/atalk.h>
+#include <linux/blktrace_api.h>
 
 #include <net/sock.h>          /* siocdevprivate_ioctl */
 #include <net/bluetooth/bluetooth.h>
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 56bb6a4e15f3..c179966f1a2f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -22,6 +22,7 @@ typedef struct request_queue request_queue_t;
 struct elevator_queue;
 typedef struct elevator_queue elevator_t;
 struct request_pm_state;
+struct blk_trace;
 
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_MAX_RQ	128	/* Default maximum */
@@ -416,6 +417,8 @@ struct request_queue
 	unsigned int		sg_reserved_size;
 	int			node;
 
+	struct blk_trace	*blk_trace;
+
 	/*
 	 * reserved for flush operations
 	 */
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
new file mode 100644
index 000000000000..b34d3e73d5ea
--- /dev/null
+++ b/include/linux/blktrace_api.h
@@ -0,0 +1,277 @@
+#ifndef BLKTRACE_H
+#define BLKTRACE_H
+
+#include <linux/config.h>
+#include <linux/blkdev.h>
+#include <linux/relay.h>
+
+/*
+ * Trace categories
+ */
+enum blktrace_cat {
+	BLK_TC_READ	= 1 << 0,	/* reads */
+	BLK_TC_WRITE	= 1 << 1,	/* writes */
+	BLK_TC_BARRIER	= 1 << 2,	/* barrier */
+	BLK_TC_SYNC	= 1 << 3,	/* barrier */
+	BLK_TC_QUEUE	= 1 << 4,	/* queueing/merging */
+	BLK_TC_REQUEUE	= 1 << 5,	/* requeueing */
+	BLK_TC_ISSUE	= 1 << 6,	/* issue */
+	BLK_TC_COMPLETE	= 1 << 7,	/* completions */
+	BLK_TC_FS	= 1 << 8,	/* fs requests */
+	BLK_TC_PC	= 1 << 9,	/* pc requests */
+	BLK_TC_NOTIFY	= 1 << 10,	/* special message */
+
+	BLK_TC_END	= 1 << 15,	/* only 16-bits, reminder */
+};
+
+#define BLK_TC_SHIFT		(16)
+#define BLK_TC_ACT(act)		((act) << BLK_TC_SHIFT)
+
+/*
+ * Basic trace actions
+ */
+enum blktrace_act {
+	__BLK_TA_QUEUE = 1,		/* queued */
+	__BLK_TA_BACKMERGE,		/* back merged to existing rq */
+	__BLK_TA_FRONTMERGE,		/* front merge to existing rq */
+	__BLK_TA_GETRQ,			/* allocated new request */
+	__BLK_TA_SLEEPRQ,		/* sleeping on rq allocation */
+	__BLK_TA_REQUEUE,		/* request requeued */
+	__BLK_TA_ISSUE,			/* sent to driver */
+	__BLK_TA_COMPLETE,		/* completed by driver */
+	__BLK_TA_PLUG,			/* queue was plugged */
+	__BLK_TA_UNPLUG_IO,		/* queue was unplugged by io */
+	__BLK_TA_UNPLUG_TIMER,		/* queue was unplugged by timer */
+	__BLK_TA_INSERT,		/* insert request */
+	__BLK_TA_SPLIT,			/* bio was split */
+	__BLK_TA_BOUNCE,		/* bio was bounced */
+	__BLK_TA_REMAP,			/* bio was remapped */
+};
+
+/*
+ * Trace actions in full. Additionally, read or write is masked
+ */
+#define BLK_TA_QUEUE		(__BLK_TA_QUEUE | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_BACKMERGE	(__BLK_TA_BACKMERGE | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_FRONTMERGE	(__BLK_TA_FRONTMERGE | BLK_TC_ACT(BLK_TC_QUEUE))
+#define	BLK_TA_GETRQ		(__BLK_TA_GETRQ | BLK_TC_ACT(BLK_TC_QUEUE))
+#define	BLK_TA_SLEEPRQ		(__BLK_TA_SLEEPRQ | BLK_TC_ACT(BLK_TC_QUEUE))
+#define	BLK_TA_REQUEUE		(__BLK_TA_REQUEUE | BLK_TC_ACT(BLK_TC_REQUEUE))
+#define BLK_TA_ISSUE		(__BLK_TA_ISSUE | BLK_TC_ACT(BLK_TC_ISSUE))
+#define BLK_TA_COMPLETE		(__BLK_TA_COMPLETE| BLK_TC_ACT(BLK_TC_COMPLETE))
+#define BLK_TA_PLUG		(__BLK_TA_PLUG | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_UNPLUG_IO	(__BLK_TA_UNPLUG_IO | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_UNPLUG_TIMER	(__BLK_TA_UNPLUG_TIMER | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_INSERT		(__BLK_TA_INSERT | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_SPLIT		(__BLK_TA_SPLIT)
+#define BLK_TA_BOUNCE		(__BLK_TA_BOUNCE)
+#define BLK_TA_REMAP		(__BLK_TA_REMAP | BLK_TC_ACT(BLK_TC_QUEUE))
+
+#define BLK_IO_TRACE_MAGIC	0x65617400
+#define BLK_IO_TRACE_VERSION	0x07
+
+/*
+ * The trace itself
+ */
+struct blk_io_trace {
+	u32 magic;		/* MAGIC << 8 | version */
+	u32 sequence;		/* event number */
+	u64 time;		/* in microseconds */
+	u64 sector;		/* disk offset */
+	u32 bytes;		/* transfer length */
+	u32 action;		/* what happened */
+	u32 pid;		/* who did it */
+	u32 device;		/* device number */
+	u32 cpu;		/* on what cpu did it happen */
+	u16 error;		/* completion error */
+	u16 pdu_len;		/* length of data after this trace */
+};
+
+/*
+ * The remap event
+ */
+struct blk_io_trace_remap {
+	u32 device;
+	u32 __pad;
+	u64 sector;
+};
+
+enum {
+	Blktrace_setup = 1,
+	Blktrace_running,
+	Blktrace_stopped,
+};
+
+struct blk_trace {
+	int trace_state;
+	struct rchan *rchan;
+	unsigned long *sequence;
+	u16 act_mask;
+	u64 start_lba;
+	u64 end_lba;
+	u32 pid;
+	u32 dev;
+	struct dentry *dir;
+	struct dentry *dropped_file;
+	atomic_t dropped;
+};
+
+/*
+ * User setup structure passed with BLKTRACESTART
+ */
+struct blk_user_trace_setup {
+	char name[BDEVNAME_SIZE];	/* output */
+	u16 act_mask;			/* input */
+	u32 buf_size;			/* input */
+	u32 buf_nr;			/* input */
+	u64 start_lba;
+	u64 end_lba;
+	u32 pid;
+};
+
+#if defined(CONFIG_BLK_DEV_IO_TRACE)
+extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *);
+extern void blk_trace_shutdown(request_queue_t *);
+extern void __blk_add_trace(struct blk_trace *, sector_t, int, int, u32, int, int, void *);
+
+/**
+ * blk_add_trace_rq - Add a trace for a request oriented action
+ * @q:		queue the io is for
+ * @rq:		the source request
+ * @what:	the action
+ *
+ * Description:
+ *     Records an action against a request. Will log the bio offset + size.
+ *
+ **/
+static inline void blk_add_trace_rq(struct request_queue *q, struct request *rq,
+				    u32 what)
+{
+	struct blk_trace *bt = q->blk_trace;
+	int rw = rq->flags & 0x07;
+
+	if (likely(!bt))
+		return;
+
+	if (blk_pc_request(rq)) {
+		what |= BLK_TC_ACT(BLK_TC_PC);
+		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd);
+	} else  {
+		what |= BLK_TC_ACT(BLK_TC_FS);
+		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, rw, what, rq->errors, 0, NULL);
+	}
+}
+
+/**
+ * blk_add_trace_bio - Add a trace for a bio oriented action
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @what:	the action
+ *
+ * Description:
+ *     Records an action against a bio. Will log the bio offset + size.
+ *
+ **/
+static inline void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
+				     u32 what)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
+}
+
+/**
+ * blk_add_trace_generic - Add a trace for a generic action
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @rw:		the data direction
+ * @what:	the action
+ *
+ * Description:
+ *     Records a simple trace
+ *
+ **/
+static inline void blk_add_trace_generic(struct request_queue *q,
+					 struct bio *bio, int rw, u32 what)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	if (bio)
+		blk_add_trace_bio(q, bio, what);
+	else
+		__blk_add_trace(bt, 0, 0, rw, what, 0, 0, NULL);
+}
+
+/**
+ * blk_add_trace_pdu_int - Add a trace for a bio with an integer payload
+ * @q:		queue the io is for
+ * @what:	the action
+ * @bio:	the source bio
+ * @pdu:	the integer payload
+ *
+ * Description:
+ *     Adds a trace with some integer payload. This might be an unplug
+ *     option given as the action, with the depth at unplug time given
+ *     as the payload
+ *
+ **/
+static inline void blk_add_trace_pdu_int(struct request_queue *q, u32 what,
+					 struct bio *bio, unsigned int pdu)
+{
+	struct blk_trace *bt = q->blk_trace;
+	u64 rpdu = cpu_to_be64(pdu);
+
+	if (likely(!bt))
+		return;
+
+	if (bio)
+		__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu);
+	else
+		__blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
+}
+
+/**
+ * blk_add_trace_remap - Add a trace for a remap operation
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @dev:	target device
+ * @from:	source sector
+ * @to:		target sector
+ *
+ * Description:
+ *     Device mapper or raid target sometimes need to split a bio because
+ *     it spans a stripe (or similar). Add a trace for that action.
+ *
+ **/
+static inline void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
+				       dev_t dev, sector_t from, sector_t to)
+{
+	struct blk_trace *bt = q->blk_trace;
+	struct blk_io_trace_remap r;
+
+	if (likely(!bt))
+		return;
+
+	r.device = cpu_to_be32(dev);
+	r.sector = cpu_to_be64(to);
+
+	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+}
+
+#else /* !CONFIG_BLK_DEV_IO_TRACE */
+#define blk_trace_ioctl(bdev, cmd, arg)		(-ENOTTY)
+#define blk_trace_shutdown(q)			do { } while (0)
+#define blk_add_trace_rq(q, rq, what)		do { } while (0)
+#define blk_add_trace_bio(q, rq, what)		do { } while (0)
+#define blk_add_trace_generic(q, rq, rw, what)	do { } while (0)
+#define blk_add_trace_pdu_int(q, what, bio, pdu)	do { } while (0)
+#define blk_add_trace_remap(q, bio, dev, f, t)	do {} while (0)
+#endif /* CONFIG_BLK_DEV_IO_TRACE */
+
+#endif
diff --git a/include/linux/compat_ioctl.h b/include/linux/compat_ioctl.h
index ae7dfb790df3..efb518f16bb3 100644
--- a/include/linux/compat_ioctl.h
+++ b/include/linux/compat_ioctl.h
@@ -97,6 +97,10 @@ COMPATIBLE_IOCTL(BLKRRPART)
 COMPATIBLE_IOCTL(BLKFLSBUF)
 COMPATIBLE_IOCTL(BLKSECTSET)
 COMPATIBLE_IOCTL(BLKSSZGET)
+COMPATIBLE_IOCTL(BLKTRACESTART)
+COMPATIBLE_IOCTL(BLKTRACESTOP)
+COMPATIBLE_IOCTL(BLKTRACESETUP)
+COMPATIBLE_IOCTL(BLKTRACETEARDOWN)
 ULONG_IOCTL(BLKRASET)
 ULONG_IOCTL(BLKFRASET)
 /* RAID */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f9c9dea636d0..9b34a1b03455 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -197,6 +197,10 @@ extern int dir_notify_enable;
 #define BLKBSZGET  _IOR(0x12,112,size_t)
 #define BLKBSZSET  _IOW(0x12,113,size_t)
 #define BLKGETSIZE64 _IOR(0x12,114,size_t)	/* return device size in bytes (u64 *arg) */
+#define BLKTRACESETUP _IOWR(0x12,115,struct blk_user_trace_setup)
+#define BLKTRACESTART _IO(0x12,116)
+#define BLKTRACESTOP _IO(0x12,117)
+#define BLKTRACETEARDOWN _IO(0x12,118)
 
 #define BMAP_IOCTL 1		/* obsolete - kept for compatibility */
 #define FIBMAP	   _IO(0x00,1)	/* bmap access */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 62e6314382f0..e60a91d5b369 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -706,6 +706,7 @@ struct task_struct {
 	prio_array_t *array;
 
 	unsigned short ioprio;
+	unsigned int btrace_seq;
 
 	unsigned long sleep_avg;
 	unsigned long long timestamp, last_ran;
diff --git a/kernel/fork.c b/kernel/fork.c
index c79ae0b19a49..c21bae8c93b9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -181,6 +181,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 	/* One for us, one for whoever does the "release_task()" (usually parent) */
 	atomic_set(&tsk->usage,2);
 	atomic_set(&tsk->fs_excl, 0);
+	tsk->btrace_seq = 0;
 	return tsk;
 }
 
diff --git a/mm/highmem.c b/mm/highmem.c
index ce2e7e8bbfa7..d0ea1eec6a9a 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -26,6 +26,7 @@
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
+#include <linux/blktrace_api.h>
 #include <asm/tlbflush.h>
 
 static mempool_t *page_pool, *isa_page_pool;
@@ -483,6 +484,8 @@ void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
 		pool = isa_page_pool;
 	}
 
+	blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
+
 	/*
 	 * slow path
 	 */
-- 
cgit v1.2.3


From 5daf2cf19a7d03843d8d2d1b4aaf5260479cc5a7 Mon Sep 17 00:00:00 2001
From: Eric Sesterhenn <snakebyte@gmx.de>
Date: Fri, 24 Mar 2006 18:35:26 +0100
Subject: BUG_ON() Conversion in md/bitmap.c

this changes if() BUG(); constructs to BUG_ON() which is
cleaner and can better optimized away

Signed-off-by: Eric Sesterhenn <snakebyte@gmx.de>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 drivers/md/bitmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index eae4473eadde..979e8cae5d83 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1309,7 +1309,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
 		case 1:
 			*bmc = 2;
 		}
-		if ((*bmc & COUNTER_MAX) == COUNTER_MAX) BUG();
+		BUG_ON((*bmc & COUNTER_MAX) == COUNTER_MAX);
 		(*bmc)++;
 
 		spin_unlock_irq(&bitmap->lock);
-- 
cgit v1.2.3


From 4401d138993c652dc7463207fa3afde71b7ae509 Mon Sep 17 00:00:00 2001
From: Eric Sesterhenn <snakebyte@gmx.de>
Date: Fri, 24 Mar 2006 18:36:27 +0100
Subject: BUG_ON() Conversion in md/dm-hw-handler.c

this changes if() BUG(); constructs to BUG_ON() which is
cleaner and can better optimized away

Signed-off-by: Eric Sesterhenn <snakebyte@gmx.de>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 drivers/md/dm-hw-handler.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-hw-handler.c b/drivers/md/dm-hw-handler.c
index 4cc0010e0156..baafaaba4d4b 100644
--- a/drivers/md/dm-hw-handler.c
+++ b/drivers/md/dm-hw-handler.c
@@ -83,8 +83,7 @@ void dm_put_hw_handler(struct hw_handler_type *hwht)
 	if (--hwhi->use == 0)
 		module_put(hwhi->hwht.module);
 
-	if (hwhi->use < 0)
-		BUG();
+	BUG_ON(hwhi->use < 0);
 
       out:
 	up_read(&_hwh_lock);
-- 
cgit v1.2.3


From 7e3176555003a45318010d9820eb5ad1abb596bf Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sat, 25 Mar 2006 03:07:51 -0800
Subject: [PATCH] md/bitmap.c:bitmap_mask_state(): fix inconsequent NULL
 checking

We dereference bitmap both one line above and one line below this check
rendering this check quite useless.

Spotted by the Coverity checker.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Cc: Alasdair G Kergon <agk@redhat.com>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/bitmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index eae4473eadde..670eb01a5a23 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -556,7 +556,7 @@ static void bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
 	unsigned long flags;
 
 	spin_lock_irqsave(&bitmap->lock, flags);
-	if (!bitmap || !bitmap->sb_page) { /* can't set the state */
+	if (!bitmap->sb_page) { /* can't set the state */
 		spin_unlock_irqrestore(&bitmap->lock, flags);
 		return;
 	}
-- 
cgit v1.2.3


From c163c7293eb68bf6c0c824d122a2192b9f129193 Mon Sep 17 00:00:00 2001
From: Eric Sesterhenn <snakebyte@gmx.de>
Date: Sun, 26 Mar 2006 18:21:58 +0200
Subject: BUG_ON() Conversion in md/dm-path-selector.c

this changes if() BUG(); constructs to BUG_ON() which is
cleaner and can better optimized away

Signed-off-by: Eric Sesterhenn <snakebyte@gmx.de>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 drivers/md/dm-path-selector.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-path-selector.c b/drivers/md/dm-path-selector.c
index a28c1c2b4ef5..f10a0c89b3f4 100644
--- a/drivers/md/dm-path-selector.c
+++ b/drivers/md/dm-path-selector.c
@@ -86,8 +86,7 @@ void dm_put_path_selector(struct path_selector_type *pst)
 	if (--psi->use == 0)
 		module_put(psi->pst.module);
 
-	if (psi->use < 0)
-		BUG();
+	BUG_ON(psi->use < 0);
 
 out:
 	up_read(&_ps_lock);
-- 
cgit v1.2.3


From 547bc92649345af6014578a64b27cc5787617935 Mon Sep 17 00:00:00 2001
From: Eric Sesterhenn <snakebyte@gmx.de>
Date: Sun, 26 Mar 2006 18:22:50 +0200
Subject: BUG_ON() Conversion in md/dm-table.c

this changes if() BUG(); constructs to BUG_ON() which is
cleaner and can better optimized away

Signed-off-by: Eric Sesterhenn <snakebyte@gmx.de>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 drivers/md/dm-table.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 9b1e2f5ca630..907b08ddb783 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -352,8 +352,7 @@ static int open_dev(struct dm_dev *d, dev_t dev)
 
 	int r;
 
-	if (d->bdev)
-		BUG();
+	BUG_ON(d->bdev);
 
 	bdev = open_by_devnum(dev, d->mode);
 	if (IS_ERR(bdev))
@@ -427,8 +426,7 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
 	struct dm_dev *dd;
 	unsigned int major, minor;
 
-	if (!t)
-		BUG();
+	BUG_ON(!t);
 
 	if (sscanf(path, "%u:%u", &major, &minor) == 2) {
 		/* Extract the major/minor numbers */
-- 
cgit v1.2.3


From 14cc3e2b633bb64063698980974df4535368e98f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 26 Mar 2006 01:37:14 -0800
Subject: [PATCH] sem2mutex: misc static one-file mutexes

Semaphore to mutex conversion.

The conversion was generated via scripts, and the result was validated
automatically via a script as well.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Dave Jones <davej@codemonkey.org.uk>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Jens Axboe <axboe@suse.de>
Cc: Neil Brown <neilb@cse.unsw.edu.au>
Acked-by: Alasdair G Kergon <agk@redhat.com>
Cc: Greg KH <greg@kroah.com>
Cc: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: Adam Belay <ambx1@neo.rr.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/arm26/kernel/traps.c                    | 12 +++---
 arch/i386/kernel/cpu/cpufreq/powernow-k8.c   |  9 ++--
 arch/i386/kernel/cpu/mtrr/main.c             | 13 +++---
 arch/i386/kernel/microcode.c                 |  7 ++--
 arch/mips/lasat/sysctl.c                     | 63 ++++++++++++++--------------
 arch/powerpc/mm/imalloc.c                    | 18 ++++----
 arch/powerpc/platforms/cell/spu_base.c       | 22 +++++-----
 arch/powerpc/platforms/powermac/cpufreq_64.c |  7 ++--
 drivers/macintosh/smu.c                      |  9 ++--
 drivers/md/kcopyd.c                          | 16 +++----
 drivers/net/wan/dscc4.c                      |  7 ++--
 drivers/parport/share.c                      | 19 +++++----
 drivers/pci/hotplug/rpadlpar_core.c          | 12 +++---
 drivers/pci/hotplug/sgi_hotplug.c            | 19 +++++----
 drivers/pnp/isapnp/core.c                    |  7 ++--
 drivers/s390/char/raw3270.c                  | 39 ++++++++---------
 drivers/telephony/phonedev.c                 | 21 +++++-----
 net/ipv4/ipvs/ip_vs_ctl.c                    | 11 ++---
 net/ipv4/netfilter/ipt_hashlimit.c           |  9 ++--
 net/netlink/genetlink.c                      |  9 ++--
 20 files changed, 172 insertions(+), 157 deletions(-)

(limited to 'drivers/md')

diff --git a/arch/arm26/kernel/traps.c b/arch/arm26/kernel/traps.c
index 5847ea5d7747..a79de041b50e 100644
--- a/arch/arm26/kernel/traps.c
+++ b/arch/arm26/kernel/traps.c
@@ -34,7 +34,7 @@
 #include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
 
 #include "ptrace.h"
 
@@ -207,19 +207,19 @@ void die_if_kernel(const char *str, struct pt_regs *regs, int err)
     	die(str, regs, err);
 }
 
-static DECLARE_MUTEX(undef_sem);
+static DEFINE_MUTEX(undef_mutex);
 static int (*undef_hook)(struct pt_regs *);
 
 int request_undef_hook(int (*fn)(struct pt_regs *))
 {
 	int ret = -EBUSY;
 
-	down(&undef_sem);
+	mutex_lock(&undef_mutex);
 	if (undef_hook == NULL) {
 		undef_hook = fn;
 		ret = 0;
 	}
-	up(&undef_sem);
+	mutex_unlock(&undef_mutex);
 
 	return ret;
 }
@@ -228,12 +228,12 @@ int release_undef_hook(int (*fn)(struct pt_regs *))
 {
 	int ret = -EINVAL;
 
-	down(&undef_sem);
+	mutex_lock(&undef_mutex);
 	if (undef_hook == fn) {
 		undef_hook = NULL;
 		ret = 0;
 	}
-	up(&undef_sem);
+	mutex_unlock(&undef_mutex);
 
 	return ret;
 }
diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c
index e5bc06480ff9..1e70823e1cb5 100644
--- a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c
@@ -40,6 +40,7 @@
 
 #ifdef CONFIG_X86_POWERNOW_K8_ACPI
 #include <linux/acpi.h>
+#include <linux/mutex.h>
 #include <acpi/processor.h>
 #endif
 
@@ -49,7 +50,7 @@
 #include "powernow-k8.h"
 
 /* serialize freq changes  */
-static DECLARE_MUTEX(fidvid_sem);
+static DEFINE_MUTEX(fidvid_mutex);
 
 static struct powernow_k8_data *powernow_data[NR_CPUS];
 
@@ -943,17 +944,17 @@ static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsi
 	if (cpufreq_frequency_table_target(pol, data->powernow_table, targfreq, relation, &newstate))
 		goto err_out;
 
-	down(&fidvid_sem);
+	mutex_lock(&fidvid_mutex);
 
 	powernow_k8_acpi_pst_values(data, newstate);
 
 	if (transition_frequency(data, newstate)) {
 		printk(KERN_ERR PFX "transition frequency failed\n");
 		ret = 1;
-		up(&fidvid_sem);
+		mutex_unlock(&fidvid_mutex);
 		goto err_out;
 	}
-	up(&fidvid_sem);
+	mutex_unlock(&fidvid_mutex);
 
 	pol->cur = find_khz_freq_from_fid(data->currfid);
 	ret = 0;
diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c
index 3b4618bed70d..fff90bda4733 100644
--- a/arch/i386/kernel/cpu/mtrr/main.c
+++ b/arch/i386/kernel/cpu/mtrr/main.c
@@ -36,6 +36,7 @@
 #include <linux/pci.h>
 #include <linux/smp.h>
 #include <linux/cpu.h>
+#include <linux/mutex.h>
 
 #include <asm/mtrr.h>
 
@@ -47,7 +48,7 @@
 u32 num_var_ranges = 0;
 
 unsigned int *usage_table;
-static DECLARE_MUTEX(mtrr_sem);
+static DEFINE_MUTEX(mtrr_mutex);
 
 u32 size_or_mask, size_and_mask;
 
@@ -333,7 +334,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 	/* No CPU hotplug when we change MTRR entries */
 	lock_cpu_hotplug();
 	/*  Search for existing MTRR  */
-	down(&mtrr_sem);
+	mutex_lock(&mtrr_mutex);
 	for (i = 0; i < num_var_ranges; ++i) {
 		mtrr_if->get(i, &lbase, &lsize, &ltype);
 		if (base >= lbase + lsize)
@@ -371,7 +372,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 		printk(KERN_INFO "mtrr: no more MTRRs available\n");
 	error = i;
  out:
-	up(&mtrr_sem);
+	mutex_unlock(&mtrr_mutex);
 	unlock_cpu_hotplug();
 	return error;
 }
@@ -464,7 +465,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
 	max = num_var_ranges;
 	/* No CPU hotplug when we change MTRR entries */
 	lock_cpu_hotplug();
-	down(&mtrr_sem);
+	mutex_lock(&mtrr_mutex);
 	if (reg < 0) {
 		/*  Search for existing MTRR  */
 		for (i = 0; i < max; ++i) {
@@ -503,7 +504,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
 		set_mtrr(reg, 0, 0, 0);
 	error = reg;
  out:
-	up(&mtrr_sem);
+	mutex_unlock(&mtrr_mutex);
 	unlock_cpu_hotplug();
 	return error;
 }
@@ -685,7 +686,7 @@ void mtrr_ap_init(void)
 	if (!mtrr_if || !use_intel())
 		return;
 	/*
-	 * Ideally we should hold mtrr_sem here to avoid mtrr entries changed,
+	 * Ideally we should hold mtrr_mutex here to avoid mtrr entries changed,
 	 * but this routine will be called in cpu boot time, holding the lock
 	 * breaks it. This routine is called in two cases: 1.very earily time
 	 * of software resume, when there absolutely isn't mtrr entry changes;
diff --git a/arch/i386/kernel/microcode.c b/arch/i386/kernel/microcode.c
index 55bc365b8753..dd780a00553f 100644
--- a/arch/i386/kernel/microcode.c
+++ b/arch/i386/kernel/microcode.c
@@ -81,6 +81,7 @@
 #include <linux/miscdevice.h>
 #include <linux/spinlock.h>
 #include <linux/mm.h>
+#include <linux/mutex.h>
 
 #include <asm/msr.h>
 #include <asm/uaccess.h>
@@ -114,7 +115,7 @@ MODULE_LICENSE("GPL");
 static DEFINE_SPINLOCK(microcode_update_lock);
 
 /* no concurrent ->write()s are allowed on /dev/cpu/microcode */
-static DECLARE_MUTEX(microcode_sem);
+static DEFINE_MUTEX(microcode_mutex);
 
 static void __user *user_buffer;	/* user area microcode data buffer */
 static unsigned int user_buffer_size;	/* it's size */
@@ -444,7 +445,7 @@ static ssize_t microcode_write (struct file *file, const char __user *buf, size_
 		return -EINVAL;
 	}
 
-	down(&microcode_sem);
+	mutex_lock(&microcode_mutex);
 
 	user_buffer = (void __user *) buf;
 	user_buffer_size = (int) len;
@@ -453,7 +454,7 @@ static ssize_t microcode_write (struct file *file, const char __user *buf, size_
 	if (!ret)
 		ret = (ssize_t)len;
 
-	up(&microcode_sem);
+	mutex_unlock(&microcode_mutex);
 
 	return ret;
 }
diff --git a/arch/mips/lasat/sysctl.c b/arch/mips/lasat/sysctl.c
index 8ff43a1c1e99..e3d5aaa90f0d 100644
--- a/arch/mips/lasat/sysctl.c
+++ b/arch/mips/lasat/sysctl.c
@@ -30,12 +30,13 @@
 #include <linux/string.h>
 #include <linux/net.h>
 #include <linux/inet.h>
+#include <linux/mutex.h>
 #include <asm/uaccess.h>
 
 #include "sysctl.h"
 #include "ds1603.h"
 
-static DECLARE_MUTEX(lasat_info_sem);
+static DEFINE_MUTEX(lasat_info_mutex);
 
 /* Strategy function to write EEPROM after changing string entry */
 int sysctl_lasatstring(ctl_table *table, int *name, int nlen,
@@ -43,17 +44,17 @@ int sysctl_lasatstring(ctl_table *table, int *name, int nlen,
 		void *newval, size_t newlen, void **context)
 {
 	int r;
-	down(&lasat_info_sem);
+	mutex_lock(&lasat_info_mutex);
 	r = sysctl_string(table, name,
 			  nlen, oldval, oldlenp, newval, newlen, context);
 	if (r < 0) {
-		up(&lasat_info_sem);
+		mutex_unlock(&lasat_info_mutex);
 		return r;
 	}
 	if (newval && newlen) {
 		lasat_write_eeprom_info();
 	}
-	up(&lasat_info_sem);
+	mutex_unlock(&lasat_info_mutex);
 	return 1;
 }
 
@@ -63,14 +64,14 @@ int proc_dolasatstring(ctl_table *table, int write, struct file *filp,
 		       void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int r;
-	down(&lasat_info_sem);
+	mutex_lock(&lasat_info_mutex);
 	r = proc_dostring(table, write, filp, buffer, lenp, ppos);
 	if ( (!write) || r) {
-		up(&lasat_info_sem);
+		mutex_unlock(&lasat_info_mutex);
 		return r;
 	}
 	lasat_write_eeprom_info();
-	up(&lasat_info_sem);
+	mutex_unlock(&lasat_info_mutex);
 	return 0;
 }
 
@@ -79,14 +80,14 @@ int proc_dolasatint(ctl_table *table, int write, struct file *filp,
 		       void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int r;
-	down(&lasat_info_sem);
+	mutex_lock(&lasat_info_mutex);
 	r = proc_dointvec(table, write, filp, buffer, lenp, ppos);
 	if ( (!write) || r) {
-		up(&lasat_info_sem);
+		mutex_unlock(&lasat_info_mutex);
 		return r;
 	}
 	lasat_write_eeprom_info();
-	up(&lasat_info_sem);
+	mutex_unlock(&lasat_info_mutex);
 	return 0;
 }
 
@@ -98,7 +99,7 @@ int proc_dolasatrtc(ctl_table *table, int write, struct file *filp,
 		       void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int r;
-	down(&lasat_info_sem);
+	mutex_lock(&lasat_info_mutex);
 	if (!write) {
 		rtctmp = ds1603_read();
 		/* check for time < 0 and set to 0 */
@@ -107,11 +108,11 @@ int proc_dolasatrtc(ctl_table *table, int write, struct file *filp,
 	}
 	r = proc_dointvec(table, write, filp, buffer, lenp, ppos);
 	if ( (!write) || r) {
-		up(&lasat_info_sem);
+		mutex_unlock(&lasat_info_mutex);
 		return r;
 	}
 	ds1603_set(rtctmp);
-	up(&lasat_info_sem);
+	mutex_unlock(&lasat_info_mutex);
 	return 0;
 }
 #endif
@@ -122,16 +123,16 @@ int sysctl_lasat_intvec(ctl_table *table, int *name, int nlen,
 		    void *newval, size_t newlen, void **context)
 {
 	int r;
-	down(&lasat_info_sem);
+	mutex_lock(&lasat_info_mutex);
 	r = sysctl_intvec(table, name, nlen, oldval, oldlenp, newval, newlen, context);
 	if (r < 0) {
-		up(&lasat_info_sem);
+		mutex_unlock(&lasat_info_mutex);
 		return r;
 	}
 	if (newval && newlen) {
 		lasat_write_eeprom_info();
 	}
-	up(&lasat_info_sem);
+	mutex_unlock(&lasat_info_mutex);
 	return 1;
 }
 
@@ -142,19 +143,19 @@ int sysctl_lasat_rtc(ctl_table *table, int *name, int nlen,
 		    void *newval, size_t newlen, void **context)
 {
 	int r;
-	down(&lasat_info_sem);
+	mutex_lock(&lasat_info_mutex);
 	rtctmp = ds1603_read();
 	if (rtctmp < 0)
 		rtctmp = 0;
 	r = sysctl_intvec(table, name, nlen, oldval, oldlenp, newval, newlen, context);
 	if (r < 0) {
-		up(&lasat_info_sem);
+		mutex_unlock(&lasat_info_mutex);
 		return r;
 	}
 	if (newval && newlen) {
 		ds1603_set(rtctmp);
 	}
-	up(&lasat_info_sem);
+	mutex_unlock(&lasat_info_mutex);
 	return 1;
 }
 #endif
@@ -192,13 +193,13 @@ int proc_lasat_ip(ctl_table *table, int write, struct file *filp,
 		return 0;
 	}
 
-	down(&lasat_info_sem);
+	mutex_lock(&lasat_info_mutex);
 	if (write) {
 		len = 0;
 		p = buffer;
 		while (len < *lenp) {
 			if(get_user(c, p++)) {
-				up(&lasat_info_sem);
+				mutex_unlock(&lasat_info_mutex);
 				return -EFAULT;
 			}
 			if (c == 0 || c == '\n')
@@ -209,7 +210,7 @@ int proc_lasat_ip(ctl_table *table, int write, struct file *filp,
 			len = sizeof(proc_lasat_ipbuf) - 1;
 		if (copy_from_user(proc_lasat_ipbuf, buffer, len))
 		{
-			up(&lasat_info_sem);
+			mutex_unlock(&lasat_info_mutex);
 			return -EFAULT;
 		}
 		proc_lasat_ipbuf[len] = 0;
@@ -230,12 +231,12 @@ int proc_lasat_ip(ctl_table *table, int write, struct file *filp,
 			len = *lenp;
 		if (len)
 			if(copy_to_user(buffer, proc_lasat_ipbuf, len)) {
-				up(&lasat_info_sem);
+				mutex_unlock(&lasat_info_mutex);
 				return -EFAULT;
 			}
 		if (len < *lenp) {
 			if(put_user('\n', ((char *) buffer) + len)) {
-				up(&lasat_info_sem);
+				mutex_unlock(&lasat_info_mutex);
 				return -EFAULT;
 			}
 			len++;
@@ -244,7 +245,7 @@ int proc_lasat_ip(ctl_table *table, int write, struct file *filp,
 		*ppos += len;
 	}
 	update_bcastaddr();
-	up(&lasat_info_sem);
+	mutex_unlock(&lasat_info_mutex);
 	return 0;
 }
 #endif /* defined(CONFIG_INET) */
@@ -256,10 +257,10 @@ static int sysctl_lasat_eeprom_value(ctl_table *table, int *name, int nlen,
 {
 	int r;
 
-	down(&lasat_info_sem);
+	mutex_lock(&lasat_info_mutex);
 	r = sysctl_intvec(table, name, nlen, oldval, oldlenp, newval, newlen, context);
 	if (r < 0) {
-		up(&lasat_info_sem);
+		mutex_unlock(&lasat_info_mutex);
 		return r;
 	}
 
@@ -271,7 +272,7 @@ static int sysctl_lasat_eeprom_value(ctl_table *table, int *name, int nlen,
 		lasat_write_eeprom_info();
 		lasat_init_board_info();
 	}
-	up(&lasat_info_sem);
+	mutex_unlock(&lasat_info_mutex);
 
 	return 0;
 }
@@ -280,10 +281,10 @@ int proc_lasat_eeprom_value(ctl_table *table, int write, struct file *filp,
 		       void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int r;
-	down(&lasat_info_sem);
+	mutex_lock(&lasat_info_mutex);
 	r = proc_dointvec(table, write, filp, buffer, lenp, ppos);
 	if ( (!write) || r) {
-		up(&lasat_info_sem);
+		mutex_unlock(&lasat_info_mutex);
 		return r;
 	}
 	if (filp && filp->f_dentry)
@@ -294,7 +295,7 @@ int proc_lasat_eeprom_value(ctl_table *table, int write, struct file *filp,
 			lasat_board_info.li_eeprom_info.debugaccess = lasat_board_info.li_debugaccess;
 	}
 	lasat_write_eeprom_info();
-	up(&lasat_info_sem);
+	mutex_unlock(&lasat_info_mutex);
 	return 0;
 }
 
diff --git a/arch/powerpc/mm/imalloc.c b/arch/powerpc/mm/imalloc.c
index 8b0c132bc163..add8c1a9af68 100644
--- a/arch/powerpc/mm/imalloc.c
+++ b/arch/powerpc/mm/imalloc.c
@@ -13,12 +13,12 @@
 #include <asm/uaccess.h>
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
 #include <asm/cacheflush.h>
 
 #include "mmu_decl.h"
 
-static DECLARE_MUTEX(imlist_sem);
+static DEFINE_MUTEX(imlist_mutex);
 struct vm_struct * imlist = NULL;
 
 static int get_free_im_addr(unsigned long size, unsigned long *im_addr)
@@ -257,7 +257,7 @@ struct vm_struct * im_get_free_area(unsigned long size)
 	struct vm_struct *area;
 	unsigned long addr;
 	
-	down(&imlist_sem);
+	mutex_lock(&imlist_mutex);
 	if (get_free_im_addr(size, &addr)) {
 		printk(KERN_ERR "%s() cannot obtain addr for size 0x%lx\n",
 				__FUNCTION__, size);
@@ -272,7 +272,7 @@ struct vm_struct * im_get_free_area(unsigned long size)
 			__FUNCTION__, addr, size);
 	}
 next_im_done:
-	up(&imlist_sem);
+	mutex_unlock(&imlist_mutex);
 	return area;
 }
 
@@ -281,9 +281,9 @@ struct vm_struct * im_get_area(unsigned long v_addr, unsigned long size,
 {
 	struct vm_struct *area;
 
-	down(&imlist_sem);
+	mutex_lock(&imlist_mutex);
 	area = __im_get_area(v_addr, size, criteria);
-	up(&imlist_sem);
+	mutex_unlock(&imlist_mutex);
 	return area;
 }
 
@@ -297,17 +297,17 @@ void im_free(void * addr)
 		printk(KERN_ERR "Trying to %s bad address (%p)\n", __FUNCTION__,			addr);
 		return;
 	}
-	down(&imlist_sem);
+	mutex_lock(&imlist_mutex);
 	for (p = &imlist ; (tmp = *p) ; p = &tmp->next) {
 		if (tmp->addr == addr) {
 			*p = tmp->next;
 			unmap_vm_area(tmp);
 			kfree(tmp);
-			up(&imlist_sem);
+			mutex_unlock(&imlist_mutex);
 			return;
 		}
 	}
-	up(&imlist_sem);
+	mutex_unlock(&imlist_mutex);
 	printk(KERN_ERR "Trying to %s nonexistent area (%p)\n", __FUNCTION__,
 			addr);
 }
diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c
index d75ae03df686..a8fa1eeeb174 100644
--- a/arch/powerpc/platforms/cell/spu_base.c
+++ b/arch/powerpc/platforms/cell/spu_base.c
@@ -32,7 +32,7 @@
 
 #include <asm/io.h>
 #include <asm/prom.h>
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
 #include <asm/spu.h>
 #include <asm/mmu_context.h>
 
@@ -342,7 +342,7 @@ spu_free_irqs(struct spu *spu)
 }
 
 static LIST_HEAD(spu_list);
-static DECLARE_MUTEX(spu_mutex);
+static DEFINE_MUTEX(spu_mutex);
 
 static void spu_init_channels(struct spu *spu)
 {
@@ -382,7 +382,7 @@ struct spu *spu_alloc(void)
 {
 	struct spu *spu;
 
-	down(&spu_mutex);
+	mutex_lock(&spu_mutex);
 	if (!list_empty(&spu_list)) {
 		spu = list_entry(spu_list.next, struct spu, list);
 		list_del_init(&spu->list);
@@ -391,7 +391,7 @@ struct spu *spu_alloc(void)
 		pr_debug("No SPU left\n");
 		spu = NULL;
 	}
-	up(&spu_mutex);
+	mutex_unlock(&spu_mutex);
 
 	if (spu)
 		spu_init_channels(spu);
@@ -402,9 +402,9 @@ EXPORT_SYMBOL_GPL(spu_alloc);
 
 void spu_free(struct spu *spu)
 {
-	down(&spu_mutex);
+	mutex_lock(&spu_mutex);
 	list_add_tail(&spu->list, &spu_list);
-	up(&spu_mutex);
+	mutex_unlock(&spu_mutex);
 }
 EXPORT_SYMBOL_GPL(spu_free);
 
@@ -633,14 +633,14 @@ static int __init create_spu(struct device_node *spe)
 	spu->wbox_callback = NULL;
 	spu->stop_callback = NULL;
 
-	down(&spu_mutex);
+	mutex_lock(&spu_mutex);
 	spu->number = number++;
 	ret = spu_request_irqs(spu);
 	if (ret)
 		goto out_unmap;
 
 	list_add(&spu->list, &spu_list);
-	up(&spu_mutex);
+	mutex_unlock(&spu_mutex);
 
 	pr_debug(KERN_DEBUG "Using SPE %s %02x %p %p %p %p %d\n",
 		spu->name, spu->isrc, spu->local_store,
@@ -648,7 +648,7 @@ static int __init create_spu(struct device_node *spe)
 	goto out;
 
 out_unmap:
-	up(&spu_mutex);
+	mutex_unlock(&spu_mutex);
 	spu_unmap(spu);
 out_free:
 	kfree(spu);
@@ -668,10 +668,10 @@ static void destroy_spu(struct spu *spu)
 static void cleanup_spu_base(void)
 {
 	struct spu *spu, *tmp;
-	down(&spu_mutex);
+	mutex_lock(&spu_mutex);
 	list_for_each_entry_safe(spu, tmp, &spu_list, list)
 		destroy_spu(spu);
-	up(&spu_mutex);
+	mutex_unlock(&spu_mutex);
 }
 module_exit(cleanup_spu_base);
 
diff --git a/arch/powerpc/platforms/powermac/cpufreq_64.c b/arch/powerpc/platforms/powermac/cpufreq_64.c
index a415e8d2f7af..b57e465a1b71 100644
--- a/arch/powerpc/platforms/powermac/cpufreq_64.c
+++ b/arch/powerpc/platforms/powermac/cpufreq_64.c
@@ -21,6 +21,7 @@
 #include <linux/cpufreq.h>
 #include <linux/init.h>
 #include <linux/completion.h>
+#include <linux/mutex.h>
 #include <asm/prom.h>
 #include <asm/machdep.h>
 #include <asm/irq.h>
@@ -90,7 +91,7 @@ static void (*g5_switch_volt)(int speed_mode);
 static int (*g5_switch_freq)(int speed_mode);
 static int (*g5_query_freq)(void);
 
-static DECLARE_MUTEX(g5_switch_mutex);
+static DEFINE_MUTEX(g5_switch_mutex);
 
 
 static struct smu_sdbp_fvt *g5_fvt_table;	/* table of op. points */
@@ -327,7 +328,7 @@ static int g5_cpufreq_target(struct cpufreq_policy *policy,
 	if (g5_pmode_cur == newstate)
 		return 0;
 
-	down(&g5_switch_mutex);
+	mutex_lock(&g5_switch_mutex);
 
 	freqs.old = g5_cpu_freqs[g5_pmode_cur].frequency;
 	freqs.new = g5_cpu_freqs[newstate].frequency;
@@ -337,7 +338,7 @@ static int g5_cpufreq_target(struct cpufreq_policy *policy,
 	rc = g5_switch_freq(newstate);
 	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
 
-	up(&g5_switch_mutex);
+	mutex_unlock(&g5_switch_mutex);
 
 	return rc;
 }
diff --git a/drivers/macintosh/smu.c b/drivers/macintosh/smu.c
index 4eb05d7143d8..f4516ca7aa3a 100644
--- a/drivers/macintosh/smu.c
+++ b/drivers/macintosh/smu.c
@@ -35,6 +35,7 @@
 #include <linux/delay.h>
 #include <linux/sysdev.h>
 #include <linux/poll.h>
+#include <linux/mutex.h>
 
 #include <asm/byteorder.h>
 #include <asm/io.h>
@@ -92,7 +93,7 @@ struct smu_device {
  * for now, just hard code that
  */
 static struct smu_device	*smu;
-static DECLARE_MUTEX(smu_part_access);
+static DEFINE_MUTEX(smu_part_access);
 
 static void smu_i2c_retry(unsigned long data);
 
@@ -976,11 +977,11 @@ struct smu_sdbp_header *__smu_get_sdb_partition(int id, unsigned int *size,
 
 	if (interruptible) {
 		int rc;
-		rc = down_interruptible(&smu_part_access);
+		rc = mutex_lock_interruptible(&smu_part_access);
 		if (rc)
 			return ERR_PTR(rc);
 	} else
-		down(&smu_part_access);
+		mutex_lock(&smu_part_access);
 
 	part = (struct smu_sdbp_header *)get_property(smu->of_node,
 						      pname, size);
@@ -990,7 +991,7 @@ struct smu_sdbp_header *__smu_get_sdb_partition(int id, unsigned int *size,
 		if (part != NULL && size)
 			*size = part->len << 2;
 	}
-	up(&smu_part_access);
+	mutex_unlock(&smu_part_access);
 	return part;
 }
 
diff --git a/drivers/md/kcopyd.c b/drivers/md/kcopyd.c
index 8b3515f394a6..0d54e8b7d9de 100644
--- a/drivers/md/kcopyd.c
+++ b/drivers/md/kcopyd.c
@@ -590,51 +590,51 @@ static void client_del(struct kcopyd_client *kc)
 	up(&_client_lock);
 }
 
-static DECLARE_MUTEX(kcopyd_init_lock);
+static DEFINE_MUTEX(kcopyd_init_lock);
 static int kcopyd_clients = 0;
 
 static int kcopyd_init(void)
 {
 	int r;
 
-	down(&kcopyd_init_lock);
+	mutex_lock(&kcopyd_init_lock);
 
 	if (kcopyd_clients) {
 		/* Already initialized. */
 		kcopyd_clients++;
-		up(&kcopyd_init_lock);
+		mutex_unlock(&kcopyd_init_lock);
 		return 0;
 	}
 
 	r = jobs_init();
 	if (r) {
-		up(&kcopyd_init_lock);
+		mutex_unlock(&kcopyd_init_lock);
 		return r;
 	}
 
 	_kcopyd_wq = create_singlethread_workqueue("kcopyd");
 	if (!_kcopyd_wq) {
 		jobs_exit();
-		up(&kcopyd_init_lock);
+		mutex_unlock(&kcopyd_init_lock);
 		return -ENOMEM;
 	}
 
 	kcopyd_clients++;
 	INIT_WORK(&_kcopyd_work, do_work, NULL);
-	up(&kcopyd_init_lock);
+	mutex_unlock(&kcopyd_init_lock);
 	return 0;
 }
 
 static void kcopyd_exit(void)
 {
-	down(&kcopyd_init_lock);
+	mutex_lock(&kcopyd_init_lock);
 	kcopyd_clients--;
 	if (!kcopyd_clients) {
 		jobs_exit();
 		destroy_workqueue(_kcopyd_wq);
 		_kcopyd_wq = NULL;
 	}
-	up(&kcopyd_init_lock);
+	mutex_unlock(&kcopyd_init_lock);
 }
 
 int kcopyd_client_create(unsigned int nr_pages, struct kcopyd_client **result)
diff --git a/drivers/net/wan/dscc4.c b/drivers/net/wan/dscc4.c
index 1ff5de076d21..4505540e3c59 100644
--- a/drivers/net/wan/dscc4.c
+++ b/drivers/net/wan/dscc4.c
@@ -105,6 +105,7 @@
 #include <linux/delay.h>
 #include <net/syncppp.h>
 #include <linux/hdlc.h>
+#include <linux/mutex.h>
 
 /* Version */
 static const char version[] = "$Id: dscc4.c,v 1.173 2003/09/20 23:55:34 romieu Exp $ for Linux\n";
@@ -112,7 +113,7 @@ static int debug;
 static int quartz;
 
 #ifdef CONFIG_DSCC4_PCI_RST
-static DECLARE_MUTEX(dscc4_sem);
+static DEFINE_MUTEX(dscc4_mutex);
 static u32 dscc4_pci_config_store[16];
 #endif
 
@@ -1018,7 +1019,7 @@ static void dscc4_pci_reset(struct pci_dev *pdev, void __iomem *ioaddr)
 {
 	int i;
 
-	down(&dscc4_sem);
+	mutex_lock(&dscc4_mutex);
 	for (i = 0; i < 16; i++)
 		pci_read_config_dword(pdev, i << 2, dscc4_pci_config_store + i);
 
@@ -1039,7 +1040,7 @@ static void dscc4_pci_reset(struct pci_dev *pdev, void __iomem *ioaddr)
 
 	for (i = 0; i < 16; i++)
 		pci_write_config_dword(pdev, i << 2, dscc4_pci_config_store[i]);
-	up(&dscc4_sem);
+	mutex_unlock(&dscc4_mutex);
 }
 #else
 #define dscc4_pci_reset(pdev,ioaddr)	do {} while (0)
diff --git a/drivers/parport/share.c b/drivers/parport/share.c
index ea62bed6bc83..bbbfd79adbaf 100644
--- a/drivers/parport/share.c
+++ b/drivers/parport/share.c
@@ -32,6 +32,7 @@
 #include <linux/kmod.h>
 
 #include <linux/spinlock.h>
+#include <linux/mutex.h>
 #include <asm/irq.h>
 
 #undef PARPORT_PARANOID
@@ -50,7 +51,7 @@ static DEFINE_SPINLOCK(full_list_lock);
 
 static LIST_HEAD(drivers);
 
-static DECLARE_MUTEX(registration_lock);
+static DEFINE_MUTEX(registration_lock);
 
 /* What you can do to a port that's gone away.. */
 static void dead_write_lines (struct parport *p, unsigned char b){}
@@ -158,11 +159,11 @@ int parport_register_driver (struct parport_driver *drv)
 	if (list_empty(&portlist))
 		get_lowlevel_driver ();
 
-	down(&registration_lock);
+	mutex_lock(&registration_lock);
 	list_for_each_entry(port, &portlist, list)
 		drv->attach(port);
 	list_add(&drv->list, &drivers);
-	up(&registration_lock);
+	mutex_unlock(&registration_lock);
 
 	return 0;
 }
@@ -188,11 +189,11 @@ void parport_unregister_driver (struct parport_driver *drv)
 {
 	struct parport *port;
 
-	down(&registration_lock);
+	mutex_lock(&registration_lock);
 	list_del_init(&drv->list);
 	list_for_each_entry(port, &portlist, list)
 		drv->detach(port);
-	up(&registration_lock);
+	mutex_unlock(&registration_lock);
 }
 
 static void free_port (struct parport *port)
@@ -366,7 +367,7 @@ void parport_announce_port (struct parport *port)
 #endif
 
 	parport_proc_register(port);
-	down(&registration_lock);
+	mutex_lock(&registration_lock);
 	spin_lock_irq(&parportlist_lock);
 	list_add_tail(&port->list, &portlist);
 	for (i = 1; i < 3; i++) {
@@ -383,7 +384,7 @@ void parport_announce_port (struct parport *port)
 		if (slave)
 			attach_driver_chain(slave);
 	}
-	up(&registration_lock);
+	mutex_unlock(&registration_lock);
 }
 
 /**
@@ -409,7 +410,7 @@ void parport_remove_port(struct parport *port)
 {
 	int i;
 
-	down(&registration_lock);
+	mutex_lock(&registration_lock);
 
 	/* Spread the word. */
 	detach_driver_chain (port);
@@ -436,7 +437,7 @@ void parport_remove_port(struct parport *port)
 	}
 	spin_unlock(&parportlist_lock);
 
-	up(&registration_lock);
+	mutex_unlock(&registration_lock);
 
 	parport_proc_unregister(port);
 
diff --git a/drivers/pci/hotplug/rpadlpar_core.c b/drivers/pci/hotplug/rpadlpar_core.c
index 3eefe2cec72d..46825fee3ae4 100644
--- a/drivers/pci/hotplug/rpadlpar_core.c
+++ b/drivers/pci/hotplug/rpadlpar_core.c
@@ -19,7 +19,7 @@
 #include <linux/string.h>
 
 #include <asm/pci-bridge.h>
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
 #include <asm/rtas.h>
 #include <asm/vio.h>
 
@@ -27,7 +27,7 @@
 #include "rpaphp.h"
 #include "rpadlpar.h"
 
-static DECLARE_MUTEX(rpadlpar_sem);
+static DEFINE_MUTEX(rpadlpar_mutex);
 
 #define DLPAR_MODULE_NAME "rpadlpar_io"
 
@@ -300,7 +300,7 @@ int dlpar_add_slot(char *drc_name)
 	int node_type;
 	int rc = -EIO;
 
-	if (down_interruptible(&rpadlpar_sem))
+	if (mutex_lock_interruptible(&rpadlpar_mutex))
 		return -ERESTARTSYS;
 
 	/* Find newly added node */
@@ -324,7 +324,7 @@ int dlpar_add_slot(char *drc_name)
 
 	printk(KERN_INFO "%s: slot %s added\n", DLPAR_MODULE_NAME, drc_name);
 exit:
-	up(&rpadlpar_sem);
+	mutex_unlock(&rpadlpar_mutex);
 	return rc;
 }
 
@@ -417,7 +417,7 @@ int dlpar_remove_slot(char *drc_name)
 	int node_type;
 	int rc = 0;
 
-	if (down_interruptible(&rpadlpar_sem))
+	if (mutex_lock_interruptible(&rpadlpar_mutex))
 		return -ERESTARTSYS;
 
 	dn = find_dlpar_node(drc_name, &node_type);
@@ -439,7 +439,7 @@ int dlpar_remove_slot(char *drc_name)
 	}
 	printk(KERN_INFO "%s: slot %s removed\n", DLPAR_MODULE_NAME, drc_name);
 exit:
-	up(&rpadlpar_sem);
+	mutex_unlock(&rpadlpar_mutex);
 	return rc;
 }
 
diff --git a/drivers/pci/hotplug/sgi_hotplug.c b/drivers/pci/hotplug/sgi_hotplug.c
index c402da8e78ae..8cb9abde736b 100644
--- a/drivers/pci/hotplug/sgi_hotplug.c
+++ b/drivers/pci/hotplug/sgi_hotplug.c
@@ -15,6 +15,7 @@
 #include <linux/pci.h>
 #include <linux/proc_fs.h>
 #include <linux/types.h>
+#include <linux/mutex.h>
 
 #include <asm/sn/addrs.h>
 #include <asm/sn/l1.h>
@@ -81,7 +82,7 @@ static struct hotplug_slot_ops sn_hotplug_slot_ops = {
 	.get_power_status       = get_power_status,
 };
 
-static DECLARE_MUTEX(sn_hotplug_sem);
+static DEFINE_MUTEX(sn_hotplug_mutex);
 
 static ssize_t path_show (struct hotplug_slot *bss_hotplug_slot,
 	       		  char *buf)
@@ -346,7 +347,7 @@ static int enable_slot(struct hotplug_slot *bss_hotplug_slot)
 	int rc;
 
 	/* Serialize the Linux PCI infrastructure */
-	down(&sn_hotplug_sem);
+	mutex_lock(&sn_hotplug_mutex);
 
 	/*
 	 * Power-on and initialize the slot in the SN
@@ -354,7 +355,7 @@ static int enable_slot(struct hotplug_slot *bss_hotplug_slot)
 	 */
 	rc = sn_slot_enable(bss_hotplug_slot, slot->device_num);
 	if (rc) {
-		up(&sn_hotplug_sem);
+		mutex_unlock(&sn_hotplug_mutex);
 		return rc;
 	}
 
@@ -362,7 +363,7 @@ static int enable_slot(struct hotplug_slot *bss_hotplug_slot)
 				  PCI_DEVFN(slot->device_num + 1, 0));
 	if (!num_funcs) {
 		dev_dbg(slot->pci_bus->self, "no device in slot\n");
-		up(&sn_hotplug_sem);
+		mutex_unlock(&sn_hotplug_mutex);
 		return -ENODEV;
 	}
 
@@ -402,7 +403,7 @@ static int enable_slot(struct hotplug_slot *bss_hotplug_slot)
 	if (new_ppb)
 		pci_bus_add_devices(new_bus);
 
-	up(&sn_hotplug_sem);
+	mutex_unlock(&sn_hotplug_mutex);
 
 	if (rc == 0)
 		dev_dbg(slot->pci_bus->self,
@@ -422,7 +423,7 @@ static int disable_slot(struct hotplug_slot *bss_hotplug_slot)
 	int rc;
 
 	/* Acquire update access to the bus */
-	down(&sn_hotplug_sem);
+	mutex_lock(&sn_hotplug_mutex);
 
 	/* is it okay to bring this slot down? */
 	rc = sn_slot_disable(bss_hotplug_slot, slot->device_num,
@@ -450,7 +451,7 @@ static int disable_slot(struct hotplug_slot *bss_hotplug_slot)
 			     PCI_REQ_SLOT_DISABLE);
  leaving:
 	/* Release the bus lock */
-	up(&sn_hotplug_sem);
+	mutex_unlock(&sn_hotplug_mutex);
 
 	return rc;
 }
@@ -462,9 +463,9 @@ static inline int get_power_status(struct hotplug_slot *bss_hotplug_slot,
 	struct pcibus_info *pcibus_info;
 
 	pcibus_info = SN_PCIBUS_BUSSOFT_INFO(slot->pci_bus);
-	down(&sn_hotplug_sem);
+	mutex_lock(&sn_hotplug_mutex);
 	*value = pcibus_info->pbi_enabled_devices & (1 << slot->device_num);
-	up(&sn_hotplug_sem);
+	mutex_unlock(&sn_hotplug_mutex);
 	return 0;
 }
 
diff --git a/drivers/pnp/isapnp/core.c b/drivers/pnp/isapnp/core.c
index b1b4b683cbdd..ac7c2bb6c69e 100644
--- a/drivers/pnp/isapnp/core.c
+++ b/drivers/pnp/isapnp/core.c
@@ -42,6 +42,7 @@
 #include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/isapnp.h>
+#include <linux/mutex.h>
 #include <asm/io.h>
 
 #if 0
@@ -92,7 +93,7 @@ MODULE_LICENSE("GPL");
 #define _LTAG_FIXEDMEM32RANGE	0x86
 
 static unsigned char isapnp_checksum_value;
-static DECLARE_MUTEX(isapnp_cfg_mutex);
+static DEFINE_MUTEX(isapnp_cfg_mutex);
 static int isapnp_detected;
 static int isapnp_csn_count;
 
@@ -903,7 +904,7 @@ int isapnp_cfg_begin(int csn, int logdev)
 {
 	if (csn < 1 || csn > isapnp_csn_count || logdev > 10)
 		return -EINVAL;
-	down(&isapnp_cfg_mutex);
+	mutex_lock(&isapnp_cfg_mutex);
 	isapnp_wait();
 	isapnp_key();
 	isapnp_wake(csn);
@@ -929,7 +930,7 @@ int isapnp_cfg_begin(int csn, int logdev)
 int isapnp_cfg_end(void)
 {
 	isapnp_wait();
-	up(&isapnp_cfg_mutex);
+	mutex_unlock(&isapnp_cfg_mutex);
 	return 0;
 }
 
diff --git a/drivers/s390/char/raw3270.c b/drivers/s390/char/raw3270.c
index bd06607a5dcc..eecb2afad5c2 100644
--- a/drivers/s390/char/raw3270.c
+++ b/drivers/s390/char/raw3270.c
@@ -28,6 +28,7 @@
 #include <linux/major.h>
 #include <linux/kdev_t.h>
 #include <linux/device.h>
+#include <linux/mutex.h>
 
 struct class *class3270;
 
@@ -59,7 +60,7 @@ struct raw3270 {
 #define RAW3270_FLAGS_CONSOLE	8	/* Device is the console. */
 
 /* Semaphore to protect global data of raw3270 (devices, views, etc). */
-static DECLARE_MUTEX(raw3270_sem);
+static DEFINE_MUTEX(raw3270_mutex);
 
 /* List of 3270 devices. */
 static struct list_head raw3270_devices = LIST_HEAD_INIT(raw3270_devices);
@@ -815,7 +816,7 @@ raw3270_setup_device(struct ccw_device *cdev, struct raw3270 *rp, char *ascebc)
 	 * number for it. Note: there is no device with minor 0,
 	 * see special case for fs3270.c:fs3270_open().
 	 */
-	down(&raw3270_sem);
+	mutex_lock(&raw3270_mutex);
 	/* Keep the list sorted. */
 	minor = RAW3270_FIRSTMINOR;
 	rp->minor = -1;
@@ -832,7 +833,7 @@ raw3270_setup_device(struct ccw_device *cdev, struct raw3270 *rp, char *ascebc)
 		rp->minor = minor;
 		list_add_tail(&rp->list, &raw3270_devices);
 	}
-	up(&raw3270_sem);
+	mutex_unlock(&raw3270_mutex);
 	/* No free minor number? Then give up. */
 	if (rp->minor == -1)
 		return -EUSERS;
@@ -1003,7 +1004,7 @@ raw3270_add_view(struct raw3270_view *view, struct raw3270_fn *fn, int minor)
 
 	if (minor <= 0)
 		return -ENODEV;
-	down(&raw3270_sem);
+	mutex_lock(&raw3270_mutex);
 	rc = -ENODEV;
 	list_for_each_entry(rp, &raw3270_devices, list) {
 		if (rp->minor != minor)
@@ -1024,7 +1025,7 @@ raw3270_add_view(struct raw3270_view *view, struct raw3270_fn *fn, int minor)
 		spin_unlock_irqrestore(get_ccwdev_lock(rp->cdev), flags);
 		break;
 	}
-	up(&raw3270_sem);
+	mutex_unlock(&raw3270_mutex);
 	return rc;
 }
 
@@ -1038,7 +1039,7 @@ raw3270_find_view(struct raw3270_fn *fn, int minor)
 	struct raw3270_view *view, *tmp;
 	unsigned long flags;
 
-	down(&raw3270_sem);
+	mutex_lock(&raw3270_mutex);
 	view = ERR_PTR(-ENODEV);
 	list_for_each_entry(rp, &raw3270_devices, list) {
 		if (rp->minor != minor)
@@ -1057,7 +1058,7 @@ raw3270_find_view(struct raw3270_fn *fn, int minor)
 		spin_unlock_irqrestore(get_ccwdev_lock(rp->cdev), flags);
 		break;
 	}
-	up(&raw3270_sem);
+	mutex_unlock(&raw3270_mutex);
 	return view;
 }
 
@@ -1104,7 +1105,7 @@ raw3270_delete_device(struct raw3270 *rp)
 	struct ccw_device *cdev;
 
 	/* Remove from device chain. */
-	down(&raw3270_sem);
+	mutex_lock(&raw3270_mutex);
 	if (rp->clttydev)
 		class_device_destroy(class3270,
 				     MKDEV(IBM_TTY3270_MAJOR, rp->minor));
@@ -1112,7 +1113,7 @@ raw3270_delete_device(struct raw3270 *rp)
 		class_device_destroy(class3270,
 				     MKDEV(IBM_FS3270_MAJOR, rp->minor));
 	list_del_init(&rp->list);
-	up(&raw3270_sem);
+	mutex_unlock(&raw3270_mutex);
 
 	/* Disconnect from ccw_device. */
 	cdev = rp->cdev;
@@ -1208,13 +1209,13 @@ int raw3270_register_notifier(void (*notifier)(int, int))
 	if (!np)
 		return -ENOMEM;
 	np->notifier = notifier;
-	down(&raw3270_sem);
+	mutex_lock(&raw3270_mutex);
 	list_add_tail(&np->list, &raw3270_notifier);
 	list_for_each_entry(rp, &raw3270_devices, list) {
 		get_device(&rp->cdev->dev);
 		notifier(rp->minor, 1);
 	}
-	up(&raw3270_sem);
+	mutex_unlock(&raw3270_mutex);
 	return 0;
 }
 
@@ -1222,14 +1223,14 @@ void raw3270_unregister_notifier(void (*notifier)(int, int))
 {
 	struct raw3270_notifier *np;
 
-	down(&raw3270_sem);
+	mutex_lock(&raw3270_mutex);
 	list_for_each_entry(np, &raw3270_notifier, list)
 		if (np->notifier == notifier) {
 			list_del(&np->list);
 			kfree(np);
 			break;
 		}
-	up(&raw3270_sem);
+	mutex_unlock(&raw3270_mutex);
 }
 
 /*
@@ -1256,10 +1257,10 @@ raw3270_set_online (struct ccw_device *cdev)
 		goto failure;
 	raw3270_create_attributes(rp);
 	set_bit(RAW3270_FLAGS_READY, &rp->flags);
-	down(&raw3270_sem);
+	mutex_lock(&raw3270_mutex);
 	list_for_each_entry(np, &raw3270_notifier, list)
 		np->notifier(rp->minor, 1);
-	up(&raw3270_sem);
+	mutex_unlock(&raw3270_mutex);
 	return 0;
 
 failure:
@@ -1307,10 +1308,10 @@ raw3270_remove (struct ccw_device *cdev)
 	}
 	spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags);
 
-	down(&raw3270_sem);
+	mutex_lock(&raw3270_mutex);
 	list_for_each_entry(np, &raw3270_notifier, list)
 		np->notifier(rp->minor, 0);
-	up(&raw3270_sem);
+	mutex_unlock(&raw3270_mutex);
 
 	/* Reset 3270 device. */
 	raw3270_reset_device(rp);
@@ -1370,13 +1371,13 @@ raw3270_init(void)
 	rc = ccw_driver_register(&raw3270_ccw_driver);
 	if (rc == 0) {
 		/* Create attributes for early (= console) device. */
-		down(&raw3270_sem);
+		mutex_lock(&raw3270_mutex);
 		class3270 = class_create(THIS_MODULE, "3270");
 		list_for_each_entry(rp, &raw3270_devices, list) {
 			get_device(&rp->cdev->dev);
 			raw3270_create_attributes(rp);
 		}
-		up(&raw3270_sem);
+		mutex_unlock(&raw3270_mutex);
 	}
 	return rc;
 }
diff --git a/drivers/telephony/phonedev.c b/drivers/telephony/phonedev.c
index 3c987f49f6b4..7a6db1c5c8c5 100644
--- a/drivers/telephony/phonedev.c
+++ b/drivers/telephony/phonedev.c
@@ -29,6 +29,7 @@
 #include <linux/kmod.h>
 #include <linux/sem.h>
 #include <linux/devfs_fs_kernel.h>
+#include <linux/mutex.h>
 
 #define PHONE_NUM_DEVICES	256
 
@@ -37,7 +38,7 @@
  */
 
 static struct phone_device *phone_device[PHONE_NUM_DEVICES];
-static DECLARE_MUTEX(phone_lock);
+static DEFINE_MUTEX(phone_lock);
 
 /*
  *    Open a phone device.
@@ -53,14 +54,14 @@ static int phone_open(struct inode *inode, struct file *file)
 	if (minor >= PHONE_NUM_DEVICES)
 		return -ENODEV;
 
-	down(&phone_lock);
+	mutex_lock(&phone_lock);
 	p = phone_device[minor];
 	if (p)
 		new_fops = fops_get(p->f_op);
 	if (!new_fops) {
-		up(&phone_lock);
+		mutex_unlock(&phone_lock);
 		request_module("char-major-%d-%d", PHONE_MAJOR, minor);
-		down(&phone_lock);
+		mutex_lock(&phone_lock);
 		p = phone_device[minor];
 		if (p == NULL || (new_fops = fops_get(p->f_op)) == NULL)
 		{
@@ -78,7 +79,7 @@ static int phone_open(struct inode *inode, struct file *file)
 	}
 	fops_put(old_fops);
 end:
-	up(&phone_lock);
+	mutex_unlock(&phone_lock);
 	return err;
 }
 
@@ -100,18 +101,18 @@ int phone_register_device(struct phone_device *p, int unit)
 		end = unit + 1;  /* enter the loop at least one time */
 	}
 	
-	down(&phone_lock);
+	mutex_lock(&phone_lock);
 	for (i = base; i < end; i++) {
 		if (phone_device[i] == NULL) {
 			phone_device[i] = p;
 			p->minor = i;
 			devfs_mk_cdev(MKDEV(PHONE_MAJOR,i),
 				S_IFCHR|S_IRUSR|S_IWUSR, "phone/%d", i);
-			up(&phone_lock);
+			mutex_unlock(&phone_lock);
 			return 0;
 		}
 	}
-	up(&phone_lock);
+	mutex_unlock(&phone_lock);
 	return -ENFILE;
 }
 
@@ -121,12 +122,12 @@ int phone_register_device(struct phone_device *p, int unit)
 
 void phone_unregister_device(struct phone_device *pfd)
 {
-	down(&phone_lock);
+	mutex_lock(&phone_lock);
 	if (phone_device[pfd->minor] != pfd)
 		panic("phone: bad unregister");
 	devfs_remove("phone/%d", pfd->minor);
 	phone_device[pfd->minor] = NULL;
-	up(&phone_lock);
+	mutex_unlock(&phone_lock);
 }
 
 
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 7f0288b25fa1..f28ec6882162 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -34,6 +34,7 @@
 
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4.h>
+#include <linux/mutex.h>
 
 #include <net/ip.h>
 #include <net/route.h>
@@ -44,7 +45,7 @@
 #include <net/ip_vs.h>
 
 /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
-static DECLARE_MUTEX(__ip_vs_mutex);
+static DEFINE_MUTEX(__ip_vs_mutex);
 
 /* lock for service table */
 static DEFINE_RWLOCK(__ip_vs_svc_lock);
@@ -1950,7 +1951,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 	/* increase the module use count */
 	ip_vs_use_count_inc();
 
-	if (down_interruptible(&__ip_vs_mutex)) {
+	if (mutex_lock_interruptible(&__ip_vs_mutex)) {
 		ret = -ERESTARTSYS;
 		goto out_dec;
 	}
@@ -2041,7 +2042,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 		ip_vs_service_put(svc);
 
   out_unlock:
-	up(&__ip_vs_mutex);
+	mutex_unlock(&__ip_vs_mutex);
   out_dec:
 	/* decrease the module use count */
 	ip_vs_use_count_dec();
@@ -2211,7 +2212,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 	if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
 		return -EFAULT;
 
-	if (down_interruptible(&__ip_vs_mutex))
+	if (mutex_lock_interruptible(&__ip_vs_mutex))
 		return -ERESTARTSYS;
 
 	switch (cmd) {
@@ -2330,7 +2331,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 	}
 
   out:
-	up(&__ip_vs_mutex);
+	mutex_unlock(&__ip_vs_mutex);
 	return ret;
 }
 
diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c
index dc1521c5aa81..ba5e23505e88 100644
--- a/net/ipv4/netfilter/ipt_hashlimit.c
+++ b/net/ipv4/netfilter/ipt_hashlimit.c
@@ -40,6 +40,7 @@
 
 /* FIXME: this is just for IP_NF_ASSERRT */
 #include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/mutex.h>
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
@@ -92,7 +93,7 @@ struct ipt_hashlimit_htable {
 };
 
 static DEFINE_SPINLOCK(hashlimit_lock);	/* protects htables list */
-static DECLARE_MUTEX(hlimit_mutex);	/* additional checkentry protection */
+static DEFINE_MUTEX(hlimit_mutex);	/* additional checkentry protection */
 static HLIST_HEAD(hashlimit_htables);
 static kmem_cache_t *hashlimit_cachep __read_mostly;
 
@@ -542,13 +543,13 @@ hashlimit_checkentry(const char *tablename,
 	 * call vmalloc, and that can sleep.  And we cannot just re-search
 	 * the list of htable's in htable_create(), since then we would
 	 * create duplicate proc files. -HW */
-	down(&hlimit_mutex);
+	mutex_lock(&hlimit_mutex);
 	r->hinfo = htable_find_get(r->name);
 	if (!r->hinfo && (htable_create(r) != 0)) {
-		up(&hlimit_mutex);
+		mutex_unlock(&hlimit_mutex);
 		return 0;
 	}
-	up(&hlimit_mutex);
+	mutex_unlock(&hlimit_mutex);
 
 	/* Ugly hack: For SMP, we only want to use one set */
 	r->u.master = r;
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 43e72419c868..f329b72578f5 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -13,26 +13,27 @@
 #include <linux/socket.h>
 #include <linux/string.h>
 #include <linux/skbuff.h>
+#include <linux/mutex.h>
 #include <net/sock.h>
 #include <net/genetlink.h>
 
 struct sock *genl_sock = NULL;
 
-static DECLARE_MUTEX(genl_sem); /* serialization of message processing */
+static DEFINE_MUTEX(genl_mutex); /* serialization of message processing */
 
 static void genl_lock(void)
 {
-	down(&genl_sem);
+	mutex_lock(&genl_mutex);
 }
 
 static int genl_trylock(void)
 {
-	return down_trylock(&genl_sem);
+	return !mutex_trylock(&genl_mutex);
 }
 
 static void genl_unlock(void)
 {
-	up(&genl_sem);
+	mutex_unlock(&genl_mutex);
 
 	if (genl_sock && genl_sock->sk_receive_queue.qlen)
 		genl_sock->sk_data_ready(genl_sock, 0);
-- 
cgit v1.2.3


From a19b27ce3847c3a5d4ea6b6c91b6f7154759af23 Mon Sep 17 00:00:00 2001
From: Matthew Dobson <colpatch@us.ibm.com>
Date: Sun, 26 Mar 2006 01:37:45 -0800
Subject: [PATCH] mempool: use common mempool page allocator

Convert two mempool users that currently use their own mempool-backed page
allocators to use the generic mempool page allocator.

Also included are 2 trivial whitespace fixes.

Signed-off-by: Matthew Dobson <colpatch@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm-crypt.c | 17 +----------------
 mm/highmem.c          | 23 +++++++----------------
 2 files changed, 8 insertions(+), 32 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index e7a650f9ca07..d88b8eda3903 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -93,20 +93,6 @@ struct crypt_config {
 
 static kmem_cache_t *_crypt_io_pool;
 
-/*
- * Mempool alloc and free functions for the page
- */
-static void *mempool_alloc_page(gfp_t gfp_mask, void *data)
-{
-	return alloc_page(gfp_mask);
-}
-
-static void mempool_free_page(void *page, void *data)
-{
-	__free_page(page);
-}
-
-
 /*
  * Different IV generation algorithms:
  *
@@ -637,8 +623,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad3;
 	}
 
-	cc->page_pool = mempool_create(MIN_POOL_PAGES, mempool_alloc_page,
-				       mempool_free_page, NULL);
+	cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
 	if (!cc->page_pool) {
 		ti->error = PFX "Cannot allocate page mempool";
 		goto bad4;
diff --git a/mm/highmem.c b/mm/highmem.c
index d0ea1eec6a9a..55885f64af40 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -31,14 +31,9 @@
 
 static mempool_t *page_pool, *isa_page_pool;
 
-static void *page_pool_alloc_isa(gfp_t gfp_mask, void *data)
+static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
 {
-	return alloc_page(gfp_mask | GFP_DMA);
-}
-
-static void page_pool_free(void *page, void *data)
-{
-	__free_page(page);
+	return mempool_alloc_pages(gfp_mask | GFP_DMA, data);
 }
 
 /*
@@ -51,11 +46,6 @@ static void page_pool_free(void *page, void *data)
  */
 #ifdef CONFIG_HIGHMEM
 
-static void *page_pool_alloc(gfp_t gfp_mask, void *data)
-{
-	return alloc_page(gfp_mask);
-}
-
 static int pkmap_count[LAST_PKMAP];
 static unsigned int last_pkmap_nr;
 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
@@ -229,7 +219,7 @@ static __init int init_emergency_pool(void)
 	if (!i.totalhigh)
 		return 0;
 
-	page_pool = mempool_create(POOL_SIZE, page_pool_alloc, page_pool_free, NULL);
+	page_pool = mempool_create_page_pool(POOL_SIZE, 0);
 	if (!page_pool)
 		BUG();
 	printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
@@ -272,7 +262,8 @@ int init_emergency_isa_pool(void)
 	if (isa_page_pool)
 		return 0;
 
-	isa_page_pool = mempool_create(ISA_POOL_SIZE, page_pool_alloc_isa, page_pool_free, NULL);
+	isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa,
+				       mempool_free_pages, (void *) 0);
 	if (!isa_page_pool)
 		BUG();
 
@@ -337,7 +328,7 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
 	bio_put(bio);
 }
 
-static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done,int err)
+static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done, int err)
 {
 	if (bio->bi_size)
 		return 1;
@@ -384,7 +375,7 @@ static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int
 }
 
 static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig,
-			mempool_t *pool)
+			       mempool_t *pool)
 {
 	struct page *page;
 	struct bio *bio = NULL;
-- 
cgit v1.2.3


From 0eaae62abaa1ad1f231932b6cdd9fb1b91df6651 Mon Sep 17 00:00:00 2001
From: Matthew Dobson <colpatch@us.ibm.com>
Date: Sun, 26 Mar 2006 01:37:47 -0800
Subject: [PATCH] mempool: use common mempool kmalloc allocator

This patch changes several mempool users, all of which are basically just
wrappers around kmalloc(), to use the common mempool_kmalloc/kfree, rather
than their own wrapper function, removing a bunch of duplicated code.

Signed-off-by: Matthew Dobson <colpatch@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/pktcdvd.c      | 26 +++----------------
 drivers/md/bitmap.c          | 14 ++---------
 drivers/md/dm-io.c           | 13 ++--------
 drivers/md/dm-raid1.c        | 14 ++---------
 drivers/s390/scsi/zfcp_aux.c | 60 +++++++++++++-------------------------------
 drivers/scsi/lpfc/lpfc_mem.c | 22 +++-------------
 fs/bio.c                     | 14 ++---------
 7 files changed, 34 insertions(+), 129 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 1d261f985f31..a04f60693c39 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -230,16 +230,6 @@ static int pkt_grow_pktlist(struct pktcdvd_device *pd, int nr_packets)
 	return 1;
 }
 
-static void *pkt_rb_alloc(gfp_t gfp_mask, void *data)
-{
-	return kmalloc(sizeof(struct pkt_rb_node), gfp_mask);
-}
-
-static void pkt_rb_free(void *ptr, void *data)
-{
-	kfree(ptr);
-}
-
 static inline struct pkt_rb_node *pkt_rbtree_next(struct pkt_rb_node *node)
 {
 	struct rb_node *n = rb_next(&node->rb_node);
@@ -2073,16 +2063,6 @@ static int pkt_close(struct inode *inode, struct file *file)
 }
 
 
-static void *psd_pool_alloc(gfp_t gfp_mask, void *data)
-{
-	return kmalloc(sizeof(struct packet_stacked_data), gfp_mask);
-}
-
-static void psd_pool_free(void *ptr, void *data)
-{
-	kfree(ptr);
-}
-
 static int pkt_end_io_read_cloned(struct bio *bio, unsigned int bytes_done, int err)
 {
 	struct packet_stacked_data *psd = bio->bi_private;
@@ -2475,7 +2455,8 @@ static int pkt_setup_dev(struct pkt_ctrl_command *ctrl_cmd)
 	if (!pd)
 		return ret;
 
-	pd->rb_pool = mempool_create(PKT_RB_POOL_SIZE, pkt_rb_alloc, pkt_rb_free, NULL);
+	pd->rb_pool = mempool_create_kmalloc_pool(PKT_RB_POOL_SIZE,
+						  sizeof(struct pkt_rb_node));
 	if (!pd->rb_pool)
 		goto out_mem;
 
@@ -2639,7 +2620,8 @@ static int __init pkt_init(void)
 {
 	int ret;
 
-	psd_pool = mempool_create(PSD_POOL_SIZE, psd_pool_alloc, psd_pool_free, NULL);
+	psd_pool = mempool_create_kmalloc_pool(PSD_POOL_SIZE,
+					sizeof(struct packet_stacked_data));
 	if (!psd_pool)
 		return -ENOMEM;
 
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index e1c18aa1d712..f8ffaee20ff8 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -89,16 +89,6 @@ int bitmap_active(struct bitmap *bitmap)
 }
 
 #define WRITE_POOL_SIZE 256
-/* mempool for queueing pending writes on the bitmap file */
-static void *write_pool_alloc(gfp_t gfp_flags, void *data)
-{
-	return kmalloc(sizeof(struct page_list), gfp_flags);
-}
-
-static void write_pool_free(void *ptr, void *data)
-{
-	kfree(ptr);
-}
 
 /*
  * just a placeholder - calls kmalloc for bitmap pages
@@ -1564,8 +1554,8 @@ int bitmap_create(mddev_t *mddev)
 	spin_lock_init(&bitmap->write_lock);
 	INIT_LIST_HEAD(&bitmap->complete_pages);
 	init_waitqueue_head(&bitmap->write_wait);
-	bitmap->write_pool = mempool_create(WRITE_POOL_SIZE, write_pool_alloc,
-				write_pool_free, NULL);
+	bitmap->write_pool = mempool_create_kmalloc_pool(WRITE_POOL_SIZE,
+						sizeof(struct page_list));
 	err = -ENOMEM;
 	if (!bitmap->write_pool)
 		goto error;
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 4809b209fbb1..da663d2ff552 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -32,16 +32,6 @@ struct io {
 static unsigned _num_ios;
 static mempool_t *_io_pool;
 
-static void *alloc_io(gfp_t gfp_mask, void *pool_data)
-{
-	return kmalloc(sizeof(struct io), gfp_mask);
-}
-
-static void free_io(void *element, void *pool_data)
-{
-	kfree(element);
-}
-
 static unsigned int pages_to_ios(unsigned int pages)
 {
 	return 4 * pages;	/* too many ? */
@@ -65,7 +55,8 @@ static int resize_pool(unsigned int new_ios)
 
 	} else {
 		/* create new pool */
-		_io_pool = mempool_create(new_ios, alloc_io, free_io, NULL);
+		_io_pool = mempool_create_kmalloc_pool(new_ios,
+						       sizeof(struct io));
 		if (!_io_pool)
 			return -ENOMEM;
 
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 6cfa8d435d55..4e90f231fbfb 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -122,16 +122,6 @@ static inline sector_t region_to_sector(struct region_hash *rh, region_t region)
 /* FIXME move this */
 static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw);
 
-static void *region_alloc(gfp_t gfp_mask, void *pool_data)
-{
-	return kmalloc(sizeof(struct region), gfp_mask);
-}
-
-static void region_free(void *element, void *pool_data)
-{
-	kfree(element);
-}
-
 #define MIN_REGIONS 64
 #define MAX_RECOVERY 1
 static int rh_init(struct region_hash *rh, struct mirror_set *ms,
@@ -173,8 +163,8 @@ static int rh_init(struct region_hash *rh, struct mirror_set *ms,
 	INIT_LIST_HEAD(&rh->quiesced_regions);
 	INIT_LIST_HEAD(&rh->recovered_regions);
 
-	rh->region_pool = mempool_create(MIN_REGIONS, region_alloc,
-					 region_free, NULL);
+	rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
+						      sizeof(struct region));
 	if (!rh->region_pool) {
 		vfree(rh->buckets);
 		rh->buckets = NULL;
diff --git a/drivers/s390/scsi/zfcp_aux.c b/drivers/s390/scsi/zfcp_aux.c
index 95b92f317b6f..395cfc6a344f 100644
--- a/drivers/s390/scsi/zfcp_aux.c
+++ b/drivers/s390/scsi/zfcp_aux.c
@@ -829,18 +829,6 @@ zfcp_unit_dequeue(struct zfcp_unit *unit)
 	device_unregister(&unit->sysfs_device);
 }
 
-static void *
-zfcp_mempool_alloc(gfp_t gfp_mask, void *size)
-{
-	return kmalloc((size_t) size, gfp_mask);
-}
-
-static void
-zfcp_mempool_free(void *element, void *size)
-{
-	kfree(element);
-}
-
 /*
  * Allocates a combined QTCB/fsf_req buffer for erp actions and fcp/SCSI
  * commands.
@@ -853,51 +841,39 @@ static int
 zfcp_allocate_low_mem_buffers(struct zfcp_adapter *adapter)
 {
 	adapter->pool.fsf_req_erp =
-		mempool_create(ZFCP_POOL_FSF_REQ_ERP_NR,
-			       zfcp_mempool_alloc, zfcp_mempool_free, (void *)
-			       sizeof(struct zfcp_fsf_req_pool_element));
-
-	if (NULL == adapter->pool.fsf_req_erp)
+		mempool_create_kmalloc_pool(ZFCP_POOL_FSF_REQ_ERP_NR,
+				sizeof(struct zfcp_fsf_req_pool_element));
+	if (!adapter->pool.fsf_req_erp)
 		return -ENOMEM;
 
 	adapter->pool.fsf_req_scsi =
-		mempool_create(ZFCP_POOL_FSF_REQ_SCSI_NR,
-			       zfcp_mempool_alloc, zfcp_mempool_free, (void *)
-			       sizeof(struct zfcp_fsf_req_pool_element));
-
-	if (NULL == adapter->pool.fsf_req_scsi)
+		mempool_create_kmalloc_pool(ZFCP_POOL_FSF_REQ_SCSI_NR,
+				sizeof(struct zfcp_fsf_req_pool_element));
+	if (!adapter->pool.fsf_req_scsi)
 		return -ENOMEM;
 
 	adapter->pool.fsf_req_abort =
-		mempool_create(ZFCP_POOL_FSF_REQ_ABORT_NR,
-			       zfcp_mempool_alloc, zfcp_mempool_free, (void *)
-			       sizeof(struct zfcp_fsf_req_pool_element));
-
-	if (NULL == adapter->pool.fsf_req_abort)
+		mempool_create_kmalloc_pool(ZFCP_POOL_FSF_REQ_ABORT_NR,
+				sizeof(struct zfcp_fsf_req_pool_element));
+	if (!adapter->pool.fsf_req_abort)
 		return -ENOMEM;
 
 	adapter->pool.fsf_req_status_read =
-		mempool_create(ZFCP_POOL_STATUS_READ_NR,
-			       zfcp_mempool_alloc, zfcp_mempool_free,
-			       (void *) sizeof(struct zfcp_fsf_req));
-
-	if (NULL == adapter->pool.fsf_req_status_read)
+		mempool_create_kmalloc_pool(ZFCP_POOL_STATUS_READ_NR,
+					    sizeof(struct zfcp_fsf_req));
+	if (!adapter->pool.fsf_req_status_read)
 		return -ENOMEM;
 
 	adapter->pool.data_status_read =
-		mempool_create(ZFCP_POOL_STATUS_READ_NR,
-			       zfcp_mempool_alloc, zfcp_mempool_free,
-			       (void *) sizeof(struct fsf_status_read_buffer));
-
-	if (NULL == adapter->pool.data_status_read)
+		mempool_create_kmalloc_pool(ZFCP_POOL_STATUS_READ_NR,
+					sizeof(struct fsf_status_read_buffer));
+	if (!adapter->pool.data_status_read)
 		return -ENOMEM;
 
 	adapter->pool.data_gid_pn =
-		mempool_create(ZFCP_POOL_DATA_GID_PN_NR,
-			       zfcp_mempool_alloc, zfcp_mempool_free, (void *)
-			       sizeof(struct zfcp_gid_pn_data));
-
-	if (NULL == adapter->pool.data_gid_pn)
+		mempool_create_kmalloc_pool(ZFCP_POOL_DATA_GID_PN_NR,
+					    sizeof(struct zfcp_gid_pn_data));
+	if (!adapter->pool.data_gid_pn)
 		return -ENOMEM;
 
 	return 0;
diff --git a/drivers/scsi/lpfc/lpfc_mem.c b/drivers/scsi/lpfc/lpfc_mem.c
index 352df47bcaca..07017658ac56 100644
--- a/drivers/scsi/lpfc/lpfc_mem.c
+++ b/drivers/scsi/lpfc/lpfc_mem.c
@@ -38,18 +38,6 @@
 #define LPFC_MBUF_POOL_SIZE     64      /* max elements in MBUF safety pool */
 #define LPFC_MEM_POOL_SIZE      64      /* max elem in non-DMA safety pool */
 
-static void *
-lpfc_pool_kmalloc(gfp_t gfp_flags, void *data)
-{
-	return kmalloc((unsigned long)data, gfp_flags);
-}
-
-static void
-lpfc_pool_kfree(void *obj, void *data)
-{
-	kfree(obj);
-}
-
 int
 lpfc_mem_alloc(struct lpfc_hba * phba)
 {
@@ -79,15 +67,13 @@ lpfc_mem_alloc(struct lpfc_hba * phba)
 		pool->current_count++;
 	}
 
-	phba->mbox_mem_pool = mempool_create(LPFC_MEM_POOL_SIZE,
-				lpfc_pool_kmalloc, lpfc_pool_kfree,
-				(void *)(unsigned long)sizeof(LPFC_MBOXQ_t));
+	phba->mbox_mem_pool = mempool_create_kmalloc_pool(LPFC_MEM_POOL_SIZE,
+							 sizeof(LPFC_MBOXQ_t));
 	if (!phba->mbox_mem_pool)
 		goto fail_free_mbuf_pool;
 
-	phba->nlp_mem_pool = mempool_create(LPFC_MEM_POOL_SIZE,
-			lpfc_pool_kmalloc, lpfc_pool_kfree,
-			(void *)(unsigned long)sizeof(struct lpfc_nodelist));
+	phba->nlp_mem_pool = mempool_create_kmalloc_pool(LPFC_MEM_POOL_SIZE,
+						sizeof(struct lpfc_nodelist));
 	if (!phba->nlp_mem_pool)
 		goto fail_free_mbox_pool;
 
diff --git a/fs/bio.c b/fs/bio.c
index 49db9286a3b4..377046d82945 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1125,16 +1125,6 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
 	return bp;
 }
 
-static void *bio_pair_alloc(gfp_t gfp_flags, void *data)
-{
-	return kmalloc(sizeof(struct bio_pair), gfp_flags);
-}
-
-static void bio_pair_free(void *bp, void *data)
-{
-	kfree(bp);
-}
-
 
 /*
  * create memory pools for biovec's in a bio_set.
@@ -1254,8 +1244,8 @@ static int __init init_bio(void)
 	if (!fs_bio_set)
 		panic("bio: can't allocate bios\n");
 
-	bio_split_pool = mempool_create(BIO_SPLIT_ENTRIES,
-				bio_pair_alloc, bio_pair_free, NULL);
+	bio_split_pool = mempool_create_kmalloc_pool(BIO_SPLIT_ENTRIES,
+						     sizeof(struct bio_pair));
 	if (!bio_split_pool)
 		panic("bio: can't create split pool\n");
 
-- 
cgit v1.2.3


From 26b6e051bc36b968a0b5ac96903b99e050d20b87 Mon Sep 17 00:00:00 2001
From: Matthew Dobson <colpatch@us.ibm.com>
Date: Sun, 26 Mar 2006 01:37:48 -0800
Subject: [PATCH] mempool: use common mempool kzalloc allocator

This patch changes a mempool user, which is basically just a wrapper around
kzalloc(), to use the common mempool_kmalloc/kfree, rather than its own
wrapper function, removing duplicated code.

Signed-off-by: Matthew Dobson <colpatch@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/multipath.c | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 96f7af4ae400..1cc9de44ce86 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -35,18 +35,6 @@
 #define	NR_RESERVED_BUFS	32
 
 
-static void *mp_pool_alloc(gfp_t gfp_flags, void *data)
-{
-	struct multipath_bh *mpb;
-	mpb = kzalloc(sizeof(*mpb), gfp_flags);
-	return mpb;
-}
-
-static void mp_pool_free(void *mpb, void *data)
-{
-	kfree(mpb);
-}
-
 static int multipath_map (multipath_conf_t *conf)
 {
 	int i, disks = conf->raid_disks;
@@ -494,9 +482,8 @@ static int multipath_run (mddev_t *mddev)
 	}
 	mddev->degraded = conf->raid_disks = conf->working_disks;
 
-	conf->pool = mempool_create(NR_RESERVED_BUFS,
-				    mp_pool_alloc, mp_pool_free,
-				    NULL);
+	conf->pool = mempool_create_kzalloc_pool(NR_RESERVED_BUFS,
+						 sizeof(struct multipath_bh));
 	if (conf->pool == NULL) {
 		printk(KERN_ERR 
 			"multipath: couldn't allocate memory for %s\n",
-- 
cgit v1.2.3


From 93d2341c750cda0df48a6cc67b35fe25f1ec47df Mon Sep 17 00:00:00 2001
From: Matthew Dobson <colpatch@us.ibm.com>
Date: Sun, 26 Mar 2006 01:37:50 -0800
Subject: [PATCH] mempool: use mempool_create_slab_pool()

Modify well over a dozen mempool users to call mempool_create_slab_pool()
rather than calling mempool_create() with extra arguments, saving about 30
lines of code and increasing readability.

Signed-off-by: Matthew Dobson <colpatch@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 block/cfq-iosched.c             |  2 +-
 drivers/block/aoe/aoeblk.c      |  4 +---
 drivers/md/dm-crypt.c           |  3 +--
 drivers/md/dm-mpath.c           |  3 +--
 drivers/md/dm-snap.c            |  3 +--
 drivers/md/dm.c                 |  6 ++----
 drivers/md/kcopyd.c             |  3 +--
 drivers/message/i2o/i2o_block.c |  7 +++----
 drivers/scsi/iscsi_tcp.c        |  4 ++--
 drivers/scsi/qla2xxx/qla_os.c   |  3 +--
 drivers/scsi/scsi_lib.c         |  5 ++---
 fs/bio.c                        |  7 ++-----
 fs/cifs/cifsfs.c                | 18 ++++++------------
 fs/jfs/jfs_metapage.c           |  4 ++--
 fs/nfs/read.c                   |  6 ++----
 fs/nfs/write.c                  | 12 ++++--------
 fs/xfs/linux-2.6/xfs_super.c    |  5 ++---
 include/linux/i2o.h             |  4 +---
 net/sunrpc/sched.c              | 12 ++++--------
 19 files changed, 39 insertions(+), 72 deletions(-)

(limited to 'drivers/md')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index c4a0d5d8d7f0..bde40a6ae665 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2191,7 +2191,7 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e)
 	if (!cfqd->cfq_hash)
 		goto out_cfqhash;
 
-	cfqd->crq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, crq_pool);
+	cfqd->crq_pool = mempool_create_slab_pool(BLKDEV_MIN_RQ, crq_pool);
 	if (!cfqd->crq_pool)
 		goto out_crqpool;
 
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 32fea55fac48..393b86a3dbf8 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -211,9 +211,7 @@ aoeblk_gdalloc(void *vp)
 		return;
 	}
 
-	d->bufpool = mempool_create(MIN_BUFS,
-				    mempool_alloc_slab, mempool_free_slab,
-				    buf_pool_cache);
+	d->bufpool = mempool_create_slab_pool(MIN_BUFS, buf_pool_cache);
 	if (d->bufpool == NULL) {
 		printk(KERN_ERR "aoe: aoeblk_gdalloc: cannot allocate bufpool "
 			"for %ld.%ld\n", d->aoemajor, d->aoeminor);
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index d88b8eda3903..259e86f26549 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -616,8 +616,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		}
 	}
 
-	cc->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
-				     mempool_free_slab, _crypt_io_pool);
+	cc->io_pool = mempool_create_slab_pool(MIN_IOS, _crypt_io_pool);
 	if (!cc->io_pool) {
 		ti->error = PFX "Cannot allocate crypt io mempool";
 		goto bad3;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index f72a82fb9434..1816f30678ed 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -179,8 +179,7 @@ static struct multipath *alloc_multipath(void)
 		m->queue_io = 1;
 		INIT_WORK(&m->process_queued_ios, process_queued_ios, m);
 		INIT_WORK(&m->trigger_event, trigger_event, m);
-		m->mpio_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
-					      mempool_free_slab, _mpio_cache);
+		m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
 		if (!m->mpio_pool) {
 			kfree(m);
 			return NULL;
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index f3759dd7828e..7401540086df 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1174,8 +1174,7 @@ static int __init dm_snapshot_init(void)
 		goto bad4;
 	}
 
-	pending_pool = mempool_create(128, mempool_alloc_slab,
-				      mempool_free_slab, pending_cache);
+	pending_pool = mempool_create_slab_pool(128, pending_cache);
 	if (!pending_pool) {
 		DMERR("Couldn't create pending pool.");
 		r = -ENOMEM;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8c82373f7ff3..a64798ef481e 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -823,13 +823,11 @@ static struct mapped_device *alloc_dev(unsigned int minor, int persistent)
 	md->queue->unplug_fn = dm_unplug_all;
 	md->queue->issue_flush_fn = dm_flush_all;
 
-	md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
-				     mempool_free_slab, _io_cache);
+	md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache);
  	if (!md->io_pool)
  		goto bad2;
 
-	md->tio_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
-				      mempool_free_slab, _tio_cache);
+	md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache);
 	if (!md->tio_pool)
 		goto bad3;
 
diff --git a/drivers/md/kcopyd.c b/drivers/md/kcopyd.c
index 0d54e8b7d9de..9dcb2c8a3853 100644
--- a/drivers/md/kcopyd.c
+++ b/drivers/md/kcopyd.c
@@ -227,8 +227,7 @@ static int jobs_init(void)
 	if (!_job_cache)
 		return -ENOMEM;
 
-	_job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab,
-				   mempool_free_slab, _job_cache);
+	_job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache);
 	if (!_job_pool) {
 		kmem_cache_destroy(_job_cache);
 		return -ENOMEM;
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index b09fb6307153..7d4c5497785b 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -1179,10 +1179,9 @@ static int __init i2o_block_init(void)
 		goto exit;
 	}
 
-	i2o_blk_req_pool.pool = mempool_create(I2O_BLOCK_REQ_MEMPOOL_SIZE,
-					       mempool_alloc_slab,
-					       mempool_free_slab,
-					       i2o_blk_req_pool.slab);
+	i2o_blk_req_pool.pool =
+		mempool_create_slab_pool(I2O_BLOCK_REQ_MEMPOOL_SIZE,
+					 i2o_blk_req_pool.slab);
 	if (!i2o_blk_req_pool.pool) {
 		osm_err("can't init request mempool\n");
 		rc = -ENOMEM;
diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index 7b82ff090d42..2068b66822b7 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -3200,8 +3200,8 @@ iscsi_r2tpool_alloc(struct iscsi_session *session)
 		 * Data-Out PDU's within R2T-sequence can be quite big;
 		 * using mempool
 		 */
-		ctask->datapool = mempool_create(ISCSI_DTASK_DEFAULT_MAX,
-			 mempool_alloc_slab, mempool_free_slab, taskcache);
+		ctask->datapool = mempool_create_slab_pool(ISCSI_DTASK_DEFAULT_MAX,
+							   taskcache);
 		if (ctask->datapool == NULL) {
 			kfifo_free(ctask->r2tqueue);
 			iscsi_pool_free(&ctask->r2tpool, (void**)ctask->r2ts);
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index 029bbf461bb2..017729c59a49 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -2154,8 +2154,7 @@ qla2x00_allocate_sp_pool(scsi_qla_host_t *ha)
 	int      rval;
 
 	rval = QLA_SUCCESS;
-	ha->srb_mempool = mempool_create(SRB_MIN_REQ, mempool_alloc_slab,
-	    mempool_free_slab, srb_cachep);
+	ha->srb_mempool = mempool_create_slab_pool(SRB_MIN_REQ, srb_cachep);
 	if (ha->srb_mempool == NULL) {
 		qla_printk(KERN_INFO, ha, "Unable to allocate SRB mempool.\n");
 		rval = QLA_FUNCTION_FAILED;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index ede158d08d9d..8f010a314a3d 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1787,9 +1787,8 @@ int __init scsi_init_queue(void)
 					sgp->name);
 		}
 
-		sgp->pool = mempool_create(SG_MEMPOOL_SIZE,
-				mempool_alloc_slab, mempool_free_slab,
-				sgp->slab);
+		sgp->pool = mempool_create_slab_pool(SG_MEMPOOL_SIZE,
+						     sgp->slab);
 		if (!sgp->pool) {
 			printk(KERN_ERR "SCSI: can't init sg mempool %s\n",
 					sgp->name);
diff --git a/fs/bio.c b/fs/bio.c
index 377046d82945..eb8fbc53f2cd 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1141,8 +1141,7 @@ static int biovec_create_pools(struct bio_set *bs, int pool_entries, int scale)
 		if (i >= scale)
 			pool_entries >>= 1;
 
-		*bvp = mempool_create(pool_entries, mempool_alloc_slab,
-					mempool_free_slab, bp->slab);
+		*bvp = mempool_create_slab_pool(pool_entries, bp->slab);
 		if (!*bvp)
 			return -ENOMEM;
 	}
@@ -1179,9 +1178,7 @@ struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size, int scale)
 	if (!bs)
 		return NULL;
 
-	bs->bio_pool = mempool_create(bio_pool_size, mempool_alloc_slab,
-			mempool_free_slab, bio_slab);
-
+	bs->bio_pool = mempool_create_slab_pool(bio_pool_size, bio_slab);
 	if (!bs->bio_pool)
 		goto bad;
 
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 221b3334b737..6b99b51d6694 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -738,10 +738,8 @@ cifs_init_request_bufs(void)
 		cERROR(1,("cifs_min_rcv set to maximum (64)"));
 	}
 
-	cifs_req_poolp = mempool_create(cifs_min_rcv,
-					mempool_alloc_slab,
-					mempool_free_slab,
-					cifs_req_cachep);
+	cifs_req_poolp = mempool_create_slab_pool(cifs_min_rcv,
+						  cifs_req_cachep);
 
 	if(cifs_req_poolp == NULL) {
 		kmem_cache_destroy(cifs_req_cachep);
@@ -771,10 +769,8 @@ cifs_init_request_bufs(void)
 		cFYI(1,("cifs_min_small set to maximum (256)"));
 	}
 
-	cifs_sm_req_poolp = mempool_create(cifs_min_small,
-				mempool_alloc_slab,
-				mempool_free_slab,
-				cifs_sm_req_cachep);
+	cifs_sm_req_poolp = mempool_create_slab_pool(cifs_min_small,
+						     cifs_sm_req_cachep);
 
 	if(cifs_sm_req_poolp == NULL) {
 		mempool_destroy(cifs_req_poolp);
@@ -808,10 +804,8 @@ cifs_init_mids(void)
 	if (cifs_mid_cachep == NULL)
 		return -ENOMEM;
 
-	cifs_mid_poolp = mempool_create(3 /* a reasonable min simultan opers */,
-					mempool_alloc_slab,
-					mempool_free_slab,
-					cifs_mid_cachep);
+	/* 3 is a reasonable minimum number of simultaneous operations */
+	cifs_mid_poolp = mempool_create_slab_pool(3, cifs_mid_cachep);
 	if(cifs_mid_poolp == NULL) {
 		kmem_cache_destroy(cifs_mid_cachep);
 		return -ENOMEM;
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 8508043849f3..f28696f235c4 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -220,8 +220,8 @@ int __init metapage_init(void)
 	if (metapage_cache == NULL)
 		return -ENOMEM;
 
-	metapage_mempool = mempool_create(METAPOOL_MIN_PAGES, mempool_alloc_slab,
-					  mempool_free_slab, metapage_cache);
+	metapage_mempool = mempool_create_slab_pool(METAPOOL_MIN_PAGES,
+						    metapage_cache);
 
 	if (metapage_mempool == NULL) {
 		kmem_cache_destroy(metapage_cache);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 3961524fd4ab..624ca7146b6b 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -663,10 +663,8 @@ int nfs_init_readpagecache(void)
 	if (nfs_rdata_cachep == NULL)
 		return -ENOMEM;
 
-	nfs_rdata_mempool = mempool_create(MIN_POOL_READ,
-					   mempool_alloc_slab,
-					   mempool_free_slab,
-					   nfs_rdata_cachep);
+	nfs_rdata_mempool = mempool_create_slab_pool(MIN_POOL_READ,
+						     nfs_rdata_cachep);
 	if (nfs_rdata_mempool == NULL)
 		return -ENOMEM;
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 3f5225404c97..4cfada2cc09f 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1521,17 +1521,13 @@ int nfs_init_writepagecache(void)
 	if (nfs_wdata_cachep == NULL)
 		return -ENOMEM;
 
-	nfs_wdata_mempool = mempool_create(MIN_POOL_WRITE,
-					   mempool_alloc_slab,
-					   mempool_free_slab,
-					   nfs_wdata_cachep);
+	nfs_wdata_mempool = mempool_create_slab_pool(MIN_POOL_WRITE,
+						     nfs_wdata_cachep);
 	if (nfs_wdata_mempool == NULL)
 		return -ENOMEM;
 
-	nfs_commit_mempool = mempool_create(MIN_POOL_COMMIT,
-					   mempool_alloc_slab,
-					   mempool_free_slab,
-					   nfs_wdata_cachep);
+	nfs_commit_mempool = mempool_create_slab_pool(MIN_POOL_COMMIT,
+						      nfs_wdata_cachep);
 	if (nfs_commit_mempool == NULL)
 		return -ENOMEM;
 
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 8355faf8ffde..1884300417e3 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -375,9 +375,8 @@ xfs_init_zones(void)
 	if (!xfs_ioend_zone)
 		goto out_destroy_vnode_zone;
 
-	xfs_ioend_pool = mempool_create(4 * MAX_BUF_PER_PAGE,
-					mempool_alloc_slab, mempool_free_slab,
-					xfs_ioend_zone);
+	xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE,
+						  xfs_ioend_zone);
 	if (!xfs_ioend_pool)
 		goto out_free_ioend_zone;
 	return 0;
diff --git a/include/linux/i2o.h b/include/linux/i2o.h
index 5a9d8c599171..dd7d627bf66f 100644
--- a/include/linux/i2o.h
+++ b/include/linux/i2o.h
@@ -950,9 +950,7 @@ static inline int i2o_pool_alloc(struct i2o_pool *pool, const char *name,
 	if (!pool->slab)
 		goto free_name;
 
-	pool->mempool =
-	    mempool_create(min_nr, mempool_alloc_slab, mempool_free_slab,
-			   pool->slab);
+	pool->mempool = mempool_create_slab_pool(min_nr, pool->slab);
 	if (!pool->mempool)
 		goto free_slab;
 
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index b9969b91a9f7..5c3eee768504 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -1167,16 +1167,12 @@ rpc_init_mempool(void)
 					     NULL, NULL);
 	if (!rpc_buffer_slabp)
 		goto err_nomem;
-	rpc_task_mempool = mempool_create(RPC_TASK_POOLSIZE,
-					    mempool_alloc_slab,
-					    mempool_free_slab,
-					    rpc_task_slabp);
+	rpc_task_mempool = mempool_create_slab_pool(RPC_TASK_POOLSIZE,
+						    rpc_task_slabp);
 	if (!rpc_task_mempool)
 		goto err_nomem;
-	rpc_buffer_mempool = mempool_create(RPC_BUFFER_POOLSIZE,
-					    mempool_alloc_slab,
-					    mempool_free_slab,
-					    rpc_buffer_slabp);
+	rpc_buffer_mempool = mempool_create_slab_pool(RPC_BUFFER_POOLSIZE,
+						      rpc_buffer_slabp);
 	if (!rpc_buffer_mempool)
 		goto err_nomem;
 	return 0;
-- 
cgit v1.2.3


From eccf081799be8d83852f183838bf26e1ca099db4 Mon Sep 17 00:00:00 2001
From: Alasdair G Kergon <agk@redhat.com>
Date: Mon, 27 Mar 2006 01:17:42 -0800
Subject: [PATCH] device-mapper snapshot: fix origin_write pending_exception
 submission

Say you have several snapshots of the same origin and then you issue a write
to some place in the origin for the first time.

Before the device-mapper snapshot target lets the write go through to the
underlying device, it needs to make a copy of the data that is about to be
overwritten.  Each snapshot is independent, so it makes one copy for each
snapshot.

__origin_write() loops through each snapshot and checks to see whether a copy
is needed for that snapshot.  (A copy is only needed the first time that data
changes.)

If a copy is needed, the code allocates a 'pending_exception' structure
holding the details.  It links these together for all the snapshots, then
works its way through this list and submits the copying requests to the kcopyd
thread by calling start_copy().  When each request is completed, the original
pending_exception structure gets freed in pending_complete().

If you're very unlucky, this structure can get freed *before* the submission
process has finished walking the list.

This patch:

  1) Creates a new temporary list pe_queue to hold the pending exception
     structures;

  2) Does all the bookkeeping up-front, then walks through the new list
     safely and calls start_copy() for each pending_exception that needed it;

  3) Avoids attempting to add pe->siblings to the list if it's already
     connected.

[NB This does not fix all the races in this code.  More patches will follow.]

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm-snap.c | 41 +++++++++++++++++++----------------------
 1 file changed, 19 insertions(+), 22 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 7401540086df..874f145431d8 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -48,6 +48,11 @@ struct pending_exception {
 	struct bio_list origin_bios;
 	struct bio_list snapshot_bios;
 
+	/*
+	 * Short-term queue of pending exceptions prior to submission.
+	 */
+	struct list_head list;
+
 	/*
 	 * Other pending_exceptions that are processing this
 	 * chunk.  When this list is empty, we know we can
@@ -930,8 +935,9 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
 	int r = 1, first = 1;
 	struct dm_snapshot *snap;
 	struct exception *e;
-	struct pending_exception *pe, *last = NULL;
+	struct pending_exception *pe, *next_pe, *last = NULL;
 	chunk_t chunk;
+	LIST_HEAD(pe_queue);
 
 	/* Do all the snapshots on this origin */
 	list_for_each_entry (snap, snapshots, list) {
@@ -965,12 +971,19 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
 				snap->valid = 0;
 
 			} else {
-				if (last)
+				if (first) {
+					bio_list_add(&pe->origin_bios, bio);
+					r = 0;
+					first = 0;
+				}
+				if (last && list_empty(&pe->siblings))
 					list_merge(&pe->siblings,
 						   &last->siblings);
-
+				if (!pe->started) {
+					pe->started = 1;
+					list_add_tail(&pe->list, &pe_queue);
+				}
 				last = pe;
-				r = 0;
 			}
 		}
 
@@ -980,24 +993,8 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
 	/*
 	 * Now that we have a complete pe list we can start the copying.
 	 */
-	if (last) {
-		pe = last;
-		do {
-			down_write(&pe->snap->lock);
-			if (first)
-				bio_list_add(&pe->origin_bios, bio);
-			if (!pe->started) {
-				pe->started = 1;
-				up_write(&pe->snap->lock);
-				start_copy(pe);
-			} else
-				up_write(&pe->snap->lock);
-			first = 0;
-			pe = list_entry(pe->siblings.next,
-					struct pending_exception, siblings);
-
-		} while (pe != last);
-	}
+	list_for_each_entry_safe(pe, next_pe, &pe_queue, list)
+		start_copy(pe);
 
 	return r;
 }
-- 
cgit v1.2.3


From b4b610f684d13bf8691feeae5d4d7a8bd1f1033e Mon Sep 17 00:00:00 2001
From: Alasdair G Kergon <agk@redhat.com>
Date: Mon, 27 Mar 2006 01:17:44 -0800
Subject: [PATCH] device-mapper snapshot: replace sibling list

The siblings "list" is used unsafely at the moment.

Firstly, only the element on the list being changed gets locked (via the
snapshot lock), not the next and previous elements which have pointers that
are also being changed.

Secondly, if you have two or more snapshots and write to the same chunk a
second time before every snapshot has finished making its private copy of the
data, if you're unlucky, _origin_write() could attempt its list_merge() and
dereference a 'last' pointer to a pending_exception structure that has just
been freed.

Analysis reveals that the list is actually only there for reference counting.
If 5 pending_exceptions are needed in origin_write, then the 5 are joined
together into a 5-element list - without a separate list head because there's
nowhere suitable to store it.  As the pending_exceptions complete, they are
removed from the list one-by-one and any contents of origin_bios get moved
across to one of the remaining pending_exceptions on the list.  Whichever one
is last is detected because list_empty() is then true and the origin_bios get
submitted.

The fix proposed here uses an alternative reference counting mechanism by
choosing one of the pending_exceptions as primary and maintaining an atomic
counter there.

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm-snap.c | 122 +++++++++++++++++++++++++++++++++------------------
 1 file changed, 80 insertions(+), 42 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 874f145431d8..475514bda9d0 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -54,11 +54,21 @@ struct pending_exception {
 	struct list_head list;
 
 	/*
-	 * Other pending_exceptions that are processing this
-	 * chunk.  When this list is empty, we know we can
-	 * complete the origins.
+	 * The primary pending_exception is the one that holds
+	 * the sibling_count and the list of origin_bios for a
+	 * group of pending_exceptions.  It is always last to get freed.
+	 * These fields get set up when writing to the origin.
 	 */
-	struct list_head siblings;
+	struct pending_exception *primary_pe;
+
+	/*
+	 * Number of pending_exceptions processing this chunk.
+	 * When this drops to zero we must complete the origin bios.
+	 * If incrementing or decrementing this, hold pe->snap->lock for
+	 * the sibling concerned and not pe->primary_pe->snap->lock unless
+	 * they are the same.
+	 */
+	atomic_t sibling_count;
 
 	/* Pointer back to snapshot context */
 	struct dm_snapshot *snap;
@@ -593,20 +603,15 @@ static void error_bios(struct bio *bio)
 
 static struct bio *__flush_bios(struct pending_exception *pe)
 {
-	struct pending_exception *sibling;
-
-	if (list_empty(&pe->siblings))
-		return bio_list_get(&pe->origin_bios);
-
-	sibling = list_entry(pe->siblings.next,
-			     struct pending_exception, siblings);
-
-	list_del(&pe->siblings);
-
-	/* This is fine as long as kcopyd is single-threaded. If kcopyd
-	 * becomes multi-threaded, we'll need some locking here.
+	/*
+	 * If this pe is involved in a write to the origin and
+	 * it is the last sibling to complete then release
+	 * the bios for the original write to the origin.
 	 */
-	bio_list_merge(&sibling->origin_bios, &pe->origin_bios);
+
+	if (pe->primary_pe &&
+	    atomic_dec_and_test(&pe->primary_pe->sibling_count))
+		return bio_list_get(&pe->primary_pe->origin_bios);
 
 	return NULL;
 }
@@ -614,6 +619,7 @@ static struct bio *__flush_bios(struct pending_exception *pe)
 static void pending_complete(struct pending_exception *pe, int success)
 {
 	struct exception *e;
+	struct pending_exception *primary_pe;
 	struct dm_snapshot *s = pe->snap;
 	struct bio *flush = NULL;
 
@@ -662,7 +668,20 @@ static void pending_complete(struct pending_exception *pe, int success)
 	}
 
  out:
-	free_pending_exception(pe);
+	primary_pe = pe->primary_pe;
+
+	/*
+	 * Free the pe if it's not linked to an origin write or if
+	 * it's not itself a primary pe.
+	 */
+	if (!primary_pe || primary_pe != pe)
+		free_pending_exception(pe);
+
+	/*
+	 * Free the primary pe if nothing references it.
+	 */
+	if (primary_pe && !atomic_read(&primary_pe->sibling_count))
+		free_pending_exception(primary_pe);
 
 	if (flush)
 		flush_bios(flush);
@@ -757,7 +776,8 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio)
 			pe->e.old_chunk = chunk;
 			bio_list_init(&pe->origin_bios);
 			bio_list_init(&pe->snapshot_bios);
-			INIT_LIST_HEAD(&pe->siblings);
+			pe->primary_pe = NULL;
+			atomic_set(&pe->sibling_count, 1);
 			pe->snap = s;
 			pe->started = 0;
 
@@ -916,26 +936,12 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
 /*-----------------------------------------------------------------
  * Origin methods
  *---------------------------------------------------------------*/
-static void list_merge(struct list_head *l1, struct list_head *l2)
-{
-	struct list_head *l1_n, *l2_p;
-
-	l1_n = l1->next;
-	l2_p = l2->prev;
-
-	l1->next = l2;
-	l2->prev = l1;
-
-	l2_p->next = l1_n;
-	l1_n->prev = l2_p;
-}
-
 static int __origin_write(struct list_head *snapshots, struct bio *bio)
 {
-	int r = 1, first = 1;
+	int r = 1, first = 0;
 	struct dm_snapshot *snap;
 	struct exception *e;
-	struct pending_exception *pe, *next_pe, *last = NULL;
+	struct pending_exception *pe, *next_pe, *primary_pe = NULL;
 	chunk_t chunk;
 	LIST_HEAD(pe_queue);
 
@@ -962,6 +968,9 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
 		 * Check exception table to see if block
 		 * is already remapped in this snapshot
 		 * and trigger an exception if not.
+		 *
+		 * sibling_count is initialised to 1 so pending_complete()
+		 * won't destroy the primary_pe while we're inside this loop.
 		 */
 		e = lookup_exception(&snap->complete, chunk);
 		if (!e) {
@@ -971,31 +980,60 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
 				snap->valid = 0;
 
 			} else {
-				if (first) {
-					bio_list_add(&pe->origin_bios, bio);
+				if (!primary_pe) {
+					/*
+					 * Either every pe here has same
+					 * primary_pe or none has one yet.
+					 */
+					if (pe->primary_pe)
+						primary_pe = pe->primary_pe;
+					else {
+						primary_pe = pe;
+						first = 1;
+					}
+
+					bio_list_add(&primary_pe->origin_bios,
+						     bio);
 					r = 0;
-					first = 0;
 				}
-				if (last && list_empty(&pe->siblings))
-					list_merge(&pe->siblings,
-						   &last->siblings);
+				if (!pe->primary_pe) {
+					atomic_inc(&primary_pe->sibling_count);
+					pe->primary_pe = primary_pe;
+				}
 				if (!pe->started) {
 					pe->started = 1;
 					list_add_tail(&pe->list, &pe_queue);
 				}
-				last = pe;
 			}
 		}
 
 		up_write(&snap->lock);
 	}
 
+	if (!primary_pe)
+		goto out;
+
+	/*
+	 * If this is the first time we're processing this chunk and
+	 * sibling_count is now 1 it means all the pending exceptions
+	 * got completed while we were in the loop above, so it falls to
+	 * us here to remove the primary_pe and submit any origin_bios.
+	 */
+
+	if (first && atomic_dec_and_test(&primary_pe->sibling_count)) {
+		flush_bios(bio_list_get(&primary_pe->origin_bios));
+		free_pending_exception(primary_pe);
+		/* If we got here, pe_queue is necessarily empty. */
+		goto out;
+	}
+
 	/*
 	 * Now that we have a complete pe list we can start the copying.
 	 */
 	list_for_each_entry_safe(pe, next_pe, &pe_queue, list)
 		start_copy(pe);
 
+ out:
 	return r;
 }
 
-- 
cgit v1.2.3


From 76df1c651b66bdf07d60b3d60789feb5f58d73e3 Mon Sep 17 00:00:00 2001
From: Alasdair G Kergon <agk@redhat.com>
Date: Mon, 27 Mar 2006 01:17:45 -0800
Subject: [PATCH] device-mapper snapshot: fix invalidation

When a snapshot becomes invalid, s->valid is set to 0.  In this state, a
snapshot can no longer be accessed.

When s->lock is acquired, before doing anything else, s->valid must be checked
to ensure the snapshot remains valid.

This patch eliminates some races (that may cause panics) by adding some
missing checks.  At the same time, some unnecessary levels of indentation are
removed and snapshot invalidation is moved into a single function that always
generates a device-mapper event.

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm-snap.c | 295 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 174 insertions(+), 121 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 475514bda9d0..14bd1a1815b1 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -392,6 +392,8 @@ static void read_snapshot_metadata(struct dm_snapshot *s)
 		down_write(&s->lock);
 		s->valid = 0;
 		up_write(&s->lock);
+
+		dm_table_event(s->table);
 	}
 }
 
@@ -601,6 +603,11 @@ static void error_bios(struct bio *bio)
 	}
 }
 
+static inline void error_snapshot_bios(struct pending_exception *pe)
+{
+	error_bios(bio_list_get(&pe->snapshot_bios));
+}
+
 static struct bio *__flush_bios(struct pending_exception *pe)
 {
 	/*
@@ -616,6 +623,28 @@ static struct bio *__flush_bios(struct pending_exception *pe)
 	return NULL;
 }
 
+static void __invalidate_snapshot(struct dm_snapshot *s,
+				struct pending_exception *pe, int err)
+{
+	if (!s->valid)
+		return;
+
+	if (err == -EIO)
+		DMERR("Invalidating snapshot: Error reading/writing.");
+	else if (err == -ENOMEM)
+		DMERR("Invalidating snapshot: Unable to allocate exception.");
+
+	if (pe)
+		remove_exception(&pe->e);
+
+	if (s->store.drop_snapshot)
+		s->store.drop_snapshot(&s->store);
+
+	s->valid = 0;
+
+	dm_table_event(s->table);
+}
+
 static void pending_complete(struct pending_exception *pe, int success)
 {
 	struct exception *e;
@@ -623,50 +652,53 @@ static void pending_complete(struct pending_exception *pe, int success)
 	struct dm_snapshot *s = pe->snap;
 	struct bio *flush = NULL;
 
-	if (success) {
-		e = alloc_exception();
-		if (!e) {
-			DMWARN("Unable to allocate exception.");
-			down_write(&s->lock);
-			s->store.drop_snapshot(&s->store);
-			s->valid = 0;
-			flush = __flush_bios(pe);
-			up_write(&s->lock);
-
-			error_bios(bio_list_get(&pe->snapshot_bios));
-			goto out;
-		}
-		*e = pe->e;
-
-		/*
-		 * Add a proper exception, and remove the
-		 * in-flight exception from the list.
-		 */
+	if (!success) {
+		/* Read/write error - snapshot is unusable */
 		down_write(&s->lock);
-		insert_exception(&s->complete, e);
-		remove_exception(&pe->e);
+		__invalidate_snapshot(s, pe, -EIO);
 		flush = __flush_bios(pe);
-
-		/* Submit any pending write bios */
 		up_write(&s->lock);
 
-		flush_bios(bio_list_get(&pe->snapshot_bios));
-	} else {
-		/* Read/write error - snapshot is unusable */
+		error_snapshot_bios(pe);
+		goto out;
+	}
+
+	e = alloc_exception();
+	if (!e) {
 		down_write(&s->lock);
-		if (s->valid)
-			DMERR("Error reading/writing snapshot");
-		s->store.drop_snapshot(&s->store);
-		s->valid = 0;
-		remove_exception(&pe->e);
+		__invalidate_snapshot(s, pe, -ENOMEM);
 		flush = __flush_bios(pe);
 		up_write(&s->lock);
 
-		error_bios(bio_list_get(&pe->snapshot_bios));
+		error_snapshot_bios(pe);
+		goto out;
+	}
+	*e = pe->e;
 
-		dm_table_event(s->table);
+	/*
+	 * Add a proper exception, and remove the
+	 * in-flight exception from the list.
+	 */
+	down_write(&s->lock);
+	if (!s->valid) {
+		flush = __flush_bios(pe);
+		up_write(&s->lock);
+
+		free_exception(e);
+
+		error_snapshot_bios(pe);
+		goto out;
 	}
 
+	insert_exception(&s->complete, e);
+	remove_exception(&pe->e);
+	flush = __flush_bios(pe);
+
+	up_write(&s->lock);
+
+	/* Submit any pending write bios */
+	flush_bios(bio_list_get(&pe->snapshot_bios));
+
  out:
 	primary_pe = pe->primary_pe;
 
@@ -758,39 +790,45 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio)
 	if (e) {
 		/* cast the exception to a pending exception */
 		pe = container_of(e, struct pending_exception, e);
+		goto out;
+	}
 
-	} else {
-		/*
-		 * Create a new pending exception, we don't want
-		 * to hold the lock while we do this.
-		 */
-		up_write(&s->lock);
-		pe = alloc_pending_exception();
-		down_write(&s->lock);
+	/*
+	 * Create a new pending exception, we don't want
+	 * to hold the lock while we do this.
+	 */
+	up_write(&s->lock);
+	pe = alloc_pending_exception();
+	down_write(&s->lock);
 
-		e = lookup_exception(&s->pending, chunk);
-		if (e) {
-			free_pending_exception(pe);
-			pe = container_of(e, struct pending_exception, e);
-		} else {
-			pe->e.old_chunk = chunk;
-			bio_list_init(&pe->origin_bios);
-			bio_list_init(&pe->snapshot_bios);
-			pe->primary_pe = NULL;
-			atomic_set(&pe->sibling_count, 1);
-			pe->snap = s;
-			pe->started = 0;
-
-			if (s->store.prepare_exception(&s->store, &pe->e)) {
-				free_pending_exception(pe);
-				s->valid = 0;
-				return NULL;
-			}
+	if (!s->valid) {
+		free_pending_exception(pe);
+		return NULL;
+	}
 
-			insert_exception(&s->pending, &pe->e);
-		}
+	e = lookup_exception(&s->pending, chunk);
+	if (e) {
+		free_pending_exception(pe);
+		pe = container_of(e, struct pending_exception, e);
+		goto out;
 	}
 
+	pe->e.old_chunk = chunk;
+	bio_list_init(&pe->origin_bios);
+	bio_list_init(&pe->snapshot_bios);
+	pe->primary_pe = NULL;
+	atomic_set(&pe->sibling_count, 1);
+	pe->snap = s;
+	pe->started = 0;
+
+	if (s->store.prepare_exception(&s->store, &pe->e)) {
+		free_pending_exception(pe);
+		return NULL;
+	}
+
+	insert_exception(&s->pending, &pe->e);
+
+ out:
 	return pe;
 }
 
@@ -807,13 +845,15 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
 {
 	struct exception *e;
 	struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
+	int copy_needed = 0;
 	int r = 1;
 	chunk_t chunk;
-	struct pending_exception *pe;
+	struct pending_exception *pe = NULL;
 
 	chunk = sector_to_chunk(s, bio->bi_sector);
 
 	/* Full snapshots are not usable */
+	/* To get here the table must be live so s->active is always set. */
 	if (!s->valid)
 		return -EIO;
 
@@ -831,36 +871,41 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
 		 * to copy an exception */
 		down_write(&s->lock);
 
+		if (!s->valid) {
+			r = -EIO;
+			goto out_unlock;
+		}
+
 		/* If the block is already remapped - use that, else remap it */
 		e = lookup_exception(&s->complete, chunk);
 		if (e) {
 			remap_exception(s, e, bio);
-			up_write(&s->lock);
-
-		} else {
-			pe = __find_pending_exception(s, bio);
-
-			if (!pe) {
-				if (s->store.drop_snapshot)
-					s->store.drop_snapshot(&s->store);
-				s->valid = 0;
-				r = -EIO;
-				up_write(&s->lock);
-			} else {
-				remap_exception(s, &pe->e, bio);
-				bio_list_add(&pe->snapshot_bios, bio);
-
-				if (!pe->started) {
-					/* this is protected by snap->lock */
-					pe->started = 1;
-					up_write(&s->lock);
-					start_copy(pe);
-				} else
-					up_write(&s->lock);
-				r = 0;
-			}
+			goto out_unlock;
+		}
+
+		pe = __find_pending_exception(s, bio);
+		if (!pe) {
+			__invalidate_snapshot(s, pe, -ENOMEM);
+			r = -EIO;
+			goto out_unlock;
 		}
 
+		remap_exception(s, &pe->e, bio);
+		bio_list_add(&pe->snapshot_bios, bio);
+
+		if (!pe->started) {
+			/* this is protected by snap->lock */
+			pe->started = 1;
+			copy_needed = 1;
+		}
+
+		r = 0;
+
+ out_unlock:
+		up_write(&s->lock);
+
+		if (copy_needed)
+			start_copy(pe);
 	} else {
 		/*
 		 * FIXME: this read path scares me because we
@@ -872,6 +917,11 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
 		/* Do reads */
 		down_read(&s->lock);
 
+		if (!s->valid) {
+			up_read(&s->lock);
+			return -EIO;
+		}
+
 		/* See if it it has been remapped */
 		e = lookup_exception(&s->complete, chunk);
 		if (e)
@@ -948,15 +998,15 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
 	/* Do all the snapshots on this origin */
 	list_for_each_entry (snap, snapshots, list) {
 
+		down_write(&snap->lock);
+
 		/* Only deal with valid and active snapshots */
 		if (!snap->valid || !snap->active)
-			continue;
+			goto next_snapshot;
 
 		/* Nothing to do if writing beyond end of snapshot */
 		if (bio->bi_sector >= dm_table_get_size(snap->table))
-			continue;
-
-		down_write(&snap->lock);
+			goto next_snapshot;
 
 		/*
 		 * Remember, different snapshots can have
@@ -973,40 +1023,43 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
 		 * won't destroy the primary_pe while we're inside this loop.
 		 */
 		e = lookup_exception(&snap->complete, chunk);
-		if (!e) {
-			pe = __find_pending_exception(snap, bio);
-			if (!pe) {
-				snap->store.drop_snapshot(&snap->store);
-				snap->valid = 0;
-
-			} else {
-				if (!primary_pe) {
-					/*
-					 * Either every pe here has same
-					 * primary_pe or none has one yet.
-					 */
-					if (pe->primary_pe)
-						primary_pe = pe->primary_pe;
-					else {
-						primary_pe = pe;
-						first = 1;
-					}
-
-					bio_list_add(&primary_pe->origin_bios,
-						     bio);
-					r = 0;
-				}
-				if (!pe->primary_pe) {
-					atomic_inc(&primary_pe->sibling_count);
-					pe->primary_pe = primary_pe;
-				}
-				if (!pe->started) {
-					pe->started = 1;
-					list_add_tail(&pe->list, &pe_queue);
-				}
+		if (e)
+			goto next_snapshot;
+
+		pe = __find_pending_exception(snap, bio);
+		if (!pe) {
+			__invalidate_snapshot(snap, pe, ENOMEM);
+			goto next_snapshot;
+		}
+
+		if (!primary_pe) {
+			/*
+			 * Either every pe here has same
+			 * primary_pe or none has one yet.
+			 */
+			if (pe->primary_pe)
+				primary_pe = pe->primary_pe;
+			else {
+				primary_pe = pe;
+				first = 1;
 			}
+
+			bio_list_add(&primary_pe->origin_bios, bio);
+
+			r = 0;
+		}
+
+		if (!pe->primary_pe) {
+			atomic_inc(&primary_pe->sibling_count);
+			pe->primary_pe = primary_pe;
+		}
+
+		if (!pe->started) {
+			pe->started = 1;
+			list_add_tail(&pe->list, &pe_queue);
 		}
 
+ next_snapshot:
 		up_write(&snap->lock);
 	}
 
-- 
cgit v1.2.3


From 930d332a23682202c07df0276dd665a57755b37d Mon Sep 17 00:00:00 2001
From: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Date: Mon, 27 Mar 2006 01:17:47 -0800
Subject: [PATCH] drivers/md/dm-raid1.c: Fix inconsistent mirroring after
 interrupted recovery

dm-mirror has potential data corruption problem: while on-disk log shows
that all disk contents are in-sync, actual contents of the disks are not
synchronized.  This problem occurs if initial recovery (synching) is
interrupted and resumed.

Attached patch fixes this problem.

Background:

rh_dec() changes the region state from RH_NOSYNC (out-of-sync) to RH_CLEAN
(in-sync), which results in the corresponding bit of clean_bits being set.

This is harmful if on-disk log is used and the map is removed/suspended
before the initial sync is completed.  The clean_bits is written down to
the on-disk log at the map removal, and, upon resume, it's read and copied
to sync_bits.  Since the recovery process refers to the sync_bits to find a
region to be recovered, the region whose state was changed from RH_NOSYNC
to RH_CLEAN is no longer recovered.

If you haven't applied dm-raid1-read-balancing.patch proposed in dm-devel
sometimes ago, the contents of the mirrored disk just corrupt silently.  If
you have, balanced read may get bogus data from out-of-sync disks.

The patch keeps RH_NOSYNC state unchanged.  It will be changed to
RH_RECOVERING when recovery starts and get reclaimed when the recovery
completes.  So it doesn't leak the region hash entry.

Description:

Keep RH_NOSYNC state unchanged when I/O on the region completes.

rh_dec() changes the region state from RH_NOSYNC (out-of-sync) to RH_CLEAN
(in-sync), which results in the corresponding bit of clean_bits being set.

This is harmful if on-disk log is used and the map is removed/suspended
before the initial sync is completed.  The clean_bits is written down to
the on-disk log at the map removal, and, upon resume, it's read and copied
to sync_bits.  Since the recovery process refers to the sync_bits to find a
region to be recovered, the region whose state was changed from RH_NOSYNC
to RH_CLEAN is no longer recovered.

If you haven't applied dm-raid1-read-balancing.patch proposed in dm-devel
sometimes ago, the contents of the mirrored disk just corrupt silently.  If
you have, balanced read may get bogus data from out-of-sync disks.

The RH_NOSYNC region will be changed to RH_RECOVERING when recovery starts
on the region and get reclaimed when the recovery completes.  So it doesn't
leak the region hash entry.

Alasdair said:

  I've analysed the relevant part of the state machine and I believe that
  the patch is correct.

  (Further work on this code is still needed - this patch has the
  side-effect of holding onto memory unnecessarily for long periods of time
  under certain workloads - but better that than corrupting data.)

Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Acked-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm-raid1.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 4e90f231fbfb..3d90f1b37e08 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -402,9 +402,21 @@ static void rh_dec(struct region_hash *rh, region_t region)
 
 	spin_lock_irqsave(&rh->region_lock, flags);
 	if (atomic_dec_and_test(&reg->pending)) {
+		/*
+		 * There is no pending I/O for this region.
+		 * We can move the region to corresponding list for next action.
+		 * At this point, the region is not yet connected to any list.
+		 *
+		 * If the state is RH_NOSYNC, the region should be kept off
+		 * from clean list.
+		 * The hash entry for RH_NOSYNC will remain in memory
+		 * until the region is recovered or the map is reloaded.
+		 */
+
+		/* do nothing for RH_NOSYNC */
 		if (reg->state == RH_RECOVERING) {
 			list_add_tail(&reg->list, &rh->quiesced_regions);
-		} else {
+		} else if (reg->state == RH_DIRTY) {
 			reg->state = RH_CLEAN;
 			list_add(&reg->list, &rh->clean_regions);
 		}
-- 
cgit v1.2.3


From 4ee218cd67b385759993a6c840ea45f0ee0a8b30 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 27 Mar 2006 01:17:48 -0800
Subject: [PATCH] dm: remove SECTOR_FORMAT

We don't know what type sector_t has.  Sometimes it's unsigned long, sometimes
it's unsigned long long.  For example on ppc64 it's unsigned long with
CONFIG_LBD=n and on x86_64 it's unsigned long long with CONFIG_LBD=n.

The way to handle all of this is to always use unsigned long long and to
always typecast the sector_t when printing it.

Acked-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm-crypt.c  | 11 +++++++----
 drivers/md/dm-linear.c |  8 +++++---
 drivers/md/dm-raid1.c  | 15 ++++++++-------
 drivers/md/dm-snap.c   | 11 ++++++-----
 drivers/md/dm-stripe.c | 11 ++++++-----
 drivers/md/dm.h        | 10 ----------
 6 files changed, 32 insertions(+), 34 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 259e86f26549..61a590bb6241 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -518,6 +518,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	char *ivopts;
 	unsigned int crypto_flags;
 	unsigned int key_size;
+	unsigned long long tmpll;
 
 	if (argc != 5) {
 		ti->error = PFX "Not enough arguments";
@@ -633,15 +634,17 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad5;
 	}
 
-	if (sscanf(argv[2], SECTOR_FORMAT, &cc->iv_offset) != 1) {
+	if (sscanf(argv[2], "%llu", &tmpll) != 1) {
 		ti->error = PFX "Invalid iv_offset sector";
 		goto bad5;
 	}
+	cc->iv_offset = tmpll;
 
-	if (sscanf(argv[4], SECTOR_FORMAT, &cc->start) != 1) {
+	if (sscanf(argv[4], "%llu", &tmpll) != 1) {
 		ti->error = PFX "Invalid device sector";
 		goto bad5;
 	}
+	cc->start = tmpll;
 
 	if (dm_get_device(ti, argv[3], cc->start, ti->len,
 	                  dm_table_get_mode(ti->table), &cc->dev)) {
@@ -885,8 +888,8 @@ static int crypt_status(struct dm_target *ti, status_type_t type,
 			result[sz++] = '-';
 		}
 
-		DMEMIT(" " SECTOR_FORMAT " %s " SECTOR_FORMAT,
-		       cc->iv_offset, cc->dev->name, cc->start);
+		DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset,
+				cc->dev->name, (unsigned long long)cc->start);
 		break;
 	}
 	return 0;
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 6a2cd5dc8a63..daf586c0898d 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -26,6 +26,7 @@ struct linear_c {
 static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
 	struct linear_c *lc;
+	unsigned long long tmp;
 
 	if (argc != 2) {
 		ti->error = "dm-linear: Invalid argument count";
@@ -38,10 +39,11 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		return -ENOMEM;
 	}
 
-	if (sscanf(argv[1], SECTOR_FORMAT, &lc->start) != 1) {
+	if (sscanf(argv[1], "%llu", &tmp) != 1) {
 		ti->error = "dm-linear: Invalid device sector";
 		goto bad;
 	}
+	lc->start = tmp;
 
 	if (dm_get_device(ti, argv[0], lc->start, ti->len,
 			  dm_table_get_mode(ti->table), &lc->dev)) {
@@ -87,8 +89,8 @@ static int linear_status(struct dm_target *ti, status_type_t type,
 		break;
 
 	case STATUSTYPE_TABLE:
-		snprintf(result, maxlen, "%s " SECTOR_FORMAT, lc->dev->name,
-			 lc->start);
+		snprintf(result, maxlen, "%s %llu", lc->dev->name,
+				(unsigned long long)lc->start);
 		break;
 	}
 	return 0;
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 3d90f1b37e08..d12cf3e5e076 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -934,9 +934,9 @@ static inline int _check_region_size(struct dm_target *ti, uint32_t size)
 static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
 		      unsigned int mirror, char **argv)
 {
-	sector_t offset;
+	unsigned long long offset;
 
-	if (sscanf(argv[1], SECTOR_FORMAT, &offset) != 1) {
+	if (sscanf(argv[1], "%llu", &offset) != 1) {
 		ti->error = "dm-mirror: Invalid offset";
 		return -EINVAL;
 	}
@@ -1203,16 +1203,17 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
 		for (m = 0; m < ms->nr_mirrors; m++)
 			DMEMIT("%s ", ms->mirror[m].dev->name);
 
-		DMEMIT(SECTOR_FORMAT "/" SECTOR_FORMAT,
-		       ms->rh.log->type->get_sync_count(ms->rh.log),
-		       ms->nr_regions);
+		DMEMIT("%llu/%llu",
+			(unsigned long long)ms->rh.log->type->
+				get_sync_count(ms->rh.log),
+			(unsigned long long)ms->nr_regions);
 		break;
 
 	case STATUSTYPE_TABLE:
 		DMEMIT("%d ", ms->nr_mirrors);
 		for (m = 0; m < ms->nr_mirrors; m++)
-			DMEMIT("%s " SECTOR_FORMAT " ",
-			       ms->mirror[m].dev->name, ms->mirror[m].offset);
+			DMEMIT("%s %llu ", ms->mirror[m].dev->name,
+				(unsigned long long)ms->mirror[m].offset);
 	}
 
 	return 0;
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 14bd1a1815b1..a5765f9fbe02 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -959,9 +959,9 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
 				snap->store.fraction_full(&snap->store,
 							  &numerator,
 							  &denominator);
-				snprintf(result, maxlen,
-					 SECTOR_FORMAT "/" SECTOR_FORMAT,
-					 numerator, denominator);
+				snprintf(result, maxlen, "%llu/%llu",
+					(unsigned long long)numerator,
+					(unsigned long long)denominator);
 			}
 			else
 				snprintf(result, maxlen, "Unknown");
@@ -974,9 +974,10 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
 		 * to make private copies if the output is to
 		 * make sense.
 		 */
-		snprintf(result, maxlen, "%s %s %c " SECTOR_FORMAT,
+		snprintf(result, maxlen, "%s %s %c %llu",
 			 snap->origin->name, snap->cow->name,
-			 snap->type, snap->chunk_size);
+			 snap->type,
+			 (unsigned long long)snap->chunk_size);
 		break;
 	}
 
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 697aacafb02a..204f796ee9e3 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -49,9 +49,9 @@ static inline struct stripe_c *alloc_context(unsigned int stripes)
 static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
 		      unsigned int stripe, char **argv)
 {
-	sector_t start;
+	unsigned long long start;
 
-	if (sscanf(argv[1], SECTOR_FORMAT, &start) != 1)
+	if (sscanf(argv[1], "%llu", &start) != 1)
 		return -EINVAL;
 
 	if (dm_get_device(ti, argv[0], start, sc->stripe_width,
@@ -201,10 +201,11 @@ static int stripe_status(struct dm_target *ti,
 		break;
 
 	case STATUSTYPE_TABLE:
-		DMEMIT("%d " SECTOR_FORMAT, sc->stripes, sc->chunk_mask + 1);
+		DMEMIT("%d %llu", sc->stripes,
+			(unsigned long long)sc->chunk_mask + 1);
 		for (i = 0; i < sc->stripes; i++)
-			DMEMIT(" %s " SECTOR_FORMAT, sc->stripe[i].dev->name,
-			       sc->stripe[i].physical_start);
+			DMEMIT(" %s %llu", sc->stripe[i].dev->name,
+			    (unsigned long long)sc->stripe[i].physical_start);
 		break;
 	}
 	return 0;
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 4eaf075da217..0ff11d6f8158 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -23,16 +23,6 @@
 #define DMEMIT(x...) sz += ((sz >= maxlen) ? \
 			  0 : scnprintf(result + sz, maxlen - sz, x))
 
-/*
- * FIXME: I think this should be with the definition of sector_t
- * in types.h.
- */
-#ifdef CONFIG_LBD
-#define SECTOR_FORMAT "%llu"
-#else
-#define SECTOR_FORMAT "%lu"
-#endif
-
 #define SECTOR_SHIFT 9
 
 /*
-- 
cgit v1.2.3


From 969429b504ae866d3f8b1cafd68a2c099e305093 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:17:49 -0800
Subject: [PATCH] dm: make sure QUEUE_FLAG_CLUSTER is set properly

This flag should be set for a virtual device iff it is set for all
underlying devices.

Signed-off-by: Neil Brown <neilb@suse.de>
Acked-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm-table.c         | 9 +++++++++
 include/linux/device-mapper.h | 1 +
 2 files changed, 10 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 907b08ddb783..9558a4acec12 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -97,6 +97,8 @@ static void combine_restrictions_low(struct io_restrictions *lhs,
 
 	lhs->seg_boundary_mask =
 		min_not_zero(lhs->seg_boundary_mask, rhs->seg_boundary_mask);
+
+	lhs->no_cluster |= rhs->no_cluster;
 }
 
 /*
@@ -523,6 +525,8 @@ int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
 		rs->seg_boundary_mask =
 			min_not_zero(rs->seg_boundary_mask,
 				     q->seg_boundary_mask);
+
+		rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
 	}
 
 	return r;
@@ -832,6 +836,11 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
 	q->hardsect_size = t->limits.hardsect_size;
 	q->max_segment_size = t->limits.max_segment_size;
 	q->seg_boundary_mask = t->limits.seg_boundary_mask;
+	if (t->limits.no_cluster)
+		q->queue_flags &= ~(1 << QUEUE_FLAG_CLUSTER);
+	else
+		q->queue_flags |= (1 << QUEUE_FLAG_CLUSTER);
+
 }
 
 unsigned int dm_table_get_num_targets(struct dm_table *t)
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 51e0e95a421a..aee10b2ea4c6 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -97,6 +97,7 @@ struct io_restrictions {
 	unsigned short		hardsect_size;
 	unsigned int		max_segment_size;
 	unsigned long		seg_boundary_mask;
+	unsigned char		no_cluster; /* inverted so that 0 is default */
 };
 
 struct dm_target {
-- 
cgit v1.2.3


From 138728dc96529f20dfe970c470e51885a60e329f Mon Sep 17 00:00:00 2001
From: Alasdair G Kergon <agk@redhat.com>
Date: Mon, 27 Mar 2006 01:17:50 -0800
Subject: [PATCH] dm snapshot: fix kcopyd destructor

Before removing a snapshot, wait for the completion of any kcopyd jobs using
it.

Do this by maintaining a count (nr_jobs) of how many outstanding jobs each
kcopyd_client has.

The snapshot destructor first unregisters the snapshot so that no new kcopyd
jobs (created by writes to the origin) will reference that particular
snapshot.  kcopyd_client_destroy() is now run next to wait for the completion
of any outstanding jobs before the snapshot exception structures (that those
jobs reference) are freed.

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm-snap.c |  6 +++++-
 drivers/md/kcopyd.c  | 17 ++++++++++++++++-
 2 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index a5765f9fbe02..08312b46463a 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -559,8 +559,12 @@ static void snapshot_dtr(struct dm_target *ti)
 {
 	struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
 
+	/* Prevent further origin writes from using this snapshot. */
+	/* After this returns there can be no new kcopyd jobs. */
 	unregister_snapshot(s);
 
+	kcopyd_client_destroy(s->kcopyd_client);
+
 	exit_exception_table(&s->pending, pending_cache);
 	exit_exception_table(&s->complete, exception_cache);
 
@@ -569,7 +573,7 @@ static void snapshot_dtr(struct dm_target *ti)
 
 	dm_put_device(ti, s->origin);
 	dm_put_device(ti, s->cow);
-	kcopyd_client_destroy(s->kcopyd_client);
+
 	kfree(s);
 }
 
diff --git a/drivers/md/kcopyd.c b/drivers/md/kcopyd.c
index 9dcb2c8a3853..ed71f3f94620 100644
--- a/drivers/md/kcopyd.c
+++ b/drivers/md/kcopyd.c
@@ -44,6 +44,9 @@ struct kcopyd_client {
 	struct page_list *pages;
 	unsigned int nr_pages;
 	unsigned int nr_free_pages;
+
+	wait_queue_head_t destroyq;
+	atomic_t nr_jobs;
 };
 
 static struct page_list *alloc_pl(void)
@@ -292,10 +295,15 @@ static int run_complete_job(struct kcopyd_job *job)
 	int read_err = job->read_err;
 	unsigned int write_err = job->write_err;
 	kcopyd_notify_fn fn = job->fn;
+	struct kcopyd_client *kc = job->kc;
 
-	kcopyd_put_pages(job->kc, job->pages);
+	kcopyd_put_pages(kc, job->pages);
 	mempool_free(job, _job_pool);
 	fn(read_err, write_err, context);
+
+	if (atomic_dec_and_test(&kc->nr_jobs))
+		wake_up(&kc->destroyq);
+
 	return 0;
 }
 
@@ -430,6 +438,7 @@ static void do_work(void *ignored)
  */
 static void dispatch_job(struct kcopyd_job *job)
 {
+	atomic_inc(&job->kc->nr_jobs);
 	push(&_pages_jobs, job);
 	wake();
 }
@@ -669,6 +678,9 @@ int kcopyd_client_create(unsigned int nr_pages, struct kcopyd_client **result)
 		return r;
 	}
 
+	init_waitqueue_head(&kc->destroyq);
+	atomic_set(&kc->nr_jobs, 0);
+
 	client_add(kc);
 	*result = kc;
 	return 0;
@@ -676,6 +688,9 @@ int kcopyd_client_create(unsigned int nr_pages, struct kcopyd_client **result)
 
 void kcopyd_client_destroy(struct kcopyd_client *kc)
 {
+	/* Wait for completion of all jobs submitted by this client. */
+	wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs));
+
 	dm_io_put(kc->nr_pages);
 	client_free_pages(kc);
 	client_del(kc);
-- 
cgit v1.2.3


From 1ecac7fd74f2e5fb06a7719ecba55fb5778a9a47 Mon Sep 17 00:00:00 2001
From: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Date: Mon, 27 Mar 2006 01:17:51 -0800
Subject: [PATCH] dm flush queue EINTR

If dm_suspend() is cancelled, bios already added to the deferred list need to
be submitted.  Otherwise they remain 'in limbo' until there's a dm_resume().

Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index a64798ef481e..48be69f3bcfa 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1107,6 +1107,7 @@ int dm_suspend(struct mapped_device *md, int do_lockfs)
 {
 	struct dm_table *map = NULL;
 	DECLARE_WAITQUEUE(wait, current);
+	struct bio *def;
 	int r = -EINVAL;
 
 	down(&md->suspend_lock);
@@ -1166,9 +1167,11 @@ int dm_suspend(struct mapped_device *md, int do_lockfs)
 	/* were we interrupted ? */
 	r = -EINTR;
 	if (atomic_read(&md->pending)) {
+		clear_bit(DMF_BLOCK_IO, &md->flags);
+		def = bio_list_get(&md->deferred);
+		__flush_deferred_io(md, def);
 		up_write(&md->io_lock);
 		unlock_fs(md);
-		clear_bit(DMF_BLOCK_IO, &md->flags);
 		goto out;
 	}
 	up_write(&md->io_lock);
-- 
cgit v1.2.3


From 7e51f257e87297a5b6fe6d136a8ef67206aaf3a8 Mon Sep 17 00:00:00 2001
From: Mike Anderson <andmike@us.ibm.com>
Date: Mon, 27 Mar 2006 01:17:52 -0800
Subject: [PATCH] dm: store md name

The patch stores a printable device number in struct mapped_device for use in
warning messages and with a proposed netlink interface.

Signed-off-by: Mike Anderson <andmike@us.ibm.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 48be69f3bcfa..f140d499602a 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -69,6 +69,7 @@ struct mapped_device {
 
 	request_queue_t *queue;
 	struct gendisk *disk;
+	char name[16];
 
 	void *interface_ptr;
 
@@ -842,6 +843,7 @@ static struct mapped_device *alloc_dev(unsigned int minor, int persistent)
 	md->disk->private_data = md;
 	sprintf(md->disk->disk_name, "dm-%d", minor);
 	add_disk(md->disk);
+	format_dev_t(md->name, MKDEV(_major, minor));
 
 	atomic_set(&md->pending, 0);
 	init_waitqueue_head(&md->wait);
-- 
cgit v1.2.3


From 9ade92a9a5b0a3a10efa6551b8c67a9277bf0438 Mon Sep 17 00:00:00 2001
From: Alasdair G Kergon <agk@redhat.com>
Date: Mon, 27 Mar 2006 01:17:53 -0800
Subject: [PATCH] dm: tidy mdptr

Change dm_get_mdptr() to take a struct mapped_device instead of dev_t.

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm-ioctl.c | 16 +++++++++++++---
 drivers/md/dm.c       | 10 ++--------
 drivers/md/dm.h       |  2 +-
 3 files changed, 16 insertions(+), 12 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 442e2be6052e..0693b6f54b7d 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -600,12 +600,22 @@ static int dev_create(struct dm_ioctl *param, size_t param_size)
  */
 static struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param)
 {
+	struct mapped_device *md;
+	void *mdptr = NULL;
+
 	if (*param->uuid)
 		return __get_uuid_cell(param->uuid);
-	else if (*param->name)
+
+	if (*param->name)
 		return __get_name_cell(param->name);
-	else
-		return dm_get_mdptr(huge_decode_dev(param->dev));
+
+	md = dm_get_md(huge_decode_dev(param->dev));
+	if (md) {
+		mdptr = dm_get_mdptr(md);
+		dm_put(md);
+	}
+
+	return mdptr;
 }
 
 static struct mapped_device *find_device(struct dm_ioctl *param)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f140d499602a..3d121cbc2fde 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -990,15 +990,9 @@ struct mapped_device *dm_get_md(dev_t dev)
 	return md;
 }
 
-void *dm_get_mdptr(dev_t dev)
+void *dm_get_mdptr(struct mapped_device *md)
 {
-	struct mapped_device *md;
-	void *mdptr = NULL;
-
-	md = dm_find_md(dev);
-	if (md)
-		mdptr = md->interface_ptr;
-	return mdptr;
+	return md->interface_ptr;
 }
 
 void dm_set_mdptr(struct mapped_device *md, void *ptr)
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 0ff11d6f8158..17ffa8d671a7 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -47,7 +47,7 @@ struct mapped_device;
 int dm_create(struct mapped_device **md);
 int dm_create_with_minor(unsigned int minor, struct mapped_device **md);
 void dm_set_mdptr(struct mapped_device *md, void *ptr);
-void *dm_get_mdptr(dev_t dev);
+void *dm_get_mdptr(struct mapped_device *md);
 struct mapped_device *dm_get_md(dev_t dev);
 
 /*
-- 
cgit v1.2.3


From 1134e5ae79bab61c05657ca35a6297cf87202e35 Mon Sep 17 00:00:00 2001
From: Mike Anderson <andmike@us.ibm.com>
Date: Mon, 27 Mar 2006 01:17:54 -0800
Subject: [PATCH] dm table: store md

Store an up-pointer to the owning struct mapped_device in every table when it
is created.

Access it with:
  struct mapped_device *dm_table_get_md(struct dm_table *t)

Tables linked to md must be destroyed before the md itself.

Signed-off-by: Mike Anderson <andmike@us.ibm.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm-ioctl.c | 32 +++++++++++++++++++++-----------
 drivers/md/dm-table.c | 13 ++++++++++++-
 drivers/md/dm.c       |  6 +++---
 drivers/md/dm.h       |  4 +++-
 4 files changed, 39 insertions(+), 16 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 0693b6f54b7d..65826bdac00c 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -244,9 +244,9 @@ static void __hash_remove(struct hash_cell *hc)
 		dm_table_put(table);
 	}
 
-	dm_put(hc->md);
 	if (hc->new_map)
 		dm_table_put(hc->new_map);
+	dm_put(hc->md);
 	free_cell(hc);
 }
 
@@ -985,33 +985,43 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
 	int r;
 	struct hash_cell *hc;
 	struct dm_table *t;
+	struct mapped_device *md;
+
+	md = find_device(param);
+	if (!md)
+		return -ENXIO;
 
-	r = dm_table_create(&t, get_mode(param), param->target_count);
+	r = dm_table_create(&t, get_mode(param), param->target_count, md);
 	if (r)
-		return r;
+		goto out;
 
 	r = populate_table(t, param, param_size);
 	if (r) {
 		dm_table_put(t);
-		return r;
+		goto out;
 	}
 
 	down_write(&_hash_lock);
-	hc = __find_device_hash_cell(param);
-	if (!hc) {
-		DMWARN("device doesn't appear to be in the dev hash table.");
-		up_write(&_hash_lock);
+	hc = dm_get_mdptr(md);
+	if (!hc || hc->md != md) {
+		DMWARN("device has been removed from the dev hash table.");
 		dm_table_put(t);
-		return -ENXIO;
+		up_write(&_hash_lock);
+		r = -ENXIO;
+		goto out;
 	}
 
 	if (hc->new_map)
 		dm_table_put(hc->new_map);
 	hc->new_map = t;
+	up_write(&_hash_lock);
+
 	param->flags |= DM_INACTIVE_PRESENT_FLAG;
+	r = __dev_status(md, param);
+
+out:
+	dm_put(md);
 
-	r = __dev_status(hc->md, param);
-	up_write(&_hash_lock);
 	return r;
 }
 
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 9558a4acec12..84f4594afd3c 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -22,6 +22,7 @@
 #define CHILDREN_PER_NODE (KEYS_PER_NODE + 1)
 
 struct dm_table {
+	struct mapped_device *md;
 	atomic_t holders;
 
 	/* btree table */
@@ -206,7 +207,8 @@ static int alloc_targets(struct dm_table *t, unsigned int num)
 	return 0;
 }
 
-int dm_table_create(struct dm_table **result, int mode, unsigned num_targets)
+int dm_table_create(struct dm_table **result, int mode,
+		    unsigned num_targets, struct mapped_device *md)
 {
 	struct dm_table *t = kmalloc(sizeof(*t), GFP_KERNEL);
 
@@ -229,6 +231,7 @@ int dm_table_create(struct dm_table **result, int mode, unsigned num_targets)
 	}
 
 	t->mode = mode;
+	t->md = md;
 	*result = t;
 	return 0;
 }
@@ -952,12 +955,20 @@ int dm_table_flush_all(struct dm_table *t)
 	return ret;
 }
 
+struct mapped_device *dm_table_get_md(struct dm_table *t)
+{
+	dm_get(t->md);
+
+	return t->md;
+}
+
 EXPORT_SYMBOL(dm_vcalloc);
 EXPORT_SYMBOL(dm_get_device);
 EXPORT_SYMBOL(dm_put_device);
 EXPORT_SYMBOL(dm_table_event);
 EXPORT_SYMBOL(dm_table_get_size);
 EXPORT_SYMBOL(dm_table_get_mode);
+EXPORT_SYMBOL(dm_table_get_md);
 EXPORT_SYMBOL(dm_table_put);
 EXPORT_SYMBOL(dm_table_get);
 EXPORT_SYMBOL(dm_table_unplug_all);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 3d121cbc2fde..b99df48cffed 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1007,18 +1007,18 @@ void dm_get(struct mapped_device *md)
 
 void dm_put(struct mapped_device *md)
 {
-	struct dm_table *map = dm_get_table(md);
+	struct dm_table *map;
 
 	if (atomic_dec_and_test(&md->holders)) {
+		map = dm_get_table(md);
 		if (!dm_suspended(md)) {
 			dm_table_presuspend_targets(map);
 			dm_table_postsuspend_targets(map);
 		}
 		__unbind(md);
+		dm_table_put(map);
 		free_dev(md);
 	}
-
-	dm_table_put(map);
 }
 
 /*
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 17ffa8d671a7..79e800051a10 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -89,7 +89,8 @@ int dm_suspended(struct mapped_device *md);
  * Functions for manipulating a table.  Tables are also reference
  * counted.
  *---------------------------------------------------------------*/
-int dm_table_create(struct dm_table **result, int mode, unsigned num_targets);
+int dm_table_create(struct dm_table **result, int mode,
+		    unsigned num_targets, struct mapped_device *md);
 
 void dm_table_get(struct dm_table *t);
 void dm_table_put(struct dm_table *t);
@@ -107,6 +108,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q);
 unsigned int dm_table_get_num_targets(struct dm_table *t);
 struct list_head *dm_table_get_devices(struct dm_table *t);
 int dm_table_get_mode(struct dm_table *t);
+struct mapped_device *dm_table_get_md(struct dm_table *t);
 void dm_table_presuspend_targets(struct dm_table *t);
 void dm_table_postsuspend_targets(struct dm_table *t);
 void dm_table_resume_targets(struct dm_table *t);
-- 
cgit v1.2.3


From 3ac51e741a46af7a20f55e79d3e3aeaa93c6c544 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@us.ibm.com>
Date: Mon, 27 Mar 2006 01:17:54 -0800
Subject: [PATCH] dm store geometry

Allow drive geometry to be stored with a new DM_DEV_SET_GEOMETRY ioctl.
Device-mapper will now respond to HDIO_GETGEO.  If the geometry information is
not available, zero will be returned for all of the parameters.

Signed-off-by: Darrick J. Wong <djwong@us.ibm.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm-ioctl.c        | 52 +++++++++++++++++++++++++++++++++++++++++++-
 drivers/md/dm.c              | 46 +++++++++++++++++++++++++++++++++++++++
 drivers/md/dm.h              |  7 ++++++
 include/linux/compat_ioctl.h |  2 ++
 include/linux/dm-ioctl.h     | 17 +++++++++++++--
 5 files changed, 121 insertions(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 65826bdac00c..8edd6435414d 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -15,6 +15,7 @@
 #include <linux/slab.h>
 #include <linux/devfs_fs_kernel.h>
 #include <linux/dm-ioctl.h>
+#include <linux/hdreg.h>
 
 #include <asm/uaccess.h>
 
@@ -700,6 +701,54 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size)
 	return dm_hash_rename(param->name, new_name);
 }
 
+static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
+{
+	int r = -EINVAL, x;
+	struct mapped_device *md;
+	struct hd_geometry geometry;
+	unsigned long indata[4];
+	char *geostr = (char *) param + param->data_start;
+
+	md = find_device(param);
+	if (!md)
+		return -ENXIO;
+
+	if (geostr < (char *) (param + 1) ||
+	    invalid_str(geostr, (void *) param + param_size)) {
+		DMWARN("Invalid geometry supplied.");
+		goto out;
+	}
+
+	x = sscanf(geostr, "%lu %lu %lu %lu", indata,
+		   indata + 1, indata + 2, indata + 3);
+
+	if (x != 4) {
+		DMWARN("Unable to interpret geometry settings.");
+		goto out;
+	}
+
+	if (indata[0] > 65535 || indata[1] > 255 ||
+	    indata[2] > 255 || indata[3] > ULONG_MAX) {
+		DMWARN("Geometry exceeds range limits.");
+		goto out;
+	}
+
+	geometry.cylinders = indata[0];
+	geometry.heads = indata[1];
+	geometry.sectors = indata[2];
+	geometry.start = indata[3];
+
+	r = dm_set_geometry(md, &geometry);
+	if (!r)
+		r = __dev_status(md, param);
+
+	param->data_size = 0;
+
+out:
+	dm_put(md);
+	return r;
+}
+
 static int do_suspend(struct dm_ioctl *param)
 {
 	int r = 0;
@@ -1234,7 +1283,8 @@ static ioctl_fn lookup_ioctl(unsigned int cmd)
 
 		{DM_LIST_VERSIONS_CMD, list_versions},
 
-		{DM_TARGET_MSG_CMD, target_message}
+		{DM_TARGET_MSG_CMD, target_message},
+		{DM_DEV_SET_GEOMETRY_CMD, dev_set_geometry}
 	};
 
 	return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index b99df48cffed..973e63d530ae 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -17,6 +17,7 @@
 #include <linux/mempool.h>
 #include <linux/slab.h>
 #include <linux/idr.h>
+#include <linux/hdreg.h>
 #include <linux/blktrace_api.h>
 
 static const char *_name = DM_NAME;
@@ -102,6 +103,9 @@ struct mapped_device {
 	 */
 	struct super_block *frozen_sb;
 	struct block_device *suspended_bdev;
+
+	/* forced geometry settings */
+	struct hd_geometry geometry;
 };
 
 #define MIN_IOS 256
@@ -227,6 +231,13 @@ static int dm_blk_close(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	struct mapped_device *md = bdev->bd_disk->private_data;
+
+	return dm_get_geometry(md, geo);
+}
+
 static inline struct dm_io *alloc_io(struct mapped_device *md)
 {
 	return mempool_alloc(md->io_pool, GFP_NOIO);
@@ -313,6 +324,33 @@ struct dm_table *dm_get_table(struct mapped_device *md)
 	return t;
 }
 
+/*
+ * Get the geometry associated with a dm device
+ */
+int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
+{
+	*geo = md->geometry;
+
+	return 0;
+}
+
+/*
+ * Set the geometry of a device.
+ */
+int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
+{
+	sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
+
+	if (geo->start > sz) {
+		DMWARN("Start sector is beyond the geometry limits.");
+		return -EINVAL;
+	}
+
+	md->geometry = *geo;
+
+	return 0;
+}
+
 /*-----------------------------------------------------------------
  * CRUD START:
  *   A more elegant soln is in the works that uses the queue
@@ -906,6 +944,13 @@ static int __bind(struct mapped_device *md, struct dm_table *t)
 	sector_t size;
 
 	size = dm_table_get_size(t);
+
+	/*
+	 * Wipe any geometry if the size of the table changed.
+	 */
+	if (size != get_capacity(md->disk))
+		memset(&md->geometry, 0, sizeof(md->geometry));
+
 	__set_size(md, size);
 	if (size == 0)
 		return 0;
@@ -1261,6 +1306,7 @@ int dm_suspended(struct mapped_device *md)
 static struct block_device_operations dm_blk_dops = {
 	.open = dm_blk_open,
 	.release = dm_blk_close,
+	.getgeo = dm_blk_getgeo,
 	.owner = THIS_MODULE
 };
 
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 79e800051a10..fd90bc8f9e45 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -14,6 +14,7 @@
 #include <linux/device-mapper.h>
 #include <linux/list.h>
 #include <linux/blkdev.h>
+#include <linux/hdreg.h>
 
 #define DM_NAME "device-mapper"
 #define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x)
@@ -85,6 +86,12 @@ int dm_wait_event(struct mapped_device *md, int event_nr);
 struct gendisk *dm_disk(struct mapped_device *md);
 int dm_suspended(struct mapped_device *md);
 
+/*
+ * Geometry functions.
+ */
+int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo);
+int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo);
+
 /*-----------------------------------------------------------------
  * Functions for manipulating a table.  Tables are also reference
  * counted.
diff --git a/include/linux/compat_ioctl.h b/include/linux/compat_ioctl.h
index efb518f16bb3..89ab677cb993 100644
--- a/include/linux/compat_ioctl.h
+++ b/include/linux/compat_ioctl.h
@@ -140,6 +140,7 @@ COMPATIBLE_IOCTL(DM_TABLE_DEPS_32)
 COMPATIBLE_IOCTL(DM_TABLE_STATUS_32)
 COMPATIBLE_IOCTL(DM_LIST_VERSIONS_32)
 COMPATIBLE_IOCTL(DM_TARGET_MSG_32)
+COMPATIBLE_IOCTL(DM_DEV_SET_GEOMETRY_32)
 COMPATIBLE_IOCTL(DM_VERSION)
 COMPATIBLE_IOCTL(DM_REMOVE_ALL)
 COMPATIBLE_IOCTL(DM_LIST_DEVICES)
@@ -155,6 +156,7 @@ COMPATIBLE_IOCTL(DM_TABLE_DEPS)
 COMPATIBLE_IOCTL(DM_TABLE_STATUS)
 COMPATIBLE_IOCTL(DM_LIST_VERSIONS)
 COMPATIBLE_IOCTL(DM_TARGET_MSG)
+COMPATIBLE_IOCTL(DM_DEV_SET_GEOMETRY)
 /* Big K */
 COMPATIBLE_IOCTL(PIO_FONT)
 COMPATIBLE_IOCTL(GIO_FONT)
diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h
index fa75ba0d635e..c67c6786612a 100644
--- a/include/linux/dm-ioctl.h
+++ b/include/linux/dm-ioctl.h
@@ -80,6 +80,16 @@
  *
  * DM_TARGET_MSG:
  * Pass a message string to the target at a specific offset of a device.
+ *
+ * DM_DEV_SET_GEOMETRY:
+ * Set the geometry of a device by passing in a string in this format:
+ *
+ * "cylinders heads sectors_per_track start_sector"
+ *
+ * Beware that CHS geometry is nearly obsolete and only provided
+ * for compatibility with dm devices that can be booted by a PC
+ * BIOS.  See struct hd_geometry for range limits.  Also note that
+ * the geometry is erased if the device size changes.
  */
 
 /*
@@ -218,6 +228,7 @@ enum {
 	/* Added later */
 	DM_LIST_VERSIONS_CMD,
 	DM_TARGET_MSG_CMD,
+	DM_DEV_SET_GEOMETRY_CMD
 };
 
 /*
@@ -247,6 +258,7 @@ typedef char ioctl_struct[308];
 #define DM_TABLE_STATUS_32  _IOWR(DM_IOCTL, DM_TABLE_STATUS_CMD, ioctl_struct)
 #define DM_LIST_VERSIONS_32 _IOWR(DM_IOCTL, DM_LIST_VERSIONS_CMD, ioctl_struct)
 #define DM_TARGET_MSG_32    _IOWR(DM_IOCTL, DM_TARGET_MSG_CMD, ioctl_struct)
+#define DM_DEV_SET_GEOMETRY_32	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, ioctl_struct)
 #endif
 
 #define DM_IOCTL 0xfd
@@ -270,11 +282,12 @@ typedef char ioctl_struct[308];
 #define DM_LIST_VERSIONS _IOWR(DM_IOCTL, DM_LIST_VERSIONS_CMD, struct dm_ioctl)
 
 #define DM_TARGET_MSG	 _IOWR(DM_IOCTL, DM_TARGET_MSG_CMD, struct dm_ioctl)
+#define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	5
+#define DM_VERSION_MINOR	6
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2005-10-04)"
+#define DM_VERSION_EXTRA	"-ioctl (2006-02-17)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
-- 
cgit v1.2.3


From 5463c7904c952aa6b6804dd902c72a5332fa5221 Mon Sep 17 00:00:00 2001
From: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Date: Mon, 27 Mar 2006 01:17:58 -0800
Subject: [PATCH] dm/md dependency tree in sysfs: md to use bd_claim_by_disk

Use bd_claim_by_disk.

Following symlinks are created if md0 is built from sda and sdb
  /sys/block/md0/slaves/sda --> /sys/block/sda
  /sys/block/md0/slaves/sdb --> /sys/block/sdb
  /sys/block/sda/holders/md0 --> /sys/block/md0
  /sys/block/sdb/holders/md0 --> /sys/block/md0

Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Cc: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5ed2228745cb..bde3e9682250 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1301,6 +1301,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
 	else
 		ko = &rdev->bdev->bd_disk->kobj;
 	sysfs_create_link(&rdev->kobj, ko, "block");
+	bd_claim_by_disk(rdev->bdev, rdev, mddev->gendisk);
 	return 0;
 }
 
@@ -1311,6 +1312,7 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
 		MD_BUG();
 		return;
 	}
+	bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
 	list_del_init(&rdev->same_set);
 	printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
 	rdev->mddev = NULL;
-- 
cgit v1.2.3


From f165921df46a977e3561f1bd9f13a348441486d1 Mon Sep 17 00:00:00 2001
From: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Date: Mon, 27 Mar 2006 01:17:59 -0800
Subject: [PATCH] dm/md dependency tree in sysfs: dm to use bd_claim_by_disk

Use bd_claim_by_disk.

Following symlinks are created if dm-0 maps to sda:
  /sys/block/dm-0/slaves/sda --> /sys/block/sda
  /sys/block/sda/holders/dm-0 --> /sys/block/dm-0

Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Cc: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm-table.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 84f4594afd3c..76610a6ac01c 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -350,7 +350,7 @@ static struct dm_dev *find_device(struct list_head *l, dev_t dev)
 /*
  * Open a device so we can use it as a map destination.
  */
-static int open_dev(struct dm_dev *d, dev_t dev)
+static int open_dev(struct dm_dev *d, dev_t dev, struct mapped_device *md)
 {
 	static char *_claim_ptr = "I belong to device-mapper";
 	struct block_device *bdev;
@@ -362,7 +362,7 @@ static int open_dev(struct dm_dev *d, dev_t dev)
 	bdev = open_by_devnum(dev, d->mode);
 	if (IS_ERR(bdev))
 		return PTR_ERR(bdev);
-	r = bd_claim(bdev, _claim_ptr);
+	r = bd_claim_by_disk(bdev, _claim_ptr, dm_disk(md));
 	if (r)
 		blkdev_put(bdev);
 	else
@@ -373,12 +373,12 @@ static int open_dev(struct dm_dev *d, dev_t dev)
 /*
  * Close a device that we've been using.
  */
-static void close_dev(struct dm_dev *d)
+static void close_dev(struct dm_dev *d, struct mapped_device *md)
 {
 	if (!d->bdev)
 		return;
 
-	bd_release(d->bdev);
+	bd_release_from_disk(d->bdev, dm_disk(md));
 	blkdev_put(d->bdev);
 	d->bdev = NULL;
 }
@@ -399,7 +399,7 @@ static int check_device_area(struct dm_dev *dd, sector_t start, sector_t len)
  * careful to leave things as they were if we fail to reopen the
  * device.
  */
-static int upgrade_mode(struct dm_dev *dd, int new_mode)
+static int upgrade_mode(struct dm_dev *dd, int new_mode, struct mapped_device *md)
 {
 	int r;
 	struct dm_dev dd_copy;
@@ -409,9 +409,9 @@ static int upgrade_mode(struct dm_dev *dd, int new_mode)
 
 	dd->mode |= new_mode;
 	dd->bdev = NULL;
-	r = open_dev(dd, dev);
+	r = open_dev(dd, dev, md);
 	if (!r)
-		close_dev(&dd_copy);
+		close_dev(&dd_copy, md);
 	else
 		*dd = dd_copy;
 
@@ -453,7 +453,7 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
 		dd->mode = mode;
 		dd->bdev = NULL;
 
-		if ((r = open_dev(dd, dev))) {
+		if ((r = open_dev(dd, dev, t->md))) {
 			kfree(dd);
 			return r;
 		}
@@ -464,7 +464,7 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
 		list_add(&dd->list, &t->devices);
 
 	} else if (dd->mode != (mode | dd->mode)) {
-		r = upgrade_mode(dd, mode);
+		r = upgrade_mode(dd, mode, t->md);
 		if (r)
 			return r;
 	}
@@ -541,7 +541,7 @@ int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
 void dm_put_device(struct dm_target *ti, struct dm_dev *dd)
 {
 	if (atomic_dec_and_test(&dd->count)) {
-		close_dev(dd);
+		close_dev(dd, ti->table->md);
 		list_del(&dd->list);
 		kfree(dd);
 	}
-- 
cgit v1.2.3


From a22c96c737a9cefbe8d6e991c0032ad6db825a67 Mon Sep 17 00:00:00 2001
From: Kevin Corry <kevcorry@us.ibm.com>
Date: Mon, 27 Mar 2006 01:18:01 -0800
Subject: [PATCH] dm: remove unnecessary typecast

Signed-off-by: Kevin Corry <kevcorry@us.ibm.com>
Cc: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm-stripe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 204f796ee9e3..08328a8f5a3c 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -103,7 +103,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		return -EINVAL;
 	}
 
-	if (((uint32_t)ti->len) & (chunk_size - 1)) {
+	if (ti->len & (chunk_size - 1)) {
 		ti->error = "dm-stripe: Target length not divisible by "
 		    "chunk size";
 		return -EINVAL;
-- 
cgit v1.2.3


From 89e5c8b5b85d6d46e8a28cdfa076313ae691d35c Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:02 -0800
Subject: [PATCH] md: Make sure QUEUE_FLAG_CLUSTER is set properly for md.

This flag should be set for a virtual device iff it is set for all underlying
devices.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 block/ll_rw_blk.c | 2 ++
 drivers/md/md.c   | 1 +
 2 files changed, 3 insertions(+)

(limited to 'drivers/md')

diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 062067fa7ead..a2e333ad0b64 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -785,6 +785,8 @@ void blk_queue_stack_limits(request_queue_t *t, request_queue_t *b)
 	t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments);
 	t->max_segment_size = min(t->max_segment_size,b->max_segment_size);
 	t->hardsect_size = max(t->hardsect_size,b->hardsect_size);
+	if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags))
+		clear_bit(QUEUE_FLAG_CLUSTER, &t->queue_flags);
 }
 
 EXPORT_SYMBOL(blk_queue_stack_limits);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index bde3e9682250..3254ff1a5cc9 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -266,6 +266,7 @@ static mddev_t * mddev_find(dev_t unit)
 		kfree(new);
 		return NULL;
 	}
+	set_bit(QUEUE_FLAG_CLUSTER, &new->queue->queue_flags);
 
 	blk_queue_make_request(new->queue, md_fail_request);
 
-- 
cgit v1.2.3


From c5a10f62c5c496c49db749af103b991873b7e2dc Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:03 -0800
Subject: [PATCH] md: Add '4' to the list of levels for which bitmaps are
 supported

I really should make this a function of the personality....

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3254ff1a5cc9..875bced88e36 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -765,7 +765,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 
 		if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
 		    mddev->bitmap_file == NULL) {
-			if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6
+			if (mddev->level != 1 && mddev->level != 4
+			    && mddev->level != 5 && mddev->level != 6
 			    && mddev->level != 10) {
 				/* FIXME use a better test */
 				printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
-- 
cgit v1.2.3


From 1be7892fffb45f6017494a88ff68fe84c6de26b4 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:03 -0800
Subject: [PATCH] md: Fix the 'failed' count for version-0 superblocks

We are counting failed devices twice, once of the device that is failed, and
once for the hole that has been left in the array.  Remove the former so
'failed' matches 'missing'.  Storing these counts in the superblock is a bit
silly anyway....

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 875bced88e36..686314f070a5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -895,10 +895,9 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 			d->raid_disk = rdev2->raid_disk;
 		else
 			d->raid_disk = rdev2->desc_nr; /* compatibility */
-		if (test_bit(Faulty, &rdev2->flags)) {
+		if (test_bit(Faulty, &rdev2->flags))
 			d->state = (1<<MD_DISK_FAULTY);
-			failed++;
-		} else if (test_bit(In_sync, &rdev2->flags)) {
+		else if (test_bit(In_sync, &rdev2->flags)) {
 			d->state = (1<<MD_DISK_ACTIVE);
 			d->state |= (1<<MD_DISK_SYNC);
 			active++;
-- 
cgit v1.2.3


From 4588b42e9d0d0904a745c96cead66506c75bae21 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:04 -0800
Subject: [PATCH] md: Update status_resync to handle LARGE devices

status_resync - used by /proc/mdstat to report the status of a resync, assumes
that device sizes will always fit into an 'unsigned long' This is no longer
the case...

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 686314f070a5..a3ecaf8ed30a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4044,7 +4044,10 @@ static void status_unused(struct seq_file *seq)
 
 static void status_resync(struct seq_file *seq, mddev_t * mddev)
 {
-	unsigned long max_blocks, resync, res, dt, db, rt;
+	sector_t max_blocks, resync, res;
+	unsigned long dt, db, rt;
+	int scale;
+	unsigned int per_milli;
 
 	resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
 
@@ -4060,9 +4063,22 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
 		MD_BUG();
 		return;
 	}
-	res = (resync/1024)*1000/(max_blocks/1024 + 1);
+	/* Pick 'scale' such that (resync>>scale)*1000 will fit
+	 * in a sector_t, and (max_blocks>>scale) will fit in a
+	 * u32, as those are the requirements for sector_div.
+	 * Thus 'scale' must be at least 10
+	 */
+	scale = 10;
+	if (sizeof(sector_t) > sizeof(unsigned long)) {
+		while ( max_blocks/2 > (1ULL<<(scale+32)))
+			scale++;
+	}
+	res = (resync>>scale)*1000;
+	sector_div(res, (u32)((max_blocks>>scale)+1));
+
+	per_milli = res;
 	{
-		int i, x = res/50, y = 20-x;
+		int i, x = per_milli/50, y = 20-x;
 		seq_printf(seq, "[");
 		for (i = 0; i < x; i++)
 			seq_printf(seq, "=");
@@ -4071,10 +4087,12 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
 			seq_printf(seq, ".");
 		seq_printf(seq, "] ");
 	}
-	seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)",
+	seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
 		      (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
 		       "resync" : "recovery"),
-		      res/10, res % 10, resync, max_blocks);
+		      per_milli/10, per_milli % 10,
+		   (unsigned long long) resync,
+		   (unsigned long long) max_blocks);
 
 	/*
 	 * We do not want to overflow, so the order of operands and
@@ -4088,7 +4106,7 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
 	dt = ((jiffies - mddev->resync_mark) / HZ);
 	if (!dt) dt++;
 	db = resync - (mddev->resync_mark_cnt/2);
-	rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;
+	rt = (dt * ((unsigned long)(max_blocks-resync) / (db/100+1)))/100;
 
 	seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
 
-- 
cgit v1.2.3


From b55e6bfcd23cb2f7249095050c649f7aea813f9f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:06 -0800
Subject: [PATCH] md: Split disks array out of raid5 conf structure so it is
 easier to grow

The remainder of this batch implements raid5 reshaping.  Currently the only
shape change that is supported is added a device, but it is envisioned that
changing the chunksize and layout will also be supported, as well as changing
the level (e.g.  1->5, 5->6).

The reshape process naturally has to move all of the data in the array, and so
should be used with caution.  It is believed to work, and some testing does
support this, but wider testing would be great for increasing my confidence.

You will need a version of mdadm newer than 2.3.1 to make use of raid5 growth.
 This is because mdadm need to take a copy of a 'critical section' at the
start of the array incase there is a crash at an awkward moment.  On restart,
mdadm will restore the critical section and allow reshape to continue.

I hope to release a 2.4-pre by early next week - it still needs a little more
polishing.

This patch:

Previously the array of disk information was included in the raid5 'conf'
structure which was allocated to an appropriate size.  This makes it awkward
to change the size of that array.  So we split it off into a separate
kmalloced array which will require a little extra indexing, but is much easier
to grow.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid5.c         | 10 +++++++---
 drivers/md/raid6main.c     | 10 +++++++---
 include/linux/raid/raid5.h |  2 +-
 3 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2dba305daf3c..03f31379cebb 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1822,11 +1822,13 @@ static int run(mddev_t *mddev)
 		return -EIO;
 	}
 
-	mddev->private = kzalloc(sizeof (raid5_conf_t)
-				 + mddev->raid_disks * sizeof(struct disk_info),
-				 GFP_KERNEL);
+	mddev->private = kzalloc(sizeof (raid5_conf_t), GFP_KERNEL);
 	if ((conf = mddev->private) == NULL)
 		goto abort;
+	conf->disks = kzalloc(mddev->raid_disks * sizeof(struct disk_info),
+			      GFP_KERNEL);
+	if (!conf->disks)
+		goto abort;
 
 	conf->mddev = mddev;
 
@@ -1966,6 +1968,7 @@ static int run(mddev_t *mddev)
 abort:
 	if (conf) {
 		print_raid5_conf(conf);
+		kfree(conf->disks);
 		kfree(conf->stripe_hashtbl);
 		kfree(conf);
 	}
@@ -1986,6 +1989,7 @@ static int stop(mddev_t *mddev)
 	kfree(conf->stripe_hashtbl);
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
 	sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
+	kfree(conf->disks);
 	kfree(conf);
 	mddev->private = NULL;
 	return 0;
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index cd477ebf2ee4..c7632f6cc487 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -2006,11 +2006,14 @@ static int run(mddev_t *mddev)
 		return -EIO;
 	}
 
-	mddev->private = kzalloc(sizeof (raid6_conf_t)
-				 + mddev->raid_disks * sizeof(struct disk_info),
-				 GFP_KERNEL);
+	mddev->private = kzalloc(sizeof (raid6_conf_t), GFP_KERNEL);
 	if ((conf = mddev->private) == NULL)
 		goto abort;
+	conf->disks = kzalloc(mddev->raid_disks * sizeof(struct disk_info),
+				 GFP_KERNEL);
+	if (!conf->disks)
+		goto abort;
+
 	conf->mddev = mddev;
 
 	if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
@@ -2158,6 +2161,7 @@ abort:
 		print_raid6_conf(conf);
 		safe_put_page(conf->spare_page);
 		kfree(conf->stripe_hashtbl);
+		kfree(conf->disks);
 		kfree(conf);
 	}
 	mddev->private = NULL;
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 394da8207b34..94dbdd406f12 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -240,7 +240,7 @@ struct raid5_private_data {
 							 * waiting for 25% to be free
 							 */        
 	spinlock_t		device_lock;
-	struct disk_info	disks[0];
+	struct disk_info	*disks;
 };
 
 typedef struct raid5_private_data raid5_conf_t;
-- 
cgit v1.2.3


From ad01c9e3752f4ba4f3d99c89b7370fa4983a25b5 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:07 -0800
Subject: [PATCH] md: Allow stripes to be expanded in preparation for expanding
 an array

Before a RAID-5 can be expanded, we need to be able to expand the stripe-cache
data structure.

This requires allocating new stripes in a new kmem_cache.  If this succeeds,
we copy cache pages over and release the old stripes and kmem_cache.

We then allocate new pages.  If that fails, we leave the stripe cache at it's
new size.  It isn't worth the effort to shrink it back again.

Unfortuanately this means we need two kmem_cache names as we, for a short
period of time, we have two kmem_caches.  So they are raid5/%s and
raid5/%s-alt

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c            |   2 +-
 drivers/md/raid5.c         | 131 +++++++++++++++++++++++++++++++++++++++++++--
 drivers/md/raid6main.c     |   4 +-
 include/linux/raid/raid5.h |   9 +++-
 4 files changed, 137 insertions(+), 9 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index a3ecaf8ed30a..c7b7656f9aa5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2775,7 +2775,6 @@ static void autorun_array(mddev_t *mddev)
  */
 static void autorun_devices(int part)
 {
-	struct list_head candidates;
 	struct list_head *tmp;
 	mdk_rdev_t *rdev0, *rdev;
 	mddev_t *mddev;
@@ -2784,6 +2783,7 @@ static void autorun_devices(int part)
 	printk(KERN_INFO "md: autorun ...\n");
 	while (!list_empty(&pending_raid_disks)) {
 		dev_t dev;
+		LIST_HEAD(candidates);
 		rdev0 = list_entry(pending_raid_disks.next,
 					 mdk_rdev_t, same_set);
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 03f31379cebb..6c20b44509d8 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -313,20 +313,143 @@ static int grow_stripes(raid5_conf_t *conf, int num)
 	kmem_cache_t *sc;
 	int devs = conf->raid_disks;
 
-	sprintf(conf->cache_name, "raid5/%s", mdname(conf->mddev));
-
-	sc = kmem_cache_create(conf->cache_name, 
+	sprintf(conf->cache_name[0], "raid5/%s", mdname(conf->mddev));
+	sprintf(conf->cache_name[1], "raid5/%s-alt", mdname(conf->mddev));
+	conf->active_name = 0;
+	sc = kmem_cache_create(conf->cache_name[conf->active_name],
 			       sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
 			       0, 0, NULL, NULL);
 	if (!sc)
 		return 1;
 	conf->slab_cache = sc;
+	conf->pool_size = devs;
 	while (num--) {
 		if (!grow_one_stripe(conf))
 			return 1;
 	}
 	return 0;
 }
+static int resize_stripes(raid5_conf_t *conf, int newsize)
+{
+	/* Make all the stripes able to hold 'newsize' devices.
+	 * New slots in each stripe get 'page' set to a new page.
+	 *
+	 * This happens in stages:
+	 * 1/ create a new kmem_cache and allocate the required number of
+	 *    stripe_heads.
+	 * 2/ gather all the old stripe_heads and tranfer the pages across
+	 *    to the new stripe_heads.  This will have the side effect of
+	 *    freezing the array as once all stripe_heads have been collected,
+	 *    no IO will be possible.  Old stripe heads are freed once their
+	 *    pages have been transferred over, and the old kmem_cache is
+	 *    freed when all stripes are done.
+	 * 3/ reallocate conf->disks to be suitable bigger.  If this fails,
+	 *    we simple return a failre status - no need to clean anything up.
+	 * 4/ allocate new pages for the new slots in the new stripe_heads.
+	 *    If this fails, we don't bother trying the shrink the
+	 *    stripe_heads down again, we just leave them as they are.
+	 *    As each stripe_head is processed the new one is released into
+	 *    active service.
+	 *
+	 * Once step2 is started, we cannot afford to wait for a write,
+	 * so we use GFP_NOIO allocations.
+	 */
+	struct stripe_head *osh, *nsh;
+	LIST_HEAD(newstripes);
+	struct disk_info *ndisks;
+	int err = 0;
+	kmem_cache_t *sc;
+	int i;
+
+	if (newsize <= conf->pool_size)
+		return 0; /* never bother to shrink */
+
+	/* Step 1 */
+	sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
+			       sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
+			       0, 0, NULL, NULL);
+	if (!sc)
+		return -ENOMEM;
+
+	for (i = conf->max_nr_stripes; i; i--) {
+		nsh = kmem_cache_alloc(sc, GFP_KERNEL);
+		if (!nsh)
+			break;
+
+		memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev));
+
+		nsh->raid_conf = conf;
+		spin_lock_init(&nsh->lock);
+
+		list_add(&nsh->lru, &newstripes);
+	}
+	if (i) {
+		/* didn't get enough, give up */
+		while (!list_empty(&newstripes)) {
+			nsh = list_entry(newstripes.next, struct stripe_head, lru);
+			list_del(&nsh->lru);
+			kmem_cache_free(sc, nsh);
+		}
+		kmem_cache_destroy(sc);
+		return -ENOMEM;
+	}
+	/* Step 2 - Must use GFP_NOIO now.
+	 * OK, we have enough stripes, start collecting inactive
+	 * stripes and copying them over
+	 */
+	list_for_each_entry(nsh, &newstripes, lru) {
+		spin_lock_irq(&conf->device_lock);
+		wait_event_lock_irq(conf->wait_for_stripe,
+				    !list_empty(&conf->inactive_list),
+				    conf->device_lock,
+				    unplug_slaves(conf->mddev);
+			);
+		osh = get_free_stripe(conf);
+		spin_unlock_irq(&conf->device_lock);
+		atomic_set(&nsh->count, 1);
+		for(i=0; i<conf->pool_size; i++)
+			nsh->dev[i].page = osh->dev[i].page;
+		for( ; i<newsize; i++)
+			nsh->dev[i].page = NULL;
+		kmem_cache_free(conf->slab_cache, osh);
+	}
+	kmem_cache_destroy(conf->slab_cache);
+
+	/* Step 3.
+	 * At this point, we are holding all the stripes so the array
+	 * is completely stalled, so now is a good time to resize
+	 * conf->disks.
+	 */
+	ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
+	if (ndisks) {
+		for (i=0; i<conf->raid_disks; i++)
+			ndisks[i] = conf->disks[i];
+		kfree(conf->disks);
+		conf->disks = ndisks;
+	} else
+		err = -ENOMEM;
+
+	/* Step 4, return new stripes to service */
+	while(!list_empty(&newstripes)) {
+		nsh = list_entry(newstripes.next, struct stripe_head, lru);
+		list_del_init(&nsh->lru);
+		for (i=conf->raid_disks; i < newsize; i++)
+			if (nsh->dev[i].page == NULL) {
+				struct page *p = alloc_page(GFP_NOIO);
+				nsh->dev[i].page = p;
+				if (!p)
+					err = -ENOMEM;
+			}
+		release_stripe(nsh);
+	}
+	/* critical section pass, GFP_NOIO no longer needed */
+
+	conf->slab_cache = sc;
+	conf->active_name = 1-conf->active_name;
+	conf->pool_size = newsize;
+	return err;
+}
+
 
 static int drop_one_stripe(raid5_conf_t *conf)
 {
@@ -339,7 +462,7 @@ static int drop_one_stripe(raid5_conf_t *conf)
 		return 0;
 	if (atomic_read(&sh->count))
 		BUG();
-	shrink_buffers(sh, conf->raid_disks);
+	shrink_buffers(sh, conf->pool_size);
 	kmem_cache_free(conf->slab_cache, sh);
 	atomic_dec(&conf->active_stripes);
 	return 1;
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index c7632f6cc487..6df4930fddec 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -331,9 +331,9 @@ static int grow_stripes(raid6_conf_t *conf, int num)
 	kmem_cache_t *sc;
 	int devs = conf->raid_disks;
 
-	sprintf(conf->cache_name, "raid6/%s", mdname(conf->mddev));
+	sprintf(conf->cache_name[0], "raid6/%s", mdname(conf->mddev));
 
-	sc = kmem_cache_create(conf->cache_name,
+	sc = kmem_cache_create(conf->cache_name[0],
 			       sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
 			       0, 0, NULL, NULL);
 	if (!sc)
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 94dbdd406f12..b7b2653af7bb 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -216,7 +216,11 @@ struct raid5_private_data {
 	struct list_head	bitmap_list; /* stripes delaying awaiting bitmap update */
 	atomic_t		preread_active_stripes; /* stripes with scheduled io */
 
-	char			cache_name[20];
+	/* unfortunately we need two cache names as we temporarily have
+	 * two caches.
+	 */
+	int			active_name;
+	char			cache_name[2][20];
 	kmem_cache_t		*slab_cache; /* for allocating stripes */
 
 	int			seq_flush, seq_write;
@@ -238,7 +242,8 @@ struct raid5_private_data {
 	wait_queue_head_t	wait_for_overlap;
 	int			inactive_blocked;	/* release of inactive stripes blocked,
 							 * waiting for 25% to be free
-							 */        
+							 */
+	int			pool_size; /* number of disks in stripeheads in pool */
 	spinlock_t		device_lock;
 	struct disk_info	*disks;
 };
-- 
cgit v1.2.3


From 7ecaa1e6a1ad69862e9980b6c777e11f26c4782d Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:08 -0800
Subject: [PATCH] md: Infrastructure to allow normal IO to continue while array
 is expanding

We need to allow that different stripes are of different effective sizes, and
use the appropriate size.  Also, when a stripe is being expanded, we must
block any IO attempts until the stripe is stable again.

Key elements in this change are:
 - each stripe_head gets a 'disk' field which is part of the key,
   thus there can sometimes be two stripe heads of the same area of
   the array, but covering different numbers of devices.  One of these
   will be marked STRIPE_EXPANDING and so won't accept new requests.
 - conf->expand_progress tracks how the expansion is progressing and
   is used to determine whether the target part of the array has been
   expanded yet or not.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid5.c         | 88 ++++++++++++++++++++++++++++++----------------
 include/linux/raid/raid5.h |  6 ++++
 2 files changed, 64 insertions(+), 30 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6c20b44509d8..7a6df515b008 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -178,10 +178,10 @@ static int grow_buffers(struct stripe_head *sh, int num)
 
 static void raid5_build_block (struct stripe_head *sh, int i);
 
-static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx)
+static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int disks)
 {
 	raid5_conf_t *conf = sh->raid_conf;
-	int disks = conf->raid_disks, i;
+	int i;
 
 	if (atomic_read(&sh->count) != 0)
 		BUG();
@@ -198,7 +198,9 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx)
 	sh->pd_idx = pd_idx;
 	sh->state = 0;
 
-	for (i=disks; i--; ) {
+	sh->disks = disks;
+
+	for (i = sh->disks; i--; ) {
 		struct r5dev *dev = &sh->dev[i];
 
 		if (dev->toread || dev->towrite || dev->written ||
@@ -215,7 +217,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx)
 	insert_hash(conf, sh);
 }
 
-static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector)
+static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, int disks)
 {
 	struct stripe_head *sh;
 	struct hlist_node *hn;
@@ -223,7 +225,7 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector)
 	CHECK_DEVLOCK();
 	PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector);
 	hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
-		if (sh->sector == sector)
+		if (sh->sector == sector && sh->disks == disks)
 			return sh;
 	PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector);
 	return NULL;
@@ -232,8 +234,8 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector)
 static void unplug_slaves(mddev_t *mddev);
 static void raid5_unplug_device(request_queue_t *q);
 
-static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector,
-					     int pd_idx, int noblock) 
+static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, int disks,
+					     int pd_idx, int noblock)
 {
 	struct stripe_head *sh;
 
@@ -245,7 +247,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
 		wait_event_lock_irq(conf->wait_for_stripe,
 				    conf->quiesce == 0,
 				    conf->device_lock, /* nothing */);
-		sh = __find_stripe(conf, sector);
+		sh = __find_stripe(conf, sector, disks);
 		if (!sh) {
 			if (!conf->inactive_blocked)
 				sh = get_free_stripe(conf);
@@ -263,7 +265,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
 					);
 				conf->inactive_blocked = 0;
 			} else
-				init_stripe(sh, sector, pd_idx);
+				init_stripe(sh, sector, pd_idx, disks);
 		} else {
 			if (atomic_read(&sh->count)) {
 				if (!list_empty(&sh->lru))
@@ -300,6 +302,7 @@ static int grow_one_stripe(raid5_conf_t *conf)
 		kmem_cache_free(conf->slab_cache, sh);
 		return 0;
 	}
+	sh->disks = conf->raid_disks;
 	/* we just created an active stripe so... */
 	atomic_set(&sh->count, 1);
 	atomic_inc(&conf->active_stripes);
@@ -483,7 +486,7 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
 {
  	struct stripe_head *sh = bi->bi_private;
 	raid5_conf_t *conf = sh->raid_conf;
-	int disks = conf->raid_disks, i;
+	int disks = sh->disks, i;
 	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
 
 	if (bi->bi_size)
@@ -581,7 +584,7 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done,
 {
  	struct stripe_head *sh = bi->bi_private;
 	raid5_conf_t *conf = sh->raid_conf;
-	int disks = conf->raid_disks, i;
+	int disks = sh->disks, i;
 	unsigned long flags;
 	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
 
@@ -735,7 +738,7 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
 static sector_t compute_blocknr(struct stripe_head *sh, int i)
 {
 	raid5_conf_t *conf = sh->raid_conf;
-	int raid_disks = conf->raid_disks, data_disks = raid_disks - 1;
+	int raid_disks = sh->disks, data_disks = raid_disks - 1;
 	sector_t new_sector = sh->sector, check;
 	int sectors_per_chunk = conf->chunk_size >> 9;
 	sector_t stripe;
@@ -836,8 +839,7 @@ static void copy_data(int frombio, struct bio *bio,
 
 static void compute_block(struct stripe_head *sh, int dd_idx)
 {
-	raid5_conf_t *conf = sh->raid_conf;
-	int i, count, disks = conf->raid_disks;
+	int i, count, disks = sh->disks;
 	void *ptr[MAX_XOR_BLOCKS], *p;
 
 	PRINTK("compute_block, stripe %llu, idx %d\n", 
@@ -867,7 +869,7 @@ static void compute_block(struct stripe_head *sh, int dd_idx)
 static void compute_parity(struct stripe_head *sh, int method)
 {
 	raid5_conf_t *conf = sh->raid_conf;
-	int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;
+	int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
 	void *ptr[MAX_XOR_BLOCKS];
 	struct bio *chosen;
 
@@ -1055,7 +1057,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 static void handle_stripe(struct stripe_head *sh)
 {
 	raid5_conf_t *conf = sh->raid_conf;
-	int disks = conf->raid_disks;
+	int disks = sh->disks;
 	struct bio *return_bi= NULL;
 	struct bio *bi;
 	int i;
@@ -1649,12 +1651,10 @@ static inline void raid5_plug_device(raid5_conf_t *conf)
 	spin_unlock_irq(&conf->device_lock);
 }
 
-static int make_request (request_queue_t *q, struct bio * bi)
+static int make_request(request_queue_t *q, struct bio * bi)
 {
 	mddev_t *mddev = q->queuedata;
 	raid5_conf_t *conf = mddev_to_conf(mddev);
-	const unsigned int raid_disks = conf->raid_disks;
-	const unsigned int data_disks = raid_disks - 1;
 	unsigned int dd_idx, pd_idx;
 	sector_t new_sector;
 	sector_t logical_sector, last_sector;
@@ -1678,20 +1678,48 @@ static int make_request (request_queue_t *q, struct bio * bi)
 
 	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
 		DEFINE_WAIT(w);
+		int disks;
 		
-		new_sector = raid5_compute_sector(logical_sector,
-						  raid_disks, data_disks, &dd_idx, &pd_idx, conf);
-
+	retry:
+		if (likely(conf->expand_progress == MaxSector))
+			disks = conf->raid_disks;
+		else {
+			spin_lock_irq(&conf->device_lock);
+			disks = conf->raid_disks;
+			if (logical_sector >= conf->expand_progress)
+				disks = conf->previous_raid_disks;
+			spin_unlock_irq(&conf->device_lock);
+		}
+ 		new_sector = raid5_compute_sector(logical_sector, disks, disks - 1,
+						  &dd_idx, &pd_idx, conf);
 		PRINTK("raid5: make_request, sector %llu logical %llu\n",
 			(unsigned long long)new_sector, 
 			(unsigned long long)logical_sector);
 
-	retry:
 		prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
-		sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK));
+		sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK));
 		if (sh) {
-			if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
-				/* Add failed due to overlap.  Flush everything
+			if (unlikely(conf->expand_progress != MaxSector)) {
+				/* expansion might have moved on while waiting for a
+				 * stripe, so we much do the range check again.
+				 */
+				int must_retry = 0;
+				spin_lock_irq(&conf->device_lock);
+				if (logical_sector <  conf->expand_progress &&
+				    disks == conf->previous_raid_disks)
+					/* mismatch, need to try again */
+					must_retry = 1;
+				spin_unlock_irq(&conf->device_lock);
+				if (must_retry) {
+					release_stripe(sh);
+					goto retry;
+				}
+			}
+
+			if (test_bit(STRIPE_EXPANDING, &sh->state) ||
+			    !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
+				/* Stripe is busy expanding or
+				 * add failed due to overlap.  Flush everything
 				 * and wait a while
 				 */
 				raid5_unplug_device(mddev->queue);
@@ -1703,7 +1731,6 @@ static int make_request (request_queue_t *q, struct bio * bi)
 			raid5_plug_device(conf);
 			handle_stripe(sh);
 			release_stripe(sh);
-
 		} else {
 			/* cannot get stripe for read-ahead, just give-up */
 			clear_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -1779,9 +1806,9 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 
 	first_sector = raid5_compute_sector((sector_t)stripe*data_disks*sectors_per_chunk
 		+ chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
-	sh = get_active_stripe(conf, sector_nr, pd_idx, 1);
+	sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1);
 	if (sh == NULL) {
-		sh = get_active_stripe(conf, sector_nr, pd_idx, 0);
+		sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0);
 		/* make sure we don't swamp the stripe cache if someone else
 		 * is trying to get access 
 		 */
@@ -1998,6 +2025,7 @@ static int run(mddev_t *mddev)
 	conf->level = mddev->level;
 	conf->algorithm = mddev->layout;
 	conf->max_nr_stripes = NR_STRIPES;
+	conf->expand_progress = MaxSector;
 
 	/* device size must be a multiple of chunk size */
 	mddev->size &= ~(mddev->chunk_size/1024 -1);
@@ -2128,7 +2156,7 @@ static void print_sh (struct stripe_head *sh)
 	printk("sh %llu,  count %d.\n",
 		(unsigned long long)sh->sector, atomic_read(&sh->count));
 	printk("sh %llu, ", (unsigned long long)sh->sector);
-	for (i = 0; i < sh->raid_conf->raid_disks; i++) {
+	for (i = 0; i < sh->disks; i++) {
 		printk("(cache%d: %p %ld) ", 
 			i, sh->dev[i].page, sh->dev[i].flags);
 	}
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index b7b2653af7bb..6fa274aea2a0 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -135,6 +135,7 @@ struct stripe_head {
 	atomic_t		count;			/* nr of active thread/requests */
 	spinlock_t		lock;
 	int			bm_seq;	/* sequence number for bitmap flushes */
+	int			disks;			/* disks in stripe */
 	struct r5dev {
 		struct bio	req;
 		struct bio_vec	vec;
@@ -174,6 +175,7 @@ struct stripe_head {
 #define	STRIPE_DELAYED		6
 #define	STRIPE_DEGRADED		7
 #define	STRIPE_BIT_DELAY	8
+#define	STRIPE_EXPANDING	9
 
 /*
  * Plugging:
@@ -211,6 +213,10 @@ struct raid5_private_data {
 	int			raid_disks, working_disks, failed_disks;
 	int			max_nr_stripes;
 
+	/* used during an expand */
+	sector_t		expand_progress;	/* MaxSector when no expand happening */
+	int			previous_raid_disks;
+
 	struct list_head	handle_list; /* stripes needing handling */
 	struct list_head	delayed_list; /* stripes that have plugged requests */
 	struct list_head	bitmap_list; /* stripes delaying awaiting bitmap update */
-- 
cgit v1.2.3


From ccfcc3c10b2a5cb8fd3c918199a4ff904fc6fb3e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:09 -0800
Subject: [PATCH] md: Core of raid5 resize process

This patch provides the core of the resize/expand process.

sync_request notices if a 'reshape' is happening and acts accordingly.

It allocated new stripe_heads for the next chunk-wide-stripe in the target
geometry, marking them STRIPE_EXPANDING.

Then it finds which stripe heads in the old geometry can provide data needed
by these and marks them STRIPE_EXPAND_SOURCE.  This causes stripe_handle to
read all blocks on those stripes.

Once all blocks on a STRIPE_EXPAND_SOURCE stripe_head are read, any that are
needed are copied into the corresponding STRIPE_EXPANDING stripe_head.  Once a
STRIPE_EXPANDING stripe_head is full, it is marks STRIPE_EXPAND_READY and then
is written out and released.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c            |  14 +++-
 drivers/md/raid5.c         | 185 +++++++++++++++++++++++++++++++++++++++------
 include/linux/raid/md_k.h  |   4 +
 include/linux/raid/raid5.h |   4 +-
 4 files changed, 181 insertions(+), 26 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index c7b7656f9aa5..8e65986bc63f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2165,7 +2165,9 @@ action_show(mddev_t *mddev, char *page)
 	char *type = "idle";
 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
 	    test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) {
-		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+		if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+			type = "reshape";
+		else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
 			if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
 				type = "resync";
 			else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
@@ -4088,8 +4090,10 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
 		seq_printf(seq, "] ");
 	}
 	seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
+		   (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
+		    "reshape" :
 		      (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
-		       "resync" : "recovery"),
+		       "resync" : "recovery")),
 		      per_milli/10, per_milli % 10,
 		   (unsigned long long) resync,
 		   (unsigned long long) max_blocks);
@@ -4543,7 +4547,9 @@ static void md_do_sync(mddev_t *mddev)
 		 */
 		max_sectors = mddev->resync_max_sectors;
 		mddev->resync_mismatches = 0;
-	} else
+	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+		max_sectors = mddev->size << 1;
+	else
 		/* recovery follows the physical size of devices */
 		max_sectors = mddev->size << 1;
 
@@ -4679,6 +4685,8 @@ static void md_do_sync(mddev_t *mddev)
 	mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
 
 	if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
+	    test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
+	    !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
 	    mddev->curr_resync > 2 &&
 	    mddev->curr_resync >= mddev->recovery_cp) {
 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7a6df515b008..56cba8d3e398 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -93,11 +93,11 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
 				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
 					md_wakeup_thread(conf->mddev->thread);
 			}
-			list_add_tail(&sh->lru, &conf->inactive_list);
 			atomic_dec(&conf->active_stripes);
-			if (!conf->inactive_blocked ||
-			    atomic_read(&conf->active_stripes) < (conf->max_nr_stripes*3/4))
+			if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
+				list_add_tail(&sh->lru, &conf->inactive_list);
 				wake_up(&conf->wait_for_stripe);
+			}
 		}
 	}
 }
@@ -273,9 +273,8 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
 			} else {
 				if (!test_bit(STRIPE_HANDLE, &sh->state))
 					atomic_inc(&conf->active_stripes);
-				if (list_empty(&sh->lru))
-					BUG();
-				list_del_init(&sh->lru);
+				if (!list_empty(&sh->lru))
+					list_del_init(&sh->lru);
 			}
 		}
 	} while (sh == NULL);
@@ -1035,6 +1034,18 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 	return 0;
 }
 
+static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
+{
+	int sectors_per_chunk = conf->chunk_size >> 9;
+	sector_t x = stripe;
+	int pd_idx, dd_idx;
+	int chunk_offset = sector_div(x, sectors_per_chunk);
+	stripe = x;
+	raid5_compute_sector(stripe*(disks-1)*sectors_per_chunk
+			     + chunk_offset, disks, disks-1, &dd_idx, &pd_idx, conf);
+	return pd_idx;
+}
+
 
 /*
  * handle_stripe - do things to a stripe.
@@ -1061,7 +1072,7 @@ static void handle_stripe(struct stripe_head *sh)
 	struct bio *return_bi= NULL;
 	struct bio *bi;
 	int i;
-	int syncing;
+	int syncing, expanding, expanded;
 	int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
 	int non_overwrite = 0;
 	int failed_num=0;
@@ -1076,6 +1087,8 @@ static void handle_stripe(struct stripe_head *sh)
 	clear_bit(STRIPE_DELAYED, &sh->state);
 
 	syncing = test_bit(STRIPE_SYNCING, &sh->state);
+	expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+	expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
 	/* Now to look around and see what can be done */
 
 	rcu_read_lock();
@@ -1268,13 +1281,14 @@ static void handle_stripe(struct stripe_head *sh)
 	 * parity, or to satisfy requests
 	 * or to load a block that is being partially written.
 	 */
-	if (to_read || non_overwrite || (syncing && (uptodate < disks))) {
+	if (to_read || non_overwrite || (syncing && (uptodate < disks)) || expanding) {
 		for (i=disks; i--;) {
 			dev = &sh->dev[i];
 			if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
 			    (dev->toread ||
 			     (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
 			     syncing ||
+			     expanding ||
 			     (failed && (sh->dev[failed_num].toread ||
 					 (sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags))))
 				    )
@@ -1464,13 +1478,76 @@ static void handle_stripe(struct stripe_head *sh)
 			set_bit(R5_Wantwrite, &dev->flags);
 			set_bit(R5_ReWrite, &dev->flags);
 			set_bit(R5_LOCKED, &dev->flags);
+			locked++;
 		} else {
 			/* let's read it back */
 			set_bit(R5_Wantread, &dev->flags);
 			set_bit(R5_LOCKED, &dev->flags);
+			locked++;
 		}
 	}
 
+	if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
+		/* Need to write out all blocks after computing parity */
+		sh->disks = conf->raid_disks;
+		sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks);
+		compute_parity(sh, RECONSTRUCT_WRITE);
+		for (i= conf->raid_disks; i--;) {
+			set_bit(R5_LOCKED, &sh->dev[i].flags);
+			locked++;
+			set_bit(R5_Wantwrite, &sh->dev[i].flags);
+		}
+		clear_bit(STRIPE_EXPANDING, &sh->state);
+	} else if (expanded) {
+		clear_bit(STRIPE_EXPAND_READY, &sh->state);
+		wake_up(&conf->wait_for_overlap);
+		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
+	}
+
+	if (expanding && locked == 0) {
+		/* We have read all the blocks in this stripe and now we need to
+		 * copy some of them into a target stripe for expand.
+		 */
+		clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+		for (i=0; i< sh->disks; i++)
+			if (i != sh->pd_idx) {
+				int dd_idx, pd_idx, j;
+				struct stripe_head *sh2;
+
+				sector_t bn = compute_blocknr(sh, i);
+				sector_t s = raid5_compute_sector(bn, conf->raid_disks,
+								  conf->raid_disks-1,
+								  &dd_idx, &pd_idx, conf);
+				sh2 = get_active_stripe(conf, s, conf->raid_disks, pd_idx, 1);
+				if (sh2 == NULL)
+					/* so far only the early blocks of this stripe
+					 * have been requested.  When later blocks
+					 * get requested, we will try again
+					 */
+					continue;
+				if(!test_bit(STRIPE_EXPANDING, &sh2->state) ||
+				   test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
+					/* must have already done this block */
+					release_stripe(sh2);
+					continue;
+				}
+				memcpy(page_address(sh2->dev[dd_idx].page),
+				       page_address(sh->dev[i].page),
+				       STRIPE_SIZE);
+				set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
+				set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
+				for (j=0; j<conf->raid_disks; j++)
+					if (j != sh2->pd_idx &&
+					    !test_bit(R5_Expanded, &sh2->dev[j].flags))
+						break;
+				if (j == conf->raid_disks) {
+					set_bit(STRIPE_EXPAND_READY, &sh2->state);
+					set_bit(STRIPE_HANDLE, &sh2->state);
+				}
+				release_stripe(sh2);
+			}
+	}
+
 	spin_unlock(&sh->lock);
 
 	while ((bi=return_bi)) {
@@ -1509,7 +1586,7 @@ static void handle_stripe(struct stripe_head *sh)
 		rcu_read_unlock();
  
 		if (rdev) {
-			if (syncing)
+			if (syncing || expanding || expanded)
 				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
 
 			bi->bi_bdev = rdev->bdev;
@@ -1757,12 +1834,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 {
 	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
 	struct stripe_head *sh;
-	int sectors_per_chunk = conf->chunk_size >> 9;
-	sector_t x;
-	unsigned long stripe;
-	int chunk_offset;
-	int dd_idx, pd_idx;
-	sector_t first_sector;
+	int pd_idx;
+	sector_t first_sector, last_sector;
 	int raid_disks = conf->raid_disks;
 	int data_disks = raid_disks-1;
 	sector_t max_sector = mddev->size << 1;
@@ -1781,6 +1854,80 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 
 		return 0;
 	}
+
+	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
+		/* reshaping is quite different to recovery/resync so it is
+		 * handled quite separately ... here.
+		 *
+		 * On each call to sync_request, we gather one chunk worth of
+		 * destination stripes and flag them as expanding.
+		 * Then we find all the source stripes and request reads.
+		 * As the reads complete, handle_stripe will copy the data
+		 * into the destination stripe and release that stripe.
+		 */
+		int i;
+		int dd_idx;
+		for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) {
+			int j;
+			int skipped = 0;
+			pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks);
+			sh = get_active_stripe(conf, sector_nr+i,
+					       conf->raid_disks, pd_idx, 0);
+			set_bit(STRIPE_EXPANDING, &sh->state);
+			/* If any of this stripe is beyond the end of the old
+			 * array, then we need to zero those blocks
+			 */
+			for (j=sh->disks; j--;) {
+				sector_t s;
+				if (j == sh->pd_idx)
+					continue;
+				s = compute_blocknr(sh, j);
+				if (s < (mddev->array_size<<1)) {
+					skipped = 1;
+					continue;
+				}
+				memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
+				set_bit(R5_Expanded, &sh->dev[j].flags);
+				set_bit(R5_UPTODATE, &sh->dev[j].flags);
+			}
+			if (!skipped) {
+				set_bit(STRIPE_EXPAND_READY, &sh->state);
+				set_bit(STRIPE_HANDLE, &sh->state);
+			}
+			release_stripe(sh);
+		}
+		spin_lock_irq(&conf->device_lock);
+		conf->expand_progress = (sector_nr + i)*(conf->raid_disks-1);
+		spin_unlock_irq(&conf->device_lock);
+		/* Ok, those stripe are ready. We can start scheduling
+		 * reads on the source stripes.
+		 * The source stripes are determined by mapping the first and last
+		 * block on the destination stripes.
+		 */
+		raid_disks = conf->previous_raid_disks;
+		data_disks = raid_disks - 1;
+		first_sector =
+			raid5_compute_sector(sector_nr*(conf->raid_disks-1),
+					     raid_disks, data_disks,
+					     &dd_idx, &pd_idx, conf);
+		last_sector =
+			raid5_compute_sector((sector_nr+conf->chunk_size/512)
+					       *(conf->raid_disks-1) -1,
+					     raid_disks, data_disks,
+					     &dd_idx, &pd_idx, conf);
+		if (last_sector >= (mddev->size<<1))
+			last_sector = (mddev->size<<1)-1;
+		while (first_sector <= last_sector) {
+			pd_idx = stripe_to_pdidx(first_sector, conf, conf->previous_raid_disks);
+			sh = get_active_stripe(conf, first_sector,
+					       conf->previous_raid_disks, pd_idx, 0);
+			set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+			set_bit(STRIPE_HANDLE, &sh->state);
+			release_stripe(sh);
+			first_sector += STRIPE_SECTORS;
+		}
+		return conf->chunk_size>>9;
+	}
 	/* if there is 1 or more failed drives and we are trying
 	 * to resync, then assert that we are finished, because there is
 	 * nothing we can do.
@@ -1799,13 +1946,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
 	}
 
-	x = sector_nr;
-	chunk_offset = sector_div(x, sectors_per_chunk);
-	stripe = x;
-	BUG_ON(x != stripe);
-
-	first_sector = raid5_compute_sector((sector_t)stripe*data_disks*sectors_per_chunk
-		+ chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
+	pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks);
 	sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1);
 	if (sh == NULL) {
 		sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0);
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 617b9506c760..4e26ef2cacca 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -157,6 +157,9 @@ struct mddev_s
 	 * DONE:     thread is done and is waiting to be reaped
 	 * REQUEST:  user-space has requested a sync (used with SYNC)
 	 * CHECK:    user-space request for for check-only, no repair
+	 * RESHAPE:  A reshape is happening
+	 *
+	 * If neither SYNC or RESHAPE are set, then it is a recovery.
 	 */
 #define	MD_RECOVERY_RUNNING	0
 #define	MD_RECOVERY_SYNC	1
@@ -166,6 +169,7 @@ struct mddev_s
 #define	MD_RECOVERY_NEEDED	5
 #define	MD_RECOVERY_REQUESTED	6
 #define	MD_RECOVERY_CHECK	7
+#define MD_RECOVERY_RESHAPE	8
 	unsigned long			recovery;
 
 	int				in_sync;	/* know to not need resync */
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 6fa274aea2a0..55c738d50508 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -157,6 +157,7 @@ struct stripe_head {
 #define	R5_ReadError	8	/* seen a read error here recently */
 #define	R5_ReWrite	9	/* have tried to over-write the readerror */
 
+#define	R5_Expanded	10	/* This block now has post-expand data */
 /*
  * Write method
  */
@@ -176,7 +177,8 @@ struct stripe_head {
 #define	STRIPE_DEGRADED		7
 #define	STRIPE_BIT_DELAY	8
 #define	STRIPE_EXPANDING	9
-
+#define	STRIPE_EXPAND_SOURCE	10
+#define	STRIPE_EXPAND_READY	11
 /*
  * Plugging:
  *
-- 
cgit v1.2.3


From 292695531ae4019bb15deedc121b218d1908b648 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:10 -0800
Subject: [PATCH] md: Final stages of raid5 expand code

This patch adds raid5_reshape and end_reshape which will start and finish the
reshape processes.

raid5_reshape is only enabled in CONFIG_MD_RAID5_RESHAPE is set, to discourage
accidental use.

Read the 'help' for the CONFIG_MD_RAID5_RESHAPE entry.

and Make sure that you have backups, just in case.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/Kconfig      |  26 ++++++++++
 drivers/md/md.c         |   6 ++-
 drivers/md/raid5.c      | 123 +++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/raid/md.h |   3 +-
 4 files changed, 154 insertions(+), 4 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index ac43f98062fd..fd2aae150ccc 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -127,6 +127,32 @@ config MD_RAID5
 
 	  If unsure, say Y.
 
+config MD_RAID5_RESHAPE
+	bool "Support adding drives to a raid-5 array (experimental)"
+	depends on MD_RAID5 && EXPERIMENTAL
+	---help---
+	  A RAID-5 set can be expanded by adding extra drives. This
+	  requires "restriping" the array which means (almost) every
+	  block must be written to a different place.
+
+          This option allows such restriping to be done while the array
+	  is online.  However it is still EXPERIMENTAL code.  It should
+	  work, but please be sure that you have backups.
+
+	  You will need a version of mdadm newer than 2.3.1.   During the
+	  early stage of reshape there is a critical section where live data
+	  is being over-written.  A crash during this time needs extra care
+	  for recovery.  The newer mdadm takes a copy of the data in the
+	  critical section and will restore it, if necessary, after a crash.
+
+	  The mdadm usage is e.g.
+	       mdadm --grow /dev/md1 --raid-disks=6
+	  to grow '/dev/md1' to having 6 disks.
+
+	  Note: The array can only be expanded, not contracted.
+	  There should be enough spares already present to make the new
+	  array workable.
+
 config MD_RAID6
 	tristate "RAID-6 mode"
 	depends on BLK_DEV_MD
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8e65986bc63f..d169bc964676 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -158,11 +158,12 @@ static int start_readonly;
  */
 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
 static atomic_t md_event_count;
-static void md_new_event(mddev_t *mddev)
+void md_new_event(mddev_t *mddev)
 {
 	atomic_inc(&md_event_count);
 	wake_up(&md_event_waiters);
 }
+EXPORT_SYMBOL_GPL(md_new_event);
 
 /*
  * Enables to iterate over all existing md arrays
@@ -4467,7 +4468,7 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
 
 #define SYNC_MARKS	10
 #define	SYNC_MARK_STEP	(3*HZ)
-static void md_do_sync(mddev_t *mddev)
+void md_do_sync(mddev_t *mddev)
 {
 	mddev_t *mddev2;
 	unsigned int currspeed = 0,
@@ -4704,6 +4705,7 @@ static void md_do_sync(mddev_t *mddev)
 	set_bit(MD_RECOVERY_DONE, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
 }
+EXPORT_SYMBOL_GPL(md_do_sync);
 
 
 /*
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 56cba8d3e398..b29135acb1d9 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -331,6 +331,8 @@ static int grow_stripes(raid5_conf_t *conf, int num)
 	}
 	return 0;
 }
+
+#ifdef CONFIG_MD_RAID5_RESHAPE
 static int resize_stripes(raid5_conf_t *conf, int newsize)
 {
 	/* Make all the stripes able to hold 'newsize' devices.
@@ -451,7 +453,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
 	conf->pool_size = newsize;
 	return err;
 }
-
+#endif
 
 static int drop_one_stripe(raid5_conf_t *conf)
 {
@@ -1034,6 +1036,8 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 	return 0;
 }
 
+static void end_reshape(raid5_conf_t *conf);
+
 static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
 {
 	int sectors_per_chunk = conf->chunk_size >> 9;
@@ -1844,6 +1848,10 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 	if (sector_nr >= max_sector) {
 		/* just being told to finish up .. nothing much to do */
 		unplug_slaves(mddev);
+		if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
+			end_reshape(conf);
+			return 0;
+		}
 
 		if (mddev->curr_resync < max_sector) /* aborted */
 			bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
@@ -2464,6 +2472,116 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
 	return 0;
 }
 
+#ifdef CONFIG_MD_RAID5_RESHAPE
+static int raid5_reshape(mddev_t *mddev, int raid_disks)
+{
+	raid5_conf_t *conf = mddev_to_conf(mddev);
+	int err;
+	mdk_rdev_t *rdev;
+	struct list_head *rtmp;
+	int spares = 0;
+	int added_devices = 0;
+
+	if (mddev->degraded ||
+	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+		return -EBUSY;
+	if (conf->raid_disks > raid_disks)
+		return -EINVAL; /* Cannot shrink array yet */
+	if (conf->raid_disks == raid_disks)
+		return 0; /* nothing to do */
+
+	/* Can only proceed if there are plenty of stripe_heads.
+	 * We need a minimum of one full stripe,, and for sensible progress
+	 * it is best to have about 4 times that.
+	 * If we require 4 times, then the default 256 4K stripe_heads will
+	 * allow for chunk sizes up to 256K, which is probably OK.
+	 * If the chunk size is greater, user-space should request more
+	 * stripe_heads first.
+	 */
+	if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes) {
+		printk(KERN_WARNING "raid5: reshape: not enough stripes.  Needed %lu\n",
+		       (mddev->chunk_size / STRIPE_SIZE)*4);
+		return -ENOSPC;
+	}
+
+	ITERATE_RDEV(mddev, rdev, rtmp)
+		if (rdev->raid_disk < 0 &&
+		    !test_bit(Faulty, &rdev->flags))
+			spares++;
+	if (conf->raid_disks + spares < raid_disks-1)
+		/* Not enough devices even to make a degraded array
+		 * of that size
+		 */
+		return -EINVAL;
+
+	err = resize_stripes(conf, raid_disks);
+	if (err)
+		return err;
+
+	spin_lock_irq(&conf->device_lock);
+	conf->previous_raid_disks = conf->raid_disks;
+	mddev->raid_disks = conf->raid_disks = raid_disks;
+	conf->expand_progress = 0;
+	spin_unlock_irq(&conf->device_lock);
+
+	/* Add some new drives, as many as will fit.
+	 * We know there are enough to make the newly sized array work.
+	 */
+	ITERATE_RDEV(mddev, rdev, rtmp)
+		if (rdev->raid_disk < 0 &&
+		    !test_bit(Faulty, &rdev->flags)) {
+			if (raid5_add_disk(mddev, rdev)) {
+				char nm[20];
+				set_bit(In_sync, &rdev->flags);
+				conf->working_disks++;
+				added_devices++;
+				sprintf(nm, "rd%d", rdev->raid_disk);
+				sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
+			} else
+				break;
+		}
+
+	mddev->degraded = (raid_disks - conf->previous_raid_disks) - added_devices;
+	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+	set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+	mddev->sync_thread = md_register_thread(md_do_sync, mddev,
+						"%s_reshape");
+	if (!mddev->sync_thread) {
+		mddev->recovery = 0;
+		spin_lock_irq(&conf->device_lock);
+		mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
+		conf->expand_progress = MaxSector;
+		spin_unlock_irq(&conf->device_lock);
+		return -EAGAIN;
+	}
+	md_wakeup_thread(mddev->sync_thread);
+	md_new_event(mddev);
+	return 0;
+}
+#endif
+
+static void end_reshape(raid5_conf_t *conf)
+{
+	struct block_device *bdev;
+
+	conf->mddev->array_size = conf->mddev->size * (conf->mddev->raid_disks-1);
+	set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
+	conf->mddev->changed = 1;
+
+	bdev = bdget_disk(conf->mddev->gendisk, 0);
+	if (bdev) {
+		mutex_lock(&bdev->bd_inode->i_mutex);
+		i_size_write(bdev->bd_inode, conf->mddev->array_size << 10);
+		mutex_unlock(&bdev->bd_inode->i_mutex);
+		bdput(bdev);
+	}
+	spin_lock_irq(&conf->device_lock);
+	conf->expand_progress = MaxSector;
+	spin_unlock_irq(&conf->device_lock);
+}
+
 static void raid5_quiesce(mddev_t *mddev, int state)
 {
 	raid5_conf_t *conf = mddev_to_conf(mddev);
@@ -2502,6 +2620,9 @@ static struct mdk_personality raid5_personality =
 	.spare_active	= raid5_spare_active,
 	.sync_request	= sync_request,
 	.resize		= raid5_resize,
+#ifdef CONFIG_MD_RAID5_RESHAPE
+	.reshape	= raid5_reshape,
+#endif
 	.quiesce	= raid5_quiesce,
 };
 
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
index b6e0bcad84e1..9c77cde5a795 100644
--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -92,7 +92,8 @@ extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
 extern void md_super_wait(mddev_t *mddev);
 extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
 			struct page *page, int rw);
-
+extern void md_do_sync(mddev_t *mddev);
+extern void md_new_event(mddev_t *mddev);
 
 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
 
-- 
cgit v1.2.3


From f67055780caac6a99f43834795c43acf99eba6a6 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:11 -0800
Subject: [PATCH] md: Checkpoint and allow restart of raid5 reshape

We allow the superblock to record an 'old' and a 'new' geometry, and a
position where any conversion is up to.  The geometry allows for changing
chunksize, layout and level as well as number of devices.

When using verion-0.90 superblock, we convert the version to 0.91 while the
conversion is happening so that an old kernel will refuse the assemble the
array.  For version-1, we use a feature bit for the same effect.

When starting an array we check for an incomplete reshape and restart the
reshape process if needed.  If the reshape stopped at an awkward time (like
when updating the first stripe) we refuse to assemble the array, and let
user-space worry about it.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c            |  69 +++++++++++++++++++++-
 drivers/md/raid1.c         |   5 ++
 drivers/md/raid5.c         | 140 ++++++++++++++++++++++++++++++++++++++-------
 include/linux/raid/md.h    |   2 +
 include/linux/raid/md_k.h  |   8 +++
 include/linux/raid/md_p.h  |  32 ++++++++++-
 include/linux/raid/raid5.h |   1 +
 7 files changed, 231 insertions(+), 26 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index d169bc964676..b9dfdfccdb78 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -662,7 +662,8 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
 	}
 
 	if (sb->major_version != 0 ||
-	    sb->minor_version != 90) {
+	    sb->minor_version < 90 ||
+	    sb->minor_version > 91) {
 		printk(KERN_WARNING "Bad version number %d.%d on %s\n",
 			sb->major_version, sb->minor_version,
 			b);
@@ -747,6 +748,20 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 		mddev->bitmap_offset = 0;
 		mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
 
+		if (mddev->minor_version >= 91) {
+			mddev->reshape_position = sb->reshape_position;
+			mddev->delta_disks = sb->delta_disks;
+			mddev->new_level = sb->new_level;
+			mddev->new_layout = sb->new_layout;
+			mddev->new_chunk = sb->new_chunk;
+		} else {
+			mddev->reshape_position = MaxSector;
+			mddev->delta_disks = 0;
+			mddev->new_level = mddev->level;
+			mddev->new_layout = mddev->layout;
+			mddev->new_chunk = mddev->chunk_size;
+		}
+
 		if (sb->state & (1<<MD_SB_CLEAN))
 			mddev->recovery_cp = MaxSector;
 		else {
@@ -841,7 +856,6 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 
 	sb->md_magic = MD_SB_MAGIC;
 	sb->major_version = mddev->major_version;
-	sb->minor_version = mddev->minor_version;
 	sb->patch_version = mddev->patch_version;
 	sb->gvalid_words  = 0; /* ignored */
 	memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
@@ -860,6 +874,17 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 	sb->events_hi = (mddev->events>>32);
 	sb->events_lo = (u32)mddev->events;
 
+	if (mddev->reshape_position == MaxSector)
+		sb->minor_version = 90;
+	else {
+		sb->minor_version = 91;
+		sb->reshape_position = mddev->reshape_position;
+		sb->new_level = mddev->new_level;
+		sb->delta_disks = mddev->delta_disks;
+		sb->new_layout = mddev->new_layout;
+		sb->new_chunk = mddev->new_chunk;
+	}
+	mddev->minor_version = sb->minor_version;
 	if (mddev->in_sync)
 	{
 		sb->recovery_cp = mddev->recovery_cp;
@@ -1104,6 +1129,20 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 			}
 			mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
 		}
+		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
+			mddev->reshape_position = le64_to_cpu(sb->reshape_position);
+			mddev->delta_disks = le32_to_cpu(sb->delta_disks);
+			mddev->new_level = le32_to_cpu(sb->new_level);
+			mddev->new_layout = le32_to_cpu(sb->new_layout);
+			mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9;
+		} else {
+			mddev->reshape_position = MaxSector;
+			mddev->delta_disks = 0;
+			mddev->new_level = mddev->level;
+			mddev->new_layout = mddev->layout;
+			mddev->new_chunk = mddev->chunk_size;
+		}
+
 	} else if (mddev->pers == NULL) {
 		/* Insist of good event counter while assembling */
 		__u64 ev1 = le64_to_cpu(sb->events);
@@ -1175,6 +1214,14 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
 		sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
 	}
+	if (mddev->reshape_position != MaxSector) {
+		sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
+		sb->reshape_position = cpu_to_le64(mddev->reshape_position);
+		sb->new_layout = cpu_to_le32(mddev->new_layout);
+		sb->delta_disks = cpu_to_le32(mddev->delta_disks);
+		sb->new_level = cpu_to_le32(mddev->new_level);
+		sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9);
+	}
 
 	max_dev = 0;
 	ITERATE_RDEV(mddev,rdev2,tmp)
@@ -1497,7 +1544,7 @@ static void sync_sbs(mddev_t * mddev)
 	}
 }
 
-static void md_update_sb(mddev_t * mddev)
+void md_update_sb(mddev_t * mddev)
 {
 	int err;
 	struct list_head *tmp;
@@ -1574,6 +1621,7 @@ repeat:
 	wake_up(&mddev->sb_wait);
 
 }
+EXPORT_SYMBOL_GPL(md_update_sb);
 
 /* words written to sysfs files may, or my not, be \n terminated.
  * We want to accept with case. For this we use cmd_match.
@@ -2545,6 +2593,14 @@ static int do_md_run(mddev_t * mddev)
 	mddev->level = pers->level;
 	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
 
+	if (mddev->reshape_position != MaxSector &&
+	    pers->reshape == NULL) {
+		/* This personality cannot handle reshaping... */
+		mddev->pers = NULL;
+		module_put(pers->owner);
+		return -EINVAL;
+	}
+
 	mddev->recovery = 0;
 	mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
 	mddev->barriers_work = 1;
@@ -3433,11 +3489,18 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
 	mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
 	mddev->bitmap_offset = 0;
 
+	mddev->reshape_position = MaxSector;
+
 	/*
 	 * Generate a 128 bit UUID
 	 */
 	get_random_bytes(mddev->uuid, 16);
 
+	mddev->new_level = mddev->level;
+	mddev->new_chunk = mddev->chunk_size;
+	mddev->new_layout = mddev->layout;
+	mddev->delta_disks = 0;
+
 	return 0;
 }
 
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 5d88329e3c7a..b65b8cfbdf30 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1789,6 +1789,11 @@ static int run(mddev_t *mddev)
 		       mdname(mddev), mddev->level);
 		goto out;
 	}
+	if (mddev->reshape_position != MaxSector) {
+		printk("raid1: %s: reshape_position set but not supported\n",
+		       mdname(mddev));
+		goto out;
+	}
 	/*
 	 * copy the already verified devices into our private RAID1
 	 * bookkeeping area. [whatever we allocate in run(),
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b29135acb1d9..20ae32d67e21 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -22,6 +22,7 @@
 #include <linux/raid/raid5.h>
 #include <linux/highmem.h>
 #include <linux/bitops.h>
+#include <linux/kthread.h>
 #include <asm/atomic.h>
 
 #include <linux/raid/bitmap.h>
@@ -1504,6 +1505,7 @@ static void handle_stripe(struct stripe_head *sh)
 		clear_bit(STRIPE_EXPANDING, &sh->state);
 	} else if (expanded) {
 		clear_bit(STRIPE_EXPAND_READY, &sh->state);
+		atomic_dec(&conf->reshape_stripes);
 		wake_up(&conf->wait_for_overlap);
 		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
 	}
@@ -1875,6 +1877,26 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		 */
 		int i;
 		int dd_idx;
+
+		if (sector_nr == 0 &&
+		    conf->expand_progress != 0) {
+			/* restarting in the middle, skip the initial sectors */
+			sector_nr = conf->expand_progress;
+			sector_div(sector_nr, conf->raid_disks-1);
+			*skipped = 1;
+			return sector_nr;
+		}
+
+		/* Cannot proceed until we've updated the superblock... */
+		wait_event(conf->wait_for_overlap,
+			   atomic_read(&conf->reshape_stripes)==0);
+		mddev->reshape_position = conf->expand_progress;
+
+		mddev->sb_dirty = 1;
+		md_wakeup_thread(mddev->thread);
+		wait_event(mddev->sb_wait, mddev->sb_dirty == 0 ||
+			kthread_should_stop());
+
 		for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) {
 			int j;
 			int skipped = 0;
@@ -1882,6 +1904,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 			sh = get_active_stripe(conf, sector_nr+i,
 					       conf->raid_disks, pd_idx, 0);
 			set_bit(STRIPE_EXPANDING, &sh->state);
+			atomic_inc(&conf->reshape_stripes);
 			/* If any of this stripe is beyond the end of the old
 			 * array, then we need to zero those blocks
 			 */
@@ -2121,10 +2144,61 @@ static int run(mddev_t *mddev)
 		return -EIO;
 	}
 
+	if (mddev->reshape_position != MaxSector) {
+		/* Check that we can continue the reshape.
+		 * Currently only disks can change, it must
+		 * increase, and we must be past the point where
+		 * a stripe over-writes itself
+		 */
+		sector_t here_new, here_old;
+		int old_disks;
+
+		if (mddev->new_level != mddev->level ||
+		    mddev->new_layout != mddev->layout ||
+		    mddev->new_chunk != mddev->chunk_size) {
+			printk(KERN_ERR "raid5: %s: unsupported reshape required - aborting.\n",
+			       mdname(mddev));
+			return -EINVAL;
+		}
+		if (mddev->delta_disks <= 0) {
+			printk(KERN_ERR "raid5: %s: unsupported reshape (reduce disks) required - aborting.\n",
+			       mdname(mddev));
+			return -EINVAL;
+		}
+		old_disks = mddev->raid_disks - mddev->delta_disks;
+		/* reshape_position must be on a new-stripe boundary, and one
+		 * further up in new geometry must map after here in old geometry.
+		 */
+		here_new = mddev->reshape_position;
+		if (sector_div(here_new, (mddev->chunk_size>>9)*(mddev->raid_disks-1))) {
+			printk(KERN_ERR "raid5: reshape_position not on a stripe boundary\n");
+			return -EINVAL;
+		}
+		/* here_new is the stripe we will write to */
+		here_old = mddev->reshape_position;
+		sector_div(here_old, (mddev->chunk_size>>9)*(old_disks-1));
+		/* here_old is the first stripe that we might need to read from */
+		if (here_new >= here_old) {
+			/* Reading from the same stripe as writing to - bad */
+			printk(KERN_ERR "raid5: reshape_position too early for auto-recovery - aborting.\n");
+			return -EINVAL;
+		}
+		printk(KERN_INFO "raid5: reshape will continue\n");
+		/* OK, we should be able to continue; */
+	}
+
+
 	mddev->private = kzalloc(sizeof (raid5_conf_t), GFP_KERNEL);
 	if ((conf = mddev->private) == NULL)
 		goto abort;
-	conf->disks = kzalloc(mddev->raid_disks * sizeof(struct disk_info),
+	if (mddev->reshape_position == MaxSector) {
+		conf->previous_raid_disks = conf->raid_disks = mddev->raid_disks;
+	} else {
+		conf->raid_disks = mddev->raid_disks;
+		conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
+	}
+
+	conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info),
 			      GFP_KERNEL);
 	if (!conf->disks)
 		goto abort;
@@ -2148,7 +2222,7 @@ static int run(mddev_t *mddev)
 
 	ITERATE_RDEV(mddev,rdev,tmp) {
 		raid_disk = rdev->raid_disk;
-		if (raid_disk >= mddev->raid_disks
+		if (raid_disk >= conf->raid_disks
 		    || raid_disk < 0)
 			continue;
 		disk = conf->disks + raid_disk;
@@ -2164,7 +2238,6 @@ static int run(mddev_t *mddev)
 		}
 	}
 
-	conf->raid_disks = mddev->raid_disks;
 	/*
 	 * 0 for a fully functional array, 1 for a degraded array.
 	 */
@@ -2174,7 +2247,7 @@ static int run(mddev_t *mddev)
 	conf->level = mddev->level;
 	conf->algorithm = mddev->layout;
 	conf->max_nr_stripes = NR_STRIPES;
-	conf->expand_progress = MaxSector;
+	conf->expand_progress = mddev->reshape_position;
 
 	/* device size must be a multiple of chunk size */
 	mddev->size &= ~(mddev->chunk_size/1024 -1);
@@ -2247,6 +2320,20 @@ static int run(mddev_t *mddev)
 
 	print_raid5_conf(conf);
 
+	if (conf->expand_progress != MaxSector) {
+		printk("...ok start reshape thread\n");
+		atomic_set(&conf->reshape_stripes, 0);
+		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+		set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+		mddev->sync_thread = md_register_thread(md_do_sync, mddev,
+							"%s_reshape");
+		/* FIXME if md_register_thread fails?? */
+		md_wakeup_thread(mddev->sync_thread);
+
+	}
+
 	/* read-ahead size must cover two whole stripes, which is
 	 * 2 * (n-1) * chunksize where 'n' is the number of raid devices
 	 */
@@ -2262,8 +2349,8 @@ static int run(mddev_t *mddev)
 
 	mddev->queue->unplug_fn = raid5_unplug_device;
 	mddev->queue->issue_flush_fn = raid5_issue_flush;
+	mddev->array_size =  mddev->size * (conf->previous_raid_disks - 1);
 
-	mddev->array_size =  mddev->size * (mddev->raid_disks - 1);
 	return 0;
 abort:
 	if (conf) {
@@ -2436,7 +2523,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	/*
 	 * find the disk ...
 	 */
-	for (disk=0; disk < mddev->raid_disks; disk++)
+	for (disk=0; disk < conf->raid_disks; disk++)
 		if ((p=conf->disks + disk)->rdev == NULL) {
 			clear_bit(In_sync, &rdev->flags);
 			rdev->raid_disk = disk;
@@ -2518,9 +2605,10 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks)
 	if (err)
 		return err;
 
+	atomic_set(&conf->reshape_stripes, 0);
 	spin_lock_irq(&conf->device_lock);
 	conf->previous_raid_disks = conf->raid_disks;
-	mddev->raid_disks = conf->raid_disks = raid_disks;
+	conf->raid_disks = raid_disks;
 	conf->expand_progress = 0;
 	spin_unlock_irq(&conf->device_lock);
 
@@ -2542,6 +2630,14 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks)
 		}
 
 	mddev->degraded = (raid_disks - conf->previous_raid_disks) - added_devices;
+	mddev->new_chunk = mddev->chunk_size;
+	mddev->new_layout = mddev->layout;
+	mddev->new_level = mddev->level;
+	mddev->raid_disks = raid_disks;
+	mddev->delta_disks = raid_disks - conf->previous_raid_disks;
+	mddev->reshape_position = 0;
+	mddev->sb_dirty = 1;
+
 	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
 	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
@@ -2552,6 +2648,7 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks)
 		mddev->recovery = 0;
 		spin_lock_irq(&conf->device_lock);
 		mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
+		mddev->delta_disks = 0;
 		conf->expand_progress = MaxSector;
 		spin_unlock_irq(&conf->device_lock);
 		return -EAGAIN;
@@ -2566,20 +2663,23 @@ static void end_reshape(raid5_conf_t *conf)
 {
 	struct block_device *bdev;
 
-	conf->mddev->array_size = conf->mddev->size * (conf->mddev->raid_disks-1);
-	set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
-	conf->mddev->changed = 1;
-
-	bdev = bdget_disk(conf->mddev->gendisk, 0);
-	if (bdev) {
-		mutex_lock(&bdev->bd_inode->i_mutex);
-		i_size_write(bdev->bd_inode, conf->mddev->array_size << 10);
-		mutex_unlock(&bdev->bd_inode->i_mutex);
-		bdput(bdev);
+	if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
+		conf->mddev->array_size = conf->mddev->size * (conf->raid_disks-1);
+		set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
+		conf->mddev->changed = 1;
+
+		bdev = bdget_disk(conf->mddev->gendisk, 0);
+		if (bdev) {
+			mutex_lock(&bdev->bd_inode->i_mutex);
+			i_size_write(bdev->bd_inode, conf->mddev->array_size << 10);
+			mutex_unlock(&bdev->bd_inode->i_mutex);
+			bdput(bdev);
+		}
+		spin_lock_irq(&conf->device_lock);
+		conf->expand_progress = MaxSector;
+		spin_unlock_irq(&conf->device_lock);
+		conf->mddev->reshape_position = MaxSector;
 	}
-	spin_lock_irq(&conf->device_lock);
-	conf->expand_progress = MaxSector;
-	spin_unlock_irq(&conf->device_lock);
 }
 
 static void raid5_quiesce(mddev_t *mddev, int state)
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
index 9c77cde5a795..66b44e5e0d6e 100644
--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -95,6 +95,8 @@ extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
 extern void md_do_sync(mddev_t *mddev);
 extern void md_new_event(mddev_t *mddev);
 
+extern void md_update_sb(mddev_t * mddev);
+
 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
 
 #endif 
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 4e26ef2cacca..1a6f9f2f6282 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -132,6 +132,14 @@ struct mddev_s
 
 	char				uuid[16];
 
+	/* If the array is being reshaped, we need to record the
+	 * new shape and an indication of where we are up to.
+	 * This is written to the superblock.
+	 * If reshape_position is MaxSector, then no reshape is happening (yet).
+	 */
+	sector_t			reshape_position;
+	int				delta_disks, new_level, new_layout, new_chunk;
+
 	struct mdk_thread_s		*thread;	/* management thread */
 	struct mdk_thread_s		*sync_thread;	/* doing resync or reconstruct */
 	sector_t			curr_resync;	/* blocks scheduled */
diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h
index c100fa5d4bfa..774e1acfb8c4 100644
--- a/include/linux/raid/md_p.h
+++ b/include/linux/raid/md_p.h
@@ -102,6 +102,18 @@ typedef struct mdp_device_descriptor_s {
 #define MD_SB_ERRORS		1
 
 #define	MD_SB_BITMAP_PRESENT	8 /* bitmap may be present nearby */
+
+/*
+ * Notes:
+ * - if an array is being reshaped (restriped) in order to change the
+ *   the number of active devices in the array, 'raid_disks' will be
+ *   the larger of the old and new numbers.  'delta_disks' will
+ *   be the "new - old".  So if +ve, raid_disks is the new value, and
+ *   "raid_disks-delta_disks" is the old.  If -ve, raid_disks is the
+ *   old value and "raid_disks+delta_disks" is the new (smaller) value.
+ */
+
+
 typedef struct mdp_superblock_s {
 	/*
 	 * Constant generic information
@@ -146,7 +158,13 @@ typedef struct mdp_superblock_s {
 	__u32 cp_events_hi;	/* 10 high-order of checkpoint update count   */
 #endif
 	__u32 recovery_cp;	/* 11 recovery checkpoint sector count	      */
-	__u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 12];
+	/* There are only valid for minor_version > 90 */
+	__u64 reshape_position;	/* 12,13 next address in array-space for reshape */
+	__u32 new_level;	/* 14 new level we are reshaping to	      */
+	__u32 delta_disks;	/* 15 change in number of raid_disks	      */
+	__u32 new_layout;	/* 16 new layout			      */
+	__u32 new_chunk;	/* 17 new chunk size (bytes)		      */
+	__u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 18];
 
 	/*
 	 * Personality information
@@ -207,7 +225,14 @@ struct mdp_superblock_1 {
 				 * NOTE: signed, so bitmap can be before superblock
 				 * only meaningful of feature_map[0] is set.
 				 */
-	__u8	pad1[128-100];	/* set to 0 when written */
+
+	/* These are only valid with feature bit '4' */
+	__u64	reshape_position;	/* next address in array-space for reshape */
+	__u32	new_level;	/* new level we are reshaping to		*/
+	__u32	delta_disks;	/* change in number of raid_disks		*/
+	__u32	new_layout;	/* new layout					*/
+	__u32	new_chunk;	/* new chunk size (bytes)			*/
+	__u8	pad1[128-124];	/* set to 0 when written */
 
 	/* constant this-device information - 64 bytes */
 	__u64	data_offset;	/* sector start of data, often 0 */
@@ -240,8 +265,9 @@ struct mdp_superblock_1 {
 
 /* feature_map bits */
 #define MD_FEATURE_BITMAP_OFFSET	1
+#define	MD_FEATURE_RESHAPE_ACTIVE	4
 
-#define	MD_FEATURE_ALL			1
+#define	MD_FEATURE_ALL			5
 
 #endif 
 
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 55c738d50508..abcdf0d0658a 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -224,6 +224,7 @@ struct raid5_private_data {
 	struct list_head	bitmap_list; /* stripes delaying awaiting bitmap update */
 	atomic_t		preread_active_stripes; /* stripes with scheduled io */
 
+	atomic_t		reshape_stripes; /* stripes with pending writes for reshape */
 	/* unfortunately we need two cache names as we temporarily have
 	 * two caches.
 	 */
-- 
cgit v1.2.3


From b578d55fdd80140f657130abd85aebeb345755fb Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:12 -0800
Subject: [PATCH] md: Only checkpoint expansion progress occasionally

Instead of checkpointing at each stripe, only checkpoint when a new write
would overwrite uncheckpointed data.  Block any write to the uncheckpointed
area.  Arbitrarily checkpoint at least every 3Meg.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid5.c         | 53 ++++++++++++++++++++++++++++++++++++----------
 include/linux/raid/raid5.h |  3 +++
 2 files changed, 45 insertions(+), 11 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 20ae32d67e21..089a32604305 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1762,8 +1762,9 @@ static int make_request(request_queue_t *q, struct bio * bi)
 	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
 		DEFINE_WAIT(w);
 		int disks;
-		
+
 	retry:
+		prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
 		if (likely(conf->expand_progress == MaxSector))
 			disks = conf->raid_disks;
 		else {
@@ -1771,6 +1772,13 @@ static int make_request(request_queue_t *q, struct bio * bi)
 			disks = conf->raid_disks;
 			if (logical_sector >= conf->expand_progress)
 				disks = conf->previous_raid_disks;
+			else {
+				if (logical_sector >= conf->expand_lo) {
+					spin_unlock_irq(&conf->device_lock);
+					schedule();
+					goto retry;
+				}
+			}
 			spin_unlock_irq(&conf->device_lock);
 		}
  		new_sector = raid5_compute_sector(logical_sector, disks, disks - 1,
@@ -1779,7 +1787,6 @@ static int make_request(request_queue_t *q, struct bio * bi)
 			(unsigned long long)new_sector, 
 			(unsigned long long)logical_sector);
 
-		prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
 		sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK));
 		if (sh) {
 			if (unlikely(conf->expand_progress != MaxSector)) {
@@ -1877,6 +1884,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		 */
 		int i;
 		int dd_idx;
+		sector_t writepos, safepos, gap;
 
 		if (sector_nr == 0 &&
 		    conf->expand_progress != 0) {
@@ -1887,15 +1895,36 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 			return sector_nr;
 		}
 
-		/* Cannot proceed until we've updated the superblock... */
-		wait_event(conf->wait_for_overlap,
-			   atomic_read(&conf->reshape_stripes)==0);
-		mddev->reshape_position = conf->expand_progress;
-
-		mddev->sb_dirty = 1;
-		md_wakeup_thread(mddev->thread);
-		wait_event(mddev->sb_wait, mddev->sb_dirty == 0 ||
-			kthread_should_stop());
+		/* we update the metadata when there is more than 3Meg
+		 * in the block range (that is rather arbitrary, should
+		 * probably be time based) or when the data about to be
+		 * copied would over-write the source of the data at
+		 * the front of the range.
+		 * i.e. one new_stripe forward from expand_progress new_maps
+		 * to after where expand_lo old_maps to
+		 */
+		writepos = conf->expand_progress +
+			conf->chunk_size/512*(conf->raid_disks-1);
+		sector_div(writepos, conf->raid_disks-1);
+		safepos = conf->expand_lo;
+		sector_div(safepos, conf->previous_raid_disks-1);
+		gap = conf->expand_progress - conf->expand_lo;
+
+		if (writepos >= safepos ||
+		    gap > (conf->raid_disks-1)*3000*2 /*3Meg*/) {
+			/* Cannot proceed until we've updated the superblock... */
+			wait_event(conf->wait_for_overlap,
+				   atomic_read(&conf->reshape_stripes)==0);
+			mddev->reshape_position = conf->expand_progress;
+			mddev->sb_dirty = 1;
+			md_wakeup_thread(mddev->thread);
+			wait_event(mddev->sb_wait, mddev->sb_dirty == 0 ||
+				   kthread_should_stop());
+			spin_lock_irq(&conf->device_lock);
+			conf->expand_lo = mddev->reshape_position;
+			spin_unlock_irq(&conf->device_lock);
+			wake_up(&conf->wait_for_overlap);
+		}
 
 		for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) {
 			int j;
@@ -2322,6 +2351,7 @@ static int run(mddev_t *mddev)
 
 	if (conf->expand_progress != MaxSector) {
 		printk("...ok start reshape thread\n");
+		conf->expand_lo = conf->expand_progress;
 		atomic_set(&conf->reshape_stripes, 0);
 		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
@@ -2610,6 +2640,7 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks)
 	conf->previous_raid_disks = conf->raid_disks;
 	conf->raid_disks = raid_disks;
 	conf->expand_progress = 0;
+	conf->expand_lo = 0;
 	spin_unlock_irq(&conf->device_lock);
 
 	/* Add some new drives, as many as will fit.
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index abcdf0d0658a..914af667044f 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -217,6 +217,9 @@ struct raid5_private_data {
 
 	/* used during an expand */
 	sector_t		expand_progress;	/* MaxSector when no expand happening */
+	sector_t		expand_lo; /* from here up to expand_progress it out-of-bounds
+					    * as we haven't flushed the metadata yet
+					    */
 	int			previous_raid_disks;
 
 	struct list_head	handle_list; /* stripes needing handling */
-- 
cgit v1.2.3


From 63c70c4f3a30e77e6f445bd16eff7934a031ebd3 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:13 -0800
Subject: [PATCH] md: Split reshape handler in check_reshape and start_reshape

check_reshape checks validity and does things that can be done instantly -
like adding devices to raid1.  start_reshape initiates a restriping process to
convert the whole array.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c           | 10 ++++----
 drivers/md/raid1.c        | 19 ++++++++++++---
 drivers/md/raid5.c        | 60 +++++++++++++++++++++++++++--------------------
 include/linux/raid/md_k.h |  3 ++-
 4 files changed, 58 insertions(+), 34 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index b9dfdfccdb78..1a637676a930 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2594,7 +2594,7 @@ static int do_md_run(mddev_t * mddev)
 	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
 
 	if (mddev->reshape_position != MaxSector &&
-	    pers->reshape == NULL) {
+	    pers->start_reshape == NULL) {
 		/* This personality cannot handle reshaping... */
 		mddev->pers = NULL;
 		module_put(pers->owner);
@@ -3556,14 +3556,16 @@ static int update_raid_disks(mddev_t *mddev, int raid_disks)
 {
 	int rv;
 	/* change the number of raid disks */
-	if (mddev->pers->reshape == NULL)
+	if (mddev->pers->check_reshape == NULL)
 		return -EINVAL;
 	if (raid_disks <= 0 ||
 	    raid_disks >= mddev->max_disks)
 		return -EINVAL;
-	if (mddev->sync_thread)
+	if (mddev->sync_thread || mddev->reshape_position != MaxSector)
 		return -EBUSY;
-	rv = mddev->pers->reshape(mddev, raid_disks);
+	mddev->delta_disks = raid_disks - mddev->raid_disks;
+
+	rv = mddev->pers->check_reshape(mddev);
 	return rv;
 }
 
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index b65b8cfbdf30..04418e10d1dc 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1976,7 +1976,7 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
 	return 0;
 }
 
-static int raid1_reshape(mddev_t *mddev, int raid_disks)
+static int raid1_reshape(mddev_t *mddev)
 {
 	/* We need to:
 	 * 1/ resize the r1bio_pool
@@ -1993,10 +1993,22 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
 	struct pool_info *newpoolinfo;
 	mirror_info_t *newmirrors;
 	conf_t *conf = mddev_to_conf(mddev);
-	int cnt;
+	int cnt, raid_disks;
 
 	int d, d2;
 
+	/* Cannot change chunk_size, layout, or level */
+	if (mddev->chunk_size != mddev->new_chunk ||
+	    mddev->layout != mddev->new_layout ||
+	    mddev->level != mddev->new_level) {
+		mddev->new_chunk = mddev->chunk_size;
+		mddev->new_layout = mddev->layout;
+		mddev->new_level = mddev->level;
+		return -EINVAL;
+	}
+
+	raid_disks = mddev->raid_disks + mddev->delta_disks;
+
 	if (raid_disks < conf->raid_disks) {
 		cnt=0;
 		for (d= 0; d < conf->raid_disks; d++)
@@ -2043,6 +2055,7 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
 
 	mddev->degraded += (raid_disks - conf->raid_disks);
 	conf->raid_disks = mddev->raid_disks = raid_disks;
+	mddev->delta_disks = 0;
 
 	conf->last_used = 0; /* just make sure it is in-range */
 	lower_barrier(conf);
@@ -2084,7 +2097,7 @@ static struct mdk_personality raid1_personality =
 	.spare_active	= raid1_spare_active,
 	.sync_request	= sync_request,
 	.resize		= raid1_resize,
-	.reshape	= raid1_reshape,
+	.check_reshape	= raid1_reshape,
 	.quiesce	= raid1_quiesce,
 };
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 089a32604305..355dafb98aac 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2590,21 +2590,15 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
 }
 
 #ifdef CONFIG_MD_RAID5_RESHAPE
-static int raid5_reshape(mddev_t *mddev, int raid_disks)
+static int raid5_check_reshape(mddev_t *mddev)
 {
 	raid5_conf_t *conf = mddev_to_conf(mddev);
 	int err;
-	mdk_rdev_t *rdev;
-	struct list_head *rtmp;
-	int spares = 0;
-	int added_devices = 0;
 
-	if (mddev->degraded ||
-	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
-		return -EBUSY;
-	if (conf->raid_disks > raid_disks)
-		return -EINVAL; /* Cannot shrink array yet */
-	if (conf->raid_disks == raid_disks)
+	if (mddev->delta_disks < 0 ||
+	    mddev->new_level != mddev->level)
+		return -EINVAL; /* Cannot shrink array or change level yet */
+	if (mddev->delta_disks == 0)
 		return 0; /* nothing to do */
 
 	/* Can only proceed if there are plenty of stripe_heads.
@@ -2615,30 +2609,48 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks)
 	 * If the chunk size is greater, user-space should request more
 	 * stripe_heads first.
 	 */
-	if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes) {
+	if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes ||
+	    (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) {
 		printk(KERN_WARNING "raid5: reshape: not enough stripes.  Needed %lu\n",
 		       (mddev->chunk_size / STRIPE_SIZE)*4);
 		return -ENOSPC;
 	}
 
+	err = resize_stripes(conf, conf->raid_disks + mddev->delta_disks);
+	if (err)
+		return err;
+
+	/* looks like we might be able to manage this */
+	return 0;
+}
+
+static int raid5_start_reshape(mddev_t *mddev)
+{
+	raid5_conf_t *conf = mddev_to_conf(mddev);
+	mdk_rdev_t *rdev;
+	struct list_head *rtmp;
+	int spares = 0;
+	int added_devices = 0;
+
+	if (mddev->degraded ||
+	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+		return -EBUSY;
+
 	ITERATE_RDEV(mddev, rdev, rtmp)
 		if (rdev->raid_disk < 0 &&
 		    !test_bit(Faulty, &rdev->flags))
 			spares++;
-	if (conf->raid_disks + spares < raid_disks-1)
+
+	if (spares < mddev->delta_disks-1)
 		/* Not enough devices even to make a degraded array
 		 * of that size
 		 */
 		return -EINVAL;
 
-	err = resize_stripes(conf, raid_disks);
-	if (err)
-		return err;
-
 	atomic_set(&conf->reshape_stripes, 0);
 	spin_lock_irq(&conf->device_lock);
 	conf->previous_raid_disks = conf->raid_disks;
-	conf->raid_disks = raid_disks;
+	conf->raid_disks += mddev->delta_disks;
 	conf->expand_progress = 0;
 	conf->expand_lo = 0;
 	spin_unlock_irq(&conf->device_lock);
@@ -2660,12 +2672,8 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks)
 				break;
 		}
 
-	mddev->degraded = (raid_disks - conf->previous_raid_disks) - added_devices;
-	mddev->new_chunk = mddev->chunk_size;
-	mddev->new_layout = mddev->layout;
-	mddev->new_level = mddev->level;
-	mddev->raid_disks = raid_disks;
-	mddev->delta_disks = raid_disks - conf->previous_raid_disks;
+	mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices;
+	mddev->raid_disks = conf->raid_disks;
 	mddev->reshape_position = 0;
 	mddev->sb_dirty = 1;
 
@@ -2679,7 +2687,6 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks)
 		mddev->recovery = 0;
 		spin_lock_irq(&conf->device_lock);
 		mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
-		mddev->delta_disks = 0;
 		conf->expand_progress = MaxSector;
 		spin_unlock_irq(&conf->device_lock);
 		return -EAGAIN;
@@ -2752,7 +2759,8 @@ static struct mdk_personality raid5_personality =
 	.sync_request	= sync_request,
 	.resize		= raid5_resize,
 #ifdef CONFIG_MD_RAID5_RESHAPE
-	.reshape	= raid5_reshape,
+	.check_reshape	= raid5_check_reshape,
+	.start_reshape  = raid5_start_reshape,
 #endif
 	.quiesce	= raid5_quiesce,
 };
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 1a6f9f2f6282..002ee631fabb 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -261,7 +261,8 @@ struct mdk_personality
 	int (*spare_active) (mddev_t *mddev);
 	sector_t (*sync_request)(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster);
 	int (*resize) (mddev_t *mddev, sector_t sectors);
-	int (*reshape) (mddev_t *mddev, int raid_disks);
+	int (*check_reshape) (mddev_t *mddev);
+	int (*start_reshape) (mddev_t *mddev);
 	int (*reconfig) (mddev_t *mddev, int layout, int chunk_size);
 	/* quiesce moves between quiescence states
 	 * 0 - fully active
-- 
cgit v1.2.3


From 16484bf59634e25d1299761e5ed8bacf22bc6368 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:13 -0800
Subject: [PATCH] md: Make 'reshape' a possible sync_action action

This allows reshape to be triggerred via sysfs (which is the only way to start
it happening).

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1a637676a930..a79dd33d343d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2247,7 +2247,14 @@ action_store(mddev_t *mddev, const char *page, size_t len)
 		return -EBUSY;
 	else if (cmd_match(page, "resync") || cmd_match(page, "recover"))
 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-	else {
+	else if (cmd_match(page, "reshape")) {
+		int err;
+		if (mddev->pers->start_reshape == NULL)
+			return -EINVAL;
+		err = mddev->pers->start_reshape(mddev);
+		if (err)
+			return err;
+	} else {
 		if (cmd_match(page, "check"))
 			set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
 		else if (cmd_match(page, "repair"))
-- 
cgit v1.2.3


From e464eafdb4400c6d6576ba3840d8bd40340f8a96 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:14 -0800
Subject: [PATCH] md: Support suspending of IO to regions of an md array

This allows user-space to access data safely.  This is needed for raid5
reshape as user-space needs to take a backup of the first few stripes before
allowing reshape to commence.

It will also be useful in cluster-aware raid1 configurations so that all
cluster members can leave a section of the array untouched while a
resync/recovery happens.

A 'start' and 'end' of the suspended range are written to 2 sysfs attributes.
Note that only one range can be suspended at a time.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c           | 59 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/raid5.c        | 14 +++++++++++
 include/linux/raid/md_k.h |  4 ++++
 3 files changed, 77 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index a79dd33d343d..92fd0104fa04 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2365,6 +2365,63 @@ sync_completed_show(mddev_t *mddev, char *page)
 static struct md_sysfs_entry
 md_sync_completed = __ATTR_RO(sync_completed);
 
+static ssize_t
+suspend_lo_show(mddev_t *mddev, char *page)
+{
+	return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
+}
+
+static ssize_t
+suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	char *e;
+	unsigned long long new = simple_strtoull(buf, &e, 10);
+
+	if (mddev->pers->quiesce == NULL)
+		return -EINVAL;
+	if (buf == e || (*e && *e != '\n'))
+		return -EINVAL;
+	if (new >= mddev->suspend_hi ||
+	    (new > mddev->suspend_lo && new < mddev->suspend_hi)) {
+		mddev->suspend_lo = new;
+		mddev->pers->quiesce(mddev, 2);
+		return len;
+	} else
+		return -EINVAL;
+}
+static struct md_sysfs_entry md_suspend_lo =
+__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
+
+
+static ssize_t
+suspend_hi_show(mddev_t *mddev, char *page)
+{
+	return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
+}
+
+static ssize_t
+suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	char *e;
+	unsigned long long new = simple_strtoull(buf, &e, 10);
+
+	if (mddev->pers->quiesce == NULL)
+		return -EINVAL;
+	if (buf == e || (*e && *e != '\n'))
+		return -EINVAL;
+	if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) ||
+	    (new > mddev->suspend_lo && new > mddev->suspend_hi)) {
+		mddev->suspend_hi = new;
+		mddev->pers->quiesce(mddev, 1);
+		mddev->pers->quiesce(mddev, 0);
+		return len;
+	} else
+		return -EINVAL;
+}
+static struct md_sysfs_entry md_suspend_hi =
+__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
+
+
 static struct attribute *md_default_attrs[] = {
 	&md_level.attr,
 	&md_raid_disks.attr,
@@ -2382,6 +2439,8 @@ static struct attribute *md_redundancy_attrs[] = {
 	&md_sync_max.attr,
 	&md_sync_speed.attr,
 	&md_sync_completed.attr,
+	&md_suspend_lo.attr,
+	&md_suspend_hi.attr,
 	NULL,
 };
 static struct attribute_group md_redundancy_group = {
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 355dafb98aac..bb16ac231a40 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1805,6 +1805,15 @@ static int make_request(request_queue_t *q, struct bio * bi)
 					goto retry;
 				}
 			}
+			/* FIXME what if we get a false positive because these
+			 * are being updated.
+			 */
+			if (logical_sector >= mddev->suspend_lo &&
+			    logical_sector < mddev->suspend_hi) {
+				release_stripe(sh);
+				schedule();
+				goto retry;
+			}
 
 			if (test_bit(STRIPE_EXPANDING, &sh->state) ||
 			    !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
@@ -2725,6 +2734,10 @@ static void raid5_quiesce(mddev_t *mddev, int state)
 	raid5_conf_t *conf = mddev_to_conf(mddev);
 
 	switch(state) {
+	case 2: /* resume for a suspend */
+		wake_up(&conf->wait_for_overlap);
+		break;
+
 	case 1: /* stop all writes */
 		spin_lock_irq(&conf->device_lock);
 		conf->quiesce = 1;
@@ -2738,6 +2751,7 @@ static void raid5_quiesce(mddev_t *mddev, int state)
 		spin_lock_irq(&conf->device_lock);
 		conf->quiesce = 0;
 		wake_up(&conf->wait_for_stripe);
+		wake_up(&conf->wait_for_overlap);
 		spin_unlock_irq(&conf->device_lock);
 		break;
 	}
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 002ee631fabb..c0d3097846a7 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -151,6 +151,10 @@ struct mddev_s
 	sector_t			resync_mismatches; /* count of sectors where
 							    * parity/replica mismatch found
 							    */
+
+	/* allow user-space to request suspension of IO to regions of the array */
+	sector_t			suspend_lo;
+	sector_t			suspend_hi;
 	/* if zero, use the system-wide default */
 	int				sync_speed_min;
 	int				sync_speed_max;
-- 
cgit v1.2.3


From df8e7f7639bab3b2cc536a1d30d5593d65251778 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:15 -0800
Subject: [PATCH] md: Improve comments about locking situation in raid5
 make_request

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid5.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index bb16ac231a40..8eded08411f3 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1768,6 +1768,14 @@ static int make_request(request_queue_t *q, struct bio * bi)
 		if (likely(conf->expand_progress == MaxSector))
 			disks = conf->raid_disks;
 		else {
+			/* spinlock is needed as expand_progress may be
+			 * 64bit on a 32bit platform, and so it might be
+			 * possible to see a half-updated value
+			 * Ofcourse expand_progress could change after
+			 * the lock is dropped, so once we get a reference
+			 * to the stripe that we think it is, we will have
+			 * to check again.
+			 */
 			spin_lock_irq(&conf->device_lock);
 			disks = conf->raid_disks;
 			if (logical_sector >= conf->expand_progress)
@@ -1791,7 +1799,12 @@ static int make_request(request_queue_t *q, struct bio * bi)
 		if (sh) {
 			if (unlikely(conf->expand_progress != MaxSector)) {
 				/* expansion might have moved on while waiting for a
-				 * stripe, so we much do the range check again.
+				 * stripe, so we must do the range check again.
+				 * Expansion could still move past after this
+				 * test, but as we are holding a reference to
+				 * 'sh', we know that if that happens,
+				 *  STRIPE_EXPANDING will get set and the expansion
+				 * won't proceed until we finish with the stripe.
 				 */
 				int must_retry = 0;
 				spin_lock_irq(&conf->device_lock);
-- 
cgit v1.2.3


From b3b46be38aef5c46a4e144f1bcb8d0cc6bf3ff97 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:16 -0800
Subject: [PATCH] md: Remove some stray semi-colons after functions called in
 macro..

wait_event_lock_irq puts a ';' after its usage of the 4th arg, so we don't
need to.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid5.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 8eded08411f3..59c8772ee4ee 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -262,7 +262,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
 						     < (conf->max_nr_stripes *3/4)
 						     || !conf->inactive_blocked),
 						    conf->device_lock,
-						    unplug_slaves(conf->mddev);
+						    unplug_slaves(conf->mddev)
 					);
 				conf->inactive_blocked = 0;
 			} else
@@ -407,7 +407,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
 		wait_event_lock_irq(conf->wait_for_stripe,
 				    !list_empty(&conf->inactive_list),
 				    conf->device_lock,
-				    unplug_slaves(conf->mddev);
+				    unplug_slaves(conf->mddev)
 			);
 		osh = get_free_stripe(conf);
 		spin_unlock_irq(&conf->device_lock);
-- 
cgit v1.2.3


From f6344757a92e5466bf8c23a74ee8f972ff35cb05 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:17 -0800
Subject: [PATCH] md: Remove bi_end_io call out from under a spinlock

raid5 overloads bi_phys_segments to count the number of blocks that the
request was broken in to so that it knows when the bio is completely handled.

Accessing this must always be done under a spinlock.  In one case we also call
bi_end_io under that spinlock, which probably isn't ideal as bi_end_io could
be expensive (even though it isn't allowed to sleep).

So we reducde the range of the spinlock to just accessing bi_phys_segments.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid5.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 59c8772ee4ee..dae740adaf65 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1743,6 +1743,7 @@ static int make_request(request_queue_t *q, struct bio * bi)
 	sector_t logical_sector, last_sector;
 	struct stripe_head *sh;
 	const int rw = bio_data_dir(bi);
+	int remaining;
 
 	if (unlikely(bio_barrier(bi))) {
 		bio_endio(bi, bi->bi_size, -EOPNOTSUPP);
@@ -1852,7 +1853,9 @@ static int make_request(request_queue_t *q, struct bio * bi)
 			
 	}
 	spin_lock_irq(&conf->device_lock);
-	if (--bi->bi_phys_segments == 0) {
+	remaining = --bi->bi_phys_segments;
+	spin_unlock_irq(&conf->device_lock);
+	if (remaining == 0) {
 		int bytes = bi->bi_size;
 
 		if ( bio_data_dir(bi) == WRITE )
@@ -1860,7 +1863,6 @@ static int make_request(request_queue_t *q, struct bio * bi)
 		bi->bi_size = 0;
 		bi->bi_end_io(bi, bytes, 0);
 	}
-	spin_unlock_irq(&conf->device_lock);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 8ddeeae51f2f197b4fafcba117ee8191b49d843e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:18 -0800
Subject: [PATCH] md: Fix md grow/size code to correctly find the maximum
 available space

An md array can be asked to change the amount of each device that it is using,
and in particular can be asked to use the maximum available space.  This
currently only works if the first device is not larger than the rest.  As
'size' gets changed and so 'fit' becomes wrong.  So check if a 'fit' is
required early and don't corrupt it.

Signed-off-by: Doug Ledford <dledford@redhat.com>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 92fd0104fa04..147efcb1e8ca 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3575,6 +3575,7 @@ static int update_size(mddev_t *mddev, unsigned long size)
 	mdk_rdev_t * rdev;
 	int rv;
 	struct list_head *tmp;
+	int fit = (size == 0);
 
 	if (mddev->pers->resize == NULL)
 		return -EINVAL;
@@ -3592,7 +3593,6 @@ static int update_size(mddev_t *mddev, unsigned long size)
 		return -EBUSY;
 	ITERATE_RDEV(mddev,rdev,tmp) {
 		sector_t avail;
-		int fit = (size == 0);
 		if (rdev->sb_offset > rdev->data_offset)
 			avail = (rdev->sb_offset*2) - rdev->data_offset;
 		else
-- 
cgit v1.2.3


From 2f889129de148b0ba2e1fbc9e9d33a4ef4c5f2cb Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:19 -0800
Subject: [PATCH] md: Restore 'remaining' count when retrying an write
 operation

When retrying a write due to barrier failure, we don't reset 'remaining', so
it goes negative and never hits 0 again.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid1.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 04418e10d1dc..3cb0872a845d 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1401,6 +1401,9 @@ static void raid1d(mddev_t *mddev)
 			int i;
 			clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
 			clear_bit(R1BIO_Barrier, &r1_bio->state);
+			for (i=0; i < conf->raid_disks; i++)
+				if (r1_bio->bios[i])
+					atomic_inc(&r1_bio->remaining);
 			for (i=0; i < conf->raid_disks; i++)
 				if (r1_bio->bios[i]) {
 					struct bio_vec *bvec;
-- 
cgit v1.2.3


From 48c9c27b8bcd2a328a06151e2d5c1170db0b701b Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Mon, 27 Mar 2006 01:18:20 -0800
Subject: [PATCH] sem2mutex: drivers/md

Semaphore to mutex conversion.

The conversion was generated via scripts, and the result was validated
automatically via a script as well.

Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm-table.c | 11 ++++++-----
 drivers/md/dm.c       | 19 ++++++++++---------
 drivers/md/kcopyd.c   | 11 ++++++-----
 drivers/md/md.c       | 11 ++++++-----
 4 files changed, 28 insertions(+), 24 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 76610a6ac01c..8f56a54cf0ce 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -14,6 +14,7 @@
 #include <linux/ctype.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
+#include <linux/mutex.h>
 #include <asm/atomic.h>
 
 #define MAX_DEPTH 16
@@ -770,14 +771,14 @@ int dm_table_complete(struct dm_table *t)
 	return r;
 }
 
-static DECLARE_MUTEX(_event_lock);
+static DEFINE_MUTEX(_event_lock);
 void dm_table_event_callback(struct dm_table *t,
 			     void (*fn)(void *), void *context)
 {
-	down(&_event_lock);
+	mutex_lock(&_event_lock);
 	t->event_fn = fn;
 	t->event_context = context;
-	up(&_event_lock);
+	mutex_unlock(&_event_lock);
 }
 
 void dm_table_event(struct dm_table *t)
@@ -788,10 +789,10 @@ void dm_table_event(struct dm_table *t)
 	 */
 	BUG_ON(in_interrupt());
 
-	down(&_event_lock);
+	mutex_lock(&_event_lock);
 	if (t->event_fn)
 		t->event_fn(t->event_context);
-	up(&_event_lock);
+	mutex_unlock(&_event_lock);
 }
 
 sector_t dm_table_get_size(struct dm_table *t)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 973e63d530ae..4d710b7a133b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -10,6 +10,7 @@
 
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/mutex.h>
 #include <linux/moduleparam.h>
 #include <linux/blkpg.h>
 #include <linux/bio.h>
@@ -743,14 +744,14 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
 /*-----------------------------------------------------------------
  * An IDR is used to keep track of allocated minor numbers.
  *---------------------------------------------------------------*/
-static DECLARE_MUTEX(_minor_lock);
+static DEFINE_MUTEX(_minor_lock);
 static DEFINE_IDR(_minor_idr);
 
 static void free_minor(unsigned int minor)
 {
-	down(&_minor_lock);
+	mutex_lock(&_minor_lock);
 	idr_remove(&_minor_idr, minor);
-	up(&_minor_lock);
+	mutex_unlock(&_minor_lock);
 }
 
 /*
@@ -763,7 +764,7 @@ static int specific_minor(struct mapped_device *md, unsigned int minor)
 	if (minor >= (1 << MINORBITS))
 		return -EINVAL;
 
-	down(&_minor_lock);
+	mutex_lock(&_minor_lock);
 
 	if (idr_find(&_minor_idr, minor)) {
 		r = -EBUSY;
@@ -788,7 +789,7 @@ static int specific_minor(struct mapped_device *md, unsigned int minor)
 	}
 
 out:
-	up(&_minor_lock);
+	mutex_unlock(&_minor_lock);
 	return r;
 }
 
@@ -797,7 +798,7 @@ static int next_free_minor(struct mapped_device *md, unsigned int *minor)
 	int r;
 	unsigned int m;
 
-	down(&_minor_lock);
+	mutex_lock(&_minor_lock);
 
 	r = idr_pre_get(&_minor_idr, GFP_KERNEL);
 	if (!r) {
@@ -819,7 +820,7 @@ static int next_free_minor(struct mapped_device *md, unsigned int *minor)
 	*minor = m;
 
 out:
-	up(&_minor_lock);
+	mutex_unlock(&_minor_lock);
 	return r;
 }
 
@@ -1014,13 +1015,13 @@ static struct mapped_device *dm_find_md(dev_t dev)
 	if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
 		return NULL;
 
-	down(&_minor_lock);
+	mutex_lock(&_minor_lock);
 
 	md = idr_find(&_minor_idr, minor);
 	if (!md || (dm_disk(md)->first_minor != minor))
 		md = NULL;
 
-	up(&_minor_lock);
+	mutex_unlock(&_minor_lock);
 
 	return md;
 }
diff --git a/drivers/md/kcopyd.c b/drivers/md/kcopyd.c
index ed71f3f94620..72480a48d88b 100644
--- a/drivers/md/kcopyd.c
+++ b/drivers/md/kcopyd.c
@@ -22,6 +22,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/workqueue.h>
+#include <linux/mutex.h>
 
 #include "kcopyd.h"
 
@@ -581,21 +582,21 @@ int kcopyd_cancel(struct kcopyd_job *job, int block)
 /*-----------------------------------------------------------------
  * Unit setup
  *---------------------------------------------------------------*/
-static DECLARE_MUTEX(_client_lock);
+static DEFINE_MUTEX(_client_lock);
 static LIST_HEAD(_clients);
 
 static void client_add(struct kcopyd_client *kc)
 {
-	down(&_client_lock);
+	mutex_lock(&_client_lock);
 	list_add(&kc->list, &_clients);
-	up(&_client_lock);
+	mutex_unlock(&_client_lock);
 }
 
 static void client_del(struct kcopyd_client *kc)
 {
-	down(&_client_lock);
+	mutex_lock(&_client_lock);
 	list_del(&kc->list);
-	up(&_client_lock);
+	mutex_unlock(&_client_lock);
 }
 
 static DEFINE_MUTEX(kcopyd_init_lock);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 147efcb1e8ca..c9c9c096ad80 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -43,6 +43,7 @@
 #include <linux/buffer_head.h> /* for invalidate_bdev */
 #include <linux/suspend.h>
 #include <linux/poll.h>
+#include <linux/mutex.h>
 
 #include <linux/init.h>
 
@@ -2500,7 +2501,7 @@ int mdp_major = 0;
 
 static struct kobject *md_probe(dev_t dev, int *part, void *data)
 {
-	static DECLARE_MUTEX(disks_sem);
+	static DEFINE_MUTEX(disks_mutex);
 	mddev_t *mddev = mddev_find(dev);
 	struct gendisk *disk;
 	int partitioned = (MAJOR(dev) != MD_MAJOR);
@@ -2510,15 +2511,15 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
 	if (!mddev)
 		return NULL;
 
-	down(&disks_sem);
+	mutex_lock(&disks_mutex);
 	if (mddev->gendisk) {
-		up(&disks_sem);
+		mutex_unlock(&disks_mutex);
 		mddev_put(mddev);
 		return NULL;
 	}
 	disk = alloc_disk(1 << shift);
 	if (!disk) {
-		up(&disks_sem);
+		mutex_unlock(&disks_mutex);
 		mddev_put(mddev);
 		return NULL;
 	}
@@ -2536,7 +2537,7 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
 	disk->queue = mddev->queue;
 	add_disk(disk);
 	mddev->gendisk = disk;
-	up(&disks_sem);
+	mutex_unlock(&disks_mutex);
 	mddev->kobj.parent = &disk->kobj;
 	mddev->kobj.k_name = NULL;
 	snprintf(mddev->kobj.name, KOBJ_NAME_LEN, "%s", "md");
-- 
cgit v1.2.3


From df5b89b323b922f56650b4b4d7c41899b937cf19 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:20 -0800
Subject: [PATCH] md: Convert reconfig_sem to reconfig_mutex

... being careful that mutex_trylock is inverted wrt down_trylock

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c           | 14 +++++++-------
 include/linux/raid/md_k.h |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index c9c9c096ad80..039e071c1007 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -255,7 +255,7 @@ static mddev_t * mddev_find(dev_t unit)
 	else
 		new->md_minor = MINOR(unit) >> MdpMinorShift;
 
-	init_MUTEX(&new->reconfig_sem);
+	mutex_init(&new->reconfig_mutex);
 	INIT_LIST_HEAD(&new->disks);
 	INIT_LIST_HEAD(&new->all_mddevs);
 	init_timer(&new->safemode_timer);
@@ -277,22 +277,22 @@ static mddev_t * mddev_find(dev_t unit)
 
 static inline int mddev_lock(mddev_t * mddev)
 {
-	return down_interruptible(&mddev->reconfig_sem);
+	return mutex_lock_interruptible(&mddev->reconfig_mutex);
 }
 
 static inline void mddev_lock_uninterruptible(mddev_t * mddev)
 {
-	down(&mddev->reconfig_sem);
+	mutex_lock(&mddev->reconfig_mutex);
 }
 
 static inline int mddev_trylock(mddev_t * mddev)
 {
-	return down_trylock(&mddev->reconfig_sem);
+	return mutex_trylock(&mddev->reconfig_mutex);
 }
 
 static inline void mddev_unlock(mddev_t * mddev)
 {
-	up(&mddev->reconfig_sem);
+	mutex_unlock(&mddev->reconfig_mutex);
 
 	md_wakeup_thread(mddev->thread);
 }
@@ -4893,7 +4893,7 @@ void md_check_recovery(mddev_t *mddev)
 		))
 		return;
 
-	if (mddev_trylock(mddev)==0) {
+	if (mddev_trylock(mddev)) {
 		int spares =0;
 
 		spin_lock_irq(&mddev->write_lock);
@@ -5029,7 +5029,7 @@ static int md_notify_reboot(struct notifier_block *this,
 		printk(KERN_INFO "md: stopping all md devices.\n");
 
 		ITERATE_MDDEV(mddev,tmp)
-			if (mddev_trylock(mddev)==0)
+			if (mddev_trylock(mddev))
 				do_md_stop (mddev, 1);
 		/*
 		 * certain more exotic SCSI devices are known to be
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index c0d3097846a7..e2df61f5b09a 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -185,7 +185,7 @@ struct mddev_s
 	unsigned long			recovery;
 
 	int				in_sync;	/* know to not need resync */
-	struct semaphore		reconfig_sem;
+	struct mutex			reconfig_mutex;
 	atomic_t			active;
 
 	int				changed;	/* true if we might need to reread partition info */
-- 
cgit v1.2.3


From 6b1117d50543e2484bee037ec2589fe52492a7e7 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 31 Mar 2006 02:31:57 -0800
Subject: [PATCH] md: Don't clear bits in bitmap when writing to one device
 fails during recovery

Currently a device failure during recovery leaves bits set in the bitmap.
This normally isn't a problem as the offending device will be rejected because
of errors.  However if device re-adding is being used with non-persistent
bitmaps, this can be a problem.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid1.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 3cb0872a845d..9b374c91db66 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1135,8 +1135,19 @@ static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error)
 			mirror = i;
 			break;
 		}
-	if (!uptodate)
+	if (!uptodate) {
+		int sync_blocks = 0;
+		sector_t s = r1_bio->sector;
+		long sectors_to_go = r1_bio->sectors;
+		/* make sure these bits doesn't get cleared. */
+		do {
+			bitmap_end_sync(mddev->bitmap, r1_bio->sector,
+					&sync_blocks, 1);
+			s += sync_blocks;
+			sectors_to_go -= sync_blocks;
+		} while (sectors_to_go > 0);
 		md_error(mddev, conf->mirrors[mirror].rdev);
+	}
 
 	update_head_pos(mirror, r1_bio);
 
-- 
cgit v1.2.3


From 926ce2d8a7d446c720faec9d8c5105eeb04bcf7a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 31 Mar 2006 02:32:02 -0800
Subject: [PATCH] md: Remove some code that can sleep from under a spinlock

And remove the comments that were put in inplace of a fix too....

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 039e071c1007..1ed5152db450 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -215,13 +215,11 @@ static void mddev_put(mddev_t *mddev)
 		return;
 	if (!mddev->raid_disks && list_empty(&mddev->disks)) {
 		list_del(&mddev->all_mddevs);
-		/* that blocks */
+		spin_unlock(&all_mddevs_lock);
 		blk_cleanup_queue(mddev->queue);
-		/* that also blocks */
 		kobject_unregister(&mddev->kobj);
-		/* result blows... */
-	}
-	spin_unlock(&all_mddevs_lock);
+	} else
+		spin_unlock(&all_mddevs_lock);
 }
 
 static mddev_t * mddev_find(dev_t unit)
-- 
cgit v1.2.3


From ec350a7fc16af45e143085594a6c9a18d7a0cba3 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 31 Mar 2006 02:32:06 -0800
Subject: [PATCH] md: Raid-6 did not create sysfs entries for stripe cache

Signed-off-by: Brad Campbell <brad@wasp.net.au>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid6main.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 6df4930fddec..ab64b37e4996 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -2151,6 +2151,8 @@ static int run(mddev_t *mddev)
 	}
 
 	/* Ok, everything is just fine now */
+	sysfs_create_group(&mddev->kobj, &raid6_attrs_group);
+
 	mddev->array_size =  mddev->size * (mddev->raid_disks - 2);
 
 	mddev->queue->unplug_fn = raid6_unplug_device;
-- 
cgit v1.2.3