summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig26
-rw-r--r--drivers/md/bitmap.c18
-rw-r--r--drivers/md/dm-crypt.c31
-rw-r--r--drivers/md/dm-hw-handler.c3
-rw-r--r--drivers/md/dm-io.c13
-rw-r--r--drivers/md/dm-ioctl.c100
-rw-r--r--drivers/md/dm-linear.c8
-rw-r--r--drivers/md/dm-mpath.c3
-rw-r--r--drivers/md/dm-path-selector.c3
-rw-r--r--drivers/md/dm-raid1.c43
-rw-r--r--drivers/md/dm-snap.c412
-rw-r--r--drivers/md/dm-stripe.c19
-rw-r--r--drivers/md/dm-table.c59
-rw-r--r--drivers/md/dm.c154
-rw-r--r--drivers/md/dm.h23
-rw-r--r--drivers/md/kcopyd.c47
-rw-r--r--drivers/md/md.c242
-rw-r--r--drivers/md/multipath.c17
-rw-r--r--drivers/md/raid1.c40
-rw-r--r--drivers/md/raid5.c719
-rw-r--r--drivers/md/raid6main.c16
21 files changed, 1514 insertions, 482 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index ac43f98062fd..fd2aae150ccc 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -127,6 +127,32 @@ config MD_RAID5
If unsure, say Y.
+config MD_RAID5_RESHAPE
+ bool "Support adding drives to a raid-5 array (experimental)"
+ depends on MD_RAID5 && EXPERIMENTAL
+ ---help---
+ A RAID-5 set can be expanded by adding extra drives. This
+ requires "restriping" the array which means (almost) every
+ block must be written to a different place.
+
+ This option allows such restriping to be done while the array
+ is online. However it is still EXPERIMENTAL code. It should
+ work, but please be sure that you have backups.
+
+ You will need a version of mdadm newer than 2.3.1. During the
+ early stage of reshape there is a critical section where live data
+ is being over-written. A crash during this time needs extra care
+ for recovery. The newer mdadm takes a copy of the data in the
+ critical section and will restore it, if necessary, after a crash.
+
+ The mdadm usage is e.g.
+ mdadm --grow /dev/md1 --raid-disks=6
+ to grow '/dev/md1' to having 6 disks.
+
+ Note: The array can only be expanded, not contracted.
+ There should be enough spares already present to make the new
+ array workable.
+
config MD_RAID6
tristate "RAID-6 mode"
depends on BLK_DEV_MD
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index eae4473eadde..f8ffaee20ff8 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -89,16 +89,6 @@ int bitmap_active(struct bitmap *bitmap)
}
#define WRITE_POOL_SIZE 256
-/* mempool for queueing pending writes on the bitmap file */
-static void *write_pool_alloc(gfp_t gfp_flags, void *data)
-{
- return kmalloc(sizeof(struct page_list), gfp_flags);
-}
-
-static void write_pool_free(void *ptr, void *data)
-{
- kfree(ptr);
-}
/*
* just a placeholder - calls kmalloc for bitmap pages
@@ -556,7 +546,7 @@ static void bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
unsigned long flags;
spin_lock_irqsave(&bitmap->lock, flags);
- if (!bitmap || !bitmap->sb_page) { /* can't set the state */
+ if (!bitmap->sb_page) { /* can't set the state */
spin_unlock_irqrestore(&bitmap->lock, flags);
return;
}
@@ -1309,7 +1299,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
case 1:
*bmc = 2;
}
- if ((*bmc & COUNTER_MAX) == COUNTER_MAX) BUG();
+ BUG_ON((*bmc & COUNTER_MAX) == COUNTER_MAX);
(*bmc)++;
spin_unlock_irq(&bitmap->lock);
@@ -1564,8 +1554,8 @@ int bitmap_create(mddev_t *mddev)
spin_lock_init(&bitmap->write_lock);
INIT_LIST_HEAD(&bitmap->complete_pages);
init_waitqueue_head(&bitmap->write_wait);
- bitmap->write_pool = mempool_create(WRITE_POOL_SIZE, write_pool_alloc,
- write_pool_free, NULL);
+ bitmap->write_pool = mempool_create_kmalloc_pool(WRITE_POOL_SIZE,
+ sizeof(struct page_list));
err = -ENOMEM;
if (!bitmap->write_pool)
goto error;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index e7a650f9ca07..61a590bb6241 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -94,20 +94,6 @@ struct crypt_config {
static kmem_cache_t *_crypt_io_pool;
/*
- * Mempool alloc and free functions for the page
- */
-static void *mempool_alloc_page(gfp_t gfp_mask, void *data)
-{
- return alloc_page(gfp_mask);
-}
-
-static void mempool_free_page(void *page, void *data)
-{
- __free_page(page);
-}
-
-
-/*
* Different IV generation algorithms:
*
* plain: the initial vector is the 32-bit low-endian version of the sector
@@ -532,6 +518,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
char *ivopts;
unsigned int crypto_flags;
unsigned int key_size;
+ unsigned long long tmpll;
if (argc != 5) {
ti->error = PFX "Not enough arguments";
@@ -630,15 +617,13 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
}
- cc->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
- mempool_free_slab, _crypt_io_pool);
+ cc->io_pool = mempool_create_slab_pool(MIN_IOS, _crypt_io_pool);
if (!cc->io_pool) {
ti->error = PFX "Cannot allocate crypt io mempool";
goto bad3;
}
- cc->page_pool = mempool_create(MIN_POOL_PAGES, mempool_alloc_page,
- mempool_free_page, NULL);
+ cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
if (!cc->page_pool) {
ti->error = PFX "Cannot allocate page mempool";
goto bad4;
@@ -649,15 +634,17 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad5;
}
- if (sscanf(argv[2], SECTOR_FORMAT, &cc->iv_offset) != 1) {
+ if (sscanf(argv[2], "%llu", &tmpll) != 1) {
ti->error = PFX "Invalid iv_offset sector";
goto bad5;
}
+ cc->iv_offset = tmpll;
- if (sscanf(argv[4], SECTOR_FORMAT, &cc->start) != 1) {
+ if (sscanf(argv[4], "%llu", &tmpll) != 1) {
ti->error = PFX "Invalid device sector";
goto bad5;
}
+ cc->start = tmpll;
if (dm_get_device(ti, argv[3], cc->start, ti->len,
dm_table_get_mode(ti->table), &cc->dev)) {
@@ -901,8 +888,8 @@ static int crypt_status(struct dm_target *ti, status_type_t type,
result[sz++] = '-';
}
- DMEMIT(" " SECTOR_FORMAT " %s " SECTOR_FORMAT,
- cc->iv_offset, cc->dev->name, cc->start);
+ DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset,
+ cc->dev->name, (unsigned long long)cc->start);
break;
}
return 0;
diff --git a/drivers/md/dm-hw-handler.c b/drivers/md/dm-hw-handler.c
index 4cc0010e0156..baafaaba4d4b 100644
--- a/drivers/md/dm-hw-handler.c
+++ b/drivers/md/dm-hw-handler.c
@@ -83,8 +83,7 @@ void dm_put_hw_handler(struct hw_handler_type *hwht)
if (--hwhi->use == 0)
module_put(hwhi->hwht.module);
- if (hwhi->use < 0)
- BUG();
+ BUG_ON(hwhi->use < 0);
out:
up_read(&_hwh_lock);
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 4809b209fbb1..da663d2ff552 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -32,16 +32,6 @@ struct io {
static unsigned _num_ios;
static mempool_t *_io_pool;
-static void *alloc_io(gfp_t gfp_mask, void *pool_data)
-{
- return kmalloc(sizeof(struct io), gfp_mask);
-}
-
-static void free_io(void *element, void *pool_data)
-{
- kfree(element);
-}
-
static unsigned int pages_to_ios(unsigned int pages)
{
return 4 * pages; /* too many ? */
@@ -65,7 +55,8 @@ static int resize_pool(unsigned int new_ios)
} else {
/* create new pool */
- _io_pool = mempool_create(new_ios, alloc_io, free_io, NULL);
+ _io_pool = mempool_create_kmalloc_pool(new_ios,
+ sizeof(struct io));
if (!_io_pool)
return -ENOMEM;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 442e2be6052e..8edd6435414d 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -15,6 +15,7 @@
#include <linux/slab.h>
#include <linux/devfs_fs_kernel.h>
#include <linux/dm-ioctl.h>
+#include <linux/hdreg.h>
#include <asm/uaccess.h>
@@ -244,9 +245,9 @@ static void __hash_remove(struct hash_cell *hc)
dm_table_put(table);
}
- dm_put(hc->md);
if (hc->new_map)
dm_table_put(hc->new_map);
+ dm_put(hc->md);
free_cell(hc);
}
@@ -600,12 +601,22 @@ static int dev_create(struct dm_ioctl *param, size_t param_size)
*/
static struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param)
{
+ struct mapped_device *md;
+ void *mdptr = NULL;
+
if (*param->uuid)
return __get_uuid_cell(param->uuid);
- else if (*param->name)
+
+ if (*param->name)
return __get_name_cell(param->name);
- else
- return dm_get_mdptr(huge_decode_dev(param->dev));
+
+ md = dm_get_md(huge_decode_dev(param->dev));
+ if (md) {
+ mdptr = dm_get_mdptr(md);
+ dm_put(md);
+ }
+
+ return mdptr;
}
static struct mapped_device *find_device(struct dm_ioctl *param)
@@ -690,6 +701,54 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size)
return dm_hash_rename(param->name, new_name);
}
+static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
+{
+ int r = -EINVAL, x;
+ struct mapped_device *md;
+ struct hd_geometry geometry;
+ unsigned long indata[4];
+ char *geostr = (char *) param + param->data_start;
+
+ md = find_device(param);
+ if (!md)
+ return -ENXIO;
+
+ if (geostr < (char *) (param + 1) ||
+ invalid_str(geostr, (void *) param + param_size)) {
+ DMWARN("Invalid geometry supplied.");
+ goto out;
+ }
+
+ x = sscanf(geostr, "%lu %lu %lu %lu", indata,
+ indata + 1, indata + 2, indata + 3);
+
+ if (x != 4) {
+ DMWARN("Unable to interpret geometry settings.");
+ goto out;
+ }
+
+ if (indata[0] > 65535 || indata[1] > 255 ||
+ indata[2] > 255 || indata[3] > ULONG_MAX) {
+ DMWARN("Geometry exceeds range limits.");
+ goto out;
+ }
+
+ geometry.cylinders = indata[0];
+ geometry.heads = indata[1];
+ geometry.sectors = indata[2];
+ geometry.start = indata[3];
+
+ r = dm_set_geometry(md, &geometry);
+ if (!r)
+ r = __dev_status(md, param);
+
+ param->data_size = 0;
+
+out:
+ dm_put(md);
+ return r;
+}
+
static int do_suspend(struct dm_ioctl *param)
{
int r = 0;
@@ -975,33 +1034,43 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
int r;
struct hash_cell *hc;
struct dm_table *t;
+ struct mapped_device *md;
- r = dm_table_create(&t, get_mode(param), param->target_count);
+ md = find_device(param);
+ if (!md)
+ return -ENXIO;
+
+ r = dm_table_create(&t, get_mode(param), param->target_count, md);
if (r)
- return r;
+ goto out;
r = populate_table(t, param, param_size);
if (r) {
dm_table_put(t);
- return r;
+ goto out;
}
down_write(&_hash_lock);
- hc = __find_device_hash_cell(param);
- if (!hc) {
- DMWARN("device doesn't appear to be in the dev hash table.");
- up_write(&_hash_lock);
+ hc = dm_get_mdptr(md);
+ if (!hc || hc->md != md) {
+ DMWARN("device has been removed from the dev hash table.");
dm_table_put(t);
- return -ENXIO;
+ up_write(&_hash_lock);
+ r = -ENXIO;
+ goto out;
}
if (hc->new_map)
dm_table_put(hc->new_map);
hc->new_map = t;
+ up_write(&_hash_lock);
+
param->flags |= DM_INACTIVE_PRESENT_FLAG;
+ r = __dev_status(md, param);
+
+out:
+ dm_put(md);
- r = __dev_status(hc->md, param);
- up_write(&_hash_lock);
return r;
}
@@ -1214,7 +1283,8 @@ static ioctl_fn lookup_ioctl(unsigned int cmd)
{DM_LIST_VERSIONS_CMD, list_versions},
- {DM_TARGET_MSG_CMD, target_message}
+ {DM_TARGET_MSG_CMD, target_message},
+ {DM_DEV_SET_GEOMETRY_CMD, dev_set_geometry}
};
return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn;
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 6a2cd5dc8a63..daf586c0898d 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -26,6 +26,7 @@ struct linear_c {
static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
struct linear_c *lc;
+ unsigned long long tmp;
if (argc != 2) {
ti->error = "dm-linear: Invalid argument count";
@@ -38,10 +39,11 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
return -ENOMEM;
}
- if (sscanf(argv[1], SECTOR_FORMAT, &lc->start) != 1) {
+ if (sscanf(argv[1], "%llu", &tmp) != 1) {
ti->error = "dm-linear: Invalid device sector";
goto bad;
}
+ lc->start = tmp;
if (dm_get_device(ti, argv[0], lc->start, ti->len,
dm_table_get_mode(ti->table), &lc->dev)) {
@@ -87,8 +89,8 @@ static int linear_status(struct dm_target *ti, status_type_t type,
break;
case STATUSTYPE_TABLE:
- snprintf(result, maxlen, "%s " SECTOR_FORMAT, lc->dev->name,
- lc->start);
+ snprintf(result, maxlen, "%s %llu", lc->dev->name,
+ (unsigned long long)lc->start);
break;
}
return 0;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index f72a82fb9434..1816f30678ed 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -179,8 +179,7 @@ static struct multipath *alloc_multipath(void)
m->queue_io = 1;
INIT_WORK(&m->process_queued_ios, process_queued_ios, m);
INIT_WORK(&m->trigger_event, trigger_event, m);
- m->mpio_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
- mempool_free_slab, _mpio_cache);
+ m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
if (!m->mpio_pool) {
kfree(m);
return NULL;
diff --git a/drivers/md/dm-path-selector.c b/drivers/md/dm-path-selector.c
index a28c1c2b4ef5..f10a0c89b3f4 100644
--- a/drivers/md/dm-path-selector.c
+++ b/drivers/md/dm-path-selector.c
@@ -86,8 +86,7 @@ void dm_put_path_selector(struct path_selector_type *pst)
if (--psi->use == 0)
module_put(psi->pst.module);
- if (psi->use < 0)
- BUG();
+ BUG_ON(psi->use < 0);
out:
up_read(&_ps_lock);
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 6cfa8d435d55..d12cf3e5e076 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -122,16 +122,6 @@ static inline sector_t region_to_sector(struct region_hash *rh, region_t region)
/* FIXME move this */
static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw);
-static void *region_alloc(gfp_t gfp_mask, void *pool_data)
-{
- return kmalloc(sizeof(struct region), gfp_mask);
-}
-
-static void region_free(void *element, void *pool_data)
-{
- kfree(element);
-}
-
#define MIN_REGIONS 64
#define MAX_RECOVERY 1
static int rh_init(struct region_hash *rh, struct mirror_set *ms,
@@ -173,8 +163,8 @@ static int rh_init(struct region_hash *rh, struct mirror_set *ms,
INIT_LIST_HEAD(&rh->quiesced_regions);
INIT_LIST_HEAD(&rh->recovered_regions);
- rh->region_pool = mempool_create(MIN_REGIONS, region_alloc,
- region_free, NULL);
+ rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
+ sizeof(struct region));
if (!rh->region_pool) {
vfree(rh->buckets);
rh->buckets = NULL;
@@ -412,9 +402,21 @@ static void rh_dec(struct region_hash *rh, region_t region)
spin_lock_irqsave(&rh->region_lock, flags);
if (atomic_dec_and_test(&reg->pending)) {
+ /*
+ * There is no pending I/O for this region.
+ * We can move the region to corresponding list for next action.
+ * At this point, the region is not yet connected to any list.
+ *
+ * If the state is RH_NOSYNC, the region should be kept off
+ * from clean list.
+ * The hash entry for RH_NOSYNC will remain in memory
+ * until the region is recovered or the map is reloaded.
+ */
+
+ /* do nothing for RH_NOSYNC */
if (reg->state == RH_RECOVERING) {
list_add_tail(&reg->list, &rh->quiesced_regions);
- } else {
+ } else if (reg->state == RH_DIRTY) {
reg->state = RH_CLEAN;
list_add(&reg->list, &rh->clean_regions);
}
@@ -932,9 +934,9 @@ static inline int _check_region_size(struct dm_target *ti, uint32_t size)
static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
unsigned int mirror, char **argv)
{
- sector_t offset;
+ unsigned long long offset;
- if (sscanf(argv[1], SECTOR_FORMAT, &offset) != 1) {
+ if (sscanf(argv[1], "%llu", &offset) != 1) {
ti->error = "dm-mirror: Invalid offset";
return -EINVAL;
}
@@ -1201,16 +1203,17 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
for (m = 0; m < ms->nr_mirrors; m++)
DMEMIT("%s ", ms->mirror[m].dev->name);
- DMEMIT(SECTOR_FORMAT "/" SECTOR_FORMAT,
- ms->rh.log->type->get_sync_count(ms->rh.log),
- ms->nr_regions);
+ DMEMIT("%llu/%llu",
+ (unsigned long long)ms->rh.log->type->
+ get_sync_count(ms->rh.log),
+ (unsigned long long)ms->nr_regions);
break;
case STATUSTYPE_TABLE:
DMEMIT("%d ", ms->nr_mirrors);
for (m = 0; m < ms->nr_mirrors; m++)
- DMEMIT("%s " SECTOR_FORMAT " ",
- ms->mirror[m].dev->name, ms->mirror[m].offset);
+ DMEMIT("%s %llu ", ms->mirror[m].dev->name,
+ (unsigned long long)ms->mirror[m].offset);
}
return 0;
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index f3759dd7828e..08312b46463a 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -49,11 +49,26 @@ struct pending_exception {
struct bio_list snapshot_bios;
/*
- * Other pending_exceptions that are processing this
- * chunk. When this list is empty, we know we can
- * complete the origins.
+ * Short-term queue of pending exceptions prior to submission.
*/
- struct list_head siblings;
+ struct list_head list;
+
+ /*
+ * The primary pending_exception is the one that holds
+ * the sibling_count and the list of origin_bios for a
+ * group of pending_exceptions. It is always last to get freed.
+ * These fields get set up when writing to the origin.
+ */
+ struct pending_exception *primary_pe;
+
+ /*
+ * Number of pending_exceptions processing this chunk.
+ * When this drops to zero we must complete the origin bios.
+ * If incrementing or decrementing this, hold pe->snap->lock for
+ * the sibling concerned and not pe->primary_pe->snap->lock unless
+ * they are the same.
+ */
+ atomic_t sibling_count;
/* Pointer back to snapshot context */
struct dm_snapshot *snap;
@@ -377,6 +392,8 @@ static void read_snapshot_metadata(struct dm_snapshot *s)
down_write(&s->lock);
s->valid = 0;
up_write(&s->lock);
+
+ dm_table_event(s->table);
}
}
@@ -542,8 +559,12 @@ static void snapshot_dtr(struct dm_target *ti)
{
struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
+ /* Prevent further origin writes from using this snapshot. */
+ /* After this returns there can be no new kcopyd jobs. */
unregister_snapshot(s);
+ kcopyd_client_destroy(s->kcopyd_client);
+
exit_exception_table(&s->pending, pending_cache);
exit_exception_table(&s->complete, exception_cache);
@@ -552,7 +573,7 @@ static void snapshot_dtr(struct dm_target *ti)
dm_put_device(ti, s->origin);
dm_put_device(ti, s->cow);
- kcopyd_client_destroy(s->kcopyd_client);
+
kfree(s);
}
@@ -586,78 +607,117 @@ static void error_bios(struct bio *bio)
}
}
+static inline void error_snapshot_bios(struct pending_exception *pe)
+{
+ error_bios(bio_list_get(&pe->snapshot_bios));
+}
+
static struct bio *__flush_bios(struct pending_exception *pe)
{
- struct pending_exception *sibling;
+ /*
+ * If this pe is involved in a write to the origin and
+ * it is the last sibling to complete then release
+ * the bios for the original write to the origin.
+ */
+
+ if (pe->primary_pe &&
+ atomic_dec_and_test(&pe->primary_pe->sibling_count))
+ return bio_list_get(&pe->primary_pe->origin_bios);
+
+ return NULL;
+}
+
+static void __invalidate_snapshot(struct dm_snapshot *s,
+ struct pending_exception *pe, int err)
+{
+ if (!s->valid)
+ return;
- if (list_empty(&pe->siblings))
- return bio_list_get(&pe->origin_bios);
+ if (err == -EIO)
+ DMERR("Invalidating snapshot: Error reading/writing.");
+ else if (err == -ENOMEM)
+ DMERR("Invalidating snapshot: Unable to allocate exception.");
- sibling = list_entry(pe->siblings.next,
- struct pending_exception, siblings);
+ if (pe)
+ remove_exception(&pe->e);
- list_del(&pe->siblings);
+ if (s->store.drop_snapshot)
+ s->store.drop_snapshot(&s->store);
- /* This is fine as long as kcopyd is single-threaded. If kcopyd
- * becomes multi-threaded, we'll need some locking here.
- */
- bio_list_merge(&sibling->origin_bios, &pe->origin_bios);
+ s->valid = 0;
- return NULL;
+ dm_table_event(s->table);
}
static void pending_complete(struct pending_exception *pe, int success)
{
struct exception *e;
+ struct pending_exception *primary_pe;
struct dm_snapshot *s = pe->snap;
struct bio *flush = NULL;
- if (success) {
- e = alloc_exception();
- if (!e) {
- DMWARN("Unable to allocate exception.");
- down_write(&s->lock);
- s->store.drop_snapshot(&s->store);
- s->valid = 0;
- flush = __flush_bios(pe);
- up_write(&s->lock);
-
- error_bios(bio_list_get(&pe->snapshot_bios));
- goto out;
- }
- *e = pe->e;
-
- /*
- * Add a proper exception, and remove the
- * in-flight exception from the list.
- */
+ if (!success) {
+ /* Read/write error - snapshot is unusable */
down_write(&s->lock);
- insert_exception(&s->complete, e);
- remove_exception(&pe->e);
+ __invalidate_snapshot(s, pe, -EIO);
flush = __flush_bios(pe);
-
- /* Submit any pending write bios */
up_write(&s->lock);
- flush_bios(bio_list_get(&pe->snapshot_bios));
- } else {
- /* Read/write error - snapshot is unusable */
+ error_snapshot_bios(pe);
+ goto out;
+ }
+
+ e = alloc_exception();
+ if (!e) {
down_write(&s->lock);
- if (s->valid)
- DMERR("Error reading/writing snapshot");
- s->store.drop_snapshot(&s->store);
- s->valid = 0;
- remove_exception(&pe->e);
+ __invalidate_snapshot(s, pe, -ENOMEM);
flush = __flush_bios(pe);
up_write(&s->lock);
- error_bios(bio_list_get(&pe->snapshot_bios));
+ error_snapshot_bios(pe);
+ goto out;
+ }
+ *e = pe->e;
- dm_table_event(s->table);
+ /*
+ * Add a proper exception, and remove the
+ * in-flight exception from the list.
+ */
+ down_write(&s->lock);
+ if (!s->valid) {
+ flush = __flush_bios(pe);
+ up_write(&s->lock);
+
+ free_exception(e);
+
+ error_snapshot_bios(pe);
+ goto out;
}
+ insert_exception(&s->complete, e);
+ remove_exception(&pe->e);
+ flush = __flush_bios(pe);
+
+ up_write(&s->lock);
+
+ /* Submit any pending write bios */
+ flush_bios(bio_list_get(&pe->snapshot_bios));
+
out:
- free_pending_exception(pe);
+ primary_pe = pe->primary_pe;
+
+ /*
+ * Free the pe if it's not linked to an origin write or if
+ * it's not itself a primary pe.
+ */
+ if (!primary_pe || primary_pe != pe)
+ free_pending_exception(pe);
+
+ /*
+ * Free the primary pe if nothing references it.
+ */
+ if (primary_pe && !atomic_read(&primary_pe->sibling_count))
+ free_pending_exception(primary_pe);
if (flush)
flush_bios(flush);
@@ -734,38 +794,45 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio)
if (e) {
/* cast the exception to a pending exception */
pe = container_of(e, struct pending_exception, e);
+ goto out;
+ }
- } else {
- /*
- * Create a new pending exception, we don't want
- * to hold the lock while we do this.
- */
- up_write(&s->lock);
- pe = alloc_pending_exception();
- down_write(&s->lock);
+ /*
+ * Create a new pending exception, we don't want
+ * to hold the lock while we do this.
+ */
+ up_write(&s->lock);
+ pe = alloc_pending_exception();
+ down_write(&s->lock);
- e = lookup_exception(&s->pending, chunk);
- if (e) {
- free_pending_exception(pe);
- pe = container_of(e, struct pending_exception, e);
- } else {
- pe->e.old_chunk = chunk;
- bio_list_init(&pe->origin_bios);
- bio_list_init(&pe->snapshot_bios);
- INIT_LIST_HEAD(&pe->siblings);
- pe->snap = s;
- pe->started = 0;
-
- if (s->store.prepare_exception(&s->store, &pe->e)) {
- free_pending_exception(pe);
- s->valid = 0;
- return NULL;
- }
+ if (!s->valid) {
+ free_pending_exception(pe);
+ return NULL;
+ }
- insert_exception(&s->pending, &pe->e);
- }
+ e = lookup_exception(&s->pending, chunk);
+ if (e) {
+ free_pending_exception(pe);
+ pe = container_of(e, struct pending_exception, e);
+ goto out;
+ }
+
+ pe->e.old_chunk = chunk;
+ bio_list_init(&pe->origin_bios);
+ bio_list_init(&pe->snapshot_bios);
+ pe->primary_pe = NULL;
+ atomic_set(&pe->sibling_count, 1);
+ pe->snap = s;
+ pe->started = 0;
+
+ if (s->store.prepare_exception(&s->store, &pe->e)) {
+ free_pending_exception(pe);
+ return NULL;
}
+ insert_exception(&s->pending, &pe->e);
+
+ out:
return pe;
}
@@ -782,13 +849,15 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
{
struct exception *e;
struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
+ int copy_needed = 0;
int r = 1;
chunk_t chunk;
- struct pending_exception *pe;
+ struct pending_exception *pe = NULL;
chunk = sector_to_chunk(s, bio->bi_sector);
/* Full snapshots are not usable */
+ /* To get here the table must be live so s->active is always set. */
if (!s->valid)
return -EIO;
@@ -806,36 +875,41 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
* to copy an exception */
down_write(&s->lock);
+ if (!s->valid) {
+ r = -EIO;
+ goto out_unlock;
+ }
+
/* If the block is already remapped - use that, else remap it */
e = lookup_exception(&s->complete, chunk);
if (e) {
remap_exception(s, e, bio);
- up_write(&s->lock);
-
- } else {
- pe = __find_pending_exception(s, bio);
-
- if (!pe) {
- if (s->store.drop_snapshot)
- s->store.drop_snapshot(&s->store);
- s->valid = 0;
- r = -EIO;
- up_write(&s->lock);
- } else {
- remap_exception(s, &pe->e, bio);
- bio_list_add(&pe->snapshot_bios, bio);
-
- if (!pe->started) {
- /* this is protected by snap->lock */
- pe->started = 1;
- up_write(&s->lock);
- start_copy(pe);
- } else
- up_write(&s->lock);
- r = 0;
- }
+ goto out_unlock;
+ }
+
+ pe = __find_pending_exception(s, bio);
+ if (!pe) {
+ __invalidate_snapshot(s, pe, -ENOMEM);
+ r = -EIO;
+ goto out_unlock;
+ }
+
+ remap_exception(s, &pe->e, bio);
+ bio_list_add(&pe->snapshot_bios, bio);
+
+ if (!pe->started) {
+ /* this is protected by snap->lock */
+ pe->started = 1;
+ copy_needed = 1;
}
+ r = 0;
+
+ out_unlock:
+ up_write(&s->lock);
+
+ if (copy_needed)
+ start_copy(pe);
} else {
/*
* FIXME: this read path scares me because we
@@ -847,6 +921,11 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
/* Do reads */
down_read(&s->lock);
+ if (!s->valid) {
+ up_read(&s->lock);
+ return -EIO;
+ }
+
/* See if it it has been remapped */
e = lookup_exception(&s->complete, chunk);
if (e)
@@ -884,9 +963,9 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
snap->store.fraction_full(&snap->store,
&numerator,
&denominator);
- snprintf(result, maxlen,
- SECTOR_FORMAT "/" SECTOR_FORMAT,
- numerator, denominator);
+ snprintf(result, maxlen, "%llu/%llu",
+ (unsigned long long)numerator,
+ (unsigned long long)denominator);
}
else
snprintf(result, maxlen, "Unknown");
@@ -899,9 +978,10 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
* to make private copies if the output is to
* make sense.
*/
- snprintf(result, maxlen, "%s %s %c " SECTOR_FORMAT,
+ snprintf(result, maxlen, "%s %s %c %llu",
snap->origin->name, snap->cow->name,
- snap->type, snap->chunk_size);
+ snap->type,
+ (unsigned long long)snap->chunk_size);
break;
}
@@ -911,40 +991,27 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
/*-----------------------------------------------------------------
* Origin methods
*---------------------------------------------------------------*/
-static void list_merge(struct list_head *l1, struct list_head *l2)
-{
- struct list_head *l1_n, *l2_p;
-
- l1_n = l1->next;
- l2_p = l2->prev;
-
- l1->next = l2;
- l2->prev = l1;
-
- l2_p->next = l1_n;
- l1_n->prev = l2_p;
-}
-
static int __origin_write(struct list_head *snapshots, struct bio *bio)
{
- int r = 1, first = 1;
+ int r = 1, first = 0;
struct dm_snapshot *snap;
struct exception *e;
- struct pending_exception *pe, *last = NULL;
+ struct pending_exception *pe, *next_pe, *primary_pe = NULL;
chunk_t chunk;
+ LIST_HEAD(pe_queue);
/* Do all the snapshots on this origin */
list_for_each_entry (snap, snapshots, list) {
+ down_write(&snap->lock);
+
/* Only deal with valid and active snapshots */
if (!snap->valid || !snap->active)
- continue;
+ goto next_snapshot;
/* Nothing to do if writing beyond end of snapshot */
if (bio->bi_sector >= dm_table_get_size(snap->table))
- continue;
-
- down_write(&snap->lock);
+ goto next_snapshot;
/*
* Remember, different snapshots can have
@@ -956,49 +1023,75 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
* Check exception table to see if block
* is already remapped in this snapshot
* and trigger an exception if not.
+ *
+ * sibling_count is initialised to 1 so pending_complete()
+ * won't destroy the primary_pe while we're inside this loop.
*/
e = lookup_exception(&snap->complete, chunk);
- if (!e) {
- pe = __find_pending_exception(snap, bio);
- if (!pe) {
- snap->store.drop_snapshot(&snap->store);
- snap->valid = 0;
-
- } else {
- if (last)
- list_merge(&pe->siblings,
- &last->siblings);
-
- last = pe;
- r = 0;
+ if (e)
+ goto next_snapshot;
+
+ pe = __find_pending_exception(snap, bio);
+ if (!pe) {
+ __invalidate_snapshot(snap, pe, ENOMEM);
+ goto next_snapshot;
+ }
+
+ if (!primary_pe) {
+ /*
+ * Either every pe here has same
+ * primary_pe or none has one yet.
+ */
+ if (pe->primary_pe)
+ primary_pe = pe->primary_pe;
+ else {
+ primary_pe = pe;
+ first = 1;
}
+
+ bio_list_add(&primary_pe->origin_bios, bio);
+
+ r = 0;
+ }
+
+ if (!pe->primary_pe) {
+ atomic_inc(&primary_pe->sibling_count);
+ pe->primary_pe = primary_pe;
+ }
+
+ if (!pe->started) {
+ pe->started = 1;
+ list_add_tail(&pe->list, &pe_queue);
}
+ next_snapshot:
up_write(&snap->lock);
}
+ if (!primary_pe)
+ goto out;
+
/*
- * Now that we have a complete pe list we can start the copying.
+ * If this is the first time we're processing this chunk and
+ * sibling_count is now 1 it means all the pending exceptions
+ * got completed while we were in the loop above, so it falls to
+ * us here to remove the primary_pe and submit any origin_bios.
*/
- if (last) {
- pe = last;
- do {
- down_write(&pe->snap->lock);
- if (first)
- bio_list_add(&pe->origin_bios, bio);
- if (!pe->started) {
- pe->started = 1;
- up_write(&pe->snap->lock);
- start_copy(pe);
- } else
- up_write(&pe->snap->lock);
- first = 0;
- pe = list_entry(pe->siblings.next,
- struct pending_exception, siblings);
-
- } while (pe != last);
+
+ if (first && atomic_dec_and_test(&primary_pe->sibling_count)) {
+ flush_bios(bio_list_get(&primary_pe->origin_bios));
+ free_pending_exception(primary_pe);
+ /* If we got here, pe_queue is necessarily empty. */
+ goto out;
}
+ /*
+ * Now that we have a complete pe list we can start the copying.
+ */
+ list_for_each_entry_safe(pe, next_pe, &pe_queue, list)
+ start_copy(pe);
+
+ out:
return r;
}
@@ -1174,8 +1267,7 @@ static int __init dm_snapshot_init(void)
goto bad4;
}
- pending_pool = mempool_create(128, mempool_alloc_slab,
- mempool_free_slab, pending_cache);
+ pending_pool = mempool_create_slab_pool(128, pending_cache);
if (!pending_pool) {
DMERR("Couldn't create pending pool.");
r = -ENOMEM;
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index ab89278a56bf..08328a8f5a3c 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -49,9 +49,9 @@ static inline struct stripe_c *alloc_context(unsigned int stripes)
static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
unsigned int stripe, char **argv)
{
- sector_t start;
+ unsigned long long start;
- if (sscanf(argv[1], SECTOR_FORMAT, &start) != 1)
+ if (sscanf(argv[1], "%llu", &start) != 1)
return -EINVAL;
if (dm_get_device(ti, argv[0], start, sc->stripe_width,
@@ -103,9 +103,15 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
return -EINVAL;
}
+ if (ti->len & (chunk_size - 1)) {
+ ti->error = "dm-stripe: Target length not divisible by "
+ "chunk size";
+ return -EINVAL;
+ }
+
width = ti->len;
if (sector_div(width, stripes)) {
- ti->error = "dm-stripe: Target length not divisable by "
+ ti->error = "dm-stripe: Target length not divisible by "
"number of stripes";
return -EINVAL;
}
@@ -195,10 +201,11 @@ static int stripe_status(struct dm_target *ti,
break;
case STATUSTYPE_TABLE:
- DMEMIT("%d " SECTOR_FORMAT, sc->stripes, sc->chunk_mask + 1);
+ DMEMIT("%d %llu", sc->stripes,
+ (unsigned long long)sc->chunk_mask + 1);
for (i = 0; i < sc->stripes; i++)
- DMEMIT(" %s " SECTOR_FORMAT, sc->stripe[i].dev->name,
- sc->stripe[i].physical_start);
+ DMEMIT(" %s %llu", sc->stripe[i].dev->name,
+ (unsigned long long)sc->stripe[i].physical_start);
break;
}
return 0;
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 9b1e2f5ca630..8f56a54cf0ce 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -14,6 +14,7 @@
#include <linux/ctype.h>
#include <linux/slab.h>
#include <linux/interrupt.h>
+#include <linux/mutex.h>
#include <asm/atomic.h>
#define MAX_DEPTH 16
@@ -22,6 +23,7 @@
#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1)
struct dm_table {
+ struct mapped_device *md;
atomic_t holders;
/* btree table */
@@ -97,6 +99,8 @@ static void combine_restrictions_low(struct io_restrictions *lhs,
lhs->seg_boundary_mask =
min_not_zero(lhs->seg_boundary_mask, rhs->seg_boundary_mask);
+
+ lhs->no_cluster |= rhs->no_cluster;
}
/*
@@ -204,7 +208,8 @@ static int alloc_targets(struct dm_table *t, unsigned int num)
return 0;
}
-int dm_table_create(struct dm_table **result, int mode, unsigned num_targets)
+int dm_table_create(struct dm_table **result, int mode,
+ unsigned num_targets, struct mapped_device *md)
{
struct dm_table *t = kmalloc(sizeof(*t), GFP_KERNEL);
@@ -227,6 +232,7 @@ int dm_table_create(struct dm_table **result, int mode, unsigned num_targets)
}
t->mode = mode;
+ t->md = md;
*result = t;
return 0;
}
@@ -345,20 +351,19 @@ static struct dm_dev *find_device(struct list_head *l, dev_t dev)
/*
* Open a device so we can use it as a map destination.
*/
-static int open_dev(struct dm_dev *d, dev_t dev)
+static int open_dev(struct dm_dev *d, dev_t dev, struct mapped_device *md)
{
static char *_claim_ptr = "I belong to device-mapper";
struct block_device *bdev;
int r;
- if (d->bdev)
- BUG();
+ BUG_ON(d->bdev);
bdev = open_by_devnum(dev, d->mode);
if (IS_ERR(bdev))
return PTR_ERR(bdev);
- r = bd_claim(bdev, _claim_ptr);
+ r = bd_claim_by_disk(bdev, _claim_ptr, dm_disk(md));
if (r)
blkdev_put(bdev);
else
@@ -369,12 +374,12 @@ static int open_dev(struct dm_dev *d, dev_t dev)
/*
* Close a device that we've been using.
*/
-static void close_dev(struct dm_dev *d)
+static void close_dev(struct dm_dev *d, struct mapped_device *md)
{
if (!d->bdev)
return;
- bd_release(d->bdev);
+ bd_release_from_disk(d->bdev, dm_disk(md));
blkdev_put(d->bdev);
d->bdev = NULL;
}
@@ -395,7 +400,7 @@ static int check_device_area(struct dm_dev *dd, sector_t start, sector_t len)
* careful to leave things as they were if we fail to reopen the
* device.
*/
-static int upgrade_mode(struct dm_dev *dd, int new_mode)
+static int upgrade_mode(struct dm_dev *dd, int new_mode, struct mapped_device *md)
{
int r;
struct dm_dev dd_copy;
@@ -405,9 +410,9 @@ static int upgrade_mode(struct dm_dev *dd, int new_mode)
dd->mode |= new_mode;
dd->bdev = NULL;
- r = open_dev(dd, dev);
+ r = open_dev(dd, dev, md);
if (!r)
- close_dev(&dd_copy);
+ close_dev(&dd_copy, md);
else
*dd = dd_copy;
@@ -427,8 +432,7 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
struct dm_dev *dd;
unsigned int major, minor;
- if (!t)
- BUG();
+ BUG_ON(!t);
if (sscanf(path, "%u:%u", &major, &minor) == 2) {
/* Extract the major/minor numbers */
@@ -450,7 +454,7 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
dd->mode = mode;
dd->bdev = NULL;
- if ((r = open_dev(dd, dev))) {
+ if ((r = open_dev(dd, dev, t->md))) {
kfree(dd);
return r;
}
@@ -461,7 +465,7 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
list_add(&dd->list, &t->devices);
} else if (dd->mode != (mode | dd->mode)) {
- r = upgrade_mode(dd, mode);
+ r = upgrade_mode(dd, mode, t->md);
if (r)
return r;
}
@@ -525,6 +529,8 @@ int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
rs->seg_boundary_mask =
min_not_zero(rs->seg_boundary_mask,
q->seg_boundary_mask);
+
+ rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
}
return r;
@@ -536,7 +542,7 @@ int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
void dm_put_device(struct dm_target *ti, struct dm_dev *dd)
{
if (atomic_dec_and_test(&dd->count)) {
- close_dev(dd);
+ close_dev(dd, ti->table->md);
list_del(&dd->list);
kfree(dd);
}
@@ -765,14 +771,14 @@ int dm_table_complete(struct dm_table *t)
return r;
}
-static DECLARE_MUTEX(_event_lock);
+static DEFINE_MUTEX(_event_lock);
void dm_table_event_callback(struct dm_table *t,
void (*fn)(void *), void *context)
{
- down(&_event_lock);
+ mutex_lock(&_event_lock);
t->event_fn = fn;
t->event_context = context;
- up(&_event_lock);
+ mutex_unlock(&_event_lock);
}
void dm_table_event(struct dm_table *t)
@@ -783,10 +789,10 @@ void dm_table_event(struct dm_table *t)
*/
BUG_ON(in_interrupt());
- down(&_event_lock);
+ mutex_lock(&_event_lock);
if (t->event_fn)
t->event_fn(t->event_context);
- up(&_event_lock);
+ mutex_unlock(&_event_lock);
}
sector_t dm_table_get_size(struct dm_table *t)
@@ -834,6 +840,11 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
q->hardsect_size = t->limits.hardsect_size;
q->max_segment_size = t->limits.max_segment_size;
q->seg_boundary_mask = t->limits.seg_boundary_mask;
+ if (t->limits.no_cluster)
+ q->queue_flags &= ~(1 << QUEUE_FLAG_CLUSTER);
+ else
+ q->queue_flags |= (1 << QUEUE_FLAG_CLUSTER);
+
}
unsigned int dm_table_get_num_targets(struct dm_table *t)
@@ -945,12 +956,20 @@ int dm_table_flush_all(struct dm_table *t)
return ret;
}
+struct mapped_device *dm_table_get_md(struct dm_table *t)
+{
+ dm_get(t->md);
+
+ return t->md;
+}
+
EXPORT_SYMBOL(dm_vcalloc);
EXPORT_SYMBOL(dm_get_device);
EXPORT_SYMBOL(dm_put_device);
EXPORT_SYMBOL(dm_table_event);
EXPORT_SYMBOL(dm_table_get_size);
EXPORT_SYMBOL(dm_table_get_mode);
+EXPORT_SYMBOL(dm_table_get_md);
EXPORT_SYMBOL(dm_table_put);
EXPORT_SYMBOL(dm_table_get);
EXPORT_SYMBOL(dm_table_unplug_all);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 745ca1f67b14..4d710b7a133b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -10,6 +10,7 @@
#include <linux/init.h>
#include <linux/module.h>
+#include <linux/mutex.h>
#include <linux/moduleparam.h>
#include <linux/blkpg.h>
#include <linux/bio.h>
@@ -17,6 +18,8 @@
#include <linux/mempool.h>
#include <linux/slab.h>
#include <linux/idr.h>
+#include <linux/hdreg.h>
+#include <linux/blktrace_api.h>
static const char *_name = DM_NAME;
@@ -68,6 +71,7 @@ struct mapped_device {
request_queue_t *queue;
struct gendisk *disk;
+ char name[16];
void *interface_ptr;
@@ -100,6 +104,9 @@ struct mapped_device {
*/
struct super_block *frozen_sb;
struct block_device *suspended_bdev;
+
+ /* forced geometry settings */
+ struct hd_geometry geometry;
};
#define MIN_IOS 256
@@ -225,6 +232,13 @@ static int dm_blk_close(struct inode *inode, struct file *file)
return 0;
}
+static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+ struct mapped_device *md = bdev->bd_disk->private_data;
+
+ return dm_get_geometry(md, geo);
+}
+
static inline struct dm_io *alloc_io(struct mapped_device *md)
{
return mempool_alloc(md->io_pool, GFP_NOIO);
@@ -311,6 +325,33 @@ struct dm_table *dm_get_table(struct mapped_device *md)
return t;
}
+/*
+ * Get the geometry associated with a dm device
+ */
+int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
+{
+ *geo = md->geometry;
+
+ return 0;
+}
+
+/*
+ * Set the geometry of a device.
+ */
+int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
+{
+ sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
+
+ if (geo->start > sz) {
+ DMWARN("Start sector is beyond the geometry limits.");
+ return -EINVAL;
+ }
+
+ md->geometry = *geo;
+
+ return 0;
+}
+
/*-----------------------------------------------------------------
* CRUD START:
* A more elegant soln is in the works that uses the queue
@@ -334,6 +375,8 @@ static void dec_pending(struct dm_io *io, int error)
/* nudge anyone waiting on suspend queue */
wake_up(&io->md->wait);
+ blk_add_trace_bio(io->md->queue, io->bio, BLK_TA_COMPLETE);
+
bio_endio(io->bio, io->bio->bi_size, io->error);
free_io(io->md, io);
}
@@ -392,6 +435,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
struct target_io *tio)
{
int r;
+ sector_t sector;
/*
* Sanity checks.
@@ -407,10 +451,17 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
* this io.
*/
atomic_inc(&tio->io->io_count);
+ sector = clone->bi_sector;
r = ti->type->map(ti, clone, &tio->info);
- if (r > 0)
+ if (r > 0) {
/* the bio has been remapped so dispatch it */
+
+ blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone,
+ tio->io->bio->bi_bdev->bd_dev, sector,
+ clone->bi_sector);
+
generic_make_request(clone);
+ }
else if (r < 0) {
/* error the io and bail out */
@@ -533,30 +584,35 @@ static void __clone_and_map(struct clone_info *ci)
} else {
/*
- * Create two copy bios to deal with io that has
- * been split across a target.
+ * Handle a bvec that must be split between two or more targets.
*/
struct bio_vec *bv = bio->bi_io_vec + ci->idx;
+ sector_t remaining = to_sector(bv->bv_len);
+ unsigned int offset = 0;
- clone = split_bvec(bio, ci->sector, ci->idx,
- bv->bv_offset, max);
- __map_bio(ti, clone, tio);
+ do {
+ if (offset) {
+ ti = dm_table_find_target(ci->map, ci->sector);
+ max = max_io_len(ci->md, ci->sector, ti);
- ci->sector += max;
- ci->sector_count -= max;
- ti = dm_table_find_target(ci->map, ci->sector);
-
- len = to_sector(bv->bv_len) - max;
- clone = split_bvec(bio, ci->sector, ci->idx,
- bv->bv_offset + to_bytes(max), len);
- tio = alloc_tio(ci->md);
- tio->io = ci->io;
- tio->ti = ti;
- memset(&tio->info, 0, sizeof(tio->info));
- __map_bio(ti, clone, tio);
+ tio = alloc_tio(ci->md);
+ tio->io = ci->io;
+ tio->ti = ti;
+ memset(&tio->info, 0, sizeof(tio->info));
+ }
+
+ len = min(remaining, max);
+
+ clone = split_bvec(bio, ci->sector, ci->idx,
+ bv->bv_offset + offset, len);
+
+ __map_bio(ti, clone, tio);
+
+ ci->sector += len;
+ ci->sector_count -= len;
+ offset += to_bytes(len);
+ } while (remaining -= len);
- ci->sector += len;
- ci->sector_count -= len;
ci->idx++;
}
}
@@ -688,14 +744,14 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
/*-----------------------------------------------------------------
* An IDR is used to keep track of allocated minor numbers.
*---------------------------------------------------------------*/
-static DECLARE_MUTEX(_minor_lock);
+static DEFINE_MUTEX(_minor_lock);
static DEFINE_IDR(_minor_idr);
static void free_minor(unsigned int minor)
{
- down(&_minor_lock);
+ mutex_lock(&_minor_lock);
idr_remove(&_minor_idr, minor);
- up(&_minor_lock);
+ mutex_unlock(&_minor_lock);
}
/*
@@ -708,7 +764,7 @@ static int specific_minor(struct mapped_device *md, unsigned int minor)
if (minor >= (1 << MINORBITS))
return -EINVAL;
- down(&_minor_lock);
+ mutex_lock(&_minor_lock);
if (idr_find(&_minor_idr, minor)) {
r = -EBUSY;
@@ -733,7 +789,7 @@ static int specific_minor(struct mapped_device *md, unsigned int minor)
}
out:
- up(&_minor_lock);
+ mutex_unlock(&_minor_lock);
return r;
}
@@ -742,7 +798,7 @@ static int next_free_minor(struct mapped_device *md, unsigned int *minor)
int r;
unsigned int m;
- down(&_minor_lock);
+ mutex_lock(&_minor_lock);
r = idr_pre_get(&_minor_idr, GFP_KERNEL);
if (!r) {
@@ -764,7 +820,7 @@ static int next_free_minor(struct mapped_device *md, unsigned int *minor)
*minor = m;
out:
- up(&_minor_lock);
+ mutex_unlock(&_minor_lock);
return r;
}
@@ -807,13 +863,11 @@ static struct mapped_device *alloc_dev(unsigned int minor, int persistent)
md->queue->unplug_fn = dm_unplug_all;
md->queue->issue_flush_fn = dm_flush_all;
- md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
- mempool_free_slab, _io_cache);
+ md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache);
if (!md->io_pool)
goto bad2;
- md->tio_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
- mempool_free_slab, _tio_cache);
+ md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache);
if (!md->tio_pool)
goto bad3;
@@ -828,6 +882,7 @@ static struct mapped_device *alloc_dev(unsigned int minor, int persistent)
md->disk->private_data = md;
sprintf(md->disk->disk_name, "dm-%d", minor);
add_disk(md->disk);
+ format_dev_t(md->name, MKDEV(_major, minor));
atomic_set(&md->pending, 0);
init_waitqueue_head(&md->wait);
@@ -840,7 +895,7 @@ static struct mapped_device *alloc_dev(unsigned int minor, int persistent)
bad3:
mempool_destroy(md->io_pool);
bad2:
- blk_put_queue(md->queue);
+ blk_cleanup_queue(md->queue);
free_minor(minor);
bad1:
kfree(md);
@@ -860,7 +915,7 @@ static void free_dev(struct mapped_device *md)
del_gendisk(md->disk);
free_minor(minor);
put_disk(md->disk);
- blk_put_queue(md->queue);
+ blk_cleanup_queue(md->queue);
kfree(md);
}
@@ -890,6 +945,13 @@ static int __bind(struct mapped_device *md, struct dm_table *t)
sector_t size;
size = dm_table_get_size(t);
+
+ /*
+ * Wipe any geometry if the size of the table changed.
+ */
+ if (size != get_capacity(md->disk))
+ memset(&md->geometry, 0, sizeof(md->geometry));
+
__set_size(md, size);
if (size == 0)
return 0;
@@ -953,13 +1015,13 @@ static struct mapped_device *dm_find_md(dev_t dev)
if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
return NULL;
- down(&_minor_lock);
+ mutex_lock(&_minor_lock);
md = idr_find(&_minor_idr, minor);
if (!md || (dm_disk(md)->first_minor != minor))
md = NULL;
- up(&_minor_lock);
+ mutex_unlock(&_minor_lock);
return md;
}
@@ -974,15 +1036,9 @@ struct mapped_device *dm_get_md(dev_t dev)
return md;
}
-void *dm_get_mdptr(dev_t dev)
+void *dm_get_mdptr(struct mapped_device *md)
{
- struct mapped_device *md;
- void *mdptr = NULL;
-
- md = dm_find_md(dev);
- if (md)
- mdptr = md->interface_ptr;
- return mdptr;
+ return md->interface_ptr;
}
void dm_set_mdptr(struct mapped_device *md, void *ptr)
@@ -997,18 +1053,18 @@ void dm_get(struct mapped_device *md)
void dm_put(struct mapped_device *md)
{
- struct dm_table *map = dm_get_table(md);
+ struct dm_table *map;
if (atomic_dec_and_test(&md->holders)) {
+ map = dm_get_table(md);
if (!dm_suspended(md)) {
dm_table_presuspend_targets(map);
dm_table_postsuspend_targets(map);
}
__unbind(md);
+ dm_table_put(map);
free_dev(md);
}
-
- dm_table_put(map);
}
/*
@@ -1093,6 +1149,7 @@ int dm_suspend(struct mapped_device *md, int do_lockfs)
{
struct dm_table *map = NULL;
DECLARE_WAITQUEUE(wait, current);
+ struct bio *def;
int r = -EINVAL;
down(&md->suspend_lock);
@@ -1152,9 +1209,11 @@ int dm_suspend(struct mapped_device *md, int do_lockfs)
/* were we interrupted ? */
r = -EINTR;
if (atomic_read(&md->pending)) {
+ clear_bit(DMF_BLOCK_IO, &md->flags);
+ def = bio_list_get(&md->deferred);
+ __flush_deferred_io(md, def);
up_write(&md->io_lock);
unlock_fs(md);
- clear_bit(DMF_BLOCK_IO, &md->flags);
goto out;
}
up_write(&md->io_lock);
@@ -1248,6 +1307,7 @@ int dm_suspended(struct mapped_device *md)
static struct block_device_operations dm_blk_dops = {
.open = dm_blk_open,
.release = dm_blk_close,
+ .getgeo = dm_blk_getgeo,
.owner = THIS_MODULE
};
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 4eaf075da217..fd90bc8f9e45 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -14,6 +14,7 @@
#include <linux/device-mapper.h>
#include <linux/list.h>
#include <linux/blkdev.h>
+#include <linux/hdreg.h>
#define DM_NAME "device-mapper"
#define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x)
@@ -23,16 +24,6 @@
#define DMEMIT(x...) sz += ((sz >= maxlen) ? \
0 : scnprintf(result + sz, maxlen - sz, x))
-/*
- * FIXME: I think this should be with the definition of sector_t
- * in types.h.
- */
-#ifdef CONFIG_LBD
-#define SECTOR_FORMAT "%llu"
-#else
-#define SECTOR_FORMAT "%lu"
-#endif
-
#define SECTOR_SHIFT 9
/*
@@ -57,7 +48,7 @@ struct mapped_device;
int dm_create(struct mapped_device **md);
int dm_create_with_minor(unsigned int minor, struct mapped_device **md);
void dm_set_mdptr(struct mapped_device *md, void *ptr);
-void *dm_get_mdptr(dev_t dev);
+void *dm_get_mdptr(struct mapped_device *md);
struct mapped_device *dm_get_md(dev_t dev);
/*
@@ -95,11 +86,18 @@ int dm_wait_event(struct mapped_device *md, int event_nr);
struct gendisk *dm_disk(struct mapped_device *md);
int dm_suspended(struct mapped_device *md);
+/*
+ * Geometry functions.
+ */
+int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo);
+int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo);
+
/*-----------------------------------------------------------------
* Functions for manipulating a table. Tables are also reference
* counted.
*---------------------------------------------------------------*/
-int dm_table_create(struct dm_table **result, int mode, unsigned num_targets);
+int dm_table_create(struct dm_table **result, int mode,
+ unsigned num_targets, struct mapped_device *md);
void dm_table_get(struct dm_table *t);
void dm_table_put(struct dm_table *t);
@@ -117,6 +115,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q);
unsigned int dm_table_get_num_targets(struct dm_table *t);
struct list_head *dm_table_get_devices(struct dm_table *t);
int dm_table_get_mode(struct dm_table *t);
+struct mapped_device *dm_table_get_md(struct dm_table *t);
void dm_table_presuspend_targets(struct dm_table *t);
void dm_table_postsuspend_targets(struct dm_table *t);
void dm_table_resume_targets(struct dm_table *t);
diff --git a/drivers/md/kcopyd.c b/drivers/md/kcopyd.c
index 8b3515f394a6..72480a48d88b 100644
--- a/drivers/md/kcopyd.c
+++ b/drivers/md/kcopyd.c
@@ -22,6 +22,7 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
+#include <linux/mutex.h>
#include "kcopyd.h"
@@ -44,6 +45,9 @@ struct kcopyd_client {
struct page_list *pages;
unsigned int nr_pages;
unsigned int nr_free_pages;
+
+ wait_queue_head_t destroyq;
+ atomic_t nr_jobs;
};
static struct page_list *alloc_pl(void)
@@ -227,8 +231,7 @@ static int jobs_init(void)
if (!_job_cache)
return -ENOMEM;
- _job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab,
- mempool_free_slab, _job_cache);
+ _job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache);
if (!_job_pool) {
kmem_cache_destroy(_job_cache);
return -ENOMEM;
@@ -293,10 +296,15 @@ static int run_complete_job(struct kcopyd_job *job)
int read_err = job->read_err;
unsigned int write_err = job->write_err;
kcopyd_notify_fn fn = job->fn;
+ struct kcopyd_client *kc = job->kc;
- kcopyd_put_pages(job->kc, job->pages);
+ kcopyd_put_pages(kc, job->pages);
mempool_free(job, _job_pool);
fn(read_err, write_err, context);
+
+ if (atomic_dec_and_test(&kc->nr_jobs))
+ wake_up(&kc->destroyq);
+
return 0;
}
@@ -431,6 +439,7 @@ static void do_work(void *ignored)
*/
static void dispatch_job(struct kcopyd_job *job)
{
+ atomic_inc(&job->kc->nr_jobs);
push(&_pages_jobs, job);
wake();
}
@@ -573,68 +582,68 @@ int kcopyd_cancel(struct kcopyd_job *job, int block)
/*-----------------------------------------------------------------
* Unit setup
*---------------------------------------------------------------*/
-static DECLARE_MUTEX(_client_lock);
+static DEFINE_MUTEX(_client_lock);
static LIST_HEAD(_clients);
static void client_add(struct kcopyd_client *kc)
{
- down(&_client_lock);
+ mutex_lock(&_client_lock);
list_add(&kc->list, &_clients);
- up(&_client_lock);
+ mutex_unlock(&_client_lock);
}
static void client_del(struct kcopyd_client *kc)
{
- down(&_client_lock);
+ mutex_lock(&_client_lock);
list_del(&kc->list);
- up(&_client_lock);
+ mutex_unlock(&_client_lock);
}
-static DECLARE_MUTEX(kcopyd_init_lock);
+static DEFINE_MUTEX(kcopyd_init_lock);
static int kcopyd_clients = 0;
static int kcopyd_init(void)
{
int r;
- down(&kcopyd_init_lock);
+ mutex_lock(&kcopyd_init_lock);
if (kcopyd_clients) {
/* Already initialized. */
kcopyd_clients++;
- up(&kcopyd_init_lock);
+ mutex_unlock(&kcopyd_init_lock);
return 0;
}
r = jobs_init();
if (r) {
- up(&kcopyd_init_lock);
+ mutex_unlock(&kcopyd_init_lock);
return r;
}
_kcopyd_wq = create_singlethread_workqueue("kcopyd");
if (!_kcopyd_wq) {
jobs_exit();
- up(&kcopyd_init_lock);
+ mutex_unlock(&kcopyd_init_lock);
return -ENOMEM;
}
kcopyd_clients++;
INIT_WORK(&_kcopyd_work, do_work, NULL);
- up(&kcopyd_init_lock);
+ mutex_unlock(&kcopyd_init_lock);
return 0;
}
static void kcopyd_exit(void)
{
- down(&kcopyd_init_lock);
+ mutex_lock(&kcopyd_init_lock);
kcopyd_clients--;
if (!kcopyd_clients) {
jobs_exit();
destroy_workqueue(_kcopyd_wq);
_kcopyd_wq = NULL;
}
- up(&kcopyd_init_lock);
+ mutex_unlock(&kcopyd_init_lock);
}
int kcopyd_client_create(unsigned int nr_pages, struct kcopyd_client **result)
@@ -670,6 +679,9 @@ int kcopyd_client_create(unsigned int nr_pages, struct kcopyd_client **result)
return r;
}
+ init_waitqueue_head(&kc->destroyq);
+ atomic_set(&kc->nr_jobs, 0);
+
client_add(kc);
*result = kc;
return 0;
@@ -677,6 +689,9 @@ int kcopyd_client_create(unsigned int nr_pages, struct kcopyd_client **result)
void kcopyd_client_destroy(struct kcopyd_client *kc)
{
+ /* Wait for completion of all jobs submitted by this client. */
+ wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs));
+
dm_io_put(kc->nr_pages);
client_free_pages(kc);
client_del(kc);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index d05e3125d298..1ed5152db450 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -43,6 +43,7 @@
#include <linux/buffer_head.h> /* for invalidate_bdev */
#include <linux/suspend.h>
#include <linux/poll.h>
+#include <linux/mutex.h>
#include <linux/init.h>
@@ -158,11 +159,12 @@ static int start_readonly;
*/
static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
static atomic_t md_event_count;
-static void md_new_event(mddev_t *mddev)
+void md_new_event(mddev_t *mddev)
{
atomic_inc(&md_event_count);
wake_up(&md_event_waiters);
}
+EXPORT_SYMBOL_GPL(md_new_event);
/*
* Enables to iterate over all existing md arrays
@@ -213,10 +215,11 @@ static void mddev_put(mddev_t *mddev)
return;
if (!mddev->raid_disks && list_empty(&mddev->disks)) {
list_del(&mddev->all_mddevs);
- blk_put_queue(mddev->queue);
+ spin_unlock(&all_mddevs_lock);
+ blk_cleanup_queue(mddev->queue);
kobject_unregister(&mddev->kobj);
- }
- spin_unlock(&all_mddevs_lock);
+ } else
+ spin_unlock(&all_mddevs_lock);
}
static mddev_t * mddev_find(dev_t unit)
@@ -250,7 +253,7 @@ static mddev_t * mddev_find(dev_t unit)
else
new->md_minor = MINOR(unit) >> MdpMinorShift;
- init_MUTEX(&new->reconfig_sem);
+ mutex_init(&new->reconfig_mutex);
INIT_LIST_HEAD(&new->disks);
INIT_LIST_HEAD(&new->all_mddevs);
init_timer(&new->safemode_timer);
@@ -263,6 +266,7 @@ static mddev_t * mddev_find(dev_t unit)
kfree(new);
return NULL;
}
+ set_bit(QUEUE_FLAG_CLUSTER, &new->queue->queue_flags);
blk_queue_make_request(new->queue, md_fail_request);
@@ -271,22 +275,22 @@ static mddev_t * mddev_find(dev_t unit)
static inline int mddev_lock(mddev_t * mddev)
{
- return down_interruptible(&mddev->reconfig_sem);
+ return mutex_lock_interruptible(&mddev->reconfig_mutex);
}
static inline void mddev_lock_uninterruptible(mddev_t * mddev)
{
- down(&mddev->reconfig_sem);
+ mutex_lock(&mddev->reconfig_mutex);
}
static inline int mddev_trylock(mddev_t * mddev)
{
- return down_trylock(&mddev->reconfig_sem);
+ return mutex_trylock(&mddev->reconfig_mutex);
}
static inline void mddev_unlock(mddev_t * mddev)
{
- up(&mddev->reconfig_sem);
+ mutex_unlock(&mddev->reconfig_mutex);
md_wakeup_thread(mddev->thread);
}
@@ -657,7 +661,8 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
}
if (sb->major_version != 0 ||
- sb->minor_version != 90) {
+ sb->minor_version < 90 ||
+ sb->minor_version > 91) {
printk(KERN_WARNING "Bad version number %d.%d on %s\n",
sb->major_version, sb->minor_version,
b);
@@ -742,6 +747,20 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
mddev->bitmap_offset = 0;
mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
+ if (mddev->minor_version >= 91) {
+ mddev->reshape_position = sb->reshape_position;
+ mddev->delta_disks = sb->delta_disks;
+ mddev->new_level = sb->new_level;
+ mddev->new_layout = sb->new_layout;
+ mddev->new_chunk = sb->new_chunk;
+ } else {
+ mddev->reshape_position = MaxSector;
+ mddev->delta_disks = 0;
+ mddev->new_level = mddev->level;
+ mddev->new_layout = mddev->layout;
+ mddev->new_chunk = mddev->chunk_size;
+ }
+
if (sb->state & (1<<MD_SB_CLEAN))
mddev->recovery_cp = MaxSector;
else {
@@ -761,7 +780,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
mddev->bitmap_file == NULL) {
- if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6
+ if (mddev->level != 1 && mddev->level != 4
+ && mddev->level != 5 && mddev->level != 6
&& mddev->level != 10) {
/* FIXME use a better test */
printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
@@ -835,7 +855,6 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
sb->md_magic = MD_SB_MAGIC;
sb->major_version = mddev->major_version;
- sb->minor_version = mddev->minor_version;
sb->patch_version = mddev->patch_version;
sb->gvalid_words = 0; /* ignored */
memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
@@ -854,6 +873,17 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
sb->events_hi = (mddev->events>>32);
sb->events_lo = (u32)mddev->events;
+ if (mddev->reshape_position == MaxSector)
+ sb->minor_version = 90;
+ else {
+ sb->minor_version = 91;
+ sb->reshape_position = mddev->reshape_position;
+ sb->new_level = mddev->new_level;
+ sb->delta_disks = mddev->delta_disks;
+ sb->new_layout = mddev->new_layout;
+ sb->new_chunk = mddev->new_chunk;
+ }
+ mddev->minor_version = sb->minor_version;
if (mddev->in_sync)
{
sb->recovery_cp = mddev->recovery_cp;
@@ -890,10 +920,9 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
d->raid_disk = rdev2->raid_disk;
else
d->raid_disk = rdev2->desc_nr; /* compatibility */
- if (test_bit(Faulty, &rdev2->flags)) {
+ if (test_bit(Faulty, &rdev2->flags))
d->state = (1<<MD_DISK_FAULTY);
- failed++;
- } else if (test_bit(In_sync, &rdev2->flags)) {
+ else if (test_bit(In_sync, &rdev2->flags)) {
d->state = (1<<MD_DISK_ACTIVE);
d->state |= (1<<MD_DISK_SYNC);
active++;
@@ -1099,6 +1128,20 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
}
mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
}
+ if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
+ mddev->reshape_position = le64_to_cpu(sb->reshape_position);
+ mddev->delta_disks = le32_to_cpu(sb->delta_disks);
+ mddev->new_level = le32_to_cpu(sb->new_level);
+ mddev->new_layout = le32_to_cpu(sb->new_layout);
+ mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9;
+ } else {
+ mddev->reshape_position = MaxSector;
+ mddev->delta_disks = 0;
+ mddev->new_level = mddev->level;
+ mddev->new_layout = mddev->layout;
+ mddev->new_chunk = mddev->chunk_size;
+ }
+
} else if (mddev->pers == NULL) {
/* Insist of good event counter while assembling */
__u64 ev1 = le64_to_cpu(sb->events);
@@ -1170,6 +1213,14 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
}
+ if (mddev->reshape_position != MaxSector) {
+ sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
+ sb->reshape_position = cpu_to_le64(mddev->reshape_position);
+ sb->new_layout = cpu_to_le32(mddev->new_layout);
+ sb->delta_disks = cpu_to_le32(mddev->delta_disks);
+ sb->new_level = cpu_to_le32(mddev->new_level);
+ sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9);
+ }
max_dev = 0;
ITERATE_RDEV(mddev,rdev2,tmp)
@@ -1298,6 +1349,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
else
ko = &rdev->bdev->bd_disk->kobj;
sysfs_create_link(&rdev->kobj, ko, "block");
+ bd_claim_by_disk(rdev->bdev, rdev, mddev->gendisk);
return 0;
}
@@ -1308,6 +1360,7 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
MD_BUG();
return;
}
+ bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
list_del_init(&rdev->same_set);
printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
rdev->mddev = NULL;
@@ -1490,7 +1543,7 @@ static void sync_sbs(mddev_t * mddev)
}
}
-static void md_update_sb(mddev_t * mddev)
+void md_update_sb(mddev_t * mddev)
{
int err;
struct list_head *tmp;
@@ -1567,6 +1620,7 @@ repeat:
wake_up(&mddev->sb_wait);
}
+EXPORT_SYMBOL_GPL(md_update_sb);
/* words written to sysfs files may, or my not, be \n terminated.
* We want to accept with case. For this we use cmd_match.
@@ -2159,7 +2213,9 @@ action_show(mddev_t *mddev, char *page)
char *type = "idle";
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) {
- if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+ if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+ type = "reshape";
+ else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
type = "resync";
else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
@@ -2190,7 +2246,14 @@ action_store(mddev_t *mddev, const char *page, size_t len)
return -EBUSY;
else if (cmd_match(page, "resync") || cmd_match(page, "recover"))
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
- else {
+ else if (cmd_match(page, "reshape")) {
+ int err;
+ if (mddev->pers->start_reshape == NULL)
+ return -EINVAL;
+ err = mddev->pers->start_reshape(mddev);
+ if (err)
+ return err;
+ } else {
if (cmd_match(page, "check"))
set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
else if (cmd_match(page, "repair"))
@@ -2301,6 +2364,63 @@ sync_completed_show(mddev_t *mddev, char *page)
static struct md_sysfs_entry
md_sync_completed = __ATTR_RO(sync_completed);
+static ssize_t
+suspend_lo_show(mddev_t *mddev, char *page)
+{
+ return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
+}
+
+static ssize_t
+suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
+{
+ char *e;
+ unsigned long long new = simple_strtoull(buf, &e, 10);
+
+ if (mddev->pers->quiesce == NULL)
+ return -EINVAL;
+ if (buf == e || (*e && *e != '\n'))
+ return -EINVAL;
+ if (new >= mddev->suspend_hi ||
+ (new > mddev->suspend_lo && new < mddev->suspend_hi)) {
+ mddev->suspend_lo = new;
+ mddev->pers->quiesce(mddev, 2);
+ return len;
+ } else
+ return -EINVAL;
+}
+static struct md_sysfs_entry md_suspend_lo =
+__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
+
+
+static ssize_t
+suspend_hi_show(mddev_t *mddev, char *page)
+{
+ return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
+}
+
+static ssize_t
+suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
+{
+ char *e;
+ unsigned long long new = simple_strtoull(buf, &e, 10);
+
+ if (mddev->pers->quiesce == NULL)
+ return -EINVAL;
+ if (buf == e || (*e && *e != '\n'))
+ return -EINVAL;
+ if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) ||
+ (new > mddev->suspend_lo && new > mddev->suspend_hi)) {
+ mddev->suspend_hi = new;
+ mddev->pers->quiesce(mddev, 1);
+ mddev->pers->quiesce(mddev, 0);
+ return len;
+ } else
+ return -EINVAL;
+}
+static struct md_sysfs_entry md_suspend_hi =
+__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
+
+
static struct attribute *md_default_attrs[] = {
&md_level.attr,
&md_raid_disks.attr,
@@ -2318,6 +2438,8 @@ static struct attribute *md_redundancy_attrs[] = {
&md_sync_max.attr,
&md_sync_speed.attr,
&md_sync_completed.attr,
+ &md_suspend_lo.attr,
+ &md_suspend_hi.attr,
NULL,
};
static struct attribute_group md_redundancy_group = {
@@ -2377,7 +2499,7 @@ int mdp_major = 0;
static struct kobject *md_probe(dev_t dev, int *part, void *data)
{
- static DECLARE_MUTEX(disks_sem);
+ static DEFINE_MUTEX(disks_mutex);
mddev_t *mddev = mddev_find(dev);
struct gendisk *disk;
int partitioned = (MAJOR(dev) != MD_MAJOR);
@@ -2387,15 +2509,15 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
if (!mddev)
return NULL;
- down(&disks_sem);
+ mutex_lock(&disks_mutex);
if (mddev->gendisk) {
- up(&disks_sem);
+ mutex_unlock(&disks_mutex);
mddev_put(mddev);
return NULL;
}
disk = alloc_disk(1 << shift);
if (!disk) {
- up(&disks_sem);
+ mutex_unlock(&disks_mutex);
mddev_put(mddev);
return NULL;
}
@@ -2413,7 +2535,7 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
disk->queue = mddev->queue;
add_disk(disk);
mddev->gendisk = disk;
- up(&disks_sem);
+ mutex_unlock(&disks_mutex);
mddev->kobj.parent = &disk->kobj;
mddev->kobj.k_name = NULL;
snprintf(mddev->kobj.name, KOBJ_NAME_LEN, "%s", "md");
@@ -2536,6 +2658,14 @@ static int do_md_run(mddev_t * mddev)
mddev->level = pers->level;
strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
+ if (mddev->reshape_position != MaxSector &&
+ pers->start_reshape == NULL) {
+ /* This personality cannot handle reshaping... */
+ mddev->pers = NULL;
+ module_put(pers->owner);
+ return -EINVAL;
+ }
+
mddev->recovery = 0;
mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
mddev->barriers_work = 1;
@@ -2769,7 +2899,6 @@ static void autorun_array(mddev_t *mddev)
*/
static void autorun_devices(int part)
{
- struct list_head candidates;
struct list_head *tmp;
mdk_rdev_t *rdev0, *rdev;
mddev_t *mddev;
@@ -2778,6 +2907,7 @@ static void autorun_devices(int part)
printk(KERN_INFO "md: autorun ...\n");
while (!list_empty(&pending_raid_disks)) {
dev_t dev;
+ LIST_HEAD(candidates);
rdev0 = list_entry(pending_raid_disks.next,
mdk_rdev_t, same_set);
@@ -3424,11 +3554,18 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
mddev->bitmap_offset = 0;
+ mddev->reshape_position = MaxSector;
+
/*
* Generate a 128 bit UUID
*/
get_random_bytes(mddev->uuid, 16);
+ mddev->new_level = mddev->level;
+ mddev->new_chunk = mddev->chunk_size;
+ mddev->new_layout = mddev->layout;
+ mddev->delta_disks = 0;
+
return 0;
}
@@ -3437,6 +3574,7 @@ static int update_size(mddev_t *mddev, unsigned long size)
mdk_rdev_t * rdev;
int rv;
struct list_head *tmp;
+ int fit = (size == 0);
if (mddev->pers->resize == NULL)
return -EINVAL;
@@ -3454,7 +3592,6 @@ static int update_size(mddev_t *mddev, unsigned long size)
return -EBUSY;
ITERATE_RDEV(mddev,rdev,tmp) {
sector_t avail;
- int fit = (size == 0);
if (rdev->sb_offset > rdev->data_offset)
avail = (rdev->sb_offset*2) - rdev->data_offset;
else
@@ -3484,14 +3621,16 @@ static int update_raid_disks(mddev_t *mddev, int raid_disks)
{
int rv;
/* change the number of raid disks */
- if (mddev->pers->reshape == NULL)
+ if (mddev->pers->check_reshape == NULL)
return -EINVAL;
if (raid_disks <= 0 ||
raid_disks >= mddev->max_disks)
return -EINVAL;
- if (mddev->sync_thread)
+ if (mddev->sync_thread || mddev->reshape_position != MaxSector)
return -EBUSY;
- rv = mddev->pers->reshape(mddev, raid_disks);
+ mddev->delta_disks = raid_disks - mddev->raid_disks;
+
+ rv = mddev->pers->check_reshape(mddev);
return rv;
}
@@ -4038,7 +4177,10 @@ static void status_unused(struct seq_file *seq)
static void status_resync(struct seq_file *seq, mddev_t * mddev)
{
- unsigned long max_blocks, resync, res, dt, db, rt;
+ sector_t max_blocks, resync, res;
+ unsigned long dt, db, rt;
+ int scale;
+ unsigned int per_milli;
resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
@@ -4054,9 +4196,22 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
MD_BUG();
return;
}
- res = (resync/1024)*1000/(max_blocks/1024 + 1);
+ /* Pick 'scale' such that (resync>>scale)*1000 will fit
+ * in a sector_t, and (max_blocks>>scale) will fit in a
+ * u32, as those are the requirements for sector_div.
+ * Thus 'scale' must be at least 10
+ */
+ scale = 10;
+ if (sizeof(sector_t) > sizeof(unsigned long)) {
+ while ( max_blocks/2 > (1ULL<<(scale+32)))
+ scale++;
+ }
+ res = (resync>>scale)*1000;
+ sector_div(res, (u32)((max_blocks>>scale)+1));
+
+ per_milli = res;
{
- int i, x = res/50, y = 20-x;
+ int i, x = per_milli/50, y = 20-x;
seq_printf(seq, "[");
for (i = 0; i < x; i++)
seq_printf(seq, "=");
@@ -4065,10 +4220,14 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
seq_printf(seq, ".");
seq_printf(seq, "] ");
}
- seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)",
+ seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
+ (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
+ "reshape" :
(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
- "resync" : "recovery"),
- res/10, res % 10, resync, max_blocks);
+ "resync" : "recovery")),
+ per_milli/10, per_milli % 10,
+ (unsigned long long) resync,
+ (unsigned long long) max_blocks);
/*
* We do not want to overflow, so the order of operands and
@@ -4082,7 +4241,7 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
dt = ((jiffies - mddev->resync_mark) / HZ);
if (!dt) dt++;
db = resync - (mddev->resync_mark_cnt/2);
- rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;
+ rt = (dt * ((unsigned long)(max_blocks-resync) / (db/100+1)))/100;
seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
@@ -4439,7 +4598,7 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
#define SYNC_MARKS 10
#define SYNC_MARK_STEP (3*HZ)
-static void md_do_sync(mddev_t *mddev)
+void md_do_sync(mddev_t *mddev)
{
mddev_t *mddev2;
unsigned int currspeed = 0,
@@ -4519,7 +4678,9 @@ static void md_do_sync(mddev_t *mddev)
*/
max_sectors = mddev->resync_max_sectors;
mddev->resync_mismatches = 0;
- } else
+ } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+ max_sectors = mddev->size << 1;
+ else
/* recovery follows the physical size of devices */
max_sectors = mddev->size << 1;
@@ -4655,6 +4816,8 @@ static void md_do_sync(mddev_t *mddev)
mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
+ test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
mddev->curr_resync > 2 &&
mddev->curr_resync >= mddev->recovery_cp) {
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
@@ -4672,6 +4835,7 @@ static void md_do_sync(mddev_t *mddev)
set_bit(MD_RECOVERY_DONE, &mddev->recovery);
md_wakeup_thread(mddev->thread);
}
+EXPORT_SYMBOL_GPL(md_do_sync);
/*
@@ -4727,7 +4891,7 @@ void md_check_recovery(mddev_t *mddev)
))
return;
- if (mddev_trylock(mddev)==0) {
+ if (mddev_trylock(mddev)) {
int spares =0;
spin_lock_irq(&mddev->write_lock);
@@ -4863,7 +5027,7 @@ static int md_notify_reboot(struct notifier_block *this,
printk(KERN_INFO "md: stopping all md devices.\n");
ITERATE_MDDEV(mddev,tmp)
- if (mddev_trylock(mddev)==0)
+ if (mddev_trylock(mddev))
do_md_stop (mddev, 1);
/*
* certain more exotic SCSI devices are known to be
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 96f7af4ae400..1cc9de44ce86 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -35,18 +35,6 @@
#define NR_RESERVED_BUFS 32
-static void *mp_pool_alloc(gfp_t gfp_flags, void *data)
-{
- struct multipath_bh *mpb;
- mpb = kzalloc(sizeof(*mpb), gfp_flags);
- return mpb;
-}
-
-static void mp_pool_free(void *mpb, void *data)
-{
- kfree(mpb);
-}
-
static int multipath_map (multipath_conf_t *conf)
{
int i, disks = conf->raid_disks;
@@ -494,9 +482,8 @@ static int multipath_run (mddev_t *mddev)
}
mddev->degraded = conf->raid_disks = conf->working_disks;
- conf->pool = mempool_create(NR_RESERVED_BUFS,
- mp_pool_alloc, mp_pool_free,
- NULL);
+ conf->pool = mempool_create_kzalloc_pool(NR_RESERVED_BUFS,
+ sizeof(struct multipath_bh));
if (conf->pool == NULL) {
printk(KERN_ERR
"multipath: couldn't allocate memory for %s\n",
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 5d88329e3c7a..9b374c91db66 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1135,8 +1135,19 @@ static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error)
mirror = i;
break;
}
- if (!uptodate)
+ if (!uptodate) {
+ int sync_blocks = 0;
+ sector_t s = r1_bio->sector;
+ long sectors_to_go = r1_bio->sectors;
+ /* make sure these bits doesn't get cleared. */
+ do {
+ bitmap_end_sync(mddev->bitmap, r1_bio->sector,
+ &sync_blocks, 1);
+ s += sync_blocks;
+ sectors_to_go -= sync_blocks;
+ } while (sectors_to_go > 0);
md_error(mddev, conf->mirrors[mirror].rdev);
+ }
update_head_pos(mirror, r1_bio);
@@ -1402,6 +1413,9 @@ static void raid1d(mddev_t *mddev)
clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
clear_bit(R1BIO_Barrier, &r1_bio->state);
for (i=0; i < conf->raid_disks; i++)
+ if (r1_bio->bios[i])
+ atomic_inc(&r1_bio->remaining);
+ for (i=0; i < conf->raid_disks; i++)
if (r1_bio->bios[i]) {
struct bio_vec *bvec;
int j;
@@ -1789,6 +1803,11 @@ static int run(mddev_t *mddev)
mdname(mddev), mddev->level);
goto out;
}
+ if (mddev->reshape_position != MaxSector) {
+ printk("raid1: %s: reshape_position set but not supported\n",
+ mdname(mddev));
+ goto out;
+ }
/*
* copy the already verified devices into our private RAID1
* bookkeeping area. [whatever we allocate in run(),
@@ -1971,7 +1990,7 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
return 0;
}
-static int raid1_reshape(mddev_t *mddev, int raid_disks)
+static int raid1_reshape(mddev_t *mddev)
{
/* We need to:
* 1/ resize the r1bio_pool
@@ -1988,10 +2007,22 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
struct pool_info *newpoolinfo;
mirror_info_t *newmirrors;
conf_t *conf = mddev_to_conf(mddev);
- int cnt;
+ int cnt, raid_disks;
int d, d2;
+ /* Cannot change chunk_size, layout, or level */
+ if (mddev->chunk_size != mddev->new_chunk ||
+ mddev->layout != mddev->new_layout ||
+ mddev->level != mddev->new_level) {
+ mddev->new_chunk = mddev->chunk_size;
+ mddev->new_layout = mddev->layout;
+ mddev->new_level = mddev->level;
+ return -EINVAL;
+ }
+
+ raid_disks = mddev->raid_disks + mddev->delta_disks;
+
if (raid_disks < conf->raid_disks) {
cnt=0;
for (d= 0; d < conf->raid_disks; d++)
@@ -2038,6 +2069,7 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
mddev->degraded += (raid_disks - conf->raid_disks);
conf->raid_disks = mddev->raid_disks = raid_disks;
+ mddev->delta_disks = 0;
conf->last_used = 0; /* just make sure it is in-range */
lower_barrier(conf);
@@ -2079,7 +2111,7 @@ static struct mdk_personality raid1_personality =
.spare_active = raid1_spare_active,
.sync_request = sync_request,
.resize = raid1_resize,
- .reshape = raid1_reshape,
+ .check_reshape = raid1_reshape,
.quiesce = raid1_quiesce,
};
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2dba305daf3c..dae740adaf65 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -22,6 +22,7 @@
#include <linux/raid/raid5.h>
#include <linux/highmem.h>
#include <linux/bitops.h>
+#include <linux/kthread.h>
#include <asm/atomic.h>
#include <linux/raid/bitmap.h>
@@ -93,11 +94,11 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
md_wakeup_thread(conf->mddev->thread);
}
- list_add_tail(&sh->lru, &conf->inactive_list);
atomic_dec(&conf->active_stripes);
- if (!conf->inactive_blocked ||
- atomic_read(&conf->active_stripes) < (conf->max_nr_stripes*3/4))
+ if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
+ list_add_tail(&sh->lru, &conf->inactive_list);
wake_up(&conf->wait_for_stripe);
+ }
}
}
}
@@ -178,10 +179,10 @@ static int grow_buffers(struct stripe_head *sh, int num)
static void raid5_build_block (struct stripe_head *sh, int i);
-static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx)
+static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int disks)
{
raid5_conf_t *conf = sh->raid_conf;
- int disks = conf->raid_disks, i;
+ int i;
if (atomic_read(&sh->count) != 0)
BUG();
@@ -198,7 +199,9 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx)
sh->pd_idx = pd_idx;
sh->state = 0;
- for (i=disks; i--; ) {
+ sh->disks = disks;
+
+ for (i = sh->disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if (dev->toread || dev->towrite || dev->written ||
@@ -215,7 +218,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx)
insert_hash(conf, sh);
}
-static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector)
+static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, int disks)
{
struct stripe_head *sh;
struct hlist_node *hn;
@@ -223,7 +226,7 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector)
CHECK_DEVLOCK();
PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector);
hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
- if (sh->sector == sector)
+ if (sh->sector == sector && sh->disks == disks)
return sh;
PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector);
return NULL;
@@ -232,8 +235,8 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector)
static void unplug_slaves(mddev_t *mddev);
static void raid5_unplug_device(request_queue_t *q);
-static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector,
- int pd_idx, int noblock)
+static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, int disks,
+ int pd_idx, int noblock)
{
struct stripe_head *sh;
@@ -245,7 +248,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
wait_event_lock_irq(conf->wait_for_stripe,
conf->quiesce == 0,
conf->device_lock, /* nothing */);
- sh = __find_stripe(conf, sector);
+ sh = __find_stripe(conf, sector, disks);
if (!sh) {
if (!conf->inactive_blocked)
sh = get_free_stripe(conf);
@@ -259,11 +262,11 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
< (conf->max_nr_stripes *3/4)
|| !conf->inactive_blocked),
conf->device_lock,
- unplug_slaves(conf->mddev);
+ unplug_slaves(conf->mddev)
);
conf->inactive_blocked = 0;
} else
- init_stripe(sh, sector, pd_idx);
+ init_stripe(sh, sector, pd_idx, disks);
} else {
if (atomic_read(&sh->count)) {
if (!list_empty(&sh->lru))
@@ -271,9 +274,8 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
} else {
if (!test_bit(STRIPE_HANDLE, &sh->state))
atomic_inc(&conf->active_stripes);
- if (list_empty(&sh->lru))
- BUG();
- list_del_init(&sh->lru);
+ if (!list_empty(&sh->lru))
+ list_del_init(&sh->lru);
}
}
} while (sh == NULL);
@@ -300,6 +302,7 @@ static int grow_one_stripe(raid5_conf_t *conf)
kmem_cache_free(conf->slab_cache, sh);
return 0;
}
+ sh->disks = conf->raid_disks;
/* we just created an active stripe so... */
atomic_set(&sh->count, 1);
atomic_inc(&conf->active_stripes);
@@ -313,14 +316,16 @@ static int grow_stripes(raid5_conf_t *conf, int num)
kmem_cache_t *sc;
int devs = conf->raid_disks;
- sprintf(conf->cache_name, "raid5/%s", mdname(conf->mddev));
-
- sc = kmem_cache_create(conf->cache_name,
+ sprintf(conf->cache_name[0], "raid5/%s", mdname(conf->mddev));
+ sprintf(conf->cache_name[1], "raid5/%s-alt", mdname(conf->mddev));
+ conf->active_name = 0;
+ sc = kmem_cache_create(conf->cache_name[conf->active_name],
sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
0, 0, NULL, NULL);
if (!sc)
return 1;
conf->slab_cache = sc;
+ conf->pool_size = devs;
while (num--) {
if (!grow_one_stripe(conf))
return 1;
@@ -328,6 +333,129 @@ static int grow_stripes(raid5_conf_t *conf, int num)
return 0;
}
+#ifdef CONFIG_MD_RAID5_RESHAPE
+static int resize_stripes(raid5_conf_t *conf, int newsize)
+{
+ /* Make all the stripes able to hold 'newsize' devices.
+ * New slots in each stripe get 'page' set to a new page.
+ *
+ * This happens in stages:
+ * 1/ create a new kmem_cache and allocate the required number of
+ * stripe_heads.
+ * 2/ gather all the old stripe_heads and tranfer the pages across
+ * to the new stripe_heads. This will have the side effect of
+ * freezing the array as once all stripe_heads have been collected,
+ * no IO will be possible. Old stripe heads are freed once their
+ * pages have been transferred over, and the old kmem_cache is
+ * freed when all stripes are done.
+ * 3/ reallocate conf->disks to be suitable bigger. If this fails,
+ * we simple return a failre status - no need to clean anything up.
+ * 4/ allocate new pages for the new slots in the new stripe_heads.
+ * If this fails, we don't bother trying the shrink the
+ * stripe_heads down again, we just leave them as they are.
+ * As each stripe_head is processed the new one is released into
+ * active service.
+ *
+ * Once step2 is started, we cannot afford to wait for a write,
+ * so we use GFP_NOIO allocations.
+ */
+ struct stripe_head *osh, *nsh;
+ LIST_HEAD(newstripes);
+ struct disk_info *ndisks;
+ int err = 0;
+ kmem_cache_t *sc;
+ int i;
+
+ if (newsize <= conf->pool_size)
+ return 0; /* never bother to shrink */
+
+ /* Step 1 */
+ sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
+ sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
+ 0, 0, NULL, NULL);
+ if (!sc)
+ return -ENOMEM;
+
+ for (i = conf->max_nr_stripes; i; i--) {
+ nsh = kmem_cache_alloc(sc, GFP_KERNEL);
+ if (!nsh)
+ break;
+
+ memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev));
+
+ nsh->raid_conf = conf;
+ spin_lock_init(&nsh->lock);
+
+ list_add(&nsh->lru, &newstripes);
+ }
+ if (i) {
+ /* didn't get enough, give up */
+ while (!list_empty(&newstripes)) {
+ nsh = list_entry(newstripes.next, struct stripe_head, lru);
+ list_del(&nsh->lru);
+ kmem_cache_free(sc, nsh);
+ }
+ kmem_cache_destroy(sc);
+ return -ENOMEM;
+ }
+ /* Step 2 - Must use GFP_NOIO now.
+ * OK, we have enough stripes, start collecting inactive
+ * stripes and copying them over
+ */
+ list_for_each_entry(nsh, &newstripes, lru) {
+ spin_lock_irq(&conf->device_lock);
+ wait_event_lock_irq(conf->wait_for_stripe,
+ !list_empty(&conf->inactive_list),
+ conf->device_lock,
+ unplug_slaves(conf->mddev)
+ );
+ osh = get_free_stripe(conf);
+ spin_unlock_irq(&conf->device_lock);
+ atomic_set(&nsh->count, 1);
+ for(i=0; i<conf->pool_size; i++)
+ nsh->dev[i].page = osh->dev[i].page;
+ for( ; i<newsize; i++)
+ nsh->dev[i].page = NULL;
+ kmem_cache_free(conf->slab_cache, osh);
+ }
+ kmem_cache_destroy(conf->slab_cache);
+
+ /* Step 3.
+ * At this point, we are holding all the stripes so the array
+ * is completely stalled, so now is a good time to resize
+ * conf->disks.
+ */
+ ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
+ if (ndisks) {
+ for (i=0; i<conf->raid_disks; i++)
+ ndisks[i] = conf->disks[i];
+ kfree(conf->disks);
+ conf->disks = ndisks;
+ } else
+ err = -ENOMEM;
+
+ /* Step 4, return new stripes to service */
+ while(!list_empty(&newstripes)) {
+ nsh = list_entry(newstripes.next, struct stripe_head, lru);
+ list_del_init(&nsh->lru);
+ for (i=conf->raid_disks; i < newsize; i++)
+ if (nsh->dev[i].page == NULL) {
+ struct page *p = alloc_page(GFP_NOIO);
+ nsh->dev[i].page = p;
+ if (!p)
+ err = -ENOMEM;
+ }
+ release_stripe(nsh);
+ }
+ /* critical section pass, GFP_NOIO no longer needed */
+
+ conf->slab_cache = sc;
+ conf->active_name = 1-conf->active_name;
+ conf->pool_size = newsize;
+ return err;
+}
+#endif
+
static int drop_one_stripe(raid5_conf_t *conf)
{
struct stripe_head *sh;
@@ -339,7 +467,7 @@ static int drop_one_stripe(raid5_conf_t *conf)
return 0;
if (atomic_read(&sh->count))
BUG();
- shrink_buffers(sh, conf->raid_disks);
+ shrink_buffers(sh, conf->pool_size);
kmem_cache_free(conf->slab_cache, sh);
atomic_dec(&conf->active_stripes);
return 1;
@@ -360,7 +488,7 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
{
struct stripe_head *sh = bi->bi_private;
raid5_conf_t *conf = sh->raid_conf;
- int disks = conf->raid_disks, i;
+ int disks = sh->disks, i;
int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
if (bi->bi_size)
@@ -458,7 +586,7 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done,
{
struct stripe_head *sh = bi->bi_private;
raid5_conf_t *conf = sh->raid_conf;
- int disks = conf->raid_disks, i;
+ int disks = sh->disks, i;
unsigned long flags;
int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -612,7 +740,7 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
static sector_t compute_blocknr(struct stripe_head *sh, int i)
{
raid5_conf_t *conf = sh->raid_conf;
- int raid_disks = conf->raid_disks, data_disks = raid_disks - 1;
+ int raid_disks = sh->disks, data_disks = raid_disks - 1;
sector_t new_sector = sh->sector, check;
int sectors_per_chunk = conf->chunk_size >> 9;
sector_t stripe;
@@ -713,8 +841,7 @@ static void copy_data(int frombio, struct bio *bio,
static void compute_block(struct stripe_head *sh, int dd_idx)
{
- raid5_conf_t *conf = sh->raid_conf;
- int i, count, disks = conf->raid_disks;
+ int i, count, disks = sh->disks;
void *ptr[MAX_XOR_BLOCKS], *p;
PRINTK("compute_block, stripe %llu, idx %d\n",
@@ -744,7 +871,7 @@ static void compute_block(struct stripe_head *sh, int dd_idx)
static void compute_parity(struct stripe_head *sh, int method)
{
raid5_conf_t *conf = sh->raid_conf;
- int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;
+ int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
void *ptr[MAX_XOR_BLOCKS];
struct bio *chosen;
@@ -910,6 +1037,20 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
return 0;
}
+static void end_reshape(raid5_conf_t *conf);
+
+static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
+{
+ int sectors_per_chunk = conf->chunk_size >> 9;
+ sector_t x = stripe;
+ int pd_idx, dd_idx;
+ int chunk_offset = sector_div(x, sectors_per_chunk);
+ stripe = x;
+ raid5_compute_sector(stripe*(disks-1)*sectors_per_chunk
+ + chunk_offset, disks, disks-1, &dd_idx, &pd_idx, conf);
+ return pd_idx;
+}
+
/*
* handle_stripe - do things to a stripe.
@@ -932,11 +1073,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
static void handle_stripe(struct stripe_head *sh)
{
raid5_conf_t *conf = sh->raid_conf;
- int disks = conf->raid_disks;
+ int disks = sh->disks;
struct bio *return_bi= NULL;
struct bio *bi;
int i;
- int syncing;
+ int syncing, expanding, expanded;
int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
int non_overwrite = 0;
int failed_num=0;
@@ -951,6 +1092,8 @@ static void handle_stripe(struct stripe_head *sh)
clear_bit(STRIPE_DELAYED, &sh->state);
syncing = test_bit(STRIPE_SYNCING, &sh->state);
+ expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+ expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
/* Now to look around and see what can be done */
rcu_read_lock();
@@ -1143,13 +1286,14 @@ static void handle_stripe(struct stripe_head *sh)
* parity, or to satisfy requests
* or to load a block that is being partially written.
*/
- if (to_read || non_overwrite || (syncing && (uptodate < disks))) {
+ if (to_read || non_overwrite || (syncing && (uptodate < disks)) || expanding) {
for (i=disks; i--;) {
dev = &sh->dev[i];
if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
(dev->toread ||
(dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
syncing ||
+ expanding ||
(failed && (sh->dev[failed_num].toread ||
(sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags))))
)
@@ -1339,13 +1483,77 @@ static void handle_stripe(struct stripe_head *sh)
set_bit(R5_Wantwrite, &dev->flags);
set_bit(R5_ReWrite, &dev->flags);
set_bit(R5_LOCKED, &dev->flags);
+ locked++;
} else {
/* let's read it back */
set_bit(R5_Wantread, &dev->flags);
set_bit(R5_LOCKED, &dev->flags);
+ locked++;
}
}
+ if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
+ /* Need to write out all blocks after computing parity */
+ sh->disks = conf->raid_disks;
+ sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks);
+ compute_parity(sh, RECONSTRUCT_WRITE);
+ for (i= conf->raid_disks; i--;) {
+ set_bit(R5_LOCKED, &sh->dev[i].flags);
+ locked++;
+ set_bit(R5_Wantwrite, &sh->dev[i].flags);
+ }
+ clear_bit(STRIPE_EXPANDING, &sh->state);
+ } else if (expanded) {
+ clear_bit(STRIPE_EXPAND_READY, &sh->state);
+ atomic_dec(&conf->reshape_stripes);
+ wake_up(&conf->wait_for_overlap);
+ md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
+ }
+
+ if (expanding && locked == 0) {
+ /* We have read all the blocks in this stripe and now we need to
+ * copy some of them into a target stripe for expand.
+ */
+ clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+ for (i=0; i< sh->disks; i++)
+ if (i != sh->pd_idx) {
+ int dd_idx, pd_idx, j;
+ struct stripe_head *sh2;
+
+ sector_t bn = compute_blocknr(sh, i);
+ sector_t s = raid5_compute_sector(bn, conf->raid_disks,
+ conf->raid_disks-1,
+ &dd_idx, &pd_idx, conf);
+ sh2 = get_active_stripe(conf, s, conf->raid_disks, pd_idx, 1);
+ if (sh2 == NULL)
+ /* so far only the early blocks of this stripe
+ * have been requested. When later blocks
+ * get requested, we will try again
+ */
+ continue;
+ if(!test_bit(STRIPE_EXPANDING, &sh2->state) ||
+ test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
+ /* must have already done this block */
+ release_stripe(sh2);
+ continue;
+ }
+ memcpy(page_address(sh2->dev[dd_idx].page),
+ page_address(sh->dev[i].page),
+ STRIPE_SIZE);
+ set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
+ set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
+ for (j=0; j<conf->raid_disks; j++)
+ if (j != sh2->pd_idx &&
+ !test_bit(R5_Expanded, &sh2->dev[j].flags))
+ break;
+ if (j == conf->raid_disks) {
+ set_bit(STRIPE_EXPAND_READY, &sh2->state);
+ set_bit(STRIPE_HANDLE, &sh2->state);
+ }
+ release_stripe(sh2);
+ }
+ }
+
spin_unlock(&sh->lock);
while ((bi=return_bi)) {
@@ -1384,7 +1592,7 @@ static void handle_stripe(struct stripe_head *sh)
rcu_read_unlock();
if (rdev) {
- if (syncing)
+ if (syncing || expanding || expanded)
md_sync_acct(rdev->bdev, STRIPE_SECTORS);
bi->bi_bdev = rdev->bdev;
@@ -1526,17 +1734,16 @@ static inline void raid5_plug_device(raid5_conf_t *conf)
spin_unlock_irq(&conf->device_lock);
}
-static int make_request (request_queue_t *q, struct bio * bi)
+static int make_request(request_queue_t *q, struct bio * bi)
{
mddev_t *mddev = q->queuedata;
raid5_conf_t *conf = mddev_to_conf(mddev);
- const unsigned int raid_disks = conf->raid_disks;
- const unsigned int data_disks = raid_disks - 1;
unsigned int dd_idx, pd_idx;
sector_t new_sector;
sector_t logical_sector, last_sector;
struct stripe_head *sh;
const int rw = bio_data_dir(bi);
+ int remaining;
if (unlikely(bio_barrier(bi))) {
bio_endio(bi, bi->bi_size, -EOPNOTSUPP);
@@ -1555,20 +1762,77 @@ static int make_request (request_queue_t *q, struct bio * bi)
for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
DEFINE_WAIT(w);
-
- new_sector = raid5_compute_sector(logical_sector,
- raid_disks, data_disks, &dd_idx, &pd_idx, conf);
+ int disks;
+ retry:
+ prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
+ if (likely(conf->expand_progress == MaxSector))
+ disks = conf->raid_disks;
+ else {
+ /* spinlock is needed as expand_progress may be
+ * 64bit on a 32bit platform, and so it might be
+ * possible to see a half-updated value
+ * Ofcourse expand_progress could change after
+ * the lock is dropped, so once we get a reference
+ * to the stripe that we think it is, we will have
+ * to check again.
+ */
+ spin_lock_irq(&conf->device_lock);
+ disks = conf->raid_disks;
+ if (logical_sector >= conf->expand_progress)
+ disks = conf->previous_raid_disks;
+ else {
+ if (logical_sector >= conf->expand_lo) {
+ spin_unlock_irq(&conf->device_lock);
+ schedule();
+ goto retry;
+ }
+ }
+ spin_unlock_irq(&conf->device_lock);
+ }
+ new_sector = raid5_compute_sector(logical_sector, disks, disks - 1,
+ &dd_idx, &pd_idx, conf);
PRINTK("raid5: make_request, sector %llu logical %llu\n",
(unsigned long long)new_sector,
(unsigned long long)logical_sector);
- retry:
- prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
- sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK));
+ sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK));
if (sh) {
- if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
- /* Add failed due to overlap. Flush everything
+ if (unlikely(conf->expand_progress != MaxSector)) {
+ /* expansion might have moved on while waiting for a
+ * stripe, so we must do the range check again.
+ * Expansion could still move past after this
+ * test, but as we are holding a reference to
+ * 'sh', we know that if that happens,
+ * STRIPE_EXPANDING will get set and the expansion
+ * won't proceed until we finish with the stripe.
+ */
+ int must_retry = 0;
+ spin_lock_irq(&conf->device_lock);
+ if (logical_sector < conf->expand_progress &&
+ disks == conf->previous_raid_disks)
+ /* mismatch, need to try again */
+ must_retry = 1;
+ spin_unlock_irq(&conf->device_lock);
+ if (must_retry) {
+ release_stripe(sh);
+ goto retry;
+ }
+ }
+ /* FIXME what if we get a false positive because these
+ * are being updated.
+ */
+ if (logical_sector >= mddev->suspend_lo &&
+ logical_sector < mddev->suspend_hi) {
+ release_stripe(sh);
+ schedule();
+ goto retry;
+ }
+
+ if (test_bit(STRIPE_EXPANDING, &sh->state) ||
+ !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
+ /* Stripe is busy expanding or
+ * add failed due to overlap. Flush everything
* and wait a while
*/
raid5_unplug_device(mddev->queue);
@@ -1580,7 +1844,6 @@ static int make_request (request_queue_t *q, struct bio * bi)
raid5_plug_device(conf);
handle_stripe(sh);
release_stripe(sh);
-
} else {
/* cannot get stripe for read-ahead, just give-up */
clear_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -1590,7 +1853,9 @@ static int make_request (request_queue_t *q, struct bio * bi)
}
spin_lock_irq(&conf->device_lock);
- if (--bi->bi_phys_segments == 0) {
+ remaining = --bi->bi_phys_segments;
+ spin_unlock_irq(&conf->device_lock);
+ if (remaining == 0) {
int bytes = bi->bi_size;
if ( bio_data_dir(bi) == WRITE )
@@ -1598,7 +1863,6 @@ static int make_request (request_queue_t *q, struct bio * bi)
bi->bi_size = 0;
bi->bi_end_io(bi, bytes, 0);
}
- spin_unlock_irq(&conf->device_lock);
return 0;
}
@@ -1607,12 +1871,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
{
raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
struct stripe_head *sh;
- int sectors_per_chunk = conf->chunk_size >> 9;
- sector_t x;
- unsigned long stripe;
- int chunk_offset;
- int dd_idx, pd_idx;
- sector_t first_sector;
+ int pd_idx;
+ sector_t first_sector, last_sector;
int raid_disks = conf->raid_disks;
int data_disks = raid_disks-1;
sector_t max_sector = mddev->size << 1;
@@ -1621,6 +1881,10 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
if (sector_nr >= max_sector) {
/* just being told to finish up .. nothing much to do */
unplug_slaves(mddev);
+ if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
+ end_reshape(conf);
+ return 0;
+ }
if (mddev->curr_resync < max_sector) /* aborted */
bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
@@ -1631,6 +1895,123 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
return 0;
}
+
+ if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
+ /* reshaping is quite different to recovery/resync so it is
+ * handled quite separately ... here.
+ *
+ * On each call to sync_request, we gather one chunk worth of
+ * destination stripes and flag them as expanding.
+ * Then we find all the source stripes and request reads.
+ * As the reads complete, handle_stripe will copy the data
+ * into the destination stripe and release that stripe.
+ */
+ int i;
+ int dd_idx;
+ sector_t writepos, safepos, gap;
+
+ if (sector_nr == 0 &&
+ conf->expand_progress != 0) {
+ /* restarting in the middle, skip the initial sectors */
+ sector_nr = conf->expand_progress;
+ sector_div(sector_nr, conf->raid_disks-1);
+ *skipped = 1;
+ return sector_nr;
+ }
+
+ /* we update the metadata when there is more than 3Meg
+ * in the block range (that is rather arbitrary, should
+ * probably be time based) or when the data about to be
+ * copied would over-write the source of the data at
+ * the front of the range.
+ * i.e. one new_stripe forward from expand_progress new_maps
+ * to after where expand_lo old_maps to
+ */
+ writepos = conf->expand_progress +
+ conf->chunk_size/512*(conf->raid_disks-1);
+ sector_div(writepos, conf->raid_disks-1);
+ safepos = conf->expand_lo;
+ sector_div(safepos, conf->previous_raid_disks-1);
+ gap = conf->expand_progress - conf->expand_lo;
+
+ if (writepos >= safepos ||
+ gap > (conf->raid_disks-1)*3000*2 /*3Meg*/) {
+ /* Cannot proceed until we've updated the superblock... */
+ wait_event(conf->wait_for_overlap,
+ atomic_read(&conf->reshape_stripes)==0);
+ mddev->reshape_position = conf->expand_progress;
+ mddev->sb_dirty = 1;
+ md_wakeup_thread(mddev->thread);
+ wait_event(mddev->sb_wait, mddev->sb_dirty == 0 ||
+ kthread_should_stop());
+ spin_lock_irq(&conf->device_lock);
+ conf->expand_lo = mddev->reshape_position;
+ spin_unlock_irq(&conf->device_lock);
+ wake_up(&conf->wait_for_overlap);
+ }
+
+ for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) {
+ int j;
+ int skipped = 0;
+ pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks);
+ sh = get_active_stripe(conf, sector_nr+i,
+ conf->raid_disks, pd_idx, 0);
+ set_bit(STRIPE_EXPANDING, &sh->state);
+ atomic_inc(&conf->reshape_stripes);
+ /* If any of this stripe is beyond the end of the old
+ * array, then we need to zero those blocks
+ */
+ for (j=sh->disks; j--;) {
+ sector_t s;
+ if (j == sh->pd_idx)
+ continue;
+ s = compute_blocknr(sh, j);
+ if (s < (mddev->array_size<<1)) {
+ skipped = 1;
+ continue;
+ }
+ memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
+ set_bit(R5_Expanded, &sh->dev[j].flags);
+ set_bit(R5_UPTODATE, &sh->dev[j].flags);
+ }
+ if (!skipped) {
+ set_bit(STRIPE_EXPAND_READY, &sh->state);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ }
+ release_stripe(sh);
+ }
+ spin_lock_irq(&conf->device_lock);
+ conf->expand_progress = (sector_nr + i)*(conf->raid_disks-1);
+ spin_unlock_irq(&conf->device_lock);
+ /* Ok, those stripe are ready. We can start scheduling
+ * reads on the source stripes.
+ * The source stripes are determined by mapping the first and last
+ * block on the destination stripes.
+ */
+ raid_disks = conf->previous_raid_disks;
+ data_disks = raid_disks - 1;
+ first_sector =
+ raid5_compute_sector(sector_nr*(conf->raid_disks-1),
+ raid_disks, data_disks,
+ &dd_idx, &pd_idx, conf);
+ last_sector =
+ raid5_compute_sector((sector_nr+conf->chunk_size/512)
+ *(conf->raid_disks-1) -1,
+ raid_disks, data_disks,
+ &dd_idx, &pd_idx, conf);
+ if (last_sector >= (mddev->size<<1))
+ last_sector = (mddev->size<<1)-1;
+ while (first_sector <= last_sector) {
+ pd_idx = stripe_to_pdidx(first_sector, conf, conf->previous_raid_disks);
+ sh = get_active_stripe(conf, first_sector,
+ conf->previous_raid_disks, pd_idx, 0);
+ set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ release_stripe(sh);
+ first_sector += STRIPE_SECTORS;
+ }
+ return conf->chunk_size>>9;
+ }
/* if there is 1 or more failed drives and we are trying
* to resync, then assert that we are finished, because there is
* nothing we can do.
@@ -1649,16 +2030,10 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
}
- x = sector_nr;
- chunk_offset = sector_div(x, sectors_per_chunk);
- stripe = x;
- BUG_ON(x != stripe);
-
- first_sector = raid5_compute_sector((sector_t)stripe*data_disks*sectors_per_chunk
- + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
- sh = get_active_stripe(conf, sector_nr, pd_idx, 1);
+ pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks);
+ sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1);
if (sh == NULL) {
- sh = get_active_stripe(conf, sector_nr, pd_idx, 0);
+ sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0);
/* make sure we don't swamp the stripe cache if someone else
* is trying to get access
*/
@@ -1822,11 +2197,64 @@ static int run(mddev_t *mddev)
return -EIO;
}
- mddev->private = kzalloc(sizeof (raid5_conf_t)
- + mddev->raid_disks * sizeof(struct disk_info),
- GFP_KERNEL);
+ if (mddev->reshape_position != MaxSector) {
+ /* Check that we can continue the reshape.
+ * Currently only disks can change, it must
+ * increase, and we must be past the point where
+ * a stripe over-writes itself
+ */
+ sector_t here_new, here_old;
+ int old_disks;
+
+ if (mddev->new_level != mddev->level ||
+ mddev->new_layout != mddev->layout ||
+ mddev->new_chunk != mddev->chunk_size) {
+ printk(KERN_ERR "raid5: %s: unsupported reshape required - aborting.\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+ if (mddev->delta_disks <= 0) {
+ printk(KERN_ERR "raid5: %s: unsupported reshape (reduce disks) required - aborting.\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+ old_disks = mddev->raid_disks - mddev->delta_disks;
+ /* reshape_position must be on a new-stripe boundary, and one
+ * further up in new geometry must map after here in old geometry.
+ */
+ here_new = mddev->reshape_position;
+ if (sector_div(here_new, (mddev->chunk_size>>9)*(mddev->raid_disks-1))) {
+ printk(KERN_ERR "raid5: reshape_position not on a stripe boundary\n");
+ return -EINVAL;
+ }
+ /* here_new is the stripe we will write to */
+ here_old = mddev->reshape_position;
+ sector_div(here_old, (mddev->chunk_size>>9)*(old_disks-1));
+ /* here_old is the first stripe that we might need to read from */
+ if (here_new >= here_old) {
+ /* Reading from the same stripe as writing to - bad */
+ printk(KERN_ERR "raid5: reshape_position too early for auto-recovery - aborting.\n");
+ return -EINVAL;
+ }
+ printk(KERN_INFO "raid5: reshape will continue\n");
+ /* OK, we should be able to continue; */
+ }
+
+
+ mddev->private = kzalloc(sizeof (raid5_conf_t), GFP_KERNEL);
if ((conf = mddev->private) == NULL)
goto abort;
+ if (mddev->reshape_position == MaxSector) {
+ conf->previous_raid_disks = conf->raid_disks = mddev->raid_disks;
+ } else {
+ conf->raid_disks = mddev->raid_disks;
+ conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
+ }
+
+ conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info),
+ GFP_KERNEL);
+ if (!conf->disks)
+ goto abort;
conf->mddev = mddev;
@@ -1847,7 +2275,7 @@ static int run(mddev_t *mddev)
ITERATE_RDEV(mddev,rdev,tmp) {
raid_disk = rdev->raid_disk;
- if (raid_disk >= mddev->raid_disks
+ if (raid_disk >= conf->raid_disks
|| raid_disk < 0)
continue;
disk = conf->disks + raid_disk;
@@ -1863,7 +2291,6 @@ static int run(mddev_t *mddev)
}
}
- conf->raid_disks = mddev->raid_disks;
/*
* 0 for a fully functional array, 1 for a degraded array.
*/
@@ -1873,6 +2300,7 @@ static int run(mddev_t *mddev)
conf->level = mddev->level;
conf->algorithm = mddev->layout;
conf->max_nr_stripes = NR_STRIPES;
+ conf->expand_progress = mddev->reshape_position;
/* device size must be a multiple of chunk size */
mddev->size &= ~(mddev->chunk_size/1024 -1);
@@ -1945,6 +2373,21 @@ static int run(mddev_t *mddev)
print_raid5_conf(conf);
+ if (conf->expand_progress != MaxSector) {
+ printk("...ok start reshape thread\n");
+ conf->expand_lo = conf->expand_progress;
+ atomic_set(&conf->reshape_stripes, 0);
+ clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+ set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ mddev->sync_thread = md_register_thread(md_do_sync, mddev,
+ "%s_reshape");
+ /* FIXME if md_register_thread fails?? */
+ md_wakeup_thread(mddev->sync_thread);
+
+ }
+
/* read-ahead size must cover two whole stripes, which is
* 2 * (n-1) * chunksize where 'n' is the number of raid devices
*/
@@ -1960,12 +2403,13 @@ static int run(mddev_t *mddev)
mddev->queue->unplug_fn = raid5_unplug_device;
mddev->queue->issue_flush_fn = raid5_issue_flush;
+ mddev->array_size = mddev->size * (conf->previous_raid_disks - 1);
- mddev->array_size = mddev->size * (mddev->raid_disks - 1);
return 0;
abort:
if (conf) {
print_raid5_conf(conf);
+ kfree(conf->disks);
kfree(conf->stripe_hashtbl);
kfree(conf);
}
@@ -1986,6 +2430,7 @@ static int stop(mddev_t *mddev)
kfree(conf->stripe_hashtbl);
blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
+ kfree(conf->disks);
kfree(conf);
mddev->private = NULL;
return 0;
@@ -2001,7 +2446,7 @@ static void print_sh (struct stripe_head *sh)
printk("sh %llu, count %d.\n",
(unsigned long long)sh->sector, atomic_read(&sh->count));
printk("sh %llu, ", (unsigned long long)sh->sector);
- for (i = 0; i < sh->raid_conf->raid_disks; i++) {
+ for (i = 0; i < sh->disks; i++) {
printk("(cache%d: %p %ld) ",
i, sh->dev[i].page, sh->dev[i].flags);
}
@@ -2132,7 +2577,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
/*
* find the disk ...
*/
- for (disk=0; disk < mddev->raid_disks; disk++)
+ for (disk=0; disk < conf->raid_disks; disk++)
if ((p=conf->disks + disk)->rdev == NULL) {
clear_bit(In_sync, &rdev->flags);
rdev->raid_disk = disk;
@@ -2168,11 +2613,146 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
return 0;
}
+#ifdef CONFIG_MD_RAID5_RESHAPE
+static int raid5_check_reshape(mddev_t *mddev)
+{
+ raid5_conf_t *conf = mddev_to_conf(mddev);
+ int err;
+
+ if (mddev->delta_disks < 0 ||
+ mddev->new_level != mddev->level)
+ return -EINVAL; /* Cannot shrink array or change level yet */
+ if (mddev->delta_disks == 0)
+ return 0; /* nothing to do */
+
+ /* Can only proceed if there are plenty of stripe_heads.
+ * We need a minimum of one full stripe,, and for sensible progress
+ * it is best to have about 4 times that.
+ * If we require 4 times, then the default 256 4K stripe_heads will
+ * allow for chunk sizes up to 256K, which is probably OK.
+ * If the chunk size is greater, user-space should request more
+ * stripe_heads first.
+ */
+ if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes ||
+ (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) {
+ printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n",
+ (mddev->chunk_size / STRIPE_SIZE)*4);
+ return -ENOSPC;
+ }
+
+ err = resize_stripes(conf, conf->raid_disks + mddev->delta_disks);
+ if (err)
+ return err;
+
+ /* looks like we might be able to manage this */
+ return 0;
+}
+
+static int raid5_start_reshape(mddev_t *mddev)
+{
+ raid5_conf_t *conf = mddev_to_conf(mddev);
+ mdk_rdev_t *rdev;
+ struct list_head *rtmp;
+ int spares = 0;
+ int added_devices = 0;
+
+ if (mddev->degraded ||
+ test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+ return -EBUSY;
+
+ ITERATE_RDEV(mddev, rdev, rtmp)
+ if (rdev->raid_disk < 0 &&
+ !test_bit(Faulty, &rdev->flags))
+ spares++;
+
+ if (spares < mddev->delta_disks-1)
+ /* Not enough devices even to make a degraded array
+ * of that size
+ */
+ return -EINVAL;
+
+ atomic_set(&conf->reshape_stripes, 0);
+ spin_lock_irq(&conf->device_lock);
+ conf->previous_raid_disks = conf->raid_disks;
+ conf->raid_disks += mddev->delta_disks;
+ conf->expand_progress = 0;
+ conf->expand_lo = 0;
+ spin_unlock_irq(&conf->device_lock);
+
+ /* Add some new drives, as many as will fit.
+ * We know there are enough to make the newly sized array work.
+ */
+ ITERATE_RDEV(mddev, rdev, rtmp)
+ if (rdev->raid_disk < 0 &&
+ !test_bit(Faulty, &rdev->flags)) {
+ if (raid5_add_disk(mddev, rdev)) {
+ char nm[20];
+ set_bit(In_sync, &rdev->flags);
+ conf->working_disks++;
+ added_devices++;
+ sprintf(nm, "rd%d", rdev->raid_disk);
+ sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
+ } else
+ break;
+ }
+
+ mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices;
+ mddev->raid_disks = conf->raid_disks;
+ mddev->reshape_position = 0;
+ mddev->sb_dirty = 1;
+
+ clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+ set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ mddev->sync_thread = md_register_thread(md_do_sync, mddev,
+ "%s_reshape");
+ if (!mddev->sync_thread) {
+ mddev->recovery = 0;
+ spin_lock_irq(&conf->device_lock);
+ mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
+ conf->expand_progress = MaxSector;
+ spin_unlock_irq(&conf->device_lock);
+ return -EAGAIN;
+ }
+ md_wakeup_thread(mddev->sync_thread);
+ md_new_event(mddev);
+ return 0;
+}
+#endif
+
+static void end_reshape(raid5_conf_t *conf)
+{
+ struct block_device *bdev;
+
+ if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
+ conf->mddev->array_size = conf->mddev->size * (conf->raid_disks-1);
+ set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
+ conf->mddev->changed = 1;
+
+ bdev = bdget_disk(conf->mddev->gendisk, 0);
+ if (bdev) {
+ mutex_lock(&bdev->bd_inode->i_mutex);
+ i_size_write(bdev->bd_inode, conf->mddev->array_size << 10);
+ mutex_unlock(&bdev->bd_inode->i_mutex);
+ bdput(bdev);
+ }
+ spin_lock_irq(&conf->device_lock);
+ conf->expand_progress = MaxSector;
+ spin_unlock_irq(&conf->device_lock);
+ conf->mddev->reshape_position = MaxSector;
+ }
+}
+
static void raid5_quiesce(mddev_t *mddev, int state)
{
raid5_conf_t *conf = mddev_to_conf(mddev);
switch(state) {
+ case 2: /* resume for a suspend */
+ wake_up(&conf->wait_for_overlap);
+ break;
+
case 1: /* stop all writes */
spin_lock_irq(&conf->device_lock);
conf->quiesce = 1;
@@ -2186,6 +2766,7 @@ static void raid5_quiesce(mddev_t *mddev, int state)
spin_lock_irq(&conf->device_lock);
conf->quiesce = 0;
wake_up(&conf->wait_for_stripe);
+ wake_up(&conf->wait_for_overlap);
spin_unlock_irq(&conf->device_lock);
break;
}
@@ -2206,6 +2787,10 @@ static struct mdk_personality raid5_personality =
.spare_active = raid5_spare_active,
.sync_request = sync_request,
.resize = raid5_resize,
+#ifdef CONFIG_MD_RAID5_RESHAPE
+ .check_reshape = raid5_check_reshape,
+ .start_reshape = raid5_start_reshape,
+#endif
.quiesce = raid5_quiesce,
};
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index cd477ebf2ee4..ab64b37e4996 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -331,9 +331,9 @@ static int grow_stripes(raid6_conf_t *conf, int num)
kmem_cache_t *sc;
int devs = conf->raid_disks;
- sprintf(conf->cache_name, "raid6/%s", mdname(conf->mddev));
+ sprintf(conf->cache_name[0], "raid6/%s", mdname(conf->mddev));
- sc = kmem_cache_create(conf->cache_name,
+ sc = kmem_cache_create(conf->cache_name[0],
sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
0, 0, NULL, NULL);
if (!sc)
@@ -2006,11 +2006,14 @@ static int run(mddev_t *mddev)
return -EIO;
}
- mddev->private = kzalloc(sizeof (raid6_conf_t)
- + mddev->raid_disks * sizeof(struct disk_info),
- GFP_KERNEL);
+ mddev->private = kzalloc(sizeof (raid6_conf_t), GFP_KERNEL);
if ((conf = mddev->private) == NULL)
goto abort;
+ conf->disks = kzalloc(mddev->raid_disks * sizeof(struct disk_info),
+ GFP_KERNEL);
+ if (!conf->disks)
+ goto abort;
+
conf->mddev = mddev;
if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
@@ -2148,6 +2151,8 @@ static int run(mddev_t *mddev)
}
/* Ok, everything is just fine now */
+ sysfs_create_group(&mddev->kobj, &raid6_attrs_group);
+
mddev->array_size = mddev->size * (mddev->raid_disks - 2);
mddev->queue->unplug_fn = raid6_unplug_device;
@@ -2158,6 +2163,7 @@ abort:
print_raid6_conf(conf);
safe_put_page(conf->spare_page);
kfree(conf->stripe_hashtbl);
+ kfree(conf->disks);
kfree(conf);
}
mddev->private = NULL;