summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig13
-rw-r--r--drivers/md/bitmap.c49
-rw-r--r--drivers/md/dm-bio-list.h107
-rw-r--r--drivers/md/dm-bio-record.h26
-rw-r--r--drivers/md/dm-crypt.c49
-rw-r--r--drivers/md/dm-delay.c2
-rw-r--r--drivers/md/dm-exception-store.c252
-rw-r--r--drivers/md/dm-exception-store.h58
-rw-r--r--drivers/md/dm-io.c7
-rw-r--r--drivers/md/dm-ioctl.c28
-rw-r--r--drivers/md/dm-kcopyd.c23
-rw-r--r--drivers/md/dm-linear.c1
-rw-r--r--drivers/md/dm-log.c75
-rw-r--r--drivers/md/dm-mpath.c1
-rw-r--r--drivers/md/dm-path-selector.c21
-rw-r--r--drivers/md/dm-raid1.c51
-rw-r--r--drivers/md/dm-region-hash.c1
-rw-r--r--drivers/md/dm-snap-persistent.c153
-rw-r--r--drivers/md/dm-snap-transient.c86
-rw-r--r--drivers/md/dm-snap.c385
-rw-r--r--drivers/md/dm-snap.h105
-rw-r--r--drivers/md/dm-table.c85
-rw-r--r--drivers/md/dm-target.c104
-rw-r--r--drivers/md/dm.c312
-rw-r--r--drivers/md/dm.h3
-rw-r--r--drivers/md/md.c130
-rw-r--r--drivers/md/md.h21
-rw-r--r--drivers/md/raid1.c10
-rw-r--r--drivers/md/raid10.c13
-rw-r--r--drivers/md/raid5.c1302
-rw-r--r--drivers/md/raid5.h8
31 files changed, 2009 insertions, 1472 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 41b3ae25b813..09c0c6e49ab5 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -124,6 +124,8 @@ config MD_RAID456
select MD_RAID6_PQ
select ASYNC_MEMCPY
select ASYNC_XOR
+ select ASYNC_PQ
+ select ASYNC_RAID6_RECOV
---help---
A RAID-5 set of N drives with a capacity of C MB per drive provides
the capacity of C * (N - 1) MB, and protects against a failure
@@ -152,6 +154,17 @@ config MD_RAID456
If unsure, say Y.
+config MULTICORE_RAID456
+ bool "RAID-4/RAID-5/RAID-6 Multicore processing (EXPERIMENTAL)"
+ depends on MD_RAID456
+ depends on SMP
+ depends on EXPERIMENTAL
+ ---help---
+ Enable the raid456 module to dispatch per-stripe raid operations to a
+ thread pool.
+
+ If unsure, say N.
+
config MD_RAID6_PQ
tristate
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index f8a9f7ab2cb8..56df1cee8fb3 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -986,6 +986,9 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
oldindex = index;
oldpage = page;
+ bitmap->filemap[bitmap->file_pages++] = page;
+ bitmap->last_page_size = count;
+
if (outofdate) {
/*
* if bitmap is out of date, dirty the
@@ -998,15 +1001,9 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
write_page(bitmap, page, 1);
ret = -EIO;
- if (bitmap->flags & BITMAP_WRITE_ERROR) {
- /* release, page not in filemap yet */
- put_page(page);
+ if (bitmap->flags & BITMAP_WRITE_ERROR)
goto err;
- }
}
-
- bitmap->filemap[bitmap->file_pages++] = page;
- bitmap->last_page_size = count;
}
paddr = kmap_atomic(page, KM_USER0);
if (bitmap->flags & BITMAP_HOSTENDIAN)
@@ -1016,9 +1013,11 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
kunmap_atomic(paddr, KM_USER0);
if (b) {
/* if the disk bit is set, set the memory bit */
- bitmap_set_memory_bits(bitmap, i << CHUNK_BLOCK_SHIFT(bitmap),
- ((i+1) << (CHUNK_BLOCK_SHIFT(bitmap)) >= start)
- );
+ int needed = ((sector_t)(i+1) << (CHUNK_BLOCK_SHIFT(bitmap))
+ >= start);
+ bitmap_set_memory_bits(bitmap,
+ (sector_t)i << CHUNK_BLOCK_SHIFT(bitmap),
+ needed);
bit_cnt++;
set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
}
@@ -1098,14 +1097,12 @@ void bitmap_daemon_work(struct bitmap *bitmap)
}
bitmap->allclean = 1;
+ spin_lock_irqsave(&bitmap->lock, flags);
for (j = 0; j < bitmap->chunks; j++) {
bitmap_counter_t *bmc;
- spin_lock_irqsave(&bitmap->lock, flags);
- if (!bitmap->filemap) {
+ if (!bitmap->filemap)
/* error or shutdown */
- spin_unlock_irqrestore(&bitmap->lock, flags);
break;
- }
page = filemap_get_page(bitmap, j);
@@ -1122,6 +1119,8 @@ void bitmap_daemon_work(struct bitmap *bitmap)
write_page(bitmap, page, 0);
bitmap->allclean = 0;
}
+ spin_lock_irqsave(&bitmap->lock, flags);
+ j |= (PAGE_BITS - 1);
continue;
}
@@ -1154,8 +1153,9 @@ void bitmap_daemon_work(struct bitmap *bitmap)
spin_lock_irqsave(&bitmap->lock, flags);
clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
}
- bmc = bitmap_get_counter(bitmap, j << CHUNK_BLOCK_SHIFT(bitmap),
- &blocks, 0);
+ bmc = bitmap_get_counter(bitmap,
+ (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap),
+ &blocks, 0);
if (bmc) {
/*
if (j < 100) printk("bitmap: j=%lu, *bmc = 0x%x\n", j, *bmc);
@@ -1169,7 +1169,8 @@ void bitmap_daemon_work(struct bitmap *bitmap)
} else if (*bmc == 1) {
/* we can clear the bit */
*bmc = 0;
- bitmap_count_page(bitmap, j << CHUNK_BLOCK_SHIFT(bitmap),
+ bitmap_count_page(bitmap,
+ (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap),
-1);
/* clear the bit */
@@ -1180,9 +1181,10 @@ void bitmap_daemon_work(struct bitmap *bitmap)
ext2_clear_bit(file_page_offset(j), paddr);
kunmap_atomic(paddr, KM_USER0);
}
- }
- spin_unlock_irqrestore(&bitmap->lock, flags);
+ } else
+ j |= PAGE_COUNTER_MASK;
}
+ spin_unlock_irqrestore(&bitmap->lock, flags);
/* now sync the final page */
if (lastpage != NULL) {
@@ -1479,6 +1481,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
s += blocks;
}
bitmap->last_end_sync = jiffies;
+ sysfs_notify(&bitmap->mddev->kobj, NULL, "sync_completed");
}
static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed)
@@ -1513,7 +1516,7 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e)
unsigned long chunk;
for (chunk = s; chunk <= e; chunk++) {
- sector_t sec = chunk << CHUNK_BLOCK_SHIFT(bitmap);
+ sector_t sec = (sector_t)chunk << CHUNK_BLOCK_SHIFT(bitmap);
bitmap_set_memory_bits(bitmap, sec, 1);
bitmap_file_set_bit(bitmap, sec);
}
@@ -1589,7 +1592,7 @@ void bitmap_destroy(mddev_t *mddev)
int bitmap_create(mddev_t *mddev)
{
struct bitmap *bitmap;
- unsigned long blocks = mddev->resync_max_sectors;
+ sector_t blocks = mddev->resync_max_sectors;
unsigned long chunks;
unsigned long pages;
struct file *file = mddev->bitmap_file;
@@ -1631,8 +1634,8 @@ int bitmap_create(mddev_t *mddev)
bitmap->chunkshift = ffz(~bitmap->chunksize);
/* now that chunksize and chunkshift are set, we can use these macros */
- chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) /
- CHUNK_BLOCK_RATIO(bitmap);
+ chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) >>
+ CHUNK_BLOCK_SHIFT(bitmap);
pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO;
BUG_ON(!pages);
diff --git a/drivers/md/dm-bio-list.h b/drivers/md/dm-bio-list.h
deleted file mode 100644
index d4509be0fe67..000000000000
--- a/drivers/md/dm-bio-list.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (C) 2004 Red Hat UK Ltd.
- *
- * This file is released under the GPL.
- */
-
-#ifndef DM_BIO_LIST_H
-#define DM_BIO_LIST_H
-
-#include <linux/bio.h>
-
-#ifdef CONFIG_BLOCK
-
-struct bio_list {
- struct bio *head;
- struct bio *tail;
-};
-
-static inline int bio_list_empty(const struct bio_list *bl)
-{
- return bl->head == NULL;
-}
-
-static inline void bio_list_init(struct bio_list *bl)
-{
- bl->head = bl->tail = NULL;
-}
-
-#define bio_list_for_each(bio, bl) \
- for (bio = (bl)->head; bio; bio = bio->bi_next)
-
-static inline unsigned bio_list_size(const struct bio_list *bl)
-{
- unsigned sz = 0;
- struct bio *bio;
-
- bio_list_for_each(bio, bl)
- sz++;
-
- return sz;
-}
-
-static inline void bio_list_add(struct bio_list *bl, struct bio *bio)
-{
- bio->bi_next = NULL;
-
- if (bl->tail)
- bl->tail->bi_next = bio;
- else
- bl->head = bio;
-
- bl->tail = bio;
-}
-
-static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2)
-{
- if (!bl2->head)
- return;
-
- if (bl->tail)
- bl->tail->bi_next = bl2->head;
- else
- bl->head = bl2->head;
-
- bl->tail = bl2->tail;
-}
-
-static inline void bio_list_merge_head(struct bio_list *bl,
- struct bio_list *bl2)
-{
- if (!bl2->head)
- return;
-
- if (bl->head)
- bl2->tail->bi_next = bl->head;
- else
- bl->tail = bl2->tail;
-
- bl->head = bl2->head;
-}
-
-static inline struct bio *bio_list_pop(struct bio_list *bl)
-{
- struct bio *bio = bl->head;
-
- if (bio) {
- bl->head = bl->head->bi_next;
- if (!bl->head)
- bl->tail = NULL;
-
- bio->bi_next = NULL;
- }
-
- return bio;
-}
-
-static inline struct bio *bio_list_get(struct bio_list *bl)
-{
- struct bio *bio = bl->head;
-
- bl->head = bl->tail = NULL;
-
- return bio;
-}
-
-#endif /* CONFIG_BLOCK */
-#endif
diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h
index d3ec217847d6..3a8cfa2645c7 100644
--- a/drivers/md/dm-bio-record.h
+++ b/drivers/md/dm-bio-record.h
@@ -16,30 +16,56 @@
* functions in this file help the target record and restore the
* original bio state.
*/
+
+struct dm_bio_vec_details {
+#if PAGE_SIZE < 65536
+ __u16 bv_len;
+ __u16 bv_offset;
+#else
+ unsigned bv_len;
+ unsigned bv_offset;
+#endif
+};
+
struct dm_bio_details {
sector_t bi_sector;
struct block_device *bi_bdev;
unsigned int bi_size;
unsigned short bi_idx;
unsigned long bi_flags;
+ struct dm_bio_vec_details bi_io_vec[BIO_MAX_PAGES];
};
static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio)
{
+ unsigned i;
+
bd->bi_sector = bio->bi_sector;
bd->bi_bdev = bio->bi_bdev;
bd->bi_size = bio->bi_size;
bd->bi_idx = bio->bi_idx;
bd->bi_flags = bio->bi_flags;
+
+ for (i = 0; i < bio->bi_vcnt; i++) {
+ bd->bi_io_vec[i].bv_len = bio->bi_io_vec[i].bv_len;
+ bd->bi_io_vec[i].bv_offset = bio->bi_io_vec[i].bv_offset;
+ }
}
static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio)
{
+ unsigned i;
+
bio->bi_sector = bd->bi_sector;
bio->bi_bdev = bd->bi_bdev;
bio->bi_size = bd->bi_size;
bio->bi_idx = bd->bi_idx;
bio->bi_flags = bd->bi_flags;
+
+ for (i = 0; i < bio->bi_vcnt; i++) {
+ bio->bi_io_vec[i].bv_len = bd->bi_io_vec[i].bv_len;
+ bio->bi_io_vec[i].bv_offset = bd->bi_io_vec[i].bv_offset;
+ }
}
#endif
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 35bda49796fb..53394e863c74 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -60,6 +60,7 @@ struct dm_crypt_io {
};
struct dm_crypt_request {
+ struct convert_context *ctx;
struct scatterlist sg_in;
struct scatterlist sg_out;
};
@@ -335,6 +336,18 @@ static void crypt_convert_init(struct crypt_config *cc,
init_completion(&ctx->restart);
}
+static struct dm_crypt_request *dmreq_of_req(struct crypt_config *cc,
+ struct ablkcipher_request *req)
+{
+ return (struct dm_crypt_request *)((char *)req + cc->dmreq_start);
+}
+
+static struct ablkcipher_request *req_of_dmreq(struct crypt_config *cc,
+ struct dm_crypt_request *dmreq)
+{
+ return (struct ablkcipher_request *)((char *)dmreq - cc->dmreq_start);
+}
+
static int crypt_convert_block(struct crypt_config *cc,
struct convert_context *ctx,
struct ablkcipher_request *req)
@@ -345,10 +358,11 @@ static int crypt_convert_block(struct crypt_config *cc,
u8 *iv;
int r = 0;
- dmreq = (struct dm_crypt_request *)((char *)req + cc->dmreq_start);
+ dmreq = dmreq_of_req(cc, req);
iv = (u8 *)ALIGN((unsigned long)(dmreq + 1),
crypto_ablkcipher_alignmask(cc->tfm) + 1);
+ dmreq->ctx = ctx;
sg_init_table(&dmreq->sg_in, 1);
sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT,
bv_in->bv_offset + ctx->offset_in);
@@ -395,8 +409,9 @@ static void crypt_alloc_req(struct crypt_config *cc,
cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
ablkcipher_request_set_tfm(cc->req, cc->tfm);
ablkcipher_request_set_callback(cc->req, CRYPTO_TFM_REQ_MAY_BACKLOG |
- CRYPTO_TFM_REQ_MAY_SLEEP,
- kcryptd_async_done, ctx);
+ CRYPTO_TFM_REQ_MAY_SLEEP,
+ kcryptd_async_done,
+ dmreq_of_req(cc, cc->req));
}
/*
@@ -553,19 +568,22 @@ static void crypt_inc_pending(struct dm_crypt_io *io)
static void crypt_dec_pending(struct dm_crypt_io *io)
{
struct crypt_config *cc = io->target->private;
+ struct bio *base_bio = io->base_bio;
+ struct dm_crypt_io *base_io = io->base_io;
+ int error = io->error;
if (!atomic_dec_and_test(&io->pending))
return;
- if (likely(!io->base_io))
- bio_endio(io->base_bio, io->error);
+ mempool_free(io, cc->io_pool);
+
+ if (likely(!base_io))
+ bio_endio(base_bio, error);
else {
- if (io->error && !io->base_io->error)
- io->base_io->error = io->error;
- crypt_dec_pending(io->base_io);
+ if (error && !base_io->error)
+ base_io->error = error;
+ crypt_dec_pending(base_io);
}
-
- mempool_free(io, cc->io_pool);
}
/*
@@ -821,7 +839,8 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
static void kcryptd_async_done(struct crypto_async_request *async_req,
int error)
{
- struct convert_context *ctx = async_req->data;
+ struct dm_crypt_request *dmreq = async_req->data;
+ struct convert_context *ctx = dmreq->ctx;
struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx);
struct crypt_config *cc = io->target->private;
@@ -830,7 +849,7 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
return;
}
- mempool_free(ablkcipher_request_cast(async_req), cc->req_pool);
+ mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool);
if (!atomic_dec_and_test(&ctx->pending))
return;
@@ -1137,8 +1156,7 @@ bad_ivmode:
crypto_free_ablkcipher(tfm);
bad_cipher:
/* Must zero key material before freeing */
- memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8));
- kfree(cc);
+ kzfree(cc);
return -EINVAL;
}
@@ -1164,8 +1182,7 @@ static void crypt_dtr(struct dm_target *ti)
dm_put_device(ti, cc->dev);
/* Must zero key material before freeing */
- memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8));
- kfree(cc);
+ kzfree(cc);
}
static int crypt_map(struct dm_target *ti, struct bio *bio,
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 59ee1b015d2d..559dbb52bc85 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -15,8 +15,6 @@
#include <linux/device-mapper.h>
-#include "dm-bio-list.h"
-
#define DM_MSG_PREFIX "delay"
struct delay_c {
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index dccbfb0e010f..a2e26c242141 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -7,6 +7,7 @@
#include "dm-exception-store.h"
+#include <linux/ctype.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/vmalloc.h>
@@ -14,6 +15,257 @@
#define DM_MSG_PREFIX "snapshot exception stores"
+static LIST_HEAD(_exception_store_types);
+static DEFINE_SPINLOCK(_lock);
+
+static struct dm_exception_store_type *__find_exception_store_type(const char *name)
+{
+ struct dm_exception_store_type *type;
+
+ list_for_each_entry(type, &_exception_store_types, list)
+ if (!strcmp(name, type->name))
+ return type;
+
+ return NULL;
+}
+
+static struct dm_exception_store_type *_get_exception_store_type(const char *name)
+{
+ struct dm_exception_store_type *type;
+
+ spin_lock(&_lock);
+
+ type = __find_exception_store_type(name);
+
+ if (type && !try_module_get(type->module))
+ type = NULL;
+
+ spin_unlock(&_lock);
+
+ return type;
+}
+
+/*
+ * get_type
+ * @type_name
+ *
+ * Attempt to retrieve the dm_exception_store_type by name. If not already
+ * available, attempt to load the appropriate module.
+ *
+ * Exstore modules are named "dm-exstore-" followed by the 'type_name'.
+ * Modules may contain multiple types.
+ * This function will first try the module "dm-exstore-<type_name>",
+ * then truncate 'type_name' on the last '-' and try again.
+ *
+ * For example, if type_name was "clustered-shared", it would search
+ * 'dm-exstore-clustered-shared' then 'dm-exstore-clustered'.
+ *
+ * 'dm-exception-store-<type_name>' is too long of a name in my
+ * opinion, which is why I've chosen to have the files
+ * containing exception store implementations be 'dm-exstore-<type_name>'.
+ * If you want your module to be autoloaded, you will follow this
+ * naming convention.
+ *
+ * Returns: dm_exception_store_type* on success, NULL on failure
+ */
+static struct dm_exception_store_type *get_type(const char *type_name)
+{
+ char *p, *type_name_dup;
+ struct dm_exception_store_type *type;
+
+ type = _get_exception_store_type(type_name);
+ if (type)
+ return type;
+
+ type_name_dup = kstrdup(type_name, GFP_KERNEL);
+ if (!type_name_dup) {
+ DMERR("No memory left to attempt load for \"%s\"", type_name);
+ return NULL;
+ }
+
+ while (request_module("dm-exstore-%s", type_name_dup) ||
+ !(type = _get_exception_store_type(type_name))) {
+ p = strrchr(type_name_dup, '-');
+ if (!p)
+ break;
+ p[0] = '\0';
+ }
+
+ if (!type)
+ DMWARN("Module for exstore type \"%s\" not found.", type_name);
+
+ kfree(type_name_dup);
+
+ return type;
+}
+
+static void put_type(struct dm_exception_store_type *type)
+{
+ spin_lock(&_lock);
+ module_put(type->module);
+ spin_unlock(&_lock);
+}
+
+int dm_exception_store_type_register(struct dm_exception_store_type *type)
+{
+ int r = 0;
+
+ spin_lock(&_lock);
+ if (!__find_exception_store_type(type->name))
+ list_add(&type->list, &_exception_store_types);
+ else
+ r = -EEXIST;
+ spin_unlock(&_lock);
+
+ return r;
+}
+EXPORT_SYMBOL(dm_exception_store_type_register);
+
+int dm_exception_store_type_unregister(struct dm_exception_store_type *type)
+{
+ spin_lock(&_lock);
+
+ if (!__find_exception_store_type(type->name)) {
+ spin_unlock(&_lock);
+ return -EINVAL;
+ }
+
+ list_del(&type->list);
+
+ spin_unlock(&_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(dm_exception_store_type_unregister);
+
+/*
+ * Round a number up to the nearest 'size' boundary. size must
+ * be a power of 2.
+ */
+static ulong round_up(ulong n, ulong size)
+{
+ size--;
+ return (n + size) & ~size;
+}
+
+static int set_chunk_size(struct dm_exception_store *store,
+ const char *chunk_size_arg, char **error)
+{
+ unsigned long chunk_size_ulong;
+ char *value;
+
+ chunk_size_ulong = simple_strtoul(chunk_size_arg, &value, 10);
+ if (*chunk_size_arg == '\0' || *value != '\0') {
+ *error = "Invalid chunk size";
+ return -EINVAL;
+ }
+
+ if (!chunk_size_ulong) {
+ store->chunk_size = store->chunk_mask = store->chunk_shift = 0;
+ return 0;
+ }
+
+ /*
+ * Chunk size must be multiple of page size. Silently
+ * round up if it's not.
+ */
+ chunk_size_ulong = round_up(chunk_size_ulong, PAGE_SIZE >> 9);
+
+ /* Check chunk_size is a power of 2 */
+ if (!is_power_of_2(chunk_size_ulong)) {
+ *error = "Chunk size is not a power of 2";
+ return -EINVAL;
+ }
+
+ /* Validate the chunk size against the device block size */
+ if (chunk_size_ulong % (bdev_hardsect_size(store->cow->bdev) >> 9)) {
+ *error = "Chunk size is not a multiple of device blocksize";
+ return -EINVAL;
+ }
+
+ store->chunk_size = chunk_size_ulong;
+ store->chunk_mask = chunk_size_ulong - 1;
+ store->chunk_shift = ffs(chunk_size_ulong) - 1;
+
+ return 0;
+}
+
+int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
+ unsigned *args_used,
+ struct dm_exception_store **store)
+{
+ int r = 0;
+ struct dm_exception_store_type *type;
+ struct dm_exception_store *tmp_store;
+ char persistent;
+
+ if (argc < 3) {
+ ti->error = "Insufficient exception store arguments";
+ return -EINVAL;
+ }
+
+ tmp_store = kmalloc(sizeof(*tmp_store), GFP_KERNEL);
+ if (!tmp_store) {
+ ti->error = "Exception store allocation failed";
+ return -ENOMEM;
+ }
+
+ persistent = toupper(*argv[1]);
+ if (persistent != 'P' && persistent != 'N') {
+ ti->error = "Persistent flag is not P or N";
+ return -EINVAL;
+ }
+
+ type = get_type(argv[1]);
+ if (!type) {
+ ti->error = "Exception store type not recognised";
+ r = -EINVAL;
+ goto bad_type;
+ }
+
+ tmp_store->type = type;
+ tmp_store->ti = ti;
+
+ r = dm_get_device(ti, argv[0], 0, 0,
+ FMODE_READ | FMODE_WRITE, &tmp_store->cow);
+ if (r) {
+ ti->error = "Cannot get COW device";
+ goto bad_cow;
+ }
+
+ r = set_chunk_size(tmp_store, argv[2], &ti->error);
+ if (r)
+ goto bad_cow;
+
+ r = type->ctr(tmp_store, 0, NULL);
+ if (r) {
+ ti->error = "Exception store type constructor failed";
+ goto bad_ctr;
+ }
+
+ *args_used = 3;
+ *store = tmp_store;
+ return 0;
+
+bad_ctr:
+ dm_put_device(ti, tmp_store->cow);
+bad_cow:
+ put_type(type);
+bad_type:
+ kfree(tmp_store);
+ return r;
+}
+EXPORT_SYMBOL(dm_exception_store_create);
+
+void dm_exception_store_destroy(struct dm_exception_store *store)
+{
+ store->type->dtr(store);
+ dm_put_device(store->ti, store->cow);
+ put_type(store->type);
+ kfree(store);
+}
+EXPORT_SYMBOL(dm_exception_store_destroy);
+
int dm_exception_store_init(void)
{
int r;
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index bb9f33d5daa2..0a2e6e7f67b3 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -37,11 +37,18 @@ struct dm_snap_exception {
* Abstraction to handle the meta/layout of exception stores (the
* COW device).
*/
-struct dm_exception_store {
+struct dm_exception_store;
+struct dm_exception_store_type {
+ const char *name;
+ struct module *module;
+
+ int (*ctr) (struct dm_exception_store *store,
+ unsigned argc, char **argv);
+
/*
* Destroys this object when you've finished with it.
*/
- void (*destroy) (struct dm_exception_store *store);
+ void (*dtr) (struct dm_exception_store *store);
/*
* The target shouldn't read the COW device until this is
@@ -72,8 +79,9 @@ struct dm_exception_store {
*/
void (*drop_snapshot) (struct dm_exception_store *store);
- int (*status) (struct dm_exception_store *store, status_type_t status,
- char *result, unsigned int maxlen);
+ unsigned (*status) (struct dm_exception_store *store,
+ status_type_t status, char *result,
+ unsigned maxlen);
/*
* Return how full the snapshot is.
@@ -82,7 +90,21 @@ struct dm_exception_store {
sector_t *numerator,
sector_t *denominator);
- struct dm_snapshot *snap;
+ /* For internal device-mapper use only. */
+ struct list_head list;
+};
+
+struct dm_exception_store {
+ struct dm_exception_store_type *type;
+ struct dm_target *ti;
+
+ struct dm_dev *cow;
+
+ /* Size of data blocks saved - must be a power of 2 */
+ chunk_t chunk_size;
+ chunk_t chunk_mask;
+ chunk_t chunk_shift;
+
void *context;
};
@@ -129,6 +151,28 @@ static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
# endif
+/*
+ * Return the number of sectors in the device.
+ */
+static inline sector_t get_dev_size(struct block_device *bdev)
+{
+ return bdev->bd_inode->i_size >> SECTOR_SHIFT;
+}
+
+static inline chunk_t sector_to_chunk(struct dm_exception_store *store,
+ sector_t sector)
+{
+ return (sector & ~store->chunk_mask) >> store->chunk_shift;
+}
+
+int dm_exception_store_type_register(struct dm_exception_store_type *type);
+int dm_exception_store_type_unregister(struct dm_exception_store_type *type);
+
+int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
+ unsigned *args_used,
+ struct dm_exception_store **store);
+void dm_exception_store_destroy(struct dm_exception_store *store);
+
int dm_exception_store_init(void);
void dm_exception_store_exit(void);
@@ -141,8 +185,4 @@ void dm_persistent_snapshot_exit(void);
int dm_transient_snapshot_init(void);
void dm_transient_snapshot_exit(void);
-int dm_create_persistent(struct dm_exception_store *store);
-
-int dm_create_transient(struct dm_exception_store *store);
-
#endif /* _LINUX_DM_EXCEPTION_STORE */
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index f14813be4eff..e73aabd61cd7 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -292,6 +292,8 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
(PAGE_SIZE >> SECTOR_SHIFT));
num_bvecs = 1 + min_t(int, bio_get_nr_vecs(where->bdev),
num_bvecs);
+ if (unlikely(num_bvecs > BIO_MAX_PAGES))
+ num_bvecs = BIO_MAX_PAGES;
bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
bio->bi_sector = where->sector + (where->count - remaining);
bio->bi_bdev = where->bdev;
@@ -368,16 +370,13 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
while (1) {
set_current_state(TASK_UNINTERRUPTIBLE);
- if (!atomic_read(&io.count) || signal_pending(current))
+ if (!atomic_read(&io.count))
break;
io_schedule();
}
set_current_state(TASK_RUNNING);
- if (atomic_read(&io.count))
- return -EINTR;
-
if (error_bits)
*error_bits = io.error_bits;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 54d0588fc1f6..823ceba6efa8 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -704,7 +704,8 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size)
char *new_name = (char *) param + param->data_start;
if (new_name < param->data ||
- invalid_str(new_name, (void *) param + param_size)) {
+ invalid_str(new_name, (void *) param + param_size) ||
+ strlen(new_name) > DM_NAME_LEN - 1) {
DMWARN("Invalid new logical volume name supplied.");
return -EINVAL;
}
@@ -1046,6 +1047,19 @@ static int populate_table(struct dm_table *table,
return dm_table_complete(table);
}
+static int table_prealloc_integrity(struct dm_table *t,
+ struct mapped_device *md)
+{
+ struct list_head *devices = dm_table_get_devices(t);
+ struct dm_dev_internal *dd;
+
+ list_for_each_entry(dd, devices, list)
+ if (bdev_get_integrity(dd->dm_dev.bdev))
+ return blk_integrity_register(dm_disk(md), NULL);
+
+ return 0;
+}
+
static int table_load(struct dm_ioctl *param, size_t param_size)
{
int r;
@@ -1063,7 +1077,15 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
r = populate_table(t, param, param_size);
if (r) {
- dm_table_put(t);
+ dm_table_destroy(t);
+ goto out;
+ }
+
+ r = table_prealloc_integrity(t, md);
+ if (r) {
+ DMERR("%s: could not register integrity profile.",
+ dm_device_name(md));
+ dm_table_destroy(t);
goto out;
}
@@ -1071,7 +1093,7 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
hc = dm_get_mdptr(md);
if (!hc || hc->md != md) {
DMWARN("device has been removed from the dev hash table.");
- dm_table_put(t);
+ dm_table_destroy(t);
up_write(&_hash_lock);
r = -ENXIO;
goto out;
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 0a225da21272..3e3fc06cb861 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -297,7 +297,8 @@ static int run_complete_job(struct kcopyd_job *job)
dm_kcopyd_notify_fn fn = job->fn;
struct dm_kcopyd_client *kc = job->kc;
- kcopyd_put_pages(kc, job->pages);
+ if (job->pages)
+ kcopyd_put_pages(kc, job->pages);
mempool_free(job, kc->job_pool);
fn(read_err, write_err, context);
@@ -461,6 +462,7 @@ static void segment_complete(int read_err, unsigned long write_err,
sector_t progress = 0;
sector_t count = 0;
struct kcopyd_job *job = (struct kcopyd_job *) context;
+ struct dm_kcopyd_client *kc = job->kc;
mutex_lock(&job->lock);
@@ -490,7 +492,7 @@ static void segment_complete(int read_err, unsigned long write_err,
if (count) {
int i;
- struct kcopyd_job *sub_job = mempool_alloc(job->kc->job_pool,
+ struct kcopyd_job *sub_job = mempool_alloc(kc->job_pool,
GFP_NOIO);
*sub_job = *job;
@@ -509,13 +511,16 @@ static void segment_complete(int read_err, unsigned long write_err,
} else if (atomic_dec_and_test(&job->sub_jobs)) {
/*
- * To avoid a race we must keep the job around
- * until after the notify function has completed.
- * Otherwise the client may try and stop the job
- * after we've completed.
+ * Queue the completion callback to the kcopyd thread.
+ *
+ * Some callers assume that all the completions are called
+ * from a single thread and don't race with each other.
+ *
+ * We must not call the callback directly here because this
+ * code may not be executing in the thread.
*/
- job->fn(read_err, write_err, job->context);
- mempool_free(job, job->kc->job_pool);
+ push(&kc->complete_jobs, job);
+ wake(kc);
}
}
@@ -528,6 +533,8 @@ static void split_job(struct kcopyd_job *job)
{
int i;
+ atomic_inc(&job->kc->nr_jobs);
+
atomic_set(&job->sub_jobs, SPLIT_COUNT);
for (i = 0; i < SPLIT_COUNT; i++)
segment_complete(0, 0u, job);
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index bfa107f59d96..79fb53e51c70 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -142,7 +142,6 @@ static struct target_type linear_target = {
.status = linear_status,
.ioctl = linear_ioctl,
.merge = linear_merge,
- .features = DM_TARGET_SUPPORTS_BARRIERS,
};
int __init dm_linear_init(void)
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 737961f275c1..be233bc4d917 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -16,40 +16,29 @@
#define DM_MSG_PREFIX "dirty region log"
-struct dm_dirty_log_internal {
- struct dm_dirty_log_type *type;
-
- struct list_head list;
- long use;
-};
-
static LIST_HEAD(_log_types);
static DEFINE_SPINLOCK(_lock);
-static struct dm_dirty_log_internal *__find_dirty_log_type(const char *name)
+static struct dm_dirty_log_type *__find_dirty_log_type(const char *name)
{
- struct dm_dirty_log_internal *log_type;
+ struct dm_dirty_log_type *log_type;
list_for_each_entry(log_type, &_log_types, list)
- if (!strcmp(name, log_type->type->name))
+ if (!strcmp(name, log_type->name))
return log_type;
return NULL;
}
-static struct dm_dirty_log_internal *_get_dirty_log_type(const char *name)
+static struct dm_dirty_log_type *_get_dirty_log_type(const char *name)
{
- struct dm_dirty_log_internal *log_type;
+ struct dm_dirty_log_type *log_type;
spin_lock(&_lock);
log_type = __find_dirty_log_type(name);
- if (log_type) {
- if (!log_type->use && !try_module_get(log_type->type->module))
- log_type = NULL;
- else
- log_type->use++;
- }
+ if (log_type && !try_module_get(log_type->module))
+ log_type = NULL;
spin_unlock(&_lock);
@@ -76,14 +65,14 @@ static struct dm_dirty_log_internal *_get_dirty_log_type(const char *name)
static struct dm_dirty_log_type *get_type(const char *type_name)
{
char *p, *type_name_dup;
- struct dm_dirty_log_internal *log_type;
+ struct dm_dirty_log_type *log_type;
if (!type_name)
return NULL;
log_type = _get_dirty_log_type(type_name);
if (log_type)
- return log_type->type;
+ return log_type;
type_name_dup = kstrdup(type_name, GFP_KERNEL);
if (!type_name_dup) {
@@ -105,56 +94,33 @@ static struct dm_dirty_log_type *get_type(const char *type_name)
kfree(type_name_dup);
- return log_type ? log_type->type : NULL;
+ return log_type;
}
static void put_type(struct dm_dirty_log_type *type)
{
- struct dm_dirty_log_internal *log_type;
-
if (!type)
return;
spin_lock(&_lock);
- log_type = __find_dirty_log_type(type->name);
- if (!log_type)
+ if (!__find_dirty_log_type(type->name))
goto out;
- if (!--log_type->use)
- module_put(type->module);
-
- BUG_ON(log_type->use < 0);
+ module_put(type->module);
out:
spin_unlock(&_lock);
}
-static struct dm_dirty_log_internal *_alloc_dirty_log_type(struct dm_dirty_log_type *type)
-{
- struct dm_dirty_log_internal *log_type = kzalloc(sizeof(*log_type),
- GFP_KERNEL);
-
- if (log_type)
- log_type->type = type;
-
- return log_type;
-}
-
int dm_dirty_log_type_register(struct dm_dirty_log_type *type)
{
- struct dm_dirty_log_internal *log_type = _alloc_dirty_log_type(type);
int r = 0;
- if (!log_type)
- return -ENOMEM;
-
spin_lock(&_lock);
if (!__find_dirty_log_type(type->name))
- list_add(&log_type->list, &_log_types);
- else {
- kfree(log_type);
+ list_add(&type->list, &_log_types);
+ else
r = -EEXIST;
- }
spin_unlock(&_lock);
return r;
@@ -163,25 +129,16 @@ EXPORT_SYMBOL(dm_dirty_log_type_register);
int dm_dirty_log_type_unregister(struct dm_dirty_log_type *type)
{
- struct dm_dirty_log_internal *log_type;
-
spin_lock(&_lock);
- log_type = __find_dirty_log_type(type->name);
- if (!log_type) {
+ if (!__find_dirty_log_type(type->name)) {
spin_unlock(&_lock);
return -EINVAL;
}
- if (log_type->use) {
- spin_unlock(&_lock);
- return -ETXTBSY;
- }
-
- list_del(&log_type->list);
+ list_del(&type->list);
spin_unlock(&_lock);
- kfree(log_type);
return 0;
}
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 095f77bf9681..6a386ab4f7eb 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -8,7 +8,6 @@
#include <linux/device-mapper.h>
#include "dm-path-selector.h"
-#include "dm-bio-list.h"
#include "dm-bio-record.h"
#include "dm-uevent.h"
diff --git a/drivers/md/dm-path-selector.c b/drivers/md/dm-path-selector.c
index 96ea226155b1..42c04f04a0c4 100644
--- a/drivers/md/dm-path-selector.c
+++ b/drivers/md/dm-path-selector.c
@@ -17,9 +17,7 @@
struct ps_internal {
struct path_selector_type pst;
-
struct list_head list;
- long use;
};
#define pst_to_psi(__pst) container_of((__pst), struct ps_internal, pst)
@@ -45,12 +43,8 @@ static struct ps_internal *get_path_selector(const char *name)
down_read(&_ps_lock);
psi = __find_path_selector_type(name);
- if (psi) {
- if ((psi->use == 0) && !try_module_get(psi->pst.module))
- psi = NULL;
- else
- psi->use++;
- }
+ if (psi && !try_module_get(psi->pst.module))
+ psi = NULL;
up_read(&_ps_lock);
return psi;
@@ -84,11 +78,7 @@ void dm_put_path_selector(struct path_selector_type *pst)
if (!psi)
goto out;
- if (--psi->use == 0)
- module_put(psi->pst.module);
-
- BUG_ON(psi->use < 0);
-
+ module_put(psi->pst.module);
out:
up_read(&_ps_lock);
}
@@ -136,11 +126,6 @@ int dm_unregister_path_selector(struct path_selector_type *pst)
return -EINVAL;
}
- if (psi->use) {
- up_write(&_ps_lock);
- return -ETXTBSY;
- }
-
list_del(&psi->list);
up_write(&_ps_lock);
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 4d6bc101962e..076fbb4e967a 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -5,7 +5,6 @@
* This file is released under the GPL.
*/
-#include "dm-bio-list.h"
#include "dm-bio-record.h"
#include <linux/init.h>
@@ -145,6 +144,8 @@ struct dm_raid1_read_record {
struct dm_bio_details details;
};
+static struct kmem_cache *_dm_raid1_read_record_cache;
+
/*
* Every mirror should look like this one.
*/
@@ -586,6 +587,9 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
int state;
struct bio *bio;
struct bio_list sync, nosync, recover, *this_list = NULL;
+ struct bio_list requeue;
+ struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
+ region_t region;
if (!writes->head)
return;
@@ -596,10 +600,18 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
bio_list_init(&sync);
bio_list_init(&nosync);
bio_list_init(&recover);
+ bio_list_init(&requeue);
while ((bio = bio_list_pop(writes))) {
- state = dm_rh_get_state(ms->rh,
- dm_rh_bio_to_region(ms->rh, bio), 1);
+ region = dm_rh_bio_to_region(ms->rh, bio);
+
+ if (log->type->is_remote_recovering &&
+ log->type->is_remote_recovering(log, region)) {
+ bio_list_add(&requeue, bio);
+ continue;
+ }
+
+ state = dm_rh_get_state(ms->rh, region, 1);
switch (state) {
case DM_RH_CLEAN:
case DM_RH_DIRTY:
@@ -619,6 +631,16 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
}
/*
+ * Add bios that are delayed due to remote recovery
+ * back on to the write queue
+ */
+ if (unlikely(requeue.head)) {
+ spin_lock_irq(&ms->lock);
+ bio_list_merge(&ms->writes, &requeue);
+ spin_unlock_irq(&ms->lock);
+ }
+
+ /*
* Increment the pending counts for any regions that will
* be written to (writes to recover regions are going to
* be delayed).
@@ -764,9 +786,9 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
atomic_set(&ms->suspend, 0);
atomic_set(&ms->default_mirror, DEFAULT_MIRROR);
- len = sizeof(struct dm_raid1_read_record);
- ms->read_record_pool = mempool_create_kmalloc_pool(MIN_READ_RECORDS,
- len);
+ ms->read_record_pool = mempool_create_slab_pool(MIN_READ_RECORDS,
+ _dm_raid1_read_record_cache);
+
if (!ms->read_record_pool) {
ti->error = "Error creating mirror read_record_pool";
kfree(ms);
@@ -1279,16 +1301,31 @@ static int __init dm_mirror_init(void)
{
int r;
+ _dm_raid1_read_record_cache = KMEM_CACHE(dm_raid1_read_record, 0);
+ if (!_dm_raid1_read_record_cache) {
+ DMERR("Can't allocate dm_raid1_read_record cache");
+ r = -ENOMEM;
+ goto bad_cache;
+ }
+
r = dm_register_target(&mirror_target);
- if (r < 0)
+ if (r < 0) {
DMERR("Failed to register mirror target");
+ goto bad_target;
+ }
+
+ return 0;
+bad_target:
+ kmem_cache_destroy(_dm_raid1_read_record_cache);
+bad_cache:
return r;
}
static void __exit dm_mirror_exit(void)
{
dm_unregister_target(&mirror_target);
+ kmem_cache_destroy(_dm_raid1_read_record_cache);
}
/* Module hooks */
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index 59f8d9df9e1a..7b899be0b087 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -14,7 +14,6 @@
#include <linux/vmalloc.h>
#include "dm.h"
-#include "dm-bio-list.h"
#define DM_MSG_PREFIX "region hash"
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 936b34e0959f..e75c6dd76a9a 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -6,7 +6,6 @@
*/
#include "dm-exception-store.h"
-#include "dm-snap.h"
#include <linux/mm.h>
#include <linux/pagemap.h>
@@ -89,7 +88,7 @@ struct commit_callback {
* The top level structure for a persistent exception store.
*/
struct pstore {
- struct dm_snapshot *snap; /* up pointer to my snapshot */
+ struct dm_exception_store *store;
int version;
int valid;
uint32_t exceptions_per_area;
@@ -141,7 +140,7 @@ static int alloc_area(struct pstore *ps)
int r = -ENOMEM;
size_t len;
- len = ps->snap->chunk_size << SECTOR_SHIFT;
+ len = ps->store->chunk_size << SECTOR_SHIFT;
/*
* Allocate the chunk_size block of memory that will hold
@@ -163,9 +162,12 @@ static int alloc_area(struct pstore *ps)
static void free_area(struct pstore *ps)
{
- vfree(ps->area);
+ if (ps->area)
+ vfree(ps->area);
ps->area = NULL;
- vfree(ps->zero_area);
+
+ if (ps->zero_area)
+ vfree(ps->zero_area);
ps->zero_area = NULL;
}
@@ -189,9 +191,9 @@ static void do_metadata(struct work_struct *work)
static int chunk_io(struct pstore *ps, chunk_t chunk, int rw, int metadata)
{
struct dm_io_region where = {
- .bdev = ps->snap->cow->bdev,
- .sector = ps->snap->chunk_size * chunk,
- .count = ps->snap->chunk_size,
+ .bdev = ps->store->cow->bdev,
+ .sector = ps->store->chunk_size * chunk,
+ .count = ps->store->chunk_size,
};
struct dm_io_request io_req = {
.bi_rw = rw,
@@ -247,15 +249,15 @@ static int area_io(struct pstore *ps, int rw)
static void zero_memory_area(struct pstore *ps)
{
- memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
+ memset(ps->area, 0, ps->store->chunk_size << SECTOR_SHIFT);
}
static int zero_disk_area(struct pstore *ps, chunk_t area)
{
struct dm_io_region where = {
- .bdev = ps->snap->cow->bdev,
- .sector = ps->snap->chunk_size * area_location(ps, area),
- .count = ps->snap->chunk_size,
+ .bdev = ps->store->cow->bdev,
+ .sector = ps->store->chunk_size * area_location(ps, area),
+ .count = ps->store->chunk_size,
};
struct dm_io_request io_req = {
.bi_rw = WRITE,
@@ -278,15 +280,15 @@ static int read_header(struct pstore *ps, int *new_snapshot)
/*
* Use default chunk size (or hardsect_size, if larger) if none supplied
*/
- if (!ps->snap->chunk_size) {
- ps->snap->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
- bdev_hardsect_size(ps->snap->cow->bdev) >> 9);
- ps->snap->chunk_mask = ps->snap->chunk_size - 1;
- ps->snap->chunk_shift = ffs(ps->snap->chunk_size) - 1;
+ if (!ps->store->chunk_size) {
+ ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
+ bdev_hardsect_size(ps->store->cow->bdev) >> 9);
+ ps->store->chunk_mask = ps->store->chunk_size - 1;
+ ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1;
chunk_size_supplied = 0;
}
- ps->io_client = dm_io_client_create(sectors_to_pages(ps->snap->
+ ps->io_client = dm_io_client_create(sectors_to_pages(ps->store->
chunk_size));
if (IS_ERR(ps->io_client))
return PTR_ERR(ps->io_client);
@@ -317,22 +319,22 @@ static int read_header(struct pstore *ps, int *new_snapshot)
ps->version = le32_to_cpu(dh->version);
chunk_size = le32_to_cpu(dh->chunk_size);
- if (!chunk_size_supplied || ps->snap->chunk_size == chunk_size)
+ if (!chunk_size_supplied || ps->store->chunk_size == chunk_size)
return 0;
DMWARN("chunk size %llu in device metadata overrides "
"table chunk size of %llu.",
(unsigned long long)chunk_size,
- (unsigned long long)ps->snap->chunk_size);
+ (unsigned long long)ps->store->chunk_size);
/* We had a bogus chunk_size. Fix stuff up. */
free_area(ps);
- ps->snap->chunk_size = chunk_size;
- ps->snap->chunk_mask = chunk_size - 1;
- ps->snap->chunk_shift = ffs(chunk_size) - 1;
+ ps->store->chunk_size = chunk_size;
+ ps->store->chunk_mask = chunk_size - 1;
+ ps->store->chunk_shift = ffs(chunk_size) - 1;
- r = dm_io_client_resize(sectors_to_pages(ps->snap->chunk_size),
+ r = dm_io_client_resize(sectors_to_pages(ps->store->chunk_size),
ps->io_client);
if (r)
return r;
@@ -349,13 +351,13 @@ static int write_header(struct pstore *ps)
{
struct disk_header *dh;
- memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
+ memset(ps->area, 0, ps->store->chunk_size << SECTOR_SHIFT);
dh = (struct disk_header *) ps->area;
dh->magic = cpu_to_le32(SNAP_MAGIC);
dh->valid = cpu_to_le32(ps->valid);
dh->version = cpu_to_le32(ps->version);
- dh->chunk_size = cpu_to_le32(ps->snap->chunk_size);
+ dh->chunk_size = cpu_to_le32(ps->store->chunk_size);
return chunk_io(ps, 0, WRITE, 1);
}
@@ -474,18 +476,25 @@ static struct pstore *get_info(struct dm_exception_store *store)
static void persistent_fraction_full(struct dm_exception_store *store,
sector_t *numerator, sector_t *denominator)
{
- *numerator = get_info(store)->next_free * store->snap->chunk_size;
- *denominator = get_dev_size(store->snap->cow->bdev);
+ *numerator = get_info(store)->next_free * store->chunk_size;
+ *denominator = get_dev_size(store->cow->bdev);
}
-static void persistent_destroy(struct dm_exception_store *store)
+static void persistent_dtr(struct dm_exception_store *store)
{
struct pstore *ps = get_info(store);
destroy_workqueue(ps->metadata_wq);
- dm_io_client_destroy(ps->io_client);
- vfree(ps->callbacks);
+
+ /* Created in read_header */
+ if (ps->io_client)
+ dm_io_client_destroy(ps->io_client);
free_area(ps);
+
+ /* Allocated in persistent_read_metadata */
+ if (ps->callbacks)
+ vfree(ps->callbacks);
+
kfree(ps);
}
@@ -507,7 +516,7 @@ static int persistent_read_metadata(struct dm_exception_store *store,
/*
* Now we know correct chunk_size, complete the initialisation.
*/
- ps->exceptions_per_area = (ps->snap->chunk_size << SECTOR_SHIFT) /
+ ps->exceptions_per_area = (ps->store->chunk_size << SECTOR_SHIFT) /
sizeof(struct disk_exception);
ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
sizeof(*ps->callbacks));
@@ -564,10 +573,10 @@ static int persistent_prepare_exception(struct dm_exception_store *store,
struct pstore *ps = get_info(store);
uint32_t stride;
chunk_t next_free;
- sector_t size = get_dev_size(store->snap->cow->bdev);
+ sector_t size = get_dev_size(store->cow->bdev);
/* Is there enough room ? */
- if (size < ((ps->next_free + 1) * store->snap->chunk_size))
+ if (size < ((ps->next_free + 1) * store->chunk_size))
return -ENOSPC;
e->new_chunk = ps->next_free;
@@ -656,16 +665,17 @@ static void persistent_drop_snapshot(struct dm_exception_store *store)
DMWARN("write header failed");
}
-int dm_create_persistent(struct dm_exception_store *store)
+static int persistent_ctr(struct dm_exception_store *store,
+ unsigned argc, char **argv)
{
struct pstore *ps;
/* allocate the pstore */
- ps = kmalloc(sizeof(*ps), GFP_KERNEL);
+ ps = kzalloc(sizeof(*ps), GFP_KERNEL);
if (!ps)
return -ENOMEM;
- ps->snap = store->snap;
+ ps->store = store;
ps->valid = 1;
ps->version = SNAPSHOT_DISK_VERSION;
ps->area = NULL;
@@ -683,22 +693,77 @@ int dm_create_persistent(struct dm_exception_store *store)
return -ENOMEM;
}
- store->destroy = persistent_destroy;
- store->read_metadata = persistent_read_metadata;
- store->prepare_exception = persistent_prepare_exception;
- store->commit_exception = persistent_commit_exception;
- store->drop_snapshot = persistent_drop_snapshot;
- store->fraction_full = persistent_fraction_full;
store->context = ps;
return 0;
}
+static unsigned persistent_status(struct dm_exception_store *store,
+ status_type_t status, char *result,
+ unsigned maxlen)
+{
+ unsigned sz = 0;
+
+ switch (status) {
+ case STATUSTYPE_INFO:
+ break;
+ case STATUSTYPE_TABLE:
+ DMEMIT(" %s P %llu", store->cow->name,
+ (unsigned long long)store->chunk_size);
+ }
+
+ return sz;
+}
+
+static struct dm_exception_store_type _persistent_type = {
+ .name = "persistent",
+ .module = THIS_MODULE,
+ .ctr = persistent_ctr,
+ .dtr = persistent_dtr,
+ .read_metadata = persistent_read_metadata,
+ .prepare_exception = persistent_prepare_exception,
+ .commit_exception = persistent_commit_exception,
+ .drop_snapshot = persistent_drop_snapshot,
+ .fraction_full = persistent_fraction_full,
+ .status = persistent_status,
+};
+
+static struct dm_exception_store_type _persistent_compat_type = {
+ .name = "P",
+ .module = THIS_MODULE,
+ .ctr = persistent_ctr,
+ .dtr = persistent_dtr,
+ .read_metadata = persistent_read_metadata,
+ .prepare_exception = persistent_prepare_exception,
+ .commit_exception = persistent_commit_exception,
+ .drop_snapshot = persistent_drop_snapshot,
+ .fraction_full = persistent_fraction_full,
+ .status = persistent_status,
+};
+
int dm_persistent_snapshot_init(void)
{
- return 0;
+ int r;
+
+ r = dm_exception_store_type_register(&_persistent_type);
+ if (r) {
+ DMERR("Unable to register persistent exception store type");
+ return r;
+ }
+
+ r = dm_exception_store_type_register(&_persistent_compat_type);
+ if (r) {
+ DMERR("Unable to register old-style persistent exception "
+ "store type");
+ dm_exception_store_type_unregister(&_persistent_type);
+ return r;
+ }
+
+ return r;
}
void dm_persistent_snapshot_exit(void)
{
+ dm_exception_store_type_unregister(&_persistent_type);
+ dm_exception_store_type_unregister(&_persistent_compat_type);
}
diff --git a/drivers/md/dm-snap-transient.c b/drivers/md/dm-snap-transient.c
index 7f6e2e6dcb0d..cde5aa558e6d 100644
--- a/drivers/md/dm-snap-transient.c
+++ b/drivers/md/dm-snap-transient.c
@@ -6,7 +6,6 @@
*/
#include "dm-exception-store.h"
-#include "dm-snap.h"
#include <linux/mm.h>
#include <linux/pagemap.h>
@@ -23,7 +22,7 @@ struct transient_c {
sector_t next_free;
};
-static void transient_destroy(struct dm_exception_store *store)
+static void transient_dtr(struct dm_exception_store *store)
{
kfree(store->context);
}
@@ -39,14 +38,14 @@ static int transient_read_metadata(struct dm_exception_store *store,
static int transient_prepare_exception(struct dm_exception_store *store,
struct dm_snap_exception *e)
{
- struct transient_c *tc = (struct transient_c *) store->context;
- sector_t size = get_dev_size(store->snap->cow->bdev);
+ struct transient_c *tc = store->context;
+ sector_t size = get_dev_size(store->cow->bdev);
- if (size < (tc->next_free + store->snap->chunk_size))
+ if (size < (tc->next_free + store->chunk_size))
return -1;
- e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
- tc->next_free += store->snap->chunk_size;
+ e->new_chunk = sector_to_chunk(store, tc->next_free);
+ tc->next_free += store->chunk_size;
return 0;
}
@@ -64,20 +63,14 @@ static void transient_fraction_full(struct dm_exception_store *store,
sector_t *numerator, sector_t *denominator)
{
*numerator = ((struct transient_c *) store->context)->next_free;
- *denominator = get_dev_size(store->snap->cow->bdev);
+ *denominator = get_dev_size(store->cow->bdev);
}
-int dm_create_transient(struct dm_exception_store *store)
+static int transient_ctr(struct dm_exception_store *store,
+ unsigned argc, char **argv)
{
struct transient_c *tc;
- store->destroy = transient_destroy;
- store->read_metadata = transient_read_metadata;
- store->prepare_exception = transient_prepare_exception;
- store->commit_exception = transient_commit_exception;
- store->drop_snapshot = NULL;
- store->fraction_full = transient_fraction_full;
-
tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
if (!tc)
return -ENOMEM;
@@ -88,11 +81,70 @@ int dm_create_transient(struct dm_exception_store *store)
return 0;
}
+static unsigned transient_status(struct dm_exception_store *store,
+ status_type_t status, char *result,
+ unsigned maxlen)
+{
+ unsigned sz = 0;
+
+ switch (status) {
+ case STATUSTYPE_INFO:
+ break;
+ case STATUSTYPE_TABLE:
+ DMEMIT(" %s N %llu", store->cow->name,
+ (unsigned long long)store->chunk_size);
+ }
+
+ return sz;
+}
+
+static struct dm_exception_store_type _transient_type = {
+ .name = "transient",
+ .module = THIS_MODULE,
+ .ctr = transient_ctr,
+ .dtr = transient_dtr,
+ .read_metadata = transient_read_metadata,
+ .prepare_exception = transient_prepare_exception,
+ .commit_exception = transient_commit_exception,
+ .fraction_full = transient_fraction_full,
+ .status = transient_status,
+};
+
+static struct dm_exception_store_type _transient_compat_type = {
+ .name = "N",
+ .module = THIS_MODULE,
+ .ctr = transient_ctr,
+ .dtr = transient_dtr,
+ .read_metadata = transient_read_metadata,
+ .prepare_exception = transient_prepare_exception,
+ .commit_exception = transient_commit_exception,
+ .fraction_full = transient_fraction_full,
+ .status = transient_status,
+};
+
int dm_transient_snapshot_init(void)
{
- return 0;
+ int r;
+
+ r = dm_exception_store_type_register(&_transient_type);
+ if (r) {
+ DMWARN("Unable to register transient exception store type");
+ return r;
+ }
+
+ r = dm_exception_store_type_register(&_transient_compat_type);
+ if (r) {
+ DMWARN("Unable to register old-style transient "
+ "exception store type");
+ dm_exception_store_type_unregister(&_transient_type);
+ return r;
+ }
+
+ return r;
}
void dm_transient_snapshot_exit(void)
{
+ dm_exception_store_type_unregister(&_transient_type);
+ dm_exception_store_type_unregister(&_transient_compat_type);
}
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 65ff82ff124e..d73f17fc7778 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -7,7 +7,6 @@
*/
#include <linux/blkdev.h>
-#include <linux/ctype.h>
#include <linux/device-mapper.h>
#include <linux/delay.h>
#include <linux/fs.h>
@@ -20,10 +19,9 @@
#include <linux/vmalloc.h>
#include <linux/log2.h>
#include <linux/dm-kcopyd.h>
+#include <linux/workqueue.h>
#include "dm-exception-store.h"
-#include "dm-snap.h"
-#include "dm-bio-list.h"
#define DM_MSG_PREFIX "snapshots"
@@ -47,9 +45,76 @@
*/
#define MIN_IOS 256
+#define DM_TRACKED_CHUNK_HASH_SIZE 16
+#define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \
+ (DM_TRACKED_CHUNK_HASH_SIZE - 1))
+
+struct exception_table {
+ uint32_t hash_mask;
+ unsigned hash_shift;
+ struct list_head *table;
+};
+
+struct dm_snapshot {
+ struct rw_semaphore lock;
+
+ struct dm_dev *origin;
+
+ /* List of snapshots per Origin */
+ struct list_head list;
+
+ /* You can't use a snapshot if this is 0 (e.g. if full) */
+ int valid;
+
+ /* Origin writes don't trigger exceptions until this is set */
+ int active;
+
+ mempool_t *pending_pool;
+
+ atomic_t pending_exceptions_count;
+
+ struct exception_table pending;
+ struct exception_table complete;
+
+ /*
+ * pe_lock protects all pending_exception operations and access
+ * as well as the snapshot_bios list.
+ */
+ spinlock_t pe_lock;
+
+ /* The on disk metadata handler */
+ struct dm_exception_store *store;
+
+ struct dm_kcopyd_client *kcopyd_client;
+
+ /* Queue of snapshot writes for ksnapd to flush */
+ struct bio_list queued_bios;
+ struct work_struct queued_bios_work;
+
+ /* Chunks with outstanding reads */
+ mempool_t *tracked_chunk_pool;
+ spinlock_t tracked_chunk_lock;
+ struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE];
+};
+
static struct workqueue_struct *ksnapd;
static void flush_queued_bios(struct work_struct *work);
+static sector_t chunk_to_sector(struct dm_exception_store *store,
+ chunk_t chunk)
+{
+ return chunk << store->chunk_shift;
+}
+
+static int bdev_equal(struct block_device *lhs, struct block_device *rhs)
+{
+ /*
+ * There is only ever one instance of a particular block
+ * device so we can compare pointers safely.
+ */
+ return lhs == rhs;
+}
+
struct dm_snap_pending_exception {
struct dm_snap_exception e;
@@ -476,11 +541,11 @@ static int init_hash_tables(struct dm_snapshot *s)
* Calculate based on the size of the original volume or
* the COW volume...
*/
- cow_dev_size = get_dev_size(s->cow->bdev);
+ cow_dev_size = get_dev_size(s->store->cow->bdev);
origin_dev_size = get_dev_size(s->origin->bdev);
max_buckets = calc_max_buckets();
- hash_size = min(origin_dev_size, cow_dev_size) >> s->chunk_shift;
+ hash_size = min(origin_dev_size, cow_dev_size) >> s->store->chunk_shift;
hash_size = min(hash_size, max_buckets);
hash_size = rounddown_pow_of_two(hash_size);
@@ -505,58 +570,6 @@ static int init_hash_tables(struct dm_snapshot *s)
}
/*
- * Round a number up to the nearest 'size' boundary. size must
- * be a power of 2.
- */
-static ulong round_up(ulong n, ulong size)
-{
- size--;
- return (n + size) & ~size;
-}
-
-static int set_chunk_size(struct dm_snapshot *s, const char *chunk_size_arg,
- char **error)
-{
- unsigned long chunk_size;
- char *value;
-
- chunk_size = simple_strtoul(chunk_size_arg, &value, 10);
- if (*chunk_size_arg == '\0' || *value != '\0') {
- *error = "Invalid chunk size";
- return -EINVAL;
- }
-
- if (!chunk_size) {
- s->chunk_size = s->chunk_mask = s->chunk_shift = 0;
- return 0;
- }
-
- /*
- * Chunk size must be multiple of page size. Silently
- * round up if it's not.
- */
- chunk_size = round_up(chunk_size, PAGE_SIZE >> 9);
-
- /* Check chunk_size is a power of 2 */
- if (!is_power_of_2(chunk_size)) {
- *error = "Chunk size is not a power of 2";
- return -EINVAL;
- }
-
- /* Validate the chunk size against the device block size */
- if (chunk_size % (bdev_hardsect_size(s->cow->bdev) >> 9)) {
- *error = "Chunk size is not a multiple of device blocksize";
- return -EINVAL;
- }
-
- s->chunk_size = chunk_size;
- s->chunk_mask = chunk_size - 1;
- s->chunk_shift = ffs(chunk_size) - 1;
-
- return 0;
-}
-
-/*
* Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
*/
static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
@@ -564,91 +577,68 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
struct dm_snapshot *s;
int i;
int r = -EINVAL;
- char persistent;
char *origin_path;
- char *cow_path;
+ struct dm_exception_store *store;
+ unsigned args_used;
if (argc != 4) {
ti->error = "requires exactly 4 arguments";
r = -EINVAL;
- goto bad1;
+ goto bad_args;
}
origin_path = argv[0];
- cow_path = argv[1];
- persistent = toupper(*argv[2]);
+ argv++;
+ argc--;
- if (persistent != 'P' && persistent != 'N') {
- ti->error = "Persistent flag is not P or N";
+ r = dm_exception_store_create(ti, argc, argv, &args_used, &store);
+ if (r) {
+ ti->error = "Couldn't create exception store";
r = -EINVAL;
- goto bad1;
+ goto bad_args;
}
+ argv += args_used;
+ argc -= args_used;
+
s = kmalloc(sizeof(*s), GFP_KERNEL);
- if (s == NULL) {
+ if (!s) {
ti->error = "Cannot allocate snapshot context private "
"structure";
r = -ENOMEM;
- goto bad1;
+ goto bad_snap;
}
r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin);
if (r) {
ti->error = "Cannot get origin device";
- goto bad2;
- }
-
- r = dm_get_device(ti, cow_path, 0, 0,
- FMODE_READ | FMODE_WRITE, &s->cow);
- if (r) {
- dm_put_device(ti, s->origin);
- ti->error = "Cannot get COW device";
- goto bad2;
+ goto bad_origin;
}
- r = set_chunk_size(s, argv[3], &ti->error);
- if (r)
- goto bad3;
-
- s->type = persistent;
-
+ s->store = store;
s->valid = 1;
s->active = 0;
atomic_set(&s->pending_exceptions_count, 0);
init_rwsem(&s->lock);
spin_lock_init(&s->pe_lock);
- s->ti = ti;
/* Allocate hash table for COW data */
if (init_hash_tables(s)) {
ti->error = "Unable to allocate hash table space";
r = -ENOMEM;
- goto bad3;
- }
-
- s->store.snap = s;
-
- if (persistent == 'P')
- r = dm_create_persistent(&s->store);
- else
- r = dm_create_transient(&s->store);
-
- if (r) {
- ti->error = "Couldn't create exception store";
- r = -EINVAL;
- goto bad4;
+ goto bad_hash_tables;
}
r = dm_kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client);
if (r) {
ti->error = "Could not create kcopyd client";
- goto bad5;
+ goto bad_kcopyd;
}
s->pending_pool = mempool_create_slab_pool(MIN_IOS, pending_cache);
if (!s->pending_pool) {
ti->error = "Could not allocate mempool for pending exceptions";
- goto bad6;
+ goto bad_pending_pool;
}
s->tracked_chunk_pool = mempool_create_slab_pool(MIN_IOS,
@@ -665,7 +655,8 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
spin_lock_init(&s->tracked_chunk_lock);
/* Metadata must only be loaded into one table at once */
- r = s->store.read_metadata(&s->store, dm_add_exception, (void *)s);
+ r = s->store->type->read_metadata(s->store, dm_add_exception,
+ (void *)s);
if (r < 0) {
ti->error = "Failed to read snapshot metadata";
goto bad_load_and_register;
@@ -686,34 +677,33 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
ti->private = s;
- ti->split_io = s->chunk_size;
+ ti->split_io = s->store->chunk_size;
return 0;
- bad_load_and_register:
+bad_load_and_register:
mempool_destroy(s->tracked_chunk_pool);
- bad_tracked_chunk_pool:
+bad_tracked_chunk_pool:
mempool_destroy(s->pending_pool);
- bad6:
+bad_pending_pool:
dm_kcopyd_client_destroy(s->kcopyd_client);
- bad5:
- s->store.destroy(&s->store);
-
- bad4:
+bad_kcopyd:
exit_exception_table(&s->pending, pending_cache);
exit_exception_table(&s->complete, exception_cache);
- bad3:
- dm_put_device(ti, s->cow);
+bad_hash_tables:
dm_put_device(ti, s->origin);
- bad2:
+bad_origin:
kfree(s);
- bad1:
+bad_snap:
+ dm_exception_store_destroy(store);
+
+bad_args:
return r;
}
@@ -724,8 +714,6 @@ static void __free_exceptions(struct dm_snapshot *s)
exit_exception_table(&s->pending, pending_cache);
exit_exception_table(&s->complete, exception_cache);
-
- s->store.destroy(&s->store);
}
static void snapshot_dtr(struct dm_target *ti)
@@ -761,7 +749,8 @@ static void snapshot_dtr(struct dm_target *ti)
mempool_destroy(s->pending_pool);
dm_put_device(ti, s->origin);
- dm_put_device(ti, s->cow);
+
+ dm_exception_store_destroy(s->store);
kfree(s);
}
@@ -820,12 +809,12 @@ static void __invalidate_snapshot(struct dm_snapshot *s, int err)
else if (err == -ENOMEM)
DMERR("Invalidating snapshot: Unable to allocate exception.");
- if (s->store.drop_snapshot)
- s->store.drop_snapshot(&s->store);
+ if (s->store->type->drop_snapshot)
+ s->store->type->drop_snapshot(s->store);
s->valid = 0;
- dm_table_event(s->ti->table);
+ dm_table_event(s->store->ti->table);
}
static void get_pending_exception(struct dm_snap_pending_exception *pe)
@@ -943,8 +932,8 @@ static void copy_callback(int read_err, unsigned long write_err, void *context)
else
/* Update the metadata if we are persistent */
- s->store.commit_exception(&s->store, &pe->e, commit_callback,
- pe);
+ s->store->type->commit_exception(s->store, &pe->e,
+ commit_callback, pe);
}
/*
@@ -960,11 +949,11 @@ static void start_copy(struct dm_snap_pending_exception *pe)
dev_size = get_dev_size(bdev);
src.bdev = bdev;
- src.sector = chunk_to_sector(s, pe->e.old_chunk);
- src.count = min(s->chunk_size, dev_size - src.sector);
+ src.sector = chunk_to_sector(s->store, pe->e.old_chunk);
+ src.count = min(s->store->chunk_size, dev_size - src.sector);
- dest.bdev = s->cow->bdev;
- dest.sector = chunk_to_sector(s, pe->e.new_chunk);
+ dest.bdev = s->store->cow->bdev;
+ dest.sector = chunk_to_sector(s->store, pe->e.new_chunk);
dest.count = src.count;
/* Hand over to kcopyd */
@@ -972,6 +961,17 @@ static void start_copy(struct dm_snap_pending_exception *pe)
&src, 1, &dest, 0, copy_callback, pe);
}
+static struct dm_snap_pending_exception *
+__lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk)
+{
+ struct dm_snap_exception *e = lookup_exception(&s->pending, chunk);
+
+ if (!e)
+ return NULL;
+
+ return container_of(e, struct dm_snap_pending_exception, e);
+}
+
/*
* Looks to see if this snapshot already has a pending exception
* for this chunk, otherwise it allocates a new one and inserts
@@ -981,40 +981,15 @@ static void start_copy(struct dm_snap_pending_exception *pe)
* this.
*/
static struct dm_snap_pending_exception *
-__find_pending_exception(struct dm_snapshot *s, struct bio *bio)
+__find_pending_exception(struct dm_snapshot *s,
+ struct dm_snap_pending_exception *pe, chunk_t chunk)
{
- struct dm_snap_exception *e;
- struct dm_snap_pending_exception *pe;
- chunk_t chunk = sector_to_chunk(s, bio->bi_sector);
+ struct dm_snap_pending_exception *pe2;
- /*
- * Is there a pending exception for this already ?
- */
- e = lookup_exception(&s->pending, chunk);
- if (e) {
- /* cast the exception to a pending exception */
- pe = container_of(e, struct dm_snap_pending_exception, e);
- goto out;
- }
-
- /*
- * Create a new pending exception, we don't want
- * to hold the lock while we do this.
- */
- up_write(&s->lock);
- pe = alloc_pending_exception(s);
- down_write(&s->lock);
-
- if (!s->valid) {
- free_pending_exception(pe);
- return NULL;
- }
-
- e = lookup_exception(&s->pending, chunk);
- if (e) {
+ pe2 = __lookup_pending_exception(s, chunk);
+ if (pe2) {
free_pending_exception(pe);
- pe = container_of(e, struct dm_snap_pending_exception, e);
- goto out;
+ return pe2;
}
pe->e.old_chunk = chunk;
@@ -1024,7 +999,7 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio)
atomic_set(&pe->ref_count, 0);
pe->started = 0;
- if (s->store.prepare_exception(&s->store, &pe->e)) {
+ if (s->store->type->prepare_exception(s->store, &pe->e)) {
free_pending_exception(pe);
return NULL;
}
@@ -1032,17 +1007,18 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio)
get_pending_exception(pe);
insert_exception(&s->pending, &pe->e);
- out:
return pe;
}
static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e,
struct bio *bio, chunk_t chunk)
{
- bio->bi_bdev = s->cow->bdev;
- bio->bi_sector = chunk_to_sector(s, dm_chunk_number(e->new_chunk) +
- (chunk - e->old_chunk)) +
- (bio->bi_sector & s->chunk_mask);
+ bio->bi_bdev = s->store->cow->bdev;
+ bio->bi_sector = chunk_to_sector(s->store,
+ dm_chunk_number(e->new_chunk) +
+ (chunk - e->old_chunk)) +
+ (bio->bi_sector &
+ s->store->chunk_mask);
}
static int snapshot_map(struct dm_target *ti, struct bio *bio,
@@ -1054,7 +1030,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
chunk_t chunk;
struct dm_snap_pending_exception *pe = NULL;
- chunk = sector_to_chunk(s, bio->bi_sector);
+ chunk = sector_to_chunk(s->store, bio->bi_sector);
/* Full snapshots are not usable */
/* To get here the table must be live so s->active is always set. */
@@ -1083,11 +1059,31 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
* writeable.
*/
if (bio_rw(bio) == WRITE) {
- pe = __find_pending_exception(s, bio);
+ pe = __lookup_pending_exception(s, chunk);
if (!pe) {
- __invalidate_snapshot(s, -ENOMEM);
- r = -EIO;
- goto out_unlock;
+ up_write(&s->lock);
+ pe = alloc_pending_exception(s);
+ down_write(&s->lock);
+
+ if (!s->valid) {
+ free_pending_exception(pe);
+ r = -EIO;
+ goto out_unlock;
+ }
+
+ e = lookup_exception(&s->complete, chunk);
+ if (e) {
+ free_pending_exception(pe);
+ remap_exception(s, e, bio, chunk);
+ goto out_unlock;
+ }
+
+ pe = __find_pending_exception(s, pe, chunk);
+ if (!pe) {
+ __invalidate_snapshot(s, -ENOMEM);
+ r = -EIO;
+ goto out_unlock;
+ }
}
remap_exception(s, &pe->e, bio, chunk);
@@ -1137,24 +1133,25 @@ static void snapshot_resume(struct dm_target *ti)
static int snapshot_status(struct dm_target *ti, status_type_t type,
char *result, unsigned int maxlen)
{
+ unsigned sz = 0;
struct dm_snapshot *snap = ti->private;
switch (type) {
case STATUSTYPE_INFO:
if (!snap->valid)
- snprintf(result, maxlen, "Invalid");
+ DMEMIT("Invalid");
else {
- if (snap->store.fraction_full) {
+ if (snap->store->type->fraction_full) {
sector_t numerator, denominator;
- snap->store.fraction_full(&snap->store,
- &numerator,
- &denominator);
- snprintf(result, maxlen, "%llu/%llu",
- (unsigned long long)numerator,
- (unsigned long long)denominator);
+ snap->store->type->fraction_full(snap->store,
+ &numerator,
+ &denominator);
+ DMEMIT("%llu/%llu",
+ (unsigned long long)numerator,
+ (unsigned long long)denominator);
}
else
- snprintf(result, maxlen, "Unknown");
+ DMEMIT("Unknown");
}
break;
@@ -1164,10 +1161,9 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
* to make private copies if the output is to
* make sense.
*/
- snprintf(result, maxlen, "%s %s %c %llu",
- snap->origin->name, snap->cow->name,
- snap->type,
- (unsigned long long)snap->chunk_size);
+ DMEMIT("%s", snap->origin->name);
+ snap->store->type->status(snap->store, type, result + sz,
+ maxlen - sz);
break;
}
@@ -1196,14 +1192,14 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
goto next_snapshot;
/* Nothing to do if writing beyond end of snapshot */
- if (bio->bi_sector >= dm_table_get_size(snap->ti->table))
+ if (bio->bi_sector >= dm_table_get_size(snap->store->ti->table))
goto next_snapshot;
/*
* Remember, different snapshots can have
* different chunk sizes.
*/
- chunk = sector_to_chunk(snap, bio->bi_sector);
+ chunk = sector_to_chunk(snap->store, bio->bi_sector);
/*
* Check exception table to see if block
@@ -1217,10 +1213,28 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
if (e)
goto next_snapshot;
- pe = __find_pending_exception(snap, bio);
+ pe = __lookup_pending_exception(snap, chunk);
if (!pe) {
- __invalidate_snapshot(snap, -ENOMEM);
- goto next_snapshot;
+ up_write(&snap->lock);
+ pe = alloc_pending_exception(snap);
+ down_write(&snap->lock);
+
+ if (!snap->valid) {
+ free_pending_exception(pe);
+ goto next_snapshot;
+ }
+
+ e = lookup_exception(&snap->complete, chunk);
+ if (e) {
+ free_pending_exception(pe);
+ goto next_snapshot;
+ }
+
+ pe = __find_pending_exception(snap, pe, chunk);
+ if (!pe) {
+ __invalidate_snapshot(snap, -ENOMEM);
+ goto next_snapshot;
+ }
}
if (!primary_pe) {
@@ -1360,7 +1374,8 @@ static void origin_resume(struct dm_target *ti)
o = __lookup_origin(dev->bdev);
if (o)
list_for_each_entry (snap, &o->snapshots, list)
- chunk_size = min_not_zero(chunk_size, snap->chunk_size);
+ chunk_size = min_not_zero(chunk_size,
+ snap->store->chunk_size);
up_read(&_origins_lock);
ti->split_io = chunk_size;
diff --git a/drivers/md/dm-snap.h b/drivers/md/dm-snap.h
deleted file mode 100644
index d9e62b43cf85..000000000000
--- a/drivers/md/dm-snap.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
- *
- * This file is released under the GPL.
- */
-
-#ifndef DM_SNAPSHOT_H
-#define DM_SNAPSHOT_H
-
-#include <linux/device-mapper.h>
-#include "dm-exception-store.h"
-#include "dm-bio-list.h"
-#include <linux/blkdev.h>
-#include <linux/workqueue.h>
-
-struct exception_table {
- uint32_t hash_mask;
- unsigned hash_shift;
- struct list_head *table;
-};
-
-#define DM_TRACKED_CHUNK_HASH_SIZE 16
-#define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \
- (DM_TRACKED_CHUNK_HASH_SIZE - 1))
-
-struct dm_snapshot {
- struct rw_semaphore lock;
- struct dm_target *ti;
-
- struct dm_dev *origin;
- struct dm_dev *cow;
-
- /* List of snapshots per Origin */
- struct list_head list;
-
- /* Size of data blocks saved - must be a power of 2 */
- chunk_t chunk_size;
- chunk_t chunk_mask;
- chunk_t chunk_shift;
-
- /* You can't use a snapshot if this is 0 (e.g. if full) */
- int valid;
-
- /* Origin writes don't trigger exceptions until this is set */
- int active;
-
- /* Used for display of table */
- char type;
-
- mempool_t *pending_pool;
-
- atomic_t pending_exceptions_count;
-
- struct exception_table pending;
- struct exception_table complete;
-
- /*
- * pe_lock protects all pending_exception operations and access
- * as well as the snapshot_bios list.
- */
- spinlock_t pe_lock;
-
- /* The on disk metadata handler */
- struct dm_exception_store store;
-
- struct dm_kcopyd_client *kcopyd_client;
-
- /* Queue of snapshot writes for ksnapd to flush */
- struct bio_list queued_bios;
- struct work_struct queued_bios_work;
-
- /* Chunks with outstanding reads */
- mempool_t *tracked_chunk_pool;
- spinlock_t tracked_chunk_lock;
- struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE];
-};
-
-/*
- * Return the number of sectors in the device.
- */
-static inline sector_t get_dev_size(struct block_device *bdev)
-{
- return bdev->bd_inode->i_size >> SECTOR_SHIFT;
-}
-
-static inline chunk_t sector_to_chunk(struct dm_snapshot *s, sector_t sector)
-{
- return (sector & ~s->chunk_mask) >> s->chunk_shift;
-}
-
-static inline sector_t chunk_to_sector(struct dm_snapshot *s, chunk_t chunk)
-{
- return chunk << s->chunk_shift;
-}
-
-static inline int bdev_equal(struct block_device *lhs, struct block_device *rhs)
-{
- /*
- * There is only ever one instance of a particular block
- * device so we can compare pointers safely.
- */
- return lhs == rhs;
-}
-
-#endif
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 2fd66c30f7f8..429b50b975d5 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -52,8 +52,6 @@ struct dm_table {
sector_t *highs;
struct dm_target *targets;
- unsigned barriers_supported:1;
-
/*
* Indicates the rw permissions for the new logical
* device. This should be a combination of FMODE_READ
@@ -243,7 +241,6 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
INIT_LIST_HEAD(&t->devices);
atomic_set(&t->holders, 0);
- t->barriers_supported = 1;
if (!num_targets)
num_targets = KEYS_PER_NODE;
@@ -399,28 +396,30 @@ static int check_device_area(struct dm_dev_internal *dd, sector_t start,
}
/*
- * This upgrades the mode on an already open dm_dev. Being
+ * This upgrades the mode on an already open dm_dev, being
* careful to leave things as they were if we fail to reopen the
- * device.
+ * device and not to touch the existing bdev field in case
+ * it is accessed concurrently inside dm_table_any_congested().
*/
static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode,
struct mapped_device *md)
{
int r;
- struct dm_dev_internal dd_copy;
- dev_t dev = dd->dm_dev.bdev->bd_dev;
+ struct dm_dev_internal dd_new, dd_old;
+
+ dd_new = dd_old = *dd;
+
+ dd_new.dm_dev.mode |= new_mode;
+ dd_new.dm_dev.bdev = NULL;
- dd_copy = *dd;
+ r = open_dev(&dd_new, dd->dm_dev.bdev->bd_dev, md);
+ if (r)
+ return r;
dd->dm_dev.mode |= new_mode;
- dd->dm_dev.bdev = NULL;
- r = open_dev(dd, dev, md);
- if (!r)
- close_dev(&dd_copy, md);
- else
- *dd = dd_copy;
+ close_dev(&dd_old, md);
- return r;
+ return 0;
}
/*
@@ -749,10 +748,6 @@ int dm_table_add_target(struct dm_table *t, const char *type,
/* FIXME: the plan is to combine high here and then have
* the merge fn apply the target level restrictions. */
combine_restrictions_low(&t->limits, &tgt->limits);
-
- if (!(tgt->type->features & DM_TARGET_SUPPORTS_BARRIERS))
- t->barriers_supported = 0;
-
return 0;
bad:
@@ -797,12 +792,6 @@ int dm_table_complete(struct dm_table *t)
check_for_valid_limits(&t->limits);
- /*
- * We only support barriers if there is exactly one underlying device.
- */
- if (!list_is_singular(&t->devices))
- t->barriers_supported = 0;
-
/* how many indexes will the btree have ? */
leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
@@ -877,6 +866,45 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
return &t->targets[(KEYS_PER_NODE * n) + k];
}
+/*
+ * Set the integrity profile for this device if all devices used have
+ * matching profiles.
+ */
+static void dm_table_set_integrity(struct dm_table *t)
+{
+ struct list_head *devices = dm_table_get_devices(t);
+ struct dm_dev_internal *prev = NULL, *dd = NULL;
+
+ if (!blk_get_integrity(dm_disk(t->md)))
+ return;
+
+ list_for_each_entry(dd, devices, list) {
+ if (prev &&
+ blk_integrity_compare(prev->dm_dev.bdev->bd_disk,
+ dd->dm_dev.bdev->bd_disk) < 0) {
+ DMWARN("%s: integrity not set: %s and %s mismatch",
+ dm_device_name(t->md),
+ prev->dm_dev.bdev->bd_disk->disk_name,
+ dd->dm_dev.bdev->bd_disk->disk_name);
+ goto no_integrity;
+ }
+ prev = dd;
+ }
+
+ if (!prev || !bdev_get_integrity(prev->dm_dev.bdev))
+ goto no_integrity;
+
+ blk_integrity_register(dm_disk(t->md),
+ bdev_get_integrity(prev->dm_dev.bdev));
+
+ return;
+
+no_integrity:
+ blk_integrity_register(dm_disk(t->md), NULL);
+
+ return;
+}
+
void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
{
/*
@@ -897,6 +925,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
else
queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q);
+ dm_table_set_integrity(t);
}
unsigned int dm_table_get_num_targets(struct dm_table *t)
@@ -1017,12 +1046,6 @@ struct mapped_device *dm_table_get_md(struct dm_table *t)
return t->md;
}
-int dm_table_barrier_ok(struct dm_table *t)
-{
- return t->barriers_supported;
-}
-EXPORT_SYMBOL(dm_table_barrier_ok);
-
EXPORT_SYMBOL(dm_vcalloc);
EXPORT_SYMBOL(dm_get_device);
EXPORT_SYMBOL(dm_put_device);
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 7decf10006e4..04feccf2a997 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -14,45 +14,34 @@
#define DM_MSG_PREFIX "target"
-struct tt_internal {
- struct target_type tt;
-
- struct list_head list;
- long use;
-};
-
static LIST_HEAD(_targets);
static DECLARE_RWSEM(_lock);
#define DM_MOD_NAME_SIZE 32
-static inline struct tt_internal *__find_target_type(const char *name)
+static inline struct target_type *__find_target_type(const char *name)
{
- struct tt_internal *ti;
+ struct target_type *tt;
- list_for_each_entry (ti, &_targets, list)
- if (!strcmp(name, ti->tt.name))
- return ti;
+ list_for_each_entry(tt, &_targets, list)
+ if (!strcmp(name, tt->name))
+ return tt;
return NULL;
}
-static struct tt_internal *get_target_type(const char *name)
+static struct target_type *get_target_type(const char *name)
{
- struct tt_internal *ti;
+ struct target_type *tt;
down_read(&_lock);
- ti = __find_target_type(name);
- if (ti) {
- if ((ti->use == 0) && !try_module_get(ti->tt.module))
- ti = NULL;
- else
- ti->use++;
- }
+ tt = __find_target_type(name);
+ if (tt && !try_module_get(tt->module))
+ tt = NULL;
up_read(&_lock);
- return ti;
+ return tt;
}
static void load_module(const char *name)
@@ -62,92 +51,59 @@ static void load_module(const char *name)
struct target_type *dm_get_target_type(const char *name)
{
- struct tt_internal *ti = get_target_type(name);
+ struct target_type *tt = get_target_type(name);
- if (!ti) {
+ if (!tt) {
load_module(name);
- ti = get_target_type(name);
+ tt = get_target_type(name);
}
- return ti ? &ti->tt : NULL;
+ return tt;
}
-void dm_put_target_type(struct target_type *t)
+void dm_put_target_type(struct target_type *tt)
{
- struct tt_internal *ti = (struct tt_internal *) t;
-
down_read(&_lock);
- if (--ti->use == 0)
- module_put(ti->tt.module);
-
- BUG_ON(ti->use < 0);
+ module_put(tt->module);
up_read(&_lock);
-
- return;
-}
-
-static struct tt_internal *alloc_target(struct target_type *t)
-{
- struct tt_internal *ti = kzalloc(sizeof(*ti), GFP_KERNEL);
-
- if (ti)
- ti->tt = *t;
-
- return ti;
}
-
int dm_target_iterate(void (*iter_func)(struct target_type *tt,
void *param), void *param)
{
- struct tt_internal *ti;
+ struct target_type *tt;
down_read(&_lock);
- list_for_each_entry (ti, &_targets, list)
- iter_func(&ti->tt, param);
+ list_for_each_entry(tt, &_targets, list)
+ iter_func(tt, param);
up_read(&_lock);
return 0;
}
-int dm_register_target(struct target_type *t)
+int dm_register_target(struct target_type *tt)
{
int rv = 0;
- struct tt_internal *ti = alloc_target(t);
-
- if (!ti)
- return -ENOMEM;
down_write(&_lock);
- if (__find_target_type(t->name))
+ if (__find_target_type(tt->name))
rv = -EEXIST;
else
- list_add(&ti->list, &_targets);
+ list_add(&tt->list, &_targets);
up_write(&_lock);
- if (rv)
- kfree(ti);
return rv;
}
-void dm_unregister_target(struct target_type *t)
+void dm_unregister_target(struct target_type *tt)
{
- struct tt_internal *ti;
-
down_write(&_lock);
- if (!(ti = __find_target_type(t->name))) {
- DMCRIT("Unregistering unrecognised target: %s", t->name);
- BUG();
- }
-
- if (ti->use) {
- DMCRIT("Attempt to unregister target still in use: %s",
- t->name);
+ if (!__find_target_type(tt->name)) {
+ DMCRIT("Unregistering unrecognised target: %s", tt->name);
BUG();
}
- list_del(&ti->list);
- kfree(ti);
+ list_del(&tt->list);
up_write(&_lock);
}
@@ -156,17 +112,17 @@ void dm_unregister_target(struct target_type *t)
* io-err: always fails an io, useful for bringing
* up LVs that have holes in them.
*/
-static int io_err_ctr(struct dm_target *ti, unsigned int argc, char **args)
+static int io_err_ctr(struct dm_target *tt, unsigned int argc, char **args)
{
return 0;
}
-static void io_err_dtr(struct dm_target *ti)
+static void io_err_dtr(struct dm_target *tt)
{
/* empty */
}
-static int io_err_map(struct dm_target *ti, struct bio *bio,
+static int io_err_map(struct dm_target *tt, struct bio *bio,
union map_info *map_context)
{
return -EIO;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 51ba1db4b3e7..424f7b048c30 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -6,7 +6,6 @@
*/
#include "dm.h"
-#include "dm-bio-list.h"
#include "dm-uevent.h"
#include <linux/init.h>
@@ -89,29 +88,20 @@ union map_info *dm_get_mapinfo(struct bio *bio)
/*
* Bits for the md->flags field.
*/
-#define DMF_BLOCK_IO 0
+#define DMF_BLOCK_IO_FOR_SUSPEND 0
#define DMF_SUSPENDED 1
#define DMF_FROZEN 2
#define DMF_FREEING 3
#define DMF_DELETING 4
#define DMF_NOFLUSH_SUSPENDING 5
+#define DMF_QUEUE_IO_TO_THREAD 6
/*
* Work processed by per-device workqueue.
*/
-struct dm_wq_req {
- enum {
- DM_WQ_FLUSH_DEFERRED,
- } type;
- struct work_struct work;
- struct mapped_device *md;
- void *context;
-};
-
struct mapped_device {
struct rw_semaphore io_lock;
struct mutex suspend_lock;
- spinlock_t pushback_lock;
rwlock_t map_lock;
atomic_t holders;
atomic_t open_count;
@@ -129,8 +119,14 @@ struct mapped_device {
*/
atomic_t pending;
wait_queue_head_t wait;
+ struct work_struct work;
struct bio_list deferred;
- struct bio_list pushback;
+ spinlock_t deferred_lock;
+
+ /*
+ * An error from the barrier request currently being processed.
+ */
+ int barrier_error;
/*
* Processing queue (flush/barriers)
@@ -433,6 +429,10 @@ static void end_io_acct(struct dm_io *io)
part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);
part_stat_unlock();
+ /*
+ * After this is decremented the bio must not be touched if it is
+ * a barrier.
+ */
dm_disk(md)->part0.in_flight = pending =
atomic_dec_return(&md->pending);
@@ -444,19 +444,18 @@ static void end_io_acct(struct dm_io *io)
/*
* Add the bio to the list of deferred io.
*/
-static int queue_io(struct mapped_device *md, struct bio *bio)
+static void queue_io(struct mapped_device *md, struct bio *bio)
{
down_write(&md->io_lock);
- if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
- up_write(&md->io_lock);
- return 1;
- }
-
+ spin_lock_irq(&md->deferred_lock);
bio_list_add(&md->deferred, bio);
+ spin_unlock_irq(&md->deferred_lock);
+
+ if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags))
+ queue_work(md->wq, &md->work);
up_write(&md->io_lock);
- return 0; /* deferred successfully */
}
/*
@@ -525,36 +524,50 @@ static int __noflush_suspending(struct mapped_device *md)
static void dec_pending(struct dm_io *io, int error)
{
unsigned long flags;
+ int io_error;
+ struct bio *bio;
+ struct mapped_device *md = io->md;
/* Push-back supersedes any I/O errors */
- if (error && !(io->error > 0 && __noflush_suspending(io->md)))
+ if (error && !(io->error > 0 && __noflush_suspending(md)))
io->error = error;
if (atomic_dec_and_test(&io->io_count)) {
if (io->error == DM_ENDIO_REQUEUE) {
/*
* Target requested pushing back the I/O.
- * This must be handled before the sleeper on
- * suspend queue merges the pushback list.
*/
- spin_lock_irqsave(&io->md->pushback_lock, flags);
- if (__noflush_suspending(io->md))
- bio_list_add(&io->md->pushback, io->bio);
+ spin_lock_irqsave(&md->deferred_lock, flags);
+ if (__noflush_suspending(md))
+ bio_list_add_head(&md->deferred, io->bio);
else
/* noflush suspend was interrupted. */
io->error = -EIO;
- spin_unlock_irqrestore(&io->md->pushback_lock, flags);
+ spin_unlock_irqrestore(&md->deferred_lock, flags);
}
- end_io_acct(io);
+ io_error = io->error;
+ bio = io->bio;
- if (io->error != DM_ENDIO_REQUEUE) {
- trace_block_bio_complete(io->md->queue, io->bio);
+ if (bio_barrier(bio)) {
+ /*
+ * There can be just one barrier request so we use
+ * a per-device variable for error reporting.
+ * Note that you can't touch the bio after end_io_acct
+ */
+ md->barrier_error = io_error;
+ end_io_acct(io);
+ } else {
+ end_io_acct(io);
- bio_endio(io->bio, io->error);
+ if (io_error != DM_ENDIO_REQUEUE) {
+ trace_block_bio_complete(md->queue, bio);
+
+ bio_endio(bio, io_error);
+ }
}
- free_io(io->md, io);
+ free_io(md, io);
}
}
@@ -562,6 +575,7 @@ static void clone_endio(struct bio *bio, int error)
{
int r = 0;
struct dm_target_io *tio = bio->bi_private;
+ struct dm_io *io = tio->io;
struct mapped_device *md = tio->io->md;
dm_endio_fn endio = tio->ti->type->end_io;
@@ -585,15 +599,14 @@ static void clone_endio(struct bio *bio, int error)
}
}
- dec_pending(tio->io, error);
-
/*
* Store md for cleanup instead of tio which is about to get freed.
*/
bio->bi_private = md->bs;
- bio_put(bio);
free_tio(md, tio);
+ bio_put(bio);
+ dec_pending(io, error);
}
static sector_t max_io_len(struct mapped_device *md,
@@ -696,13 +709,19 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector,
clone->bi_sector = sector;
clone->bi_bdev = bio->bi_bdev;
- clone->bi_rw = bio->bi_rw;
+ clone->bi_rw = bio->bi_rw & ~(1 << BIO_RW_BARRIER);
clone->bi_vcnt = 1;
clone->bi_size = to_bytes(len);
clone->bi_io_vec->bv_offset = offset;
clone->bi_io_vec->bv_len = clone->bi_size;
clone->bi_flags |= 1 << BIO_CLONED;
+ if (bio_integrity(bio)) {
+ bio_integrity_clone(clone, bio, GFP_NOIO);
+ bio_integrity_trim(clone,
+ bio_sector_offset(bio, idx, offset), len);
+ }
+
return clone;
}
@@ -717,6 +736,7 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
__bio_clone(clone, bio);
+ clone->bi_rw &= ~(1 << BIO_RW_BARRIER);
clone->bi_destructor = dm_bio_destructor;
clone->bi_sector = sector;
clone->bi_idx = idx;
@@ -724,6 +744,14 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
clone->bi_size = to_bytes(len);
clone->bi_flags &= ~(1 << BIO_SEG_VALID);
+ if (bio_integrity(bio)) {
+ bio_integrity_clone(clone, bio, GFP_NOIO);
+
+ if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
+ bio_integrity_trim(clone,
+ bio_sector_offset(bio, idx, 0), len);
+ }
+
return clone;
}
@@ -828,21 +856,22 @@ static int __clone_and_map(struct clone_info *ci)
}
/*
- * Split the bio into several clones.
+ * Split the bio into several clones and submit it to targets.
*/
-static int __split_bio(struct mapped_device *md, struct bio *bio)
+static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
{
struct clone_info ci;
int error = 0;
ci.map = dm_get_table(md);
- if (unlikely(!ci.map))
- return -EIO;
- if (unlikely(bio_barrier(bio) && !dm_table_barrier_ok(ci.map))) {
- dm_table_put(ci.map);
- bio_endio(bio, -EOPNOTSUPP);
- return 0;
+ if (unlikely(!ci.map)) {
+ if (!bio_barrier(bio))
+ bio_io_error(bio);
+ else
+ md->barrier_error = -EIO;
+ return;
}
+
ci.md = md;
ci.bio = bio;
ci.io = alloc_io(md);
@@ -861,8 +890,6 @@ static int __split_bio(struct mapped_device *md, struct bio *bio)
/* drop the extra reference count */
dec_pending(ci.io, error);
dm_table_put(ci.map);
-
- return 0;
}
/*-----------------------------------------------------------------
* CRUD END
@@ -921,7 +948,6 @@ out:
*/
static int dm_request(struct request_queue *q, struct bio *bio)
{
- int r = -EIO;
int rw = bio_data_dir(bio);
struct mapped_device *md = q->queuedata;
int cpu;
@@ -934,32 +960,26 @@ static int dm_request(struct request_queue *q, struct bio *bio)
part_stat_unlock();
/*
- * If we're suspended we have to queue
- * this io for later.
+ * If we're suspended or the thread is processing barriers
+ * we have to queue this io for later.
*/
- while (test_bit(DMF_BLOCK_IO, &md->flags)) {
+ if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) ||
+ unlikely(bio_barrier(bio))) {
up_read(&md->io_lock);
- if (bio_rw(bio) != READA)
- r = queue_io(md, bio);
+ if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) &&
+ bio_rw(bio) == READA) {
+ bio_io_error(bio);
+ return 0;
+ }
- if (r <= 0)
- goto out_req;
+ queue_io(md, bio);
- /*
- * We're in a while loop, because someone could suspend
- * before we get to the following read lock.
- */
- down_read(&md->io_lock);
+ return 0;
}
- r = __split_bio(md, bio);
+ __split_and_process_bio(md, bio);
up_read(&md->io_lock);
-
-out_req:
- if (r < 0)
- bio_io_error(bio);
-
return 0;
}
@@ -980,7 +1000,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
struct mapped_device *md = congested_data;
struct dm_table *map;
- if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
+ if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
map = dm_get_table(md);
if (map) {
r = dm_table_any_congested(map, bdi_bits);
@@ -1068,6 +1088,8 @@ out:
static struct block_device_operations dm_blk_dops;
+static void dm_wq_work(struct work_struct *work);
+
/*
* Allocate and initialise a blank device with a given minor.
*/
@@ -1095,7 +1117,7 @@ static struct mapped_device *alloc_dev(int minor)
init_rwsem(&md->io_lock);
mutex_init(&md->suspend_lock);
- spin_lock_init(&md->pushback_lock);
+ spin_lock_init(&md->deferred_lock);
rwlock_init(&md->map_lock);
atomic_set(&md->holders, 1);
atomic_set(&md->open_count, 0);
@@ -1112,6 +1134,7 @@ static struct mapped_device *alloc_dev(int minor)
md->queue->backing_dev_info.congested_fn = dm_any_congested;
md->queue->backing_dev_info.congested_data = md;
blk_queue_make_request(md->queue, dm_request);
+ blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL);
blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
md->queue->unplug_fn = dm_unplug_all;
blk_queue_merge_bvec(md->queue, dm_merge_bvec);
@@ -1134,6 +1157,7 @@ static struct mapped_device *alloc_dev(int minor)
atomic_set(&md->pending, 0);
init_waitqueue_head(&md->wait);
+ INIT_WORK(&md->work, dm_wq_work);
init_waitqueue_head(&md->eventq);
md->disk->major = _major;
@@ -1191,6 +1215,7 @@ static void free_dev(struct mapped_device *md)
mempool_destroy(md->tio_pool);
mempool_destroy(md->io_pool);
bioset_free(md->bs);
+ blk_integrity_unregister(md->disk);
del_gendisk(md->disk);
free_minor(minor);
@@ -1373,18 +1398,24 @@ void dm_put(struct mapped_device *md)
}
EXPORT_SYMBOL_GPL(dm_put);
-static int dm_wait_for_completion(struct mapped_device *md)
+static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
{
int r = 0;
+ DECLARE_WAITQUEUE(wait, current);
+
+ dm_unplug_all(md->queue);
+
+ add_wait_queue(&md->wait, &wait);
while (1) {
- set_current_state(TASK_INTERRUPTIBLE);
+ set_current_state(interruptible);
smp_mb();
if (!atomic_read(&md->pending))
break;
- if (signal_pending(current)) {
+ if (interruptible == TASK_INTERRUPTIBLE &&
+ signal_pending(current)) {
r = -EINTR;
break;
}
@@ -1393,68 +1424,80 @@ static int dm_wait_for_completion(struct mapped_device *md)
}
set_current_state(TASK_RUNNING);
+ remove_wait_queue(&md->wait, &wait);
+
return r;
}
-/*
- * Process the deferred bios
- */
-static void __flush_deferred_io(struct mapped_device *md)
+static int dm_flush(struct mapped_device *md)
{
- struct bio *c;
+ dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
+ return 0;
+}
+
+static void process_barrier(struct mapped_device *md, struct bio *bio)
+{
+ int error = dm_flush(md);
- while ((c = bio_list_pop(&md->deferred))) {
- if (__split_bio(md, c))
- bio_io_error(c);
+ if (unlikely(error)) {
+ bio_endio(bio, error);
+ return;
+ }
+ if (bio_empty_barrier(bio)) {
+ bio_endio(bio, 0);
+ return;
}
- clear_bit(DMF_BLOCK_IO, &md->flags);
-}
+ __split_and_process_bio(md, bio);
-static void __merge_pushback_list(struct mapped_device *md)
-{
- unsigned long flags;
+ error = dm_flush(md);
+
+ if (!error && md->barrier_error)
+ error = md->barrier_error;
- spin_lock_irqsave(&md->pushback_lock, flags);
- clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
- bio_list_merge_head(&md->deferred, &md->pushback);
- bio_list_init(&md->pushback);
- spin_unlock_irqrestore(&md->pushback_lock, flags);
+ if (md->barrier_error != DM_ENDIO_REQUEUE)
+ bio_endio(bio, error);
}
+/*
+ * Process the deferred bios
+ */
static void dm_wq_work(struct work_struct *work)
{
- struct dm_wq_req *req = container_of(work, struct dm_wq_req, work);
- struct mapped_device *md = req->md;
+ struct mapped_device *md = container_of(work, struct mapped_device,
+ work);
+ struct bio *c;
down_write(&md->io_lock);
- switch (req->type) {
- case DM_WQ_FLUSH_DEFERRED:
- __flush_deferred_io(md);
- break;
- default:
- DMERR("dm_wq_work: unrecognised work type %d", req->type);
- BUG();
+
+ while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
+ spin_lock_irq(&md->deferred_lock);
+ c = bio_list_pop(&md->deferred);
+ spin_unlock_irq(&md->deferred_lock);
+
+ if (!c) {
+ clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
+ break;
+ }
+
+ up_write(&md->io_lock);
+
+ if (bio_barrier(c))
+ process_barrier(md, c);
+ else
+ __split_and_process_bio(md, c);
+
+ down_write(&md->io_lock);
}
+
up_write(&md->io_lock);
}
-static void dm_wq_queue(struct mapped_device *md, int type, void *context,
- struct dm_wq_req *req)
+static void dm_queue_flush(struct mapped_device *md)
{
- req->type = type;
- req->md = md;
- req->context = context;
- INIT_WORK(&req->work, dm_wq_work);
- queue_work(md->wq, &req->work);
-}
-
-static void dm_queue_flush(struct mapped_device *md, int type, void *context)
-{
- struct dm_wq_req req;
-
- dm_wq_queue(md, type, context, &req);
- flush_workqueue(md->wq);
+ clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
+ smp_mb__after_clear_bit();
+ queue_work(md->wq, &md->work);
}
/*
@@ -1528,7 +1571,6 @@ static void unlock_fs(struct mapped_device *md)
int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
{
struct dm_table *map = NULL;
- DECLARE_WAITQUEUE(wait, current);
int r = 0;
int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;
int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
@@ -1573,38 +1615,54 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
}
/*
- * First we set the BLOCK_IO flag so no more ios will be mapped.
+ * Here we must make sure that no processes are submitting requests
+ * to target drivers i.e. no one may be executing
+ * __split_and_process_bio. This is called from dm_request and
+ * dm_wq_work.
+ *
+ * To get all processes out of __split_and_process_bio in dm_request,
+ * we take the write lock. To prevent any process from reentering
+ * __split_and_process_bio from dm_request, we set
+ * DMF_QUEUE_IO_TO_THREAD.
+ *
+ * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND
+ * and call flush_workqueue(md->wq). flush_workqueue will wait until
+ * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any
+ * further calls to __split_and_process_bio from dm_wq_work.
*/
down_write(&md->io_lock);
- set_bit(DMF_BLOCK_IO, &md->flags);
-
- add_wait_queue(&md->wait, &wait);
+ set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
+ set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
up_write(&md->io_lock);
- /* unplug */
- if (map)
- dm_table_unplug_all(map);
+ flush_workqueue(md->wq);
/*
- * Wait for the already-mapped ios to complete.
+ * At this point no more requests are entering target request routines.
+ * We call dm_wait_for_completion to wait for all existing requests
+ * to finish.
*/
- r = dm_wait_for_completion(md);
+ r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE);
down_write(&md->io_lock);
- remove_wait_queue(&md->wait, &wait);
-
if (noflush)
- __merge_pushback_list(md);
+ clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
up_write(&md->io_lock);
/* were we interrupted ? */
if (r < 0) {
- dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL);
+ dm_queue_flush(md);
unlock_fs(md);
goto out; /* pushback list is already flushed, so skip flush */
}
+ /*
+ * If dm_wait_for_completion returned 0, the device is completely
+ * quiescent now. There is no request-processing activity. All new
+ * requests are being added to md->deferred list.
+ */
+
dm_table_postsuspend_targets(map);
set_bit(DMF_SUSPENDED, &md->flags);
@@ -1639,7 +1697,7 @@ int dm_resume(struct mapped_device *md)
if (r)
goto out;
- dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL);
+ dm_queue_flush(md);
unlock_fs(md);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 20194e000c5a..a31506d93e91 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -52,7 +52,6 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits);
* To check the return value from dm_table_find_target().
*/
#define dm_target_is_valid(t) ((t)->table)
-int dm_table_barrier_ok(struct dm_table *t);
/*-----------------------------------------------------------------
* A registry of target types.
@@ -60,7 +59,7 @@ int dm_table_barrier_ok(struct dm_table *t);
int dm_target_init(void);
void dm_target_exit(void);
struct target_type *dm_get_target_type(const char *name);
-void dm_put_target_type(struct target_type *t);
+void dm_put_target_type(struct target_type *tt);
int dm_target_iterate(void (*iter_func)(struct target_type *tt,
void *param), void *param);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index ed5727c089a9..641b211fe3fe 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1375,6 +1375,9 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
sb->raid_disks = cpu_to_le32(mddev->raid_disks);
sb->size = cpu_to_le64(mddev->dev_sectors);
+ sb->chunksize = cpu_to_le32(mddev->chunk_size >> 9);
+ sb->level = cpu_to_le32(mddev->level);
+ sb->layout = cpu_to_le32(mddev->layout);
if (mddev->bitmap && mddev->bitmap_file == NULL) {
sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
@@ -2017,6 +2020,8 @@ repeat:
clear_bit(MD_CHANGE_PENDING, &mddev->flags);
spin_unlock_irq(&mddev->write_lock);
wake_up(&mddev->sb_wait);
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+ sysfs_notify(&mddev->kobj, NULL, "sync_completed");
}
@@ -2086,6 +2091,7 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
* -writemostly - clears write_mostly
* blocked - sets the Blocked flag
* -blocked - clears the Blocked flag
+ * insync - sets Insync providing device isn't active
*/
int err = -EINVAL;
if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
@@ -2118,6 +2124,9 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
md_wakeup_thread(rdev->mddev->thread);
err = 0;
+ } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
+ set_bit(In_sync, &rdev->flags);
+ err = 0;
}
if (!err && rdev->sysfs_state)
sysfs_notify_dirent(rdev->sysfs_state);
@@ -2190,7 +2199,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
} else if (rdev->mddev->pers) {
mdk_rdev_t *rdev2;
/* Activating a spare .. or possibly reactivating
- * if we every get bitmaps working here.
+ * if we ever get bitmaps working here.
*/
if (rdev->raid_disk != -1)
@@ -3060,11 +3069,8 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
} else
err = -EBUSY;
spin_unlock_irq(&mddev->write_lock);
- } else {
- mddev->ro = 0;
- mddev->recovery_cp = MaxSector;
- err = do_md_run(mddev);
- }
+ } else
+ err = -EINVAL;
break;
case active:
if (mddev->pers) {
@@ -3300,7 +3306,9 @@ static ssize_t
action_show(mddev_t *mddev, char *page)
{
char *type = "idle";
- if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
+ if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
+ type = "frozen";
+ else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
(!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) {
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
type = "reshape";
@@ -3323,7 +3331,12 @@ action_store(mddev_t *mddev, const char *page, size_t len)
if (!mddev->pers || !mddev->pers->sync_request)
return -EINVAL;
- if (cmd_match(page, "idle")) {
+ if (cmd_match(page, "frozen"))
+ set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ else
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+
+ if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_unregister_thread(mddev->sync_thread);
@@ -3482,12 +3495,15 @@ sync_completed_show(mddev_t *mddev, char *page)
{
unsigned long max_sectors, resync;
+ if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+ return sprintf(page, "none\n");
+
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
max_sectors = mddev->resync_max_sectors;
else
max_sectors = mddev->dev_sectors;
- resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
+ resync = mddev->curr_resync_completed;
return sprintf(page, "%lu / %lu\n", resync, max_sectors);
}
@@ -3674,7 +3690,7 @@ array_size_store(mddev_t *mddev, const char *buf, size_t len)
if (strict_blocks_to_sectors(buf, &sectors) < 0)
return -EINVAL;
if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
- return -EINVAL;
+ return -E2BIG;
mddev->external_size = 1;
}
@@ -4288,6 +4304,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
{
int err = 0;
struct gendisk *disk = mddev->gendisk;
+ mdk_rdev_t *rdev;
if (atomic_read(&mddev->openers) > is_open) {
printk("md: %s still in use.\n",mdname(mddev));
@@ -4330,6 +4347,13 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
/* tell userspace to handle 'inactive' */
sysfs_notify_dirent(mddev->sysfs_state);
+ list_for_each_entry(rdev, &mddev->disks, same_set)
+ if (rdev->raid_disk >= 0) {
+ char nm[20];
+ sprintf(nm, "rd%d", rdev->raid_disk);
+ sysfs_remove_link(&mddev->kobj, nm);
+ }
+
set_capacity(disk, 0);
mddev->changed = 1;
@@ -4350,7 +4374,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
* Free resources if final stop
*/
if (mode == 0) {
- mdk_rdev_t *rdev;
printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
@@ -4362,13 +4385,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
}
mddev->bitmap_offset = 0;
- list_for_each_entry(rdev, &mddev->disks, same_set)
- if (rdev->raid_disk >= 0) {
- char nm[20];
- sprintf(nm, "rd%d", rdev->raid_disk);
- sysfs_remove_link(&mddev->kobj, nm);
- }
-
/* make sure all md_delayed_delete calls have finished */
flush_scheduled_work();
@@ -5551,7 +5567,7 @@ static struct block_device_operations md_fops =
.owner = THIS_MODULE,
.open = md_open,
.release = md_release,
- .locked_ioctl = md_ioctl,
+ .ioctl = md_ioctl,
.getgeo = md_getgeo,
.media_changed = md_media_changed,
.revalidate_disk= md_revalidate,
@@ -5696,37 +5712,38 @@ static void status_unused(struct seq_file *seq)
static void status_resync(struct seq_file *seq, mddev_t * mddev)
{
- sector_t max_blocks, resync, res;
- unsigned long dt, db, rt;
+ sector_t max_sectors, resync, res;
+ unsigned long dt, db;
+ sector_t rt;
int scale;
unsigned int per_milli;
- resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
+ resync = mddev->curr_resync - atomic_read(&mddev->recovery_active);
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
- max_blocks = mddev->resync_max_sectors >> 1;
+ max_sectors = mddev->resync_max_sectors;
else
- max_blocks = mddev->dev_sectors / 2;
+ max_sectors = mddev->dev_sectors;
/*
* Should not happen.
*/
- if (!max_blocks) {
+ if (!max_sectors) {
MD_BUG();
return;
}
/* Pick 'scale' such that (resync>>scale)*1000 will fit
- * in a sector_t, and (max_blocks>>scale) will fit in a
+ * in a sector_t, and (max_sectors>>scale) will fit in a
* u32, as those are the requirements for sector_div.
* Thus 'scale' must be at least 10
*/
scale = 10;
if (sizeof(sector_t) > sizeof(unsigned long)) {
- while ( max_blocks/2 > (1ULL<<(scale+32)))
+ while ( max_sectors/2 > (1ULL<<(scale+32)))
scale++;
}
res = (resync>>scale)*1000;
- sector_div(res, (u32)((max_blocks>>scale)+1));
+ sector_div(res, (u32)((max_sectors>>scale)+1));
per_milli = res;
{
@@ -5747,25 +5764,35 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
"resync" : "recovery"))),
per_milli/10, per_milli % 10,
- (unsigned long long) resync,
- (unsigned long long) max_blocks);
+ (unsigned long long) resync/2,
+ (unsigned long long) max_sectors/2);
/*
- * We do not want to overflow, so the order of operands and
- * the * 100 / 100 trick are important. We do a +1 to be
- * safe against division by zero. We only estimate anyway.
- *
* dt: time from mark until now
* db: blocks written from mark until now
* rt: remaining time
+ *
+ * rt is a sector_t, so could be 32bit or 64bit.
+ * So we divide before multiply in case it is 32bit and close
+ * to the limit.
+ * We scale the divisor (db) by 32 to avoid loosing precision
+ * near the end of resync when the number of remaining sectors
+ * is close to 'db'.
+ * We then divide rt by 32 after multiplying by db to compensate.
+ * The '+1' avoids division by zero if db is very small.
*/
dt = ((jiffies - mddev->resync_mark) / HZ);
if (!dt) dt++;
db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
- mddev->resync_mark_cnt;
- rt = (dt * ((unsigned long)(max_blocks-resync) / (db/2/100+1)))/100;
- seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
+ rt = max_sectors - resync; /* number of remaining sectors */
+ sector_div(rt, db/32+1);
+ rt *= dt;
+ rt >>= 5;
+
+ seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
+ ((unsigned long)rt % 60)/6);
seq_printf(seq, " speed=%ldK/sec", db/2/dt);
}
@@ -5956,7 +5983,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
return 0;
}
-static struct seq_operations md_seq_ops = {
+static const struct seq_operations md_seq_ops = {
.start = md_seq_start,
.next = md_seq_next,
.stop = md_seq_stop,
@@ -6334,18 +6361,14 @@ void md_do_sync(mddev_t *mddev)
sector_t sectors;
skipped = 0;
- if (j >= mddev->resync_max) {
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
- wait_event(mddev->recovery_wait,
- mddev->resync_max > j
- || kthread_should_stop());
- }
- if (kthread_should_stop())
- goto interrupted;
- if (mddev->curr_resync > mddev->curr_resync_completed &&
- (mddev->curr_resync - mddev->curr_resync_completed)
- > (max_sectors >> 4)) {
+ if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+ ((mddev->curr_resync > mddev->curr_resync_completed &&
+ (mddev->curr_resync - mddev->curr_resync_completed)
+ > (max_sectors >> 4)) ||
+ (j - mddev->curr_resync_completed)*2
+ >= mddev->resync_max - mddev->curr_resync_completed
+ )) {
/* time to update curr_resync_completed */
blk_unplug(mddev->queue);
wait_event(mddev->recovery_wait,
@@ -6353,7 +6376,17 @@ void md_do_sync(mddev_t *mddev)
mddev->curr_resync_completed =
mddev->curr_resync;
set_bit(MD_CHANGE_CLEAN, &mddev->flags);
+ sysfs_notify(&mddev->kobj, NULL, "sync_completed");
}
+
+ if (j >= mddev->resync_max)
+ wait_event(mddev->recovery_wait,
+ mddev->resync_max > j
+ || kthread_should_stop());
+
+ if (kthread_should_stop())
+ goto interrupted;
+
sectors = mddev->pers->sync_request(mddev, j, &skipped,
currspeed < speed_min(mddev));
if (sectors == 0) {
@@ -6461,6 +6494,7 @@ void md_do_sync(mddev_t *mddev)
skip:
mddev->curr_resync = 0;
+ mddev->curr_resync_completed = 0;
mddev->resync_min = 0;
mddev->resync_max = MaxSector;
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
diff --git a/drivers/md/md.h b/drivers/md/md.h
index e9b7f54c24d6..8227ab909d44 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -12,10 +12,17 @@
Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
-#ifndef _MD_K_H
-#define _MD_K_H
-
-#ifdef CONFIG_BLOCK
+#ifndef _MD_MD_H
+#define _MD_MD_H
+
+#include <linux/blkdev.h>
+#include <linux/kobject.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/timer.h>
+#include <linux/wait.h>
+#include <linux/workqueue.h>
#define MaxSector (~(sector_t)0)
@@ -408,10 +415,6 @@ static inline void safe_put_page(struct page *p)
if (p) put_page(p);
}
-#endif /* CONFIG_BLOCK */
-#endif
-
-
extern int register_md_personality(struct mdk_personality *p);
extern int unregister_md_personality(struct mdk_personality *p);
extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev),
@@ -434,3 +437,5 @@ extern void md_new_event(mddev_t *mddev);
extern int md_allow_write(mddev_t *mddev);
extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors);
+
+#endif /* _MD_MD_H */
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index b4f4badc0068..36df9109cde1 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -35,7 +35,6 @@
#include <linux/blkdev.h>
#include <linux/seq_file.h>
#include "md.h"
-#include "dm-bio-list.h"
#include "raid1.h"
#include "bitmap.h"
@@ -123,6 +122,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
goto out_free_pages;
bio->bi_io_vec[i].bv_page = page;
+ bio->bi_vcnt = i+1;
}
}
/* If not user-requests, copy the page pointers to all bios */
@@ -138,9 +138,9 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
return r1_bio;
out_free_pages:
- for (i=0; i < RESYNC_PAGES ; i++)
- for (j=0 ; j < pi->raid_disks; j++)
- safe_put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
+ for (j=0 ; j < pi->raid_disks; j++)
+ for (i=0; i < r1_bio->bios[j]->bi_vcnt ; i++)
+ put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
j = -1;
out_free_bio:
while ( ++j < pi->raid_disks )
@@ -585,7 +585,7 @@ static int raid1_congested(void *data, int bits)
/* Note the '|| 1' - when read_balance prefers
* non-congested targets, it can be removed
*/
- if ((bits & (1<<BDI_write_congested)) || 1)
+ if ((bits & (1<<BDI_async_congested)) || 1)
ret |= bdi_congested(&q->backing_dev_info, bits);
else
ret &= bdi_congested(&q->backing_dev_info, bits);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index e293d92641ac..499620afb44b 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -22,7 +22,6 @@
#include <linux/blkdev.h>
#include <linux/seq_file.h>
#include "md.h"
-#include "dm-bio-list.h"
#include "raid10.h"
#include "bitmap.h"
@@ -1810,17 +1809,17 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
r10_bio->sector = sect;
raid10_find_phys(conf, r10_bio);
- /* Need to check if this section will still be
+
+ /* Need to check if the array will still be
* degraded
*/
- for (j=0; j<conf->copies;j++) {
- int d = r10_bio->devs[j].devnum;
- if (conf->mirrors[d].rdev == NULL ||
- test_bit(Faulty, &conf->mirrors[d].rdev->flags)) {
+ for (j=0; j<conf->raid_disks; j++)
+ if (conf->mirrors[j].rdev == NULL ||
+ test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
still_degraded = 1;
break;
}
- }
+
must_sync = bitmap_start_sync(mddev->bitmap, sect,
&sync_blocks, still_degraded);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 1f2a266f3cf7..54ef8d75541d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -47,6 +47,7 @@
#include <linux/kthread.h>
#include <linux/raid/pq.h>
#include <linux/async_tx.h>
+#include <linux/async.h>
#include <linux/seq_file.h>
#include <linux/cpu.h>
#include "md.h"
@@ -363,7 +364,7 @@ static void raid5_unplug_device(struct request_queue *q);
static struct stripe_head *
get_active_stripe(raid5_conf_t *conf, sector_t sector,
- int previous, int noblock)
+ int previous, int noblock, int noquiesce)
{
struct stripe_head *sh;
@@ -373,7 +374,7 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector,
do {
wait_event_lock_irq(conf->wait_for_stripe,
- conf->quiesce == 0,
+ conf->quiesce == 0 || noquiesce,
conf->device_lock, /* nothing */);
sh = __find_stripe(conf, sector, conf->generation - previous);
if (!sh) {
@@ -501,13 +502,17 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
int i;
int page_offset;
struct async_submit_ctl submit;
+ enum async_tx_flags flags = 0;
if (bio->bi_sector >= sector)
page_offset = (signed)(bio->bi_sector - sector) * 512;
else
page_offset = (signed)(sector - bio->bi_sector) * -512;
- init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
+ if (frombio)
+ flags |= ASYNC_TX_FENCE;
+ init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
+
bio_for_each_segment(bvl, bio, i) {
int len = bio_iovec_idx(bio, i)->bv_len;
int clen;
@@ -623,18 +628,30 @@ static void ops_run_biofill(struct stripe_head *sh)
async_trigger_callback(&submit);
}
-static void ops_complete_compute5(void *stripe_head_ref)
+static void mark_target_uptodate(struct stripe_head *sh, int target)
{
- struct stripe_head *sh = stripe_head_ref;
- int target = sh->ops.target;
- struct r5dev *tgt = &sh->dev[target];
+ struct r5dev *tgt;
- pr_debug("%s: stripe %llu\n", __func__,
- (unsigned long long)sh->sector);
+ if (target < 0)
+ return;
+ tgt = &sh->dev[target];
set_bit(R5_UPTODATE, &tgt->flags);
BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
clear_bit(R5_Wantcompute, &tgt->flags);
+}
+
+static void ops_complete_compute(void *stripe_head_ref)
+{
+ struct stripe_head *sh = stripe_head_ref;
+
+ pr_debug("%s: stripe %llu\n", __func__,
+ (unsigned long long)sh->sector);
+
+ /* mark the computed target(s) as uptodate */
+ mark_target_uptodate(sh, sh->ops.target);
+ mark_target_uptodate(sh, sh->ops.target2);
+
clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
if (sh->check_state == check_state_compute_run)
sh->check_state = check_state_compute_result;
@@ -672,8 +689,8 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
atomic_inc(&sh->count);
- init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL,
- ops_complete_compute5, sh, to_addr_conv(sh, percpu));
+ init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
+ ops_complete_compute, sh, to_addr_conv(sh, percpu));
if (unlikely(count == 1))
tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
else
@@ -682,6 +699,202 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
return tx;
}
+/* set_syndrome_sources - populate source buffers for gen_syndrome
+ * @srcs - (struct page *) array of size sh->disks
+ * @sh - stripe_head to parse
+ *
+ * Populates srcs in proper layout order for the stripe and returns the
+ * 'count' of sources to be used in a call to async_gen_syndrome. The P
+ * destination buffer is recorded in srcs[count] and the Q destination
+ * is recorded in srcs[count+1]].
+ */
+static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
+{
+ int disks = sh->disks;
+ int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
+ int d0_idx = raid6_d0(sh);
+ int count;
+ int i;
+
+ for (i = 0; i < disks; i++)
+ srcs[i] = (void *)raid6_empty_zero_page;
+
+ count = 0;
+ i = d0_idx;
+ do {
+ int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
+
+ srcs[slot] = sh->dev[i].page;
+ i = raid6_next_disk(i, disks);
+ } while (i != d0_idx);
+ BUG_ON(count != syndrome_disks);
+
+ return count;
+}
+
+static struct dma_async_tx_descriptor *
+ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
+{
+ int disks = sh->disks;
+ struct page **blocks = percpu->scribble;
+ int target;
+ int qd_idx = sh->qd_idx;
+ struct dma_async_tx_descriptor *tx;
+ struct async_submit_ctl submit;
+ struct r5dev *tgt;
+ struct page *dest;
+ int i;
+ int count;
+
+ if (sh->ops.target < 0)
+ target = sh->ops.target2;
+ else if (sh->ops.target2 < 0)
+ target = sh->ops.target;
+ else
+ /* we should only have one valid target */
+ BUG();
+ BUG_ON(target < 0);
+ pr_debug("%s: stripe %llu block: %d\n",
+ __func__, (unsigned long long)sh->sector, target);
+
+ tgt = &sh->dev[target];
+ BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
+ dest = tgt->page;
+
+ atomic_inc(&sh->count);
+
+ if (target == qd_idx) {
+ count = set_syndrome_sources(blocks, sh);
+ blocks[count] = NULL; /* regenerating p is not necessary */
+ BUG_ON(blocks[count+1] != dest); /* q should already be set */
+ init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
+ ops_complete_compute, sh,
+ to_addr_conv(sh, percpu));
+ tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
+ } else {
+ /* Compute any data- or p-drive using XOR */
+ count = 0;
+ for (i = disks; i-- ; ) {
+ if (i == target || i == qd_idx)
+ continue;
+ blocks[count++] = sh->dev[i].page;
+ }
+
+ init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
+ NULL, ops_complete_compute, sh,
+ to_addr_conv(sh, percpu));
+ tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
+ }
+
+ return tx;
+}
+
+static struct dma_async_tx_descriptor *
+ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
+{
+ int i, count, disks = sh->disks;
+ int syndrome_disks = sh->ddf_layout ? disks : disks-2;
+ int d0_idx = raid6_d0(sh);
+ int faila = -1, failb = -1;
+ int target = sh->ops.target;
+ int target2 = sh->ops.target2;
+ struct r5dev *tgt = &sh->dev[target];
+ struct r5dev *tgt2 = &sh->dev[target2];
+ struct dma_async_tx_descriptor *tx;
+ struct page **blocks = percpu->scribble;
+ struct async_submit_ctl submit;
+
+ pr_debug("%s: stripe %llu block1: %d block2: %d\n",
+ __func__, (unsigned long long)sh->sector, target, target2);
+ BUG_ON(target < 0 || target2 < 0);
+ BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
+ BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
+
+ /* we need to open-code set_syndrome_sources to handle to the
+ * slot number conversion for 'faila' and 'failb'
+ */
+ for (i = 0; i < disks ; i++)
+ blocks[i] = (void *)raid6_empty_zero_page;
+ count = 0;
+ i = d0_idx;
+ do {
+ int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
+
+ blocks[slot] = sh->dev[i].page;
+
+ if (i == target)
+ faila = slot;
+ if (i == target2)
+ failb = slot;
+ i = raid6_next_disk(i, disks);
+ } while (i != d0_idx);
+ BUG_ON(count != syndrome_disks);
+
+ BUG_ON(faila == failb);
+ if (failb < faila)
+ swap(faila, failb);
+ pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
+ __func__, (unsigned long long)sh->sector, faila, failb);
+
+ atomic_inc(&sh->count);
+
+ if (failb == syndrome_disks+1) {
+ /* Q disk is one of the missing disks */
+ if (faila == syndrome_disks) {
+ /* Missing P+Q, just recompute */
+ init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
+ ops_complete_compute, sh,
+ to_addr_conv(sh, percpu));
+ return async_gen_syndrome(blocks, 0, count+2,
+ STRIPE_SIZE, &submit);
+ } else {
+ struct page *dest;
+ int data_target;
+ int qd_idx = sh->qd_idx;
+
+ /* Missing D+Q: recompute D from P, then recompute Q */
+ if (target == qd_idx)
+ data_target = target2;
+ else
+ data_target = target;
+
+ count = 0;
+ for (i = disks; i-- ; ) {
+ if (i == data_target || i == qd_idx)
+ continue;
+ blocks[count++] = sh->dev[i].page;
+ }
+ dest = sh->dev[data_target].page;
+ init_async_submit(&submit,
+ ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
+ NULL, NULL, NULL,
+ to_addr_conv(sh, percpu));
+ tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
+ &submit);
+
+ count = set_syndrome_sources(blocks, sh);
+ init_async_submit(&submit, ASYNC_TX_FENCE, tx,
+ ops_complete_compute, sh,
+ to_addr_conv(sh, percpu));
+ return async_gen_syndrome(blocks, 0, count+2,
+ STRIPE_SIZE, &submit);
+ }
+ }
+
+ init_async_submit(&submit, ASYNC_TX_FENCE, NULL, ops_complete_compute,
+ sh, to_addr_conv(sh, percpu));
+ if (failb == syndrome_disks) {
+ /* We're missing D+P. */
+ return async_raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE,
+ faila, blocks, &submit);
+ } else {
+ /* We're missing D+D. */
+ return async_raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE,
+ faila, failb, blocks, &submit);
+ }
+}
+
+
static void ops_complete_prexor(void *stripe_head_ref)
{
struct stripe_head *sh = stripe_head_ref;
@@ -712,7 +925,7 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
xor_srcs[count++] = dev->page;
}
- init_async_submit(&submit, ASYNC_TX_XOR_DROP_DST, tx,
+ init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
ops_complete_prexor, sh, to_addr_conv(sh, percpu));
tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
@@ -754,17 +967,21 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
return tx;
}
-static void ops_complete_postxor(void *stripe_head_ref)
+static void ops_complete_reconstruct(void *stripe_head_ref)
{
struct stripe_head *sh = stripe_head_ref;
- int disks = sh->disks, i, pd_idx = sh->pd_idx;
+ int disks = sh->disks;
+ int pd_idx = sh->pd_idx;
+ int qd_idx = sh->qd_idx;
+ int i;
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
- if (dev->written || i == pd_idx)
+
+ if (dev->written || i == pd_idx || i == qd_idx)
set_bit(R5_UPTODATE, &dev->flags);
}
@@ -782,8 +999,8 @@ static void ops_complete_postxor(void *stripe_head_ref)
}
static void
-ops_run_postxor(struct stripe_head *sh, struct raid5_percpu *percpu,
- struct dma_async_tx_descriptor *tx)
+ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
+ struct dma_async_tx_descriptor *tx)
{
int disks = sh->disks;
struct page **xor_srcs = percpu->scribble;
@@ -826,7 +1043,7 @@ ops_run_postxor(struct stripe_head *sh, struct raid5_percpu *percpu,
atomic_inc(&sh->count);
- init_async_submit(&submit, flags, tx, ops_complete_postxor, sh,
+ init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh,
to_addr_conv(sh, percpu));
if (unlikely(count == 1))
tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
@@ -834,6 +1051,25 @@ ops_run_postxor(struct stripe_head *sh, struct raid5_percpu *percpu,
tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
}
+static void
+ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
+ struct dma_async_tx_descriptor *tx)
+{
+ struct async_submit_ctl submit;
+ struct page **blocks = percpu->scribble;
+ int count;
+
+ pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
+
+ count = set_syndrome_sources(blocks, sh);
+
+ atomic_inc(&sh->count);
+
+ init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
+ sh, to_addr_conv(sh, percpu));
+ async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
+}
+
static void ops_complete_check(void *stripe_head_ref)
{
struct stripe_head *sh = stripe_head_ref;
@@ -846,23 +1082,28 @@ static void ops_complete_check(void *stripe_head_ref)
release_stripe(sh);
}
-static void ops_run_check(struct stripe_head *sh, struct raid5_percpu *percpu)
+static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
{
int disks = sh->disks;
+ int pd_idx = sh->pd_idx;
+ int qd_idx = sh->qd_idx;
+ struct page *xor_dest;
struct page **xor_srcs = percpu->scribble;
struct dma_async_tx_descriptor *tx;
struct async_submit_ctl submit;
-
- int count = 0, pd_idx = sh->pd_idx, i;
- struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
+ int count;
+ int i;
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
+ count = 0;
+ xor_dest = sh->dev[pd_idx].page;
+ xor_srcs[count++] = xor_dest;
for (i = disks; i--; ) {
- struct r5dev *dev = &sh->dev[i];
- if (i != pd_idx)
- xor_srcs[count++] = dev->page;
+ if (i == pd_idx || i == qd_idx)
+ continue;
+ xor_srcs[count++] = sh->dev[i].page;
}
init_async_submit(&submit, 0, NULL, NULL, NULL,
@@ -875,11 +1116,32 @@ static void ops_run_check(struct stripe_head *sh, struct raid5_percpu *percpu)
tx = async_trigger_callback(&submit);
}
-static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
+static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
+{
+ struct page **srcs = percpu->scribble;
+ struct async_submit_ctl submit;
+ int count;
+
+ pr_debug("%s: stripe %llu checkp: %d\n", __func__,
+ (unsigned long long)sh->sector, checkp);
+
+ count = set_syndrome_sources(srcs, sh);
+ if (!checkp)
+ srcs[count] = NULL;
+
+ atomic_inc(&sh->count);
+ init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
+ sh, to_addr_conv(sh, percpu));
+ async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
+ &sh->ops.zero_sum_result, percpu->spare_page, &submit);
+}
+
+static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
{
int overlap_clear = 0, i, disks = sh->disks;
struct dma_async_tx_descriptor *tx = NULL;
raid5_conf_t *conf = sh->raid_conf;
+ int level = conf->level;
struct raid5_percpu *percpu;
unsigned long cpu;
@@ -891,9 +1153,16 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
}
if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
- tx = ops_run_compute5(sh, percpu);
- /* terminate the chain if postxor is not set to be run */
- if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request))
+ if (level < 6)
+ tx = ops_run_compute5(sh, percpu);
+ else {
+ if (sh->ops.target2 < 0 || sh->ops.target < 0)
+ tx = ops_run_compute6_1(sh, percpu);
+ else
+ tx = ops_run_compute6_2(sh, percpu);
+ }
+ /* terminate the chain if reconstruct is not set to be run */
+ if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
async_tx_ack(tx);
}
@@ -905,11 +1174,23 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
overlap_clear++;
}
- if (test_bit(STRIPE_OP_POSTXOR, &ops_request))
- ops_run_postxor(sh, percpu, tx);
+ if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
+ if (level < 6)
+ ops_run_reconstruct5(sh, percpu, tx);
+ else
+ ops_run_reconstruct6(sh, percpu, tx);
+ }
- if (test_bit(STRIPE_OP_CHECK, &ops_request))
- ops_run_check(sh, percpu);
+ if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
+ if (sh->check_state == check_state_run)
+ ops_run_check_p(sh, percpu);
+ else if (sh->check_state == check_state_run_q)
+ ops_run_check_pq(sh, percpu, 0);
+ else if (sh->check_state == check_state_run_pq)
+ ops_run_check_pq(sh, percpu, 1);
+ else
+ BUG();
+ }
if (overlap_clear)
for (i = disks; i--; ) {
@@ -1656,258 +1937,13 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
}
-
-/*
- * Copy data between a page in the stripe cache, and one or more bion
- * The page could align with the middle of the bio, or there could be
- * several bion, each with several bio_vecs, which cover part of the page
- * Multiple bion are linked together on bi_next. There may be extras
- * at the end of this list. We ignore them.
- */
-static void copy_data(int frombio, struct bio *bio,
- struct page *page,
- sector_t sector)
-{
- char *pa = page_address(page);
- struct bio_vec *bvl;
- int i;
- int page_offset;
-
- if (bio->bi_sector >= sector)
- page_offset = (signed)(bio->bi_sector - sector) * 512;
- else
- page_offset = (signed)(sector - bio->bi_sector) * -512;
- bio_for_each_segment(bvl, bio, i) {
- int len = bio_iovec_idx(bio,i)->bv_len;
- int clen;
- int b_offset = 0;
-
- if (page_offset < 0) {
- b_offset = -page_offset;
- page_offset += b_offset;
- len -= b_offset;
- }
-
- if (len > 0 && page_offset + len > STRIPE_SIZE)
- clen = STRIPE_SIZE - page_offset;
- else clen = len;
-
- if (clen > 0) {
- char *ba = __bio_kmap_atomic(bio, i, KM_USER0);
- if (frombio)
- memcpy(pa+page_offset, ba+b_offset, clen);
- else
- memcpy(ba+b_offset, pa+page_offset, clen);
- __bio_kunmap_atomic(ba, KM_USER0);
- }
- if (clen < len) /* hit end of page */
- break;
- page_offset += len;
- }
-}
-
-#define check_xor() do { \
- if (count == MAX_XOR_BLOCKS) { \
- xor_blocks(count, STRIPE_SIZE, dest, ptr);\
- count = 0; \
- } \
- } while(0)
-
-static void compute_parity6(struct stripe_head *sh, int method)
-{
- raid5_conf_t *conf = sh->raid_conf;
- int i, pd_idx, qd_idx, d0_idx, disks = sh->disks, count;
- int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
- struct bio *chosen;
- /**** FIX THIS: This could be very bad if disks is close to 256 ****/
- void *ptrs[syndrome_disks+2];
-
- pd_idx = sh->pd_idx;
- qd_idx = sh->qd_idx;
- d0_idx = raid6_d0(sh);
-
- pr_debug("compute_parity, stripe %llu, method %d\n",
- (unsigned long long)sh->sector, method);
-
- switch(method) {
- case READ_MODIFY_WRITE:
- BUG(); /* READ_MODIFY_WRITE N/A for RAID-6 */
- case RECONSTRUCT_WRITE:
- for (i= disks; i-- ;)
- if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) {
- chosen = sh->dev[i].towrite;
- sh->dev[i].towrite = NULL;
-
- if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
- wake_up(&conf->wait_for_overlap);
-
- BUG_ON(sh->dev[i].written);
- sh->dev[i].written = chosen;
- }
- break;
- case CHECK_PARITY:
- BUG(); /* Not implemented yet */
- }
-
- for (i = disks; i--;)
- if (sh->dev[i].written) {
- sector_t sector = sh->dev[i].sector;
- struct bio *wbi = sh->dev[i].written;
- while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
- copy_data(1, wbi, sh->dev[i].page, sector);
- wbi = r5_next_bio(wbi, sector);
- }
-
- set_bit(R5_LOCKED, &sh->dev[i].flags);
- set_bit(R5_UPTODATE, &sh->dev[i].flags);
- }
-
- /* Note that unlike RAID-5, the ordering of the disks matters greatly.*/
-
- for (i = 0; i < disks; i++)
- ptrs[i] = (void *)raid6_empty_zero_page;
-
- count = 0;
- i = d0_idx;
- do {
- int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
-
- ptrs[slot] = page_address(sh->dev[i].page);
- if (slot < syndrome_disks &&
- !test_bit(R5_UPTODATE, &sh->dev[i].flags)) {
- printk(KERN_ERR "block %d/%d not uptodate "
- "on parity calc\n", i, count);
- BUG();
- }
-
- i = raid6_next_disk(i, disks);
- } while (i != d0_idx);
- BUG_ON(count != syndrome_disks);
-
- raid6_call.gen_syndrome(syndrome_disks+2, STRIPE_SIZE, ptrs);
-
- switch(method) {
- case RECONSTRUCT_WRITE:
- set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
- set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
- set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
- set_bit(R5_LOCKED, &sh->dev[qd_idx].flags);
- break;
- case UPDATE_PARITY:
- set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
- set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
- break;
- }
-}
-
-
-/* Compute one missing block */
-static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
-{
- int i, count, disks = sh->disks;
- void *ptr[MAX_XOR_BLOCKS], *dest, *p;
- int qd_idx = sh->qd_idx;
-
- pr_debug("compute_block_1, stripe %llu, idx %d\n",
- (unsigned long long)sh->sector, dd_idx);
-
- if ( dd_idx == qd_idx ) {
- /* We're actually computing the Q drive */
- compute_parity6(sh, UPDATE_PARITY);
- } else {
- dest = page_address(sh->dev[dd_idx].page);
- if (!nozero) memset(dest, 0, STRIPE_SIZE);
- count = 0;
- for (i = disks ; i--; ) {
- if (i == dd_idx || i == qd_idx)
- continue;
- p = page_address(sh->dev[i].page);
- if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
- ptr[count++] = p;
- else
- printk("compute_block() %d, stripe %llu, %d"
- " not present\n", dd_idx,
- (unsigned long long)sh->sector, i);
-
- check_xor();
- }
- if (count)
- xor_blocks(count, STRIPE_SIZE, dest, ptr);
- if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
- else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
- }
-}
-
-/* Compute two missing blocks */
-static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
-{
- int i, count, disks = sh->disks;
- int syndrome_disks = sh->ddf_layout ? disks : disks-2;
- int d0_idx = raid6_d0(sh);
- int faila = -1, failb = -1;
- /**** FIX THIS: This could be very bad if disks is close to 256 ****/
- void *ptrs[syndrome_disks+2];
-
- for (i = 0; i < disks ; i++)
- ptrs[i] = (void *)raid6_empty_zero_page;
- count = 0;
- i = d0_idx;
- do {
- int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
-
- ptrs[slot] = page_address(sh->dev[i].page);
-
- if (i == dd_idx1)
- faila = slot;
- if (i == dd_idx2)
- failb = slot;
- i = raid6_next_disk(i, disks);
- } while (i != d0_idx);
- BUG_ON(count != syndrome_disks);
-
- BUG_ON(faila == failb);
- if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; }
-
- pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
- (unsigned long long)sh->sector, dd_idx1, dd_idx2,
- faila, failb);
-
- if (failb == syndrome_disks+1) {
- /* Q disk is one of the missing disks */
- if (faila == syndrome_disks) {
- /* Missing P+Q, just recompute */
- compute_parity6(sh, UPDATE_PARITY);
- return;
- } else {
- /* We're missing D+Q; recompute D from P */
- compute_block_1(sh, ((dd_idx1 == sh->qd_idx) ?
- dd_idx2 : dd_idx1),
- 0);
- compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */
- return;
- }
- }
-
- /* We're missing D+P or D+D; */
- if (failb == syndrome_disks) {
- /* We're missing D+P. */
- raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, faila, ptrs);
- } else {
- /* We're missing D+D. */
- raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, faila, failb,
- ptrs);
- }
-
- /* Both the above update both missing blocks */
- set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
- set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
-}
-
static void
-schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
+schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
int rcw, int expand)
{
int i, pd_idx = sh->pd_idx, disks = sh->disks;
+ raid5_conf_t *conf = sh->raid_conf;
+ int level = conf->level;
if (rcw) {
/* if we are not expanding this is a proper write request, and
@@ -1920,7 +1956,7 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
} else
sh->reconstruct_state = reconstruct_state_run;
- set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
+ set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
@@ -1933,17 +1969,18 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
s->locked++;
}
}
- if (s->locked + 1 == disks)
+ if (s->locked + conf->max_degraded == disks)
if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
- atomic_inc(&sh->raid_conf->pending_full_writes);
+ atomic_inc(&conf->pending_full_writes);
} else {
+ BUG_ON(level == 6);
BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
sh->reconstruct_state = reconstruct_state_prexor_drain_run;
set_bit(STRIPE_OP_PREXOR, &s->ops_request);
set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
- set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
+ set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
@@ -1961,13 +1998,22 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
}
}
- /* keep the parity disk locked while asynchronous operations
+ /* keep the parity disk(s) locked while asynchronous operations
* are in flight
*/
set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
s->locked++;
+ if (level == 6) {
+ int qd_idx = sh->qd_idx;
+ struct r5dev *dev = &sh->dev[qd_idx];
+
+ set_bit(R5_LOCKED, &dev->flags);
+ clear_bit(R5_UPTODATE, &dev->flags);
+ s->locked++;
+ }
+
pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
__func__, (unsigned long long)sh->sector,
s->locked, s->ops_request);
@@ -2048,13 +2094,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
static void end_reshape(raid5_conf_t *conf);
-static int page_is_zero(struct page *p)
-{
- char *a = page_address(p);
- return ((*(u32*)a) == 0 &&
- memcmp(a, a+4, STRIPE_SIZE-4)==0);
-}
-
static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
struct stripe_head *sh)
{
@@ -2195,9 +2234,10 @@ static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s,
set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
set_bit(R5_Wantcompute, &dev->flags);
sh->ops.target = disk_idx;
+ sh->ops.target2 = -1;
s->req_compute = 1;
/* Careful: from this point on 'uptodate' is in the eye
- * of raid5_run_ops which services 'compute' operations
+ * of raid_run_ops which services 'compute' operations
* before writes. R5_Wantcompute flags a block that will
* be R5_UPTODATE by the time it is needed for a
* subsequent operation.
@@ -2236,61 +2276,104 @@ static void handle_stripe_fill5(struct stripe_head *sh,
set_bit(STRIPE_HANDLE, &sh->state);
}
-static void handle_stripe_fill6(struct stripe_head *sh,
- struct stripe_head_state *s, struct r6_state *r6s,
- int disks)
+/* fetch_block6 - checks the given member device to see if its data needs
+ * to be read or computed to satisfy a request.
+ *
+ * Returns 1 when no more member devices need to be checked, otherwise returns
+ * 0 to tell the loop in handle_stripe_fill6 to continue
+ */
+static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s,
+ struct r6_state *r6s, int disk_idx, int disks)
{
- int i;
- for (i = disks; i--; ) {
- struct r5dev *dev = &sh->dev[i];
- if (!test_bit(R5_LOCKED, &dev->flags) &&
- !test_bit(R5_UPTODATE, &dev->flags) &&
- (dev->toread || (dev->towrite &&
- !test_bit(R5_OVERWRITE, &dev->flags)) ||
- s->syncing || s->expanding ||
- (s->failed >= 1 &&
- (sh->dev[r6s->failed_num[0]].toread ||
- s->to_write)) ||
- (s->failed >= 2 &&
- (sh->dev[r6s->failed_num[1]].toread ||
- s->to_write)))) {
- /* we would like to get this block, possibly
- * by computing it, but we might not be able to
+ struct r5dev *dev = &sh->dev[disk_idx];
+ struct r5dev *fdev[2] = { &sh->dev[r6s->failed_num[0]],
+ &sh->dev[r6s->failed_num[1]] };
+
+ if (!test_bit(R5_LOCKED, &dev->flags) &&
+ !test_bit(R5_UPTODATE, &dev->flags) &&
+ (dev->toread ||
+ (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
+ s->syncing || s->expanding ||
+ (s->failed >= 1 &&
+ (fdev[0]->toread || s->to_write)) ||
+ (s->failed >= 2 &&
+ (fdev[1]->toread || s->to_write)))) {
+ /* we would like to get this block, possibly by computing it,
+ * otherwise read it if the backing disk is insync
+ */
+ BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
+ BUG_ON(test_bit(R5_Wantread, &dev->flags));
+ if ((s->uptodate == disks - 1) &&
+ (s->failed && (disk_idx == r6s->failed_num[0] ||
+ disk_idx == r6s->failed_num[1]))) {
+ /* have disk failed, and we're requested to fetch it;
+ * do compute it
*/
- if ((s->uptodate == disks - 1) &&
- (s->failed && (i == r6s->failed_num[0] ||
- i == r6s->failed_num[1]))) {
- pr_debug("Computing stripe %llu block %d\n",
- (unsigned long long)sh->sector, i);
- compute_block_1(sh, i, 0);
- s->uptodate++;
- } else if ( s->uptodate == disks-2 && s->failed >= 2 ) {
- /* Computing 2-failure is *very* expensive; only
- * do it if failed >= 2
- */
- int other;
- for (other = disks; other--; ) {
- if (other == i)
- continue;
- if (!test_bit(R5_UPTODATE,
- &sh->dev[other].flags))
- break;
- }
- BUG_ON(other < 0);
- pr_debug("Computing stripe %llu blocks %d,%d\n",
- (unsigned long long)sh->sector,
- i, other);
- compute_block_2(sh, i, other);
- s->uptodate += 2;
- } else if (test_bit(R5_Insync, &dev->flags)) {
- set_bit(R5_LOCKED, &dev->flags);
- set_bit(R5_Wantread, &dev->flags);
- s->locked++;
- pr_debug("Reading block %d (sync=%d)\n",
- i, s->syncing);
+ pr_debug("Computing stripe %llu block %d\n",
+ (unsigned long long)sh->sector, disk_idx);
+ set_bit(STRIPE_COMPUTE_RUN, &sh->state);
+ set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
+ set_bit(R5_Wantcompute, &dev->flags);
+ sh->ops.target = disk_idx;
+ sh->ops.target2 = -1; /* no 2nd target */
+ s->req_compute = 1;
+ s->uptodate++;
+ return 1;
+ } else if (s->uptodate == disks-2 && s->failed >= 2) {
+ /* Computing 2-failure is *very* expensive; only
+ * do it if failed >= 2
+ */
+ int other;
+ for (other = disks; other--; ) {
+ if (other == disk_idx)
+ continue;
+ if (!test_bit(R5_UPTODATE,
+ &sh->dev[other].flags))
+ break;
}
+ BUG_ON(other < 0);
+ pr_debug("Computing stripe %llu blocks %d,%d\n",
+ (unsigned long long)sh->sector,
+ disk_idx, other);
+ set_bit(STRIPE_COMPUTE_RUN, &sh->state);
+ set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
+ set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags);
+ set_bit(R5_Wantcompute, &sh->dev[other].flags);
+ sh->ops.target = disk_idx;
+ sh->ops.target2 = other;
+ s->uptodate += 2;
+ s->req_compute = 1;
+ return 1;
+ } else if (test_bit(R5_Insync, &dev->flags)) {
+ set_bit(R5_LOCKED, &dev->flags);
+ set_bit(R5_Wantread, &dev->flags);
+ s->locked++;
+ pr_debug("Reading block %d (sync=%d)\n",
+ disk_idx, s->syncing);
}
}
+
+ return 0;
+}
+
+/**
+ * handle_stripe_fill6 - read or compute data to satisfy pending requests.
+ */
+static void handle_stripe_fill6(struct stripe_head *sh,
+ struct stripe_head_state *s, struct r6_state *r6s,
+ int disks)
+{
+ int i;
+
+ /* look for blocks to read/compute, skip this if a compute
+ * is already in flight, or if the stripe contents are in the
+ * midst of changing due to a write
+ */
+ if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
+ !sh->reconstruct_state)
+ for (i = disks; i--; )
+ if (fetch_block6(sh, s, r6s, i, disks))
+ break;
set_bit(STRIPE_HANDLE, &sh->state);
}
@@ -2424,114 +2507,61 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf,
*/
/* since handle_stripe can be called at any time we need to handle the
* case where a compute block operation has been submitted and then a
- * subsequent call wants to start a write request. raid5_run_ops only
- * handles the case where compute block and postxor are requested
+ * subsequent call wants to start a write request. raid_run_ops only
+ * handles the case where compute block and reconstruct are requested
* simultaneously. If this is not the case then new writes need to be
* held off until the compute completes.
*/
if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
(s->locked == 0 && (rcw == 0 || rmw == 0) &&
!test_bit(STRIPE_BIT_DELAY, &sh->state)))
- schedule_reconstruction5(sh, s, rcw == 0, 0);
+ schedule_reconstruction(sh, s, rcw == 0, 0);
}
static void handle_stripe_dirtying6(raid5_conf_t *conf,
struct stripe_head *sh, struct stripe_head_state *s,
struct r6_state *r6s, int disks)
{
- int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i;
+ int rcw = 0, pd_idx = sh->pd_idx, i;
int qd_idx = sh->qd_idx;
+
+ set_bit(STRIPE_HANDLE, &sh->state);
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
- /* Would I have to read this buffer for reconstruct_write */
- if (!test_bit(R5_OVERWRITE, &dev->flags)
- && i != pd_idx && i != qd_idx
- && (!test_bit(R5_LOCKED, &dev->flags)
- ) &&
- !test_bit(R5_UPTODATE, &dev->flags)) {
- if (test_bit(R5_Insync, &dev->flags)) rcw++;
- else {
- pr_debug("raid6: must_compute: "
- "disk %d flags=%#lx\n", i, dev->flags);
- must_compute++;
+ /* check if we haven't enough data */
+ if (!test_bit(R5_OVERWRITE, &dev->flags) &&
+ i != pd_idx && i != qd_idx &&
+ !test_bit(R5_LOCKED, &dev->flags) &&
+ !(test_bit(R5_UPTODATE, &dev->flags) ||
+ test_bit(R5_Wantcompute, &dev->flags))) {
+ rcw++;
+ if (!test_bit(R5_Insync, &dev->flags))
+ continue; /* it's a failed drive */
+
+ if (
+ test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+ pr_debug("Read_old stripe %llu "
+ "block %d for Reconstruct\n",
+ (unsigned long long)sh->sector, i);
+ set_bit(R5_LOCKED, &dev->flags);
+ set_bit(R5_Wantread, &dev->flags);
+ s->locked++;
+ } else {
+ pr_debug("Request delayed stripe %llu "
+ "block %d for Reconstruct\n",
+ (unsigned long long)sh->sector, i);
+ set_bit(STRIPE_DELAYED, &sh->state);
+ set_bit(STRIPE_HANDLE, &sh->state);
}
}
}
- pr_debug("for sector %llu, rcw=%d, must_compute=%d\n",
- (unsigned long long)sh->sector, rcw, must_compute);
- set_bit(STRIPE_HANDLE, &sh->state);
-
- if (rcw > 0)
- /* want reconstruct write, but need to get some data */
- for (i = disks; i--; ) {
- struct r5dev *dev = &sh->dev[i];
- if (!test_bit(R5_OVERWRITE, &dev->flags)
- && !(s->failed == 0 && (i == pd_idx || i == qd_idx))
- && !test_bit(R5_LOCKED, &dev->flags) &&
- !test_bit(R5_UPTODATE, &dev->flags) &&
- test_bit(R5_Insync, &dev->flags)) {
- if (
- test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
- pr_debug("Read_old stripe %llu "
- "block %d for Reconstruct\n",
- (unsigned long long)sh->sector, i);
- set_bit(R5_LOCKED, &dev->flags);
- set_bit(R5_Wantread, &dev->flags);
- s->locked++;
- } else {
- pr_debug("Request delayed stripe %llu "
- "block %d for Reconstruct\n",
- (unsigned long long)sh->sector, i);
- set_bit(STRIPE_DELAYED, &sh->state);
- set_bit(STRIPE_HANDLE, &sh->state);
- }
- }
- }
/* now if nothing is locked, and if we have enough data, we can start a
* write request
*/
- if (s->locked == 0 && rcw == 0 &&
+ if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
+ s->locked == 0 && rcw == 0 &&
!test_bit(STRIPE_BIT_DELAY, &sh->state)) {
- if (must_compute > 0) {
- /* We have failed blocks and need to compute them */
- switch (s->failed) {
- case 0:
- BUG();
- case 1:
- compute_block_1(sh, r6s->failed_num[0], 0);
- break;
- case 2:
- compute_block_2(sh, r6s->failed_num[0],
- r6s->failed_num[1]);
- break;
- default: /* This request should have been failed? */
- BUG();
- }
- }
-
- pr_debug("Computing parity for stripe %llu\n",
- (unsigned long long)sh->sector);
- compute_parity6(sh, RECONSTRUCT_WRITE);
- /* now every locked buffer is ready to be written */
- for (i = disks; i--; )
- if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
- pr_debug("Writing stripe %llu block %d\n",
- (unsigned long long)sh->sector, i);
- s->locked++;
- set_bit(R5_Wantwrite, &sh->dev[i].flags);
- }
- if (s->locked == disks)
- if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
- atomic_inc(&conf->pending_full_writes);
- /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
- set_bit(STRIPE_INSYNC, &sh->state);
-
- if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
- atomic_dec(&conf->preread_active_stripes);
- if (atomic_read(&conf->preread_active_stripes) <
- IO_THRESHOLD)
- md_wakeup_thread(conf->mddev->thread);
- }
+ schedule_reconstruction(sh, s, 1, 0);
}
}
@@ -2607,6 +2637,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
set_bit(R5_Wantcompute,
&sh->dev[sh->pd_idx].flags);
sh->ops.target = sh->pd_idx;
+ sh->ops.target2 = -1;
s->uptodate++;
}
}
@@ -2626,91 +2657,163 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
struct stripe_head_state *s,
struct r6_state *r6s, int disks)
{
- int update_p = 0, update_q = 0;
- struct r5dev *dev;
int pd_idx = sh->pd_idx;
int qd_idx = sh->qd_idx;
- unsigned long cpu;
- struct page *tmp_page;
+ struct r5dev *dev;
set_bit(STRIPE_HANDLE, &sh->state);
BUG_ON(s->failed > 2);
- BUG_ON(s->uptodate < disks);
+
/* Want to check and possibly repair P and Q.
* However there could be one 'failed' device, in which
* case we can only check one of them, possibly using the
* other to generate missing data
*/
- cpu = get_cpu();
- tmp_page = per_cpu_ptr(conf->percpu, cpu)->spare_page;
- if (s->failed == r6s->q_failed) {
- /* The only possible failed device holds 'Q', so it
- * makes sense to check P (If anything else were failed,
- * we would have used P to recreate it).
- */
- compute_block_1(sh, pd_idx, 1);
- if (!page_is_zero(sh->dev[pd_idx].page)) {
- compute_block_1(sh, pd_idx, 0);
- update_p = 1;
+
+ switch (sh->check_state) {
+ case check_state_idle:
+ /* start a new check operation if there are < 2 failures */
+ if (s->failed == r6s->q_failed) {
+ /* The only possible failed device holds Q, so it
+ * makes sense to check P (If anything else were failed,
+ * we would have used P to recreate it).
+ */
+ sh->check_state = check_state_run;
}
- }
- if (!r6s->q_failed && s->failed < 2) {
- /* q is not failed, and we didn't use it to generate
- * anything, so it makes sense to check it
- */
- memcpy(page_address(tmp_page),
- page_address(sh->dev[qd_idx].page),
- STRIPE_SIZE);
- compute_parity6(sh, UPDATE_PARITY);
- if (memcmp(page_address(tmp_page),
- page_address(sh->dev[qd_idx].page),
- STRIPE_SIZE) != 0) {
- clear_bit(STRIPE_INSYNC, &sh->state);
- update_q = 1;
+ if (!r6s->q_failed && s->failed < 2) {
+ /* Q is not failed, and we didn't use it to generate
+ * anything, so it makes sense to check it
+ */
+ if (sh->check_state == check_state_run)
+ sh->check_state = check_state_run_pq;
+ else
+ sh->check_state = check_state_run_q;
}
- }
- put_cpu();
- if (update_p || update_q) {
- conf->mddev->resync_mismatches += STRIPE_SECTORS;
- if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
- /* don't try to repair!! */
- update_p = update_q = 0;
- }
+ /* discard potentially stale zero_sum_result */
+ sh->ops.zero_sum_result = 0;
- /* now write out any block on a failed drive,
- * or P or Q if they need it
- */
+ if (sh->check_state == check_state_run) {
+ /* async_xor_zero_sum destroys the contents of P */
+ clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
+ s->uptodate--;
+ }
+ if (sh->check_state >= check_state_run &&
+ sh->check_state <= check_state_run_pq) {
+ /* async_syndrome_zero_sum preserves P and Q, so
+ * no need to mark them !uptodate here
+ */
+ set_bit(STRIPE_OP_CHECK, &s->ops_request);
+ break;
+ }
- if (s->failed == 2) {
- dev = &sh->dev[r6s->failed_num[1]];
- s->locked++;
- set_bit(R5_LOCKED, &dev->flags);
- set_bit(R5_Wantwrite, &dev->flags);
- }
- if (s->failed >= 1) {
- dev = &sh->dev[r6s->failed_num[0]];
- s->locked++;
- set_bit(R5_LOCKED, &dev->flags);
- set_bit(R5_Wantwrite, &dev->flags);
- }
+ /* we have 2-disk failure */
+ BUG_ON(s->failed != 2);
+ /* fall through */
+ case check_state_compute_result:
+ sh->check_state = check_state_idle;
- if (update_p) {
- dev = &sh->dev[pd_idx];
- s->locked++;
- set_bit(R5_LOCKED, &dev->flags);
- set_bit(R5_Wantwrite, &dev->flags);
- }
- if (update_q) {
- dev = &sh->dev[qd_idx];
- s->locked++;
- set_bit(R5_LOCKED, &dev->flags);
- set_bit(R5_Wantwrite, &dev->flags);
- }
- clear_bit(STRIPE_DEGRADED, &sh->state);
+ /* check that a write has not made the stripe insync */
+ if (test_bit(STRIPE_INSYNC, &sh->state))
+ break;
- set_bit(STRIPE_INSYNC, &sh->state);
+ /* now write out any block on a failed drive,
+ * or P or Q if they were recomputed
+ */
+ BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */
+ if (s->failed == 2) {
+ dev = &sh->dev[r6s->failed_num[1]];
+ s->locked++;
+ set_bit(R5_LOCKED, &dev->flags);
+ set_bit(R5_Wantwrite, &dev->flags);
+ }
+ if (s->failed >= 1) {
+ dev = &sh->dev[r6s->failed_num[0]];
+ s->locked++;
+ set_bit(R5_LOCKED, &dev->flags);
+ set_bit(R5_Wantwrite, &dev->flags);
+ }
+ if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
+ dev = &sh->dev[pd_idx];
+ s->locked++;
+ set_bit(R5_LOCKED, &dev->flags);
+ set_bit(R5_Wantwrite, &dev->flags);
+ }
+ if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
+ dev = &sh->dev[qd_idx];
+ s->locked++;
+ set_bit(R5_LOCKED, &dev->flags);
+ set_bit(R5_Wantwrite, &dev->flags);
+ }
+ clear_bit(STRIPE_DEGRADED, &sh->state);
+
+ set_bit(STRIPE_INSYNC, &sh->state);
+ break;
+ case check_state_run:
+ case check_state_run_q:
+ case check_state_run_pq:
+ break; /* we will be called again upon completion */
+ case check_state_check_result:
+ sh->check_state = check_state_idle;
+
+ /* handle a successful check operation, if parity is correct
+ * we are done. Otherwise update the mismatch count and repair
+ * parity if !MD_RECOVERY_CHECK
+ */
+ if (sh->ops.zero_sum_result == 0) {
+ /* both parities are correct */
+ if (!s->failed)
+ set_bit(STRIPE_INSYNC, &sh->state);
+ else {
+ /* in contrast to the raid5 case we can validate
+ * parity, but still have a failure to write
+ * back
+ */
+ sh->check_state = check_state_compute_result;
+ /* Returning at this point means that we may go
+ * off and bring p and/or q uptodate again so
+ * we make sure to check zero_sum_result again
+ * to verify if p or q need writeback
+ */
+ }
+ } else {
+ conf->mddev->resync_mismatches += STRIPE_SECTORS;
+ if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
+ /* don't try to repair!! */
+ set_bit(STRIPE_INSYNC, &sh->state);
+ else {
+ int *target = &sh->ops.target;
+
+ sh->ops.target = -1;
+ sh->ops.target2 = -1;
+ sh->check_state = check_state_compute_run;
+ set_bit(STRIPE_COMPUTE_RUN, &sh->state);
+ set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
+ if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
+ set_bit(R5_Wantcompute,
+ &sh->dev[pd_idx].flags);
+ *target = pd_idx;
+ target = &sh->ops.target2;
+ s->uptodate++;
+ }
+ if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
+ set_bit(R5_Wantcompute,
+ &sh->dev[qd_idx].flags);
+ *target = qd_idx;
+ s->uptodate++;
+ }
+ }
+ }
+ break;
+ case check_state_compute_run:
+ break;
+ default:
+ printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
+ __func__, sh->check_state,
+ (unsigned long long) sh->sector);
+ BUG();
+ }
}
static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
@@ -2732,7 +2835,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
sector_t bn = compute_blocknr(sh, i, 1);
sector_t s = raid5_compute_sector(conf, bn, 0,
&dd_idx, NULL);
- sh2 = get_active_stripe(conf, s, 0, 1);
+ sh2 = get_active_stripe(conf, s, 0, 1, 1);
if (sh2 == NULL)
/* so far only the early blocks of this stripe
* have been requested. When later blocks
@@ -3006,7 +3109,7 @@ static bool handle_stripe5(struct stripe_head *sh)
/* Finish reconstruct operations initiated by the expansion process */
if (sh->reconstruct_state == reconstruct_state_result) {
struct stripe_head *sh2
- = get_active_stripe(conf, sh->sector, 1, 1);
+ = get_active_stripe(conf, sh->sector, 1, 1, 1);
if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
/* sh cannot be written until sh2 has been read.
* so arrange for sh to be delayed a little
@@ -3036,7 +3139,7 @@ static bool handle_stripe5(struct stripe_head *sh)
/* Need to write out all blocks after computing parity */
sh->disks = conf->raid_disks;
stripe_set_idx(sh->sector, conf, 0, sh);
- schedule_reconstruction5(sh, &s, 1, 1);
+ schedule_reconstruction(sh, &s, 1, 1);
} else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
clear_bit(STRIPE_EXPAND_READY, &sh->state);
atomic_dec(&conf->reshape_stripes);
@@ -3056,7 +3159,7 @@ static bool handle_stripe5(struct stripe_head *sh)
md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
if (s.ops_request)
- raid5_run_ops(sh, s.ops_request);
+ raid_run_ops(sh, s.ops_request);
ops_run_io(sh, &s);
@@ -3077,9 +3180,10 @@ static bool handle_stripe6(struct stripe_head *sh)
mdk_rdev_t *blocked_rdev = NULL;
pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
- "pd_idx=%d, qd_idx=%d\n",
+ "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
(unsigned long long)sh->sector, sh->state,
- atomic_read(&sh->count), pd_idx, qd_idx);
+ atomic_read(&sh->count), pd_idx, qd_idx,
+ sh->check_state, sh->reconstruct_state);
memset(&s, 0, sizeof(s));
spin_lock(&sh->lock);
@@ -3099,35 +3203,24 @@ static bool handle_stripe6(struct stripe_head *sh)
pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
i, dev->flags, dev->toread, dev->towrite, dev->written);
- /* maybe we can reply to a read */
- if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
- struct bio *rbi, *rbi2;
- pr_debug("Return read for disc %d\n", i);
- spin_lock_irq(&conf->device_lock);
- rbi = dev->toread;
- dev->toread = NULL;
- if (test_and_clear_bit(R5_Overlap, &dev->flags))
- wake_up(&conf->wait_for_overlap);
- spin_unlock_irq(&conf->device_lock);
- while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
- copy_data(0, rbi, dev->page, dev->sector);
- rbi2 = r5_next_bio(rbi, dev->sector);
- spin_lock_irq(&conf->device_lock);
- if (!raid5_dec_bi_phys_segments(rbi)) {
- rbi->bi_next = return_bi;
- return_bi = rbi;
- }
- spin_unlock_irq(&conf->device_lock);
- rbi = rbi2;
- }
- }
+ /* maybe we can reply to a read
+ *
+ * new wantfill requests are only permitted while
+ * ops_complete_biofill is guaranteed to be inactive
+ */
+ if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
+ !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
+ set_bit(R5_Wantfill, &dev->flags);
/* now count some things */
if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
+ if (test_bit(R5_Wantcompute, &dev->flags))
+ BUG_ON(++s.compute > 2);
-
- if (dev->toread)
+ if (test_bit(R5_Wantfill, &dev->flags)) {
+ s.to_fill++;
+ } else if (dev->toread)
s.to_read++;
if (dev->towrite) {
s.to_write++;
@@ -3168,6 +3261,11 @@ static bool handle_stripe6(struct stripe_head *sh)
blocked_rdev = NULL;
}
+ if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
+ set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
+ set_bit(STRIPE_BIOFILL_RUN, &sh->state);
+ }
+
pr_debug("locked=%d uptodate=%d to_read=%d"
" to_write=%d failed=%d failed_num=%d,%d\n",
s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
@@ -3208,18 +3306,61 @@ static bool handle_stripe6(struct stripe_head *sh)
* or to load a block that is being partially written.
*/
if (s.to_read || s.non_overwrite || (s.to_write && s.failed) ||
- (s.syncing && (s.uptodate < disks)) || s.expanding)
+ (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
handle_stripe_fill6(sh, &s, &r6s, disks);
- /* now to consider writing and what else, if anything should be read */
- if (s.to_write)
+ /* Now we check to see if any write operations have recently
+ * completed
+ */
+ if (sh->reconstruct_state == reconstruct_state_drain_result) {
+ int qd_idx = sh->qd_idx;
+
+ sh->reconstruct_state = reconstruct_state_idle;
+ /* All the 'written' buffers and the parity blocks are ready to
+ * be written back to disk
+ */
+ BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
+ BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags));
+ for (i = disks; i--; ) {
+ dev = &sh->dev[i];
+ if (test_bit(R5_LOCKED, &dev->flags) &&
+ (i == sh->pd_idx || i == qd_idx ||
+ dev->written)) {
+ pr_debug("Writing block %d\n", i);
+ BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
+ set_bit(R5_Wantwrite, &dev->flags);
+ if (!test_bit(R5_Insync, &dev->flags) ||
+ ((i == sh->pd_idx || i == qd_idx) &&
+ s.failed == 0))
+ set_bit(STRIPE_INSYNC, &sh->state);
+ }
+ }
+ if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+ atomic_dec(&conf->preread_active_stripes);
+ if (atomic_read(&conf->preread_active_stripes) <
+ IO_THRESHOLD)
+ md_wakeup_thread(conf->mddev->thread);
+ }
+ }
+
+ /* Now to consider new write requests and what else, if anything
+ * should be read. We do not handle new writes when:
+ * 1/ A 'write' operation (copy+gen_syndrome) is already in flight.
+ * 2/ A 'check' operation is in flight, as it may clobber the parity
+ * block.
+ */
+ if (s.to_write && !sh->reconstruct_state && !sh->check_state)
handle_stripe_dirtying6(conf, sh, &s, &r6s, disks);
/* maybe we need to check and possibly fix the parity for this stripe
* Any reads will already have been scheduled, so we just see if enough
- * data is available
+ * data is available. The parity check is held off while parity
+ * dependent operations are in flight.
*/
- if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state))
+ if (sh->check_state ||
+ (s.syncing && s.locked == 0 &&
+ !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
+ !test_bit(STRIPE_INSYNC, &sh->state)))
handle_parity_checks6(conf, sh, &s, &r6s, disks);
if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
@@ -3241,17 +3382,31 @@ static bool handle_stripe6(struct stripe_head *sh)
set_bit(R5_Wantwrite, &dev->flags);
set_bit(R5_ReWrite, &dev->flags);
set_bit(R5_LOCKED, &dev->flags);
+ s.locked++;
} else {
/* let's read it back */
set_bit(R5_Wantread, &dev->flags);
set_bit(R5_LOCKED, &dev->flags);
+ s.locked++;
}
}
}
- if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
+ /* Finish reconstruct operations initiated by the expansion process */
+ if (sh->reconstruct_state == reconstruct_state_result) {
+ sh->reconstruct_state = reconstruct_state_idle;
+ clear_bit(STRIPE_EXPANDING, &sh->state);
+ for (i = conf->raid_disks; i--; ) {
+ set_bit(R5_Wantwrite, &sh->dev[i].flags);
+ set_bit(R5_LOCKED, &sh->dev[i].flags);
+ s.locked++;
+ }
+ }
+
+ if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
+ !sh->reconstruct_state) {
struct stripe_head *sh2
- = get_active_stripe(conf, sh->sector, 1, 1);
+ = get_active_stripe(conf, sh->sector, 1, 1, 1);
if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
/* sh cannot be written until sh2 has been read.
* so arrange for sh to be delayed a little
@@ -3270,14 +3425,8 @@ static bool handle_stripe6(struct stripe_head *sh)
/* Need to write out all blocks after computing P&Q */
sh->disks = conf->raid_disks;
stripe_set_idx(sh->sector, conf, 0, sh);
- compute_parity6(sh, RECONSTRUCT_WRITE);
- for (i = conf->raid_disks ; i-- ; ) {
- set_bit(R5_LOCKED, &sh->dev[i].flags);
- s.locked++;
- set_bit(R5_Wantwrite, &sh->dev[i].flags);
- }
- clear_bit(STRIPE_EXPANDING, &sh->state);
- } else if (s.expanded) {
+ schedule_reconstruction(sh, &s, 1, 1);
+ } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
clear_bit(STRIPE_EXPAND_READY, &sh->state);
atomic_dec(&conf->reshape_stripes);
wake_up(&conf->wait_for_overlap);
@@ -3295,6 +3444,9 @@ static bool handle_stripe6(struct stripe_head *sh)
if (unlikely(blocked_rdev))
md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
+ if (s.ops_request)
+ raid_run_ops(sh, s.ops_request);
+
ops_run_io(sh, &s);
return_io(return_bi);
@@ -3348,7 +3500,7 @@ static void unplug_slaves(mddev_t *mddev)
int i;
rcu_read_lock();
- for (i=0; i<mddev->raid_disks; i++) {
+ for (i = 0; i < conf->raid_disks; i++) {
mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
@@ -3735,7 +3887,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
(unsigned long long)logical_sector);
sh = get_active_stripe(conf, new_sector, previous,
- (bi->bi_rw&RWA_MASK));
+ (bi->bi_rw&RWA_MASK), 0);
if (sh) {
if (unlikely(previous)) {
/* expansion might have moved on while waiting for a
@@ -3871,13 +4023,13 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
safepos = conf->reshape_safe;
sector_div(safepos, data_disks);
if (mddev->delta_disks < 0) {
- writepos -= reshape_sectors;
+ writepos -= min_t(sector_t, reshape_sectors, writepos);
readpos += reshape_sectors;
safepos += reshape_sectors;
} else {
writepos += reshape_sectors;
- readpos -= reshape_sectors;
- safepos -= reshape_sectors;
+ readpos -= min_t(sector_t, reshape_sectors, readpos);
+ safepos -= min_t(sector_t, reshape_sectors, safepos);
}
/* 'writepos' is the most advanced device address we might write.
@@ -3905,6 +4057,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
wait_event(conf->wait_for_overlap,
atomic_read(&conf->reshape_stripes)==0);
mddev->reshape_position = conf->reshape_progress;
+ mddev->curr_resync_completed = mddev->curr_resync;
conf->reshape_checkpoint = jiffies;
set_bit(MD_CHANGE_DEVS, &mddev->flags);
md_wakeup_thread(mddev->thread);
@@ -3914,6 +4067,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
conf->reshape_safe = mddev->reshape_position;
spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap);
+ sysfs_notify(&mddev->kobj, NULL, "sync_completed");
}
if (mddev->delta_disks < 0) {
@@ -3931,7 +4085,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
int j;
int skipped = 0;
- sh = get_active_stripe(conf, stripe_addr+i, 0, 0);
+ sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
set_bit(STRIPE_EXPANDING, &sh->state);
atomic_inc(&conf->reshape_stripes);
/* If any of this stripe is beyond the end of the old
@@ -3974,13 +4128,13 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
raid5_compute_sector(conf, stripe_addr*(new_data_disks),
1, &dd_idx, NULL);
last_sector =
- raid5_compute_sector(conf, ((stripe_addr+conf->chunk_size/512)
+ raid5_compute_sector(conf, ((stripe_addr+reshape_sectors)
*(new_data_disks) - 1),
1, &dd_idx, NULL);
if (last_sector >= mddev->dev_sectors)
last_sector = mddev->dev_sectors - 1;
while (first_sector <= last_sector) {
- sh = get_active_stripe(conf, first_sector, 1, 0);
+ sh = get_active_stripe(conf, first_sector, 1, 0, 1);
set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
@@ -3998,11 +4152,13 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
* then we need to write out the superblock.
*/
sector_nr += reshape_sectors;
- if (sector_nr >= mddev->resync_max) {
+ if ((sector_nr - mddev->curr_resync_completed) * 2
+ >= mddev->resync_max - mddev->curr_resync_completed) {
/* Cannot proceed until we've updated the superblock... */
wait_event(conf->wait_for_overlap,
atomic_read(&conf->reshape_stripes) == 0);
mddev->reshape_position = conf->reshape_progress;
+ mddev->curr_resync_completed = mddev->curr_resync;
conf->reshape_checkpoint = jiffies;
set_bit(MD_CHANGE_DEVS, &mddev->flags);
md_wakeup_thread(mddev->thread);
@@ -4013,6 +4169,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
conf->reshape_safe = mddev->reshape_position;
spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap);
+ sysfs_notify(&mddev->kobj, NULL, "sync_completed");
}
return reshape_sectors;
}
@@ -4077,9 +4234,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
bitmap_cond_end_sync(mddev->bitmap, sector_nr);
- sh = get_active_stripe(conf, sector_nr, 0, 1);
+ sh = get_active_stripe(conf, sector_nr, 0, 1, 0);
if (sh == NULL) {
- sh = get_active_stripe(conf, sector_nr, 0, 0);
+ sh = get_active_stripe(conf, sector_nr, 0, 0, 0);
/* make sure we don't swamp the stripe cache if someone else
* is trying to get access
*/
@@ -4089,7 +4246,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
* We don't need to check the 'failed' flag as when that gets set,
* recovery aborts.
*/
- for (i=0; i<mddev->raid_disks; i++)
+ for (i = 0; i < conf->raid_disks; i++)
if (conf->disks[i].rdev == NULL)
still_degraded = 1;
@@ -4141,7 +4298,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
/* already done this stripe */
continue;
- sh = get_active_stripe(conf, sector, 0, 1);
+ sh = get_active_stripe(conf, sector, 0, 1, 0);
if (!sh) {
/* failed to get a stripe - must wait */
@@ -4172,6 +4329,36 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
return handled;
}
+#ifdef CONFIG_MULTICORE_RAID456
+static void __process_stripe(void *param, async_cookie_t cookie)
+{
+ struct stripe_head *sh = param;
+
+ handle_stripe(sh);
+ release_stripe(sh);
+}
+
+static void process_stripe(struct stripe_head *sh, struct list_head *domain)
+{
+ async_schedule_domain(__process_stripe, sh, domain);
+}
+
+static void synchronize_stripe_processing(struct list_head *domain)
+{
+ async_synchronize_full_domain(domain);
+}
+#else
+static void process_stripe(struct stripe_head *sh, struct list_head *domain)
+{
+ handle_stripe(sh);
+ release_stripe(sh);
+ cond_resched();
+}
+
+static void synchronize_stripe_processing(struct list_head *domain)
+{
+}
+#endif
/*
@@ -4186,6 +4373,7 @@ static void raid5d(mddev_t *mddev)
struct stripe_head *sh;
raid5_conf_t *conf = mddev_to_conf(mddev);
int handled;
+ LIST_HEAD(raid_domain);
pr_debug("+++ raid5d active\n");
@@ -4222,8 +4410,7 @@ static void raid5d(mddev_t *mddev)
spin_unlock_irq(&conf->device_lock);
handled++;
- handle_stripe(sh);
- release_stripe(sh);
+ process_stripe(sh, &raid_domain);
spin_lock_irq(&conf->device_lock);
}
@@ -4231,6 +4418,7 @@ static void raid5d(mddev_t *mddev)
spin_unlock_irq(&conf->device_lock);
+ synchronize_stripe_processing(&raid_domain);
async_tx_issue_pending_all();
unplug_slaves(mddev);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 75f2c6c4cf90..116d0b44b2a9 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -176,7 +176,9 @@
*/
enum check_states {
check_state_idle = 0,
- check_state_run, /* parity check */
+ check_state_run, /* xor parity check */
+ check_state_run_q, /* q-parity check */
+ check_state_run_pq, /* pq dual parity check */
check_state_check_result,
check_state_compute_run, /* parity repair */
check_state_compute_result,
@@ -216,7 +218,7 @@ struct stripe_head {
* @target - STRIPE_OP_COMPUTE_BLK target
*/
struct stripe_operations {
- int target;
+ int target, target2;
enum sum_check_flags zero_sum_result;
} ops;
struct r5dev {
@@ -299,7 +301,7 @@ struct r6_state {
#define STRIPE_OP_COMPUTE_BLK 1
#define STRIPE_OP_PREXOR 2
#define STRIPE_OP_BIODRAIN 3
-#define STRIPE_OP_POSTXOR 4
+#define STRIPE_OP_RECONSTRUCT 4
#define STRIPE_OP_CHECK 5
/*