diff options
Diffstat (limited to 'drivers/md')
31 files changed, 2009 insertions, 1472 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 41b3ae25b813..09c0c6e49ab5 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -124,6 +124,8 @@ config MD_RAID456 select MD_RAID6_PQ select ASYNC_MEMCPY select ASYNC_XOR + select ASYNC_PQ + select ASYNC_RAID6_RECOV ---help--- A RAID-5 set of N drives with a capacity of C MB per drive provides the capacity of C * (N - 1) MB, and protects against a failure @@ -152,6 +154,17 @@ config MD_RAID456 If unsure, say Y. +config MULTICORE_RAID456 + bool "RAID-4/RAID-5/RAID-6 Multicore processing (EXPERIMENTAL)" + depends on MD_RAID456 + depends on SMP + depends on EXPERIMENTAL + ---help--- + Enable the raid456 module to dispatch per-stripe raid operations to a + thread pool. + + If unsure, say N. + config MD_RAID6_PQ tristate diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index f8a9f7ab2cb8..56df1cee8fb3 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -986,6 +986,9 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) oldindex = index; oldpage = page; + bitmap->filemap[bitmap->file_pages++] = page; + bitmap->last_page_size = count; + if (outofdate) { /* * if bitmap is out of date, dirty the @@ -998,15 +1001,9 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) write_page(bitmap, page, 1); ret = -EIO; - if (bitmap->flags & BITMAP_WRITE_ERROR) { - /* release, page not in filemap yet */ - put_page(page); + if (bitmap->flags & BITMAP_WRITE_ERROR) goto err; - } } - - bitmap->filemap[bitmap->file_pages++] = page; - bitmap->last_page_size = count; } paddr = kmap_atomic(page, KM_USER0); if (bitmap->flags & BITMAP_HOSTENDIAN) @@ -1016,9 +1013,11 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) kunmap_atomic(paddr, KM_USER0); if (b) { /* if the disk bit is set, set the memory bit */ - bitmap_set_memory_bits(bitmap, i << CHUNK_BLOCK_SHIFT(bitmap), - ((i+1) << (CHUNK_BLOCK_SHIFT(bitmap)) >= start) - ); + int needed = ((sector_t)(i+1) << (CHUNK_BLOCK_SHIFT(bitmap)) + >= start); + bitmap_set_memory_bits(bitmap, + (sector_t)i << CHUNK_BLOCK_SHIFT(bitmap), + needed); bit_cnt++; set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); } @@ -1098,14 +1097,12 @@ void bitmap_daemon_work(struct bitmap *bitmap) } bitmap->allclean = 1; + spin_lock_irqsave(&bitmap->lock, flags); for (j = 0; j < bitmap->chunks; j++) { bitmap_counter_t *bmc; - spin_lock_irqsave(&bitmap->lock, flags); - if (!bitmap->filemap) { + if (!bitmap->filemap) /* error or shutdown */ - spin_unlock_irqrestore(&bitmap->lock, flags); break; - } page = filemap_get_page(bitmap, j); @@ -1122,6 +1119,8 @@ void bitmap_daemon_work(struct bitmap *bitmap) write_page(bitmap, page, 0); bitmap->allclean = 0; } + spin_lock_irqsave(&bitmap->lock, flags); + j |= (PAGE_BITS - 1); continue; } @@ -1154,8 +1153,9 @@ void bitmap_daemon_work(struct bitmap *bitmap) spin_lock_irqsave(&bitmap->lock, flags); clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); } - bmc = bitmap_get_counter(bitmap, j << CHUNK_BLOCK_SHIFT(bitmap), - &blocks, 0); + bmc = bitmap_get_counter(bitmap, + (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap), + &blocks, 0); if (bmc) { /* if (j < 100) printk("bitmap: j=%lu, *bmc = 0x%x\n", j, *bmc); @@ -1169,7 +1169,8 @@ void bitmap_daemon_work(struct bitmap *bitmap) } else if (*bmc == 1) { /* we can clear the bit */ *bmc = 0; - bitmap_count_page(bitmap, j << CHUNK_BLOCK_SHIFT(bitmap), + bitmap_count_page(bitmap, + (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap), -1); /* clear the bit */ @@ -1180,9 +1181,10 @@ void bitmap_daemon_work(struct bitmap *bitmap) ext2_clear_bit(file_page_offset(j), paddr); kunmap_atomic(paddr, KM_USER0); } - } - spin_unlock_irqrestore(&bitmap->lock, flags); + } else + j |= PAGE_COUNTER_MASK; } + spin_unlock_irqrestore(&bitmap->lock, flags); /* now sync the final page */ if (lastpage != NULL) { @@ -1479,6 +1481,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) s += blocks; } bitmap->last_end_sync = jiffies; + sysfs_notify(&bitmap->mddev->kobj, NULL, "sync_completed"); } static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) @@ -1513,7 +1516,7 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e) unsigned long chunk; for (chunk = s; chunk <= e; chunk++) { - sector_t sec = chunk << CHUNK_BLOCK_SHIFT(bitmap); + sector_t sec = (sector_t)chunk << CHUNK_BLOCK_SHIFT(bitmap); bitmap_set_memory_bits(bitmap, sec, 1); bitmap_file_set_bit(bitmap, sec); } @@ -1589,7 +1592,7 @@ void bitmap_destroy(mddev_t *mddev) int bitmap_create(mddev_t *mddev) { struct bitmap *bitmap; - unsigned long blocks = mddev->resync_max_sectors; + sector_t blocks = mddev->resync_max_sectors; unsigned long chunks; unsigned long pages; struct file *file = mddev->bitmap_file; @@ -1631,8 +1634,8 @@ int bitmap_create(mddev_t *mddev) bitmap->chunkshift = ffz(~bitmap->chunksize); /* now that chunksize and chunkshift are set, we can use these macros */ - chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) / - CHUNK_BLOCK_RATIO(bitmap); + chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) >> + CHUNK_BLOCK_SHIFT(bitmap); pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO; BUG_ON(!pages); diff --git a/drivers/md/dm-bio-list.h b/drivers/md/dm-bio-list.h deleted file mode 100644 index d4509be0fe67..000000000000 --- a/drivers/md/dm-bio-list.h +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright (C) 2004 Red Hat UK Ltd. - * - * This file is released under the GPL. - */ - -#ifndef DM_BIO_LIST_H -#define DM_BIO_LIST_H - -#include <linux/bio.h> - -#ifdef CONFIG_BLOCK - -struct bio_list { - struct bio *head; - struct bio *tail; -}; - -static inline int bio_list_empty(const struct bio_list *bl) -{ - return bl->head == NULL; -} - -static inline void bio_list_init(struct bio_list *bl) -{ - bl->head = bl->tail = NULL; -} - -#define bio_list_for_each(bio, bl) \ - for (bio = (bl)->head; bio; bio = bio->bi_next) - -static inline unsigned bio_list_size(const struct bio_list *bl) -{ - unsigned sz = 0; - struct bio *bio; - - bio_list_for_each(bio, bl) - sz++; - - return sz; -} - -static inline void bio_list_add(struct bio_list *bl, struct bio *bio) -{ - bio->bi_next = NULL; - - if (bl->tail) - bl->tail->bi_next = bio; - else - bl->head = bio; - - bl->tail = bio; -} - -static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2) -{ - if (!bl2->head) - return; - - if (bl->tail) - bl->tail->bi_next = bl2->head; - else - bl->head = bl2->head; - - bl->tail = bl2->tail; -} - -static inline void bio_list_merge_head(struct bio_list *bl, - struct bio_list *bl2) -{ - if (!bl2->head) - return; - - if (bl->head) - bl2->tail->bi_next = bl->head; - else - bl->tail = bl2->tail; - - bl->head = bl2->head; -} - -static inline struct bio *bio_list_pop(struct bio_list *bl) -{ - struct bio *bio = bl->head; - - if (bio) { - bl->head = bl->head->bi_next; - if (!bl->head) - bl->tail = NULL; - - bio->bi_next = NULL; - } - - return bio; -} - -static inline struct bio *bio_list_get(struct bio_list *bl) -{ - struct bio *bio = bl->head; - - bl->head = bl->tail = NULL; - - return bio; -} - -#endif /* CONFIG_BLOCK */ -#endif diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h index d3ec217847d6..3a8cfa2645c7 100644 --- a/drivers/md/dm-bio-record.h +++ b/drivers/md/dm-bio-record.h @@ -16,30 +16,56 @@ * functions in this file help the target record and restore the * original bio state. */ + +struct dm_bio_vec_details { +#if PAGE_SIZE < 65536 + __u16 bv_len; + __u16 bv_offset; +#else + unsigned bv_len; + unsigned bv_offset; +#endif +}; + struct dm_bio_details { sector_t bi_sector; struct block_device *bi_bdev; unsigned int bi_size; unsigned short bi_idx; unsigned long bi_flags; + struct dm_bio_vec_details bi_io_vec[BIO_MAX_PAGES]; }; static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio) { + unsigned i; + bd->bi_sector = bio->bi_sector; bd->bi_bdev = bio->bi_bdev; bd->bi_size = bio->bi_size; bd->bi_idx = bio->bi_idx; bd->bi_flags = bio->bi_flags; + + for (i = 0; i < bio->bi_vcnt; i++) { + bd->bi_io_vec[i].bv_len = bio->bi_io_vec[i].bv_len; + bd->bi_io_vec[i].bv_offset = bio->bi_io_vec[i].bv_offset; + } } static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio) { + unsigned i; + bio->bi_sector = bd->bi_sector; bio->bi_bdev = bd->bi_bdev; bio->bi_size = bd->bi_size; bio->bi_idx = bd->bi_idx; bio->bi_flags = bd->bi_flags; + + for (i = 0; i < bio->bi_vcnt; i++) { + bio->bi_io_vec[i].bv_len = bd->bi_io_vec[i].bv_len; + bio->bi_io_vec[i].bv_offset = bd->bi_io_vec[i].bv_offset; + } } #endif diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 35bda49796fb..53394e863c74 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -60,6 +60,7 @@ struct dm_crypt_io { }; struct dm_crypt_request { + struct convert_context *ctx; struct scatterlist sg_in; struct scatterlist sg_out; }; @@ -335,6 +336,18 @@ static void crypt_convert_init(struct crypt_config *cc, init_completion(&ctx->restart); } +static struct dm_crypt_request *dmreq_of_req(struct crypt_config *cc, + struct ablkcipher_request *req) +{ + return (struct dm_crypt_request *)((char *)req + cc->dmreq_start); +} + +static struct ablkcipher_request *req_of_dmreq(struct crypt_config *cc, + struct dm_crypt_request *dmreq) +{ + return (struct ablkcipher_request *)((char *)dmreq - cc->dmreq_start); +} + static int crypt_convert_block(struct crypt_config *cc, struct convert_context *ctx, struct ablkcipher_request *req) @@ -345,10 +358,11 @@ static int crypt_convert_block(struct crypt_config *cc, u8 *iv; int r = 0; - dmreq = (struct dm_crypt_request *)((char *)req + cc->dmreq_start); + dmreq = dmreq_of_req(cc, req); iv = (u8 *)ALIGN((unsigned long)(dmreq + 1), crypto_ablkcipher_alignmask(cc->tfm) + 1); + dmreq->ctx = ctx; sg_init_table(&dmreq->sg_in, 1); sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT, bv_in->bv_offset + ctx->offset_in); @@ -395,8 +409,9 @@ static void crypt_alloc_req(struct crypt_config *cc, cc->req = mempool_alloc(cc->req_pool, GFP_NOIO); ablkcipher_request_set_tfm(cc->req, cc->tfm); ablkcipher_request_set_callback(cc->req, CRYPTO_TFM_REQ_MAY_BACKLOG | - CRYPTO_TFM_REQ_MAY_SLEEP, - kcryptd_async_done, ctx); + CRYPTO_TFM_REQ_MAY_SLEEP, + kcryptd_async_done, + dmreq_of_req(cc, cc->req)); } /* @@ -553,19 +568,22 @@ static void crypt_inc_pending(struct dm_crypt_io *io) static void crypt_dec_pending(struct dm_crypt_io *io) { struct crypt_config *cc = io->target->private; + struct bio *base_bio = io->base_bio; + struct dm_crypt_io *base_io = io->base_io; + int error = io->error; if (!atomic_dec_and_test(&io->pending)) return; - if (likely(!io->base_io)) - bio_endio(io->base_bio, io->error); + mempool_free(io, cc->io_pool); + + if (likely(!base_io)) + bio_endio(base_bio, error); else { - if (io->error && !io->base_io->error) - io->base_io->error = io->error; - crypt_dec_pending(io->base_io); + if (error && !base_io->error) + base_io->error = error; + crypt_dec_pending(base_io); } - - mempool_free(io, cc->io_pool); } /* @@ -821,7 +839,8 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) static void kcryptd_async_done(struct crypto_async_request *async_req, int error) { - struct convert_context *ctx = async_req->data; + struct dm_crypt_request *dmreq = async_req->data; + struct convert_context *ctx = dmreq->ctx; struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx); struct crypt_config *cc = io->target->private; @@ -830,7 +849,7 @@ static void kcryptd_async_done(struct crypto_async_request *async_req, return; } - mempool_free(ablkcipher_request_cast(async_req), cc->req_pool); + mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool); if (!atomic_dec_and_test(&ctx->pending)) return; @@ -1137,8 +1156,7 @@ bad_ivmode: crypto_free_ablkcipher(tfm); bad_cipher: /* Must zero key material before freeing */ - memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8)); - kfree(cc); + kzfree(cc); return -EINVAL; } @@ -1164,8 +1182,7 @@ static void crypt_dtr(struct dm_target *ti) dm_put_device(ti, cc->dev); /* Must zero key material before freeing */ - memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8)); - kfree(cc); + kzfree(cc); } static int crypt_map(struct dm_target *ti, struct bio *bio, diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 59ee1b015d2d..559dbb52bc85 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -15,8 +15,6 @@ #include <linux/device-mapper.h> -#include "dm-bio-list.h" - #define DM_MSG_PREFIX "delay" struct delay_c { diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index dccbfb0e010f..a2e26c242141 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c @@ -7,6 +7,7 @@ #include "dm-exception-store.h" +#include <linux/ctype.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/vmalloc.h> @@ -14,6 +15,257 @@ #define DM_MSG_PREFIX "snapshot exception stores" +static LIST_HEAD(_exception_store_types); +static DEFINE_SPINLOCK(_lock); + +static struct dm_exception_store_type *__find_exception_store_type(const char *name) +{ + struct dm_exception_store_type *type; + + list_for_each_entry(type, &_exception_store_types, list) + if (!strcmp(name, type->name)) + return type; + + return NULL; +} + +static struct dm_exception_store_type *_get_exception_store_type(const char *name) +{ + struct dm_exception_store_type *type; + + spin_lock(&_lock); + + type = __find_exception_store_type(name); + + if (type && !try_module_get(type->module)) + type = NULL; + + spin_unlock(&_lock); + + return type; +} + +/* + * get_type + * @type_name + * + * Attempt to retrieve the dm_exception_store_type by name. If not already + * available, attempt to load the appropriate module. + * + * Exstore modules are named "dm-exstore-" followed by the 'type_name'. + * Modules may contain multiple types. + * This function will first try the module "dm-exstore-<type_name>", + * then truncate 'type_name' on the last '-' and try again. + * + * For example, if type_name was "clustered-shared", it would search + * 'dm-exstore-clustered-shared' then 'dm-exstore-clustered'. + * + * 'dm-exception-store-<type_name>' is too long of a name in my + * opinion, which is why I've chosen to have the files + * containing exception store implementations be 'dm-exstore-<type_name>'. + * If you want your module to be autoloaded, you will follow this + * naming convention. + * + * Returns: dm_exception_store_type* on success, NULL on failure + */ +static struct dm_exception_store_type *get_type(const char *type_name) +{ + char *p, *type_name_dup; + struct dm_exception_store_type *type; + + type = _get_exception_store_type(type_name); + if (type) + return type; + + type_name_dup = kstrdup(type_name, GFP_KERNEL); + if (!type_name_dup) { + DMERR("No memory left to attempt load for \"%s\"", type_name); + return NULL; + } + + while (request_module("dm-exstore-%s", type_name_dup) || + !(type = _get_exception_store_type(type_name))) { + p = strrchr(type_name_dup, '-'); + if (!p) + break; + p[0] = '\0'; + } + + if (!type) + DMWARN("Module for exstore type \"%s\" not found.", type_name); + + kfree(type_name_dup); + + return type; +} + +static void put_type(struct dm_exception_store_type *type) +{ + spin_lock(&_lock); + module_put(type->module); + spin_unlock(&_lock); +} + +int dm_exception_store_type_register(struct dm_exception_store_type *type) +{ + int r = 0; + + spin_lock(&_lock); + if (!__find_exception_store_type(type->name)) + list_add(&type->list, &_exception_store_types); + else + r = -EEXIST; + spin_unlock(&_lock); + + return r; +} +EXPORT_SYMBOL(dm_exception_store_type_register); + +int dm_exception_store_type_unregister(struct dm_exception_store_type *type) +{ + spin_lock(&_lock); + + if (!__find_exception_store_type(type->name)) { + spin_unlock(&_lock); + return -EINVAL; + } + + list_del(&type->list); + + spin_unlock(&_lock); + + return 0; +} +EXPORT_SYMBOL(dm_exception_store_type_unregister); + +/* + * Round a number up to the nearest 'size' boundary. size must + * be a power of 2. + */ +static ulong round_up(ulong n, ulong size) +{ + size--; + return (n + size) & ~size; +} + +static int set_chunk_size(struct dm_exception_store *store, + const char *chunk_size_arg, char **error) +{ + unsigned long chunk_size_ulong; + char *value; + + chunk_size_ulong = simple_strtoul(chunk_size_arg, &value, 10); + if (*chunk_size_arg == '\0' || *value != '\0') { + *error = "Invalid chunk size"; + return -EINVAL; + } + + if (!chunk_size_ulong) { + store->chunk_size = store->chunk_mask = store->chunk_shift = 0; + return 0; + } + + /* + * Chunk size must be multiple of page size. Silently + * round up if it's not. + */ + chunk_size_ulong = round_up(chunk_size_ulong, PAGE_SIZE >> 9); + + /* Check chunk_size is a power of 2 */ + if (!is_power_of_2(chunk_size_ulong)) { + *error = "Chunk size is not a power of 2"; + return -EINVAL; + } + + /* Validate the chunk size against the device block size */ + if (chunk_size_ulong % (bdev_hardsect_size(store->cow->bdev) >> 9)) { + *error = "Chunk size is not a multiple of device blocksize"; + return -EINVAL; + } + + store->chunk_size = chunk_size_ulong; + store->chunk_mask = chunk_size_ulong - 1; + store->chunk_shift = ffs(chunk_size_ulong) - 1; + + return 0; +} + +int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, + unsigned *args_used, + struct dm_exception_store **store) +{ + int r = 0; + struct dm_exception_store_type *type; + struct dm_exception_store *tmp_store; + char persistent; + + if (argc < 3) { + ti->error = "Insufficient exception store arguments"; + return -EINVAL; + } + + tmp_store = kmalloc(sizeof(*tmp_store), GFP_KERNEL); + if (!tmp_store) { + ti->error = "Exception store allocation failed"; + return -ENOMEM; + } + + persistent = toupper(*argv[1]); + if (persistent != 'P' && persistent != 'N') { + ti->error = "Persistent flag is not P or N"; + return -EINVAL; + } + + type = get_type(argv[1]); + if (!type) { + ti->error = "Exception store type not recognised"; + r = -EINVAL; + goto bad_type; + } + + tmp_store->type = type; + tmp_store->ti = ti; + + r = dm_get_device(ti, argv[0], 0, 0, + FMODE_READ | FMODE_WRITE, &tmp_store->cow); + if (r) { + ti->error = "Cannot get COW device"; + goto bad_cow; + } + + r = set_chunk_size(tmp_store, argv[2], &ti->error); + if (r) + goto bad_cow; + + r = type->ctr(tmp_store, 0, NULL); + if (r) { + ti->error = "Exception store type constructor failed"; + goto bad_ctr; + } + + *args_used = 3; + *store = tmp_store; + return 0; + +bad_ctr: + dm_put_device(ti, tmp_store->cow); +bad_cow: + put_type(type); +bad_type: + kfree(tmp_store); + return r; +} +EXPORT_SYMBOL(dm_exception_store_create); + +void dm_exception_store_destroy(struct dm_exception_store *store) +{ + store->type->dtr(store); + dm_put_device(store->ti, store->cow); + put_type(store->type); + kfree(store); +} +EXPORT_SYMBOL(dm_exception_store_destroy); + int dm_exception_store_init(void) { int r; diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h index bb9f33d5daa2..0a2e6e7f67b3 100644 --- a/drivers/md/dm-exception-store.h +++ b/drivers/md/dm-exception-store.h @@ -37,11 +37,18 @@ struct dm_snap_exception { * Abstraction to handle the meta/layout of exception stores (the * COW device). */ -struct dm_exception_store { +struct dm_exception_store; +struct dm_exception_store_type { + const char *name; + struct module *module; + + int (*ctr) (struct dm_exception_store *store, + unsigned argc, char **argv); + /* * Destroys this object when you've finished with it. */ - void (*destroy) (struct dm_exception_store *store); + void (*dtr) (struct dm_exception_store *store); /* * The target shouldn't read the COW device until this is @@ -72,8 +79,9 @@ struct dm_exception_store { */ void (*drop_snapshot) (struct dm_exception_store *store); - int (*status) (struct dm_exception_store *store, status_type_t status, - char *result, unsigned int maxlen); + unsigned (*status) (struct dm_exception_store *store, + status_type_t status, char *result, + unsigned maxlen); /* * Return how full the snapshot is. @@ -82,7 +90,21 @@ struct dm_exception_store { sector_t *numerator, sector_t *denominator); - struct dm_snapshot *snap; + /* For internal device-mapper use only. */ + struct list_head list; +}; + +struct dm_exception_store { + struct dm_exception_store_type *type; + struct dm_target *ti; + + struct dm_dev *cow; + + /* Size of data blocks saved - must be a power of 2 */ + chunk_t chunk_size; + chunk_t chunk_mask; + chunk_t chunk_shift; + void *context; }; @@ -129,6 +151,28 @@ static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e) # endif +/* + * Return the number of sectors in the device. + */ +static inline sector_t get_dev_size(struct block_device *bdev) +{ + return bdev->bd_inode->i_size >> SECTOR_SHIFT; +} + +static inline chunk_t sector_to_chunk(struct dm_exception_store *store, + sector_t sector) +{ + return (sector & ~store->chunk_mask) >> store->chunk_shift; +} + +int dm_exception_store_type_register(struct dm_exception_store_type *type); +int dm_exception_store_type_unregister(struct dm_exception_store_type *type); + +int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, + unsigned *args_used, + struct dm_exception_store **store); +void dm_exception_store_destroy(struct dm_exception_store *store); + int dm_exception_store_init(void); void dm_exception_store_exit(void); @@ -141,8 +185,4 @@ void dm_persistent_snapshot_exit(void); int dm_transient_snapshot_init(void); void dm_transient_snapshot_exit(void); -int dm_create_persistent(struct dm_exception_store *store); - -int dm_create_transient(struct dm_exception_store *store); - #endif /* _LINUX_DM_EXCEPTION_STORE */ diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index f14813be4eff..e73aabd61cd7 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -292,6 +292,8 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, (PAGE_SIZE >> SECTOR_SHIFT)); num_bvecs = 1 + min_t(int, bio_get_nr_vecs(where->bdev), num_bvecs); + if (unlikely(num_bvecs > BIO_MAX_PAGES)) + num_bvecs = BIO_MAX_PAGES; bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios); bio->bi_sector = where->sector + (where->count - remaining); bio->bi_bdev = where->bdev; @@ -368,16 +370,13 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, while (1) { set_current_state(TASK_UNINTERRUPTIBLE); - if (!atomic_read(&io.count) || signal_pending(current)) + if (!atomic_read(&io.count)) break; io_schedule(); } set_current_state(TASK_RUNNING); - if (atomic_read(&io.count)) - return -EINTR; - if (error_bits) *error_bits = io.error_bits; diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 54d0588fc1f6..823ceba6efa8 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -704,7 +704,8 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size) char *new_name = (char *) param + param->data_start; if (new_name < param->data || - invalid_str(new_name, (void *) param + param_size)) { + invalid_str(new_name, (void *) param + param_size) || + strlen(new_name) > DM_NAME_LEN - 1) { DMWARN("Invalid new logical volume name supplied."); return -EINVAL; } @@ -1046,6 +1047,19 @@ static int populate_table(struct dm_table *table, return dm_table_complete(table); } +static int table_prealloc_integrity(struct dm_table *t, + struct mapped_device *md) +{ + struct list_head *devices = dm_table_get_devices(t); + struct dm_dev_internal *dd; + + list_for_each_entry(dd, devices, list) + if (bdev_get_integrity(dd->dm_dev.bdev)) + return blk_integrity_register(dm_disk(md), NULL); + + return 0; +} + static int table_load(struct dm_ioctl *param, size_t param_size) { int r; @@ -1063,7 +1077,15 @@ static int table_load(struct dm_ioctl *param, size_t param_size) r = populate_table(t, param, param_size); if (r) { - dm_table_put(t); + dm_table_destroy(t); + goto out; + } + + r = table_prealloc_integrity(t, md); + if (r) { + DMERR("%s: could not register integrity profile.", + dm_device_name(md)); + dm_table_destroy(t); goto out; } @@ -1071,7 +1093,7 @@ static int table_load(struct dm_ioctl *param, size_t param_size) hc = dm_get_mdptr(md); if (!hc || hc->md != md) { DMWARN("device has been removed from the dev hash table."); - dm_table_put(t); + dm_table_destroy(t); up_write(&_hash_lock); r = -ENXIO; goto out; diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 0a225da21272..3e3fc06cb861 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -297,7 +297,8 @@ static int run_complete_job(struct kcopyd_job *job) dm_kcopyd_notify_fn fn = job->fn; struct dm_kcopyd_client *kc = job->kc; - kcopyd_put_pages(kc, job->pages); + if (job->pages) + kcopyd_put_pages(kc, job->pages); mempool_free(job, kc->job_pool); fn(read_err, write_err, context); @@ -461,6 +462,7 @@ static void segment_complete(int read_err, unsigned long write_err, sector_t progress = 0; sector_t count = 0; struct kcopyd_job *job = (struct kcopyd_job *) context; + struct dm_kcopyd_client *kc = job->kc; mutex_lock(&job->lock); @@ -490,7 +492,7 @@ static void segment_complete(int read_err, unsigned long write_err, if (count) { int i; - struct kcopyd_job *sub_job = mempool_alloc(job->kc->job_pool, + struct kcopyd_job *sub_job = mempool_alloc(kc->job_pool, GFP_NOIO); *sub_job = *job; @@ -509,13 +511,16 @@ static void segment_complete(int read_err, unsigned long write_err, } else if (atomic_dec_and_test(&job->sub_jobs)) { /* - * To avoid a race we must keep the job around - * until after the notify function has completed. - * Otherwise the client may try and stop the job - * after we've completed. + * Queue the completion callback to the kcopyd thread. + * + * Some callers assume that all the completions are called + * from a single thread and don't race with each other. + * + * We must not call the callback directly here because this + * code may not be executing in the thread. */ - job->fn(read_err, write_err, job->context); - mempool_free(job, job->kc->job_pool); + push(&kc->complete_jobs, job); + wake(kc); } } @@ -528,6 +533,8 @@ static void split_job(struct kcopyd_job *job) { int i; + atomic_inc(&job->kc->nr_jobs); + atomic_set(&job->sub_jobs, SPLIT_COUNT); for (i = 0; i < SPLIT_COUNT; i++) segment_complete(0, 0u, job); diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index bfa107f59d96..79fb53e51c70 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -142,7 +142,6 @@ static struct target_type linear_target = { .status = linear_status, .ioctl = linear_ioctl, .merge = linear_merge, - .features = DM_TARGET_SUPPORTS_BARRIERS, }; int __init dm_linear_init(void) diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 737961f275c1..be233bc4d917 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -16,40 +16,29 @@ #define DM_MSG_PREFIX "dirty region log" -struct dm_dirty_log_internal { - struct dm_dirty_log_type *type; - - struct list_head list; - long use; -}; - static LIST_HEAD(_log_types); static DEFINE_SPINLOCK(_lock); -static struct dm_dirty_log_internal *__find_dirty_log_type(const char *name) +static struct dm_dirty_log_type *__find_dirty_log_type(const char *name) { - struct dm_dirty_log_internal *log_type; + struct dm_dirty_log_type *log_type; list_for_each_entry(log_type, &_log_types, list) - if (!strcmp(name, log_type->type->name)) + if (!strcmp(name, log_type->name)) return log_type; return NULL; } -static struct dm_dirty_log_internal *_get_dirty_log_type(const char *name) +static struct dm_dirty_log_type *_get_dirty_log_type(const char *name) { - struct dm_dirty_log_internal *log_type; + struct dm_dirty_log_type *log_type; spin_lock(&_lock); log_type = __find_dirty_log_type(name); - if (log_type) { - if (!log_type->use && !try_module_get(log_type->type->module)) - log_type = NULL; - else - log_type->use++; - } + if (log_type && !try_module_get(log_type->module)) + log_type = NULL; spin_unlock(&_lock); @@ -76,14 +65,14 @@ static struct dm_dirty_log_internal *_get_dirty_log_type(const char *name) static struct dm_dirty_log_type *get_type(const char *type_name) { char *p, *type_name_dup; - struct dm_dirty_log_internal *log_type; + struct dm_dirty_log_type *log_type; if (!type_name) return NULL; log_type = _get_dirty_log_type(type_name); if (log_type) - return log_type->type; + return log_type; type_name_dup = kstrdup(type_name, GFP_KERNEL); if (!type_name_dup) { @@ -105,56 +94,33 @@ static struct dm_dirty_log_type *get_type(const char *type_name) kfree(type_name_dup); - return log_type ? log_type->type : NULL; + return log_type; } static void put_type(struct dm_dirty_log_type *type) { - struct dm_dirty_log_internal *log_type; - if (!type) return; spin_lock(&_lock); - log_type = __find_dirty_log_type(type->name); - if (!log_type) + if (!__find_dirty_log_type(type->name)) goto out; - if (!--log_type->use) - module_put(type->module); - - BUG_ON(log_type->use < 0); + module_put(type->module); out: spin_unlock(&_lock); } -static struct dm_dirty_log_internal *_alloc_dirty_log_type(struct dm_dirty_log_type *type) -{ - struct dm_dirty_log_internal *log_type = kzalloc(sizeof(*log_type), - GFP_KERNEL); - - if (log_type) - log_type->type = type; - - return log_type; -} - int dm_dirty_log_type_register(struct dm_dirty_log_type *type) { - struct dm_dirty_log_internal *log_type = _alloc_dirty_log_type(type); int r = 0; - if (!log_type) - return -ENOMEM; - spin_lock(&_lock); if (!__find_dirty_log_type(type->name)) - list_add(&log_type->list, &_log_types); - else { - kfree(log_type); + list_add(&type->list, &_log_types); + else r = -EEXIST; - } spin_unlock(&_lock); return r; @@ -163,25 +129,16 @@ EXPORT_SYMBOL(dm_dirty_log_type_register); int dm_dirty_log_type_unregister(struct dm_dirty_log_type *type) { - struct dm_dirty_log_internal *log_type; - spin_lock(&_lock); - log_type = __find_dirty_log_type(type->name); - if (!log_type) { + if (!__find_dirty_log_type(type->name)) { spin_unlock(&_lock); return -EINVAL; } - if (log_type->use) { - spin_unlock(&_lock); - return -ETXTBSY; - } - - list_del(&log_type->list); + list_del(&type->list); spin_unlock(&_lock); - kfree(log_type); return 0; } diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 095f77bf9681..6a386ab4f7eb 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -8,7 +8,6 @@ #include <linux/device-mapper.h> #include "dm-path-selector.h" -#include "dm-bio-list.h" #include "dm-bio-record.h" #include "dm-uevent.h" diff --git a/drivers/md/dm-path-selector.c b/drivers/md/dm-path-selector.c index 96ea226155b1..42c04f04a0c4 100644 --- a/drivers/md/dm-path-selector.c +++ b/drivers/md/dm-path-selector.c @@ -17,9 +17,7 @@ struct ps_internal { struct path_selector_type pst; - struct list_head list; - long use; }; #define pst_to_psi(__pst) container_of((__pst), struct ps_internal, pst) @@ -45,12 +43,8 @@ static struct ps_internal *get_path_selector(const char *name) down_read(&_ps_lock); psi = __find_path_selector_type(name); - if (psi) { - if ((psi->use == 0) && !try_module_get(psi->pst.module)) - psi = NULL; - else - psi->use++; - } + if (psi && !try_module_get(psi->pst.module)) + psi = NULL; up_read(&_ps_lock); return psi; @@ -84,11 +78,7 @@ void dm_put_path_selector(struct path_selector_type *pst) if (!psi) goto out; - if (--psi->use == 0) - module_put(psi->pst.module); - - BUG_ON(psi->use < 0); - + module_put(psi->pst.module); out: up_read(&_ps_lock); } @@ -136,11 +126,6 @@ int dm_unregister_path_selector(struct path_selector_type *pst) return -EINVAL; } - if (psi->use) { - up_write(&_ps_lock); - return -ETXTBSY; - } - list_del(&psi->list); up_write(&_ps_lock); diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 4d6bc101962e..076fbb4e967a 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -5,7 +5,6 @@ * This file is released under the GPL. */ -#include "dm-bio-list.h" #include "dm-bio-record.h" #include <linux/init.h> @@ -145,6 +144,8 @@ struct dm_raid1_read_record { struct dm_bio_details details; }; +static struct kmem_cache *_dm_raid1_read_record_cache; + /* * Every mirror should look like this one. */ @@ -586,6 +587,9 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) int state; struct bio *bio; struct bio_list sync, nosync, recover, *this_list = NULL; + struct bio_list requeue; + struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); + region_t region; if (!writes->head) return; @@ -596,10 +600,18 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) bio_list_init(&sync); bio_list_init(&nosync); bio_list_init(&recover); + bio_list_init(&requeue); while ((bio = bio_list_pop(writes))) { - state = dm_rh_get_state(ms->rh, - dm_rh_bio_to_region(ms->rh, bio), 1); + region = dm_rh_bio_to_region(ms->rh, bio); + + if (log->type->is_remote_recovering && + log->type->is_remote_recovering(log, region)) { + bio_list_add(&requeue, bio); + continue; + } + + state = dm_rh_get_state(ms->rh, region, 1); switch (state) { case DM_RH_CLEAN: case DM_RH_DIRTY: @@ -619,6 +631,16 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) } /* + * Add bios that are delayed due to remote recovery + * back on to the write queue + */ + if (unlikely(requeue.head)) { + spin_lock_irq(&ms->lock); + bio_list_merge(&ms->writes, &requeue); + spin_unlock_irq(&ms->lock); + } + + /* * Increment the pending counts for any regions that will * be written to (writes to recover regions are going to * be delayed). @@ -764,9 +786,9 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors, atomic_set(&ms->suspend, 0); atomic_set(&ms->default_mirror, DEFAULT_MIRROR); - len = sizeof(struct dm_raid1_read_record); - ms->read_record_pool = mempool_create_kmalloc_pool(MIN_READ_RECORDS, - len); + ms->read_record_pool = mempool_create_slab_pool(MIN_READ_RECORDS, + _dm_raid1_read_record_cache); + if (!ms->read_record_pool) { ti->error = "Error creating mirror read_record_pool"; kfree(ms); @@ -1279,16 +1301,31 @@ static int __init dm_mirror_init(void) { int r; + _dm_raid1_read_record_cache = KMEM_CACHE(dm_raid1_read_record, 0); + if (!_dm_raid1_read_record_cache) { + DMERR("Can't allocate dm_raid1_read_record cache"); + r = -ENOMEM; + goto bad_cache; + } + r = dm_register_target(&mirror_target); - if (r < 0) + if (r < 0) { DMERR("Failed to register mirror target"); + goto bad_target; + } + + return 0; +bad_target: + kmem_cache_destroy(_dm_raid1_read_record_cache); +bad_cache: return r; } static void __exit dm_mirror_exit(void) { dm_unregister_target(&mirror_target); + kmem_cache_destroy(_dm_raid1_read_record_cache); } /* Module hooks */ diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c index 59f8d9df9e1a..7b899be0b087 100644 --- a/drivers/md/dm-region-hash.c +++ b/drivers/md/dm-region-hash.c @@ -14,7 +14,6 @@ #include <linux/vmalloc.h> #include "dm.h" -#include "dm-bio-list.h" #define DM_MSG_PREFIX "region hash" diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 936b34e0959f..e75c6dd76a9a 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -6,7 +6,6 @@ */ #include "dm-exception-store.h" -#include "dm-snap.h" #include <linux/mm.h> #include <linux/pagemap.h> @@ -89,7 +88,7 @@ struct commit_callback { * The top level structure for a persistent exception store. */ struct pstore { - struct dm_snapshot *snap; /* up pointer to my snapshot */ + struct dm_exception_store *store; int version; int valid; uint32_t exceptions_per_area; @@ -141,7 +140,7 @@ static int alloc_area(struct pstore *ps) int r = -ENOMEM; size_t len; - len = ps->snap->chunk_size << SECTOR_SHIFT; + len = ps->store->chunk_size << SECTOR_SHIFT; /* * Allocate the chunk_size block of memory that will hold @@ -163,9 +162,12 @@ static int alloc_area(struct pstore *ps) static void free_area(struct pstore *ps) { - vfree(ps->area); + if (ps->area) + vfree(ps->area); ps->area = NULL; - vfree(ps->zero_area); + + if (ps->zero_area) + vfree(ps->zero_area); ps->zero_area = NULL; } @@ -189,9 +191,9 @@ static void do_metadata(struct work_struct *work) static int chunk_io(struct pstore *ps, chunk_t chunk, int rw, int metadata) { struct dm_io_region where = { - .bdev = ps->snap->cow->bdev, - .sector = ps->snap->chunk_size * chunk, - .count = ps->snap->chunk_size, + .bdev = ps->store->cow->bdev, + .sector = ps->store->chunk_size * chunk, + .count = ps->store->chunk_size, }; struct dm_io_request io_req = { .bi_rw = rw, @@ -247,15 +249,15 @@ static int area_io(struct pstore *ps, int rw) static void zero_memory_area(struct pstore *ps) { - memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT); + memset(ps->area, 0, ps->store->chunk_size << SECTOR_SHIFT); } static int zero_disk_area(struct pstore *ps, chunk_t area) { struct dm_io_region where = { - .bdev = ps->snap->cow->bdev, - .sector = ps->snap->chunk_size * area_location(ps, area), - .count = ps->snap->chunk_size, + .bdev = ps->store->cow->bdev, + .sector = ps->store->chunk_size * area_location(ps, area), + .count = ps->store->chunk_size, }; struct dm_io_request io_req = { .bi_rw = WRITE, @@ -278,15 +280,15 @@ static int read_header(struct pstore *ps, int *new_snapshot) /* * Use default chunk size (or hardsect_size, if larger) if none supplied */ - if (!ps->snap->chunk_size) { - ps->snap->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS, - bdev_hardsect_size(ps->snap->cow->bdev) >> 9); - ps->snap->chunk_mask = ps->snap->chunk_size - 1; - ps->snap->chunk_shift = ffs(ps->snap->chunk_size) - 1; + if (!ps->store->chunk_size) { + ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS, + bdev_hardsect_size(ps->store->cow->bdev) >> 9); + ps->store->chunk_mask = ps->store->chunk_size - 1; + ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1; chunk_size_supplied = 0; } - ps->io_client = dm_io_client_create(sectors_to_pages(ps->snap-> + ps->io_client = dm_io_client_create(sectors_to_pages(ps->store-> chunk_size)); if (IS_ERR(ps->io_client)) return PTR_ERR(ps->io_client); @@ -317,22 +319,22 @@ static int read_header(struct pstore *ps, int *new_snapshot) ps->version = le32_to_cpu(dh->version); chunk_size = le32_to_cpu(dh->chunk_size); - if (!chunk_size_supplied || ps->snap->chunk_size == chunk_size) + if (!chunk_size_supplied || ps->store->chunk_size == chunk_size) return 0; DMWARN("chunk size %llu in device metadata overrides " "table chunk size of %llu.", (unsigned long long)chunk_size, - (unsigned long long)ps->snap->chunk_size); + (unsigned long long)ps->store->chunk_size); /* We had a bogus chunk_size. Fix stuff up. */ free_area(ps); - ps->snap->chunk_size = chunk_size; - ps->snap->chunk_mask = chunk_size - 1; - ps->snap->chunk_shift = ffs(chunk_size) - 1; + ps->store->chunk_size = chunk_size; + ps->store->chunk_mask = chunk_size - 1; + ps->store->chunk_shift = ffs(chunk_size) - 1; - r = dm_io_client_resize(sectors_to_pages(ps->snap->chunk_size), + r = dm_io_client_resize(sectors_to_pages(ps->store->chunk_size), ps->io_client); if (r) return r; @@ -349,13 +351,13 @@ static int write_header(struct pstore *ps) { struct disk_header *dh; - memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT); + memset(ps->area, 0, ps->store->chunk_size << SECTOR_SHIFT); dh = (struct disk_header *) ps->area; dh->magic = cpu_to_le32(SNAP_MAGIC); dh->valid = cpu_to_le32(ps->valid); dh->version = cpu_to_le32(ps->version); - dh->chunk_size = cpu_to_le32(ps->snap->chunk_size); + dh->chunk_size = cpu_to_le32(ps->store->chunk_size); return chunk_io(ps, 0, WRITE, 1); } @@ -474,18 +476,25 @@ static struct pstore *get_info(struct dm_exception_store *store) static void persistent_fraction_full(struct dm_exception_store *store, sector_t *numerator, sector_t *denominator) { - *numerator = get_info(store)->next_free * store->snap->chunk_size; - *denominator = get_dev_size(store->snap->cow->bdev); + *numerator = get_info(store)->next_free * store->chunk_size; + *denominator = get_dev_size(store->cow->bdev); } -static void persistent_destroy(struct dm_exception_store *store) +static void persistent_dtr(struct dm_exception_store *store) { struct pstore *ps = get_info(store); destroy_workqueue(ps->metadata_wq); - dm_io_client_destroy(ps->io_client); - vfree(ps->callbacks); + + /* Created in read_header */ + if (ps->io_client) + dm_io_client_destroy(ps->io_client); free_area(ps); + + /* Allocated in persistent_read_metadata */ + if (ps->callbacks) + vfree(ps->callbacks); + kfree(ps); } @@ -507,7 +516,7 @@ static int persistent_read_metadata(struct dm_exception_store *store, /* * Now we know correct chunk_size, complete the initialisation. */ - ps->exceptions_per_area = (ps->snap->chunk_size << SECTOR_SHIFT) / + ps->exceptions_per_area = (ps->store->chunk_size << SECTOR_SHIFT) / sizeof(struct disk_exception); ps->callbacks = dm_vcalloc(ps->exceptions_per_area, sizeof(*ps->callbacks)); @@ -564,10 +573,10 @@ static int persistent_prepare_exception(struct dm_exception_store *store, struct pstore *ps = get_info(store); uint32_t stride; chunk_t next_free; - sector_t size = get_dev_size(store->snap->cow->bdev); + sector_t size = get_dev_size(store->cow->bdev); /* Is there enough room ? */ - if (size < ((ps->next_free + 1) * store->snap->chunk_size)) + if (size < ((ps->next_free + 1) * store->chunk_size)) return -ENOSPC; e->new_chunk = ps->next_free; @@ -656,16 +665,17 @@ static void persistent_drop_snapshot(struct dm_exception_store *store) DMWARN("write header failed"); } -int dm_create_persistent(struct dm_exception_store *store) +static int persistent_ctr(struct dm_exception_store *store, + unsigned argc, char **argv) { struct pstore *ps; /* allocate the pstore */ - ps = kmalloc(sizeof(*ps), GFP_KERNEL); + ps = kzalloc(sizeof(*ps), GFP_KERNEL); if (!ps) return -ENOMEM; - ps->snap = store->snap; + ps->store = store; ps->valid = 1; ps->version = SNAPSHOT_DISK_VERSION; ps->area = NULL; @@ -683,22 +693,77 @@ int dm_create_persistent(struct dm_exception_store *store) return -ENOMEM; } - store->destroy = persistent_destroy; - store->read_metadata = persistent_read_metadata; - store->prepare_exception = persistent_prepare_exception; - store->commit_exception = persistent_commit_exception; - store->drop_snapshot = persistent_drop_snapshot; - store->fraction_full = persistent_fraction_full; store->context = ps; return 0; } +static unsigned persistent_status(struct dm_exception_store *store, + status_type_t status, char *result, + unsigned maxlen) +{ + unsigned sz = 0; + + switch (status) { + case STATUSTYPE_INFO: + break; + case STATUSTYPE_TABLE: + DMEMIT(" %s P %llu", store->cow->name, + (unsigned long long)store->chunk_size); + } + + return sz; +} + +static struct dm_exception_store_type _persistent_type = { + .name = "persistent", + .module = THIS_MODULE, + .ctr = persistent_ctr, + .dtr = persistent_dtr, + .read_metadata = persistent_read_metadata, + .prepare_exception = persistent_prepare_exception, + .commit_exception = persistent_commit_exception, + .drop_snapshot = persistent_drop_snapshot, + .fraction_full = persistent_fraction_full, + .status = persistent_status, +}; + +static struct dm_exception_store_type _persistent_compat_type = { + .name = "P", + .module = THIS_MODULE, + .ctr = persistent_ctr, + .dtr = persistent_dtr, + .read_metadata = persistent_read_metadata, + .prepare_exception = persistent_prepare_exception, + .commit_exception = persistent_commit_exception, + .drop_snapshot = persistent_drop_snapshot, + .fraction_full = persistent_fraction_full, + .status = persistent_status, +}; + int dm_persistent_snapshot_init(void) { - return 0; + int r; + + r = dm_exception_store_type_register(&_persistent_type); + if (r) { + DMERR("Unable to register persistent exception store type"); + return r; + } + + r = dm_exception_store_type_register(&_persistent_compat_type); + if (r) { + DMERR("Unable to register old-style persistent exception " + "store type"); + dm_exception_store_type_unregister(&_persistent_type); + return r; + } + + return r; } void dm_persistent_snapshot_exit(void) { + dm_exception_store_type_unregister(&_persistent_type); + dm_exception_store_type_unregister(&_persistent_compat_type); } diff --git a/drivers/md/dm-snap-transient.c b/drivers/md/dm-snap-transient.c index 7f6e2e6dcb0d..cde5aa558e6d 100644 --- a/drivers/md/dm-snap-transient.c +++ b/drivers/md/dm-snap-transient.c @@ -6,7 +6,6 @@ */ #include "dm-exception-store.h" -#include "dm-snap.h" #include <linux/mm.h> #include <linux/pagemap.h> @@ -23,7 +22,7 @@ struct transient_c { sector_t next_free; }; -static void transient_destroy(struct dm_exception_store *store) +static void transient_dtr(struct dm_exception_store *store) { kfree(store->context); } @@ -39,14 +38,14 @@ static int transient_read_metadata(struct dm_exception_store *store, static int transient_prepare_exception(struct dm_exception_store *store, struct dm_snap_exception *e) { - struct transient_c *tc = (struct transient_c *) store->context; - sector_t size = get_dev_size(store->snap->cow->bdev); + struct transient_c *tc = store->context; + sector_t size = get_dev_size(store->cow->bdev); - if (size < (tc->next_free + store->snap->chunk_size)) + if (size < (tc->next_free + store->chunk_size)) return -1; - e->new_chunk = sector_to_chunk(store->snap, tc->next_free); - tc->next_free += store->snap->chunk_size; + e->new_chunk = sector_to_chunk(store, tc->next_free); + tc->next_free += store->chunk_size; return 0; } @@ -64,20 +63,14 @@ static void transient_fraction_full(struct dm_exception_store *store, sector_t *numerator, sector_t *denominator) { *numerator = ((struct transient_c *) store->context)->next_free; - *denominator = get_dev_size(store->snap->cow->bdev); + *denominator = get_dev_size(store->cow->bdev); } -int dm_create_transient(struct dm_exception_store *store) +static int transient_ctr(struct dm_exception_store *store, + unsigned argc, char **argv) { struct transient_c *tc; - store->destroy = transient_destroy; - store->read_metadata = transient_read_metadata; - store->prepare_exception = transient_prepare_exception; - store->commit_exception = transient_commit_exception; - store->drop_snapshot = NULL; - store->fraction_full = transient_fraction_full; - tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL); if (!tc) return -ENOMEM; @@ -88,11 +81,70 @@ int dm_create_transient(struct dm_exception_store *store) return 0; } +static unsigned transient_status(struct dm_exception_store *store, + status_type_t status, char *result, + unsigned maxlen) +{ + unsigned sz = 0; + + switch (status) { + case STATUSTYPE_INFO: + break; + case STATUSTYPE_TABLE: + DMEMIT(" %s N %llu", store->cow->name, + (unsigned long long)store->chunk_size); + } + + return sz; +} + +static struct dm_exception_store_type _transient_type = { + .name = "transient", + .module = THIS_MODULE, + .ctr = transient_ctr, + .dtr = transient_dtr, + .read_metadata = transient_read_metadata, + .prepare_exception = transient_prepare_exception, + .commit_exception = transient_commit_exception, + .fraction_full = transient_fraction_full, + .status = transient_status, +}; + +static struct dm_exception_store_type _transient_compat_type = { + .name = "N", + .module = THIS_MODULE, + .ctr = transient_ctr, + .dtr = transient_dtr, + .read_metadata = transient_read_metadata, + .prepare_exception = transient_prepare_exception, + .commit_exception = transient_commit_exception, + .fraction_full = transient_fraction_full, + .status = transient_status, +}; + int dm_transient_snapshot_init(void) { - return 0; + int r; + + r = dm_exception_store_type_register(&_transient_type); + if (r) { + DMWARN("Unable to register transient exception store type"); + return r; + } + + r = dm_exception_store_type_register(&_transient_compat_type); + if (r) { + DMWARN("Unable to register old-style transient " + "exception store type"); + dm_exception_store_type_unregister(&_transient_type); + return r; + } + + return r; } void dm_transient_snapshot_exit(void) { + dm_exception_store_type_unregister(&_transient_type); + dm_exception_store_type_unregister(&_transient_compat_type); } diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 65ff82ff124e..d73f17fc7778 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -7,7 +7,6 @@ */ #include <linux/blkdev.h> -#include <linux/ctype.h> #include <linux/device-mapper.h> #include <linux/delay.h> #include <linux/fs.h> @@ -20,10 +19,9 @@ #include <linux/vmalloc.h> #include <linux/log2.h> #include <linux/dm-kcopyd.h> +#include <linux/workqueue.h> #include "dm-exception-store.h" -#include "dm-snap.h" -#include "dm-bio-list.h" #define DM_MSG_PREFIX "snapshots" @@ -47,9 +45,76 @@ */ #define MIN_IOS 256 +#define DM_TRACKED_CHUNK_HASH_SIZE 16 +#define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \ + (DM_TRACKED_CHUNK_HASH_SIZE - 1)) + +struct exception_table { + uint32_t hash_mask; + unsigned hash_shift; + struct list_head *table; +}; + +struct dm_snapshot { + struct rw_semaphore lock; + + struct dm_dev *origin; + + /* List of snapshots per Origin */ + struct list_head list; + + /* You can't use a snapshot if this is 0 (e.g. if full) */ + int valid; + + /* Origin writes don't trigger exceptions until this is set */ + int active; + + mempool_t *pending_pool; + + atomic_t pending_exceptions_count; + + struct exception_table pending; + struct exception_table complete; + + /* + * pe_lock protects all pending_exception operations and access + * as well as the snapshot_bios list. + */ + spinlock_t pe_lock; + + /* The on disk metadata handler */ + struct dm_exception_store *store; + + struct dm_kcopyd_client *kcopyd_client; + + /* Queue of snapshot writes for ksnapd to flush */ + struct bio_list queued_bios; + struct work_struct queued_bios_work; + + /* Chunks with outstanding reads */ + mempool_t *tracked_chunk_pool; + spinlock_t tracked_chunk_lock; + struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE]; +}; + static struct workqueue_struct *ksnapd; static void flush_queued_bios(struct work_struct *work); +static sector_t chunk_to_sector(struct dm_exception_store *store, + chunk_t chunk) +{ + return chunk << store->chunk_shift; +} + +static int bdev_equal(struct block_device *lhs, struct block_device *rhs) +{ + /* + * There is only ever one instance of a particular block + * device so we can compare pointers safely. + */ + return lhs == rhs; +} + struct dm_snap_pending_exception { struct dm_snap_exception e; @@ -476,11 +541,11 @@ static int init_hash_tables(struct dm_snapshot *s) * Calculate based on the size of the original volume or * the COW volume... */ - cow_dev_size = get_dev_size(s->cow->bdev); + cow_dev_size = get_dev_size(s->store->cow->bdev); origin_dev_size = get_dev_size(s->origin->bdev); max_buckets = calc_max_buckets(); - hash_size = min(origin_dev_size, cow_dev_size) >> s->chunk_shift; + hash_size = min(origin_dev_size, cow_dev_size) >> s->store->chunk_shift; hash_size = min(hash_size, max_buckets); hash_size = rounddown_pow_of_two(hash_size); @@ -505,58 +570,6 @@ static int init_hash_tables(struct dm_snapshot *s) } /* - * Round a number up to the nearest 'size' boundary. size must - * be a power of 2. - */ -static ulong round_up(ulong n, ulong size) -{ - size--; - return (n + size) & ~size; -} - -static int set_chunk_size(struct dm_snapshot *s, const char *chunk_size_arg, - char **error) -{ - unsigned long chunk_size; - char *value; - - chunk_size = simple_strtoul(chunk_size_arg, &value, 10); - if (*chunk_size_arg == '\0' || *value != '\0') { - *error = "Invalid chunk size"; - return -EINVAL; - } - - if (!chunk_size) { - s->chunk_size = s->chunk_mask = s->chunk_shift = 0; - return 0; - } - - /* - * Chunk size must be multiple of page size. Silently - * round up if it's not. - */ - chunk_size = round_up(chunk_size, PAGE_SIZE >> 9); - - /* Check chunk_size is a power of 2 */ - if (!is_power_of_2(chunk_size)) { - *error = "Chunk size is not a power of 2"; - return -EINVAL; - } - - /* Validate the chunk size against the device block size */ - if (chunk_size % (bdev_hardsect_size(s->cow->bdev) >> 9)) { - *error = "Chunk size is not a multiple of device blocksize"; - return -EINVAL; - } - - s->chunk_size = chunk_size; - s->chunk_mask = chunk_size - 1; - s->chunk_shift = ffs(chunk_size) - 1; - - return 0; -} - -/* * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size> */ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) @@ -564,91 +577,68 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) struct dm_snapshot *s; int i; int r = -EINVAL; - char persistent; char *origin_path; - char *cow_path; + struct dm_exception_store *store; + unsigned args_used; if (argc != 4) { ti->error = "requires exactly 4 arguments"; r = -EINVAL; - goto bad1; + goto bad_args; } origin_path = argv[0]; - cow_path = argv[1]; - persistent = toupper(*argv[2]); + argv++; + argc--; - if (persistent != 'P' && persistent != 'N') { - ti->error = "Persistent flag is not P or N"; + r = dm_exception_store_create(ti, argc, argv, &args_used, &store); + if (r) { + ti->error = "Couldn't create exception store"; r = -EINVAL; - goto bad1; + goto bad_args; } + argv += args_used; + argc -= args_used; + s = kmalloc(sizeof(*s), GFP_KERNEL); - if (s == NULL) { + if (!s) { ti->error = "Cannot allocate snapshot context private " "structure"; r = -ENOMEM; - goto bad1; + goto bad_snap; } r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin); if (r) { ti->error = "Cannot get origin device"; - goto bad2; - } - - r = dm_get_device(ti, cow_path, 0, 0, - FMODE_READ | FMODE_WRITE, &s->cow); - if (r) { - dm_put_device(ti, s->origin); - ti->error = "Cannot get COW device"; - goto bad2; + goto bad_origin; } - r = set_chunk_size(s, argv[3], &ti->error); - if (r) - goto bad3; - - s->type = persistent; - + s->store = store; s->valid = 1; s->active = 0; atomic_set(&s->pending_exceptions_count, 0); init_rwsem(&s->lock); spin_lock_init(&s->pe_lock); - s->ti = ti; /* Allocate hash table for COW data */ if (init_hash_tables(s)) { ti->error = "Unable to allocate hash table space"; r = -ENOMEM; - goto bad3; - } - - s->store.snap = s; - - if (persistent == 'P') - r = dm_create_persistent(&s->store); - else - r = dm_create_transient(&s->store); - - if (r) { - ti->error = "Couldn't create exception store"; - r = -EINVAL; - goto bad4; + goto bad_hash_tables; } r = dm_kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client); if (r) { ti->error = "Could not create kcopyd client"; - goto bad5; + goto bad_kcopyd; } s->pending_pool = mempool_create_slab_pool(MIN_IOS, pending_cache); if (!s->pending_pool) { ti->error = "Could not allocate mempool for pending exceptions"; - goto bad6; + goto bad_pending_pool; } s->tracked_chunk_pool = mempool_create_slab_pool(MIN_IOS, @@ -665,7 +655,8 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) spin_lock_init(&s->tracked_chunk_lock); /* Metadata must only be loaded into one table at once */ - r = s->store.read_metadata(&s->store, dm_add_exception, (void *)s); + r = s->store->type->read_metadata(s->store, dm_add_exception, + (void *)s); if (r < 0) { ti->error = "Failed to read snapshot metadata"; goto bad_load_and_register; @@ -686,34 +677,33 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) } ti->private = s; - ti->split_io = s->chunk_size; + ti->split_io = s->store->chunk_size; return 0; - bad_load_and_register: +bad_load_and_register: mempool_destroy(s->tracked_chunk_pool); - bad_tracked_chunk_pool: +bad_tracked_chunk_pool: mempool_destroy(s->pending_pool); - bad6: +bad_pending_pool: dm_kcopyd_client_destroy(s->kcopyd_client); - bad5: - s->store.destroy(&s->store); - - bad4: +bad_kcopyd: exit_exception_table(&s->pending, pending_cache); exit_exception_table(&s->complete, exception_cache); - bad3: - dm_put_device(ti, s->cow); +bad_hash_tables: dm_put_device(ti, s->origin); - bad2: +bad_origin: kfree(s); - bad1: +bad_snap: + dm_exception_store_destroy(store); + +bad_args: return r; } @@ -724,8 +714,6 @@ static void __free_exceptions(struct dm_snapshot *s) exit_exception_table(&s->pending, pending_cache); exit_exception_table(&s->complete, exception_cache); - - s->store.destroy(&s->store); } static void snapshot_dtr(struct dm_target *ti) @@ -761,7 +749,8 @@ static void snapshot_dtr(struct dm_target *ti) mempool_destroy(s->pending_pool); dm_put_device(ti, s->origin); - dm_put_device(ti, s->cow); + + dm_exception_store_destroy(s->store); kfree(s); } @@ -820,12 +809,12 @@ static void __invalidate_snapshot(struct dm_snapshot *s, int err) else if (err == -ENOMEM) DMERR("Invalidating snapshot: Unable to allocate exception."); - if (s->store.drop_snapshot) - s->store.drop_snapshot(&s->store); + if (s->store->type->drop_snapshot) + s->store->type->drop_snapshot(s->store); s->valid = 0; - dm_table_event(s->ti->table); + dm_table_event(s->store->ti->table); } static void get_pending_exception(struct dm_snap_pending_exception *pe) @@ -943,8 +932,8 @@ static void copy_callback(int read_err, unsigned long write_err, void *context) else /* Update the metadata if we are persistent */ - s->store.commit_exception(&s->store, &pe->e, commit_callback, - pe); + s->store->type->commit_exception(s->store, &pe->e, + commit_callback, pe); } /* @@ -960,11 +949,11 @@ static void start_copy(struct dm_snap_pending_exception *pe) dev_size = get_dev_size(bdev); src.bdev = bdev; - src.sector = chunk_to_sector(s, pe->e.old_chunk); - src.count = min(s->chunk_size, dev_size - src.sector); + src.sector = chunk_to_sector(s->store, pe->e.old_chunk); + src.count = min(s->store->chunk_size, dev_size - src.sector); - dest.bdev = s->cow->bdev; - dest.sector = chunk_to_sector(s, pe->e.new_chunk); + dest.bdev = s->store->cow->bdev; + dest.sector = chunk_to_sector(s->store, pe->e.new_chunk); dest.count = src.count; /* Hand over to kcopyd */ @@ -972,6 +961,17 @@ static void start_copy(struct dm_snap_pending_exception *pe) &src, 1, &dest, 0, copy_callback, pe); } +static struct dm_snap_pending_exception * +__lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk) +{ + struct dm_snap_exception *e = lookup_exception(&s->pending, chunk); + + if (!e) + return NULL; + + return container_of(e, struct dm_snap_pending_exception, e); +} + /* * Looks to see if this snapshot already has a pending exception * for this chunk, otherwise it allocates a new one and inserts @@ -981,40 +981,15 @@ static void start_copy(struct dm_snap_pending_exception *pe) * this. */ static struct dm_snap_pending_exception * -__find_pending_exception(struct dm_snapshot *s, struct bio *bio) +__find_pending_exception(struct dm_snapshot *s, + struct dm_snap_pending_exception *pe, chunk_t chunk) { - struct dm_snap_exception *e; - struct dm_snap_pending_exception *pe; - chunk_t chunk = sector_to_chunk(s, bio->bi_sector); + struct dm_snap_pending_exception *pe2; - /* - * Is there a pending exception for this already ? - */ - e = lookup_exception(&s->pending, chunk); - if (e) { - /* cast the exception to a pending exception */ - pe = container_of(e, struct dm_snap_pending_exception, e); - goto out; - } - - /* - * Create a new pending exception, we don't want - * to hold the lock while we do this. - */ - up_write(&s->lock); - pe = alloc_pending_exception(s); - down_write(&s->lock); - - if (!s->valid) { - free_pending_exception(pe); - return NULL; - } - - e = lookup_exception(&s->pending, chunk); - if (e) { + pe2 = __lookup_pending_exception(s, chunk); + if (pe2) { free_pending_exception(pe); - pe = container_of(e, struct dm_snap_pending_exception, e); - goto out; + return pe2; } pe->e.old_chunk = chunk; @@ -1024,7 +999,7 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio) atomic_set(&pe->ref_count, 0); pe->started = 0; - if (s->store.prepare_exception(&s->store, &pe->e)) { + if (s->store->type->prepare_exception(s->store, &pe->e)) { free_pending_exception(pe); return NULL; } @@ -1032,17 +1007,18 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio) get_pending_exception(pe); insert_exception(&s->pending, &pe->e); - out: return pe; } static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e, struct bio *bio, chunk_t chunk) { - bio->bi_bdev = s->cow->bdev; - bio->bi_sector = chunk_to_sector(s, dm_chunk_number(e->new_chunk) + - (chunk - e->old_chunk)) + - (bio->bi_sector & s->chunk_mask); + bio->bi_bdev = s->store->cow->bdev; + bio->bi_sector = chunk_to_sector(s->store, + dm_chunk_number(e->new_chunk) + + (chunk - e->old_chunk)) + + (bio->bi_sector & + s->store->chunk_mask); } static int snapshot_map(struct dm_target *ti, struct bio *bio, @@ -1054,7 +1030,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, chunk_t chunk; struct dm_snap_pending_exception *pe = NULL; - chunk = sector_to_chunk(s, bio->bi_sector); + chunk = sector_to_chunk(s->store, bio->bi_sector); /* Full snapshots are not usable */ /* To get here the table must be live so s->active is always set. */ @@ -1083,11 +1059,31 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, * writeable. */ if (bio_rw(bio) == WRITE) { - pe = __find_pending_exception(s, bio); + pe = __lookup_pending_exception(s, chunk); if (!pe) { - __invalidate_snapshot(s, -ENOMEM); - r = -EIO; - goto out_unlock; + up_write(&s->lock); + pe = alloc_pending_exception(s); + down_write(&s->lock); + + if (!s->valid) { + free_pending_exception(pe); + r = -EIO; + goto out_unlock; + } + + e = lookup_exception(&s->complete, chunk); + if (e) { + free_pending_exception(pe); + remap_exception(s, e, bio, chunk); + goto out_unlock; + } + + pe = __find_pending_exception(s, pe, chunk); + if (!pe) { + __invalidate_snapshot(s, -ENOMEM); + r = -EIO; + goto out_unlock; + } } remap_exception(s, &pe->e, bio, chunk); @@ -1137,24 +1133,25 @@ static void snapshot_resume(struct dm_target *ti) static int snapshot_status(struct dm_target *ti, status_type_t type, char *result, unsigned int maxlen) { + unsigned sz = 0; struct dm_snapshot *snap = ti->private; switch (type) { case STATUSTYPE_INFO: if (!snap->valid) - snprintf(result, maxlen, "Invalid"); + DMEMIT("Invalid"); else { - if (snap->store.fraction_full) { + if (snap->store->type->fraction_full) { sector_t numerator, denominator; - snap->store.fraction_full(&snap->store, - &numerator, - &denominator); - snprintf(result, maxlen, "%llu/%llu", - (unsigned long long)numerator, - (unsigned long long)denominator); + snap->store->type->fraction_full(snap->store, + &numerator, + &denominator); + DMEMIT("%llu/%llu", + (unsigned long long)numerator, + (unsigned long long)denominator); } else - snprintf(result, maxlen, "Unknown"); + DMEMIT("Unknown"); } break; @@ -1164,10 +1161,9 @@ static int snapshot_status(struct dm_target *ti, status_type_t type, * to make private copies if the output is to * make sense. */ - snprintf(result, maxlen, "%s %s %c %llu", - snap->origin->name, snap->cow->name, - snap->type, - (unsigned long long)snap->chunk_size); + DMEMIT("%s", snap->origin->name); + snap->store->type->status(snap->store, type, result + sz, + maxlen - sz); break; } @@ -1196,14 +1192,14 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio) goto next_snapshot; /* Nothing to do if writing beyond end of snapshot */ - if (bio->bi_sector >= dm_table_get_size(snap->ti->table)) + if (bio->bi_sector >= dm_table_get_size(snap->store->ti->table)) goto next_snapshot; /* * Remember, different snapshots can have * different chunk sizes. */ - chunk = sector_to_chunk(snap, bio->bi_sector); + chunk = sector_to_chunk(snap->store, bio->bi_sector); /* * Check exception table to see if block @@ -1217,10 +1213,28 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio) if (e) goto next_snapshot; - pe = __find_pending_exception(snap, bio); + pe = __lookup_pending_exception(snap, chunk); if (!pe) { - __invalidate_snapshot(snap, -ENOMEM); - goto next_snapshot; + up_write(&snap->lock); + pe = alloc_pending_exception(snap); + down_write(&snap->lock); + + if (!snap->valid) { + free_pending_exception(pe); + goto next_snapshot; + } + + e = lookup_exception(&snap->complete, chunk); + if (e) { + free_pending_exception(pe); + goto next_snapshot; + } + + pe = __find_pending_exception(snap, pe, chunk); + if (!pe) { + __invalidate_snapshot(snap, -ENOMEM); + goto next_snapshot; + } } if (!primary_pe) { @@ -1360,7 +1374,8 @@ static void origin_resume(struct dm_target *ti) o = __lookup_origin(dev->bdev); if (o) list_for_each_entry (snap, &o->snapshots, list) - chunk_size = min_not_zero(chunk_size, snap->chunk_size); + chunk_size = min_not_zero(chunk_size, + snap->store->chunk_size); up_read(&_origins_lock); ti->split_io = chunk_size; diff --git a/drivers/md/dm-snap.h b/drivers/md/dm-snap.h deleted file mode 100644 index d9e62b43cf85..000000000000 --- a/drivers/md/dm-snap.h +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright (C) 2001-2002 Sistina Software (UK) Limited. - * - * This file is released under the GPL. - */ - -#ifndef DM_SNAPSHOT_H -#define DM_SNAPSHOT_H - -#include <linux/device-mapper.h> -#include "dm-exception-store.h" -#include "dm-bio-list.h" -#include <linux/blkdev.h> -#include <linux/workqueue.h> - -struct exception_table { - uint32_t hash_mask; - unsigned hash_shift; - struct list_head *table; -}; - -#define DM_TRACKED_CHUNK_HASH_SIZE 16 -#define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \ - (DM_TRACKED_CHUNK_HASH_SIZE - 1)) - -struct dm_snapshot { - struct rw_semaphore lock; - struct dm_target *ti; - - struct dm_dev *origin; - struct dm_dev *cow; - - /* List of snapshots per Origin */ - struct list_head list; - - /* Size of data blocks saved - must be a power of 2 */ - chunk_t chunk_size; - chunk_t chunk_mask; - chunk_t chunk_shift; - - /* You can't use a snapshot if this is 0 (e.g. if full) */ - int valid; - - /* Origin writes don't trigger exceptions until this is set */ - int active; - - /* Used for display of table */ - char type; - - mempool_t *pending_pool; - - atomic_t pending_exceptions_count; - - struct exception_table pending; - struct exception_table complete; - - /* - * pe_lock protects all pending_exception operations and access - * as well as the snapshot_bios list. - */ - spinlock_t pe_lock; - - /* The on disk metadata handler */ - struct dm_exception_store store; - - struct dm_kcopyd_client *kcopyd_client; - - /* Queue of snapshot writes for ksnapd to flush */ - struct bio_list queued_bios; - struct work_struct queued_bios_work; - - /* Chunks with outstanding reads */ - mempool_t *tracked_chunk_pool; - spinlock_t tracked_chunk_lock; - struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE]; -}; - -/* - * Return the number of sectors in the device. - */ -static inline sector_t get_dev_size(struct block_device *bdev) -{ - return bdev->bd_inode->i_size >> SECTOR_SHIFT; -} - -static inline chunk_t sector_to_chunk(struct dm_snapshot *s, sector_t sector) -{ - return (sector & ~s->chunk_mask) >> s->chunk_shift; -} - -static inline sector_t chunk_to_sector(struct dm_snapshot *s, chunk_t chunk) -{ - return chunk << s->chunk_shift; -} - -static inline int bdev_equal(struct block_device *lhs, struct block_device *rhs) -{ - /* - * There is only ever one instance of a particular block - * device so we can compare pointers safely. - */ - return lhs == rhs; -} - -#endif diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 2fd66c30f7f8..429b50b975d5 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -52,8 +52,6 @@ struct dm_table { sector_t *highs; struct dm_target *targets; - unsigned barriers_supported:1; - /* * Indicates the rw permissions for the new logical * device. This should be a combination of FMODE_READ @@ -243,7 +241,6 @@ int dm_table_create(struct dm_table **result, fmode_t mode, INIT_LIST_HEAD(&t->devices); atomic_set(&t->holders, 0); - t->barriers_supported = 1; if (!num_targets) num_targets = KEYS_PER_NODE; @@ -399,28 +396,30 @@ static int check_device_area(struct dm_dev_internal *dd, sector_t start, } /* - * This upgrades the mode on an already open dm_dev. Being + * This upgrades the mode on an already open dm_dev, being * careful to leave things as they were if we fail to reopen the - * device. + * device and not to touch the existing bdev field in case + * it is accessed concurrently inside dm_table_any_congested(). */ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, struct mapped_device *md) { int r; - struct dm_dev_internal dd_copy; - dev_t dev = dd->dm_dev.bdev->bd_dev; + struct dm_dev_internal dd_new, dd_old; + + dd_new = dd_old = *dd; + + dd_new.dm_dev.mode |= new_mode; + dd_new.dm_dev.bdev = NULL; - dd_copy = *dd; + r = open_dev(&dd_new, dd->dm_dev.bdev->bd_dev, md); + if (r) + return r; dd->dm_dev.mode |= new_mode; - dd->dm_dev.bdev = NULL; - r = open_dev(dd, dev, md); - if (!r) - close_dev(&dd_copy, md); - else - *dd = dd_copy; + close_dev(&dd_old, md); - return r; + return 0; } /* @@ -749,10 +748,6 @@ int dm_table_add_target(struct dm_table *t, const char *type, /* FIXME: the plan is to combine high here and then have * the merge fn apply the target level restrictions. */ combine_restrictions_low(&t->limits, &tgt->limits); - - if (!(tgt->type->features & DM_TARGET_SUPPORTS_BARRIERS)) - t->barriers_supported = 0; - return 0; bad: @@ -797,12 +792,6 @@ int dm_table_complete(struct dm_table *t) check_for_valid_limits(&t->limits); - /* - * We only support barriers if there is exactly one underlying device. - */ - if (!list_is_singular(&t->devices)) - t->barriers_supported = 0; - /* how many indexes will the btree have ? */ leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE); t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE); @@ -877,6 +866,45 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) return &t->targets[(KEYS_PER_NODE * n) + k]; } +/* + * Set the integrity profile for this device if all devices used have + * matching profiles. + */ +static void dm_table_set_integrity(struct dm_table *t) +{ + struct list_head *devices = dm_table_get_devices(t); + struct dm_dev_internal *prev = NULL, *dd = NULL; + + if (!blk_get_integrity(dm_disk(t->md))) + return; + + list_for_each_entry(dd, devices, list) { + if (prev && + blk_integrity_compare(prev->dm_dev.bdev->bd_disk, + dd->dm_dev.bdev->bd_disk) < 0) { + DMWARN("%s: integrity not set: %s and %s mismatch", + dm_device_name(t->md), + prev->dm_dev.bdev->bd_disk->disk_name, + dd->dm_dev.bdev->bd_disk->disk_name); + goto no_integrity; + } + prev = dd; + } + + if (!prev || !bdev_get_integrity(prev->dm_dev.bdev)) + goto no_integrity; + + blk_integrity_register(dm_disk(t->md), + bdev_get_integrity(prev->dm_dev.bdev)); + + return; + +no_integrity: + blk_integrity_register(dm_disk(t->md), NULL); + + return; +} + void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q) { /* @@ -897,6 +925,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q) else queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q); + dm_table_set_integrity(t); } unsigned int dm_table_get_num_targets(struct dm_table *t) @@ -1017,12 +1046,6 @@ struct mapped_device *dm_table_get_md(struct dm_table *t) return t->md; } -int dm_table_barrier_ok(struct dm_table *t) -{ - return t->barriers_supported; -} -EXPORT_SYMBOL(dm_table_barrier_ok); - EXPORT_SYMBOL(dm_vcalloc); EXPORT_SYMBOL(dm_get_device); EXPORT_SYMBOL(dm_put_device); diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c index 7decf10006e4..04feccf2a997 100644 --- a/drivers/md/dm-target.c +++ b/drivers/md/dm-target.c @@ -14,45 +14,34 @@ #define DM_MSG_PREFIX "target" -struct tt_internal { - struct target_type tt; - - struct list_head list; - long use; -}; - static LIST_HEAD(_targets); static DECLARE_RWSEM(_lock); #define DM_MOD_NAME_SIZE 32 -static inline struct tt_internal *__find_target_type(const char *name) +static inline struct target_type *__find_target_type(const char *name) { - struct tt_internal *ti; + struct target_type *tt; - list_for_each_entry (ti, &_targets, list) - if (!strcmp(name, ti->tt.name)) - return ti; + list_for_each_entry(tt, &_targets, list) + if (!strcmp(name, tt->name)) + return tt; return NULL; } -static struct tt_internal *get_target_type(const char *name) +static struct target_type *get_target_type(const char *name) { - struct tt_internal *ti; + struct target_type *tt; down_read(&_lock); - ti = __find_target_type(name); - if (ti) { - if ((ti->use == 0) && !try_module_get(ti->tt.module)) - ti = NULL; - else - ti->use++; - } + tt = __find_target_type(name); + if (tt && !try_module_get(tt->module)) + tt = NULL; up_read(&_lock); - return ti; + return tt; } static void load_module(const char *name) @@ -62,92 +51,59 @@ static void load_module(const char *name) struct target_type *dm_get_target_type(const char *name) { - struct tt_internal *ti = get_target_type(name); + struct target_type *tt = get_target_type(name); - if (!ti) { + if (!tt) { load_module(name); - ti = get_target_type(name); + tt = get_target_type(name); } - return ti ? &ti->tt : NULL; + return tt; } -void dm_put_target_type(struct target_type *t) +void dm_put_target_type(struct target_type *tt) { - struct tt_internal *ti = (struct tt_internal *) t; - down_read(&_lock); - if (--ti->use == 0) - module_put(ti->tt.module); - - BUG_ON(ti->use < 0); + module_put(tt->module); up_read(&_lock); - - return; -} - -static struct tt_internal *alloc_target(struct target_type *t) -{ - struct tt_internal *ti = kzalloc(sizeof(*ti), GFP_KERNEL); - - if (ti) - ti->tt = *t; - - return ti; } - int dm_target_iterate(void (*iter_func)(struct target_type *tt, void *param), void *param) { - struct tt_internal *ti; + struct target_type *tt; down_read(&_lock); - list_for_each_entry (ti, &_targets, list) - iter_func(&ti->tt, param); + list_for_each_entry(tt, &_targets, list) + iter_func(tt, param); up_read(&_lock); return 0; } -int dm_register_target(struct target_type *t) +int dm_register_target(struct target_type *tt) { int rv = 0; - struct tt_internal *ti = alloc_target(t); - - if (!ti) - return -ENOMEM; down_write(&_lock); - if (__find_target_type(t->name)) + if (__find_target_type(tt->name)) rv = -EEXIST; else - list_add(&ti->list, &_targets); + list_add(&tt->list, &_targets); up_write(&_lock); - if (rv) - kfree(ti); return rv; } -void dm_unregister_target(struct target_type *t) +void dm_unregister_target(struct target_type *tt) { - struct tt_internal *ti; - down_write(&_lock); - if (!(ti = __find_target_type(t->name))) { - DMCRIT("Unregistering unrecognised target: %s", t->name); - BUG(); - } - - if (ti->use) { - DMCRIT("Attempt to unregister target still in use: %s", - t->name); + if (!__find_target_type(tt->name)) { + DMCRIT("Unregistering unrecognised target: %s", tt->name); BUG(); } - list_del(&ti->list); - kfree(ti); + list_del(&tt->list); up_write(&_lock); } @@ -156,17 +112,17 @@ void dm_unregister_target(struct target_type *t) * io-err: always fails an io, useful for bringing * up LVs that have holes in them. */ -static int io_err_ctr(struct dm_target *ti, unsigned int argc, char **args) +static int io_err_ctr(struct dm_target *tt, unsigned int argc, char **args) { return 0; } -static void io_err_dtr(struct dm_target *ti) +static void io_err_dtr(struct dm_target *tt) { /* empty */ } -static int io_err_map(struct dm_target *ti, struct bio *bio, +static int io_err_map(struct dm_target *tt, struct bio *bio, union map_info *map_context) { return -EIO; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 51ba1db4b3e7..424f7b048c30 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -6,7 +6,6 @@ */ #include "dm.h" -#include "dm-bio-list.h" #include "dm-uevent.h" #include <linux/init.h> @@ -89,29 +88,20 @@ union map_info *dm_get_mapinfo(struct bio *bio) /* * Bits for the md->flags field. */ -#define DMF_BLOCK_IO 0 +#define DMF_BLOCK_IO_FOR_SUSPEND 0 #define DMF_SUSPENDED 1 #define DMF_FROZEN 2 #define DMF_FREEING 3 #define DMF_DELETING 4 #define DMF_NOFLUSH_SUSPENDING 5 +#define DMF_QUEUE_IO_TO_THREAD 6 /* * Work processed by per-device workqueue. */ -struct dm_wq_req { - enum { - DM_WQ_FLUSH_DEFERRED, - } type; - struct work_struct work; - struct mapped_device *md; - void *context; -}; - struct mapped_device { struct rw_semaphore io_lock; struct mutex suspend_lock; - spinlock_t pushback_lock; rwlock_t map_lock; atomic_t holders; atomic_t open_count; @@ -129,8 +119,14 @@ struct mapped_device { */ atomic_t pending; wait_queue_head_t wait; + struct work_struct work; struct bio_list deferred; - struct bio_list pushback; + spinlock_t deferred_lock; + + /* + * An error from the barrier request currently being processed. + */ + int barrier_error; /* * Processing queue (flush/barriers) @@ -433,6 +429,10 @@ static void end_io_acct(struct dm_io *io) part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration); part_stat_unlock(); + /* + * After this is decremented the bio must not be touched if it is + * a barrier. + */ dm_disk(md)->part0.in_flight = pending = atomic_dec_return(&md->pending); @@ -444,19 +444,18 @@ static void end_io_acct(struct dm_io *io) /* * Add the bio to the list of deferred io. */ -static int queue_io(struct mapped_device *md, struct bio *bio) +static void queue_io(struct mapped_device *md, struct bio *bio) { down_write(&md->io_lock); - if (!test_bit(DMF_BLOCK_IO, &md->flags)) { - up_write(&md->io_lock); - return 1; - } - + spin_lock_irq(&md->deferred_lock); bio_list_add(&md->deferred, bio); + spin_unlock_irq(&md->deferred_lock); + + if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) + queue_work(md->wq, &md->work); up_write(&md->io_lock); - return 0; /* deferred successfully */ } /* @@ -525,36 +524,50 @@ static int __noflush_suspending(struct mapped_device *md) static void dec_pending(struct dm_io *io, int error) { unsigned long flags; + int io_error; + struct bio *bio; + struct mapped_device *md = io->md; /* Push-back supersedes any I/O errors */ - if (error && !(io->error > 0 && __noflush_suspending(io->md))) + if (error && !(io->error > 0 && __noflush_suspending(md))) io->error = error; if (atomic_dec_and_test(&io->io_count)) { if (io->error == DM_ENDIO_REQUEUE) { /* * Target requested pushing back the I/O. - * This must be handled before the sleeper on - * suspend queue merges the pushback list. */ - spin_lock_irqsave(&io->md->pushback_lock, flags); - if (__noflush_suspending(io->md)) - bio_list_add(&io->md->pushback, io->bio); + spin_lock_irqsave(&md->deferred_lock, flags); + if (__noflush_suspending(md)) + bio_list_add_head(&md->deferred, io->bio); else /* noflush suspend was interrupted. */ io->error = -EIO; - spin_unlock_irqrestore(&io->md->pushback_lock, flags); + spin_unlock_irqrestore(&md->deferred_lock, flags); } - end_io_acct(io); + io_error = io->error; + bio = io->bio; - if (io->error != DM_ENDIO_REQUEUE) { - trace_block_bio_complete(io->md->queue, io->bio); + if (bio_barrier(bio)) { + /* + * There can be just one barrier request so we use + * a per-device variable for error reporting. + * Note that you can't touch the bio after end_io_acct + */ + md->barrier_error = io_error; + end_io_acct(io); + } else { + end_io_acct(io); - bio_endio(io->bio, io->error); + if (io_error != DM_ENDIO_REQUEUE) { + trace_block_bio_complete(md->queue, bio); + + bio_endio(bio, io_error); + } } - free_io(io->md, io); + free_io(md, io); } } @@ -562,6 +575,7 @@ static void clone_endio(struct bio *bio, int error) { int r = 0; struct dm_target_io *tio = bio->bi_private; + struct dm_io *io = tio->io; struct mapped_device *md = tio->io->md; dm_endio_fn endio = tio->ti->type->end_io; @@ -585,15 +599,14 @@ static void clone_endio(struct bio *bio, int error) } } - dec_pending(tio->io, error); - /* * Store md for cleanup instead of tio which is about to get freed. */ bio->bi_private = md->bs; - bio_put(bio); free_tio(md, tio); + bio_put(bio); + dec_pending(io, error); } static sector_t max_io_len(struct mapped_device *md, @@ -696,13 +709,19 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector, clone->bi_sector = sector; clone->bi_bdev = bio->bi_bdev; - clone->bi_rw = bio->bi_rw; + clone->bi_rw = bio->bi_rw & ~(1 << BIO_RW_BARRIER); clone->bi_vcnt = 1; clone->bi_size = to_bytes(len); clone->bi_io_vec->bv_offset = offset; clone->bi_io_vec->bv_len = clone->bi_size; clone->bi_flags |= 1 << BIO_CLONED; + if (bio_integrity(bio)) { + bio_integrity_clone(clone, bio, GFP_NOIO); + bio_integrity_trim(clone, + bio_sector_offset(bio, idx, offset), len); + } + return clone; } @@ -717,6 +736,7 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector, clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs); __bio_clone(clone, bio); + clone->bi_rw &= ~(1 << BIO_RW_BARRIER); clone->bi_destructor = dm_bio_destructor; clone->bi_sector = sector; clone->bi_idx = idx; @@ -724,6 +744,14 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector, clone->bi_size = to_bytes(len); clone->bi_flags &= ~(1 << BIO_SEG_VALID); + if (bio_integrity(bio)) { + bio_integrity_clone(clone, bio, GFP_NOIO); + + if (idx != bio->bi_idx || clone->bi_size < bio->bi_size) + bio_integrity_trim(clone, + bio_sector_offset(bio, idx, 0), len); + } + return clone; } @@ -828,21 +856,22 @@ static int __clone_and_map(struct clone_info *ci) } /* - * Split the bio into several clones. + * Split the bio into several clones and submit it to targets. */ -static int __split_bio(struct mapped_device *md, struct bio *bio) +static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) { struct clone_info ci; int error = 0; ci.map = dm_get_table(md); - if (unlikely(!ci.map)) - return -EIO; - if (unlikely(bio_barrier(bio) && !dm_table_barrier_ok(ci.map))) { - dm_table_put(ci.map); - bio_endio(bio, -EOPNOTSUPP); - return 0; + if (unlikely(!ci.map)) { + if (!bio_barrier(bio)) + bio_io_error(bio); + else + md->barrier_error = -EIO; + return; } + ci.md = md; ci.bio = bio; ci.io = alloc_io(md); @@ -861,8 +890,6 @@ static int __split_bio(struct mapped_device *md, struct bio *bio) /* drop the extra reference count */ dec_pending(ci.io, error); dm_table_put(ci.map); - - return 0; } /*----------------------------------------------------------------- * CRUD END @@ -921,7 +948,6 @@ out: */ static int dm_request(struct request_queue *q, struct bio *bio) { - int r = -EIO; int rw = bio_data_dir(bio); struct mapped_device *md = q->queuedata; int cpu; @@ -934,32 +960,26 @@ static int dm_request(struct request_queue *q, struct bio *bio) part_stat_unlock(); /* - * If we're suspended we have to queue - * this io for later. + * If we're suspended or the thread is processing barriers + * we have to queue this io for later. */ - while (test_bit(DMF_BLOCK_IO, &md->flags)) { + if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) || + unlikely(bio_barrier(bio))) { up_read(&md->io_lock); - if (bio_rw(bio) != READA) - r = queue_io(md, bio); + if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) && + bio_rw(bio) == READA) { + bio_io_error(bio); + return 0; + } - if (r <= 0) - goto out_req; + queue_io(md, bio); - /* - * We're in a while loop, because someone could suspend - * before we get to the following read lock. - */ - down_read(&md->io_lock); + return 0; } - r = __split_bio(md, bio); + __split_and_process_bio(md, bio); up_read(&md->io_lock); - -out_req: - if (r < 0) - bio_io_error(bio); - return 0; } @@ -980,7 +1000,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits) struct mapped_device *md = congested_data; struct dm_table *map; - if (!test_bit(DMF_BLOCK_IO, &md->flags)) { + if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { map = dm_get_table(md); if (map) { r = dm_table_any_congested(map, bdi_bits); @@ -1068,6 +1088,8 @@ out: static struct block_device_operations dm_blk_dops; +static void dm_wq_work(struct work_struct *work); + /* * Allocate and initialise a blank device with a given minor. */ @@ -1095,7 +1117,7 @@ static struct mapped_device *alloc_dev(int minor) init_rwsem(&md->io_lock); mutex_init(&md->suspend_lock); - spin_lock_init(&md->pushback_lock); + spin_lock_init(&md->deferred_lock); rwlock_init(&md->map_lock); atomic_set(&md->holders, 1); atomic_set(&md->open_count, 0); @@ -1112,6 +1134,7 @@ static struct mapped_device *alloc_dev(int minor) md->queue->backing_dev_info.congested_fn = dm_any_congested; md->queue->backing_dev_info.congested_data = md; blk_queue_make_request(md->queue, dm_request); + blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL); blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); md->queue->unplug_fn = dm_unplug_all; blk_queue_merge_bvec(md->queue, dm_merge_bvec); @@ -1134,6 +1157,7 @@ static struct mapped_device *alloc_dev(int minor) atomic_set(&md->pending, 0); init_waitqueue_head(&md->wait); + INIT_WORK(&md->work, dm_wq_work); init_waitqueue_head(&md->eventq); md->disk->major = _major; @@ -1191,6 +1215,7 @@ static void free_dev(struct mapped_device *md) mempool_destroy(md->tio_pool); mempool_destroy(md->io_pool); bioset_free(md->bs); + blk_integrity_unregister(md->disk); del_gendisk(md->disk); free_minor(minor); @@ -1373,18 +1398,24 @@ void dm_put(struct mapped_device *md) } EXPORT_SYMBOL_GPL(dm_put); -static int dm_wait_for_completion(struct mapped_device *md) +static int dm_wait_for_completion(struct mapped_device *md, int interruptible) { int r = 0; + DECLARE_WAITQUEUE(wait, current); + + dm_unplug_all(md->queue); + + add_wait_queue(&md->wait, &wait); while (1) { - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(interruptible); smp_mb(); if (!atomic_read(&md->pending)) break; - if (signal_pending(current)) { + if (interruptible == TASK_INTERRUPTIBLE && + signal_pending(current)) { r = -EINTR; break; } @@ -1393,68 +1424,80 @@ static int dm_wait_for_completion(struct mapped_device *md) } set_current_state(TASK_RUNNING); + remove_wait_queue(&md->wait, &wait); + return r; } -/* - * Process the deferred bios - */ -static void __flush_deferred_io(struct mapped_device *md) +static int dm_flush(struct mapped_device *md) { - struct bio *c; + dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); + return 0; +} + +static void process_barrier(struct mapped_device *md, struct bio *bio) +{ + int error = dm_flush(md); - while ((c = bio_list_pop(&md->deferred))) { - if (__split_bio(md, c)) - bio_io_error(c); + if (unlikely(error)) { + bio_endio(bio, error); + return; + } + if (bio_empty_barrier(bio)) { + bio_endio(bio, 0); + return; } - clear_bit(DMF_BLOCK_IO, &md->flags); -} + __split_and_process_bio(md, bio); -static void __merge_pushback_list(struct mapped_device *md) -{ - unsigned long flags; + error = dm_flush(md); + + if (!error && md->barrier_error) + error = md->barrier_error; - spin_lock_irqsave(&md->pushback_lock, flags); - clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); - bio_list_merge_head(&md->deferred, &md->pushback); - bio_list_init(&md->pushback); - spin_unlock_irqrestore(&md->pushback_lock, flags); + if (md->barrier_error != DM_ENDIO_REQUEUE) + bio_endio(bio, error); } +/* + * Process the deferred bios + */ static void dm_wq_work(struct work_struct *work) { - struct dm_wq_req *req = container_of(work, struct dm_wq_req, work); - struct mapped_device *md = req->md; + struct mapped_device *md = container_of(work, struct mapped_device, + work); + struct bio *c; down_write(&md->io_lock); - switch (req->type) { - case DM_WQ_FLUSH_DEFERRED: - __flush_deferred_io(md); - break; - default: - DMERR("dm_wq_work: unrecognised work type %d", req->type); - BUG(); + + while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { + spin_lock_irq(&md->deferred_lock); + c = bio_list_pop(&md->deferred); + spin_unlock_irq(&md->deferred_lock); + + if (!c) { + clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); + break; + } + + up_write(&md->io_lock); + + if (bio_barrier(c)) + process_barrier(md, c); + else + __split_and_process_bio(md, c); + + down_write(&md->io_lock); } + up_write(&md->io_lock); } -static void dm_wq_queue(struct mapped_device *md, int type, void *context, - struct dm_wq_req *req) +static void dm_queue_flush(struct mapped_device *md) { - req->type = type; - req->md = md; - req->context = context; - INIT_WORK(&req->work, dm_wq_work); - queue_work(md->wq, &req->work); -} - -static void dm_queue_flush(struct mapped_device *md, int type, void *context) -{ - struct dm_wq_req req; - - dm_wq_queue(md, type, context, &req); - flush_workqueue(md->wq); + clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); + smp_mb__after_clear_bit(); + queue_work(md->wq, &md->work); } /* @@ -1528,7 +1571,6 @@ static void unlock_fs(struct mapped_device *md) int dm_suspend(struct mapped_device *md, unsigned suspend_flags) { struct dm_table *map = NULL; - DECLARE_WAITQUEUE(wait, current); int r = 0; int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; @@ -1573,38 +1615,54 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) } /* - * First we set the BLOCK_IO flag so no more ios will be mapped. + * Here we must make sure that no processes are submitting requests + * to target drivers i.e. no one may be executing + * __split_and_process_bio. This is called from dm_request and + * dm_wq_work. + * + * To get all processes out of __split_and_process_bio in dm_request, + * we take the write lock. To prevent any process from reentering + * __split_and_process_bio from dm_request, we set + * DMF_QUEUE_IO_TO_THREAD. + * + * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND + * and call flush_workqueue(md->wq). flush_workqueue will wait until + * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any + * further calls to __split_and_process_bio from dm_wq_work. */ down_write(&md->io_lock); - set_bit(DMF_BLOCK_IO, &md->flags); - - add_wait_queue(&md->wait, &wait); + set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); + set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); up_write(&md->io_lock); - /* unplug */ - if (map) - dm_table_unplug_all(map); + flush_workqueue(md->wq); /* - * Wait for the already-mapped ios to complete. + * At this point no more requests are entering target request routines. + * We call dm_wait_for_completion to wait for all existing requests + * to finish. */ - r = dm_wait_for_completion(md); + r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE); down_write(&md->io_lock); - remove_wait_queue(&md->wait, &wait); - if (noflush) - __merge_pushback_list(md); + clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); up_write(&md->io_lock); /* were we interrupted ? */ if (r < 0) { - dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL); + dm_queue_flush(md); unlock_fs(md); goto out; /* pushback list is already flushed, so skip flush */ } + /* + * If dm_wait_for_completion returned 0, the device is completely + * quiescent now. There is no request-processing activity. All new + * requests are being added to md->deferred list. + */ + dm_table_postsuspend_targets(map); set_bit(DMF_SUSPENDED, &md->flags); @@ -1639,7 +1697,7 @@ int dm_resume(struct mapped_device *md) if (r) goto out; - dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL); + dm_queue_flush(md); unlock_fs(md); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 20194e000c5a..a31506d93e91 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -52,7 +52,6 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits); * To check the return value from dm_table_find_target(). */ #define dm_target_is_valid(t) ((t)->table) -int dm_table_barrier_ok(struct dm_table *t); /*----------------------------------------------------------------- * A registry of target types. @@ -60,7 +59,7 @@ int dm_table_barrier_ok(struct dm_table *t); int dm_target_init(void); void dm_target_exit(void); struct target_type *dm_get_target_type(const char *name); -void dm_put_target_type(struct target_type *t); +void dm_put_target_type(struct target_type *tt); int dm_target_iterate(void (*iter_func)(struct target_type *tt, void *param), void *param); diff --git a/drivers/md/md.c b/drivers/md/md.c index ed5727c089a9..641b211fe3fe 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1375,6 +1375,9 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->raid_disks = cpu_to_le32(mddev->raid_disks); sb->size = cpu_to_le64(mddev->dev_sectors); + sb->chunksize = cpu_to_le32(mddev->chunk_size >> 9); + sb->level = cpu_to_le32(mddev->level); + sb->layout = cpu_to_le32(mddev->layout); if (mddev->bitmap && mddev->bitmap_file == NULL) { sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); @@ -2017,6 +2020,8 @@ repeat: clear_bit(MD_CHANGE_PENDING, &mddev->flags); spin_unlock_irq(&mddev->write_lock); wake_up(&mddev->sb_wait); + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } @@ -2086,6 +2091,7 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) * -writemostly - clears write_mostly * blocked - sets the Blocked flag * -blocked - clears the Blocked flag + * insync - sets Insync providing device isn't active */ int err = -EINVAL; if (cmd_match(buf, "faulty") && rdev->mddev->pers) { @@ -2118,6 +2124,9 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) md_wakeup_thread(rdev->mddev->thread); err = 0; + } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { + set_bit(In_sync, &rdev->flags); + err = 0; } if (!err && rdev->sysfs_state) sysfs_notify_dirent(rdev->sysfs_state); @@ -2190,7 +2199,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) } else if (rdev->mddev->pers) { mdk_rdev_t *rdev2; /* Activating a spare .. or possibly reactivating - * if we every get bitmaps working here. + * if we ever get bitmaps working here. */ if (rdev->raid_disk != -1) @@ -3060,11 +3069,8 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) } else err = -EBUSY; spin_unlock_irq(&mddev->write_lock); - } else { - mddev->ro = 0; - mddev->recovery_cp = MaxSector; - err = do_md_run(mddev); - } + } else + err = -EINVAL; break; case active: if (mddev->pers) { @@ -3300,7 +3306,9 @@ static ssize_t action_show(mddev_t *mddev, char *page) { char *type = "idle"; - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) + type = "frozen"; + else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) { if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) type = "reshape"; @@ -3323,7 +3331,12 @@ action_store(mddev_t *mddev, const char *page, size_t len) if (!mddev->pers || !mddev->pers->sync_request) return -EINVAL; - if (cmd_match(page, "idle")) { + if (cmd_match(page, "frozen")) + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + else + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + + if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_unregister_thread(mddev->sync_thread); @@ -3482,12 +3495,15 @@ sync_completed_show(mddev_t *mddev, char *page) { unsigned long max_sectors, resync; + if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + return sprintf(page, "none\n"); + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) max_sectors = mddev->resync_max_sectors; else max_sectors = mddev->dev_sectors; - resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active)); + resync = mddev->curr_resync_completed; return sprintf(page, "%lu / %lu\n", resync, max_sectors); } @@ -3674,7 +3690,7 @@ array_size_store(mddev_t *mddev, const char *buf, size_t len) if (strict_blocks_to_sectors(buf, §ors) < 0) return -EINVAL; if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) - return -EINVAL; + return -E2BIG; mddev->external_size = 1; } @@ -4288,6 +4304,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) { int err = 0; struct gendisk *disk = mddev->gendisk; + mdk_rdev_t *rdev; if (atomic_read(&mddev->openers) > is_open) { printk("md: %s still in use.\n",mdname(mddev)); @@ -4330,6 +4347,13 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) /* tell userspace to handle 'inactive' */ sysfs_notify_dirent(mddev->sysfs_state); + list_for_each_entry(rdev, &mddev->disks, same_set) + if (rdev->raid_disk >= 0) { + char nm[20]; + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); + } + set_capacity(disk, 0); mddev->changed = 1; @@ -4350,7 +4374,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) * Free resources if final stop */ if (mode == 0) { - mdk_rdev_t *rdev; printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); @@ -4362,13 +4385,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) } mddev->bitmap_offset = 0; - list_for_each_entry(rdev, &mddev->disks, same_set) - if (rdev->raid_disk >= 0) { - char nm[20]; - sprintf(nm, "rd%d", rdev->raid_disk); - sysfs_remove_link(&mddev->kobj, nm); - } - /* make sure all md_delayed_delete calls have finished */ flush_scheduled_work(); @@ -5551,7 +5567,7 @@ static struct block_device_operations md_fops = .owner = THIS_MODULE, .open = md_open, .release = md_release, - .locked_ioctl = md_ioctl, + .ioctl = md_ioctl, .getgeo = md_getgeo, .media_changed = md_media_changed, .revalidate_disk= md_revalidate, @@ -5696,37 +5712,38 @@ static void status_unused(struct seq_file *seq) static void status_resync(struct seq_file *seq, mddev_t * mddev) { - sector_t max_blocks, resync, res; - unsigned long dt, db, rt; + sector_t max_sectors, resync, res; + unsigned long dt, db; + sector_t rt; int scale; unsigned int per_milli; - resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; + resync = mddev->curr_resync - atomic_read(&mddev->recovery_active); if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) - max_blocks = mddev->resync_max_sectors >> 1; + max_sectors = mddev->resync_max_sectors; else - max_blocks = mddev->dev_sectors / 2; + max_sectors = mddev->dev_sectors; /* * Should not happen. */ - if (!max_blocks) { + if (!max_sectors) { MD_BUG(); return; } /* Pick 'scale' such that (resync>>scale)*1000 will fit - * in a sector_t, and (max_blocks>>scale) will fit in a + * in a sector_t, and (max_sectors>>scale) will fit in a * u32, as those are the requirements for sector_div. * Thus 'scale' must be at least 10 */ scale = 10; if (sizeof(sector_t) > sizeof(unsigned long)) { - while ( max_blocks/2 > (1ULL<<(scale+32))) + while ( max_sectors/2 > (1ULL<<(scale+32))) scale++; } res = (resync>>scale)*1000; - sector_div(res, (u32)((max_blocks>>scale)+1)); + sector_div(res, (u32)((max_sectors>>scale)+1)); per_milli = res; { @@ -5747,25 +5764,35 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev) (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? "resync" : "recovery"))), per_milli/10, per_milli % 10, - (unsigned long long) resync, - (unsigned long long) max_blocks); + (unsigned long long) resync/2, + (unsigned long long) max_sectors/2); /* - * We do not want to overflow, so the order of operands and - * the * 100 / 100 trick are important. We do a +1 to be - * safe against division by zero. We only estimate anyway. - * * dt: time from mark until now * db: blocks written from mark until now * rt: remaining time + * + * rt is a sector_t, so could be 32bit or 64bit. + * So we divide before multiply in case it is 32bit and close + * to the limit. + * We scale the divisor (db) by 32 to avoid loosing precision + * near the end of resync when the number of remaining sectors + * is close to 'db'. + * We then divide rt by 32 after multiplying by db to compensate. + * The '+1' avoids division by zero if db is very small. */ dt = ((jiffies - mddev->resync_mark) / HZ); if (!dt) dt++; db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)) - mddev->resync_mark_cnt; - rt = (dt * ((unsigned long)(max_blocks-resync) / (db/2/100+1)))/100; - seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); + rt = max_sectors - resync; /* number of remaining sectors */ + sector_div(rt, db/32+1); + rt *= dt; + rt >>= 5; + + seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, + ((unsigned long)rt % 60)/6); seq_printf(seq, " speed=%ldK/sec", db/2/dt); } @@ -5956,7 +5983,7 @@ static int md_seq_show(struct seq_file *seq, void *v) return 0; } -static struct seq_operations md_seq_ops = { +static const struct seq_operations md_seq_ops = { .start = md_seq_start, .next = md_seq_next, .stop = md_seq_stop, @@ -6334,18 +6361,14 @@ void md_do_sync(mddev_t *mddev) sector_t sectors; skipped = 0; - if (j >= mddev->resync_max) { - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); - wait_event(mddev->recovery_wait, - mddev->resync_max > j - || kthread_should_stop()); - } - if (kthread_should_stop()) - goto interrupted; - if (mddev->curr_resync > mddev->curr_resync_completed && - (mddev->curr_resync - mddev->curr_resync_completed) - > (max_sectors >> 4)) { + if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + ((mddev->curr_resync > mddev->curr_resync_completed && + (mddev->curr_resync - mddev->curr_resync_completed) + > (max_sectors >> 4)) || + (j - mddev->curr_resync_completed)*2 + >= mddev->resync_max - mddev->curr_resync_completed + )) { /* time to update curr_resync_completed */ blk_unplug(mddev->queue); wait_event(mddev->recovery_wait, @@ -6353,7 +6376,17 @@ void md_do_sync(mddev_t *mddev) mddev->curr_resync_completed = mddev->curr_resync; set_bit(MD_CHANGE_CLEAN, &mddev->flags); + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } + + if (j >= mddev->resync_max) + wait_event(mddev->recovery_wait, + mddev->resync_max > j + || kthread_should_stop()); + + if (kthread_should_stop()) + goto interrupted; + sectors = mddev->pers->sync_request(mddev, j, &skipped, currspeed < speed_min(mddev)); if (sectors == 0) { @@ -6461,6 +6494,7 @@ void md_do_sync(mddev_t *mddev) skip: mddev->curr_resync = 0; + mddev->curr_resync_completed = 0; mddev->resync_min = 0; mddev->resync_max = MaxSector; sysfs_notify(&mddev->kobj, NULL, "sync_completed"); diff --git a/drivers/md/md.h b/drivers/md/md.h index e9b7f54c24d6..8227ab909d44 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -12,10 +12,17 @@ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#ifndef _MD_K_H -#define _MD_K_H - -#ifdef CONFIG_BLOCK +#ifndef _MD_MD_H +#define _MD_MD_H + +#include <linux/blkdev.h> +#include <linux/kobject.h> +#include <linux/list.h> +#include <linux/mm.h> +#include <linux/mutex.h> +#include <linux/timer.h> +#include <linux/wait.h> +#include <linux/workqueue.h> #define MaxSector (~(sector_t)0) @@ -408,10 +415,6 @@ static inline void safe_put_page(struct page *p) if (p) put_page(p); } -#endif /* CONFIG_BLOCK */ -#endif - - extern int register_md_personality(struct mdk_personality *p); extern int unregister_md_personality(struct mdk_personality *p); extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev), @@ -434,3 +437,5 @@ extern void md_new_event(mddev_t *mddev); extern int md_allow_write(mddev_t *mddev); extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors); + +#endif /* _MD_MD_H */ diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index b4f4badc0068..36df9109cde1 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -35,7 +35,6 @@ #include <linux/blkdev.h> #include <linux/seq_file.h> #include "md.h" -#include "dm-bio-list.h" #include "raid1.h" #include "bitmap.h" @@ -123,6 +122,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) goto out_free_pages; bio->bi_io_vec[i].bv_page = page; + bio->bi_vcnt = i+1; } } /* If not user-requests, copy the page pointers to all bios */ @@ -138,9 +138,9 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) return r1_bio; out_free_pages: - for (i=0; i < RESYNC_PAGES ; i++) - for (j=0 ; j < pi->raid_disks; j++) - safe_put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page); + for (j=0 ; j < pi->raid_disks; j++) + for (i=0; i < r1_bio->bios[j]->bi_vcnt ; i++) + put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page); j = -1; out_free_bio: while ( ++j < pi->raid_disks ) @@ -585,7 +585,7 @@ static int raid1_congested(void *data, int bits) /* Note the '|| 1' - when read_balance prefers * non-congested targets, it can be removed */ - if ((bits & (1<<BDI_write_congested)) || 1) + if ((bits & (1<<BDI_async_congested)) || 1) ret |= bdi_congested(&q->backing_dev_info, bits); else ret &= bdi_congested(&q->backing_dev_info, bits); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index e293d92641ac..499620afb44b 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -22,7 +22,6 @@ #include <linux/blkdev.h> #include <linux/seq_file.h> #include "md.h" -#include "dm-bio-list.h" #include "raid10.h" #include "bitmap.h" @@ -1810,17 +1809,17 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i r10_bio->sector = sect; raid10_find_phys(conf, r10_bio); - /* Need to check if this section will still be + + /* Need to check if the array will still be * degraded */ - for (j=0; j<conf->copies;j++) { - int d = r10_bio->devs[j].devnum; - if (conf->mirrors[d].rdev == NULL || - test_bit(Faulty, &conf->mirrors[d].rdev->flags)) { + for (j=0; j<conf->raid_disks; j++) + if (conf->mirrors[j].rdev == NULL || + test_bit(Faulty, &conf->mirrors[j].rdev->flags)) { still_degraded = 1; break; } - } + must_sync = bitmap_start_sync(mddev->bitmap, sect, &sync_blocks, still_degraded); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 1f2a266f3cf7..54ef8d75541d 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -47,6 +47,7 @@ #include <linux/kthread.h> #include <linux/raid/pq.h> #include <linux/async_tx.h> +#include <linux/async.h> #include <linux/seq_file.h> #include <linux/cpu.h> #include "md.h" @@ -363,7 +364,7 @@ static void raid5_unplug_device(struct request_queue *q); static struct stripe_head * get_active_stripe(raid5_conf_t *conf, sector_t sector, - int previous, int noblock) + int previous, int noblock, int noquiesce) { struct stripe_head *sh; @@ -373,7 +374,7 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector, do { wait_event_lock_irq(conf->wait_for_stripe, - conf->quiesce == 0, + conf->quiesce == 0 || noquiesce, conf->device_lock, /* nothing */); sh = __find_stripe(conf, sector, conf->generation - previous); if (!sh) { @@ -501,13 +502,17 @@ async_copy_data(int frombio, struct bio *bio, struct page *page, int i; int page_offset; struct async_submit_ctl submit; + enum async_tx_flags flags = 0; if (bio->bi_sector >= sector) page_offset = (signed)(bio->bi_sector - sector) * 512; else page_offset = (signed)(sector - bio->bi_sector) * -512; - init_async_submit(&submit, 0, tx, NULL, NULL, NULL); + if (frombio) + flags |= ASYNC_TX_FENCE; + init_async_submit(&submit, flags, tx, NULL, NULL, NULL); + bio_for_each_segment(bvl, bio, i) { int len = bio_iovec_idx(bio, i)->bv_len; int clen; @@ -623,18 +628,30 @@ static void ops_run_biofill(struct stripe_head *sh) async_trigger_callback(&submit); } -static void ops_complete_compute5(void *stripe_head_ref) +static void mark_target_uptodate(struct stripe_head *sh, int target) { - struct stripe_head *sh = stripe_head_ref; - int target = sh->ops.target; - struct r5dev *tgt = &sh->dev[target]; + struct r5dev *tgt; - pr_debug("%s: stripe %llu\n", __func__, - (unsigned long long)sh->sector); + if (target < 0) + return; + tgt = &sh->dev[target]; set_bit(R5_UPTODATE, &tgt->flags); BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); clear_bit(R5_Wantcompute, &tgt->flags); +} + +static void ops_complete_compute(void *stripe_head_ref) +{ + struct stripe_head *sh = stripe_head_ref; + + pr_debug("%s: stripe %llu\n", __func__, + (unsigned long long)sh->sector); + + /* mark the computed target(s) as uptodate */ + mark_target_uptodate(sh, sh->ops.target); + mark_target_uptodate(sh, sh->ops.target2); + clear_bit(STRIPE_COMPUTE_RUN, &sh->state); if (sh->check_state == check_state_compute_run) sh->check_state = check_state_compute_result; @@ -672,8 +689,8 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) atomic_inc(&sh->count); - init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL, - ops_complete_compute5, sh, to_addr_conv(sh, percpu)); + init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, + ops_complete_compute, sh, to_addr_conv(sh, percpu)); if (unlikely(count == 1)) tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); else @@ -682,6 +699,202 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) return tx; } +/* set_syndrome_sources - populate source buffers for gen_syndrome + * @srcs - (struct page *) array of size sh->disks + * @sh - stripe_head to parse + * + * Populates srcs in proper layout order for the stripe and returns the + * 'count' of sources to be used in a call to async_gen_syndrome. The P + * destination buffer is recorded in srcs[count] and the Q destination + * is recorded in srcs[count+1]]. + */ +static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) +{ + int disks = sh->disks; + int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); + int d0_idx = raid6_d0(sh); + int count; + int i; + + for (i = 0; i < disks; i++) + srcs[i] = (void *)raid6_empty_zero_page; + + count = 0; + i = d0_idx; + do { + int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); + + srcs[slot] = sh->dev[i].page; + i = raid6_next_disk(i, disks); + } while (i != d0_idx); + BUG_ON(count != syndrome_disks); + + return count; +} + +static struct dma_async_tx_descriptor * +ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) +{ + int disks = sh->disks; + struct page **blocks = percpu->scribble; + int target; + int qd_idx = sh->qd_idx; + struct dma_async_tx_descriptor *tx; + struct async_submit_ctl submit; + struct r5dev *tgt; + struct page *dest; + int i; + int count; + + if (sh->ops.target < 0) + target = sh->ops.target2; + else if (sh->ops.target2 < 0) + target = sh->ops.target; + else + /* we should only have one valid target */ + BUG(); + BUG_ON(target < 0); + pr_debug("%s: stripe %llu block: %d\n", + __func__, (unsigned long long)sh->sector, target); + + tgt = &sh->dev[target]; + BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); + dest = tgt->page; + + atomic_inc(&sh->count); + + if (target == qd_idx) { + count = set_syndrome_sources(blocks, sh); + blocks[count] = NULL; /* regenerating p is not necessary */ + BUG_ON(blocks[count+1] != dest); /* q should already be set */ + init_async_submit(&submit, ASYNC_TX_FENCE, NULL, + ops_complete_compute, sh, + to_addr_conv(sh, percpu)); + tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); + } else { + /* Compute any data- or p-drive using XOR */ + count = 0; + for (i = disks; i-- ; ) { + if (i == target || i == qd_idx) + continue; + blocks[count++] = sh->dev[i].page; + } + + init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, + NULL, ops_complete_compute, sh, + to_addr_conv(sh, percpu)); + tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); + } + + return tx; +} + +static struct dma_async_tx_descriptor * +ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) +{ + int i, count, disks = sh->disks; + int syndrome_disks = sh->ddf_layout ? disks : disks-2; + int d0_idx = raid6_d0(sh); + int faila = -1, failb = -1; + int target = sh->ops.target; + int target2 = sh->ops.target2; + struct r5dev *tgt = &sh->dev[target]; + struct r5dev *tgt2 = &sh->dev[target2]; + struct dma_async_tx_descriptor *tx; + struct page **blocks = percpu->scribble; + struct async_submit_ctl submit; + + pr_debug("%s: stripe %llu block1: %d block2: %d\n", + __func__, (unsigned long long)sh->sector, target, target2); + BUG_ON(target < 0 || target2 < 0); + BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); + BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); + + /* we need to open-code set_syndrome_sources to handle to the + * slot number conversion for 'faila' and 'failb' + */ + for (i = 0; i < disks ; i++) + blocks[i] = (void *)raid6_empty_zero_page; + count = 0; + i = d0_idx; + do { + int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); + + blocks[slot] = sh->dev[i].page; + + if (i == target) + faila = slot; + if (i == target2) + failb = slot; + i = raid6_next_disk(i, disks); + } while (i != d0_idx); + BUG_ON(count != syndrome_disks); + + BUG_ON(faila == failb); + if (failb < faila) + swap(faila, failb); + pr_debug("%s: stripe: %llu faila: %d failb: %d\n", + __func__, (unsigned long long)sh->sector, faila, failb); + + atomic_inc(&sh->count); + + if (failb == syndrome_disks+1) { + /* Q disk is one of the missing disks */ + if (faila == syndrome_disks) { + /* Missing P+Q, just recompute */ + init_async_submit(&submit, ASYNC_TX_FENCE, NULL, + ops_complete_compute, sh, + to_addr_conv(sh, percpu)); + return async_gen_syndrome(blocks, 0, count+2, + STRIPE_SIZE, &submit); + } else { + struct page *dest; + int data_target; + int qd_idx = sh->qd_idx; + + /* Missing D+Q: recompute D from P, then recompute Q */ + if (target == qd_idx) + data_target = target2; + else + data_target = target; + + count = 0; + for (i = disks; i-- ; ) { + if (i == data_target || i == qd_idx) + continue; + blocks[count++] = sh->dev[i].page; + } + dest = sh->dev[data_target].page; + init_async_submit(&submit, + ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, + NULL, NULL, NULL, + to_addr_conv(sh, percpu)); + tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, + &submit); + + count = set_syndrome_sources(blocks, sh); + init_async_submit(&submit, ASYNC_TX_FENCE, tx, + ops_complete_compute, sh, + to_addr_conv(sh, percpu)); + return async_gen_syndrome(blocks, 0, count+2, + STRIPE_SIZE, &submit); + } + } + + init_async_submit(&submit, ASYNC_TX_FENCE, NULL, ops_complete_compute, + sh, to_addr_conv(sh, percpu)); + if (failb == syndrome_disks) { + /* We're missing D+P. */ + return async_raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, + faila, blocks, &submit); + } else { + /* We're missing D+D. */ + return async_raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, + faila, failb, blocks, &submit); + } +} + + static void ops_complete_prexor(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; @@ -712,7 +925,7 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, xor_srcs[count++] = dev->page; } - init_async_submit(&submit, ASYNC_TX_XOR_DROP_DST, tx, + init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, ops_complete_prexor, sh, to_addr_conv(sh, percpu)); tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); @@ -754,17 +967,21 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) return tx; } -static void ops_complete_postxor(void *stripe_head_ref) +static void ops_complete_reconstruct(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; - int disks = sh->disks, i, pd_idx = sh->pd_idx; + int disks = sh->disks; + int pd_idx = sh->pd_idx; + int qd_idx = sh->qd_idx; + int i; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if (dev->written || i == pd_idx) + + if (dev->written || i == pd_idx || i == qd_idx) set_bit(R5_UPTODATE, &dev->flags); } @@ -782,8 +999,8 @@ static void ops_complete_postxor(void *stripe_head_ref) } static void -ops_run_postxor(struct stripe_head *sh, struct raid5_percpu *percpu, - struct dma_async_tx_descriptor *tx) +ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, + struct dma_async_tx_descriptor *tx) { int disks = sh->disks; struct page **xor_srcs = percpu->scribble; @@ -826,7 +1043,7 @@ ops_run_postxor(struct stripe_head *sh, struct raid5_percpu *percpu, atomic_inc(&sh->count); - init_async_submit(&submit, flags, tx, ops_complete_postxor, sh, + init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh, to_addr_conv(sh, percpu)); if (unlikely(count == 1)) tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); @@ -834,6 +1051,25 @@ ops_run_postxor(struct stripe_head *sh, struct raid5_percpu *percpu, tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); } +static void +ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, + struct dma_async_tx_descriptor *tx) +{ + struct async_submit_ctl submit; + struct page **blocks = percpu->scribble; + int count; + + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); + + count = set_syndrome_sources(blocks, sh); + + atomic_inc(&sh->count); + + init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, + sh, to_addr_conv(sh, percpu)); + async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); +} + static void ops_complete_check(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; @@ -846,23 +1082,28 @@ static void ops_complete_check(void *stripe_head_ref) release_stripe(sh); } -static void ops_run_check(struct stripe_head *sh, struct raid5_percpu *percpu) +static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) { int disks = sh->disks; + int pd_idx = sh->pd_idx; + int qd_idx = sh->qd_idx; + struct page *xor_dest; struct page **xor_srcs = percpu->scribble; struct dma_async_tx_descriptor *tx; struct async_submit_ctl submit; - - int count = 0, pd_idx = sh->pd_idx, i; - struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; + int count; + int i; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); + count = 0; + xor_dest = sh->dev[pd_idx].page; + xor_srcs[count++] = xor_dest; for (i = disks; i--; ) { - struct r5dev *dev = &sh->dev[i]; - if (i != pd_idx) - xor_srcs[count++] = dev->page; + if (i == pd_idx || i == qd_idx) + continue; + xor_srcs[count++] = sh->dev[i].page; } init_async_submit(&submit, 0, NULL, NULL, NULL, @@ -875,11 +1116,32 @@ static void ops_run_check(struct stripe_head *sh, struct raid5_percpu *percpu) tx = async_trigger_callback(&submit); } -static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) +static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) +{ + struct page **srcs = percpu->scribble; + struct async_submit_ctl submit; + int count; + + pr_debug("%s: stripe %llu checkp: %d\n", __func__, + (unsigned long long)sh->sector, checkp); + + count = set_syndrome_sources(srcs, sh); + if (!checkp) + srcs[count] = NULL; + + atomic_inc(&sh->count); + init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, + sh, to_addr_conv(sh, percpu)); + async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, + &sh->ops.zero_sum_result, percpu->spare_page, &submit); +} + +static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) { int overlap_clear = 0, i, disks = sh->disks; struct dma_async_tx_descriptor *tx = NULL; raid5_conf_t *conf = sh->raid_conf; + int level = conf->level; struct raid5_percpu *percpu; unsigned long cpu; @@ -891,9 +1153,16 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) } if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { - tx = ops_run_compute5(sh, percpu); - /* terminate the chain if postxor is not set to be run */ - if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request)) + if (level < 6) + tx = ops_run_compute5(sh, percpu); + else { + if (sh->ops.target2 < 0 || sh->ops.target < 0) + tx = ops_run_compute6_1(sh, percpu); + else + tx = ops_run_compute6_2(sh, percpu); + } + /* terminate the chain if reconstruct is not set to be run */ + if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) async_tx_ack(tx); } @@ -905,11 +1174,23 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) overlap_clear++; } - if (test_bit(STRIPE_OP_POSTXOR, &ops_request)) - ops_run_postxor(sh, percpu, tx); + if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { + if (level < 6) + ops_run_reconstruct5(sh, percpu, tx); + else + ops_run_reconstruct6(sh, percpu, tx); + } - if (test_bit(STRIPE_OP_CHECK, &ops_request)) - ops_run_check(sh, percpu); + if (test_bit(STRIPE_OP_CHECK, &ops_request)) { + if (sh->check_state == check_state_run) + ops_run_check_p(sh, percpu); + else if (sh->check_state == check_state_run_q) + ops_run_check_pq(sh, percpu, 0); + else if (sh->check_state == check_state_run_pq) + ops_run_check_pq(sh, percpu, 1); + else + BUG(); + } if (overlap_clear) for (i = disks; i--; ) { @@ -1656,258 +1937,13 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) } - -/* - * Copy data between a page in the stripe cache, and one or more bion - * The page could align with the middle of the bio, or there could be - * several bion, each with several bio_vecs, which cover part of the page - * Multiple bion are linked together on bi_next. There may be extras - * at the end of this list. We ignore them. - */ -static void copy_data(int frombio, struct bio *bio, - struct page *page, - sector_t sector) -{ - char *pa = page_address(page); - struct bio_vec *bvl; - int i; - int page_offset; - - if (bio->bi_sector >= sector) - page_offset = (signed)(bio->bi_sector - sector) * 512; - else - page_offset = (signed)(sector - bio->bi_sector) * -512; - bio_for_each_segment(bvl, bio, i) { - int len = bio_iovec_idx(bio,i)->bv_len; - int clen; - int b_offset = 0; - - if (page_offset < 0) { - b_offset = -page_offset; - page_offset += b_offset; - len -= b_offset; - } - - if (len > 0 && page_offset + len > STRIPE_SIZE) - clen = STRIPE_SIZE - page_offset; - else clen = len; - - if (clen > 0) { - char *ba = __bio_kmap_atomic(bio, i, KM_USER0); - if (frombio) - memcpy(pa+page_offset, ba+b_offset, clen); - else - memcpy(ba+b_offset, pa+page_offset, clen); - __bio_kunmap_atomic(ba, KM_USER0); - } - if (clen < len) /* hit end of page */ - break; - page_offset += len; - } -} - -#define check_xor() do { \ - if (count == MAX_XOR_BLOCKS) { \ - xor_blocks(count, STRIPE_SIZE, dest, ptr);\ - count = 0; \ - } \ - } while(0) - -static void compute_parity6(struct stripe_head *sh, int method) -{ - raid5_conf_t *conf = sh->raid_conf; - int i, pd_idx, qd_idx, d0_idx, disks = sh->disks, count; - int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); - struct bio *chosen; - /**** FIX THIS: This could be very bad if disks is close to 256 ****/ - void *ptrs[syndrome_disks+2]; - - pd_idx = sh->pd_idx; - qd_idx = sh->qd_idx; - d0_idx = raid6_d0(sh); - - pr_debug("compute_parity, stripe %llu, method %d\n", - (unsigned long long)sh->sector, method); - - switch(method) { - case READ_MODIFY_WRITE: - BUG(); /* READ_MODIFY_WRITE N/A for RAID-6 */ - case RECONSTRUCT_WRITE: - for (i= disks; i-- ;) - if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) { - chosen = sh->dev[i].towrite; - sh->dev[i].towrite = NULL; - - if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - wake_up(&conf->wait_for_overlap); - - BUG_ON(sh->dev[i].written); - sh->dev[i].written = chosen; - } - break; - case CHECK_PARITY: - BUG(); /* Not implemented yet */ - } - - for (i = disks; i--;) - if (sh->dev[i].written) { - sector_t sector = sh->dev[i].sector; - struct bio *wbi = sh->dev[i].written; - while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { - copy_data(1, wbi, sh->dev[i].page, sector); - wbi = r5_next_bio(wbi, sector); - } - - set_bit(R5_LOCKED, &sh->dev[i].flags); - set_bit(R5_UPTODATE, &sh->dev[i].flags); - } - - /* Note that unlike RAID-5, the ordering of the disks matters greatly.*/ - - for (i = 0; i < disks; i++) - ptrs[i] = (void *)raid6_empty_zero_page; - - count = 0; - i = d0_idx; - do { - int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); - - ptrs[slot] = page_address(sh->dev[i].page); - if (slot < syndrome_disks && - !test_bit(R5_UPTODATE, &sh->dev[i].flags)) { - printk(KERN_ERR "block %d/%d not uptodate " - "on parity calc\n", i, count); - BUG(); - } - - i = raid6_next_disk(i, disks); - } while (i != d0_idx); - BUG_ON(count != syndrome_disks); - - raid6_call.gen_syndrome(syndrome_disks+2, STRIPE_SIZE, ptrs); - - switch(method) { - case RECONSTRUCT_WRITE: - set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); - set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags); - set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); - set_bit(R5_LOCKED, &sh->dev[qd_idx].flags); - break; - case UPDATE_PARITY: - set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); - set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags); - break; - } -} - - -/* Compute one missing block */ -static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) -{ - int i, count, disks = sh->disks; - void *ptr[MAX_XOR_BLOCKS], *dest, *p; - int qd_idx = sh->qd_idx; - - pr_debug("compute_block_1, stripe %llu, idx %d\n", - (unsigned long long)sh->sector, dd_idx); - - if ( dd_idx == qd_idx ) { - /* We're actually computing the Q drive */ - compute_parity6(sh, UPDATE_PARITY); - } else { - dest = page_address(sh->dev[dd_idx].page); - if (!nozero) memset(dest, 0, STRIPE_SIZE); - count = 0; - for (i = disks ; i--; ) { - if (i == dd_idx || i == qd_idx) - continue; - p = page_address(sh->dev[i].page); - if (test_bit(R5_UPTODATE, &sh->dev[i].flags)) - ptr[count++] = p; - else - printk("compute_block() %d, stripe %llu, %d" - " not present\n", dd_idx, - (unsigned long long)sh->sector, i); - - check_xor(); - } - if (count) - xor_blocks(count, STRIPE_SIZE, dest, ptr); - if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); - else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); - } -} - -/* Compute two missing blocks */ -static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) -{ - int i, count, disks = sh->disks; - int syndrome_disks = sh->ddf_layout ? disks : disks-2; - int d0_idx = raid6_d0(sh); - int faila = -1, failb = -1; - /**** FIX THIS: This could be very bad if disks is close to 256 ****/ - void *ptrs[syndrome_disks+2]; - - for (i = 0; i < disks ; i++) - ptrs[i] = (void *)raid6_empty_zero_page; - count = 0; - i = d0_idx; - do { - int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); - - ptrs[slot] = page_address(sh->dev[i].page); - - if (i == dd_idx1) - faila = slot; - if (i == dd_idx2) - failb = slot; - i = raid6_next_disk(i, disks); - } while (i != d0_idx); - BUG_ON(count != syndrome_disks); - - BUG_ON(faila == failb); - if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; } - - pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n", - (unsigned long long)sh->sector, dd_idx1, dd_idx2, - faila, failb); - - if (failb == syndrome_disks+1) { - /* Q disk is one of the missing disks */ - if (faila == syndrome_disks) { - /* Missing P+Q, just recompute */ - compute_parity6(sh, UPDATE_PARITY); - return; - } else { - /* We're missing D+Q; recompute D from P */ - compute_block_1(sh, ((dd_idx1 == sh->qd_idx) ? - dd_idx2 : dd_idx1), - 0); - compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */ - return; - } - } - - /* We're missing D+P or D+D; */ - if (failb == syndrome_disks) { - /* We're missing D+P. */ - raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, faila, ptrs); - } else { - /* We're missing D+D. */ - raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, faila, failb, - ptrs); - } - - /* Both the above update both missing blocks */ - set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags); - set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags); -} - static void -schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, +schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, int rcw, int expand) { int i, pd_idx = sh->pd_idx, disks = sh->disks; + raid5_conf_t *conf = sh->raid_conf; + int level = conf->level; if (rcw) { /* if we are not expanding this is a proper write request, and @@ -1920,7 +1956,7 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, } else sh->reconstruct_state = reconstruct_state_run; - set_bit(STRIPE_OP_POSTXOR, &s->ops_request); + set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -1933,17 +1969,18 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, s->locked++; } } - if (s->locked + 1 == disks) + if (s->locked + conf->max_degraded == disks) if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) - atomic_inc(&sh->raid_conf->pending_full_writes); + atomic_inc(&conf->pending_full_writes); } else { + BUG_ON(level == 6); BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); sh->reconstruct_state = reconstruct_state_prexor_drain_run; set_bit(STRIPE_OP_PREXOR, &s->ops_request); set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); - set_bit(STRIPE_OP_POSTXOR, &s->ops_request); + set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -1961,13 +1998,22 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, } } - /* keep the parity disk locked while asynchronous operations + /* keep the parity disk(s) locked while asynchronous operations * are in flight */ set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); s->locked++; + if (level == 6) { + int qd_idx = sh->qd_idx; + struct r5dev *dev = &sh->dev[qd_idx]; + + set_bit(R5_LOCKED, &dev->flags); + clear_bit(R5_UPTODATE, &dev->flags); + s->locked++; + } + pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", __func__, (unsigned long long)sh->sector, s->locked, s->ops_request); @@ -2048,13 +2094,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in static void end_reshape(raid5_conf_t *conf); -static int page_is_zero(struct page *p) -{ - char *a = page_address(p); - return ((*(u32*)a) == 0 && - memcmp(a, a+4, STRIPE_SIZE-4)==0); -} - static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, struct stripe_head *sh) { @@ -2195,9 +2234,10 @@ static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s, set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); set_bit(R5_Wantcompute, &dev->flags); sh->ops.target = disk_idx; + sh->ops.target2 = -1; s->req_compute = 1; /* Careful: from this point on 'uptodate' is in the eye - * of raid5_run_ops which services 'compute' operations + * of raid_run_ops which services 'compute' operations * before writes. R5_Wantcompute flags a block that will * be R5_UPTODATE by the time it is needed for a * subsequent operation. @@ -2236,61 +2276,104 @@ static void handle_stripe_fill5(struct stripe_head *sh, set_bit(STRIPE_HANDLE, &sh->state); } -static void handle_stripe_fill6(struct stripe_head *sh, - struct stripe_head_state *s, struct r6_state *r6s, - int disks) +/* fetch_block6 - checks the given member device to see if its data needs + * to be read or computed to satisfy a request. + * + * Returns 1 when no more member devices need to be checked, otherwise returns + * 0 to tell the loop in handle_stripe_fill6 to continue + */ +static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s, + struct r6_state *r6s, int disk_idx, int disks) { - int i; - for (i = disks; i--; ) { - struct r5dev *dev = &sh->dev[i]; - if (!test_bit(R5_LOCKED, &dev->flags) && - !test_bit(R5_UPTODATE, &dev->flags) && - (dev->toread || (dev->towrite && - !test_bit(R5_OVERWRITE, &dev->flags)) || - s->syncing || s->expanding || - (s->failed >= 1 && - (sh->dev[r6s->failed_num[0]].toread || - s->to_write)) || - (s->failed >= 2 && - (sh->dev[r6s->failed_num[1]].toread || - s->to_write)))) { - /* we would like to get this block, possibly - * by computing it, but we might not be able to + struct r5dev *dev = &sh->dev[disk_idx]; + struct r5dev *fdev[2] = { &sh->dev[r6s->failed_num[0]], + &sh->dev[r6s->failed_num[1]] }; + + if (!test_bit(R5_LOCKED, &dev->flags) && + !test_bit(R5_UPTODATE, &dev->flags) && + (dev->toread || + (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || + s->syncing || s->expanding || + (s->failed >= 1 && + (fdev[0]->toread || s->to_write)) || + (s->failed >= 2 && + (fdev[1]->toread || s->to_write)))) { + /* we would like to get this block, possibly by computing it, + * otherwise read it if the backing disk is insync + */ + BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); + BUG_ON(test_bit(R5_Wantread, &dev->flags)); + if ((s->uptodate == disks - 1) && + (s->failed && (disk_idx == r6s->failed_num[0] || + disk_idx == r6s->failed_num[1]))) { + /* have disk failed, and we're requested to fetch it; + * do compute it */ - if ((s->uptodate == disks - 1) && - (s->failed && (i == r6s->failed_num[0] || - i == r6s->failed_num[1]))) { - pr_debug("Computing stripe %llu block %d\n", - (unsigned long long)sh->sector, i); - compute_block_1(sh, i, 0); - s->uptodate++; - } else if ( s->uptodate == disks-2 && s->failed >= 2 ) { - /* Computing 2-failure is *very* expensive; only - * do it if failed >= 2 - */ - int other; - for (other = disks; other--; ) { - if (other == i) - continue; - if (!test_bit(R5_UPTODATE, - &sh->dev[other].flags)) - break; - } - BUG_ON(other < 0); - pr_debug("Computing stripe %llu blocks %d,%d\n", - (unsigned long long)sh->sector, - i, other); - compute_block_2(sh, i, other); - s->uptodate += 2; - } else if (test_bit(R5_Insync, &dev->flags)) { - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantread, &dev->flags); - s->locked++; - pr_debug("Reading block %d (sync=%d)\n", - i, s->syncing); + pr_debug("Computing stripe %llu block %d\n", + (unsigned long long)sh->sector, disk_idx); + set_bit(STRIPE_COMPUTE_RUN, &sh->state); + set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); + set_bit(R5_Wantcompute, &dev->flags); + sh->ops.target = disk_idx; + sh->ops.target2 = -1; /* no 2nd target */ + s->req_compute = 1; + s->uptodate++; + return 1; + } else if (s->uptodate == disks-2 && s->failed >= 2) { + /* Computing 2-failure is *very* expensive; only + * do it if failed >= 2 + */ + int other; + for (other = disks; other--; ) { + if (other == disk_idx) + continue; + if (!test_bit(R5_UPTODATE, + &sh->dev[other].flags)) + break; } + BUG_ON(other < 0); + pr_debug("Computing stripe %llu blocks %d,%d\n", + (unsigned long long)sh->sector, + disk_idx, other); + set_bit(STRIPE_COMPUTE_RUN, &sh->state); + set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); + set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); + set_bit(R5_Wantcompute, &sh->dev[other].flags); + sh->ops.target = disk_idx; + sh->ops.target2 = other; + s->uptodate += 2; + s->req_compute = 1; + return 1; + } else if (test_bit(R5_Insync, &dev->flags)) { + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantread, &dev->flags); + s->locked++; + pr_debug("Reading block %d (sync=%d)\n", + disk_idx, s->syncing); } } + + return 0; +} + +/** + * handle_stripe_fill6 - read or compute data to satisfy pending requests. + */ +static void handle_stripe_fill6(struct stripe_head *sh, + struct stripe_head_state *s, struct r6_state *r6s, + int disks) +{ + int i; + + /* look for blocks to read/compute, skip this if a compute + * is already in flight, or if the stripe contents are in the + * midst of changing due to a write + */ + if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && + !sh->reconstruct_state) + for (i = disks; i--; ) + if (fetch_block6(sh, s, r6s, i, disks)) + break; set_bit(STRIPE_HANDLE, &sh->state); } @@ -2424,114 +2507,61 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf, */ /* since handle_stripe can be called at any time we need to handle the * case where a compute block operation has been submitted and then a - * subsequent call wants to start a write request. raid5_run_ops only - * handles the case where compute block and postxor are requested + * subsequent call wants to start a write request. raid_run_ops only + * handles the case where compute block and reconstruct are requested * simultaneously. If this is not the case then new writes need to be * held off until the compute completes. */ if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && (s->locked == 0 && (rcw == 0 || rmw == 0) && !test_bit(STRIPE_BIT_DELAY, &sh->state))) - schedule_reconstruction5(sh, s, rcw == 0, 0); + schedule_reconstruction(sh, s, rcw == 0, 0); } static void handle_stripe_dirtying6(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, struct r6_state *r6s, int disks) { - int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i; + int rcw = 0, pd_idx = sh->pd_idx, i; int qd_idx = sh->qd_idx; + + set_bit(STRIPE_HANDLE, &sh->state); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - /* Would I have to read this buffer for reconstruct_write */ - if (!test_bit(R5_OVERWRITE, &dev->flags) - && i != pd_idx && i != qd_idx - && (!test_bit(R5_LOCKED, &dev->flags) - ) && - !test_bit(R5_UPTODATE, &dev->flags)) { - if (test_bit(R5_Insync, &dev->flags)) rcw++; - else { - pr_debug("raid6: must_compute: " - "disk %d flags=%#lx\n", i, dev->flags); - must_compute++; + /* check if we haven't enough data */ + if (!test_bit(R5_OVERWRITE, &dev->flags) && + i != pd_idx && i != qd_idx && + !test_bit(R5_LOCKED, &dev->flags) && + !(test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_Wantcompute, &dev->flags))) { + rcw++; + if (!test_bit(R5_Insync, &dev->flags)) + continue; /* it's a failed drive */ + + if ( + test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { + pr_debug("Read_old stripe %llu " + "block %d for Reconstruct\n", + (unsigned long long)sh->sector, i); + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantread, &dev->flags); + s->locked++; + } else { + pr_debug("Request delayed stripe %llu " + "block %d for Reconstruct\n", + (unsigned long long)sh->sector, i); + set_bit(STRIPE_DELAYED, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); } } } - pr_debug("for sector %llu, rcw=%d, must_compute=%d\n", - (unsigned long long)sh->sector, rcw, must_compute); - set_bit(STRIPE_HANDLE, &sh->state); - - if (rcw > 0) - /* want reconstruct write, but need to get some data */ - for (i = disks; i--; ) { - struct r5dev *dev = &sh->dev[i]; - if (!test_bit(R5_OVERWRITE, &dev->flags) - && !(s->failed == 0 && (i == pd_idx || i == qd_idx)) - && !test_bit(R5_LOCKED, &dev->flags) && - !test_bit(R5_UPTODATE, &dev->flags) && - test_bit(R5_Insync, &dev->flags)) { - if ( - test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { - pr_debug("Read_old stripe %llu " - "block %d for Reconstruct\n", - (unsigned long long)sh->sector, i); - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantread, &dev->flags); - s->locked++; - } else { - pr_debug("Request delayed stripe %llu " - "block %d for Reconstruct\n", - (unsigned long long)sh->sector, i); - set_bit(STRIPE_DELAYED, &sh->state); - set_bit(STRIPE_HANDLE, &sh->state); - } - } - } /* now if nothing is locked, and if we have enough data, we can start a * write request */ - if (s->locked == 0 && rcw == 0 && + if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && + s->locked == 0 && rcw == 0 && !test_bit(STRIPE_BIT_DELAY, &sh->state)) { - if (must_compute > 0) { - /* We have failed blocks and need to compute them */ - switch (s->failed) { - case 0: - BUG(); - case 1: - compute_block_1(sh, r6s->failed_num[0], 0); - break; - case 2: - compute_block_2(sh, r6s->failed_num[0], - r6s->failed_num[1]); - break; - default: /* This request should have been failed? */ - BUG(); - } - } - - pr_debug("Computing parity for stripe %llu\n", - (unsigned long long)sh->sector); - compute_parity6(sh, RECONSTRUCT_WRITE); - /* now every locked buffer is ready to be written */ - for (i = disks; i--; ) - if (test_bit(R5_LOCKED, &sh->dev[i].flags)) { - pr_debug("Writing stripe %llu block %d\n", - (unsigned long long)sh->sector, i); - s->locked++; - set_bit(R5_Wantwrite, &sh->dev[i].flags); - } - if (s->locked == disks) - if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) - atomic_inc(&conf->pending_full_writes); - /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */ - set_bit(STRIPE_INSYNC, &sh->state); - - if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { - atomic_dec(&conf->preread_active_stripes); - if (atomic_read(&conf->preread_active_stripes) < - IO_THRESHOLD) - md_wakeup_thread(conf->mddev->thread); - } + schedule_reconstruction(sh, s, 1, 0); } } @@ -2607,6 +2637,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, set_bit(R5_Wantcompute, &sh->dev[sh->pd_idx].flags); sh->ops.target = sh->pd_idx; + sh->ops.target2 = -1; s->uptodate++; } } @@ -2626,91 +2657,163 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, struct r6_state *r6s, int disks) { - int update_p = 0, update_q = 0; - struct r5dev *dev; int pd_idx = sh->pd_idx; int qd_idx = sh->qd_idx; - unsigned long cpu; - struct page *tmp_page; + struct r5dev *dev; set_bit(STRIPE_HANDLE, &sh->state); BUG_ON(s->failed > 2); - BUG_ON(s->uptodate < disks); + /* Want to check and possibly repair P and Q. * However there could be one 'failed' device, in which * case we can only check one of them, possibly using the * other to generate missing data */ - cpu = get_cpu(); - tmp_page = per_cpu_ptr(conf->percpu, cpu)->spare_page; - if (s->failed == r6s->q_failed) { - /* The only possible failed device holds 'Q', so it - * makes sense to check P (If anything else were failed, - * we would have used P to recreate it). - */ - compute_block_1(sh, pd_idx, 1); - if (!page_is_zero(sh->dev[pd_idx].page)) { - compute_block_1(sh, pd_idx, 0); - update_p = 1; + + switch (sh->check_state) { + case check_state_idle: + /* start a new check operation if there are < 2 failures */ + if (s->failed == r6s->q_failed) { + /* The only possible failed device holds Q, so it + * makes sense to check P (If anything else were failed, + * we would have used P to recreate it). + */ + sh->check_state = check_state_run; } - } - if (!r6s->q_failed && s->failed < 2) { - /* q is not failed, and we didn't use it to generate - * anything, so it makes sense to check it - */ - memcpy(page_address(tmp_page), - page_address(sh->dev[qd_idx].page), - STRIPE_SIZE); - compute_parity6(sh, UPDATE_PARITY); - if (memcmp(page_address(tmp_page), - page_address(sh->dev[qd_idx].page), - STRIPE_SIZE) != 0) { - clear_bit(STRIPE_INSYNC, &sh->state); - update_q = 1; + if (!r6s->q_failed && s->failed < 2) { + /* Q is not failed, and we didn't use it to generate + * anything, so it makes sense to check it + */ + if (sh->check_state == check_state_run) + sh->check_state = check_state_run_pq; + else + sh->check_state = check_state_run_q; } - } - put_cpu(); - if (update_p || update_q) { - conf->mddev->resync_mismatches += STRIPE_SECTORS; - if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) - /* don't try to repair!! */ - update_p = update_q = 0; - } + /* discard potentially stale zero_sum_result */ + sh->ops.zero_sum_result = 0; - /* now write out any block on a failed drive, - * or P or Q if they need it - */ + if (sh->check_state == check_state_run) { + /* async_xor_zero_sum destroys the contents of P */ + clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); + s->uptodate--; + } + if (sh->check_state >= check_state_run && + sh->check_state <= check_state_run_pq) { + /* async_syndrome_zero_sum preserves P and Q, so + * no need to mark them !uptodate here + */ + set_bit(STRIPE_OP_CHECK, &s->ops_request); + break; + } - if (s->failed == 2) { - dev = &sh->dev[r6s->failed_num[1]]; - s->locked++; - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantwrite, &dev->flags); - } - if (s->failed >= 1) { - dev = &sh->dev[r6s->failed_num[0]]; - s->locked++; - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantwrite, &dev->flags); - } + /* we have 2-disk failure */ + BUG_ON(s->failed != 2); + /* fall through */ + case check_state_compute_result: + sh->check_state = check_state_idle; - if (update_p) { - dev = &sh->dev[pd_idx]; - s->locked++; - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantwrite, &dev->flags); - } - if (update_q) { - dev = &sh->dev[qd_idx]; - s->locked++; - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantwrite, &dev->flags); - } - clear_bit(STRIPE_DEGRADED, &sh->state); + /* check that a write has not made the stripe insync */ + if (test_bit(STRIPE_INSYNC, &sh->state)) + break; - set_bit(STRIPE_INSYNC, &sh->state); + /* now write out any block on a failed drive, + * or P or Q if they were recomputed + */ + BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ + if (s->failed == 2) { + dev = &sh->dev[r6s->failed_num[1]]; + s->locked++; + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantwrite, &dev->flags); + } + if (s->failed >= 1) { + dev = &sh->dev[r6s->failed_num[0]]; + s->locked++; + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantwrite, &dev->flags); + } + if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { + dev = &sh->dev[pd_idx]; + s->locked++; + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantwrite, &dev->flags); + } + if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { + dev = &sh->dev[qd_idx]; + s->locked++; + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantwrite, &dev->flags); + } + clear_bit(STRIPE_DEGRADED, &sh->state); + + set_bit(STRIPE_INSYNC, &sh->state); + break; + case check_state_run: + case check_state_run_q: + case check_state_run_pq: + break; /* we will be called again upon completion */ + case check_state_check_result: + sh->check_state = check_state_idle; + + /* handle a successful check operation, if parity is correct + * we are done. Otherwise update the mismatch count and repair + * parity if !MD_RECOVERY_CHECK + */ + if (sh->ops.zero_sum_result == 0) { + /* both parities are correct */ + if (!s->failed) + set_bit(STRIPE_INSYNC, &sh->state); + else { + /* in contrast to the raid5 case we can validate + * parity, but still have a failure to write + * back + */ + sh->check_state = check_state_compute_result; + /* Returning at this point means that we may go + * off and bring p and/or q uptodate again so + * we make sure to check zero_sum_result again + * to verify if p or q need writeback + */ + } + } else { + conf->mddev->resync_mismatches += STRIPE_SECTORS; + if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) + /* don't try to repair!! */ + set_bit(STRIPE_INSYNC, &sh->state); + else { + int *target = &sh->ops.target; + + sh->ops.target = -1; + sh->ops.target2 = -1; + sh->check_state = check_state_compute_run; + set_bit(STRIPE_COMPUTE_RUN, &sh->state); + set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); + if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { + set_bit(R5_Wantcompute, + &sh->dev[pd_idx].flags); + *target = pd_idx; + target = &sh->ops.target2; + s->uptodate++; + } + if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { + set_bit(R5_Wantcompute, + &sh->dev[qd_idx].flags); + *target = qd_idx; + s->uptodate++; + } + } + } + break; + case check_state_compute_run: + break; + default: + printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", + __func__, sh->check_state, + (unsigned long long) sh->sector); + BUG(); + } } static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, @@ -2732,7 +2835,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, sector_t bn = compute_blocknr(sh, i, 1); sector_t s = raid5_compute_sector(conf, bn, 0, &dd_idx, NULL); - sh2 = get_active_stripe(conf, s, 0, 1); + sh2 = get_active_stripe(conf, s, 0, 1, 1); if (sh2 == NULL) /* so far only the early blocks of this stripe * have been requested. When later blocks @@ -3006,7 +3109,7 @@ static bool handle_stripe5(struct stripe_head *sh) /* Finish reconstruct operations initiated by the expansion process */ if (sh->reconstruct_state == reconstruct_state_result) { struct stripe_head *sh2 - = get_active_stripe(conf, sh->sector, 1, 1); + = get_active_stripe(conf, sh->sector, 1, 1, 1); if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { /* sh cannot be written until sh2 has been read. * so arrange for sh to be delayed a little @@ -3036,7 +3139,7 @@ static bool handle_stripe5(struct stripe_head *sh) /* Need to write out all blocks after computing parity */ sh->disks = conf->raid_disks; stripe_set_idx(sh->sector, conf, 0, sh); - schedule_reconstruction5(sh, &s, 1, 1); + schedule_reconstruction(sh, &s, 1, 1); } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { clear_bit(STRIPE_EXPAND_READY, &sh->state); atomic_dec(&conf->reshape_stripes); @@ -3056,7 +3159,7 @@ static bool handle_stripe5(struct stripe_head *sh) md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); if (s.ops_request) - raid5_run_ops(sh, s.ops_request); + raid_run_ops(sh, s.ops_request); ops_run_io(sh, &s); @@ -3077,9 +3180,10 @@ static bool handle_stripe6(struct stripe_head *sh) mdk_rdev_t *blocked_rdev = NULL; pr_debug("handling stripe %llu, state=%#lx cnt=%d, " - "pd_idx=%d, qd_idx=%d\n", + "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", (unsigned long long)sh->sector, sh->state, - atomic_read(&sh->count), pd_idx, qd_idx); + atomic_read(&sh->count), pd_idx, qd_idx, + sh->check_state, sh->reconstruct_state); memset(&s, 0, sizeof(s)); spin_lock(&sh->lock); @@ -3099,35 +3203,24 @@ static bool handle_stripe6(struct stripe_head *sh) pr_debug("check %d: state 0x%lx read %p write %p written %p\n", i, dev->flags, dev->toread, dev->towrite, dev->written); - /* maybe we can reply to a read */ - if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) { - struct bio *rbi, *rbi2; - pr_debug("Return read for disc %d\n", i); - spin_lock_irq(&conf->device_lock); - rbi = dev->toread; - dev->toread = NULL; - if (test_and_clear_bit(R5_Overlap, &dev->flags)) - wake_up(&conf->wait_for_overlap); - spin_unlock_irq(&conf->device_lock); - while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { - copy_data(0, rbi, dev->page, dev->sector); - rbi2 = r5_next_bio(rbi, dev->sector); - spin_lock_irq(&conf->device_lock); - if (!raid5_dec_bi_phys_segments(rbi)) { - rbi->bi_next = return_bi; - return_bi = rbi; - } - spin_unlock_irq(&conf->device_lock); - rbi = rbi2; - } - } + /* maybe we can reply to a read + * + * new wantfill requests are only permitted while + * ops_complete_biofill is guaranteed to be inactive + */ + if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && + !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) + set_bit(R5_Wantfill, &dev->flags); /* now count some things */ if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; + if (test_bit(R5_Wantcompute, &dev->flags)) + BUG_ON(++s.compute > 2); - - if (dev->toread) + if (test_bit(R5_Wantfill, &dev->flags)) { + s.to_fill++; + } else if (dev->toread) s.to_read++; if (dev->towrite) { s.to_write++; @@ -3168,6 +3261,11 @@ static bool handle_stripe6(struct stripe_head *sh) blocked_rdev = NULL; } + if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { + set_bit(STRIPE_OP_BIOFILL, &s.ops_request); + set_bit(STRIPE_BIOFILL_RUN, &sh->state); + } + pr_debug("locked=%d uptodate=%d to_read=%d" " to_write=%d failed=%d failed_num=%d,%d\n", s.locked, s.uptodate, s.to_read, s.to_write, s.failed, @@ -3208,18 +3306,61 @@ static bool handle_stripe6(struct stripe_head *sh) * or to load a block that is being partially written. */ if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || - (s.syncing && (s.uptodate < disks)) || s.expanding) + (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) handle_stripe_fill6(sh, &s, &r6s, disks); - /* now to consider writing and what else, if anything should be read */ - if (s.to_write) + /* Now we check to see if any write operations have recently + * completed + */ + if (sh->reconstruct_state == reconstruct_state_drain_result) { + int qd_idx = sh->qd_idx; + + sh->reconstruct_state = reconstruct_state_idle; + /* All the 'written' buffers and the parity blocks are ready to + * be written back to disk + */ + BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); + BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags)); + for (i = disks; i--; ) { + dev = &sh->dev[i]; + if (test_bit(R5_LOCKED, &dev->flags) && + (i == sh->pd_idx || i == qd_idx || + dev->written)) { + pr_debug("Writing block %d\n", i); + BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); + set_bit(R5_Wantwrite, &dev->flags); + if (!test_bit(R5_Insync, &dev->flags) || + ((i == sh->pd_idx || i == qd_idx) && + s.failed == 0)) + set_bit(STRIPE_INSYNC, &sh->state); + } + } + if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { + atomic_dec(&conf->preread_active_stripes); + if (atomic_read(&conf->preread_active_stripes) < + IO_THRESHOLD) + md_wakeup_thread(conf->mddev->thread); + } + } + + /* Now to consider new write requests and what else, if anything + * should be read. We do not handle new writes when: + * 1/ A 'write' operation (copy+gen_syndrome) is already in flight. + * 2/ A 'check' operation is in flight, as it may clobber the parity + * block. + */ + if (s.to_write && !sh->reconstruct_state && !sh->check_state) handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); /* maybe we need to check and possibly fix the parity for this stripe * Any reads will already have been scheduled, so we just see if enough - * data is available + * data is available. The parity check is held off while parity + * dependent operations are in flight. */ - if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) + if (sh->check_state || + (s.syncing && s.locked == 0 && + !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && + !test_bit(STRIPE_INSYNC, &sh->state))) handle_parity_checks6(conf, sh, &s, &r6s, disks); if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { @@ -3241,17 +3382,31 @@ static bool handle_stripe6(struct stripe_head *sh) set_bit(R5_Wantwrite, &dev->flags); set_bit(R5_ReWrite, &dev->flags); set_bit(R5_LOCKED, &dev->flags); + s.locked++; } else { /* let's read it back */ set_bit(R5_Wantread, &dev->flags); set_bit(R5_LOCKED, &dev->flags); + s.locked++; } } } - if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { + /* Finish reconstruct operations initiated by the expansion process */ + if (sh->reconstruct_state == reconstruct_state_result) { + sh->reconstruct_state = reconstruct_state_idle; + clear_bit(STRIPE_EXPANDING, &sh->state); + for (i = conf->raid_disks; i--; ) { + set_bit(R5_Wantwrite, &sh->dev[i].flags); + set_bit(R5_LOCKED, &sh->dev[i].flags); + s.locked++; + } + } + + if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && + !sh->reconstruct_state) { struct stripe_head *sh2 - = get_active_stripe(conf, sh->sector, 1, 1); + = get_active_stripe(conf, sh->sector, 1, 1, 1); if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { /* sh cannot be written until sh2 has been read. * so arrange for sh to be delayed a little @@ -3270,14 +3425,8 @@ static bool handle_stripe6(struct stripe_head *sh) /* Need to write out all blocks after computing P&Q */ sh->disks = conf->raid_disks; stripe_set_idx(sh->sector, conf, 0, sh); - compute_parity6(sh, RECONSTRUCT_WRITE); - for (i = conf->raid_disks ; i-- ; ) { - set_bit(R5_LOCKED, &sh->dev[i].flags); - s.locked++; - set_bit(R5_Wantwrite, &sh->dev[i].flags); - } - clear_bit(STRIPE_EXPANDING, &sh->state); - } else if (s.expanded) { + schedule_reconstruction(sh, &s, 1, 1); + } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { clear_bit(STRIPE_EXPAND_READY, &sh->state); atomic_dec(&conf->reshape_stripes); wake_up(&conf->wait_for_overlap); @@ -3295,6 +3444,9 @@ static bool handle_stripe6(struct stripe_head *sh) if (unlikely(blocked_rdev)) md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); + if (s.ops_request) + raid_run_ops(sh, s.ops_request); + ops_run_io(sh, &s); return_io(return_bi); @@ -3348,7 +3500,7 @@ static void unplug_slaves(mddev_t *mddev) int i; rcu_read_lock(); - for (i=0; i<mddev->raid_disks; i++) { + for (i = 0; i < conf->raid_disks; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { struct request_queue *r_queue = bdev_get_queue(rdev->bdev); @@ -3735,7 +3887,7 @@ static int make_request(struct request_queue *q, struct bio * bi) (unsigned long long)logical_sector); sh = get_active_stripe(conf, new_sector, previous, - (bi->bi_rw&RWA_MASK)); + (bi->bi_rw&RWA_MASK), 0); if (sh) { if (unlikely(previous)) { /* expansion might have moved on while waiting for a @@ -3871,13 +4023,13 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped safepos = conf->reshape_safe; sector_div(safepos, data_disks); if (mddev->delta_disks < 0) { - writepos -= reshape_sectors; + writepos -= min_t(sector_t, reshape_sectors, writepos); readpos += reshape_sectors; safepos += reshape_sectors; } else { writepos += reshape_sectors; - readpos -= reshape_sectors; - safepos -= reshape_sectors; + readpos -= min_t(sector_t, reshape_sectors, readpos); + safepos -= min_t(sector_t, reshape_sectors, safepos); } /* 'writepos' is the most advanced device address we might write. @@ -3905,6 +4057,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped wait_event(conf->wait_for_overlap, atomic_read(&conf->reshape_stripes)==0); mddev->reshape_position = conf->reshape_progress; + mddev->curr_resync_completed = mddev->curr_resync; conf->reshape_checkpoint = jiffies; set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); @@ -3914,6 +4067,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped conf->reshape_safe = mddev->reshape_position; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } if (mddev->delta_disks < 0) { @@ -3931,7 +4085,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { int j; int skipped = 0; - sh = get_active_stripe(conf, stripe_addr+i, 0, 0); + sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1); set_bit(STRIPE_EXPANDING, &sh->state); atomic_inc(&conf->reshape_stripes); /* If any of this stripe is beyond the end of the old @@ -3974,13 +4128,13 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped raid5_compute_sector(conf, stripe_addr*(new_data_disks), 1, &dd_idx, NULL); last_sector = - raid5_compute_sector(conf, ((stripe_addr+conf->chunk_size/512) + raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) *(new_data_disks) - 1), 1, &dd_idx, NULL); if (last_sector >= mddev->dev_sectors) last_sector = mddev->dev_sectors - 1; while (first_sector <= last_sector) { - sh = get_active_stripe(conf, first_sector, 1, 0); + sh = get_active_stripe(conf, first_sector, 1, 0, 1); set_bit(STRIPE_EXPAND_SOURCE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); @@ -3998,11 +4152,13 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped * then we need to write out the superblock. */ sector_nr += reshape_sectors; - if (sector_nr >= mddev->resync_max) { + if ((sector_nr - mddev->curr_resync_completed) * 2 + >= mddev->resync_max - mddev->curr_resync_completed) { /* Cannot proceed until we've updated the superblock... */ wait_event(conf->wait_for_overlap, atomic_read(&conf->reshape_stripes) == 0); mddev->reshape_position = conf->reshape_progress; + mddev->curr_resync_completed = mddev->curr_resync; conf->reshape_checkpoint = jiffies; set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); @@ -4013,6 +4169,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped conf->reshape_safe = mddev->reshape_position; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } return reshape_sectors; } @@ -4077,9 +4234,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski bitmap_cond_end_sync(mddev->bitmap, sector_nr); - sh = get_active_stripe(conf, sector_nr, 0, 1); + sh = get_active_stripe(conf, sector_nr, 0, 1, 0); if (sh == NULL) { - sh = get_active_stripe(conf, sector_nr, 0, 0); + sh = get_active_stripe(conf, sector_nr, 0, 0, 0); /* make sure we don't swamp the stripe cache if someone else * is trying to get access */ @@ -4089,7 +4246,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski * We don't need to check the 'failed' flag as when that gets set, * recovery aborts. */ - for (i=0; i<mddev->raid_disks; i++) + for (i = 0; i < conf->raid_disks; i++) if (conf->disks[i].rdev == NULL) still_degraded = 1; @@ -4141,7 +4298,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) /* already done this stripe */ continue; - sh = get_active_stripe(conf, sector, 0, 1); + sh = get_active_stripe(conf, sector, 0, 1, 0); if (!sh) { /* failed to get a stripe - must wait */ @@ -4172,6 +4329,36 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) return handled; } +#ifdef CONFIG_MULTICORE_RAID456 +static void __process_stripe(void *param, async_cookie_t cookie) +{ + struct stripe_head *sh = param; + + handle_stripe(sh); + release_stripe(sh); +} + +static void process_stripe(struct stripe_head *sh, struct list_head *domain) +{ + async_schedule_domain(__process_stripe, sh, domain); +} + +static void synchronize_stripe_processing(struct list_head *domain) +{ + async_synchronize_full_domain(domain); +} +#else +static void process_stripe(struct stripe_head *sh, struct list_head *domain) +{ + handle_stripe(sh); + release_stripe(sh); + cond_resched(); +} + +static void synchronize_stripe_processing(struct list_head *domain) +{ +} +#endif /* @@ -4186,6 +4373,7 @@ static void raid5d(mddev_t *mddev) struct stripe_head *sh; raid5_conf_t *conf = mddev_to_conf(mddev); int handled; + LIST_HEAD(raid_domain); pr_debug("+++ raid5d active\n"); @@ -4222,8 +4410,7 @@ static void raid5d(mddev_t *mddev) spin_unlock_irq(&conf->device_lock); handled++; - handle_stripe(sh); - release_stripe(sh); + process_stripe(sh, &raid_domain); spin_lock_irq(&conf->device_lock); } @@ -4231,6 +4418,7 @@ static void raid5d(mddev_t *mddev) spin_unlock_irq(&conf->device_lock); + synchronize_stripe_processing(&raid_domain); async_tx_issue_pending_all(); unplug_slaves(mddev); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 75f2c6c4cf90..116d0b44b2a9 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -176,7 +176,9 @@ */ enum check_states { check_state_idle = 0, - check_state_run, /* parity check */ + check_state_run, /* xor parity check */ + check_state_run_q, /* q-parity check */ + check_state_run_pq, /* pq dual parity check */ check_state_check_result, check_state_compute_run, /* parity repair */ check_state_compute_result, @@ -216,7 +218,7 @@ struct stripe_head { * @target - STRIPE_OP_COMPUTE_BLK target */ struct stripe_operations { - int target; + int target, target2; enum sum_check_flags zero_sum_result; } ops; struct r5dev { @@ -299,7 +301,7 @@ struct r6_state { #define STRIPE_OP_COMPUTE_BLK 1 #define STRIPE_OP_PREXOR 2 #define STRIPE_OP_BIODRAIN 3 -#define STRIPE_OP_POSTXOR 4 +#define STRIPE_OP_RECONSTRUCT 4 #define STRIPE_OP_CHECK 5 /* |