diff options
Diffstat (limited to 'drivers/md')
41 files changed, 2424 insertions, 807 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index faa4741df6d3..10f122a3a856 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -277,8 +277,8 @@ config DM_MIRROR needed for live data migration tools such as 'pvmove'. config DM_RAID - tristate "RAID 1/4/5/6 target (EXPERIMENTAL)" - depends on BLK_DEV_DM && EXPERIMENTAL + tristate "RAID 1/4/5/6 target" + depends on BLK_DEV_DM select MD_RAID1 select MD_RAID456 select BLK_DEV_MD @@ -359,8 +359,8 @@ config DM_DELAY If unsure, say N. config DM_UEVENT - bool "DM uevents (EXPERIMENTAL)" - depends on BLK_DEV_DM && EXPERIMENTAL + bool "DM uevents" + depends on BLK_DEV_DM ---help--- Generate udev events for DM events. @@ -370,4 +370,24 @@ config DM_FLAKEY ---help--- A target that intermittently fails I/O for debugging purposes. +config DM_VERITY + tristate "Verity target support (EXPERIMENTAL)" + depends on BLK_DEV_DM && EXPERIMENTAL + select CRYPTO + select CRYPTO_HASH + select DM_BUFIO + ---help--- + This device-mapper target creates a read-only device that + transparently validates the data on one underlying device against + a pre-generated tree of cryptographic checksums stored on a second + device. + + You'll need to activate the digests you're going to use in the + cryptoapi configuration. + + To compile this code as a module, choose M here: the module will + be called dm-verity. + + If unsure, say N. + endif # MD diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 046860c7a166..8b2e0dffe82e 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -42,6 +42,7 @@ obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o obj-$(CONFIG_DM_ZERO) += dm-zero.o obj-$(CONFIG_DM_RAID) += dm-raid.o obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o +obj-$(CONFIG_DM_VERITY) += dm-verity.o ifeq ($(CONFIG_DM_UEVENT),y) dm-mod-objs += dm-uevent.o diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index cdf36b1e9aa6..3d0dfa7a89a2 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -26,6 +26,7 @@ #include <linux/file.h> #include <linux/mount.h> #include <linux/buffer_head.h> +#include <linux/seq_file.h> #include "md.h" #include "bitmap.h" @@ -35,31 +36,6 @@ static inline char *bmname(struct bitmap *bitmap) } /* - * just a placeholder - calls kmalloc for bitmap pages - */ -static unsigned char *bitmap_alloc_page(struct bitmap *bitmap) -{ - unsigned char *page; - - page = kzalloc(PAGE_SIZE, GFP_NOIO); - if (!page) - printk("%s: bitmap_alloc_page FAILED\n", bmname(bitmap)); - else - pr_debug("%s: bitmap_alloc_page: allocated page at %p\n", - bmname(bitmap), page); - return page; -} - -/* - * for now just a placeholder -- just calls kfree for bitmap pages - */ -static void bitmap_free_page(struct bitmap *bitmap, unsigned char *page) -{ - pr_debug("%s: bitmap_free_page: free page %p\n", bmname(bitmap), page); - kfree(page); -} - -/* * check a page and, if necessary, allocate it (or hijack it if the alloc fails) * * 1) check to see if this page is allocated, if it's not then try to alloc @@ -96,7 +72,7 @@ __acquires(bitmap->lock) /* this page has not been allocated yet */ spin_unlock_irq(&bitmap->lock); - mappage = bitmap_alloc_page(bitmap); + mappage = kzalloc(PAGE_SIZE, GFP_NOIO); spin_lock_irq(&bitmap->lock); if (mappage == NULL) { @@ -109,7 +85,7 @@ __acquires(bitmap->lock) } else if (bitmap->bp[page].map || bitmap->bp[page].hijacked) { /* somebody beat us to getting the page */ - bitmap_free_page(bitmap, mappage); + kfree(mappage); return 0; } else { @@ -141,7 +117,7 @@ static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page) ptr = bitmap->bp[page].map; bitmap->bp[page].map = NULL; bitmap->missing_pages++; - bitmap_free_page(bitmap, ptr); + kfree(ptr); } } @@ -171,7 +147,7 @@ static struct page *read_sb_page(struct mddev *mddev, loff_t offset, did_alloc = 1; } - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { if (! test_bit(In_sync, &rdev->flags) || test_bit(Faulty, &rdev->flags)) continue; @@ -445,19 +421,14 @@ out: void bitmap_update_sb(struct bitmap *bitmap) { bitmap_super_t *sb; - unsigned long flags; if (!bitmap || !bitmap->mddev) /* no bitmap for this array */ return; if (bitmap->mddev->bitmap_info.external) return; - spin_lock_irqsave(&bitmap->lock, flags); - if (!bitmap->sb_page) { /* no superblock */ - spin_unlock_irqrestore(&bitmap->lock, flags); + if (!bitmap->sb_page) /* no superblock */ return; - } - spin_unlock_irqrestore(&bitmap->lock, flags); - sb = kmap_atomic(bitmap->sb_page, KM_USER0); + sb = kmap_atomic(bitmap->sb_page); sb->events = cpu_to_le64(bitmap->mddev->events); if (bitmap->mddev->events < bitmap->events_cleared) /* rocking back to read-only */ @@ -467,7 +438,7 @@ void bitmap_update_sb(struct bitmap *bitmap) /* Just in case these have been changed via sysfs: */ sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); - kunmap_atomic(sb, KM_USER0); + kunmap_atomic(sb); write_page(bitmap, bitmap->sb_page, 1); } @@ -478,7 +449,7 @@ void bitmap_print_sb(struct bitmap *bitmap) if (!bitmap || !bitmap->sb_page) return; - sb = kmap_atomic(bitmap->sb_page, KM_USER0); + sb = kmap_atomic(bitmap->sb_page); printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap)); printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic)); printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version)); @@ -497,7 +468,7 @@ void bitmap_print_sb(struct bitmap *bitmap) printk(KERN_DEBUG " sync size: %llu KB\n", (unsigned long long)le64_to_cpu(sb->sync_size)/2); printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind)); - kunmap_atomic(sb, KM_USER0); + kunmap_atomic(sb); } /* @@ -525,7 +496,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap) } bitmap->sb_page->index = 0; - sb = kmap_atomic(bitmap->sb_page, KM_USER0); + sb = kmap_atomic(bitmap->sb_page); sb->magic = cpu_to_le32(BITMAP_MAGIC); sb->version = cpu_to_le32(BITMAP_MAJOR_HI); @@ -533,7 +504,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap) chunksize = bitmap->mddev->bitmap_info.chunksize; BUG_ON(!chunksize); if (!is_power_of_2(chunksize)) { - kunmap_atomic(sb, KM_USER0); + kunmap_atomic(sb); printk(KERN_ERR "bitmap chunksize not a power of 2\n"); return -EINVAL; } @@ -571,7 +542,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap) bitmap->flags |= BITMAP_HOSTENDIAN; sb->version = cpu_to_le32(BITMAP_MAJOR_HOSTENDIAN); - kunmap_atomic(sb, KM_USER0); + kunmap_atomic(sb); return 0; } @@ -603,7 +574,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) return err; } - sb = kmap_atomic(bitmap->sb_page, KM_USER0); + sb = kmap_atomic(bitmap->sb_page); chunksize = le32_to_cpu(sb->chunksize); daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; @@ -632,26 +603,28 @@ static int bitmap_read_sb(struct bitmap *bitmap) /* keep the array size field of the bitmap superblock up to date */ sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); - if (!bitmap->mddev->persistent) - goto success; - - /* - * if we have a persistent array superblock, compare the - * bitmap's UUID and event counter to the mddev's - */ - if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) { - printk(KERN_INFO "%s: bitmap superblock UUID mismatch\n", - bmname(bitmap)); - goto out; - } - events = le64_to_cpu(sb->events); - if (events < bitmap->mddev->events) { - printk(KERN_INFO "%s: bitmap file is out of date (%llu < %llu) " - "-- forcing full recovery\n", bmname(bitmap), events, - (unsigned long long) bitmap->mddev->events); - sb->state |= cpu_to_le32(BITMAP_STALE); + if (bitmap->mddev->persistent) { + /* + * We have a persistent array superblock, so compare the + * bitmap's UUID and event counter to the mddev's + */ + if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) { + printk(KERN_INFO + "%s: bitmap superblock UUID mismatch\n", + bmname(bitmap)); + goto out; + } + events = le64_to_cpu(sb->events); + if (events < bitmap->mddev->events) { + printk(KERN_INFO + "%s: bitmap file is out of date (%llu < %llu) " + "-- forcing full recovery\n", + bmname(bitmap), events, + (unsigned long long) bitmap->mddev->events); + sb->state |= cpu_to_le32(BITMAP_STALE); + } } -success: + /* assign fields using values from superblock */ bitmap->mddev->bitmap_info.chunksize = chunksize; bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; @@ -664,7 +637,7 @@ success: bitmap->events_cleared = bitmap->mddev->events; err = 0; out: - kunmap_atomic(sb, KM_USER0); + kunmap_atomic(sb); if (err) bitmap_print_sb(bitmap); return err; @@ -680,16 +653,11 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, enum bitmap_mask_op op) { bitmap_super_t *sb; - unsigned long flags; int old; - spin_lock_irqsave(&bitmap->lock, flags); - if (!bitmap->sb_page) { /* can't set the state */ - spin_unlock_irqrestore(&bitmap->lock, flags); + if (!bitmap->sb_page) /* can't set the state */ return 0; - } - spin_unlock_irqrestore(&bitmap->lock, flags); - sb = kmap_atomic(bitmap->sb_page, KM_USER0); + sb = kmap_atomic(bitmap->sb_page); old = le32_to_cpu(sb->state) & bits; switch (op) { case MASK_SET: @@ -703,7 +671,7 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, default: BUG(); } - kunmap_atomic(sb, KM_USER0); + kunmap_atomic(sb); return old; } @@ -870,7 +838,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) unsigned long bit; struct page *page; void *kaddr; - unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap); + unsigned long chunk = block >> bitmap->chunkshift; if (!bitmap->filemap) return; @@ -881,12 +849,12 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) bit = file_page_offset(bitmap, chunk); /* set the bit */ - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page); if (bitmap->flags & BITMAP_HOSTENDIAN) set_bit(bit, kaddr); else __set_bit_le(bit, kaddr); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); pr_debug("set file bit %lu page %lu\n", bit, page->index); /* record page number so it gets flushed to disk when unplug occurs */ set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); @@ -1050,10 +1018,10 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) * if bitmap is out of date, dirty the * whole page and write it out */ - paddr = kmap_atomic(page, KM_USER0); + paddr = kmap_atomic(page); memset(paddr + offset, 0xff, PAGE_SIZE - offset); - kunmap_atomic(paddr, KM_USER0); + kunmap_atomic(paddr); write_page(bitmap, page, 1); ret = -EIO; @@ -1061,18 +1029,18 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) goto err; } } - paddr = kmap_atomic(page, KM_USER0); + paddr = kmap_atomic(page); if (bitmap->flags & BITMAP_HOSTENDIAN) b = test_bit(bit, paddr); else b = test_bit_le(bit, paddr); - kunmap_atomic(paddr, KM_USER0); + kunmap_atomic(paddr); if (b) { /* if the disk bit is set, set the memory bit */ - int needed = ((sector_t)(i+1) << (CHUNK_BLOCK_SHIFT(bitmap)) + int needed = ((sector_t)(i+1) << bitmap->chunkshift >= start); bitmap_set_memory_bits(bitmap, - (sector_t)i << CHUNK_BLOCK_SHIFT(bitmap), + (sector_t)i << bitmap->chunkshift, needed); bit_cnt++; } @@ -1116,7 +1084,7 @@ void bitmap_write_all(struct bitmap *bitmap) static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc) { - sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap); + sector_t chunk = offset >> bitmap->chunkshift; unsigned long page = chunk >> PAGE_COUNTER_SHIFT; bitmap->bp[page].count += inc; bitmap_checkfree(bitmap, page); @@ -1209,10 +1177,10 @@ void bitmap_daemon_work(struct mddev *mddev) mddev->bitmap_info.external == 0) { bitmap_super_t *sb; bitmap->need_sync = 0; - sb = kmap_atomic(bitmap->sb_page, KM_USER0); + sb = kmap_atomic(bitmap->sb_page); sb->events_cleared = cpu_to_le64(bitmap->events_cleared); - kunmap_atomic(sb, KM_USER0); + kunmap_atomic(sb); write_page(bitmap, bitmap->sb_page, 1); } spin_lock_irqsave(&bitmap->lock, flags); @@ -1222,7 +1190,7 @@ void bitmap_daemon_work(struct mddev *mddev) bitmap->allclean = 0; } bmc = bitmap_get_counter(bitmap, - (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap), + (sector_t)j << bitmap->chunkshift, &blocks, 0); if (!bmc) j |= PAGE_COUNTER_MASK; @@ -1231,11 +1199,11 @@ void bitmap_daemon_work(struct mddev *mddev) /* we can clear the bit */ *bmc = 0; bitmap_count_page(bitmap, - (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap), + (sector_t)j << bitmap->chunkshift, -1); /* clear the bit */ - paddr = kmap_atomic(page, KM_USER0); + paddr = kmap_atomic(page); if (bitmap->flags & BITMAP_HOSTENDIAN) clear_bit(file_page_offset(bitmap, j), paddr); @@ -1244,7 +1212,7 @@ void bitmap_daemon_work(struct mddev *mddev) file_page_offset(bitmap, j), paddr); - kunmap_atomic(paddr, KM_USER0); + kunmap_atomic(paddr); } else if (*bmc <= 2) { *bmc = 1; /* maybe clear the bit next time */ set_page_attr(bitmap, page, BITMAP_PAGE_PENDING); @@ -1285,7 +1253,7 @@ __acquires(bitmap->lock) * The lock must have been taken with interrupts enabled. * If !create, we don't release the lock. */ - sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap); + sector_t chunk = offset >> bitmap->chunkshift; unsigned long page = chunk >> PAGE_COUNTER_SHIFT; unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT; sector_t csize; @@ -1295,10 +1263,10 @@ __acquires(bitmap->lock) if (bitmap->bp[page].hijacked || bitmap->bp[page].map == NULL) - csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap) + + csize = ((sector_t)1) << (bitmap->chunkshift + PAGE_COUNTER_SHIFT - 1); else - csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap)); + csize = ((sector_t)1) << bitmap->chunkshift; *blocks = csize - (offset & (csize - 1)); if (err < 0) @@ -1424,7 +1392,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto set_page_attr(bitmap, filemap_get_page( bitmap, - offset >> CHUNK_BLOCK_SHIFT(bitmap)), + offset >> bitmap->chunkshift), BITMAP_PAGE_PENDING); bitmap->allclean = 0; } @@ -1512,7 +1480,7 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, i else { if (*bmc <= 2) { set_page_attr(bitmap, - filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)), + filemap_get_page(bitmap, offset >> bitmap->chunkshift), BITMAP_PAGE_PENDING); bitmap->allclean = 0; } @@ -1559,7 +1527,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) bitmap->mddev->curr_resync_completed = sector; set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); - sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1); + sector &= ~((1ULL << bitmap->chunkshift) - 1); s = 0; while (s < sector && s < bitmap->mddev->resync_max_sectors) { bitmap_end_sync(bitmap, s, &blocks, 0); @@ -1589,7 +1557,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n struct page *page; *bmc = 2 | (needed ? NEEDED_MASK : 0); bitmap_count_page(bitmap, offset, 1); - page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)); + page = filemap_get_page(bitmap, offset >> bitmap->chunkshift); set_page_attr(bitmap, page, BITMAP_PAGE_PENDING); bitmap->allclean = 0; } @@ -1602,7 +1570,7 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e) unsigned long chunk; for (chunk = s; chunk <= e; chunk++) { - sector_t sec = (sector_t)chunk << CHUNK_BLOCK_SHIFT(bitmap); + sector_t sec = (sector_t)chunk << bitmap->chunkshift; bitmap_set_memory_bits(bitmap, sec, 1); spin_lock_irq(&bitmap->lock); bitmap_file_set_bit(bitmap, sec); @@ -1759,11 +1727,12 @@ int bitmap_create(struct mddev *mddev) goto error; bitmap->daemon_lastrun = jiffies; - bitmap->chunkshift = ffz(~mddev->bitmap_info.chunksize); + bitmap->chunkshift = (ffz(~mddev->bitmap_info.chunksize) + - BITMAP_BLOCK_SHIFT); /* now that chunksize and chunkshift are set, we can use these macros */ - chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) >> - CHUNK_BLOCK_SHIFT(bitmap); + chunks = (blocks + bitmap->chunkshift - 1) >> + bitmap->chunkshift; pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO; BUG_ON(!pages); @@ -1836,6 +1805,33 @@ out: } EXPORT_SYMBOL_GPL(bitmap_load); +void bitmap_status(struct seq_file *seq, struct bitmap *bitmap) +{ + unsigned long chunk_kb; + unsigned long flags; + + if (!bitmap) + return; + + spin_lock_irqsave(&bitmap->lock, flags); + chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10; + seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " + "%lu%s chunk", + bitmap->pages - bitmap->missing_pages, + bitmap->pages, + (bitmap->pages - bitmap->missing_pages) + << (PAGE_SHIFT - 10), + chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize, + chunk_kb ? "KB" : "B"); + if (bitmap->file) { + seq_printf(seq, ", file: "); + seq_path(seq, &bitmap->file->f_path, " \t\n"); + } + + seq_printf(seq, "\n"); + spin_unlock_irqrestore(&bitmap->lock, flags); +} + static ssize_t location_show(struct mddev *mddev, char *page) { @@ -1904,6 +1900,8 @@ location_store(struct mddev *mddev, const char *buf, size_t len) if (mddev->pers) { mddev->pers->quiesce(mddev, 1); rv = bitmap_create(mddev); + if (!rv) + rv = bitmap_load(mddev); if (rv) { bitmap_destroy(mddev); mddev->bitmap_info.offset = 0; diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index a15436dd9b3e..55ca5aec84e4 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h @@ -13,8 +13,6 @@ #define BITMAP_MAJOR_HI 4 #define BITMAP_MAJOR_HOSTENDIAN 3 -#define BITMAP_MINOR 39 - /* * in-memory bitmap: * @@ -101,21 +99,10 @@ typedef __u16 bitmap_counter_t; /* same, except a mask value for more efficient bitops */ #define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1) -#define BITMAP_BLOCK_SIZE 512 #define BITMAP_BLOCK_SHIFT 9 /* how many blocks per chunk? (this is variable) */ #define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->mddev->bitmap_info.chunksize >> BITMAP_BLOCK_SHIFT) -#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT) -#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1) - -/* when hijacked, the counters and bits represent even larger "chunks" */ -/* there will be 1024 chunks represented by each counter in the page pointers */ -#define PAGEPTR_BLOCK_RATIO(bitmap) \ - (CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1) -#define PAGEPTR_BLOCK_SHIFT(bitmap) \ - (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1) -#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1) #endif @@ -181,12 +168,6 @@ struct bitmap_page { unsigned int count:31; }; -/* keep track of bitmap file pages that have pending writes on them */ -struct page_list { - struct list_head list; - struct page *page; -}; - /* the main bitmap structure - one per mddev */ struct bitmap { struct bitmap_page *bp; @@ -196,7 +177,7 @@ struct bitmap { struct mddev *mddev; /* the md device that the bitmap is for */ /* bitmap chunksize -- how much data does each bit represent? */ - unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */ + unsigned long chunkshift; /* chunksize = 2^(chunkshift+9) (for bitops) */ unsigned long chunks; /* total number of data chunks for the array */ __u64 events_cleared; @@ -245,6 +226,7 @@ void bitmap_destroy(struct mddev *mddev); void bitmap_print_sb(struct bitmap *bitmap); void bitmap_update_sb(struct bitmap *bitmap); +void bitmap_status(struct seq_file *seq, struct bitmap *bitmap); int bitmap_setallbits(struct bitmap *bitmap); void bitmap_write_all(struct bitmap *bitmap); diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 0a6806f80ab5..cc06a1e52423 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -12,7 +12,6 @@ #include <linux/dm-io.h> #include <linux/slab.h> #include <linux/vmalloc.h> -#include <linux/version.h> #include <linux/shrinker.h> #include <linux/module.h> @@ -579,7 +578,7 @@ static void write_endio(struct bio *bio, int error) struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); b->write_error = error; - if (error) { + if (unlikely(error)) { struct dm_bufio_client *c = b->c; (void)cmpxchg(&c->async_write_error, 0, error); } @@ -698,13 +697,20 @@ static void __wait_for_free_buffer(struct dm_bufio_client *c) dm_bufio_lock(c); } +enum new_flag { + NF_FRESH = 0, + NF_READ = 1, + NF_GET = 2, + NF_PREFETCH = 3 +}; + /* * Allocate a new buffer. If the allocation is not possible, wait until * some other thread frees a buffer. * * May drop the lock and regain it. */ -static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c) +static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf) { struct dm_buffer *b; @@ -727,6 +733,9 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client return b; } + if (nf == NF_PREFETCH) + return NULL; + if (!list_empty(&c->reserved_buffers)) { b = list_entry(c->reserved_buffers.next, struct dm_buffer, lru_list); @@ -744,9 +753,12 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client } } -static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c) +static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf) { - struct dm_buffer *b = __alloc_buffer_wait_no_callback(c); + struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf); + + if (!b) + return NULL; if (c->alloc_callback) c->alloc_callback(b); @@ -866,32 +878,23 @@ static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) * Getting a buffer *--------------------------------------------------------------*/ -enum new_flag { - NF_FRESH = 0, - NF_READ = 1, - NF_GET = 2 -}; - static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, - enum new_flag nf, struct dm_buffer **bp, - int *need_submit) + enum new_flag nf, int *need_submit) { struct dm_buffer *b, *new_b = NULL; *need_submit = 0; b = __find(c, block); - if (b) { - b->hold_count++; - __relink_lru(b, test_bit(B_DIRTY, &b->state) || - test_bit(B_WRITING, &b->state)); - return b; - } + if (b) + goto found_buffer; if (nf == NF_GET) return NULL; - new_b = __alloc_buffer_wait(c); + new_b = __alloc_buffer_wait(c, nf); + if (!new_b) + return NULL; /* * We've had a period where the mutex was unlocked, so need to @@ -900,10 +903,7 @@ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, b = __find(c, block); if (b) { __free_buffer_wake(new_b); - b->hold_count++; - __relink_lru(b, test_bit(B_DIRTY, &b->state) || - test_bit(B_WRITING, &b->state)); - return b; + goto found_buffer; } __check_watermark(c); @@ -923,6 +923,24 @@ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, *need_submit = 1; return b; + +found_buffer: + if (nf == NF_PREFETCH) + return NULL; + /* + * Note: it is essential that we don't wait for the buffer to be + * read if dm_bufio_get function is used. Both dm_bufio_get and + * dm_bufio_prefetch can be used in the driver request routine. + * If the user called both dm_bufio_prefetch and dm_bufio_get on + * the same buffer, it would deadlock if we waited. + */ + if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state))) + return NULL; + + b->hold_count++; + __relink_lru(b, test_bit(B_DIRTY, &b->state) || + test_bit(B_WRITING, &b->state)); + return b; } /* @@ -957,10 +975,10 @@ static void *new_read(struct dm_bufio_client *c, sector_t block, struct dm_buffer *b; dm_bufio_lock(c); - b = __bufio_new(c, block, nf, bp, &need_submit); + b = __bufio_new(c, block, nf, &need_submit); dm_bufio_unlock(c); - if (!b || IS_ERR(b)) + if (!b) return b; if (need_submit) @@ -1006,13 +1024,47 @@ void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, } EXPORT_SYMBOL_GPL(dm_bufio_new); +void dm_bufio_prefetch(struct dm_bufio_client *c, + sector_t block, unsigned n_blocks) +{ + struct blk_plug plug; + + blk_start_plug(&plug); + dm_bufio_lock(c); + + for (; n_blocks--; block++) { + int need_submit; + struct dm_buffer *b; + b = __bufio_new(c, block, NF_PREFETCH, &need_submit); + if (unlikely(b != NULL)) { + dm_bufio_unlock(c); + + if (need_submit) + submit_io(b, READ, b->block, read_endio); + dm_bufio_release(b); + + dm_bufio_cond_resched(); + + if (!n_blocks) + goto flush_plug; + dm_bufio_lock(c); + } + + } + + dm_bufio_unlock(c); + +flush_plug: + blk_finish_plug(&plug); +} +EXPORT_SYMBOL_GPL(dm_bufio_prefetch); + void dm_bufio_release(struct dm_buffer *b) { struct dm_bufio_client *c = b->c; dm_bufio_lock(c); - BUG_ON(test_bit(B_READING, &b->state)); BUG_ON(!b->hold_count); b->hold_count--; @@ -1025,6 +1077,7 @@ void dm_bufio_release(struct dm_buffer *b) * invalid buffer. */ if ((b->read_error || b->write_error) && + !test_bit(B_READING, &b->state) && !test_bit(B_WRITING, &b->state) && !test_bit(B_DIRTY, &b->state)) { __unlink_buffer(b); @@ -1042,6 +1095,8 @@ void dm_bufio_mark_buffer_dirty(struct dm_buffer *b) dm_bufio_lock(c); + BUG_ON(test_bit(B_READING, &b->state)); + if (!test_and_set_bit(B_DIRTY, &b->state)) __relink_lru(b, LIST_DIRTY); diff --git a/drivers/md/dm-bufio.h b/drivers/md/dm-bufio.h index 5c4c3a04e381..b142946a9e32 100644 --- a/drivers/md/dm-bufio.h +++ b/drivers/md/dm-bufio.h @@ -63,6 +63,14 @@ void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, struct dm_buffer **bp); /* + * Prefetch the specified blocks to the cache. + * The function starts to read the blocks and returns without waiting for + * I/O to finish. + */ +void dm_bufio_prefetch(struct dm_bufio_client *c, + sector_t block, unsigned n_blocks); + +/* * Release a reference obtained with dm_bufio_{read,get,new}. The data * pointer and dm_buffer pointer is no longer valid after this call. */ diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 8c2a000cf3f5..3f06df59fd82 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -176,7 +176,6 @@ struct crypt_config { #define MIN_IOS 16 #define MIN_POOL_PAGES 32 -#define MIN_BIO_PAGES 8 static struct kmem_cache *_crypt_io_pool; @@ -590,9 +589,9 @@ static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv, int r = 0; if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) { - src = kmap_atomic(sg_page(&dmreq->sg_in), KM_USER0); + src = kmap_atomic(sg_page(&dmreq->sg_in)); r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset); - kunmap_atomic(src, KM_USER0); + kunmap_atomic(src); } else memset(iv, 0, cc->iv_size); @@ -608,14 +607,14 @@ static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv, if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) return 0; - dst = kmap_atomic(sg_page(&dmreq->sg_out), KM_USER0); + dst = kmap_atomic(sg_page(&dmreq->sg_out)); r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset); /* Tweak the first block of plaintext sector */ if (!r) crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size); - kunmap_atomic(dst, KM_USER0); + kunmap_atomic(dst); return r; } @@ -848,12 +847,11 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size, } /* - * if additional pages cannot be allocated without waiting, - * return a partially allocated bio, the caller will then try - * to allocate additional bios while submitting this partial bio + * If additional pages cannot be allocated without waiting, + * return a partially-allocated bio. The caller will then try + * to allocate more bios while submitting this partial bio. */ - if (i == (MIN_BIO_PAGES - 1)) - gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT; + gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT; len = (size > PAGE_SIZE) ? PAGE_SIZE : size; @@ -1046,16 +1044,14 @@ static void kcryptd_queue_io(struct dm_crypt_io *io) queue_work(cc->io_queue, &io->work); } -static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, - int error, int async) +static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) { struct bio *clone = io->ctx.bio_out; struct crypt_config *cc = io->target->private; - if (unlikely(error < 0)) { + if (unlikely(io->error < 0)) { crypt_free_buffer_pages(cc, clone); bio_put(clone); - io->error = -EIO; crypt_dec_pending(io); return; } @@ -1106,12 +1102,16 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) sector += bio_sectors(clone); crypt_inc_pending(io); + r = crypt_convert(cc, &io->ctx); + if (r < 0) + io->error = -EIO; + crypt_finished = atomic_dec_and_test(&io->ctx.pending); /* Encryption was already finished, submit io now */ if (crypt_finished) { - kcryptd_crypt_write_io_submit(io, r, 0); + kcryptd_crypt_write_io_submit(io, 0); /* * If there was an error, do not try next fragments. @@ -1162,11 +1162,8 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) crypt_dec_pending(io); } -static void kcryptd_crypt_read_done(struct dm_crypt_io *io, int error) +static void kcryptd_crypt_read_done(struct dm_crypt_io *io) { - if (unlikely(error < 0)) - io->error = -EIO; - crypt_dec_pending(io); } @@ -1181,9 +1178,11 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) io->sector); r = crypt_convert(cc, &io->ctx); + if (r < 0) + io->error = -EIO; if (atomic_dec_and_test(&io->ctx.pending)) - kcryptd_crypt_read_done(io, r); + kcryptd_crypt_read_done(io); crypt_dec_pending(io); } @@ -1204,15 +1203,18 @@ static void kcryptd_async_done(struct crypto_async_request *async_req, if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post) error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq); + if (error < 0) + io->error = -EIO; + mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool); if (!atomic_dec_and_test(&ctx->pending)) return; if (bio_data_dir(io->base_bio) == READ) - kcryptd_crypt_read_done(io, error); + kcryptd_crypt_read_done(io); else - kcryptd_crypt_write_io_submit(io, error, 1); + kcryptd_crypt_write_io_submit(io, 1); } static void kcryptd_crypt(struct work_struct *work) @@ -1413,6 +1415,7 @@ static int crypt_ctr_cipher(struct dm_target *ti, char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount; char *cipher_api = NULL; int cpu, ret = -EINVAL; + char dummy; /* Convert to crypto api definition? */ if (strchr(cipher_in, '(')) { @@ -1434,7 +1437,7 @@ static int crypt_ctr_cipher(struct dm_target *ti, if (!keycount) cc->tfms_count = 1; - else if (sscanf(keycount, "%u", &cc->tfms_count) != 1 || + else if (sscanf(keycount, "%u%c", &cc->tfms_count, &dummy) != 1 || !is_power_of_2(cc->tfms_count)) { ti->error = "Bad cipher key count specification"; return -EINVAL; @@ -1579,6 +1582,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) int ret; struct dm_arg_set as; const char *opt_string; + char dummy; static struct dm_arg _args[] = { {0, 1, "Invalid number of feature args"}, @@ -1636,7 +1640,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) } ret = -EINVAL; - if (sscanf(argv[2], "%llu", &tmpll) != 1) { + if (sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) { ti->error = "Invalid iv_offset sector"; goto bad; } @@ -1647,7 +1651,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - if (sscanf(argv[4], "%llu", &tmpll) != 1) { + if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) { ti->error = "Invalid device sector"; goto bad; } diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index f18375dcedd9..2dc22dddb2ae 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -131,6 +131,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct delay_c *dc; unsigned long long tmpll; + char dummy; if (argc != 3 && argc != 6) { ti->error = "requires exactly 3 or 6 arguments"; @@ -145,13 +146,13 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) dc->reads = dc->writes = 0; - if (sscanf(argv[1], "%llu", &tmpll) != 1) { + if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1) { ti->error = "Invalid device sector"; goto bad; } dc->start_read = tmpll; - if (sscanf(argv[2], "%u", &dc->read_delay) != 1) { + if (sscanf(argv[2], "%u%c", &dc->read_delay, &dummy) != 1) { ti->error = "Invalid delay"; goto bad; } @@ -166,13 +167,13 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (argc == 3) goto out; - if (sscanf(argv[4], "%llu", &tmpll) != 1) { + if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) { ti->error = "Invalid write device sector"; goto bad_dev_read; } dc->start_write = tmpll; - if (sscanf(argv[5], "%u", &dc->write_delay) != 1) { + if (sscanf(argv[5], "%u%c", &dc->write_delay, &dummy) != 1) { ti->error = "Invalid write delay"; goto bad_dev_read; } diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index 042e71996569..aa70f7d43a1a 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c @@ -283,7 +283,7 @@ int dm_exception_store_init(void) return 0; persistent_fail: - dm_persistent_snapshot_exit(); + dm_transient_snapshot_exit(); transient_fail: return r; } diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index 9fb18c147825..ac49c01f1a44 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -160,6 +160,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) unsigned long long tmpll; struct dm_arg_set as; const char *devname; + char dummy; as.argc = argc; as.argv = argv; @@ -178,7 +179,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) devname = dm_shift_arg(&as); - if (sscanf(dm_shift_arg(&as), "%llu", &tmpll) != 1) { + if (sscanf(dm_shift_arg(&as), "%llu%c", &tmpll, &dummy) != 1) { ti->error = "Invalid device sector"; goto bad; } @@ -323,7 +324,7 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio, * Corrupt successful READs while in down state. * If flags were specified, only corrupt those that match. */ - if (!error && bio_submitted_while_down && + if (fc->corrupt_bio_byte && !error && bio_submitted_while_down && (bio_data_dir(bio) == READ) && (fc->corrupt_bio_rw == READ) && all_corrupt_bio_flags_match(bio, fc)) corrupt_bio_data(bio, fc); diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index ad2eba40e319..ea5dd289fe2a 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -296,6 +296,8 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, unsigned offset; unsigned num_bvecs; sector_t remaining = where->count; + struct request_queue *q = bdev_get_queue(where->bdev); + sector_t discard_sectors; /* * where->count may be zero if rw holds a flush and we need to @@ -305,9 +307,12 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, /* * Allocate a suitably sized-bio. */ - num_bvecs = dm_sector_div_up(remaining, - (PAGE_SIZE >> SECTOR_SHIFT)); - num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), num_bvecs); + if (rw & REQ_DISCARD) + num_bvecs = 1; + else + num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), + dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT))); + bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios); bio->bi_sector = where->sector + (where->count - remaining); bio->bi_bdev = where->bdev; @@ -315,10 +320,14 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, bio->bi_destructor = dm_bio_destructor; store_io_and_region_in_bio(bio, io, region); - /* - * Try and add as many pages as possible. - */ - while (remaining) { + if (rw & REQ_DISCARD) { + discard_sectors = min_t(sector_t, q->limits.max_discard_sectors, remaining); + bio->bi_size = discard_sectors << SECTOR_SHIFT; + remaining -= discard_sectors; + } else while (remaining) { + /* + * Try and add as many pages as possible. + */ dp->get_page(dp, &page, &len, &offset); len = min(len, to_bytes(remaining)); if (!bio_add_page(bio, page, len, offset)) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 31c2dc25886d..a1a3e6df17b8 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -880,6 +880,7 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) struct hd_geometry geometry; unsigned long indata[4]; char *geostr = (char *) param + param->data_start; + char dummy; md = find_device(param); if (!md) @@ -891,8 +892,8 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) goto out; } - x = sscanf(geostr, "%lu %lu %lu %lu", indata, - indata + 1, indata + 2, indata + 3); + x = sscanf(geostr, "%lu %lu %lu %lu%c", indata, + indata + 1, indata + 2, indata + 3, &dummy); if (x != 4) { DMWARN("Unable to interpret geometry settings."); @@ -1437,7 +1438,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size) if (!argc) { DMWARN("Empty message received."); - goto out; + goto out_argv; } table = dm_get_live_table(md); diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 9728839f844a..3639eeab6042 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -29,6 +29,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct linear_c *lc; unsigned long long tmp; + char dummy; if (argc != 2) { ti->error = "Invalid argument count"; @@ -41,7 +42,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) return -ENOMEM; } - if (sscanf(argv[1], "%llu", &tmp) != 1) { + if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1) { ti->error = "dm-linear: Invalid device sector"; goto bad; } diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 3b52bb72bd1f..65ebaebf502b 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -369,6 +369,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, unsigned int region_count; size_t bitset_size, buf_size; int r; + char dummy; if (argc < 1 || argc > 2) { DMWARN("wrong number of arguments to dirty region log"); @@ -387,7 +388,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, } } - if (sscanf(argv[0], "%u", ®ion_size) != 1 || + if (sscanf(argv[0], "%u%c", ®ion_size, &dummy) != 1 || !_check_region_size(ti, region_size)) { DMWARN("invalid region size %s", argv[0]); return -EINVAL; diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 801d92d237cf..922a3385eead 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -226,6 +226,27 @@ static void free_multipath(struct multipath *m) kfree(m); } +static int set_mapinfo(struct multipath *m, union map_info *info) +{ + struct dm_mpath_io *mpio; + + mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC); + if (!mpio) + return -ENOMEM; + + memset(mpio, 0, sizeof(*mpio)); + info->ptr = mpio; + + return 0; +} + +static void clear_mapinfo(struct multipath *m, union map_info *info) +{ + struct dm_mpath_io *mpio = info->ptr; + + info->ptr = NULL; + mempool_free(mpio, m->mpio_pool); +} /*----------------------------------------------- * Path selection @@ -341,13 +362,14 @@ static int __must_push_back(struct multipath *m) } static int map_io(struct multipath *m, struct request *clone, - struct dm_mpath_io *mpio, unsigned was_queued) + union map_info *map_context, unsigned was_queued) { int r = DM_MAPIO_REMAPPED; size_t nr_bytes = blk_rq_bytes(clone); unsigned long flags; struct pgpath *pgpath; struct block_device *bdev; + struct dm_mpath_io *mpio = map_context->ptr; spin_lock_irqsave(&m->lock, flags); @@ -423,7 +445,6 @@ static void dispatch_queued_ios(struct multipath *m) { int r; unsigned long flags; - struct dm_mpath_io *mpio; union map_info *info; struct request *clone, *n; LIST_HEAD(cl); @@ -436,16 +457,15 @@ static void dispatch_queued_ios(struct multipath *m) list_del_init(&clone->queuelist); info = dm_get_rq_mapinfo(clone); - mpio = info->ptr; - r = map_io(m, clone, mpio, 1); + r = map_io(m, clone, info, 1); if (r < 0) { - mempool_free(mpio, m->mpio_pool); + clear_mapinfo(m, info); dm_kill_unmapped_request(clone, r); } else if (r == DM_MAPIO_REMAPPED) dm_dispatch_request(clone); else if (r == DM_MAPIO_REQUEUE) { - mempool_free(mpio, m->mpio_pool); + clear_mapinfo(m, info); dm_requeue_unmapped_request(clone); } } @@ -908,20 +928,16 @@ static int multipath_map(struct dm_target *ti, struct request *clone, union map_info *map_context) { int r; - struct dm_mpath_io *mpio; struct multipath *m = (struct multipath *) ti->private; - mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC); - if (!mpio) + if (set_mapinfo(m, map_context) < 0) /* ENOMEM, requeue */ return DM_MAPIO_REQUEUE; - memset(mpio, 0, sizeof(*mpio)); - map_context->ptr = mpio; clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; - r = map_io(m, clone, mpio, 0); + r = map_io(m, clone, map_context, 0); if (r < 0 || r == DM_MAPIO_REQUEUE) - mempool_free(mpio, m->mpio_pool); + clear_mapinfo(m, map_context); return r; } @@ -1054,8 +1070,9 @@ static int switch_pg_num(struct multipath *m, const char *pgstr) struct priority_group *pg; unsigned pgnum; unsigned long flags; + char dummy; - if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum || + if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || (pgnum > m->nr_priority_groups)) { DMWARN("invalid PG number supplied to switch_pg_num"); return -EINVAL; @@ -1085,8 +1102,9 @@ static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed) { struct priority_group *pg; unsigned pgnum; + char dummy; - if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum || + if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || (pgnum > m->nr_priority_groups)) { DMWARN("invalid PG number supplied to bypass_pg"); return -EINVAL; @@ -1261,13 +1279,15 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone, struct path_selector *ps; int r; + BUG_ON(!mpio); + r = do_end_io(m, clone, error, mpio); if (pgpath) { ps = &pgpath->pg->ps; if (ps->type->end_io) ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); } - mempool_free(mpio, m->mpio_pool); + clear_mapinfo(m, map_context); return r; } diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c index 03a837aa5ce6..3941fae0de9f 100644 --- a/drivers/md/dm-queue-length.c +++ b/drivers/md/dm-queue-length.c @@ -112,6 +112,7 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path, struct selector *s = ps->context; struct path_info *pi; unsigned repeat_count = QL_MIN_IO; + char dummy; /* * Arguments: [<repeat_count>] @@ -123,7 +124,7 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path, return -EINVAL; } - if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) { + if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { *error = "queue-length ps: invalid repeat count"; return -EINVAL; } diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 86cb7e5d83d5..b0ba52459ed7 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -604,7 +604,9 @@ static int read_disk_sb(struct md_rdev *rdev, int size) return 0; if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) { - DMERR("Failed to read device superblock"); + DMERR("Failed to read superblock of device at position %d", + rdev->raid_disk); + set_bit(Faulty, &rdev->flags); return -EINVAL; } @@ -615,14 +617,14 @@ static int read_disk_sb(struct md_rdev *rdev, int size) static void super_sync(struct mddev *mddev, struct md_rdev *rdev) { - struct md_rdev *r, *t; + struct md_rdev *r; uint64_t failed_devices; struct dm_raid_superblock *sb; sb = page_address(rdev->sb_page); failed_devices = le64_to_cpu(sb->failed_devices); - rdev_for_each(r, t, mddev) + rdev_for_each(r, mddev) if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags)) failed_devices |= (1ULL << r->raid_disk); @@ -668,7 +670,14 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev) return ret; sb = page_address(rdev->sb_page); - if (sb->magic != cpu_to_le32(DM_RAID_MAGIC)) { + + /* + * Two cases that we want to write new superblocks and rebuild: + * 1) New device (no matching magic number) + * 2) Device specified for rebuild (!In_sync w/ offset == 0) + */ + if ((sb->magic != cpu_to_le32(DM_RAID_MAGIC)) || + (!test_bit(In_sync, &rdev->flags) && !rdev->recovery_offset)) { super_sync(rdev->mddev, rdev); set_bit(FirstUse, &rdev->flags); @@ -700,7 +709,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev) struct dm_raid_superblock *sb; uint32_t new_devs = 0; uint32_t rebuilds = 0; - struct md_rdev *r, *t; + struct md_rdev *r; struct dm_raid_superblock *sb2; sb = page_address(rdev->sb_page); @@ -743,13 +752,10 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev) * case the In_sync bit will /not/ be set and * recovery_cp must be MaxSector. */ - rdev_for_each(r, t, mddev) { + rdev_for_each(r, mddev) { if (!test_bit(In_sync, &r->flags)) { - if (!test_bit(FirstUse, &r->flags)) - DMERR("Superblock area of " - "rebuild device %d should have been " - "cleared.", r->raid_disk); - set_bit(FirstUse, &r->flags); + DMINFO("Device %d specified for rebuild: " + "Clearing superblock", r->raid_disk); rebuilds++; } else if (test_bit(FirstUse, &r->flags)) new_devs++; @@ -778,7 +784,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev) * Now we set the Faulty bit for those devices that are * recorded in the superblock as failed. */ - rdev_for_each(r, t, mddev) { + rdev_for_each(r, mddev) { if (!r->sb_page) continue; sb2 = page_address(r->sb_page); @@ -851,11 +857,27 @@ static int super_validate(struct mddev *mddev, struct md_rdev *rdev) static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) { int ret; - struct md_rdev *rdev, *freshest, *tmp; + unsigned redundancy = 0; + struct raid_dev *dev; + struct md_rdev *rdev, *freshest; struct mddev *mddev = &rs->md; + switch (rs->raid_type->level) { + case 1: + redundancy = rs->md.raid_disks - 1; + break; + case 4: + case 5: + case 6: + redundancy = rs->raid_type->parity_devs; + break; + default: + ti->error = "Unknown RAID type"; + return -EINVAL; + } + freshest = NULL; - rdev_for_each(rdev, tmp, mddev) { + rdev_for_each(rdev, mddev) { if (!rdev->meta_bdev) continue; @@ -868,6 +890,37 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) case 0: break; default: + dev = container_of(rdev, struct raid_dev, rdev); + if (redundancy--) { + if (dev->meta_dev) + dm_put_device(ti, dev->meta_dev); + + dev->meta_dev = NULL; + rdev->meta_bdev = NULL; + + if (rdev->sb_page) + put_page(rdev->sb_page); + + rdev->sb_page = NULL; + + rdev->sb_loaded = 0; + + /* + * We might be able to salvage the data device + * even though the meta device has failed. For + * now, we behave as though '- -' had been + * set for this device in the table. + */ + if (dev->data_dev) + dm_put_device(ti, dev->data_dev); + + dev->data_dev = NULL; + rdev->bdev = NULL; + + list_del(&rdev->same_set); + + continue; + } ti->error = "Failed to load superblock"; return ret; } @@ -884,7 +937,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) if (super_validate(mddev, freshest)) return -EINVAL; - rdev_for_each(rdev, tmp, mddev) + rdev_for_each(rdev, mddev) if ((rdev != freshest) && super_validate(mddev, rdev)) return -EINVAL; @@ -971,6 +1024,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) INIT_WORK(&rs->md.event_work, do_table_event); ti->private = rs; + ti->num_flush_requests = 1; mutex_lock(&rs->md.reconfig_mutex); ret = md_run(&rs->md); @@ -1209,7 +1263,7 @@ static void raid_resume(struct dm_target *ti) static struct target_type raid_target = { .name = "raid", - .version = {1, 1, 0}, + .version = {1, 2, 0}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 9bfd057be686..d039de8322f0 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -924,8 +924,9 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti, unsigned int mirror, char **argv) { unsigned long long offset; + char dummy; - if (sscanf(argv[1], "%llu", &offset) != 1) { + if (sscanf(argv[1], "%llu%c", &offset, &dummy) != 1) { ti->error = "Invalid offset"; return -EINVAL; } @@ -953,13 +954,14 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti, { unsigned param_count; struct dm_dirty_log *dl; + char dummy; if (argc < 2) { ti->error = "Insufficient mirror log arguments"; return NULL; } - if (sscanf(argv[1], "%u", ¶m_count) != 1) { + if (sscanf(argv[1], "%u%c", ¶m_count, &dummy) != 1) { ti->error = "Invalid mirror log argument count"; return NULL; } @@ -986,13 +988,14 @@ static int parse_features(struct mirror_set *ms, unsigned argc, char **argv, { unsigned num_features; struct dm_target *ti = ms->ti; + char dummy; *args_used = 0; if (!argc) return 0; - if (sscanf(argv[0], "%u", &num_features) != 1) { + if (sscanf(argv[0], "%u%c", &num_features, &dummy) != 1) { ti->error = "Invalid number of features"; return -EINVAL; } @@ -1036,6 +1039,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) unsigned int nr_mirrors, m, args_used; struct mirror_set *ms; struct dm_dirty_log *dl; + char dummy; dl = create_dirty_log(ti, argc, argv, &args_used); if (!dl) @@ -1044,7 +1048,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) argv += args_used; argc -= args_used; - if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 || + if (!argc || sscanf(argv[0], "%u%c", &nr_mirrors, &dummy) != 1 || nr_mirrors < 2 || nr_mirrors > DM_KCOPYD_MAX_REGIONS + 1) { ti->error = "Invalid number of mirrors"; dm_dirty_log_destroy(dl); diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c index 27f1d423b76c..6ab1192cdd5f 100644 --- a/drivers/md/dm-round-robin.c +++ b/drivers/md/dm-round-robin.c @@ -114,6 +114,7 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path, struct selector *s = (struct selector *) ps->context; struct path_info *pi; unsigned repeat_count = RR_MIN_IO; + char dummy; if (argc > 1) { *error = "round-robin ps: incorrect number of arguments"; @@ -121,7 +122,7 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path, } /* First path argument is number of I/Os before switching path */ - if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) { + if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { *error = "round-robin ps: invalid repeat count"; return -EINVAL; } diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c index 59883bd78214..9df8f6bd6418 100644 --- a/drivers/md/dm-service-time.c +++ b/drivers/md/dm-service-time.c @@ -110,6 +110,7 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path, struct path_info *pi; unsigned repeat_count = ST_MIN_IO; unsigned relative_throughput = 1; + char dummy; /* * Arguments: [<repeat_count> [<relative_throughput>]] @@ -128,13 +129,13 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path, return -EINVAL; } - if (argc && (sscanf(argv[0], "%u", &repeat_count) != 1)) { + if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { *error = "service-time ps: invalid repeat count"; return -EINVAL; } if ((argc == 2) && - (sscanf(argv[1], "%u", &relative_throughput) != 1 || + (sscanf(argv[1], "%u%c", &relative_throughput, &dummy) != 1 || relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) { *error = "service-time ps: invalid relative_throughput value"; return -EINVAL; diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 3d80cf0c152d..35c94ff24ad5 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -75,8 +75,9 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc, unsigned int stripe, char **argv) { unsigned long long start; + char dummy; - if (sscanf(argv[1], "%llu", &start) != 1) + if (sscanf(argv[1], "%llu%c", &start, &dummy) != 1) return -EINVAL; if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 63cc54289aff..2e227fbf1622 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -268,8 +268,7 @@ void dm_table_destroy(struct dm_table *t) vfree(t->highs); /* free the device list */ - if (t->devices.next != &t->devices) - free_devices(&t->devices); + free_devices(&t->devices); dm_free_md_mempools(t->mempools); @@ -464,10 +463,11 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, struct dm_dev_internal *dd; unsigned int major, minor; struct dm_table *t = ti->table; + char dummy; BUG_ON(!t); - if (sscanf(path, "%u:%u", &major, &minor) == 2) { + if (sscanf(path, "%u:%u%c", &major, &minor, &dummy) == 2) { /* Extract the major/minor numbers */ dev = MKDEV(major, minor); if (MAJOR(dev) != major || MINOR(dev) != minor) @@ -842,9 +842,10 @@ static int validate_next_arg(struct dm_arg *arg, struct dm_arg_set *arg_set, unsigned *value, char **error, unsigned grouped) { const char *arg_str = dm_shift_arg(arg_set); + char dummy; if (!arg_str || - (sscanf(arg_str, "%u", value) != 1) || + (sscanf(arg_str, "%u%c", value, &dummy) != 1) || (*value < arg->min) || (*value > arg->max) || (grouped && arg_set->argc < *value)) { diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 59c4f0446ffa..737d38865b69 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -385,6 +385,7 @@ static int init_pmd(struct dm_pool_metadata *pmd, data_sm = dm_sm_disk_create(tm, nr_blocks); if (IS_ERR(data_sm)) { DMERR("sm_disk_create failed"); + dm_tm_unlock(tm, sblock); r = PTR_ERR(data_sm); goto bad; } @@ -613,7 +614,7 @@ static int __commit_transaction(struct dm_pool_metadata *pmd) if (r < 0) goto out; - r = dm_sm_root_size(pmd->metadata_sm, &data_len); + r = dm_sm_root_size(pmd->data_sm, &data_len); if (r < 0) goto out; @@ -712,6 +713,9 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, if (r) goto bad; + if (bdev_size > THIN_METADATA_MAX_SECTORS) + bdev_size = THIN_METADATA_MAX_SECTORS; + disk_super = dm_block_data(sblock); disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC); disk_super->version = cpu_to_le32(THIN_VERSION); @@ -789,6 +793,11 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd) return 0; } +/* + * __open_device: Returns @td corresponding to device with id @dev, + * creating it if @create is set and incrementing @td->open_count. + * On failure, @td is undefined. + */ static int __open_device(struct dm_pool_metadata *pmd, dm_thin_id dev, int create, struct dm_thin_device **td) @@ -799,10 +808,16 @@ static int __open_device(struct dm_pool_metadata *pmd, struct disk_device_details details_le; /* - * Check the device isn't already open. + * If the device is already open, return it. */ list_for_each_entry(td2, &pmd->thin_devices, list) if (td2->id == dev) { + /* + * May not create an already-open device. + */ + if (create) + return -EEXIST; + td2->open_count++; *td = td2; return 0; @@ -817,6 +832,9 @@ static int __open_device(struct dm_pool_metadata *pmd, if (r != -ENODATA || !create) return r; + /* + * Create new device. + */ changed = 1; details_le.mapped_blocks = 0; details_le.transaction_id = cpu_to_le64(pmd->trans_id); @@ -882,12 +900,10 @@ static int __create_thin(struct dm_pool_metadata *pmd, r = __open_device(pmd, dev, 1, &td); if (r) { - __close_device(td); dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); dm_btree_del(&pmd->bl_info, dev_root); return r; } - td->changed = 1; __close_device(td); return r; @@ -967,14 +983,14 @@ static int __create_snap(struct dm_pool_metadata *pmd, goto bad; r = __set_snapshot_details(pmd, td, origin, pmd->time); + __close_device(td); + if (r) goto bad; - __close_device(td); return 0; bad: - __close_device(td); dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); dm_btree_remove(&pmd->details_info, pmd->details_root, &key, &pmd->details_root); @@ -1211,6 +1227,8 @@ static int __remove(struct dm_thin_device *td, dm_block_t block) if (r) return r; + td->mapped_blocks--; + td->changed = 1; pmd->need_commit = 1; return 0; diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h index 859c16896877..ed4725e67c96 100644 --- a/drivers/md/dm-thin-metadata.h +++ b/drivers/md/dm-thin-metadata.h @@ -11,6 +11,19 @@ #define THIN_METADATA_BLOCK_SIZE 4096 +/* + * The metadata device is currently limited in size. + * + * We have one block of index, which can hold 255 index entries. Each + * index entry contains allocation info about 16k metadata blocks. + */ +#define THIN_METADATA_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT))) + +/* + * A metadata device larger than 16GB triggers a warning. + */ +#define THIN_METADATA_MAX_SECTORS_WARNING (16 * (1024 * 1024 * 1024 >> SECTOR_SHIFT)) + /*----------------------------------------------------------------*/ struct dm_pool_metadata; diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index c3087575fef0..213ae32a0fc4 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -23,6 +23,7 @@ #define DEFERRED_SET_SIZE 64 #define MAPPING_POOL_SIZE 1024 #define PRISON_CELLS 1024 +#define COMMIT_PERIOD HZ /* * The block size of the device holding pool data must be @@ -32,16 +33,6 @@ #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) /* - * The metadata device is currently limited in size. The limitation is - * checked lower down in dm-space-map-metadata, but we also check it here - * so we can fail early. - * - * We have one block of index, which can hold 255 index entries. Each - * index entry contains allocation info about 16k metadata blocks. - */ -#define METADATA_DEV_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT))) - -/* * Device id is restricted to 24 bits. */ #define MAX_DEV_ID ((1 << 24) - 1) @@ -72,7 +63,7 @@ * missed out if the io covers the block. (schedule_copy). * * iv) insert the new mapping into the origin's btree - * (process_prepared_mappings). This act of inserting breaks some + * (process_prepared_mapping). This act of inserting breaks some * sharing of btree nodes between the two devices. Breaking sharing only * effects the btree of that specific device. Btrees for the other * devices that share the block never change. The btree for the origin @@ -124,7 +115,7 @@ struct cell { struct hlist_node list; struct bio_prison *prison; struct cell_key key; - unsigned count; + struct bio *holder; struct bio_list bios; }; @@ -220,54 +211,59 @@ static struct cell *__search_bucket(struct hlist_head *bucket, * This may block if a new cell needs allocating. You must ensure that * cells will be unlocked even if the calling thread is blocked. * - * Returns the number of entries in the cell prior to the new addition - * or < 0 on failure. + * Returns 1 if the cell was already held, 0 if @inmate is the new holder. */ static int bio_detain(struct bio_prison *prison, struct cell_key *key, struct bio *inmate, struct cell **ref) { - int r; + int r = 1; unsigned long flags; uint32_t hash = hash_key(prison, key); - struct cell *uninitialized_var(cell), *cell2 = NULL; + struct cell *cell, *cell2; BUG_ON(hash > prison->nr_buckets); spin_lock_irqsave(&prison->lock, flags); + cell = __search_bucket(prison->cells + hash, key); + if (cell) { + bio_list_add(&cell->bios, inmate); + goto out; + } - if (!cell) { - /* - * Allocate a new cell - */ - spin_unlock_irqrestore(&prison->lock, flags); - cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO); - spin_lock_irqsave(&prison->lock, flags); + /* + * Allocate a new cell + */ + spin_unlock_irqrestore(&prison->lock, flags); + cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO); + spin_lock_irqsave(&prison->lock, flags); - /* - * We've been unlocked, so we have to double check that - * nobody else has inserted this cell in the meantime. - */ - cell = __search_bucket(prison->cells + hash, key); + /* + * We've been unlocked, so we have to double check that + * nobody else has inserted this cell in the meantime. + */ + cell = __search_bucket(prison->cells + hash, key); + if (cell) { + mempool_free(cell2, prison->cell_pool); + bio_list_add(&cell->bios, inmate); + goto out; + } - if (!cell) { - cell = cell2; - cell2 = NULL; + /* + * Use new cell. + */ + cell = cell2; - cell->prison = prison; - memcpy(&cell->key, key, sizeof(cell->key)); - cell->count = 0; - bio_list_init(&cell->bios); - hlist_add_head(&cell->list, prison->cells + hash); - } - } + cell->prison = prison; + memcpy(&cell->key, key, sizeof(cell->key)); + cell->holder = inmate; + bio_list_init(&cell->bios); + hlist_add_head(&cell->list, prison->cells + hash); - r = cell->count++; - bio_list_add(&cell->bios, inmate); - spin_unlock_irqrestore(&prison->lock, flags); + r = 0; - if (cell2) - mempool_free(cell2, prison->cell_pool); +out: + spin_unlock_irqrestore(&prison->lock, flags); *ref = cell; @@ -283,8 +279,8 @@ static void __cell_release(struct cell *cell, struct bio_list *inmates) hlist_del(&cell->list); - if (inmates) - bio_list_merge(inmates, &cell->bios); + bio_list_add(inmates, cell->holder); + bio_list_merge(inmates, &cell->bios); mempool_free(cell, prison->cell_pool); } @@ -305,22 +301,44 @@ static void cell_release(struct cell *cell, struct bio_list *bios) * bio may be in the cell. This function releases the cell, and also does * a sanity check. */ +static void __cell_release_singleton(struct cell *cell, struct bio *bio) +{ + hlist_del(&cell->list); + BUG_ON(cell->holder != bio); + BUG_ON(!bio_list_empty(&cell->bios)); +} + static void cell_release_singleton(struct cell *cell, struct bio *bio) { - struct bio_prison *prison = cell->prison; - struct bio_list bios; - struct bio *b; unsigned long flags; - - bio_list_init(&bios); + struct bio_prison *prison = cell->prison; spin_lock_irqsave(&prison->lock, flags); - __cell_release(cell, &bios); + __cell_release_singleton(cell, bio); spin_unlock_irqrestore(&prison->lock, flags); +} + +/* + * Sometimes we don't want the holder, just the additional bios. + */ +static void __cell_release_no_holder(struct cell *cell, struct bio_list *inmates) +{ + struct bio_prison *prison = cell->prison; + + hlist_del(&cell->list); + bio_list_merge(inmates, &cell->bios); - b = bio_list_pop(&bios); - BUG_ON(b != bio); - BUG_ON(!bio_list_empty(&bios)); + mempool_free(cell, prison->cell_pool); +} + +static void cell_release_no_holder(struct cell *cell, struct bio_list *inmates) +{ + unsigned long flags; + struct bio_prison *prison = cell->prison; + + spin_lock_irqsave(&prison->lock, flags); + __cell_release_no_holder(cell, inmates); + spin_unlock_irqrestore(&prison->lock, flags); } static void cell_error(struct cell *cell) @@ -471,6 +489,13 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, * devices. */ struct new_mapping; + +struct pool_features { + unsigned zero_new_blocks:1; + unsigned discard_enabled:1; + unsigned discard_passdown:1; +}; + struct pool { struct list_head list; struct dm_target *ti; /* Only set if a pool target is bound */ @@ -484,7 +509,7 @@ struct pool { dm_block_t offset_mask; dm_block_t low_water_blocks; - unsigned zero_new_blocks:1; + struct pool_features pf; unsigned low_water_triggered:1; /* A dm event has been sent */ unsigned no_free_space:1; /* A -ENOSPC warning has been issued */ @@ -493,17 +518,21 @@ struct pool { struct workqueue_struct *wq; struct work_struct worker; + struct delayed_work waker; unsigned ref_count; + unsigned long last_commit_jiffies; spinlock_t lock; struct bio_list deferred_bios; struct bio_list deferred_flush_bios; struct list_head prepared_mappings; + struct list_head prepared_discards; struct bio_list retry_on_resume_list; - struct deferred_set ds; /* FIXME: move to thin_c */ + struct deferred_set shared_read_ds; + struct deferred_set all_io_ds; struct new_mapping *next_mapping; mempool_t *mapping_pool; @@ -521,7 +550,7 @@ struct pool_c { struct dm_target_callbacks callbacks; dm_block_t low_water_blocks; - unsigned zero_new_blocks:1; + struct pool_features pf; }; /* @@ -529,6 +558,7 @@ struct pool_c { */ struct thin_c { struct dm_dev *pool_dev; + struct dm_dev *origin_dev; dm_thin_id dev_id; struct pool *pool; @@ -597,6 +627,13 @@ static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev /*----------------------------------------------------------------*/ +struct endio_hook { + struct thin_c *tc; + struct deferred_entry *shared_read_entry; + struct deferred_entry *all_io_entry; + struct new_mapping *overwrite_mapping; +}; + static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) { struct bio *bio; @@ -607,7 +644,8 @@ static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) bio_list_init(master); while ((bio = bio_list_pop(&bios))) { - if (dm_get_mapinfo(bio)->ptr == tc) + struct endio_hook *h = dm_get_mapinfo(bio)->ptr; + if (h->tc == tc) bio_endio(bio, DM_ENDIO_REQUEUE); else bio_list_add(master, bio); @@ -646,14 +684,16 @@ static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) (bio->bi_sector & pool->offset_mask); } -static void remap_and_issue(struct thin_c *tc, struct bio *bio, - dm_block_t block) +static void remap_to_origin(struct thin_c *tc, struct bio *bio) +{ + bio->bi_bdev = tc->origin_dev->bdev; +} + +static void issue(struct thin_c *tc, struct bio *bio) { struct pool *pool = tc->pool; unsigned long flags; - remap(tc, bio, block); - /* * Batch together any FUA/FLUSH bios we find and then issue * a single commit for them in process_deferred_bios(). @@ -666,6 +706,19 @@ static void remap_and_issue(struct thin_c *tc, struct bio *bio, generic_make_request(bio); } +static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio) +{ + remap_to_origin(tc, bio); + issue(tc, bio); +} + +static void remap_and_issue(struct thin_c *tc, struct bio *bio, + dm_block_t block) +{ + remap(tc, bio, block); + issue(tc, bio); +} + /* * wake_worker() is used when new work is queued and when pool_resume is * ready to continue deferred IO processing. @@ -680,21 +733,17 @@ static void wake_worker(struct pool *pool) /* * Bio endio functions. */ -struct endio_hook { - struct thin_c *tc; - bio_end_io_t *saved_bi_end_io; - struct deferred_entry *entry; -}; - struct new_mapping { struct list_head list; - int prepared; + unsigned quiesced:1; + unsigned prepared:1; + unsigned pass_discard:1; struct thin_c *tc; dm_block_t virt_block; dm_block_t data_block; - struct cell *cell; + struct cell *cell, *cell2; int err; /* @@ -711,7 +760,7 @@ static void __maybe_add_mapping(struct new_mapping *m) { struct pool *pool = m->tc->pool; - if (list_empty(&m->list) && m->prepared) { + if (m->quiesced && m->prepared) { list_add(&m->list, &pool->prepared_mappings); wake_worker(pool); } @@ -734,7 +783,8 @@ static void copy_complete(int read_err, unsigned long write_err, void *context) static void overwrite_endio(struct bio *bio, int err) { unsigned long flags; - struct new_mapping *m = dm_get_mapinfo(bio)->ptr; + struct endio_hook *h = dm_get_mapinfo(bio)->ptr; + struct new_mapping *m = h->overwrite_mapping; struct pool *pool = m->tc->pool; m->err = err; @@ -745,31 +795,6 @@ static void overwrite_endio(struct bio *bio, int err) spin_unlock_irqrestore(&pool->lock, flags); } -static void shared_read_endio(struct bio *bio, int err) -{ - struct list_head mappings; - struct new_mapping *m, *tmp; - struct endio_hook *h = dm_get_mapinfo(bio)->ptr; - unsigned long flags; - struct pool *pool = h->tc->pool; - - bio->bi_end_io = h->saved_bi_end_io; - bio_endio(bio, err); - - INIT_LIST_HEAD(&mappings); - ds_dec(h->entry, &mappings); - - spin_lock_irqsave(&pool->lock, flags); - list_for_each_entry_safe(m, tmp, &mappings, list) { - list_del(&m->list); - INIT_LIST_HEAD(&m->list); - __maybe_add_mapping(m); - } - spin_unlock_irqrestore(&pool->lock, flags); - - mempool_free(h, pool->endio_hook_pool); -} - /*----------------------------------------------------------------*/ /* @@ -800,21 +825,16 @@ static void cell_defer(struct thin_c *tc, struct cell *cell, * Same as cell_defer above, except it omits one particular detainee, * a write bio that covers the block and has already been processed. */ -static void cell_defer_except(struct thin_c *tc, struct cell *cell, - struct bio *exception) +static void cell_defer_except(struct thin_c *tc, struct cell *cell) { struct bio_list bios; - struct bio *bio; struct pool *pool = tc->pool; unsigned long flags; bio_list_init(&bios); - cell_release(cell, &bios); spin_lock_irqsave(&pool->lock, flags); - while ((bio = bio_list_pop(&bios))) - if (bio != exception) - bio_list_add(&pool->deferred_bios, bio); + cell_release_no_holder(cell, &pool->deferred_bios); spin_unlock_irqrestore(&pool->lock, flags); wake_worker(pool); @@ -854,7 +874,7 @@ static void process_prepared_mapping(struct new_mapping *m) * the bios in the cell. */ if (bio) { - cell_defer_except(tc, m->cell, bio); + cell_defer_except(tc, m->cell); bio_endio(bio, 0); } else cell_defer(tc, m->cell, m->data_block); @@ -863,7 +883,30 @@ static void process_prepared_mapping(struct new_mapping *m) mempool_free(m, tc->pool->mapping_pool); } -static void process_prepared_mappings(struct pool *pool) +static void process_prepared_discard(struct new_mapping *m) +{ + int r; + struct thin_c *tc = m->tc; + + r = dm_thin_remove_block(tc->td, m->virt_block); + if (r) + DMERR("dm_thin_remove_block() failed"); + + /* + * Pass the discard down to the underlying device? + */ + if (m->pass_discard) + remap_and_issue(tc, m->bio, m->data_block); + else + bio_endio(m->bio, 0); + + cell_defer_except(tc, m->cell); + cell_defer_except(tc, m->cell2); + mempool_free(m, tc->pool->mapping_pool); +} + +static void process_prepared(struct pool *pool, struct list_head *head, + void (*fn)(struct new_mapping *)) { unsigned long flags; struct list_head maps; @@ -871,21 +914,27 @@ static void process_prepared_mappings(struct pool *pool) INIT_LIST_HEAD(&maps); spin_lock_irqsave(&pool->lock, flags); - list_splice_init(&pool->prepared_mappings, &maps); + list_splice_init(head, &maps); spin_unlock_irqrestore(&pool->lock, flags); list_for_each_entry_safe(m, tmp, &maps, list) - process_prepared_mapping(m); + fn(m); } /* * Deferred bio jobs. */ -static int io_overwrites_block(struct pool *pool, struct bio *bio) +static int io_overlaps_block(struct pool *pool, struct bio *bio) { - return ((bio_data_dir(bio) == WRITE) && - !(bio->bi_sector & pool->offset_mask)) && + return !(bio->bi_sector & pool->offset_mask) && (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT)); + +} + +static int io_overwrites_block(struct pool *pool, struct bio *bio) +{ + return (bio_data_dir(bio) == WRITE) && + io_overlaps_block(pool, bio); } static void save_and_set_endio(struct bio *bio, bio_end_io_t **save, @@ -917,7 +966,8 @@ static struct new_mapping *get_next_mapping(struct pool *pool) } static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, - dm_block_t data_origin, dm_block_t data_dest, + struct dm_dev *origin, dm_block_t data_origin, + dm_block_t data_dest, struct cell *cell, struct bio *bio) { int r; @@ -925,6 +975,7 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, struct new_mapping *m = get_next_mapping(pool); INIT_LIST_HEAD(&m->list); + m->quiesced = 0; m->prepared = 0; m->tc = tc; m->virt_block = virt_block; @@ -933,7 +984,8 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, m->err = 0; m->bio = NULL; - ds_add_work(&pool->ds, &m->list); + if (!ds_add_work(&pool->shared_read_ds, &m->list)) + m->quiesced = 1; /* * IO to pool_dev remaps to the pool target's data_dev. @@ -942,14 +994,15 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, * bio immediately. Otherwise we use kcopyd to clone the data first. */ if (io_overwrites_block(pool, bio)) { + struct endio_hook *h = dm_get_mapinfo(bio)->ptr; + h->overwrite_mapping = m; m->bio = bio; save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); - dm_get_mapinfo(bio)->ptr = m; remap_and_issue(tc, bio, data_dest); } else { struct dm_io_region from, to; - from.bdev = tc->pool_dev->bdev; + from.bdev = origin->bdev; from.sector = data_origin * pool->sectors_per_block; from.count = pool->sectors_per_block; @@ -967,6 +1020,22 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, } } +static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block, + dm_block_t data_origin, dm_block_t data_dest, + struct cell *cell, struct bio *bio) +{ + schedule_copy(tc, virt_block, tc->pool_dev, + data_origin, data_dest, cell, bio); +} + +static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block, + dm_block_t data_dest, + struct cell *cell, struct bio *bio) +{ + schedule_copy(tc, virt_block, tc->origin_dev, + virt_block, data_dest, cell, bio); +} + static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, dm_block_t data_block, struct cell *cell, struct bio *bio) @@ -975,6 +1044,7 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, struct new_mapping *m = get_next_mapping(pool); INIT_LIST_HEAD(&m->list); + m->quiesced = 1; m->prepared = 0; m->tc = tc; m->virt_block = virt_block; @@ -988,13 +1058,14 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, * zeroing pre-existing data, we can issue the bio immediately. * Otherwise we use kcopyd to zero the data first. */ - if (!pool->zero_new_blocks) + if (!pool->pf.zero_new_blocks) process_prepared_mapping(m); else if (io_overwrites_block(pool, bio)) { + struct endio_hook *h = dm_get_mapinfo(bio)->ptr; + h->overwrite_mapping = m; m->bio = bio; save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); - dm_get_mapinfo(bio)->ptr = m; remap_and_issue(tc, bio, data_block); } else { @@ -1081,7 +1152,8 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result) */ static void retry_on_resume(struct bio *bio) { - struct thin_c *tc = dm_get_mapinfo(bio)->ptr; + struct endio_hook *h = dm_get_mapinfo(bio)->ptr; + struct thin_c *tc = h->tc; struct pool *pool = tc->pool; unsigned long flags; @@ -1102,6 +1174,86 @@ static void no_space(struct cell *cell) retry_on_resume(bio); } +static void process_discard(struct thin_c *tc, struct bio *bio) +{ + int r; + struct pool *pool = tc->pool; + struct cell *cell, *cell2; + struct cell_key key, key2; + dm_block_t block = get_bio_block(tc, bio); + struct dm_thin_lookup_result lookup_result; + struct new_mapping *m; + + build_virtual_key(tc->td, block, &key); + if (bio_detain(tc->pool->prison, &key, bio, &cell)) + return; + + r = dm_thin_find_block(tc->td, block, 1, &lookup_result); + switch (r) { + case 0: + /* + * Check nobody is fiddling with this pool block. This can + * happen if someone's in the process of breaking sharing + * on this block. + */ + build_data_key(tc->td, lookup_result.block, &key2); + if (bio_detain(tc->pool->prison, &key2, bio, &cell2)) { + cell_release_singleton(cell, bio); + break; + } + + if (io_overlaps_block(pool, bio)) { + /* + * IO may still be going to the destination block. We must + * quiesce before we can do the removal. + */ + m = get_next_mapping(pool); + m->tc = tc; + m->pass_discard = (!lookup_result.shared) & pool->pf.discard_passdown; + m->virt_block = block; + m->data_block = lookup_result.block; + m->cell = cell; + m->cell2 = cell2; + m->err = 0; + m->bio = bio; + + if (!ds_add_work(&pool->all_io_ds, &m->list)) { + list_add(&m->list, &pool->prepared_discards); + wake_worker(pool); + } + } else { + /* + * This path is hit if people are ignoring + * limits->discard_granularity. It ignores any + * part of the discard that is in a subsequent + * block. + */ + sector_t offset = bio->bi_sector - (block << pool->block_shift); + unsigned remaining = (pool->sectors_per_block - offset) << 9; + bio->bi_size = min(bio->bi_size, remaining); + + cell_release_singleton(cell, bio); + cell_release_singleton(cell2, bio); + remap_and_issue(tc, bio, lookup_result.block); + } + break; + + case -ENODATA: + /* + * It isn't provisioned, just forget it. + */ + cell_release_singleton(cell, bio); + bio_endio(bio, 0); + break; + + default: + DMERR("discard: find block unexpectedly returned %d", r); + cell_release_singleton(cell, bio); + bio_io_error(bio); + break; + } +} + static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, struct cell_key *key, struct dm_thin_lookup_result *lookup_result, @@ -1113,8 +1265,8 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, r = alloc_data_block(tc, &data_block); switch (r) { case 0: - schedule_copy(tc, block, lookup_result->block, - data_block, cell, bio); + schedule_internal_copy(tc, block, lookup_result->block, + data_block, cell, bio); break; case -ENOSPC: @@ -1147,13 +1299,9 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio, if (bio_data_dir(bio) == WRITE) break_sharing(tc, bio, block, &key, lookup_result, cell); else { - struct endio_hook *h; - h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO); + struct endio_hook *h = dm_get_mapinfo(bio)->ptr; - h->tc = tc; - h->entry = ds_inc(&pool->ds); - save_and_set_endio(bio, &h->saved_bi_end_io, shared_read_endio); - dm_get_mapinfo(bio)->ptr = h; + h->shared_read_entry = ds_inc(&pool->shared_read_ds); cell_release_singleton(cell, bio); remap_and_issue(tc, bio, lookup_result->block); @@ -1188,7 +1336,10 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block r = alloc_data_block(tc, &data_block); switch (r) { case 0: - schedule_zero(tc, block, data_block, cell, bio); + if (tc->origin_dev) + schedule_external_copy(tc, block, data_block, cell, bio); + else + schedule_zero(tc, block, data_block, cell, bio); break; case -ENOSPC: @@ -1239,16 +1390,27 @@ static void process_bio(struct thin_c *tc, struct bio *bio) break; case -ENODATA: - provision_block(tc, bio, block, cell); + if (bio_data_dir(bio) == READ && tc->origin_dev) { + cell_release_singleton(cell, bio); + remap_to_origin_and_issue(tc, bio); + } else + provision_block(tc, bio, block, cell); break; default: DMERR("dm_thin_find_block() failed, error = %d", r); + cell_release_singleton(cell, bio); bio_io_error(bio); break; } } +static int need_commit_due_to_time(struct pool *pool) +{ + return jiffies < pool->last_commit_jiffies || + jiffies > pool->last_commit_jiffies + COMMIT_PERIOD; +} + static void process_deferred_bios(struct pool *pool) { unsigned long flags; @@ -1264,7 +1426,9 @@ static void process_deferred_bios(struct pool *pool) spin_unlock_irqrestore(&pool->lock, flags); while ((bio = bio_list_pop(&bios))) { - struct thin_c *tc = dm_get_mapinfo(bio)->ptr; + struct endio_hook *h = dm_get_mapinfo(bio)->ptr; + struct thin_c *tc = h->tc; + /* * If we've got no free new_mapping structs, and processing * this bio might require one, we pause until there are some @@ -1277,7 +1441,11 @@ static void process_deferred_bios(struct pool *pool) break; } - process_bio(tc, bio); + + if (bio->bi_rw & REQ_DISCARD) + process_discard(tc, bio); + else + process_bio(tc, bio); } /* @@ -1290,7 +1458,7 @@ static void process_deferred_bios(struct pool *pool) bio_list_init(&pool->deferred_flush_bios); spin_unlock_irqrestore(&pool->lock, flags); - if (bio_list_empty(&bios)) + if (bio_list_empty(&bios) && !need_commit_due_to_time(pool)) return; r = dm_pool_commit_metadata(pool->pmd); @@ -1301,6 +1469,7 @@ static void process_deferred_bios(struct pool *pool) bio_io_error(bio); return; } + pool->last_commit_jiffies = jiffies; while ((bio = bio_list_pop(&bios))) generic_make_request(bio); @@ -1310,10 +1479,22 @@ static void do_worker(struct work_struct *ws) { struct pool *pool = container_of(ws, struct pool, worker); - process_prepared_mappings(pool); + process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping); + process_prepared(pool, &pool->prepared_discards, process_prepared_discard); process_deferred_bios(pool); } +/* + * We want to commit periodically so that not too much + * unwritten data builds up. + */ +static void do_waker(struct work_struct *ws) +{ + struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker); + wake_worker(pool); + queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD); +} + /*----------------------------------------------------------------*/ /* @@ -1335,6 +1516,19 @@ static void thin_defer_bio(struct thin_c *tc, struct bio *bio) wake_worker(pool); } +static struct endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio) +{ + struct pool *pool = tc->pool; + struct endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO); + + h->tc = tc; + h->shared_read_entry = NULL; + h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : ds_inc(&pool->all_io_ds); + h->overwrite_mapping = NULL; + + return h; +} + /* * Non-blocking function called from the thin target's map function. */ @@ -1347,12 +1541,8 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio, struct dm_thin_device *td = tc->td; struct dm_thin_lookup_result result; - /* - * Save the thin context for easy access from the deferred bio later. - */ - map_context->ptr = tc; - - if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { + map_context->ptr = thin_hook_bio(tc, bio); + if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) { thin_defer_bio(tc, bio); return DM_MAPIO_SUBMITTED; } @@ -1434,7 +1624,7 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti) pool->ti = ti; pool->low_water_blocks = pt->low_water_blocks; - pool->zero_new_blocks = pt->zero_new_blocks; + pool->pf = pt->pf; return 0; } @@ -1448,6 +1638,14 @@ static void unbind_control_target(struct pool *pool, struct dm_target *ti) /*---------------------------------------------------------------- * Pool creation *--------------------------------------------------------------*/ +/* Initialize pool features. */ +static void pool_features_init(struct pool_features *pf) +{ + pf->zero_new_blocks = 1; + pf->discard_enabled = 1; + pf->discard_passdown = 1; +} + static void __pool_destroy(struct pool *pool) { __pool_table_remove(pool); @@ -1495,7 +1693,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, pool->block_shift = ffs(block_size) - 1; pool->offset_mask = block_size - 1; pool->low_water_blocks = 0; - pool->zero_new_blocks = 1; + pool_features_init(&pool->pf); pool->prison = prison_create(PRISON_CELLS); if (!pool->prison) { *error = "Error creating pool's bio prison"; @@ -1523,14 +1721,17 @@ static struct pool *pool_create(struct mapped_device *pool_md, } INIT_WORK(&pool->worker, do_worker); + INIT_DELAYED_WORK(&pool->waker, do_waker); spin_lock_init(&pool->lock); bio_list_init(&pool->deferred_bios); bio_list_init(&pool->deferred_flush_bios); INIT_LIST_HEAD(&pool->prepared_mappings); + INIT_LIST_HEAD(&pool->prepared_discards); pool->low_water_triggered = 0; pool->no_free_space = 0; bio_list_init(&pool->retry_on_resume_list); - ds_init(&pool->ds); + ds_init(&pool->shared_read_ds); + ds_init(&pool->all_io_ds); pool->next_mapping = NULL; pool->mapping_pool = @@ -1549,6 +1750,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, goto bad_endio_hook_pool; } pool->ref_count = 1; + pool->last_commit_jiffies = jiffies; pool->pool_md = pool_md; pool->md_dev = metadata_dev; __pool_table_insert(pool); @@ -1588,7 +1790,8 @@ static void __pool_dec(struct pool *pool) static struct pool *__pool_find(struct mapped_device *pool_md, struct block_device *metadata_dev, - unsigned long block_size, char **error) + unsigned long block_size, char **error, + int *created) { struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev); @@ -1604,8 +1807,10 @@ static struct pool *__pool_find(struct mapped_device *pool_md, return ERR_PTR(-EINVAL); __pool_inc(pool); - } else + } else { pool = pool_create(pool_md, metadata_dev, block_size, error); + *created = 1; + } } return pool; @@ -1629,10 +1834,6 @@ static void pool_dtr(struct dm_target *ti) mutex_unlock(&dm_thin_pool_table.mutex); } -struct pool_features { - unsigned zero_new_blocks:1; -}; - static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, struct dm_target *ti) { @@ -1641,7 +1842,7 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, const char *arg_name; static struct dm_arg _args[] = { - {0, 1, "Invalid number of pool feature arguments"}, + {0, 3, "Invalid number of pool feature arguments"}, }; /* @@ -1661,6 +1862,12 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, if (!strcasecmp(arg_name, "skip_block_zeroing")) { pf->zero_new_blocks = 0; continue; + } else if (!strcasecmp(arg_name, "ignore_discard")) { + pf->discard_enabled = 0; + continue; + } else if (!strcasecmp(arg_name, "no_discard_passdown")) { + pf->discard_passdown = 0; + continue; } ti->error = "Unrecognised pool feature requested"; @@ -1678,10 +1885,12 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, * * Optional feature arguments are: * skip_block_zeroing: skips the zeroing of newly-provisioned blocks. + * ignore_discard: disable discard + * no_discard_passdown: don't pass discards down to the data device */ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) { - int r; + int r, pool_created = 0; struct pool_c *pt; struct pool *pool; struct pool_features pf; @@ -1691,6 +1900,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) dm_block_t low_water_blocks; struct dm_dev *metadata_dev; sector_t metadata_dev_size; + char b[BDEVNAME_SIZE]; /* * FIXME Remove validation from scope of lock. @@ -1712,11 +1922,9 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) } metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT; - if (metadata_dev_size > METADATA_DEV_MAX_SECTORS) { - ti->error = "Metadata device is too large"; - r = -EINVAL; - goto out_metadata; - } + if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) + DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", + bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); if (r) { @@ -1742,8 +1950,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) /* * Set default pool features. */ - memset(&pf, 0, sizeof(pf)); - pf.zero_new_blocks = 1; + pool_features_init(&pf); dm_consume_args(&as, 4); r = parse_pool_features(&as, &pf, ti); @@ -1757,20 +1964,58 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) } pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, - block_size, &ti->error); + block_size, &ti->error, &pool_created); if (IS_ERR(pool)) { r = PTR_ERR(pool); goto out_free_pt; } + /* + * 'pool_created' reflects whether this is the first table load. + * Top level discard support is not allowed to be changed after + * initial load. This would require a pool reload to trigger thin + * device changes. + */ + if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) { + ti->error = "Discard support cannot be disabled once enabled"; + r = -EINVAL; + goto out_flags_changed; + } + + /* + * If discard_passdown was enabled verify that the data device + * supports discards. Disable discard_passdown if not; otherwise + * -EOPNOTSUPP will be returned. + */ + if (pf.discard_passdown) { + struct request_queue *q = bdev_get_queue(data_dev->bdev); + if (!q || !blk_queue_discard(q)) { + DMWARN("Discard unsupported by data device: Disabling discard passdown."); + pf.discard_passdown = 0; + } + } + pt->pool = pool; pt->ti = ti; pt->metadata_dev = metadata_dev; pt->data_dev = data_dev; pt->low_water_blocks = low_water_blocks; - pt->zero_new_blocks = pf.zero_new_blocks; + pt->pf = pf; ti->num_flush_requests = 1; - ti->num_discard_requests = 0; + /* + * Only need to enable discards if the pool should pass + * them down to the data device. The thin device's discard + * processing will cause mappings to be removed from the btree. + */ + if (pf.discard_enabled && pf.discard_passdown) { + ti->num_discard_requests = 1; + /* + * Setting 'discards_supported' circumvents the normal + * stacking of discard limits (this keeps the pool and + * thin devices' discard limits consistent). + */ + ti->discards_supported = 1; + } ti->private = pt; pt->callbacks.congested_fn = pool_is_congested; @@ -1780,6 +2025,8 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) return 0; +out_flags_changed: + __pool_dec(pool); out_free_pt: kfree(pt); out: @@ -1878,7 +2125,7 @@ static void pool_resume(struct dm_target *ti) __requeue_bios(pool); spin_unlock_irqrestore(&pool->lock, flags); - wake_worker(pool); + do_waker(&pool->waker.work); } static void pool_postsuspend(struct dm_target *ti) @@ -1887,6 +2134,7 @@ static void pool_postsuspend(struct dm_target *ti) struct pool_c *pt = ti->private; struct pool *pool = pt->pool; + cancel_delayed_work(&pool->waker); flush_workqueue(pool->wq); r = dm_pool_commit_metadata(pool->pmd); @@ -2067,7 +2315,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv) static int pool_status(struct dm_target *ti, status_type_t type, char *result, unsigned maxlen) { - int r; + int r, count; unsigned sz = 0; uint64_t transaction_id; dm_block_t nr_free_blocks_data; @@ -2130,10 +2378,19 @@ static int pool_status(struct dm_target *ti, status_type_t type, (unsigned long)pool->sectors_per_block, (unsigned long long)pt->low_water_blocks); - DMEMIT("%u ", !pool->zero_new_blocks); + count = !pool->pf.zero_new_blocks + !pool->pf.discard_enabled + + !pool->pf.discard_passdown; + DMEMIT("%u ", count); - if (!pool->zero_new_blocks) + if (!pool->pf.zero_new_blocks) DMEMIT("skip_block_zeroing "); + + if (!pool->pf.discard_enabled) + DMEMIT("ignore_discard "); + + if (!pool->pf.discard_passdown) + DMEMIT("no_discard_passdown "); + break; } @@ -2162,6 +2419,21 @@ static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm, return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); } +static void set_discard_limits(struct pool *pool, struct queue_limits *limits) +{ + /* + * FIXME: these limits may be incompatible with the pool's data device + */ + limits->max_discard_sectors = pool->sectors_per_block; + + /* + * This is just a hint, and not enforced. We have to cope with + * bios that overlap 2 blocks. + */ + limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; + limits->discard_zeroes_data = pool->pf.zero_new_blocks; +} + static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct pool_c *pt = ti->private; @@ -2169,13 +2441,15 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) blk_limits_io_min(limits, 0); blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); + if (pool->pf.discard_enabled) + set_discard_limits(pool, limits); } static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | DM_TARGET_IMMUTABLE, - .version = {1, 0, 0}, + .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, @@ -2202,6 +2476,8 @@ static void thin_dtr(struct dm_target *ti) __pool_dec(tc->pool); dm_pool_close_thin_device(tc->td); dm_put_device(ti, tc->pool_dev); + if (tc->origin_dev) + dm_put_device(ti, tc->origin_dev); kfree(tc); mutex_unlock(&dm_thin_pool_table.mutex); @@ -2210,21 +2486,25 @@ static void thin_dtr(struct dm_target *ti) /* * Thin target parameters: * - * <pool_dev> <dev_id> + * <pool_dev> <dev_id> [origin_dev] * * pool_dev: the path to the pool (eg, /dev/mapper/my_pool) * dev_id: the internal device identifier + * origin_dev: a device external to the pool that should act as the origin + * + * If the pool device has discards disabled, they get disabled for the thin + * device as well. */ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) { int r; struct thin_c *tc; - struct dm_dev *pool_dev; + struct dm_dev *pool_dev, *origin_dev; struct mapped_device *pool_md; mutex_lock(&dm_thin_pool_table.mutex); - if (argc != 2) { + if (argc != 2 && argc != 3) { ti->error = "Invalid argument count"; r = -EINVAL; goto out_unlock; @@ -2237,6 +2517,15 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) goto out_unlock; } + if (argc == 3) { + r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev); + if (r) { + ti->error = "Error opening origin device"; + goto bad_origin_dev; + } + tc->origin_dev = origin_dev; + } + r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev); if (r) { ti->error = "Error opening pool device"; @@ -2273,8 +2562,12 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->split_io = tc->pool->sectors_per_block; ti->num_flush_requests = 1; - ti->num_discard_requests = 0; - ti->discards_supported = 0; + + /* In case the pool supports discards, pass them on. */ + if (tc->pool->pf.discard_enabled) { + ti->discards_supported = 1; + ti->num_discard_requests = 1; + } dm_put(pool_md); @@ -2289,6 +2582,9 @@ bad_pool_lookup: bad_common: dm_put_device(ti, tc->pool_dev); bad_pool_dev: + if (tc->origin_dev) + dm_put_device(ti, tc->origin_dev); +bad_origin_dev: kfree(tc); out_unlock: mutex_unlock(&dm_thin_pool_table.mutex); @@ -2299,11 +2595,46 @@ out_unlock: static int thin_map(struct dm_target *ti, struct bio *bio, union map_info *map_context) { - bio->bi_sector -= ti->begin; + bio->bi_sector = dm_target_offset(ti, bio->bi_sector); return thin_bio_map(ti, bio, map_context); } +static int thin_endio(struct dm_target *ti, + struct bio *bio, int err, + union map_info *map_context) +{ + unsigned long flags; + struct endio_hook *h = map_context->ptr; + struct list_head work; + struct new_mapping *m, *tmp; + struct pool *pool = h->tc->pool; + + if (h->shared_read_entry) { + INIT_LIST_HEAD(&work); + ds_dec(h->shared_read_entry, &work); + + spin_lock_irqsave(&pool->lock, flags); + list_for_each_entry_safe(m, tmp, &work, list) { + list_del(&m->list); + m->quiesced = 1; + __maybe_add_mapping(m); + } + spin_unlock_irqrestore(&pool->lock, flags); + } + + if (h->all_io_entry) { + INIT_LIST_HEAD(&work); + ds_dec(h->all_io_entry, &work); + list_for_each_entry_safe(m, tmp, &work, list) + list_add(&m->list, &pool->prepared_discards); + } + + mempool_free(h, pool->endio_hook_pool); + + return 0; +} + static void thin_postsuspend(struct dm_target *ti) { if (dm_noflush_suspending(ti)) @@ -2347,6 +2678,8 @@ static int thin_status(struct dm_target *ti, status_type_t type, DMEMIT("%s %lu", format_dev_t(buf, tc->pool_dev->bdev->bd_dev), (unsigned long) tc->dev_id); + if (tc->origin_dev) + DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev)); break; } } @@ -2377,18 +2710,21 @@ static int thin_iterate_devices(struct dm_target *ti, static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct thin_c *tc = ti->private; + struct pool *pool = tc->pool; blk_limits_io_min(limits, 0); - blk_limits_io_opt(limits, tc->pool->sectors_per_block << SECTOR_SHIFT); + blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); + set_discard_limits(pool, limits); } static struct target_type thin_target = { .name = "thin", - .version = {1, 0, 0}, + .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr, .map = thin_map, + .end_io = thin_endio, .postsuspend = thin_postsuspend, .status = thin_status, .iterate_devices = thin_iterate_devices, diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c new file mode 100644 index 000000000000..fa365d39b612 --- /dev/null +++ b/drivers/md/dm-verity.c @@ -0,0 +1,913 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * + * Author: Mikulas Patocka <mpatocka@redhat.com> + * + * Based on Chromium dm-verity driver (C) 2011 The Chromium OS Authors + * + * This file is released under the GPLv2. + * + * In the file "/sys/module/dm_verity/parameters/prefetch_cluster" you can set + * default prefetch value. Data are read in "prefetch_cluster" chunks from the + * hash device. Setting this greatly improves performance when data and hash + * are on the same disk on different partitions on devices with poor random + * access behavior. + */ + +#include "dm-bufio.h" + +#include <linux/module.h> +#include <linux/device-mapper.h> +#include <crypto/hash.h> + +#define DM_MSG_PREFIX "verity" + +#define DM_VERITY_IO_VEC_INLINE 16 +#define DM_VERITY_MEMPOOL_SIZE 4 +#define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144 + +#define DM_VERITY_MAX_LEVELS 63 + +static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE; + +module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR); + +struct dm_verity { + struct dm_dev *data_dev; + struct dm_dev *hash_dev; + struct dm_target *ti; + struct dm_bufio_client *bufio; + char *alg_name; + struct crypto_shash *tfm; + u8 *root_digest; /* digest of the root block */ + u8 *salt; /* salt: its size is salt_size */ + unsigned salt_size; + sector_t data_start; /* data offset in 512-byte sectors */ + sector_t hash_start; /* hash start in blocks */ + sector_t data_blocks; /* the number of data blocks */ + sector_t hash_blocks; /* the number of hash blocks */ + unsigned char data_dev_block_bits; /* log2(data blocksize) */ + unsigned char hash_dev_block_bits; /* log2(hash blocksize) */ + unsigned char hash_per_block_bits; /* log2(hashes in hash block) */ + unsigned char levels; /* the number of tree levels */ + unsigned char version; + unsigned digest_size; /* digest size for the current hash algorithm */ + unsigned shash_descsize;/* the size of temporary space for crypto */ + int hash_failed; /* set to 1 if hash of any block failed */ + + mempool_t *io_mempool; /* mempool of struct dm_verity_io */ + mempool_t *vec_mempool; /* mempool of bio vector */ + + struct workqueue_struct *verify_wq; + + /* starting blocks for each tree level. 0 is the lowest level. */ + sector_t hash_level_block[DM_VERITY_MAX_LEVELS]; +}; + +struct dm_verity_io { + struct dm_verity *v; + struct bio *bio; + + /* original values of bio->bi_end_io and bio->bi_private */ + bio_end_io_t *orig_bi_end_io; + void *orig_bi_private; + + sector_t block; + unsigned n_blocks; + + /* saved bio vector */ + struct bio_vec *io_vec; + unsigned io_vec_size; + + struct work_struct work; + + /* A space for short vectors; longer vectors are allocated separately. */ + struct bio_vec io_vec_inline[DM_VERITY_IO_VEC_INLINE]; + + /* + * Three variably-size fields follow this struct: + * + * u8 hash_desc[v->shash_descsize]; + * u8 real_digest[v->digest_size]; + * u8 want_digest[v->digest_size]; + * + * To access them use: io_hash_desc(), io_real_digest() and io_want_digest(). + */ +}; + +static struct shash_desc *io_hash_desc(struct dm_verity *v, struct dm_verity_io *io) +{ + return (struct shash_desc *)(io + 1); +} + +static u8 *io_real_digest(struct dm_verity *v, struct dm_verity_io *io) +{ + return (u8 *)(io + 1) + v->shash_descsize; +} + +static u8 *io_want_digest(struct dm_verity *v, struct dm_verity_io *io) +{ + return (u8 *)(io + 1) + v->shash_descsize + v->digest_size; +} + +/* + * Auxiliary structure appended to each dm-bufio buffer. If the value + * hash_verified is nonzero, hash of the block has been verified. + * + * The variable hash_verified is set to 0 when allocating the buffer, then + * it can be changed to 1 and it is never reset to 0 again. + * + * There is no lock around this value, a race condition can at worst cause + * that multiple processes verify the hash of the same buffer simultaneously + * and write 1 to hash_verified simultaneously. + * This condition is harmless, so we don't need locking. + */ +struct buffer_aux { + int hash_verified; +}; + +/* + * Initialize struct buffer_aux for a freshly created buffer. + */ +static void dm_bufio_alloc_callback(struct dm_buffer *buf) +{ + struct buffer_aux *aux = dm_bufio_get_aux_data(buf); + + aux->hash_verified = 0; +} + +/* + * Translate input sector number to the sector number on the target device. + */ +static sector_t verity_map_sector(struct dm_verity *v, sector_t bi_sector) +{ + return v->data_start + dm_target_offset(v->ti, bi_sector); +} + +/* + * Return hash position of a specified block at a specified tree level + * (0 is the lowest level). + * The lowest "hash_per_block_bits"-bits of the result denote hash position + * inside a hash block. The remaining bits denote location of the hash block. + */ +static sector_t verity_position_at_level(struct dm_verity *v, sector_t block, + int level) +{ + return block >> (level * v->hash_per_block_bits); +} + +static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level, + sector_t *hash_block, unsigned *offset) +{ + sector_t position = verity_position_at_level(v, block, level); + unsigned idx; + + *hash_block = v->hash_level_block[level] + (position >> v->hash_per_block_bits); + + if (!offset) + return; + + idx = position & ((1 << v->hash_per_block_bits) - 1); + if (!v->version) + *offset = idx * v->digest_size; + else + *offset = idx << (v->hash_dev_block_bits - v->hash_per_block_bits); +} + +/* + * Verify hash of a metadata block pertaining to the specified data block + * ("block" argument) at a specified level ("level" argument). + * + * On successful return, io_want_digest(v, io) contains the hash value for + * a lower tree level or for the data block (if we're at the lowest leve). + * + * If "skip_unverified" is true, unverified buffer is skipped and 1 is returned. + * If "skip_unverified" is false, unverified buffer is hashed and verified + * against current value of io_want_digest(v, io). + */ +static int verity_verify_level(struct dm_verity_io *io, sector_t block, + int level, bool skip_unverified) +{ + struct dm_verity *v = io->v; + struct dm_buffer *buf; + struct buffer_aux *aux; + u8 *data; + int r; + sector_t hash_block; + unsigned offset; + + verity_hash_at_level(v, block, level, &hash_block, &offset); + + data = dm_bufio_read(v->bufio, hash_block, &buf); + if (unlikely(IS_ERR(data))) + return PTR_ERR(data); + + aux = dm_bufio_get_aux_data(buf); + + if (!aux->hash_verified) { + struct shash_desc *desc; + u8 *result; + + if (skip_unverified) { + r = 1; + goto release_ret_r; + } + + desc = io_hash_desc(v, io); + desc->tfm = v->tfm; + desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + r = crypto_shash_init(desc); + if (r < 0) { + DMERR("crypto_shash_init failed: %d", r); + goto release_ret_r; + } + + if (likely(v->version >= 1)) { + r = crypto_shash_update(desc, v->salt, v->salt_size); + if (r < 0) { + DMERR("crypto_shash_update failed: %d", r); + goto release_ret_r; + } + } + + r = crypto_shash_update(desc, data, 1 << v->hash_dev_block_bits); + if (r < 0) { + DMERR("crypto_shash_update failed: %d", r); + goto release_ret_r; + } + + if (!v->version) { + r = crypto_shash_update(desc, v->salt, v->salt_size); + if (r < 0) { + DMERR("crypto_shash_update failed: %d", r); + goto release_ret_r; + } + } + + result = io_real_digest(v, io); + r = crypto_shash_final(desc, result); + if (r < 0) { + DMERR("crypto_shash_final failed: %d", r); + goto release_ret_r; + } + if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) { + DMERR_LIMIT("metadata block %llu is corrupted", + (unsigned long long)hash_block); + v->hash_failed = 1; + r = -EIO; + goto release_ret_r; + } else + aux->hash_verified = 1; + } + + data += offset; + + memcpy(io_want_digest(v, io), data, v->digest_size); + + dm_bufio_release(buf); + return 0; + +release_ret_r: + dm_bufio_release(buf); + + return r; +} + +/* + * Verify one "dm_verity_io" structure. + */ +static int verity_verify_io(struct dm_verity_io *io) +{ + struct dm_verity *v = io->v; + unsigned b; + int i; + unsigned vector = 0, offset = 0; + + for (b = 0; b < io->n_blocks; b++) { + struct shash_desc *desc; + u8 *result; + int r; + unsigned todo; + + if (likely(v->levels)) { + /* + * First, we try to get the requested hash for + * the current block. If the hash block itself is + * verified, zero is returned. If it isn't, this + * function returns 0 and we fall back to whole + * chain verification. + */ + int r = verity_verify_level(io, io->block + b, 0, true); + if (likely(!r)) + goto test_block_hash; + if (r < 0) + return r; + } + + memcpy(io_want_digest(v, io), v->root_digest, v->digest_size); + + for (i = v->levels - 1; i >= 0; i--) { + int r = verity_verify_level(io, io->block + b, i, false); + if (unlikely(r)) + return r; + } + +test_block_hash: + desc = io_hash_desc(v, io); + desc->tfm = v->tfm; + desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + r = crypto_shash_init(desc); + if (r < 0) { + DMERR("crypto_shash_init failed: %d", r); + return r; + } + + if (likely(v->version >= 1)) { + r = crypto_shash_update(desc, v->salt, v->salt_size); + if (r < 0) { + DMERR("crypto_shash_update failed: %d", r); + return r; + } + } + + todo = 1 << v->data_dev_block_bits; + do { + struct bio_vec *bv; + u8 *page; + unsigned len; + + BUG_ON(vector >= io->io_vec_size); + bv = &io->io_vec[vector]; + page = kmap_atomic(bv->bv_page); + len = bv->bv_len - offset; + if (likely(len >= todo)) + len = todo; + r = crypto_shash_update(desc, + page + bv->bv_offset + offset, len); + kunmap_atomic(page); + if (r < 0) { + DMERR("crypto_shash_update failed: %d", r); + return r; + } + offset += len; + if (likely(offset == bv->bv_len)) { + offset = 0; + vector++; + } + todo -= len; + } while (todo); + + if (!v->version) { + r = crypto_shash_update(desc, v->salt, v->salt_size); + if (r < 0) { + DMERR("crypto_shash_update failed: %d", r); + return r; + } + } + + result = io_real_digest(v, io); + r = crypto_shash_final(desc, result); + if (r < 0) { + DMERR("crypto_shash_final failed: %d", r); + return r; + } + if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) { + DMERR_LIMIT("data block %llu is corrupted", + (unsigned long long)(io->block + b)); + v->hash_failed = 1; + return -EIO; + } + } + BUG_ON(vector != io->io_vec_size); + BUG_ON(offset); + + return 0; +} + +/* + * End one "io" structure with a given error. + */ +static void verity_finish_io(struct dm_verity_io *io, int error) +{ + struct bio *bio = io->bio; + struct dm_verity *v = io->v; + + bio->bi_end_io = io->orig_bi_end_io; + bio->bi_private = io->orig_bi_private; + + if (io->io_vec != io->io_vec_inline) + mempool_free(io->io_vec, v->vec_mempool); + + mempool_free(io, v->io_mempool); + + bio_endio(bio, error); +} + +static void verity_work(struct work_struct *w) +{ + struct dm_verity_io *io = container_of(w, struct dm_verity_io, work); + + verity_finish_io(io, verity_verify_io(io)); +} + +static void verity_end_io(struct bio *bio, int error) +{ + struct dm_verity_io *io = bio->bi_private; + + if (error) { + verity_finish_io(io, error); + return; + } + + INIT_WORK(&io->work, verity_work); + queue_work(io->v->verify_wq, &io->work); +} + +/* + * Prefetch buffers for the specified io. + * The root buffer is not prefetched, it is assumed that it will be cached + * all the time. + */ +static void verity_prefetch_io(struct dm_verity *v, struct dm_verity_io *io) +{ + int i; + + for (i = v->levels - 2; i >= 0; i--) { + sector_t hash_block_start; + sector_t hash_block_end; + verity_hash_at_level(v, io->block, i, &hash_block_start, NULL); + verity_hash_at_level(v, io->block + io->n_blocks - 1, i, &hash_block_end, NULL); + if (!i) { + unsigned cluster = *(volatile unsigned *)&dm_verity_prefetch_cluster; + + cluster >>= v->data_dev_block_bits; + if (unlikely(!cluster)) + goto no_prefetch_cluster; + + if (unlikely(cluster & (cluster - 1))) + cluster = 1 << (fls(cluster) - 1); + + hash_block_start &= ~(sector_t)(cluster - 1); + hash_block_end |= cluster - 1; + if (unlikely(hash_block_end >= v->hash_blocks)) + hash_block_end = v->hash_blocks - 1; + } +no_prefetch_cluster: + dm_bufio_prefetch(v->bufio, hash_block_start, + hash_block_end - hash_block_start + 1); + } +} + +/* + * Bio map function. It allocates dm_verity_io structure and bio vector and + * fills them. Then it issues prefetches and the I/O. + */ +static int verity_map(struct dm_target *ti, struct bio *bio, + union map_info *map_context) +{ + struct dm_verity *v = ti->private; + struct dm_verity_io *io; + + bio->bi_bdev = v->data_dev->bdev; + bio->bi_sector = verity_map_sector(v, bio->bi_sector); + + if (((unsigned)bio->bi_sector | bio_sectors(bio)) & + ((1 << (v->data_dev_block_bits - SECTOR_SHIFT)) - 1)) { + DMERR_LIMIT("unaligned io"); + return -EIO; + } + + if ((bio->bi_sector + bio_sectors(bio)) >> + (v->data_dev_block_bits - SECTOR_SHIFT) > v->data_blocks) { + DMERR_LIMIT("io out of range"); + return -EIO; + } + + if (bio_data_dir(bio) == WRITE) + return -EIO; + + io = mempool_alloc(v->io_mempool, GFP_NOIO); + io->v = v; + io->bio = bio; + io->orig_bi_end_io = bio->bi_end_io; + io->orig_bi_private = bio->bi_private; + io->block = bio->bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT); + io->n_blocks = bio->bi_size >> v->data_dev_block_bits; + + bio->bi_end_io = verity_end_io; + bio->bi_private = io; + io->io_vec_size = bio->bi_vcnt - bio->bi_idx; + if (io->io_vec_size < DM_VERITY_IO_VEC_INLINE) + io->io_vec = io->io_vec_inline; + else + io->io_vec = mempool_alloc(v->vec_mempool, GFP_NOIO); + memcpy(io->io_vec, bio_iovec(bio), + io->io_vec_size * sizeof(struct bio_vec)); + + verity_prefetch_io(v, io); + + generic_make_request(bio); + + return DM_MAPIO_SUBMITTED; +} + +/* + * Status: V (valid) or C (corruption found) + */ +static int verity_status(struct dm_target *ti, status_type_t type, + char *result, unsigned maxlen) +{ + struct dm_verity *v = ti->private; + unsigned sz = 0; + unsigned x; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%c", v->hash_failed ? 'C' : 'V'); + break; + case STATUSTYPE_TABLE: + DMEMIT("%u %s %s %u %u %llu %llu %s ", + v->version, + v->data_dev->name, + v->hash_dev->name, + 1 << v->data_dev_block_bits, + 1 << v->hash_dev_block_bits, + (unsigned long long)v->data_blocks, + (unsigned long long)v->hash_start, + v->alg_name + ); + for (x = 0; x < v->digest_size; x++) + DMEMIT("%02x", v->root_digest[x]); + DMEMIT(" "); + if (!v->salt_size) + DMEMIT("-"); + else + for (x = 0; x < v->salt_size; x++) + DMEMIT("%02x", v->salt[x]); + break; + } + + return 0; +} + +static int verity_ioctl(struct dm_target *ti, unsigned cmd, + unsigned long arg) +{ + struct dm_verity *v = ti->private; + int r = 0; + + if (v->data_start || + ti->len != i_size_read(v->data_dev->bdev->bd_inode) >> SECTOR_SHIFT) + r = scsi_verify_blk_ioctl(NULL, cmd); + + return r ? : __blkdev_driver_ioctl(v->data_dev->bdev, v->data_dev->mode, + cmd, arg); +} + +static int verity_merge(struct dm_target *ti, struct bvec_merge_data *bvm, + struct bio_vec *biovec, int max_size) +{ + struct dm_verity *v = ti->private; + struct request_queue *q = bdev_get_queue(v->data_dev->bdev); + + if (!q->merge_bvec_fn) + return max_size; + + bvm->bi_bdev = v->data_dev->bdev; + bvm->bi_sector = verity_map_sector(v, bvm->bi_sector); + + return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); +} + +static int verity_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct dm_verity *v = ti->private; + + return fn(ti, v->data_dev, v->data_start, ti->len, data); +} + +static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct dm_verity *v = ti->private; + + if (limits->logical_block_size < 1 << v->data_dev_block_bits) + limits->logical_block_size = 1 << v->data_dev_block_bits; + + if (limits->physical_block_size < 1 << v->data_dev_block_bits) + limits->physical_block_size = 1 << v->data_dev_block_bits; + + blk_limits_io_min(limits, limits->logical_block_size); +} + +static void verity_dtr(struct dm_target *ti) +{ + struct dm_verity *v = ti->private; + + if (v->verify_wq) + destroy_workqueue(v->verify_wq); + + if (v->vec_mempool) + mempool_destroy(v->vec_mempool); + + if (v->io_mempool) + mempool_destroy(v->io_mempool); + + if (v->bufio) + dm_bufio_client_destroy(v->bufio); + + kfree(v->salt); + kfree(v->root_digest); + + if (v->tfm) + crypto_free_shash(v->tfm); + + kfree(v->alg_name); + + if (v->hash_dev) + dm_put_device(ti, v->hash_dev); + + if (v->data_dev) + dm_put_device(ti, v->data_dev); + + kfree(v); +} + +/* + * Target parameters: + * <version> The current format is version 1. + * Vsn 0 is compatible with original Chromium OS releases. + * <data device> + * <hash device> + * <data block size> + * <hash block size> + * <the number of data blocks> + * <hash start block> + * <algorithm> + * <digest> + * <salt> Hex string or "-" if no salt. + */ +static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) +{ + struct dm_verity *v; + unsigned num; + unsigned long long num_ll; + int r; + int i; + sector_t hash_position; + char dummy; + + v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL); + if (!v) { + ti->error = "Cannot allocate verity structure"; + return -ENOMEM; + } + ti->private = v; + v->ti = ti; + + if ((dm_table_get_mode(ti->table) & ~FMODE_READ)) { + ti->error = "Device must be readonly"; + r = -EINVAL; + goto bad; + } + + if (argc != 10) { + ti->error = "Invalid argument count: exactly 10 arguments required"; + r = -EINVAL; + goto bad; + } + + if (sscanf(argv[0], "%d%c", &num, &dummy) != 1 || + num < 0 || num > 1) { + ti->error = "Invalid version"; + r = -EINVAL; + goto bad; + } + v->version = num; + + r = dm_get_device(ti, argv[1], FMODE_READ, &v->data_dev); + if (r) { + ti->error = "Data device lookup failed"; + goto bad; + } + + r = dm_get_device(ti, argv[2], FMODE_READ, &v->hash_dev); + if (r) { + ti->error = "Data device lookup failed"; + goto bad; + } + + if (sscanf(argv[3], "%u%c", &num, &dummy) != 1 || + !num || (num & (num - 1)) || + num < bdev_logical_block_size(v->data_dev->bdev) || + num > PAGE_SIZE) { + ti->error = "Invalid data device block size"; + r = -EINVAL; + goto bad; + } + v->data_dev_block_bits = ffs(num) - 1; + + if (sscanf(argv[4], "%u%c", &num, &dummy) != 1 || + !num || (num & (num - 1)) || + num < bdev_logical_block_size(v->hash_dev->bdev) || + num > INT_MAX) { + ti->error = "Invalid hash device block size"; + r = -EINVAL; + goto bad; + } + v->hash_dev_block_bits = ffs(num) - 1; + + if (sscanf(argv[5], "%llu%c", &num_ll, &dummy) != 1 || + num_ll << (v->data_dev_block_bits - SECTOR_SHIFT) != + (sector_t)num_ll << (v->data_dev_block_bits - SECTOR_SHIFT)) { + ti->error = "Invalid data blocks"; + r = -EINVAL; + goto bad; + } + v->data_blocks = num_ll; + + if (ti->len > (v->data_blocks << (v->data_dev_block_bits - SECTOR_SHIFT))) { + ti->error = "Data device is too small"; + r = -EINVAL; + goto bad; + } + + if (sscanf(argv[6], "%llu%c", &num_ll, &dummy) != 1 || + num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT) != + (sector_t)num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT)) { + ti->error = "Invalid hash start"; + r = -EINVAL; + goto bad; + } + v->hash_start = num_ll; + + v->alg_name = kstrdup(argv[7], GFP_KERNEL); + if (!v->alg_name) { + ti->error = "Cannot allocate algorithm name"; + r = -ENOMEM; + goto bad; + } + + v->tfm = crypto_alloc_shash(v->alg_name, 0, 0); + if (IS_ERR(v->tfm)) { + ti->error = "Cannot initialize hash function"; + r = PTR_ERR(v->tfm); + v->tfm = NULL; + goto bad; + } + v->digest_size = crypto_shash_digestsize(v->tfm); + if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) { + ti->error = "Digest size too big"; + r = -EINVAL; + goto bad; + } + v->shash_descsize = + sizeof(struct shash_desc) + crypto_shash_descsize(v->tfm); + + v->root_digest = kmalloc(v->digest_size, GFP_KERNEL); + if (!v->root_digest) { + ti->error = "Cannot allocate root digest"; + r = -ENOMEM; + goto bad; + } + if (strlen(argv[8]) != v->digest_size * 2 || + hex2bin(v->root_digest, argv[8], v->digest_size)) { + ti->error = "Invalid root digest"; + r = -EINVAL; + goto bad; + } + + if (strcmp(argv[9], "-")) { + v->salt_size = strlen(argv[9]) / 2; + v->salt = kmalloc(v->salt_size, GFP_KERNEL); + if (!v->salt) { + ti->error = "Cannot allocate salt"; + r = -ENOMEM; + goto bad; + } + if (strlen(argv[9]) != v->salt_size * 2 || + hex2bin(v->salt, argv[9], v->salt_size)) { + ti->error = "Invalid salt"; + r = -EINVAL; + goto bad; + } + } + + v->hash_per_block_bits = + fls((1 << v->hash_dev_block_bits) / v->digest_size) - 1; + + v->levels = 0; + if (v->data_blocks) + while (v->hash_per_block_bits * v->levels < 64 && + (unsigned long long)(v->data_blocks - 1) >> + (v->hash_per_block_bits * v->levels)) + v->levels++; + + if (v->levels > DM_VERITY_MAX_LEVELS) { + ti->error = "Too many tree levels"; + r = -E2BIG; + goto bad; + } + + hash_position = v->hash_start; + for (i = v->levels - 1; i >= 0; i--) { + sector_t s; + v->hash_level_block[i] = hash_position; + s = verity_position_at_level(v, v->data_blocks, i); + s = (s >> v->hash_per_block_bits) + + !!(s & ((1 << v->hash_per_block_bits) - 1)); + if (hash_position + s < hash_position) { + ti->error = "Hash device offset overflow"; + r = -E2BIG; + goto bad; + } + hash_position += s; + } + v->hash_blocks = hash_position; + + v->bufio = dm_bufio_client_create(v->hash_dev->bdev, + 1 << v->hash_dev_block_bits, 1, sizeof(struct buffer_aux), + dm_bufio_alloc_callback, NULL); + if (IS_ERR(v->bufio)) { + ti->error = "Cannot initialize dm-bufio"; + r = PTR_ERR(v->bufio); + v->bufio = NULL; + goto bad; + } + + if (dm_bufio_get_device_size(v->bufio) < v->hash_blocks) { + ti->error = "Hash device is too small"; + r = -E2BIG; + goto bad; + } + + v->io_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE, + sizeof(struct dm_verity_io) + v->shash_descsize + v->digest_size * 2); + if (!v->io_mempool) { + ti->error = "Cannot allocate io mempool"; + r = -ENOMEM; + goto bad; + } + + v->vec_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE, + BIO_MAX_PAGES * sizeof(struct bio_vec)); + if (!v->vec_mempool) { + ti->error = "Cannot allocate vector mempool"; + r = -ENOMEM; + goto bad; + } + + /* WQ_UNBOUND greatly improves performance when running on ramdisk */ + v->verify_wq = alloc_workqueue("kverityd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus()); + if (!v->verify_wq) { + ti->error = "Cannot allocate workqueue"; + r = -ENOMEM; + goto bad; + } + + return 0; + +bad: + verity_dtr(ti); + + return r; +} + +static struct target_type verity_target = { + .name = "verity", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = verity_ctr, + .dtr = verity_dtr, + .map = verity_map, + .status = verity_status, + .ioctl = verity_ioctl, + .merge = verity_merge, + .iterate_devices = verity_iterate_devices, + .io_hints = verity_io_hints, +}; + +static int __init dm_verity_init(void) +{ + int r; + + r = dm_register_target(&verity_target); + if (r < 0) + DMERR("register failed %d", r); + + return r; +} + +static void __exit dm_verity_exit(void) +{ + dm_unregister_target(&verity_target); +} + +module_init(dm_verity_init); +module_exit(dm_verity_exit); + +MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>"); +MODULE_AUTHOR("Mandeep Baines <msb@chromium.org>"); +MODULE_AUTHOR("Will Drewry <wad@chromium.org>"); +MODULE_DESCRIPTION(DM_NAME " target for transparent disk integrity checking"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index b89c548ec3f8..e24143cc2040 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1016,6 +1016,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone, /* * Store bio_set for cleanup. */ + clone->bi_end_io = NULL; clone->bi_private = md->bs; bio_put(clone); free_tio(md, tio); diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index feb2c3c7bb44..45135f69509c 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -315,7 +315,7 @@ static int run(struct mddev *mddev) } conf->nfaults = 0; - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) conf->rdev = rdev; md_set_array_sectors(mddev, faulty_size(mddev, 0, 0)); diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 627456542fb3..b0fcc7d02adb 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -68,10 +68,19 @@ static int linear_mergeable_bvec(struct request_queue *q, struct dev_info *dev0; unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); + int maxbytes = biovec->bv_len; + struct request_queue *subq; rcu_read_lock(); dev0 = which_dev(mddev, sector); maxsectors = dev0->end_sector - sector; + subq = bdev_get_queue(dev0->rdev->bdev); + if (subq->merge_bvec_fn) { + bvm->bi_bdev = dev0->rdev->bdev; + bvm->bi_sector -= dev0->end_sector - dev0->rdev->sectors; + maxbytes = min(maxbytes, subq->merge_bvec_fn(subq, bvm, + biovec)); + } rcu_read_unlock(); if (maxsectors < bio_sectors) @@ -80,12 +89,12 @@ static int linear_mergeable_bvec(struct request_queue *q, maxsectors -= bio_sectors; if (maxsectors <= (PAGE_SIZE >> 9 ) && bio_sectors == 0) - return biovec->bv_len; - /* The bytes available at this offset could be really big, - * so we cap at 2^31 to avoid overflow */ - if (maxsectors > (1 << (31-9))) - return 1<<31; - return maxsectors << 9; + return maxbytes; + + if (maxsectors > (maxbytes >> 9)) + return maxbytes; + else + return maxsectors << 9; } static int linear_congested(void *data, int bits) @@ -138,7 +147,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) cnt = 0; conf->array_sectors = 0; - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { int j = rdev->raid_disk; struct dev_info *disk = conf->disks + j; sector_t sectors; @@ -158,15 +167,6 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); - /* as we don't honour merge_bvec_fn, we must never risk - * violating it, so limit max_segments to 1 lying within - * a single page. - */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { - blk_queue_max_segments(mddev->queue, 1); - blk_queue_segment_boundary(mddev->queue, - PAGE_CACHE_SIZE - 1); - } conf->array_sectors += rdev->sectors; cnt++; diff --git a/drivers/md/md.c b/drivers/md/md.c index ce88755baf4a..b572e1e386ce 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -439,7 +439,7 @@ static void submit_flushes(struct work_struct *ws) INIT_WORK(&mddev->flush_work, md_submit_flush_data); atomic_set(&mddev->flush_pending, 1); rcu_read_lock(); - list_for_each_entry_rcu(rdev, &mddev->disks, same_set) + rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags)) { /* Take two references, one is dropped @@ -749,7 +749,7 @@ static struct md_rdev * find_rdev_nr(struct mddev *mddev, int nr) { struct md_rdev *rdev; - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) if (rdev->desc_nr == nr) return rdev; @@ -760,7 +760,7 @@ static struct md_rdev * find_rdev(struct mddev * mddev, dev_t dev) { struct md_rdev *rdev; - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) if (rdev->bdev->bd_dev == dev) return rdev; @@ -1342,7 +1342,7 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) sb->state |= (1<<MD_SB_BITMAP_PRESENT); sb->disks[0].state = (1<<MD_DISK_REMOVED); - list_for_each_entry(rdev2, &mddev->disks, same_set) { + rdev_for_each(rdev2, mddev) { mdp_disk_t *d; int desc_nr; int is_active = test_bit(In_sync, &rdev2->flags); @@ -1805,18 +1805,18 @@ retry: | BB_LEN(internal_bb)); *bbp++ = cpu_to_le64(store_bb); } + bb->changed = 0; if (read_seqretry(&bb->lock, seq)) goto retry; bb->sector = (rdev->sb_start + (int)le32_to_cpu(sb->bblog_offset)); bb->size = le16_to_cpu(sb->bblog_size); - bb->changed = 0; } } max_dev = 0; - list_for_each_entry(rdev2, &mddev->disks, same_set) + rdev_for_each(rdev2, mddev) if (rdev2->desc_nr+1 > max_dev) max_dev = rdev2->desc_nr+1; @@ -1833,7 +1833,7 @@ retry: for (i=0; i<max_dev;i++) sb->dev_roles[i] = cpu_to_le16(0xfffe); - list_for_each_entry(rdev2, &mddev->disks, same_set) { + rdev_for_each(rdev2, mddev) { i = rdev2->desc_nr; if (test_bit(Faulty, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(0xfffe); @@ -1948,7 +1948,7 @@ int md_integrity_register(struct mddev *mddev) return 0; /* nothing to do */ if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) return 0; /* shouldn't register, or already is */ - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { /* skip spares and non-functional disks */ if (test_bit(Faulty, &rdev->flags)) continue; @@ -2175,7 +2175,7 @@ static void export_array(struct mddev *mddev) { struct md_rdev *rdev, *tmp; - rdev_for_each(rdev, tmp, mddev) { + rdev_for_each_safe(rdev, tmp, mddev) { if (!rdev->mddev) { MD_BUG(); continue; @@ -2307,11 +2307,11 @@ static void md_print_devices(void) bitmap_print_sb(mddev->bitmap); else printk("%s: ", mdname(mddev)); - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) printk("<%s>", bdevname(rdev->bdev,b)); printk("\n"); - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) print_rdev(rdev, mddev->major_version); } printk("md: **********************************\n"); @@ -2328,7 +2328,7 @@ static void sync_sbs(struct mddev * mddev, int nospares) * with the rest of the array) */ struct md_rdev *rdev; - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { if (rdev->sb_events == mddev->events || (nospares && rdev->raid_disk < 0 && @@ -2351,7 +2351,7 @@ static void md_update_sb(struct mddev * mddev, int force_change) repeat: /* First make sure individual recovery_offsets are correct */ - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { if (rdev->raid_disk >= 0 && mddev->delta_disks >= 0 && !test_bit(In_sync, &rdev->flags) && @@ -2364,8 +2364,9 @@ repeat: clear_bit(MD_CHANGE_DEVS, &mddev->flags); if (!mddev->external) { clear_bit(MD_CHANGE_PENDING, &mddev->flags); - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { if (rdev->badblocks.changed) { + rdev->badblocks.changed = 0; md_ack_all_badblocks(&rdev->badblocks); md_error(mddev, rdev); } @@ -2430,7 +2431,7 @@ repeat: mddev->events --; } - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { if (rdev->badblocks.changed) any_badblocks_changed++; if (test_bit(Faulty, &rdev->flags)) @@ -2444,7 +2445,7 @@ repeat: mdname(mddev), mddev->in_sync); bitmap_update_sb(mddev->bitmap); - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { char b[BDEVNAME_SIZE]; if (rdev->sb_loaded != 1) @@ -2493,7 +2494,7 @@ repeat: if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) sysfs_notify(&mddev->kobj, NULL, "sync_completed"); - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { if (test_and_clear_bit(FaultRecorded, &rdev->flags)) clear_bit(Blocked, &rdev->flags); @@ -2896,7 +2897,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) struct md_rdev *rdev2; mddev_lock(mddev); - list_for_each_entry(rdev2, &mddev->disks, same_set) + rdev_for_each(rdev2, mddev) if (rdev->bdev == rdev2->bdev && rdev != rdev2 && overlaps(rdev->data_offset, rdev->sectors, @@ -3193,7 +3194,7 @@ static void analyze_sbs(struct mddev * mddev) char b[BDEVNAME_SIZE]; freshest = NULL; - rdev_for_each(rdev, tmp, mddev) + rdev_for_each_safe(rdev, tmp, mddev) switch (super_types[mddev->major_version]. load_super(rdev, freshest, mddev->minor_version)) { case 1: @@ -3214,7 +3215,7 @@ static void analyze_sbs(struct mddev * mddev) validate_super(mddev, freshest); i = 0; - rdev_for_each(rdev, tmp, mddev) { + rdev_for_each_safe(rdev, tmp, mddev) { if (mddev->max_disks && (rdev->desc_nr >= mddev->max_disks || i > mddev->max_disks)) { @@ -3403,7 +3404,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) return -EINVAL; } - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) rdev->new_raid_disk = rdev->raid_disk; /* ->takeover must set new_* and/or delta_disks @@ -3456,7 +3457,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev->safemode = 0; } - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { if (rdev->raid_disk < 0) continue; if (rdev->new_raid_disk >= mddev->raid_disks) @@ -3465,7 +3466,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) continue; sysfs_unlink_rdev(mddev, rdev); } - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { if (rdev->raid_disk < 0) continue; if (rdev->new_raid_disk == rdev->raid_disk) @@ -4796,7 +4797,7 @@ int md_run(struct mddev *mddev) * the only valid external interface is through the md * device. */ - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { if (test_bit(Faulty, &rdev->flags)) continue; sync_blockdev(rdev->bdev); @@ -4867,8 +4868,8 @@ int md_run(struct mddev *mddev) struct md_rdev *rdev2; int warned = 0; - list_for_each_entry(rdev, &mddev->disks, same_set) - list_for_each_entry(rdev2, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) + rdev_for_each(rdev2, mddev) { if (rdev < rdev2 && rdev->bdev->bd_contains == rdev2->bdev->bd_contains) { @@ -4945,7 +4946,7 @@ int md_run(struct mddev *mddev) mddev->in_sync = 1; smp_wmb(); mddev->ready = 1; - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0) if (sysfs_link_rdev(mddev, rdev)) /* failure here is OK */; @@ -5073,6 +5074,7 @@ static void md_clean(struct mddev *mddev) mddev->changed = 0; mddev->degraded = 0; mddev->safemode = 0; + mddev->merge_check_needed = 0; mddev->bitmap_info.offset = 0; mddev->bitmap_info.default_offset = 0; mddev->bitmap_info.chunksize = 0; @@ -5175,7 +5177,7 @@ static int do_md_stop(struct mddev * mddev, int mode, int is_open) /* tell userspace to handle 'inactive' */ sysfs_notify_dirent_safe(mddev->sysfs_state); - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0) sysfs_unlink_rdev(mddev, rdev); @@ -5226,7 +5228,7 @@ static void autorun_array(struct mddev *mddev) printk(KERN_INFO "md: running: "); - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { char b[BDEVNAME_SIZE]; printk("<%s>", bdevname(rdev->bdev,b)); } @@ -5356,7 +5358,7 @@ static int get_array_info(struct mddev * mddev, void __user * arg) struct md_rdev *rdev; nr=working=insync=failed=spare=0; - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { nr++; if (test_bit(Faulty, &rdev->flags)) failed++; @@ -5923,7 +5925,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) * grow, and re-add. */ return -EBUSY; - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { sector_t avail = rdev->sectors; if (fit && (num_sectors == 0 || num_sectors > avail)) @@ -6724,7 +6726,6 @@ static int md_seq_show(struct seq_file *seq, void *v) struct mddev *mddev = v; sector_t sectors; struct md_rdev *rdev; - struct bitmap *bitmap; if (v == (void*)1) { struct md_personality *pers; @@ -6758,7 +6759,7 @@ static int md_seq_show(struct seq_file *seq, void *v) } sectors = 0; - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { char b[BDEVNAME_SIZE]; seq_printf(seq, " %s[%d]", bdevname(rdev->bdev,b), rdev->desc_nr); @@ -6812,27 +6813,7 @@ static int md_seq_show(struct seq_file *seq, void *v) } else seq_printf(seq, "\n "); - if ((bitmap = mddev->bitmap)) { - unsigned long chunk_kb; - unsigned long flags; - spin_lock_irqsave(&bitmap->lock, flags); - chunk_kb = mddev->bitmap_info.chunksize >> 10; - seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " - "%lu%s chunk", - bitmap->pages - bitmap->missing_pages, - bitmap->pages, - (bitmap->pages - bitmap->missing_pages) - << (PAGE_SHIFT - 10), - chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize, - chunk_kb ? "KB" : "B"); - if (bitmap->file) { - seq_printf(seq, ", file: "); - seq_path(seq, &bitmap->file->f_path, " \t\n"); - } - - seq_printf(seq, "\n"); - spin_unlock_irqrestore(&bitmap->lock, flags); - } + bitmap_status(seq, mddev->bitmap); seq_printf(seq, "\n"); } @@ -7170,7 +7151,7 @@ void md_do_sync(struct mddev *mddev) max_sectors = mddev->dev_sectors; j = MaxSector; rcu_read_lock(); - list_for_each_entry_rcu(rdev, &mddev->disks, same_set) + rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && @@ -7342,7 +7323,7 @@ void md_do_sync(struct mddev *mddev) if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) mddev->curr_resync = MaxSector; rcu_read_lock(); - list_for_each_entry_rcu(rdev, &mddev->disks, same_set) + rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && mddev->delta_disks >= 0 && !test_bit(Faulty, &rdev->flags) && @@ -7388,7 +7369,7 @@ static int remove_and_add_spares(struct mddev *mddev) mddev->curr_resync_completed = 0; - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0 && !test_bit(Blocked, &rdev->flags) && (test_bit(Faulty, &rdev->flags) || @@ -7406,7 +7387,7 @@ static int remove_and_add_spares(struct mddev *mddev) "degraded"); - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { if (rdev->raid_disk >= 0 && !test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) @@ -7451,7 +7432,7 @@ static void reap_sync_thread(struct mddev *mddev) * do the superblock for an incrementally recovered device * written out. */ - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) if (!mddev->degraded || test_bit(In_sync, &rdev->flags)) rdev->saved_raid_disk = -1; @@ -7529,7 +7510,7 @@ void md_check_recovery(struct mddev *mddev) * failed devices. */ struct md_rdev *rdev; - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0 && !test_bit(Blocked, &rdev->flags) && test_bit(Faulty, &rdev->flags) && @@ -8040,7 +8021,7 @@ void md_ack_all_badblocks(struct badblocks *bb) return; write_seqlock_irq(&bb->lock); - if (bb->changed == 0) { + if (bb->changed == 0 && bb->unacked_exist) { u64 *p = bb->page; int i; for (i = 0; i < bb->count ; i++) { @@ -8157,30 +8138,23 @@ static int md_notify_reboot(struct notifier_block *this, struct mddev *mddev; int need_delay = 0; - if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { - - printk(KERN_INFO "md: stopping all md devices.\n"); - - for_each_mddev(mddev, tmp) { - if (mddev_trylock(mddev)) { - /* Force a switch to readonly even array - * appears to still be in use. Hence - * the '100'. - */ - md_set_readonly(mddev, 100); - mddev_unlock(mddev); - } - need_delay = 1; + for_each_mddev(mddev, tmp) { + if (mddev_trylock(mddev)) { + __md_stop_writes(mddev); + mddev->safemode = 2; + mddev_unlock(mddev); } - /* - * certain more exotic SCSI devices are known to be - * volatile wrt too early system reboots. While the - * right place to handle this issue is the given - * driver, we do want to have a safe RAID driver ... - */ - if (need_delay) - mdelay(1000*1); + need_delay = 1; } + /* + * certain more exotic SCSI devices are known to be + * volatile wrt too early system reboots. While the + * right place to handle this issue is the given + * driver, we do want to have a safe RAID driver ... + */ + if (need_delay) + mdelay(1000*1); + return NOTIFY_DONE; } diff --git a/drivers/md/md.h b/drivers/md/md.h index 44c63dfeeb2b..1c2063ccf48e 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -128,6 +128,10 @@ struct md_rdev { enum flag_bits { Faulty, /* device is known to have a fault */ In_sync, /* device is in_sync with rest of array */ + Unmerged, /* device is being added to array and should + * be considerred for bvec_merge_fn but not + * yet for actual IO + */ WriteMostly, /* Avoid reading if at all possible */ AutoDetected, /* added by auto-detect */ Blocked, /* An error occurred but has not yet @@ -345,6 +349,10 @@ struct mddev { int degraded; /* whether md should consider * adding a spare */ + int merge_check_needed; /* at least one + * member device + * has a + * merge_bvec_fn */ atomic_t recovery_active; /* blocks scheduled, but not written */ wait_queue_head_t recovery_wait; @@ -519,7 +527,10 @@ static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev) /* * iterates through the 'same array disks' ringlist */ -#define rdev_for_each(rdev, tmp, mddev) \ +#define rdev_for_each(rdev, mddev) \ + list_for_each_entry(rdev, &((mddev)->disks), same_set) + +#define rdev_for_each_safe(rdev, tmp, mddev) \ list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set) #define rdev_for_each_rcu(rdev, mddev) \ diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index a222f516660e..9339e67fcc79 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -428,7 +428,7 @@ static int multipath_run (struct mddev *mddev) } working_disks = 0; - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { disk_idx = rdev->raid_disk; if (disk_idx < 0 || disk_idx >= mddev->raid_disks) diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h index d279c768f8f1..5709bfeab1e8 100644 --- a/drivers/md/persistent-data/dm-btree-internal.h +++ b/drivers/md/persistent-data/dm-btree-internal.h @@ -108,12 +108,9 @@ static inline void *value_base(struct node *n) return &n->keys[le32_to_cpu(n->header.max_entries)]; } -/* - * FIXME: Now that value size is stored in node we don't need the third parm. - */ -static inline void *value_ptr(struct node *n, uint32_t index, size_t value_size) +static inline void *value_ptr(struct node *n, uint32_t index) { - BUG_ON(value_size != le32_to_cpu(n->header.value_size)); + uint32_t value_size = le32_to_cpu(n->header.value_size); return value_base(n) + (value_size * index); } diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c index 023fbc2d389e..aa71e2359a07 100644 --- a/drivers/md/persistent-data/dm-btree-remove.c +++ b/drivers/md/persistent-data/dm-btree-remove.c @@ -61,20 +61,20 @@ static void node_shift(struct node *n, int shift) if (shift < 0) { shift = -shift; BUG_ON(shift > nr_entries); - BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift, value_size)); + BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift)); memmove(key_ptr(n, 0), key_ptr(n, shift), (nr_entries - shift) * sizeof(__le64)); - memmove(value_ptr(n, 0, value_size), - value_ptr(n, shift, value_size), + memmove(value_ptr(n, 0), + value_ptr(n, shift), (nr_entries - shift) * value_size); } else { BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries)); memmove(key_ptr(n, shift), key_ptr(n, 0), nr_entries * sizeof(__le64)); - memmove(value_ptr(n, shift, value_size), - value_ptr(n, 0, value_size), + memmove(value_ptr(n, shift), + value_ptr(n, 0), nr_entries * value_size); } } @@ -91,16 +91,16 @@ static void node_copy(struct node *left, struct node *right, int shift) memcpy(key_ptr(left, nr_left), key_ptr(right, 0), shift * sizeof(__le64)); - memcpy(value_ptr(left, nr_left, value_size), - value_ptr(right, 0, value_size), + memcpy(value_ptr(left, nr_left), + value_ptr(right, 0), shift * value_size); } else { BUG_ON(shift > le32_to_cpu(right->header.max_entries)); memcpy(key_ptr(right, 0), key_ptr(left, nr_left - shift), shift * sizeof(__le64)); - memcpy(value_ptr(right, 0, value_size), - value_ptr(left, nr_left - shift, value_size), + memcpy(value_ptr(right, 0), + value_ptr(left, nr_left - shift), shift * value_size); } } @@ -120,26 +120,17 @@ static void delete_at(struct node *n, unsigned index) key_ptr(n, index + 1), nr_to_copy * sizeof(__le64)); - memmove(value_ptr(n, index, value_size), - value_ptr(n, index + 1, value_size), + memmove(value_ptr(n, index), + value_ptr(n, index + 1), nr_to_copy * value_size); } n->header.nr_entries = cpu_to_le32(nr_entries - 1); } -static unsigned del_threshold(struct node *n) -{ - return le32_to_cpu(n->header.max_entries) / 3; -} - static unsigned merge_threshold(struct node *n) { - /* - * The extra one is because we know we're potentially going to - * delete an entry. - */ - return 2 * (le32_to_cpu(n->header.max_entries) / 3) + 1; + return le32_to_cpu(n->header.max_entries) / 3; } struct child { @@ -175,7 +166,7 @@ static int init_child(struct dm_btree_info *info, struct node *parent, if (inc) inc_children(info->tm, result->n, &le64_type); - *((__le64 *) value_ptr(parent, index, sizeof(__le64))) = + *((__le64 *) value_ptr(parent, index)) = cpu_to_le64(dm_block_location(result->block)); return 0; @@ -188,6 +179,15 @@ static int exit_child(struct dm_btree_info *info, struct child *c) static void shift(struct node *left, struct node *right, int count) { + uint32_t nr_left = le32_to_cpu(left->header.nr_entries); + uint32_t nr_right = le32_to_cpu(right->header.nr_entries); + uint32_t max_entries = le32_to_cpu(left->header.max_entries); + uint32_t r_max_entries = le32_to_cpu(right->header.max_entries); + + BUG_ON(max_entries != r_max_entries); + BUG_ON(nr_left - count > max_entries); + BUG_ON(nr_right + count > max_entries); + if (!count) return; @@ -199,13 +199,8 @@ static void shift(struct node *left, struct node *right, int count) node_shift(right, count); } - left->header.nr_entries = - cpu_to_le32(le32_to_cpu(left->header.nr_entries) - count); - BUG_ON(le32_to_cpu(left->header.nr_entries) > le32_to_cpu(left->header.max_entries)); - - right->header.nr_entries = - cpu_to_le32(le32_to_cpu(right->header.nr_entries) + count); - BUG_ON(le32_to_cpu(right->header.nr_entries) > le32_to_cpu(right->header.max_entries)); + left->header.nr_entries = cpu_to_le32(nr_left - count); + right->header.nr_entries = cpu_to_le32(nr_right + count); } static void __rebalance2(struct dm_btree_info *info, struct node *parent, @@ -215,8 +210,9 @@ static void __rebalance2(struct dm_btree_info *info, struct node *parent, struct node *right = r->n; uint32_t nr_left = le32_to_cpu(left->header.nr_entries); uint32_t nr_right = le32_to_cpu(right->header.nr_entries); + unsigned threshold = 2 * merge_threshold(left) + 1; - if (nr_left + nr_right <= merge_threshold(left)) { + if (nr_left + nr_right < threshold) { /* * Merge */ @@ -234,9 +230,6 @@ static void __rebalance2(struct dm_btree_info *info, struct node *parent, * Rebalance. */ unsigned target_left = (nr_left + nr_right) / 2; - unsigned shift_ = nr_left - target_left; - BUG_ON(le32_to_cpu(left->header.max_entries) <= nr_left - shift_); - BUG_ON(le32_to_cpu(right->header.max_entries) <= nr_right + shift_); shift(left, right, nr_left - target_left); *key_ptr(parent, r->index) = right->keys[0]; } @@ -272,6 +265,84 @@ static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info, return exit_child(info, &right); } +/* + * We dump as many entries from center as possible into left, then the rest + * in right, then rebalance2. This wastes some cpu, but I want something + * simple atm. + */ +static void delete_center_node(struct dm_btree_info *info, struct node *parent, + struct child *l, struct child *c, struct child *r, + struct node *left, struct node *center, struct node *right, + uint32_t nr_left, uint32_t nr_center, uint32_t nr_right) +{ + uint32_t max_entries = le32_to_cpu(left->header.max_entries); + unsigned shift = min(max_entries - nr_left, nr_center); + + BUG_ON(nr_left + shift > max_entries); + node_copy(left, center, -shift); + left->header.nr_entries = cpu_to_le32(nr_left + shift); + + if (shift != nr_center) { + shift = nr_center - shift; + BUG_ON((nr_right + shift) > max_entries); + node_shift(right, shift); + node_copy(center, right, shift); + right->header.nr_entries = cpu_to_le32(nr_right + shift); + } + *key_ptr(parent, r->index) = right->keys[0]; + + delete_at(parent, c->index); + r->index--; + + dm_tm_dec(info->tm, dm_block_location(c->block)); + __rebalance2(info, parent, l, r); +} + +/* + * Redistributes entries among 3 sibling nodes. + */ +static void redistribute3(struct dm_btree_info *info, struct node *parent, + struct child *l, struct child *c, struct child *r, + struct node *left, struct node *center, struct node *right, + uint32_t nr_left, uint32_t nr_center, uint32_t nr_right) +{ + int s; + uint32_t max_entries = le32_to_cpu(left->header.max_entries); + unsigned target = (nr_left + nr_center + nr_right) / 3; + BUG_ON(target > max_entries); + + if (nr_left < nr_right) { + s = nr_left - target; + + if (s < 0 && nr_center < -s) { + /* not enough in central node */ + shift(left, center, nr_center); + s = nr_center - target; + shift(left, right, s); + nr_right += s; + } else + shift(left, center, s); + + shift(center, right, target - nr_right); + + } else { + s = target - nr_right; + if (s > 0 && nr_center < s) { + /* not enough in central node */ + shift(center, right, nr_center); + s = target - nr_center; + shift(left, right, s); + nr_left -= s; + } else + shift(center, right, s); + + shift(left, center, nr_left - target); + } + + *key_ptr(parent, c->index) = center->keys[0]; + *key_ptr(parent, r->index) = right->keys[0]; +} + static void __rebalance3(struct dm_btree_info *info, struct node *parent, struct child *l, struct child *c, struct child *r) { @@ -282,62 +353,18 @@ static void __rebalance3(struct dm_btree_info *info, struct node *parent, uint32_t nr_left = le32_to_cpu(left->header.nr_entries); uint32_t nr_center = le32_to_cpu(center->header.nr_entries); uint32_t nr_right = le32_to_cpu(right->header.nr_entries); - uint32_t max_entries = le32_to_cpu(left->header.max_entries); - unsigned target; + unsigned threshold = merge_threshold(left) * 4 + 1; BUG_ON(left->header.max_entries != center->header.max_entries); BUG_ON(center->header.max_entries != right->header.max_entries); - if (((nr_left + nr_center + nr_right) / 2) < merge_threshold(center)) { - /* - * Delete center node: - * - * We dump as many entries from center as possible into - * left, then the rest in right, then rebalance2. This - * wastes some cpu, but I want something simple atm. - */ - unsigned shift = min(max_entries - nr_left, nr_center); - - BUG_ON(nr_left + shift > max_entries); - node_copy(left, center, -shift); - left->header.nr_entries = cpu_to_le32(nr_left + shift); - - if (shift != nr_center) { - shift = nr_center - shift; - BUG_ON((nr_right + shift) >= max_entries); - node_shift(right, shift); - node_copy(center, right, shift); - right->header.nr_entries = cpu_to_le32(nr_right + shift); - } - *key_ptr(parent, r->index) = right->keys[0]; - - delete_at(parent, c->index); - r->index--; - - dm_tm_dec(info->tm, dm_block_location(c->block)); - __rebalance2(info, parent, l, r); - - return; - } - - /* - * Rebalance - */ - target = (nr_left + nr_center + nr_right) / 3; - BUG_ON(target > max_entries); - - /* - * Adjust the left node - */ - shift(left, center, nr_left - target); - - /* - * Adjust the right node - */ - shift(center, right, target - nr_right); - *key_ptr(parent, c->index) = center->keys[0]; - *key_ptr(parent, r->index) = right->keys[0]; + if ((nr_left + nr_center + nr_right) < threshold) + delete_center_node(info, parent, l, c, r, left, center, right, + nr_left, nr_center, nr_right); + else + redistribute3(info, parent, l, c, r, left, center, right, + nr_left, nr_center, nr_right); } static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, @@ -441,9 +468,6 @@ static int rebalance_children(struct shadow_spine *s, if (r) return r; - if (child_entries > del_threshold(n)) - return 0; - has_left_sibling = i > 0; has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1); @@ -496,7 +520,7 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, */ if (shadow_has_parent(s)) { __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); - memcpy(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(__le64)), + memcpy(value_ptr(dm_block_data(shadow_parent(s)), i), &location, sizeof(__le64)); } @@ -553,7 +577,7 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, if (info->value_type.dec) info->value_type.dec(info->value_type.context, - value_ptr(n, index, info->value_type.size)); + value_ptr(n, index)); delete_at(n, index); } diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index bd1e7ffbe26c..d12b2cc51f1a 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c @@ -74,8 +74,7 @@ void inc_children(struct dm_transaction_manager *tm, struct node *n, dm_tm_inc(tm, value64(n, i)); else if (vt->inc) for (i = 0; i < nr_entries; i++) - vt->inc(vt->context, - value_ptr(n, i, vt->size)); + vt->inc(vt->context, value_ptr(n, i)); } static int insert_at(size_t value_size, struct node *node, unsigned index, @@ -281,7 +280,7 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root) for (i = 0; i < f->nr_children; i++) info->value_type.dec(info->value_type.context, - value_ptr(f->n, i, info->value_type.size)); + value_ptr(f->n, i)); } f->current_child = f->nr_children; } @@ -320,7 +319,7 @@ static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key, } while (!(flags & LEAF_NODE)); *result_key = le64_to_cpu(ro_node(s)->keys[i]); - memcpy(v, value_ptr(ro_node(s), i, value_size), value_size); + memcpy(v, value_ptr(ro_node(s), i), value_size); return 0; } @@ -432,7 +431,7 @@ static int btree_split_sibling(struct shadow_spine *s, dm_block_t root, size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ? sizeof(uint64_t) : s->info->value_type.size; - memcpy(value_ptr(rn, 0, size), value_ptr(ln, nr_left, size), + memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left), size * nr_right); /* @@ -443,7 +442,7 @@ static int btree_split_sibling(struct shadow_spine *s, dm_block_t root, pn = dm_block_data(parent); location = cpu_to_le64(dm_block_location(left)); __dm_bless_for_disk(&location); - memcpy_disk(value_ptr(pn, parent_index, sizeof(__le64)), + memcpy_disk(value_ptr(pn, parent_index), &location, sizeof(__le64)); location = cpu_to_le64(dm_block_location(right)); @@ -529,8 +528,8 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key) size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ? sizeof(__le64) : s->info->value_type.size; - memcpy(value_ptr(ln, 0, size), value_ptr(pn, 0, size), nr_left * size); - memcpy(value_ptr(rn, 0, size), value_ptr(pn, nr_left, size), + memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size); + memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left), nr_right * size); /* new_parent should just point to l and r now */ @@ -545,12 +544,12 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key) val = cpu_to_le64(dm_block_location(left)); __dm_bless_for_disk(&val); pn->keys[0] = ln->keys[0]; - memcpy_disk(value_ptr(pn, 0, sizeof(__le64)), &val, sizeof(__le64)); + memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64)); val = cpu_to_le64(dm_block_location(right)); __dm_bless_for_disk(&val); pn->keys[1] = rn->keys[0]; - memcpy_disk(value_ptr(pn, 1, sizeof(__le64)), &val, sizeof(__le64)); + memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64)); /* * rejig the spine. This is ugly, since it knows too @@ -595,7 +594,7 @@ static int btree_insert_raw(struct shadow_spine *s, dm_block_t root, __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); __dm_bless_for_disk(&location); - memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(uint64_t)), + memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i), &location, sizeof(__le64)); } @@ -710,12 +709,12 @@ static int insert(struct dm_btree_info *info, dm_block_t root, (!info->value_type.equal || !info->value_type.equal( info->value_type.context, - value_ptr(n, index, info->value_type.size), + value_ptr(n, index), value))) { info->value_type.dec(info->value_type.context, - value_ptr(n, index, info->value_type.size)); + value_ptr(n, index)); } - memcpy_disk(value_ptr(n, index, info->value_type.size), + memcpy_disk(value_ptr(n, index), value, info->value_type.size); } diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index df2494c06cdc..ff3beed6ad2d 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c @@ -405,8 +405,6 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b, if (r < 0) return r; -#if 0 - /* FIXME: dm_btree_remove doesn't handle this yet */ if (old > 2) { r = dm_btree_remove(&ll->ref_count_info, ll->ref_count_root, @@ -414,7 +412,6 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b, if (r) return r; } -#endif } else { __le32 le_rc = cpu_to_le32(ref_count); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 7294bd115e34..6f31f5596e01 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -91,7 +91,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) if (!conf) return -ENOMEM; - list_for_each_entry(rdev1, &mddev->disks, same_set) { + rdev_for_each(rdev1, mddev) { pr_debug("md/raid0:%s: looking at %s\n", mdname(mddev), bdevname(rdev1->bdev, b)); @@ -102,7 +102,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) sector_div(sectors, mddev->chunk_sectors); rdev1->sectors = sectors * mddev->chunk_sectors; - list_for_each_entry(rdev2, &mddev->disks, same_set) { + rdev_for_each(rdev2, mddev) { pr_debug("md/raid0:%s: comparing %s(%llu)" " with %s(%llu)\n", mdname(mddev), @@ -157,7 +157,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) smallest = NULL; dev = conf->devlist; err = -EINVAL; - list_for_each_entry(rdev1, &mddev->disks, same_set) { + rdev_for_each(rdev1, mddev) { int j = rdev1->raid_disk; if (mddev->level == 10) { @@ -188,16 +188,10 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) disk_stack_limits(mddev->gendisk, rdev1->bdev, rdev1->data_offset << 9); - /* as we don't honour merge_bvec_fn, we must never risk - * violating it, so limit ->max_segments to 1, lying within - * a single page. - */ - if (rdev1->bdev->bd_disk->queue->merge_bvec_fn) { - blk_queue_max_segments(mddev->queue, 1); - blk_queue_segment_boundary(mddev->queue, - PAGE_CACHE_SIZE - 1); - } + if (rdev1->bdev->bd_disk->queue->merge_bvec_fn) + conf->has_merge_bvec = 1; + if (!smallest || (rdev1->sectors < smallest->sectors)) smallest = rdev1; cnt++; @@ -290,8 +284,64 @@ abort: return err; } +/* Find the zone which holds a particular offset + * Update *sectorp to be an offset in that zone + */ +static struct strip_zone *find_zone(struct r0conf *conf, + sector_t *sectorp) +{ + int i; + struct strip_zone *z = conf->strip_zone; + sector_t sector = *sectorp; + + for (i = 0; i < conf->nr_strip_zones; i++) + if (sector < z[i].zone_end) { + if (i) + *sectorp = sector - z[i-1].zone_end; + return z + i; + } + BUG(); +} + +/* + * remaps the bio to the target device. we separate two flows. + * power 2 flow and a general flow for the sake of perfromance +*/ +static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone, + sector_t sector, sector_t *sector_offset) +{ + unsigned int sect_in_chunk; + sector_t chunk; + struct r0conf *conf = mddev->private; + int raid_disks = conf->strip_zone[0].nb_dev; + unsigned int chunk_sects = mddev->chunk_sectors; + + if (is_power_of_2(chunk_sects)) { + int chunksect_bits = ffz(~chunk_sects); + /* find the sector offset inside the chunk */ + sect_in_chunk = sector & (chunk_sects - 1); + sector >>= chunksect_bits; + /* chunk in zone */ + chunk = *sector_offset; + /* quotient is the chunk in real device*/ + sector_div(chunk, zone->nb_dev << chunksect_bits); + } else{ + sect_in_chunk = sector_div(sector, chunk_sects); + chunk = *sector_offset; + sector_div(chunk, chunk_sects * zone->nb_dev); + } + /* + * position the bio over the real device + * real sector = chunk in device + starting of zone + * + the position in the chunk + */ + *sector_offset = (chunk * chunk_sects) + sect_in_chunk; + return conf->devlist[(zone - conf->strip_zone)*raid_disks + + sector_div(sector, zone->nb_dev)]; +} + /** - * raid0_mergeable_bvec -- tell bio layer if a two requests can be merged + * raid0_mergeable_bvec -- tell bio layer if two requests can be merged * @q: request queue * @bvm: properties of new bio * @biovec: the request that could be merged to it. @@ -303,10 +353,15 @@ static int raid0_mergeable_bvec(struct request_queue *q, struct bio_vec *biovec) { struct mddev *mddev = q->queuedata; + struct r0conf *conf = mddev->private; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); + sector_t sector_offset = sector; int max; unsigned int chunk_sectors = mddev->chunk_sectors; unsigned int bio_sectors = bvm->bi_size >> 9; + struct strip_zone *zone; + struct md_rdev *rdev; + struct request_queue *subq; if (is_power_of_2(chunk_sectors)) max = (chunk_sectors - ((sector & (chunk_sectors-1)) @@ -314,10 +369,27 @@ static int raid0_mergeable_bvec(struct request_queue *q, else max = (chunk_sectors - (sector_div(sector, chunk_sectors) + bio_sectors)) << 9; - if (max < 0) max = 0; /* bio_add cannot handle a negative return */ + if (max < 0) + max = 0; /* bio_add cannot handle a negative return */ if (max <= biovec->bv_len && bio_sectors == 0) return biovec->bv_len; - else + if (max < biovec->bv_len) + /* too small already, no need to check further */ + return max; + if (!conf->has_merge_bvec) + return max; + + /* May need to check subordinate device */ + sector = sector_offset; + zone = find_zone(mddev->private, §or_offset); + rdev = map_sector(mddev, zone, sector, §or_offset); + subq = bdev_get_queue(rdev->bdev); + if (subq->merge_bvec_fn) { + bvm->bi_bdev = rdev->bdev; + bvm->bi_sector = sector_offset + zone->dev_start + + rdev->data_offset; + return min(max, subq->merge_bvec_fn(subq, bvm, biovec)); + } else return max; } @@ -329,7 +401,7 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks WARN_ONCE(sectors || raid_disks, "%s does not support generic reshape\n", __func__); - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) array_sectors += rdev->sectors; return array_sectors; @@ -397,62 +469,6 @@ static int raid0_stop(struct mddev *mddev) return 0; } -/* Find the zone which holds a particular offset - * Update *sectorp to be an offset in that zone - */ -static struct strip_zone *find_zone(struct r0conf *conf, - sector_t *sectorp) -{ - int i; - struct strip_zone *z = conf->strip_zone; - sector_t sector = *sectorp; - - for (i = 0; i < conf->nr_strip_zones; i++) - if (sector < z[i].zone_end) { - if (i) - *sectorp = sector - z[i-1].zone_end; - return z + i; - } - BUG(); -} - -/* - * remaps the bio to the target device. we separate two flows. - * power 2 flow and a general flow for the sake of perfromance -*/ -static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone, - sector_t sector, sector_t *sector_offset) -{ - unsigned int sect_in_chunk; - sector_t chunk; - struct r0conf *conf = mddev->private; - int raid_disks = conf->strip_zone[0].nb_dev; - unsigned int chunk_sects = mddev->chunk_sectors; - - if (is_power_of_2(chunk_sects)) { - int chunksect_bits = ffz(~chunk_sects); - /* find the sector offset inside the chunk */ - sect_in_chunk = sector & (chunk_sects - 1); - sector >>= chunksect_bits; - /* chunk in zone */ - chunk = *sector_offset; - /* quotient is the chunk in real device*/ - sector_div(chunk, zone->nb_dev << chunksect_bits); - } else{ - sect_in_chunk = sector_div(sector, chunk_sects); - chunk = *sector_offset; - sector_div(chunk, chunk_sects * zone->nb_dev); - } - /* - * position the bio over the real device - * real sector = chunk in device + starting of zone - * + the position in the chunk - */ - *sector_offset = (chunk * chunk_sects) + sect_in_chunk; - return conf->devlist[(zone - conf->strip_zone)*raid_disks - + sector_div(sector, zone->nb_dev)]; -} - /* * Is io distribute over 1 or more chunks ? */ @@ -505,7 +521,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) } sector_offset = bio->bi_sector; - zone = find_zone(mddev->private, §or_offset); + zone = find_zone(mddev->private, §or_offset); tmp_dev = map_sector(mddev, zone, bio->bi_sector, §or_offset); bio->bi_bdev = tmp_dev->bdev; @@ -543,7 +559,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev) return ERR_PTR(-EINVAL); } - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { /* check slot number for a disk */ if (rdev->raid_disk == mddev->raid_disks-1) { printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n", diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h index 0884bba8df4c..05539d9c97f0 100644 --- a/drivers/md/raid0.h +++ b/drivers/md/raid0.h @@ -4,13 +4,16 @@ struct strip_zone { sector_t zone_end; /* Start of the next zone (in sectors) */ sector_t dev_start; /* Zone offset in real dev (in sectors) */ - int nb_dev; /* # of devices attached to the zone */ + int nb_dev; /* # of devices attached to the zone */ }; struct r0conf { - struct strip_zone *strip_zone; - struct md_rdev **devlist; /* lists of rdevs, pointed to by strip_zone->dev */ - int nr_strip_zones; + struct strip_zone *strip_zone; + struct md_rdev **devlist; /* lists of rdevs, pointed to + * by strip_zone->dev */ + int nr_strip_zones; + int has_merge_bvec; /* at least one member has + * a merge_bvec_fn */ }; #endif diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index a368db2431a5..4a40a200d769 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -523,6 +523,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect rdev = rcu_dereference(conf->mirrors[disk].rdev); if (r1_bio->bios[disk] == IO_BLOCKED || rdev == NULL + || test_bit(Unmerged, &rdev->flags) || test_bit(Faulty, &rdev->flags)) continue; if (!test_bit(In_sync, &rdev->flags) && @@ -614,6 +615,39 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect return best_disk; } +static int raid1_mergeable_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) +{ + struct mddev *mddev = q->queuedata; + struct r1conf *conf = mddev->private; + sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); + int max = biovec->bv_len; + + if (mddev->merge_check_needed) { + int disk; + rcu_read_lock(); + for (disk = 0; disk < conf->raid_disks * 2; disk++) { + struct md_rdev *rdev = rcu_dereference( + conf->mirrors[disk].rdev); + if (rdev && !test_bit(Faulty, &rdev->flags)) { + struct request_queue *q = + bdev_get_queue(rdev->bdev); + if (q->merge_bvec_fn) { + bvm->bi_sector = sector + + rdev->data_offset; + bvm->bi_bdev = rdev->bdev; + max = min(max, q->merge_bvec_fn( + q, bvm, biovec)); + } + } + } + rcu_read_unlock(); + } + return max; + +} + int md_raid1_congested(struct mddev *mddev, int bits) { struct r1conf *conf = mddev->private; @@ -624,7 +658,7 @@ int md_raid1_congested(struct mddev *mddev, int bits) return 1; rcu_read_lock(); - for (i = 0; i < conf->raid_disks; i++) { + for (i = 0; i < conf->raid_disks * 2; i++) { struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags)) { struct request_queue *q = bdev_get_queue(rdev->bdev); @@ -737,9 +771,22 @@ static void wait_barrier(struct r1conf *conf) spin_lock_irq(&conf->resync_lock); if (conf->barrier) { conf->nr_waiting++; - wait_event_lock_irq(conf->wait_barrier, !conf->barrier, + /* Wait for the barrier to drop. + * However if there are already pending + * requests (preventing the barrier from + * rising completely), and the + * pre-process bio queue isn't empty, + * then don't wait, as we need to empty + * that queue to get the nr_pending + * count down. + */ + wait_event_lock_irq(conf->wait_barrier, + !conf->barrier || + (conf->nr_pending && + current->bio_list && + !bio_list_empty(current->bio_list)), conf->resync_lock, - ); + ); conf->nr_waiting--; } conf->nr_pending++; @@ -1002,7 +1049,8 @@ read_again: break; } r1_bio->bios[i] = NULL; - if (!rdev || test_bit(Faulty, &rdev->flags)) { + if (!rdev || test_bit(Faulty, &rdev->flags) + || test_bit(Unmerged, &rdev->flags)) { if (i < conf->raid_disks) set_bit(R1BIO_Degraded, &r1_bio->state); continue; @@ -1322,6 +1370,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) struct mirror_info *p; int first = 0; int last = conf->raid_disks - 1; + struct request_queue *q = bdev_get_queue(rdev->bdev); if (mddev->recovery_disabled == conf->recovery_disabled) return -EBUSY; @@ -1329,23 +1378,17 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; + if (q->merge_bvec_fn) { + set_bit(Unmerged, &rdev->flags); + mddev->merge_check_needed = 1; + } + for (mirror = first; mirror <= last; mirror++) { p = conf->mirrors+mirror; if (!p->rdev) { disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); - /* as we don't honour merge_bvec_fn, we must - * never risk violating it, so limit - * ->max_segments to one lying with a single - * page, as a one page request is never in - * violation. - */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { - blk_queue_max_segments(mddev->queue, 1); - blk_queue_segment_boundary(mddev->queue, - PAGE_CACHE_SIZE - 1); - } p->head_position = 0; rdev->raid_disk = mirror; @@ -1370,6 +1413,19 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) break; } } + if (err == 0 && test_bit(Unmerged, &rdev->flags)) { + /* Some requests might not have seen this new + * merge_bvec_fn. We must wait for them to complete + * before merging the device fully. + * First we make sure any code which has tested + * our function has submitted the request, then + * we wait for all outstanding requests to complete. + */ + synchronize_sched(); + raise_barrier(conf); + lower_barrier(conf); + clear_bit(Unmerged, &rdev->flags); + } md_integrity_add_rdev(rdev, mddev); print_conf(conf); return err; @@ -2491,7 +2547,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) err = -EINVAL; spin_lock_init(&conf->device_lock); - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { int disk_idx = rdev->raid_disk; if (disk_idx >= mddev->raid_disks || disk_idx < 0) @@ -2609,20 +2665,11 @@ static int run(struct mddev *mddev) if (IS_ERR(conf)) return PTR_ERR(conf); - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { if (!mddev->gendisk) continue; disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); - /* as we don't honour merge_bvec_fn, we must never risk - * violating it, so limit ->max_segments to 1 lying within - * a single page, as a one page request is never in violation. - */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { - blk_queue_max_segments(mddev->queue, 1); - blk_queue_segment_boundary(mddev->queue, - PAGE_CACHE_SIZE - 1); - } } mddev->degraded = 0; @@ -2656,6 +2703,7 @@ static int run(struct mddev *mddev) if (mddev->queue) { mddev->queue->backing_dev_info.congested_fn = raid1_congested; mddev->queue->backing_dev_info.congested_data = mddev; + blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec); } return md_integrity_register(mddev); } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 6e8aa213f0d5..3540316886f2 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -67,6 +67,7 @@ static int max_queued_requests = 1024; static void allow_barrier(struct r10conf *conf); static void lower_barrier(struct r10conf *conf); +static int enough(struct r10conf *conf, int ignore); static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) { @@ -347,6 +348,19 @@ static void raid10_end_read_request(struct bio *bio, int error) * wait for the 'master' bio. */ set_bit(R10BIO_Uptodate, &r10_bio->state); + } else { + /* If all other devices that store this block have + * failed, we want to return the error upwards rather + * than fail the last device. Here we redefine + * "uptodate" to mean "Don't want to retry" + */ + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); + if (!enough(conf, rdev->raid_disk)) + uptodate = 1; + spin_unlock_irqrestore(&conf->device_lock, flags); + } + if (uptodate) { raid_end_bio_io(r10_bio); rdev_dec_pending(rdev, conf->mddev); } else { @@ -572,25 +586,68 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) * @biovec: the request that could be merged to it. * * Return amount of bytes we can accept at this offset - * If near_copies == raid_disk, there are no striping issues, - * but in that case, the function isn't called at all. + * This requires checking for end-of-chunk if near_copies != raid_disks, + * and for subordinate merge_bvec_fns if merge_check_needed. */ static int raid10_mergeable_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *biovec) { struct mddev *mddev = q->queuedata; + struct r10conf *conf = mddev->private; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; unsigned int chunk_sectors = mddev->chunk_sectors; unsigned int bio_sectors = bvm->bi_size >> 9; - max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; - if (max < 0) max = 0; /* bio_add cannot handle a negative return */ - if (max <= biovec->bv_len && bio_sectors == 0) - return biovec->bv_len; - else - return max; + if (conf->near_copies < conf->raid_disks) { + max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + + bio_sectors)) << 9; + if (max < 0) + /* bio_add cannot handle a negative return */ + max = 0; + if (max <= biovec->bv_len && bio_sectors == 0) + return biovec->bv_len; + } else + max = biovec->bv_len; + + if (mddev->merge_check_needed) { + struct r10bio r10_bio; + int s; + r10_bio.sector = sector; + raid10_find_phys(conf, &r10_bio); + rcu_read_lock(); + for (s = 0; s < conf->copies; s++) { + int disk = r10_bio.devs[s].devnum; + struct md_rdev *rdev = rcu_dereference( + conf->mirrors[disk].rdev); + if (rdev && !test_bit(Faulty, &rdev->flags)) { + struct request_queue *q = + bdev_get_queue(rdev->bdev); + if (q->merge_bvec_fn) { + bvm->bi_sector = r10_bio.devs[s].addr + + rdev->data_offset; + bvm->bi_bdev = rdev->bdev; + max = min(max, q->merge_bvec_fn( + q, bvm, biovec)); + } + } + rdev = rcu_dereference(conf->mirrors[disk].replacement); + if (rdev && !test_bit(Faulty, &rdev->flags)) { + struct request_queue *q = + bdev_get_queue(rdev->bdev); + if (q->merge_bvec_fn) { + bvm->bi_sector = r10_bio.devs[s].addr + + rdev->data_offset; + bvm->bi_bdev = rdev->bdev; + max = min(max, q->merge_bvec_fn( + q, bvm, biovec)); + } + } + } + rcu_read_unlock(); + } + return max; } /* @@ -654,11 +711,12 @@ retry: disk = r10_bio->devs[slot].devnum; rdev = rcu_dereference(conf->mirrors[disk].replacement); if (rdev == NULL || test_bit(Faulty, &rdev->flags) || + test_bit(Unmerged, &rdev->flags) || r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) rdev = rcu_dereference(conf->mirrors[disk].rdev); - if (rdev == NULL) - continue; - if (test_bit(Faulty, &rdev->flags)) + if (rdev == NULL || + test_bit(Faulty, &rdev->flags) || + test_bit(Unmerged, &rdev->flags)) continue; if (!test_bit(In_sync, &rdev->flags) && r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) @@ -849,9 +907,22 @@ static void wait_barrier(struct r10conf *conf) spin_lock_irq(&conf->resync_lock); if (conf->barrier) { conf->nr_waiting++; - wait_event_lock_irq(conf->wait_barrier, !conf->barrier, + /* Wait for the barrier to drop. + * However if there are already pending + * requests (preventing the barrier from + * rising completely), and the + * pre-process bio queue isn't empty, + * then don't wait, as we need to empty + * that queue to get the nr_pending + * count down. + */ + wait_event_lock_irq(conf->wait_barrier, + !conf->barrier || + (conf->nr_pending && + current->bio_list && + !bio_list_empty(current->bio_list)), conf->resync_lock, - ); + ); conf->nr_waiting--; } conf->nr_pending++; @@ -1107,12 +1178,14 @@ retry_write: blocked_rdev = rrdev; break; } - if (rrdev && test_bit(Faulty, &rrdev->flags)) + if (rrdev && (test_bit(Faulty, &rrdev->flags) + || test_bit(Unmerged, &rrdev->flags))) rrdev = NULL; r10_bio->devs[i].bio = NULL; r10_bio->devs[i].repl_bio = NULL; - if (!rdev || test_bit(Faulty, &rdev->flags)) { + if (!rdev || test_bit(Faulty, &rdev->flags) || + test_bit(Unmerged, &rdev->flags)) { set_bit(R10BIO_Degraded, &r10_bio->state); continue; } @@ -1463,18 +1536,24 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) int mirror; int first = 0; int last = conf->raid_disks - 1; + struct request_queue *q = bdev_get_queue(rdev->bdev); if (mddev->recovery_cp < MaxSector) /* only hot-add to in-sync arrays, as recovery is * very different from resync */ return -EBUSY; - if (!enough(conf, -1)) + if (rdev->saved_raid_disk < 0 && !enough(conf, -1)) return -EINVAL; if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; + if (q->merge_bvec_fn) { + set_bit(Unmerged, &rdev->flags); + mddev->merge_check_needed = 1; + } + if (rdev->saved_raid_disk >= first && conf->mirrors[rdev->saved_raid_disk].rdev == NULL) mirror = rdev->saved_raid_disk; @@ -1494,11 +1573,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) err = 0; disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); - if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { - blk_queue_max_segments(mddev->queue, 1); - blk_queue_segment_boundary(mddev->queue, - PAGE_CACHE_SIZE - 1); - } conf->fullsync = 1; rcu_assign_pointer(p->replacement, rdev); break; @@ -1506,17 +1580,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); - /* as we don't honour merge_bvec_fn, we must - * never risk violating it, so limit - * ->max_segments to one lying with a single - * page, as a one page request is never in - * violation. - */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { - blk_queue_max_segments(mddev->queue, 1); - blk_queue_segment_boundary(mddev->queue, - PAGE_CACHE_SIZE - 1); - } p->head_position = 0; p->recovery_disabled = mddev->recovery_disabled - 1; @@ -1527,7 +1590,19 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) rcu_assign_pointer(p->rdev, rdev); break; } - + if (err == 0 && test_bit(Unmerged, &rdev->flags)) { + /* Some requests might not have seen this new + * merge_bvec_fn. We must wait for them to complete + * before merging the device fully. + * First we make sure any code which has tested + * our function has submitted the request, then + * we wait for all outstanding requests to complete. + */ + synchronize_sched(); + raise_barrier(conf, 0); + lower_barrier(conf); + clear_bit(Unmerged, &rdev->flags); + } md_integrity_add_rdev(rdev, mddev); print_conf(conf); return err; @@ -1668,10 +1743,8 @@ static void end_sync_write(struct bio *bio, int error) d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); if (repl) rdev = conf->mirrors[d].replacement; - if (!rdev) { - smp_mb(); + else rdev = conf->mirrors[d].rdev; - } if (!uptodate) { if (repl) @@ -2052,6 +2125,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 "md/raid10:%s: %s: Failing raid device\n", mdname(mddev), b); md_error(mddev, conf->mirrors[d].rdev); + r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED; return; } @@ -2072,6 +2146,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 d = r10_bio->devs[sl].devnum; rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev && + !test_bit(Unmerged, &rdev->flags) && test_bit(In_sync, &rdev->flags) && is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, &first_bad, &bad_sectors) == 0) { @@ -2105,8 +2180,11 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 rdev, r10_bio->devs[r10_bio->read_slot].addr + sect, - s, 0)) + s, 0)) { md_error(mddev, rdev); + r10_bio->devs[r10_bio->read_slot].bio + = IO_BLOCKED; + } break; } @@ -2122,6 +2200,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 d = r10_bio->devs[sl].devnum; rdev = rcu_dereference(conf->mirrors[d].rdev); if (!rdev || + test_bit(Unmerged, &rdev->flags) || !test_bit(In_sync, &rdev->flags)) continue; @@ -2299,17 +2378,20 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) * This is all done synchronously while the array is * frozen. */ + bio = r10_bio->devs[slot].bio; + bdevname(bio->bi_bdev, b); + bio_put(bio); + r10_bio->devs[slot].bio = NULL; + if (mddev->ro == 0) { freeze_array(conf); fix_read_error(conf, mddev, r10_bio); unfreeze_array(conf); - } + } else + r10_bio->devs[slot].bio = IO_BLOCKED; + rdev_dec_pending(rdev, mddev); - bio = r10_bio->devs[slot].bio; - bdevname(bio->bi_bdev, b); - r10_bio->devs[slot].bio = - mddev->ro ? IO_BLOCKED : NULL; read_more: rdev = read_balance(conf, r10_bio, &max_sectors); if (rdev == NULL) { @@ -2318,13 +2400,10 @@ read_more: mdname(mddev), b, (unsigned long long)r10_bio->sector); raid_end_bio_io(r10_bio); - bio_put(bio); return; } do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); - if (bio) - bio_put(bio); slot = r10_bio->read_slot; printk_ratelimited( KERN_ERR @@ -2360,7 +2439,6 @@ read_more: mbio->bi_phys_segments++; spin_unlock_irq(&conf->device_lock); generic_make_request(bio); - bio = NULL; r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); @@ -3225,7 +3303,7 @@ static int run(struct mddev *mddev) blk_queue_io_opt(mddev->queue, chunk_size * (conf->raid_disks / conf->near_copies)); - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { disk_idx = rdev->raid_disk; if (disk_idx >= conf->raid_disks @@ -3243,18 +3321,8 @@ static int run(struct mddev *mddev) disk->rdev = rdev; } - disk->rdev = rdev; disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); - /* as we don't honour merge_bvec_fn, we must never risk - * violating it, so limit max_segments to 1 lying - * within a single page. - */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { - blk_queue_max_segments(mddev->queue, 1); - blk_queue_segment_boundary(mddev->queue, - PAGE_CACHE_SIZE - 1); - } disk->head_position = 0; } @@ -3318,8 +3386,7 @@ static int run(struct mddev *mddev) mddev->queue->backing_dev_info.ra_pages = 2* stripe; } - if (conf->near_copies < conf->raid_disks) - blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); + blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); if (md_integrity_register(mddev)) goto out_free_conf; @@ -3369,6 +3436,43 @@ static void raid10_quiesce(struct mddev *mddev, int state) } } +static int raid10_resize(struct mddev *mddev, sector_t sectors) +{ + /* Resize of 'far' arrays is not supported. + * For 'near' and 'offset' arrays we can set the + * number of sectors used to be an appropriate multiple + * of the chunk size. + * For 'offset', this is far_copies*chunksize. + * For 'near' the multiplier is the LCM of + * near_copies and raid_disks. + * So if far_copies > 1 && !far_offset, fail. + * Else find LCM(raid_disks, near_copy)*far_copies and + * multiply by chunk_size. Then round to this number. + * This is mostly done by raid10_size() + */ + struct r10conf *conf = mddev->private; + sector_t oldsize, size; + + if (conf->far_copies > 1 && !conf->far_offset) + return -EINVAL; + + oldsize = raid10_size(mddev, 0, 0); + size = raid10_size(mddev, sectors, 0); + md_set_array_sectors(mddev, size); + if (mddev->array_sectors > size) + return -EINVAL; + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + if (sectors > mddev->dev_sectors && + mddev->recovery_cp > oldsize) { + mddev->recovery_cp = oldsize; + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + } + mddev->dev_sectors = sectors; + mddev->resync_max_sectors = size; + return 0; +} + static void *raid10_takeover_raid0(struct mddev *mddev) { struct md_rdev *rdev; @@ -3392,7 +3496,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev) conf = setup_conf(mddev); if (!IS_ERR(conf)) { - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0) rdev->new_raid_disk = rdev->raid_disk * 2; conf->barrier = 1; @@ -3438,6 +3542,7 @@ static struct md_personality raid10_personality = .sync_request = sync_request, .quiesce = raid10_quiesce, .size = raid10_size, + .resize = raid10_resize, .takeover = raid10_takeover, }; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 360f2b98f62b..23ac880bba9a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -208,11 +208,10 @@ static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) md_wakeup_thread(conf->mddev->thread); } else { BUG_ON(stripe_operations_active(sh)); - if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { - atomic_dec(&conf->preread_active_stripes); - if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) + if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + if (atomic_dec_return(&conf->preread_active_stripes) + < IO_THRESHOLD) md_wakeup_thread(conf->mddev->thread); - } atomic_dec(&conf->active_stripes); if (!test_bit(STRIPE_EXPANDING, &sh->state)) { list_add_tail(&sh->lru, &conf->inactive_list); @@ -4843,7 +4842,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) pr_debug("raid456: run(%s) called.\n", mdname(mddev)); - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { raid_disk = rdev->raid_disk; if (raid_disk >= max_disks || raid_disk < 0) @@ -5178,7 +5177,7 @@ static int run(struct mddev *mddev) blk_queue_io_opt(mddev->queue, chunk_size * (conf->raid_disks - conf->max_degraded)); - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); } @@ -5362,7 +5361,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (mddev->recovery_disabled == conf->recovery_disabled) return -EBUSY; - if (has_failed(conf)) + if (rdev->saved_raid_disk < 0 && has_failed(conf)) /* no point adding a device */ return -EINVAL; @@ -5501,7 +5500,7 @@ static int raid5_start_reshape(struct mddev *mddev) if (!check_stripe_cache(mddev)) return -ENOSPC; - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) if (!test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) spares++; @@ -5547,16 +5546,14 @@ static int raid5_start_reshape(struct mddev *mddev) * such devices during the reshape and confusion could result. */ if (mddev->delta_disks >= 0) { - int added_devices = 0; - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) { if (raid5_add_disk(mddev, rdev) == 0) { if (rdev->raid_disk - >= conf->previous_raid_disks) { + >= conf->previous_raid_disks) set_bit(In_sync, &rdev->flags); - added_devices++; - } else + else rdev->recovery_offset = 0; if (sysfs_link_rdev(mddev, rdev)) @@ -5566,7 +5563,6 @@ static int raid5_start_reshape(struct mddev *mddev) && !test_bit(Faulty, &rdev->flags)) { /* This is a spare that was manually added */ set_bit(In_sync, &rdev->flags); - added_devices++; } /* When a reshape changes the number of devices, @@ -5592,6 +5588,7 @@ static int raid5_start_reshape(struct mddev *mddev) spin_lock_irq(&conf->device_lock); mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; conf->reshape_progress = MaxSector; + mddev->reshape_position = MaxSector; spin_unlock_irq(&conf->device_lock); return -EAGAIN; } |