summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bitmap.c194
-rw-r--r--drivers/md/bitmap.h22
-rw-r--r--drivers/md/dm-bufio.c1
-rw-r--r--drivers/md/dm-crypt.c8
-rw-r--r--drivers/md/dm-flakey.c13
-rw-r--r--drivers/md/dm-io.c23
-rw-r--r--drivers/md/dm-ioctl.c2
-rw-r--r--drivers/md/dm-linear.c12
-rw-r--r--drivers/md/dm-mpath.c6
-rw-r--r--drivers/md/dm-raid.c45
-rw-r--r--drivers/md/dm-table.c6
-rw-r--r--drivers/md/dm-thin-metadata.c25
-rw-r--r--drivers/md/faulty.c2
-rw-r--r--drivers/md/linear.c32
-rw-r--r--drivers/md/md.c152
-rw-r--r--drivers/md/md.h13
-rw-r--r--drivers/md/multipath.c2
-rw-r--r--drivers/md/raid0.c164
-rw-r--r--drivers/md/raid0.h11
-rw-r--r--drivers/md/raid1.c111
-rw-r--r--drivers/md/raid10.c225
-rw-r--r--drivers/md/raid5.c25
22 files changed, 652 insertions, 442 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index cdf36b1e9aa6..3d0dfa7a89a2 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -26,6 +26,7 @@
#include <linux/file.h>
#include <linux/mount.h>
#include <linux/buffer_head.h>
+#include <linux/seq_file.h>
#include "md.h"
#include "bitmap.h"
@@ -35,31 +36,6 @@ static inline char *bmname(struct bitmap *bitmap)
}
/*
- * just a placeholder - calls kmalloc for bitmap pages
- */
-static unsigned char *bitmap_alloc_page(struct bitmap *bitmap)
-{
- unsigned char *page;
-
- page = kzalloc(PAGE_SIZE, GFP_NOIO);
- if (!page)
- printk("%s: bitmap_alloc_page FAILED\n", bmname(bitmap));
- else
- pr_debug("%s: bitmap_alloc_page: allocated page at %p\n",
- bmname(bitmap), page);
- return page;
-}
-
-/*
- * for now just a placeholder -- just calls kfree for bitmap pages
- */
-static void bitmap_free_page(struct bitmap *bitmap, unsigned char *page)
-{
- pr_debug("%s: bitmap_free_page: free page %p\n", bmname(bitmap), page);
- kfree(page);
-}
-
-/*
* check a page and, if necessary, allocate it (or hijack it if the alloc fails)
*
* 1) check to see if this page is allocated, if it's not then try to alloc
@@ -96,7 +72,7 @@ __acquires(bitmap->lock)
/* this page has not been allocated yet */
spin_unlock_irq(&bitmap->lock);
- mappage = bitmap_alloc_page(bitmap);
+ mappage = kzalloc(PAGE_SIZE, GFP_NOIO);
spin_lock_irq(&bitmap->lock);
if (mappage == NULL) {
@@ -109,7 +85,7 @@ __acquires(bitmap->lock)
} else if (bitmap->bp[page].map ||
bitmap->bp[page].hijacked) {
/* somebody beat us to getting the page */
- bitmap_free_page(bitmap, mappage);
+ kfree(mappage);
return 0;
} else {
@@ -141,7 +117,7 @@ static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page)
ptr = bitmap->bp[page].map;
bitmap->bp[page].map = NULL;
bitmap->missing_pages++;
- bitmap_free_page(bitmap, ptr);
+ kfree(ptr);
}
}
@@ -171,7 +147,7 @@ static struct page *read_sb_page(struct mddev *mddev, loff_t offset,
did_alloc = 1;
}
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
if (! test_bit(In_sync, &rdev->flags)
|| test_bit(Faulty, &rdev->flags))
continue;
@@ -445,19 +421,14 @@ out:
void bitmap_update_sb(struct bitmap *bitmap)
{
bitmap_super_t *sb;
- unsigned long flags;
if (!bitmap || !bitmap->mddev) /* no bitmap for this array */
return;
if (bitmap->mddev->bitmap_info.external)
return;
- spin_lock_irqsave(&bitmap->lock, flags);
- if (!bitmap->sb_page) { /* no superblock */
- spin_unlock_irqrestore(&bitmap->lock, flags);
+ if (!bitmap->sb_page) /* no superblock */
return;
- }
- spin_unlock_irqrestore(&bitmap->lock, flags);
- sb = kmap_atomic(bitmap->sb_page, KM_USER0);
+ sb = kmap_atomic(bitmap->sb_page);
sb->events = cpu_to_le64(bitmap->mddev->events);
if (bitmap->mddev->events < bitmap->events_cleared)
/* rocking back to read-only */
@@ -467,7 +438,7 @@ void bitmap_update_sb(struct bitmap *bitmap)
/* Just in case these have been changed via sysfs: */
sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ);
sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind);
- kunmap_atomic(sb, KM_USER0);
+ kunmap_atomic(sb);
write_page(bitmap, bitmap->sb_page, 1);
}
@@ -478,7 +449,7 @@ void bitmap_print_sb(struct bitmap *bitmap)
if (!bitmap || !bitmap->sb_page)
return;
- sb = kmap_atomic(bitmap->sb_page, KM_USER0);
+ sb = kmap_atomic(bitmap->sb_page);
printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap));
printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic));
printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version));
@@ -497,7 +468,7 @@ void bitmap_print_sb(struct bitmap *bitmap)
printk(KERN_DEBUG " sync size: %llu KB\n",
(unsigned long long)le64_to_cpu(sb->sync_size)/2);
printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind));
- kunmap_atomic(sb, KM_USER0);
+ kunmap_atomic(sb);
}
/*
@@ -525,7 +496,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
}
bitmap->sb_page->index = 0;
- sb = kmap_atomic(bitmap->sb_page, KM_USER0);
+ sb = kmap_atomic(bitmap->sb_page);
sb->magic = cpu_to_le32(BITMAP_MAGIC);
sb->version = cpu_to_le32(BITMAP_MAJOR_HI);
@@ -533,7 +504,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
chunksize = bitmap->mddev->bitmap_info.chunksize;
BUG_ON(!chunksize);
if (!is_power_of_2(chunksize)) {
- kunmap_atomic(sb, KM_USER0);
+ kunmap_atomic(sb);
printk(KERN_ERR "bitmap chunksize not a power of 2\n");
return -EINVAL;
}
@@ -571,7 +542,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
bitmap->flags |= BITMAP_HOSTENDIAN;
sb->version = cpu_to_le32(BITMAP_MAJOR_HOSTENDIAN);
- kunmap_atomic(sb, KM_USER0);
+ kunmap_atomic(sb);
return 0;
}
@@ -603,7 +574,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
return err;
}
- sb = kmap_atomic(bitmap->sb_page, KM_USER0);
+ sb = kmap_atomic(bitmap->sb_page);
chunksize = le32_to_cpu(sb->chunksize);
daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
@@ -632,26 +603,28 @@ static int bitmap_read_sb(struct bitmap *bitmap)
/* keep the array size field of the bitmap superblock up to date */
sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
- if (!bitmap->mddev->persistent)
- goto success;
-
- /*
- * if we have a persistent array superblock, compare the
- * bitmap's UUID and event counter to the mddev's
- */
- if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) {
- printk(KERN_INFO "%s: bitmap superblock UUID mismatch\n",
- bmname(bitmap));
- goto out;
- }
- events = le64_to_cpu(sb->events);
- if (events < bitmap->mddev->events) {
- printk(KERN_INFO "%s: bitmap file is out of date (%llu < %llu) "
- "-- forcing full recovery\n", bmname(bitmap), events,
- (unsigned long long) bitmap->mddev->events);
- sb->state |= cpu_to_le32(BITMAP_STALE);
+ if (bitmap->mddev->persistent) {
+ /*
+ * We have a persistent array superblock, so compare the
+ * bitmap's UUID and event counter to the mddev's
+ */
+ if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) {
+ printk(KERN_INFO
+ "%s: bitmap superblock UUID mismatch\n",
+ bmname(bitmap));
+ goto out;
+ }
+ events = le64_to_cpu(sb->events);
+ if (events < bitmap->mddev->events) {
+ printk(KERN_INFO
+ "%s: bitmap file is out of date (%llu < %llu) "
+ "-- forcing full recovery\n",
+ bmname(bitmap), events,
+ (unsigned long long) bitmap->mddev->events);
+ sb->state |= cpu_to_le32(BITMAP_STALE);
+ }
}
-success:
+
/* assign fields using values from superblock */
bitmap->mddev->bitmap_info.chunksize = chunksize;
bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
@@ -664,7 +637,7 @@ success:
bitmap->events_cleared = bitmap->mddev->events;
err = 0;
out:
- kunmap_atomic(sb, KM_USER0);
+ kunmap_atomic(sb);
if (err)
bitmap_print_sb(bitmap);
return err;
@@ -680,16 +653,11 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
enum bitmap_mask_op op)
{
bitmap_super_t *sb;
- unsigned long flags;
int old;
- spin_lock_irqsave(&bitmap->lock, flags);
- if (!bitmap->sb_page) { /* can't set the state */
- spin_unlock_irqrestore(&bitmap->lock, flags);
+ if (!bitmap->sb_page) /* can't set the state */
return 0;
- }
- spin_unlock_irqrestore(&bitmap->lock, flags);
- sb = kmap_atomic(bitmap->sb_page, KM_USER0);
+ sb = kmap_atomic(bitmap->sb_page);
old = le32_to_cpu(sb->state) & bits;
switch (op) {
case MASK_SET:
@@ -703,7 +671,7 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
default:
BUG();
}
- kunmap_atomic(sb, KM_USER0);
+ kunmap_atomic(sb);
return old;
}
@@ -870,7 +838,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
unsigned long bit;
struct page *page;
void *kaddr;
- unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap);
+ unsigned long chunk = block >> bitmap->chunkshift;
if (!bitmap->filemap)
return;
@@ -881,12 +849,12 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
bit = file_page_offset(bitmap, chunk);
/* set the bit */
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
if (bitmap->flags & BITMAP_HOSTENDIAN)
set_bit(bit, kaddr);
else
__set_bit_le(bit, kaddr);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
pr_debug("set file bit %lu page %lu\n", bit, page->index);
/* record page number so it gets flushed to disk when unplug occurs */
set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
@@ -1050,10 +1018,10 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
* if bitmap is out of date, dirty the
* whole page and write it out
*/
- paddr = kmap_atomic(page, KM_USER0);
+ paddr = kmap_atomic(page);
memset(paddr + offset, 0xff,
PAGE_SIZE - offset);
- kunmap_atomic(paddr, KM_USER0);
+ kunmap_atomic(paddr);
write_page(bitmap, page, 1);
ret = -EIO;
@@ -1061,18 +1029,18 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
goto err;
}
}
- paddr = kmap_atomic(page, KM_USER0);
+ paddr = kmap_atomic(page);
if (bitmap->flags & BITMAP_HOSTENDIAN)
b = test_bit(bit, paddr);
else
b = test_bit_le(bit, paddr);
- kunmap_atomic(paddr, KM_USER0);
+ kunmap_atomic(paddr);
if (b) {
/* if the disk bit is set, set the memory bit */
- int needed = ((sector_t)(i+1) << (CHUNK_BLOCK_SHIFT(bitmap))
+ int needed = ((sector_t)(i+1) << bitmap->chunkshift
>= start);
bitmap_set_memory_bits(bitmap,
- (sector_t)i << CHUNK_BLOCK_SHIFT(bitmap),
+ (sector_t)i << bitmap->chunkshift,
needed);
bit_cnt++;
}
@@ -1116,7 +1084,7 @@ void bitmap_write_all(struct bitmap *bitmap)
static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc)
{
- sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap);
+ sector_t chunk = offset >> bitmap->chunkshift;
unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
bitmap->bp[page].count += inc;
bitmap_checkfree(bitmap, page);
@@ -1209,10 +1177,10 @@ void bitmap_daemon_work(struct mddev *mddev)
mddev->bitmap_info.external == 0) {
bitmap_super_t *sb;
bitmap->need_sync = 0;
- sb = kmap_atomic(bitmap->sb_page, KM_USER0);
+ sb = kmap_atomic(bitmap->sb_page);
sb->events_cleared =
cpu_to_le64(bitmap->events_cleared);
- kunmap_atomic(sb, KM_USER0);
+ kunmap_atomic(sb);
write_page(bitmap, bitmap->sb_page, 1);
}
spin_lock_irqsave(&bitmap->lock, flags);
@@ -1222,7 +1190,7 @@ void bitmap_daemon_work(struct mddev *mddev)
bitmap->allclean = 0;
}
bmc = bitmap_get_counter(bitmap,
- (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap),
+ (sector_t)j << bitmap->chunkshift,
&blocks, 0);
if (!bmc)
j |= PAGE_COUNTER_MASK;
@@ -1231,11 +1199,11 @@ void bitmap_daemon_work(struct mddev *mddev)
/* we can clear the bit */
*bmc = 0;
bitmap_count_page(bitmap,
- (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap),
+ (sector_t)j << bitmap->chunkshift,
-1);
/* clear the bit */
- paddr = kmap_atomic(page, KM_USER0);
+ paddr = kmap_atomic(page);
if (bitmap->flags & BITMAP_HOSTENDIAN)
clear_bit(file_page_offset(bitmap, j),
paddr);
@@ -1244,7 +1212,7 @@ void bitmap_daemon_work(struct mddev *mddev)
file_page_offset(bitmap,
j),
paddr);
- kunmap_atomic(paddr, KM_USER0);
+ kunmap_atomic(paddr);
} else if (*bmc <= 2) {
*bmc = 1; /* maybe clear the bit next time */
set_page_attr(bitmap, page, BITMAP_PAGE_PENDING);
@@ -1285,7 +1253,7 @@ __acquires(bitmap->lock)
* The lock must have been taken with interrupts enabled.
* If !create, we don't release the lock.
*/
- sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap);
+ sector_t chunk = offset >> bitmap->chunkshift;
unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT;
sector_t csize;
@@ -1295,10 +1263,10 @@ __acquires(bitmap->lock)
if (bitmap->bp[page].hijacked ||
bitmap->bp[page].map == NULL)
- csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap) +
+ csize = ((sector_t)1) << (bitmap->chunkshift +
PAGE_COUNTER_SHIFT - 1);
else
- csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap));
+ csize = ((sector_t)1) << bitmap->chunkshift;
*blocks = csize - (offset & (csize - 1));
if (err < 0)
@@ -1424,7 +1392,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
set_page_attr(bitmap,
filemap_get_page(
bitmap,
- offset >> CHUNK_BLOCK_SHIFT(bitmap)),
+ offset >> bitmap->chunkshift),
BITMAP_PAGE_PENDING);
bitmap->allclean = 0;
}
@@ -1512,7 +1480,7 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, i
else {
if (*bmc <= 2) {
set_page_attr(bitmap,
- filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)),
+ filemap_get_page(bitmap, offset >> bitmap->chunkshift),
BITMAP_PAGE_PENDING);
bitmap->allclean = 0;
}
@@ -1559,7 +1527,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
bitmap->mddev->curr_resync_completed = sector;
set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags);
- sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1);
+ sector &= ~((1ULL << bitmap->chunkshift) - 1);
s = 0;
while (s < sector && s < bitmap->mddev->resync_max_sectors) {
bitmap_end_sync(bitmap, s, &blocks, 0);
@@ -1589,7 +1557,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
struct page *page;
*bmc = 2 | (needed ? NEEDED_MASK : 0);
bitmap_count_page(bitmap, offset, 1);
- page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap));
+ page = filemap_get_page(bitmap, offset >> bitmap->chunkshift);
set_page_attr(bitmap, page, BITMAP_PAGE_PENDING);
bitmap->allclean = 0;
}
@@ -1602,7 +1570,7 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e)
unsigned long chunk;
for (chunk = s; chunk <= e; chunk++) {
- sector_t sec = (sector_t)chunk << CHUNK_BLOCK_SHIFT(bitmap);
+ sector_t sec = (sector_t)chunk << bitmap->chunkshift;
bitmap_set_memory_bits(bitmap, sec, 1);
spin_lock_irq(&bitmap->lock);
bitmap_file_set_bit(bitmap, sec);
@@ -1759,11 +1727,12 @@ int bitmap_create(struct mddev *mddev)
goto error;
bitmap->daemon_lastrun = jiffies;
- bitmap->chunkshift = ffz(~mddev->bitmap_info.chunksize);
+ bitmap->chunkshift = (ffz(~mddev->bitmap_info.chunksize)
+ - BITMAP_BLOCK_SHIFT);
/* now that chunksize and chunkshift are set, we can use these macros */
- chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) >>
- CHUNK_BLOCK_SHIFT(bitmap);
+ chunks = (blocks + bitmap->chunkshift - 1) >>
+ bitmap->chunkshift;
pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO;
BUG_ON(!pages);
@@ -1836,6 +1805,33 @@ out:
}
EXPORT_SYMBOL_GPL(bitmap_load);
+void bitmap_status(struct seq_file *seq, struct bitmap *bitmap)
+{
+ unsigned long chunk_kb;
+ unsigned long flags;
+
+ if (!bitmap)
+ return;
+
+ spin_lock_irqsave(&bitmap->lock, flags);
+ chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10;
+ seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
+ "%lu%s chunk",
+ bitmap->pages - bitmap->missing_pages,
+ bitmap->pages,
+ (bitmap->pages - bitmap->missing_pages)
+ << (PAGE_SHIFT - 10),
+ chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize,
+ chunk_kb ? "KB" : "B");
+ if (bitmap->file) {
+ seq_printf(seq, ", file: ");
+ seq_path(seq, &bitmap->file->f_path, " \t\n");
+ }
+
+ seq_printf(seq, "\n");
+ spin_unlock_irqrestore(&bitmap->lock, flags);
+}
+
static ssize_t
location_show(struct mddev *mddev, char *page)
{
@@ -1904,6 +1900,8 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
if (mddev->pers) {
mddev->pers->quiesce(mddev, 1);
rv = bitmap_create(mddev);
+ if (!rv)
+ rv = bitmap_load(mddev);
if (rv) {
bitmap_destroy(mddev);
mddev->bitmap_info.offset = 0;
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index a15436dd9b3e..55ca5aec84e4 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -13,8 +13,6 @@
#define BITMAP_MAJOR_HI 4
#define BITMAP_MAJOR_HOSTENDIAN 3
-#define BITMAP_MINOR 39
-
/*
* in-memory bitmap:
*
@@ -101,21 +99,10 @@ typedef __u16 bitmap_counter_t;
/* same, except a mask value for more efficient bitops */
#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1)
-#define BITMAP_BLOCK_SIZE 512
#define BITMAP_BLOCK_SHIFT 9
/* how many blocks per chunk? (this is variable) */
#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->mddev->bitmap_info.chunksize >> BITMAP_BLOCK_SHIFT)
-#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT)
-#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1)
-
-/* when hijacked, the counters and bits represent even larger "chunks" */
-/* there will be 1024 chunks represented by each counter in the page pointers */
-#define PAGEPTR_BLOCK_RATIO(bitmap) \
- (CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1)
-#define PAGEPTR_BLOCK_SHIFT(bitmap) \
- (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1)
-#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1)
#endif
@@ -181,12 +168,6 @@ struct bitmap_page {
unsigned int count:31;
};
-/* keep track of bitmap file pages that have pending writes on them */
-struct page_list {
- struct list_head list;
- struct page *page;
-};
-
/* the main bitmap structure - one per mddev */
struct bitmap {
struct bitmap_page *bp;
@@ -196,7 +177,7 @@ struct bitmap {
struct mddev *mddev; /* the md device that the bitmap is for */
/* bitmap chunksize -- how much data does each bit represent? */
- unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */
+ unsigned long chunkshift; /* chunksize = 2^(chunkshift+9) (for bitops) */
unsigned long chunks; /* total number of data chunks for the array */
__u64 events_cleared;
@@ -245,6 +226,7 @@ void bitmap_destroy(struct mddev *mddev);
void bitmap_print_sb(struct bitmap *bitmap);
void bitmap_update_sb(struct bitmap *bitmap);
+void bitmap_status(struct seq_file *seq, struct bitmap *bitmap);
int bitmap_setallbits(struct bitmap *bitmap);
void bitmap_write_all(struct bitmap *bitmap);
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 0a6806f80ab5..b6e58c7b6df5 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -12,7 +12,6 @@
#include <linux/dm-io.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
-#include <linux/version.h>
#include <linux/shrinker.h>
#include <linux/module.h>
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 8c2a000cf3f5..db6b51639cee 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -590,9 +590,9 @@ static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv,
int r = 0;
if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
- src = kmap_atomic(sg_page(&dmreq->sg_in), KM_USER0);
+ src = kmap_atomic(sg_page(&dmreq->sg_in));
r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset);
- kunmap_atomic(src, KM_USER0);
+ kunmap_atomic(src);
} else
memset(iv, 0, cc->iv_size);
@@ -608,14 +608,14 @@ static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv,
if (bio_data_dir(dmreq->ctx->bio_in) == WRITE)
return 0;
- dst = kmap_atomic(sg_page(&dmreq->sg_out), KM_USER0);
+ dst = kmap_atomic(sg_page(&dmreq->sg_out));
r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset);
/* Tweak the first block of plaintext sector */
if (!r)
crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size);
- kunmap_atomic(dst, KM_USER0);
+ kunmap_atomic(dst);
return r;
}
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index f84c08029b21..b280c433e4a0 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -323,7 +323,7 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio,
* Corrupt successful READs while in down state.
* If flags were specified, only corrupt those that match.
*/
- if (!error && bio_submitted_while_down &&
+ if (fc->corrupt_bio_byte && !error && bio_submitted_while_down &&
(bio_data_dir(bio) == READ) && (fc->corrupt_bio_rw == READ) &&
all_corrupt_bio_flags_match(bio, fc))
corrupt_bio_data(bio, fc);
@@ -368,8 +368,17 @@ static int flakey_status(struct dm_target *ti, status_type_t type,
static int flakey_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg)
{
struct flakey_c *fc = ti->private;
+ struct dm_dev *dev = fc->dev;
+ int r = 0;
- return __blkdev_driver_ioctl(fc->dev->bdev, fc->dev->mode, cmd, arg);
+ /*
+ * Only pass ioctls through if the device sizes match exactly.
+ */
+ if (fc->start ||
+ ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
+ r = scsi_verify_blk_ioctl(NULL, cmd);
+
+ return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg);
}
static int flakey_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index ad2eba40e319..ea5dd289fe2a 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -296,6 +296,8 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
unsigned offset;
unsigned num_bvecs;
sector_t remaining = where->count;
+ struct request_queue *q = bdev_get_queue(where->bdev);
+ sector_t discard_sectors;
/*
* where->count may be zero if rw holds a flush and we need to
@@ -305,9 +307,12 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
/*
* Allocate a suitably sized-bio.
*/
- num_bvecs = dm_sector_div_up(remaining,
- (PAGE_SIZE >> SECTOR_SHIFT));
- num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), num_bvecs);
+ if (rw & REQ_DISCARD)
+ num_bvecs = 1;
+ else
+ num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev),
+ dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT)));
+
bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
bio->bi_sector = where->sector + (where->count - remaining);
bio->bi_bdev = where->bdev;
@@ -315,10 +320,14 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
bio->bi_destructor = dm_bio_destructor;
store_io_and_region_in_bio(bio, io, region);
- /*
- * Try and add as many pages as possible.
- */
- while (remaining) {
+ if (rw & REQ_DISCARD) {
+ discard_sectors = min_t(sector_t, q->limits.max_discard_sectors, remaining);
+ bio->bi_size = discard_sectors << SECTOR_SHIFT;
+ remaining -= discard_sectors;
+ } else while (remaining) {
+ /*
+ * Try and add as many pages as possible.
+ */
dp->get_page(dp, &page, &len, &offset);
len = min(len, to_bytes(remaining));
if (!bio_add_page(bio, page, len, offset))
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 31c2dc25886d..1ce84ed0b765 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1437,7 +1437,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
if (!argc) {
DMWARN("Empty message received.");
- goto out;
+ goto out_argv;
}
table = dm_get_live_table(md);
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 3921e3bb43c1..9728839f844a 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -116,7 +116,17 @@ static int linear_ioctl(struct dm_target *ti, unsigned int cmd,
unsigned long arg)
{
struct linear_c *lc = (struct linear_c *) ti->private;
- return __blkdev_driver_ioctl(lc->dev->bdev, lc->dev->mode, cmd, arg);
+ struct dm_dev *dev = lc->dev;
+ int r = 0;
+
+ /*
+ * Only pass ioctls through if the device sizes match exactly.
+ */
+ if (lc->start ||
+ ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
+ r = scsi_verify_blk_ioctl(NULL, cmd);
+
+ return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg);
}
static int linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 5e0090ef4182..801d92d237cf 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1520,6 +1520,12 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
spin_unlock_irqrestore(&m->lock, flags);
+ /*
+ * Only pass ioctls through if the device sizes match exactly.
+ */
+ if (!r && ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT)
+ r = scsi_verify_blk_ioctl(NULL, cmd);
+
return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
}
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index c2907d836e4e..c5a875d7b882 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -56,7 +56,8 @@ struct raid_dev {
struct raid_set {
struct dm_target *ti;
- uint64_t print_flags;
+ uint32_t bitmap_loaded;
+ uint32_t print_flags;
struct mddev md;
struct raid_type *raid_type;
@@ -614,14 +615,14 @@ static int read_disk_sb(struct md_rdev *rdev, int size)
static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
{
- struct md_rdev *r, *t;
+ struct md_rdev *r;
uint64_t failed_devices;
struct dm_raid_superblock *sb;
sb = page_address(rdev->sb_page);
failed_devices = le64_to_cpu(sb->failed_devices);
- rdev_for_each(r, t, mddev)
+ rdev_for_each(r, mddev)
if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags))
failed_devices |= (1ULL << r->raid_disk);
@@ -667,7 +668,14 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
return ret;
sb = page_address(rdev->sb_page);
- if (sb->magic != cpu_to_le32(DM_RAID_MAGIC)) {
+
+ /*
+ * Two cases that we want to write new superblocks and rebuild:
+ * 1) New device (no matching magic number)
+ * 2) Device specified for rebuild (!In_sync w/ offset == 0)
+ */
+ if ((sb->magic != cpu_to_le32(DM_RAID_MAGIC)) ||
+ (!test_bit(In_sync, &rdev->flags) && !rdev->recovery_offset)) {
super_sync(rdev->mddev, rdev);
set_bit(FirstUse, &rdev->flags);
@@ -699,7 +707,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
struct dm_raid_superblock *sb;
uint32_t new_devs = 0;
uint32_t rebuilds = 0;
- struct md_rdev *r, *t;
+ struct md_rdev *r;
struct dm_raid_superblock *sb2;
sb = page_address(rdev->sb_page);
@@ -742,13 +750,10 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
* case the In_sync bit will /not/ be set and
* recovery_cp must be MaxSector.
*/
- rdev_for_each(r, t, mddev) {
+ rdev_for_each(r, mddev) {
if (!test_bit(In_sync, &r->flags)) {
- if (!test_bit(FirstUse, &r->flags))
- DMERR("Superblock area of "
- "rebuild device %d should have been "
- "cleared.", r->raid_disk);
- set_bit(FirstUse, &r->flags);
+ DMINFO("Device %d specified for rebuild: "
+ "Clearing superblock", r->raid_disk);
rebuilds++;
} else if (test_bit(FirstUse, &r->flags))
new_devs++;
@@ -777,7 +782,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
* Now we set the Faulty bit for those devices that are
* recorded in the superblock as failed.
*/
- rdev_for_each(r, t, mddev) {
+ rdev_for_each(r, mddev) {
if (!r->sb_page)
continue;
sb2 = page_address(r->sb_page);
@@ -850,11 +855,11 @@ static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
{
int ret;
- struct md_rdev *rdev, *freshest, *tmp;
+ struct md_rdev *rdev, *freshest;
struct mddev *mddev = &rs->md;
freshest = NULL;
- rdev_for_each(rdev, tmp, mddev) {
+ rdev_for_each(rdev, mddev) {
if (!rdev->meta_bdev)
continue;
@@ -883,7 +888,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
if (super_validate(mddev, freshest))
return -EINVAL;
- rdev_for_each(rdev, tmp, mddev)
+ rdev_for_each(rdev, mddev)
if ((rdev != freshest) && super_validate(mddev, rdev))
return -EINVAL;
@@ -970,6 +975,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
INIT_WORK(&rs->md.event_work, do_table_event);
ti->private = rs;
+ ti->num_flush_requests = 1;
mutex_lock(&rs->md.reconfig_mutex);
ret = md_run(&rs->md);
@@ -1085,7 +1091,7 @@ static int raid_status(struct dm_target *ti, status_type_t type,
raid_param_cnt += 2;
}
- raid_param_cnt += (hweight64(rs->print_flags & ~DMPF_REBUILD) * 2);
+ raid_param_cnt += (hweight32(rs->print_flags & ~DMPF_REBUILD) * 2);
if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))
raid_param_cnt--;
@@ -1197,7 +1203,12 @@ static void raid_resume(struct dm_target *ti)
{
struct raid_set *rs = ti->private;
- bitmap_load(&rs->md);
+ if (!rs->bitmap_loaded) {
+ bitmap_load(&rs->md);
+ rs->bitmap_loaded = 1;
+ } else
+ md_wakeup_thread(rs->md.thread);
+
mddev_resume(&rs->md);
}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 8e9132130142..63cc54289aff 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -699,7 +699,7 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table,
while (i < dm_table_get_num_targets(table)) {
ti = dm_table_get_target(table, i++);
- blk_set_default_limits(&ti_limits);
+ blk_set_stacking_limits(&ti_limits);
/* combine all target devices' limits */
if (ti->type->iterate_devices)
@@ -1221,10 +1221,10 @@ int dm_calculate_queue_limits(struct dm_table *table,
struct queue_limits ti_limits;
unsigned i = 0;
- blk_set_default_limits(limits);
+ blk_set_stacking_limits(limits);
while (i < dm_table_get_num_targets(table)) {
- blk_set_default_limits(&ti_limits);
+ blk_set_stacking_limits(&ti_limits);
ti = dm_table_get_target(table, i++);
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 59c4f0446ffa..237571af77fd 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -385,6 +385,7 @@ static int init_pmd(struct dm_pool_metadata *pmd,
data_sm = dm_sm_disk_create(tm, nr_blocks);
if (IS_ERR(data_sm)) {
DMERR("sm_disk_create failed");
+ dm_tm_unlock(tm, sblock);
r = PTR_ERR(data_sm);
goto bad;
}
@@ -789,6 +790,11 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
return 0;
}
+/*
+ * __open_device: Returns @td corresponding to device with id @dev,
+ * creating it if @create is set and incrementing @td->open_count.
+ * On failure, @td is undefined.
+ */
static int __open_device(struct dm_pool_metadata *pmd,
dm_thin_id dev, int create,
struct dm_thin_device **td)
@@ -799,10 +805,16 @@ static int __open_device(struct dm_pool_metadata *pmd,
struct disk_device_details details_le;
/*
- * Check the device isn't already open.
+ * If the device is already open, return it.
*/
list_for_each_entry(td2, &pmd->thin_devices, list)
if (td2->id == dev) {
+ /*
+ * May not create an already-open device.
+ */
+ if (create)
+ return -EEXIST;
+
td2->open_count++;
*td = td2;
return 0;
@@ -817,6 +829,9 @@ static int __open_device(struct dm_pool_metadata *pmd,
if (r != -ENODATA || !create)
return r;
+ /*
+ * Create new device.
+ */
changed = 1;
details_le.mapped_blocks = 0;
details_le.transaction_id = cpu_to_le64(pmd->trans_id);
@@ -882,12 +897,10 @@ static int __create_thin(struct dm_pool_metadata *pmd,
r = __open_device(pmd, dev, 1, &td);
if (r) {
- __close_device(td);
dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
dm_btree_del(&pmd->bl_info, dev_root);
return r;
}
- td->changed = 1;
__close_device(td);
return r;
@@ -967,14 +980,14 @@ static int __create_snap(struct dm_pool_metadata *pmd,
goto bad;
r = __set_snapshot_details(pmd, td, origin, pmd->time);
+ __close_device(td);
+
if (r)
goto bad;
- __close_device(td);
return 0;
bad:
- __close_device(td);
dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
dm_btree_remove(&pmd->details_info, pmd->details_root,
&key, &pmd->details_root);
@@ -1211,6 +1224,8 @@ static int __remove(struct dm_thin_device *td, dm_block_t block)
if (r)
return r;
+ td->mapped_blocks--;
+ td->changed = 1;
pmd->need_commit = 1;
return 0;
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index feb2c3c7bb44..45135f69509c 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -315,7 +315,7 @@ static int run(struct mddev *mddev)
}
conf->nfaults = 0;
- list_for_each_entry(rdev, &mddev->disks, same_set)
+ rdev_for_each(rdev, mddev)
conf->rdev = rdev;
md_set_array_sectors(mddev, faulty_size(mddev, 0, 0));
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 627456542fb3..b0fcc7d02adb 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -68,10 +68,19 @@ static int linear_mergeable_bvec(struct request_queue *q,
struct dev_info *dev0;
unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9;
sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
+ int maxbytes = biovec->bv_len;
+ struct request_queue *subq;
rcu_read_lock();
dev0 = which_dev(mddev, sector);
maxsectors = dev0->end_sector - sector;
+ subq = bdev_get_queue(dev0->rdev->bdev);
+ if (subq->merge_bvec_fn) {
+ bvm->bi_bdev = dev0->rdev->bdev;
+ bvm->bi_sector -= dev0->end_sector - dev0->rdev->sectors;
+ maxbytes = min(maxbytes, subq->merge_bvec_fn(subq, bvm,
+ biovec));
+ }
rcu_read_unlock();
if (maxsectors < bio_sectors)
@@ -80,12 +89,12 @@ static int linear_mergeable_bvec(struct request_queue *q,
maxsectors -= bio_sectors;
if (maxsectors <= (PAGE_SIZE >> 9 ) && bio_sectors == 0)
- return biovec->bv_len;
- /* The bytes available at this offset could be really big,
- * so we cap at 2^31 to avoid overflow */
- if (maxsectors > (1 << (31-9)))
- return 1<<31;
- return maxsectors << 9;
+ return maxbytes;
+
+ if (maxsectors > (maxbytes >> 9))
+ return maxbytes;
+ else
+ return maxsectors << 9;
}
static int linear_congested(void *data, int bits)
@@ -138,7 +147,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
cnt = 0;
conf->array_sectors = 0;
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
int j = rdev->raid_disk;
struct dev_info *disk = conf->disks + j;
sector_t sectors;
@@ -158,15 +167,6 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
- /* as we don't honour merge_bvec_fn, we must never risk
- * violating it, so limit max_segments to 1 lying within
- * a single page.
- */
- if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
- blk_queue_max_segments(mddev->queue, 1);
- blk_queue_segment_boundary(mddev->queue,
- PAGE_CACHE_SIZE - 1);
- }
conf->array_sectors += rdev->sectors;
cnt++;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index ca8527fe77eb..b572e1e386ce 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -439,7 +439,7 @@ static void submit_flushes(struct work_struct *ws)
INIT_WORK(&mddev->flush_work, md_submit_flush_data);
atomic_set(&mddev->flush_pending, 1);
rcu_read_lock();
- list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
+ rdev_for_each_rcu(rdev, mddev)
if (rdev->raid_disk >= 0 &&
!test_bit(Faulty, &rdev->flags)) {
/* Take two references, one is dropped
@@ -749,7 +749,7 @@ static struct md_rdev * find_rdev_nr(struct mddev *mddev, int nr)
{
struct md_rdev *rdev;
- list_for_each_entry(rdev, &mddev->disks, same_set)
+ rdev_for_each(rdev, mddev)
if (rdev->desc_nr == nr)
return rdev;
@@ -760,7 +760,7 @@ static struct md_rdev * find_rdev(struct mddev * mddev, dev_t dev)
{
struct md_rdev *rdev;
- list_for_each_entry(rdev, &mddev->disks, same_set)
+ rdev_for_each(rdev, mddev)
if (rdev->bdev->bd_dev == dev)
return rdev;
@@ -1342,7 +1342,7 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->state |= (1<<MD_SB_BITMAP_PRESENT);
sb->disks[0].state = (1<<MD_DISK_REMOVED);
- list_for_each_entry(rdev2, &mddev->disks, same_set) {
+ rdev_for_each(rdev2, mddev) {
mdp_disk_t *d;
int desc_nr;
int is_active = test_bit(In_sync, &rdev2->flags);
@@ -1805,18 +1805,18 @@ retry:
| BB_LEN(internal_bb));
*bbp++ = cpu_to_le64(store_bb);
}
+ bb->changed = 0;
if (read_seqretry(&bb->lock, seq))
goto retry;
bb->sector = (rdev->sb_start +
(int)le32_to_cpu(sb->bblog_offset));
bb->size = le16_to_cpu(sb->bblog_size);
- bb->changed = 0;
}
}
max_dev = 0;
- list_for_each_entry(rdev2, &mddev->disks, same_set)
+ rdev_for_each(rdev2, mddev)
if (rdev2->desc_nr+1 > max_dev)
max_dev = rdev2->desc_nr+1;
@@ -1833,7 +1833,7 @@ retry:
for (i=0; i<max_dev;i++)
sb->dev_roles[i] = cpu_to_le16(0xfffe);
- list_for_each_entry(rdev2, &mddev->disks, same_set) {
+ rdev_for_each(rdev2, mddev) {
i = rdev2->desc_nr;
if (test_bit(Faulty, &rdev2->flags))
sb->dev_roles[i] = cpu_to_le16(0xfffe);
@@ -1948,7 +1948,7 @@ int md_integrity_register(struct mddev *mddev)
return 0; /* nothing to do */
if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
return 0; /* shouldn't register, or already is */
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
/* skip spares and non-functional disks */
if (test_bit(Faulty, &rdev->flags))
continue;
@@ -2175,7 +2175,7 @@ static void export_array(struct mddev *mddev)
{
struct md_rdev *rdev, *tmp;
- rdev_for_each(rdev, tmp, mddev) {
+ rdev_for_each_safe(rdev, tmp, mddev) {
if (!rdev->mddev) {
MD_BUG();
continue;
@@ -2307,11 +2307,11 @@ static void md_print_devices(void)
bitmap_print_sb(mddev->bitmap);
else
printk("%s: ", mdname(mddev));
- list_for_each_entry(rdev, &mddev->disks, same_set)
+ rdev_for_each(rdev, mddev)
printk("<%s>", bdevname(rdev->bdev,b));
printk("\n");
- list_for_each_entry(rdev, &mddev->disks, same_set)
+ rdev_for_each(rdev, mddev)
print_rdev(rdev, mddev->major_version);
}
printk("md: **********************************\n");
@@ -2328,7 +2328,7 @@ static void sync_sbs(struct mddev * mddev, int nospares)
* with the rest of the array)
*/
struct md_rdev *rdev;
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
if (rdev->sb_events == mddev->events ||
(nospares &&
rdev->raid_disk < 0 &&
@@ -2351,7 +2351,7 @@ static void md_update_sb(struct mddev * mddev, int force_change)
repeat:
/* First make sure individual recovery_offsets are correct */
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
if (rdev->raid_disk >= 0 &&
mddev->delta_disks >= 0 &&
!test_bit(In_sync, &rdev->flags) &&
@@ -2364,8 +2364,9 @@ repeat:
clear_bit(MD_CHANGE_DEVS, &mddev->flags);
if (!mddev->external) {
clear_bit(MD_CHANGE_PENDING, &mddev->flags);
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
if (rdev->badblocks.changed) {
+ rdev->badblocks.changed = 0;
md_ack_all_badblocks(&rdev->badblocks);
md_error(mddev, rdev);
}
@@ -2430,7 +2431,7 @@ repeat:
mddev->events --;
}
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
if (rdev->badblocks.changed)
any_badblocks_changed++;
if (test_bit(Faulty, &rdev->flags))
@@ -2444,7 +2445,7 @@ repeat:
mdname(mddev), mddev->in_sync);
bitmap_update_sb(mddev->bitmap);
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
char b[BDEVNAME_SIZE];
if (rdev->sb_loaded != 1)
@@ -2493,7 +2494,7 @@ repeat:
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
if (test_and_clear_bit(FaultRecorded, &rdev->flags))
clear_bit(Blocked, &rdev->flags);
@@ -2896,7 +2897,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
struct md_rdev *rdev2;
mddev_lock(mddev);
- list_for_each_entry(rdev2, &mddev->disks, same_set)
+ rdev_for_each(rdev2, mddev)
if (rdev->bdev == rdev2->bdev &&
rdev != rdev2 &&
overlaps(rdev->data_offset, rdev->sectors,
@@ -3193,7 +3194,7 @@ static void analyze_sbs(struct mddev * mddev)
char b[BDEVNAME_SIZE];
freshest = NULL;
- rdev_for_each(rdev, tmp, mddev)
+ rdev_for_each_safe(rdev, tmp, mddev)
switch (super_types[mddev->major_version].
load_super(rdev, freshest, mddev->minor_version)) {
case 1:
@@ -3214,7 +3215,7 @@ static void analyze_sbs(struct mddev * mddev)
validate_super(mddev, freshest);
i = 0;
- rdev_for_each(rdev, tmp, mddev) {
+ rdev_for_each_safe(rdev, tmp, mddev) {
if (mddev->max_disks &&
(rdev->desc_nr >= mddev->max_disks ||
i > mddev->max_disks)) {
@@ -3403,7 +3404,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
return -EINVAL;
}
- list_for_each_entry(rdev, &mddev->disks, same_set)
+ rdev_for_each(rdev, mddev)
rdev->new_raid_disk = rdev->raid_disk;
/* ->takeover must set new_* and/or delta_disks
@@ -3456,7 +3457,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
mddev->safemode = 0;
}
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
if (rdev->raid_disk < 0)
continue;
if (rdev->new_raid_disk >= mddev->raid_disks)
@@ -3465,7 +3466,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
continue;
sysfs_unlink_rdev(mddev, rdev);
}
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
if (rdev->raid_disk < 0)
continue;
if (rdev->new_raid_disk == rdev->raid_disk)
@@ -4666,6 +4667,7 @@ static int md_alloc(dev_t dev, char *name)
mddev->queue->queuedata = mddev;
blk_queue_make_request(mddev->queue, md_make_request);
+ blk_set_stacking_limits(&mddev->queue->limits);
disk = alloc_disk(1 << shift);
if (!disk) {
@@ -4795,7 +4797,7 @@ int md_run(struct mddev *mddev)
* the only valid external interface is through the md
* device.
*/
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
if (test_bit(Faulty, &rdev->flags))
continue;
sync_blockdev(rdev->bdev);
@@ -4866,8 +4868,8 @@ int md_run(struct mddev *mddev)
struct md_rdev *rdev2;
int warned = 0;
- list_for_each_entry(rdev, &mddev->disks, same_set)
- list_for_each_entry(rdev2, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev)
+ rdev_for_each(rdev2, mddev) {
if (rdev < rdev2 &&
rdev->bdev->bd_contains ==
rdev2->bdev->bd_contains) {
@@ -4944,7 +4946,7 @@ int md_run(struct mddev *mddev)
mddev->in_sync = 1;
smp_wmb();
mddev->ready = 1;
- list_for_each_entry(rdev, &mddev->disks, same_set)
+ rdev_for_each(rdev, mddev)
if (rdev->raid_disk >= 0)
if (sysfs_link_rdev(mddev, rdev))
/* failure here is OK */;
@@ -5072,6 +5074,7 @@ static void md_clean(struct mddev *mddev)
mddev->changed = 0;
mddev->degraded = 0;
mddev->safemode = 0;
+ mddev->merge_check_needed = 0;
mddev->bitmap_info.offset = 0;
mddev->bitmap_info.default_offset = 0;
mddev->bitmap_info.chunksize = 0;
@@ -5174,7 +5177,7 @@ static int do_md_stop(struct mddev * mddev, int mode, int is_open)
/* tell userspace to handle 'inactive' */
sysfs_notify_dirent_safe(mddev->sysfs_state);
- list_for_each_entry(rdev, &mddev->disks, same_set)
+ rdev_for_each(rdev, mddev)
if (rdev->raid_disk >= 0)
sysfs_unlink_rdev(mddev, rdev);
@@ -5225,7 +5228,7 @@ static void autorun_array(struct mddev *mddev)
printk(KERN_INFO "md: running: ");
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
char b[BDEVNAME_SIZE];
printk("<%s>", bdevname(rdev->bdev,b));
}
@@ -5355,7 +5358,7 @@ static int get_array_info(struct mddev * mddev, void __user * arg)
struct md_rdev *rdev;
nr=working=insync=failed=spare=0;
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
nr++;
if (test_bit(Faulty, &rdev->flags))
failed++;
@@ -5922,7 +5925,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
* grow, and re-add.
*/
return -EBUSY;
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
sector_t avail = rdev->sectors;
if (fit && (num_sectors == 0 || num_sectors > avail))
@@ -6723,7 +6726,6 @@ static int md_seq_show(struct seq_file *seq, void *v)
struct mddev *mddev = v;
sector_t sectors;
struct md_rdev *rdev;
- struct bitmap *bitmap;
if (v == (void*)1) {
struct md_personality *pers;
@@ -6757,7 +6759,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
}
sectors = 0;
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
char b[BDEVNAME_SIZE];
seq_printf(seq, " %s[%d]",
bdevname(rdev->bdev,b), rdev->desc_nr);
@@ -6811,27 +6813,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
} else
seq_printf(seq, "\n ");
- if ((bitmap = mddev->bitmap)) {
- unsigned long chunk_kb;
- unsigned long flags;
- spin_lock_irqsave(&bitmap->lock, flags);
- chunk_kb = mddev->bitmap_info.chunksize >> 10;
- seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
- "%lu%s chunk",
- bitmap->pages - bitmap->missing_pages,
- bitmap->pages,
- (bitmap->pages - bitmap->missing_pages)
- << (PAGE_SHIFT - 10),
- chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize,
- chunk_kb ? "KB" : "B");
- if (bitmap->file) {
- seq_printf(seq, ", file: ");
- seq_path(seq, &bitmap->file->f_path, " \t\n");
- }
-
- seq_printf(seq, "\n");
- spin_unlock_irqrestore(&bitmap->lock, flags);
- }
+ bitmap_status(seq, mddev->bitmap);
seq_printf(seq, "\n");
}
@@ -7169,7 +7151,7 @@ void md_do_sync(struct mddev *mddev)
max_sectors = mddev->dev_sectors;
j = MaxSector;
rcu_read_lock();
- list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
+ rdev_for_each_rcu(rdev, mddev)
if (rdev->raid_disk >= 0 &&
!test_bit(Faulty, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) &&
@@ -7332,7 +7314,8 @@ void md_do_sync(struct mddev *mddev)
printk(KERN_INFO
"md: checkpointing %s of %s.\n",
desc, mdname(mddev));
- mddev->recovery_cp = mddev->curr_resync;
+ mddev->recovery_cp =
+ mddev->curr_resync_completed;
}
} else
mddev->recovery_cp = MaxSector;
@@ -7340,7 +7323,7 @@ void md_do_sync(struct mddev *mddev)
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
mddev->curr_resync = MaxSector;
rcu_read_lock();
- list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
+ rdev_for_each_rcu(rdev, mddev)
if (rdev->raid_disk >= 0 &&
mddev->delta_disks >= 0 &&
!test_bit(Faulty, &rdev->flags) &&
@@ -7350,9 +7333,9 @@ void md_do_sync(struct mddev *mddev)
rcu_read_unlock();
}
}
+ skip:
set_bit(MD_CHANGE_DEVS, &mddev->flags);
- skip:
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
/* We completed so min/max setting can be forgotten if used. */
if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
@@ -7382,10 +7365,11 @@ static int remove_and_add_spares(struct mddev *mddev)
{
struct md_rdev *rdev;
int spares = 0;
+ int removed = 0;
mddev->curr_resync_completed = 0;
- list_for_each_entry(rdev, &mddev->disks, same_set)
+ rdev_for_each(rdev, mddev)
if (rdev->raid_disk >= 0 &&
!test_bit(Blocked, &rdev->flags) &&
(test_bit(Faulty, &rdev->flags) ||
@@ -7395,10 +7379,15 @@ static int remove_and_add_spares(struct mddev *mddev)
mddev, rdev) == 0) {
sysfs_unlink_rdev(mddev, rdev);
rdev->raid_disk = -1;
+ removed++;
}
}
+ if (removed)
+ sysfs_notify(&mddev->kobj, NULL,
+ "degraded");
+
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
if (rdev->raid_disk >= 0 &&
!test_bit(In_sync, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags))
@@ -7443,7 +7432,7 @@ static void reap_sync_thread(struct mddev *mddev)
* do the superblock for an incrementally recovered device
* written out.
*/
- list_for_each_entry(rdev, &mddev->disks, same_set)
+ rdev_for_each(rdev, mddev)
if (!mddev->degraded ||
test_bit(In_sync, &rdev->flags))
rdev->saved_raid_disk = -1;
@@ -7521,7 +7510,7 @@ void md_check_recovery(struct mddev *mddev)
* failed devices.
*/
struct md_rdev *rdev;
- list_for_each_entry(rdev, &mddev->disks, same_set)
+ rdev_for_each(rdev, mddev)
if (rdev->raid_disk >= 0 &&
!test_bit(Blocked, &rdev->flags) &&
test_bit(Faulty, &rdev->flags) &&
@@ -8032,7 +8021,7 @@ void md_ack_all_badblocks(struct badblocks *bb)
return;
write_seqlock_irq(&bb->lock);
- if (bb->changed == 0) {
+ if (bb->changed == 0 && bb->unacked_exist) {
u64 *p = bb->page;
int i;
for (i = 0; i < bb->count ; i++) {
@@ -8149,30 +8138,23 @@ static int md_notify_reboot(struct notifier_block *this,
struct mddev *mddev;
int need_delay = 0;
- if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
-
- printk(KERN_INFO "md: stopping all md devices.\n");
-
- for_each_mddev(mddev, tmp) {
- if (mddev_trylock(mddev)) {
- /* Force a switch to readonly even array
- * appears to still be in use. Hence
- * the '100'.
- */
- md_set_readonly(mddev, 100);
- mddev_unlock(mddev);
- }
- need_delay = 1;
+ for_each_mddev(mddev, tmp) {
+ if (mddev_trylock(mddev)) {
+ __md_stop_writes(mddev);
+ mddev->safemode = 2;
+ mddev_unlock(mddev);
}
- /*
- * certain more exotic SCSI devices are known to be
- * volatile wrt too early system reboots. While the
- * right place to handle this issue is the given
- * driver, we do want to have a safe RAID driver ...
- */
- if (need_delay)
- mdelay(1000*1);
+ need_delay = 1;
}
+ /*
+ * certain more exotic SCSI devices are known to be
+ * volatile wrt too early system reboots. While the
+ * right place to handle this issue is the given
+ * driver, we do want to have a safe RAID driver ...
+ */
+ if (need_delay)
+ mdelay(1000*1);
+
return NOTIFY_DONE;
}
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 44c63dfeeb2b..1c2063ccf48e 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -128,6 +128,10 @@ struct md_rdev {
enum flag_bits {
Faulty, /* device is known to have a fault */
In_sync, /* device is in_sync with rest of array */
+ Unmerged, /* device is being added to array and should
+ * be considerred for bvec_merge_fn but not
+ * yet for actual IO
+ */
WriteMostly, /* Avoid reading if at all possible */
AutoDetected, /* added by auto-detect */
Blocked, /* An error occurred but has not yet
@@ -345,6 +349,10 @@ struct mddev {
int degraded; /* whether md should consider
* adding a spare
*/
+ int merge_check_needed; /* at least one
+ * member device
+ * has a
+ * merge_bvec_fn */
atomic_t recovery_active; /* blocks scheduled, but not written */
wait_queue_head_t recovery_wait;
@@ -519,7 +527,10 @@ static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
/*
* iterates through the 'same array disks' ringlist
*/
-#define rdev_for_each(rdev, tmp, mddev) \
+#define rdev_for_each(rdev, mddev) \
+ list_for_each_entry(rdev, &((mddev)->disks), same_set)
+
+#define rdev_for_each_safe(rdev, tmp, mddev) \
list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set)
#define rdev_for_each_rcu(rdev, mddev) \
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index a222f516660e..9339e67fcc79 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -428,7 +428,7 @@ static int multipath_run (struct mddev *mddev)
}
working_disks = 0;
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
disk_idx = rdev->raid_disk;
if (disk_idx < 0 ||
disk_idx >= mddev->raid_disks)
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 7294bd115e34..6f31f5596e01 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -91,7 +91,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
if (!conf)
return -ENOMEM;
- list_for_each_entry(rdev1, &mddev->disks, same_set) {
+ rdev_for_each(rdev1, mddev) {
pr_debug("md/raid0:%s: looking at %s\n",
mdname(mddev),
bdevname(rdev1->bdev, b));
@@ -102,7 +102,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
sector_div(sectors, mddev->chunk_sectors);
rdev1->sectors = sectors * mddev->chunk_sectors;
- list_for_each_entry(rdev2, &mddev->disks, same_set) {
+ rdev_for_each(rdev2, mddev) {
pr_debug("md/raid0:%s: comparing %s(%llu)"
" with %s(%llu)\n",
mdname(mddev),
@@ -157,7 +157,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
smallest = NULL;
dev = conf->devlist;
err = -EINVAL;
- list_for_each_entry(rdev1, &mddev->disks, same_set) {
+ rdev_for_each(rdev1, mddev) {
int j = rdev1->raid_disk;
if (mddev->level == 10) {
@@ -188,16 +188,10 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
disk_stack_limits(mddev->gendisk, rdev1->bdev,
rdev1->data_offset << 9);
- /* as we don't honour merge_bvec_fn, we must never risk
- * violating it, so limit ->max_segments to 1, lying within
- * a single page.
- */
- if (rdev1->bdev->bd_disk->queue->merge_bvec_fn) {
- blk_queue_max_segments(mddev->queue, 1);
- blk_queue_segment_boundary(mddev->queue,
- PAGE_CACHE_SIZE - 1);
- }
+ if (rdev1->bdev->bd_disk->queue->merge_bvec_fn)
+ conf->has_merge_bvec = 1;
+
if (!smallest || (rdev1->sectors < smallest->sectors))
smallest = rdev1;
cnt++;
@@ -290,8 +284,64 @@ abort:
return err;
}
+/* Find the zone which holds a particular offset
+ * Update *sectorp to be an offset in that zone
+ */
+static struct strip_zone *find_zone(struct r0conf *conf,
+ sector_t *sectorp)
+{
+ int i;
+ struct strip_zone *z = conf->strip_zone;
+ sector_t sector = *sectorp;
+
+ for (i = 0; i < conf->nr_strip_zones; i++)
+ if (sector < z[i].zone_end) {
+ if (i)
+ *sectorp = sector - z[i-1].zone_end;
+ return z + i;
+ }
+ BUG();
+}
+
+/*
+ * remaps the bio to the target device. we separate two flows.
+ * power 2 flow and a general flow for the sake of perfromance
+*/
+static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone,
+ sector_t sector, sector_t *sector_offset)
+{
+ unsigned int sect_in_chunk;
+ sector_t chunk;
+ struct r0conf *conf = mddev->private;
+ int raid_disks = conf->strip_zone[0].nb_dev;
+ unsigned int chunk_sects = mddev->chunk_sectors;
+
+ if (is_power_of_2(chunk_sects)) {
+ int chunksect_bits = ffz(~chunk_sects);
+ /* find the sector offset inside the chunk */
+ sect_in_chunk = sector & (chunk_sects - 1);
+ sector >>= chunksect_bits;
+ /* chunk in zone */
+ chunk = *sector_offset;
+ /* quotient is the chunk in real device*/
+ sector_div(chunk, zone->nb_dev << chunksect_bits);
+ } else{
+ sect_in_chunk = sector_div(sector, chunk_sects);
+ chunk = *sector_offset;
+ sector_div(chunk, chunk_sects * zone->nb_dev);
+ }
+ /*
+ * position the bio over the real device
+ * real sector = chunk in device + starting of zone
+ * + the position in the chunk
+ */
+ *sector_offset = (chunk * chunk_sects) + sect_in_chunk;
+ return conf->devlist[(zone - conf->strip_zone)*raid_disks
+ + sector_div(sector, zone->nb_dev)];
+}
+
/**
- * raid0_mergeable_bvec -- tell bio layer if a two requests can be merged
+ * raid0_mergeable_bvec -- tell bio layer if two requests can be merged
* @q: request queue
* @bvm: properties of new bio
* @biovec: the request that could be merged to it.
@@ -303,10 +353,15 @@ static int raid0_mergeable_bvec(struct request_queue *q,
struct bio_vec *biovec)
{
struct mddev *mddev = q->queuedata;
+ struct r0conf *conf = mddev->private;
sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
+ sector_t sector_offset = sector;
int max;
unsigned int chunk_sectors = mddev->chunk_sectors;
unsigned int bio_sectors = bvm->bi_size >> 9;
+ struct strip_zone *zone;
+ struct md_rdev *rdev;
+ struct request_queue *subq;
if (is_power_of_2(chunk_sectors))
max = (chunk_sectors - ((sector & (chunk_sectors-1))
@@ -314,10 +369,27 @@ static int raid0_mergeable_bvec(struct request_queue *q,
else
max = (chunk_sectors - (sector_div(sector, chunk_sectors)
+ bio_sectors)) << 9;
- if (max < 0) max = 0; /* bio_add cannot handle a negative return */
+ if (max < 0)
+ max = 0; /* bio_add cannot handle a negative return */
if (max <= biovec->bv_len && bio_sectors == 0)
return biovec->bv_len;
- else
+ if (max < biovec->bv_len)
+ /* too small already, no need to check further */
+ return max;
+ if (!conf->has_merge_bvec)
+ return max;
+
+ /* May need to check subordinate device */
+ sector = sector_offset;
+ zone = find_zone(mddev->private, &sector_offset);
+ rdev = map_sector(mddev, zone, sector, &sector_offset);
+ subq = bdev_get_queue(rdev->bdev);
+ if (subq->merge_bvec_fn) {
+ bvm->bi_bdev = rdev->bdev;
+ bvm->bi_sector = sector_offset + zone->dev_start +
+ rdev->data_offset;
+ return min(max, subq->merge_bvec_fn(subq, bvm, biovec));
+ } else
return max;
}
@@ -329,7 +401,7 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks
WARN_ONCE(sectors || raid_disks,
"%s does not support generic reshape\n", __func__);
- list_for_each_entry(rdev, &mddev->disks, same_set)
+ rdev_for_each(rdev, mddev)
array_sectors += rdev->sectors;
return array_sectors;
@@ -397,62 +469,6 @@ static int raid0_stop(struct mddev *mddev)
return 0;
}
-/* Find the zone which holds a particular offset
- * Update *sectorp to be an offset in that zone
- */
-static struct strip_zone *find_zone(struct r0conf *conf,
- sector_t *sectorp)
-{
- int i;
- struct strip_zone *z = conf->strip_zone;
- sector_t sector = *sectorp;
-
- for (i = 0; i < conf->nr_strip_zones; i++)
- if (sector < z[i].zone_end) {
- if (i)
- *sectorp = sector - z[i-1].zone_end;
- return z + i;
- }
- BUG();
-}
-
-/*
- * remaps the bio to the target device. we separate two flows.
- * power 2 flow and a general flow for the sake of perfromance
-*/
-static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone,
- sector_t sector, sector_t *sector_offset)
-{
- unsigned int sect_in_chunk;
- sector_t chunk;
- struct r0conf *conf = mddev->private;
- int raid_disks = conf->strip_zone[0].nb_dev;
- unsigned int chunk_sects = mddev->chunk_sectors;
-
- if (is_power_of_2(chunk_sects)) {
- int chunksect_bits = ffz(~chunk_sects);
- /* find the sector offset inside the chunk */
- sect_in_chunk = sector & (chunk_sects - 1);
- sector >>= chunksect_bits;
- /* chunk in zone */
- chunk = *sector_offset;
- /* quotient is the chunk in real device*/
- sector_div(chunk, zone->nb_dev << chunksect_bits);
- } else{
- sect_in_chunk = sector_div(sector, chunk_sects);
- chunk = *sector_offset;
- sector_div(chunk, chunk_sects * zone->nb_dev);
- }
- /*
- * position the bio over the real device
- * real sector = chunk in device + starting of zone
- * + the position in the chunk
- */
- *sector_offset = (chunk * chunk_sects) + sect_in_chunk;
- return conf->devlist[(zone - conf->strip_zone)*raid_disks
- + sector_div(sector, zone->nb_dev)];
-}
-
/*
* Is io distribute over 1 or more chunks ?
*/
@@ -505,7 +521,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
}
sector_offset = bio->bi_sector;
- zone = find_zone(mddev->private, &sector_offset);
+ zone = find_zone(mddev->private, &sector_offset);
tmp_dev = map_sector(mddev, zone, bio->bi_sector,
&sector_offset);
bio->bi_bdev = tmp_dev->bdev;
@@ -543,7 +559,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
return ERR_PTR(-EINVAL);
}
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
/* check slot number for a disk */
if (rdev->raid_disk == mddev->raid_disks-1) {
printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n",
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index 0884bba8df4c..05539d9c97f0 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -4,13 +4,16 @@
struct strip_zone {
sector_t zone_end; /* Start of the next zone (in sectors) */
sector_t dev_start; /* Zone offset in real dev (in sectors) */
- int nb_dev; /* # of devices attached to the zone */
+ int nb_dev; /* # of devices attached to the zone */
};
struct r0conf {
- struct strip_zone *strip_zone;
- struct md_rdev **devlist; /* lists of rdevs, pointed to by strip_zone->dev */
- int nr_strip_zones;
+ struct strip_zone *strip_zone;
+ struct md_rdev **devlist; /* lists of rdevs, pointed to
+ * by strip_zone->dev */
+ int nr_strip_zones;
+ int has_merge_bvec; /* at least one member has
+ * a merge_bvec_fn */
};
#endif
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index cc24f0cb7ee3..4a40a200d769 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -523,6 +523,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
rdev = rcu_dereference(conf->mirrors[disk].rdev);
if (r1_bio->bios[disk] == IO_BLOCKED
|| rdev == NULL
+ || test_bit(Unmerged, &rdev->flags)
|| test_bit(Faulty, &rdev->flags))
continue;
if (!test_bit(In_sync, &rdev->flags) &&
@@ -531,8 +532,17 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
if (test_bit(WriteMostly, &rdev->flags)) {
/* Don't balance among write-mostly, just
* use the first as a last resort */
- if (best_disk < 0)
+ if (best_disk < 0) {
+ if (is_badblock(rdev, this_sector, sectors,
+ &first_bad, &bad_sectors)) {
+ if (first_bad < this_sector)
+ /* Cannot use this */
+ continue;
+ best_good_sectors = first_bad - this_sector;
+ } else
+ best_good_sectors = sectors;
best_disk = disk;
+ }
continue;
}
/* This is a reasonable device to use. It might
@@ -605,6 +615,39 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
return best_disk;
}
+static int raid1_mergeable_bvec(struct request_queue *q,
+ struct bvec_merge_data *bvm,
+ struct bio_vec *biovec)
+{
+ struct mddev *mddev = q->queuedata;
+ struct r1conf *conf = mddev->private;
+ sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
+ int max = biovec->bv_len;
+
+ if (mddev->merge_check_needed) {
+ int disk;
+ rcu_read_lock();
+ for (disk = 0; disk < conf->raid_disks * 2; disk++) {
+ struct md_rdev *rdev = rcu_dereference(
+ conf->mirrors[disk].rdev);
+ if (rdev && !test_bit(Faulty, &rdev->flags)) {
+ struct request_queue *q =
+ bdev_get_queue(rdev->bdev);
+ if (q->merge_bvec_fn) {
+ bvm->bi_sector = sector +
+ rdev->data_offset;
+ bvm->bi_bdev = rdev->bdev;
+ max = min(max, q->merge_bvec_fn(
+ q, bvm, biovec));
+ }
+ }
+ }
+ rcu_read_unlock();
+ }
+ return max;
+
+}
+
int md_raid1_congested(struct mddev *mddev, int bits)
{
struct r1conf *conf = mddev->private;
@@ -615,7 +658,7 @@ int md_raid1_congested(struct mddev *mddev, int bits)
return 1;
rcu_read_lock();
- for (i = 0; i < conf->raid_disks; i++) {
+ for (i = 0; i < conf->raid_disks * 2; i++) {
struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
if (rdev && !test_bit(Faulty, &rdev->flags)) {
struct request_queue *q = bdev_get_queue(rdev->bdev);
@@ -728,9 +771,22 @@ static void wait_barrier(struct r1conf *conf)
spin_lock_irq(&conf->resync_lock);
if (conf->barrier) {
conf->nr_waiting++;
- wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
+ /* Wait for the barrier to drop.
+ * However if there are already pending
+ * requests (preventing the barrier from
+ * rising completely), and the
+ * pre-process bio queue isn't empty,
+ * then don't wait, as we need to empty
+ * that queue to get the nr_pending
+ * count down.
+ */
+ wait_event_lock_irq(conf->wait_barrier,
+ !conf->barrier ||
+ (conf->nr_pending &&
+ current->bio_list &&
+ !bio_list_empty(current->bio_list)),
conf->resync_lock,
- );
+ );
conf->nr_waiting--;
}
conf->nr_pending++;
@@ -993,7 +1049,8 @@ read_again:
break;
}
r1_bio->bios[i] = NULL;
- if (!rdev || test_bit(Faulty, &rdev->flags)) {
+ if (!rdev || test_bit(Faulty, &rdev->flags)
+ || test_bit(Unmerged, &rdev->flags)) {
if (i < conf->raid_disks)
set_bit(R1BIO_Degraded, &r1_bio->state);
continue;
@@ -1313,6 +1370,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
struct mirror_info *p;
int first = 0;
int last = conf->raid_disks - 1;
+ struct request_queue *q = bdev_get_queue(rdev->bdev);
if (mddev->recovery_disabled == conf->recovery_disabled)
return -EBUSY;
@@ -1320,23 +1378,17 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
if (rdev->raid_disk >= 0)
first = last = rdev->raid_disk;
+ if (q->merge_bvec_fn) {
+ set_bit(Unmerged, &rdev->flags);
+ mddev->merge_check_needed = 1;
+ }
+
for (mirror = first; mirror <= last; mirror++) {
p = conf->mirrors+mirror;
if (!p->rdev) {
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
- /* as we don't honour merge_bvec_fn, we must
- * never risk violating it, so limit
- * ->max_segments to one lying with a single
- * page, as a one page request is never in
- * violation.
- */
- if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
- blk_queue_max_segments(mddev->queue, 1);
- blk_queue_segment_boundary(mddev->queue,
- PAGE_CACHE_SIZE - 1);
- }
p->head_position = 0;
rdev->raid_disk = mirror;
@@ -1361,6 +1413,19 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
break;
}
}
+ if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
+ /* Some requests might not have seen this new
+ * merge_bvec_fn. We must wait for them to complete
+ * before merging the device fully.
+ * First we make sure any code which has tested
+ * our function has submitted the request, then
+ * we wait for all outstanding requests to complete.
+ */
+ synchronize_sched();
+ raise_barrier(conf);
+ lower_barrier(conf);
+ clear_bit(Unmerged, &rdev->flags);
+ }
md_integrity_add_rdev(rdev, mddev);
print_conf(conf);
return err;
@@ -2482,7 +2547,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
err = -EINVAL;
spin_lock_init(&conf->device_lock);
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
int disk_idx = rdev->raid_disk;
if (disk_idx >= mddev->raid_disks
|| disk_idx < 0)
@@ -2600,20 +2665,11 @@ static int run(struct mddev *mddev)
if (IS_ERR(conf))
return PTR_ERR(conf);
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
if (!mddev->gendisk)
continue;
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
- /* as we don't honour merge_bvec_fn, we must never risk
- * violating it, so limit ->max_segments to 1 lying within
- * a single page, as a one page request is never in violation.
- */
- if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
- blk_queue_max_segments(mddev->queue, 1);
- blk_queue_segment_boundary(mddev->queue,
- PAGE_CACHE_SIZE - 1);
- }
}
mddev->degraded = 0;
@@ -2647,6 +2703,7 @@ static int run(struct mddev *mddev)
if (mddev->queue) {
mddev->queue->backing_dev_info.congested_fn = raid1_congested;
mddev->queue->backing_dev_info.congested_data = mddev;
+ blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec);
}
return md_integrity_register(mddev);
}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 6e8aa213f0d5..3540316886f2 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -67,6 +67,7 @@ static int max_queued_requests = 1024;
static void allow_barrier(struct r10conf *conf);
static void lower_barrier(struct r10conf *conf);
+static int enough(struct r10conf *conf, int ignore);
static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
{
@@ -347,6 +348,19 @@ static void raid10_end_read_request(struct bio *bio, int error)
* wait for the 'master' bio.
*/
set_bit(R10BIO_Uptodate, &r10_bio->state);
+ } else {
+ /* If all other devices that store this block have
+ * failed, we want to return the error upwards rather
+ * than fail the last device. Here we redefine
+ * "uptodate" to mean "Don't want to retry"
+ */
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
+ if (!enough(conf, rdev->raid_disk))
+ uptodate = 1;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ }
+ if (uptodate) {
raid_end_bio_io(r10_bio);
rdev_dec_pending(rdev, conf->mddev);
} else {
@@ -572,25 +586,68 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
* @biovec: the request that could be merged to it.
*
* Return amount of bytes we can accept at this offset
- * If near_copies == raid_disk, there are no striping issues,
- * but in that case, the function isn't called at all.
+ * This requires checking for end-of-chunk if near_copies != raid_disks,
+ * and for subordinate merge_bvec_fns if merge_check_needed.
*/
static int raid10_mergeable_bvec(struct request_queue *q,
struct bvec_merge_data *bvm,
struct bio_vec *biovec)
{
struct mddev *mddev = q->queuedata;
+ struct r10conf *conf = mddev->private;
sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
int max;
unsigned int chunk_sectors = mddev->chunk_sectors;
unsigned int bio_sectors = bvm->bi_size >> 9;
- max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
- if (max < 0) max = 0; /* bio_add cannot handle a negative return */
- if (max <= biovec->bv_len && bio_sectors == 0)
- return biovec->bv_len;
- else
- return max;
+ if (conf->near_copies < conf->raid_disks) {
+ max = (chunk_sectors - ((sector & (chunk_sectors - 1))
+ + bio_sectors)) << 9;
+ if (max < 0)
+ /* bio_add cannot handle a negative return */
+ max = 0;
+ if (max <= biovec->bv_len && bio_sectors == 0)
+ return biovec->bv_len;
+ } else
+ max = biovec->bv_len;
+
+ if (mddev->merge_check_needed) {
+ struct r10bio r10_bio;
+ int s;
+ r10_bio.sector = sector;
+ raid10_find_phys(conf, &r10_bio);
+ rcu_read_lock();
+ for (s = 0; s < conf->copies; s++) {
+ int disk = r10_bio.devs[s].devnum;
+ struct md_rdev *rdev = rcu_dereference(
+ conf->mirrors[disk].rdev);
+ if (rdev && !test_bit(Faulty, &rdev->flags)) {
+ struct request_queue *q =
+ bdev_get_queue(rdev->bdev);
+ if (q->merge_bvec_fn) {
+ bvm->bi_sector = r10_bio.devs[s].addr
+ + rdev->data_offset;
+ bvm->bi_bdev = rdev->bdev;
+ max = min(max, q->merge_bvec_fn(
+ q, bvm, biovec));
+ }
+ }
+ rdev = rcu_dereference(conf->mirrors[disk].replacement);
+ if (rdev && !test_bit(Faulty, &rdev->flags)) {
+ struct request_queue *q =
+ bdev_get_queue(rdev->bdev);
+ if (q->merge_bvec_fn) {
+ bvm->bi_sector = r10_bio.devs[s].addr
+ + rdev->data_offset;
+ bvm->bi_bdev = rdev->bdev;
+ max = min(max, q->merge_bvec_fn(
+ q, bvm, biovec));
+ }
+ }
+ }
+ rcu_read_unlock();
+ }
+ return max;
}
/*
@@ -654,11 +711,12 @@ retry:
disk = r10_bio->devs[slot].devnum;
rdev = rcu_dereference(conf->mirrors[disk].replacement);
if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
+ test_bit(Unmerged, &rdev->flags) ||
r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
rdev = rcu_dereference(conf->mirrors[disk].rdev);
- if (rdev == NULL)
- continue;
- if (test_bit(Faulty, &rdev->flags))
+ if (rdev == NULL ||
+ test_bit(Faulty, &rdev->flags) ||
+ test_bit(Unmerged, &rdev->flags))
continue;
if (!test_bit(In_sync, &rdev->flags) &&
r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
@@ -849,9 +907,22 @@ static void wait_barrier(struct r10conf *conf)
spin_lock_irq(&conf->resync_lock);
if (conf->barrier) {
conf->nr_waiting++;
- wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
+ /* Wait for the barrier to drop.
+ * However if there are already pending
+ * requests (preventing the barrier from
+ * rising completely), and the
+ * pre-process bio queue isn't empty,
+ * then don't wait, as we need to empty
+ * that queue to get the nr_pending
+ * count down.
+ */
+ wait_event_lock_irq(conf->wait_barrier,
+ !conf->barrier ||
+ (conf->nr_pending &&
+ current->bio_list &&
+ !bio_list_empty(current->bio_list)),
conf->resync_lock,
- );
+ );
conf->nr_waiting--;
}
conf->nr_pending++;
@@ -1107,12 +1178,14 @@ retry_write:
blocked_rdev = rrdev;
break;
}
- if (rrdev && test_bit(Faulty, &rrdev->flags))
+ if (rrdev && (test_bit(Faulty, &rrdev->flags)
+ || test_bit(Unmerged, &rrdev->flags)))
rrdev = NULL;
r10_bio->devs[i].bio = NULL;
r10_bio->devs[i].repl_bio = NULL;
- if (!rdev || test_bit(Faulty, &rdev->flags)) {
+ if (!rdev || test_bit(Faulty, &rdev->flags) ||
+ test_bit(Unmerged, &rdev->flags)) {
set_bit(R10BIO_Degraded, &r10_bio->state);
continue;
}
@@ -1463,18 +1536,24 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
int mirror;
int first = 0;
int last = conf->raid_disks - 1;
+ struct request_queue *q = bdev_get_queue(rdev->bdev);
if (mddev->recovery_cp < MaxSector)
/* only hot-add to in-sync arrays, as recovery is
* very different from resync
*/
return -EBUSY;
- if (!enough(conf, -1))
+ if (rdev->saved_raid_disk < 0 && !enough(conf, -1))
return -EINVAL;
if (rdev->raid_disk >= 0)
first = last = rdev->raid_disk;
+ if (q->merge_bvec_fn) {
+ set_bit(Unmerged, &rdev->flags);
+ mddev->merge_check_needed = 1;
+ }
+
if (rdev->saved_raid_disk >= first &&
conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
mirror = rdev->saved_raid_disk;
@@ -1494,11 +1573,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
err = 0;
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
- if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
- blk_queue_max_segments(mddev->queue, 1);
- blk_queue_segment_boundary(mddev->queue,
- PAGE_CACHE_SIZE - 1);
- }
conf->fullsync = 1;
rcu_assign_pointer(p->replacement, rdev);
break;
@@ -1506,17 +1580,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
- /* as we don't honour merge_bvec_fn, we must
- * never risk violating it, so limit
- * ->max_segments to one lying with a single
- * page, as a one page request is never in
- * violation.
- */
- if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
- blk_queue_max_segments(mddev->queue, 1);
- blk_queue_segment_boundary(mddev->queue,
- PAGE_CACHE_SIZE - 1);
- }
p->head_position = 0;
p->recovery_disabled = mddev->recovery_disabled - 1;
@@ -1527,7 +1590,19 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
rcu_assign_pointer(p->rdev, rdev);
break;
}
-
+ if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
+ /* Some requests might not have seen this new
+ * merge_bvec_fn. We must wait for them to complete
+ * before merging the device fully.
+ * First we make sure any code which has tested
+ * our function has submitted the request, then
+ * we wait for all outstanding requests to complete.
+ */
+ synchronize_sched();
+ raise_barrier(conf, 0);
+ lower_barrier(conf);
+ clear_bit(Unmerged, &rdev->flags);
+ }
md_integrity_add_rdev(rdev, mddev);
print_conf(conf);
return err;
@@ -1668,10 +1743,8 @@ static void end_sync_write(struct bio *bio, int error)
d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
if (repl)
rdev = conf->mirrors[d].replacement;
- if (!rdev) {
- smp_mb();
+ else
rdev = conf->mirrors[d].rdev;
- }
if (!uptodate) {
if (repl)
@@ -2052,6 +2125,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
"md/raid10:%s: %s: Failing raid device\n",
mdname(mddev), b);
md_error(mddev, conf->mirrors[d].rdev);
+ r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
return;
}
@@ -2072,6 +2146,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
d = r10_bio->devs[sl].devnum;
rdev = rcu_dereference(conf->mirrors[d].rdev);
if (rdev &&
+ !test_bit(Unmerged, &rdev->flags) &&
test_bit(In_sync, &rdev->flags) &&
is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
&first_bad, &bad_sectors) == 0) {
@@ -2105,8 +2180,11 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
rdev,
r10_bio->devs[r10_bio->read_slot].addr
+ sect,
- s, 0))
+ s, 0)) {
md_error(mddev, rdev);
+ r10_bio->devs[r10_bio->read_slot].bio
+ = IO_BLOCKED;
+ }
break;
}
@@ -2122,6 +2200,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
d = r10_bio->devs[sl].devnum;
rdev = rcu_dereference(conf->mirrors[d].rdev);
if (!rdev ||
+ test_bit(Unmerged, &rdev->flags) ||
!test_bit(In_sync, &rdev->flags))
continue;
@@ -2299,17 +2378,20 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
* This is all done synchronously while the array is
* frozen.
*/
+ bio = r10_bio->devs[slot].bio;
+ bdevname(bio->bi_bdev, b);
+ bio_put(bio);
+ r10_bio->devs[slot].bio = NULL;
+
if (mddev->ro == 0) {
freeze_array(conf);
fix_read_error(conf, mddev, r10_bio);
unfreeze_array(conf);
- }
+ } else
+ r10_bio->devs[slot].bio = IO_BLOCKED;
+
rdev_dec_pending(rdev, mddev);
- bio = r10_bio->devs[slot].bio;
- bdevname(bio->bi_bdev, b);
- r10_bio->devs[slot].bio =
- mddev->ro ? IO_BLOCKED : NULL;
read_more:
rdev = read_balance(conf, r10_bio, &max_sectors);
if (rdev == NULL) {
@@ -2318,13 +2400,10 @@ read_more:
mdname(mddev), b,
(unsigned long long)r10_bio->sector);
raid_end_bio_io(r10_bio);
- bio_put(bio);
return;
}
do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
- if (bio)
- bio_put(bio);
slot = r10_bio->read_slot;
printk_ratelimited(
KERN_ERR
@@ -2360,7 +2439,6 @@ read_more:
mbio->bi_phys_segments++;
spin_unlock_irq(&conf->device_lock);
generic_make_request(bio);
- bio = NULL;
r10_bio = mempool_alloc(conf->r10bio_pool,
GFP_NOIO);
@@ -3225,7 +3303,7 @@ static int run(struct mddev *mddev)
blk_queue_io_opt(mddev->queue, chunk_size *
(conf->raid_disks / conf->near_copies));
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
disk_idx = rdev->raid_disk;
if (disk_idx >= conf->raid_disks
@@ -3243,18 +3321,8 @@ static int run(struct mddev *mddev)
disk->rdev = rdev;
}
- disk->rdev = rdev;
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
- /* as we don't honour merge_bvec_fn, we must never risk
- * violating it, so limit max_segments to 1 lying
- * within a single page.
- */
- if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
- blk_queue_max_segments(mddev->queue, 1);
- blk_queue_segment_boundary(mddev->queue,
- PAGE_CACHE_SIZE - 1);
- }
disk->head_position = 0;
}
@@ -3318,8 +3386,7 @@ static int run(struct mddev *mddev)
mddev->queue->backing_dev_info.ra_pages = 2* stripe;
}
- if (conf->near_copies < conf->raid_disks)
- blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
+ blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
if (md_integrity_register(mddev))
goto out_free_conf;
@@ -3369,6 +3436,43 @@ static void raid10_quiesce(struct mddev *mddev, int state)
}
}
+static int raid10_resize(struct mddev *mddev, sector_t sectors)
+{
+ /* Resize of 'far' arrays is not supported.
+ * For 'near' and 'offset' arrays we can set the
+ * number of sectors used to be an appropriate multiple
+ * of the chunk size.
+ * For 'offset', this is far_copies*chunksize.
+ * For 'near' the multiplier is the LCM of
+ * near_copies and raid_disks.
+ * So if far_copies > 1 && !far_offset, fail.
+ * Else find LCM(raid_disks, near_copy)*far_copies and
+ * multiply by chunk_size. Then round to this number.
+ * This is mostly done by raid10_size()
+ */
+ struct r10conf *conf = mddev->private;
+ sector_t oldsize, size;
+
+ if (conf->far_copies > 1 && !conf->far_offset)
+ return -EINVAL;
+
+ oldsize = raid10_size(mddev, 0, 0);
+ size = raid10_size(mddev, sectors, 0);
+ md_set_array_sectors(mddev, size);
+ if (mddev->array_sectors > size)
+ return -EINVAL;
+ set_capacity(mddev->gendisk, mddev->array_sectors);
+ revalidate_disk(mddev->gendisk);
+ if (sectors > mddev->dev_sectors &&
+ mddev->recovery_cp > oldsize) {
+ mddev->recovery_cp = oldsize;
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ }
+ mddev->dev_sectors = sectors;
+ mddev->resync_max_sectors = size;
+ return 0;
+}
+
static void *raid10_takeover_raid0(struct mddev *mddev)
{
struct md_rdev *rdev;
@@ -3392,7 +3496,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev)
conf = setup_conf(mddev);
if (!IS_ERR(conf)) {
- list_for_each_entry(rdev, &mddev->disks, same_set)
+ rdev_for_each(rdev, mddev)
if (rdev->raid_disk >= 0)
rdev->new_raid_disk = rdev->raid_disk * 2;
conf->barrier = 1;
@@ -3438,6 +3542,7 @@ static struct md_personality raid10_personality =
.sync_request = sync_request,
.quiesce = raid10_quiesce,
.size = raid10_size,
+ .resize = raid10_resize,
.takeover = raid10_takeover,
};
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 360f2b98f62b..23ac880bba9a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -208,11 +208,10 @@ static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
md_wakeup_thread(conf->mddev->thread);
} else {
BUG_ON(stripe_operations_active(sh));
- if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
- atomic_dec(&conf->preread_active_stripes);
- if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
+ if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ if (atomic_dec_return(&conf->preread_active_stripes)
+ < IO_THRESHOLD)
md_wakeup_thread(conf->mddev->thread);
- }
atomic_dec(&conf->active_stripes);
if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
list_add_tail(&sh->lru, &conf->inactive_list);
@@ -4843,7 +4842,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
pr_debug("raid456: run(%s) called.\n", mdname(mddev));
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
raid_disk = rdev->raid_disk;
if (raid_disk >= max_disks
|| raid_disk < 0)
@@ -5178,7 +5177,7 @@ static int run(struct mddev *mddev)
blk_queue_io_opt(mddev->queue, chunk_size *
(conf->raid_disks - conf->max_degraded));
- list_for_each_entry(rdev, &mddev->disks, same_set)
+ rdev_for_each(rdev, mddev)
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
}
@@ -5362,7 +5361,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
if (mddev->recovery_disabled == conf->recovery_disabled)
return -EBUSY;
- if (has_failed(conf))
+ if (rdev->saved_raid_disk < 0 && has_failed(conf))
/* no point adding a device */
return -EINVAL;
@@ -5501,7 +5500,7 @@ static int raid5_start_reshape(struct mddev *mddev)
if (!check_stripe_cache(mddev))
return -ENOSPC;
- list_for_each_entry(rdev, &mddev->disks, same_set)
+ rdev_for_each(rdev, mddev)
if (!test_bit(In_sync, &rdev->flags)
&& !test_bit(Faulty, &rdev->flags))
spares++;
@@ -5547,16 +5546,14 @@ static int raid5_start_reshape(struct mddev *mddev)
* such devices during the reshape and confusion could result.
*/
if (mddev->delta_disks >= 0) {
- int added_devices = 0;
- list_for_each_entry(rdev, &mddev->disks, same_set)
+ rdev_for_each(rdev, mddev)
if (rdev->raid_disk < 0 &&
!test_bit(Faulty, &rdev->flags)) {
if (raid5_add_disk(mddev, rdev) == 0) {
if (rdev->raid_disk
- >= conf->previous_raid_disks) {
+ >= conf->previous_raid_disks)
set_bit(In_sync, &rdev->flags);
- added_devices++;
- } else
+ else
rdev->recovery_offset = 0;
if (sysfs_link_rdev(mddev, rdev))
@@ -5566,7 +5563,6 @@ static int raid5_start_reshape(struct mddev *mddev)
&& !test_bit(Faulty, &rdev->flags)) {
/* This is a spare that was manually added */
set_bit(In_sync, &rdev->flags);
- added_devices++;
}
/* When a reshape changes the number of devices,
@@ -5592,6 +5588,7 @@ static int raid5_start_reshape(struct mddev *mddev)
spin_lock_irq(&conf->device_lock);
mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
conf->reshape_progress = MaxSector;
+ mddev->reshape_position = MaxSector;
spin_unlock_irq(&conf->device_lock);
return -EAGAIN;
}