summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/dm-mpath.c3
-rw-r--r--drivers/md/dm-raid1.c4
-rw-r--r--drivers/md/linear.c6
-rw-r--r--drivers/md/md.c99
-rw-r--r--drivers/md/raid1.c67
-rw-r--r--drivers/md/raid10.c4
-rw-r--r--drivers/md/raid5.c84
7 files changed, 173 insertions, 94 deletions
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 217615b33223..93f701ea87bc 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -710,6 +710,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
return -EINVAL;
}
+ m->ti = ti;
+
r = parse_features(&as, m, ti);
if (r)
goto bad;
@@ -751,7 +753,6 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
}
ti->private = m;
- m->ti = ti;
return 0;
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index be48cedf986b..c54de989eb00 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -255,7 +255,9 @@ static struct region *__rh_alloc(struct region_hash *rh, region_t region)
struct region *reg, *nreg;
read_unlock(&rh->hash_lock);
- nreg = mempool_alloc(rh->region_pool, GFP_NOIO);
+ nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC);
+ if (unlikely(!nreg))
+ nreg = kmalloc(sizeof(struct region), GFP_NOIO);
nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
RH_CLEAN : RH_NOSYNC;
nreg->rh = rh;
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index ff83c9b5979e..b99c19c7eb22 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -162,7 +162,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
goto out;
}
- min_spacing = mddev->array_size;
+ min_spacing = conf->array_size;
sector_div(min_spacing, PAGE_SIZE/sizeof(struct dev_info *));
/* min_spacing is the minimum spacing that will fit the hash
@@ -171,7 +171,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
* that is larger than min_spacing as use the size of that as
* the actual spacing
*/
- conf->hash_spacing = mddev->array_size;
+ conf->hash_spacing = conf->array_size;
for (i=0; i < cnt-1 ; i++) {
sector_t sz = 0;
int j;
@@ -228,7 +228,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
curr_offset = 0;
i = 0;
for (curr_offset = 0;
- curr_offset < mddev->array_size;
+ curr_offset < conf->array_size;
curr_offset += conf->hash_spacing) {
while (i < mddev->raid_disks-1 &&
diff --git a/drivers/md/md.c b/drivers/md/md.c
index e4e161372a3e..8dbab2ef3885 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -110,7 +110,7 @@ static ctl_table raid_table[] = {
.procname = "speed_limit_min",
.data = &sysctl_speed_limit_min,
.maxlen = sizeof(int),
- .mode = 0644,
+ .mode = S_IRUGO|S_IWUSR,
.proc_handler = &proc_dointvec,
},
{
@@ -118,7 +118,7 @@ static ctl_table raid_table[] = {
.procname = "speed_limit_max",
.data = &sysctl_speed_limit_max,
.maxlen = sizeof(int),
- .mode = 0644,
+ .mode = S_IRUGO|S_IWUSR,
.proc_handler = &proc_dointvec,
},
{ .ctl_name = 0 }
@@ -129,7 +129,7 @@ static ctl_table raid_dir_table[] = {
.ctl_name = DEV_RAID,
.procname = "raid",
.maxlen = 0,
- .mode = 0555,
+ .mode = S_IRUGO|S_IXUGO,
.child = raid_table,
},
{ .ctl_name = 0 }
@@ -1062,6 +1062,11 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
if (rdev->sb_size & bmask)
rdev-> sb_size = (rdev->sb_size | bmask)+1;
+ if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
+ rdev->desc_nr = -1;
+ else
+ rdev->desc_nr = le32_to_cpu(sb->dev_number);
+
if (refdev == 0)
ret = 1;
else {
@@ -1171,7 +1176,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
}
if (mddev->level != LEVEL_MULTIPATH) {
int role;
- rdev->desc_nr = le32_to_cpu(sb->dev_number);
role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
switch(role) {
case 0xffff: /* spare */
@@ -1593,6 +1597,19 @@ void md_update_sb(mddev_t * mddev)
repeat:
spin_lock_irq(&mddev->write_lock);
+
+ if (mddev->degraded && mddev->sb_dirty == 3)
+ /* If the array is degraded, then skipping spares is both
+ * dangerous and fairly pointless.
+ * Dangerous because a device that was removed from the array
+ * might have a event_count that still looks up-to-date,
+ * so it can be re-added without a resync.
+ * Pointless because if there are any spares to skip,
+ * then a recovery will happen and soon that array won't
+ * be degraded any more and the spare can go back to sleep then.
+ */
+ mddev->sb_dirty = 1;
+
sync_req = mddev->in_sync;
mddev->utime = get_seconds();
if (mddev->sb_dirty == 3)
@@ -1779,8 +1796,8 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
}
return err ? err : len;
}
-static struct rdev_sysfs_entry
-rdev_state = __ATTR(state, 0644, state_show, state_store);
+static struct rdev_sysfs_entry rdev_state =
+__ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
static ssize_t
super_show(mdk_rdev_t *rdev, char *page)
@@ -1811,7 +1828,7 @@ errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
return -EINVAL;
}
static struct rdev_sysfs_entry rdev_errors =
-__ATTR(errors, 0644, errors_show, errors_store);
+__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
static ssize_t
slot_show(mdk_rdev_t *rdev, char *page)
@@ -1845,7 +1862,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
static struct rdev_sysfs_entry rdev_slot =
-__ATTR(slot, 0644, slot_show, slot_store);
+__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
static ssize_t
offset_show(mdk_rdev_t *rdev, char *page)
@@ -1867,7 +1884,7 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
}
static struct rdev_sysfs_entry rdev_offset =
-__ATTR(offset, 0644, offset_show, offset_store);
+__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
static ssize_t
rdev_size_show(mdk_rdev_t *rdev, char *page)
@@ -1891,7 +1908,7 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
}
static struct rdev_sysfs_entry rdev_size =
-__ATTR(size, 0644, rdev_size_show, rdev_size_store);
+__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
static struct attribute *rdev_default_attrs[] = {
&rdev_state.attr,
@@ -1922,6 +1939,8 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
if (!entry->store)
return -EIO;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
return entry->store(rdev, page, length);
}
@@ -2128,7 +2147,7 @@ safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
return len;
}
static struct md_sysfs_entry md_safe_delay =
-__ATTR(safe_mode_delay, 0644,safe_delay_show, safe_delay_store);
+__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
static ssize_t
level_show(mddev_t *mddev, char *page)
@@ -2163,7 +2182,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
}
static struct md_sysfs_entry md_level =
-__ATTR(level, 0644, level_show, level_store);
+__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
static ssize_t
@@ -2188,7 +2207,7 @@ layout_store(mddev_t *mddev, const char *buf, size_t len)
return len;
}
static struct md_sysfs_entry md_layout =
-__ATTR(layout, 0655, layout_show, layout_store);
+__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
static ssize_t
@@ -2219,7 +2238,7 @@ raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
return rv ? rv : len;
}
static struct md_sysfs_entry md_raid_disks =
-__ATTR(raid_disks, 0644, raid_disks_show, raid_disks_store);
+__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
static ssize_t
chunk_size_show(mddev_t *mddev, char *page)
@@ -2243,7 +2262,7 @@ chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
return len;
}
static struct md_sysfs_entry md_chunk_size =
-__ATTR(chunk_size, 0644, chunk_size_show, chunk_size_store);
+__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
static ssize_t
resync_start_show(mddev_t *mddev, char *page)
@@ -2267,7 +2286,7 @@ resync_start_store(mddev_t *mddev, const char *buf, size_t len)
return len;
}
static struct md_sysfs_entry md_resync_start =
-__ATTR(resync_start, 0644, resync_start_show, resync_start_store);
+__ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
/*
* The array state can be:
@@ -2437,7 +2456,8 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
else
return len;
}
-static struct md_sysfs_entry md_array_state = __ATTR(array_state, 0644, array_state_show, array_state_store);
+static struct md_sysfs_entry md_array_state =
+__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
static ssize_t
null_show(mddev_t *mddev, char *page)
@@ -2497,7 +2517,7 @@ new_dev_store(mddev_t *mddev, const char *buf, size_t len)
}
static struct md_sysfs_entry md_new_device =
-__ATTR(new_dev, 0200, null_show, new_dev_store);
+__ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
static ssize_t
size_show(mddev_t *mddev, char *page)
@@ -2535,7 +2555,7 @@ size_store(mddev_t *mddev, const char *buf, size_t len)
}
static struct md_sysfs_entry md_size =
-__ATTR(component_size, 0644, size_show, size_store);
+__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
/* Metdata version.
@@ -2583,7 +2603,7 @@ metadata_store(mddev_t *mddev, const char *buf, size_t len)
}
static struct md_sysfs_entry md_metadata =
-__ATTR(metadata_version, 0644, metadata_show, metadata_store);
+__ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
static ssize_t
action_show(mddev_t *mddev, char *page)
@@ -2651,12 +2671,11 @@ mismatch_cnt_show(mddev_t *mddev, char *page)
(unsigned long long) mddev->resync_mismatches);
}
-static struct md_sysfs_entry
-md_scan_mode = __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
+static struct md_sysfs_entry md_scan_mode =
+__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
-static struct md_sysfs_entry
-md_mismatches = __ATTR_RO(mismatch_cnt);
+static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
static ssize_t
sync_min_show(mddev_t *mddev, char *page)
@@ -2715,15 +2734,14 @@ static ssize_t
sync_speed_show(mddev_t *mddev, char *page)
{
unsigned long resync, dt, db;
- resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
+ resync = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active));
dt = ((jiffies - mddev->resync_mark) / HZ);
if (!dt) dt++;
db = resync - (mddev->resync_mark_cnt);
return sprintf(page, "%ld\n", db/dt/2); /* K/sec */
}
-static struct md_sysfs_entry
-md_sync_speed = __ATTR_RO(sync_speed);
+static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
static ssize_t
sync_completed_show(mddev_t *mddev, char *page)
@@ -2739,8 +2757,7 @@ sync_completed_show(mddev_t *mddev, char *page)
return sprintf(page, "%lu / %lu\n", resync, max_blocks);
}
-static struct md_sysfs_entry
-md_sync_completed = __ATTR_RO(sync_completed);
+static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
static ssize_t
suspend_lo_show(mddev_t *mddev, char *page)
@@ -2857,6 +2874,8 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
if (!entry->store)
return -EIO;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
rv = mddev_lock(mddev);
if (!rv) {
rv = entry->store(mddev, page, length);
@@ -3091,7 +3110,6 @@ static int do_md_run(mddev_t * mddev)
}
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
- md_wakeup_thread(mddev->thread);
if (mddev->sb_dirty)
md_update_sb(mddev);
@@ -3112,7 +3130,7 @@ static int do_md_run(mddev_t * mddev)
* start recovery here. If we leave it to md_check_recovery,
* it will remove the drives and not do the right thing
*/
- if (mddev->degraded) {
+ if (mddev->degraded && !mddev->sync_thread) {
struct list_head *rtmp;
int spares = 0;
ITERATE_RDEV(mddev,rdev,rtmp)
@@ -3133,10 +3151,11 @@ static int do_md_run(mddev_t * mddev)
mdname(mddev));
/* leave the spares where they are, it shouldn't hurt */
mddev->recovery = 0;
- } else
- md_wakeup_thread(mddev->sync_thread);
+ }
}
}
+ md_wakeup_thread(mddev->thread);
+ md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
mddev->changed = 1;
md_new_event(mddev);
@@ -4586,6 +4605,8 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
__builtin_return_address(0),__builtin_return_address(1),
__builtin_return_address(2),__builtin_return_address(3));
*/
+ if (!mddev->pers)
+ return;
if (!mddev->pers->error_handler)
return;
mddev->pers->error_handler(mddev,rdev);
@@ -4683,12 +4704,13 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
*/
dt = ((jiffies - mddev->resync_mark) / HZ);
if (!dt) dt++;
- db = resync - (mddev->resync_mark_cnt/2);
- rt = (dt * ((unsigned long)(max_blocks-resync) / (db/100+1)))/100;
+ db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
+ - mddev->resync_mark_cnt;
+ rt = (dt * ((unsigned long)(max_blocks-resync) / (db/2/100+1)))/100;
seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
- seq_printf(seq, " speed=%ldK/sec", db/dt);
+ seq_printf(seq, " speed=%ldK/sec", db/2/dt);
}
static void *md_seq_start(struct seq_file *seq, loff_t *pos)
@@ -5199,6 +5221,7 @@ void md_do_sync(mddev_t *mddev)
j += sectors;
if (j>1) mddev->curr_resync = j;
+ mddev->curr_mark_cnt = io_sectors;
if (last_check == 0)
/* this is the earliers that rebuilt will be
* visible in /proc/mdstat
@@ -5645,8 +5668,8 @@ static int set_ro(const char *val, struct kernel_param *kp)
return -EINVAL;
}
-module_param_call(start_ro, set_ro, get_ro, NULL, 0600);
-module_param(start_dirty_degraded, int, 0644);
+module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
+module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
EXPORT_SYMBOL(register_md_personality);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index cead918578a7..3b4d69c05623 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -930,10 +930,13 @@ static void status(struct seq_file *seq, mddev_t *mddev)
seq_printf(seq, " [%d/%d] [", conf->raid_disks,
conf->working_disks);
- for (i = 0; i < conf->raid_disks; i++)
+ rcu_read_lock();
+ for (i = 0; i < conf->raid_disks; i++) {
+ mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
seq_printf(seq, "%s",
- conf->mirrors[i].rdev &&
- test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
+ rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
+ }
+ rcu_read_unlock();
seq_printf(seq, "]");
}
@@ -975,7 +978,6 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
static void print_conf(conf_t *conf)
{
int i;
- mirror_info_t *tmp;
printk("RAID1 conf printout:\n");
if (!conf) {
@@ -985,14 +987,17 @@ static void print_conf(conf_t *conf)
printk(" --- wd:%d rd:%d\n", conf->working_disks,
conf->raid_disks);
+ rcu_read_lock();
for (i = 0; i < conf->raid_disks; i++) {
char b[BDEVNAME_SIZE];
- tmp = conf->mirrors + i;
- if (tmp->rdev)
+ mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
+ if (rdev)
printk(" disk %d, wo:%d, o:%d, dev:%s\n",
- i, !test_bit(In_sync, &tmp->rdev->flags), !test_bit(Faulty, &tmp->rdev->flags),
- bdevname(tmp->rdev->bdev,b));
+ i, !test_bit(In_sync, &rdev->flags),
+ !test_bit(Faulty, &rdev->flags),
+ bdevname(rdev->bdev,b));
}
+ rcu_read_unlock();
}
static void close_sync(conf_t *conf)
@@ -1008,20 +1013,20 @@ static int raid1_spare_active(mddev_t *mddev)
{
int i;
conf_t *conf = mddev->private;
- mirror_info_t *tmp;
/*
* Find all failed disks within the RAID1 configuration
- * and mark them readable
+ * and mark them readable.
+ * Called under mddev lock, so rcu protection not needed.
*/
for (i = 0; i < conf->raid_disks; i++) {
- tmp = conf->mirrors + i;
- if (tmp->rdev
- && !test_bit(Faulty, &tmp->rdev->flags)
- && !test_bit(In_sync, &tmp->rdev->flags)) {
+ mdk_rdev_t *rdev = conf->mirrors[i].rdev;
+ if (rdev
+ && !test_bit(Faulty, &rdev->flags)
+ && !test_bit(In_sync, &rdev->flags)) {
conf->working_disks++;
mddev->degraded--;
- set_bit(In_sync, &tmp->rdev->flags);
+ set_bit(In_sync, &rdev->flags);
}
}
@@ -1145,7 +1150,7 @@ static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error)
long sectors_to_go = r1_bio->sectors;
/* make sure these bits doesn't get cleared. */
do {
- bitmap_end_sync(mddev->bitmap, r1_bio->sector,
+ bitmap_end_sync(mddev->bitmap, s,
&sync_blocks, 1);
s += sync_blocks;
sectors_to_go -= sync_blocks;
@@ -1237,7 +1242,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
/* ouch - failed to read all of that.
* Try some synchronous reads of other devices to get
* good data, much like with normal read errors. Only
- * read into the pages we already have so they we don't
+ * read into the pages we already have so we don't
* need to re-issue the read request.
* We don't need to freeze the array, because being in an
* active sync request, there is no normal IO, and
@@ -1257,6 +1262,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
s = PAGE_SIZE >> 9;
do {
if (r1_bio->bios[d]->bi_end_io == end_sync_read) {
+ /* No rcu protection needed here devices
+ * can only be removed when no resync is
+ * active, and resync is currently active
+ */
rdev = conf->mirrors[d].rdev;
if (sync_page_io(rdev->bdev,
sect + rdev->data_offset,
@@ -1463,6 +1472,11 @@ static void raid1d(mddev_t *mddev)
s = PAGE_SIZE >> 9;
do {
+ /* Note: no rcu protection needed here
+ * as this is synchronous in the raid1d thread
+ * which is the thread that might remove
+ * a device. If raid1d ever becomes multi-threaded....
+ */
rdev = conf->mirrors[d].rdev;
if (rdev &&
test_bit(In_sync, &rdev->flags) &&
@@ -1486,7 +1500,6 @@ static void raid1d(mddev_t *mddev)
d = conf->raid_disks;
d--;
rdev = conf->mirrors[d].rdev;
- atomic_add(s, &rdev->corrected_errors);
if (rdev &&
test_bit(In_sync, &rdev->flags)) {
if (sync_page_io(rdev->bdev,
@@ -1509,6 +1522,11 @@ static void raid1d(mddev_t *mddev)
s<<9, conf->tmppage, READ) == 0)
/* Well, this device is dead */
md_error(mddev, rdev);
+ else {
+ atomic_add(s, &rdev->corrected_errors);
+ printk(KERN_INFO "raid1:%s: read error corrected (%d sectors at %llu on %s)\n",
+ mdname(mddev), s, (unsigned long long)(sect + rdev->data_offset), bdevname(rdev->bdev, b));
+ }
}
}
} else {
@@ -1622,15 +1640,16 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
return 0;
}
- /* before building a request, check if we can skip these blocks..
- * This call the bitmap_start_sync doesn't actually record anything
- */
if (mddev->bitmap == NULL &&
mddev->recovery_cp == MaxSector &&
+ !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
conf->fullsync == 0) {
*skipped = 1;
return max_sector - sector_nr;
}
+ /* before building a request, check if we can skip these blocks..
+ * This call the bitmap_start_sync doesn't actually record anything
+ */
if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
!conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
/* We can skip this block, and probably several more */
@@ -1783,19 +1802,17 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
for (i=0; i<conf->raid_disks; i++) {
bio = r1_bio->bios[i];
if (bio->bi_end_io == end_sync_read) {
- md_sync_acct(conf->mirrors[i].rdev->bdev, nr_sectors);
+ md_sync_acct(bio->bi_bdev, nr_sectors);
generic_make_request(bio);
}
}
} else {
atomic_set(&r1_bio->remaining, 1);
bio = r1_bio->bios[r1_bio->read_disk];
- md_sync_acct(conf->mirrors[r1_bio->read_disk].rdev->bdev,
- nr_sectors);
+ md_sync_acct(bio->bi_bdev, nr_sectors);
generic_make_request(bio);
}
-
return nr_sectors;
}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 7f636283a1ba..016ddb831c9b 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1492,6 +1492,10 @@ static void raid10d(mddev_t *mddev)
s<<9, conf->tmppage, READ) == 0)
/* Well, this device is dead */
md_error(mddev, rdev);
+ else
+ printk(KERN_INFO "raid10:%s: read error corrected (%d sectors at %llu on %s)\n",
+ mdname(mddev), s, (unsigned long long)(sect+rdev->data_offset), bdevname(rdev->bdev, b));
+
rdev_dec_pending(rdev, mddev);
rcu_read_lock();
}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7433871f4b3a..450066007160 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -18,6 +18,30 @@
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
+/*
+ * BITMAP UNPLUGGING:
+ *
+ * The sequencing for updating the bitmap reliably is a little
+ * subtle (and I got it wrong the first time) so it deserves some
+ * explanation.
+ *
+ * We group bitmap updates into batches. Each batch has a number.
+ * We may write out several batches at once, but that isn't very important.
+ * conf->bm_write is the number of the last batch successfully written.
+ * conf->bm_flush is the number of the last batch that was closed to
+ * new additions.
+ * When we discover that we will need to write to any block in a stripe
+ * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
+ * the number of the batch it will be in. This is bm_flush+1.
+ * When we are ready to do a write, if that batch hasn't been written yet,
+ * we plug the array and queue the stripe for later.
+ * When an unplug happens, we increment bm_flush, thus closing the current
+ * batch.
+ * When we notice that bm_flush > bm_write, we write out all pending updates
+ * to the bitmap, and advance bm_write to where bm_flush was.
+ * This may occasionally write a bit out twice, but is sure never to
+ * miss any bits.
+ */
#include <linux/module.h>
#include <linux/slab.h>
@@ -88,12 +112,14 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
BUG_ON(!list_empty(&sh->lru));
BUG_ON(atomic_read(&conf->active_stripes)==0);
if (test_bit(STRIPE_HANDLE, &sh->state)) {
- if (test_bit(STRIPE_DELAYED, &sh->state))
+ if (test_bit(STRIPE_DELAYED, &sh->state)) {
list_add_tail(&sh->lru, &conf->delayed_list);
- else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
- conf->seq_write == sh->bm_seq)
+ blk_plug_device(conf->mddev->queue);
+ } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+ sh->bm_seq - conf->seq_write > 0) {
list_add_tail(&sh->lru, &conf->bitmap_list);
- else {
+ blk_plug_device(conf->mddev->queue);
+ } else {
clear_bit(STRIPE_BIT_DELAY, &sh->state);
list_add_tail(&sh->lru, &conf->handle_list);
}
@@ -270,7 +296,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
< (conf->max_nr_stripes *3/4)
|| !conf->inactive_blocked),
conf->device_lock,
- unplug_slaves(conf->mddev)
+ raid5_unplug_device(conf->mddev->queue)
);
conf->inactive_blocked = 0;
} else
@@ -281,7 +307,8 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
} else {
if (!test_bit(STRIPE_HANDLE, &sh->state))
atomic_inc(&conf->active_stripes);
- if (list_empty(&sh->lru))
+ if (list_empty(&sh->lru) &&
+ !test_bit(STRIPE_EXPANDING, &sh->state))
BUG();
list_del_init(&sh->lru);
}
@@ -496,6 +523,8 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
raid5_conf_t *conf = sh->raid_conf;
int disks = sh->disks, i;
int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
+ char b[BDEVNAME_SIZE];
+ mdk_rdev_t *rdev;
if (bi->bi_size)
return 1;
@@ -543,25 +572,39 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
set_bit(R5_UPTODATE, &sh->dev[i].flags);
#endif
if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
- printk(KERN_INFO "raid5: read error corrected!!\n");
+ rdev = conf->disks[i].rdev;
+ printk(KERN_INFO "raid5:%s: read error corrected (%lu sectors at %llu on %s)\n",
+ mdname(conf->mddev), STRIPE_SECTORS,
+ (unsigned long long)sh->sector + rdev->data_offset,
+ bdevname(rdev->bdev, b));
clear_bit(R5_ReadError, &sh->dev[i].flags);
clear_bit(R5_ReWrite, &sh->dev[i].flags);
}
if (atomic_read(&conf->disks[i].rdev->read_errors))
atomic_set(&conf->disks[i].rdev->read_errors, 0);
} else {
+ const char *bdn = bdevname(conf->disks[i].rdev->bdev, b);
int retry = 0;
+ rdev = conf->disks[i].rdev;
+
clear_bit(R5_UPTODATE, &sh->dev[i].flags);
- atomic_inc(&conf->disks[i].rdev->read_errors);
+ atomic_inc(&rdev->read_errors);
if (conf->mddev->degraded)
- printk(KERN_WARNING "raid5: read error not correctable.\n");
+ printk(KERN_WARNING "raid5:%s: read error not correctable (sector %llu on %s).\n",
+ mdname(conf->mddev),
+ (unsigned long long)sh->sector + rdev->data_offset,
+ bdn);
else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
/* Oh, no!!! */
- printk(KERN_WARNING "raid5: read error NOT corrected!!\n");
- else if (atomic_read(&conf->disks[i].rdev->read_errors)
+ printk(KERN_WARNING "raid5:%s: read error NOT corrected!! (sector %llu on %s).\n",
+ mdname(conf->mddev),
+ (unsigned long long)sh->sector + rdev->data_offset,
+ bdn);
+ else if (atomic_read(&rdev->read_errors)
> conf->max_nr_stripes)
printk(KERN_WARNING
- "raid5: Too many read errors, failing device.\n");
+ "raid5:%s: Too many read errors, failing device %s.\n",
+ mdname(conf->mddev), bdn);
else
retry = 1;
if (retry)
@@ -569,7 +612,7 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
else {
clear_bit(R5_ReadError, &sh->dev[i].flags);
clear_bit(R5_ReWrite, &sh->dev[i].flags);
- md_error(conf->mddev, conf->disks[i].rdev);
+ md_error(conf->mddev, rdev);
}
}
rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
@@ -1270,9 +1313,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
(unsigned long long)sh->sector, dd_idx);
if (conf->mddev->bitmap && firstwrite) {
- sh->bm_seq = conf->seq_write;
bitmap_startwrite(conf->mddev->bitmap, sh->sector,
STRIPE_SECTORS, 0);
+ sh->bm_seq = conf->seq_flush+1;
set_bit(STRIPE_BIT_DELAY, &sh->state);
}
@@ -2554,13 +2597,6 @@ static int raid5_issue_flush(request_queue_t *q, struct gendisk *disk,
return ret;
}
-static inline void raid5_plug_device(raid5_conf_t *conf)
-{
- spin_lock_irq(&conf->device_lock);
- blk_plug_device(conf->mddev->queue);
- spin_unlock_irq(&conf->device_lock);
-}
-
static int make_request(request_queue_t *q, struct bio * bi)
{
mddev_t *mddev = q->queuedata;
@@ -2670,7 +2706,6 @@ static int make_request(request_queue_t *q, struct bio * bi)
goto retry;
}
finish_wait(&conf->wait_for_overlap, &w);
- raid5_plug_device(conf);
handle_stripe(sh, NULL);
release_stripe(sh);
} else {
@@ -2923,7 +2958,7 @@ static void raid5d (mddev_t *mddev)
while (1) {
struct list_head *first;
- if (conf->seq_flush - conf->seq_write > 0) {
+ if (conf->seq_flush != conf->seq_write) {
int seq = conf->seq_flush;
spin_unlock_irq(&conf->device_lock);
bitmap_unplug(mddev->bitmap);
@@ -3246,9 +3281,6 @@ static int run(mddev_t *mddev)
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
mddev->sync_thread = md_register_thread(md_do_sync, mddev,
"%s_reshape");
- /* FIXME if md_register_thread fails?? */
- md_wakeup_thread(mddev->sync_thread);
-
}
/* read-ahead size must cover two whole stripes, which is