diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/dm-mpath.c | 3 | ||||
-rw-r--r-- | drivers/md/dm-raid1.c | 4 | ||||
-rw-r--r-- | drivers/md/linear.c | 6 | ||||
-rw-r--r-- | drivers/md/md.c | 99 | ||||
-rw-r--r-- | drivers/md/raid1.c | 67 | ||||
-rw-r--r-- | drivers/md/raid10.c | 4 | ||||
-rw-r--r-- | drivers/md/raid5.c | 84 |
7 files changed, 173 insertions, 94 deletions
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 217615b33223..93f701ea87bc 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -710,6 +710,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, return -EINVAL; } + m->ti = ti; + r = parse_features(&as, m, ti); if (r) goto bad; @@ -751,7 +753,6 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, } ti->private = m; - m->ti = ti; return 0; diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index be48cedf986b..c54de989eb00 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -255,7 +255,9 @@ static struct region *__rh_alloc(struct region_hash *rh, region_t region) struct region *reg, *nreg; read_unlock(&rh->hash_lock); - nreg = mempool_alloc(rh->region_pool, GFP_NOIO); + nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC); + if (unlikely(!nreg)) + nreg = kmalloc(sizeof(struct region), GFP_NOIO); nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? RH_CLEAN : RH_NOSYNC; nreg->rh = rh; diff --git a/drivers/md/linear.c b/drivers/md/linear.c index ff83c9b5979e..b99c19c7eb22 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -162,7 +162,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) goto out; } - min_spacing = mddev->array_size; + min_spacing = conf->array_size; sector_div(min_spacing, PAGE_SIZE/sizeof(struct dev_info *)); /* min_spacing is the minimum spacing that will fit the hash @@ -171,7 +171,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) * that is larger than min_spacing as use the size of that as * the actual spacing */ - conf->hash_spacing = mddev->array_size; + conf->hash_spacing = conf->array_size; for (i=0; i < cnt-1 ; i++) { sector_t sz = 0; int j; @@ -228,7 +228,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) curr_offset = 0; i = 0; for (curr_offset = 0; - curr_offset < mddev->array_size; + curr_offset < conf->array_size; curr_offset += conf->hash_spacing) { while (i < mddev->raid_disks-1 && diff --git a/drivers/md/md.c b/drivers/md/md.c index e4e161372a3e..8dbab2ef3885 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -110,7 +110,7 @@ static ctl_table raid_table[] = { .procname = "speed_limit_min", .data = &sysctl_speed_limit_min, .maxlen = sizeof(int), - .mode = 0644, + .mode = S_IRUGO|S_IWUSR, .proc_handler = &proc_dointvec, }, { @@ -118,7 +118,7 @@ static ctl_table raid_table[] = { .procname = "speed_limit_max", .data = &sysctl_speed_limit_max, .maxlen = sizeof(int), - .mode = 0644, + .mode = S_IRUGO|S_IWUSR, .proc_handler = &proc_dointvec, }, { .ctl_name = 0 } @@ -129,7 +129,7 @@ static ctl_table raid_dir_table[] = { .ctl_name = DEV_RAID, .procname = "raid", .maxlen = 0, - .mode = 0555, + .mode = S_IRUGO|S_IXUGO, .child = raid_table, }, { .ctl_name = 0 } @@ -1062,6 +1062,11 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) if (rdev->sb_size & bmask) rdev-> sb_size = (rdev->sb_size | bmask)+1; + if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) + rdev->desc_nr = -1; + else + rdev->desc_nr = le32_to_cpu(sb->dev_number); + if (refdev == 0) ret = 1; else { @@ -1171,7 +1176,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) } if (mddev->level != LEVEL_MULTIPATH) { int role; - rdev->desc_nr = le32_to_cpu(sb->dev_number); role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); switch(role) { case 0xffff: /* spare */ @@ -1593,6 +1597,19 @@ void md_update_sb(mddev_t * mddev) repeat: spin_lock_irq(&mddev->write_lock); + + if (mddev->degraded && mddev->sb_dirty == 3) + /* If the array is degraded, then skipping spares is both + * dangerous and fairly pointless. + * Dangerous because a device that was removed from the array + * might have a event_count that still looks up-to-date, + * so it can be re-added without a resync. + * Pointless because if there are any spares to skip, + * then a recovery will happen and soon that array won't + * be degraded any more and the spare can go back to sleep then. + */ + mddev->sb_dirty = 1; + sync_req = mddev->in_sync; mddev->utime = get_seconds(); if (mddev->sb_dirty == 3) @@ -1779,8 +1796,8 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) } return err ? err : len; } -static struct rdev_sysfs_entry -rdev_state = __ATTR(state, 0644, state_show, state_store); +static struct rdev_sysfs_entry rdev_state = +__ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store); static ssize_t super_show(mdk_rdev_t *rdev, char *page) @@ -1811,7 +1828,7 @@ errors_store(mdk_rdev_t *rdev, const char *buf, size_t len) return -EINVAL; } static struct rdev_sysfs_entry rdev_errors = -__ATTR(errors, 0644, errors_show, errors_store); +__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); static ssize_t slot_show(mdk_rdev_t *rdev, char *page) @@ -1845,7 +1862,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) static struct rdev_sysfs_entry rdev_slot = -__ATTR(slot, 0644, slot_show, slot_store); +__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); static ssize_t offset_show(mdk_rdev_t *rdev, char *page) @@ -1867,7 +1884,7 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) } static struct rdev_sysfs_entry rdev_offset = -__ATTR(offset, 0644, offset_show, offset_store); +__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); static ssize_t rdev_size_show(mdk_rdev_t *rdev, char *page) @@ -1891,7 +1908,7 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) } static struct rdev_sysfs_entry rdev_size = -__ATTR(size, 0644, rdev_size_show, rdev_size_store); +__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); static struct attribute *rdev_default_attrs[] = { &rdev_state.attr, @@ -1922,6 +1939,8 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, if (!entry->store) return -EIO; + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; return entry->store(rdev, page, length); } @@ -2128,7 +2147,7 @@ safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) return len; } static struct md_sysfs_entry md_safe_delay = -__ATTR(safe_mode_delay, 0644,safe_delay_show, safe_delay_store); +__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); static ssize_t level_show(mddev_t *mddev, char *page) @@ -2163,7 +2182,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len) } static struct md_sysfs_entry md_level = -__ATTR(level, 0644, level_show, level_store); +__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); static ssize_t @@ -2188,7 +2207,7 @@ layout_store(mddev_t *mddev, const char *buf, size_t len) return len; } static struct md_sysfs_entry md_layout = -__ATTR(layout, 0655, layout_show, layout_store); +__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); static ssize_t @@ -2219,7 +2238,7 @@ raid_disks_store(mddev_t *mddev, const char *buf, size_t len) return rv ? rv : len; } static struct md_sysfs_entry md_raid_disks = -__ATTR(raid_disks, 0644, raid_disks_show, raid_disks_store); +__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); static ssize_t chunk_size_show(mddev_t *mddev, char *page) @@ -2243,7 +2262,7 @@ chunk_size_store(mddev_t *mddev, const char *buf, size_t len) return len; } static struct md_sysfs_entry md_chunk_size = -__ATTR(chunk_size, 0644, chunk_size_show, chunk_size_store); +__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); static ssize_t resync_start_show(mddev_t *mddev, char *page) @@ -2267,7 +2286,7 @@ resync_start_store(mddev_t *mddev, const char *buf, size_t len) return len; } static struct md_sysfs_entry md_resync_start = -__ATTR(resync_start, 0644, resync_start_show, resync_start_store); +__ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); /* * The array state can be: @@ -2437,7 +2456,8 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) else return len; } -static struct md_sysfs_entry md_array_state = __ATTR(array_state, 0644, array_state_show, array_state_store); +static struct md_sysfs_entry md_array_state = +__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); static ssize_t null_show(mddev_t *mddev, char *page) @@ -2497,7 +2517,7 @@ new_dev_store(mddev_t *mddev, const char *buf, size_t len) } static struct md_sysfs_entry md_new_device = -__ATTR(new_dev, 0200, null_show, new_dev_store); +__ATTR(new_dev, S_IWUSR, null_show, new_dev_store); static ssize_t size_show(mddev_t *mddev, char *page) @@ -2535,7 +2555,7 @@ size_store(mddev_t *mddev, const char *buf, size_t len) } static struct md_sysfs_entry md_size = -__ATTR(component_size, 0644, size_show, size_store); +__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); /* Metdata version. @@ -2583,7 +2603,7 @@ metadata_store(mddev_t *mddev, const char *buf, size_t len) } static struct md_sysfs_entry md_metadata = -__ATTR(metadata_version, 0644, metadata_show, metadata_store); +__ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); static ssize_t action_show(mddev_t *mddev, char *page) @@ -2651,12 +2671,11 @@ mismatch_cnt_show(mddev_t *mddev, char *page) (unsigned long long) mddev->resync_mismatches); } -static struct md_sysfs_entry -md_scan_mode = __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); +static struct md_sysfs_entry md_scan_mode = +__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); -static struct md_sysfs_entry -md_mismatches = __ATTR_RO(mismatch_cnt); +static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); static ssize_t sync_min_show(mddev_t *mddev, char *page) @@ -2715,15 +2734,14 @@ static ssize_t sync_speed_show(mddev_t *mddev, char *page) { unsigned long resync, dt, db; - resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active)); + resync = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)); dt = ((jiffies - mddev->resync_mark) / HZ); if (!dt) dt++; db = resync - (mddev->resync_mark_cnt); return sprintf(page, "%ld\n", db/dt/2); /* K/sec */ } -static struct md_sysfs_entry -md_sync_speed = __ATTR_RO(sync_speed); +static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); static ssize_t sync_completed_show(mddev_t *mddev, char *page) @@ -2739,8 +2757,7 @@ sync_completed_show(mddev_t *mddev, char *page) return sprintf(page, "%lu / %lu\n", resync, max_blocks); } -static struct md_sysfs_entry -md_sync_completed = __ATTR_RO(sync_completed); +static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); static ssize_t suspend_lo_show(mddev_t *mddev, char *page) @@ -2857,6 +2874,8 @@ md_attr_store(struct kobject *kobj, struct attribute *attr, if (!entry->store) return -EIO; + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; rv = mddev_lock(mddev); if (!rv) { rv = entry->store(mddev, page, length); @@ -3091,7 +3110,6 @@ static int do_md_run(mddev_t * mddev) } set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); if (mddev->sb_dirty) md_update_sb(mddev); @@ -3112,7 +3130,7 @@ static int do_md_run(mddev_t * mddev) * start recovery here. If we leave it to md_check_recovery, * it will remove the drives and not do the right thing */ - if (mddev->degraded) { + if (mddev->degraded && !mddev->sync_thread) { struct list_head *rtmp; int spares = 0; ITERATE_RDEV(mddev,rdev,rtmp) @@ -3133,10 +3151,11 @@ static int do_md_run(mddev_t * mddev) mdname(mddev)); /* leave the spares where they are, it shouldn't hurt */ mddev->recovery = 0; - } else - md_wakeup_thread(mddev->sync_thread); + } } } + md_wakeup_thread(mddev->thread); + md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ mddev->changed = 1; md_new_event(mddev); @@ -4586,6 +4605,8 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) __builtin_return_address(0),__builtin_return_address(1), __builtin_return_address(2),__builtin_return_address(3)); */ + if (!mddev->pers) + return; if (!mddev->pers->error_handler) return; mddev->pers->error_handler(mddev,rdev); @@ -4683,12 +4704,13 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev) */ dt = ((jiffies - mddev->resync_mark) / HZ); if (!dt) dt++; - db = resync - (mddev->resync_mark_cnt/2); - rt = (dt * ((unsigned long)(max_blocks-resync) / (db/100+1)))/100; + db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)) + - mddev->resync_mark_cnt; + rt = (dt * ((unsigned long)(max_blocks-resync) / (db/2/100+1)))/100; seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); - seq_printf(seq, " speed=%ldK/sec", db/dt); + seq_printf(seq, " speed=%ldK/sec", db/2/dt); } static void *md_seq_start(struct seq_file *seq, loff_t *pos) @@ -5199,6 +5221,7 @@ void md_do_sync(mddev_t *mddev) j += sectors; if (j>1) mddev->curr_resync = j; + mddev->curr_mark_cnt = io_sectors; if (last_check == 0) /* this is the earliers that rebuilt will be * visible in /proc/mdstat @@ -5645,8 +5668,8 @@ static int set_ro(const char *val, struct kernel_param *kp) return -EINVAL; } -module_param_call(start_ro, set_ro, get_ro, NULL, 0600); -module_param(start_dirty_degraded, int, 0644); +module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); +module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); EXPORT_SYMBOL(register_md_personality); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index cead918578a7..3b4d69c05623 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -930,10 +930,13 @@ static void status(struct seq_file *seq, mddev_t *mddev) seq_printf(seq, " [%d/%d] [", conf->raid_disks, conf->working_disks); - for (i = 0; i < conf->raid_disks; i++) + rcu_read_lock(); + for (i = 0; i < conf->raid_disks; i++) { + mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); seq_printf(seq, "%s", - conf->mirrors[i].rdev && - test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_"); + rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); + } + rcu_read_unlock(); seq_printf(seq, "]"); } @@ -975,7 +978,6 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) static void print_conf(conf_t *conf) { int i; - mirror_info_t *tmp; printk("RAID1 conf printout:\n"); if (!conf) { @@ -985,14 +987,17 @@ static void print_conf(conf_t *conf) printk(" --- wd:%d rd:%d\n", conf->working_disks, conf->raid_disks); + rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { char b[BDEVNAME_SIZE]; - tmp = conf->mirrors + i; - if (tmp->rdev) + mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); + if (rdev) printk(" disk %d, wo:%d, o:%d, dev:%s\n", - i, !test_bit(In_sync, &tmp->rdev->flags), !test_bit(Faulty, &tmp->rdev->flags), - bdevname(tmp->rdev->bdev,b)); + i, !test_bit(In_sync, &rdev->flags), + !test_bit(Faulty, &rdev->flags), + bdevname(rdev->bdev,b)); } + rcu_read_unlock(); } static void close_sync(conf_t *conf) @@ -1008,20 +1013,20 @@ static int raid1_spare_active(mddev_t *mddev) { int i; conf_t *conf = mddev->private; - mirror_info_t *tmp; /* * Find all failed disks within the RAID1 configuration - * and mark them readable + * and mark them readable. + * Called under mddev lock, so rcu protection not needed. */ for (i = 0; i < conf->raid_disks; i++) { - tmp = conf->mirrors + i; - if (tmp->rdev - && !test_bit(Faulty, &tmp->rdev->flags) - && !test_bit(In_sync, &tmp->rdev->flags)) { + mdk_rdev_t *rdev = conf->mirrors[i].rdev; + if (rdev + && !test_bit(Faulty, &rdev->flags) + && !test_bit(In_sync, &rdev->flags)) { conf->working_disks++; mddev->degraded--; - set_bit(In_sync, &tmp->rdev->flags); + set_bit(In_sync, &rdev->flags); } } @@ -1145,7 +1150,7 @@ static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error) long sectors_to_go = r1_bio->sectors; /* make sure these bits doesn't get cleared. */ do { - bitmap_end_sync(mddev->bitmap, r1_bio->sector, + bitmap_end_sync(mddev->bitmap, s, &sync_blocks, 1); s += sync_blocks; sectors_to_go -= sync_blocks; @@ -1237,7 +1242,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) /* ouch - failed to read all of that. * Try some synchronous reads of other devices to get * good data, much like with normal read errors. Only - * read into the pages we already have so they we don't + * read into the pages we already have so we don't * need to re-issue the read request. * We don't need to freeze the array, because being in an * active sync request, there is no normal IO, and @@ -1257,6 +1262,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) s = PAGE_SIZE >> 9; do { if (r1_bio->bios[d]->bi_end_io == end_sync_read) { + /* No rcu protection needed here devices + * can only be removed when no resync is + * active, and resync is currently active + */ rdev = conf->mirrors[d].rdev; if (sync_page_io(rdev->bdev, sect + rdev->data_offset, @@ -1463,6 +1472,11 @@ static void raid1d(mddev_t *mddev) s = PAGE_SIZE >> 9; do { + /* Note: no rcu protection needed here + * as this is synchronous in the raid1d thread + * which is the thread that might remove + * a device. If raid1d ever becomes multi-threaded.... + */ rdev = conf->mirrors[d].rdev; if (rdev && test_bit(In_sync, &rdev->flags) && @@ -1486,7 +1500,6 @@ static void raid1d(mddev_t *mddev) d = conf->raid_disks; d--; rdev = conf->mirrors[d].rdev; - atomic_add(s, &rdev->corrected_errors); if (rdev && test_bit(In_sync, &rdev->flags)) { if (sync_page_io(rdev->bdev, @@ -1509,6 +1522,11 @@ static void raid1d(mddev_t *mddev) s<<9, conf->tmppage, READ) == 0) /* Well, this device is dead */ md_error(mddev, rdev); + else { + atomic_add(s, &rdev->corrected_errors); + printk(KERN_INFO "raid1:%s: read error corrected (%d sectors at %llu on %s)\n", + mdname(mddev), s, (unsigned long long)(sect + rdev->data_offset), bdevname(rdev->bdev, b)); + } } } } else { @@ -1622,15 +1640,16 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i return 0; } - /* before building a request, check if we can skip these blocks.. - * This call the bitmap_start_sync doesn't actually record anything - */ if (mddev->bitmap == NULL && mddev->recovery_cp == MaxSector && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && conf->fullsync == 0) { *skipped = 1; return max_sector - sector_nr; } + /* before building a request, check if we can skip these blocks.. + * This call the bitmap_start_sync doesn't actually record anything + */ if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { /* We can skip this block, and probably several more */ @@ -1783,19 +1802,17 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i for (i=0; i<conf->raid_disks; i++) { bio = r1_bio->bios[i]; if (bio->bi_end_io == end_sync_read) { - md_sync_acct(conf->mirrors[i].rdev->bdev, nr_sectors); + md_sync_acct(bio->bi_bdev, nr_sectors); generic_make_request(bio); } } } else { atomic_set(&r1_bio->remaining, 1); bio = r1_bio->bios[r1_bio->read_disk]; - md_sync_acct(conf->mirrors[r1_bio->read_disk].rdev->bdev, - nr_sectors); + md_sync_acct(bio->bi_bdev, nr_sectors); generic_make_request(bio); } - return nr_sectors; } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 7f636283a1ba..016ddb831c9b 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1492,6 +1492,10 @@ static void raid10d(mddev_t *mddev) s<<9, conf->tmppage, READ) == 0) /* Well, this device is dead */ md_error(mddev, rdev); + else + printk(KERN_INFO "raid10:%s: read error corrected (%d sectors at %llu on %s)\n", + mdname(mddev), s, (unsigned long long)(sect+rdev->data_offset), bdevname(rdev->bdev, b)); + rdev_dec_pending(rdev, mddev); rcu_read_lock(); } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7433871f4b3a..450066007160 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -18,6 +18,30 @@ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +/* + * BITMAP UNPLUGGING: + * + * The sequencing for updating the bitmap reliably is a little + * subtle (and I got it wrong the first time) so it deserves some + * explanation. + * + * We group bitmap updates into batches. Each batch has a number. + * We may write out several batches at once, but that isn't very important. + * conf->bm_write is the number of the last batch successfully written. + * conf->bm_flush is the number of the last batch that was closed to + * new additions. + * When we discover that we will need to write to any block in a stripe + * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq + * the number of the batch it will be in. This is bm_flush+1. + * When we are ready to do a write, if that batch hasn't been written yet, + * we plug the array and queue the stripe for later. + * When an unplug happens, we increment bm_flush, thus closing the current + * batch. + * When we notice that bm_flush > bm_write, we write out all pending updates + * to the bitmap, and advance bm_write to where bm_flush was. + * This may occasionally write a bit out twice, but is sure never to + * miss any bits. + */ #include <linux/module.h> #include <linux/slab.h> @@ -88,12 +112,14 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) BUG_ON(!list_empty(&sh->lru)); BUG_ON(atomic_read(&conf->active_stripes)==0); if (test_bit(STRIPE_HANDLE, &sh->state)) { - if (test_bit(STRIPE_DELAYED, &sh->state)) + if (test_bit(STRIPE_DELAYED, &sh->state)) { list_add_tail(&sh->lru, &conf->delayed_list); - else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && - conf->seq_write == sh->bm_seq) + blk_plug_device(conf->mddev->queue); + } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && + sh->bm_seq - conf->seq_write > 0) { list_add_tail(&sh->lru, &conf->bitmap_list); - else { + blk_plug_device(conf->mddev->queue); + } else { clear_bit(STRIPE_BIT_DELAY, &sh->state); list_add_tail(&sh->lru, &conf->handle_list); } @@ -270,7 +296,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector < (conf->max_nr_stripes *3/4) || !conf->inactive_blocked), conf->device_lock, - unplug_slaves(conf->mddev) + raid5_unplug_device(conf->mddev->queue) ); conf->inactive_blocked = 0; } else @@ -281,7 +307,8 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector } else { if (!test_bit(STRIPE_HANDLE, &sh->state)) atomic_inc(&conf->active_stripes); - if (list_empty(&sh->lru)) + if (list_empty(&sh->lru) && + !test_bit(STRIPE_EXPANDING, &sh->state)) BUG(); list_del_init(&sh->lru); } @@ -496,6 +523,8 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done, raid5_conf_t *conf = sh->raid_conf; int disks = sh->disks, i; int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); + char b[BDEVNAME_SIZE]; + mdk_rdev_t *rdev; if (bi->bi_size) return 1; @@ -543,25 +572,39 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done, set_bit(R5_UPTODATE, &sh->dev[i].flags); #endif if (test_bit(R5_ReadError, &sh->dev[i].flags)) { - printk(KERN_INFO "raid5: read error corrected!!\n"); + rdev = conf->disks[i].rdev; + printk(KERN_INFO "raid5:%s: read error corrected (%lu sectors at %llu on %s)\n", + mdname(conf->mddev), STRIPE_SECTORS, + (unsigned long long)sh->sector + rdev->data_offset, + bdevname(rdev->bdev, b)); clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags); } if (atomic_read(&conf->disks[i].rdev->read_errors)) atomic_set(&conf->disks[i].rdev->read_errors, 0); } else { + const char *bdn = bdevname(conf->disks[i].rdev->bdev, b); int retry = 0; + rdev = conf->disks[i].rdev; + clear_bit(R5_UPTODATE, &sh->dev[i].flags); - atomic_inc(&conf->disks[i].rdev->read_errors); + atomic_inc(&rdev->read_errors); if (conf->mddev->degraded) - printk(KERN_WARNING "raid5: read error not correctable.\n"); + printk(KERN_WARNING "raid5:%s: read error not correctable (sector %llu on %s).\n", + mdname(conf->mddev), + (unsigned long long)sh->sector + rdev->data_offset, + bdn); else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) /* Oh, no!!! */ - printk(KERN_WARNING "raid5: read error NOT corrected!!\n"); - else if (atomic_read(&conf->disks[i].rdev->read_errors) + printk(KERN_WARNING "raid5:%s: read error NOT corrected!! (sector %llu on %s).\n", + mdname(conf->mddev), + (unsigned long long)sh->sector + rdev->data_offset, + bdn); + else if (atomic_read(&rdev->read_errors) > conf->max_nr_stripes) printk(KERN_WARNING - "raid5: Too many read errors, failing device.\n"); + "raid5:%s: Too many read errors, failing device %s.\n", + mdname(conf->mddev), bdn); else retry = 1; if (retry) @@ -569,7 +612,7 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done, else { clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags); - md_error(conf->mddev, conf->disks[i].rdev); + md_error(conf->mddev, rdev); } } rdev_dec_pending(conf->disks[i].rdev, conf->mddev); @@ -1270,9 +1313,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in (unsigned long long)sh->sector, dd_idx); if (conf->mddev->bitmap && firstwrite) { - sh->bm_seq = conf->seq_write; bitmap_startwrite(conf->mddev->bitmap, sh->sector, STRIPE_SECTORS, 0); + sh->bm_seq = conf->seq_flush+1; set_bit(STRIPE_BIT_DELAY, &sh->state); } @@ -2554,13 +2597,6 @@ static int raid5_issue_flush(request_queue_t *q, struct gendisk *disk, return ret; } -static inline void raid5_plug_device(raid5_conf_t *conf) -{ - spin_lock_irq(&conf->device_lock); - blk_plug_device(conf->mddev->queue); - spin_unlock_irq(&conf->device_lock); -} - static int make_request(request_queue_t *q, struct bio * bi) { mddev_t *mddev = q->queuedata; @@ -2670,7 +2706,6 @@ static int make_request(request_queue_t *q, struct bio * bi) goto retry; } finish_wait(&conf->wait_for_overlap, &w); - raid5_plug_device(conf); handle_stripe(sh, NULL); release_stripe(sh); } else { @@ -2923,7 +2958,7 @@ static void raid5d (mddev_t *mddev) while (1) { struct list_head *first; - if (conf->seq_flush - conf->seq_write > 0) { + if (conf->seq_flush != conf->seq_write) { int seq = conf->seq_flush; spin_unlock_irq(&conf->device_lock); bitmap_unplug(mddev->bitmap); @@ -3246,9 +3281,6 @@ static int run(mddev_t *mddev) set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); mddev->sync_thread = md_register_thread(md_do_sync, mddev, "%s_reshape"); - /* FIXME if md_register_thread fails?? */ - md_wakeup_thread(mddev->sync_thread); - } /* read-ahead size must cover two whole stripes, which is |