diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-07-29 03:04:39 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-07-29 03:04:39 +0200 |
commit | 867900b5ec231b3386304e61a42bfc9b30f9076f (patch) | |
tree | 6dd822033f162f7a928b7c8cd152989564782145 /drivers/md | |
parent | Merge tag 'libnvdimm-for-4.8' of git://git.kernel.org/pub/scm/linux/kernel/gi... (diff) | |
parent | Merge branch 'mymd/for-next' into mymd/for-linus (diff) | |
download | linux-867900b5ec231b3386304e61a42bfc9b30f9076f.tar.xz linux-867900b5ec231b3386304e61a42bfc9b30f9076f.zip |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md
Pull MD updates from Shaohua Li:
- A bunch of patches from Neil Brown to fix RCU usage
- Two performance improvement patches from Tomasz Majchrzak
- Alexey Obitotskiy fixes module refcount issue
- Arnd Bergmann fixes time granularity
- Cong Wang fixes a list corruption issue
- Guoqing Jiang fixes a deadlock in md-cluster
- A null pointer deference fix from me
- Song Liu fixes misuse of raid6 rmw
- Other trival/cleanup fixes from Guoqing Jiang and Xiao Ni
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md: (28 commits)
MD: fix null pointer deference
raid10: improve random reads performance
md: add missing sysfs_notify on array_state update
Fix kernel module refcount handling
md: use seconds granularity for error logging
md: reduce the number of synchronize_rcu() calls when multiple devices fail.
md: be extra careful not to take a reference to a Faulty device.
md/multipath: add rcu protection to rdev access in multipath_status.
md/raid5: add rcu protection to rdev accesses in raid5_status.
md/raid5: add rcu protection to rdev accesses in want_replace
md/raid5: add rcu protection to rdev accesses in handle_failed_sync.
md/raid1: add rcu protection to rdev in fix_read_error
md/raid1: small code cleanup in end_sync_write
md/raid1: small cleanup in raid1_end_read/write_request
md/raid10: simplify print_conf a little.
md/raid10: minor code improvement in fix_read_error()
md/raid10: add rcu protection to rdev access during reshape.
md/raid10: add rcu protection to rdev access in raid10_sync_request.
md/raid10: add rcu protection in raid10_status.
md/raid10: fix refounct imbalance when resyncing an array with a replacement device.
...
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/md.c | 74 | ||||
-rw-r--r-- | drivers/md/md.h | 10 | ||||
-rw-r--r-- | drivers/md/multipath.c | 29 | ||||
-rw-r--r-- | drivers/md/raid1.c | 130 | ||||
-rw-r--r-- | drivers/md/raid10.c | 250 | ||||
-rw-r--r-- | drivers/md/raid10.h | 3 | ||||
-rw-r--r-- | drivers/md/raid5.c | 45 |
7 files changed, 328 insertions, 213 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index 1f123f5a29da..2c3ab6f5e6be 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2482,8 +2482,7 @@ static int add_bound_rdev(struct md_rdev *rdev) if (add_journal) mddev_resume(mddev); if (err) { - unbind_rdev_from_array(rdev); - export_rdev(rdev); + md_kick_rdev_from_array(rdev); return err; } } @@ -2600,6 +2599,10 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) else err = -EBUSY; } else if (cmd_match(buf, "remove")) { + if (rdev->mddev->pers) { + clear_bit(Blocked, &rdev->flags); + remove_and_add_spares(rdev->mddev, rdev); + } if (rdev->raid_disk >= 0) err = -EBUSY; else { @@ -3176,8 +3179,7 @@ int md_rdev_init(struct md_rdev *rdev) rdev->data_offset = 0; rdev->new_data_offset = 0; rdev->sb_events = 0; - rdev->last_read_error.tv_sec = 0; - rdev->last_read_error.tv_nsec = 0; + rdev->last_read_error = 0; rdev->sb_loaded = 0; rdev->bb_page = NULL; atomic_set(&rdev->nr_pending, 0); @@ -3583,6 +3585,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev->to_remove = &md_redundancy_group; } + module_put(oldpers->owner); + rdev_for_each(rdev, mddev) { if (rdev->raid_disk < 0) continue; @@ -3940,6 +3944,8 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) } else err = -EBUSY; } + if (!err) + sysfs_notify_dirent_safe(mddev->sysfs_state); spin_unlock(&mddev->lock); return err ?: len; } @@ -4191,7 +4197,8 @@ size_store(struct mddev *mddev, const char *buf, size_t len) return err; if (mddev->pers) { err = update_size(mddev, sectors); - md_update_sb(mddev, 1); + if (err == 0) + md_update_sb(mddev, 1); } else { if (mddev->dev_sectors == 0 || mddev->dev_sectors > sectors) @@ -7813,6 +7820,7 @@ void md_do_sync(struct md_thread *thread) if (ret) goto skip; + set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags); if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) || test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) @@ -8151,18 +8159,11 @@ void md_do_sync(struct md_thread *thread) } } skip: - if (mddev_is_clustered(mddev) && - ret == 0) { - /* set CHANGE_PENDING here since maybe another - * update is needed, so other nodes are informed */ - set_mask_bits(&mddev->flags, 0, - BIT(MD_CHANGE_PENDING) | BIT(MD_CHANGE_DEVS)); - md_wakeup_thread(mddev->thread); - wait_event(mddev->sb_wait, - !test_bit(MD_CHANGE_PENDING, &mddev->flags)); - md_cluster_ops->resync_finish(mddev); - } else - set_bit(MD_CHANGE_DEVS, &mddev->flags); + /* set CHANGE_PENDING here since maybe another update is needed, + * so other nodes are informed. It should be harmless for normal + * raid */ + set_mask_bits(&mddev->flags, 0, + BIT(MD_CHANGE_PENDING) | BIT(MD_CHANGE_DEVS)); spin_lock(&mddev->lock); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { @@ -8188,15 +8189,34 @@ static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *rdev; int spares = 0; int removed = 0; + bool remove_some = false; - rdev_for_each(rdev, mddev) + rdev_for_each(rdev, mddev) { + if ((this == NULL || rdev == this) && + rdev->raid_disk >= 0 && + !test_bit(Blocked, &rdev->flags) && + test_bit(Faulty, &rdev->flags) && + atomic_read(&rdev->nr_pending)==0) { + /* Faulty non-Blocked devices with nr_pending == 0 + * never get nr_pending incremented, + * never get Faulty cleared, and never get Blocked set. + * So we can synchronize_rcu now rather than once per device + */ + remove_some = true; + set_bit(RemoveSynchronized, &rdev->flags); + } + } + + if (remove_some) + synchronize_rcu(); + rdev_for_each(rdev, mddev) { if ((this == NULL || rdev == this) && rdev->raid_disk >= 0 && !test_bit(Blocked, &rdev->flags) && - (test_bit(Faulty, &rdev->flags) || + ((test_bit(RemoveSynchronized, &rdev->flags) || (!test_bit(In_sync, &rdev->flags) && !test_bit(Journal, &rdev->flags))) && - atomic_read(&rdev->nr_pending)==0) { + atomic_read(&rdev->nr_pending)==0)) { if (mddev->pers->hot_remove_disk( mddev, rdev) == 0) { sysfs_unlink_rdev(mddev, rdev); @@ -8204,6 +8224,10 @@ static int remove_and_add_spares(struct mddev *mddev, removed++; } } + if (remove_some && test_bit(RemoveSynchronized, &rdev->flags)) + clear_bit(RemoveSynchronized, &rdev->flags); + } + if (removed && mddev->kobj.sd) sysfs_notify(&mddev->kobj, NULL, "degraded"); @@ -8506,6 +8530,11 @@ void md_reap_sync_thread(struct mddev *mddev) rdev->saved_raid_disk = -1; md_update_sb(mddev, 1); + /* MD_CHANGE_PENDING should be cleared by md_update_sb, so we can + * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by + * clustered raid */ + if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags)) + md_cluster_ops->resync_finish(mddev); clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); @@ -8803,6 +8832,7 @@ EXPORT_SYMBOL(md_reload_sb); * at boot time. */ +static DEFINE_MUTEX(detected_devices_mutex); static LIST_HEAD(all_detected_devices); struct detected_devices_node { struct list_head list; @@ -8816,7 +8846,9 @@ void md_autodetect_dev(dev_t dev) node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); if (node_detected_dev) { node_detected_dev->dev = dev; + mutex_lock(&detected_devices_mutex); list_add_tail(&node_detected_dev->list, &all_detected_devices); + mutex_unlock(&detected_devices_mutex); } else { printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed" ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev)); @@ -8835,6 +8867,7 @@ static void autostart_arrays(int part) printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); + mutex_lock(&detected_devices_mutex); while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { i_scanned++; node_detected_dev = list_entry(all_detected_devices.next, @@ -8853,6 +8886,7 @@ static void autostart_arrays(int part) list_add(&rdev->same_set, &pending_raid_disks); i_passed++; } + mutex_unlock(&detected_devices_mutex); printk(KERN_INFO "md: Scanned %d and added %d devices.\n", i_scanned, i_passed); diff --git a/drivers/md/md.h b/drivers/md/md.h index b4f335245bd6..20c667579ede 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -99,7 +99,7 @@ struct md_rdev { atomic_t read_errors; /* number of consecutive read errors that * we have tried to ignore. */ - struct timespec last_read_error; /* monotonic time since our + time64_t last_read_error; /* monotonic time since our * last read error */ atomic_t corrected_errors; /* number of corrected read errors, @@ -163,6 +163,11 @@ enum flag_bits { * than other devices in the array */ ClusterRemove, + RemoveSynchronized, /* synchronize_rcu() was called after + * this device was known to be faulty, + * so it is safe to remove without + * another synchronize_rcu() call. + */ }; static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, @@ -204,6 +209,9 @@ struct mddev { #define MD_RELOAD_SB 7 /* Reload the superblock because another node * updated it. */ +#define MD_CLUSTER_RESYNC_LOCKED 8 /* cluster raid only, which means node + * already took resync lock, need to + * release the lock */ int suspended; atomic_t active_io; diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 72ea98e89e57..4974682842ae 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -43,7 +43,8 @@ static int multipath_map (struct mpconf *conf) rcu_read_lock(); for (i = 0; i < disks; i++) { struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev); - if (rdev && test_bit(In_sync, &rdev->flags)) { + if (rdev && test_bit(In_sync, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) { atomic_inc(&rdev->nr_pending); rcu_read_unlock(); return i; @@ -141,17 +142,19 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio) return; } -static void multipath_status (struct seq_file *seq, struct mddev *mddev) +static void multipath_status(struct seq_file *seq, struct mddev *mddev) { struct mpconf *conf = mddev->private; int i; seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); - for (i = 0; i < conf->raid_disks; i++) - seq_printf (seq, "%s", - conf->multipaths[i].rdev && - test_bit(In_sync, &conf->multipaths[i].rdev->flags) ? "U" : "_"); + rcu_read_lock(); + for (i = 0; i < conf->raid_disks; i++) { + struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev); + seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); + } + rcu_read_unlock(); seq_printf (seq, "]"); } @@ -295,12 +298,14 @@ static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev) goto abort; } p->rdev = NULL; - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - p->rdev = rdev; - goto abort; + if (!test_bit(RemoveSynchronized, &rdev->flags)) { + synchronize_rcu(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + p->rdev = rdev; + goto abort; + } } err = md_integrity_register(mddev); } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 4e6da4497553..46168ef2e279 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -319,14 +319,13 @@ static void raid1_end_read_request(struct bio *bio) { int uptodate = !bio->bi_error; struct r1bio *r1_bio = bio->bi_private; - int mirror; struct r1conf *conf = r1_bio->mddev->private; + struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev; - mirror = r1_bio->read_disk; /* * this branch is our 'one mirror IO has finished' event handler: */ - update_head_pos(mirror, r1_bio); + update_head_pos(r1_bio->read_disk, r1_bio); if (uptodate) set_bit(R1BIO_Uptodate, &r1_bio->state); @@ -339,14 +338,14 @@ static void raid1_end_read_request(struct bio *bio) spin_lock_irqsave(&conf->device_lock, flags); if (r1_bio->mddev->degraded == conf->raid_disks || (r1_bio->mddev->degraded == conf->raid_disks-1 && - test_bit(In_sync, &conf->mirrors[mirror].rdev->flags))) + test_bit(In_sync, &rdev->flags))) uptodate = 1; spin_unlock_irqrestore(&conf->device_lock, flags); } if (uptodate) { raid_end_bio_io(r1_bio); - rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); + rdev_dec_pending(rdev, conf->mddev); } else { /* * oops, read error: @@ -356,7 +355,7 @@ static void raid1_end_read_request(struct bio *bio) KERN_ERR "md/raid1:%s: %s: " "rescheduling sector %llu\n", mdname(conf->mddev), - bdevname(conf->mirrors[mirror].rdev->bdev, + bdevname(rdev->bdev, b), (unsigned long long)r1_bio->sector); set_bit(R1BIO_ReadError, &r1_bio->state); @@ -403,20 +402,18 @@ static void r1_bio_write_done(struct r1bio *r1_bio) static void raid1_end_write_request(struct bio *bio) { struct r1bio *r1_bio = bio->bi_private; - int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state); + int behind = test_bit(R1BIO_BehindIO, &r1_bio->state); struct r1conf *conf = r1_bio->mddev->private; struct bio *to_put = NULL; - - mirror = find_bio_disk(r1_bio, bio); + int mirror = find_bio_disk(r1_bio, bio); + struct md_rdev *rdev = conf->mirrors[mirror].rdev; /* * 'one mirror IO has finished' event handler: */ if (bio->bi_error) { - set_bit(WriteErrorSeen, - &conf->mirrors[mirror].rdev->flags); - if (!test_and_set_bit(WantReplacement, - &conf->mirrors[mirror].rdev->flags)) + set_bit(WriteErrorSeen, &rdev->flags); + if (!test_and_set_bit(WantReplacement, &rdev->flags)) set_bit(MD_RECOVERY_NEEDED, & conf->mddev->recovery); @@ -445,13 +442,12 @@ static void raid1_end_write_request(struct bio *bio) * before rdev->recovery_offset, but for simplicity we don't * check this here. */ - if (test_bit(In_sync, &conf->mirrors[mirror].rdev->flags) && - !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags)) + if (test_bit(In_sync, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) set_bit(R1BIO_Uptodate, &r1_bio->state); /* Maybe we can clear some bad blocks. */ - if (is_badblock(conf->mirrors[mirror].rdev, - r1_bio->sector, r1_bio->sectors, + if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors, &first_bad, &bad_sectors)) { r1_bio->bios[mirror] = IO_MADE_GOOD; set_bit(R1BIO_MadeGood, &r1_bio->state); @@ -459,7 +455,7 @@ static void raid1_end_write_request(struct bio *bio) } if (behind) { - if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) + if (test_bit(WriteMostly, &rdev->flags)) atomic_dec(&r1_bio->behind_remaining); /* @@ -483,8 +479,7 @@ static void raid1_end_write_request(struct bio *bio) } } if (r1_bio->bios[mirror] == NULL) - rdev_dec_pending(conf->mirrors[mirror].rdev, - conf->mddev); + rdev_dec_pending(rdev, conf->mddev); /* * Let's see if all mirrored write operations have finished @@ -689,13 +684,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect if (!rdev) goto retry; atomic_inc(&rdev->nr_pending); - if (test_bit(Faulty, &rdev->flags)) { - /* cannot risk returning a device that failed - * before we inc'ed nr_pending - */ - rdev_dec_pending(rdev, conf->mddev); - goto retry; - } sectors = best_good_sectors; if (conf->mirrors[best_disk].next_seq_sect != this_sector) @@ -1666,13 +1654,16 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) goto abort; } p->rdev = NULL; - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - p->rdev = rdev; - goto abort; - } else if (conf->mirrors[conf->raid_disks + number].rdev) { + if (!test_bit(RemoveSynchronized, &rdev->flags)) { + synchronize_rcu(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + p->rdev = rdev; + goto abort; + } + } + if (conf->mirrors[conf->raid_disks + number].rdev) { /* We just removed a device that is being replaced. * Move down the replacement. We drain all IO before * doing this to avoid confusion. @@ -1719,11 +1710,9 @@ static void end_sync_write(struct bio *bio) struct r1bio *r1_bio = bio->bi_private; struct mddev *mddev = r1_bio->mddev; struct r1conf *conf = mddev->private; - int mirror=0; sector_t first_bad; int bad_sectors; - - mirror = find_bio_disk(r1_bio, bio); + struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev; if (!uptodate) { sector_t sync_blocks = 0; @@ -1736,16 +1725,12 @@ static void end_sync_write(struct bio *bio) s += sync_blocks; sectors_to_go -= sync_blocks; } while (sectors_to_go > 0); - set_bit(WriteErrorSeen, - &conf->mirrors[mirror].rdev->flags); - if (!test_and_set_bit(WantReplacement, - &conf->mirrors[mirror].rdev->flags)) + set_bit(WriteErrorSeen, &rdev->flags); + if (!test_and_set_bit(WantReplacement, &rdev->flags)) set_bit(MD_RECOVERY_NEEDED, & mddev->recovery); set_bit(R1BIO_WriteError, &r1_bio->state); - } else if (is_badblock(conf->mirrors[mirror].rdev, - r1_bio->sector, - r1_bio->sectors, + } else if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors, &first_bad, &bad_sectors) && !is_badblock(conf->mirrors[r1_bio->read_disk].rdev, r1_bio->sector, @@ -2072,29 +2057,30 @@ static void fix_read_error(struct r1conf *conf, int read_disk, s = PAGE_SIZE >> 9; do { - /* Note: no rcu protection needed here - * as this is synchronous in the raid1d thread - * which is the thread that might remove - * a device. If raid1d ever becomes multi-threaded.... - */ sector_t first_bad; int bad_sectors; - rdev = conf->mirrors[d].rdev; + rcu_read_lock(); + rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev && (test_bit(In_sync, &rdev->flags) || (!test_bit(Faulty, &rdev->flags) && rdev->recovery_offset >= sect + s)) && is_badblock(rdev, sect, s, - &first_bad, &bad_sectors) == 0 && - sync_page_io(rdev, sect, s<<9, + &first_bad, &bad_sectors) == 0) { + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + if (sync_page_io(rdev, sect, s<<9, conf->tmppage, REQ_OP_READ, 0, false)) - success = 1; - else { - d++; - if (d == conf->raid_disks * 2) - d = 0; - } + success = 1; + rdev_dec_pending(rdev, mddev); + if (success) + break; + } else + rcu_read_unlock(); + d++; + if (d == conf->raid_disks * 2) + d = 0; } while (!success && d != read_disk); if (!success) { @@ -2110,11 +2096,17 @@ static void fix_read_error(struct r1conf *conf, int read_disk, if (d==0) d = conf->raid_disks * 2; d--; - rdev = conf->mirrors[d].rdev; + rcu_read_lock(); + rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev && - !test_bit(Faulty, &rdev->flags)) + !test_bit(Faulty, &rdev->flags)) { + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); r1_sync_page_io(rdev, sect, s, conf->tmppage, WRITE); + rdev_dec_pending(rdev, mddev); + } else + rcu_read_unlock(); } d = start; while (d != read_disk) { @@ -2122,9 +2114,12 @@ static void fix_read_error(struct r1conf *conf, int read_disk, if (d==0) d = conf->raid_disks * 2; d--; - rdev = conf->mirrors[d].rdev; + rcu_read_lock(); + rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev && !test_bit(Faulty, &rdev->flags)) { + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); if (r1_sync_page_io(rdev, sect, s, conf->tmppage, READ)) { atomic_add(s, &rdev->corrected_errors); @@ -2133,10 +2128,12 @@ static void fix_read_error(struct r1conf *conf, int read_disk, "(%d sectors at %llu on %s)\n", mdname(mddev), s, (unsigned long long)(sect + - rdev->data_offset), + rdev->data_offset), bdevname(rdev->bdev, b)); } - } + rdev_dec_pending(rdev, mddev); + } else + rcu_read_unlock(); } sectors -= s; sect += s; @@ -2534,6 +2531,13 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, return sync_blocks; } + /* + * If there is non-resync activity waiting for a turn, then let it + * though before starting on this new sync request. + */ + if (conf->nr_waiting) + schedule_timeout_uninterruptible(1); + /* we are incrementing sector_nr below. To be safe, we check against * sector_nr + two times RESYNC_SECTORS */ diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 26ae74fd0d01..ed29fc899f06 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -707,7 +707,6 @@ static struct md_rdev *read_balance(struct r10conf *conf, raid10_find_phys(conf, r10_bio); rcu_read_lock(); -retry: sectors = r10_bio->sectors; best_slot = -1; best_rdev = NULL; @@ -804,13 +803,6 @@ retry: if (slot >= 0) { atomic_inc(&rdev->nr_pending); - if (test_bit(Faulty, &rdev->flags)) { - /* Cannot risk returning a device that failed - * before we inc'ed nr_pending - */ - rdev_dec_pending(rdev, conf->mddev); - goto retry; - } r10_bio->read_slot = slot; } else rdev = NULL; @@ -913,7 +905,7 @@ static void raise_barrier(struct r10conf *conf, int force) /* Now wait for all pending IO to complete */ wait_event_lock_irq(conf->wait_barrier, - !conf->nr_pending && conf->barrier < RESYNC_DEPTH, + !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH, conf->resync_lock); spin_unlock_irq(&conf->resync_lock); @@ -944,23 +936,23 @@ static void wait_barrier(struct r10conf *conf) */ wait_event_lock_irq(conf->wait_barrier, !conf->barrier || - (conf->nr_pending && + (atomic_read(&conf->nr_pending) && current->bio_list && !bio_list_empty(current->bio_list)), conf->resync_lock); conf->nr_waiting--; + if (!conf->nr_waiting) + wake_up(&conf->wait_barrier); } - conf->nr_pending++; + atomic_inc(&conf->nr_pending); spin_unlock_irq(&conf->resync_lock); } static void allow_barrier(struct r10conf *conf) { - unsigned long flags; - spin_lock_irqsave(&conf->resync_lock, flags); - conf->nr_pending--; - spin_unlock_irqrestore(&conf->resync_lock, flags); - wake_up(&conf->wait_barrier); + if ((atomic_dec_and_test(&conf->nr_pending)) || + (conf->array_freeze_pending)) + wake_up(&conf->wait_barrier); } static void freeze_array(struct r10conf *conf, int extra) @@ -978,13 +970,15 @@ static void freeze_array(struct r10conf *conf, int extra) * we continue. */ spin_lock_irq(&conf->resync_lock); + conf->array_freeze_pending++; conf->barrier++; conf->nr_waiting++; wait_event_lock_irq_cmd(conf->wait_barrier, - conf->nr_pending == conf->nr_queued+extra, + atomic_read(&conf->nr_pending) == conf->nr_queued+extra, conf->resync_lock, flush_pending_writes(conf)); + conf->array_freeze_pending--; spin_unlock_irq(&conf->resync_lock); } @@ -1499,10 +1493,12 @@ static void raid10_status(struct seq_file *seq, struct mddev *mddev) } seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks, conf->geo.raid_disks - mddev->degraded); - for (i = 0; i < conf->geo.raid_disks; i++) - seq_printf(seq, "%s", - conf->mirrors[i].rdev && - test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_"); + rcu_read_lock(); + for (i = 0; i < conf->geo.raid_disks; i++) { + struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + seq_printf(seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); + } + rcu_read_unlock(); seq_printf(seq, "]"); } @@ -1600,7 +1596,7 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev) static void print_conf(struct r10conf *conf) { int i; - struct raid10_info *tmp; + struct md_rdev *rdev; printk(KERN_DEBUG "RAID10 conf printout:\n"); if (!conf) { @@ -1610,14 +1606,16 @@ static void print_conf(struct r10conf *conf) printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded, conf->geo.raid_disks); + /* This is only called with ->reconfix_mutex held, so + * rcu protection of rdev is not needed */ for (i = 0; i < conf->geo.raid_disks; i++) { char b[BDEVNAME_SIZE]; - tmp = conf->mirrors + i; - if (tmp->rdev) + rdev = conf->mirrors[i].rdev; + if (rdev) printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n", - i, !test_bit(In_sync, &tmp->rdev->flags), - !test_bit(Faulty, &tmp->rdev->flags), - bdevname(tmp->rdev->bdev,b)); + i, !test_bit(In_sync, &rdev->flags), + !test_bit(Faulty, &rdev->flags), + bdevname(rdev->bdev,b)); } } @@ -1766,7 +1764,7 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) err = -EBUSY; goto abort; } - /* Only remove faulty devices if recovery + /* Only remove non-faulty devices if recovery * is not possible. */ if (!test_bit(Faulty, &rdev->flags) && @@ -1778,13 +1776,16 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) goto abort; } *rdevp = NULL; - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - *rdevp = rdev; - goto abort; - } else if (p->replacement) { + if (!test_bit(RemoveSynchronized, &rdev->flags)) { + synchronize_rcu(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + *rdevp = rdev; + goto abort; + } + } + if (p->replacement) { /* We must have just cleared 'rdev' */ p->rdev = p->replacement; clear_bit(Replacement, &p->replacement->flags); @@ -2171,21 +2172,20 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) */ static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev) { - struct timespec cur_time_mon; + long cur_time_mon; unsigned long hours_since_last; unsigned int read_errors = atomic_read(&rdev->read_errors); - ktime_get_ts(&cur_time_mon); + cur_time_mon = ktime_get_seconds(); - if (rdev->last_read_error.tv_sec == 0 && - rdev->last_read_error.tv_nsec == 0) { + if (rdev->last_read_error == 0) { /* first time we've seen a read error */ rdev->last_read_error = cur_time_mon; return; } - hours_since_last = (cur_time_mon.tv_sec - - rdev->last_read_error.tv_sec) / 3600; + hours_since_last = (long)(cur_time_mon - + rdev->last_read_error) / 3600; rdev->last_read_error = cur_time_mon; @@ -2264,7 +2264,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 printk(KERN_NOTICE "md/raid10:%s: %s: Failing raid device\n", mdname(mddev), b); - md_error(mddev, conf->mirrors[d].rdev); + md_error(mddev, rdev); r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED; return; } @@ -2287,6 +2287,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev && test_bit(In_sync, &rdev->flags) && + !test_bit(Faulty, &rdev->flags) && is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, &first_bad, &bad_sectors) == 0) { atomic_inc(&rdev->nr_pending); @@ -2340,6 +2341,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 d = r10_bio->devs[sl].devnum; rdev = rcu_dereference(conf->mirrors[d].rdev); if (!rdev || + test_bit(Faulty, &rdev->flags) || !test_bit(In_sync, &rdev->flags)) continue; @@ -2379,6 +2381,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 d = r10_bio->devs[sl].devnum; rdev = rcu_dereference(conf->mirrors[d].rdev); if (!rdev || + test_bit(Faulty, &rdev->flags) || !test_bit(In_sync, &rdev->flags)) continue; @@ -2876,11 +2879,14 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, /* Completed a full sync so the replacements * are now fully recovered. */ - for (i = 0; i < conf->geo.raid_disks; i++) - if (conf->mirrors[i].replacement) - conf->mirrors[i].replacement - ->recovery_offset - = MaxSector; + rcu_read_lock(); + for (i = 0; i < conf->geo.raid_disks; i++) { + struct md_rdev *rdev = + rcu_dereference(conf->mirrors[i].replacement); + if (rdev) + rdev->recovery_offset = MaxSector; + } + rcu_read_unlock(); } conf->fullsync = 0; } @@ -2911,6 +2917,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, max_sector > (sector_nr | chunk_mask)) max_sector = (sector_nr | chunk_mask) + 1; + /* + * If there is non-resync activity waiting for a turn, then let it + * though before starting on this new sync request. + */ + if (conf->nr_waiting) + schedule_timeout_uninterruptible(1); + /* Again, very different code for resync and recovery. * Both must result in an r10bio with a list of bios that * have bi_end_io, bi_sector, bi_bdev set, @@ -2939,14 +2952,20 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, int must_sync; int any_working; struct raid10_info *mirror = &conf->mirrors[i]; + struct md_rdev *mrdev, *mreplace; - if ((mirror->rdev == NULL || - test_bit(In_sync, &mirror->rdev->flags)) - && - (mirror->replacement == NULL || - test_bit(Faulty, - &mirror->replacement->flags))) + rcu_read_lock(); + mrdev = rcu_dereference(mirror->rdev); + mreplace = rcu_dereference(mirror->replacement); + + if ((mrdev == NULL || + test_bit(Faulty, &mrdev->flags) || + test_bit(In_sync, &mrdev->flags)) && + (mreplace == NULL || + test_bit(Faulty, &mreplace->flags))) { + rcu_read_unlock(); continue; + } still_degraded = 0; /* want to reconstruct this device */ @@ -2956,8 +2975,11 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, /* last stripe is not complete - don't * try to recover this sector. */ + rcu_read_unlock(); continue; } + if (mreplace && test_bit(Faulty, &mreplace->flags)) + mreplace = NULL; /* Unless we are doing a full sync, or a replacement * we only need to recover the block if it is set in * the bitmap @@ -2967,14 +2989,19 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (sync_blocks < max_sync) max_sync = sync_blocks; if (!must_sync && - mirror->replacement == NULL && + mreplace == NULL && !conf->fullsync) { /* yep, skip the sync_blocks here, but don't assume * that there will never be anything to do here */ chunks_skipped = -1; + rcu_read_unlock(); continue; } + atomic_inc(&mrdev->nr_pending); + if (mreplace) + atomic_inc(&mreplace->nr_pending); + rcu_read_unlock(); r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); r10_bio->state = 0; @@ -2993,12 +3020,15 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, /* Need to check if the array will still be * degraded */ - for (j = 0; j < conf->geo.raid_disks; j++) - if (conf->mirrors[j].rdev == NULL || - test_bit(Faulty, &conf->mirrors[j].rdev->flags)) { + rcu_read_lock(); + for (j = 0; j < conf->geo.raid_disks; j++) { + struct md_rdev *rdev = rcu_dereference( + conf->mirrors[j].rdev); + if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { still_degraded = 1; break; } + } must_sync = bitmap_start_sync(mddev->bitmap, sect, &sync_blocks, still_degraded); @@ -3008,15 +3038,15 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, int k; int d = r10_bio->devs[j].devnum; sector_t from_addr, to_addr; - struct md_rdev *rdev; + struct md_rdev *rdev = + rcu_dereference(conf->mirrors[d].rdev); sector_t sector, first_bad; int bad_sectors; - if (!conf->mirrors[d].rdev || - !test_bit(In_sync, &conf->mirrors[d].rdev->flags)) + if (!rdev || + !test_bit(In_sync, &rdev->flags)) continue; /* This is where we read from */ any_working = 1; - rdev = conf->mirrors[d].rdev; sector = r10_bio->devs[j].addr; if (is_badblock(rdev, sector, max_sync, @@ -3055,8 +3085,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, r10_bio->devs[1].devnum = i; r10_bio->devs[1].addr = to_addr; - rdev = mirror->rdev; - if (!test_bit(In_sync, &rdev->flags)) { + if (!test_bit(In_sync, &mrdev->flags)) { bio = r10_bio->devs[1].bio; bio_reset(bio); bio->bi_next = biolist; @@ -3065,8 +3094,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bio->bi_end_io = end_sync_write; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio->bi_iter.bi_sector = to_addr - + rdev->data_offset; - bio->bi_bdev = rdev->bdev; + + mrdev->data_offset; + bio->bi_bdev = mrdev->bdev; atomic_inc(&r10_bio->remaining); } else r10_bio->devs[1].bio->bi_end_io = NULL; @@ -3075,8 +3104,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bio = r10_bio->devs[1].repl_bio; if (bio) bio->bi_end_io = NULL; - rdev = mirror->replacement; - /* Note: if rdev != NULL, then bio + /* Note: if mreplace != NULL, then bio * cannot be NULL as r10buf_pool_alloc will * have allocated it. * So the second test here is pointless. @@ -3084,8 +3112,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, * this comment keeps human reviewers * happy. */ - if (rdev == NULL || bio == NULL || - test_bit(Faulty, &rdev->flags)) + if (mreplace == NULL || bio == NULL || + test_bit(Faulty, &mreplace->flags)) break; bio_reset(bio); bio->bi_next = biolist; @@ -3094,11 +3122,12 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bio->bi_end_io = end_sync_write; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio->bi_iter.bi_sector = to_addr + - rdev->data_offset; - bio->bi_bdev = rdev->bdev; + mreplace->data_offset; + bio->bi_bdev = mreplace->bdev; atomic_inc(&r10_bio->remaining); break; } + rcu_read_unlock(); if (j == conf->copies) { /* Cannot recover, so abort the recovery or * record a bad block */ @@ -3111,15 +3140,15 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (r10_bio->devs[k].devnum == i) break; if (!test_bit(In_sync, - &mirror->rdev->flags) + &mrdev->flags) && !rdev_set_badblocks( - mirror->rdev, + mrdev, r10_bio->devs[k].addr, max_sync, 0)) any_working = 0; - if (mirror->replacement && + if (mreplace && !rdev_set_badblocks( - mirror->replacement, + mreplace, r10_bio->devs[k].addr, max_sync, 0)) any_working = 0; @@ -3137,8 +3166,14 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (rb2) atomic_dec(&rb2->remaining); r10_bio = rb2; + rdev_dec_pending(mrdev, mddev); + if (mreplace) + rdev_dec_pending(mreplace, mddev); break; } + rdev_dec_pending(mrdev, mddev); + if (mreplace) + rdev_dec_pending(mreplace, mddev); } if (biolist == NULL) { while (r10_bio) { @@ -3183,6 +3218,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, int d = r10_bio->devs[i].devnum; sector_t first_bad, sector; int bad_sectors; + struct md_rdev *rdev; if (r10_bio->devs[i].repl_bio) r10_bio->devs[i].repl_bio->bi_end_io = NULL; @@ -3190,12 +3226,14 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bio = r10_bio->devs[i].bio; bio_reset(bio); bio->bi_error = -EIO; - if (conf->mirrors[d].rdev == NULL || - test_bit(Faulty, &conf->mirrors[d].rdev->flags)) + rcu_read_lock(); + rdev = rcu_dereference(conf->mirrors[d].rdev); + if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { + rcu_read_unlock(); continue; + } sector = r10_bio->devs[i].addr; - if (is_badblock(conf->mirrors[d].rdev, - sector, max_sync, + if (is_badblock(rdev, sector, max_sync, &first_bad, &bad_sectors)) { if (first_bad > sector) max_sync = first_bad - sector; @@ -3203,25 +3241,28 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bad_sectors -= (sector - first_bad); if (max_sync > bad_sectors) max_sync = bad_sectors; + rcu_read_unlock(); continue; } } - atomic_inc(&conf->mirrors[d].rdev->nr_pending); + atomic_inc(&rdev->nr_pending); atomic_inc(&r10_bio->remaining); bio->bi_next = biolist; biolist = bio; bio->bi_private = r10_bio; bio->bi_end_io = end_sync_read; bio_set_op_attrs(bio, REQ_OP_READ, 0); - bio->bi_iter.bi_sector = sector + - conf->mirrors[d].rdev->data_offset; - bio->bi_bdev = conf->mirrors[d].rdev->bdev; + bio->bi_iter.bi_sector = sector + rdev->data_offset; + bio->bi_bdev = rdev->bdev; count++; - if (conf->mirrors[d].replacement == NULL || - test_bit(Faulty, - &conf->mirrors[d].replacement->flags)) + rdev = rcu_dereference(conf->mirrors[d].replacement); + if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { + rcu_read_unlock(); continue; + } + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); /* Need to set up for writing to the replacement */ bio = r10_bio->devs[i].repl_bio; @@ -3229,15 +3270,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bio->bi_error = -EIO; sector = r10_bio->devs[i].addr; - atomic_inc(&conf->mirrors[d].rdev->nr_pending); bio->bi_next = biolist; biolist = bio; bio->bi_private = r10_bio; bio->bi_end_io = end_sync_write; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - bio->bi_iter.bi_sector = sector + - conf->mirrors[d].replacement->data_offset; - bio->bi_bdev = conf->mirrors[d].replacement->bdev; + bio->bi_iter.bi_sector = sector + rdev->data_offset; + bio->bi_bdev = rdev->bdev; count++; } @@ -3504,6 +3543,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) spin_lock_init(&conf->resync_lock); init_waitqueue_head(&conf->wait_barrier); + atomic_set(&conf->nr_pending, 0); conf->thread = md_register_thread(raid10d, mddev, "raid10"); if (!conf->thread) @@ -4333,15 +4373,16 @@ read_more: blist = read_bio; read_bio->bi_next = NULL; + rcu_read_lock(); for (s = 0; s < conf->copies*2; s++) { struct bio *b; int d = r10_bio->devs[s/2].devnum; struct md_rdev *rdev2; if (s&1) { - rdev2 = conf->mirrors[d].replacement; + rdev2 = rcu_dereference(conf->mirrors[d].replacement); b = r10_bio->devs[s/2].repl_bio; } else { - rdev2 = conf->mirrors[d].rdev; + rdev2 = rcu_dereference(conf->mirrors[d].rdev); b = r10_bio->devs[s/2].bio; } if (!rdev2 || test_bit(Faulty, &rdev2->flags)) @@ -4386,6 +4427,7 @@ read_more: nr_sectors += len >> 9; } bio_full: + rcu_read_unlock(); r10_bio->sectors = nr_sectors; /* Now submit the read */ @@ -4437,16 +4479,20 @@ static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio) struct bio *b; int d = r10_bio->devs[s/2].devnum; struct md_rdev *rdev; + rcu_read_lock(); if (s&1) { - rdev = conf->mirrors[d].replacement; + rdev = rcu_dereference(conf->mirrors[d].replacement); b = r10_bio->devs[s/2].repl_bio; } else { - rdev = conf->mirrors[d].rdev; + rdev = rcu_dereference(conf->mirrors[d].rdev); b = r10_bio->devs[s/2].bio; } - if (!rdev || test_bit(Faulty, &rdev->flags)) + if (!rdev || test_bit(Faulty, &rdev->flags)) { + rcu_read_unlock(); continue; + } atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); md_sync_acct(b->bi_bdev, r10_bio->sectors); atomic_inc(&r10_bio->remaining); b->bi_next = NULL; @@ -4507,9 +4553,10 @@ static int handle_reshape_read_error(struct mddev *mddev, if (s > (PAGE_SIZE >> 9)) s = PAGE_SIZE >> 9; + rcu_read_lock(); while (!success) { int d = r10b->devs[slot].devnum; - struct md_rdev *rdev = conf->mirrors[d].rdev; + struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); sector_t addr; if (rdev == NULL || test_bit(Faulty, &rdev->flags) || @@ -4517,11 +4564,15 @@ static int handle_reshape_read_error(struct mddev *mddev, goto failed; addr = r10b->devs[slot].addr + idx * PAGE_SIZE; + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); success = sync_page_io(rdev, addr, s << 9, bvec[idx].bv_page, REQ_OP_READ, 0, false); + rdev_dec_pending(rdev, mddev); + rcu_read_lock(); if (success) break; failed: @@ -4531,6 +4582,7 @@ static int handle_reshape_read_error(struct mddev *mddev, if (slot == first_slot) break; } + rcu_read_unlock(); if (!success) { /* couldn't read this block, must give up */ set_bit(MD_RECOVERY_INTR, @@ -4600,16 +4652,18 @@ static void raid10_finish_reshape(struct mddev *mddev) } } else { int d; + rcu_read_lock(); for (d = conf->geo.raid_disks ; d < conf->geo.raid_disks - mddev->delta_disks; d++) { - struct md_rdev *rdev = conf->mirrors[d].rdev; + struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev) clear_bit(In_sync, &rdev->flags); - rdev = conf->mirrors[d].replacement; + rdev = rcu_dereference(conf->mirrors[d].replacement); if (rdev) clear_bit(In_sync, &rdev->flags); } + rcu_read_unlock(); } mddev->layout = mddev->new_layout; mddev->chunk_sectors = 1 << conf->geo.chunk_shift; diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 6fc2c75759bf..18ec1f7a98bf 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -64,10 +64,11 @@ struct r10conf { int pending_count; spinlock_t resync_lock; - int nr_pending; + atomic_t nr_pending; int nr_waiting; int nr_queued; int barrier; + int array_freeze_pending; sector_t next_resync; int fullsync; /* set to 1 if a full sync is needed, * (fresh device added). diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 6953d78297b0..d189e894b921 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3080,7 +3080,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, struct md_rdev *rdev; rcu_read_lock(); rdev = rcu_dereference(conf->disks[i].rdev); - if (rdev && test_bit(In_sync, &rdev->flags)) + if (rdev && test_bit(In_sync, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) atomic_inc(&rdev->nr_pending); else rdev = NULL; @@ -3210,15 +3211,16 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, /* During recovery devices cannot be removed, so * locking and refcounting of rdevs is not needed */ + rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { - struct md_rdev *rdev = conf->disks[i].rdev; + struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && !rdev_set_badblocks(rdev, sh->sector, STRIPE_SECTORS, 0)) abort = 1; - rdev = conf->disks[i].replacement; + rdev = rcu_dereference(conf->disks[i].replacement); if (rdev && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) @@ -3226,6 +3228,7 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, STRIPE_SECTORS, 0)) abort = 1; } + rcu_read_unlock(); if (abort) conf->recovery_disabled = conf->mddev->recovery_disabled; @@ -3237,15 +3240,16 @@ static int want_replace(struct stripe_head *sh, int disk_idx) { struct md_rdev *rdev; int rv = 0; - /* Doing recovery so rcu locking not required */ - rdev = sh->raid_conf->disks[disk_idx].replacement; + + rcu_read_lock(); + rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement); if (rdev && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && (rdev->recovery_offset <= sh->sector || rdev->mddev->recovery_cp <= sh->sector)) rv = 1; - + rcu_read_unlock(); return rv; } @@ -3600,7 +3604,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, pr_debug("for sector %llu, rmw=%d rcw=%d\n", (unsigned long long)sh->sector, rmw, rcw); set_bit(STRIPE_HANDLE, &sh->state); - if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_ENABLE_RMW)) && rmw > 0) { + if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) { /* prefer read-modify-write, but need to get some data */ if (conf->mddev->queue) blk_add_trace_msg(conf->mddev->queue, @@ -3627,7 +3631,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, } } } - if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_ENABLE_RMW)) && rcw > 0) { + if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > 0) { /* want reconstruct write, but need to get some data */ int qread =0; rcw = 0; @@ -7066,10 +7070,12 @@ static void raid5_status(struct seq_file *seq, struct mddev *mddev) seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, conf->chunk_sectors / 2, mddev->layout); seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); - for (i = 0; i < conf->raid_disks; i++) - seq_printf (seq, "%s", - conf->disks[i].rdev && - test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); + rcu_read_lock(); + for (i = 0; i < conf->raid_disks; i++) { + struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); + seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); + } + rcu_read_unlock(); seq_printf (seq, "]"); } @@ -7191,12 +7197,15 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) goto abort; } *rdevp = NULL; - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - *rdevp = rdev; - } else if (p->replacement) { + if (!test_bit(RemoveSynchronized, &rdev->flags)) { + synchronize_rcu(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + *rdevp = rdev; + } + } + if (p->replacement) { /* We must have just cleared 'rdev' */ p->rdev = p->replacement; clear_bit(Replacement, &p->replacement->flags); |