diff options
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r-- | drivers/md/md.c | 522 |
1 files changed, 380 insertions, 142 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index 4f5ecbe94ccb..807095f4c793 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -250,7 +250,7 @@ static DEFINE_SPINLOCK(all_mddevs_lock); * call has finished, the bio has been linked into some internal structure * and so is visible to ->quiesce(), so we don't need the refcount any more. */ -static void md_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) { const int rw = bio_data_dir(bio); struct mddev *mddev = q->queuedata; @@ -262,13 +262,13 @@ static void md_make_request(struct request_queue *q, struct bio *bio) if (mddev == NULL || mddev->pers == NULL || !mddev->ready) { bio_io_error(bio); - return; + return BLK_QC_T_NONE; } if (mddev->ro == 1 && unlikely(rw == WRITE)) { if (bio_sectors(bio) != 0) bio->bi_error = -EROFS; bio_endio(bio); - return; + return BLK_QC_T_NONE; } smp_rmb(); /* Ensure implications of 'active' are visible */ rcu_read_lock(); @@ -302,6 +302,8 @@ static void md_make_request(struct request_queue *q, struct bio *bio) if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) wake_up(&mddev->sb_wait); + + return BLK_QC_T_NONE; } /* mddev_suspend makes sure no new requests are submitted @@ -1608,7 +1610,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) ++ev1; if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) && - le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe) + (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || + le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) if (ev1 < mddev->events) return -EINVAL; } else if (mddev->bitmap) { @@ -1628,16 +1631,29 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) int role; if (rdev->desc_nr < 0 || rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { - role = 0xffff; + role = MD_DISK_ROLE_SPARE; rdev->desc_nr = -1; } else role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); switch(role) { - case 0xffff: /* spare */ + case MD_DISK_ROLE_SPARE: /* spare */ break; - case 0xfffe: /* faulty */ + case MD_DISK_ROLE_FAULTY: /* faulty */ set_bit(Faulty, &rdev->flags); break; + case MD_DISK_ROLE_JOURNAL: /* journal device */ + if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { + /* journal device without journal feature */ + printk(KERN_WARNING + "md: journal device provided without journal feature, ignoring the device\n"); + return -EINVAL; + } + set_bit(Journal, &rdev->flags); + rdev->journal_tail = le64_to_cpu(sb->journal_tail); + if (mddev->recovery_cp == MaxSector) + set_bit(MD_JOURNAL_CLEAN, &mddev->flags); + rdev->raid_disk = mddev->raid_disks; + break; default: rdev->saved_raid_disk = role; if ((le32_to_cpu(sb->feature_map) & @@ -1655,6 +1671,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) set_bit(WriteMostly, &rdev->flags); if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) set_bit(Replacement, &rdev->flags); + if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) + set_bit(MD_HAS_JOURNAL, &mddev->flags); } else /* MULTIPATH are always insync */ set_bit(In_sync, &rdev->flags); @@ -1679,6 +1697,8 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) sb->events = cpu_to_le64(mddev->events); if (mddev->in_sync) sb->resync_offset = cpu_to_le64(mddev->recovery_cp); + else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) + sb->resync_offset = cpu_to_le64(MaxSector); else sb->resync_offset = cpu_to_le64(0); @@ -1702,7 +1722,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); } - if (rdev->raid_disk >= 0 && + if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && !test_bit(In_sync, &rdev->flags)) { sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); @@ -1712,6 +1732,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); } + /* Note: recovery_offset and journal_tail share space */ + if (test_bit(Journal, &rdev->flags)) + sb->journal_tail = cpu_to_le64(rdev->journal_tail); if (test_bit(Replacement, &rdev->flags)) sb->feature_map |= cpu_to_le32(MD_FEATURE_REPLACEMENT); @@ -1735,6 +1758,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) } } + if (mddev_is_clustered(mddev)) + sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED); + if (rdev->badblocks.count == 0) /* Nothing to do for bad blocks*/ ; else if (sb->bblog_offset == 0) @@ -1785,18 +1811,23 @@ retry: max_dev = le32_to_cpu(sb->max_dev); for (i=0; i<max_dev;i++) - sb->dev_roles[i] = cpu_to_le16(0xfffe); + sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); + + if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) + sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); rdev_for_each(rdev2, mddev) { i = rdev2->desc_nr; if (test_bit(Faulty, &rdev2->flags)) - sb->dev_roles[i] = cpu_to_le16(0xfffe); + sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); else if (test_bit(In_sync, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); + else if (test_bit(Journal, &rdev2->flags)) + sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL); else if (rdev2->raid_disk >= 0) sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); else - sb->dev_roles[i] = cpu_to_le16(0xffff); + sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); } sb->sb_csum = calc_sb_1_csum(sb); @@ -1912,13 +1943,23 @@ static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) struct md_rdev *rdev, *rdev2; rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev1) - rdev_for_each_rcu(rdev2, mddev2) + rdev_for_each_rcu(rdev, mddev1) { + if (test_bit(Faulty, &rdev->flags) || + test_bit(Journal, &rdev->flags) || + rdev->raid_disk == -1) + continue; + rdev_for_each_rcu(rdev2, mddev2) { + if (test_bit(Faulty, &rdev2->flags) || + test_bit(Journal, &rdev2->flags) || + rdev2->raid_disk == -1) + continue; if (rdev->bdev->bd_contains == rdev2->bdev->bd_contains) { rcu_read_unlock(); return 1; } + } + } rcu_read_unlock(); return 0; } @@ -1962,12 +2003,9 @@ int md_integrity_register(struct mddev *mddev) * All component devices are integrity capable and have matching * profiles, register the common profile for the md device. */ - if (blk_integrity_register(mddev->gendisk, - bdev_get_integrity(reference->bdev)) != 0) { - printk(KERN_ERR "md: failed to register integrity for %s\n", - mdname(mddev)); - return -EINVAL; - } + blk_integrity_register(mddev->gendisk, + bdev_get_integrity(reference->bdev)); + printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev)); if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) { printk(KERN_ERR "md: failed to create integrity pool for %s\n", @@ -1997,6 +2035,7 @@ void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) if (bi_rdev && blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) >= 0) return; + WARN_ON_ONCE(!mddev->suspended); printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev)); blk_integrity_unregister(mddev->gendisk); } @@ -2196,23 +2235,77 @@ static void sync_sbs(struct mddev *mddev, int nospares) } } +static bool does_sb_need_changing(struct mddev *mddev) +{ + struct md_rdev *rdev; + struct mdp_superblock_1 *sb; + int role; + + /* Find a good rdev */ + rdev_for_each(rdev, mddev) + if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags)) + break; + + /* No good device found. */ + if (!rdev) + return false; + + sb = page_address(rdev->sb_page); + /* Check if a device has become faulty or a spare become active */ + rdev_for_each(rdev, mddev) { + role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); + /* Device activated? */ + if (role == 0xffff && rdev->raid_disk >=0 && + !test_bit(Faulty, &rdev->flags)) + return true; + /* Device turned faulty? */ + if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd)) + return true; + } + + /* Check if any mddev parameters have changed */ + if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || + (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || + (mddev->layout != le64_to_cpu(sb->layout)) || + (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || + (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) + return true; + + return false; +} + void md_update_sb(struct mddev *mddev, int force_change) { struct md_rdev *rdev; int sync_req; int nospares = 0; int any_badblocks_changed = 0; + int ret = -1; if (mddev->ro) { if (force_change) set_bit(MD_CHANGE_DEVS, &mddev->flags); return; } + + if (mddev_is_clustered(mddev)) { + if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) + force_change = 1; + ret = md_cluster_ops->metadata_update_start(mddev); + /* Has someone else has updated the sb */ + if (!does_sb_need_changing(mddev)) { + if (ret == 0) + md_cluster_ops->metadata_update_cancel(mddev); + clear_bit(MD_CHANGE_PENDING, &mddev->flags); + return; + } + } repeat: /* First make sure individual recovery_offsets are correct */ rdev_for_each(rdev, mddev) { if (rdev->raid_disk >= 0 && mddev->delta_disks >= 0 && + !test_bit(Journal, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && mddev->curr_resync_completed > rdev->recovery_offset) rdev->recovery_offset = mddev->curr_resync_completed; @@ -2356,6 +2449,9 @@ repeat: clear_bit(BlockedBadBlocks, &rdev->flags); wake_up(&rdev->blocked_wait); } + + if (mddev_is_clustered(mddev) && ret == 0) + md_cluster_ops->metadata_update_finish(mddev); } EXPORT_SYMBOL(md_update_sb); @@ -2431,6 +2527,10 @@ state_show(struct md_rdev *rdev, char *page) len += sprintf(page+len, "%sin_sync",sep); sep = ","; } + if (test_bit(Journal, &flags)) { + len += sprintf(page+len, "%sjournal",sep); + sep = ","; + } if (test_bit(WriteMostly, &flags)) { len += sprintf(page+len, "%swrite_mostly",sep); sep = ","; @@ -2442,6 +2542,7 @@ state_show(struct md_rdev *rdev, char *page) sep = ","; } if (!test_bit(Faulty, &flags) && + !test_bit(Journal, &flags) && !test_bit(In_sync, &flags)) { len += sprintf(page+len, "%sspare", sep); sep = ","; @@ -2490,17 +2591,16 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) err = -EBUSY; else { struct mddev *mddev = rdev->mddev; - if (mddev_is_clustered(mddev)) - md_cluster_ops->remove_disk(mddev, rdev); - md_kick_rdev_from_array(rdev); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); - if (mddev->pers) - md_update_sb(mddev, 1); - md_new_event(mddev); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); err = 0; + if (mddev_is_clustered(mddev)) + err = md_cluster_ops->remove_disk(mddev, rdev); + + if (err == 0) { + md_kick_rdev_from_array(rdev); + if (mddev->pers) + md_update_sb(mddev, 1); + md_new_event(mddev); + } } } else if (cmd_match(buf, "writemostly")) { set_bit(WriteMostly, &rdev->flags); @@ -2529,7 +2629,8 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { set_bit(In_sync, &rdev->flags); err = 0; - } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0) { + } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags)) { if (rdev->mddev->pers == NULL) { clear_bit(In_sync, &rdev->flags); rdev->saved_raid_disk = rdev->raid_disk; @@ -2548,6 +2649,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) * check if recovery is needed. */ if (rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && !test_bit(Replacement, &rdev->flags)) set_bit(WantReplacement, &rdev->flags); set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); @@ -2625,7 +2727,9 @@ __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); static ssize_t slot_show(struct md_rdev *rdev, char *page) { - if (rdev->raid_disk < 0) + if (test_bit(Journal, &rdev->flags)) + return sprintf(page, "journal\n"); + else if (rdev->raid_disk < 0) return sprintf(page, "none\n"); else return sprintf(page, "%d\n", rdev->raid_disk); @@ -2637,6 +2741,8 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) int slot; int err; + if (test_bit(Journal, &rdev->flags)) + return -EBUSY; if (strncmp(buf, "none", 4)==0) slot = -1; else { @@ -2688,15 +2794,9 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) rdev->saved_raid_disk = -1; clear_bit(In_sync, &rdev->flags); clear_bit(Bitmap_sync, &rdev->flags); - err = rdev->mddev->pers-> - hot_add_disk(rdev->mddev, rdev); - if (err) { - rdev->raid_disk = -1; - return err; - } else - sysfs_notify_dirent_safe(rdev->sysfs_state); - if (sysfs_link_rdev(rdev->mddev, rdev)) - /* failure here is OK */; + remove_and_add_spares(rdev->mddev, rdev); + if (rdev->raid_disk == -1) + return -EBUSY; /* don't wakeup anyone, leave that to userspace. */ } else { if (slot >= rdev->mddev->raid_disks && @@ -2841,6 +2941,8 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) sector_t oldsectors = rdev->sectors; sector_t sectors; + if (test_bit(Journal, &rdev->flags)) + return -EBUSY; if (strict_blocks_to_sectors(buf, §ors) < 0) return -EINVAL; if (rdev->data_offset != rdev->new_data_offset) @@ -3198,20 +3300,14 @@ static void analyze_sbs(struct mddev *mddev) md_kick_rdev_from_array(rdev); continue; } - /* No device should have a Candidate flag - * when reading devices - */ - if (test_bit(Candidate, &rdev->flags)) { - pr_info("md: kicking Cluster Candidate %s from array!\n", - bdevname(rdev->bdev, b)); - md_kick_rdev_from_array(rdev); - } } if (mddev->level == LEVEL_MULTIPATH) { rdev->desc_nr = i++; rdev->raid_disk = rdev->desc_nr; set_bit(In_sync, &rdev->flags); - } else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) { + } else if (rdev->raid_disk >= + (mddev->raid_disks - min(0, mddev->delta_disks)) && + !test_bit(Journal, &rdev->flags)) { rdev->raid_disk = -1; clear_bit(In_sync, &rdev->flags); } @@ -3269,6 +3365,11 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) { unsigned long msec; + if (mddev_is_clustered(mddev)) { + pr_info("md: Safemode is disabled for clustered mode\n"); + return -EINVAL; + } + if (strict_strtoul_scaled(cbuf, &msec, 3) < 0) return -EINVAL; if (msec == 0) @@ -3869,7 +3970,9 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) break; case clean: if (mddev->pers) { - restart_array(mddev); + err = restart_array(mddev); + if (err) + break; spin_lock(&mddev->lock); if (atomic_read(&mddev->writes_pending) == 0) { if (mddev->in_sync == 0) { @@ -3887,7 +3990,9 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) break; case active: if (mddev->pers) { - restart_array(mddev); + err = restart_array(mddev); + if (err) + break; clear_bit(MD_CHANGE_PENDING, &mddev->flags); wake_up(&mddev->sb_wait); err = 0; @@ -4066,12 +4171,8 @@ size_store(struct mddev *mddev, const char *buf, size_t len) if (err) return err; if (mddev->pers) { - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); err = update_size(mddev, sectors); md_update_sb(mddev, 1); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); } else { if (mddev->dev_sectors == 0 || mddev->dev_sectors > sectors) @@ -5183,7 +5284,10 @@ int md_run(struct mddev *mddev) atomic_set(&mddev->max_corr_read_errors, MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); mddev->safemode = 0; - mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ + if (mddev_is_clustered(mddev)) + mddev->safemode_delay = 0; + else + mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ mddev->in_sync = 1; smp_wmb(); spin_lock(&mddev->lock); @@ -5226,6 +5330,9 @@ static int do_md_run(struct mddev *mddev) goto out; } + if (mddev_is_clustered(mddev)) + md_allow_write(mddev); + md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ @@ -5248,6 +5355,25 @@ static int restart_array(struct mddev *mddev) return -EINVAL; if (!mddev->ro) return -EBUSY; + if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { + struct md_rdev *rdev; + bool has_journal = false; + + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { + if (test_bit(Journal, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) { + has_journal = true; + break; + } + } + rcu_read_unlock(); + + /* Don't restart rw with journal missing/faulty */ + if (!has_journal) + return -EINVAL; + } + mddev->safemode = 0; mddev->ro = 0; set_disk_ro(disk, 0); @@ -5309,8 +5435,6 @@ static void md_clean(struct mddev *mddev) static void __md_stop_writes(struct mddev *mddev) { - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); flush_workqueue(md_misc_wq); if (mddev->sync_thread) { @@ -5324,13 +5448,13 @@ static void __md_stop_writes(struct mddev *mddev) md_super_wait(mddev); if (mddev->ro == 0 && - (!mddev->in_sync || (mddev->flags & MD_UPDATE_SB_FLAGS))) { + ((!mddev->in_sync && !mddev_is_clustered(mddev)) || + (mddev->flags & MD_UPDATE_SB_FLAGS))) { /* mark array as shutdown cleanly */ - mddev->in_sync = 1; + if (!mddev_is_clustered(mddev)) + mddev->in_sync = 1; md_update_sb(mddev, 1); } - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); } void md_stop_writes(struct mddev *mddev) @@ -5409,9 +5533,13 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) * which will now never happen */ wake_up_process(mddev->sync_thread->tsk); + if (mddev->external && test_bit(MD_CHANGE_PENDING, &mddev->flags)) + return -EBUSY; mddev_unlock(mddev); wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); + wait_event(mddev->sb_wait, + !test_bit(MD_CHANGE_PENDING, &mddev->flags)); mddev_lock_nointr(mddev); mutex_lock(&mddev->open_mutex); @@ -5538,7 +5666,6 @@ static int do_md_stop(struct mddev *mddev, int mode, if (mddev->hold_active == UNTIL_STOP) mddev->hold_active = 0; } - blk_integrity_unregister(disk); md_new_event(mddev); sysfs_notify_dirent_safe(mddev->sysfs_state); return 0; @@ -5788,6 +5915,8 @@ static int get_disk_info(struct mddev *mddev, void __user * arg) info.state |= (1<<MD_DISK_ACTIVE); info.state |= (1<<MD_DISK_SYNC); } + if (test_bit(Journal, &rdev->flags)) + info.state |= (1<<MD_DISK_JOURNAL); if (test_bit(WriteMostly, &rdev->flags)) info.state |= (1<<MD_DISK_WRITEMOSTLY); } else { @@ -5902,23 +6031,18 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) else clear_bit(WriteMostly, &rdev->flags); + if (info->state & (1<<MD_DISK_JOURNAL)) + set_bit(Journal, &rdev->flags); /* * check whether the device shows up in other nodes */ if (mddev_is_clustered(mddev)) { - if (info->state & (1 << MD_DISK_CANDIDATE)) { - /* Through --cluster-confirm */ + if (info->state & (1 << MD_DISK_CANDIDATE)) set_bit(Candidate, &rdev->flags); - err = md_cluster_ops->new_disk_ack(mddev, true); - if (err) { - export_rdev(rdev); - return err; - } - } else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { + else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { /* --add initiated by this node */ - err = md_cluster_ops->add_new_disk_start(mddev, rdev); + err = md_cluster_ops->add_new_disk(mddev, rdev); if (err) { - md_cluster_ops->add_new_disk_finish(mddev); export_rdev(rdev); return err; } @@ -5927,13 +6051,23 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) rdev->raid_disk = -1; err = bind_rdev_to_array(rdev, mddev); + if (err) export_rdev(rdev); - else + + if (mddev_is_clustered(mddev)) { + if (info->state & (1 << MD_DISK_CANDIDATE)) + md_cluster_ops->new_disk_ack(mddev, (err == 0)); + else { + if (err) + md_cluster_ops->add_new_disk_cancel(mddev); + else + err = add_bound_rdev(rdev); + } + + } else if (!err) err = add_bound_rdev(rdev); - if (mddev_is_clustered(mddev) && - (info->state & (1 << MD_DISK_CLUSTER_ADD))) - md_cluster_ops->add_new_disk_finish(mddev); + return err; } @@ -5989,13 +6123,17 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) { char b[BDEVNAME_SIZE]; struct md_rdev *rdev; + int ret = -1; rdev = find_rdev(mddev, dev); if (!rdev) return -ENXIO; if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); + ret = md_cluster_ops->metadata_update_start(mddev); + + if (rdev->raid_disk < 0) + goto kick_rdev; clear_bit(Blocked, &rdev->flags); remove_and_add_spares(mddev, rdev); @@ -6003,20 +6141,19 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) if (rdev->raid_disk >= 0) goto busy; - if (mddev_is_clustered(mddev)) +kick_rdev: + if (mddev_is_clustered(mddev) && ret == 0) md_cluster_ops->remove_disk(mddev, rdev); md_kick_rdev_from_array(rdev); md_update_sb(mddev, 1); md_new_event(mddev); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); - return 0; busy: - if (mddev_is_clustered(mddev)) + if (mddev_is_clustered(mddev) && ret == 0) md_cluster_ops->metadata_update_cancel(mddev); + printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", bdevname(rdev->bdev,b), mdname(mddev)); return -EBUSY; @@ -6067,14 +6204,12 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) goto abort_export; } - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); clear_bit(In_sync, &rdev->flags); rdev->desc_nr = -1; rdev->saved_raid_disk = -1; err = bind_rdev_to_array(rdev, mddev); if (err) - goto abort_clustered; + goto abort_export; /* * The rest should better be atomic, we can have disk failures @@ -6084,9 +6219,6 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) rdev->raid_disk = -1; md_update_sb(mddev, 1); - - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); /* * Kick recovery, maybe this spare has to be added to the * array immediately. @@ -6096,9 +6228,6 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) md_new_event(mddev); return 0; -abort_clustered: - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_cancel(mddev); abort_export: export_rdev(rdev); return err; @@ -6416,8 +6545,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) return rv; } } - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) rv = update_size(mddev, (sector_t)info->size * 2); @@ -6475,12 +6602,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) } } md_update_sb(mddev, 1); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); return rv; err: - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_cancel(mddev); return rv; } @@ -7281,6 +7404,8 @@ static int md_seq_show(struct seq_file *seq, void *v) bdevname(rdev->bdev,b), rdev->desc_nr); if (test_bit(WriteMostly, &rdev->flags)) seq_printf(seq, "(W)"); + if (test_bit(Journal, &rdev->flags)) + seq_printf(seq, "(J)"); if (test_bit(Faulty, &rdev->flags)) { seq_printf(seq, "(F)"); continue; @@ -7593,11 +7718,7 @@ int md_allow_write(struct mddev *mddev) mddev->safemode == 0) mddev->safemode = 1; spin_unlock(&mddev->lock); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); md_update_sb(mddev, 0); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); sysfs_notify_dirent_safe(mddev->sysfs_state); } else spin_unlock(&mddev->lock); @@ -7629,6 +7750,7 @@ void md_do_sync(struct md_thread *thread) struct md_rdev *rdev; char *desc, *action = NULL; struct blk_plug plug; + bool cluster_resync_finished = false; /* just incase thread restarts... */ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) @@ -7738,6 +7860,7 @@ void md_do_sync(struct md_thread *thread) rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && rdev->recovery_offset < j) @@ -7798,9 +7921,6 @@ void md_do_sync(struct md_thread *thread) md_new_event(mddev); update_time = jiffies; - if (mddev_is_clustered(mddev)) - md_cluster_ops->resync_start(mddev, j, max_sectors); - blk_start_plug(&plug); while (j < max_sectors) { sector_t sectors; @@ -7864,8 +7984,6 @@ void md_do_sync(struct md_thread *thread) j = max_sectors; if (j > 2) mddev->curr_resync = j; - if (mddev_is_clustered(mddev)) - md_cluster_ops->resync_info_update(mddev, j, max_sectors); mddev->curr_mark_cnt = io_sectors; if (last_check == 0) /* this is the earliest that rebuild will be @@ -7936,7 +8054,11 @@ void md_do_sync(struct md_thread *thread) mddev->curr_resync_completed = mddev->curr_resync; sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } - /* tell personality that we are finished */ + /* tell personality and other nodes that we are finished */ + if (mddev_is_clustered(mddev)) { + md_cluster_ops->resync_finish(mddev); + cluster_resync_finished = true; + } mddev->pers->sync_request(mddev, max_sectors, &skipped); if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && @@ -7964,6 +8086,7 @@ void md_do_sync(struct md_thread *thread) rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && mddev->delta_disks >= 0 && + !test_bit(Journal, &rdev->flags) && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && rdev->recovery_offset < mddev->curr_resync) @@ -7972,11 +8095,13 @@ void md_do_sync(struct md_thread *thread) } } skip: - if (mddev_is_clustered(mddev)) - md_cluster_ops->resync_finish(mddev); - set_bit(MD_CHANGE_DEVS, &mddev->flags); + if (mddev_is_clustered(mddev) && + test_bit(MD_RECOVERY_INTR, &mddev->recovery) && + !cluster_resync_finished) + md_cluster_ops->resync_finish(mddev); + spin_lock(&mddev->lock); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { /* We completed so min/max setting can be forgotten if used. */ @@ -8007,7 +8132,8 @@ static int remove_and_add_spares(struct mddev *mddev, rdev->raid_disk >= 0 && !test_bit(Blocked, &rdev->flags) && (test_bit(Faulty, &rdev->flags) || - ! test_bit(In_sync, &rdev->flags)) && + (!test_bit(In_sync, &rdev->flags) && + !test_bit(Journal, &rdev->flags))) && atomic_read(&rdev->nr_pending)==0) { if (mddev->pers->hot_remove_disk( mddev, rdev) == 0) { @@ -8019,25 +8145,31 @@ static int remove_and_add_spares(struct mddev *mddev, if (removed && mddev->kobj.sd) sysfs_notify(&mddev->kobj, NULL, "degraded"); - if (this) + if (this && removed) goto no_add; rdev_for_each(rdev, mddev) { + if (this && this != rdev) + continue; + if (test_bit(Candidate, &rdev->flags)) + continue; if (rdev->raid_disk >= 0 && !test_bit(In_sync, &rdev->flags) && + !test_bit(Journal, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) spares++; if (rdev->raid_disk >= 0) continue; if (test_bit(Faulty, &rdev->flags)) continue; + if (test_bit(Journal, &rdev->flags)) + continue; if (mddev->ro && ! (rdev->saved_raid_disk >= 0 && !test_bit(Bitmap_sync, &rdev->flags))) continue; - if (rdev->saved_raid_disk < 0) - rdev->recovery_offset = 0; + rdev->recovery_offset = 0; if (mddev->pers-> hot_add_disk(mddev, rdev) == 0) { if (sysfs_link_rdev(mddev, rdev)) @@ -8056,14 +8188,25 @@ no_add: static void md_start_sync(struct work_struct *ws) { struct mddev *mddev = container_of(ws, struct mddev, del_work); + int ret = 0; + + if (mddev_is_clustered(mddev)) { + ret = md_cluster_ops->resync_start(mddev); + if (ret) { + mddev->sync_thread = NULL; + goto out; + } + } mddev->sync_thread = md_register_thread(md_do_sync, mddev, "resync"); +out: if (!mddev->sync_thread) { - printk(KERN_ERR "%s: could not start resync" - " thread...\n", - mdname(mddev)); + if (!(mddev_is_clustered(mddev) && ret == -EAGAIN)) + printk(KERN_ERR "%s: could not start resync" + " thread...\n", + mdname(mddev)); /* leave the spares where they are, it shouldn't hurt */ clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); @@ -8160,6 +8303,7 @@ void md_check_recovery(struct mddev *mddev) md_reap_sync_thread(mddev); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + clear_bit(MD_CHANGE_PENDING, &mddev->flags); goto unlock; } @@ -8181,13 +8325,8 @@ void md_check_recovery(struct mddev *mddev) sysfs_notify_dirent_safe(mddev->sysfs_state); } - if (mddev->flags & MD_UPDATE_SB_FLAGS) { - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); + if (mddev->flags & MD_UPDATE_SB_FLAGS) md_update_sb(mddev, 0); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); - } if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { @@ -8285,8 +8424,6 @@ void md_reap_sync_thread(struct mddev *mddev) set_bit(MD_CHANGE_DEVS, &mddev->flags); } } - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && mddev->pers->finish_reshape) mddev->pers->finish_reshape(mddev); @@ -8299,8 +8436,6 @@ void md_reap_sync_thread(struct mddev *mddev) rdev->saved_raid_disk = -1; md_update_sb(mddev, 1); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); @@ -8923,25 +9058,128 @@ err_wq: return ret; } -void md_reload_sb(struct mddev *mddev) +static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) { - struct md_rdev *rdev, *tmp; + struct mdp_superblock_1 *sb = page_address(rdev->sb_page); + struct md_rdev *rdev2; + int role, ret; + char b[BDEVNAME_SIZE]; - rdev_for_each_safe(rdev, tmp, mddev) { - rdev->sb_loaded = 0; - ClearPageUptodate(rdev->sb_page); + /* Check for change of roles in the active devices */ + rdev_for_each(rdev2, mddev) { + if (test_bit(Faulty, &rdev2->flags)) + continue; + + /* Check if the roles changed */ + role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); + + if (test_bit(Candidate, &rdev2->flags)) { + if (role == 0xfffe) { + pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b)); + md_kick_rdev_from_array(rdev2); + continue; + } + else + clear_bit(Candidate, &rdev2->flags); + } + + if (role != rdev2->raid_disk) { + /* got activated */ + if (rdev2->raid_disk == -1 && role != 0xffff) { + rdev2->saved_raid_disk = role; + ret = remove_and_add_spares(mddev, rdev2); + pr_info("Activated spare: %s\n", + bdevname(rdev2->bdev,b)); + continue; + } + /* device faulty + * We just want to do the minimum to mark the disk + * as faulty. The recovery is performed by the + * one who initiated the error. + */ + if ((role == 0xfffe) || (role == 0xfffd)) { + md_error(mddev, rdev2); + clear_bit(Blocked, &rdev2->flags); + } + } } - mddev->raid_disks = 0; - analyze_sbs(mddev); - rdev_for_each_safe(rdev, tmp, mddev) { - struct mdp_superblock_1 *sb = page_address(rdev->sb_page); - /* since we don't write to faulty devices, we figure out if the - * disk is faulty by comparing events - */ - if (mddev->events > sb->events) - set_bit(Faulty, &rdev->flags); + + if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) + update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); + + /* Finally set the event to be up to date */ + mddev->events = le64_to_cpu(sb->events); +} + +static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) +{ + int err; + struct page *swapout = rdev->sb_page; + struct mdp_superblock_1 *sb; + + /* Store the sb page of the rdev in the swapout temporary + * variable in case we err in the future + */ + rdev->sb_page = NULL; + alloc_disk_sb(rdev); + ClearPageUptodate(rdev->sb_page); + rdev->sb_loaded = 0; + err = super_types[mddev->major_version].load_super(rdev, NULL, mddev->minor_version); + + if (err < 0) { + pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", + __func__, __LINE__, rdev->desc_nr, err); + put_page(rdev->sb_page); + rdev->sb_page = swapout; + rdev->sb_loaded = 1; + return err; + } + + sb = page_address(rdev->sb_page); + /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET + * is not set + */ + + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) + rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); + + /* The other node finished recovery, call spare_active to set + * device In_sync and mddev->degraded + */ + if (rdev->recovery_offset == MaxSector && + !test_bit(In_sync, &rdev->flags) && + mddev->pers->spare_active(mddev)) + sysfs_notify(&mddev->kobj, NULL, "degraded"); + + put_page(swapout); + return 0; +} + +void md_reload_sb(struct mddev *mddev, int nr) +{ + struct md_rdev *rdev; + int err; + + /* Find the rdev */ + rdev_for_each_rcu(rdev, mddev) { + if (rdev->desc_nr == nr) + break; + } + + if (!rdev || rdev->desc_nr != nr) { + pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr); + return; } + err = read_rdev(mddev, rdev); + if (err < 0) + return; + + check_sb_changes(mddev, rdev); + + /* Read all rdev's to update recovery_offset */ + rdev_for_each_rcu(rdev, mddev) + read_rdev(mddev, rdev); } EXPORT_SYMBOL(md_reload_sb); |