summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-07-08 21:50:18 +0200
committerLinus Torvalds <torvalds@linux-foundation.org>2017-07-08 21:50:18 +0200
commit026d15f6b9878794fae1f794cae881ccd65052e5 (patch)
treed772991739c19d74d6ccdd1c9ae8e1ad72c5e061
parentMerge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dto... (diff)
parentMD: fix sleep in atomic (diff)
downloadlinux-026d15f6b9878794fae1f794cae881ccd65052e5.tar.xz
linux-026d15f6b9878794fae1f794cae881ccd65052e5.zip
Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md
Pull MD update from Shaohua Li: - fixed deadlock in MD suspend and a potential bug in bio allocation (Neil Brown) - fixed signal issue (Mikulas Patocka) - fixed typo in FailFast test (Guoqing Jiang) - other trival fixes * 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md: MD: fix sleep in atomic MD: fix a null dereference md: use a separate bio_set for synchronous IO. md: change the initialization value for a spare device spot to MD_DISK_ROLE_SPARE md/raid1: remove unused bio in sync_request_write md/raid10: fix FailFast test for wrong device md: don't use flush_signals in userspace processes md: fix deadlock between mddev_suspend() and md_write_start()
-rw-r--r--drivers/md/faulty.c5
-rw-r--r--drivers/md/linear.c7
-rw-r--r--drivers/md/md.c47
-rw-r--r--drivers/md/md.h7
-rw-r--r--drivers/md/multipath.c8
-rw-r--r--drivers/md/raid0.c7
-rw-r--r--drivers/md/raid1.c20
-rw-r--r--drivers/md/raid10.c16
-rw-r--r--drivers/md/raid5.c22
9 files changed, 92 insertions, 47 deletions
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index b0536cfd8e17..06a64d5d8c6c 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -170,7 +170,7 @@ static void add_sector(struct faulty_conf *conf, sector_t start, int mode)
conf->nfaults = n+1;
}
-static void faulty_make_request(struct mddev *mddev, struct bio *bio)
+static bool faulty_make_request(struct mddev *mddev, struct bio *bio)
{
struct faulty_conf *conf = mddev->private;
int failit = 0;
@@ -182,7 +182,7 @@ static void faulty_make_request(struct mddev *mddev, struct bio *bio)
* just fail immediately
*/
bio_io_error(bio);
- return;
+ return true;
}
if (check_sector(conf, bio->bi_iter.bi_sector,
@@ -224,6 +224,7 @@ static void faulty_make_request(struct mddev *mddev, struct bio *bio)
bio->bi_bdev = conf->rdev->bdev;
generic_make_request(bio);
+ return true;
}
static void faulty_status(struct seq_file *seq, struct mddev *mddev)
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index df6f2c98eca7..5f1eb9189542 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -245,7 +245,7 @@ static void linear_free(struct mddev *mddev, void *priv)
kfree(conf);
}
-static void linear_make_request(struct mddev *mddev, struct bio *bio)
+static bool linear_make_request(struct mddev *mddev, struct bio *bio)
{
char b[BDEVNAME_SIZE];
struct dev_info *tmp_dev;
@@ -254,7 +254,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
md_flush_request(mddev, bio);
- return;
+ return true;
}
tmp_dev = which_dev(mddev, bio_sector);
@@ -292,7 +292,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
mddev_check_write_zeroes(mddev, bio);
generic_make_request(bio);
}
- return;
+ return true;
out_of_bounds:
pr_err("md/linear:%s: make_request: Sector %llu out of bounds on dev %s: %llu sectors, offset %llu\n",
@@ -302,6 +302,7 @@ out_of_bounds:
(unsigned long long)tmp_dev->rdev->sectors,
(unsigned long long)start_sector);
bio_io_error(bio);
+ return true;
}
static void linear_status (struct seq_file *seq, struct mddev *mddev)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 31bcbfb09fef..8cdca0296749 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -203,6 +203,14 @@ struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
}
EXPORT_SYMBOL_GPL(bio_alloc_mddev);
+static struct bio *md_bio_alloc_sync(struct mddev *mddev)
+{
+ if (!mddev || !mddev->sync_set)
+ return bio_alloc(GFP_NOIO, 1);
+
+ return bio_alloc_bioset(GFP_NOIO, 1, mddev->sync_set);
+}
+
/*
* We have a system wide 'event count' that is incremented
* on any 'interesting' event, and readers of /proc/mdstat
@@ -277,7 +285,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
bio_endio(bio);
return BLK_QC_T_NONE;
}
- smp_rmb(); /* Ensure implications of 'active' are visible */
+check_suspended:
rcu_read_lock();
if (mddev->suspended) {
DEFINE_WAIT(__wait);
@@ -302,7 +310,11 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
sectors = bio_sectors(bio);
/* bio could be mergeable after passing to underlayer */
bio->bi_opf &= ~REQ_NOMERGE;
- mddev->pers->make_request(mddev, bio);
+ if (!mddev->pers->make_request(mddev, bio)) {
+ atomic_dec(&mddev->active_io);
+ wake_up(&mddev->sb_wait);
+ goto check_suspended;
+ }
cpu = part_stat_lock();
part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
@@ -327,6 +339,7 @@ void mddev_suspend(struct mddev *mddev)
if (mddev->suspended++)
return;
synchronize_rcu();
+ wake_up(&mddev->sb_wait);
wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
mddev->pers->quiesce(mddev, 1);
@@ -462,7 +475,7 @@ static void mddev_delayed_delete(struct work_struct *ws);
static void mddev_put(struct mddev *mddev)
{
- struct bio_set *bs = NULL;
+ struct bio_set *bs = NULL, *sync_bs = NULL;
if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
return;
@@ -472,7 +485,9 @@ static void mddev_put(struct mddev *mddev)
* so destroy it */
list_del_init(&mddev->all_mddevs);
bs = mddev->bio_set;
+ sync_bs = mddev->sync_set;
mddev->bio_set = NULL;
+ mddev->sync_set = NULL;
if (mddev->gendisk) {
/* We did a probe so need to clean up. Call
* queue_work inside the spinlock so that
@@ -487,6 +502,8 @@ static void mddev_put(struct mddev *mddev)
spin_unlock(&all_mddevs_lock);
if (bs)
bioset_free(bs);
+ if (sync_bs)
+ bioset_free(sync_bs);
}
static void md_safemode_timeout(unsigned long data);
@@ -751,7 +768,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
if (test_bit(Faulty, &rdev->flags))
return;
- bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
+ bio = md_bio_alloc_sync(mddev);
atomic_inc(&rdev->nr_pending);
@@ -783,7 +800,7 @@ int md_super_wait(struct mddev *mddev)
int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
struct page *page, int op, int op_flags, bool metadata_op)
{
- struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
+ struct bio *bio = md_bio_alloc_sync(rdev->mddev);
int ret;
bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
@@ -1852,7 +1869,7 @@ retry:
max_dev = le32_to_cpu(sb->max_dev);
for (i=0; i<max_dev;i++)
- sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
+ sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
@@ -5432,6 +5449,11 @@ int md_run(struct mddev *mddev)
if (!mddev->bio_set)
return -ENOMEM;
}
+ if (mddev->sync_set == NULL) {
+ mddev->sync_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
+ if (!mddev->sync_set)
+ return -ENOMEM;
+ }
spin_lock(&pers_lock);
pers = find_pers(mddev->level, mddev->clevel);
@@ -7950,12 +7972,14 @@ EXPORT_SYMBOL(md_done_sync);
* If we need to update some array metadata (e.g. 'active' flag
* in superblock) before writing, schedule a superblock update
* and wait for it to complete.
+ * A return value of 'false' means that the write wasn't recorded
+ * and cannot proceed as the array is being suspend.
*/
-void md_write_start(struct mddev *mddev, struct bio *bi)
+bool md_write_start(struct mddev *mddev, struct bio *bi)
{
int did_change = 0;
if (bio_data_dir(bi) != WRITE)
- return;
+ return true;
BUG_ON(mddev->ro == 1);
if (mddev->ro == 2) {
@@ -7987,7 +8011,12 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
if (did_change)
sysfs_notify_dirent_safe(mddev->sysfs_state);
wait_event(mddev->sb_wait,
- !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
+ !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) && !mddev->suspended);
+ if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
+ percpu_ref_put(&mddev->writes_pending);
+ return false;
+ }
+ return true;
}
EXPORT_SYMBOL(md_write_start);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 0fa1de42c42b..991f0fe2dcc6 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -444,6 +444,9 @@ struct mddev {
struct attribute_group *to_remove;
struct bio_set *bio_set;
+ struct bio_set *sync_set; /* for sync operations like
+ * metadata and bitmap writes
+ */
/* Generic flush handling.
* The last to finish preflush schedules a worker to submit
@@ -510,7 +513,7 @@ struct md_personality
int level;
struct list_head list;
struct module *owner;
- void (*make_request)(struct mddev *mddev, struct bio *bio);
+ bool (*make_request)(struct mddev *mddev, struct bio *bio);
int (*run)(struct mddev *mddev);
void (*free)(struct mddev *mddev, void *priv);
void (*status)(struct seq_file *seq, struct mddev *mddev);
@@ -649,7 +652,7 @@ extern void md_wakeup_thread(struct md_thread *thread);
extern void md_check_recovery(struct mddev *mddev);
extern void md_reap_sync_thread(struct mddev *mddev);
extern int mddev_init_writes_pending(struct mddev *mddev);
-extern void md_write_start(struct mddev *mddev, struct bio *bi);
+extern bool md_write_start(struct mddev *mddev, struct bio *bi);
extern void md_write_inc(struct mddev *mddev, struct bio *bi);
extern void md_write_end(struct mddev *mddev);
extern void md_done_sync(struct mddev *mddev, int blocks, int ok);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 68d036e64041..23a162ba6c56 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -106,7 +106,7 @@ static void multipath_end_request(struct bio *bio)
rdev_dec_pending(rdev, conf->mddev);
}
-static void multipath_make_request(struct mddev *mddev, struct bio * bio)
+static bool multipath_make_request(struct mddev *mddev, struct bio * bio)
{
struct mpconf *conf = mddev->private;
struct multipath_bh * mp_bh;
@@ -114,7 +114,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
md_flush_request(mddev, bio);
- return;
+ return true;
}
mp_bh = mempool_alloc(conf->pool, GFP_NOIO);
@@ -126,7 +126,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
if (mp_bh->path < 0) {
bio_io_error(bio);
mempool_free(mp_bh, conf->pool);
- return;
+ return true;
}
multipath = conf->multipaths + mp_bh->path;
@@ -141,7 +141,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
mddev_check_writesame(mddev, &mp_bh->bio);
mddev_check_write_zeroes(mddev, &mp_bh->bio);
generic_make_request(&mp_bh->bio);
- return;
+ return true;
}
static void multipath_status(struct seq_file *seq, struct mddev *mddev)
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index d6c0bc76e837..94d9ae9b0fd0 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -548,7 +548,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
bio_endio(bio);
}
-static void raid0_make_request(struct mddev *mddev, struct bio *bio)
+static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
{
struct strip_zone *zone;
struct md_rdev *tmp_dev;
@@ -559,12 +559,12 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
md_flush_request(mddev, bio);
- return;
+ return true;
}
if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) {
raid0_handle_discard(mddev, bio);
- return;
+ return true;
}
bio_sector = bio->bi_iter.bi_sector;
@@ -599,6 +599,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
mddev_check_writesame(mddev, bio);
mddev_check_write_zeroes(mddev, bio);
generic_make_request(bio);
+ return true;
}
static void raid0_status(struct seq_file *seq, struct mddev *mddev)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 98ca2c1d3226..3febfc8391fb 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1321,7 +1321,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
* Continue immediately if no resync is active currently.
*/
- md_write_start(mddev, bio); /* wait on superblock update early */
if ((bio_end_sector(bio) > mddev->suspend_lo &&
bio->bi_iter.bi_sector < mddev->suspend_hi) ||
@@ -1335,7 +1334,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
*/
DEFINE_WAIT(w);
for (;;) {
- flush_signals(current);
+ sigset_t full, old;
prepare_to_wait(&conf->wait_barrier,
&w, TASK_INTERRUPTIBLE);
if (bio_end_sector(bio) <= mddev->suspend_lo ||
@@ -1345,7 +1344,10 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
bio->bi_iter.bi_sector,
bio_end_sector(bio))))
break;
+ sigfillset(&full);
+ sigprocmask(SIG_BLOCK, &full, &old);
schedule();
+ sigprocmask(SIG_SETMASK, &old, NULL);
}
finish_wait(&conf->wait_barrier, &w);
}
@@ -1550,13 +1552,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
wake_up(&conf->wait_barrier);
}
-static void raid1_make_request(struct mddev *mddev, struct bio *bio)
+static bool raid1_make_request(struct mddev *mddev, struct bio *bio)
{
sector_t sectors;
if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
md_flush_request(mddev, bio);
- return;
+ return true;
}
/*
@@ -1571,8 +1573,12 @@ static void raid1_make_request(struct mddev *mddev, struct bio *bio)
if (bio_data_dir(bio) == READ)
raid1_read_request(mddev, bio, sectors, NULL);
- else
+ else {
+ if (!md_write_start(mddev,bio))
+ return false;
raid1_write_request(mddev, bio, sectors);
+ }
+ return true;
}
static void raid1_status(struct seq_file *seq, struct mddev *mddev)
@@ -2165,9 +2171,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
struct r1conf *conf = mddev->private;
int i;
int disks = conf->raid_disks * 2;
- struct bio *bio, *wbio;
-
- bio = r1_bio->bios[r1_bio->read_disk];
+ struct bio *wbio;
if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
/* ouch - failed to read all of that. */
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 57a250fdbbcc..5026e7ad51d3 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1303,8 +1303,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
sector_t sectors;
int max_sectors;
- md_write_start(mddev, bio);
-
/*
* Register the new request and wait if the reconstruction
* thread has put up a bar for new requests.
@@ -1525,7 +1523,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
raid10_write_request(mddev, bio, r10_bio);
}
-static void raid10_make_request(struct mddev *mddev, struct bio *bio)
+static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
{
struct r10conf *conf = mddev->private;
sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
@@ -1534,9 +1532,12 @@ static void raid10_make_request(struct mddev *mddev, struct bio *bio)
if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
md_flush_request(mddev, bio);
- return;
+ return true;
}
+ if (!md_write_start(mddev, bio))
+ return false;
+
/*
* If this request crosses a chunk boundary, we need to split
* it.
@@ -1553,6 +1554,7 @@ static void raid10_make_request(struct mddev *mddev, struct bio *bio)
/* In case raid10d snuck in to freeze_array */
wake_up(&conf->wait_barrier);
+ return true;
}
static void raid10_status(struct seq_file *seq, struct mddev *mddev)
@@ -3293,7 +3295,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
biolist = bio;
bio->bi_end_io = end_sync_read;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
- if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
+ if (test_bit(FailFast, &rdev->flags))
bio->bi_opf |= MD_FAILFAST;
bio->bi_iter.bi_sector = sector + rdev->data_offset;
bio->bi_bdev = rdev->bdev;
@@ -3305,7 +3307,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
continue;
}
atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
/* Need to set up for writing to the replacement */
bio = r10_bio->devs[i].repl_bio;
@@ -3316,11 +3317,12 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
biolist = bio;
bio->bi_end_io = end_sync_write;
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
- if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
+ if (test_bit(FailFast, &rdev->flags))
bio->bi_opf |= MD_FAILFAST;
bio->bi_iter.bi_sector = sector + rdev->data_offset;
bio->bi_bdev = rdev->bdev;
count++;
+ rcu_read_unlock();
}
if (count < 2) {
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 62c965be97e1..2ceb338b094b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5479,7 +5479,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
bi->bi_next = NULL;
- md_write_start(mddev, bi);
stripe_sectors = conf->chunk_sectors *
(conf->raid_disks - conf->max_degraded);
@@ -5549,11 +5548,10 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
release_stripe_plug(mddev, sh);
}
- md_write_end(mddev);
bio_endio(bi);
}
-static void raid5_make_request(struct mddev *mddev, struct bio * bi)
+static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
{
struct r5conf *conf = mddev->private;
int dd_idx;
@@ -5569,10 +5567,10 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
int ret = r5l_handle_flush_request(conf->log, bi);
if (ret == 0)
- return;
+ return true;
if (ret == -ENODEV) {
md_flush_request(mddev, bi);
- return;
+ return true;
}
/* ret == -EAGAIN, fallback */
/*
@@ -5582,6 +5580,8 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
do_flush = bi->bi_opf & REQ_PREFLUSH;
}
+ if (!md_write_start(mddev, bi))
+ return false;
/*
* If array is degraded, better not do chunk aligned read because
* later we might have to read it again in order to reconstruct
@@ -5591,18 +5591,18 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
mddev->reshape_position == MaxSector) {
bi = chunk_aligned_read(mddev, bi);
if (!bi)
- return;
+ return true;
}
if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) {
make_discard_request(mddev, bi);
- return;
+ md_write_end(mddev);
+ return true;
}
logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
last_sector = bio_end_sector(bi);
bi->bi_next = NULL;
- md_write_start(mddev, bi);
prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
@@ -5693,12 +5693,15 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
* userspace, we want an interruptible
* wait.
*/
- flush_signals(current);
prepare_to_wait(&conf->wait_for_overlap,
&w, TASK_INTERRUPTIBLE);
if (logical_sector >= mddev->suspend_lo &&
logical_sector < mddev->suspend_hi) {
+ sigset_t full, old;
+ sigfillset(&full);
+ sigprocmask(SIG_BLOCK, &full, &old);
schedule();
+ sigprocmask(SIG_SETMASK, &old, NULL);
do_prepare = true;
}
goto retry;
@@ -5740,6 +5743,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
if (rw == WRITE)
md_write_end(mddev);
bio_endio(bi);
+ return true;
}
static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);