diff options
Diffstat (limited to 'fs')
247 files changed, 3663 insertions, 4284 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index b84c291ba1eb..d7b78d531e63 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -74,7 +74,7 @@ int v9fs_file_open(struct inode *inode, struct file *file) v9fs_proto_dotu(v9ses)); fid = file->private_data; if (!fid) { - fid = v9fs_fid_clone(file->f_path.dentry); + fid = v9fs_fid_clone(file_dentry(file)); if (IS_ERR(fid)) return PTR_ERR(fid); @@ -100,7 +100,7 @@ int v9fs_file_open(struct inode *inode, struct file *file) * because we want write after unlink usecase * to work. */ - fid = v9fs_writeback_fid(file->f_path.dentry); + fid = v9fs_writeback_fid(file_dentry(file)); if (IS_ERR(fid)) { err = PTR_ERR(fid); mutex_unlock(&v9inode->v_mutex); @@ -516,7 +516,7 @@ v9fs_mmap_file_mmap(struct file *filp, struct vm_area_struct *vma) * because we want write after unlink usecase * to work. */ - fid = v9fs_writeback_fid(filp->f_path.dentry); + fid = v9fs_writeback_fid(file_dentry(filp)); if (IS_ERR(fid)) { retval = PTR_ERR(fid); mutex_unlock(&v9inode->v_mutex); diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index f4645c515262..e2e7c749925a 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -853,7 +853,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, struct p9_fid *fid, *inode_fid; struct dentry *res = NULL; - if (d_unhashed(dentry)) { + if (d_in_lookup(dentry)) { res = v9fs_vfs_lookup(dir, dentry, 0); if (IS_ERR(res)) return PTR_ERR(res); diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index a34702c998f5..1b51eaa5e2dd 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -254,7 +254,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, struct posix_acl *pacl = NULL, *dacl = NULL; struct dentry *res = NULL; - if (d_unhashed(dentry)) { + if (d_in_lookup(dentry)) { res = v9fs_vfs_lookup(dir, dentry, 0); if (IS_ERR(res)) return PTR_ERR(res); diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index f0d268b97d19..a439548de785 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -70,9 +70,13 @@ struct autofs_info { }; #define AUTOFS_INF_EXPIRING (1<<0) /* dentry in the process of expiring */ -#define AUTOFS_INF_NO_RCU (1<<1) /* the dentry is being considered +#define AUTOFS_INF_WANT_EXPIRE (1<<1) /* the dentry is being considered * for expiry, so RCU_walk is - * not permitted + * not permitted. If it progresses to + * actual expiry attempt, the flag is + * not cleared when EXPIRING is set - + * in that case it gets cleared only + * when it comes to clearing EXPIRING. */ #define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */ diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 9510d8d2e9cd..b493909e7492 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -316,19 +316,17 @@ struct dentry *autofs4_expire_direct(struct super_block *sb, if (ino->flags & AUTOFS_INF_PENDING) goto out; if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { - ino->flags |= AUTOFS_INF_NO_RCU; + ino->flags |= AUTOFS_INF_WANT_EXPIRE; spin_unlock(&sbi->fs_lock); synchronize_rcu(); spin_lock(&sbi->fs_lock); if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { ino->flags |= AUTOFS_INF_EXPIRING; - smp_mb(); - ino->flags &= ~AUTOFS_INF_NO_RCU; init_completion(&ino->expire_complete); spin_unlock(&sbi->fs_lock); return root; } - ino->flags &= ~AUTOFS_INF_NO_RCU; + ino->flags &= ~AUTOFS_INF_WANT_EXPIRE; } out: spin_unlock(&sbi->fs_lock); @@ -446,7 +444,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, while ((dentry = get_next_positive_subdir(dentry, root))) { spin_lock(&sbi->fs_lock); ino = autofs4_dentry_ino(dentry); - if (ino->flags & AUTOFS_INF_NO_RCU) + if (ino->flags & AUTOFS_INF_WANT_EXPIRE) expired = NULL; else expired = should_expire(dentry, mnt, timeout, how); @@ -455,7 +453,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, continue; } ino = autofs4_dentry_ino(expired); - ino->flags |= AUTOFS_INF_NO_RCU; + ino->flags |= AUTOFS_INF_WANT_EXPIRE; spin_unlock(&sbi->fs_lock); synchronize_rcu(); spin_lock(&sbi->fs_lock); @@ -465,7 +463,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, goto found; } - ino->flags &= ~AUTOFS_INF_NO_RCU; + ino->flags &= ~AUTOFS_INF_WANT_EXPIRE; if (expired != dentry) dput(expired); spin_unlock(&sbi->fs_lock); @@ -475,17 +473,8 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, found: pr_debug("returning %p %pd\n", expired, expired); ino->flags |= AUTOFS_INF_EXPIRING; - smp_mb(); - ino->flags &= ~AUTOFS_INF_NO_RCU; init_completion(&ino->expire_complete); spin_unlock(&sbi->fs_lock); - spin_lock(&sbi->lookup_lock); - spin_lock(&expired->d_parent->d_lock); - spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED); - list_move(&expired->d_parent->d_subdirs, &expired->d_child); - spin_unlock(&expired->d_lock); - spin_unlock(&expired->d_parent->d_lock); - spin_unlock(&sbi->lookup_lock); return expired; } @@ -496,7 +485,7 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk) int status; /* Block on any pending expire */ - if (!(ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU))) + if (!(ino->flags & AUTOFS_INF_WANT_EXPIRE)) return 0; if (rcu_walk) return -ECHILD; @@ -554,7 +543,7 @@ int autofs4_expire_run(struct super_block *sb, ino = autofs4_dentry_ino(dentry); /* avoid rapid-fire expire attempts if expiry fails */ ino->last_used = now; - ino->flags &= ~AUTOFS_INF_EXPIRING; + ino->flags &= ~(AUTOFS_INF_EXPIRING|AUTOFS_INF_WANT_EXPIRE); complete_all(&ino->expire_complete); spin_unlock(&sbi->fs_lock); @@ -583,7 +572,7 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, spin_lock(&sbi->fs_lock); /* avoid rapid-fire expire attempts if expiry fails */ ino->last_used = now; - ino->flags &= ~AUTOFS_INF_EXPIRING; + ino->flags &= ~(AUTOFS_INF_EXPIRING|AUTOFS_INF_WANT_EXPIRE); complete_all(&ino->expire_complete); spin_unlock(&sbi->fs_lock); dput(dentry); diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 78bd80298528..3767f6641af1 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -458,7 +458,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) */ struct inode *inode; - if (ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU)) + if (ino->flags & AUTOFS_INF_WANT_EXPIRE) return 0; if (d_mountpoint(dentry)) return 0; diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 0146d911f468..631f1554c87b 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -66,11 +66,12 @@ static int autofs4_write(struct autofs_sb_info *sbi, set_fs(KERNEL_DS); mutex_lock(&sbi->pipe_mutex); - wr = __vfs_write(file, data, bytes, &file->f_pos); - while (bytes && wr) { + while (bytes) { + wr = __vfs_write(file, data, bytes, &file->f_pos); + if (wr <= 0) + break; data += wr; bytes -= wr; - wr = __vfs_write(file, data, bytes, &file->f_pos); } mutex_unlock(&sbi->pipe_mutex); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index e158b22ef32f..a7a28110dc80 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -2275,7 +2275,7 @@ static int elf_core_dump(struct coredump_params *cprm) goto end_coredump; /* Align to page */ - if (!dump_skip(cprm, dataoff - cprm->file->f_pos)) + if (!dump_skip(cprm, dataoff - cprm->pos)) goto end_coredump; for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 71ade0e556b7..203589311bf8 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1787,7 +1787,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) goto end_coredump; } - if (!dump_skip(cprm, dataoff - cprm->file->f_pos)) + if (!dump_skip(cprm, dataoff - cprm->pos)) goto end_coredump; if (!elf_fdpic_dump_segments(cprm)) diff --git a/fs/block_dev.c b/fs/block_dev.c index 71ccab1d22c6..d012be4ab977 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -493,7 +493,7 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax) if (size < 0) return size; - if (!ops->direct_access) + if (!blk_queue_dax(bdev_get_queue(bdev)) || !ops->direct_access) return -EOPNOTSUPP; if ((sector + DIV_ROUND_UP(size, 512)) > part_nr_sects_read(bdev->bd_part)) @@ -1287,7 +1287,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = disk; bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; - if (IS_ENABLED(CONFIG_BLK_DEV_DAX) && disk->fops->direct_access) + if (IS_ENABLED(CONFIG_BLK_DEV_DAX) && + blk_queue_dax(disk->queue)) bdev->bd_inode->i_flags = S_DAX; else bdev->bd_inode->i_flags = 0; diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index b677a6ea6001..5d5cae05818d 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1673,6 +1673,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, } bio->bi_bdev = block_ctx->dev->bdev; bio->bi_iter.bi_sector = dev_bytenr >> 9; + bio_set_op_attrs(bio, REQ_OP_READ, 0); for (j = i; j < num_pages; j++) { ret = bio_add_page(bio, block_ctx->pagev[j], @@ -1685,7 +1686,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, "btrfsic: error, failed to add a single page!\n"); return -1; } - if (submit_bio_wait(READ, bio)) { + if (submit_bio_wait(bio)) { printk(KERN_INFO "btrfsic: read error at logical %llu dev %s!\n", block_ctx->start, block_ctx->dev->name); @@ -2206,7 +2207,7 @@ static void btrfsic_bio_end_io(struct bio *bp) block->dev_bytenr, block->mirror_num); next_block = block->next_in_same_bio; block->iodone_w_error = iodone_w_error; - if (block->submit_bio_bh_rw & REQ_FLUSH) { + if (block->submit_bio_bh_rw & REQ_PREFLUSH) { dev_state->last_flush_gen++; if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) @@ -2242,7 +2243,7 @@ static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate) block->dev_bytenr, block->mirror_num); block->iodone_w_error = iodone_w_error; - if (block->submit_bio_bh_rw & REQ_FLUSH) { + if (block->submit_bio_bh_rw & REQ_PREFLUSH) { dev_state->last_flush_gen++; if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) @@ -2645,7 +2646,7 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, * This algorithm is recursive because the amount of used stack space * is very small and the max recursion depth is limited. */ - indent_add = sprintf(buf, "%c-%llu(%s/%llu/%d)", + indent_add = sprintf(buf, "%c-%llu(%s/%llu/%u)", btrfsic_get_block_type(state, block), block->logical_bytenr, block->dev_state->name, block->dev_bytenr, block->mirror_num); @@ -2855,12 +2856,12 @@ static struct btrfsic_dev_state *btrfsic_dev_state_lookup( return ds; } -int btrfsic_submit_bh(int rw, struct buffer_head *bh) +int btrfsic_submit_bh(int op, int op_flags, struct buffer_head *bh) { struct btrfsic_dev_state *dev_state; if (!btrfsic_is_initialized) - return submit_bh(rw, bh); + return submit_bh(op, op_flags, bh); mutex_lock(&btrfsic_mutex); /* since btrfsic_submit_bh() might also be called before @@ -2869,26 +2870,26 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh) /* Only called to write the superblock (incl. FLUSH/FUA) */ if (NULL != dev_state && - (rw & WRITE) && bh->b_size > 0) { + (op == REQ_OP_WRITE) && bh->b_size > 0) { u64 dev_bytenr; dev_bytenr = 4096 * bh->b_blocknr; if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) printk(KERN_INFO - "submit_bh(rw=0x%x, blocknr=%llu (bytenr %llu)," - " size=%zu, data=%p, bdev=%p)\n", - rw, (unsigned long long)bh->b_blocknr, + "submit_bh(op=0x%x,0x%x, blocknr=%llu " + "(bytenr %llu), size=%zu, data=%p, bdev=%p)\n", + op, op_flags, (unsigned long long)bh->b_blocknr, dev_bytenr, bh->b_size, bh->b_data, bh->b_bdev); btrfsic_process_written_block(dev_state, dev_bytenr, &bh->b_data, 1, NULL, - NULL, bh, rw); - } else if (NULL != dev_state && (rw & REQ_FLUSH)) { + NULL, bh, op_flags); + } else if (NULL != dev_state && (op_flags & REQ_PREFLUSH)) { if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) printk(KERN_INFO - "submit_bh(rw=0x%x FLUSH, bdev=%p)\n", - rw, bh->b_bdev); + "submit_bh(op=0x%x,0x%x FLUSH, bdev=%p)\n", + op, op_flags, bh->b_bdev); if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { if ((dev_state->state->print_mask & (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | @@ -2906,7 +2907,7 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh) block->never_written = 0; block->iodone_w_error = 0; block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = rw; + block->submit_bio_bh_rw = op_flags; block->orig_bio_bh_private = bh->b_private; block->orig_bio_bh_end_io.bh = bh->b_end_io; block->next_in_same_bio = NULL; @@ -2915,10 +2916,10 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh) } } mutex_unlock(&btrfsic_mutex); - return submit_bh(rw, bh); + return submit_bh(op, op_flags, bh); } -static void __btrfsic_submit_bio(int rw, struct bio *bio) +static void __btrfsic_submit_bio(struct bio *bio) { struct btrfsic_dev_state *dev_state; @@ -2930,7 +2931,7 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio) * btrfsic_mount(), this might return NULL */ dev_state = btrfsic_dev_state_lookup(bio->bi_bdev); if (NULL != dev_state && - (rw & WRITE) && NULL != bio->bi_io_vec) { + (bio_op(bio) == REQ_OP_WRITE) && NULL != bio->bi_io_vec) { unsigned int i; u64 dev_bytenr; u64 cur_bytenr; @@ -2942,9 +2943,9 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio) if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) printk(KERN_INFO - "submit_bio(rw=0x%x, bi_vcnt=%u," + "submit_bio(rw=%d,0x%x, bi_vcnt=%u," " bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", - rw, bio->bi_vcnt, + bio_op(bio), bio->bi_rw, bio->bi_vcnt, (unsigned long long)bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev); @@ -2975,18 +2976,18 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio) btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, bio->bi_vcnt, bio, &bio_is_patched, - NULL, rw); + NULL, bio->bi_rw); while (i > 0) { i--; kunmap(bio->bi_io_vec[i].bv_page); } kfree(mapped_datav); - } else if (NULL != dev_state && (rw & REQ_FLUSH)) { + } else if (NULL != dev_state && (bio->bi_rw & REQ_PREFLUSH)) { if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) printk(KERN_INFO - "submit_bio(rw=0x%x FLUSH, bdev=%p)\n", - rw, bio->bi_bdev); + "submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n", + bio_op(bio), bio->bi_rw, bio->bi_bdev); if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { if ((dev_state->state->print_mask & (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | @@ -3004,7 +3005,7 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio) block->never_written = 0; block->iodone_w_error = 0; block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = rw; + block->submit_bio_bh_rw = bio->bi_rw; block->orig_bio_bh_private = bio->bi_private; block->orig_bio_bh_end_io.bio = bio->bi_end_io; block->next_in_same_bio = NULL; @@ -3016,16 +3017,16 @@ leave: mutex_unlock(&btrfsic_mutex); } -void btrfsic_submit_bio(int rw, struct bio *bio) +void btrfsic_submit_bio(struct bio *bio) { - __btrfsic_submit_bio(rw, bio); - submit_bio(rw, bio); + __btrfsic_submit_bio(bio); + submit_bio(bio); } -int btrfsic_submit_bio_wait(int rw, struct bio *bio) +int btrfsic_submit_bio_wait(struct bio *bio) { - __btrfsic_submit_bio(rw, bio); - return submit_bio_wait(rw, bio); + __btrfsic_submit_bio(bio); + return submit_bio_wait(bio); } int btrfsic_mount(struct btrfs_root *root, diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h index 13b8566c97ab..f78dff1c7e86 100644 --- a/fs/btrfs/check-integrity.h +++ b/fs/btrfs/check-integrity.h @@ -20,9 +20,9 @@ #define __BTRFS_CHECK_INTEGRITY__ #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY -int btrfsic_submit_bh(int rw, struct buffer_head *bh); -void btrfsic_submit_bio(int rw, struct bio *bio); -int btrfsic_submit_bio_wait(int rw, struct bio *bio); +int btrfsic_submit_bh(int op, int op_flags, struct buffer_head *bh); +void btrfsic_submit_bio(struct bio *bio); +int btrfsic_submit_bio_wait(struct bio *bio); #else #define btrfsic_submit_bh submit_bh #define btrfsic_submit_bio submit_bio diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 658c39b70fba..cefedabf0a92 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -363,6 +363,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, kfree(cb); return -ENOMEM; } + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; atomic_inc(&cb->pending_bios); @@ -373,7 +374,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, page = compressed_pages[pg_index]; page->mapping = inode->i_mapping; if (bio->bi_iter.bi_size) - ret = io_tree->ops->merge_bio_hook(WRITE, page, 0, + ret = io_tree->ops->merge_bio_hook(page, 0, PAGE_SIZE, bio, 0); else @@ -401,13 +402,14 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, BUG_ON(ret); /* -ENOMEM */ } - ret = btrfs_map_bio(root, WRITE, bio, 0, 1); + ret = btrfs_map_bio(root, bio, 0, 1); BUG_ON(ret); /* -ENOMEM */ bio_put(bio); bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); BUG_ON(!bio); + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; bio_add_page(bio, page, PAGE_SIZE, 0); @@ -431,7 +433,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, BUG_ON(ret); /* -ENOMEM */ } - ret = btrfs_map_bio(root, WRITE, bio, 0, 1); + ret = btrfs_map_bio(root, bio, 0, 1); BUG_ON(ret); /* -ENOMEM */ bio_put(bio); @@ -646,6 +648,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); if (!comp_bio) goto fail2; + bio_set_op_attrs (comp_bio, REQ_OP_READ, 0); comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; atomic_inc(&cb->pending_bios); @@ -656,7 +659,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, page->index = em_start >> PAGE_SHIFT; if (comp_bio->bi_iter.bi_size) - ret = tree->ops->merge_bio_hook(READ, page, 0, + ret = tree->ops->merge_bio_hook(page, 0, PAGE_SIZE, comp_bio, 0); else @@ -687,8 +690,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, sums += DIV_ROUND_UP(comp_bio->bi_iter.bi_size, root->sectorsize); - ret = btrfs_map_bio(root, READ, comp_bio, - mirror_num, 0); + ret = btrfs_map_bio(root, comp_bio, mirror_num, 0); if (ret) { bio->bi_error = ret; bio_endio(comp_bio); @@ -699,6 +701,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); BUG_ON(!comp_bio); + bio_set_op_attrs(comp_bio, REQ_OP_READ, 0); comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; @@ -717,7 +720,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, BUG_ON(ret); /* -ENOMEM */ } - ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); + ret = btrfs_map_bio(root, comp_bio, mirror_num, 0); if (ret) { bio->bi_error = ret; bio_endio(comp_bio); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 427c36b430a6..a85cf7d23309 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1373,7 +1373,8 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { BUG_ON(tm->slot != 0); - eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start); + eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start, + eb->len); if (!eb_rewin) { btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); @@ -1454,7 +1455,8 @@ get_old_root(struct btrfs_root *root, u64 time_seq) } else if (old_root) { btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); - eb = alloc_dummy_extent_buffer(root->fs_info, logical); + eb = alloc_dummy_extent_buffer(root->fs_info, logical, + root->nodesize); } else { btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK); eb = btrfs_clone_extent_buffer(eb_root); @@ -1552,6 +1554,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, trans->transid, root->fs_info->generation); if (!should_cow_block(trans, root, buf)) { + trans->dirty = true; *cow_ret = buf; return 0; } @@ -1783,10 +1786,12 @@ static noinline int generic_bin_search(struct extent_buffer *eb, if (!err) { tmp = (struct btrfs_disk_key *)(kaddr + offset - map_start); - } else { + } else if (err == 1) { read_extent_buffer(eb, &unaligned, offset, sizeof(unaligned)); tmp = &unaligned; + } else { + return err; } } else { @@ -2510,6 +2515,8 @@ read_block_for_search(struct btrfs_trans_handle *trans, if (!btrfs_buffer_uptodate(tmp, 0, 0)) ret = -EIO; free_extent_buffer(tmp); + } else { + ret = PTR_ERR(tmp); } return ret; } @@ -2773,8 +2780,10 @@ again: * then we don't want to set the path blocking, * so we test it here */ - if (!should_cow_block(trans, root, b)) + if (!should_cow_block(trans, root, b)) { + trans->dirty = true; goto cow_done; + } /* * must have write locks on this node and the @@ -2823,6 +2832,8 @@ cow_done: } ret = key_search(b, key, level, &prev_cmp, &slot); + if (ret < 0) + goto done; if (level != 0) { int dec = 0; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 101c3cfd3f7c..b2620d1f883f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2518,7 +2518,7 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count); int btrfs_async_run_delayed_refs(struct btrfs_root *root, - unsigned long count, int wait); + unsigned long count, u64 transid, int wait); int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len); int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, @@ -3091,7 +3091,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, struct btrfs_root *parent_root, u64 new_dirid); -int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, +int btrfs_merge_bio_hook(struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags); int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 61561c2a3f96..d3aaabbfada0 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1606,15 +1606,23 @@ int btrfs_inode_delayed_dir_index_count(struct inode *inode) return 0; } -void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list, - struct list_head *del_list) +bool btrfs_readdir_get_delayed_items(struct inode *inode, + struct list_head *ins_list, + struct list_head *del_list) { struct btrfs_delayed_node *delayed_node; struct btrfs_delayed_item *item; delayed_node = btrfs_get_delayed_node(inode); if (!delayed_node) - return; + return false; + + /* + * We can only do one readdir with delayed items at a time because of + * item->readdir_list. + */ + inode_unlock_shared(inode); + inode_lock(inode); mutex_lock(&delayed_node->mutex); item = __btrfs_first_delayed_insertion_item(delayed_node); @@ -1641,10 +1649,13 @@ void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list, * requeue or dequeue this delayed node. */ atomic_dec(&delayed_node->refs); + + return true; } -void btrfs_put_delayed_items(struct list_head *ins_list, - struct list_head *del_list) +void btrfs_readdir_put_delayed_items(struct inode *inode, + struct list_head *ins_list, + struct list_head *del_list) { struct btrfs_delayed_item *curr, *next; @@ -1659,6 +1670,12 @@ void btrfs_put_delayed_items(struct list_head *ins_list, if (atomic_dec_and_test(&curr->refs)) kfree(curr); } + + /* + * The VFS is going to do up_read(), so we need to downgrade back to a + * read lock. + */ + downgrade_write(&inode->i_rwsem); } int btrfs_should_delete_dir_index(struct list_head *del_list, diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 0167853c84ae..2495b3d4075f 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -137,10 +137,12 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root); void btrfs_destroy_delayed_inodes(struct btrfs_root *root); /* Used for readdir() */ -void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list, - struct list_head *del_list); -void btrfs_put_delayed_items(struct list_head *ins_list, - struct list_head *del_list); +bool btrfs_readdir_get_delayed_items(struct inode *inode, + struct list_head *ins_list, + struct list_head *del_list); +void btrfs_readdir_put_delayed_items(struct inode *inode, + struct list_head *ins_list, + struct list_head *del_list); int btrfs_should_delete_dir_index(struct list_head *del_list, u64 index); int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6628fca9f4ed..9a726ded2c6d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -124,7 +124,6 @@ struct async_submit_bio { struct list_head list; extent_submit_bio_hook_t *submit_bio_start; extent_submit_bio_hook_t *submit_bio_done; - int rw; int mirror_num; unsigned long bio_flags; /* @@ -727,7 +726,7 @@ static void end_workqueue_bio(struct bio *bio) fs_info = end_io_wq->info; end_io_wq->error = bio->bi_error; - if (bio->bi_rw & REQ_WRITE) { + if (bio_op(bio) == REQ_OP_WRITE) { if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) { wq = fs_info->endio_meta_write_workers; func = btrfs_endio_meta_write_helper; @@ -797,7 +796,7 @@ static void run_one_async_start(struct btrfs_work *work) int ret; async = container_of(work, struct async_submit_bio, work); - ret = async->submit_bio_start(async->inode, async->rw, async->bio, + ret = async->submit_bio_start(async->inode, async->bio, async->mirror_num, async->bio_flags, async->bio_offset); if (ret) @@ -830,9 +829,8 @@ static void run_one_async_done(struct btrfs_work *work) return; } - async->submit_bio_done(async->inode, async->rw, async->bio, - async->mirror_num, async->bio_flags, - async->bio_offset); + async->submit_bio_done(async->inode, async->bio, async->mirror_num, + async->bio_flags, async->bio_offset); } static void run_one_async_free(struct btrfs_work *work) @@ -844,7 +842,7 @@ static void run_one_async_free(struct btrfs_work *work) } int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, - int rw, struct bio *bio, int mirror_num, + struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset, extent_submit_bio_hook_t *submit_bio_start, @@ -857,7 +855,6 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, return -ENOMEM; async->inode = inode; - async->rw = rw; async->bio = bio; async->mirror_num = mirror_num; async->submit_bio_start = submit_bio_start; @@ -873,7 +870,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, atomic_inc(&fs_info->nr_async_submits); - if (rw & REQ_SYNC) + if (bio->bi_rw & REQ_SYNC) btrfs_set_work_high_priority(&async->work); btrfs_queue_work(fs_info->workers, &async->work); @@ -903,9 +900,8 @@ static int btree_csum_one_bio(struct bio *bio) return ret; } -static int __btree_submit_bio_start(struct inode *inode, int rw, - struct bio *bio, int mirror_num, - unsigned long bio_flags, +static int __btree_submit_bio_start(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags, u64 bio_offset) { /* @@ -915,7 +911,7 @@ static int __btree_submit_bio_start(struct inode *inode, int rw, return btree_csum_one_bio(bio); } -static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, +static int __btree_submit_bio_done(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { @@ -925,7 +921,7 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, * when we're called for a write, we're already in the async * submission context. Just jump into btrfs_map_bio */ - ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); + ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 1); if (ret) { bio->bi_error = ret; bio_endio(bio); @@ -944,14 +940,14 @@ static int check_async_write(struct inode *inode, unsigned long bio_flags) return 1; } -static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, +static int btree_submit_bio_hook(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { int async = check_async_write(inode, bio_flags); int ret; - if (!(rw & REQ_WRITE)) { + if (bio_op(bio) != REQ_OP_WRITE) { /* * called for a read, do the setup so that checksum validation * can happen in the async kernel threads @@ -960,21 +956,19 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, bio, BTRFS_WQ_ENDIO_METADATA); if (ret) goto out_w_error; - ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, - mirror_num, 0); + ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 0); } else if (!async) { ret = btree_csum_one_bio(bio); if (ret) goto out_w_error; - ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, - mirror_num, 0); + ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 0); } else { /* * kthread helpers are used to submit writes so that * checksumming can happen in parallel across all CPUs */ ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, - inode, rw, bio, mirror_num, 0, + inode, bio, mirror_num, 0, bio_offset, __btree_submit_bio_start, __btree_submit_bio_done); @@ -1098,7 +1092,7 @@ void readahead_tree_block(struct btrfs_root *root, u64 bytenr) struct inode *btree_inode = root->fs_info->btree_inode; buf = btrfs_find_create_tree_block(root, bytenr); - if (!buf) + if (IS_ERR(buf)) return; read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, buf, 0, WAIT_NONE, btree_get_extent, 0); @@ -1114,7 +1108,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, int ret; buf = btrfs_find_create_tree_block(root, bytenr); - if (!buf) + if (IS_ERR(buf)) return 0; set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags); @@ -1147,7 +1141,8 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr) { if (btrfs_test_is_dummy_root(root)) - return alloc_test_extent_buffer(root->fs_info, bytenr); + return alloc_test_extent_buffer(root->fs_info, bytenr, + root->nodesize); return alloc_extent_buffer(root->fs_info, bytenr); } @@ -1171,8 +1166,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, int ret; buf = btrfs_find_create_tree_block(root, bytenr); - if (!buf) - return ERR_PTR(-ENOMEM); + if (IS_ERR(buf)) + return buf; ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); if (ret) { @@ -1314,14 +1309,16 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* Should only be used by the testing infrastructure */ -struct btrfs_root *btrfs_alloc_dummy_root(void) +struct btrfs_root *btrfs_alloc_dummy_root(u32 sectorsize, u32 nodesize) { struct btrfs_root *root; root = btrfs_alloc_root(NULL, GFP_KERNEL); if (!root) return ERR_PTR(-ENOMEM); - __setup_root(4096, 4096, 4096, root, NULL, 1); + /* We don't use the stripesize in selftest, set it as sectorsize */ + __setup_root(nodesize, sectorsize, sectorsize, root, NULL, + BTRFS_ROOT_TREE_OBJECTID); set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state); root->alloc_bytenr = 0; @@ -1803,6 +1800,13 @@ static int cleaner_kthread(void *arg) if (btrfs_need_cleaner_sleep(root)) goto sleep; + /* + * Do not do anything if we might cause open_ctree() to block + * before we have finished mounting the filesystem. + */ + if (!root->fs_info->open) + goto sleep; + if (!mutex_trylock(&root->fs_info->cleaner_mutex)) goto sleep; @@ -2517,7 +2521,6 @@ int open_ctree(struct super_block *sb, int num_backups_tried = 0; int backup_index = 0; int max_active; - bool cleaner_mutex_locked = false; tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL); chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL); @@ -2797,7 +2800,7 @@ int open_ctree(struct super_block *sb, nodesize = btrfs_super_nodesize(disk_super); sectorsize = btrfs_super_sectorsize(disk_super); - stripesize = btrfs_super_stripesize(disk_super); + stripesize = sectorsize; fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids)); fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids)); @@ -2996,13 +2999,6 @@ retry_root_backup: goto fail_sysfs; } - /* - * Hold the cleaner_mutex thread here so that we don't block - * for a long time on btrfs_recover_relocation. cleaner_kthread - * will wait for us to finish mounting the filesystem. - */ - mutex_lock(&fs_info->cleaner_mutex); - cleaner_mutex_locked = true; fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, "btrfs-cleaner"); if (IS_ERR(fs_info->cleaner_kthread)) @@ -3062,8 +3058,10 @@ retry_root_backup: ret = btrfs_cleanup_fs_roots(fs_info); if (ret) goto fail_qgroup; - /* We locked cleaner_mutex before creating cleaner_kthread. */ + + mutex_lock(&fs_info->cleaner_mutex); ret = btrfs_recover_relocation(tree_root); + mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { btrfs_warn(fs_info, "failed to recover relocation: %d", ret); @@ -3071,8 +3069,6 @@ retry_root_backup: goto fail_qgroup; } } - mutex_unlock(&fs_info->cleaner_mutex); - cleaner_mutex_locked = false; location.objectid = BTRFS_FS_TREE_OBJECTID; location.type = BTRFS_ROOT_ITEM_KEY; @@ -3186,10 +3182,6 @@ fail_cleaner: filemap_write_and_wait(fs_info->btree_inode->i_mapping); fail_sysfs: - if (cleaner_mutex_locked) { - mutex_unlock(&fs_info->cleaner_mutex); - cleaner_mutex_locked = false; - } btrfs_sysfs_remove_mounted(fs_info); fail_fsdev_sysfs: @@ -3420,9 +3412,9 @@ static int write_dev_supers(struct btrfs_device *device, * to go down lazy. */ if (i == 0) - ret = btrfsic_submit_bh(WRITE_FUA, bh); + ret = btrfsic_submit_bh(REQ_OP_WRITE, WRITE_FUA, bh); else - ret = btrfsic_submit_bh(WRITE_SYNC, bh); + ret = btrfsic_submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh); if (ret) errors++; } @@ -3486,12 +3478,13 @@ static int write_dev_flush(struct btrfs_device *device, int wait) bio->bi_end_io = btrfs_end_empty_barrier; bio->bi_bdev = device->bdev; + bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); init_completion(&device->flush_wait); bio->bi_private = &device->flush_wait; device->flush_bio = bio; bio_get(bio); - btrfsic_submit_bio(WRITE_FLUSH, bio); + btrfsic_submit_bio(bio); return 0; } @@ -4130,6 +4123,16 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, * Hint to catch really bogus numbers, bitflips or so, more exact checks are * done later */ + if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) { + btrfs_err(fs_info, "bytes_used is too small %llu", + btrfs_super_bytes_used(sb)); + ret = -EINVAL; + } + if (!is_power_of_2(btrfs_super_stripesize(sb))) { + btrfs_err(fs_info, "invalid stripesize %u", + btrfs_super_stripesize(sb)); + ret = -EINVAL; + } if (btrfs_super_num_devices(sb) > (1UL << 31)) printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n", btrfs_super_num_devices(sb)); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 8e79d0070bcf..dbf3e1aab69e 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -90,7 +90,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, void btrfs_free_fs_root(struct btrfs_root *root); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -struct btrfs_root *btrfs_alloc_dummy_root(void); +struct btrfs_root *btrfs_alloc_dummy_root(u32 sectorsize, u32 nodesize); #endif /* @@ -122,7 +122,7 @@ void btrfs_csum_final(u32 crc, char *result); int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, enum btrfs_wq_endio_type metadata); int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, - int rw, struct bio *bio, int mirror_num, + struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset, extent_submit_bio_hook_t *submit_bio_start, extent_submit_bio_hook_t *submit_bio_done); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a400951e8678..b480fd555774 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2042,8 +2042,13 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, struct btrfs_bio *bbio = NULL; + /* + * Avoid races with device replace and make sure our bbio has devices + * associated to its stripes that don't go away while we are discarding. + */ + btrfs_bio_counter_inc_blocked(root->fs_info); /* Tell the block device(s) that the sectors can be discarded */ - ret = btrfs_map_block(root->fs_info, REQ_DISCARD, + ret = btrfs_map_block(root->fs_info, REQ_OP_DISCARD, bytenr, &num_bytes, &bbio, 0); /* Error condition is -ENOMEM */ if (!ret) { @@ -2074,6 +2079,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, } btrfs_put_bbio(bbio); } + btrfs_bio_counter_dec(root->fs_info); if (actual_bytes) *actual_bytes = discarded_bytes; @@ -2829,6 +2835,7 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, struct async_delayed_refs { struct btrfs_root *root; + u64 transid; int count; int error; int sync; @@ -2844,6 +2851,10 @@ static void delayed_ref_async_start(struct btrfs_work *work) async = container_of(work, struct async_delayed_refs, work); + /* if the commit is already started, we don't need to wait here */ + if (btrfs_transaction_blocked(async->root->fs_info)) + goto done; + trans = btrfs_join_transaction(async->root); if (IS_ERR(trans)) { async->error = PTR_ERR(trans); @@ -2855,10 +2866,15 @@ static void delayed_ref_async_start(struct btrfs_work *work) * wait on delayed refs */ trans->sync = true; + + /* Don't bother flushing if we got into a different transaction */ + if (trans->transid > async->transid) + goto end; + ret = btrfs_run_delayed_refs(trans, async->root, async->count); if (ret) async->error = ret; - +end: ret = btrfs_end_transaction(trans, async->root); if (ret && !async->error) async->error = ret; @@ -2870,7 +2886,7 @@ done: } int btrfs_async_run_delayed_refs(struct btrfs_root *root, - unsigned long count, int wait) + unsigned long count, u64 transid, int wait) { struct async_delayed_refs *async; int ret; @@ -2882,6 +2898,7 @@ int btrfs_async_run_delayed_refs(struct btrfs_root *root, async->root = root->fs_info->tree_root; async->count = count; async->error = 0; + async->transid = transid; if (wait) async->sync = 1; else @@ -8010,8 +8027,9 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf; buf = btrfs_find_create_tree_block(root, bytenr); - if (!buf) - return ERR_PTR(-ENOMEM); + if (IS_ERR(buf)) + return buf; + btrfs_set_header_generation(buf, trans->transid); btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); btrfs_tree_lock(buf); @@ -8038,7 +8056,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, set_extent_dirty(&trans->transaction->dirty_pages, buf->start, buf->start + buf->len - 1, GFP_NOFS); } - trans->blocks_used++; + trans->dirty = true; /* this returns a buffer locked for blocking */ return buf; } @@ -8653,8 +8671,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, next = btrfs_find_tree_block(root->fs_info, bytenr); if (!next) { next = btrfs_find_create_tree_block(root, bytenr); - if (!next) - return -ENOMEM; + if (IS_ERR(next)) + return PTR_ERR(next); + btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, level - 1); reada = 1; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3cd57825c75f..cee4cb99b8ce 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2025,9 +2025,16 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, bio->bi_iter.bi_size = 0; map_length = length; + /* + * Avoid races with device replace and make sure our bbio has devices + * associated to its stripes that don't go away while we are doing the + * read repair operation. + */ + btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_block(fs_info, WRITE, logical, &map_length, &bbio, mirror_num); if (ret) { + btrfs_bio_counter_dec(fs_info); bio_put(bio); return -EIO; } @@ -2037,14 +2044,17 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, dev = bbio->stripes[mirror_num-1].dev; btrfs_put_bbio(bbio); if (!dev || !dev->bdev || !dev->writeable) { + btrfs_bio_counter_dec(fs_info); bio_put(bio); return -EIO; } bio->bi_bdev = dev->bdev; + bio->bi_rw = WRITE_SYNC; bio_add_page(bio, page, length, pg_offset); - if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) { + if (btrfsic_submit_bio_wait(bio)) { /* try to remap that extent elsewhere? */ + btrfs_bio_counter_dec(fs_info); bio_put(bio); btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); return -EIO; @@ -2054,6 +2064,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, "read error corrected: ino %llu off %llu (dev %s sector %llu)", btrfs_ino(inode), start, rcu_str_deref(dev->name), sector); + btrfs_bio_counter_dec(fs_info); bio_put(bio); return 0; } @@ -2376,7 +2387,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, int read_mode; int ret; - BUG_ON(failed_bio->bi_rw & REQ_WRITE); + BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); ret = btrfs_get_io_failure_record(inode, start, end, &failrec); if (ret) @@ -2402,12 +2413,12 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, free_io_failure(inode, failrec); return -EIO; } + bio_set_op_attrs(bio, REQ_OP_READ, read_mode); pr_debug("Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d\n", read_mode, failrec->this_mirror, failrec->in_validation); - ret = tree->ops->submit_bio_hook(inode, read_mode, bio, - failrec->this_mirror, + ret = tree->ops->submit_bio_hook(inode, bio, failrec->this_mirror, failrec->bio_flags, 0); if (ret) { free_io_failure(inode, failrec); @@ -2713,8 +2724,8 @@ struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) } -static int __must_check submit_one_bio(int rw, struct bio *bio, - int mirror_num, unsigned long bio_flags) +static int __must_check submit_one_bio(struct bio *bio, int mirror_num, + unsigned long bio_flags) { int ret = 0; struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; @@ -2725,33 +2736,32 @@ static int __must_check submit_one_bio(int rw, struct bio *bio, start = page_offset(page) + bvec->bv_offset; bio->bi_private = NULL; - bio_get(bio); if (tree->ops && tree->ops->submit_bio_hook) - ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, + ret = tree->ops->submit_bio_hook(page->mapping->host, bio, mirror_num, bio_flags, start); else - btrfsic_submit_bio(rw, bio); + btrfsic_submit_bio(bio); bio_put(bio); return ret; } -static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page, +static int merge_bio(struct extent_io_tree *tree, struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags) { int ret = 0; if (tree->ops && tree->ops->merge_bio_hook) - ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio, + ret = tree->ops->merge_bio_hook(page, offset, size, bio, bio_flags); BUG_ON(ret < 0); return ret; } -static int submit_extent_page(int rw, struct extent_io_tree *tree, +static int submit_extent_page(int op, int op_flags, struct extent_io_tree *tree, struct writeback_control *wbc, struct page *page, sector_t sector, size_t size, unsigned long offset, @@ -2779,10 +2789,9 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, if (prev_bio_flags != bio_flags || !contig || force_bio_submit || - merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) || + merge_bio(tree, page, offset, page_size, bio, bio_flags) || bio_add_page(bio, page, page_size, offset) < page_size) { - ret = submit_one_bio(rw, bio, mirror_num, - prev_bio_flags); + ret = submit_one_bio(bio, mirror_num, prev_bio_flags); if (ret < 0) { *bio_ret = NULL; return ret; @@ -2803,6 +2812,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, bio_add_page(bio, page, page_size, offset); bio->bi_end_io = end_io_func; bio->bi_private = tree; + bio_set_op_attrs(bio, op, op_flags); if (wbc) { wbc_init_bio(wbc, bio); wbc_account_io(wbc, page, page_size); @@ -2811,7 +2821,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, if (bio_ret) *bio_ret = bio; else - ret = submit_one_bio(rw, bio, mirror_num, bio_flags); + ret = submit_one_bio(bio, mirror_num, bio_flags); return ret; } @@ -2875,7 +2885,7 @@ static int __do_readpage(struct extent_io_tree *tree, get_extent_t *get_extent, struct extent_map **em_cached, struct bio **bio, int mirror_num, - unsigned long *bio_flags, int rw, + unsigned long *bio_flags, int read_flags, u64 *prev_em_start) { struct inode *inode = page->mapping->host; @@ -3058,8 +3068,8 @@ static int __do_readpage(struct extent_io_tree *tree, } pnr -= page->index; - ret = submit_extent_page(rw, tree, NULL, page, - sector, disk_io_size, pg_offset, + ret = submit_extent_page(REQ_OP_READ, read_flags, tree, NULL, + page, sector, disk_io_size, pg_offset, bdev, bio, pnr, end_bio_extent_readpage, mirror_num, *bio_flags, @@ -3090,7 +3100,7 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree, get_extent_t *get_extent, struct extent_map **em_cached, struct bio **bio, int mirror_num, - unsigned long *bio_flags, int rw, + unsigned long *bio_flags, u64 *prev_em_start) { struct inode *inode; @@ -3111,7 +3121,7 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree, for (index = 0; index < nr_pages; index++) { __do_readpage(tree, pages[index], get_extent, em_cached, bio, - mirror_num, bio_flags, rw, prev_em_start); + mirror_num, bio_flags, 0, prev_em_start); put_page(pages[index]); } } @@ -3121,7 +3131,7 @@ static void __extent_readpages(struct extent_io_tree *tree, int nr_pages, get_extent_t *get_extent, struct extent_map **em_cached, struct bio **bio, int mirror_num, - unsigned long *bio_flags, int rw, + unsigned long *bio_flags, u64 *prev_em_start) { u64 start = 0; @@ -3143,7 +3153,7 @@ static void __extent_readpages(struct extent_io_tree *tree, index - first_index, start, end, get_extent, em_cached, bio, mirror_num, bio_flags, - rw, prev_em_start); + prev_em_start); start = page_start; end = start + PAGE_SIZE - 1; first_index = index; @@ -3154,7 +3164,7 @@ static void __extent_readpages(struct extent_io_tree *tree, __do_contiguous_readpages(tree, &pages[first_index], index - first_index, start, end, get_extent, em_cached, bio, - mirror_num, bio_flags, rw, + mirror_num, bio_flags, prev_em_start); } @@ -3162,7 +3172,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, struct bio **bio, int mirror_num, - unsigned long *bio_flags, int rw) + unsigned long *bio_flags, int read_flags) { struct inode *inode = page->mapping->host; struct btrfs_ordered_extent *ordered; @@ -3182,7 +3192,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, } ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, - bio_flags, rw, NULL); + bio_flags, read_flags, NULL); return ret; } @@ -3194,9 +3204,9 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, int ret; ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, - &bio_flags, READ); + &bio_flags, 0); if (bio) - ret = submit_one_bio(READ, bio, mirror_num, bio_flags); + ret = submit_one_bio(bio, mirror_num, bio_flags); return ret; } @@ -3430,8 +3440,8 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, page->index, cur, end); } - ret = submit_extent_page(write_flags, tree, wbc, page, - sector, iosize, pg_offset, + ret = submit_extent_page(REQ_OP_WRITE, write_flags, tree, wbc, + page, sector, iosize, pg_offset, bdev, &epd->bio, max_nr, end_bio_extent_writepage, 0, 0, 0, false); @@ -3470,13 +3480,11 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, size_t pg_offset = 0; loff_t i_size = i_size_read(inode); unsigned long end_index = i_size >> PAGE_SHIFT; - int write_flags; + int write_flags = 0; unsigned long nr_written = 0; if (wbc->sync_mode == WB_SYNC_ALL) write_flags = WRITE_SYNC; - else - write_flags = WRITE; trace___extent_writepage(page, inode, wbc); @@ -3720,7 +3728,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, u64 offset = eb->start; unsigned long i, num_pages; unsigned long bio_flags = 0; - int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META; + int write_flags = (epd->sync_io ? WRITE_SYNC : 0) | REQ_META; int ret = 0; clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); @@ -3734,9 +3742,10 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, clear_page_dirty_for_io(p); set_page_writeback(p); - ret = submit_extent_page(rw, tree, wbc, p, offset >> 9, - PAGE_SIZE, 0, bdev, &epd->bio, - -1, end_bio_extent_buffer_writepage, + ret = submit_extent_page(REQ_OP_WRITE, write_flags, tree, wbc, + p, offset >> 9, PAGE_SIZE, 0, bdev, + &epd->bio, -1, + end_bio_extent_buffer_writepage, 0, epd->bio_flags, bio_flags, false); epd->bio_flags = bio_flags; if (ret) { @@ -4046,13 +4055,12 @@ retry: static void flush_epd_write_bio(struct extent_page_data *epd) { if (epd->bio) { - int rw = WRITE; int ret; - if (epd->sync_io) - rw = WRITE_SYNC; + bio_set_op_attrs(epd->bio, REQ_OP_WRITE, + epd->sync_io ? WRITE_SYNC : 0); - ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags); + ret = submit_one_bio(epd->bio, 0, epd->bio_flags); BUG_ON(ret < 0); /* -ENOMEM */ epd->bio = NULL; } @@ -4170,7 +4178,8 @@ int extent_readpages(struct extent_io_tree *tree, prefetchw(&page->flags); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, - page->index, GFP_NOFS)) { + page->index, + readahead_gfp_mask(mapping))) { put_page(page); continue; } @@ -4179,19 +4188,19 @@ int extent_readpages(struct extent_io_tree *tree, if (nr < ARRAY_SIZE(pagepool)) continue; __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, - &bio, 0, &bio_flags, READ, &prev_em_start); + &bio, 0, &bio_flags, &prev_em_start); nr = 0; } if (nr) __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, - &bio, 0, &bio_flags, READ, &prev_em_start); + &bio, 0, &bio_flags, &prev_em_start); if (em_cached) free_extent_map(em_cached); BUG_ON(!list_empty(pages)); if (bio) - return submit_one_bio(READ, bio, 0, bio_flags); + return submit_one_bio(bio, 0, bio_flags); return 0; } @@ -4718,16 +4727,16 @@ err: } struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start) + u64 start, u32 nodesize) { unsigned long len; if (!fs_info) { /* * Called only from tests that don't always have a fs_info - * available, but we know that nodesize is 4096 + * available */ - len = 4096; + len = nodesize; } else { len = fs_info->tree_root->nodesize; } @@ -4823,7 +4832,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start) + u64 start, u32 nodesize) { struct extent_buffer *eb, *exists = NULL; int ret; @@ -4831,7 +4840,7 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, eb = find_extent_buffer(fs_info, start); if (eb) return eb; - eb = alloc_dummy_extent_buffer(fs_info, start); + eb = alloc_dummy_extent_buffer(fs_info, start, nodesize); if (!eb) return NULL; eb->fs_info = fs_info; @@ -4882,18 +4891,25 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, int uptodate = 1; int ret; + if (!IS_ALIGNED(start, fs_info->tree_root->sectorsize)) { + btrfs_err(fs_info, "bad tree block start %llu", start); + return ERR_PTR(-EINVAL); + } + eb = find_extent_buffer(fs_info, start); if (eb) return eb; eb = __alloc_extent_buffer(fs_info, start, len); if (!eb) - return NULL; + return ERR_PTR(-ENOMEM); for (i = 0; i < num_pages; i++, index++) { p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL); - if (!p) + if (!p) { + exists = ERR_PTR(-ENOMEM); goto free_eb; + } spin_lock(&mapping->private_lock); if (PagePrivate(p)) { @@ -4938,8 +4954,10 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); again: ret = radix_tree_preload(GFP_NOFS); - if (ret) + if (ret) { + exists = ERR_PTR(ret); goto free_eb; + } spin_lock(&fs_info->buffer_lock); ret = radix_tree_insert(&fs_info->buffer_radix, @@ -5217,7 +5235,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, err = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, &bio_flags, - READ | REQ_META); + REQ_META); if (err) ret = err; } else { @@ -5226,8 +5244,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, } if (bio) { - err = submit_one_bio(READ | REQ_META, bio, mirror_num, - bio_flags); + err = submit_one_bio(bio, mirror_num, bio_flags); if (err) return err; } @@ -5323,6 +5340,11 @@ int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv, return ret; } +/* + * return 0 if the item is found within a page. + * return 1 if the item spans two pages. + * return -EINVAL otherwise. + */ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, unsigned long min_len, char **map, unsigned long *map_start, @@ -5337,7 +5359,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, PAGE_SHIFT; if (i != end_i) - return -EINVAL; + return 1; if (i == 0) { offset = start_offset; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 1baf19c9b79d..bc2729a7612d 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -63,16 +63,16 @@ struct btrfs_root; struct btrfs_io_bio; struct io_failure_record; -typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, - struct bio *bio, int mirror_num, - unsigned long bio_flags, u64 bio_offset); +typedef int (extent_submit_bio_hook_t)(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset); struct extent_io_ops { int (*fill_delalloc)(struct inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written); int (*writepage_start_hook)(struct page *page, u64 start, u64 end); extent_submit_bio_hook_t *submit_bio_hook; - int (*merge_bio_hook)(int rw, struct page *page, unsigned long offset, + int (*merge_bio_hook)(struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags); int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); @@ -348,7 +348,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, unsigned long len); struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start); + u64 start, u32 nodesize); struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src); struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); @@ -468,5 +468,5 @@ noinline u64 find_lock_delalloc_range(struct inode *inode, u64 *end, u64 max_bytes); #endif struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start); + u64 start, u32 nodesize); #endif diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e0c9bd3fb02d..2234e88cf674 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1534,30 +1534,30 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, reserve_bytes = round_up(write_bytes + sector_offset, root->sectorsize); - if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | - BTRFS_INODE_PREALLOC)) && - check_can_nocow(inode, pos, &write_bytes) > 0) { - /* - * For nodata cow case, no need to reserve - * data space. - */ - only_release_metadata = true; - /* - * our prealloc extent may be smaller than - * write_bytes, so scale down. - */ - num_pages = DIV_ROUND_UP(write_bytes + offset, - PAGE_SIZE); - reserve_bytes = round_up(write_bytes + sector_offset, - root->sectorsize); - goto reserve_metadata; - } - ret = btrfs_check_data_free_space(inode, pos, write_bytes); - if (ret < 0) - break; + if (ret < 0) { + if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | + BTRFS_INODE_PREALLOC)) && + check_can_nocow(inode, pos, &write_bytes) > 0) { + /* + * For nodata cow case, no need to reserve + * data space. + */ + only_release_metadata = true; + /* + * our prealloc extent may be smaller than + * write_bytes, so scale down. + */ + num_pages = DIV_ROUND_UP(write_bytes + offset, + PAGE_SIZE); + reserve_bytes = round_up(write_bytes + + sector_offset, + root->sectorsize); + } else { + break; + } + } -reserve_metadata: ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes); if (ret) { if (!only_release_metadata) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index c6dc1183f542..69d270f6602c 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -29,7 +29,7 @@ #include "inode-map.h" #include "volumes.h" -#define BITS_PER_BITMAP (PAGE_SIZE * 8) +#define BITS_PER_BITMAP (PAGE_SIZE * 8UL) #define MAX_CACHE_BYTES_PER_GIG SZ_32K struct btrfs_trim_range { @@ -1415,11 +1415,11 @@ static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset) { u64 bitmap_start; - u32 bytes_per_bitmap; + u64 bytes_per_bitmap; bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit; bitmap_start = offset - ctl->start; - bitmap_start = div_u64(bitmap_start, bytes_per_bitmap); + bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); bitmap_start *= bytes_per_bitmap; bitmap_start += ctl->start; @@ -1638,10 +1638,10 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) u64 bitmap_bytes; u64 extent_bytes; u64 size = block_group->key.offset; - u32 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; - u32 max_bitmaps = div_u64(size + bytes_per_bg - 1, bytes_per_bg); + u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; + u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); - max_bitmaps = max_t(u32, max_bitmaps, 1); + max_bitmaps = max_t(u64, max_bitmaps, 1); ASSERT(ctl->total_bitmaps <= max_bitmaps); @@ -1660,7 +1660,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as * we add more bitmaps. */ - bitmap_bytes = (ctl->total_bitmaps + 1) * PAGE_SIZE; + bitmap_bytes = (ctl->total_bitmaps + 1) * ctl->unit; if (bitmap_bytes >= max_bytes) { ctl->extents_thresh = 0; @@ -3662,7 +3662,7 @@ have_info: if (tmp->offset + tmp->bytes < offset) break; if (offset + bytes < tmp->offset) { - n = rb_prev(&info->offset_index); + n = rb_prev(&tmp->offset_index); continue; } info = tmp; @@ -3676,7 +3676,7 @@ have_info: if (offset + bytes < tmp->offset) break; if (tmp->offset + tmp->bytes < offset) { - n = rb_next(&info->offset_index); + n = rb_next(&tmp->offset_index); continue; } info = tmp; diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c index aae520b2aee5..a97fdc156a03 100644 --- a/fs/btrfs/hash.c +++ b/fs/btrfs/hash.c @@ -24,6 +24,11 @@ int __init btrfs_hash_init(void) return PTR_ERR_OR_ZERO(tfm); } +const char* btrfs_crc32c_impl(void) +{ + return crypto_tfm_alg_driver_name(crypto_shash_tfm(tfm)); +} + void btrfs_hash_exit(void) { crypto_free_shash(tfm); diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h index 118a2316e5d3..c3a2ec554361 100644 --- a/fs/btrfs/hash.h +++ b/fs/btrfs/hash.h @@ -22,6 +22,7 @@ int __init btrfs_hash_init(void); void btrfs_hash_exit(void); +const char* btrfs_crc32c_impl(void); u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 270499598ed4..df731c0ebec7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1823,7 +1823,7 @@ static void btrfs_clear_bit_hook(struct inode *inode, * extent_io.c merge_bio_hook, this must check the chunk tree to make sure * we don't create bios that span stripes or chunks */ -int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, +int btrfs_merge_bio_hook(struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags) { @@ -1838,7 +1838,7 @@ int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, length = bio->bi_iter.bi_size; map_length = length; - ret = btrfs_map_block(root->fs_info, rw, logical, + ret = btrfs_map_block(root->fs_info, bio_op(bio), logical, &map_length, NULL, 0); /* Will always return 0 with map_multi == NULL */ BUG_ON(ret < 0); @@ -1855,9 +1855,8 @@ int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, * At IO completion time the cums attached on the ordered extent record * are inserted into the btree */ -static int __btrfs_submit_bio_start(struct inode *inode, int rw, - struct bio *bio, int mirror_num, - unsigned long bio_flags, +static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags, u64 bio_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -1876,14 +1875,14 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw, * At IO completion time the cums attached on the ordered extent record * are inserted into the btree */ -static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, +static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret; - ret = btrfs_map_bio(root, rw, bio, mirror_num, 1); + ret = btrfs_map_bio(root, bio, mirror_num, 1); if (ret) { bio->bi_error = ret; bio_endio(bio); @@ -1895,7 +1894,7 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, * extent_io.c submission hook. This does the right thing for csum calculation * on write, or reading the csums from the tree before a read */ -static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, +static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { @@ -1910,7 +1909,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, if (btrfs_is_free_space_inode(inode)) metadata = BTRFS_WQ_ENDIO_FREE_SPACE; - if (!(rw & REQ_WRITE)) { + if (bio_op(bio) != REQ_OP_WRITE) { ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); if (ret) goto out; @@ -1932,7 +1931,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, goto mapit; /* we're doing a write, do the async checksumming */ ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, - inode, rw, bio, mirror_num, + inode, bio, mirror_num, bio_flags, bio_offset, __btrfs_submit_bio_start, __btrfs_submit_bio_done); @@ -1944,7 +1943,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, } mapit: - ret = btrfs_map_bio(root, rw, bio, mirror_num, 0); + ret = btrfs_map_bio(root, bio, mirror_num, 0); out: if (ret < 0) { @@ -3271,7 +3270,16 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) /* grab metadata reservation from transaction handle */ if (reserve) { ret = btrfs_orphan_reserve_metadata(trans, inode); - BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */ + ASSERT(!ret); + if (ret) { + atomic_dec(&root->orphan_inodes); + clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, + &BTRFS_I(inode)->runtime_flags); + if (insert) + clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &BTRFS_I(inode)->runtime_flags); + return ret; + } } /* insert an orphan item to track this unlinked/truncated file */ @@ -4549,6 +4557,7 @@ delete: BUG_ON(ret); if (btrfs_should_throttle_delayed_refs(trans, root)) btrfs_async_run_delayed_refs(root, + trans->transid, trans->delayed_ref_updates * 2, 0); if (be_nice) { if (truncate_space_check(trans, root, @@ -5748,6 +5757,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) int name_len; int is_curr = 0; /* ctx->pos points to the current index? */ bool emitted; + bool put = false; /* FIXME, use a real flag for deciding about the key type */ if (root->fs_info->tree_root == root) @@ -5765,7 +5775,8 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) if (key_type == BTRFS_DIR_INDEX_KEY) { INIT_LIST_HEAD(&ins_list); INIT_LIST_HEAD(&del_list); - btrfs_get_delayed_items(inode, &ins_list, &del_list); + put = btrfs_readdir_get_delayed_items(inode, &ins_list, + &del_list); } key.type = key_type; @@ -5912,8 +5923,8 @@ next: nopos: ret = 0; err: - if (key_type == BTRFS_DIR_INDEX_KEY) - btrfs_put_delayed_items(&ins_list, &del_list); + if (put) + btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list); btrfs_free_path(path); return ret; } @@ -6979,7 +6990,18 @@ insert: * existing will always be non-NULL, since there must be * extent causing the -EEXIST. */ - if (start >= extent_map_end(existing) || + if (existing->start == em->start && + extent_map_end(existing) == extent_map_end(em) && + em->block_start == existing->block_start) { + /* + * these two extents are the same, it happens + * with inlines especially + */ + free_extent_map(em); + em = existing; + err = 0; + + } else if (start >= extent_map_end(existing) || start <= existing->start) { /* * The existing extent map is the one nearest to @@ -7767,12 +7789,12 @@ err: } static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio, - int rw, int mirror_num) + int mirror_num) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret; - BUG_ON(rw & REQ_WRITE); + BUG_ON(bio_op(bio) == REQ_OP_WRITE); bio_get(bio); @@ -7781,7 +7803,7 @@ static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio, if (ret) goto err; - ret = btrfs_map_bio(root, rw, bio, mirror_num, 0); + ret = btrfs_map_bio(root, bio, mirror_num, 0); err: bio_put(bio); return ret; @@ -7832,7 +7854,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, int read_mode; int ret; - BUG_ON(failed_bio->bi_rw & REQ_WRITE); + BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); ret = btrfs_get_io_failure_record(inode, start, end, &failrec); if (ret) @@ -7860,13 +7882,13 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, free_io_failure(inode, failrec); return -EIO; } + bio_set_op_attrs(bio, REQ_OP_READ, read_mode); btrfs_debug(BTRFS_I(inode)->root->fs_info, "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n", read_mode, failrec->this_mirror, failrec->in_validation); - ret = submit_dio_repair_bio(inode, bio, read_mode, - failrec->this_mirror); + ret = submit_dio_repair_bio(inode, bio, failrec->this_mirror); if (ret) { free_io_failure(inode, failrec); bio_put(bio); @@ -8156,7 +8178,7 @@ static void btrfs_endio_direct_write(struct bio *bio) bio_put(bio); } -static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, +static int __btrfs_submit_bio_start_direct_io(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 offset) { @@ -8174,8 +8196,8 @@ static void btrfs_end_dio_bio(struct bio *bio) if (err) btrfs_warn(BTRFS_I(dip->inode)->root->fs_info, - "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d", - btrfs_ino(dip->inode), bio->bi_rw, + "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d", + btrfs_ino(dip->inode), bio_op(bio), bio->bi_rw, (unsigned long long)bio->bi_iter.bi_sector, bio->bi_iter.bi_size, err); @@ -8249,11 +8271,11 @@ static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root, } static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, - int rw, u64 file_offset, int skip_sum, + u64 file_offset, int skip_sum, int async_submit) { struct btrfs_dio_private *dip = bio->bi_private; - int write = rw & REQ_WRITE; + bool write = bio_op(bio) == REQ_OP_WRITE; struct btrfs_root *root = BTRFS_I(inode)->root; int ret; @@ -8274,8 +8296,7 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, if (write && async_submit) { ret = btrfs_wq_submit_bio(root->fs_info, - inode, rw, bio, 0, 0, - file_offset, + inode, bio, 0, 0, file_offset, __btrfs_submit_bio_start_direct_io, __btrfs_submit_bio_done); goto err; @@ -8294,13 +8315,13 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, goto err; } map: - ret = btrfs_map_bio(root, rw, bio, 0, async_submit); + ret = btrfs_map_bio(root, bio, 0, async_submit); err: bio_put(bio); return ret; } -static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, +static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, int skip_sum) { struct inode *inode = dip->inode; @@ -8319,8 +8340,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, int i; map_length = orig_bio->bi_iter.bi_size; - ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, - &map_length, NULL, 0); + ret = btrfs_map_block(root->fs_info, bio_op(orig_bio), + start_sector << 9, &map_length, NULL, 0); if (ret) return -EIO; @@ -8340,6 +8361,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, if (!bio) return -ENOMEM; + bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_rw); bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; @@ -8359,7 +8381,7 @@ next_block: * before we're done setting it up */ atomic_inc(&dip->pending_bios); - ret = __btrfs_submit_dio_bio(bio, inode, rw, + ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum, async_submit); if (ret) { @@ -8377,12 +8399,13 @@ next_block: start_sector, GFP_NOFS); if (!bio) goto out_err; + bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_rw); bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; map_length = orig_bio->bi_iter.bi_size; - ret = btrfs_map_block(root->fs_info, rw, + ret = btrfs_map_block(root->fs_info, bio_op(orig_bio), start_sector << 9, &map_length, NULL, 0); if (ret) { @@ -8402,7 +8425,7 @@ next_block: } submit: - ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum, + ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum, async_submit); if (!ret) return 0; @@ -8422,14 +8445,14 @@ out_err: return 0; } -static void btrfs_submit_direct(int rw, struct bio *dio_bio, - struct inode *inode, loff_t file_offset) +static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, + loff_t file_offset) { struct btrfs_dio_private *dip = NULL; struct bio *io_bio = NULL; struct btrfs_io_bio *btrfs_bio; int skip_sum; - int write = rw & REQ_WRITE; + bool write = (bio_op(dio_bio) == REQ_OP_WRITE); int ret = 0; skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; @@ -8480,7 +8503,7 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, dio_data->unsubmitted_oe_range_end; } - ret = btrfs_submit_direct_hook(rw, dip, skip_sum); + ret = btrfs_submit_direct_hook(dip, skip_sum); if (!ret) return; @@ -10514,7 +10537,7 @@ static const struct inode_operations btrfs_dir_ro_inode_operations = { static const struct file_operations btrfs_dir_file_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .iterate = btrfs_real_readdir, + .iterate_shared = btrfs_real_readdir, .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = btrfs_compat_ioctl, diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 559170464d7c..aca8264f4a49 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -718,12 +718,13 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, return count; } -void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, +int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, const u64 range_start, const u64 range_len) { struct btrfs_root *root; struct list_head splice; int done; + int total_done = 0; INIT_LIST_HEAD(&splice); @@ -742,6 +743,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, done = btrfs_wait_ordered_extents(root, nr, range_start, range_len); btrfs_put_fs_root(root); + total_done += done; spin_lock(&fs_info->ordered_root_lock); if (nr != -1) { @@ -752,6 +754,8 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, list_splice_tail(&splice, &fs_info->ordered_roots); spin_unlock(&fs_info->ordered_root_lock); mutex_unlock(&fs_info->ordered_operations_mutex); + + return total_done; } /* @@ -964,6 +968,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct rb_node *prev = NULL; struct btrfs_ordered_extent *test; int ret = 1; + u64 orig_offset = offset; spin_lock_irq(&tree->lock); if (ordered) { @@ -979,7 +984,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, /* truncate file */ if (disk_i_size > i_size) { - BTRFS_I(inode)->disk_i_size = i_size; + BTRFS_I(inode)->disk_i_size = orig_offset; ret = 0; goto out; } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 2049c9be85ee..451507776ff5 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -199,7 +199,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum, int len); int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, const u64 range_start, const u64 range_len); -void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, +int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, const u64 range_start, const u64 range_len); void btrfs_get_logged_extents(struct inode *inode, struct list_head *logged_list, diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index f8b6d411a034..cd8d302a1f61 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1320,7 +1320,9 @@ write_data: bio->bi_private = rbio; bio->bi_end_io = raid_write_end_io; - submit_bio(WRITE, bio); + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + + submit_bio(bio); } return; @@ -1573,11 +1575,12 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) bio->bi_private = rbio; bio->bi_end_io = raid_rmw_end_io; + bio_set_op_attrs(bio, REQ_OP_READ, 0); btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); - submit_bio(READ, bio); + submit_bio(bio); } /* the actual write will happen once the reads are done */ return 0; @@ -2097,11 +2100,12 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) bio->bi_private = rbio; bio->bi_end_io = raid_recover_end_io; + bio_set_op_attrs(bio, REQ_OP_READ, 0); btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); - submit_bio(READ, bio); + submit_bio(bio); } out: return 0; @@ -2433,7 +2437,9 @@ submit_write: bio->bi_private = rbio; bio->bi_end_io = raid_write_end_io; - submit_bio(WRITE, bio); + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + + submit_bio(bio); } return; @@ -2610,11 +2616,12 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) bio->bi_private = rbio; bio->bi_end_io = raid56_parity_scrub_end_io; + bio_set_op_attrs(bio, REQ_OP_READ, 0); btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); - submit_bio(READ, bio); + submit_bio(bio); } /* the actual write will happen once the reads are done */ return; diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 298631eaee78..8428db7cd88f 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -761,12 +761,14 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info) do { enqueued = 0; + mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { if (atomic_read(&device->reada_in_flight) < MAX_IN_FLIGHT) enqueued += reada_start_machine_dev(fs_info, device); } + mutex_unlock(&fs_devices->device_list_mutex); total += enqueued; } while (enqueued && total < 10000); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 46d847f66e4b..e08b6bc676e3 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1504,8 +1504,9 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, sblock->no_io_error_seen = 0; } else { bio->bi_iter.bi_sector = page->physical >> 9; + bio_set_op_attrs(bio, REQ_OP_READ, 0); - if (btrfsic_submit_bio_wait(READ, bio)) + if (btrfsic_submit_bio_wait(bio)) sblock->no_io_error_seen = 0; } @@ -1583,6 +1584,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, return -EIO; bio->bi_bdev = page_bad->dev->bdev; bio->bi_iter.bi_sector = page_bad->physical >> 9; + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0); if (PAGE_SIZE != ret) { @@ -1590,7 +1592,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, return -EIO; } - if (btrfsic_submit_bio_wait(WRITE, bio)) { + if (btrfsic_submit_bio_wait(bio)) { btrfs_dev_stat_inc_and_print(page_bad->dev, BTRFS_DEV_STAT_WRITE_ERRS); btrfs_dev_replace_stats_inc( @@ -1684,6 +1686,7 @@ again: bio->bi_end_io = scrub_wr_bio_end_io; bio->bi_bdev = sbio->dev->bdev; bio->bi_iter.bi_sector = sbio->physical >> 9; + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); sbio->err = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical_for_dev_replace || @@ -1731,7 +1734,7 @@ static void scrub_wr_submit(struct scrub_ctx *sctx) * orders the requests before sending them to the driver which * doubled the write performance on spinning disks when measured * with Linux 3.5 */ - btrfsic_submit_bio(WRITE, sbio->bio); + btrfsic_submit_bio(sbio->bio); } static void scrub_wr_bio_end_io(struct bio *bio) @@ -2041,7 +2044,7 @@ static void scrub_submit(struct scrub_ctx *sctx) sbio = sctx->bios[sctx->curr]; sctx->curr = -1; scrub_pending_bio_inc(sctx); - btrfsic_submit_bio(READ, sbio->bio); + btrfsic_submit_bio(sbio->bio); } static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, @@ -2088,6 +2091,7 @@ again: bio->bi_end_io = scrub_bio_end_io; bio->bi_bdev = sbio->dev->bdev; bio->bi_iter.bi_sector = sbio->physical >> 9; + bio_set_op_attrs(bio, REQ_OP_READ, 0); sbio->err = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical || @@ -3582,6 +3586,46 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, */ scrub_pause_on(fs_info); ret = btrfs_inc_block_group_ro(root, cache); + if (!ret && is_dev_replace) { + /* + * If we are doing a device replace wait for any tasks + * that started dellaloc right before we set the block + * group to RO mode, as they might have just allocated + * an extent from it or decided they could do a nocow + * write. And if any such tasks did that, wait for their + * ordered extents to complete and then commit the + * current transaction, so that we can later see the new + * extent items in the extent tree - the ordered extents + * create delayed data references (for cow writes) when + * they complete, which will be run and insert the + * corresponding extent items into the extent tree when + * we commit the transaction they used when running + * inode.c:btrfs_finish_ordered_io(). We later use + * the commit root of the extent tree to find extents + * to copy from the srcdev into the tgtdev, and we don't + * want to miss any new extents. + */ + btrfs_wait_block_group_reservations(cache); + btrfs_wait_nocow_writers(cache); + ret = btrfs_wait_ordered_roots(fs_info, -1, + cache->key.objectid, + cache->key.offset); + if (ret > 0) { + struct btrfs_trans_handle *trans; + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + ret = PTR_ERR(trans); + else + ret = btrfs_commit_transaction(trans, + root); + if (ret) { + scrub_pause_off(fs_info); + btrfs_put_block_group(cache); + break; + } + } + } scrub_pause_off(fs_info); if (ret == 0) { @@ -3602,9 +3646,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, break; } + btrfs_dev_replace_lock(&fs_info->dev_replace, 1); dev_replace->cursor_right = found_key.offset + length; dev_replace->cursor_left = found_key.offset; dev_replace->item_needs_writeback = 1; + btrfs_dev_replace_unlock(&fs_info->dev_replace, 1); ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length, found_key.offset, cache, is_dev_replace); @@ -3640,6 +3686,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, scrub_pause_off(fs_info); + btrfs_dev_replace_lock(&fs_info->dev_replace, 1); + dev_replace->cursor_left = dev_replace->cursor_right; + dev_replace->item_needs_writeback = 1; + btrfs_dev_replace_unlock(&fs_info->dev_replace, 1); + if (ro_set) btrfs_dec_block_group_ro(root, cache); @@ -3677,9 +3728,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, ret = -ENOMEM; break; } - - dev_replace->cursor_left = dev_replace->cursor_right; - dev_replace->item_needs_writeback = 1; skip: key.offset = found_key.offset + length; btrfs_release_path(path); @@ -4392,6 +4440,7 @@ static int write_page_nocow(struct scrub_ctx *sctx, bio->bi_iter.bi_size = 0; bio->bi_iter.bi_sector = physical_for_dev_replace >> 9; bio->bi_bdev = dev->bdev; + bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC); ret = bio_add_page(bio, page, PAGE_SIZE, 0); if (ret != PAGE_SIZE) { leave_with_eio: @@ -4400,7 +4449,7 @@ leave_with_eio: return -EIO; } - if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) + if (btrfsic_submit_bio_wait(bio)) goto leave_with_eio; bio_put(bio); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 4e59a91a11e0..60e7179ed4b7 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -235,7 +235,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, trans->aborted = errno; /* Nothing used. The other threads that have joined this * transaction may be able to continue. */ - if (!trans->blocks_used && list_empty(&trans->new_bgs)) { + if (!trans->dirty && list_empty(&trans->new_bgs)) { const char *errstr; errstr = btrfs_decode_error(errno); @@ -1807,6 +1807,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) } } sb->s_flags &= ~MS_RDONLY; + + fs_info->open = 1; } out: wake_up_process(fs_info->transaction_kthread); @@ -2303,7 +2305,7 @@ static void btrfs_interface_exit(void) static void btrfs_print_mod_info(void) { - printk(KERN_INFO "Btrfs loaded" + printk(KERN_INFO "Btrfs loaded, crc32c=%s" #ifdef CONFIG_BTRFS_DEBUG ", debug=on" #endif @@ -2313,33 +2315,48 @@ static void btrfs_print_mod_info(void) #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY ", integrity-checker=on" #endif - "\n"); + "\n", + btrfs_crc32c_impl()); } static int btrfs_run_sanity_tests(void) { - int ret; - + int ret, i; + u32 sectorsize, nodesize; + u32 test_sectorsize[] = { + PAGE_SIZE, + }; ret = btrfs_init_test_fs(); if (ret) return ret; - - ret = btrfs_test_free_space_cache(); - if (ret) - goto out; - ret = btrfs_test_extent_buffer_operations(); - if (ret) - goto out; - ret = btrfs_test_extent_io(); - if (ret) - goto out; - ret = btrfs_test_inodes(); - if (ret) - goto out; - ret = btrfs_test_qgroups(); - if (ret) - goto out; - ret = btrfs_test_free_space_tree(); + for (i = 0; i < ARRAY_SIZE(test_sectorsize); i++) { + sectorsize = test_sectorsize[i]; + for (nodesize = sectorsize; + nodesize <= BTRFS_MAX_METADATA_BLOCKSIZE; + nodesize <<= 1) { + pr_info("BTRFS: selftest: sectorsize: %u nodesize: %u\n", + sectorsize, nodesize); + ret = btrfs_test_free_space_cache(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_extent_buffer_operations(sectorsize, + nodesize); + if (ret) + goto out; + ret = btrfs_test_extent_io(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_inodes(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_qgroups(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_free_space_tree(sectorsize, nodesize); + if (ret) + goto out; + } + } out: btrfs_destroy_test_fs(); return ret; diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index f54bf450bad3..02223f3f78f4 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -68,7 +68,7 @@ int btrfs_init_test_fs(void) if (IS_ERR(test_mnt)) { printk(KERN_ERR "btrfs: cannot mount test file system\n"); unregister_filesystem(&test_type); - return ret; + return PTR_ERR(test_mnt); } return 0; } @@ -175,7 +175,7 @@ void btrfs_free_dummy_root(struct btrfs_root *root) } struct btrfs_block_group_cache * -btrfs_alloc_dummy_block_group(unsigned long length) +btrfs_alloc_dummy_block_group(unsigned long length, u32 sectorsize) { struct btrfs_block_group_cache *cache; @@ -192,8 +192,8 @@ btrfs_alloc_dummy_block_group(unsigned long length) cache->key.objectid = 0; cache->key.offset = length; cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; - cache->sectorsize = 4096; - cache->full_stripe_len = 4096; + cache->sectorsize = sectorsize; + cache->full_stripe_len = sectorsize; INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index 054b8c73c951..66fb6b701eb7 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -26,27 +26,28 @@ struct btrfs_root; struct btrfs_trans_handle; -int btrfs_test_free_space_cache(void); -int btrfs_test_extent_buffer_operations(void); -int btrfs_test_extent_io(void); -int btrfs_test_inodes(void); -int btrfs_test_qgroups(void); -int btrfs_test_free_space_tree(void); +int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize); +int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize); +int btrfs_test_extent_io(u32 sectorsize, u32 nodesize); +int btrfs_test_inodes(u32 sectorsize, u32 nodesize); +int btrfs_test_qgroups(u32 sectorsize, u32 nodesize); +int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize); int btrfs_init_test_fs(void); void btrfs_destroy_test_fs(void); struct inode *btrfs_new_test_inode(void); struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void); void btrfs_free_dummy_root(struct btrfs_root *root); struct btrfs_block_group_cache * -btrfs_alloc_dummy_block_group(unsigned long length); +btrfs_alloc_dummy_block_group(unsigned long length, u32 sectorsize); void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache); void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans); #else -static inline int btrfs_test_free_space_cache(void) +static inline int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) { return 0; } -static inline int btrfs_test_extent_buffer_operations(void) +static inline int btrfs_test_extent_buffer_operations(u32 sectorsize, + u32 nodesize) { return 0; } @@ -57,19 +58,19 @@ static inline int btrfs_init_test_fs(void) static inline void btrfs_destroy_test_fs(void) { } -static inline int btrfs_test_extent_io(void) +static inline int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) { return 0; } -static inline int btrfs_test_inodes(void) +static inline int btrfs_test_inodes(u32 sectorsize, u32 nodesize) { return 0; } -static inline int btrfs_test_qgroups(void) +static inline int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) { return 0; } -static inline int btrfs_test_free_space_tree(void) +static inline int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize) { return 0; } diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index f51963a8f929..4f8cbd1ec5ee 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -22,7 +22,7 @@ #include "../extent_io.h" #include "../disk-io.h" -static int test_btrfs_split_item(void) +static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) { struct btrfs_path *path; struct btrfs_root *root; @@ -40,7 +40,7 @@ static int test_btrfs_split_item(void) test_msg("Running btrfs_split_item tests\n"); - root = btrfs_alloc_dummy_root(); + root = btrfs_alloc_dummy_root(sectorsize, nodesize); if (IS_ERR(root)) { test_msg("Could not allocate root\n"); return PTR_ERR(root); @@ -53,7 +53,8 @@ static int test_btrfs_split_item(void) return -ENOMEM; } - path->nodes[0] = eb = alloc_dummy_extent_buffer(NULL, 4096); + path->nodes[0] = eb = alloc_dummy_extent_buffer(NULL, nodesize, + nodesize); if (!eb) { test_msg("Could not allocate dummy buffer\n"); ret = -ENOMEM; @@ -222,8 +223,8 @@ out: return ret; } -int btrfs_test_extent_buffer_operations(void) +int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize) { - test_msg("Running extent buffer operation tests"); - return test_btrfs_split_item(); + test_msg("Running extent buffer operation tests\n"); + return test_btrfs_split_item(sectorsize, nodesize); } diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 55724607f79b..d19ab0317283 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -21,6 +21,7 @@ #include <linux/slab.h> #include <linux/sizes.h> #include "btrfs-tests.h" +#include "../ctree.h" #include "../extent_io.h" #define PROCESS_UNLOCK (1 << 0) @@ -65,7 +66,7 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end, return count; } -static int test_find_delalloc(void) +static int test_find_delalloc(u32 sectorsize) { struct inode *inode; struct extent_io_tree tmp; @@ -113,7 +114,7 @@ static int test_find_delalloc(void) * |--- delalloc ---| * |--- search ---| */ - set_extent_delalloc(&tmp, 0, 4095, NULL); + set_extent_delalloc(&tmp, 0, sectorsize - 1, NULL); start = 0; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, @@ -122,9 +123,9 @@ static int test_find_delalloc(void) test_msg("Should have found at least one delalloc\n"); goto out_bits; } - if (start != 0 || end != 4095) { - test_msg("Expected start 0 end 4095, got start %Lu end %Lu\n", - start, end); + if (start != 0 || end != (sectorsize - 1)) { + test_msg("Expected start 0 end %u, got start %llu end %llu\n", + sectorsize - 1, start, end); goto out_bits; } unlock_extent(&tmp, start, end); @@ -144,7 +145,7 @@ static int test_find_delalloc(void) test_msg("Couldn't find the locked page\n"); goto out_bits; } - set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL); + set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, NULL); start = test_start; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, @@ -172,7 +173,7 @@ static int test_find_delalloc(void) * |--- delalloc ---| * |--- search ---| */ - test_start = max_bytes + 4096; + test_start = max_bytes + sectorsize; locked_page = find_lock_page(inode->i_mapping, test_start >> PAGE_SHIFT); if (!locked_page) { @@ -272,6 +273,16 @@ out: return ret; } +/** + * test_bit_in_byte - Determine whether a bit is set in a byte + * @nr: bit number to test + * @addr: Address to start counting from + */ +static inline int test_bit_in_byte(int nr, const u8 *addr) +{ + return 1UL & (addr[nr / BITS_PER_BYTE] >> (nr & (BITS_PER_BYTE - 1))); +} + static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, unsigned long len) { @@ -298,25 +309,29 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, return -EINVAL; } - bitmap_set(bitmap, (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, - sizeof(long) * BITS_PER_BYTE); - extent_buffer_bitmap_set(eb, PAGE_SIZE - sizeof(long) / 2, 0, - sizeof(long) * BITS_PER_BYTE); - if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { - test_msg("Setting straddling pages failed\n"); - return -EINVAL; - } + /* Straddling pages test */ + if (len > PAGE_SIZE) { + bitmap_set(bitmap, + (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, + sizeof(long) * BITS_PER_BYTE); + extent_buffer_bitmap_set(eb, PAGE_SIZE - sizeof(long) / 2, 0, + sizeof(long) * BITS_PER_BYTE); + if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + test_msg("Setting straddling pages failed\n"); + return -EINVAL; + } - bitmap_set(bitmap, 0, len * BITS_PER_BYTE); - bitmap_clear(bitmap, - (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, - sizeof(long) * BITS_PER_BYTE); - extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE); - extent_buffer_bitmap_clear(eb, PAGE_SIZE - sizeof(long) / 2, 0, - sizeof(long) * BITS_PER_BYTE); - if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { - test_msg("Clearing straddling pages failed\n"); - return -EINVAL; + bitmap_set(bitmap, 0, len * BITS_PER_BYTE); + bitmap_clear(bitmap, + (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, + sizeof(long) * BITS_PER_BYTE); + extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE); + extent_buffer_bitmap_clear(eb, PAGE_SIZE - sizeof(long) / 2, 0, + sizeof(long) * BITS_PER_BYTE); + if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + test_msg("Clearing straddling pages failed\n"); + return -EINVAL; + } } /* @@ -333,7 +348,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, for (i = 0; i < len * BITS_PER_BYTE; i++) { int bit, bit1; - bit = !!test_bit(i, bitmap); + bit = !!test_bit_in_byte(i, (u8 *)bitmap); bit1 = !!extent_buffer_test_bit(eb, 0, i); if (bit1 != bit) { test_msg("Testing bit pattern failed\n"); @@ -351,15 +366,22 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, return 0; } -static int test_eb_bitmaps(void) +static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) { - unsigned long len = PAGE_SIZE * 4; + unsigned long len; unsigned long *bitmap; struct extent_buffer *eb; int ret; test_msg("Running extent buffer bitmap tests\n"); + /* + * In ppc64, sectorsize can be 64K, thus 4 * 64K will be larger than + * BTRFS_MAX_METADATA_BLOCKSIZE. + */ + len = (sectorsize < BTRFS_MAX_METADATA_BLOCKSIZE) + ? sectorsize * 4 : sectorsize; + bitmap = kmalloc(len, GFP_KERNEL); if (!bitmap) { test_msg("Couldn't allocate test bitmap\n"); @@ -379,7 +401,7 @@ static int test_eb_bitmaps(void) /* Do it over again with an extent buffer which isn't page-aligned. */ free_extent_buffer(eb); - eb = __alloc_dummy_extent_buffer(NULL, PAGE_SIZE / 2, len); + eb = __alloc_dummy_extent_buffer(NULL, nodesize / 2, len); if (!eb) { test_msg("Couldn't allocate test extent buffer\n"); kfree(bitmap); @@ -393,17 +415,17 @@ out: return ret; } -int btrfs_test_extent_io(void) +int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) { int ret; test_msg("Running extent I/O tests\n"); - ret = test_find_delalloc(); + ret = test_find_delalloc(sectorsize); if (ret) goto out; - ret = test_eb_bitmaps(); + ret = test_eb_bitmaps(sectorsize, nodesize); out: test_msg("Extent I/O tests finished\n"); return ret; diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index 0eeb8f3d6b67..3956bb2ff84c 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -22,7 +22,7 @@ #include "../disk-io.h" #include "../free-space-cache.h" -#define BITS_PER_BITMAP (PAGE_SIZE * 8) +#define BITS_PER_BITMAP (PAGE_SIZE * 8UL) /* * This test just does basic sanity checking, making sure we can add an extent @@ -99,7 +99,8 @@ static int test_extents(struct btrfs_block_group_cache *cache) return 0; } -static int test_bitmaps(struct btrfs_block_group_cache *cache) +static int test_bitmaps(struct btrfs_block_group_cache *cache, + u32 sectorsize) { u64 next_bitmap_offset; int ret; @@ -139,7 +140,7 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache) * The first bitmap we have starts at offset 0 so the next one is just * at the end of the first bitmap. */ - next_bitmap_offset = (u64)(BITS_PER_BITMAP * 4096); + next_bitmap_offset = (u64)(BITS_PER_BITMAP * sectorsize); /* Test a bit straddling two bitmaps */ ret = test_add_free_space_entry(cache, next_bitmap_offset - SZ_2M, @@ -167,9 +168,10 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache) } /* This is the high grade jackassery */ -static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) +static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache, + u32 sectorsize) { - u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096); + u64 bitmap_offset = (u64)(BITS_PER_BITMAP * sectorsize); int ret; test_msg("Running bitmap and extent tests\n"); @@ -401,7 +403,8 @@ static int check_cache_empty(struct btrfs_block_group_cache *cache) * requests. */ static int -test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) +test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, + u32 sectorsize) { int ret; u64 offset; @@ -539,7 +542,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * The goal is to test that the bitmap entry space stealing doesn't * steal this space region. */ - ret = btrfs_add_free_space(cache, SZ_128M + SZ_16M, 4096); + ret = btrfs_add_free_space(cache, SZ_128M + SZ_16M, sectorsize); if (ret) { test_msg("Error adding free space: %d\n", ret); return ret; @@ -597,8 +600,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) return -ENOENT; } - if (cache->free_space_ctl->free_space != (SZ_1M + 4096)) { - test_msg("Cache free space is not 1Mb + 4Kb\n"); + if (cache->free_space_ctl->free_space != (SZ_1M + sectorsize)) { + test_msg("Cache free space is not 1Mb + %u\n", sectorsize); return -EINVAL; } @@ -611,22 +614,25 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) return -EINVAL; } - /* All that remains is a 4Kb free space region in a bitmap. Confirm. */ + /* + * All that remains is a sectorsize free space region in a bitmap. + * Confirm. + */ ret = check_num_extents_and_bitmaps(cache, 1, 1); if (ret) return ret; - if (cache->free_space_ctl->free_space != 4096) { - test_msg("Cache free space is not 4Kb\n"); + if (cache->free_space_ctl->free_space != sectorsize) { + test_msg("Cache free space is not %u\n", sectorsize); return -EINVAL; } offset = btrfs_find_space_for_alloc(cache, - 0, 4096, 0, + 0, sectorsize, 0, &max_extent_size); if (offset != (SZ_128M + SZ_16M)) { - test_msg("Failed to allocate 4Kb from space cache, returned offset is: %llu\n", - offset); + test_msg("Failed to allocate %u, returned offset : %llu\n", + sectorsize, offset); return -EINVAL; } @@ -733,7 +739,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * The goal is to test that the bitmap entry space stealing doesn't * steal this space region. */ - ret = btrfs_add_free_space(cache, SZ_32M, 8192); + ret = btrfs_add_free_space(cache, SZ_32M, 2 * sectorsize); if (ret) { test_msg("Error adding free space: %d\n", ret); return ret; @@ -757,7 +763,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) /* * Confirm that our extent entry didn't stole all free space from the - * bitmap, because of the small 8Kb free space region. + * bitmap, because of the small 2 * sectorsize free space region. */ ret = check_num_extents_and_bitmaps(cache, 2, 1); if (ret) @@ -783,8 +789,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) return -ENOENT; } - if (cache->free_space_ctl->free_space != (SZ_1M + 8192)) { - test_msg("Cache free space is not 1Mb + 8Kb\n"); + if (cache->free_space_ctl->free_space != (SZ_1M + 2 * sectorsize)) { + test_msg("Cache free space is not 1Mb + %u\n", 2 * sectorsize); return -EINVAL; } @@ -796,21 +802,25 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) return -EINVAL; } - /* All that remains is a 8Kb free space region in a bitmap. Confirm. */ + /* + * All that remains is 2 * sectorsize free space region + * in a bitmap. Confirm. + */ ret = check_num_extents_and_bitmaps(cache, 1, 1); if (ret) return ret; - if (cache->free_space_ctl->free_space != 8192) { - test_msg("Cache free space is not 8Kb\n"); + if (cache->free_space_ctl->free_space != 2 * sectorsize) { + test_msg("Cache free space is not %u\n", 2 * sectorsize); return -EINVAL; } offset = btrfs_find_space_for_alloc(cache, - 0, 8192, 0, + 0, 2 * sectorsize, 0, &max_extent_size); if (offset != SZ_32M) { - test_msg("Failed to allocate 8Kb from space cache, returned offset is: %llu\n", + test_msg("Failed to allocate %u, offset: %llu\n", + 2 * sectorsize, offset); return -EINVAL; } @@ -825,7 +835,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) return 0; } -int btrfs_test_free_space_cache(void) +int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) { struct btrfs_block_group_cache *cache; struct btrfs_root *root = NULL; @@ -833,13 +843,19 @@ int btrfs_test_free_space_cache(void) test_msg("Running btrfs free space cache tests\n"); - cache = btrfs_alloc_dummy_block_group(1024 * 1024 * 1024); + /* + * For ppc64 (with 64k page size), bytes per bitmap might be + * larger than 1G. To make bitmap test available in ppc64, + * alloc dummy block group whose size cross bitmaps. + */ + cache = btrfs_alloc_dummy_block_group(BITS_PER_BITMAP * sectorsize + + PAGE_SIZE, sectorsize); if (!cache) { test_msg("Couldn't run the tests\n"); return 0; } - root = btrfs_alloc_dummy_root(); + root = btrfs_alloc_dummy_root(sectorsize, nodesize); if (IS_ERR(root)) { ret = PTR_ERR(root); goto out; @@ -855,14 +871,14 @@ int btrfs_test_free_space_cache(void) ret = test_extents(cache); if (ret) goto out; - ret = test_bitmaps(cache); + ret = test_bitmaps(cache, sectorsize); if (ret) goto out; - ret = test_bitmaps_and_extents(cache); + ret = test_bitmaps_and_extents(cache, sectorsize); if (ret) goto out; - ret = test_steal_space_from_bitmap_to_extent(cache); + ret = test_steal_space_from_bitmap_to_extent(cache, sectorsize); out: btrfs_free_dummy_block_group(cache); btrfs_free_dummy_root(root); diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index 7cea4462acd5..aac507085ab0 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -16,6 +16,7 @@ * Boston, MA 021110-1307, USA. */ +#include <linux/types.h> #include "btrfs-tests.h" #include "../ctree.h" #include "../disk-io.h" @@ -30,7 +31,7 @@ struct free_space_extent { * The test cases align their operations to this in order to hit some of the * edge cases in the bitmap code. */ -#define BITMAP_RANGE (BTRFS_FREE_SPACE_BITMAP_BITS * 4096) +#define BITMAP_RANGE (BTRFS_FREE_SPACE_BITMAP_BITS * PAGE_SIZE) static int __check_free_space_extents(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, @@ -439,7 +440,8 @@ typedef int (*test_func_t)(struct btrfs_trans_handle *, struct btrfs_block_group_cache *, struct btrfs_path *); -static int run_test(test_func_t test_func, int bitmaps) +static int run_test(test_func_t test_func, int bitmaps, + u32 sectorsize, u32 nodesize) { struct btrfs_root *root = NULL; struct btrfs_block_group_cache *cache = NULL; @@ -447,7 +449,7 @@ static int run_test(test_func_t test_func, int bitmaps) struct btrfs_path *path = NULL; int ret; - root = btrfs_alloc_dummy_root(); + root = btrfs_alloc_dummy_root(sectorsize, nodesize); if (IS_ERR(root)) { test_msg("Couldn't allocate dummy root\n"); ret = PTR_ERR(root); @@ -466,7 +468,8 @@ static int run_test(test_func_t test_func, int bitmaps) root->fs_info->free_space_root = root; root->fs_info->tree_root = root; - root->node = alloc_test_extent_buffer(root->fs_info, 4096); + root->node = alloc_test_extent_buffer(root->fs_info, + nodesize, nodesize); if (!root->node) { test_msg("Couldn't allocate dummy buffer\n"); ret = -ENOMEM; @@ -474,9 +477,9 @@ static int run_test(test_func_t test_func, int bitmaps) } btrfs_set_header_level(root->node, 0); btrfs_set_header_nritems(root->node, 0); - root->alloc_bytenr += 8192; + root->alloc_bytenr += 2 * nodesize; - cache = btrfs_alloc_dummy_block_group(8 * BITMAP_RANGE); + cache = btrfs_alloc_dummy_block_group(8 * BITMAP_RANGE, sectorsize); if (!cache) { test_msg("Couldn't allocate dummy block group cache\n"); ret = -ENOMEM; @@ -534,17 +537,18 @@ out: return ret; } -static int run_test_both_formats(test_func_t test_func) +static int run_test_both_formats(test_func_t test_func, + u32 sectorsize, u32 nodesize) { int ret; - ret = run_test(test_func, 0); + ret = run_test(test_func, 0, sectorsize, nodesize); if (ret) return ret; - return run_test(test_func, 1); + return run_test(test_func, 1, sectorsize, nodesize); } -int btrfs_test_free_space_tree(void) +int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize) { test_func_t tests[] = { test_empty_block_group, @@ -561,9 +565,11 @@ int btrfs_test_free_space_tree(void) test_msg("Running free space tree tests\n"); for (i = 0; i < ARRAY_SIZE(tests); i++) { - int ret = run_test_both_formats(tests[i]); + int ret = run_test_both_formats(tests[i], sectorsize, + nodesize); if (ret) { - test_msg("%pf failed\n", tests[i]); + test_msg("%pf : sectorsize %u failed\n", + tests[i], sectorsize); return ret; } } diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 8a25fe8b7c45..29648c0a39f1 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -16,6 +16,7 @@ * Boston, MA 021110-1307, USA. */ +#include <linux/types.h> #include "btrfs-tests.h" #include "../ctree.h" #include "../btrfs_inode.h" @@ -86,19 +87,19 @@ static void insert_inode_item_key(struct btrfs_root *root) * diagram of how the extents will look though this may not be possible we still * want to make sure everything acts normally (the last number is not inclusive) * - * [0 - 5][5 - 6][6 - 10][10 - 4096][ 4096 - 8192 ][8192 - 12288] - * [hole ][inline][ hole ][ regular ][regular1 split][ hole ] + * [0 - 5][5 - 6][ 6 - 4096 ][ 4096 - 4100][4100 - 8195][8195 - 12291] + * [hole ][inline][hole but no extent][ hole ][ regular ][regular1 split] * - * [ 12288 - 20480][20480 - 24576][ 24576 - 28672 ][28672 - 36864][36864 - 45056] - * [regular1 split][ prealloc1 ][prealloc1 written][ prealloc1 ][ compressed ] + * [12291 - 16387][16387 - 24579][24579 - 28675][ 28675 - 32771][32771 - 36867 ] + * [ hole ][regular1 split][ prealloc ][ prealloc1 ][prealloc1 written] * - * [45056 - 49152][49152-53248][53248-61440][61440-65536][ 65536+81920 ] - * [ compressed1 ][ regular ][compressed1][ regular ][ hole but no extent] + * [36867 - 45059][45059 - 53251][53251 - 57347][57347 - 61443][61443- 69635] + * [ prealloc1 ][ compressed ][ compressed1 ][ regular ][ compressed1] * - * [81920-86016] - * [ regular ] + * [69635-73731][ 73731 - 86019 ][86019-90115] + * [ regular ][ hole but no extent][ regular ] */ -static void setup_file_extents(struct btrfs_root *root) +static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) { int slot = 0; u64 disk_bytenr = SZ_1M; @@ -119,7 +120,7 @@ static void setup_file_extents(struct btrfs_root *root) insert_extent(root, offset, 1, 1, 0, 0, 0, BTRFS_FILE_EXTENT_INLINE, 0, slot); slot++; - offset = 4096; + offset = sectorsize; /* Now another hole */ insert_extent(root, offset, 4, 4, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0, @@ -128,99 +129,106 @@ static void setup_file_extents(struct btrfs_root *root) offset += 4; /* Now for a regular extent */ - insert_extent(root, offset, 4095, 4095, 0, disk_bytenr, 4096, - BTRFS_FILE_EXTENT_REG, 0, slot); + insert_extent(root, offset, sectorsize - 1, sectorsize - 1, 0, + disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; - disk_bytenr += 4096; - offset += 4095; + disk_bytenr += sectorsize; + offset += sectorsize - 1; /* * Now for 3 extents that were split from a hole punch so we test * offsets properly. */ - insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 16384, - BTRFS_FILE_EXTENT_REG, 0, slot); + insert_extent(root, offset, sectorsize, 4 * sectorsize, 0, disk_bytenr, + 4 * sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; - offset += 4096; - insert_extent(root, offset, 4096, 4096, 0, 0, 0, BTRFS_FILE_EXTENT_REG, - 0, slot); + offset += sectorsize; + insert_extent(root, offset, sectorsize, sectorsize, 0, 0, 0, + BTRFS_FILE_EXTENT_REG, 0, slot); slot++; - offset += 4096; - insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 16384, + offset += sectorsize; + insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize, + 2 * sectorsize, disk_bytenr, 4 * sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; - offset += 8192; - disk_bytenr += 16384; + offset += 2 * sectorsize; + disk_bytenr += 4 * sectorsize; /* Now for a unwritten prealloc extent */ - insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096, - BTRFS_FILE_EXTENT_PREALLOC, 0, slot); + insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr, + sectorsize, BTRFS_FILE_EXTENT_PREALLOC, 0, slot); slot++; - offset += 4096; + offset += sectorsize; /* * We want to jack up disk_bytenr a little more so the em stuff doesn't * merge our records. */ - disk_bytenr += 8192; + disk_bytenr += 2 * sectorsize; /* * Now for a partially written prealloc extent, basically the same as * the hole punch example above. Ram_bytes never changes when you mark * extents written btw. */ - insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 16384, - BTRFS_FILE_EXTENT_PREALLOC, 0, slot); + insert_extent(root, offset, sectorsize, 4 * sectorsize, 0, disk_bytenr, + 4 * sectorsize, BTRFS_FILE_EXTENT_PREALLOC, 0, slot); slot++; - offset += 4096; - insert_extent(root, offset, 4096, 16384, 4096, disk_bytenr, 16384, - BTRFS_FILE_EXTENT_REG, 0, slot); + offset += sectorsize; + insert_extent(root, offset, sectorsize, 4 * sectorsize, sectorsize, + disk_bytenr, 4 * sectorsize, BTRFS_FILE_EXTENT_REG, 0, + slot); slot++; - offset += 4096; - insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 16384, + offset += sectorsize; + insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize, + 2 * sectorsize, disk_bytenr, 4 * sectorsize, BTRFS_FILE_EXTENT_PREALLOC, 0, slot); slot++; - offset += 8192; - disk_bytenr += 16384; + offset += 2 * sectorsize; + disk_bytenr += 4 * sectorsize; /* Now a normal compressed extent */ - insert_extent(root, offset, 8192, 8192, 0, disk_bytenr, 4096, - BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot); + insert_extent(root, offset, 2 * sectorsize, 2 * sectorsize, 0, + disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, + BTRFS_COMPRESS_ZLIB, slot); slot++; - offset += 8192; + offset += 2 * sectorsize; /* No merges */ - disk_bytenr += 8192; + disk_bytenr += 2 * sectorsize; /* Now a split compressed extent */ - insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 4096, - BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot); + insert_extent(root, offset, sectorsize, 4 * sectorsize, 0, disk_bytenr, + sectorsize, BTRFS_FILE_EXTENT_REG, + BTRFS_COMPRESS_ZLIB, slot); slot++; - offset += 4096; - insert_extent(root, offset, 4096, 4096, 0, disk_bytenr + 4096, 4096, + offset += sectorsize; + insert_extent(root, offset, sectorsize, sectorsize, 0, + disk_bytenr + sectorsize, sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; - offset += 4096; - insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 4096, + offset += sectorsize; + insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize, + 2 * sectorsize, disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot); slot++; - offset += 8192; - disk_bytenr += 8192; + offset += 2 * sectorsize; + disk_bytenr += 2 * sectorsize; /* Now extents that have a hole but no hole extent */ - insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096, - BTRFS_FILE_EXTENT_REG, 0, slot); + insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr, + sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; - offset += 16384; - disk_bytenr += 4096; - insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096, - BTRFS_FILE_EXTENT_REG, 0, slot); + offset += 4 * sectorsize; + disk_bytenr += sectorsize; + insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr, + sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); } static unsigned long prealloc_only = 0; static unsigned long compressed_only = 0; static unsigned long vacancy_only = 0; -static noinline int test_btrfs_get_extent(void) +static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) { struct inode *inode = NULL; struct btrfs_root *root = NULL; @@ -240,7 +248,7 @@ static noinline int test_btrfs_get_extent(void) BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; BTRFS_I(inode)->location.offset = 0; - root = btrfs_alloc_dummy_root(); + root = btrfs_alloc_dummy_root(sectorsize, nodesize); if (IS_ERR(root)) { test_msg("Couldn't allocate root\n"); goto out; @@ -256,7 +264,7 @@ static noinline int test_btrfs_get_extent(void) goto out; } - root->node = alloc_dummy_extent_buffer(NULL, 4096); + root->node = alloc_dummy_extent_buffer(NULL, nodesize, nodesize); if (!root->node) { test_msg("Couldn't allocate dummy buffer\n"); goto out; @@ -273,7 +281,7 @@ static noinline int test_btrfs_get_extent(void) /* First with no extents */ BTRFS_I(inode)->root = root; - em = btrfs_get_extent(inode, NULL, 0, 0, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, 0, sectorsize, 0); if (IS_ERR(em)) { em = NULL; test_msg("Got an error when we shouldn't have\n"); @@ -295,7 +303,7 @@ static noinline int test_btrfs_get_extent(void) * setup_file_extents, so if you change anything there you need to * update the comment and update the expected values below. */ - setup_file_extents(root); + setup_file_extents(root, sectorsize); em = btrfs_get_extent(inode, NULL, 0, 0, (u64)-1, 0); if (IS_ERR(em)) { @@ -318,7 +326,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -327,7 +335,8 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected an inline, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4091) { + + if (em->start != offset || em->len != (sectorsize - 5)) { test_msg("Unexpected extent wanted start %llu len 1, got start " "%llu len %llu\n", offset, em->start, em->len); goto out; @@ -344,7 +353,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -366,7 +375,7 @@ static noinline int test_btrfs_get_extent(void) free_extent_map(em); /* Regular extent */ - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -375,7 +384,7 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4095) { + if (em->start != offset || em->len != sectorsize - 1) { test_msg("Unexpected extent wanted start %llu len 4095, got " "start %llu len %llu\n", offset, em->start, em->len); goto out; @@ -393,7 +402,7 @@ static noinline int test_btrfs_get_extent(void) free_extent_map(em); /* The next 3 are split extents */ - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -402,9 +411,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent start %llu len %u, " + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -421,7 +431,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -430,9 +440,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a hole, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -442,7 +453,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -451,9 +462,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 8192) { - test_msg("Unexpected extent wanted start %llu len 8192, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != 2 * sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, 2 * sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -475,7 +487,7 @@ static noinline int test_btrfs_get_extent(void) free_extent_map(em); /* Prealloc extent */ - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -484,9 +496,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != prealloc_only) { @@ -503,7 +516,7 @@ static noinline int test_btrfs_get_extent(void) free_extent_map(em); /* The next 3 are a half written prealloc extent */ - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -512,9 +525,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != prealloc_only) { @@ -532,7 +546,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -541,9 +555,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -564,7 +579,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -573,9 +588,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 8192) { - test_msg("Unexpected extent wanted start %llu len 8192, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != 2 * sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, 2 * sectorsize, em->start, em->len); goto out; } if (em->flags != prealloc_only) { @@ -598,7 +614,7 @@ static noinline int test_btrfs_get_extent(void) free_extent_map(em); /* Now for the compressed extent */ - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -607,9 +623,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 8192) { - test_msg("Unexpected extent wanted start %llu len 8192, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != 2 * sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u," + "got start %llu len %llu\n", + offset, 2 * sectorsize, em->start, em->len); goto out; } if (em->flags != compressed_only) { @@ -631,7 +648,7 @@ static noinline int test_btrfs_get_extent(void) free_extent_map(em); /* Split compressed extent */ - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -640,9 +657,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u," + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != compressed_only) { @@ -665,7 +683,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -674,9 +692,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -691,7 +710,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -701,9 +720,10 @@ static noinline int test_btrfs_get_extent(void) disk_bytenr, em->block_start); goto out; } - if (em->start != offset || em->len != 8192) { - test_msg("Unexpected extent wanted start %llu len 8192, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != 2 * sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, 2 * sectorsize, em->start, em->len); goto out; } if (em->flags != compressed_only) { @@ -725,7 +745,7 @@ static noinline int test_btrfs_get_extent(void) free_extent_map(em); /* A hole between regular extents but no hole extent */ - em = btrfs_get_extent(inode, NULL, 0, offset + 6, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset + 6, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -734,9 +754,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -765,9 +786,10 @@ static noinline int test_btrfs_get_extent(void) * length of the actual hole, if this changes we'll have to change this * test. */ - if (em->start != offset || em->len != 12288) { - test_msg("Unexpected extent wanted start %llu len 12288, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != 3 * sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, 3 * sectorsize, em->start, em->len); goto out; } if (em->flags != vacancy_only) { @@ -783,7 +805,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -792,9 +814,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u," + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -815,7 +838,7 @@ out: return ret; } -static int test_hole_first(void) +static int test_hole_first(u32 sectorsize, u32 nodesize) { struct inode *inode = NULL; struct btrfs_root *root = NULL; @@ -832,7 +855,7 @@ static int test_hole_first(void) BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; BTRFS_I(inode)->location.offset = 0; - root = btrfs_alloc_dummy_root(); + root = btrfs_alloc_dummy_root(sectorsize, nodesize); if (IS_ERR(root)) { test_msg("Couldn't allocate root\n"); goto out; @@ -844,7 +867,7 @@ static int test_hole_first(void) goto out; } - root->node = alloc_dummy_extent_buffer(NULL, 4096); + root->node = alloc_dummy_extent_buffer(NULL, nodesize, nodesize); if (!root->node) { test_msg("Couldn't allocate dummy buffer\n"); goto out; @@ -861,9 +884,9 @@ static int test_hole_first(void) * btrfs_get_extent. */ insert_inode_item_key(root); - insert_extent(root, 4096, 4096, 4096, 0, 4096, 4096, - BTRFS_FILE_EXTENT_REG, 0, 1); - em = btrfs_get_extent(inode, NULL, 0, 0, 8192, 0); + insert_extent(root, sectorsize, sectorsize, sectorsize, 0, sectorsize, + sectorsize, BTRFS_FILE_EXTENT_REG, 0, 1); + em = btrfs_get_extent(inode, NULL, 0, 0, 2 * sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -872,9 +895,10 @@ static int test_hole_first(void) test_msg("Expected a hole, got %llu\n", em->block_start); goto out; } - if (em->start != 0 || em->len != 4096) { - test_msg("Unexpected extent wanted start 0 len 4096, got start " - "%llu len %llu\n", em->start, em->len); + if (em->start != 0 || em->len != sectorsize) { + test_msg("Unexpected extent wanted start 0 len %u, " + "got start %llu len %llu\n", + sectorsize, em->start, em->len); goto out; } if (em->flags != vacancy_only) { @@ -884,18 +908,19 @@ static int test_hole_first(void) } free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, 4096, 8192, 0); + em = btrfs_get_extent(inode, NULL, 0, sectorsize, 2 * sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; } - if (em->block_start != 4096) { + if (em->block_start != sectorsize) { test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != 4096 || em->len != 4096) { - test_msg("Unexpected extent wanted start 4096 len 4096, got " - "start %llu len %llu\n", em->start, em->len); + if (em->start != sectorsize || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %u len %u, " + "got start %llu len %llu\n", + sectorsize, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -912,7 +937,7 @@ out: return ret; } -static int test_extent_accounting(void) +static int test_extent_accounting(u32 sectorsize, u32 nodesize) { struct inode *inode = NULL; struct btrfs_root *root = NULL; @@ -924,7 +949,7 @@ static int test_extent_accounting(void) return ret; } - root = btrfs_alloc_dummy_root(); + root = btrfs_alloc_dummy_root(sectorsize, nodesize); if (IS_ERR(root)) { test_msg("Couldn't allocate root\n"); goto out; @@ -954,10 +979,11 @@ static int test_extent_accounting(void) goto out; } - /* [BTRFS_MAX_EXTENT_SIZE][4k] */ + /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */ BTRFS_I(inode)->outstanding_extents++; ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE, - BTRFS_MAX_EXTENT_SIZE + 4095, NULL); + BTRFS_MAX_EXTENT_SIZE + sectorsize - 1, + NULL); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); goto out; @@ -969,10 +995,10 @@ static int test_extent_accounting(void) goto out; } - /* [BTRFS_MAX_EXTENT_SIZE/2][4K HOLE][the rest] */ + /* [BTRFS_MAX_EXTENT_SIZE/2][sectorsize HOLE][the rest] */ ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, BTRFS_MAX_EXTENT_SIZE >> 1, - (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095, + (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_KERNEL); @@ -987,10 +1013,11 @@ static int test_extent_accounting(void) goto out; } - /* [BTRFS_MAX_EXTENT_SIZE][4K] */ + /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */ BTRFS_I(inode)->outstanding_extents++; ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1, - (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095, + (BTRFS_MAX_EXTENT_SIZE >> 1) + + sectorsize - 1, NULL); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); @@ -1004,16 +1031,17 @@ static int test_extent_accounting(void) } /* - * [BTRFS_MAX_EXTENT_SIZE+4K][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4K] + * [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize HOLE][BTRFS_MAX_EXTENT_SIZE+sectorsize] * * I'm artificially adding 2 to outstanding_extents because in the * buffered IO case we'd add things up as we go, but I don't feel like * doing that here, this isn't the interesting case we want to test. */ BTRFS_I(inode)->outstanding_extents += 2; - ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + 8192, - (BTRFS_MAX_EXTENT_SIZE << 1) + 12287, - NULL); + ret = btrfs_set_extent_delalloc(inode, + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize, + (BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1, + NULL); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); goto out; @@ -1025,10 +1053,13 @@ static int test_extent_accounting(void) goto out; } - /* [BTRFS_MAX_EXTENT_SIZE+4k][4k][BTRFS_MAX_EXTENT_SIZE+4k] */ + /* + * [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize][BTRFS_MAX_EXTENT_SIZE+sectorsize] + */ BTRFS_I(inode)->outstanding_extents++; - ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096, - BTRFS_MAX_EXTENT_SIZE+8191, NULL); + ret = btrfs_set_extent_delalloc(inode, + BTRFS_MAX_EXTENT_SIZE + sectorsize, + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); goto out; @@ -1042,8 +1073,8 @@ static int test_extent_accounting(void) /* [BTRFS_MAX_EXTENT_SIZE+4k][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4k] */ ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, - BTRFS_MAX_EXTENT_SIZE+4096, - BTRFS_MAX_EXTENT_SIZE+8191, + BTRFS_MAX_EXTENT_SIZE + sectorsize, + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, NULL, GFP_KERNEL); @@ -1063,8 +1094,9 @@ static int test_extent_accounting(void) * might fail and I'd rather satisfy my paranoia at this point. */ BTRFS_I(inode)->outstanding_extents++; - ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096, - BTRFS_MAX_EXTENT_SIZE+8191, NULL); + ret = btrfs_set_extent_delalloc(inode, + BTRFS_MAX_EXTENT_SIZE + sectorsize, + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); goto out; @@ -1103,7 +1135,7 @@ out: return ret; } -int btrfs_test_inodes(void) +int btrfs_test_inodes(u32 sectorsize, u32 nodesize) { int ret; @@ -1112,13 +1144,13 @@ int btrfs_test_inodes(void) set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only); test_msg("Running btrfs_get_extent tests\n"); - ret = test_btrfs_get_extent(); + ret = test_btrfs_get_extent(sectorsize, nodesize); if (ret) return ret; test_msg("Running hole first btrfs_get_extent test\n"); - ret = test_hole_first(); + ret = test_hole_first(sectorsize, nodesize); if (ret) return ret; test_msg("Running outstanding_extents tests\n"); - return test_extent_accounting(); + return test_extent_accounting(sectorsize, nodesize); } diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 8aa4ded31326..57a12c0d680b 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -16,6 +16,7 @@ * Boston, MA 021110-1307, USA. */ +#include <linux/types.h> #include "btrfs-tests.h" #include "../ctree.h" #include "../transaction.h" @@ -216,7 +217,8 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, return ret; } -static int test_no_shared_qgroup(struct btrfs_root *root) +static int test_no_shared_qgroup(struct btrfs_root *root, + u32 sectorsize, u32 nodesize) { struct btrfs_trans_handle trans; struct btrfs_fs_info *fs_info = root->fs_info; @@ -227,7 +229,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root) btrfs_init_dummy_trans(&trans); test_msg("Qgroup basic add\n"); - ret = btrfs_create_qgroup(NULL, fs_info, 5); + ret = btrfs_create_qgroup(NULL, fs_info, BTRFS_FS_TREE_OBJECTID); if (ret) { test_msg("Couldn't create a qgroup %d\n", ret); return ret; @@ -238,18 +240,19 @@ static int test_no_shared_qgroup(struct btrfs_root *root) * we can only call btrfs_qgroup_account_extent() directly to test * quota. */ - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5); + ret = insert_normal_tree_ref(root, nodesize, nodesize, 0, + BTRFS_FS_TREE_OBJECTID); if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -257,32 +260,33 @@ static int test_no_shared_qgroup(struct btrfs_root *root) return ret; } - ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, - old_roots, new_roots); + ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, + nodesize, old_roots, new_roots); if (ret) { test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } - if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) { + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, + nodesize, nodesize)) { test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } old_roots = NULL; new_roots = NULL; - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = remove_extent_item(root, 4096, 4096); + ret = remove_extent_item(root, nodesize, nodesize); if (ret) return -EINVAL; - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -290,14 +294,14 @@ static int test_no_shared_qgroup(struct btrfs_root *root) return ret; } - ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, - old_roots, new_roots); + ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, + nodesize, old_roots, new_roots); if (ret) { test_msg("Couldn't account space for a qgroup %d\n", ret); return -EINVAL; } - if (btrfs_verify_qgroup_counts(fs_info, 5, 0, 0)) { + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, 0, 0)) { test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } @@ -310,7 +314,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root) * right, also remove one of the roots and make sure the exclusive count is * adjusted properly. */ -static int test_multiple_refs(struct btrfs_root *root) +static int test_multiple_refs(struct btrfs_root *root, + u32 sectorsize, u32 nodesize) { struct btrfs_trans_handle trans; struct btrfs_fs_info *fs_info = root->fs_info; @@ -322,25 +327,29 @@ static int test_multiple_refs(struct btrfs_root *root) test_msg("Qgroup multiple refs test\n"); - /* We have 5 created already from the previous test */ - ret = btrfs_create_qgroup(NULL, fs_info, 256); + /* + * We have BTRFS_FS_TREE_OBJECTID created already from the + * previous test. + */ + ret = btrfs_create_qgroup(NULL, fs_info, BTRFS_FIRST_FREE_OBJECTID); if (ret) { test_msg("Couldn't create a qgroup %d\n", ret); return ret; } - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5); + ret = insert_normal_tree_ref(root, nodesize, nodesize, 0, + BTRFS_FS_TREE_OBJECTID); if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -348,30 +357,32 @@ static int test_multiple_refs(struct btrfs_root *root) return ret; } - ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, - old_roots, new_roots); + ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, + nodesize, old_roots, new_roots); if (ret) { test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } - if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) { + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, + nodesize, nodesize)) { test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = add_tree_ref(root, 4096, 4096, 0, 256); + ret = add_tree_ref(root, nodesize, nodesize, 0, + BTRFS_FIRST_FREE_OBJECTID); if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -379,35 +390,38 @@ static int test_multiple_refs(struct btrfs_root *root) return ret; } - ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, - old_roots, new_roots); + ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, + nodesize, old_roots, new_roots); if (ret) { test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } - if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 0)) { + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, + nodesize, 0)) { test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } - if (btrfs_verify_qgroup_counts(fs_info, 256, 4096, 0)) { + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FIRST_FREE_OBJECTID, + nodesize, 0)) { test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = remove_extent_ref(root, 4096, 4096, 0, 256); + ret = remove_extent_ref(root, nodesize, nodesize, 0, + BTRFS_FIRST_FREE_OBJECTID); if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -415,19 +429,21 @@ static int test_multiple_refs(struct btrfs_root *root) return ret; } - ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, - old_roots, new_roots); + ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, + nodesize, old_roots, new_roots); if (ret) { test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } - if (btrfs_verify_qgroup_counts(fs_info, 256, 0, 0)) { + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FIRST_FREE_OBJECTID, + 0, 0)) { test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } - if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) { + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, + nodesize, nodesize)) { test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } @@ -435,13 +451,13 @@ static int test_multiple_refs(struct btrfs_root *root) return 0; } -int btrfs_test_qgroups(void) +int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) { struct btrfs_root *root; struct btrfs_root *tmp_root; int ret = 0; - root = btrfs_alloc_dummy_root(); + root = btrfs_alloc_dummy_root(sectorsize, nodesize); if (IS_ERR(root)) { test_msg("Couldn't allocate root\n"); return PTR_ERR(root); @@ -468,7 +484,8 @@ int btrfs_test_qgroups(void) * Can't use bytenr 0, some things freak out * *cough*backref walking code*cough* */ - root->node = alloc_test_extent_buffer(root->fs_info, 4096); + root->node = alloc_test_extent_buffer(root->fs_info, nodesize, + nodesize); if (!root->node) { test_msg("Couldn't allocate dummy buffer\n"); ret = -ENOMEM; @@ -476,16 +493,16 @@ int btrfs_test_qgroups(void) } btrfs_set_header_level(root->node, 0); btrfs_set_header_nritems(root->node, 0); - root->alloc_bytenr += 8192; + root->alloc_bytenr += 2 * nodesize; - tmp_root = btrfs_alloc_dummy_root(); + tmp_root = btrfs_alloc_dummy_root(sectorsize, nodesize); if (IS_ERR(tmp_root)) { test_msg("Couldn't allocate a fs root\n"); ret = PTR_ERR(tmp_root); goto out; } - tmp_root->root_key.objectid = 5; + tmp_root->root_key.objectid = BTRFS_FS_TREE_OBJECTID; root->fs_info->fs_root = tmp_root; ret = btrfs_insert_fs_root(root->fs_info, tmp_root); if (ret) { @@ -493,14 +510,14 @@ int btrfs_test_qgroups(void) goto out; } - tmp_root = btrfs_alloc_dummy_root(); + tmp_root = btrfs_alloc_dummy_root(sectorsize, nodesize); if (IS_ERR(tmp_root)) { test_msg("Couldn't allocate a fs root\n"); ret = PTR_ERR(tmp_root); goto out; } - tmp_root->root_key.objectid = 256; + tmp_root->root_key.objectid = BTRFS_FIRST_FREE_OBJECTID; ret = btrfs_insert_fs_root(root->fs_info, tmp_root); if (ret) { test_msg("Couldn't insert fs root %d\n", ret); @@ -508,10 +525,10 @@ int btrfs_test_qgroups(void) } test_msg("Running qgroup tests\n"); - ret = test_no_shared_qgroup(root); + ret = test_no_shared_qgroup(root, sectorsize, nodesize); if (ret) goto out; - ret = test_multiple_refs(root); + ret = test_multiple_refs(root, sectorsize, nodesize); out: btrfs_free_dummy_root(root); return ret; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f6e24cb423ae..948aa186b353 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -818,6 +818,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, { struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_fs_info *info = root->fs_info; + u64 transid = trans->transid; unsigned long cur = trans->delayed_ref_updates; int lock = (trans->type != TRANS_JOIN_NOLOCK); int err = 0; @@ -905,7 +906,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_trans_handle_cachep, trans); if (must_run_delayed_refs) { - btrfs_async_run_delayed_refs(root, cur, + btrfs_async_run_delayed_refs(root, cur, transid, must_run_delayed_refs == 1); } return err; @@ -1311,11 +1312,6 @@ int btrfs_defrag_root(struct btrfs_root *root) return ret; } -/* Bisesctability fixup, remove in 4.8 */ -#ifndef btrfs_std_error -#define btrfs_std_error btrfs_handle_fs_error -#endif - /* * Do all special snapshot related qgroup dirty hack. * @@ -1385,7 +1381,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, switch_commit_roots(trans->transaction, fs_info); ret = btrfs_write_and_wait_transaction(trans, src); if (ret) - btrfs_std_error(fs_info, ret, + btrfs_handle_fs_error(fs_info, ret, "Error while writing out transaction for qgroup"); out: diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 9fe0ec2bf0fe..c5abee4f01ad 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -110,7 +110,6 @@ struct btrfs_trans_handle { u64 chunk_bytes_reserved; unsigned long use_count; unsigned long blocks_reserved; - unsigned long blocks_used; unsigned long delayed_ref_updates; struct btrfs_transaction *transaction; struct btrfs_block_rsv *block_rsv; @@ -121,6 +120,7 @@ struct btrfs_trans_handle { bool can_flush_pending_bgs; bool reloc_reserved; bool sync; + bool dirty; unsigned int type; /* * this root is only needed to validate that the root passed to diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index b7665af471d8..c05f69a8ec42 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2422,8 +2422,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, root_owner = btrfs_header_owner(parent); next = btrfs_find_create_tree_block(root, bytenr); - if (!next) - return -ENOMEM; + if (IS_ERR(next)) + return PTR_ERR(next); if (*level == 1) { ret = wc->process_func(root, next, wc, ptr_gen); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index bdc62561ede8..0fb4a959012e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -462,7 +462,7 @@ loop_lock: sync_pending = 0; } - btrfsic_submit_bio(cur->bi_rw, cur); + btrfsic_submit_bio(cur); num_run++; batch_run++; @@ -2761,6 +2761,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 dev_extent_len = 0; u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; int i, ret = 0; + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; /* Just in case */ root = root->fs_info->chunk_root; @@ -2787,12 +2788,19 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, check_system_chunk(trans, extent_root, map->type); unlock_chunks(root->fs_info->chunk_root); + /* + * Take the device list mutex to prevent races with the final phase of + * a device replace operation that replaces the device object associated + * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()). + */ + mutex_lock(&fs_devices->device_list_mutex); for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *device = map->stripes[i].dev; ret = btrfs_free_dev_extent(trans, device, map->stripes[i].physical, &dev_extent_len); if (ret) { + mutex_unlock(&fs_devices->device_list_mutex); btrfs_abort_transaction(trans, root, ret); goto out; } @@ -2811,11 +2819,14 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, if (map->stripes[i].dev) { ret = btrfs_update_device(trans, map->stripes[i].dev); if (ret) { + mutex_unlock(&fs_devices->device_list_mutex); btrfs_abort_transaction(trans, root, ret); goto out; } } } + mutex_unlock(&fs_devices->device_list_mutex); + ret = btrfs_free_chunk(trans, root, chunk_objectid, chunk_offset); if (ret) { btrfs_abort_transaction(trans, root, ret); @@ -4230,6 +4241,7 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) if (IS_ERR(uuid_root)) { ret = PTR_ERR(uuid_root); btrfs_abort_transaction(trans, tree_root, ret); + btrfs_end_transaction(trans, tree_root); return ret; } @@ -4682,12 +4694,12 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (type & BTRFS_BLOCK_GROUP_RAID5) { raid_stripe_len = find_raid56_stripe_len(ndevs - 1, - btrfs_super_stripesize(info->super_copy)); + extent_root->stripesize); data_stripes = num_stripes - 1; } if (type & BTRFS_BLOCK_GROUP_RAID6) { raid_stripe_len = find_raid56_stripe_len(ndevs - 2, - btrfs_super_stripesize(info->super_copy)); + extent_root->stripesize); data_stripes = num_stripes - 2; } @@ -5248,7 +5260,7 @@ void btrfs_put_bbio(struct btrfs_bio *bbio) kfree(bbio); } -static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, +static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num, int need_raid_map) @@ -5334,7 +5346,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, raid56_full_stripe_start *= full_stripe_len; } - if (rw & REQ_DISCARD) { + if (op == REQ_OP_DISCARD) { /* we don't discard raid56 yet */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { ret = -EOPNOTSUPP; @@ -5347,7 +5359,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, For other RAID types and for RAID[56] reads, just allow a single stripe (on a single disk). */ if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && - (rw & REQ_WRITE)) { + (op == REQ_OP_WRITE)) { max_len = stripe_len * nr_data_stripes(map) - (offset - raid56_full_stripe_start); } else { @@ -5372,8 +5384,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, btrfs_dev_replace_set_lock_blocking(dev_replace); if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && - !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) && - dev_replace->tgtdev != NULL) { + op != REQ_OP_WRITE && op != REQ_OP_DISCARD && + op != REQ_GET_READ_MIRRORS && dev_replace->tgtdev != NULL) { /* * in dev-replace case, for repair case (that's the only * case where the mirror is selected explicitly when @@ -5460,15 +5472,17 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, (offset + *length); if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - if (rw & REQ_DISCARD) + if (op == REQ_OP_DISCARD) num_stripes = min_t(u64, map->num_stripes, stripe_nr_end - stripe_nr_orig); stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &stripe_index); - if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))) + if (op != REQ_OP_WRITE && op != REQ_OP_DISCARD && + op != REQ_GET_READ_MIRRORS) mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { - if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) + if (op == REQ_OP_WRITE || op == REQ_OP_DISCARD || + op == REQ_GET_READ_MIRRORS) num_stripes = map->num_stripes; else if (mirror_num) stripe_index = mirror_num - 1; @@ -5481,7 +5495,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) { + if (op == REQ_OP_WRITE || op == REQ_OP_DISCARD || + op == REQ_GET_READ_MIRRORS) { num_stripes = map->num_stripes; } else if (mirror_num) { stripe_index = mirror_num - 1; @@ -5495,9 +5510,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); stripe_index *= map->sub_stripes; - if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) + if (op == REQ_OP_WRITE || op == REQ_GET_READ_MIRRORS) num_stripes = map->sub_stripes; - else if (rw & REQ_DISCARD) + else if (op == REQ_OP_DISCARD) num_stripes = min_t(u64, map->sub_stripes * (stripe_nr_end - stripe_nr_orig), map->num_stripes); @@ -5515,7 +5530,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { if (need_raid_map && - ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) || + (op == REQ_OP_WRITE || op == REQ_GET_READ_MIRRORS || mirror_num > 1)) { /* push stripe_nr back to the start of the full stripe */ stripe_nr = div_u64(raid56_full_stripe_start, @@ -5543,8 +5558,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, /* We distribute the parity blocks across stripes */ div_u64_rem(stripe_nr + stripe_index, map->num_stripes, &stripe_index); - if (!(rw & (REQ_WRITE | REQ_DISCARD | - REQ_GET_READ_MIRRORS)) && mirror_num <= 1) + if ((op != REQ_OP_WRITE && op != REQ_OP_DISCARD && + op != REQ_GET_READ_MIRRORS) && mirror_num <= 1) mirror_num = 1; } } else { @@ -5567,9 +5582,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, num_alloc_stripes = num_stripes; if (dev_replace_is_ongoing) { - if (rw & (REQ_WRITE | REQ_DISCARD)) + if (op == REQ_OP_WRITE || op == REQ_OP_DISCARD) num_alloc_stripes <<= 1; - if (rw & REQ_GET_READ_MIRRORS) + if (op == REQ_GET_READ_MIRRORS) num_alloc_stripes++; tgtdev_indexes = num_stripes; } @@ -5584,7 +5599,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, /* build raid_map */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && - need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) || + need_raid_map && + ((op == REQ_OP_WRITE || op == REQ_GET_READ_MIRRORS) || mirror_num > 1)) { u64 tmp; unsigned rot; @@ -5609,7 +5625,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, RAID6_Q_STRIPE; } - if (rw & REQ_DISCARD) { + if (op == REQ_OP_DISCARD) { u32 factor = 0; u32 sub_stripes = 0; u64 stripes_per_dev = 0; @@ -5689,14 +5705,15 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } - if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) + if (op == REQ_OP_WRITE || op == REQ_GET_READ_MIRRORS) max_errors = btrfs_chunk_max_errors(map); if (bbio->raid_map) sort_parity_stripes(bbio, num_stripes); tgtdev_indexes = 0; - if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && + if (dev_replace_is_ongoing && + (op == REQ_OP_WRITE || op == REQ_OP_DISCARD) && dev_replace->tgtdev != NULL) { int index_where_to_add; u64 srcdev_devid = dev_replace->srcdev->devid; @@ -5731,7 +5748,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } num_stripes = index_where_to_add; - } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) && + } else if (dev_replace_is_ongoing && (op == REQ_GET_READ_MIRRORS) && dev_replace->tgtdev != NULL) { u64 srcdev_devid = dev_replace->srcdev->devid; int index_srcdev = 0; @@ -5762,20 +5779,17 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } if (found) { - if (physical_of_found + map->stripe_len <= - dev_replace->cursor_left) { - struct btrfs_bio_stripe *tgtdev_stripe = - bbio->stripes + num_stripes; + struct btrfs_bio_stripe *tgtdev_stripe = + bbio->stripes + num_stripes; - tgtdev_stripe->physical = physical_of_found; - tgtdev_stripe->length = - bbio->stripes[index_srcdev].length; - tgtdev_stripe->dev = dev_replace->tgtdev; - bbio->tgtdev_map[index_srcdev] = num_stripes; + tgtdev_stripe->physical = physical_of_found; + tgtdev_stripe->length = + bbio->stripes[index_srcdev].length; + tgtdev_stripe->dev = dev_replace->tgtdev; + bbio->tgtdev_map[index_srcdev] = num_stripes; - tgtdev_indexes++; - num_stripes++; - } + tgtdev_indexes++; + num_stripes++; } } @@ -5806,21 +5820,21 @@ out: return ret; } -int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, +int btrfs_map_block(struct btrfs_fs_info *fs_info, int op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num) { - return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, + return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, mirror_num, 0); } /* For Scrub/replace */ -int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw, +int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num, int need_raid_map) { - return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, + return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, mirror_num, need_raid_map); } @@ -5934,7 +5948,7 @@ static void btrfs_end_bio(struct bio *bio) BUG_ON(stripe_index >= bbio->num_stripes); dev = bbio->stripes[stripe_index].dev; if (dev->bdev) { - if (bio->bi_rw & WRITE) + if (bio_op(bio) == REQ_OP_WRITE) btrfs_dev_stat_inc(dev, BTRFS_DEV_STAT_WRITE_ERRS); else @@ -5988,7 +6002,7 @@ static void btrfs_end_bio(struct bio *bio) */ static noinline void btrfs_schedule_bio(struct btrfs_root *root, struct btrfs_device *device, - int rw, struct bio *bio) + struct bio *bio) { int should_queue = 1; struct btrfs_pending_bios *pending_bios; @@ -5999,9 +6013,9 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root, } /* don't bother with additional async steps for reads, right now */ - if (!(rw & REQ_WRITE)) { + if (bio_op(bio) == REQ_OP_READ) { bio_get(bio); - btrfsic_submit_bio(rw, bio); + btrfsic_submit_bio(bio); bio_put(bio); return; } @@ -6015,7 +6029,6 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root, atomic_inc(&root->fs_info->nr_async_bios); WARN_ON(bio->bi_next); bio->bi_next = NULL; - bio->bi_rw |= rw; spin_lock(&device->io_lock); if (bio->bi_rw & REQ_SYNC) @@ -6041,7 +6054,7 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root, static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, struct bio *bio, u64 physical, int dev_nr, - int rw, int async) + int async) { struct btrfs_device *dev = bbio->stripes[dev_nr].dev; @@ -6055,8 +6068,8 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, rcu_read_lock(); name = rcu_dereference(dev->name); - pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu " - "(%s id %llu), size=%u\n", rw, + pr_debug("btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu " + "(%s id %llu), size=%u\n", bio_op(bio), bio->bi_rw, (u64)bio->bi_iter.bi_sector, (u_long)dev->bdev->bd_dev, name->str, dev->devid, bio->bi_iter.bi_size); rcu_read_unlock(); @@ -6067,9 +6080,9 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, btrfs_bio_counter_inc_noblocked(root->fs_info); if (async) - btrfs_schedule_bio(root, dev, rw, bio); + btrfs_schedule_bio(root, dev, bio); else - btrfsic_submit_bio(rw, bio); + btrfsic_submit_bio(bio); } static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) @@ -6086,7 +6099,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) } } -int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, +int btrfs_map_bio(struct btrfs_root *root, struct bio *bio, int mirror_num, int async_submit) { struct btrfs_device *dev; @@ -6103,8 +6116,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, map_length = length; btrfs_bio_counter_inc_blocked(root->fs_info); - ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, - mirror_num, 1); + ret = __btrfs_map_block(root->fs_info, bio_op(bio), logical, + &map_length, &bbio, mirror_num, 1); if (ret) { btrfs_bio_counter_dec(root->fs_info); return ret; @@ -6118,10 +6131,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, atomic_set(&bbio->stripes_pending, bbio->num_stripes); if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && - ((rw & WRITE) || (mirror_num > 1))) { + ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) { /* In this case, map_length has been set to the length of a single stripe; not the whole write */ - if (rw & WRITE) { + if (bio_op(bio) == REQ_OP_WRITE) { ret = raid56_parity_write(root, bio, bbio, map_length); } else { ret = raid56_parity_recover(root, bio, bbio, map_length, @@ -6140,7 +6153,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { dev = bbio->stripes[dev_nr].dev; - if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { + if (!dev || !dev->bdev || + (bio_op(bio) == REQ_OP_WRITE && !dev->writeable)) { bbio_error(bbio, first_bio, logical); continue; } @@ -6152,7 +6166,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, bio = first_bio; submit_stripe_bio(root, bbio, bio, - bbio->stripes[dev_nr].physical, dev_nr, rw, + bbio->stripes[dev_nr].physical, dev_nr, async_submit); } btrfs_bio_counter_dec(root->fs_info); @@ -6250,27 +6264,23 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, return dev; } -static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, - struct extent_buffer *leaf, - struct btrfs_chunk *chunk) +/* Return -EIO if any error, otherwise return 0. */ +static int btrfs_check_chunk_valid(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_chunk *chunk, u64 logical) { - struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; - struct map_lookup *map; - struct extent_map *em; - u64 logical; u64 length; u64 stripe_len; - u64 devid; - u8 uuid[BTRFS_UUID_SIZE]; - int num_stripes; - int ret; - int i; + u16 num_stripes; + u16 sub_stripes; + u64 type; - logical = key->offset; length = btrfs_chunk_length(leaf, chunk); stripe_len = btrfs_chunk_stripe_len(leaf, chunk); num_stripes = btrfs_chunk_num_stripes(leaf, chunk); - /* Validation check */ + sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); + type = btrfs_chunk_type(leaf, chunk); + if (!num_stripes) { btrfs_err(root->fs_info, "invalid chunk num_stripes: %u", num_stripes); @@ -6281,6 +6291,11 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, "invalid chunk logical %llu", logical); return -EIO; } + if (btrfs_chunk_sector_size(leaf, chunk) != root->sectorsize) { + btrfs_err(root->fs_info, "invalid chunk sectorsize %u", + btrfs_chunk_sector_size(leaf, chunk)); + return -EIO; + } if (!length || !IS_ALIGNED(length, root->sectorsize)) { btrfs_err(root->fs_info, "invalid chunk length %llu", length); @@ -6292,13 +6307,54 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, return -EIO; } if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & - btrfs_chunk_type(leaf, chunk)) { + type) { btrfs_err(root->fs_info, "unrecognized chunk type: %llu", ~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & btrfs_chunk_type(leaf, chunk)); return -EIO; } + if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || + (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) || + (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || + (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || + (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) || + ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && + num_stripes != 1)) { + btrfs_err(root->fs_info, + "invalid num_stripes:sub_stripes %u:%u for profile %llu", + num_stripes, sub_stripes, + type & BTRFS_BLOCK_GROUP_PROFILE_MASK); + return -EIO; + } + + return 0; +} + +static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, + struct extent_buffer *leaf, + struct btrfs_chunk *chunk) +{ + struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; + struct map_lookup *map; + struct extent_map *em; + u64 logical; + u64 length; + u64 stripe_len; + u64 devid; + u8 uuid[BTRFS_UUID_SIZE]; + int num_stripes; + int ret; + int i; + + logical = key->offset; + length = btrfs_chunk_length(leaf, chunk); + stripe_len = btrfs_chunk_stripe_len(leaf, chunk); + num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + + ret = btrfs_check_chunk_valid(root, leaf, chunk, logical); + if (ret) + return ret; read_lock(&map_tree->map_tree.lock); em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); @@ -6546,6 +6602,7 @@ int btrfs_read_sys_array(struct btrfs_root *root) u32 array_size; u32 len = 0; u32 cur_offset; + u64 type; struct btrfs_key key; ASSERT(BTRFS_SUPER_INFO_SIZE <= root->nodesize); @@ -6555,8 +6612,8 @@ int btrfs_read_sys_array(struct btrfs_root *root) * overallocate but we can keep it as-is, only the first page is used. */ sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET); - if (!sb) - return -ENOMEM; + if (IS_ERR(sb)) + return PTR_ERR(sb); set_extent_buffer_uptodate(sb); btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); /* @@ -6612,6 +6669,15 @@ int btrfs_read_sys_array(struct btrfs_root *root) break; } + type = btrfs_chunk_type(sb, chunk); + if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { + btrfs_err(root->fs_info, + "invalid chunk type %llu in sys_array at offset %u", + type, cur_offset); + ret = -EIO; + break; + } + len = btrfs_chunk_item_size(num_stripes); if (cur_offset + len > array_size) goto out_short_read; @@ -6630,12 +6696,14 @@ int btrfs_read_sys_array(struct btrfs_root *root) sb_array_offset += len; cur_offset += len; } + clear_extent_buffer_uptodate(sb); free_extent_buffer_stale(sb); return ret; out_short_read: printk(KERN_ERR "BTRFS: sys_array too short to read %u bytes at offset %u\n", len, cur_offset); + clear_extent_buffer_uptodate(sb); free_extent_buffer_stale(sb); return -EIO; } @@ -6648,6 +6716,7 @@ int btrfs_read_chunk_tree(struct btrfs_root *root) struct btrfs_key found_key; int ret; int slot; + u64 total_dev = 0; root = root->fs_info->chunk_root; @@ -6689,6 +6758,7 @@ int btrfs_read_chunk_tree(struct btrfs_root *root) ret = read_one_dev(root, leaf, dev_item); if (ret) goto error; + total_dev++; } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { struct btrfs_chunk *chunk; chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); @@ -6698,6 +6768,28 @@ int btrfs_read_chunk_tree(struct btrfs_root *root) } path->slots[0]++; } + + /* + * After loading chunk tree, we've got all device information, + * do another round of validation checks. + */ + if (total_dev != root->fs_info->fs_devices->total_devices) { + btrfs_err(root->fs_info, + "super_num_devices %llu mismatch with num_devices %llu found here", + btrfs_super_num_devices(root->fs_info->super_copy), + total_dev); + ret = -EINVAL; + goto error; + } + if (btrfs_super_total_bytes(root->fs_info->super_copy) < + root->fs_info->fs_devices->total_rw_bytes) { + btrfs_err(root->fs_info, + "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu", + btrfs_super_total_bytes(root->fs_info->super_copy), + root->fs_info->fs_devices->total_rw_bytes); + ret = -EINVAL; + goto error; + } ret = 0; error: unlock_chunks(root); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 0ac90f8d85bd..6613e6335ca2 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -375,10 +375,10 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, u64 end, u64 *length); void btrfs_get_bbio(struct btrfs_bio *bbio); void btrfs_put_bbio(struct btrfs_bio *bbio); -int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, +int btrfs_map_block(struct btrfs_fs_info *fs_info, int op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num); -int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw, +int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num, int need_raid_map); @@ -391,7 +391,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 type); void btrfs_mapping_init(struct btrfs_mapping_tree *tree); void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree); -int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, +int btrfs_map_bio(struct btrfs_root *root, struct bio *bio, int mirror_num, int async_submit); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder); diff --git a/fs/buffer.c b/fs/buffer.c index 228288a7de38..9c8eb9b6db6a 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -46,7 +46,7 @@ #include <trace/events/block.h> static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); -static int submit_bh_wbc(int rw, struct buffer_head *bh, +static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, unsigned long bio_flags, struct writeback_control *wbc); @@ -154,7 +154,7 @@ static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) if (uptodate) { set_buffer_uptodate(bh); } else { - /* This happens, due to failed READA attempts. */ + /* This happens, due to failed read-ahead attempts. */ clear_buffer_uptodate(bh); } unlock_buffer(bh); @@ -589,7 +589,7 @@ void write_boundary_block(struct block_device *bdev, struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize); if (bh) { if (buffer_dirty(bh)) - ll_rw_block(WRITE, 1, &bh); + ll_rw_block(REQ_OP_WRITE, 0, 1, &bh); put_bh(bh); } } @@ -1226,7 +1226,7 @@ static struct buffer_head *__bread_slow(struct buffer_head *bh) } else { get_bh(bh); bh->b_end_io = end_buffer_read_sync; - submit_bh(READ, bh); + submit_bh(REQ_OP_READ, 0, bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) return bh; @@ -1396,7 +1396,7 @@ void __breadahead(struct block_device *bdev, sector_t block, unsigned size) { struct buffer_head *bh = __getblk(bdev, block, size); if (likely(bh)) { - ll_rw_block(READA, 1, &bh); + ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, &bh); brelse(bh); } } @@ -1688,7 +1688,7 @@ static struct buffer_head *create_page_buffers(struct page *page, struct inode * * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this * causes the writes to be flagged as synchronous writes. */ -static int __block_write_full_page(struct inode *inode, struct page *page, +int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block, struct writeback_control *wbc, bh_end_io_t *handler) { @@ -1698,7 +1698,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page, struct buffer_head *bh, *head; unsigned int blocksize, bbits; int nr_underway = 0; - int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); + int write_flags = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0); head = create_page_buffers(page, inode, (1 << BH_Dirty)|(1 << BH_Uptodate)); @@ -1787,7 +1787,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page, do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh_wbc(write_op, bh, 0, wbc); + submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, 0, wbc); nr_underway++; } bh = next; @@ -1841,7 +1841,7 @@ recover: struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { clear_buffer_dirty(bh); - submit_bh_wbc(write_op, bh, 0, wbc); + submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, 0, wbc); nr_underway++; } bh = next; @@ -1849,6 +1849,7 @@ recover: unlock_page(page); goto done; } +EXPORT_SYMBOL(__block_write_full_page); /* * If a page has any new buffers, zero them out here, and mark them uptodate @@ -2015,7 +2016,7 @@ int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh) && (block_start < from || block_end > to)) { - ll_rw_block(READ, 1, &bh); + ll_rw_block(REQ_OP_READ, 0, 1, &bh); *wait_bh++=bh; } } @@ -2314,7 +2315,7 @@ int block_read_full_page(struct page *page, get_block_t *get_block) if (buffer_uptodate(bh)) end_buffer_async_read(bh, 1); else - submit_bh(READ, bh); + submit_bh(REQ_OP_READ, 0, bh); } return 0; } @@ -2648,7 +2649,7 @@ int nobh_write_begin(struct address_space *mapping, if (block_start < from || block_end > to) { lock_buffer(bh); bh->b_end_io = end_buffer_read_nobh; - submit_bh(READ, bh); + submit_bh(REQ_OP_READ, 0, bh); nr_reads++; } } @@ -2918,7 +2919,7 @@ int block_truncate_page(struct address_space *mapping, if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) { err = -EIO; - ll_rw_block(READ, 1, &bh); + ll_rw_block(REQ_OP_READ, 0, 1, &bh); wait_on_buffer(bh); /* Uhhuh. Read error. Complain and punt. */ if (!buffer_uptodate(bh)) @@ -3015,7 +3016,7 @@ static void end_bio_bh_io_sync(struct bio *bio) * errors, this only handles the "we need to be able to * do IO at the final sector" case. */ -void guard_bio_eod(int rw, struct bio *bio) +void guard_bio_eod(int op, struct bio *bio) { sector_t maxsector; struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; @@ -3045,13 +3046,13 @@ void guard_bio_eod(int rw, struct bio *bio) bvec->bv_len -= truncated_bytes; /* ..and clear the end of the buffer for reads */ - if ((rw & RW_MASK) == READ) { + if (op == REQ_OP_READ) { zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len, truncated_bytes); } } -static int submit_bh_wbc(int rw, struct buffer_head *bh, +static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, unsigned long bio_flags, struct writeback_control *wbc) { struct bio *bio; @@ -3065,7 +3066,7 @@ static int submit_bh_wbc(int rw, struct buffer_head *bh, /* * Only clear out a write error when rewriting */ - if (test_set_buffer_req(bh) && (rw & WRITE)) + if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE)) clear_buffer_write_io_error(bh); /* @@ -3090,39 +3091,42 @@ static int submit_bh_wbc(int rw, struct buffer_head *bh, bio->bi_flags |= bio_flags; /* Take care of bh's that straddle the end of the device */ - guard_bio_eod(rw, bio); + guard_bio_eod(op, bio); if (buffer_meta(bh)) - rw |= REQ_META; + op_flags |= REQ_META; if (buffer_prio(bh)) - rw |= REQ_PRIO; + op_flags |= REQ_PRIO; + bio_set_op_attrs(bio, op, op_flags); - submit_bio(rw, bio); + submit_bio(bio); return 0; } -int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) +int _submit_bh(int op, int op_flags, struct buffer_head *bh, + unsigned long bio_flags) { - return submit_bh_wbc(rw, bh, bio_flags, NULL); + return submit_bh_wbc(op, op_flags, bh, bio_flags, NULL); } EXPORT_SYMBOL_GPL(_submit_bh); -int submit_bh(int rw, struct buffer_head *bh) +int submit_bh(int op, int op_flags, struct buffer_head *bh) { - return submit_bh_wbc(rw, bh, 0, NULL); + return submit_bh_wbc(op, op_flags, bh, 0, NULL); } EXPORT_SYMBOL(submit_bh); /** * ll_rw_block: low-level access to block devices (DEPRECATED) - * @rw: whether to %READ or %WRITE or maybe %READA (readahead) + * @op: whether to %READ or %WRITE + * @op_flags: rq_flag_bits * @nr: number of &struct buffer_heads in the array * @bhs: array of pointers to &struct buffer_head * * ll_rw_block() takes an array of pointers to &struct buffer_heads, and - * requests an I/O operation on them, either a %READ or a %WRITE. The third - * %READA option is described in the documentation for generic_make_request() - * which ll_rw_block() calls. + * requests an I/O operation on them, either a %REQ_OP_READ or a %REQ_OP_WRITE. + * @op_flags contains flags modifying the detailed I/O behavior, most notably + * %REQ_RAHEAD. * * This function drops any buffer that it cannot get a lock on (with the * BH_Lock state bit), any buffer that appears to be clean when doing a write @@ -3138,7 +3142,7 @@ EXPORT_SYMBOL(submit_bh); * All of the buffers must be for the same device, and must also be a * multiple of the current approved size for the device. */ -void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) +void ll_rw_block(int op, int op_flags, int nr, struct buffer_head *bhs[]) { int i; @@ -3147,18 +3151,18 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) if (!trylock_buffer(bh)) continue; - if (rw == WRITE) { + if (op == WRITE) { if (test_clear_buffer_dirty(bh)) { bh->b_end_io = end_buffer_write_sync; get_bh(bh); - submit_bh(WRITE, bh); + submit_bh(op, op_flags, bh); continue; } } else { if (!buffer_uptodate(bh)) { bh->b_end_io = end_buffer_read_sync; get_bh(bh); - submit_bh(rw, bh); + submit_bh(op, op_flags, bh); continue; } } @@ -3167,7 +3171,7 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) } EXPORT_SYMBOL(ll_rw_block); -void write_dirty_buffer(struct buffer_head *bh, int rw) +void write_dirty_buffer(struct buffer_head *bh, int op_flags) { lock_buffer(bh); if (!test_clear_buffer_dirty(bh)) { @@ -3176,7 +3180,7 @@ void write_dirty_buffer(struct buffer_head *bh, int rw) } bh->b_end_io = end_buffer_write_sync; get_bh(bh); - submit_bh(rw, bh); + submit_bh(REQ_OP_WRITE, op_flags, bh); } EXPORT_SYMBOL(write_dirty_buffer); @@ -3185,7 +3189,7 @@ EXPORT_SYMBOL(write_dirty_buffer); * and then start new I/O and then wait upon it. The caller must have a ref on * the buffer_head. */ -int __sync_dirty_buffer(struct buffer_head *bh, int rw) +int __sync_dirty_buffer(struct buffer_head *bh, int op_flags) { int ret = 0; @@ -3194,7 +3198,7 @@ int __sync_dirty_buffer(struct buffer_head *bh, int rw) if (test_clear_buffer_dirty(bh)) { get_bh(bh); bh->b_end_io = end_buffer_write_sync; - ret = submit_bh(rw, bh); + ret = submit_bh(REQ_OP_WRITE, op_flags, bh); wait_on_buffer(bh); if (!ret && !buffer_uptodate(bh)) ret = -EIO; @@ -3457,7 +3461,7 @@ int bh_submit_read(struct buffer_head *bh) get_bh(bh); bh->b_end_io = end_buffer_read_sync; - submit_bh(READ, bh); + submit_bh(REQ_OP_READ, 0, bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) return 0; diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 861d611b8c05..ce5f345d70f5 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -380,7 +380,7 @@ static void cachefiles_sync_cache(struct fscache_cache *_cache) * check if the backing cache is updated to FS-Cache * - called by FS-Cache when evaluates if need to invalidate the cache */ -static bool cachefiles_check_consistency(struct fscache_operation *op) +static int cachefiles_check_consistency(struct fscache_operation *op) { struct cachefiles_object *object; struct cachefiles_cache *cache; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index eeb71e5de27a..26a9d10d75e9 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -276,8 +276,10 @@ static void finish_read(struct ceph_osd_request *req) for (i = 0; i < num_pages; i++) { struct page *page = osd_data->pages[i]; - if (rc < 0 && rc != -ENOENT) + if (rc < 0 && rc != -ENOENT) { + ceph_fscache_readpage_cancel(inode, page); goto unlock; + } if (bytes < (int)PAGE_SIZE) { /* zero (remainder of) page */ int s = bytes < 0 ? 0 : bytes; @@ -535,8 +537,6 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); - ceph_readpage_to_fscache(inode, page); - set_page_writeback(page); err = ceph_osdc_writepages(osdc, ceph_vino(inode), &ci->i_layout, snapc, diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index c052b5bf219b..238c55b01723 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -25,6 +25,7 @@ #include "cache.h" struct ceph_aux_inode { + u64 version; struct timespec mtime; loff_t size; }; @@ -69,15 +70,8 @@ int ceph_fscache_register_fs(struct ceph_fs_client* fsc) fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index, &ceph_fscache_fsid_object_def, fsc, true); - - if (fsc->fscache == NULL) { + if (!fsc->fscache) pr_err("Unable to resgister fsid: %p fscache cookie", fsc); - return 0; - } - - fsc->revalidate_wq = alloc_workqueue("ceph-revalidate", 0, 1); - if (fsc->revalidate_wq == NULL) - return -ENOMEM; return 0; } @@ -105,6 +99,7 @@ static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data, const struct inode* inode = &ci->vfs_inode; memset(&aux, 0, sizeof(aux)); + aux.version = ci->i_version; aux.mtime = inode->i_mtime; aux.size = i_size_read(inode); @@ -131,6 +126,7 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux( return FSCACHE_CHECKAUX_OBSOLETE; memset(&aux, 0, sizeof(aux)); + aux.version = ci->i_version; aux.mtime = inode->i_mtime; aux.size = i_size_read(inode); @@ -181,32 +177,26 @@ static const struct fscache_cookie_def ceph_fscache_inode_object_def = { .now_uncached = ceph_fscache_inode_now_uncached, }; -void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, - struct ceph_inode_info* ci) +void ceph_fscache_register_inode_cookie(struct inode *inode) { - struct inode* inode = &ci->vfs_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); /* No caching for filesystem */ if (fsc->fscache == NULL) return; /* Only cache for regular files that are read only */ - if ((ci->vfs_inode.i_mode & S_IFREG) == 0) + if (!S_ISREG(inode->i_mode)) return; - /* Avoid multiple racing open requests */ - inode_lock(inode); - - if (ci->fscache) - goto done; - - ci->fscache = fscache_acquire_cookie(fsc->fscache, - &ceph_fscache_inode_object_def, - ci, true); - fscache_check_consistency(ci->fscache); -done: + inode_lock_nested(inode, I_MUTEX_CHILD); + if (!ci->fscache) { + ci->fscache = fscache_acquire_cookie(fsc->fscache, + &ceph_fscache_inode_object_def, + ci, false); + } inode_unlock(inode); - } void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) @@ -222,6 +212,34 @@ void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) fscache_relinquish_cookie(cookie, 0); } +static bool ceph_fscache_can_enable(void *data) +{ + struct inode *inode = data; + return !inode_is_open_for_write(inode); +} + +void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + if (!fscache_cookie_valid(ci->fscache)) + return; + + if (inode_is_open_for_write(inode)) { + dout("fscache_file_set_cookie %p %p disabling cache\n", + inode, filp); + fscache_disable_cookie(ci->fscache, false); + fscache_uncache_all_inode_pages(ci->fscache, inode); + } else { + fscache_enable_cookie(ci->fscache, ceph_fscache_can_enable, + inode); + if (fscache_cookie_enabled(ci->fscache)) { + dout("fscache_file_set_cookie %p %p enabing cache\n", + inode, filp); + } + } +} + static void ceph_vfs_readpage_complete(struct page *page, void *data, int error) { if (!error) @@ -238,8 +256,7 @@ static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int static inline bool cache_valid(struct ceph_inode_info *ci) { - return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) && - (ci->i_fscache_gen == ci->i_rdcache_gen)); + return ci->i_fscache_gen == ci->i_rdcache_gen; } @@ -332,69 +349,27 @@ void ceph_invalidate_fscache_page(struct inode* inode, struct page *page) void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) { - if (fsc->revalidate_wq) - destroy_workqueue(fsc->revalidate_wq); - fscache_relinquish_cookie(fsc->fscache, 0); fsc->fscache = NULL; } -static void ceph_revalidate_work(struct work_struct *work) -{ - int issued; - u32 orig_gen; - struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, - i_revalidate_work); - struct inode *inode = &ci->vfs_inode; - - spin_lock(&ci->i_ceph_lock); - issued = __ceph_caps_issued(ci, NULL); - orig_gen = ci->i_rdcache_gen; - spin_unlock(&ci->i_ceph_lock); - - if (!(issued & CEPH_CAP_FILE_CACHE)) { - dout("revalidate_work lost cache before validation %p\n", - inode); - goto out; - } - - if (!fscache_check_consistency(ci->fscache)) - fscache_invalidate(ci->fscache); - - spin_lock(&ci->i_ceph_lock); - /* Update the new valid generation (backwards sanity check too) */ - if (orig_gen > ci->i_fscache_gen) { - ci->i_fscache_gen = orig_gen; - } - spin_unlock(&ci->i_ceph_lock); - -out: - iput(&ci->vfs_inode); -} - -void ceph_queue_revalidate(struct inode *inode) +/* + * caller should hold CEPH_CAP_FILE_{RD,CACHE} + */ +void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci) { - struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); - struct ceph_inode_info *ci = ceph_inode(inode); - - if (fsc->revalidate_wq == NULL || ci->fscache == NULL) + if (cache_valid(ci)) return; - ihold(inode); - - if (queue_work(ceph_sb_to_client(inode->i_sb)->revalidate_wq, - &ci->i_revalidate_work)) { - dout("ceph_queue_revalidate %p\n", inode); - } else { - dout("ceph_queue_revalidate %p failed\n)", inode); - iput(inode); + /* resue i_truncate_mutex. There should be no pending + * truncate while the caller holds CEPH_CAP_FILE_RD */ + mutex_lock(&ci->i_truncate_mutex); + if (!cache_valid(ci)) { + if (fscache_check_consistency(ci->fscache)) + fscache_invalidate(ci->fscache); + spin_lock(&ci->i_ceph_lock); + ci->i_fscache_gen = ci->i_rdcache_gen; + spin_unlock(&ci->i_ceph_lock); } -} - -void ceph_fscache_inode_init(struct ceph_inode_info *ci) -{ - ci->fscache = NULL; - /* The first load is verifed cookie open time */ - ci->i_fscache_gen = 1; - INIT_WORK(&ci->i_revalidate_work, ceph_revalidate_work); + mutex_unlock(&ci->i_truncate_mutex); } diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index 5ac591bd012b..7e72c7594f0c 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -34,10 +34,10 @@ void ceph_fscache_unregister(void); int ceph_fscache_register_fs(struct ceph_fs_client* fsc); void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc); -void ceph_fscache_inode_init(struct ceph_inode_info *ci); -void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, - struct ceph_inode_info* ci); +void ceph_fscache_register_inode_cookie(struct inode *inode); void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci); +void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp); +void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci); int ceph_readpage_from_fscache(struct inode *inode, struct page *page); int ceph_readpages_from_fscache(struct inode *inode, @@ -46,12 +46,11 @@ int ceph_readpages_from_fscache(struct inode *inode, unsigned *nr_pages); void ceph_readpage_to_fscache(struct inode *inode, struct page *page); void ceph_invalidate_fscache_page(struct inode* inode, struct page *page); -void ceph_queue_revalidate(struct inode *inode); -static inline void ceph_fscache_update_objectsize(struct inode *inode) +static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) { - struct ceph_inode_info *ci = ceph_inode(inode); - fscache_attr_changed(ci->fscache); + ci->fscache = NULL; + ci->i_fscache_gen = 0; } static inline void ceph_fscache_invalidate(struct inode *inode) @@ -88,6 +87,11 @@ static inline void ceph_fscache_readpages_cancel(struct inode *inode, return fscache_readpages_cancel(ci->fscache, pages); } +static inline void ceph_disable_fscache_readpage(struct ceph_inode_info *ci) +{ + ci->i_fscache_gen = ci->i_rdcache_gen - 1; +} + #else static inline int ceph_fscache_register(void) @@ -112,8 +116,20 @@ static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) { } -static inline void ceph_fscache_register_inode_cookie(struct ceph_fs_client* parent_fsc, - struct ceph_inode_info* ci) +static inline void ceph_fscache_register_inode_cookie(struct inode *inode) +{ +} + +static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) +{ +} + +static inline void ceph_fscache_file_set_cookie(struct inode *inode, + struct file *filp) +{ +} + +static inline void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci) { } @@ -141,10 +157,6 @@ static inline void ceph_readpage_to_fscache(struct inode *inode, { } -static inline void ceph_fscache_update_objectsize(struct inode *inode) -{ -} - static inline void ceph_fscache_invalidate(struct inode *inode) { } @@ -154,10 +166,6 @@ static inline void ceph_invalidate_fscache_page(struct inode *inode, { } -static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) -{ -} - static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) { return 1; @@ -173,7 +181,7 @@ static inline void ceph_fscache_readpages_cancel(struct inode *inode, { } -static inline void ceph_queue_revalidate(struct inode *inode) +static inline void ceph_disable_fscache_readpage(struct ceph_inode_info *ci) { } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index c17b5d76d75e..6f60d0a3d0f9 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2393,6 +2393,9 @@ again: snap_rwsem_locked = true; } *got = need | (have & want); + if ((need & CEPH_CAP_FILE_RD) && + !(*got & CEPH_CAP_FILE_CACHE)) + ceph_disable_fscache_readpage(ci); __take_cap_refs(ci, *got, true); ret = 1; } @@ -2554,6 +2557,9 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, break; } + if ((_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE)) + ceph_fscache_revalidate_cookie(ci); + *got = _got; return 0; } @@ -2795,7 +2801,6 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, bool writeback = false; bool queue_trunc = false; bool queue_invalidate = false; - bool queue_revalidate = false; bool deleted_inode = false; bool fill_inline = false; @@ -2837,8 +2842,6 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, ci->i_rdcache_revoking = ci->i_rdcache_gen; } } - - ceph_fscache_invalidate(inode); } /* side effects now are allowed */ @@ -2880,11 +2883,6 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, } } - /* Do we need to revalidate our fscache cookie. Don't bother on the - * first cache cap as we already validate at cookie creation time. */ - if ((issued & CEPH_CAP_FILE_CACHE) && ci->i_rdcache_gen > 1) - queue_revalidate = true; - if (newcaps & CEPH_CAP_ANY_RD) { /* ctime/mtime/atime? */ ceph_decode_timespec(&mtime, &grant->mtime); @@ -2993,11 +2991,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, if (fill_inline) ceph_fill_inline_data(inode, NULL, inline_data, inline_len); - if (queue_trunc) { + if (queue_trunc) ceph_queue_vmtruncate(inode); - ceph_queue_revalidate(inode); - } else if (queue_revalidate) - ceph_queue_revalidate(inode); if (writeback) /* @@ -3199,10 +3194,8 @@ static void handle_cap_trunc(struct inode *inode, truncate_seq, truncate_size, size); spin_unlock(&ci->i_ceph_lock); - if (queue_trunc) { + if (queue_trunc) ceph_queue_vmtruncate(inode); - ceph_fscache_invalidate(inode); - } } /* diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 6e72c98162d5..1780218a48f0 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -95,10 +95,8 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) } dentry = d_obtain_alias(inode); - if (IS_ERR(dentry)) { - iput(inode); + if (IS_ERR(dentry)) return dentry; - } err = ceph_init_dentry(dentry); if (err < 0) { dput(dentry); @@ -167,10 +165,8 @@ static struct dentry *__get_parent(struct super_block *sb, return ERR_PTR(-ENOENT); dentry = d_obtain_alias(inode); - if (IS_ERR(dentry)) { - iput(inode); + if (IS_ERR(dentry)) return dentry; - } err = ceph_init_dentry(dentry); if (err < 0) { dput(dentry); @@ -210,7 +206,7 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb, dout("fh_to_parent %llx\n", cfh->parent_ino); dentry = __get_parent(sb, NULL, cfh->ino); - if (IS_ERR(dentry) && PTR_ERR(dentry) == -ENOENT) + if (unlikely(dentry == ERR_PTR(-ENOENT))) dentry = __fh_to_dentry(sb, cfh->parent_ino); return dentry; } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index a888df6f2d71..0daaf7ceedc5 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -137,23 +137,11 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) { struct ceph_file_info *cf; int ret = 0; - struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); - struct ceph_mds_client *mdsc = fsc->mdsc; switch (inode->i_mode & S_IFMT) { case S_IFREG: - /* First file open request creates the cookie, we want to keep - * this cookie around for the filetime of the inode as not to - * have to worry about fscache register / revoke / operation - * races. - * - * Also, if we know the operation is going to invalidate data - * (non readonly) just nuke the cache right away. - */ - ceph_fscache_register_inode_cookie(mdsc->fsc, ci); - if ((fmode & CEPH_FILE_MODE_WR)) - ceph_fscache_invalidate(inode); + ceph_fscache_register_inode_cookie(inode); + ceph_fscache_file_set_cookie(inode, file); case S_IFDIR: dout("init_file %p %p 0%o (regular)\n", inode, file, inode->i_mode); @@ -406,7 +394,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); - if (d_unhashed(dentry)) { + if (d_in_lookup(dentry)) { dn = ceph_finish_lookup(req, dentry, err); if (IS_ERR(dn)) err = PTR_ERR(dn); @@ -1349,7 +1337,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) } retry_snap: - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) { + if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL)) { err = -ENOSPC; goto out; } @@ -1407,7 +1395,6 @@ retry_snap: iov_iter_advance(from, written); ceph_put_snap_context(snapc); } else { - loff_t old_size = i_size_read(inode); /* * No need to acquire the i_truncate_mutex. Because * the MDS revokes Fwb caps before sending truncate @@ -1418,8 +1405,6 @@ retry_snap: written = generic_perform_write(file, from, pos); if (likely(written >= 0)) iocb->ki_pos = pos + written; - if (i_size_read(inode) > old_size) - ceph_fscache_update_objectsize(inode); inode_unlock(inode); } @@ -1440,7 +1425,7 @@ retry_snap: ceph_put_cap_refs(ci, got); if (written >= 0) { - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL)) + if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_NEARFULL)) iocb->ki_flags |= IOCB_DSYNC; written = generic_write_sync(iocb, written); @@ -1672,8 +1657,8 @@ static long ceph_fallocate(struct file *file, int mode, goto unlock; } - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) && - !(mode & FALLOC_FL_PUNCH_HOLE)) { + if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) && + !(mode & FALLOC_FL_PUNCH_HOLE)) { ret = -ENOSPC; goto unlock; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 0130a8592191..0168b49fb6ad 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -103,7 +103,6 @@ struct ceph_fs_client { #ifdef CONFIG_CEPH_FSCACHE struct fscache_cookie *fscache; - struct workqueue_struct *revalidate_wq; #endif }; @@ -360,8 +359,7 @@ struct ceph_inode_info { #ifdef CONFIG_CEPH_FSCACHE struct fscache_cookie *fscache; - u32 i_fscache_gen; /* sequence, for delayed fscache validate */ - struct work_struct i_revalidate_work; + u32 i_fscache_gen; #endif struct inode vfs_inode; /* at end */ }; diff --git a/fs/char_dev.c b/fs/char_dev.c index 687471dc04a0..6edd825231c5 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -92,7 +92,7 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor, } if (i < CHRDEV_MAJOR_DYN_END) - pr_warn("CHRDEV \"%s\" major number %d goes below the dynamic allocation range", + pr_warn("CHRDEV \"%s\" major number %d goes below the dynamic allocation range\n", name, i); if (i == 0) { diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index 5a53ac6b1e02..02b071bf3732 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -101,6 +101,12 @@ convert_sfm_char(const __u16 src_char, char *target) case SFM_SLASH: *target = '\\'; break; + case SFM_SPACE: + *target = ' '; + break; + case SFM_PERIOD: + *target = '.'; + break; default: return false; } @@ -404,7 +410,7 @@ static __le16 convert_to_sfu_char(char src_char) return dest_char; } -static __le16 convert_to_sfm_char(char src_char) +static __le16 convert_to_sfm_char(char src_char, bool end_of_string) { __le16 dest_char; @@ -427,6 +433,18 @@ static __le16 convert_to_sfm_char(char src_char) case '|': dest_char = cpu_to_le16(SFM_PIPE); break; + case '.': + if (end_of_string) + dest_char = cpu_to_le16(SFM_PERIOD); + else + dest_char = 0; + break; + case ' ': + if (end_of_string) + dest_char = cpu_to_le16(SFM_SPACE); + else + dest_char = 0; + break; default: dest_char = 0; } @@ -469,9 +487,16 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen, /* see if we must remap this char */ if (map_chars == SFU_MAP_UNI_RSVD) dst_char = convert_to_sfu_char(src_char); - else if (map_chars == SFM_MAP_UNI_RSVD) - dst_char = convert_to_sfm_char(src_char); - else + else if (map_chars == SFM_MAP_UNI_RSVD) { + bool end_of_string; + + if (i == srclen - 1) + end_of_string = true; + else + end_of_string = false; + + dst_char = convert_to_sfm_char(src_char, end_of_string); + } else dst_char = 0; /* * FIXME: We can not handle remapping backslash (UNI_SLASH) diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h index bdc52cb9a676..479bc0a941f3 100644 --- a/fs/cifs/cifs_unicode.h +++ b/fs/cifs/cifs_unicode.h @@ -64,6 +64,8 @@ #define SFM_LESSTHAN ((__u16) 0xF023) #define SFM_PIPE ((__u16) 0xF027) #define SFM_SLASH ((__u16) 0xF026) +#define SFM_PERIOD ((__u16) 0xF028) +#define SFM_SPACE ((__u16) 0xF029) /* * Mapping mechanism to use when one of the seven reserved characters is diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 5d8b7edf8a8f..5d841f39c4b7 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -87,6 +87,7 @@ extern mempool_t *cifs_req_poolp; extern mempool_t *cifs_mid_poolp; struct workqueue_struct *cifsiod_wq; +__u32 cifs_lock_secret; /* * Bumps refcount for cifs super block. @@ -1266,6 +1267,8 @@ init_cifs(void) spin_lock_init(&cifs_file_list_lock); spin_lock_init(&GlobalMid_Lock); + get_random_bytes(&cifs_lock_secret, sizeof(cifs_lock_secret)); + if (cifs_max_pending < 2) { cifs_max_pending = 2; cifs_dbg(FYI, "cifs_max_pending set to min of 2\n"); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index bba106cdc43c..8f1d8c1e72be 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1619,6 +1619,7 @@ void cifs_oplock_break(struct work_struct *work); extern const struct slow_work_ops cifs_oplock_break_ops; extern struct workqueue_struct *cifsiod_wq; +extern __u32 cifs_lock_secret; extern mempool_t *cifs_mid_poolp; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 66736f57b5ab..7d2b15c06090 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -428,7 +428,9 @@ cifs_echo_request(struct work_struct *work) * server->ops->need_neg() == true. Also, no need to ping if * we got a response recently. */ - if (!server->ops->need_neg || server->ops->need_neg(server) || + + if (server->tcpStatus == CifsNeedReconnect || + server->tcpStatus == CifsExiting || server->tcpStatus == CifsNew || (server->ops->can_echo && !server->ops->can_echo(server)) || time_before(jiffies, server->lstrp + echo_interval - HZ)) goto requeue_echo; diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index c3eb998a99bd..fb0903fffc22 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -445,7 +445,7 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, * Check for hashed negative dentry. We have already revalidated * the dentry and it is fine. No need to perform another lookup. */ - if (!d_unhashed(direntry)) + if (!d_in_lookup(direntry)) return -ENOENT; res = cifs_lookup(inode, direntry, 0); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 9793ae0bcaa2..579e41b350a2 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1112,6 +1112,12 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) return rc; } +static __u32 +hash_lockowner(fl_owner_t owner) +{ + return cifs_lock_secret ^ hash32_ptr((const void *)owner); +} + struct lock_to_push { struct list_head llist; __u64 offset; @@ -1178,7 +1184,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) else type = CIFS_WRLCK; lck = list_entry(el, struct lock_to_push, llist); - lck->pid = flock->fl_pid; + lck->pid = hash_lockowner(flock->fl_owner); lck->netfid = cfile->fid.netfid; lck->length = length; lck->type = type; @@ -1305,7 +1311,8 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type, posix_lock_type = CIFS_RDLCK; else posix_lock_type = CIFS_WRLCK; - rc = CIFSSMBPosixLock(xid, tcon, netfid, current->tgid, + rc = CIFSSMBPosixLock(xid, tcon, netfid, + hash_lockowner(flock->fl_owner), flock->fl_start, length, flock, posix_lock_type, wait_flag); return rc; @@ -1505,7 +1512,8 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, posix_lock_type = CIFS_UNLCK; rc = CIFSSMBPosixLock(xid, tcon, cfile->fid.netfid, - current->tgid, flock->fl_start, length, + hash_lockowner(flock->fl_owner), + flock->fl_start, length, NULL, posix_lock_type, wait_flag); goto out; } @@ -3358,7 +3366,7 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, struct page *page, *tpage; unsigned int expected_index; int rc; - gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL); + gfp_t gfp = readahead_gfp_mask(mapping); INIT_LIST_HEAD(tmplist); diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h index 848249fa120f..3079b38f0afb 100644 --- a/fs/cifs/ntlmssp.h +++ b/fs/cifs/ntlmssp.h @@ -133,6 +133,6 @@ typedef struct _AUTHENTICATE_MESSAGE { int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, struct cifs_ses *ses); void build_ntlmssp_negotiate_blob(unsigned char *pbuffer, struct cifs_ses *ses); -int build_ntlmssp_auth_blob(unsigned char *pbuffer, u16 *buflen, +int build_ntlmssp_auth_blob(unsigned char **pbuffer, u16 *buflen, struct cifs_ses *ses, const struct nls_table *nls_cp); diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index af0ec2d5ad0e..538d9b55699a 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -364,19 +364,43 @@ void build_ntlmssp_negotiate_blob(unsigned char *pbuffer, sec_blob->DomainName.MaximumLength = 0; } -/* We do not malloc the blob, it is passed in pbuffer, because its - maximum possible size is fixed and small, making this approach cleaner. - This function returns the length of the data in the blob */ -int build_ntlmssp_auth_blob(unsigned char *pbuffer, +static int size_of_ntlmssp_blob(struct cifs_ses *ses) +{ + int sz = sizeof(AUTHENTICATE_MESSAGE) + ses->auth_key.len + - CIFS_SESS_KEY_SIZE + CIFS_CPHTXT_SIZE + 2; + + if (ses->domainName) + sz += 2 * strnlen(ses->domainName, CIFS_MAX_DOMAINNAME_LEN); + else + sz += 2; + + if (ses->user_name) + sz += 2 * strnlen(ses->user_name, CIFS_MAX_USERNAME_LEN); + else + sz += 2; + + return sz; +} + +int build_ntlmssp_auth_blob(unsigned char **pbuffer, u16 *buflen, struct cifs_ses *ses, const struct nls_table *nls_cp) { int rc; - AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer; + AUTHENTICATE_MESSAGE *sec_blob; __u32 flags; unsigned char *tmp; + rc = setup_ntlmv2_rsp(ses, nls_cp); + if (rc) { + cifs_dbg(VFS, "Error %d during NTLMSSP authentication\n", rc); + *buflen = 0; + goto setup_ntlmv2_ret; + } + *pbuffer = kmalloc(size_of_ntlmssp_blob(ses), GFP_KERNEL); + sec_blob = (AUTHENTICATE_MESSAGE *)*pbuffer; + memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8); sec_blob->MessageType = NtLmAuthenticate; @@ -391,7 +415,7 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer, flags |= NTLMSSP_NEGOTIATE_KEY_XCH; } - tmp = pbuffer + sizeof(AUTHENTICATE_MESSAGE); + tmp = *pbuffer + sizeof(AUTHENTICATE_MESSAGE); sec_blob->NegotiateFlags = cpu_to_le32(flags); sec_blob->LmChallengeResponse.BufferOffset = @@ -399,13 +423,9 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer, sec_blob->LmChallengeResponse.Length = 0; sec_blob->LmChallengeResponse.MaximumLength = 0; - sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->NtChallengeResponse.BufferOffset = + cpu_to_le32(tmp - *pbuffer); if (ses->user_name != NULL) { - rc = setup_ntlmv2_rsp(ses, nls_cp); - if (rc) { - cifs_dbg(VFS, "Error %d during NTLMSSP authentication\n", rc); - goto setup_ntlmv2_ret; - } memcpy(tmp, ses->auth_key.response + CIFS_SESS_KEY_SIZE, ses->auth_key.len - CIFS_SESS_KEY_SIZE); tmp += ses->auth_key.len - CIFS_SESS_KEY_SIZE; @@ -423,23 +443,23 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer, } if (ses->domainName == NULL) { - sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - *pbuffer); sec_blob->DomainName.Length = 0; sec_blob->DomainName.MaximumLength = 0; tmp += 2; } else { int len; len = cifs_strtoUTF16((__le16 *)tmp, ses->domainName, - CIFS_MAX_USERNAME_LEN, nls_cp); + CIFS_MAX_DOMAINNAME_LEN, nls_cp); len *= 2; /* unicode is 2 bytes each */ - sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - *pbuffer); sec_blob->DomainName.Length = cpu_to_le16(len); sec_blob->DomainName.MaximumLength = cpu_to_le16(len); tmp += len; } if (ses->user_name == NULL) { - sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - *pbuffer); sec_blob->UserName.Length = 0; sec_blob->UserName.MaximumLength = 0; tmp += 2; @@ -448,13 +468,13 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer, len = cifs_strtoUTF16((__le16 *)tmp, ses->user_name, CIFS_MAX_USERNAME_LEN, nls_cp); len *= 2; /* unicode is 2 bytes each */ - sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - *pbuffer); sec_blob->UserName.Length = cpu_to_le16(len); sec_blob->UserName.MaximumLength = cpu_to_le16(len); tmp += len; } - sec_blob->WorkstationName.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->WorkstationName.BufferOffset = cpu_to_le32(tmp - *pbuffer); sec_blob->WorkstationName.Length = 0; sec_blob->WorkstationName.MaximumLength = 0; tmp += 2; @@ -463,19 +483,19 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer, (ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_EXTENDED_SEC)) && !calc_seckey(ses)) { memcpy(tmp, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE); - sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - *pbuffer); sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE); sec_blob->SessionKey.MaximumLength = cpu_to_le16(CIFS_CPHTXT_SIZE); tmp += CIFS_CPHTXT_SIZE; } else { - sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - *pbuffer); sec_blob->SessionKey.Length = 0; sec_blob->SessionKey.MaximumLength = 0; } + *buflen = tmp - *pbuffer; setup_ntlmv2_ret: - *buflen = tmp - pbuffer; return rc; } @@ -690,6 +710,8 @@ sess_auth_lanman(struct sess_data *sess_data) rc = calc_lanman_hash(ses->password, ses->server->cryptkey, ses->server->sec_mode & SECMODE_PW_ENCRYPT ? true : false, lnm_session_key); + if (rc) + goto out; memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE); bcc_ptr += CIFS_AUTH_RESP_SIZE; @@ -1266,7 +1288,7 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data) struct cifs_ses *ses = sess_data->ses; __u16 bytes_remaining; char *bcc_ptr; - char *ntlmsspblob = NULL; + unsigned char *ntlmsspblob = NULL; u16 blob_len; cifs_dbg(FYI, "rawntlmssp session setup authenticate phase\n"); @@ -1279,19 +1301,7 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data) /* Build security blob before we assemble the request */ pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; smb_buf = (struct smb_hdr *)pSMB; - /* - * 5 is an empirical value, large enough to hold - * authenticate message plus max 10 of av paris, - * domain, user, workstation names, flags, etc. - */ - ntlmsspblob = kzalloc(5*sizeof(struct _AUTHENTICATE_MESSAGE), - GFP_KERNEL); - if (!ntlmsspblob) { - rc = -ENOMEM; - goto out; - } - - rc = build_ntlmssp_auth_blob(ntlmsspblob, + rc = build_ntlmssp_auth_blob(&ntlmsspblob, &blob_len, ses, sess_data->nls_cp); if (rc) goto out_free_ntlmsspblob; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 8f38e33d365b..29e06db5f187 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -588,7 +588,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, u16 blob_length = 0; struct key *spnego_key = NULL; char *security_blob = NULL; - char *ntlmssp_blob = NULL; + unsigned char *ntlmssp_blob = NULL; bool use_spnego = false; /* else use raw ntlmssp */ cifs_dbg(FYI, "Session Setup\n"); @@ -713,13 +713,7 @@ ssetup_ntlmssp_authenticate: iov[1].iov_len = blob_length; } else if (phase == NtLmAuthenticate) { req->hdr.SessionId = ses->Suid; - ntlmssp_blob = kzalloc(sizeof(struct _NEGOTIATE_MESSAGE) + 500, - GFP_KERNEL); - if (ntlmssp_blob == NULL) { - rc = -ENOMEM; - goto ssetup_exit; - } - rc = build_ntlmssp_auth_blob(ntlmssp_blob, &blob_length, ses, + rc = build_ntlmssp_auth_blob(&ntlmssp_blob, &blob_length, ses, nls_cp); if (rc) { cifs_dbg(FYI, "build_ntlmssp_auth_blob failed %d\n", @@ -1818,6 +1812,33 @@ SMB2_echo(struct TCP_Server_Info *server) cifs_dbg(FYI, "In echo request\n"); + if (server->tcpStatus == CifsNeedNegotiate) { + struct list_head *tmp, *tmp2; + struct cifs_ses *ses; + struct cifs_tcon *tcon; + + cifs_dbg(FYI, "Need negotiate, reconnecting tcons\n"); + spin_lock(&cifs_tcp_ses_lock); + list_for_each(tmp, &server->smb_ses_list) { + ses = list_entry(tmp, struct cifs_ses, smb_ses_list); + list_for_each(tmp2, &ses->tcon_list) { + tcon = list_entry(tmp2, struct cifs_tcon, + tcon_list); + /* add check for persistent handle reconnect */ + if (tcon && tcon->need_reconnect) { + spin_unlock(&cifs_tcp_ses_lock); + rc = smb2_reconnect(SMB2_ECHO, tcon); + spin_lock(&cifs_tcp_ses_lock); + } + } + } + spin_unlock(&cifs_tcp_ses_lock); + } + + /* if no session, renegotiate failed above */ + if (server->tcpStatus == CifsNeedNegotiate) + return -EIO; + rc = small_smb2_init(SMB2_ECHO, NULL, (void **)&req); if (rc) return rc; diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index bd01b92aad98..c1e9f29c924c 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -57,6 +57,7 @@ #include <linux/i2c-dev.h> #include <linux/atalk.h> #include <linux/gfp.h> +#include <linux/cec.h> #include "internal.h" @@ -1377,6 +1378,17 @@ COMPATIBLE_IOCTL(VIDEO_GET_NAVI) COMPATIBLE_IOCTL(VIDEO_SET_ATTRIBUTES) COMPATIBLE_IOCTL(VIDEO_GET_SIZE) COMPATIBLE_IOCTL(VIDEO_GET_FRAME_RATE) +/* cec */ +COMPATIBLE_IOCTL(CEC_ADAP_G_CAPS) +COMPATIBLE_IOCTL(CEC_ADAP_G_LOG_ADDRS) +COMPATIBLE_IOCTL(CEC_ADAP_S_LOG_ADDRS) +COMPATIBLE_IOCTL(CEC_ADAP_G_PHYS_ADDR) +COMPATIBLE_IOCTL(CEC_ADAP_S_PHYS_ADDR) +COMPATIBLE_IOCTL(CEC_G_MODE) +COMPATIBLE_IOCTL(CEC_S_MODE) +COMPATIBLE_IOCTL(CEC_TRANSMIT) +COMPATIBLE_IOCTL(CEC_RECEIVE) +COMPATIBLE_IOCTL(CEC_DQEVENT) /* joystick */ COMPATIBLE_IOCTL(JSIOCGVERSION) diff --git a/fs/configfs/file.c b/fs/configfs/file.c index 33b7ee34eda5..bbc1252a59f5 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -357,8 +357,6 @@ configfs_write_bin_file(struct file *file, const char __user *buf, len = simple_write_to_buffer(buffer->bin_buffer, buffer->bin_buffer_size, ppos, buf, count); - if (len > 0) - *ppos += len; out: mutex_unlock(&buffer->mutex); return len; diff --git a/fs/coredump.c b/fs/coredump.c index 38a7ab87e10a..281b768000e6 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -794,6 +794,7 @@ int dump_emit(struct coredump_params *cprm, const void *addr, int nr) return 0; file->f_pos = pos; cprm->written += n; + cprm->pos += n; nr -= n; } return 1; @@ -808,6 +809,7 @@ int dump_skip(struct coredump_params *cprm, size_t nr) if (dump_interrupted() || file->f_op->llseek(file, nr, SEEK_CUR) < 0) return 0; + cprm->pos += nr; return 1; } else { while (nr > PAGE_SIZE) { @@ -822,7 +824,7 @@ EXPORT_SYMBOL(dump_skip); int dump_align(struct coredump_params *cprm, int align) { - unsigned mod = cprm->file->f_pos & (align - 1); + unsigned mod = cprm->pos & (align - 1); if (align & (align - 1)) return 0; return mod ? dump_skip(cprm, align - mod) : 1; diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 2fc8c43ce531..c502c116924c 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -318,6 +318,7 @@ int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk, bio->bi_bdev = inode->i_sb->s_bdev; bio->bi_iter.bi_sector = pblk << (inode->i_sb->s_blocksize_bits - 9); + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); ret = bio_add_page(bio, ciphertext_page, inode->i_sb->s_blocksize, 0); if (ret != inode->i_sb->s_blocksize) { @@ -327,7 +328,7 @@ int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk, err = -EIO; goto errout; } - err = submit_bio_wait(WRITE, bio); + err = submit_bio_wait(bio); if ((err == 0) && bio->bi_error) err = -EIO; bio_put(bio); @@ -208,7 +208,12 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, dax.addr += first; size = map_len - first; } - max = min(pos + size, end); + /* + * pos + size is one past the last offset for IO, + * so pos + size can overflow loff_t at extreme offsets. + * Cast to u64 to catch this and get the true minimum. + */ + max = min_t(u64, pos + size, end); } if (iov_iter_rw(iter) == WRITE) { @@ -814,16 +819,16 @@ static int dax_insert_mapping(struct address_space *mapping, } /** - * __dax_fault - handle a page fault on a DAX file + * dax_fault - handle a page fault on a DAX file * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault * @get_block: The filesystem method used to translate file offsets to blocks * * When a page fault occurs, filesystems may call this helper in their - * fault handler for DAX files. __dax_fault() assumes the caller has done all + * fault handler for DAX files. dax_fault() assumes the caller has done all * the necessary locking for the page fault to proceed successfully. */ -int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, +int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block) { struct file *file = vma->vm_file; @@ -908,33 +913,6 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, return VM_FAULT_SIGBUS | major; return VM_FAULT_NOPAGE | major; } -EXPORT_SYMBOL(__dax_fault); - -/** - * dax_fault - handle a page fault on a DAX file - * @vma: The virtual memory area where the fault occurred - * @vmf: The description of the fault - * @get_block: The filesystem method used to translate file offsets to blocks - * - * When a page fault occurs, filesystems may call this helper in their - * fault handler for DAX files. - */ -int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - get_block_t get_block) -{ - int result; - struct super_block *sb = file_inode(vma->vm_file)->i_sb; - - if (vmf->flags & FAULT_FLAG_WRITE) { - sb_start_pagefault(sb); - file_update_time(vma->vm_file); - } - result = __dax_fault(vma, vmf, get_block); - if (vmf->flags & FAULT_FLAG_WRITE) - sb_end_pagefault(sb); - - return result; -} EXPORT_SYMBOL_GPL(dax_fault); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) @@ -962,7 +940,16 @@ static void __dax_dbg(struct buffer_head *bh, unsigned long address, #define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd") -int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, +/** + * dax_pmd_fault - handle a PMD fault on a DAX file + * @vma: The virtual memory area where the fault occurred + * @vmf: The description of the fault + * @get_block: The filesystem method used to translate file offsets to blocks + * + * When a page fault occurs, filesystems may call this helper in their + * pmd_fault handler for DAX files. + */ +int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags, get_block_t get_block) { struct file *file = vma->vm_file; @@ -1114,7 +1101,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, * * The PMD path doesn't have an equivalent to * dax_pfn_mkwrite(), though, so for a read followed by a - * write we traverse all the way through __dax_pmd_fault() + * write we traverse all the way through dax_pmd_fault() * twice. This means we can just skip inserting a radix tree * entry completely on the initial read and just wait until * the write to insert a dirty entry. @@ -1143,33 +1130,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, result = VM_FAULT_FALLBACK; goto out; } -EXPORT_SYMBOL_GPL(__dax_pmd_fault); - -/** - * dax_pmd_fault - handle a PMD fault on a DAX file - * @vma: The virtual memory area where the fault occurred - * @vmf: The description of the fault - * @get_block: The filesystem method used to translate file offsets to blocks - * - * When a page fault occurs, filesystems may call this helper in their - * pmd_fault handler for DAX files. - */ -int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd, unsigned int flags, get_block_t get_block) -{ - int result; - struct super_block *sb = file_inode(vma->vm_file)->i_sb; - - if (flags & FAULT_FLAG_WRITE) { - sb_start_pagefault(sb); - file_update_time(vma->vm_file); - } - result = __dax_pmd_fault(vma, address, pmd, flags, get_block); - if (flags & FAULT_FLAG_WRITE) - sb_end_pagefault(sb); - - return result; -} EXPORT_SYMBOL_GPL(dax_pmd_fault); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/fs/dcache.c b/fs/dcache.c index ad4a542e9bab..d6847d7b123d 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -507,6 +507,44 @@ void d_drop(struct dentry *dentry) } EXPORT_SYMBOL(d_drop); +static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent) +{ + struct dentry *next; + /* + * Inform d_walk() and shrink_dentry_list() that we are no longer + * attached to the dentry tree + */ + dentry->d_flags |= DCACHE_DENTRY_KILLED; + if (unlikely(list_empty(&dentry->d_child))) + return; + __list_del_entry(&dentry->d_child); + /* + * Cursors can move around the list of children. While we'd been + * a normal list member, it didn't matter - ->d_child.next would've + * been updated. However, from now on it won't be and for the + * things like d_walk() it might end up with a nasty surprise. + * Normally d_walk() doesn't care about cursors moving around - + * ->d_lock on parent prevents that and since a cursor has no children + * of its own, we get through it without ever unlocking the parent. + * There is one exception, though - if we ascend from a child that + * gets killed as soon as we unlock it, the next sibling is found + * using the value left in its ->d_child.next. And if _that_ + * pointed to a cursor, and cursor got moved (e.g. by lseek()) + * before d_walk() regains parent->d_lock, we'll end up skipping + * everything the cursor had been moved past. + * + * Solution: make sure that the pointer left behind in ->d_child.next + * points to something that won't be moving around. I.e. skip the + * cursors. + */ + while (dentry->d_child.next != &parent->d_subdirs) { + next = list_entry(dentry->d_child.next, struct dentry, d_child); + if (likely(!(next->d_flags & DCACHE_DENTRY_CURSOR))) + break; + dentry->d_child.next = next->d_child.next; + } +} + static void __dentry_kill(struct dentry *dentry) { struct dentry *parent = NULL; @@ -532,12 +570,7 @@ static void __dentry_kill(struct dentry *dentry) } /* if it was on the hash then remove it */ __d_drop(dentry); - __list_del_entry(&dentry->d_child); - /* - * Inform d_walk() that we are no longer attached to the - * dentry tree - */ - dentry->d_flags |= DCACHE_DENTRY_KILLED; + dentry_unlist(dentry, parent); if (parent) spin_unlock(&parent->d_lock); dentry_iput(dentry); @@ -1203,6 +1236,9 @@ resume: struct dentry *dentry = list_entry(tmp, struct dentry, d_child); next = tmp->next; + if (unlikely(dentry->d_flags & DCACHE_DENTRY_CURSOR)) + continue; + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); ret = enter(data, dentry); @@ -1636,7 +1672,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) struct dentry *dentry = __d_alloc(parent->d_sb, name); if (!dentry) return NULL; - + dentry->d_flags |= DCACHE_RCUACCESS; spin_lock(&parent->d_lock); /* * don't need child lock because it is not subject @@ -1651,6 +1687,16 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) } EXPORT_SYMBOL(d_alloc); +struct dentry *d_alloc_cursor(struct dentry * parent) +{ + struct dentry *dentry = __d_alloc(parent->d_sb, NULL); + if (dentry) { + dentry->d_flags |= DCACHE_RCUACCESS | DCACHE_DENTRY_CURSOR; + dentry->d_parent = dget(parent); + } + return dentry; +} + /** * d_alloc_pseudo - allocate a dentry (for lookup-less filesystems) * @sb: the superblock @@ -2358,7 +2404,6 @@ static void __d_rehash(struct dentry * entry, struct hlist_bl_head *b) { BUG_ON(!d_unhashed(entry)); hlist_bl_lock(b); - entry->d_flags |= DCACHE_RCUACCESS; hlist_bl_add_head_rcu(&entry->d_hash, b); hlist_bl_unlock(b); } @@ -2458,7 +2503,6 @@ retry: rcu_read_unlock(); goto retry; } - rcu_read_unlock(); /* * No changes for the parent since the beginning of d_lookup(). * Since all removals from the chain happen with hlist_bl_lock(), @@ -2471,8 +2515,6 @@ retry: continue; if (dentry->d_parent != parent) continue; - if (d_unhashed(dentry)) - continue; if (parent->d_flags & DCACHE_OP_COMPARE) { int tlen = dentry->d_name.len; const char *tname = dentry->d_name.name; @@ -2484,9 +2526,18 @@ retry: if (dentry_cmp(dentry, str, len)) continue; } - dget(dentry); hlist_bl_unlock(b); - /* somebody is doing lookup for it right now; wait for it */ + /* now we can try to grab a reference */ + if (!lockref_get_not_dead(&dentry->d_lockref)) { + rcu_read_unlock(); + goto retry; + } + + rcu_read_unlock(); + /* + * somebody is likely to be still doing lookup for it; + * wait for them to finish + */ spin_lock(&dentry->d_lock); d_wait_lookup(dentry); /* @@ -2517,6 +2568,7 @@ retry: dput(new); return dentry; } + rcu_read_unlock(); /* we can't take ->d_lock here; it's OK, though. */ new->d_flags |= DCACHE_PAR_LOOKUP; new->d_wait = wq; @@ -2843,6 +2895,7 @@ static void __d_move(struct dentry *dentry, struct dentry *target, /* ... and switch them in the tree */ if (IS_ROOT(dentry)) { /* splicing a tree */ + dentry->d_flags |= DCACHE_RCUACCESS; dentry->d_parent = target->d_parent; target->d_parent = target; list_del_init(&target->d_child); diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 9c1c9a01b7e5..592059f88e04 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -127,7 +127,6 @@ static int open_proxy_open(struct inode *inode, struct file *filp) r = real_fops->open(inode, filp); out: - fops_put(real_fops); debugfs_use_file_finish(srcu_idx); return r; } @@ -262,8 +261,10 @@ static int full_proxy_open(struct inode *inode, struct file *filp) if (real_fops->open) { r = real_fops->open(inode, filp); - - if (filp->f_op != proxy_fops) { + if (r) { + replace_fops(filp, d_inode(dentry)->i_fop); + goto free_proxy; + } else if (filp->f_op != proxy_fops) { /* No protection against file removal anymore. */ WARN(1, "debugfs file owner replaced proxy fops: %pd", dentry); diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 0b2954d7172d..37c134a132c7 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -95,8 +95,6 @@ static struct ctl_table pty_root_table[] = { static DEFINE_MUTEX(allocated_ptys_lock); -static struct vfsmount *devpts_mnt; - struct pts_mount_opts { int setuid; int setgid; @@ -104,7 +102,7 @@ struct pts_mount_opts { kgid_t gid; umode_t mode; umode_t ptmxmode; - int newinstance; + int reserve; int max; }; @@ -117,11 +115,9 @@ static const match_table_t tokens = { {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, {Opt_mode, "mode=%o"}, -#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES {Opt_ptmxmode, "ptmxmode=%o"}, {Opt_newinstance, "newinstance"}, {Opt_max, "max=%d"}, -#endif {Opt_err, NULL} }; @@ -137,15 +133,48 @@ static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb) return sb->s_fs_info; } -static inline struct super_block *pts_sb_from_inode(struct inode *inode) +struct pts_fs_info *devpts_acquire(struct file *filp) { -#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES - if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC) - return inode->i_sb; -#endif - if (!devpts_mnt) - return NULL; - return devpts_mnt->mnt_sb; + struct pts_fs_info *result; + struct path path; + struct super_block *sb; + int err; + + path = filp->f_path; + path_get(&path); + + /* Has the devpts filesystem already been found? */ + sb = path.mnt->mnt_sb; + if (sb->s_magic != DEVPTS_SUPER_MAGIC) { + /* Is a devpts filesystem at "pts" in the same directory? */ + err = path_pts(&path); + if (err) { + result = ERR_PTR(err); + goto out; + } + + /* Is the path the root of a devpts filesystem? */ + result = ERR_PTR(-ENODEV); + sb = path.mnt->mnt_sb; + if ((sb->s_magic != DEVPTS_SUPER_MAGIC) || + (path.mnt->mnt_root != sb->s_root)) + goto out; + } + + /* + * pty code needs to hold extra references in case of last /dev/tty close + */ + atomic_inc(&sb->s_active); + result = DEVPTS_SB(sb); + +out: + path_put(&path); + return result; +} + +void devpts_release(struct pts_fs_info *fsi) +{ + deactivate_super(fsi->sb); } #define PARSE_MOUNT 0 @@ -154,9 +183,7 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode) /* * parse_mount_options(): * Set @opts to mount options specified in @data. If an option is not - * specified in @data, set it to its default value. The exception is - * 'newinstance' option which can only be set/cleared on a mount (i.e. - * cannot be changed during remount). + * specified in @data, set it to its default value. * * Note: @data may be NULL (in which case all options are set to default). */ @@ -174,9 +201,12 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts) opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE; opts->max = NR_UNIX98_PTY_MAX; - /* newinstance makes sense only on initial mount */ + /* Only allow instances mounted from the initial mount + * namespace to tap the reserve pool of ptys. + */ if (op == PARSE_MOUNT) - opts->newinstance = 0; + opts->reserve = + (current->nsproxy->mnt_ns == init_task.nsproxy->mnt_ns); while ((p = strsep(&data, ",")) != NULL) { substring_t args[MAX_OPT_ARGS]; @@ -211,16 +241,12 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts) return -EINVAL; opts->mode = option & S_IALLUGO; break; -#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES case Opt_ptmxmode: if (match_octal(&args[0], &option)) return -EINVAL; opts->ptmxmode = option & S_IALLUGO; break; case Opt_newinstance: - /* newinstance makes sense only on initial mount */ - if (op == PARSE_MOUNT) - opts->newinstance = 1; break; case Opt_max: if (match_int(&args[0], &option) || @@ -228,7 +254,6 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts) return -EINVAL; opts->max = option; break; -#endif default: pr_err("called with bogus options\n"); return -EINVAL; @@ -238,7 +263,6 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts) return 0; } -#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES static int mknod_ptmx(struct super_block *sb) { int mode; @@ -305,12 +329,6 @@ static void update_ptmx_mode(struct pts_fs_info *fsi) inode->i_mode = S_IFCHR|fsi->mount_opts.ptmxmode; } } -#else -static inline void update_ptmx_mode(struct pts_fs_info *fsi) -{ - return; -} -#endif static int devpts_remount(struct super_block *sb, int *flags, char *data) { @@ -344,11 +362,9 @@ static int devpts_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, opts->gid)); seq_printf(seq, ",mode=%03o", opts->mode); -#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES seq_printf(seq, ",ptmxmode=%03o", opts->ptmxmode); if (opts->max < NR_UNIX98_PTY_MAX) seq_printf(seq, ",max=%d", opts->max); -#endif return 0; } @@ -410,40 +426,11 @@ fail: return -ENOMEM; } -#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES -static int compare_init_pts_sb(struct super_block *s, void *p) -{ - if (devpts_mnt) - return devpts_mnt->mnt_sb == s; - return 0; -} - /* * devpts_mount() * - * If the '-o newinstance' mount option was specified, mount a new - * (private) instance of devpts. PTYs created in this instance are - * independent of the PTYs in other devpts instances. - * - * If the '-o newinstance' option was not specified, mount/remount the - * initial kernel mount of devpts. This type of mount gives the - * legacy, single-instance semantics. - * - * The 'newinstance' option is needed to support multiple namespace - * semantics in devpts while preserving backward compatibility of the - * current 'single-namespace' semantics. i.e all mounts of devpts - * without the 'newinstance' mount option should bind to the initial - * kernel mount, like mount_single(). - * - * Mounts with 'newinstance' option create a new, private namespace. - * - * NOTE: - * - * For single-mount semantics, devpts cannot use mount_single(), - * because mount_single()/sget() find and use the super-block from - * the most recent mount of devpts. But that recent mount may be a - * 'newinstance' mount and mount_single() would pick the newinstance - * super-block instead of the initial super-block. + * Mount a new (private) instance of devpts. PTYs created in this + * instance are independent of the PTYs in other devpts instances. */ static struct dentry *devpts_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) @@ -456,18 +443,7 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type, if (error) return ERR_PTR(error); - /* Require newinstance for all user namespace mounts to ensure - * the mount options are not changed. - */ - if ((current_user_ns() != &init_user_ns) && !opts.newinstance) - return ERR_PTR(-EINVAL); - - if (opts.newinstance) - s = sget(fs_type, NULL, set_anon_super, flags, NULL); - else - s = sget(fs_type, compare_init_pts_sb, set_anon_super, flags, - NULL); - + s = sget(fs_type, NULL, set_anon_super, flags, NULL); if (IS_ERR(s)) return ERR_CAST(s); @@ -491,18 +467,6 @@ out_undo_sget: return ERR_PTR(error); } -#else -/* - * This supports only the legacy single-instance semantics (no - * multiple-instance semantics) - */ -static struct dentry *devpts_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data) -{ - return mount_single(fs_type, flags, data, devpts_fill_super); -} -#endif - static void devpts_kill_sb(struct super_block *sb) { struct pts_fs_info *fsi = DEVPTS_SB(sb); @@ -516,9 +480,7 @@ static struct file_system_type devpts_fs_type = { .name = "devpts", .mount = devpts_mount, .kill_sb = devpts_kill_sb, -#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES .fs_flags = FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT, -#endif }; /* @@ -531,16 +493,13 @@ int devpts_new_index(struct pts_fs_info *fsi) int index; int ida_ret; - if (!fsi) - return -ENODEV; - retry: if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL)) return -ENOMEM; mutex_lock(&allocated_ptys_lock); - if (pty_count >= pty_limit - - (fsi->mount_opts.newinstance ? pty_reserve : 0)) { + if (pty_count >= (pty_limit - + (fsi->mount_opts.reserve ? 0 : pty_reserve))) { mutex_unlock(&allocated_ptys_lock); return -ENOSPC; } @@ -571,30 +530,6 @@ void devpts_kill_index(struct pts_fs_info *fsi, int idx) mutex_unlock(&allocated_ptys_lock); } -/* - * pty code needs to hold extra references in case of last /dev/tty close - */ -struct pts_fs_info *devpts_get_ref(struct inode *ptmx_inode, struct file *file) -{ - struct super_block *sb; - struct pts_fs_info *fsi; - - sb = pts_sb_from_inode(ptmx_inode); - if (!sb) - return NULL; - fsi = DEVPTS_SB(sb); - if (!fsi) - return NULL; - - atomic_inc(&sb->s_active); - return fsi; -} - -void devpts_put_ref(struct pts_fs_info *fsi) -{ - deactivate_super(fsi->sb); -} - /** * devpts_pty_new -- create a new inode in /dev/pts/ * @ptmx_inode: inode of the master @@ -607,16 +542,12 @@ void devpts_put_ref(struct pts_fs_info *fsi) struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv) { struct dentry *dentry; - struct super_block *sb; + struct super_block *sb = fsi->sb; struct inode *inode; struct dentry *root; struct pts_mount_opts *opts; char s[12]; - if (!fsi) - return ERR_PTR(-ENODEV); - - sb = fsi->sb; root = sb->s_root; opts = &fsi->mount_opts; @@ -676,20 +607,8 @@ void devpts_pty_kill(struct dentry *dentry) static int __init init_devpts_fs(void) { int err = register_filesystem(&devpts_fs_type); - struct ctl_table_header *table; - if (!err) { - struct vfsmount *mnt; - - table = register_sysctl_table(pty_root_table); - mnt = kern_mount(&devpts_fs_type); - if (IS_ERR(mnt)) { - err = PTR_ERR(mnt); - unregister_filesystem(&devpts_fs_type); - unregister_sysctl_table(table); - } else { - devpts_mnt = mnt; - } + register_sysctl_table(pty_root_table); } return err; } diff --git a/fs/direct-io.c b/fs/direct-io.c index f3b4408be590..7c3ce73cb617 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -108,7 +108,8 @@ struct dio_submit { /* dio_state communicated between submission path and end_io */ struct dio { int flags; /* doesn't change */ - int rw; + int op; + int op_flags; blk_qc_t bio_cookie; struct block_device *bio_bdev; struct inode *inode; @@ -163,7 +164,7 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) ret = iov_iter_get_pages(sdio->iter, dio->pages, LONG_MAX, DIO_PAGES, &sdio->from); - if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) { + if (ret < 0 && sdio->blocks_available && (dio->op == REQ_OP_WRITE)) { struct page *page = ZERO_PAGE(0); /* * A memory fault, but the filesystem has some outstanding @@ -242,7 +243,8 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async) transferred = dio->result; /* Check for short read case */ - if ((dio->rw == READ) && ((offset + transferred) > dio->i_size)) + if ((dio->op == REQ_OP_READ) && + ((offset + transferred) > dio->i_size)) transferred = dio->i_size - offset; } @@ -273,7 +275,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async) */ dio->iocb->ki_pos += transferred; - if (dio->rw & WRITE) + if (dio->op == REQ_OP_WRITE) ret = generic_write_sync(dio->iocb, transferred); dio->iocb->ki_complete(dio->iocb, ret, 0); } @@ -375,6 +377,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, bio->bi_bdev = bdev; bio->bi_iter.bi_sector = first_sector; + bio_set_op_attrs(bio, dio->op, dio->op_flags); if (dio->is_async) bio->bi_end_io = dio_bio_end_aio; else @@ -402,17 +405,16 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) dio->refcount++; spin_unlock_irqrestore(&dio->bio_lock, flags); - if (dio->is_async && dio->rw == READ && dio->should_dirty) + if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) bio_set_pages_dirty(bio); dio->bio_bdev = bio->bi_bdev; if (sdio->submit_io) { - sdio->submit_io(dio->rw, bio, dio->inode, - sdio->logical_offset_in_bio); + sdio->submit_io(bio, dio->inode, sdio->logical_offset_in_bio); dio->bio_cookie = BLK_QC_T_NONE; } else - dio->bio_cookie = submit_bio(dio->rw, bio); + dio->bio_cookie = submit_bio(bio); sdio->bio = NULL; sdio->boundary = 0; @@ -478,14 +480,14 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio) if (bio->bi_error) dio->io_error = -EIO; - if (dio->is_async && dio->rw == READ && dio->should_dirty) { + if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) { err = bio->bi_error; bio_check_pages_dirty(bio); /* transfers ownership */ } else { bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; - if (dio->rw == READ && !PageCompound(page) && + if (dio->op == REQ_OP_READ && !PageCompound(page) && dio->should_dirty) set_page_dirty_lock(page); put_page(page); @@ -638,7 +640,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, * which may decide to handle it or also return an unmapped * buffer head. */ - create = dio->rw & WRITE; + create = dio->op == REQ_OP_WRITE; if (dio->flags & DIO_SKIP_HOLES) { if (fs_startblk <= ((i_size_read(dio->inode) - 1) >> i_blkbits)) @@ -788,7 +790,7 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, { int ret = 0; - if (dio->rw & WRITE) { + if (dio->op == REQ_OP_WRITE) { /* * Read accounting is performed in submit_bio() */ @@ -988,7 +990,7 @@ do_holes: loff_t i_size_aligned; /* AKPM: eargh, -ENOTBLK is a hack */ - if (dio->rw & WRITE) { + if (dio->op == REQ_OP_WRITE) { put_page(page); return -ENOTBLK; } @@ -1202,7 +1204,12 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, dio->is_async = true; dio->inode = inode; - dio->rw = iov_iter_rw(iter) == WRITE ? WRITE_ODIRECT : READ; + if (iov_iter_rw(iter) == WRITE) { + dio->op = REQ_OP_WRITE; + dio->op_flags = WRITE_ODIRECT; + } else { + dio->op = REQ_OP_READ; + } /* * For AIO O_(D)SYNC writes we need to defer completions to a workqueue diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 0d8eb3455b34..e5e29f8c920b 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -45,7 +45,7 @@ * ecryptfs_to_hex * @dst: Buffer to take hex character representation of contents of * src; must be at least of size (src_size * 2) - * @src: Buffer to be converted to a hex string respresentation + * @src: Buffer to be converted to a hex string representation * @src_size: number of bytes to convert */ void ecryptfs_to_hex(char *dst, char *src, size_t src_size) @@ -60,7 +60,7 @@ void ecryptfs_to_hex(char *dst, char *src, size_t src_size) * ecryptfs_from_hex * @dst: Buffer to take the bytes from src hex; must be at least of * size (src_size / 2) - * @src: Buffer to be converted from a hex string respresentation to raw value + * @src: Buffer to be converted from a hex string representation to raw value * @dst_size: size of dst buffer, or number of hex characters pairs to convert */ void ecryptfs_from_hex(char *dst, char *src, int dst_size) @@ -953,7 +953,7 @@ struct ecryptfs_cipher_code_str_map_elem { }; /* Add support for additional ciphers by adding elements here. The - * cipher_code is whatever OpenPGP applicatoins use to identify the + * cipher_code is whatever OpenPGP applications use to identify the * ciphers. List in order of probability. */ static struct ecryptfs_cipher_code_str_map_elem ecryptfs_cipher_code_str_map[] = { @@ -1410,7 +1410,7 @@ int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry, * * Common entry point for reading file metadata. From here, we could * retrieve the header information from the header region of the file, - * the xattr region of the file, or some other repostory that is + * the xattr region of the file, or some other repository that is * stored separately from the file itself. The current implementation * supports retrieving the metadata information from the file contents * and from the xattr region. diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 7000b96b783e..ca4e83750214 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -169,9 +169,22 @@ out: return rc; } +static int ecryptfs_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct file *lower_file = ecryptfs_file_to_lower(file); + /* + * Don't allow mmap on top of file systems that don't support it + * natively. If FILESYSTEM_MAX_STACK_DEPTH > 2 or ecryptfs + * allows recursive mounting, this will need to be extended. + */ + if (!lower_file->f_op->mmap) + return -ENODEV; + return generic_file_mmap(file, vma); +} + /** * ecryptfs_open - * @inode: inode speciying file to open + * @inode: inode specifying file to open * @file: Structure to return filled in * * Opens the file specified by inode. @@ -240,7 +253,7 @@ out: /** * ecryptfs_dir_open - * @inode: inode speciying file to open + * @inode: inode specifying file to open * @file: Structure to return filled in * * Opens the file specified by inode. @@ -403,7 +416,7 @@ const struct file_operations ecryptfs_main_fops = { #ifdef CONFIG_COMPAT .compat_ioctl = ecryptfs_compat_ioctl, #endif - .mmap = generic_file_mmap, + .mmap = ecryptfs_mmap, .open = ecryptfs_open, .flush = ecryptfs_flush, .release = ecryptfs_release, diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 1698132d0e57..612004495141 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -738,8 +738,7 @@ static void ecryptfs_free_kmem_caches(void) struct ecryptfs_cache_info *info; info = &ecryptfs_cache_infos[i]; - if (*(info->cache)) - kmem_cache_destroy(*(info->cache)); + kmem_cache_destroy(*(info->cache)); } } diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 7bd8ac8dfb28..8bb72807e70d 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -878,7 +878,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) } else { bio = master_dev->bio; /* FIXME: bio_set_dir() */ - bio->bi_rw |= REQ_WRITE; + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); } osd_req_write(or, _ios_obj(ios, cur_comp), diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index 9f9992b37924..4c40c0786e16 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -1194,6 +1194,27 @@ static int ext2_has_free_blocks(struct ext2_sb_info *sbi) } /* + * Returns 1 if the passed-in block region is valid; 0 if some part overlaps + * with filesystem metadata blocksi. + */ +int ext2_data_block_valid(struct ext2_sb_info *sbi, ext2_fsblk_t start_blk, + unsigned int count) +{ + if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || + (start_blk + count < start_blk) || + (start_blk > le32_to_cpu(sbi->s_es->s_blocks_count))) + return 0; + + /* Ensure we do not step over superblock */ + if ((start_blk <= sbi->s_sb_block) && + (start_blk + count >= sbi->s_sb_block)) + return 0; + + + return 1; +} + +/* * ext2_new_blocks() -- core block(s) allocation function * @inode: file inode * @goal: given target block(filesystem wide) diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 170939f379d7..3fb93681bf7f 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -367,6 +367,7 @@ struct ext2_inode { */ #define EXT2_VALID_FS 0x0001 /* Unmounted cleanly */ #define EXT2_ERROR_FS 0x0002 /* Errors detected */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ /* * Mount flags @@ -739,6 +740,8 @@ extern unsigned long ext2_bg_num_gdb(struct super_block *sb, int group); extern ext2_fsblk_t ext2_new_block(struct inode *, unsigned long, int *); extern ext2_fsblk_t ext2_new_blocks(struct inode *, unsigned long, unsigned long *, int *); +extern int ext2_data_block_valid(struct ext2_sb_info *sbi, ext2_fsblk_t start_blk, + unsigned int count); extern void ext2_free_blocks (struct inode *, unsigned long, unsigned long); extern unsigned long ext2_count_free_blocks (struct super_block *); diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 868c02317b05..5efeefe17abb 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -51,7 +51,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } down_read(&ei->dax_sem); - ret = __dax_fault(vma, vmf, ext2_get_block); + ret = dax_fault(vma, vmf, ext2_get_block); up_read(&ei->dax_sem); if (vmf->flags & FAULT_FLAG_WRITE) @@ -72,7 +72,7 @@ static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, } down_read(&ei->dax_sem); - ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block); + ret = dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block); up_read(&ei->dax_sem); if (flags & FAULT_FLAG_WRITE) diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index fcbe58641e40..d5c7d09919f3 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1389,6 +1389,16 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) ei->i_frag_size = raw_inode->i_fsize; ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl); ei->i_dir_acl = 0; + + if (ei->i_file_acl && + !ext2_data_block_valid(EXT2_SB(sb), ei->i_file_acl, 1)) { + ext2_error(sb, "ext2_iget", "bad extended attribute block %u", + ei->i_file_acl); + brelse(bh); + ret = -EFSCORRUPTED; + goto bad_inode; + } + if (S_ISREG(inode->i_mode)) inode->i_size |= ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32; else diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 1a5e3bff0b63..b7f896f3f7a7 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -759,10 +759,19 @@ void ext2_xattr_delete_inode(struct inode *inode) { struct buffer_head *bh = NULL; + struct ext2_sb_info *sbi = EXT2_SB(inode->i_sb); down_write(&EXT2_I(inode)->xattr_sem); if (!EXT2_I(inode)->i_file_acl) goto cleanup; + + if (!ext2_data_block_valid(sbi, EXT2_I(inode)->i_file_acl, 0)) { + ext2_error(inode->i_sb, "ext2_xattr_delete_inode", + "inode %ld: xattr block %d is out of data blocks range", + inode->i_ino, EXT2_I(inode)->i_file_acl); + goto cleanup; + } + bh = sb_bread(inode->i_sb, EXT2_I(inode)->i_file_acl); if (!bh) { ext2_error(inode->i_sb, "ext2_xattr_delete_inode", diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index b46e9fc64196..e38039fd96ff 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -99,17 +99,9 @@ config EXT4_FS_SECURITY extended attributes for file security labels, say N. config EXT4_ENCRYPTION - tristate "Ext4 Encryption" + bool "Ext4 Encryption" depends on EXT4_FS - select CRYPTO_AES - select CRYPTO_CBC - select CRYPTO_ECB - select CRYPTO_XTS - select CRYPTO_CTS - select CRYPTO_CTR - select CRYPTO_SHA256 - select KEYS - select ENCRYPTED_KEYS + select FS_ENCRYPTION help Enable encryption of ext4 files and directories. This feature is similar to ecryptfs, but it is more memory diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index f52cf54f0cbc..354103f3490c 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -12,5 +12,3 @@ ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o -ext4-$(CONFIG_EXT4_FS_ENCRYPTION) += crypto_policy.o crypto.o \ - crypto_key.o crypto_fname.o diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 3020fd70c392..e04ec868e37e 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -208,6 +208,9 @@ static int ext4_init_block_bitmap(struct super_block *sb, memset(bh->b_data, 0, sb->s_blocksize); bit_max = ext4_num_base_meta_clusters(sb, block_group); + if ((bit_max >> 3) >= bh->b_size) + return -EFSCORRUPTED; + for (bit = 0; bit < bit_max; bit++) ext4_set_bit(bit, bh->b_data); @@ -470,7 +473,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) trace_ext4_read_block_bitmap_load(sb, block_group); bh->b_end_io = ext4_end_bitmap_read; get_bh(bh); - submit_bh(READ | REQ_META | REQ_PRIO, bh); + submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh); return bh; verify: err = ext4_validate_block_bitmap(sb, desc, block_group, bh); @@ -610,7 +613,9 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries) jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); - jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); + smp_mb(); + if (EXT4_SB(sb)->s_mb_free_pending) + jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); return 1; } diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c deleted file mode 100644 index 6a6c27373b54..000000000000 --- a/fs/ext4/crypto.c +++ /dev/null @@ -1,536 +0,0 @@ -/* - * linux/fs/ext4/crypto.c - * - * Copyright (C) 2015, Google, Inc. - * - * This contains encryption functions for ext4 - * - * Written by Michael Halcrow, 2014. - * - * Filename encryption additions - * Uday Savagaonkar, 2014 - * Encryption policy handling additions - * Ildar Muslukhov, 2014 - * - * This has not yet undergone a rigorous security audit. - * - * The usage of AES-XTS should conform to recommendations in NIST - * Special Publication 800-38E and IEEE P1619/D16. - */ - -#include <crypto/skcipher.h> -#include <keys/user-type.h> -#include <keys/encrypted-type.h> -#include <linux/ecryptfs.h> -#include <linux/gfp.h> -#include <linux/kernel.h> -#include <linux/key.h> -#include <linux/list.h> -#include <linux/mempool.h> -#include <linux/module.h> -#include <linux/mutex.h> -#include <linux/random.h> -#include <linux/scatterlist.h> -#include <linux/spinlock_types.h> -#include <linux/namei.h> - -#include "ext4_extents.h" -#include "xattr.h" - -/* Encryption added and removed here! (L: */ - -static unsigned int num_prealloc_crypto_pages = 32; -static unsigned int num_prealloc_crypto_ctxs = 128; - -module_param(num_prealloc_crypto_pages, uint, 0444); -MODULE_PARM_DESC(num_prealloc_crypto_pages, - "Number of crypto pages to preallocate"); -module_param(num_prealloc_crypto_ctxs, uint, 0444); -MODULE_PARM_DESC(num_prealloc_crypto_ctxs, - "Number of crypto contexts to preallocate"); - -static mempool_t *ext4_bounce_page_pool; - -static LIST_HEAD(ext4_free_crypto_ctxs); -static DEFINE_SPINLOCK(ext4_crypto_ctx_lock); - -static struct kmem_cache *ext4_crypto_ctx_cachep; -struct kmem_cache *ext4_crypt_info_cachep; - -/** - * ext4_release_crypto_ctx() - Releases an encryption context - * @ctx: The encryption context to release. - * - * If the encryption context was allocated from the pre-allocated pool, returns - * it to that pool. Else, frees it. - * - * If there's a bounce page in the context, this frees that. - */ -void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx) -{ - unsigned long flags; - - if (ctx->flags & EXT4_WRITE_PATH_FL && ctx->w.bounce_page) - mempool_free(ctx->w.bounce_page, ext4_bounce_page_pool); - ctx->w.bounce_page = NULL; - ctx->w.control_page = NULL; - if (ctx->flags & EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL) { - kmem_cache_free(ext4_crypto_ctx_cachep, ctx); - } else { - spin_lock_irqsave(&ext4_crypto_ctx_lock, flags); - list_add(&ctx->free_list, &ext4_free_crypto_ctxs); - spin_unlock_irqrestore(&ext4_crypto_ctx_lock, flags); - } -} - -/** - * ext4_get_crypto_ctx() - Gets an encryption context - * @inode: The inode for which we are doing the crypto - * - * Allocates and initializes an encryption context. - * - * Return: An allocated and initialized encryption context on success; error - * value or NULL otherwise. - */ -struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode, - gfp_t gfp_flags) -{ - struct ext4_crypto_ctx *ctx = NULL; - int res = 0; - unsigned long flags; - struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; - - if (ci == NULL) - return ERR_PTR(-ENOKEY); - - /* - * We first try getting the ctx from a free list because in - * the common case the ctx will have an allocated and - * initialized crypto tfm, so it's probably a worthwhile - * optimization. For the bounce page, we first try getting it - * from the kernel allocator because that's just about as fast - * as getting it from a list and because a cache of free pages - * should generally be a "last resort" option for a filesystem - * to be able to do its job. - */ - spin_lock_irqsave(&ext4_crypto_ctx_lock, flags); - ctx = list_first_entry_or_null(&ext4_free_crypto_ctxs, - struct ext4_crypto_ctx, free_list); - if (ctx) - list_del(&ctx->free_list); - spin_unlock_irqrestore(&ext4_crypto_ctx_lock, flags); - if (!ctx) { - ctx = kmem_cache_zalloc(ext4_crypto_ctx_cachep, gfp_flags); - if (!ctx) { - res = -ENOMEM; - goto out; - } - ctx->flags |= EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL; - } else { - ctx->flags &= ~EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL; - } - ctx->flags &= ~EXT4_WRITE_PATH_FL; - -out: - if (res) { - if (!IS_ERR_OR_NULL(ctx)) - ext4_release_crypto_ctx(ctx); - ctx = ERR_PTR(res); - } - return ctx; -} - -struct workqueue_struct *ext4_read_workqueue; -static DEFINE_MUTEX(crypto_init); - -/** - * ext4_exit_crypto() - Shutdown the ext4 encryption system - */ -void ext4_exit_crypto(void) -{ - struct ext4_crypto_ctx *pos, *n; - - list_for_each_entry_safe(pos, n, &ext4_free_crypto_ctxs, free_list) - kmem_cache_free(ext4_crypto_ctx_cachep, pos); - INIT_LIST_HEAD(&ext4_free_crypto_ctxs); - if (ext4_bounce_page_pool) - mempool_destroy(ext4_bounce_page_pool); - ext4_bounce_page_pool = NULL; - if (ext4_read_workqueue) - destroy_workqueue(ext4_read_workqueue); - ext4_read_workqueue = NULL; - if (ext4_crypto_ctx_cachep) - kmem_cache_destroy(ext4_crypto_ctx_cachep); - ext4_crypto_ctx_cachep = NULL; - if (ext4_crypt_info_cachep) - kmem_cache_destroy(ext4_crypt_info_cachep); - ext4_crypt_info_cachep = NULL; -} - -/** - * ext4_init_crypto() - Set up for ext4 encryption. - * - * We only call this when we start accessing encrypted files, since it - * results in memory getting allocated that wouldn't otherwise be used. - * - * Return: Zero on success, non-zero otherwise. - */ -int ext4_init_crypto(void) -{ - int i, res = -ENOMEM; - - mutex_lock(&crypto_init); - if (ext4_read_workqueue) - goto already_initialized; - ext4_read_workqueue = alloc_workqueue("ext4_crypto", WQ_HIGHPRI, 0); - if (!ext4_read_workqueue) - goto fail; - - ext4_crypto_ctx_cachep = KMEM_CACHE(ext4_crypto_ctx, - SLAB_RECLAIM_ACCOUNT); - if (!ext4_crypto_ctx_cachep) - goto fail; - - ext4_crypt_info_cachep = KMEM_CACHE(ext4_crypt_info, - SLAB_RECLAIM_ACCOUNT); - if (!ext4_crypt_info_cachep) - goto fail; - - for (i = 0; i < num_prealloc_crypto_ctxs; i++) { - struct ext4_crypto_ctx *ctx; - - ctx = kmem_cache_zalloc(ext4_crypto_ctx_cachep, GFP_NOFS); - if (!ctx) { - res = -ENOMEM; - goto fail; - } - list_add(&ctx->free_list, &ext4_free_crypto_ctxs); - } - - ext4_bounce_page_pool = - mempool_create_page_pool(num_prealloc_crypto_pages, 0); - if (!ext4_bounce_page_pool) { - res = -ENOMEM; - goto fail; - } -already_initialized: - mutex_unlock(&crypto_init); - return 0; -fail: - ext4_exit_crypto(); - mutex_unlock(&crypto_init); - return res; -} - -void ext4_restore_control_page(struct page *data_page) -{ - struct ext4_crypto_ctx *ctx = - (struct ext4_crypto_ctx *)page_private(data_page); - - set_page_private(data_page, (unsigned long)NULL); - ClearPagePrivate(data_page); - unlock_page(data_page); - ext4_release_crypto_ctx(ctx); -} - -/** - * ext4_crypt_complete() - The completion callback for page encryption - * @req: The asynchronous encryption request context - * @res: The result of the encryption operation - */ -static void ext4_crypt_complete(struct crypto_async_request *req, int res) -{ - struct ext4_completion_result *ecr = req->data; - - if (res == -EINPROGRESS) - return; - ecr->res = res; - complete(&ecr->completion); -} - -typedef enum { - EXT4_DECRYPT = 0, - EXT4_ENCRYPT, -} ext4_direction_t; - -static int ext4_page_crypto(struct inode *inode, - ext4_direction_t rw, - pgoff_t index, - struct page *src_page, - struct page *dest_page, - gfp_t gfp_flags) - -{ - u8 xts_tweak[EXT4_XTS_TWEAK_SIZE]; - struct skcipher_request *req = NULL; - DECLARE_EXT4_COMPLETION_RESULT(ecr); - struct scatterlist dst, src; - struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; - struct crypto_skcipher *tfm = ci->ci_ctfm; - int res = 0; - - req = skcipher_request_alloc(tfm, gfp_flags); - if (!req) { - printk_ratelimited(KERN_ERR - "%s: crypto_request_alloc() failed\n", - __func__); - return -ENOMEM; - } - skcipher_request_set_callback( - req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - ext4_crypt_complete, &ecr); - - BUILD_BUG_ON(EXT4_XTS_TWEAK_SIZE < sizeof(index)); - memcpy(xts_tweak, &index, sizeof(index)); - memset(&xts_tweak[sizeof(index)], 0, - EXT4_XTS_TWEAK_SIZE - sizeof(index)); - - sg_init_table(&dst, 1); - sg_set_page(&dst, dest_page, PAGE_SIZE, 0); - sg_init_table(&src, 1); - sg_set_page(&src, src_page, PAGE_SIZE, 0); - skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, - xts_tweak); - if (rw == EXT4_DECRYPT) - res = crypto_skcipher_decrypt(req); - else - res = crypto_skcipher_encrypt(req); - if (res == -EINPROGRESS || res == -EBUSY) { - wait_for_completion(&ecr.completion); - res = ecr.res; - } - skcipher_request_free(req); - if (res) { - printk_ratelimited( - KERN_ERR - "%s: crypto_skcipher_encrypt() returned %d\n", - __func__, res); - return res; - } - return 0; -} - -static struct page *alloc_bounce_page(struct ext4_crypto_ctx *ctx, - gfp_t gfp_flags) -{ - ctx->w.bounce_page = mempool_alloc(ext4_bounce_page_pool, gfp_flags); - if (ctx->w.bounce_page == NULL) - return ERR_PTR(-ENOMEM); - ctx->flags |= EXT4_WRITE_PATH_FL; - return ctx->w.bounce_page; -} - -/** - * ext4_encrypt() - Encrypts a page - * @inode: The inode for which the encryption should take place - * @plaintext_page: The page to encrypt. Must be locked. - * - * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx - * encryption context. - * - * Called on the page write path. The caller must call - * ext4_restore_control_page() on the returned ciphertext page to - * release the bounce buffer and the encryption context. - * - * Return: An allocated page with the encrypted content on success. Else, an - * error value or NULL. - */ -struct page *ext4_encrypt(struct inode *inode, - struct page *plaintext_page, - gfp_t gfp_flags) -{ - struct ext4_crypto_ctx *ctx; - struct page *ciphertext_page = NULL; - int err; - - BUG_ON(!PageLocked(plaintext_page)); - - ctx = ext4_get_crypto_ctx(inode, gfp_flags); - if (IS_ERR(ctx)) - return (struct page *) ctx; - - /* The encryption operation will require a bounce page. */ - ciphertext_page = alloc_bounce_page(ctx, gfp_flags); - if (IS_ERR(ciphertext_page)) - goto errout; - ctx->w.control_page = plaintext_page; - err = ext4_page_crypto(inode, EXT4_ENCRYPT, plaintext_page->index, - plaintext_page, ciphertext_page, gfp_flags); - if (err) { - ciphertext_page = ERR_PTR(err); - errout: - ext4_release_crypto_ctx(ctx); - return ciphertext_page; - } - SetPagePrivate(ciphertext_page); - set_page_private(ciphertext_page, (unsigned long)ctx); - lock_page(ciphertext_page); - return ciphertext_page; -} - -/** - * ext4_decrypt() - Decrypts a page in-place - * @ctx: The encryption context. - * @page: The page to decrypt. Must be locked. - * - * Decrypts page in-place using the ctx encryption context. - * - * Called from the read completion callback. - * - * Return: Zero on success, non-zero otherwise. - */ -int ext4_decrypt(struct page *page) -{ - BUG_ON(!PageLocked(page)); - - return ext4_page_crypto(page->mapping->host, EXT4_DECRYPT, - page->index, page, page, GFP_NOFS); -} - -int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk, - ext4_fsblk_t pblk, ext4_lblk_t len) -{ - struct ext4_crypto_ctx *ctx; - struct page *ciphertext_page = NULL; - struct bio *bio; - int ret, err = 0; - -#if 0 - ext4_msg(inode->i_sb, KERN_CRIT, - "ext4_encrypted_zeroout ino %lu lblk %u len %u", - (unsigned long) inode->i_ino, lblk, len); -#endif - - BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE); - - ctx = ext4_get_crypto_ctx(inode, GFP_NOFS); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - - ciphertext_page = alloc_bounce_page(ctx, GFP_NOWAIT); - if (IS_ERR(ciphertext_page)) { - err = PTR_ERR(ciphertext_page); - goto errout; - } - - while (len--) { - err = ext4_page_crypto(inode, EXT4_ENCRYPT, lblk, - ZERO_PAGE(0), ciphertext_page, - GFP_NOFS); - if (err) - goto errout; - - bio = bio_alloc(GFP_NOWAIT, 1); - if (!bio) { - err = -ENOMEM; - goto errout; - } - bio->bi_bdev = inode->i_sb->s_bdev; - bio->bi_iter.bi_sector = - pblk << (inode->i_sb->s_blocksize_bits - 9); - ret = bio_add_page(bio, ciphertext_page, - inode->i_sb->s_blocksize, 0); - if (ret != inode->i_sb->s_blocksize) { - /* should never happen! */ - ext4_msg(inode->i_sb, KERN_ERR, - "bio_add_page failed: %d", ret); - WARN_ON(1); - bio_put(bio); - err = -EIO; - goto errout; - } - err = submit_bio_wait(WRITE, bio); - if ((err == 0) && bio->bi_error) - err = -EIO; - bio_put(bio); - if (err) - goto errout; - lblk++; pblk++; - } - err = 0; -errout: - ext4_release_crypto_ctx(ctx); - return err; -} - -bool ext4_valid_contents_enc_mode(uint32_t mode) -{ - return (mode == EXT4_ENCRYPTION_MODE_AES_256_XTS); -} - -/** - * ext4_validate_encryption_key_size() - Validate the encryption key size - * @mode: The key mode. - * @size: The key size to validate. - * - * Return: The validated key size for @mode. Zero if invalid. - */ -uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size) -{ - if (size == ext4_encryption_key_size(mode)) - return size; - return 0; -} - -/* - * Validate dentries for encrypted directories to make sure we aren't - * potentially caching stale data after a key has been added or - * removed. - */ -static int ext4_d_revalidate(struct dentry *dentry, unsigned int flags) -{ - struct dentry *dir; - struct ext4_crypt_info *ci; - int dir_has_key, cached_with_key; - - if (flags & LOOKUP_RCU) - return -ECHILD; - - dir = dget_parent(dentry); - if (!ext4_encrypted_inode(d_inode(dir))) { - dput(dir); - return 0; - } - ci = EXT4_I(d_inode(dir))->i_crypt_info; - if (ci && ci->ci_keyring_key && - (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | - (1 << KEY_FLAG_REVOKED) | - (1 << KEY_FLAG_DEAD)))) - ci = NULL; - - /* this should eventually be an flag in d_flags */ - cached_with_key = dentry->d_fsdata != NULL; - dir_has_key = (ci != NULL); - dput(dir); - - /* - * If the dentry was cached without the key, and it is a - * negative dentry, it might be a valid name. We can't check - * if the key has since been made available due to locking - * reasons, so we fail the validation so ext4_lookup() can do - * this check. - * - * We also fail the validation if the dentry was created with - * the key present, but we no longer have the key, or vice versa. - */ - if ((!cached_with_key && d_is_negative(dentry)) || - (!cached_with_key && dir_has_key) || - (cached_with_key && !dir_has_key)) { -#if 0 /* Revalidation debug */ - char buf[80]; - char *cp = simple_dname(dentry, buf, sizeof(buf)); - - if (IS_ERR(cp)) - cp = (char *) "???"; - pr_err("revalidate: %s %p %d %d %d\n", cp, dentry->d_fsdata, - cached_with_key, d_is_negative(dentry), - dir_has_key); -#endif - return 0; - } - return 1; -} - -const struct dentry_operations ext4_encrypted_d_ops = { - .d_revalidate = ext4_d_revalidate, -}; diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c deleted file mode 100644 index 1a2f360405db..000000000000 --- a/fs/ext4/crypto_fname.c +++ /dev/null @@ -1,468 +0,0 @@ -/* - * linux/fs/ext4/crypto_fname.c - * - * Copyright (C) 2015, Google, Inc. - * - * This contains functions for filename crypto management in ext4 - * - * Written by Uday Savagaonkar, 2014. - * - * This has not yet undergone a rigorous security audit. - * - */ - -#include <crypto/skcipher.h> -#include <keys/encrypted-type.h> -#include <keys/user-type.h> -#include <linux/gfp.h> -#include <linux/kernel.h> -#include <linux/key.h> -#include <linux/list.h> -#include <linux/mempool.h> -#include <linux/random.h> -#include <linux/scatterlist.h> -#include <linux/spinlock_types.h> - -#include "ext4.h" -#include "ext4_crypto.h" -#include "xattr.h" - -/** - * ext4_dir_crypt_complete() - - */ -static void ext4_dir_crypt_complete(struct crypto_async_request *req, int res) -{ - struct ext4_completion_result *ecr = req->data; - - if (res == -EINPROGRESS) - return; - ecr->res = res; - complete(&ecr->completion); -} - -bool ext4_valid_filenames_enc_mode(uint32_t mode) -{ - return (mode == EXT4_ENCRYPTION_MODE_AES_256_CTS); -} - -static unsigned max_name_len(struct inode *inode) -{ - return S_ISLNK(inode->i_mode) ? inode->i_sb->s_blocksize : - EXT4_NAME_LEN; -} - -/** - * ext4_fname_encrypt() - - * - * This function encrypts the input filename, and returns the length of the - * ciphertext. Errors are returned as negative numbers. We trust the caller to - * allocate sufficient memory to oname string. - */ -static int ext4_fname_encrypt(struct inode *inode, - const struct qstr *iname, - struct ext4_str *oname) -{ - u32 ciphertext_len; - struct skcipher_request *req = NULL; - DECLARE_EXT4_COMPLETION_RESULT(ecr); - struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; - struct crypto_skcipher *tfm = ci->ci_ctfm; - int res = 0; - char iv[EXT4_CRYPTO_BLOCK_SIZE]; - struct scatterlist src_sg, dst_sg; - int padding = 4 << (ci->ci_flags & EXT4_POLICY_FLAGS_PAD_MASK); - char *workbuf, buf[32], *alloc_buf = NULL; - unsigned lim = max_name_len(inode); - - if (iname->len <= 0 || iname->len > lim) - return -EIO; - - ciphertext_len = (iname->len < EXT4_CRYPTO_BLOCK_SIZE) ? - EXT4_CRYPTO_BLOCK_SIZE : iname->len; - ciphertext_len = ext4_fname_crypto_round_up(ciphertext_len, padding); - ciphertext_len = (ciphertext_len > lim) - ? lim : ciphertext_len; - - if (ciphertext_len <= sizeof(buf)) { - workbuf = buf; - } else { - alloc_buf = kmalloc(ciphertext_len, GFP_NOFS); - if (!alloc_buf) - return -ENOMEM; - workbuf = alloc_buf; - } - - /* Allocate request */ - req = skcipher_request_alloc(tfm, GFP_NOFS); - if (!req) { - printk_ratelimited( - KERN_ERR "%s: crypto_request_alloc() failed\n", __func__); - kfree(alloc_buf); - return -ENOMEM; - } - skcipher_request_set_callback(req, - CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - ext4_dir_crypt_complete, &ecr); - - /* Copy the input */ - memcpy(workbuf, iname->name, iname->len); - if (iname->len < ciphertext_len) - memset(workbuf + iname->len, 0, ciphertext_len - iname->len); - - /* Initialize IV */ - memset(iv, 0, EXT4_CRYPTO_BLOCK_SIZE); - - /* Create encryption request */ - sg_init_one(&src_sg, workbuf, ciphertext_len); - sg_init_one(&dst_sg, oname->name, ciphertext_len); - skcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv); - res = crypto_skcipher_encrypt(req); - if (res == -EINPROGRESS || res == -EBUSY) { - wait_for_completion(&ecr.completion); - res = ecr.res; - } - kfree(alloc_buf); - skcipher_request_free(req); - if (res < 0) { - printk_ratelimited( - KERN_ERR "%s: Error (error code %d)\n", __func__, res); - } - oname->len = ciphertext_len; - return res; -} - -/* - * ext4_fname_decrypt() - * This function decrypts the input filename, and returns - * the length of the plaintext. - * Errors are returned as negative numbers. - * We trust the caller to allocate sufficient memory to oname string. - */ -static int ext4_fname_decrypt(struct inode *inode, - const struct ext4_str *iname, - struct ext4_str *oname) -{ - struct ext4_str tmp_in[2], tmp_out[1]; - struct skcipher_request *req = NULL; - DECLARE_EXT4_COMPLETION_RESULT(ecr); - struct scatterlist src_sg, dst_sg; - struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; - struct crypto_skcipher *tfm = ci->ci_ctfm; - int res = 0; - char iv[EXT4_CRYPTO_BLOCK_SIZE]; - unsigned lim = max_name_len(inode); - - if (iname->len <= 0 || iname->len > lim) - return -EIO; - - tmp_in[0].name = iname->name; - tmp_in[0].len = iname->len; - tmp_out[0].name = oname->name; - - /* Allocate request */ - req = skcipher_request_alloc(tfm, GFP_NOFS); - if (!req) { - printk_ratelimited( - KERN_ERR "%s: crypto_request_alloc() failed\n", __func__); - return -ENOMEM; - } - skcipher_request_set_callback(req, - CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - ext4_dir_crypt_complete, &ecr); - - /* Initialize IV */ - memset(iv, 0, EXT4_CRYPTO_BLOCK_SIZE); - - /* Create encryption request */ - sg_init_one(&src_sg, iname->name, iname->len); - sg_init_one(&dst_sg, oname->name, oname->len); - skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv); - res = crypto_skcipher_decrypt(req); - if (res == -EINPROGRESS || res == -EBUSY) { - wait_for_completion(&ecr.completion); - res = ecr.res; - } - skcipher_request_free(req); - if (res < 0) { - printk_ratelimited( - KERN_ERR "%s: Error in ext4_fname_encrypt (error code %d)\n", - __func__, res); - return res; - } - - oname->len = strnlen(oname->name, iname->len); - return oname->len; -} - -static const char *lookup_table = - "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; - -/** - * ext4_fname_encode_digest() - - * - * Encodes the input digest using characters from the set [a-zA-Z0-9_+]. - * The encoded string is roughly 4/3 times the size of the input string. - */ -static int digest_encode(const char *src, int len, char *dst) -{ - int i = 0, bits = 0, ac = 0; - char *cp = dst; - - while (i < len) { - ac += (((unsigned char) src[i]) << bits); - bits += 8; - do { - *cp++ = lookup_table[ac & 0x3f]; - ac >>= 6; - bits -= 6; - } while (bits >= 6); - i++; - } - if (bits) - *cp++ = lookup_table[ac & 0x3f]; - return cp - dst; -} - -static int digest_decode(const char *src, int len, char *dst) -{ - int i = 0, bits = 0, ac = 0; - const char *p; - char *cp = dst; - - while (i < len) { - p = strchr(lookup_table, src[i]); - if (p == NULL || src[i] == 0) - return -2; - ac += (p - lookup_table) << bits; - bits += 6; - if (bits >= 8) { - *cp++ = ac & 0xff; - ac >>= 8; - bits -= 8; - } - i++; - } - if (ac) - return -1; - return cp - dst; -} - -/** - * ext4_fname_crypto_round_up() - - * - * Return: The next multiple of block size - */ -u32 ext4_fname_crypto_round_up(u32 size, u32 blksize) -{ - return ((size+blksize-1)/blksize)*blksize; -} - -unsigned ext4_fname_encrypted_size(struct inode *inode, u32 ilen) -{ - struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; - int padding = 32; - - if (ci) - padding = 4 << (ci->ci_flags & EXT4_POLICY_FLAGS_PAD_MASK); - if (ilen < EXT4_CRYPTO_BLOCK_SIZE) - ilen = EXT4_CRYPTO_BLOCK_SIZE; - return ext4_fname_crypto_round_up(ilen, padding); -} - -/* - * ext4_fname_crypto_alloc_buffer() - - * - * Allocates an output buffer that is sufficient for the crypto operation - * specified by the context and the direction. - */ -int ext4_fname_crypto_alloc_buffer(struct inode *inode, - u32 ilen, struct ext4_str *crypto_str) -{ - unsigned int olen = ext4_fname_encrypted_size(inode, ilen); - - crypto_str->len = olen; - if (olen < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2) - olen = EXT4_FNAME_CRYPTO_DIGEST_SIZE*2; - /* Allocated buffer can hold one more character to null-terminate the - * string */ - crypto_str->name = kmalloc(olen+1, GFP_NOFS); - if (!(crypto_str->name)) - return -ENOMEM; - return 0; -} - -/** - * ext4_fname_crypto_free_buffer() - - * - * Frees the buffer allocated for crypto operation. - */ -void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str) -{ - if (!crypto_str) - return; - kfree(crypto_str->name); - crypto_str->name = NULL; -} - -/** - * ext4_fname_disk_to_usr() - converts a filename from disk space to user space - */ -int _ext4_fname_disk_to_usr(struct inode *inode, - struct dx_hash_info *hinfo, - const struct ext4_str *iname, - struct ext4_str *oname) -{ - char buf[24]; - int ret; - - if (iname->len < 3) { - /*Check for . and .. */ - if (iname->name[0] == '.' && iname->name[iname->len-1] == '.') { - oname->name[0] = '.'; - oname->name[iname->len-1] = '.'; - oname->len = iname->len; - return oname->len; - } - } - if (iname->len < EXT4_CRYPTO_BLOCK_SIZE) { - EXT4_ERROR_INODE(inode, "encrypted inode too small"); - return -EUCLEAN; - } - if (EXT4_I(inode)->i_crypt_info) - return ext4_fname_decrypt(inode, iname, oname); - - if (iname->len <= EXT4_FNAME_CRYPTO_DIGEST_SIZE) { - ret = digest_encode(iname->name, iname->len, oname->name); - oname->len = ret; - return ret; - } - if (hinfo) { - memcpy(buf, &hinfo->hash, 4); - memcpy(buf+4, &hinfo->minor_hash, 4); - } else - memset(buf, 0, 8); - memcpy(buf + 8, iname->name + iname->len - 16, 16); - oname->name[0] = '_'; - ret = digest_encode(buf, 24, oname->name+1); - oname->len = ret + 1; - return ret + 1; -} - -int ext4_fname_disk_to_usr(struct inode *inode, - struct dx_hash_info *hinfo, - const struct ext4_dir_entry_2 *de, - struct ext4_str *oname) -{ - struct ext4_str iname = {.name = (unsigned char *) de->name, - .len = de->name_len }; - - return _ext4_fname_disk_to_usr(inode, hinfo, &iname, oname); -} - - -/** - * ext4_fname_usr_to_disk() - converts a filename from user space to disk space - */ -int ext4_fname_usr_to_disk(struct inode *inode, - const struct qstr *iname, - struct ext4_str *oname) -{ - int res; - struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; - - if (iname->len < 3) { - /*Check for . and .. */ - if (iname->name[0] == '.' && - iname->name[iname->len-1] == '.') { - oname->name[0] = '.'; - oname->name[iname->len-1] = '.'; - oname->len = iname->len; - return oname->len; - } - } - if (ci) { - res = ext4_fname_encrypt(inode, iname, oname); - return res; - } - /* Without a proper key, a user is not allowed to modify the filenames - * in a directory. Consequently, a user space name cannot be mapped to - * a disk-space name */ - return -EACCES; -} - -int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, - int lookup, struct ext4_filename *fname) -{ - struct ext4_crypt_info *ci; - int ret = 0, bigname = 0; - - memset(fname, 0, sizeof(struct ext4_filename)); - fname->usr_fname = iname; - - if (!ext4_encrypted_inode(dir) || - ((iname->name[0] == '.') && - ((iname->len == 1) || - ((iname->name[1] == '.') && (iname->len == 2))))) { - fname->disk_name.name = (unsigned char *) iname->name; - fname->disk_name.len = iname->len; - return 0; - } - ret = ext4_get_encryption_info(dir); - if (ret) - return ret; - ci = EXT4_I(dir)->i_crypt_info; - if (ci) { - ret = ext4_fname_crypto_alloc_buffer(dir, iname->len, - &fname->crypto_buf); - if (ret < 0) - return ret; - ret = ext4_fname_encrypt(dir, iname, &fname->crypto_buf); - if (ret < 0) - goto errout; - fname->disk_name.name = fname->crypto_buf.name; - fname->disk_name.len = fname->crypto_buf.len; - return 0; - } - if (!lookup) - return -EACCES; - - /* We don't have the key and we are doing a lookup; decode the - * user-supplied name - */ - if (iname->name[0] == '_') - bigname = 1; - if ((bigname && (iname->len != 33)) || - (!bigname && (iname->len > 43))) - return -ENOENT; - - fname->crypto_buf.name = kmalloc(32, GFP_KERNEL); - if (fname->crypto_buf.name == NULL) - return -ENOMEM; - ret = digest_decode(iname->name + bigname, iname->len - bigname, - fname->crypto_buf.name); - if (ret < 0) { - ret = -ENOENT; - goto errout; - } - fname->crypto_buf.len = ret; - if (bigname) { - memcpy(&fname->hinfo.hash, fname->crypto_buf.name, 4); - memcpy(&fname->hinfo.minor_hash, fname->crypto_buf.name + 4, 4); - } else { - fname->disk_name.name = fname->crypto_buf.name; - fname->disk_name.len = fname->crypto_buf.len; - } - return 0; -errout: - kfree(fname->crypto_buf.name); - fname->crypto_buf.name = NULL; - return ret; -} - -void ext4_fname_free_filename(struct ext4_filename *fname) -{ - kfree(fname->crypto_buf.name); - fname->crypto_buf.name = NULL; - fname->usr_fname = NULL; - fname->disk_name.name = NULL; -} diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c deleted file mode 100644 index 0129d688d1f7..000000000000 --- a/fs/ext4/crypto_key.c +++ /dev/null @@ -1,274 +0,0 @@ -/* - * linux/fs/ext4/crypto_key.c - * - * Copyright (C) 2015, Google, Inc. - * - * This contains encryption key functions for ext4 - * - * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. - */ - -#include <crypto/skcipher.h> -#include <keys/encrypted-type.h> -#include <keys/user-type.h> -#include <linux/random.h> -#include <linux/scatterlist.h> -#include <uapi/linux/keyctl.h> - -#include "ext4.h" -#include "xattr.h" - -static void derive_crypt_complete(struct crypto_async_request *req, int rc) -{ - struct ext4_completion_result *ecr = req->data; - - if (rc == -EINPROGRESS) - return; - - ecr->res = rc; - complete(&ecr->completion); -} - -/** - * ext4_derive_key_aes() - Derive a key using AES-128-ECB - * @deriving_key: Encryption key used for derivation. - * @source_key: Source key to which to apply derivation. - * @derived_key: Derived key. - * - * Return: Zero on success; non-zero otherwise. - */ -static int ext4_derive_key_aes(char deriving_key[EXT4_AES_128_ECB_KEY_SIZE], - char source_key[EXT4_AES_256_XTS_KEY_SIZE], - char derived_key[EXT4_AES_256_XTS_KEY_SIZE]) -{ - int res = 0; - struct skcipher_request *req = NULL; - DECLARE_EXT4_COMPLETION_RESULT(ecr); - struct scatterlist src_sg, dst_sg; - struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0); - - if (IS_ERR(tfm)) { - res = PTR_ERR(tfm); - tfm = NULL; - goto out; - } - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY); - req = skcipher_request_alloc(tfm, GFP_NOFS); - if (!req) { - res = -ENOMEM; - goto out; - } - skcipher_request_set_callback(req, - CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - derive_crypt_complete, &ecr); - res = crypto_skcipher_setkey(tfm, deriving_key, - EXT4_AES_128_ECB_KEY_SIZE); - if (res < 0) - goto out; - sg_init_one(&src_sg, source_key, EXT4_AES_256_XTS_KEY_SIZE); - sg_init_one(&dst_sg, derived_key, EXT4_AES_256_XTS_KEY_SIZE); - skcipher_request_set_crypt(req, &src_sg, &dst_sg, - EXT4_AES_256_XTS_KEY_SIZE, NULL); - res = crypto_skcipher_encrypt(req); - if (res == -EINPROGRESS || res == -EBUSY) { - wait_for_completion(&ecr.completion); - res = ecr.res; - } - -out: - skcipher_request_free(req); - crypto_free_skcipher(tfm); - return res; -} - -void ext4_free_crypt_info(struct ext4_crypt_info *ci) -{ - if (!ci) - return; - - if (ci->ci_keyring_key) - key_put(ci->ci_keyring_key); - crypto_free_skcipher(ci->ci_ctfm); - kmem_cache_free(ext4_crypt_info_cachep, ci); -} - -void ext4_free_encryption_info(struct inode *inode, - struct ext4_crypt_info *ci) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_crypt_info *prev; - - if (ci == NULL) - ci = ACCESS_ONCE(ei->i_crypt_info); - if (ci == NULL) - return; - prev = cmpxchg(&ei->i_crypt_info, ci, NULL); - if (prev != ci) - return; - - ext4_free_crypt_info(ci); -} - -int _ext4_get_encryption_info(struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_crypt_info *crypt_info; - char full_key_descriptor[EXT4_KEY_DESC_PREFIX_SIZE + - (EXT4_KEY_DESCRIPTOR_SIZE * 2) + 1]; - struct key *keyring_key = NULL; - struct ext4_encryption_key *master_key; - struct ext4_encryption_context ctx; - const struct user_key_payload *ukp; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - struct crypto_skcipher *ctfm; - const char *cipher_str; - char raw_key[EXT4_MAX_KEY_SIZE]; - char mode; - int res; - - if (!ext4_read_workqueue) { - res = ext4_init_crypto(); - if (res) - return res; - } - -retry: - crypt_info = ACCESS_ONCE(ei->i_crypt_info); - if (crypt_info) { - if (!crypt_info->ci_keyring_key || - key_validate(crypt_info->ci_keyring_key) == 0) - return 0; - ext4_free_encryption_info(inode, crypt_info); - goto retry; - } - - res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, - &ctx, sizeof(ctx)); - if (res < 0) { - if (!DUMMY_ENCRYPTION_ENABLED(sbi)) - return res; - ctx.contents_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS; - ctx.filenames_encryption_mode = - EXT4_ENCRYPTION_MODE_AES_256_CTS; - ctx.flags = 0; - } else if (res != sizeof(ctx)) - return -EINVAL; - res = 0; - - crypt_info = kmem_cache_alloc(ext4_crypt_info_cachep, GFP_KERNEL); - if (!crypt_info) - return -ENOMEM; - - crypt_info->ci_flags = ctx.flags; - crypt_info->ci_data_mode = ctx.contents_encryption_mode; - crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; - crypt_info->ci_ctfm = NULL; - crypt_info->ci_keyring_key = NULL; - memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, - sizeof(crypt_info->ci_master_key)); - if (S_ISREG(inode->i_mode)) - mode = crypt_info->ci_data_mode; - else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - mode = crypt_info->ci_filename_mode; - else - BUG(); - switch (mode) { - case EXT4_ENCRYPTION_MODE_AES_256_XTS: - cipher_str = "xts(aes)"; - break; - case EXT4_ENCRYPTION_MODE_AES_256_CTS: - cipher_str = "cts(cbc(aes))"; - break; - default: - printk_once(KERN_WARNING - "ext4: unsupported key mode %d (ino %u)\n", - mode, (unsigned) inode->i_ino); - res = -ENOKEY; - goto out; - } - if (DUMMY_ENCRYPTION_ENABLED(sbi)) { - memset(raw_key, 0x42, EXT4_AES_256_XTS_KEY_SIZE); - goto got_key; - } - memcpy(full_key_descriptor, EXT4_KEY_DESC_PREFIX, - EXT4_KEY_DESC_PREFIX_SIZE); - sprintf(full_key_descriptor + EXT4_KEY_DESC_PREFIX_SIZE, - "%*phN", EXT4_KEY_DESCRIPTOR_SIZE, - ctx.master_key_descriptor); - full_key_descriptor[EXT4_KEY_DESC_PREFIX_SIZE + - (2 * EXT4_KEY_DESCRIPTOR_SIZE)] = '\0'; - keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL); - if (IS_ERR(keyring_key)) { - res = PTR_ERR(keyring_key); - keyring_key = NULL; - goto out; - } - crypt_info->ci_keyring_key = keyring_key; - if (keyring_key->type != &key_type_logon) { - printk_once(KERN_WARNING - "ext4: key type must be logon\n"); - res = -ENOKEY; - goto out; - } - down_read(&keyring_key->sem); - ukp = user_key_payload(keyring_key); - if (ukp->datalen != sizeof(struct ext4_encryption_key)) { - res = -EINVAL; - up_read(&keyring_key->sem); - goto out; - } - master_key = (struct ext4_encryption_key *)ukp->data; - BUILD_BUG_ON(EXT4_AES_128_ECB_KEY_SIZE != - EXT4_KEY_DERIVATION_NONCE_SIZE); - if (master_key->size != EXT4_AES_256_XTS_KEY_SIZE) { - printk_once(KERN_WARNING - "ext4: key size incorrect: %d\n", - master_key->size); - res = -ENOKEY; - up_read(&keyring_key->sem); - goto out; - } - res = ext4_derive_key_aes(ctx.nonce, master_key->raw, - raw_key); - up_read(&keyring_key->sem); - if (res) - goto out; -got_key: - ctfm = crypto_alloc_skcipher(cipher_str, 0, 0); - if (!ctfm || IS_ERR(ctfm)) { - res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; - printk(KERN_DEBUG - "%s: error %d (inode %u) allocating crypto tfm\n", - __func__, res, (unsigned) inode->i_ino); - goto out; - } - crypt_info->ci_ctfm = ctfm; - crypto_skcipher_clear_flags(ctfm, ~0); - crypto_tfm_set_flags(crypto_skcipher_tfm(ctfm), - CRYPTO_TFM_REQ_WEAK_KEY); - res = crypto_skcipher_setkey(ctfm, raw_key, - ext4_encryption_key_size(mode)); - if (res) - goto out; - memzero_explicit(raw_key, sizeof(raw_key)); - if (cmpxchg(&ei->i_crypt_info, NULL, crypt_info) != NULL) { - ext4_free_crypt_info(crypt_info); - goto retry; - } - return 0; - -out: - if (res == -ENOKEY) - res = 0; - ext4_free_crypt_info(crypt_info); - memzero_explicit(raw_key, sizeof(raw_key)); - return res; -} - -int ext4_has_encryption_key(struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - - return (ei->i_crypt_info != NULL); -} diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c deleted file mode 100644 index ad050698143f..000000000000 --- a/fs/ext4/crypto_policy.c +++ /dev/null @@ -1,229 +0,0 @@ -/* - * linux/fs/ext4/crypto_policy.c - * - * Copyright (C) 2015, Google, Inc. - * - * This contains encryption policy functions for ext4 - * - * Written by Michael Halcrow, 2015. - */ - -#include <linux/random.h> -#include <linux/string.h> -#include <linux/types.h> - -#include "ext4_jbd2.h" -#include "ext4.h" -#include "xattr.h" - -static int ext4_inode_has_encryption_context(struct inode *inode) -{ - int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, NULL, 0); - return (res > 0); -} - -/* - * check whether the policy is consistent with the encryption context - * for the inode - */ -static int ext4_is_encryption_context_consistent_with_policy( - struct inode *inode, const struct ext4_encryption_policy *policy) -{ - struct ext4_encryption_context ctx; - int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, - sizeof(ctx)); - if (res != sizeof(ctx)) - return 0; - return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor, - EXT4_KEY_DESCRIPTOR_SIZE) == 0 && - (ctx.flags == - policy->flags) && - (ctx.contents_encryption_mode == - policy->contents_encryption_mode) && - (ctx.filenames_encryption_mode == - policy->filenames_encryption_mode)); -} - -static int ext4_create_encryption_context_from_policy( - struct inode *inode, const struct ext4_encryption_policy *policy) -{ - struct ext4_encryption_context ctx; - handle_t *handle; - int res, res2; - - res = ext4_convert_inline_data(inode); - if (res) - return res; - - ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1; - memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, - EXT4_KEY_DESCRIPTOR_SIZE); - if (!ext4_valid_contents_enc_mode(policy->contents_encryption_mode)) { - printk(KERN_WARNING - "%s: Invalid contents encryption mode %d\n", __func__, - policy->contents_encryption_mode); - return -EINVAL; - } - if (!ext4_valid_filenames_enc_mode(policy->filenames_encryption_mode)) { - printk(KERN_WARNING - "%s: Invalid filenames encryption mode %d\n", __func__, - policy->filenames_encryption_mode); - return -EINVAL; - } - if (policy->flags & ~EXT4_POLICY_FLAGS_VALID) - return -EINVAL; - ctx.contents_encryption_mode = policy->contents_encryption_mode; - ctx.filenames_encryption_mode = policy->filenames_encryption_mode; - ctx.flags = policy->flags; - BUILD_BUG_ON(sizeof(ctx.nonce) != EXT4_KEY_DERIVATION_NONCE_SIZE); - get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE); - - handle = ext4_journal_start(inode, EXT4_HT_MISC, - ext4_jbd2_credits_xattr(inode)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, - sizeof(ctx), 0); - if (!res) { - ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); - res = ext4_mark_inode_dirty(handle, inode); - if (res) - EXT4_ERROR_INODE(inode, "Failed to mark inode dirty"); - } - res2 = ext4_journal_stop(handle); - if (!res) - res = res2; - return res; -} - -int ext4_process_policy(const struct ext4_encryption_policy *policy, - struct inode *inode) -{ - if (policy->version != 0) - return -EINVAL; - - if (!ext4_inode_has_encryption_context(inode)) { - if (!S_ISDIR(inode->i_mode)) - return -EINVAL; - if (!ext4_empty_dir(inode)) - return -ENOTEMPTY; - return ext4_create_encryption_context_from_policy(inode, - policy); - } - - if (ext4_is_encryption_context_consistent_with_policy(inode, policy)) - return 0; - - printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n", - __func__); - return -EINVAL; -} - -int ext4_get_policy(struct inode *inode, struct ext4_encryption_policy *policy) -{ - struct ext4_encryption_context ctx; - - int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, - &ctx, sizeof(ctx)); - if (res != sizeof(ctx)) - return -ENOENT; - if (ctx.format != EXT4_ENCRYPTION_CONTEXT_FORMAT_V1) - return -EINVAL; - policy->version = 0; - policy->contents_encryption_mode = ctx.contents_encryption_mode; - policy->filenames_encryption_mode = ctx.filenames_encryption_mode; - policy->flags = ctx.flags; - memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor, - EXT4_KEY_DESCRIPTOR_SIZE); - return 0; -} - -int ext4_is_child_context_consistent_with_parent(struct inode *parent, - struct inode *child) -{ - struct ext4_crypt_info *parent_ci, *child_ci; - int res; - - if ((parent == NULL) || (child == NULL)) { - pr_err("parent %p child %p\n", parent, child); - WARN_ON(1); /* Should never happen */ - return 0; - } - /* no restrictions if the parent directory is not encrypted */ - if (!ext4_encrypted_inode(parent)) - return 1; - /* if the child directory is not encrypted, this is always a problem */ - if (!ext4_encrypted_inode(child)) - return 0; - res = ext4_get_encryption_info(parent); - if (res) - return 0; - res = ext4_get_encryption_info(child); - if (res) - return 0; - parent_ci = EXT4_I(parent)->i_crypt_info; - child_ci = EXT4_I(child)->i_crypt_info; - if (!parent_ci && !child_ci) - return 1; - if (!parent_ci || !child_ci) - return 0; - - return (memcmp(parent_ci->ci_master_key, - child_ci->ci_master_key, - EXT4_KEY_DESCRIPTOR_SIZE) == 0 && - (parent_ci->ci_data_mode == child_ci->ci_data_mode) && - (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) && - (parent_ci->ci_flags == child_ci->ci_flags)); -} - -/** - * ext4_inherit_context() - Sets a child context from its parent - * @parent: Parent inode from which the context is inherited. - * @child: Child inode that inherits the context from @parent. - * - * Return: Zero on success, non-zero otherwise - */ -int ext4_inherit_context(struct inode *parent, struct inode *child) -{ - struct ext4_encryption_context ctx; - struct ext4_crypt_info *ci; - int res; - - res = ext4_get_encryption_info(parent); - if (res < 0) - return res; - ci = EXT4_I(parent)->i_crypt_info; - if (ci == NULL) - return -ENOKEY; - - ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1; - if (DUMMY_ENCRYPTION_ENABLED(EXT4_SB(parent->i_sb))) { - ctx.contents_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS; - ctx.filenames_encryption_mode = - EXT4_ENCRYPTION_MODE_AES_256_CTS; - ctx.flags = 0; - memset(ctx.master_key_descriptor, 0x42, - EXT4_KEY_DESCRIPTOR_SIZE); - res = 0; - } else { - ctx.contents_encryption_mode = ci->ci_data_mode; - ctx.filenames_encryption_mode = ci->ci_filename_mode; - ctx.flags = ci->ci_flags; - memcpy(ctx.master_key_descriptor, ci->ci_master_key, - EXT4_KEY_DESCRIPTOR_SIZE); - } - get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE); - res = ext4_xattr_set(child, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, - sizeof(ctx), 0); - if (!res) { - ext4_set_inode_flag(child, EXT4_INODE_ENCRYPT); - ext4_clear_inode_state(child, EXT4_STATE_MAY_INLINE_DATA); - res = ext4_get_encryption_info(child); - } - return res; -} diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 68323e3da3fa..67415e0e6af0 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -109,10 +109,10 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) struct super_block *sb = inode->i_sb; struct buffer_head *bh = NULL; int dir_has_error = 0; - struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; + struct fscrypt_str fstr = FSTR_INIT(NULL, 0); if (ext4_encrypted_inode(inode)) { - err = ext4_get_encryption_info(inode); + err = fscrypt_get_encryption_info(inode); if (err && err != -ENOKEY) return err; } @@ -139,8 +139,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) } if (ext4_encrypted_inode(inode)) { - err = ext4_fname_crypto_alloc_buffer(inode, EXT4_NAME_LEN, - &fname_crypto_str); + err = fscrypt_fname_alloc_buffer(inode, EXT4_NAME_LEN, &fstr); if (err < 0) return err; } @@ -253,16 +252,19 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) get_dtype(sb, de->file_type))) goto done; } else { - int save_len = fname_crypto_str.len; + int save_len = fstr.len; + struct fscrypt_str de_name = + FSTR_INIT(de->name, + de->name_len); /* Directory is encrypted */ - err = ext4_fname_disk_to_usr(inode, - NULL, de, &fname_crypto_str); - fname_crypto_str.len = save_len; + err = fscrypt_fname_disk_to_usr(inode, + 0, 0, &de_name, &fstr); + fstr.len = save_len; if (err < 0) goto errout; if (!dir_emit(ctx, - fname_crypto_str.name, err, + fstr.name, err, le32_to_cpu(de->inode), get_dtype(sb, de->file_type))) goto done; @@ -281,7 +283,7 @@ done: err = 0; errout: #ifdef CONFIG_EXT4_FS_ENCRYPTION - ext4_fname_crypto_free_buffer(&fname_crypto_str); + fscrypt_fname_free_buffer(&fstr); #endif brelse(bh); return err; @@ -432,7 +434,7 @@ void ext4_htree_free_dir_info(struct dir_private_info *p) int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent, - struct ext4_str *ent_name) + struct fscrypt_str *ent_name) { struct rb_node **p, *parent = NULL; struct fname *fname, *new_fn; @@ -609,7 +611,7 @@ finished: static int ext4_dir_open(struct inode * inode, struct file * filp) { if (ext4_encrypted_inode(inode)) - return ext4_get_encryption_info(inode) ? -EACCES : 0; + return fscrypt_get_encryption_info(inode) ? -EACCES : 0; return 0; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b84aa1ca480a..ea31931386ec 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -32,6 +32,7 @@ #include <linux/percpu_counter.h> #include <linux/ratelimit.h> #include <crypto/hash.h> +#include <linux/fscrypto.h> #include <linux/falloc.h> #include <linux/percpu-rwsem.h> #ifdef __KERNEL__ @@ -608,15 +609,6 @@ enum { #define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 -/* Encryption algorithms */ -#define EXT4_ENCRYPTION_MODE_INVALID 0 -#define EXT4_ENCRYPTION_MODE_AES_256_XTS 1 -#define EXT4_ENCRYPTION_MODE_AES_256_GCM 2 -#define EXT4_ENCRYPTION_MODE_AES_256_CBC 3 -#define EXT4_ENCRYPTION_MODE_AES_256_CTS 4 - -#include "ext4_crypto.h" - /* * ioctl commands */ @@ -638,9 +630,9 @@ enum { #define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) #define EXT4_IOC_SWAP_BOOT _IO('f', 17) #define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18) -#define EXT4_IOC_SET_ENCRYPTION_POLICY _IOR('f', 19, struct ext4_encryption_policy) -#define EXT4_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16]) -#define EXT4_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct ext4_encryption_policy) +#define EXT4_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY +#define EXT4_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT +#define EXT4_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY #ifndef FS_IOC_FSGETXATTR /* Until the uapi changes get merged for project quota... */ @@ -1082,10 +1074,6 @@ struct ext4_inode_info { /* Precomputed uuid+inum+igen checksum for seeding inode checksums */ __u32 i_csum_seed; -#ifdef CONFIG_EXT4_FS_ENCRYPTION - /* Encryption params */ - struct ext4_crypt_info *i_crypt_info; -#endif kprojid_t i_projid; }; @@ -1344,6 +1332,11 @@ struct ext4_super_block { /* Number of quota types we support */ #define EXT4_MAXQUOTAS 3 +#ifdef CONFIG_EXT4_FS_ENCRYPTION +#define EXT4_KEY_DESC_PREFIX "ext4:" +#define EXT4_KEY_DESC_PREFIX_SIZE 5 +#endif + /* * fourth extended-fs super-block data in memory */ @@ -1430,6 +1423,7 @@ struct ext4_sb_info { unsigned short *s_mb_offsets; unsigned int *s_mb_maxs; unsigned int s_group_info_size; + unsigned int s_mb_free_pending; /* tunables */ unsigned long s_stripe; @@ -1512,6 +1506,12 @@ struct ext4_sb_info { /* Barrier between changing inodes' journal flags and writepages ops. */ struct percpu_rw_semaphore s_journal_flag_rwsem; + + /* Encryption support */ +#ifdef CONFIG_EXT4_FS_ENCRYPTION + u8 key_prefix[EXT4_KEY_DESC_PREFIX_SIZE]; + u8 key_prefix_size; +#endif }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -1610,15 +1610,6 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) /* * Returns true if the inode is inode is encrypted */ -static inline int ext4_encrypted_inode(struct inode *inode) -{ -#ifdef CONFIG_EXT4_FS_ENCRYPTION - return ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT); -#else - return 0; -#endif -} - #define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime /* @@ -2082,10 +2073,10 @@ struct dx_hash_info struct ext4_filename { const struct qstr *usr_fname; - struct ext4_str disk_name; + struct fscrypt_str disk_name; struct dx_hash_info hinfo; #ifdef CONFIG_EXT4_FS_ENCRYPTION - struct ext4_str crypto_buf; + struct fscrypt_str crypto_buf; #endif }; @@ -2296,132 +2287,82 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb, struct ext4_group_desc *gdp); ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); -/* crypto_policy.c */ -int ext4_is_child_context_consistent_with_parent(struct inode *parent, - struct inode *child); -int ext4_inherit_context(struct inode *parent, struct inode *child); -void ext4_to_hex(char *dst, char *src, size_t src_size); -int ext4_process_policy(const struct ext4_encryption_policy *policy, - struct inode *inode); -int ext4_get_policy(struct inode *inode, - struct ext4_encryption_policy *policy); - -/* crypto.c */ -extern struct kmem_cache *ext4_crypt_info_cachep; -bool ext4_valid_contents_enc_mode(uint32_t mode); -uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size); -extern struct workqueue_struct *ext4_read_workqueue; -struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode, - gfp_t gfp_flags); -void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx); -void ext4_restore_control_page(struct page *data_page); -struct page *ext4_encrypt(struct inode *inode, - struct page *plaintext_page, - gfp_t gfp_flags); -int ext4_decrypt(struct page *page); -int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk, - ext4_fsblk_t pblk, ext4_lblk_t len); -extern const struct dentry_operations ext4_encrypted_d_ops; - -#ifdef CONFIG_EXT4_FS_ENCRYPTION -int ext4_init_crypto(void); -void ext4_exit_crypto(void); static inline int ext4_sb_has_crypto(struct super_block *sb) { return ext4_has_feature_encrypt(sb); } -#else -static inline int ext4_init_crypto(void) { return 0; } -static inline void ext4_exit_crypto(void) { } -static inline int ext4_sb_has_crypto(struct super_block *sb) + +static inline bool ext4_encrypted_inode(struct inode *inode) { - return 0; + return ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT); } -#endif -/* crypto_fname.c */ -bool ext4_valid_filenames_enc_mode(uint32_t mode); -u32 ext4_fname_crypto_round_up(u32 size, u32 blksize); -unsigned ext4_fname_encrypted_size(struct inode *inode, u32 ilen); -int ext4_fname_crypto_alloc_buffer(struct inode *inode, - u32 ilen, struct ext4_str *crypto_str); -int _ext4_fname_disk_to_usr(struct inode *inode, - struct dx_hash_info *hinfo, - const struct ext4_str *iname, - struct ext4_str *oname); -int ext4_fname_disk_to_usr(struct inode *inode, - struct dx_hash_info *hinfo, - const struct ext4_dir_entry_2 *de, - struct ext4_str *oname); -int ext4_fname_usr_to_disk(struct inode *inode, - const struct qstr *iname, - struct ext4_str *oname); #ifdef CONFIG_EXT4_FS_ENCRYPTION -void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str); -int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, - int lookup, struct ext4_filename *fname); -void ext4_fname_free_filename(struct ext4_filename *fname); -#else -static inline -int ext4_setup_fname_crypto(struct inode *inode) -{ - return 0; -} -static inline void ext4_fname_crypto_free_buffer(struct ext4_str *p) { } static inline int ext4_fname_setup_filename(struct inode *dir, - const struct qstr *iname, - int lookup, struct ext4_filename *fname) + const struct qstr *iname, + int lookup, struct ext4_filename *fname) { - fname->usr_fname = iname; - fname->disk_name.name = (unsigned char *) iname->name; - fname->disk_name.len = iname->len; - return 0; -} -static inline void ext4_fname_free_filename(struct ext4_filename *fname) { } -#endif - + struct fscrypt_name name; + int err; -/* crypto_key.c */ -void ext4_free_crypt_info(struct ext4_crypt_info *ci); -void ext4_free_encryption_info(struct inode *inode, struct ext4_crypt_info *ci); -int _ext4_get_encryption_info(struct inode *inode); + memset(fname, 0, sizeof(struct ext4_filename)); -#ifdef CONFIG_EXT4_FS_ENCRYPTION -int ext4_has_encryption_key(struct inode *inode); + err = fscrypt_setup_filename(dir, iname, lookup, &name); -static inline int ext4_get_encryption_info(struct inode *inode) -{ - struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; - - if (!ci || - (ci->ci_keyring_key && - (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | - (1 << KEY_FLAG_REVOKED) | - (1 << KEY_FLAG_DEAD))))) - return _ext4_get_encryption_info(inode); - return 0; + fname->usr_fname = name.usr_fname; + fname->disk_name = name.disk_name; + fname->hinfo.hash = name.hash; + fname->hinfo.minor_hash = name.minor_hash; + fname->crypto_buf = name.crypto_buf; + return err; } -static inline struct ext4_crypt_info *ext4_encryption_info(struct inode *inode) +static inline void ext4_fname_free_filename(struct ext4_filename *fname) { - return EXT4_I(inode)->i_crypt_info; -} + struct fscrypt_name name; -#else -static inline int ext4_has_encryption_key(struct inode *inode) -{ - return 0; + name.crypto_buf = fname->crypto_buf; + fscrypt_free_filename(&name); + + fname->crypto_buf.name = NULL; + fname->usr_fname = NULL; + fname->disk_name.name = NULL; } -static inline int ext4_get_encryption_info(struct inode *inode) +#else +static inline int ext4_fname_setup_filename(struct inode *dir, + const struct qstr *iname, + int lookup, struct ext4_filename *fname) { + fname->usr_fname = iname; + fname->disk_name.name = (unsigned char *) iname->name; + fname->disk_name.len = iname->len; return 0; } -static inline struct ext4_crypt_info *ext4_encryption_info(struct inode *inode) -{ - return NULL; -} -#endif +static inline void ext4_fname_free_filename(struct ext4_filename *fname) { } +#define fscrypt_set_d_op(i) +#define fscrypt_get_ctx fscrypt_notsupp_get_ctx +#define fscrypt_release_ctx fscrypt_notsupp_release_ctx +#define fscrypt_encrypt_page fscrypt_notsupp_encrypt_page +#define fscrypt_decrypt_page fscrypt_notsupp_decrypt_page +#define fscrypt_decrypt_bio_pages fscrypt_notsupp_decrypt_bio_pages +#define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page +#define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page +#define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range +#define fscrypt_process_policy fscrypt_notsupp_process_policy +#define fscrypt_get_policy fscrypt_notsupp_get_policy +#define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context +#define fscrypt_inherit_context fscrypt_notsupp_inherit_context +#define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info +#define fscrypt_put_encryption_info fscrypt_notsupp_put_encryption_info +#define fscrypt_setup_filename fscrypt_notsupp_setup_filename +#define fscrypt_free_filename fscrypt_notsupp_free_filename +#define fscrypt_fname_encrypted_size fscrypt_notsupp_fname_encrypted_size +#define fscrypt_fname_alloc_buffer fscrypt_notsupp_fname_alloc_buffer +#define fscrypt_fname_free_buffer fscrypt_notsupp_fname_free_buffer +#define fscrypt_fname_disk_to_usr fscrypt_notsupp_fname_disk_to_usr +#define fscrypt_fname_usr_to_disk fscrypt_notsupp_fname_usr_to_disk +#endif /* dir.c */ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, @@ -2435,7 +2376,7 @@ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent, - struct ext4_str *ent_name); + struct fscrypt_str *ent_name); extern void ext4_htree_free_dir_info(struct dir_private_info *p); extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, struct buffer_head *bh, @@ -2623,7 +2564,7 @@ extern int ext4_generic_delete_entry(handle_t *handle, void *entry_buf, int buf_size, int csum_size); -extern int ext4_empty_dir(struct inode *inode); +extern bool ext4_empty_dir(struct inode *inode); /* resize.c */ extern int ext4_group_add(struct super_block *sb, @@ -3105,7 +3046,7 @@ extern int ext4_delete_inline_entry(handle_t *handle, struct ext4_dir_entry_2 *de_del, struct buffer_head *bh, int *has_inline_data); -extern int empty_inline_dir(struct inode *dir, int *has_inline_data); +extern bool empty_inline_dir(struct inode *dir, int *has_inline_data); extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, struct ext4_dir_entry_2 **parent_de, int *retval); diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h deleted file mode 100644 index 1f73c29717e1..000000000000 --- a/fs/ext4/ext4_crypto.h +++ /dev/null @@ -1,159 +0,0 @@ -/* - * linux/fs/ext4/ext4_crypto.h - * - * Copyright (C) 2015, Google, Inc. - * - * This contains encryption header content for ext4 - * - * Written by Michael Halcrow, 2015. - */ - -#ifndef _EXT4_CRYPTO_H -#define _EXT4_CRYPTO_H - -#include <linux/fs.h> - -#define EXT4_KEY_DESCRIPTOR_SIZE 8 - -/* Policy provided via an ioctl on the topmost directory */ -struct ext4_encryption_policy { - char version; - char contents_encryption_mode; - char filenames_encryption_mode; - char flags; - char master_key_descriptor[EXT4_KEY_DESCRIPTOR_SIZE]; -} __attribute__((__packed__)); - -#define EXT4_ENCRYPTION_CONTEXT_FORMAT_V1 1 -#define EXT4_KEY_DERIVATION_NONCE_SIZE 16 - -#define EXT4_POLICY_FLAGS_PAD_4 0x00 -#define EXT4_POLICY_FLAGS_PAD_8 0x01 -#define EXT4_POLICY_FLAGS_PAD_16 0x02 -#define EXT4_POLICY_FLAGS_PAD_32 0x03 -#define EXT4_POLICY_FLAGS_PAD_MASK 0x03 -#define EXT4_POLICY_FLAGS_VALID 0x03 - -/** - * Encryption context for inode - * - * Protector format: - * 1 byte: Protector format (1 = this version) - * 1 byte: File contents encryption mode - * 1 byte: File names encryption mode - * 1 byte: Reserved - * 8 bytes: Master Key descriptor - * 16 bytes: Encryption Key derivation nonce - */ -struct ext4_encryption_context { - char format; - char contents_encryption_mode; - char filenames_encryption_mode; - char flags; - char master_key_descriptor[EXT4_KEY_DESCRIPTOR_SIZE]; - char nonce[EXT4_KEY_DERIVATION_NONCE_SIZE]; -} __attribute__((__packed__)); - -/* Encryption parameters */ -#define EXT4_XTS_TWEAK_SIZE 16 -#define EXT4_AES_128_ECB_KEY_SIZE 16 -#define EXT4_AES_256_GCM_KEY_SIZE 32 -#define EXT4_AES_256_CBC_KEY_SIZE 32 -#define EXT4_AES_256_CTS_KEY_SIZE 32 -#define EXT4_AES_256_XTS_KEY_SIZE 64 -#define EXT4_MAX_KEY_SIZE 64 - -#define EXT4_KEY_DESC_PREFIX "ext4:" -#define EXT4_KEY_DESC_PREFIX_SIZE 5 - -/* This is passed in from userspace into the kernel keyring */ -struct ext4_encryption_key { - __u32 mode; - char raw[EXT4_MAX_KEY_SIZE]; - __u32 size; -} __attribute__((__packed__)); - -struct ext4_crypt_info { - char ci_data_mode; - char ci_filename_mode; - char ci_flags; - struct crypto_skcipher *ci_ctfm; - struct key *ci_keyring_key; - char ci_master_key[EXT4_KEY_DESCRIPTOR_SIZE]; -}; - -#define EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 -#define EXT4_WRITE_PATH_FL 0x00000002 - -struct ext4_crypto_ctx { - union { - struct { - struct page *bounce_page; /* Ciphertext page */ - struct page *control_page; /* Original page */ - } w; - struct { - struct bio *bio; - struct work_struct work; - } r; - struct list_head free_list; /* Free list */ - }; - char flags; /* Flags */ - char mode; /* Encryption mode for tfm */ -}; - -struct ext4_completion_result { - struct completion completion; - int res; -}; - -#define DECLARE_EXT4_COMPLETION_RESULT(ecr) \ - struct ext4_completion_result ecr = { \ - COMPLETION_INITIALIZER((ecr).completion), 0 } - -static inline int ext4_encryption_key_size(int mode) -{ - switch (mode) { - case EXT4_ENCRYPTION_MODE_AES_256_XTS: - return EXT4_AES_256_XTS_KEY_SIZE; - case EXT4_ENCRYPTION_MODE_AES_256_GCM: - return EXT4_AES_256_GCM_KEY_SIZE; - case EXT4_ENCRYPTION_MODE_AES_256_CBC: - return EXT4_AES_256_CBC_KEY_SIZE; - case EXT4_ENCRYPTION_MODE_AES_256_CTS: - return EXT4_AES_256_CTS_KEY_SIZE; - default: - BUG(); - } - return 0; -} - -#define EXT4_FNAME_NUM_SCATTER_ENTRIES 4 -#define EXT4_CRYPTO_BLOCK_SIZE 16 -#define EXT4_FNAME_CRYPTO_DIGEST_SIZE 32 - -struct ext4_str { - unsigned char *name; - u32 len; -}; - -/** - * For encrypted symlinks, the ciphertext length is stored at the beginning - * of the string in little-endian format. - */ -struct ext4_encrypted_symlink_data { - __le16 len; - char encrypted_path[1]; -} __attribute__((__packed__)); - -/** - * This function is used to calculate the disk space required to - * store a filename of length l in encrypted symlink format. - */ -static inline u32 encrypted_symlink_data_len(u32 l) -{ - if (l < EXT4_CRYPTO_BLOCK_SIZE) - l = EXT4_CRYPTO_BLOCK_SIZE; - return (l + sizeof(struct ext4_encrypted_symlink_data) - 1); -} - -#endif /* _EXT4_CRYPTO_H */ diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 09c1ef38cbe6..b1d52c14098e 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -175,6 +175,13 @@ struct ext4_journal_cb_entry { * There is no guaranteed calling order of multiple registered callbacks on * the same transaction. */ +static inline void _ext4_journal_callback_add(handle_t *handle, + struct ext4_journal_cb_entry *jce) +{ + /* Add the jce to transaction's private list */ + list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list); +} + static inline void ext4_journal_callback_add(handle_t *handle, void (*func)(struct super_block *sb, struct ext4_journal_cb_entry *jce, @@ -187,10 +194,11 @@ static inline void ext4_journal_callback_add(handle_t *handle, /* Add the jce to transaction's private list */ jce->jce_func = func; spin_lock(&sbi->s_md_lock); - list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list); + _ext4_journal_callback_add(handle, jce); spin_unlock(&sbi->s_md_lock); } + /** * ext4_journal_callback_del: delete a registered callback * @handle: active journal transaction handle on which callback was registered diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 2a2eef9c14e4..d7ccb7f51dfc 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -381,9 +381,13 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) ext4_fsblk_t block = ext4_ext_pblock(ext); int len = ext4_ext_get_actual_len(ext); ext4_lblk_t lblock = le32_to_cpu(ext->ee_block); - ext4_lblk_t last = lblock + len - 1; - if (len == 0 || lblock > last) + /* + * We allow neither: + * - zero length + * - overflow/wrap-around + */ + if (lblock + len <= lblock) return 0; return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); } @@ -474,6 +478,10 @@ static int __ext4_ext_check(const char *function, unsigned int line, error_msg = "invalid extent entries"; goto corrupted; } + if (unlikely(depth > 32)) { + error_msg = "too large eh_depth"; + goto corrupted; + } /* Verify checksum on non-root extent tree nodes */ if (ext_depth(inode) != depth && !ext4_extent_block_csum_verify(inode, eh)) { diff --git a/fs/ext4/file.c b/fs/ext4/file.c index df44c877892a..261ac3734c58 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -202,7 +202,7 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (IS_ERR(handle)) result = VM_FAULT_SIGBUS; else - result = __dax_fault(vma, vmf, ext4_dax_get_block); + result = dax_fault(vma, vmf, ext4_dax_get_block); if (write) { if (!IS_ERR(handle)) @@ -237,7 +237,7 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, if (IS_ERR(handle)) result = VM_FAULT_SIGBUS; else - result = __dax_pmd_fault(vma, addr, pmd, flags, + result = dax_pmd_fault(vma, addr, pmd, flags, ext4_dax_get_block); if (write) { @@ -303,10 +303,10 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) struct inode *inode = file->f_mapping->host; if (ext4_encrypted_inode(inode)) { - int err = ext4_get_encryption_info(inode); + int err = fscrypt_get_encryption_info(inode); if (err) return 0; - if (ext4_encryption_info(inode) == NULL) + if (!fscrypt_has_encryption_key(inode)) return -ENOKEY; } file_accessed(file); @@ -362,16 +362,16 @@ static int ext4_file_open(struct inode * inode, struct file * filp) } } if (ext4_encrypted_inode(inode)) { - ret = ext4_get_encryption_info(inode); + ret = fscrypt_get_encryption_info(inode); if (ret) return -EACCES; - if (ext4_encryption_info(inode) == NULL) + if (!fscrypt_has_encryption_key(inode)) return -ENOKEY; } dir = dget_parent(file_dentry(filp)); if (ext4_encrypted_inode(d_inode(dir)) && - !ext4_is_child_context_consistent_with_parent(d_inode(dir), inode)) { + !fscrypt_has_permitted_context(d_inode(dir), inode)) { ext4_warning(inode->i_sb, "Inconsistent encryption contexts: %lu/%lu", (unsigned long) d_inode(dir)->i_ino, diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 8850254136ae..5c4372512ef7 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -106,9 +106,11 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) } if (!journal) { - ret = generic_file_fsync(file, start, end, datasync); + ret = __generic_file_fsync(file, start, end, datasync); if (!ret && !hlist_empty(&inode->i_dentry)) ret = ext4_sync_parent(inode); + if (test_opt(inode->i_sb, BARRIER)) + goto issue_flush; goto out; } @@ -140,6 +142,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) needs_barrier = true; ret = jbd2_complete_transaction(journal, commit_tid); if (needs_barrier) { + issue_flush: err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); if (!ret) ret = err; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 3da4cf8d18b6..9e66cd1d7b78 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -214,7 +214,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) trace_ext4_load_inode_bitmap(sb, block_group); bh->b_end_io = ext4_end_bitmap_read; get_bh(bh); - submit_bh(READ | REQ_META | REQ_PRIO, bh); + submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { put_bh(bh); @@ -767,10 +767,10 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, if ((ext4_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { - err = ext4_get_encryption_info(dir); + err = fscrypt_get_encryption_info(dir); if (err) return ERR_PTR(err); - if (ext4_encryption_info(dir) == NULL) + if (!fscrypt_has_encryption_key(dir)) return ERR_PTR(-EPERM); if (!handle) nblocks += EXT4_DATA_TRANS_BLOCKS(dir->i_sb); @@ -1115,7 +1115,8 @@ got: } if (encrypt) { - err = ext4_inherit_context(dir, inode); + /* give pointer to avoid set_context with journal ops. */ + err = fscrypt_inherit_context(dir, inode, &encrypt, true); if (err) goto fail_free_drop; } diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index ff7538c26992..f74d5ee2cdec 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1326,7 +1326,7 @@ int htree_inlinedir_to_tree(struct file *dir_file, struct ext4_iloc iloc; void *dir_buf = NULL; struct ext4_dir_entry_2 fake; - struct ext4_str tmp_str; + struct fscrypt_str tmp_str; ret = ext4_get_inode_loc(inode, &iloc); if (ret) @@ -1739,20 +1739,20 @@ ext4_get_inline_entry(struct inode *inode, return (struct ext4_dir_entry_2 *)(inline_pos + offset); } -int empty_inline_dir(struct inode *dir, int *has_inline_data) +bool empty_inline_dir(struct inode *dir, int *has_inline_data) { int err, inline_size; struct ext4_iloc iloc; void *inline_pos; unsigned int offset; struct ext4_dir_entry_2 *de; - int ret = 1; + bool ret = true; err = ext4_get_inode_loc(dir, &iloc); if (err) { EXT4_ERROR_INODE(dir, "error %d getting inode %lu block", err, dir->i_ino); - return 1; + return true; } down_read(&EXT4_I(dir)->xattr_sem); @@ -1766,7 +1766,7 @@ int empty_inline_dir(struct inode *dir, int *has_inline_data) ext4_warning(dir->i_sb, "bad inline directory (dir #%lu) - no `..'", dir->i_ino); - ret = 1; + ret = true; goto out; } @@ -1784,11 +1784,11 @@ int empty_inline_dir(struct inode *dir, int *has_inline_data) dir->i_ino, le32_to_cpu(de->inode), le16_to_cpu(de->rec_len), de->name_len, inline_size); - ret = 1; + ret = true; goto out; } if (le32_to_cpu(de->inode)) { - ret = 0; + ret = false; goto out; } offset += ext4_rec_len_from_disk(de->rec_len, inline_size); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f7140ca66e3b..3131747199e1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -51,25 +51,31 @@ static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - __u16 csum_lo; - __u16 csum_hi = 0; __u32 csum; + __u16 dummy_csum = 0; + int offset = offsetof(struct ext4_inode, i_checksum_lo); + unsigned int csum_size = sizeof(dummy_csum); - csum_lo = le16_to_cpu(raw->i_checksum_lo); - raw->i_checksum_lo = 0; - if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && - EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { - csum_hi = le16_to_cpu(raw->i_checksum_hi); - raw->i_checksum_hi = 0; - } - - csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, - EXT4_INODE_SIZE(inode->i_sb)); + csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset); + csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size); + offset += csum_size; + csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, + EXT4_GOOD_OLD_INODE_SIZE - offset); - raw->i_checksum_lo = cpu_to_le16(csum_lo); - if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && - EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) - raw->i_checksum_hi = cpu_to_le16(csum_hi); + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { + offset = offsetof(struct ext4_inode, i_checksum_hi); + csum = ext4_chksum(sbi, csum, (__u8 *)raw + + EXT4_GOOD_OLD_INODE_SIZE, + offset - EXT4_GOOD_OLD_INODE_SIZE); + if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { + csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, + csum_size); + offset += csum_size; + csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, + EXT4_INODE_SIZE(inode->i_sb) - + offset); + } + } return csum; } @@ -205,9 +211,9 @@ void ext4_evict_inode(struct inode *inode) * Note that directories do not have this problem because they * don't use page cache. */ - if (ext4_should_journal_data(inode) && - (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) && - inode->i_ino != EXT4_JOURNAL_INO) { + if (inode->i_ino != EXT4_JOURNAL_INO && + ext4_should_journal_data(inode) && + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; @@ -386,7 +392,7 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, int ret; if (ext4_encrypted_inode(inode)) - return ext4_encrypted_zeroout(inode, lblk, pblk, len); + return fscrypt_zeroout_range(inode, lblk, pblk, len); ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS); if (ret > 0) @@ -981,7 +987,7 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, return bh; if (!bh || buffer_uptodate(bh)) return bh; - ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh); + ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) return bh; @@ -1135,7 +1141,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh) && (block_start < from || block_end > to)) { - ll_rw_block(READ, 1, &bh); + ll_rw_block(REQ_OP_READ, 0, 1, &bh); *wait_bh++ = bh; decrypt = ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode); @@ -1152,7 +1158,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, if (unlikely(err)) page_zero_new_buffers(page, from, to); else if (decrypt) - err = ext4_decrypt(page); + err = fscrypt_decrypt_page(page); return err; } #endif @@ -2748,13 +2754,36 @@ retry: done = true; } } - ext4_journal_stop(handle); + /* + * Caution: If the handle is synchronous, + * ext4_journal_stop() can wait for transaction commit + * to finish which may depend on writeback of pages to + * complete or on page lock to be released. In that + * case, we have to wait until after after we have + * submitted all the IO, released page locks we hold, + * and dropped io_end reference (for extent conversion + * to be able to complete) before stopping the handle. + */ + if (!ext4_handle_valid(handle) || handle->h_sync == 0) { + ext4_journal_stop(handle); + handle = NULL; + } /* Submit prepared bio */ ext4_io_submit(&mpd.io_submit); /* Unlock pages we didn't use */ mpage_release_unused_pages(&mpd, give_up_on_write); - /* Drop our io_end reference we got from init */ - ext4_put_io_end(mpd.io_submit.io_end); + /* + * Drop our io_end reference we got from init. We have + * to be careful and use deferred io_end finishing if + * we are still holding the transaction as we can + * release the last reference to io_end which may end + * up doing unwritten extent conversion. + */ + if (handle) { + ext4_put_io_end_defer(mpd.io_submit.io_end); + ext4_journal_stop(handle); + } else + ext4_put_io_end(mpd.io_submit.io_end); if (ret == -ENOSPC && sbi->s_journal) { /* @@ -3698,7 +3727,7 @@ static int __ext4_block_zero_page_range(handle_t *handle, if (!buffer_uptodate(bh)) { err = -EIO; - ll_rw_block(READ, 1, &bh); + ll_rw_block(REQ_OP_READ, 0, 1, &bh); wait_on_buffer(bh); /* Uhhuh. Read error. Complain and punt. */ if (!buffer_uptodate(bh)) @@ -3706,9 +3735,9 @@ static int __ext4_block_zero_page_range(handle_t *handle, if (S_ISREG(inode->i_mode) && ext4_encrypted_inode(inode)) { /* We expect the key to be set. */ - BUG_ON(!ext4_has_encryption_key(inode)); + BUG_ON(!fscrypt_has_encryption_key(inode)); BUG_ON(blocksize != PAGE_SIZE); - WARN_ON_ONCE(ext4_decrypt(page)); + WARN_ON_ONCE(fscrypt_decrypt_page(page)); } } if (ext4_should_journal_data(inode)) { @@ -4281,7 +4310,7 @@ make_io: trace_ext4_load_inode(inode); get_bh(bh); bh->b_end_io = end_buffer_read_sync; - submit_bh(READ | REQ_META | REQ_PRIO, bh); + submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { EXT4_ERROR_INODE_BLOCK(inode, block, diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 28cc412852af..10686fd67fb4 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -308,6 +308,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) kprojid_t kprojid; struct ext4_iloc iloc; struct ext4_inode *raw_inode; + struct dquot *transfer_to[MAXQUOTAS] = { }; if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT)) { @@ -361,17 +362,14 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) if (err) goto out_stop; - if (sb_has_quota_limits_enabled(sb, PRJQUOTA)) { - struct dquot *transfer_to[MAXQUOTAS] = { }; - - transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); - if (!IS_ERR(transfer_to[PRJQUOTA])) { - err = __dquot_transfer(inode, transfer_to); - dqput(transfer_to[PRJQUOTA]); - if (err) - goto out_dirty; - } + transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); + if (!IS_ERR(transfer_to[PRJQUOTA])) { + err = __dquot_transfer(inode, transfer_to); + dqput(transfer_to[PRJQUOTA]); + if (err) + goto out_dirty; } + EXT4_I(inode)->i_projid = kprojid; inode->i_ctime = ext4_current_time(inode); out_dirty: @@ -772,19 +770,13 @@ resizefs_out: return ext4_ext_precache(inode); case EXT4_IOC_SET_ENCRYPTION_POLICY: { #ifdef CONFIG_EXT4_FS_ENCRYPTION - struct ext4_encryption_policy policy; - int err = 0; + struct fscrypt_policy policy; if (copy_from_user(&policy, - (struct ext4_encryption_policy __user *)arg, - sizeof(policy))) { - err = -EFAULT; - goto encryption_policy_out; - } - - err = ext4_process_policy(&policy, inode); -encryption_policy_out: - return err; + (struct fscrypt_policy __user *)arg, + sizeof(policy))) + return -EFAULT; + return fscrypt_process_policy(inode, &policy); #else return -EOPNOTSUPP; #endif @@ -827,12 +819,12 @@ encryption_policy_out: } case EXT4_IOC_GET_ENCRYPTION_POLICY: { #ifdef CONFIG_EXT4_FS_ENCRYPTION - struct ext4_encryption_policy policy; + struct fscrypt_policy policy; int err = 0; if (!ext4_encrypted_inode(inode)) return -ENOENT; - err = ext4_get_policy(inode, &policy); + err = fscrypt_get_policy(inode, &policy); if (err) return err; if (copy_to_user((void __user *)arg, &policy, sizeof(policy))) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index c1ab3ec30423..11562161e24a 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2627,6 +2627,7 @@ int ext4_mb_init(struct super_block *sb) spin_lock_init(&sbi->s_md_lock); spin_lock_init(&sbi->s_bal_lock); + sbi->s_mb_free_pending = 0; sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; @@ -2814,6 +2815,9 @@ static void ext4_free_data_callback(struct super_block *sb, /* we expect to find existing buddy because it's pinned */ BUG_ON(err != 0); + spin_lock(&EXT4_SB(sb)->s_md_lock); + EXT4_SB(sb)->s_mb_free_pending -= entry->efd_count; + spin_unlock(&EXT4_SB(sb)->s_md_lock); db = e4b.bd_info; /* there are blocks to put in buddy to make them really free */ @@ -2939,7 +2943,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, ext4_error(sb, "Allocating blocks %llu-%llu which overlap " "fs metadata", block, block+len); /* File system mounted not to panic on error - * Fix the bitmap and repeat the block allocation + * Fix the bitmap and return EFSCORRUPTED * We leak some of the blocks here. */ ext4_lock_group(sb, ac->ac_b_ex.fe_group); @@ -2948,7 +2952,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, ext4_unlock_group(sb, ac->ac_b_ex.fe_group); err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!err) - err = -EAGAIN; + err = -EFSCORRUPTED; goto out_err; } @@ -4513,18 +4517,7 @@ repeat: } if (likely(ac->ac_status == AC_STATUS_FOUND)) { *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); - if (*errp == -EAGAIN) { - /* - * drop the reference that we took - * in ext4_mb_use_best_found - */ - ext4_mb_release_context(ac); - ac->ac_b_ex.fe_group = 0; - ac->ac_b_ex.fe_start = 0; - ac->ac_b_ex.fe_len = 0; - ac->ac_status = AC_STATUS_CONTINUE; - goto repeat; - } else if (*errp) { + if (*errp) { ext4_discard_allocated_blocks(ac); goto errout; } else { @@ -4583,6 +4576,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, { ext4_group_t group = e4b->bd_group; ext4_grpblk_t cluster; + ext4_grpblk_t clusters = new_entry->efd_count; struct ext4_free_data *entry; struct ext4_group_info *db = e4b->bd_info; struct super_block *sb = e4b->bd_sb; @@ -4649,8 +4643,11 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, } } /* Add the extent to transaction's private list */ - ext4_journal_callback_add(handle, ext4_free_data_callback, - &new_entry->efd_jce); + new_entry->efd_jce.jce_func = ext4_free_data_callback; + spin_lock(&sbi->s_md_lock); + _ext4_journal_callback_add(handle, &new_entry->efd_jce); + sbi->s_mb_free_pending += clusters; + spin_unlock(&sbi->s_md_lock); return 0; } diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 23d436d6f8b8..d89754ef1aab 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -52,7 +52,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) lock_buffer(bh); bh->b_end_io = end_buffer_write_sync; get_bh(bh); - submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh); + submit_bh(REQ_OP_WRITE, WRITE_SYNC | REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); sb_end_write(sb); if (unlikely(!buffer_uptodate(bh))) @@ -88,7 +88,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, get_bh(*bh); lock_buffer(*bh); (*bh)->b_end_io = end_buffer_read_sync; - submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh); + submit_bh(REQ_OP_READ, READ_SYNC | REQ_META | REQ_PRIO, *bh); wait_on_buffer(*bh); if (!buffer_uptodate(*bh)) { ret = -EIO; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index ec4c39952e84..34c0142caf6a 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -420,15 +420,14 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent, struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); __u32 csum; - __le32 save_csum; int size; + __u32 dummy_csum = 0; + int offset = offsetof(struct dx_tail, dt_checksum); size = count_offset + (count * sizeof(struct dx_entry)); - save_csum = t->dt_checksum; - t->dt_checksum = 0; csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); - csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail)); - t->dt_checksum = save_csum; + csum = ext4_chksum(sbi, csum, (__u8 *)t, offset); + csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); return cpu_to_le32(csum); } @@ -446,14 +445,14 @@ static int ext4_dx_csum_verify(struct inode *inode, c = get_dx_countlimit(inode, dirent, &count_offset); if (!c) { EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D."); - return 1; + return 0; } limit = le16_to_cpu(c->limit); count = le16_to_cpu(c->count); if (count_offset + (limit * sizeof(struct dx_entry)) > EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) { warn_no_space_for_csum(inode); - return 1; + return 0; } t = (struct dx_tail *)(((struct dx_entry *)c) + limit); @@ -612,19 +611,19 @@ static struct stats dx_show_leaf(struct inode *dir, #ifdef CONFIG_EXT4_FS_ENCRYPTION int len; char *name; - struct ext4_str fname_crypto_str - = {.name = NULL, .len = 0}; + struct fscrypt_str fname_crypto_str = + FSTR_INIT(NULL, 0); int res = 0; name = de->name; len = de->name_len; - if (ext4_encrypted_inode(inode)) - res = ext4_get_encryption_info(dir); + if (ext4_encrypted_inode(dir)) + res = fscrypt_get_encryption_info(dir); if (res) { printk(KERN_WARNING "Error setting up" " fname crypto: %d\n", res); } - if (ctx == NULL) { + if (!fscrypt_has_encryption_key(dir)) { /* Directory is not encrypted */ ext4fs_dirhash(de->name, de->name_len, &h); @@ -633,19 +632,21 @@ static struct stats dx_show_leaf(struct inode *dir, (unsigned) ((char *) de - base)); } else { + struct fscrypt_str de_name = + FSTR_INIT(name, len); + /* Directory is encrypted */ - res = ext4_fname_crypto_alloc_buffer( - ctx, de->name_len, + res = fscrypt_fname_alloc_buffer( + dir, len, &fname_crypto_str); - if (res < 0) { + if (res < 0) printk(KERN_WARNING "Error " "allocating crypto " "buffer--skipping " "crypto\n"); - ctx = NULL; - } - res = ext4_fname_disk_to_usr(ctx, NULL, de, - &fname_crypto_str); + res = fscrypt_fname_disk_to_usr(dir, + 0, 0, &de_name, + &fname_crypto_str); if (res < 0) { printk(KERN_WARNING "Error " "converting filename " @@ -662,8 +663,8 @@ static struct stats dx_show_leaf(struct inode *dir, printk("%*.s:(E)%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); - ext4_fname_crypto_free_buffer( - &fname_crypto_str); + fscrypt_fname_free_buffer( + &fname_crypto_str); } #else int len = de->name_len; @@ -952,7 +953,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, struct buffer_head *bh; struct ext4_dir_entry_2 *de, *top; int err = 0, count = 0; - struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}, tmp_str; + struct fscrypt_str fname_crypto_str = FSTR_INIT(NULL, 0), tmp_str; dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", (unsigned long)block)); @@ -967,12 +968,12 @@ static int htree_dirblock_to_tree(struct file *dir_file, #ifdef CONFIG_EXT4_FS_ENCRYPTION /* Check if the directory is encrypted */ if (ext4_encrypted_inode(dir)) { - err = ext4_get_encryption_info(dir); + err = fscrypt_get_encryption_info(dir); if (err < 0) { brelse(bh); return err; } - err = ext4_fname_crypto_alloc_buffer(dir, EXT4_NAME_LEN, + err = fscrypt_fname_alloc_buffer(dir, EXT4_NAME_LEN, &fname_crypto_str); if (err < 0) { brelse(bh); @@ -1003,10 +1004,13 @@ static int htree_dirblock_to_tree(struct file *dir_file, &tmp_str); } else { int save_len = fname_crypto_str.len; + struct fscrypt_str de_name = FSTR_INIT(de->name, + de->name_len); /* Directory is encrypted */ - err = ext4_fname_disk_to_usr(dir, hinfo, de, - &fname_crypto_str); + err = fscrypt_fname_disk_to_usr(dir, hinfo->hash, + hinfo->minor_hash, &de_name, + &fname_crypto_str); if (err < 0) { count = err; goto errout; @@ -1025,7 +1029,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, errout: brelse(bh); #ifdef CONFIG_EXT4_FS_ENCRYPTION - ext4_fname_crypto_free_buffer(&fname_crypto_str); + fscrypt_fname_free_buffer(&fname_crypto_str); #endif return count; } @@ -1050,7 +1054,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, int count = 0; int ret, err; __u32 hashval; - struct ext4_str tmp_str; + struct fscrypt_str tmp_str; dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", start_hash, start_minor_hash)); @@ -1443,7 +1447,8 @@ restart: } bh_use[ra_max] = bh; if (bh) - ll_rw_block(READ | REQ_META | REQ_PRIO, + ll_rw_block(REQ_OP_READ, + REQ_META | REQ_PRIO, 1, &bh); } } @@ -1563,26 +1568,23 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi struct ext4_dir_entry_2 *de; struct buffer_head *bh; - if (ext4_encrypted_inode(dir)) { - int res = ext4_get_encryption_info(dir); + if (ext4_encrypted_inode(dir)) { + int res = fscrypt_get_encryption_info(dir); /* - * This should be a properly defined flag for - * dentry->d_flags when we uplift this to the VFS. - * d_fsdata is set to (void *) 1 if if the dentry is + * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is * created while the directory was encrypted and we - * don't have access to the key. + * have access to the key. */ - dentry->d_fsdata = NULL; - if (ext4_encryption_info(dir)) - dentry->d_fsdata = (void *) 1; - d_set_d_op(dentry, &ext4_encrypted_d_ops); - if (res && res != -ENOKEY) - return ERR_PTR(res); - } + if (fscrypt_has_encryption_key(dir)) + fscrypt_set_encrypted_dentry(dentry); + fscrypt_set_d_op(dentry); + if (res && res != -ENOKEY) + return ERR_PTR(res); + } - if (dentry->d_name.len > EXT4_NAME_LEN) - return ERR_PTR(-ENAMETOOLONG); + if (dentry->d_name.len > EXT4_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); if (IS_ERR(bh)) @@ -1609,11 +1611,9 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi } if (!IS_ERR(inode) && ext4_encrypted_inode(dir) && (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && - !ext4_is_child_context_consistent_with_parent(dir, - inode)) { + !fscrypt_has_permitted_context(dir, inode)) { int nokey = ext4_encrypted_inode(inode) && - !ext4_encryption_info(inode); - + !fscrypt_has_encryption_key(inode); iput(inode); if (nokey) return ERR_PTR(-ENOKEY); @@ -2690,30 +2690,30 @@ out_stop: /* * routine to check that the specified directory is empty (for rmdir) */ -int ext4_empty_dir(struct inode *inode) +bool ext4_empty_dir(struct inode *inode) { unsigned int offset; struct buffer_head *bh; struct ext4_dir_entry_2 *de, *de1; struct super_block *sb; - int err = 0; if (ext4_has_inline_data(inode)) { int has_inline_data = 1; + int ret; - err = empty_inline_dir(inode, &has_inline_data); + ret = empty_inline_dir(inode, &has_inline_data); if (has_inline_data) - return err; + return ret; } sb = inode->i_sb; if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2)) { EXT4_ERROR_INODE(inode, "invalid size"); - return 1; + return true; } bh = ext4_read_dirblock(inode, 0, EITHER); if (IS_ERR(bh)) - return 1; + return true; de = (struct ext4_dir_entry_2 *) bh->b_data; de1 = ext4_next_entry(de, sb->s_blocksize); @@ -2722,7 +2722,7 @@ int ext4_empty_dir(struct inode *inode) strcmp(".", de->name) || strcmp("..", de1->name)) { ext4_warning_inode(inode, "directory missing '.' and/or '..'"); brelse(bh); - return 1; + return true; } offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) + ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize); @@ -2730,12 +2730,11 @@ int ext4_empty_dir(struct inode *inode) while (offset < inode->i_size) { if ((void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { unsigned int lblock; - err = 0; brelse(bh); lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb); bh = ext4_read_dirblock(inode, lblock, EITHER); if (IS_ERR(bh)) - return 1; + return true; de = (struct ext4_dir_entry_2 *) bh->b_data; } if (ext4_check_dir_entry(inode, NULL, de, bh, @@ -2747,13 +2746,13 @@ int ext4_empty_dir(struct inode *inode) } if (le32_to_cpu(de->inode)) { brelse(bh); - return 0; + return false; } offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); de = ext4_next_entry(de, sb->s_blocksize); } brelse(bh); - return 1; + return true; } /* @@ -3076,8 +3075,8 @@ static int ext4_symlink(struct inode *dir, int err, len = strlen(symname); int credits; bool encryption_required; - struct ext4_str disk_link; - struct ext4_encrypted_symlink_data *sd = NULL; + struct fscrypt_str disk_link; + struct fscrypt_symlink_data *sd = NULL; disk_link.len = len + 1; disk_link.name = (char *) symname; @@ -3085,13 +3084,13 @@ static int ext4_symlink(struct inode *dir, encryption_required = (ext4_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))); if (encryption_required) { - err = ext4_get_encryption_info(dir); + err = fscrypt_get_encryption_info(dir); if (err) return err; - if (ext4_encryption_info(dir) == NULL) + if (!fscrypt_has_encryption_key(dir)) return -EPERM; - disk_link.len = (ext4_fname_encrypted_size(dir, len) + - sizeof(struct ext4_encrypted_symlink_data)); + disk_link.len = (fscrypt_fname_encrypted_size(dir, len) + + sizeof(struct fscrypt_symlink_data)); sd = kzalloc(disk_link.len, GFP_KERNEL); if (!sd) return -ENOMEM; @@ -3139,13 +3138,12 @@ static int ext4_symlink(struct inode *dir, if (encryption_required) { struct qstr istr; - struct ext4_str ostr; + struct fscrypt_str ostr = + FSTR_INIT(sd->encrypted_path, disk_link.len); istr.name = (const unsigned char *) symname; istr.len = len; - ostr.name = sd->encrypted_path; - ostr.len = disk_link.len; - err = ext4_fname_usr_to_disk(inode, &istr, &ostr); + err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr); if (err < 0) goto err_drop_inode; sd->len = cpu_to_le16(ostr.len); @@ -3234,7 +3232,7 @@ static int ext4_link(struct dentry *old_dentry, if (inode->i_nlink >= EXT4_LINK_MAX) return -EMLINK; if (ext4_encrypted_inode(dir) && - !ext4_is_child_context_consistent_with_parent(dir, inode)) + !fscrypt_has_permitted_context(dir, inode)) return -EPERM; if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) && @@ -3557,8 +3555,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, if ((old.dir != new.dir) && ext4_encrypted_inode(new.dir) && - !ext4_is_child_context_consistent_with_parent(new.dir, - old.inode)) { + !fscrypt_has_permitted_context(new.dir, old.inode)) { retval = -EPERM; goto end_rename; } @@ -3730,10 +3727,8 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, if ((ext4_encrypted_inode(old_dir) || ext4_encrypted_inode(new_dir)) && (old_dir != new_dir) && - (!ext4_is_child_context_consistent_with_parent(new_dir, - old.inode) || - !ext4_is_child_context_consistent_with_parent(old_dir, - new.inode))) + (!fscrypt_has_permitted_context(new_dir, old.inode) || + !fscrypt_has_permitted_context(old_dir, new.inode))) return -EPERM; if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) && diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 2a01df9cc1c3..a6132a730967 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -24,6 +24,7 @@ #include <linux/slab.h> #include <linux/mm.h> #include <linux/backing-dev.h> +#include <linux/fscrypto.h> #include "ext4_jbd2.h" #include "xattr.h" @@ -67,7 +68,6 @@ static void ext4_finish_bio(struct bio *bio) struct page *page = bvec->bv_page; #ifdef CONFIG_EXT4_FS_ENCRYPTION struct page *data_page = NULL; - struct ext4_crypto_ctx *ctx = NULL; #endif struct buffer_head *bh, *head; unsigned bio_start = bvec->bv_offset; @@ -82,8 +82,7 @@ static void ext4_finish_bio(struct bio *bio) if (!page->mapping) { /* The bounce data pages are unmapped. */ data_page = page; - ctx = (struct ext4_crypto_ctx *)page_private(data_page); - page = ctx->w.control_page; + fscrypt_pullback_bio_page(&page, false); } #endif @@ -113,8 +112,8 @@ static void ext4_finish_bio(struct bio *bio) local_irq_restore(flags); if (!under_io) { #ifdef CONFIG_EXT4_FS_ENCRYPTION - if (ctx) - ext4_restore_control_page(data_page); + if (data_page) + fscrypt_restore_control_page(data_page); #endif end_page_writeback(page); } @@ -340,9 +339,10 @@ void ext4_io_submit(struct ext4_io_submit *io) struct bio *bio = io->io_bio; if (bio) { - int io_op = io->io_wbc->sync_mode == WB_SYNC_ALL ? - WRITE_SYNC : WRITE; - submit_bio(io_op, io->io_bio); + int io_op_flags = io->io_wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC : 0; + bio_set_op_attrs(io->io_bio, REQ_OP_WRITE, io_op_flags); + submit_bio(io->io_bio); } io->io_bio = NULL; } @@ -472,7 +472,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, gfp_t gfp_flags = GFP_NOFS; retry_encrypt: - data_page = ext4_encrypt(inode, page, gfp_flags); + data_page = fscrypt_encrypt_page(inode, page, gfp_flags); if (IS_ERR(data_page)) { ret = PTR_ERR(data_page); if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) { @@ -510,7 +510,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, if (ret) { out: if (data_page) - ext4_restore_control_page(data_page); + fscrypt_restore_control_page(data_page); printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret); redirty_page_for_writepage(wbc, page); do { diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index dc54a4b60eba..a81b829d56de 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -46,37 +46,6 @@ #include "ext4.h" -/* - * Call ext4_decrypt on every single page, reusing the encryption - * context. - */ -static void completion_pages(struct work_struct *work) -{ -#ifdef CONFIG_EXT4_FS_ENCRYPTION - struct ext4_crypto_ctx *ctx = - container_of(work, struct ext4_crypto_ctx, r.work); - struct bio *bio = ctx->r.bio; - struct bio_vec *bv; - int i; - - bio_for_each_segment_all(bv, bio, i) { - struct page *page = bv->bv_page; - - int ret = ext4_decrypt(page); - if (ret) { - WARN_ON_ONCE(1); - SetPageError(page); - } else - SetPageUptodate(page); - unlock_page(page); - } - ext4_release_crypto_ctx(ctx); - bio_put(bio); -#else - BUG(); -#endif -} - static inline bool ext4_bio_encrypted(struct bio *bio) { #ifdef CONFIG_EXT4_FS_ENCRYPTION @@ -104,14 +73,10 @@ static void mpage_end_io(struct bio *bio) int i; if (ext4_bio_encrypted(bio)) { - struct ext4_crypto_ctx *ctx = bio->bi_private; - if (bio->bi_error) { - ext4_release_crypto_ctx(ctx); + fscrypt_release_ctx(bio->bi_private); } else { - INIT_WORK(&ctx->r.work, completion_pages); - ctx->r.bio = bio; - queue_work(ext4_read_workqueue, &ctx->r.work); + fscrypt_decrypt_bio_pages(bio->bi_private, bio); return; } } @@ -135,7 +100,6 @@ int ext4_mpage_readpages(struct address_space *mapping, unsigned nr_pages) { struct bio *bio = NULL; - unsigned page_idx; sector_t last_block_in_bio = 0; struct inode *inode = mapping->host; @@ -157,7 +121,7 @@ int ext4_mpage_readpages(struct address_space *mapping, map.m_len = 0; map.m_flags = 0; - for (page_idx = 0; nr_pages; page_idx++, nr_pages--) { + for (; nr_pages; nr_pages--) { int fully_mapped = 1; unsigned first_hole = blocks_per_page; @@ -166,7 +130,7 @@ int ext4_mpage_readpages(struct address_space *mapping, page = list_entry(pages->prev, struct page, lru); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, page->index, - mapping_gfp_constraint(mapping, GFP_KERNEL))) + readahead_gfp_mask(mapping))) goto next_page; } @@ -271,15 +235,15 @@ int ext4_mpage_readpages(struct address_space *mapping, */ if (bio && (last_block_in_bio != blocks[0] - 1)) { submit_and_realloc: - submit_bio(READ, bio); + submit_bio(bio); bio = NULL; } if (bio == NULL) { - struct ext4_crypto_ctx *ctx = NULL; + struct fscrypt_ctx *ctx = NULL; if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { - ctx = ext4_get_crypto_ctx(inode, GFP_NOFS); + ctx = fscrypt_get_ctx(inode, GFP_NOFS); if (IS_ERR(ctx)) goto set_error_page; } @@ -287,13 +251,14 @@ int ext4_mpage_readpages(struct address_space *mapping, min_t(int, nr_pages, BIO_MAX_PAGES)); if (!bio) { if (ctx) - ext4_release_crypto_ctx(ctx); + fscrypt_release_ctx(ctx); goto set_error_page; } bio->bi_bdev = bdev; bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); bio->bi_end_io = mpage_end_io; bio->bi_private = ctx; + bio_set_op_attrs(bio, REQ_OP_READ, 0); } length = first_hole << blkbits; @@ -303,14 +268,14 @@ int ext4_mpage_readpages(struct address_space *mapping, if (((map.m_flags & EXT4_MAP_BOUNDARY) && (relative_block == map.m_len)) || (first_hole != blocks_per_page)) { - submit_bio(READ, bio); + submit_bio(bio); bio = NULL; } else last_block_in_bio = blocks[blocks_per_page - 1]; goto next_page; confused: if (bio) { - submit_bio(READ, bio); + submit_bio(bio); bio = NULL; } if (!PageUptodate(page)) @@ -323,6 +288,6 @@ int ext4_mpage_readpages(struct address_space *mapping, } BUG_ON(pages && !list_empty(pages)); if (bio) - submit_bio(READ, bio); + submit_bio(bio); return 0; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3822a5aedc61..1c593aa0218e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -945,9 +945,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_datasync_tid = 0; atomic_set(&ei->i_unwritten, 0); INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); -#ifdef CONFIG_EXT4_FS_ENCRYPTION - ei->i_crypt_info = NULL; -#endif return &ei->vfs_inode; } @@ -1026,8 +1023,7 @@ void ext4_clear_inode(struct inode *inode) EXT4_I(inode)->jinode = NULL; } #ifdef CONFIG_EXT4_FS_ENCRYPTION - if (EXT4_I(inode)->i_crypt_info) - ext4_free_encryption_info(inode, EXT4_I(inode)->i_crypt_info); + fscrypt_put_encryption_info(inode, NULL); #endif } @@ -1094,6 +1090,90 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page, return try_to_free_buffers(page); } +#ifdef CONFIG_EXT4_FS_ENCRYPTION +static int ext4_get_context(struct inode *inode, void *ctx, size_t len) +{ + return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len); +} + +static int ext4_key_prefix(struct inode *inode, u8 **key) +{ + *key = EXT4_SB(inode->i_sb)->key_prefix; + return EXT4_SB(inode->i_sb)->key_prefix_size; +} + +static int ext4_prepare_context(struct inode *inode) +{ + return ext4_convert_inline_data(inode); +} + +static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, + void *fs_data) +{ + handle_t *handle; + int res, res2; + + /* fs_data is null when internally used. */ + if (fs_data) { + res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, + len, 0); + if (!res) { + ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); + ext4_clear_inode_state(inode, + EXT4_STATE_MAY_INLINE_DATA); + } + return res; + } + + handle = ext4_journal_start(inode, EXT4_HT_MISC, + ext4_jbd2_credits_xattr(inode)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, + len, 0); + if (!res) { + ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); + res = ext4_mark_inode_dirty(handle, inode); + if (res) + EXT4_ERROR_INODE(inode, "Failed to mark inode dirty"); + } + res2 = ext4_journal_stop(handle); + if (!res) + res = res2; + return res; +} + +static int ext4_dummy_context(struct inode *inode) +{ + return DUMMY_ENCRYPTION_ENABLED(EXT4_SB(inode->i_sb)); +} + +static unsigned ext4_max_namelen(struct inode *inode) +{ + return S_ISLNK(inode->i_mode) ? inode->i_sb->s_blocksize : + EXT4_NAME_LEN; +} + +static struct fscrypt_operations ext4_cryptops = { + .get_context = ext4_get_context, + .key_prefix = ext4_key_prefix, + .prepare_context = ext4_prepare_context, + .set_context = ext4_set_context, + .dummy_context = ext4_dummy_context, + .is_encrypted = ext4_encrypted_inode, + .empty_dir = ext4_empty_dir, + .max_namelen = ext4_max_namelen, +}; +#else +static struct fscrypt_operations ext4_cryptops = { + .is_encrypted = ext4_encrypted_inode, +}; +#endif + #ifdef CONFIG_QUOTA static char *quotatypes[] = INITQFNAMES; #define QTYPE2NAME(t) (quotatypes[t]) @@ -2068,23 +2148,25 @@ failed: static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group, struct ext4_group_desc *gdp) { - int offset; + int offset = offsetof(struct ext4_group_desc, bg_checksum); __u16 crc = 0; __le32 le_group = cpu_to_le32(block_group); struct ext4_sb_info *sbi = EXT4_SB(sb); if (ext4_has_metadata_csum(sbi->s_sb)) { /* Use new metadata_csum algorithm */ - __le16 save_csum; __u32 csum32; + __u16 dummy_csum = 0; - save_csum = gdp->bg_checksum; - gdp->bg_checksum = 0; csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group, sizeof(le_group)); - csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, - sbi->s_desc_size); - gdp->bg_checksum = save_csum; + csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, offset); + csum32 = ext4_chksum(sbi, csum32, (__u8 *)&dummy_csum, + sizeof(dummy_csum)); + offset += sizeof(dummy_csum); + if (offset < sbi->s_desc_size) + csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp + offset, + sbi->s_desc_size - offset); crc = csum32 & 0xFFFF; goto out; @@ -2094,8 +2176,6 @@ static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group, if (!ext4_has_feature_gdt_csum(sb)) return 0; - offset = offsetof(struct ext4_group_desc, bg_checksum); - crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid)); crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group)); crc = crc16(crc, (__u8 *)gdp, offset); @@ -2278,6 +2358,16 @@ static void ext4_orphan_cleanup(struct super_block *sb, while (es->s_last_orphan) { struct inode *inode; + /* + * We may have encountered an error during cleanup; if + * so, skip the rest. + */ + if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { + jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); + es->s_last_orphan = 0; + break; + } + inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan)); if (IS_ERR(inode)) { es->s_last_orphan = 0; @@ -3416,6 +3506,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } + if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (blocksize / 4)) { + ext4_msg(sb, KERN_ERR, + "Number of reserved GDT blocks insanely large: %d", + le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks)); + goto failed_mount; + } + if (sbi->s_mount_opt & EXT4_MOUNT_DAX) { err = bdev_dax_supported(sb, blocksize); if (err) @@ -3686,6 +3783,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_op = &ext4_sops; sb->s_export_op = &ext4_export_ops; sb->s_xattr = ext4_xattr_handlers; + sb->s_cop = &ext4_cryptops; #ifdef CONFIG_QUOTA sb->dq_op = &ext4_quota_operations; if (ext4_has_feature_quota(sb)) @@ -3996,6 +4094,11 @@ no_journal: ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10); kfree(orig_data); +#ifdef CONFIG_EXT4_FS_ENCRYPTION + memcpy(sbi->key_prefix, EXT4_KEY_DESC_PREFIX, + EXT4_KEY_DESC_PREFIX_SIZE); + sbi->key_prefix_size = EXT4_KEY_DESC_PREFIX_SIZE; +#endif return 0; cantfind_ext4: @@ -4204,7 +4307,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, goto out_bdev; } journal->j_private = sb; - ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer); + ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer); wait_on_buffer(journal->j_sb_buffer); if (!buffer_uptodate(journal->j_sb_buffer)) { ext4_msg(sb, KERN_ERR, "I/O error on journal device"); @@ -4327,20 +4430,6 @@ static int ext4_commit_super(struct super_block *sb, int sync) if (!sbh || block_device_ejected(sb)) return error; - if (buffer_write_io_error(sbh)) { - /* - * Oh, dear. A previous attempt to write the - * superblock failed. This could happen because the - * USB device was yanked out. Or it could happen to - * be a transient write error and maybe the block will - * be remapped. Nothing we can do but to retry the - * write and hope for the best. - */ - ext4_msg(sb, KERN_ERR, "previous I/O error to " - "superblock detected"); - clear_buffer_write_io_error(sbh); - set_buffer_uptodate(sbh); - } /* * If the file system is mounted read-only, don't update the * superblock write time. This avoids updating the superblock @@ -4371,7 +4460,23 @@ static int ext4_commit_super(struct super_block *sb, int sync) &EXT4_SB(sb)->s_freeinodes_counter)); BUFFER_TRACE(sbh, "marking dirty"); ext4_superblock_csum_set(sb); + lock_buffer(sbh); + if (buffer_write_io_error(sbh)) { + /* + * Oh, dear. A previous attempt to write the + * superblock failed. This could happen because the + * USB device was yanked out. Or it could happen to + * be a transient write error and maybe the block will + * be remapped. Nothing we can do but to retry the + * write and hope for the best. + */ + ext4_msg(sb, KERN_ERR, "previous I/O error to " + "superblock detected"); + clear_buffer_write_io_error(sbh); + set_buffer_uptodate(sbh); + } mark_buffer_dirty(sbh); + unlock_buffer(sbh); if (sync) { error = __sync_dirty_buffer(sbh, test_opt(sb, BARRIER) ? WRITE_FUA : WRITE_SYNC); @@ -5422,7 +5527,6 @@ out5: static void __exit ext4_exit_fs(void) { - ext4_exit_crypto(); ext4_destroy_lazyinit_thread(); unregister_as_ext2(); unregister_as_ext3(); diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 75ed5c2f0c16..4d83d9e05f2e 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -22,23 +22,22 @@ #include "ext4.h" #include "xattr.h" -#ifdef CONFIG_EXT4_FS_ENCRYPTION static const char *ext4_encrypted_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct page *cpage = NULL; char *caddr, *paddr = NULL; - struct ext4_str cstr, pstr; - struct ext4_encrypted_symlink_data *sd; + struct fscrypt_str cstr, pstr; + struct fscrypt_symlink_data *sd; loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1); int res; - u32 plen, max_size = inode->i_sb->s_blocksize; + u32 max_size = inode->i_sb->s_blocksize; if (!dentry) return ERR_PTR(-ECHILD); - res = ext4_get_encryption_info(inode); + res = fscrypt_get_encryption_info(inode); if (res) return ERR_PTR(res); @@ -54,30 +53,27 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry, } /* Symlink is encrypted */ - sd = (struct ext4_encrypted_symlink_data *)caddr; + sd = (struct fscrypt_symlink_data *)caddr; cstr.name = sd->encrypted_path; cstr.len = le16_to_cpu(sd->len); - if ((cstr.len + - sizeof(struct ext4_encrypted_symlink_data) - 1) > - max_size) { + if ((cstr.len + sizeof(struct fscrypt_symlink_data) - 1) > max_size) { /* Symlink data on the disk is corrupted */ res = -EFSCORRUPTED; goto errout; } - plen = (cstr.len < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2) ? - EXT4_FNAME_CRYPTO_DIGEST_SIZE*2 : cstr.len; - paddr = kmalloc(plen + 1, GFP_NOFS); - if (!paddr) { - res = -ENOMEM; + + res = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr); + if (res) goto errout; - } - pstr.name = paddr; - pstr.len = plen; - res = _ext4_fname_disk_to_usr(inode, NULL, &cstr, &pstr); + + res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); if (res < 0) goto errout; + + paddr = pstr.name; + /* Null-terminate the name */ - if (res <= plen) + if (res <= pstr.len) paddr[res] = '\0'; if (cpage) put_page(cpage); @@ -99,7 +95,6 @@ const struct inode_operations ext4_encrypted_symlink_inode_operations = { .listxattr = ext4_listxattr, .removexattr = generic_removexattr, }; -#endif const struct inode_operations ext4_symlink_inode_operations = { .readlink = generic_readlink, diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index e79bd32b9b79..39e9cfb1b371 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -121,17 +121,18 @@ static __le32 ext4_xattr_block_csum(struct inode *inode, { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; - __le32 save_csum; __le64 dsk_block_nr = cpu_to_le64(block_nr); + __u32 dummy_csum = 0; + int offset = offsetof(struct ext4_xattr_header, h_checksum); - save_csum = hdr->h_checksum; - hdr->h_checksum = 0; csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); - csum = ext4_chksum(sbi, csum, (__u8 *)hdr, - EXT4_BLOCK_SIZE(inode->i_sb)); + csum = ext4_chksum(sbi, csum, (__u8 *)hdr, offset); + csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); + offset += sizeof(dummy_csum); + csum = ext4_chksum(sbi, csum, (__u8 *)hdr + offset, + EXT4_BLOCK_SIZE(inode->i_sb) - offset); - hdr->h_checksum = save_csum; return cpu_to_le32(csum); } diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 389160049993..124b4a3017b5 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -63,14 +63,15 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, struct f2fs_io_info fio = { .sbi = sbi, .type = META, - .rw = READ_SYNC | REQ_META | REQ_PRIO, + .op = REQ_OP_READ, + .op_flags = READ_SYNC | REQ_META | REQ_PRIO, .old_blkaddr = index, .new_blkaddr = index, .encrypted_page = NULL, }; if (unlikely(!is_meta)) - fio.rw &= ~REQ_META; + fio.op_flags &= ~REQ_META; repeat: page = f2fs_grab_cache_page(mapping, index, false); if (!page) { @@ -157,13 +158,14 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, struct f2fs_io_info fio = { .sbi = sbi, .type = META, - .rw = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : READA, + .op = REQ_OP_READ, + .op_flags = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : REQ_RAHEAD, .encrypted_page = NULL, }; struct blk_plug plug; if (unlikely(type == META_POR)) - fio.rw &= ~REQ_META; + fio.op_flags &= ~REQ_META; blk_start_plug(&plug); for (; nrpages-- > 0; blkno++) { diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9a8bbc1fb1fa..ded224518978 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -97,12 +97,11 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, return bio; } -static inline void __submit_bio(struct f2fs_sb_info *sbi, int rw, - struct bio *bio) +static inline void __submit_bio(struct f2fs_sb_info *sbi, struct bio *bio) { - if (!is_read_io(rw)) + if (!is_read_io(bio_op(bio))) atomic_inc(&sbi->nr_wb_bios); - submit_bio(rw, bio); + submit_bio(bio); } static void __submit_merged_bio(struct f2fs_bio_info *io) @@ -112,12 +111,14 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) if (!io->bio) return; - if (is_read_io(fio->rw)) + if (is_read_io(fio->op)) trace_f2fs_submit_read_bio(io->sbi->sb, fio, io->bio); else trace_f2fs_submit_write_bio(io->sbi->sb, fio, io->bio); - __submit_bio(io->sbi, fio->rw, io->bio); + bio_set_op_attrs(io->bio, fio->op, fio->op_flags); + + __submit_bio(io->sbi, io->bio); io->bio = NULL; } @@ -183,10 +184,12 @@ static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { io->fio.type = META_FLUSH; + io->fio.op = REQ_OP_WRITE; if (test_opt(sbi, NOBARRIER)) - io->fio.rw = WRITE_FLUSH | REQ_META | REQ_PRIO; + io->fio.op_flags = WRITE_FLUSH | REQ_META | REQ_PRIO; else - io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO; + io->fio.op_flags = WRITE_FLUSH_FUA | REQ_META | + REQ_PRIO; } __submit_merged_bio(io); out: @@ -228,14 +231,16 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) f2fs_trace_ios(fio, 0); /* Allocate a new bio */ - bio = __bio_alloc(fio->sbi, fio->new_blkaddr, 1, is_read_io(fio->rw)); + bio = __bio_alloc(fio->sbi, fio->new_blkaddr, 1, is_read_io(fio->op)); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_put(bio); return -EFAULT; } + bio->bi_rw = fio->op_flags; + bio_set_op_attrs(bio, fio->op, fio->op_flags); - __submit_bio(fio->sbi, fio->rw, bio); + __submit_bio(fio->sbi, bio); return 0; } @@ -244,7 +249,7 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) struct f2fs_sb_info *sbi = fio->sbi; enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); struct f2fs_bio_info *io; - bool is_read = is_read_io(fio->rw); + bool is_read = is_read_io(fio->op); struct page *bio_page; io = is_read ? &sbi->read_io : &sbi->write_io[btype]; @@ -256,7 +261,7 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) down_write(&io->io_rwsem); if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 || - io->fio.rw != fio->rw)) + (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags))) __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { @@ -390,7 +395,7 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) } struct page *get_read_data_page(struct inode *inode, pgoff_t index, - int rw, bool for_write) + int op_flags, bool for_write) { struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; @@ -400,7 +405,8 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), .type = DATA, - .rw = rw, + .op = REQ_OP_READ, + .op_flags = op_flags, .encrypted_page = NULL, }; @@ -996,7 +1002,8 @@ static int f2fs_mpage_readpages(struct address_space *mapping, page = list_entry(pages->prev, struct page, lru); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, - page->index, GFP_KERNEL)) + page->index, + readahead_gfp_mask(mapping))) goto next_page; } @@ -1051,7 +1058,7 @@ got_it: */ if (bio && (last_block_in_bio != block_nr - 1)) { submit_and_realloc: - __submit_bio(F2FS_I_SB(inode), READ, bio); + __submit_bio(F2FS_I_SB(inode), bio); bio = NULL; } if (bio == NULL) { @@ -1080,6 +1087,7 @@ submit_and_realloc: bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(block_nr); bio->bi_end_io = f2fs_read_end_io; bio->bi_private = ctx; + bio_set_op_attrs(bio, REQ_OP_READ, 0); } if (bio_add_page(bio, page, blocksize, 0) < blocksize) @@ -1094,7 +1102,7 @@ set_error_page: goto next_page; confused: if (bio) { - __submit_bio(F2FS_I_SB(inode), READ, bio); + __submit_bio(F2FS_I_SB(inode), bio); bio = NULL; } unlock_page(page); @@ -1104,7 +1112,7 @@ next_page: } BUG_ON(pages && !list_empty(pages)); if (bio) - __submit_bio(F2FS_I_SB(inode), READ, bio); + __submit_bio(F2FS_I_SB(inode), bio); return 0; } @@ -1221,7 +1229,8 @@ static int f2fs_write_data_page(struct page *page, struct f2fs_io_info fio = { .sbi = sbi, .type = DATA, - .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, + .op = REQ_OP_WRITE, + .op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0, .page = page, .encrypted_page = NULL, }; @@ -1662,7 +1671,8 @@ repeat: struct f2fs_io_info fio = { .sbi = sbi, .type = DATA, - .rw = READ_SYNC, + .op = REQ_OP_READ, + .op_flags = READ_SYNC, .old_blkaddr = blkaddr, .new_blkaddr = blkaddr, .page = page, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 916e7c238e3d..23ae6a81ccd6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -686,14 +686,15 @@ enum page_type { struct f2fs_io_info { struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ - int rw; /* contains R/RS/W/WS with REQ_META/REQ_PRIO */ + int op; /* contains REQ_OP_ */ + int op_flags; /* rq_flag_bits */ block_t new_blkaddr; /* new block address to be written */ block_t old_blkaddr; /* old block address before Cow */ struct page *page; /* page to be written */ struct page *encrypted_page; /* encrypted page */ }; -#define is_read_io(rw) (((rw) & 1) == READ) +#define is_read_io(rw) (rw == READ) struct f2fs_bio_info { struct f2fs_sb_info *sbi; /* f2fs superblock */ struct bio *bio; /* bios to merge */ diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 38d56f678912..f06ed73adf99 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -538,7 +538,8 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), .type = DATA, - .rw = READ_SYNC, + .op = REQ_OP_READ, + .op_flags = READ_SYNC, .encrypted_page = NULL, }; struct dnode_of_data dn; @@ -612,7 +613,8 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) /* allocate block address */ f2fs_wait_on_page_writeback(dn.node_page, NODE, true); - fio.rw = WRITE_SYNC; + fio.op = REQ_OP_WRITE; + fio.op_flags = WRITE_SYNC; fio.new_blkaddr = newaddr; f2fs_submit_page_mbio(&fio); @@ -649,7 +651,8 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type) struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), .type = DATA, - .rw = WRITE_SYNC, + .op = REQ_OP_WRITE, + .op_flags = WRITE_SYNC, .page = page, .encrypted_page = NULL, }; @@ -730,7 +733,8 @@ next_step: start_bidx = start_bidx_of_node(nofs, inode); data_page = get_read_data_page(inode, - start_bidx + ofs_in_node, READA, true); + start_bidx + ofs_in_node, REQ_RAHEAD, + true); if (IS_ERR(data_page)) { iput(inode); continue; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index a4bb155dd00a..c15e53c1d794 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -108,7 +108,8 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) struct f2fs_io_info fio = { .sbi = F2FS_I_SB(dn->inode), .type = DATA, - .rw = WRITE_SYNC | REQ_PRIO, + .op = REQ_OP_WRITE, + .op_flags = WRITE_SYNC | REQ_PRIO, .page = page, .encrypted_page = NULL, }; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 1f21aae80c40..d1867698e601 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1070,14 +1070,15 @@ fail: * 0: f2fs_put_page(page, 0) * LOCKED_PAGE or error: f2fs_put_page(page, 1) */ -static int read_node_page(struct page *page, int rw) +static int read_node_page(struct page *page, int op_flags) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); struct node_info ni; struct f2fs_io_info fio = { .sbi = sbi, .type = NODE, - .rw = rw, + .op = REQ_OP_READ, + .op_flags = op_flags, .page = page, .encrypted_page = NULL, }; @@ -1118,7 +1119,7 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) if (!apage) return; - err = read_node_page(apage, READA); + err = read_node_page(apage, REQ_RAHEAD); f2fs_put_page(apage, err ? 1 : 0); } @@ -1568,7 +1569,8 @@ static int f2fs_write_node_page(struct page *page, struct f2fs_io_info fio = { .sbi = sbi, .type = NODE, - .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, + .op = REQ_OP_WRITE, + .op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0, .page = page, .encrypted_page = NULL, }; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 2e6f537a0e7d..4c2d1fa1e0e2 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -257,7 +257,8 @@ static int __commit_inmem_pages(struct inode *inode, struct f2fs_io_info fio = { .sbi = sbi, .type = DATA, - .rw = WRITE_SYNC | REQ_PRIO, + .op = REQ_OP_WRITE, + .op_flags = WRITE_SYNC | REQ_PRIO, .encrypted_page = NULL, }; bool submit_bio = false; @@ -406,7 +407,8 @@ repeat: fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); bio->bi_bdev = sbi->sb->s_bdev; - ret = submit_bio_wait(WRITE_FLUSH, bio); + bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); + ret = submit_bio_wait(bio); llist_for_each_entry_safe(cmd, next, fcc->dispatch_list, llnode) { @@ -438,7 +440,8 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) int ret; bio->bi_bdev = sbi->sb->s_bdev; - ret = submit_bio_wait(WRITE_FLUSH, bio); + bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); + ret = submit_bio_wait(bio); bio_put(bio); return ret; } @@ -1401,7 +1404,8 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) struct f2fs_io_info fio = { .sbi = sbi, .type = META, - .rw = WRITE_SYNC | REQ_META | REQ_PRIO, + .op = REQ_OP_WRITE, + .op_flags = WRITE_SYNC | REQ_META | REQ_PRIO, .old_blkaddr = page->index, .new_blkaddr = page->index, .page = page, @@ -1409,7 +1413,7 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) }; if (unlikely(page->index >= MAIN_BLKADDR(sbi))) - fio.rw &= ~REQ_META; + fio.op_flags &= ~REQ_META; set_page_writeback(page); f2fs_submit_page_mbio(&fio); diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c index 562ce0821559..73b4e1d1912a 100644 --- a/fs/f2fs/trace.c +++ b/fs/f2fs/trace.c @@ -25,11 +25,11 @@ static inline void __print_last_io(void) if (!last_io.len) return; - trace_printk("%3x:%3x %4x %-16s %2x %5x %12x %4x\n", + trace_printk("%3x:%3x %4x %-16s %2x %5x %5x %12x %4x\n", last_io.major, last_io.minor, last_io.pid, "----------------", last_io.type, - last_io.fio.rw, + last_io.fio.op, last_io.fio.op_flags, last_io.fio.new_blkaddr, last_io.len); memset(&last_io, 0, sizeof(last_io)); @@ -101,7 +101,8 @@ void f2fs_trace_ios(struct f2fs_io_info *fio, int flush) if (last_io.major == major && last_io.minor == minor && last_io.pid == pid && last_io.type == __file_type(inode, pid) && - last_io.fio.rw == fio->rw && + last_io.fio.op == fio->op && + last_io.fio.op_flags == fio->op_flags && last_io.fio.new_blkaddr + last_io.len == fio->new_blkaddr) { last_io.len++; diff --git a/fs/fat/misc.c b/fs/fat/misc.c index c4589e981760..8a8698119ff7 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -267,7 +267,7 @@ int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs) int i, err = 0; for (i = 0; i < nr_bhs; i++) - write_dirty_buffer(bhs[i], WRITE); + write_dirty_buffer(bhs[i], 0); for (i = 0; i < nr_bhs; i++) { wait_on_buffer(bhs[i]); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 989a2cef6b76..6f9c9f6f5157 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -483,9 +483,9 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) goto out_free; } inode->i_state |= I_WB_SWITCH; + __iget(inode); spin_unlock(&inode->i_lock); - ihold(inode); isw->inode = inode; atomic_inc(&isw_nr_in_flight); @@ -981,6 +981,42 @@ void inode_io_list_del(struct inode *inode) } /* + * mark an inode as under writeback on the sb + */ +void sb_mark_inode_writeback(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + unsigned long flags; + + if (list_empty(&inode->i_wb_list)) { + spin_lock_irqsave(&sb->s_inode_wblist_lock, flags); + if (list_empty(&inode->i_wb_list)) { + list_add_tail(&inode->i_wb_list, &sb->s_inodes_wb); + trace_sb_mark_inode_writeback(inode); + } + spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags); + } +} + +/* + * clear an inode as under writeback on the sb + */ +void sb_clear_inode_writeback(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + unsigned long flags; + + if (!list_empty(&inode->i_wb_list)) { + spin_lock_irqsave(&sb->s_inode_wblist_lock, flags); + if (!list_empty(&inode->i_wb_list)) { + list_del_init(&inode->i_wb_list); + trace_sb_clear_inode_writeback(inode); + } + spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags); + } +} + +/* * Redirty an inode: set its when-it-was dirtied timestamp and move it to the * furthest end of its superblock's dirty-inode list. * @@ -2154,7 +2190,7 @@ EXPORT_SYMBOL(__mark_inode_dirty); */ static void wait_sb_inodes(struct super_block *sb) { - struct inode *inode, *old_inode = NULL; + LIST_HEAD(sync_list); /* * We need to be protected against the filesystem going from @@ -2163,38 +2199,60 @@ static void wait_sb_inodes(struct super_block *sb) WARN_ON(!rwsem_is_locked(&sb->s_umount)); mutex_lock(&sb->s_sync_lock); - spin_lock(&sb->s_inode_list_lock); /* - * Data integrity sync. Must wait for all pages under writeback, - * because there may have been pages dirtied before our sync - * call, but which had writeout started before we write it out. - * In which case, the inode may not be on the dirty list, but - * we still have to wait for that writeout. + * Splice the writeback list onto a temporary list to avoid waiting on + * inodes that have started writeback after this point. + * + * Use rcu_read_lock() to keep the inodes around until we have a + * reference. s_inode_wblist_lock protects sb->s_inodes_wb as well as + * the local list because inodes can be dropped from either by writeback + * completion. + */ + rcu_read_lock(); + spin_lock_irq(&sb->s_inode_wblist_lock); + list_splice_init(&sb->s_inodes_wb, &sync_list); + + /* + * Data integrity sync. Must wait for all pages under writeback, because + * there may have been pages dirtied before our sync call, but which had + * writeout started before we write it out. In which case, the inode + * may not be on the dirty list, but we still have to wait for that + * writeout. */ - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + while (!list_empty(&sync_list)) { + struct inode *inode = list_first_entry(&sync_list, struct inode, + i_wb_list); struct address_space *mapping = inode->i_mapping; + /* + * Move each inode back to the wb list before we drop the lock + * to preserve consistency between i_wb_list and the mapping + * writeback tag. Writeback completion is responsible to remove + * the inode from either list once the writeback tag is cleared. + */ + list_move_tail(&inode->i_wb_list, &sb->s_inodes_wb); + + /* + * The mapping can appear untagged while still on-list since we + * do not have the mapping lock. Skip it here, wb completion + * will remove it. + */ + if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) + continue; + + spin_unlock_irq(&sb->s_inode_wblist_lock); + spin_lock(&inode->i_lock); - if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || - (mapping->nrpages == 0)) { + if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { spin_unlock(&inode->i_lock); + + spin_lock_irq(&sb->s_inode_wblist_lock); continue; } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&sb->s_inode_list_lock); - - /* - * We hold a reference to 'inode' so it couldn't have been - * removed from s_inodes list while we dropped the - * s_inode_list_lock. We cannot iput the inode now as we can - * be holding the last reference and we cannot iput it under - * s_inode_list_lock. So we keep the reference and iput it - * later. - */ - iput(old_inode); - old_inode = inode; + rcu_read_unlock(); /* * We keep the error status of individual mapping so that @@ -2205,10 +2263,13 @@ static void wait_sb_inodes(struct super_block *sb) cond_resched(); - spin_lock(&sb->s_inode_list_lock); + iput(inode); + + rcu_read_lock(); + spin_lock_irq(&sb->s_inode_wblist_lock); } - spin_unlock(&sb->s_inode_list_lock); - iput(old_inode); + spin_unlock_irq(&sb->s_inode_wblist_lock); + rcu_read_unlock(); mutex_unlock(&sb->s_sync_lock); } diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 3078b679fcd1..c8c4f79c7ce1 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -887,6 +887,8 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie) put_page(results[i]); } + wake_up_bit(&cookie->flags, 0); + _leave(""); } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index ccd4971cc6c1..cca7b048c07b 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -341,8 +341,10 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, struct dentry *newent; bool outarg_valid = true; + fuse_lock_inode(dir); err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name, &outarg, &inode); + fuse_unlock_inode(dir); if (err == -ENOENT) { outarg_valid = false; err = 0; @@ -478,7 +480,7 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry, struct fuse_conn *fc = get_fuse_conn(dir); struct dentry *res = NULL; - if (d_unhashed(entry)) { + if (d_in_lookup(entry)) { res = fuse_lookup(dir, entry, 0); if (IS_ERR(res)) return PTR_ERR(res); @@ -1341,7 +1343,9 @@ static int fuse_readdir(struct file *file, struct dir_context *ctx) fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, FUSE_READDIR); } + fuse_lock_inode(inode); fuse_request_send(fc, req); + fuse_unlock_inode(inode); nbytes = req->out.args[0].size; err = req->out.h.error; fuse_put_request(fc, req); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index eddbe02c4028..929c383432b0 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -110,6 +110,9 @@ struct fuse_inode { /** Miscellaneous bits describing inode state */ unsigned long state; + + /** Lock for serializing lookup and readdir for back compatibility*/ + struct mutex mutex; }; /** FUSE inode state bits */ @@ -540,6 +543,9 @@ struct fuse_conn { /** write-back cache policy (default is write-through) */ unsigned writeback_cache:1; + /** allow parallel lookups and readdir (default is serialized) */ + unsigned parallel_dirops:1; + /* * The following bitfields are only for optimization purposes * and hence races in setting them will not cause malfunction @@ -956,4 +962,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, void fuse_set_initialized(struct fuse_conn *fc); +void fuse_unlock_inode(struct inode *inode); +void fuse_lock_inode(struct inode *inode); + #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 1ce67668a8e1..9961d8432ce3 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -97,6 +97,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) INIT_LIST_HEAD(&fi->queued_writes); INIT_LIST_HEAD(&fi->writepages); init_waitqueue_head(&fi->page_waitq); + mutex_init(&fi->mutex); fi->forget = fuse_alloc_forget(); if (!fi->forget) { kmem_cache_free(fuse_inode_cachep, inode); @@ -117,6 +118,7 @@ static void fuse_destroy_inode(struct inode *inode) struct fuse_inode *fi = get_fuse_inode(inode); BUG_ON(!list_empty(&fi->write_files)); BUG_ON(!list_empty(&fi->queued_writes)); + mutex_destroy(&fi->mutex); kfree(fi->forget); call_rcu(&inode->i_rcu, fuse_i_callback); } @@ -351,6 +353,18 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, return 0; } +void fuse_lock_inode(struct inode *inode) +{ + if (!get_fuse_conn(inode)->parallel_dirops) + mutex_lock(&get_fuse_inode(inode)->mutex); +} + +void fuse_unlock_inode(struct inode *inode) +{ + if (!get_fuse_conn(inode)->parallel_dirops) + mutex_unlock(&get_fuse_inode(inode)->mutex); +} + static void fuse_umount_begin(struct super_block *sb) { fuse_abort_conn(get_fuse_conn_super(sb)); @@ -898,6 +912,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->async_dio = 1; if (arg->flags & FUSE_WRITEBACK_CACHE) fc->writeback_cache = 1; + if (arg->flags & FUSE_PARALLEL_DIROPS) + fc->parallel_dirops = 1; if (arg->time_gran && arg->time_gran <= 1000000000) fc->sb->s_time_gran = arg->time_gran; } else { @@ -928,7 +944,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO | - FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT; + FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT | + FUSE_PARALLEL_DIROPS; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 37b7bc14c8da..82df36886938 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -140,6 +140,32 @@ static int gfs2_writepage(struct page *page, struct writeback_control *wbc) return nobh_writepage(page, gfs2_get_block_noalloc, wbc); } +/* This is the same as calling block_write_full_page, but it also + * writes pages outside of i_size + */ +int gfs2_write_full_page(struct page *page, get_block_t *get_block, + struct writeback_control *wbc) +{ + struct inode * const inode = page->mapping->host; + loff_t i_size = i_size_read(inode); + const pgoff_t end_index = i_size >> PAGE_SHIFT; + unsigned offset; + + /* + * The page straddles i_size. It must be zeroed out on each and every + * writepage invocation because it may be mmapped. "A file is mapped + * in multiples of the page size. For a file that is not a multiple of + * the page size, the remaining memory is zeroed when mapped, and + * writes to that region are not written out to the file." + */ + offset = i_size & (PAGE_SIZE-1); + if (page->index == end_index && offset) + zero_user_segment(page, offset, PAGE_SIZE); + + return __block_write_full_page(inode, page, get_block, wbc, + end_buffer_async_write); +} + /** * __gfs2_jdata_writepage - The core of jdata writepage * @page: The page to write @@ -165,7 +191,7 @@ static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *w } gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1); } - return block_write_full_page(page, gfs2_get_block_noalloc, wbc); + return gfs2_write_full_page(page, gfs2_get_block_noalloc, wbc); } /** @@ -180,27 +206,20 @@ static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *w static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); int ret; - int done_trans = 0; - if (PageChecked(page)) { - if (wbc->sync_mode != WB_SYNC_ALL) - goto out_ignore; - ret = gfs2_trans_begin(sdp, RES_DINODE + 1, 0); - if (ret) - goto out_ignore; - done_trans = 1; - } - ret = gfs2_writepage_common(page, wbc); - if (ret > 0) - ret = __gfs2_jdata_writepage(page, wbc); - if (done_trans) - gfs2_trans_end(sdp); + if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) + goto out; + if (PageChecked(page) || current->journal_info) + goto out_ignore; + ret = __gfs2_jdata_writepage(page, wbc); return ret; out_ignore: redirty_page_for_writepage(wbc, page); +out: unlock_page(page); return 0; } diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 24ce1cdd434a..6e2bec1cd289 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -285,7 +285,8 @@ static void gfs2_metapath_ra(struct gfs2_glock *gl, if (trylock_buffer(rabh)) { if (!buffer_uptodate(rabh)) { rabh->b_end_io = end_buffer_read_sync; - submit_bh(READA | REQ_META, rabh); + submit_bh(REQ_OP_READ, REQ_RAHEAD | REQ_META, + rabh); continue; } unlock_buffer(rabh); @@ -974,7 +975,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from) if (!buffer_uptodate(bh)) { err = -EIO; - ll_rw_block(READ, 1, &bh); + ll_rw_block(REQ_OP_READ, 0, 1, &bh); wait_on_buffer(bh); /* Uhhuh. Read error. Complain and punt. */ if (!buffer_uptodate(bh)) diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index 30822b148f3e..5173b98ca036 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c @@ -117,7 +117,7 @@ static int gfs2_dentry_delete(const struct dentry *dentry) return 0; ginode = GFS2_I(d_inode(dentry)); - if (!ginode->i_iopen_gh.gh_gl) + if (!gfs2_holder_initialized(&ginode->i_iopen_gh)) return 0; if (test_bit(GLF_DEMOTE, &ginode->i_iopen_gh.gh_gl->gl_flags)) diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 271d93905bac..fcb59b23f1e3 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -1513,7 +1513,7 @@ static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index, continue; } bh->b_end_io = end_buffer_read_sync; - submit_bh(READA | REQ_META, bh); + submit_bh(REQ_OP_READ, REQ_RAHEAD | REQ_META, bh); continue; } brelse(bh); @@ -1663,7 +1663,8 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name, brelse(bh); if (fail_on_exist) return ERR_PTR(-EEXIST); - inode = gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino); + inode = gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, + GFS2_BLKST_FREE /* ignore */); if (!IS_ERR(inode)) GFS2_I(inode)->i_rahead = rahead; return inode; diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index d5bda8513457..a332f3cd925e 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -137,21 +137,10 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, struct gfs2_sbd *sdp = sb->s_fs_info; struct inode *inode; - inode = gfs2_ilookup(sb, inum->no_addr); - if (inode) { - if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) { - iput(inode); - return ERR_PTR(-ESTALE); - } - goto out_inode; - } - inode = gfs2_lookup_by_inum(sdp, inum->no_addr, &inum->no_formal_ino, GFS2_BLKST_DINODE); if (IS_ERR(inode)) return ERR_CAST(inode); - -out_inode: return d_obtain_alias(inode); } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index e0f98e483aec..320e65e61938 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1098,7 +1098,7 @@ static void do_unflock(struct file *file, struct file_lock *fl) mutex_lock(&fp->f_fl_mutex); locks_lock_file_wait(file, fl); - if (fl_gh->gh_gl) { + if (gfs2_holder_initialized(fl_gh)) { gfs2_glock_dq(fl_gh); gfs2_holder_uninit(fl_gh); } diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 706fd9352f36..3a90b2b5b9bb 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -575,7 +575,6 @@ static void delete_work_func(struct work_struct *work) { struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete); struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - struct gfs2_inode *ip; struct inode *inode; u64 no_addr = gl->gl_name.ln_number; @@ -585,13 +584,7 @@ static void delete_work_func(struct work_struct *work) if (test_bit(GLF_INODE_CREATING, &gl->gl_flags)) goto out; - ip = gl->gl_object; - /* Note: Unsafe to dereference ip as we don't hold right refs/locks */ - - if (ip) - inode = gfs2_ilookup(sdp->sd_vfs, no_addr); - else - inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED); + inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED); if (inode && !IS_ERR(inode)) { d_prune_aliases(inode); iput(inode); @@ -808,7 +801,7 @@ void gfs2_holder_uninit(struct gfs2_holder *gh) { put_pid(gh->gh_owner_pid); gfs2_glock_put(gh->gh_gl); - gh->gh_gl = NULL; + gfs2_holder_mark_uninitialized(gh); gh->gh_ip = 0; } diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 46ab67fc16da..ab1ef322f7a5 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -247,4 +247,14 @@ extern void gfs2_unregister_debugfs(void); extern const struct lm_lockops gfs2_dlm_ops; +static inline void gfs2_holder_mark_uninitialized(struct gfs2_holder *gh) +{ + gh->gh_gl = NULL; +} + +static inline bool gfs2_holder_initialized(struct gfs2_holder *gh) +{ + return gh->gh_gl; +} + #endif /* __GLOCK_DOT_H__ */ diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 21dc784f66c2..e0621cacf134 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -37,9 +37,35 @@ #include "super.h" #include "glops.h" -struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr) +static int iget_test(struct inode *inode, void *opaque) { - return ilookup(sb, (unsigned long)no_addr); + u64 no_addr = *(u64 *)opaque; + + return GFS2_I(inode)->i_no_addr == no_addr; +} + +static int iget_set(struct inode *inode, void *opaque) +{ + u64 no_addr = *(u64 *)opaque; + + GFS2_I(inode)->i_no_addr = no_addr; + inode->i_ino = no_addr; + return 0; +} + +static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr) +{ + struct inode *inode; + +repeat: + inode = iget5_locked(sb, no_addr, iget_test, iget_set, &no_addr); + if (!inode) + return inode; + if (is_bad_inode(inode)) { + iput(inode); + goto repeat; + } + return inode; } /** @@ -78,26 +104,37 @@ static void gfs2_set_iop(struct inode *inode) /** * gfs2_inode_lookup - Lookup an inode * @sb: The super block - * @no_addr: The inode number * @type: The type of the inode + * @no_addr: The inode number + * @no_formal_ino: The inode generation number + * @blktype: Requested block type (GFS2_BLKST_DINODE or GFS2_BLKST_UNLINKED; + * GFS2_BLKST_FREE do indicate not to verify) + * + * If @type is DT_UNKNOWN, the inode type is fetched from disk. + * + * If @blktype is anything other than GFS2_BLKST_FREE (which is used as a + * placeholder because it doesn't otherwise make sense), the on-disk block type + * is verified to be @blktype. * * Returns: A VFS inode, or an error */ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, - u64 no_addr, u64 no_formal_ino) + u64 no_addr, u64 no_formal_ino, + unsigned int blktype) { struct inode *inode; struct gfs2_inode *ip; struct gfs2_glock *io_gl = NULL; + struct gfs2_holder i_gh; int error; - inode = iget_locked(sb, (unsigned long)no_addr); + gfs2_holder_mark_uninitialized(&i_gh); + inode = gfs2_iget(sb, no_addr); if (!inode) return ERR_PTR(-ENOMEM); ip = GFS2_I(inode); - ip->i_no_addr = no_addr; if (inode->i_state & I_NEW) { struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -112,10 +149,29 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, if (unlikely(error)) goto fail_put; + if (type == DT_UNKNOWN || blktype != GFS2_BLKST_FREE) { + /* + * The GL_SKIP flag indicates to skip reading the inode + * block. We read the inode with gfs2_inode_refresh + * after possibly checking the block type. + */ + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, + GL_SKIP, &i_gh); + if (error) + goto fail_put; + + if (blktype != GFS2_BLKST_FREE) { + error = gfs2_check_blk_type(sdp, no_addr, + blktype); + if (error) + goto fail_put; + } + } + set_bit(GIF_INVALID, &ip->i_flags); error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); if (unlikely(error)) - goto fail_iopen; + goto fail_put; ip->i_iopen_gh.gh_gl->gl_object = ip; gfs2_glock_put(io_gl); @@ -134,6 +190,8 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, unlock_new_inode(inode); } + if (gfs2_holder_initialized(&i_gh)) + gfs2_glock_dq_uninit(&i_gh); return inode; fail_refresh: @@ -141,10 +199,11 @@ fail_refresh: ip->i_iopen_gh.gh_gl->gl_object = NULL; gfs2_glock_dq_wait(&ip->i_iopen_gh); gfs2_holder_uninit(&ip->i_iopen_gh); -fail_iopen: +fail_put: if (io_gl) gfs2_glock_put(io_gl); -fail_put: + if (gfs2_holder_initialized(&i_gh)) + gfs2_glock_dq_uninit(&i_gh); ip->i_gl->gl_object = NULL; fail: iget_failed(inode); @@ -155,23 +214,12 @@ struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, u64 *no_formal_ino, unsigned int blktype) { struct super_block *sb = sdp->sd_vfs; - struct gfs2_holder i_gh; - struct inode *inode = NULL; + struct inode *inode; int error; - /* Must not read in block until block type is verified */ - error = gfs2_glock_nq_num(sdp, no_addr, &gfs2_inode_glops, - LM_ST_EXCLUSIVE, GL_SKIP, &i_gh); - if (error) - return ERR_PTR(error); - - error = gfs2_check_blk_type(sdp, no_addr, blktype); - if (error) - goto fail; - - inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0); + inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0, blktype); if (IS_ERR(inode)) - goto fail; + return inode; /* Two extra checks for NFS only */ if (no_formal_ino) { @@ -182,16 +230,12 @@ struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, error = -EIO; if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) goto fail_iput; - - error = 0; } + return inode; -fail: - gfs2_glock_dq_uninit(&i_gh); - return error ? ERR_PTR(error) : inode; fail_iput: iput(inode); - goto fail; + return ERR_PTR(error); } @@ -236,8 +280,8 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, struct gfs2_holder d_gh; int error = 0; struct inode *inode = NULL; - int unlock = 0; + gfs2_holder_mark_uninitialized(&d_gh); if (!name->len || name->len > GFS2_FNAMESIZE) return ERR_PTR(-ENAMETOOLONG); @@ -252,7 +296,6 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); if (error) return ERR_PTR(error); - unlock = 1; } if (!is_root) { @@ -265,7 +308,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, if (IS_ERR(inode)) error = PTR_ERR(inode); out: - if (unlock) + if (gfs2_holder_initialized(&d_gh)) gfs2_glock_dq_uninit(&d_gh); if (error == -ENOENT) return NULL; @@ -1189,7 +1232,7 @@ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry, struct dentry *d; bool excl = !!(flags & O_EXCL); - if (!d_unhashed(dentry)) + if (!d_in_lookup(dentry)) goto skip_lookup; d = __gfs2_lookup(dir, dentry, file, opened); @@ -1309,7 +1352,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, struct gfs2_inode *ip = GFS2_I(d_inode(odentry)); struct gfs2_inode *nip = NULL; struct gfs2_sbd *sdp = GFS2_SB(odir); - struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }; + struct gfs2_holder ghs[5], r_gh; struct gfs2_rgrpd *nrgd; unsigned int num_gh; int dir_rename = 0; @@ -1317,6 +1360,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, unsigned int x; int error; + gfs2_holder_mark_uninitialized(&r_gh); if (d_really_is_positive(ndentry)) { nip = GFS2_I(d_inode(ndentry)); if (ip == nip) @@ -1506,7 +1550,7 @@ out_gunlock: gfs2_holder_uninit(ghs + x); } out_gunlock_r: - if (r_gh.gh_gl) + if (gfs2_holder_initialized(&r_gh)) gfs2_glock_dq_uninit(&r_gh); out: return error; @@ -1532,13 +1576,14 @@ static int gfs2_exchange(struct inode *odir, struct dentry *odentry, struct gfs2_inode *oip = GFS2_I(odentry->d_inode); struct gfs2_inode *nip = GFS2_I(ndentry->d_inode); struct gfs2_sbd *sdp = GFS2_SB(odir); - struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }; + struct gfs2_holder ghs[5], r_gh; unsigned int num_gh; unsigned int x; umode_t old_mode = oip->i_inode.i_mode; umode_t new_mode = nip->i_inode.i_mode; int error; + gfs2_holder_mark_uninitialized(&r_gh); error = gfs2_rindex_update(sdp); if (error) return error; @@ -1646,7 +1691,7 @@ out_gunlock: gfs2_holder_uninit(ghs + x); } out_gunlock_r: - if (r_gh.gh_gl) + if (gfs2_holder_initialized(&r_gh)) gfs2_glock_dq_uninit(&r_gh); out: return error; @@ -1743,9 +1788,8 @@ int gfs2_permission(struct inode *inode, int mask) struct gfs2_inode *ip; struct gfs2_holder i_gh; int error; - int unlock = 0; - + gfs2_holder_mark_uninitialized(&i_gh); ip = GFS2_I(inode); if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { if (mask & MAY_NOT_BLOCK) @@ -1753,14 +1797,13 @@ int gfs2_permission(struct inode *inode, int mask) error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); if (error) return error; - unlock = 1; } if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode)) error = -EACCES; else error = generic_permission(inode, mask); - if (unlock) + if (gfs2_holder_initialized(&i_gh)) gfs2_glock_dq_uninit(&i_gh); return error; @@ -1932,17 +1975,16 @@ static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int error; - int unlock = 0; + gfs2_holder_mark_uninitialized(&gh); if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); if (error) return error; - unlock = 1; } generic_fillattr(inode, stat); - if (unlock) + if (gfs2_holder_initialized(&gh)) gfs2_glock_dq_uninit(&gh); return 0; diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index e1af0d4aa308..7710dfd3af35 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -94,11 +94,11 @@ err: } extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, - u64 no_addr, u64 no_formal_ino); + u64 no_addr, u64 no_formal_ino, + unsigned int blktype); extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, u64 *no_formal_ino, unsigned int blktype); -extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); extern int gfs2_inode_refresh(struct gfs2_inode *ip); diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 0ff028c15199..e58ccef09c91 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -657,7 +657,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags) struct gfs2_log_header *lh; unsigned int tail; u32 hash; - int rw = WRITE_FLUSH_FUA | REQ_META; + int op_flags = WRITE_FLUSH_FUA | REQ_META; struct page *page = mempool_alloc(gfs2_page_pool, GFP_NOIO); enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state); lh = page_address(page); @@ -682,12 +682,12 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags) if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) { gfs2_ordered_wait(sdp); log_flush_wait(sdp); - rw = WRITE_SYNC | REQ_META | REQ_PRIO; + op_flags = WRITE_SYNC | REQ_META | REQ_PRIO; } sdp->sd_log_idle = (tail == sdp->sd_log_flush_head); gfs2_log_write_page(sdp, page); - gfs2_log_flush_bio(sdp, rw); + gfs2_log_flush_bio(sdp, REQ_OP_WRITE, op_flags); log_flush_wait(sdp); if (sdp->sd_log_tail != tail) @@ -738,7 +738,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, gfs2_ordered_write(sdp); lops_before_commit(sdp, tr); - gfs2_log_flush_bio(sdp, WRITE); + gfs2_log_flush_bio(sdp, REQ_OP_WRITE, 0); if (sdp->sd_log_head != sdp->sd_log_flush_head) { log_flush_wait(sdp); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index d5369a109781..49d5a1b61b06 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -230,17 +230,19 @@ static void gfs2_end_log_write(struct bio *bio) /** * gfs2_log_flush_bio - Submit any pending log bio * @sdp: The superblock - * @rw: The rw flags + * @op: REQ_OP + * @op_flags: rq_flag_bits * * Submit any pending part-built or full bio to the block device. If * there is no pending bio, then this is a no-op. */ -void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int rw) +void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int op, int op_flags) { if (sdp->sd_log_bio) { atomic_inc(&sdp->sd_log_in_flight); - submit_bio(rw, sdp->sd_log_bio); + bio_set_op_attrs(sdp->sd_log_bio, op, op_flags); + submit_bio(sdp->sd_log_bio); sdp->sd_log_bio = NULL; } } @@ -299,7 +301,7 @@ static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno) nblk >>= sdp->sd_fsb2bb_shift; if (blkno == nblk) return bio; - gfs2_log_flush_bio(sdp, WRITE); + gfs2_log_flush_bio(sdp, REQ_OP_WRITE, 0); } return gfs2_log_alloc_bio(sdp, blkno); @@ -328,7 +330,7 @@ static void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page, bio = gfs2_log_get_bio(sdp, blkno); ret = bio_add_page(bio, page, size, offset); if (ret == 0) { - gfs2_log_flush_bio(sdp, WRITE); + gfs2_log_flush_bio(sdp, REQ_OP_WRITE, 0); bio = gfs2_log_alloc_bio(sdp, blkno); ret = bio_add_page(bio, page, size, offset); WARN_ON(ret == 0); @@ -535,9 +537,9 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_METADATA) return 0; - gfs2_replay_incr_blk(sdp, &start); + gfs2_replay_incr_blk(jd, &start); - for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) { + for (; blks; gfs2_replay_incr_blk(jd, &start), blks--) { blkno = be64_to_cpu(*ptr++); jd->jd_found_blocks++; @@ -693,7 +695,7 @@ static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, offset = sizeof(struct gfs2_log_descriptor); - for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) { + for (; blks; gfs2_replay_incr_blk(jd, &start), blks--) { error = gfs2_replay_read_block(jd, start, &bh); if (error) return error; @@ -762,7 +764,6 @@ static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, __be64 *ptr, int pass) { struct gfs2_inode *ip = GFS2_I(jd->jd_inode); - struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct gfs2_glock *gl = ip->i_gl; unsigned int blks = be32_to_cpu(ld->ld_data1); struct buffer_head *bh_log, *bh_ip; @@ -773,8 +774,8 @@ static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_JDATA) return 0; - gfs2_replay_incr_blk(sdp, &start); - for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) { + gfs2_replay_incr_blk(jd, &start); + for (; blks; gfs2_replay_incr_blk(jd, &start), blks--) { blkno = be64_to_cpu(*ptr++); esc = be64_to_cpu(*ptr++); diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index a65a7ba32ffd..e529f536c117 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -27,7 +27,7 @@ extern const struct gfs2_log_operations gfs2_databuf_lops; extern const struct gfs2_log_operations *gfs2_log_ops[]; extern void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page); -extern void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int rw); +extern void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int op, int op_flags); extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); static inline unsigned int buf_limit(struct gfs2_sbd *sdp) diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index f99f8e94de3f..74fd0139e6c2 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -45,6 +45,7 @@ static void gfs2_init_inode_once(void *foo) memset(&ip->i_res, 0, sizeof(ip->i_res)); RB_CLEAR_NODE(&ip->i_res.rs_node); ip->i_hash_cache = NULL; + gfs2_holder_mark_uninitialized(&ip->i_iopen_gh); } static void gfs2_init_glock_once(void *foo) diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 8eaadabbc771..950b8be68e41 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -37,8 +37,8 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb { struct buffer_head *bh, *head; int nr_underway = 0; - int write_op = REQ_META | REQ_PRIO | - (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); + int write_flags = REQ_META | REQ_PRIO | + (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0); BUG_ON(!PageLocked(page)); BUG_ON(!page_has_buffers(page)); @@ -79,7 +79,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh(write_op, bh); + submit_bh(REQ_OP_WRITE, write_flags, bh); nr_underway++; } bh = next; @@ -213,7 +213,8 @@ static void gfs2_meta_read_endio(struct bio *bio) * Submit several consecutive buffer head I/O requests as a single bio I/O * request. (See submit_bh_wbc.) */ -static void gfs2_submit_bhs(int rw, struct buffer_head *bhs[], int num) +static void gfs2_submit_bhs(int op, int op_flags, struct buffer_head *bhs[], + int num) { struct buffer_head *bh = bhs[0]; struct bio *bio; @@ -230,7 +231,8 @@ static void gfs2_submit_bhs(int rw, struct buffer_head *bhs[], int num) bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); } bio->bi_end_io = gfs2_meta_read_endio; - submit_bio(rw, bio); + bio_set_op_attrs(bio, op, op_flags); + submit_bio(bio); } /** @@ -280,7 +282,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, } } - gfs2_submit_bhs(READ_SYNC | REQ_META | REQ_PRIO, bhs, num); + gfs2_submit_bhs(REQ_OP_READ, READ_SYNC | REQ_META | REQ_PRIO, bhs, num); if (!(flags & DIO_WAIT)) return 0; @@ -448,7 +450,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) if (buffer_uptodate(first_bh)) goto out; if (!buffer_locked(first_bh)) - ll_rw_block(READ_SYNC | REQ_META, 1, &first_bh); + ll_rw_block(REQ_OP_READ, READ_SYNC | REQ_META, 1, &first_bh); dblock++; extlen--; @@ -457,7 +459,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) bh = gfs2_getbuf(gl, dblock, CREATE); if (!buffer_uptodate(bh) && !buffer_locked(bh)) - ll_rw_block(READA | REQ_META, 1, &bh); + ll_rw_block(REQ_OP_READ, REQ_RAHEAD | REQ_META, 1, &bh); brelse(bh); dblock++; extlen--; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 45463600fb81..ef1e1822977f 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -246,7 +246,8 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent) bio->bi_end_io = end_bio_io_page; bio->bi_private = page; - submit_bio(READ_SYNC | REQ_META, bio); + bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC | REQ_META); + submit_bio(bio); wait_on_page_locked(page); bio_put(bio); if (!PageUptodate(page)) { @@ -454,7 +455,8 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr, struct dentry *dentry; struct inode *inode; - inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0); + inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, + GFS2_BLKST_FREE /* ignore */); if (IS_ERR(inode)) { fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode)); return PTR_ERR(inode); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index ce7d69a2fdc0..77930ca25303 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -730,7 +730,7 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index, if (PageUptodate(page)) set_buffer_uptodate(bh); if (!buffer_uptodate(bh)) { - ll_rw_block(READ | REQ_META, 1, &bh); + ll_rw_block(REQ_OP_READ, REQ_META, 1, &bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) goto unlock_out; @@ -883,7 +883,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota), &data_blocks, &ind_blocks); - ghs = kcalloc(num_qd, sizeof(struct gfs2_holder), GFP_NOFS); + ghs = kmalloc(num_qd * sizeof(struct gfs2_holder), GFP_NOFS); if (!ghs) return -ENOMEM; diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 1b645773c98e..113b6095a58d 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -338,7 +338,7 @@ static int foreach_descriptor(struct gfs2_jdesc *jd, unsigned int start, struct gfs2_log_header_host lh; error = get_log_header(jd, start, &lh); if (!error) { - gfs2_replay_incr_blk(sdp, &start); + gfs2_replay_incr_blk(jd, &start); brelse(bh); continue; } @@ -360,7 +360,7 @@ static int foreach_descriptor(struct gfs2_jdesc *jd, unsigned int start, } while (length--) - gfs2_replay_incr_blk(sdp, &start); + gfs2_replay_incr_blk(jd, &start); brelse(bh); } @@ -390,7 +390,7 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 }; lblock = head->lh_blkno; - gfs2_replay_incr_blk(sdp, &lblock); + gfs2_replay_incr_blk(jd, &lblock); bh_map.b_size = 1 << ip->i_inode.i_blkbits; error = gfs2_block_map(&ip->i_inode, lblock, &bh_map, 0); if (error) diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h index 6142836cce96..11fdfab4bf99 100644 --- a/fs/gfs2/recovery.h +++ b/fs/gfs2/recovery.h @@ -14,9 +14,9 @@ extern struct workqueue_struct *gfs_recovery_wq; -static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk) +static inline void gfs2_replay_incr_blk(struct gfs2_jdesc *jd, unsigned int *blk) { - if (++*blk == sdp->sd_jdesc->jd_blocks) + if (++*blk == jd->jd_blocks) *blk = 0; } diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 5bd216901e89..86ccc0159393 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -658,6 +658,7 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs) if (rgd) { spin_lock(&rgd->rd_rsspin); __rs_deltree(rs); + BUG_ON(rs->rs_free); spin_unlock(&rgd->rd_rsspin); } } @@ -671,10 +672,8 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs) void gfs2_rsqa_delete(struct gfs2_inode *ip, atomic_t *wcount) { down_write(&ip->i_rw_mutex); - if ((wcount == NULL) || (atomic_read(wcount) <= 1)) { + if ((wcount == NULL) || (atomic_read(wcount) <= 1)) gfs2_rs_deltree(&ip->i_res); - BUG_ON(ip->i_res.rs_free); - } up_write(&ip->i_rw_mutex); gfs2_qa_delete(ip, wcount); } @@ -722,6 +721,7 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) gfs2_free_clones(rgd); kfree(rgd->rd_bits); + rgd->rd_bits = NULL; return_all_reservations(rgd); kmem_cache_free(gfs2_rgrpd_cachep, rgd); } @@ -916,9 +916,6 @@ static int read_rindex_entry(struct gfs2_inode *ip) if (error) goto fail; - rgd->rd_gl->gl_object = rgd; - rgd->rd_gl->gl_vm.start = (rgd->rd_addr * bsize) & PAGE_MASK; - rgd->rd_gl->gl_vm.end = PAGE_ALIGN((rgd->rd_addr + rgd->rd_length) * bsize) - 1; rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr; rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED); if (rgd->rd_data > sdp->sd_max_rg_data) @@ -926,14 +923,20 @@ static int read_rindex_entry(struct gfs2_inode *ip) spin_lock(&sdp->sd_rindex_spin); error = rgd_insert(rgd); spin_unlock(&sdp->sd_rindex_spin); - if (!error) + if (!error) { + rgd->rd_gl->gl_object = rgd; + rgd->rd_gl->gl_vm.start = (rgd->rd_addr * bsize) & PAGE_MASK; + rgd->rd_gl->gl_vm.end = PAGE_ALIGN((rgd->rd_addr + + rgd->rd_length) * bsize) - 1; return 0; + } error = 0; /* someone else read in the rgrp; free it and ignore it */ gfs2_glock_put(rgd->rd_gl); fail: kfree(rgd->rd_bits); + rgd->rd_bits = NULL; kmem_cache_free(gfs2_rgrpd_cachep, rgd); return error; } @@ -2096,7 +2099,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip) { struct gfs2_blkreserv *rs = &ip->i_res; - if (rs->rs_rgd_gh.gh_gl) + if (gfs2_holder_initialized(&rs->rs_rgd_gh)) gfs2_glock_dq_uninit(&rs->rs_rgd_gh); } @@ -2596,7 +2599,7 @@ void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state) { unsigned int x; - rlist->rl_ghs = kcalloc(rlist->rl_rgrps, sizeof(struct gfs2_holder), + rlist->rl_ghs = kmalloc(rlist->rl_rgrps * sizeof(struct gfs2_holder), GFP_NOFS | __GFP_NOFAIL); for (x = 0; x < rlist->rl_rgrps; x++) gfs2_holder_init(rlist->rl_rgd[x]->rd_gl, diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 9b2ff353e45f..3a7e60bb39f8 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -855,7 +855,7 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) wait_event(sdp->sd_reserving_log_wait, atomic_read(&sdp->sd_reserving_log) == 0); gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks); - if (freeze_gh.gh_gl) + if (gfs2_holder_initialized(&freeze_gh)) gfs2_glock_dq_uninit(&freeze_gh); gfs2_quota_cleanup(sdp); @@ -1033,7 +1033,7 @@ static int gfs2_unfreeze(struct super_block *sb) mutex_lock(&sdp->sd_freeze_mutex); if (atomic_read(&sdp->sd_freeze_state) != SFS_FROZEN || - sdp->sd_freeze_gh.gh_gl == NULL) { + !gfs2_holder_initialized(&sdp->sd_freeze_gh)) { mutex_unlock(&sdp->sd_freeze_mutex); return 0; } @@ -1084,9 +1084,11 @@ static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host int error = 0, err; memset(sc, 0, sizeof(struct gfs2_statfs_change_host)); - gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL); + gha = kmalloc(slots * sizeof(struct gfs2_holder), GFP_KERNEL); if (!gha) return -ENOMEM; + for (x = 0; x < slots; x++) + gfs2_holder_mark_uninitialized(gha + x); rgd_next = gfs2_rgrpd_get_first(sdp); @@ -1096,7 +1098,7 @@ static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host for (x = 0; x < slots; x++) { gh = gha + x; - if (gh->gh_gl && gfs2_glock_poll(gh)) { + if (gfs2_holder_initialized(gh) && gfs2_glock_poll(gh)) { err = gfs2_glock_wait(gh); if (err) { gfs2_holder_uninit(gh); @@ -1109,7 +1111,7 @@ static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host } } - if (gh->gh_gl) + if (gfs2_holder_initialized(gh)) done = 0; else if (rgd_next && !error) { error = gfs2_glock_nq_init(rgd_next->rd_gl, @@ -1304,9 +1306,11 @@ static int gfs2_drop_inode(struct inode *inode) { struct gfs2_inode *ip = GFS2_I(inode); - if (!test_bit(GIF_FREE_VFS_INODE, &ip->i_flags) && inode->i_nlink) { + if (!test_bit(GIF_FREE_VFS_INODE, &ip->i_flags) && + inode->i_nlink && + gfs2_holder_initialized(&ip->i_iopen_gh)) { struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; - if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags)) + if (test_bit(GLF_DEMOTE, &gl->gl_flags)) clear_nlink(inode); } return generic_drop_inode(inode); @@ -1551,7 +1555,7 @@ static void gfs2_evict_inode(struct inode *inode) goto out_truncate; } - if (ip->i_iopen_gh.gh_gl && + if (gfs2_holder_initialized(&ip->i_iopen_gh) && test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_wait(&ip->i_iopen_gh); @@ -1610,7 +1614,7 @@ out_unlock: if (gfs2_rs_active(&ip->i_res)) gfs2_rs_deltree(&ip->i_res); - if (ip->i_iopen_gh.gh_gl) { + if (gfs2_holder_initialized(&ip->i_iopen_gh)) { if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_wait(&ip->i_iopen_gh); @@ -1632,7 +1636,7 @@ out: gfs2_glock_add_to_lru(ip->i_gl); gfs2_glock_put(ip->i_gl); ip->i_gl = NULL; - if (ip->i_iopen_gh.gh_gl) { + if (gfs2_holder_initialized(&ip->i_iopen_gh)) { ip->i_iopen_gh.gh_gl->gl_object = NULL; ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_wait(&ip->i_iopen_gh); diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index fdc3446d934a..047245bd2cd6 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -526,7 +526,7 @@ int hfsplus_compare_dentry(const struct dentry *parent, /* wrapper.c */ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, void *buf, - void **data, int rw); + void **data, int op, int op_flags); int hfsplus_read_wrapper(struct super_block *sb); /* time macros */ diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c index eb355d81e279..63164ebc52fa 100644 --- a/fs/hfsplus/part_tbl.c +++ b/fs/hfsplus/part_tbl.c @@ -112,7 +112,8 @@ static int hfs_parse_new_pmap(struct super_block *sb, void *buf, if ((u8 *)pm - (u8 *)buf >= buf_size) { res = hfsplus_submit_bio(sb, *part_start + HFS_PMAP_BLK + i, - buf, (void **)&pm, READ); + buf, (void **)&pm, REQ_OP_READ, + 0); if (res) return res; } @@ -136,7 +137,7 @@ int hfs_part_find(struct super_block *sb, return -ENOMEM; res = hfsplus_submit_bio(sb, *part_start + HFS_PMAP_BLK, - buf, &data, READ); + buf, &data, REQ_OP_READ, 0); if (res) goto out; diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 755bf30ba1ce..11854dd84572 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -220,7 +220,8 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait) error2 = hfsplus_submit_bio(sb, sbi->part_start + HFSPLUS_VOLHEAD_SECTOR, - sbi->s_vhdr_buf, NULL, WRITE_SYNC); + sbi->s_vhdr_buf, NULL, REQ_OP_WRITE, + WRITE_SYNC); if (!error) error = error2; if (!write_backup) @@ -228,7 +229,8 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait) error2 = hfsplus_submit_bio(sb, sbi->part_start + sbi->sect_count - 2, - sbi->s_backup_vhdr_buf, NULL, WRITE_SYNC); + sbi->s_backup_vhdr_buf, NULL, REQ_OP_WRITE, + WRITE_SYNC); if (!error) error2 = error; out: diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index cc6235671437..ebb85e5f6549 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -30,7 +30,8 @@ struct hfsplus_wd { * @sector: block to read or write, for blocks of HFSPLUS_SECTOR_SIZE bytes * @buf: buffer for I/O * @data: output pointer for location of requested data - * @rw: direction of I/O + * @op: direction of I/O + * @op_flags: request op flags * * The unit of I/O is hfsplus_min_io_size(sb), which may be bigger than * HFSPLUS_SECTOR_SIZE, and @buf must be sized accordingly. On reads @@ -44,7 +45,7 @@ struct hfsplus_wd { * will work correctly. */ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, - void *buf, void **data, int rw) + void *buf, void **data, int op, int op_flags) { struct bio *bio; int ret = 0; @@ -65,8 +66,9 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, bio = bio_alloc(GFP_NOIO, 1); bio->bi_iter.bi_sector = sector; bio->bi_bdev = sb->s_bdev; + bio_set_op_attrs(bio, op, op_flags); - if (!(rw & WRITE) && data) + if (op != WRITE && data) *data = (u8 *)buf + offset; while (io_size > 0) { @@ -83,7 +85,7 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, buf = (u8 *)buf + len; } - ret = submit_bio_wait(rw, bio); + ret = submit_bio_wait(bio); out: bio_put(bio); return ret < 0 ? ret : 0; @@ -181,7 +183,7 @@ int hfsplus_read_wrapper(struct super_block *sb) reread: error = hfsplus_submit_bio(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, sbi->s_vhdr_buf, (void **)&sbi->s_vhdr, - READ); + REQ_OP_READ, 0); if (error) goto out_free_backup_vhdr; @@ -213,7 +215,8 @@ reread: error = hfsplus_submit_bio(sb, part_start + part_size - 2, sbi->s_backup_vhdr_buf, - (void **)&sbi->s_backup_vhdr, READ); + (void **)&sbi->s_backup_vhdr, REQ_OP_READ, + 0); if (error) goto out_free_backup_vhdr; diff --git a/fs/inode.c b/fs/inode.c index 4ccbc21b30ce..e171f7b5f9e4 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -365,6 +365,7 @@ void inode_init_once(struct inode *inode) INIT_HLIST_NODE(&inode->i_hash); INIT_LIST_HEAD(&inode->i_devices); INIT_LIST_HEAD(&inode->i_io_list); + INIT_LIST_HEAD(&inode->i_wb_list); INIT_LIST_HEAD(&inode->i_lru); address_space_init_once(&inode->i_data); i_size_ordered_init(inode); @@ -507,6 +508,7 @@ void clear_inode(struct inode *inode) BUG_ON(!list_empty(&inode->i_data.private_list)); BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(inode->i_state & I_CLEAR); + BUG_ON(!list_empty(&inode->i_wb_list)); /* don't need i_lock here, no concurrent mods to i_state */ inode->i_state = I_FREEING | I_CLEAR; } diff --git a/fs/internal.h b/fs/internal.h index c0c6f493ab8a..cef0913e5d41 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -133,6 +133,7 @@ extern int invalidate_inodes(struct super_block *, bool); extern struct dentry *__d_alloc(struct super_block *, const struct qstr *); extern int d_set_mounted(struct dentry *dentry); extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc); +extern struct dentry *d_alloc_cursor(struct dentry *); /* * read_write.c diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c index 2e4e834d1a98..2ce5b75ee9a5 100644 --- a/fs/isofs/compress.c +++ b/fs/isofs/compress.c @@ -81,7 +81,7 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, blocknum = block_start >> bufshift; memset(bhs, 0, (needblocks + 1) * sizeof(struct buffer_head *)); haveblocks = isofs_get_blocks(inode, blocknum, bhs, needblocks); - ll_rw_block(READ, haveblocks, bhs); + ll_rw_block(REQ_OP_READ, 0, haveblocks, bhs); curbh = 0; curpage = 0; diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 70078096117d..5bb565f9989c 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -124,7 +124,7 @@ static int journal_submit_commit_record(journal_t *journal, struct commit_header *tmp; struct buffer_head *bh; int ret; - struct timespec now = current_kernel_time(); + struct timespec64 now = current_kernel_time64(); *cbh = NULL; @@ -155,9 +155,9 @@ static int journal_submit_commit_record(journal_t *journal, if (journal->j_flags & JBD2_BARRIER && !jbd2_has_feature_async_commit(journal)) - ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh); + ret = submit_bh(REQ_OP_WRITE, WRITE_SYNC | WRITE_FLUSH_FUA, bh); else - ret = submit_bh(WRITE_SYNC, bh); + ret = submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh); *cbh = bh; return ret; @@ -718,7 +718,7 @@ start_journal_io: clear_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = journal_end_buffer_io_sync; - submit_bh(WRITE_SYNC, bh); + submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh); } cond_resched(); stats.run.rs_blocks_logged += bufs; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index b31852f76f46..46261a6f902d 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -691,6 +691,7 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) { int err = 0; + jbd2_might_wait_for_commit(journal); read_lock(&journal->j_state_lock); #ifdef CONFIG_JBD2_DEBUG if (!tid_geq(journal->j_commit_request, tid)) { @@ -1091,6 +1092,7 @@ static void jbd2_stats_proc_exit(journal_t *journal) static journal_t * journal_init_common (void) { + static struct lock_class_key jbd2_trans_commit_key; journal_t *journal; int err; @@ -1126,6 +1128,9 @@ static journal_t * journal_init_common (void) spin_lock_init(&journal->j_history_lock); + lockdep_init_map(&journal->j_trans_commit_map, "jbd2_handle", + &jbd2_trans_commit_key, 0); + return journal; } @@ -1346,15 +1351,15 @@ static int journal_reset(journal_t *journal) return jbd2_journal_start_thread(journal); } -static int jbd2_write_superblock(journal_t *journal, int write_op) +static int jbd2_write_superblock(journal_t *journal, int write_flags) { struct buffer_head *bh = journal->j_sb_buffer; journal_superblock_t *sb = journal->j_superblock; int ret; - trace_jbd2_write_superblock(journal, write_op); + trace_jbd2_write_superblock(journal, write_flags); if (!(journal->j_flags & JBD2_BARRIER)) - write_op &= ~(REQ_FUA | REQ_FLUSH); + write_flags &= ~(REQ_FUA | REQ_PREFLUSH); lock_buffer(bh); if (buffer_write_io_error(bh)) { /* @@ -1374,7 +1379,7 @@ static int jbd2_write_superblock(journal_t *journal, int write_op) jbd2_superblock_csum_set(journal, sb); get_bh(bh); bh->b_end_io = end_buffer_write_sync; - ret = submit_bh(write_op, bh); + ret = submit_bh(REQ_OP_WRITE, write_flags, bh); wait_on_buffer(bh); if (buffer_write_io_error(bh)) { clear_buffer_write_io_error(bh); @@ -1498,7 +1503,7 @@ static int journal_get_superblock(journal_t *journal) J_ASSERT(bh != NULL); if (!buffer_uptodate(bh)) { - ll_rw_block(READ, 1, &bh); + ll_rw_block(REQ_OP_READ, 0, 1, &bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { printk(KERN_ERR @@ -2329,18 +2334,10 @@ void *jbd2_alloc(size_t size, gfp_t flags) BUG_ON(size & (size-1)); /* Must be a power of 2 */ - flags |= __GFP_REPEAT; - if (size == PAGE_SIZE) - ptr = (void *)__get_free_pages(flags, 0); - else if (size > PAGE_SIZE) { - int order = get_order(size); - - if (order < 3) - ptr = (void *)__get_free_pages(flags, order); - else - ptr = vmalloc(size); - } else + if (size < PAGE_SIZE) ptr = kmem_cache_alloc(get_slab(size), flags); + else + ptr = (void *)__get_free_pages(flags, get_order(size)); /* Check alignment; SLUB has gotten this wrong in the past, * and this can lead to user data corruption! */ @@ -2351,20 +2348,10 @@ void *jbd2_alloc(size_t size, gfp_t flags) void jbd2_free(void *ptr, size_t size) { - if (size == PAGE_SIZE) { - free_pages((unsigned long)ptr, 0); - return; - } - if (size > PAGE_SIZE) { - int order = get_order(size); - - if (order < 3) - free_pages((unsigned long)ptr, order); - else - vfree(ptr); - return; - } - kmem_cache_free(get_slab(size), ptr); + if (size < PAGE_SIZE) + kmem_cache_free(get_slab(size), ptr); + else + free_pages((unsigned long)ptr, get_order(size)); }; /* diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 805bc6bcd8ab..02dd3360cb20 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -104,7 +104,7 @@ static int do_readahead(journal_t *journal, unsigned int start) if (!buffer_uptodate(bh) && !buffer_locked(bh)) { bufs[nbufs++] = bh; if (nbufs == MAXBUF) { - ll_rw_block(READ, nbufs, bufs); + ll_rw_block(REQ_OP_READ, 0, nbufs, bufs); journal_brelse_array(bufs, nbufs); nbufs = 0; } @@ -113,7 +113,7 @@ static int do_readahead(journal_t *journal, unsigned int start) } if (nbufs) - ll_rw_block(READ, nbufs, bufs); + ll_rw_block(REQ_OP_READ, 0, nbufs, bufs); err = 0; failed: diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 1749519b362f..b5bc3e249163 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -182,6 +182,8 @@ static int add_transaction_credits(journal_t *journal, int blocks, int needed; int total = blocks + rsv_blocks; + jbd2_might_wait_for_commit(journal); + /* * If the current transaction is locked down for commit, wait * for the lock to be released. @@ -382,13 +384,11 @@ repeat: read_unlock(&journal->j_state_lock); current->journal_info = handle; - lock_map_acquire(&handle->h_lockdep_map); + rwsem_acquire_read(&journal->j_trans_commit_map, 0, 0, _THIS_IP_); jbd2_journal_free_transaction(new_transaction); return 0; } -static struct lock_class_key jbd2_handle_key; - /* Allocate a new handle. This should probably be in a slab... */ static handle_t *new_handle(int nblocks) { @@ -398,9 +398,6 @@ static handle_t *new_handle(int nblocks) handle->h_buffer_credits = nblocks; handle->h_ref = 1; - lockdep_init_map(&handle->h_lockdep_map, "jbd2_handle", - &jbd2_handle_key, 0); - return handle; } @@ -672,7 +669,7 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask) if (need_to_start) jbd2_log_start_commit(journal, tid); - lock_map_release(&handle->h_lockdep_map); + rwsem_release(&journal->j_trans_commit_map, 1, _THIS_IP_); handle->h_buffer_credits = nblocks; ret = start_this_handle(journal, handle, gfp_mask); return ret; @@ -700,6 +697,8 @@ void jbd2_journal_lock_updates(journal_t *journal) { DEFINE_WAIT(wait); + jbd2_might_wait_for_commit(journal); + write_lock(&journal->j_state_lock); ++journal->j_barrier_count; @@ -1750,11 +1749,11 @@ int jbd2_journal_stop(handle_t *handle) wake_up(&journal->j_wait_transaction_locked); } + rwsem_release(&journal->j_trans_commit_map, 1, _THIS_IP_); + if (wait_for_commit) err = jbd2_log_wait_commit(journal, tid); - lock_map_release(&handle->h_lockdep_map); - if (handle->h_rsv_handle) jbd2_journal_free_reserved(handle->h_rsv_handle); free_and_exit: diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 63759d723920..a74752146ec9 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -2002,12 +2002,13 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) bio->bi_end_io = lbmIODone; bio->bi_private = bp; + bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC); /*check if journaling to disk has been disabled*/ if (log->no_integrity) { bio->bi_iter.bi_size = 0; lbmIODone(bio); } else { - submit_bio(READ_SYNC, bio); + submit_bio(bio); } wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD)); @@ -2145,13 +2146,14 @@ static void lbmStartIO(struct lbuf * bp) bio->bi_end_io = lbmIODone; bio->bi_private = bp; + bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC); /* check if journaling to disk has been disabled */ if (log->no_integrity) { bio->bi_iter.bi_size = 0; lbmIODone(bio); } else { - submit_bio(WRITE_SYNC, bio); + submit_bio(bio); INCREMENT(lmStat.submitted); } } diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index b60e015cc757..e7fa9e513040 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -411,7 +411,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc) inc_io(page); if (!bio->bi_iter.bi_size) goto dump_bio; - submit_bio(WRITE, bio); + submit_bio(bio); nr_underway++; bio = NULL; } else @@ -434,6 +434,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc) bio->bi_iter.bi_sector = pblock << (inode->i_blkbits - 9); bio->bi_end_io = metapage_write_end_io; bio->bi_private = page; + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); /* Don't call bio_add_page yet, we may add to this vec */ bio_offset = offset; @@ -448,7 +449,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc) if (!bio->bi_iter.bi_size) goto dump_bio; - submit_bio(WRITE, bio); + submit_bio(bio); nr_underway++; } if (redirty) @@ -506,7 +507,7 @@ static int metapage_readpage(struct file *fp, struct page *page) insert_metapage(page, NULL); inc_io(page); if (bio) - submit_bio(READ, bio); + submit_bio(bio); bio = bio_alloc(GFP_NOFS, 1); bio->bi_bdev = inode->i_sb->s_bdev; @@ -514,6 +515,7 @@ static int metapage_readpage(struct file *fp, struct page *page) pblock << (inode->i_blkbits - 9); bio->bi_end_io = metapage_read_end_io; bio->bi_private = page; + bio_set_op_attrs(bio, REQ_OP_READ, 0); len = xlen << inode->i_blkbits; offset = block_offset << inode->i_blkbits; if (bio_add_page(bio, page, len, offset) < len) @@ -523,7 +525,7 @@ static int metapage_readpage(struct file *fp, struct page *page) block_offset++; } if (bio) - submit_bio(READ, bio); + submit_bio(bio); else unlock_page(page); diff --git a/fs/libfs.c b/fs/libfs.c index 3db2721144c2..74dc8b9e7f53 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -71,9 +71,7 @@ EXPORT_SYMBOL(simple_lookup); int dcache_dir_open(struct inode *inode, struct file *file) { - static struct qstr cursor_name = QSTR_INIT(".", 1); - - file->private_data = d_alloc(file->f_path.dentry, &cursor_name); + file->private_data = d_alloc_cursor(file->f_path.dentry); return file->private_data ? 0 : -ENOMEM; } @@ -86,6 +84,61 @@ int dcache_dir_close(struct inode *inode, struct file *file) } EXPORT_SYMBOL(dcache_dir_close); +/* parent is locked at least shared */ +static struct dentry *next_positive(struct dentry *parent, + struct list_head *from, + int count) +{ + unsigned *seq = &parent->d_inode->i_dir_seq, n; + struct dentry *res; + struct list_head *p; + bool skipped; + int i; + +retry: + i = count; + skipped = false; + n = smp_load_acquire(seq) & ~1; + res = NULL; + rcu_read_lock(); + for (p = from->next; p != &parent->d_subdirs; p = p->next) { + struct dentry *d = list_entry(p, struct dentry, d_child); + if (!simple_positive(d)) { + skipped = true; + } else if (!--i) { + res = d; + break; + } + } + rcu_read_unlock(); + if (skipped) { + smp_rmb(); + if (unlikely(*seq != n)) + goto retry; + } + return res; +} + +static void move_cursor(struct dentry *cursor, struct list_head *after) +{ + struct dentry *parent = cursor->d_parent; + unsigned n, *seq = &parent->d_inode->i_dir_seq; + spin_lock(&parent->d_lock); + for (;;) { + n = *seq; + if (!(n & 1) && cmpxchg(seq, n, n + 1) == n) + break; + cpu_relax(); + } + __list_del(cursor->d_child.prev, cursor->d_child.next); + if (after) + list_add(&cursor->d_child, after); + else + list_add_tail(&cursor->d_child, &parent->d_subdirs); + smp_store_release(seq, n + 2); + spin_unlock(&parent->d_lock); +} + loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) { struct dentry *dentry = file->f_path.dentry; @@ -101,25 +154,14 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) if (offset != file->f_pos) { file->f_pos = offset; if (file->f_pos >= 2) { - struct list_head *p; struct dentry *cursor = file->private_data; + struct dentry *to; loff_t n = file->f_pos - 2; - spin_lock(&dentry->d_lock); - /* d_lock not required for cursor */ - list_del(&cursor->d_child); - p = dentry->d_subdirs.next; - while (n && p != &dentry->d_subdirs) { - struct dentry *next; - next = list_entry(p, struct dentry, d_child); - spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); - if (simple_positive(next)) - n--; - spin_unlock(&next->d_lock); - p = p->next; - } - list_add_tail(&cursor->d_child, p); - spin_unlock(&dentry->d_lock); + inode_lock_shared(dentry->d_inode); + to = next_positive(dentry, &dentry->d_subdirs, n); + move_cursor(cursor, to ? &to->d_child : NULL); + inode_unlock_shared(dentry->d_inode); } } return offset; @@ -142,36 +184,25 @@ int dcache_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; struct dentry *cursor = file->private_data; - struct list_head *p, *q = &cursor->d_child; + struct list_head *p = &cursor->d_child; + struct dentry *next; + bool moved = false; if (!dir_emit_dots(file, ctx)) return 0; - spin_lock(&dentry->d_lock); - if (ctx->pos == 2) - list_move(q, &dentry->d_subdirs); - for (p = q->next; p != &dentry->d_subdirs; p = p->next) { - struct dentry *next = list_entry(p, struct dentry, d_child); - spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); - if (!simple_positive(next)) { - spin_unlock(&next->d_lock); - continue; - } - - spin_unlock(&next->d_lock); - spin_unlock(&dentry->d_lock); + if (ctx->pos == 2) + p = &dentry->d_subdirs; + while ((next = next_positive(dentry, p, 1)) != NULL) { if (!dir_emit(ctx, next->d_name.name, next->d_name.len, d_inode(next)->i_ino, dt_type(d_inode(next)))) - return 0; - spin_lock(&dentry->d_lock); - spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); - /* next is still alive */ - list_move(q, p); - spin_unlock(&next->d_lock); - p = q; + break; + moved = true; + p = &next->d_child; ctx->pos++; } - spin_unlock(&dentry->d_lock); + if (moved) + move_cursor(cursor, p); return 0; } EXPORT_SYMBOL(dcache_readdir); diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 154a107cd376..fc4084ef4736 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -335,12 +335,17 @@ static struct notifier_block lockd_inet6addr_notifier = { }; #endif -static void lockd_svc_exit_thread(void) +static void lockd_unregister_notifiers(void) { unregister_inetaddr_notifier(&lockd_inetaddr_notifier); #if IS_ENABLED(CONFIG_IPV6) unregister_inet6addr_notifier(&lockd_inet6addr_notifier); #endif +} + +static void lockd_svc_exit_thread(void) +{ + lockd_unregister_notifiers(); svc_exit_thread(nlmsvc_rqst); } @@ -462,7 +467,7 @@ int lockd_up(struct net *net) * Note: svc_serv structures have an initial use count of 1, * so we exit through here on both success and failure. */ -err_net: +err_put: svc_destroy(serv); err_create: mutex_unlock(&nlmsvc_mutex); @@ -470,7 +475,9 @@ err_create: err_start: lockd_down_net(serv, net); - goto err_net; +err_net: + lockd_unregister_notifiers(); + goto err_put; } EXPORT_SYMBOL_GPL(lockd_up); diff --git a/fs/locks.c b/fs/locks.c index 7c5f91be9b65..ee1b15f6fc13 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1628,7 +1628,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr { struct file_lock *fl, *my_fl = NULL, *lease; struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = file_inode(filp); struct file_lock_context *ctx; bool is_deleg = (*flp)->fl_flags & FL_DELEG; int error; diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c index cc26f8f215f5..a8329cc47dec 100644 --- a/fs/logfs/dev_bdev.c +++ b/fs/logfs/dev_bdev.c @@ -14,7 +14,7 @@ #define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1)) -static int sync_request(struct page *page, struct block_device *bdev, int rw) +static int sync_request(struct page *page, struct block_device *bdev, int op) { struct bio bio; struct bio_vec bio_vec; @@ -29,8 +29,9 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw) bio.bi_bdev = bdev; bio.bi_iter.bi_sector = page->index * (PAGE_SIZE >> 9); bio.bi_iter.bi_size = PAGE_SIZE; + bio_set_op_attrs(&bio, op, 0); - return submit_bio_wait(rw, &bio); + return submit_bio_wait(&bio); } static int bdev_readpage(void *_sb, struct page *page) @@ -95,8 +96,9 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, bio->bi_iter.bi_sector = ofs >> 9; bio->bi_private = sb; bio->bi_end_io = writeseg_end_io; + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); atomic_inc(&super->s_pending_writes); - submit_bio(WRITE, bio); + submit_bio(bio); ofs += i * PAGE_SIZE; index += i; @@ -122,8 +124,9 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, bio->bi_iter.bi_sector = ofs >> 9; bio->bi_private = sb; bio->bi_end_io = writeseg_end_io; + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); atomic_inc(&super->s_pending_writes); - submit_bio(WRITE, bio); + submit_bio(bio); return 0; } @@ -185,8 +188,9 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index, bio->bi_iter.bi_sector = ofs >> 9; bio->bi_private = sb; bio->bi_end_io = erase_end_io; + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); atomic_inc(&super->s_pending_writes); - submit_bio(WRITE, bio); + submit_bio(bio); ofs += i * PAGE_SIZE; index += i; @@ -206,8 +210,9 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index, bio->bi_iter.bi_sector = ofs >> 9; bio->bi_private = sb; bio->bi_end_io = erase_end_io; + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); atomic_inc(&super->s_pending_writes); - submit_bio(WRITE, bio); + submit_bio(bio); return 0; } diff --git a/fs/mpage.c b/fs/mpage.c index eedc644b78d7..2ca1f39c8cba 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -56,11 +56,12 @@ static void mpage_end_io(struct bio *bio) bio_put(bio); } -static struct bio *mpage_bio_submit(int rw, struct bio *bio) +static struct bio *mpage_bio_submit(int op, int op_flags, struct bio *bio) { bio->bi_end_io = mpage_end_io; - guard_bio_eod(rw, bio); - submit_bio(rw, bio); + bio_set_op_attrs(bio, op, op_flags); + guard_bio_eod(op, bio); + submit_bio(bio); return NULL; } @@ -71,6 +72,8 @@ mpage_alloc(struct block_device *bdev, { struct bio *bio; + /* Restrict the given (page cache) mask for slab allocations */ + gfp_flags &= GFP_KERNEL; bio = bio_alloc(gfp_flags, nr_vecs); if (bio == NULL && (current->flags & PF_MEMALLOC)) { @@ -269,7 +272,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, * This page will go to BIO. Do we need to send this BIO off first? */ if (bio && (*last_block_in_bio != blocks[0] - 1)) - bio = mpage_bio_submit(READ, bio); + bio = mpage_bio_submit(REQ_OP_READ, 0, bio); alloc_new: if (bio == NULL) { @@ -286,7 +289,7 @@ alloc_new: length = first_hole << blkbits; if (bio_add_page(bio, page, length, 0) < length) { - bio = mpage_bio_submit(READ, bio); + bio = mpage_bio_submit(REQ_OP_READ, 0, bio); goto alloc_new; } @@ -294,7 +297,7 @@ alloc_new: nblocks = map_bh->b_size >> blkbits; if ((buffer_boundary(map_bh) && relative_block == nblocks) || (first_hole != blocks_per_page)) - bio = mpage_bio_submit(READ, bio); + bio = mpage_bio_submit(REQ_OP_READ, 0, bio); else *last_block_in_bio = blocks[blocks_per_page - 1]; out: @@ -302,7 +305,7 @@ out: confused: if (bio) - bio = mpage_bio_submit(READ, bio); + bio = mpage_bio_submit(REQ_OP_READ, 0, bio); if (!PageUptodate(page)) block_read_full_page(page, get_block); else @@ -362,7 +365,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages, sector_t last_block_in_bio = 0; struct buffer_head map_bh; unsigned long first_logical_block = 0; - gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL); + gfp_t gfp = readahead_gfp_mask(mapping); map_bh.b_state = 0; map_bh.b_size = 0; @@ -384,7 +387,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages, } BUG_ON(!list_empty(pages)); if (bio) - mpage_bio_submit(READ, bio); + mpage_bio_submit(REQ_OP_READ, 0, bio); return 0; } EXPORT_SYMBOL(mpage_readpages); @@ -405,7 +408,7 @@ int mpage_readpage(struct page *page, get_block_t get_block) bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio, &map_bh, &first_logical_block, get_block, gfp); if (bio) - mpage_bio_submit(READ, bio); + mpage_bio_submit(REQ_OP_READ, 0, bio); return 0; } EXPORT_SYMBOL(mpage_readpage); @@ -486,7 +489,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, struct buffer_head map_bh; loff_t i_size = i_size_read(inode); int ret = 0; - int wr = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); + int op_flags = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0); if (page_has_buffers(page)) { struct buffer_head *head = page_buffers(page); @@ -595,7 +598,7 @@ page_is_mapped: * This page will go to BIO. Do we need to send this BIO off first? */ if (bio && mpd->last_block_in_bio != blocks[0] - 1) - bio = mpage_bio_submit(wr, bio); + bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio); alloc_new: if (bio == NULL) { @@ -622,7 +625,7 @@ alloc_new: wbc_account_io(wbc, page, PAGE_SIZE); length = first_unmapped << blkbits; if (bio_add_page(bio, page, length, 0) < length) { - bio = mpage_bio_submit(wr, bio); + bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio); goto alloc_new; } @@ -632,7 +635,7 @@ alloc_new: set_page_writeback(page); unlock_page(page); if (boundary || (first_unmapped != blocks_per_page)) { - bio = mpage_bio_submit(wr, bio); + bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio); if (boundary_block) { write_boundary_block(boundary_bdev, boundary_block, 1 << blkbits); @@ -644,7 +647,7 @@ alloc_new: confused: if (bio) - bio = mpage_bio_submit(wr, bio); + bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio); if (mpd->use_writepage) { ret = mapping->a_ops->writepage(page, wbc); @@ -701,9 +704,9 @@ mpage_writepages(struct address_space *mapping, ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd); if (mpd.bio) { - int wr = (wbc->sync_mode == WB_SYNC_ALL ? - WRITE_SYNC : WRITE); - mpage_bio_submit(wr, mpd.bio); + int op_flags = (wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC : 0); + mpage_bio_submit(REQ_OP_WRITE, op_flags, mpd.bio); } } blk_finish_plug(&plug); @@ -722,9 +725,9 @@ int mpage_writepage(struct page *page, get_block_t get_block, }; int ret = __mpage_writepage(page, wbc, &mpd); if (mpd.bio) { - int wr = (wbc->sync_mode == WB_SYNC_ALL ? - WRITE_SYNC : WRITE); - mpage_bio_submit(wr, mpd.bio); + int op_flags = (wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC : 0); + mpage_bio_submit(REQ_OP_WRITE, op_flags, mpd.bio); } return ret; } diff --git a/fs/namei.c b/fs/namei.c index 4c4f95ac8aa5..70580ab1445c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1416,21 +1416,28 @@ static void follow_mount(struct path *path) } } +static int path_parent_directory(struct path *path) +{ + struct dentry *old = path->dentry; + /* rare case of legitimate dget_parent()... */ + path->dentry = dget_parent(path->dentry); + dput(old); + if (unlikely(!path_connected(path))) + return -ENOENT; + return 0; +} + static int follow_dotdot(struct nameidata *nd) { while(1) { - struct dentry *old = nd->path.dentry; - if (nd->path.dentry == nd->root.dentry && nd->path.mnt == nd->root.mnt) { break; } if (nd->path.dentry != nd->path.mnt->mnt_root) { - /* rare case of legitimate dget_parent()... */ - nd->path.dentry = dget_parent(nd->path.dentry); - dput(old); - if (unlikely(!path_connected(&nd->path))) - return -ENOENT; + int ret = path_parent_directory(&nd->path); + if (ret) + return ret; break; } if (!follow_up(&nd->path)) @@ -2514,6 +2521,34 @@ struct dentry *lookup_one_len_unlocked(const char *name, } EXPORT_SYMBOL(lookup_one_len_unlocked); +#ifdef CONFIG_UNIX98_PTYS +int path_pts(struct path *path) +{ + /* Find something mounted on "pts" in the same directory as + * the input path. + */ + struct dentry *child, *parent; + struct qstr this; + int ret; + + ret = path_parent_directory(path); + if (ret) + return ret; + + parent = path->dentry; + this.name = "pts"; + this.len = 3; + child = d_hash_and_lookup(parent, &this); + if (!child) + return -ENOENT; + + path->dentry = child; + dput(parent); + follow_mount(path); + return 0; +} +#endif + int user_path_at_empty(int dfd, const char __user *name, unsigned flags, struct path *path, int *empty) { @@ -2995,9 +3030,13 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry, } if (*opened & FILE_CREATED) fsnotify_create(dir, dentry); - path->dentry = dentry; - path->mnt = nd->path.mnt; - return 1; + if (unlikely(d_is_negative(dentry))) { + error = -ENOENT; + } else { + path->dentry = dentry; + path->mnt = nd->path.mnt; + return 1; + } } } dput(dentry); @@ -3166,9 +3205,7 @@ static int do_last(struct nameidata *nd, int acc_mode = op->acc_mode; unsigned seq; struct inode *inode; - struct path save_parent = { .dentry = NULL, .mnt = NULL }; struct path path; - bool retried = false; int error; nd->flags &= ~LOOKUP_PARENT; @@ -3211,7 +3248,6 @@ static int do_last(struct nameidata *nd, return -EISDIR; } -retry_lookup: if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { error = mnt_want_write(nd->path.mnt); if (!error) @@ -3263,6 +3299,10 @@ retry_lookup: got_write = false; } + error = follow_managed(&path, nd); + if (unlikely(error < 0)) + return error; + if (unlikely(d_is_negative(path.dentry))) { path_to_nameidata(&path, nd); return -ENOENT; @@ -3278,10 +3318,6 @@ retry_lookup: return -EEXIST; } - error = follow_managed(&path, nd); - if (unlikely(error < 0)) - return error; - seq = 0; /* out of RCU mode, so the value doesn't matter */ inode = d_backing_inode(path.dentry); finish_lookup: @@ -3292,23 +3328,14 @@ finish_lookup: if (unlikely(error)) return error; - if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path.mnt) { - path_to_nameidata(&path, nd); - } else { - save_parent.dentry = nd->path.dentry; - save_parent.mnt = mntget(path.mnt); - nd->path.dentry = path.dentry; - - } + path_to_nameidata(&path, nd); nd->inode = inode; nd->seq = seq; /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ finish_open: error = complete_walk(nd); - if (error) { - path_put(&save_parent); + if (error) return error; - } audit_inode(nd->name, nd->path.dentry, 0); error = -EISDIR; if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry)) @@ -3331,13 +3358,9 @@ finish_open_created: goto out; BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */ error = vfs_open(&nd->path, file, current_cred()); - if (!error) { - *opened |= FILE_OPENED; - } else { - if (error == -EOPENSTALE) - goto stale_open; + if (error) goto out; - } + *opened |= FILE_OPENED; opened: error = open_check_o_direct(file); if (!error) @@ -3353,26 +3376,7 @@ out: } if (got_write) mnt_drop_write(nd->path.mnt); - path_put(&save_parent); return error; - -stale_open: - /* If no saved parent or already retried then can't retry */ - if (!save_parent.dentry || retried) - goto out; - - BUG_ON(save_parent.dentry != dir); - path_put(&nd->path); - nd->path = save_parent; - nd->inode = dir->d_inode; - save_parent.mnt = NULL; - save_parent.dentry = NULL; - if (got_write) { - mnt_drop_write(nd->path.mnt); - got_write = false; - } - retried = true; - goto retry_lookup; } static int do_tmpfile(struct nameidata *nd, unsigned flags, diff --git a/fs/namespace.c b/fs/namespace.c index 4fb1691b4355..419f746d851d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1562,6 +1562,7 @@ void __detach_mounts(struct dentry *dentry) goto out_unlock; lock_mount_hash(); + event++; while (!hlist_empty(&mp->m_list)) { mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list); if (mnt->mnt.mnt_flags & MNT_UMOUNT) { @@ -2409,8 +2410,10 @@ static int do_new_mount(struct path *path, const char *fstype, int flags, mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV; } if (type->fs_flags & FS_USERNS_VISIBLE) { - if (!fs_fully_visible(type, &mnt_flags)) + if (!fs_fully_visible(type, &mnt_flags)) { + put_filesystem(type); return -EPERM; + } } } @@ -3245,6 +3248,10 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags) if (mnt->mnt.mnt_sb->s_iflags & SB_I_NOEXEC) mnt_flags &= ~(MNT_LOCK_NOSUID | MNT_LOCK_NOEXEC); + /* Don't miss readonly hidden in the superblock flags */ + if (mnt->mnt.mnt_sb->s_flags & MS_RDONLY) + mnt_flags |= MNT_LOCK_READONLY; + /* Verify the mount flags are equal to or more permissive * than the proposed new mount. */ @@ -3271,7 +3278,7 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags) list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { struct inode *inode = child->mnt_mountpoint->d_inode; /* Only worry about locked mounts */ - if (!(mnt_flags & MNT_LOCKED)) + if (!(child->mnt.mnt_flags & MNT_LOCKED)) continue; /* Is the directory permanetly empty? */ if (!is_empty_dir_inode(inode)) diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 17a42e4eb872..f55a4e756047 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -102,14 +102,15 @@ static inline void put_parallel(struct parallel_io *p) } static struct bio * -bl_submit_bio(int rw, struct bio *bio) +bl_submit_bio(struct bio *bio) { if (bio) { get_parallel(bio->bi_private); dprintk("%s submitting %s bio %u@%llu\n", __func__, - rw == READ ? "read" : "write", bio->bi_iter.bi_size, + bio_op(bio) == READ ? "read" : "write", + bio->bi_iter.bi_size, (unsigned long long)bio->bi_iter.bi_sector); - submit_bio(rw, bio); + submit_bio(bio); } return NULL; } @@ -158,7 +159,7 @@ do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, if (disk_addr < map->start || disk_addr >= map->start + map->len) { if (!dev->map(dev, disk_addr, map)) return ERR_PTR(-EIO); - bio = bl_submit_bio(rw, bio); + bio = bl_submit_bio(bio); } disk_addr += map->disk_offset; disk_addr -= map->start; @@ -174,9 +175,10 @@ retry: disk_addr >> SECTOR_SHIFT, end_io, par); if (!bio) return ERR_PTR(-ENOMEM); + bio_set_op_attrs(bio, rw, 0); } if (bio_add_page(bio, page, *len, offset) < *len) { - bio = bl_submit_bio(rw, bio); + bio = bl_submit_bio(bio); goto retry; } return bio; @@ -252,7 +254,7 @@ bl_read_pagelist(struct nfs_pgio_header *header) for (i = pg_index; i < header->page_array.npages; i++) { if (extent_length <= 0) { /* We've used up the previous extent */ - bio = bl_submit_bio(READ, bio); + bio = bl_submit_bio(bio); /* Get the next one */ if (!ext_tree_lookup(bl, isect, &be, false)) { @@ -273,7 +275,7 @@ bl_read_pagelist(struct nfs_pgio_header *header) } if (is_hole(&be)) { - bio = bl_submit_bio(READ, bio); + bio = bl_submit_bio(bio); /* Fill hole w/ zeroes w/o accessing device */ dprintk("%s Zeroing page for hole\n", __func__); zero_user_segment(pages[i], pg_offset, pg_len); @@ -306,7 +308,7 @@ bl_read_pagelist(struct nfs_pgio_header *header) header->res.count = (isect << SECTOR_SHIFT) - header->args.offset; } out: - bl_submit_bio(READ, bio); + bl_submit_bio(bio); blk_finish_plug(&plug); put_parallel(par); return PNFS_ATTEMPTED; @@ -398,7 +400,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync) for (i = pg_index; i < header->page_array.npages; i++) { if (extent_length <= 0) { /* We've used up the previous extent */ - bio = bl_submit_bio(WRITE, bio); + bio = bl_submit_bio(bio); /* Get the next one */ if (!ext_tree_lookup(bl, isect, &be, true)) { header->pnfs_error = -EINVAL; @@ -427,7 +429,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync) header->res.count = header->args.count; out: - bl_submit_bio(WRITE, bio); + bl_submit_bio(bio); blk_finish_plug(&plug); put_parallel(par); return PNFS_ATTEMPTED; diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index aaf7bd0cbae2..19d93d0cd400 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -424,12 +424,17 @@ static int xdr_decode(nfs_readdir_descriptor_t *desc, static int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry) { + struct inode *inode; struct nfs_inode *nfsi; if (d_really_is_negative(dentry)) return 0; - nfsi = NFS_I(d_inode(dentry)); + inode = d_inode(dentry); + if (is_bad_inode(inode) || NFS_STALE(inode)) + return 0; + + nfsi = NFS_I(inode); if (entry->fattr->fileid == nfsi->fileid) return 1; if (nfs_compare_fh(entry->fh, &nfsi->fh) == 0) @@ -1363,7 +1368,6 @@ EXPORT_SYMBOL_GPL(nfs_dentry_operations); struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { struct dentry *res; - struct dentry *parent; struct inode *inode = NULL; struct nfs_fh *fhandle = NULL; struct nfs_fattr *fattr = NULL; @@ -1393,7 +1397,6 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in if (IS_ERR(label)) goto out; - parent = dentry->d_parent; /* Protect against concurrent sillydeletes */ trace_nfs_lookup_enter(dir, dentry, flags); error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label); @@ -1482,11 +1485,13 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned open_flags, umode_t mode, int *opened) { + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); struct nfs_open_context *ctx; struct dentry *res; struct iattr attr = { .ia_valid = ATTR_OPEN }; struct inode *inode; unsigned int lookup_flags = 0; + bool switched = false; int err; /* Expect a negative dentry */ @@ -1501,7 +1506,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, /* NFS only supports OPEN on regular files */ if ((open_flags & O_DIRECTORY)) { - if (!d_unhashed(dentry)) { + if (!d_in_lookup(dentry)) { /* * Hashed negative dentry with O_DIRECTORY: dentry was * revalidated and is fine, no need to perform lookup @@ -1525,6 +1530,17 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, attr.ia_size = 0; } + if (!(open_flags & O_CREAT) && !d_in_lookup(dentry)) { + d_drop(dentry); + switched = true; + dentry = d_alloc_parallel(dentry->d_parent, + &dentry->d_name, &wq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + if (unlikely(!d_in_lookup(dentry))) + return finish_no_open(file, dentry); + } + ctx = create_nfs_open_context(dentry, open_flags); err = PTR_ERR(ctx); if (IS_ERR(ctx)) @@ -1536,9 +1552,9 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, err = PTR_ERR(inode); trace_nfs_atomic_open_exit(dir, ctx, open_flags, err); put_nfs_open_context(ctx); + d_drop(dentry); switch (err) { case -ENOENT: - d_drop(dentry); d_add(dentry, NULL); nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); break; @@ -1560,14 +1576,23 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, trace_nfs_atomic_open_exit(dir, ctx, open_flags, err); put_nfs_open_context(ctx); out: + if (unlikely(switched)) { + d_lookup_done(dentry); + dput(dentry); + } return err; no_open: res = nfs_lookup(dir, dentry, lookup_flags); - err = PTR_ERR(res); + if (switched) { + d_lookup_done(dentry); + if (!res) + res = dentry; + else + dput(dentry); + } if (IS_ERR(res)) - goto out; - + return PTR_ERR(res); return finish_no_open(file, res); } EXPORT_SYMBOL_GPL(nfs_atomic_open); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 979b3c4dee6a..c7326c2af2c3 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -353,10 +353,12 @@ static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq) result = wait_for_completion_killable(&dreq->completion); + if (!result) { + result = dreq->count; + WARN_ON_ONCE(dreq->count < 0); + } if (!result) result = dreq->error; - if (!result) - result = dreq->count; out: return (ssize_t) result; @@ -386,8 +388,10 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write) if (dreq->iocb) { long res = (long) dreq->error; - if (!res) + if (dreq->count != 0) { res = (long) dreq->count; + WARN_ON_ONCE(dreq->count < 0); + } dreq->iocb->ki_complete(dreq->iocb, res, 0); } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 52e7d6869e3b..dda689d7a8a7 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -282,6 +282,7 @@ nfs_init_locked(struct inode *inode, void *opaque) struct nfs_fattr *fattr = desc->fattr; set_nfs_fileid(inode, fattr->fileid); + inode->i_mode = fattr->mode; nfs_copy_fh(NFS_FH(inode), desc->fh); return 0; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index de97567795a5..ff416d0e24bc 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2882,12 +2882,11 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) call_close |= is_wronly; else if (is_wronly) calldata->arg.fmode |= FMODE_WRITE; + if (calldata->arg.fmode != (FMODE_READ|FMODE_WRITE)) + call_close |= is_rdwr; } else if (is_rdwr) calldata->arg.fmode |= FMODE_READ|FMODE_WRITE; - if (calldata->arg.fmode == 0) - call_close |= is_rdwr; - if (!nfs4_valid_open_stateid(state)) call_close = 0; spin_unlock(&state->owner->so_lock); @@ -7924,8 +7923,8 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, break; } lo = NFS_I(inode)->layout; - if (lo && nfs4_stateid_match(&lgp->args.stateid, - &lo->plh_stateid)) { + if (lo && !test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) && + nfs4_stateid_match_other(&lgp->args.stateid, &lo->plh_stateid)) { LIST_HEAD(head); /* @@ -7936,10 +7935,10 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0); spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&head); + status = -EAGAIN; + goto out; } else spin_unlock(&inode->i_lock); - status = -EAGAIN; - goto out; } status = nfs4_handle_exception(server, status, exception); @@ -8036,7 +8035,10 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags) .flags = RPC_TASK_ASYNC, }; struct pnfs_layout_segment *lseg = NULL; - struct nfs4_exception exception = { .timeout = *timeout }; + struct nfs4_exception exception = { + .inode = inode, + .timeout = *timeout, + }; int status = 0; dprintk("--> %s\n", __func__); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 9679f4749364..834b875900d6 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1488,9 +1488,9 @@ restart: } spin_unlock(&state->state_lock); } - nfs4_put_open_state(state); clear_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags); + nfs4_put_open_state(state); spin_lock(&sp->so_lock); goto restart; } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 0c7e0d45a4de..0fbe734cc38c 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -361,8 +361,10 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo, list_del_init(&lseg->pls_list); /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */ atomic_dec(&lo->plh_refcount); - if (list_empty(&lo->plh_segs)) + if (list_empty(&lo->plh_segs)) { + set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); + } rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); } @@ -1290,6 +1292,7 @@ alloc_init_layout_hdr(struct inode *ino, INIT_LIST_HEAD(&lo->plh_bulk_destroy); lo->plh_inode = ino; lo->plh_lc_cred = get_rpccred(ctx->cred); + lo->plh_flags |= 1 << NFS_LAYOUT_INVALID_STID; return lo; } @@ -1297,6 +1300,8 @@ static struct pnfs_layout_hdr * pnfs_find_alloc_layout(struct inode *ino, struct nfs_open_context *ctx, gfp_t gfp_flags) + __releases(&ino->i_lock) + __acquires(&ino->i_lock) { struct nfs_inode *nfsi = NFS_I(ino); struct pnfs_layout_hdr *new = NULL; @@ -1565,8 +1570,7 @@ lookup_again: * stateid, or it has been invalidated, then we must use the open * stateid. */ - if (lo->plh_stateid.seqid == 0 || - test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) { + if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) { /* * The first layoutget for the file. Need to serialize per diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 0dfc476da3e1..b38e3c0dc790 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -247,7 +247,11 @@ void pnfs_fetch_commit_bucket_list(struct list_head *pages, } /* Helper function for pnfs_generic_commit_pagelist to catch an empty - * page list. This can happen when two commits race. */ + * page list. This can happen when two commits race. + * + * This must be called instead of nfs_init_commit - call one or the other, but + * not both! + */ static bool pnfs_generic_commit_cancel_empty_pagelist(struct list_head *pages, struct nfs_commit_data *data, @@ -256,7 +260,11 @@ pnfs_generic_commit_cancel_empty_pagelist(struct list_head *pages, if (list_empty(pages)) { if (atomic_dec_and_test(&cinfo->mds->rpcs_out)) wake_up_atomic_t(&cinfo->mds->rpcs_out); - nfs_commitdata_release(data); + /* don't call nfs_commitdata_release - it tries to put + * the open_context which is not acquired until nfs_init_commit + * which has not been called on @data */ + WARN_ON_ONCE(data->context); + nfs_commit_free(data); return true; } diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 6776d7a7839e..572e5b3b06f1 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -367,13 +367,13 @@ readpage_async_filler(void *data, struct page *page) nfs_list_remove_request(new); nfs_readpage_release(new); error = desc->pgio->pg_error; - goto out_unlock; + goto out; } return 0; out_error: error = PTR_ERR(new); -out_unlock: unlock_page(page); +out: return error; } diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 4df16ae1633a..ad2c05e80a83 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -291,7 +291,7 @@ out_free_buf: return error; } -#define NFSD_MDS_PR_KEY 0x0100000000000000 +#define NFSD_MDS_PR_KEY 0x0100000000000000ULL /* * We use the client ID as a unique key for the reservations. diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 1580ea6fd64d..d08cd88155c7 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -104,22 +104,21 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp, goto out; inode = d_inode(fh->fh_dentry); - if (!IS_POSIXACL(inode) || !inode->i_op->set_acl) { - error = -EOPNOTSUPP; - goto out_errno; - } error = fh_want_write(fh); if (error) goto out_errno; - error = inode->i_op->set_acl(inode, argp->acl_access, ACL_TYPE_ACCESS); + fh_lock(fh); + + error = set_posix_acl(inode, ACL_TYPE_ACCESS, argp->acl_access); if (error) - goto out_drop_write; - error = inode->i_op->set_acl(inode, argp->acl_default, - ACL_TYPE_DEFAULT); + goto out_drop_lock; + error = set_posix_acl(inode, ACL_TYPE_DEFAULT, argp->acl_default); if (error) - goto out_drop_write; + goto out_drop_lock; + + fh_unlock(fh); fh_drop_write(fh); @@ -131,7 +130,8 @@ out: posix_acl_release(argp->acl_access); posix_acl_release(argp->acl_default); return nfserr; -out_drop_write: +out_drop_lock: + fh_unlock(fh); fh_drop_write(fh); out_errno: nfserr = nfserrno(error); diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 01df4cd7c753..0c890347cde3 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -95,22 +95,20 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst * rqstp, goto out; inode = d_inode(fh->fh_dentry); - if (!IS_POSIXACL(inode) || !inode->i_op->set_acl) { - error = -EOPNOTSUPP; - goto out_errno; - } error = fh_want_write(fh); if (error) goto out_errno; - error = inode->i_op->set_acl(inode, argp->acl_access, ACL_TYPE_ACCESS); + fh_lock(fh); + + error = set_posix_acl(inode, ACL_TYPE_ACCESS, argp->acl_access); if (error) - goto out_drop_write; - error = inode->i_op->set_acl(inode, argp->acl_default, - ACL_TYPE_DEFAULT); + goto out_drop_lock; + error = set_posix_acl(inode, ACL_TYPE_DEFAULT, argp->acl_default); -out_drop_write: +out_drop_lock: + fh_unlock(fh); fh_drop_write(fh); out_errno: nfserr = nfserrno(error); diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 6adabd6049b7..71292a0d6f09 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -770,9 +770,6 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, dentry = fhp->fh_dentry; inode = d_inode(dentry); - if (!inode->i_op->set_acl || !IS_POSIXACL(inode)) - return nfserr_attrnotsupp; - if (S_ISDIR(inode->i_mode)) flags = NFS4_ACL_DIR; @@ -782,16 +779,19 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, if (host_error < 0) goto out_nfserr; - host_error = inode->i_op->set_acl(inode, pacl, ACL_TYPE_ACCESS); + fh_lock(fhp); + + host_error = set_posix_acl(inode, ACL_TYPE_ACCESS, pacl); if (host_error < 0) - goto out_release; + goto out_drop_lock; if (S_ISDIR(inode->i_mode)) { - host_error = inode->i_op->set_acl(inode, dpacl, - ACL_TYPE_DEFAULT); + host_error = set_posix_acl(inode, ACL_TYPE_DEFAULT, dpacl); } -out_release: +out_drop_lock: + fh_unlock(fhp); + posix_acl_release(pacl); posix_acl_release(dpacl); out_nfserr: diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 7389cb1d7409..04c68d900324 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -710,22 +710,6 @@ static struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc } } -static struct rpc_clnt *create_backchannel_client(struct rpc_create_args *args) -{ - struct rpc_xprt *xprt; - - if (args->protocol != XPRT_TRANSPORT_BC_TCP) - return rpc_create(args); - - xprt = args->bc_xprt->xpt_bc_xprt; - if (xprt) { - xprt_get(xprt); - return rpc_create_xprt(args, xprt); - } - - return rpc_create(args); -} - static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses) { int maxtime = max_cb_time(clp->net); @@ -768,7 +752,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c args.authflavor = ses->se_cb_sec.flavor; } /* Create RPC client */ - client = create_backchannel_client(&args); + client = rpc_create(&args); if (IS_ERR(client)) { dprintk("NFSD: couldn't create callback client: %ld\n", PTR_ERR(client)); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index f5f82e145018..70d0b9b33031 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3480,12 +3480,17 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open, } static struct nfs4_ol_stateid * -init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, - struct nfsd4_open *open) +init_open_stateid(struct nfs4_file *fp, struct nfsd4_open *open) { struct nfs4_openowner *oo = open->op_openowner; struct nfs4_ol_stateid *retstp = NULL; + struct nfs4_ol_stateid *stp; + + stp = open->op_stp; + /* We are moving these outside of the spinlocks to avoid the warnings */ + mutex_init(&stp->st_mutex); + mutex_lock(&stp->st_mutex); spin_lock(&oo->oo_owner.so_client->cl_lock); spin_lock(&fp->fi_lock); @@ -3493,6 +3498,8 @@ init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, retstp = nfsd4_find_existing_open(fp, open); if (retstp) goto out_unlock; + + open->op_stp = NULL; atomic_inc(&stp->st_stid.sc_count); stp->st_stid.sc_type = NFS4_OPEN_STID; INIT_LIST_HEAD(&stp->st_locks); @@ -3502,14 +3509,19 @@ init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, stp->st_access_bmap = 0; stp->st_deny_bmap = 0; stp->st_openstp = NULL; - init_rwsem(&stp->st_rwsem); list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids); list_add(&stp->st_perfile, &fp->fi_stateids); out_unlock: spin_unlock(&fp->fi_lock); spin_unlock(&oo->oo_owner.so_client->cl_lock); - return retstp; + if (retstp) { + mutex_lock(&retstp->st_mutex); + /* To keep mutex tracking happy */ + mutex_unlock(&stp->st_mutex); + stp = retstp; + } + return stp; } /* @@ -4305,7 +4317,6 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf struct nfs4_client *cl = open->op_openowner->oo_owner.so_client; struct nfs4_file *fp = NULL; struct nfs4_ol_stateid *stp = NULL; - struct nfs4_ol_stateid *swapstp = NULL; struct nfs4_delegation *dp = NULL; __be32 status; @@ -4335,32 +4346,28 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf */ if (stp) { /* Stateid was found, this is an OPEN upgrade */ - down_read(&stp->st_rwsem); + mutex_lock(&stp->st_mutex); status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open); if (status) { - up_read(&stp->st_rwsem); + mutex_unlock(&stp->st_mutex); goto out; } } else { - stp = open->op_stp; - open->op_stp = NULL; - swapstp = init_open_stateid(stp, fp, open); - if (swapstp) { - nfs4_put_stid(&stp->st_stid); - stp = swapstp; - down_read(&stp->st_rwsem); + /* stp is returned locked. */ + stp = init_open_stateid(fp, open); + /* See if we lost the race to some other thread */ + if (stp->st_access_bmap != 0) { status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open); if (status) { - up_read(&stp->st_rwsem); + mutex_unlock(&stp->st_mutex); goto out; } goto upgrade_out; } - down_read(&stp->st_rwsem); status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open); if (status) { - up_read(&stp->st_rwsem); + mutex_unlock(&stp->st_mutex); release_open_stateid(stp); goto out; } @@ -4372,7 +4379,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf } upgrade_out: nfs4_inc_and_copy_stateid(&open->op_stateid, &stp->st_stid); - up_read(&stp->st_rwsem); + mutex_unlock(&stp->st_mutex); if (nfsd4_has_session(&resp->cstate)) { if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) { @@ -4977,12 +4984,12 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_ * revoked delegations are kept only for free_stateid. */ return nfserr_bad_stateid; - down_write(&stp->st_rwsem); + mutex_lock(&stp->st_mutex); status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate)); if (status == nfs_ok) status = nfs4_check_fh(current_fh, &stp->st_stid); if (status != nfs_ok) - up_write(&stp->st_rwsem); + mutex_unlock(&stp->st_mutex); return status; } @@ -5030,7 +5037,7 @@ static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cs return status; oo = openowner(stp->st_stateowner); if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) { - up_write(&stp->st_rwsem); + mutex_unlock(&stp->st_mutex); nfs4_put_stid(&stp->st_stid); return nfserr_bad_stateid; } @@ -5062,12 +5069,12 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, oo = openowner(stp->st_stateowner); status = nfserr_bad_stateid; if (oo->oo_flags & NFS4_OO_CONFIRMED) { - up_write(&stp->st_rwsem); + mutex_unlock(&stp->st_mutex); goto put_stateid; } oo->oo_flags |= NFS4_OO_CONFIRMED; nfs4_inc_and_copy_stateid(&oc->oc_resp_stateid, &stp->st_stid); - up_write(&stp->st_rwsem); + mutex_unlock(&stp->st_mutex); dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n", __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid)); @@ -5143,7 +5150,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, nfs4_inc_and_copy_stateid(&od->od_stateid, &stp->st_stid); status = nfs_ok; put_stateid: - up_write(&stp->st_rwsem); + mutex_unlock(&stp->st_mutex); nfs4_put_stid(&stp->st_stid); out: nfsd4_bump_seqid(cstate, status); @@ -5196,7 +5203,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) goto out; nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid); - up_write(&stp->st_rwsem); + mutex_unlock(&stp->st_mutex); nfsd4_close_open_stateid(stp); @@ -5422,7 +5429,7 @@ init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo, stp->st_access_bmap = 0; stp->st_deny_bmap = open_stp->st_deny_bmap; stp->st_openstp = open_stp; - init_rwsem(&stp->st_rwsem); + mutex_init(&stp->st_mutex); list_add(&stp->st_locks, &open_stp->st_locks); list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids); spin_lock(&fp->fi_lock); @@ -5591,7 +5598,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, &open_stp, nn); if (status) goto out; - up_write(&open_stp->st_rwsem); + mutex_unlock(&open_stp->st_mutex); open_sop = openowner(open_stp->st_stateowner); status = nfserr_bad_stateid; if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid, @@ -5600,7 +5607,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = lookup_or_create_lock_state(cstate, open_stp, lock, &lock_stp, &new); if (status == nfs_ok) - down_write(&lock_stp->st_rwsem); + mutex_lock(&lock_stp->st_mutex); } else { status = nfs4_preprocess_seqid_op(cstate, lock->lk_old_lock_seqid, @@ -5704,7 +5711,7 @@ out: seqid_mutating_err(ntohl(status))) lock_sop->lo_owner.so_seqid++; - up_write(&lock_stp->st_rwsem); + mutex_unlock(&lock_stp->st_mutex); /* * If this is a new, never-before-used stateid, and we are @@ -5874,7 +5881,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fput: fput(filp); put_stateid: - up_write(&stp->st_rwsem); + mutex_unlock(&stp->st_mutex); nfs4_put_stid(&stp->st_stid); out: nfsd4_bump_seqid(cstate, status); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 986e51e5ceac..64053eadeb81 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -535,7 +535,7 @@ struct nfs4_ol_stateid { unsigned char st_access_bmap; unsigned char st_deny_bmap; struct nfs4_ol_stateid *st_openstp; - struct rw_semaphore st_rwsem; + struct mutex st_mutex; }; static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s) diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 0576033699bc..4cca998ec7a0 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -62,7 +62,7 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr) } int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, - sector_t pblocknr, int mode, + sector_t pblocknr, int mode, int mode_flags, struct buffer_head **pbh, sector_t *submit_ptr) { struct buffer_head *bh; @@ -95,7 +95,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, } } - if (mode == READA) { + if (mode_flags & REQ_RAHEAD) { if (pblocknr != *submit_ptr + 1 || !trylock_buffer(bh)) { err = -EBUSY; /* internal code */ brelse(bh); @@ -114,7 +114,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, bh->b_blocknr = pblocknr; /* set block address for read */ bh->b_end_io = end_buffer_read_sync; get_bh(bh); - submit_bh(mode, bh); + submit_bh(mode, mode_flags, bh); bh->b_blocknr = blocknr; /* set back to the given block address */ *submit_ptr = pblocknr; err = 0; diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h index 2cc1b80e18f7..4e8aaa1aeb65 100644 --- a/fs/nilfs2/btnode.h +++ b/fs/nilfs2/btnode.h @@ -43,7 +43,7 @@ void nilfs_btnode_cache_clear(struct address_space *); struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr); int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t, int, - struct buffer_head **, sector_t *); + int, struct buffer_head **, sector_t *); void nilfs_btnode_delete(struct buffer_head *); int nilfs_btnode_prepare_change_key(struct address_space *, struct nilfs_btnode_chkey_ctxt *); diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index eccb1c89ccbb..982d1e3df3a5 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -476,7 +476,8 @@ static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr, sector_t submit_ptr = 0; int ret; - ret = nilfs_btnode_submit_block(btnc, ptr, 0, READ, &bh, &submit_ptr); + ret = nilfs_btnode_submit_block(btnc, ptr, 0, REQ_OP_READ, 0, &bh, + &submit_ptr); if (ret) { if (ret != -EEXIST) return ret; @@ -492,7 +493,8 @@ static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr, n > 0 && i < ra->ncmax; n--, i++) { ptr2 = nilfs_btree_node_get_ptr(ra->node, i, ra->ncmax); - ret = nilfs_btnode_submit_block(btnc, ptr2, 0, READA, + ret = nilfs_btnode_submit_block(btnc, ptr2, 0, + REQ_OP_READ, REQ_RAHEAD, &ra_bh, &submit_ptr); if (likely(!ret || ret == -EEXIST)) brelse(ra_bh); diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index 693aded72498..e9148f94d696 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -101,7 +101,7 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff, bh->b_blocknr = pbn; bh->b_end_io = end_buffer_read_sync; get_bh(bh); - submit_bh(READ, bh); + submit_bh(REQ_OP_READ, 0, bh); if (vbn) bh->b_blocknr = vbn; out: @@ -138,7 +138,8 @@ int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn, int ret; ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache, - vbn ? : pbn, pbn, READ, out_bh, &pbn); + vbn ? : pbn, pbn, REQ_OP_READ, 0, + out_bh, &pbn); if (ret == -EEXIST) /* internal code (cache hit) */ ret = 0; return ret; diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 3417d859a03c..0d7b71fbeff8 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -121,7 +121,7 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block, static int nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, - int mode, struct buffer_head **out_bh) + int mode, int mode_flags, struct buffer_head **out_bh) { struct buffer_head *bh; __u64 blknum = 0; @@ -135,7 +135,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, if (buffer_uptodate(bh)) goto out; - if (mode == READA) { + if (mode_flags & REQ_RAHEAD) { if (!trylock_buffer(bh)) { ret = -EBUSY; goto failed_bh; @@ -157,7 +157,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, bh->b_end_io = end_buffer_read_sync; get_bh(bh); - submit_bh(mode, bh); + submit_bh(mode, mode_flags, bh); ret = 0; trace_nilfs2_mdt_submit_block(inode, inode->i_ino, blkoff, mode); @@ -181,7 +181,7 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block, int i, nr_ra_blocks = NILFS_MDT_MAX_RA_BLOCKS; int err; - err = nilfs_mdt_submit_block(inode, block, READ, &first_bh); + err = nilfs_mdt_submit_block(inode, block, REQ_OP_READ, 0, &first_bh); if (err == -EEXIST) /* internal code */ goto out; @@ -191,7 +191,8 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block, if (readahead) { blkoff = block + 1; for (i = 0; i < nr_ra_blocks; i++, blkoff++) { - err = nilfs_mdt_submit_block(inode, blkoff, READA, &bh); + err = nilfs_mdt_submit_block(inode, blkoff, REQ_OP_READ, + REQ_RAHEAD, &bh); if (likely(!err || err == -EEXIST)) brelse(bh); else if (err != -EBUSY) diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index bf36df10540b..a962d7d83447 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -346,7 +346,8 @@ static void nilfs_end_bio_write(struct bio *bio) } static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf, - struct nilfs_write_info *wi, int mode) + struct nilfs_write_info *wi, int mode, + int mode_flags) { struct bio *bio = wi->bio; int err; @@ -364,7 +365,8 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf, bio->bi_end_io = nilfs_end_bio_write; bio->bi_private = segbuf; - submit_bio(mode, bio); + bio_set_op_attrs(bio, mode, mode_flags); + submit_bio(bio); segbuf->sb_nbio++; wi->bio = NULL; @@ -437,7 +439,7 @@ static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf, return 0; } /* bio is FULL */ - err = nilfs_segbuf_submit_bio(segbuf, wi, mode); + err = nilfs_segbuf_submit_bio(segbuf, wi, mode, 0); /* never submit current bh */ if (likely(!err)) goto repeat; @@ -461,19 +463,19 @@ static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, { struct nilfs_write_info wi; struct buffer_head *bh; - int res = 0, rw = WRITE; + int res = 0; wi.nilfs = nilfs; nilfs_segbuf_prepare_write(segbuf, &wi); list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) { - res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, rw); + res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, REQ_OP_WRITE); if (unlikely(res)) goto failed_bio; } list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { - res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, rw); + res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, REQ_OP_WRITE); if (unlikely(res)) goto failed_bio; } @@ -483,8 +485,8 @@ static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, * Last BIO is always sent through the following * submission. */ - rw |= REQ_SYNC; - res = nilfs_segbuf_submit_bio(segbuf, &wi, rw); + res = nilfs_segbuf_submit_bio(segbuf, &wi, REQ_OP_WRITE, + REQ_SYNC); } failed_bio: diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 809bd2de7ad0..e9fd241b9a0a 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -439,7 +439,7 @@ static int nilfs_valid_sb(struct nilfs_super_block *sbp) if (!sbp || le16_to_cpu(sbp->s_magic) != NILFS_SUPER_MAGIC) return 0; bytes = le16_to_cpu(sbp->s_bytes); - if (bytes > BLOCK_SIZE) + if (bytes < sumoff + 4 || bytes > BLOCK_SIZE) return 0; crc = crc32_le(le32_to_cpu(sbp->s_crc_seed), (unsigned char *)sbp, sumoff); diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 97768a1379f2..fe251f187ff8 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -362,7 +362,7 @@ handle_zblock: for (i = 0; i < nr; i++) { tbh = arr[i]; if (likely(!buffer_uptodate(tbh))) - submit_bh(READ, tbh); + submit_bh(REQ_OP_READ, 0, tbh); else ntfs_end_buffer_async_read(tbh, 1); } @@ -877,7 +877,7 @@ lock_retry_remap: do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh(WRITE, bh); + submit_bh(REQ_OP_WRITE, 0, bh); need_end_writeback = false; } bh = next; @@ -1202,7 +1202,7 @@ lock_retry_remap: BUG_ON(!buffer_mapped(tbh)); get_bh(tbh); tbh->b_end_io = end_buffer_write_sync; - submit_bh(WRITE, tbh); + submit_bh(REQ_OP_WRITE, 0, tbh); } /* Synchronize the mft mirror now if not @sync. */ if (is_mft && !sync) diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c index f2b5e746f49b..f8eb04387ca4 100644 --- a/fs/ntfs/compress.c +++ b/fs/ntfs/compress.c @@ -670,7 +670,7 @@ lock_retry_remap: } get_bh(tbh); tbh->b_end_io = end_buffer_read_sync; - submit_bh(READ, tbh); + submit_bh(REQ_OP_READ, 0, tbh); } /* Wait for io completion on all buffer heads. */ diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 5622ed5a201e..f548629dfaac 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -553,7 +553,7 @@ static inline int ntfs_submit_bh_for_read(struct buffer_head *bh) lock_buffer(bh); get_bh(bh); bh->b_end_io = end_buffer_read_sync; - return submit_bh(READ, bh); + return submit_bh(REQ_OP_READ, 0, bh); } /** diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c index 9d71213ca81e..761f12f7f3ef 100644 --- a/fs/ntfs/logfile.c +++ b/fs/ntfs/logfile.c @@ -821,7 +821,7 @@ map_vcn: * completed ignore errors afterwards as we can assume * that if one buffer worked all of them will work. */ - submit_bh(WRITE, bh); + submit_bh(REQ_OP_WRITE, 0, bh); if (should_wait) { should_wait = false; wait_on_buffer(bh); diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 37b2501caaa4..d15d492ce47b 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -592,7 +592,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, clear_buffer_dirty(tbh); get_bh(tbh); tbh->b_end_io = end_buffer_write_sync; - submit_bh(WRITE, tbh); + submit_bh(REQ_OP_WRITE, 0, tbh); } /* Wait on i/o completion of buffers. */ for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { @@ -785,7 +785,7 @@ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync) clear_buffer_dirty(tbh); get_bh(tbh); tbh->b_end_io = end_buffer_write_sync; - submit_bh(WRITE, tbh); + submit_bh(REQ_OP_WRITE, 0, tbh); } /* Synchronize the mft mirror now if not @sync. */ if (!sync && ni->mft_no < vol->mftmirr_size) diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index e27e6527912b..4342c7ee7d20 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -1,7 +1,5 @@ ccflags-y := -Ifs/ocfs2 -ccflags-y += -DCATCH_BH_JBD_RACES - obj-$(CONFIG_OCFS2_FS) += \ ocfs2.o \ ocfs2_stackglue.o diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index c034edf3ef38..e97a37179614 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -640,7 +640,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, !buffer_new(bh) && ocfs2_should_read_blk(inode, page, block_start) && (block_start < from || block_end > to)) { - ll_rw_block(READ, 1, &bh); + ll_rw_block(REQ_OP_READ, 0, 1, &bh); *wait_bh++=bh; } diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index fe50ded1b4ce..8f040f88ade4 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -79,7 +79,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, get_bh(bh); /* for end_buffer_write_sync() */ bh->b_end_io = end_buffer_write_sync; - submit_bh(WRITE, bh); + submit_bh(REQ_OP_WRITE, 0, bh); wait_on_buffer(bh); @@ -139,17 +139,22 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, lock_buffer(bh); if (buffer_jbd(bh)) { +#ifdef CATCH_BH_JBD_RACES mlog(ML_ERROR, "block %llu had the JBD bit set " "while I was in lock_buffer!", (unsigned long long)bh->b_blocknr); BUG(); +#else + unlock_buffer(bh); + continue; +#endif } clear_buffer_uptodate(bh); get_bh(bh); /* for end_buffer_read_sync() */ bh->b_end_io = end_buffer_read_sync; - submit_bh(READ, bh); + submit_bh(REQ_OP_READ, 0, bh); } for (i = nr; i > 0; i--) { @@ -305,7 +310,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, if (validate) set_buffer_needs_validate(bh); bh->b_end_io = end_buffer_read_sync; - submit_bh(READ, bh); + submit_bh(REQ_OP_READ, 0, bh); continue; } } @@ -419,7 +424,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb, get_bh(bh); /* for end_buffer_write_sync() */ bh->b_end_io = end_buffer_write_sync; ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &di->i_check); - submit_bh(WRITE, bh); + submit_bh(REQ_OP_WRITE, 0, bh); wait_on_buffer(bh); diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 6aaf3e351391..636abcbd4650 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -530,7 +530,8 @@ static void o2hb_bio_end_io(struct bio *bio) static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, struct o2hb_bio_wait_ctxt *wc, unsigned int *current_slot, - unsigned int max_slots) + unsigned int max_slots, int op, + int op_flags) { int len, current_page; unsigned int vec_len, vec_start; @@ -556,6 +557,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, bio->bi_bdev = reg->hr_bdev; bio->bi_private = wc; bio->bi_end_io = o2hb_bio_end_io; + bio_set_op_attrs(bio, op, op_flags); vec_start = (cs << bits) % PAGE_SIZE; while(cs < max_slots) { @@ -591,7 +593,8 @@ static int o2hb_read_slots(struct o2hb_region *reg, o2hb_bio_wait_init(&wc); while(current_slot < max_slots) { - bio = o2hb_setup_one_bio(reg, &wc, ¤t_slot, max_slots); + bio = o2hb_setup_one_bio(reg, &wc, ¤t_slot, max_slots, + REQ_OP_READ, 0); if (IS_ERR(bio)) { status = PTR_ERR(bio); mlog_errno(status); @@ -599,7 +602,7 @@ static int o2hb_read_slots(struct o2hb_region *reg, } atomic_inc(&wc.wc_num_reqs); - submit_bio(READ, bio); + submit_bio(bio); } status = 0; @@ -623,7 +626,8 @@ static int o2hb_issue_node_write(struct o2hb_region *reg, slot = o2nm_this_node(); - bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1); + bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1, REQ_OP_WRITE, + WRITE_SYNC); if (IS_ERR(bio)) { status = PTR_ERR(bio); mlog_errno(status); @@ -631,7 +635,7 @@ static int o2hb_issue_node_write(struct o2hb_region *reg, } atomic_inc(&write_wc->wc_num_reqs); - submit_bio(WRITE_SYNC, bio); + submit_bio(bio); status = 0; bail: diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 4238eb28889f..1d67fcbf7160 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1618,16 +1618,12 @@ static void o2net_start_connect(struct work_struct *work) /* watch for racing with tearing a node down */ node = o2nm_get_node_by_num(o2net_num_from_nn(nn)); - if (node == NULL) { - ret = 0; + if (node == NULL) goto out; - } mynode = o2nm_get_node_by_num(o2nm_this_node()); - if (mynode == NULL) { - ret = 0; + if (mynode == NULL) goto out; - } spin_lock(&nn->nn_lock); /* diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 825136070d2c..e7b760deefae 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -347,26 +347,6 @@ static struct dentry *dlm_debugfs_root; #define DLM_DEBUGFS_PURGE_LIST "purge_list" /* begin - utils funcs */ -static void dlm_debug_free(struct kref *kref) -{ - struct dlm_debug_ctxt *dc; - - dc = container_of(kref, struct dlm_debug_ctxt, debug_refcnt); - - kfree(dc); -} - -static void dlm_debug_put(struct dlm_debug_ctxt *dc) -{ - if (dc) - kref_put(&dc->debug_refcnt, dlm_debug_free); -} - -static void dlm_debug_get(struct dlm_debug_ctxt *dc) -{ - kref_get(&dc->debug_refcnt); -} - static int debug_release(struct inode *inode, struct file *file) { free_page((unsigned long)file->private_data); @@ -932,11 +912,9 @@ int dlm_debug_init(struct dlm_ctxt *dlm) goto bail; } - dlm_debug_get(dc); return 0; bail: - dlm_debug_shutdown(dlm); return -ENOMEM; } @@ -949,7 +927,8 @@ void dlm_debug_shutdown(struct dlm_ctxt *dlm) debugfs_remove(dc->debug_mle_dentry); debugfs_remove(dc->debug_lockres_dentry); debugfs_remove(dc->debug_state_dentry); - dlm_debug_put(dc); + kfree(dc); + dc = NULL; } } @@ -969,7 +948,6 @@ int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) mlog_errno(-ENOMEM); goto bail; } - kref_init(&dlm->dlm_debug_ctxt->debug_refcnt); return 0; bail: diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h index 1f27c4812d1a..5ced5482e7d3 100644 --- a/fs/ocfs2/dlm/dlmdebug.h +++ b/fs/ocfs2/dlm/dlmdebug.h @@ -30,7 +30,6 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle); #ifdef CONFIG_DEBUG_FS struct dlm_debug_ctxt { - struct kref debug_refcnt; struct dentry *debug_state_dentry; struct dentry *debug_lockres_dentry; struct dentry *debug_mle_dentry; diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 1eaa9100c889..83d576f6a287 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -1635,7 +1635,6 @@ int ocfs2_create_new_inode_locks(struct inode *inode) int ret; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - BUG_ON(!inode); BUG_ON(!ocfs2_inode_is_new(inode)); mlog(0, "Inode %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); @@ -1665,10 +1664,8 @@ int ocfs2_create_new_inode_locks(struct inode *inode) } ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0); - if (ret) { + if (ret) mlog_errno(ret); - goto bail; - } bail: return ret; @@ -1680,8 +1677,6 @@ int ocfs2_rw_lock(struct inode *inode, int write) struct ocfs2_lock_res *lockres; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - BUG_ON(!inode); - mlog(0, "inode %llu take %s RW lock\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, write ? "EXMODE" : "PRMODE"); @@ -1724,8 +1719,6 @@ int ocfs2_open_lock(struct inode *inode) struct ocfs2_lock_res *lockres; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - BUG_ON(!inode); - mlog(0, "inode %llu take PRMODE open lock\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); @@ -1749,8 +1742,6 @@ int ocfs2_try_open_lock(struct inode *inode, int write) struct ocfs2_lock_res *lockres; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - BUG_ON(!inode); - mlog(0, "inode %llu try to take %s open lock\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, write ? "EXMODE" : "PRMODE"); @@ -2328,8 +2319,6 @@ int ocfs2_inode_lock_full_nested(struct inode *inode, struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct buffer_head *local_bh = NULL; - BUG_ON(!inode); - mlog(0, "inode %llu, take %s META lock\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, ex ? "EXMODE" : "PRMODE"); diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index d8f3fc8d2551..50cc55047443 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -145,22 +145,15 @@ int ocfs2_drop_inode(struct inode *inode); struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff); struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags, int sysfile_type); -int ocfs2_inode_init_private(struct inode *inode); int ocfs2_inode_revalidate(struct dentry *dentry); void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, int create_ino); -void ocfs2_read_inode(struct inode *inode); -void ocfs2_read_inode2(struct inode *inode, void *opaque); -ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf, - size_t size, loff_t *offp); void ocfs2_sync_blockdev(struct super_block *sb); void ocfs2_refresh_inode(struct inode *inode, struct ocfs2_dinode *fe); int ocfs2_mark_inode_dirty(handle_t *handle, struct inode *inode, struct buffer_head *bh); -struct buffer_head *ocfs2_bread(struct inode *inode, - int block, int *err, int reada); void ocfs2_set_inode_flags(struct inode *inode); void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index e607419cdfa4..a244f14c6b87 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1159,10 +1159,8 @@ static int ocfs2_force_read_journal(struct inode *inode) int status = 0; int i; u64 v_blkno, p_blkno, p_blocks, num_blocks; -#define CONCURRENT_JOURNAL_FILL 32ULL - struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL]; - - memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL); + struct buffer_head *bh = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); v_blkno = 0; @@ -1174,29 +1172,32 @@ static int ocfs2_force_read_journal(struct inode *inode) goto bail; } - if (p_blocks > CONCURRENT_JOURNAL_FILL) - p_blocks = CONCURRENT_JOURNAL_FILL; - - /* We are reading journal data which should not - * be put in the uptodate cache */ - status = ocfs2_read_blocks_sync(OCFS2_SB(inode->i_sb), - p_blkno, p_blocks, bhs); - if (status < 0) { - mlog_errno(status); - goto bail; - } + for (i = 0; i < p_blocks; i++, p_blkno++) { + bh = __find_get_block(osb->sb->s_bdev, p_blkno, + osb->sb->s_blocksize); + /* block not cached. */ + if (!bh) + continue; + + brelse(bh); + bh = NULL; + /* We are reading journal data which should not + * be put in the uptodate cache. + */ + status = ocfs2_read_blocks_sync(osb, p_blkno, 1, &bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } - for(i = 0; i < p_blocks; i++) { - brelse(bhs[i]); - bhs[i] = NULL; + brelse(bh); + bh = NULL; } v_blkno += p_blocks; } bail: - for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++) - brelse(bhs[i]); return status; } diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 13219ed73e1d..52c07346bea3 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -735,8 +735,6 @@ static void __exit ocfs2_stack_glue_exit(void) { memset(&locking_max_version, 0, sizeof(struct ocfs2_protocol_version)); - locking_max_version.pv_major = 0; - locking_max_version.pv_minor = 0; ocfs2_sysfs_exit(); if (ocfs2_table_header) unregister_sysctl_table(ocfs2_table_header); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index d7cae3327de5..603b28d6f008 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1819,7 +1819,7 @@ static int ocfs2_get_sector(struct super_block *sb, if (!buffer_dirty(*bh)) clear_buffer_uptodate(*bh); unlock_buffer(*bh); - ll_rw_block(READ, 1, bh); + ll_rw_block(REQ_OP_READ, 0, 1, bh); wait_on_buffer(*bh); if (!buffer_uptodate(*bh)) { mlog_errno(-EIO); @@ -2072,7 +2072,6 @@ static int ocfs2_initialize_super(struct super_block *sb, osb->osb_dx_seed[3] = le32_to_cpu(di->id2.i_super.s_uuid_hash); osb->sb = sb; - /* Save off for ocfs2_rw_direct */ osb->s_sectsize_bits = blksize_bits(sector_size); BUG_ON(!osb->s_sectsize_bits); diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c index 03f89dbb2512..28f2195cd798 100644 --- a/fs/orangefs/acl.c +++ b/fs/orangefs/acl.c @@ -18,10 +18,10 @@ struct posix_acl *orangefs_get_acl(struct inode *inode, int type) switch (type) { case ACL_TYPE_ACCESS: - key = ORANGEFS_XATTR_NAME_ACL_ACCESS; + key = XATTR_NAME_POSIX_ACL_ACCESS; break; case ACL_TYPE_DEFAULT: - key = ORANGEFS_XATTR_NAME_ACL_DEFAULT; + key = XATTR_NAME_POSIX_ACL_DEFAULT; break; default: gossip_err("orangefs_get_acl: bogus value of type %d\n", type); @@ -43,11 +43,8 @@ struct posix_acl *orangefs_get_acl(struct inode *inode, int type) get_khandle_from_ino(inode), key, type); - ret = orangefs_inode_getxattr(inode, - "", - key, - value, - ORANGEFS_MAX_XATTR_VALUELEN); + ret = orangefs_inode_getxattr(inode, key, value, + ORANGEFS_MAX_XATTR_VALUELEN); /* if the key exists, convert it to an in-memory rep */ if (ret > 0) { acl = posix_acl_from_xattr(&init_user_ns, value, ret); @@ -74,7 +71,7 @@ int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type) switch (type) { case ACL_TYPE_ACCESS: - name = ORANGEFS_XATTR_NAME_ACL_ACCESS; + name = XATTR_NAME_POSIX_ACL_ACCESS; if (acl) { umode_t mode = inode->i_mode; /* @@ -98,7 +95,7 @@ int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type) } break; case ACL_TYPE_DEFAULT: - name = ORANGEFS_XATTR_NAME_ACL_DEFAULT; + name = XATTR_NAME_POSIX_ACL_DEFAULT; break; default: gossip_err("%s: invalid type %d!\n", __func__, type); @@ -131,7 +128,7 @@ int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type) * will xlate to a removexattr. However, we don't want removexattr * complain if attributes does not exist. */ - error = orangefs_inode_setxattr(inode, "", name, value, size, 0); + error = orangefs_inode_setxattr(inode, name, value, size, 0); out: kfree(value); diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c index db170beba797..a287a66d94e3 100644 --- a/fs/orangefs/devorangefs-req.c +++ b/fs/orangefs/devorangefs-req.c @@ -116,6 +116,13 @@ static int orangefs_devreq_open(struct inode *inode, struct file *file) { int ret = -EINVAL; + /* in order to ensure that the filesystem driver sees correct UIDs */ + if (file->f_cred->user_ns != &init_user_ns) { + gossip_err("%s: device cannot be opened outside init_user_ns\n", + __func__); + goto out; + } + if (!(file->f_flags & O_NONBLOCK)) { gossip_err("%s: device cannot be opened in blocking mode\n", __func__); diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 491e82c6f705..526040e09f78 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -516,7 +516,6 @@ static long orangefs_ioctl(struct file *file, unsigned int cmd, unsigned long ar if (cmd == FS_IOC_GETFLAGS) { val = 0; ret = orangefs_inode_getxattr(file_inode(file), - ORANGEFS_XATTR_NAME_DEFAULT_PREFIX, "user.pvfs2.meta_hint", &val, sizeof(val)); if (ret < 0 && ret != -ENODATA) @@ -549,7 +548,6 @@ static long orangefs_ioctl(struct file *file, unsigned int cmd, unsigned long ar "orangefs_ioctl: FS_IOC_SETFLAGS: %llu\n", (unsigned long long)val); ret = orangefs_inode_setxattr(file_inode(file), - ORANGEFS_XATTR_NAME_DEFAULT_PREFIX, "user.pvfs2.meta_hint", &val, sizeof(val), 0); } diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 85640e955cde..8f2fa94cc4f6 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -80,7 +80,7 @@ static int orangefs_readpages(struct file *file, if (!add_to_page_cache(page, mapping, page->index, - GFP_KERNEL)) { + readahead_gfp_mask(mapping))) { ret = read_one_page(page); gossip_debug(GOSSIP_INODE_DEBUG, "failure adding page to cache, read_one_page returned: %d\n", @@ -124,19 +124,16 @@ static int orangefs_releasepage(struct page *page, gfp_t foo) * will need to be able to use O_DIRECT on open in order to support * AIO. Modeled after NFS, they do this too. */ -/* - * static ssize_t orangefs_direct_IO(int rw, - * struct kiocb *iocb, - * struct iov_iter *iter, - * loff_t offset) - *{ - * gossip_debug(GOSSIP_INODE_DEBUG, - * "orangefs_direct_IO: %s\n", - * iocb->ki_filp->f_path.dentry->d_name.name); - * - * return -EINVAL; - *} - */ + +static ssize_t orangefs_direct_IO(struct kiocb *iocb, + struct iov_iter *iter) +{ + gossip_debug(GOSSIP_INODE_DEBUG, + "orangefs_direct_IO: %s\n", + iocb->ki_filp->f_path.dentry->d_name.name); + + return -EINVAL; +} struct backing_dev_info orangefs_backing_dev_info = { .name = "orangefs", @@ -150,7 +147,7 @@ const struct address_space_operations orangefs_address_operations = { .readpages = orangefs_readpages, .invalidatepage = orangefs_invalidatepage, .releasepage = orangefs_releasepage, -/* .direct_IO = orangefs_direct_IO */ + .direct_IO = orangefs_direct_IO, }; static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr) diff --git a/fs/orangefs/orangefs-cache.c b/fs/orangefs/orangefs-cache.c index 900a2e38e11b..b6edbe9fb309 100644 --- a/fs/orangefs/orangefs-cache.c +++ b/fs/orangefs/orangefs-cache.c @@ -136,10 +136,10 @@ struct orangefs_kernel_op_s *op_alloc(__s32 type) llu(new_op->tag), get_opname_string(new_op)); - new_op->upcall.uid = from_kuid(current_user_ns(), + new_op->upcall.uid = from_kuid(&init_user_ns, current_fsuid()); - new_op->upcall.gid = from_kgid(current_user_ns(), + new_op->upcall.gid = from_kgid(&init_user_ns, current_fsgid()); } else { gossip_err("op_alloc: kmem_cache_zalloc failed!\n"); diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 2281882f718e..c1181e5529af 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -119,17 +119,6 @@ struct client_debug_mask { #define ORANGEFS_CACHE_CREATE_FLAGS 0 #endif /* ((defined ORANGEFS_KERNEL_DEBUG) && (defined CONFIG_DEBUG_SLAB)) */ -/* orangefs xattr and acl related defines */ -#define ORANGEFS_XATTR_INDEX_POSIX_ACL_ACCESS 1 -#define ORANGEFS_XATTR_INDEX_POSIX_ACL_DEFAULT 2 -#define ORANGEFS_XATTR_INDEX_TRUSTED 3 -#define ORANGEFS_XATTR_INDEX_DEFAULT 4 - -#define ORANGEFS_XATTR_NAME_ACL_ACCESS XATTR_NAME_POSIX_ACL_ACCESS -#define ORANGEFS_XATTR_NAME_ACL_DEFAULT XATTR_NAME_POSIX_ACL_DEFAULT -#define ORANGEFS_XATTR_NAME_TRUSTED_PREFIX "trusted." -#define ORANGEFS_XATTR_NAME_DEFAULT_PREFIX "" - /* these functions are defined in orangefs-utils.c */ int orangefs_prepare_cdm_array(char *debug_array_string); int orangefs_prepare_debugfs_help_string(int); @@ -528,13 +517,11 @@ __s32 fsid_of_op(struct orangefs_kernel_op_s *op); int orangefs_flush_inode(struct inode *inode); ssize_t orangefs_inode_getxattr(struct inode *inode, - const char *prefix, const char *name, void *buffer, size_t size); int orangefs_inode_setxattr(struct inode *inode, - const char *prefix, const char *name, const void *value, size_t size, @@ -600,8 +587,8 @@ int service_operation(struct orangefs_kernel_op_s *op, #define fill_default_sys_attrs(sys_attr, type, mode) \ do { \ - sys_attr.owner = from_kuid(current_user_ns(), current_fsuid()); \ - sys_attr.group = from_kgid(current_user_ns(), current_fsgid()); \ + sys_attr.owner = from_kuid(&init_user_ns, current_fsuid()); \ + sys_attr.group = from_kgid(&init_user_ns, current_fsgid()); \ sys_attr.perms = ORANGEFS_util_translate_mode(mode); \ sys_attr.mtime = 0; \ sys_attr.atime = 0; \ diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index 2d129b5886ee..c5fbc62357c6 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -153,12 +153,12 @@ static inline int copy_attributes_from_inode(struct inode *inode, */ attrs->mask = 0; if (iattr->ia_valid & ATTR_UID) { - attrs->owner = from_kuid(current_user_ns(), iattr->ia_uid); + attrs->owner = from_kuid(&init_user_ns, iattr->ia_uid); attrs->mask |= ORANGEFS_ATTR_SYS_UID; gossip_debug(GOSSIP_UTILS_DEBUG, "(UID) %d\n", attrs->owner); } if (iattr->ia_valid & ATTR_GID) { - attrs->group = from_kgid(current_user_ns(), iattr->ia_gid); + attrs->group = from_kgid(&init_user_ns, iattr->ia_gid); attrs->mask |= ORANGEFS_ATTR_SYS_GID; gossip_debug(GOSSIP_UTILS_DEBUG, "(GID) %d\n", attrs->group); } diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c index 5893ddde0e4b..2a9f07f06d10 100644 --- a/fs/orangefs/xattr.c +++ b/fs/orangefs/xattr.c @@ -59,8 +59,8 @@ static inline int convert_to_internal_xattr_flags(int setxattr_flags) * unless the key does not exist for the file and/or if * there were errors in fetching the attribute value. */ -ssize_t orangefs_inode_getxattr(struct inode *inode, const char *prefix, - const char *name, void *buffer, size_t size) +ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name, + void *buffer, size_t size) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct orangefs_kernel_op_s *new_op = NULL; @@ -70,17 +70,17 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *prefix, int fsgid; gossip_debug(GOSSIP_XATTR_DEBUG, - "%s: prefix %s name %s, buffer_size %zd\n", - __func__, prefix, name, size); + "%s: name %s, buffer_size %zd\n", + __func__, name, size); - if ((strlen(name) + strlen(prefix)) >= ORANGEFS_MAX_XATTR_NAMELEN) { + if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) { gossip_err("Invalid key length (%d)\n", - (int)(strlen(name) + strlen(prefix))); + (int)strlen(name)); return -EINVAL; } - fsuid = from_kuid(current_user_ns(), current_fsuid()); - fsgid = from_kgid(current_user_ns(), current_fsgid()); + fsuid = from_kuid(&init_user_ns, current_fsuid()); + fsgid = from_kgid(&init_user_ns, current_fsgid()); gossip_debug(GOSSIP_XATTR_DEBUG, "getxattr on inode %pU, name %s " @@ -97,15 +97,14 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *prefix, goto out_unlock; new_op->upcall.req.getxattr.refn = orangefs_inode->refn; - ret = snprintf((char *)new_op->upcall.req.getxattr.key, - ORANGEFS_MAX_XATTR_NAMELEN, "%s%s", prefix, name); + strcpy(new_op->upcall.req.getxattr.key, name); /* * NOTE: Although keys are meant to be NULL terminated textual * strings, I am going to explicitly pass the length just in case * we change this later on... */ - new_op->upcall.req.getxattr.key_sz = ret + 1; + new_op->upcall.req.getxattr.key_sz = strlen(name) + 1; ret = service_operation(new_op, "orangefs_inode_getxattr", get_interruptible_flag(inode)); @@ -163,10 +162,8 @@ out_unlock: return ret; } -static int orangefs_inode_removexattr(struct inode *inode, - const char *prefix, - const char *name, - int flags) +static int orangefs_inode_removexattr(struct inode *inode, const char *name, + int flags) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct orangefs_kernel_op_s *new_op = NULL; @@ -183,12 +180,8 @@ static int orangefs_inode_removexattr(struct inode *inode, * textual strings, I am going to explicitly pass the * length just in case we change this later on... */ - ret = snprintf((char *)new_op->upcall.req.removexattr.key, - ORANGEFS_MAX_XATTR_NAMELEN, - "%s%s", - (prefix ? prefix : ""), - name); - new_op->upcall.req.removexattr.key_sz = ret + 1; + strcpy(new_op->upcall.req.removexattr.key, name); + new_op->upcall.req.removexattr.key_sz = strlen(name) + 1; gossip_debug(GOSSIP_XATTR_DEBUG, "orangefs_inode_removexattr: key %s, key_sz %d\n", @@ -223,8 +216,8 @@ out_unlock: * Returns a -ve number on error and 0 on success. Key is text, but value * can be binary! */ -int orangefs_inode_setxattr(struct inode *inode, const char *prefix, - const char *name, const void *value, size_t size, int flags) +int orangefs_inode_setxattr(struct inode *inode, const char *name, + const void *value, size_t size, int flags) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct orangefs_kernel_op_s *new_op; @@ -232,8 +225,8 @@ int orangefs_inode_setxattr(struct inode *inode, const char *prefix, int ret = -ENOMEM; gossip_debug(GOSSIP_XATTR_DEBUG, - "%s: prefix %s, name %s, buffer_size %zd\n", - __func__, prefix, name, size); + "%s: name %s, buffer_size %zd\n", + __func__, name, size); if (size >= ORANGEFS_MAX_XATTR_VALUELEN || flags < 0) { @@ -245,29 +238,19 @@ int orangefs_inode_setxattr(struct inode *inode, const char *prefix, internal_flag = convert_to_internal_xattr_flags(flags); - if (prefix) { - if (strlen(name) + strlen(prefix) >= ORANGEFS_MAX_XATTR_NAMELEN) { - gossip_err - ("orangefs_inode_setxattr: bogus key size (%d)\n", - (int)(strlen(name) + strlen(prefix))); - return -EINVAL; - } - } else { - if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) { - gossip_err - ("orangefs_inode_setxattr: bogus key size (%d)\n", - (int)(strlen(name))); - return -EINVAL; - } + if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) { + gossip_err + ("orangefs_inode_setxattr: bogus key size (%d)\n", + (int)(strlen(name))); + return -EINVAL; } /* This is equivalent to a removexattr */ if (size == 0 && value == NULL) { gossip_debug(GOSSIP_XATTR_DEBUG, - "removing xattr (%s%s)\n", - prefix, + "removing xattr (%s)\n", name); - return orangefs_inode_removexattr(inode, prefix, name, flags); + return orangefs_inode_removexattr(inode, name, flags); } gossip_debug(GOSSIP_XATTR_DEBUG, @@ -288,11 +271,8 @@ int orangefs_inode_setxattr(struct inode *inode, const char *prefix, * strings, I am going to explicitly pass the length just in * case we change this later on... */ - ret = snprintf((char *)new_op->upcall.req.setxattr.keyval.key, - ORANGEFS_MAX_XATTR_NAMELEN, - "%s%s", - prefix, name); - new_op->upcall.req.setxattr.keyval.key_sz = ret + 1; + strcpy(new_op->upcall.req.setxattr.keyval.key, name); + new_op->upcall.req.setxattr.keyval.key_sz = strlen(name) + 1; memcpy(new_op->upcall.req.setxattr.keyval.val, value, size); new_op->upcall.req.setxattr.keyval.val_sz = size; @@ -455,12 +435,7 @@ static int orangefs_xattr_set_default(const struct xattr_handler *handler, size_t size, int flags) { - return orangefs_inode_setxattr(inode, - ORANGEFS_XATTR_NAME_DEFAULT_PREFIX, - name, - buffer, - size, - flags); + return orangefs_inode_setxattr(inode, name, buffer, size, flags); } static int orangefs_xattr_get_default(const struct xattr_handler *handler, @@ -470,57 +445,12 @@ static int orangefs_xattr_get_default(const struct xattr_handler *handler, void *buffer, size_t size) { - return orangefs_inode_getxattr(inode, - ORANGEFS_XATTR_NAME_DEFAULT_PREFIX, - name, - buffer, - size); - -} + return orangefs_inode_getxattr(inode, name, buffer, size); -static int orangefs_xattr_set_trusted(const struct xattr_handler *handler, - struct dentry *unused, - struct inode *inode, - const char *name, - const void *buffer, - size_t size, - int flags) -{ - return orangefs_inode_setxattr(inode, - ORANGEFS_XATTR_NAME_TRUSTED_PREFIX, - name, - buffer, - size, - flags); } -static int orangefs_xattr_get_trusted(const struct xattr_handler *handler, - struct dentry *unused, - struct inode *inode, - const char *name, - void *buffer, - size_t size) -{ - return orangefs_inode_getxattr(inode, - ORANGEFS_XATTR_NAME_TRUSTED_PREFIX, - name, - buffer, - size); -} - -static struct xattr_handler orangefs_xattr_trusted_handler = { - .prefix = ORANGEFS_XATTR_NAME_TRUSTED_PREFIX, - .get = orangefs_xattr_get_trusted, - .set = orangefs_xattr_set_trusted, -}; - static struct xattr_handler orangefs_xattr_default_handler = { - /* - * NOTE: this is set to be the empty string. - * so that all un-prefixed xattrs keys get caught - * here! - */ - .prefix = ORANGEFS_XATTR_NAME_DEFAULT_PREFIX, + .prefix = "", /* match any name => handlers called with full name */ .get = orangefs_xattr_get_default, .set = orangefs_xattr_set_default, }; @@ -528,7 +458,6 @@ static struct xattr_handler orangefs_xattr_default_handler = { const struct xattr_handler *orangefs_xattr_handlers[] = { &posix_acl_access_xattr_handler, &posix_acl_default_xattr_handler, - &orangefs_xattr_trusted_handler, &orangefs_xattr_default_handler, NULL }; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 22f0253a3567..5c9d2d80ff70 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -405,12 +405,21 @@ static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev, err = ovl_create_upper(dentry, inode, &stat, link, hardlink); } else { const struct cred *old_cred; + struct cred *override_cred; old_cred = ovl_override_creds(dentry->d_sb); - err = ovl_create_over_whiteout(dentry, inode, &stat, link, - hardlink); + err = -ENOMEM; + override_cred = prepare_creds(); + if (override_cred) { + override_cred->fsuid = old_cred->fsuid; + override_cred->fsgid = old_cred->fsgid; + put_cred(override_creds(override_cred)); + put_cred(override_cred); + err = ovl_create_over_whiteout(dentry, inode, &stat, + link, hardlink); + } revert_creds(old_cred); } @@ -496,6 +505,7 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir) struct dentry *upper; struct dentry *opaquedir = NULL; int err; + int flags = 0; if (WARN_ON(!workdir)) return -EROFS; @@ -525,46 +535,39 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir) if (err) goto out_dput; - whiteout = ovl_whiteout(workdir, dentry); - err = PTR_ERR(whiteout); - if (IS_ERR(whiteout)) + upper = lookup_one_len(dentry->d_name.name, upperdir, + dentry->d_name.len); + err = PTR_ERR(upper); + if (IS_ERR(upper)) goto out_unlock; - upper = ovl_dentry_upper(dentry); - if (!upper) { - upper = lookup_one_len(dentry->d_name.name, upperdir, - dentry->d_name.len); - err = PTR_ERR(upper); - if (IS_ERR(upper)) - goto kill_whiteout; - - err = ovl_do_rename(wdir, whiteout, udir, upper, 0); - dput(upper); - if (err) - goto kill_whiteout; - } else { - int flags = 0; + err = -ESTALE; + if ((opaquedir && upper != opaquedir) || + (!opaquedir && ovl_dentry_upper(dentry) && + upper != ovl_dentry_upper(dentry))) { + goto out_dput_upper; + } - if (opaquedir) - upper = opaquedir; - err = -ESTALE; - if (upper->d_parent != upperdir) - goto kill_whiteout; + whiteout = ovl_whiteout(workdir, dentry); + err = PTR_ERR(whiteout); + if (IS_ERR(whiteout)) + goto out_dput_upper; - if (is_dir) - flags |= RENAME_EXCHANGE; + if (d_is_dir(upper)) + flags = RENAME_EXCHANGE; - err = ovl_do_rename(wdir, whiteout, udir, upper, flags); - if (err) - goto kill_whiteout; + err = ovl_do_rename(wdir, whiteout, udir, upper, flags); + if (err) + goto kill_whiteout; + if (flags) + ovl_cleanup(wdir, upper); - if (is_dir) - ovl_cleanup(wdir, upper); - } ovl_dentry_version_inc(dentry->d_parent); out_d_drop: d_drop(dentry); dput(whiteout); +out_dput_upper: + dput(upper); out_unlock: unlock_rename(workdir, upperdir); out_dput: diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 0ed7c4012437..d1cdc60dd68f 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -59,16 +59,40 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr) if (err) goto out; + if (attr->ia_valid & ATTR_SIZE) { + struct inode *realinode = d_inode(ovl_dentry_real(dentry)); + + err = -ETXTBSY; + if (atomic_read(&realinode->i_writecount) < 0) + goto out_drop_write; + } + err = ovl_copy_up(dentry); if (!err) { + struct inode *winode = NULL; + upperdentry = ovl_dentry_upper(dentry); + if (attr->ia_valid & ATTR_SIZE) { + winode = d_inode(upperdentry); + err = get_write_access(winode); + if (err) + goto out_drop_write; + } + + if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) + attr->ia_valid &= ~ATTR_MODE; + inode_lock(upperdentry->d_inode); err = notify_change(upperdentry, attr, NULL); if (!err) ovl_copyattr(upperdentry->d_inode, dentry->d_inode); inode_unlock(upperdentry->d_inode); + + if (winode) + put_write_access(winode); } +out_drop_write: ovl_drop_write(dentry); out: return err; @@ -121,16 +145,18 @@ int ovl_permission(struct inode *inode, int mask) err = vfs_getattr(&realpath, &stat); if (err) - return err; + goto out_dput; + err = -ESTALE; if ((stat.mode ^ inode->i_mode) & S_IFMT) - return -ESTALE; + goto out_dput; inode->i_mode = stat.mode; inode->i_uid = stat.uid; inode->i_gid = stat.gid; - return generic_permission(inode, mask); + err = generic_permission(inode, mask); + goto out_dput; } /* Careful in RCU walk mode */ @@ -238,41 +264,27 @@ out: return err; } -static bool ovl_need_xattr_filter(struct dentry *dentry, - enum ovl_path_type type) -{ - if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER) - return S_ISDIR(dentry->d_inode->i_mode); - else - return false; -} - ssize_t ovl_getxattr(struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size) { - struct path realpath; - enum ovl_path_type type = ovl_path_real(dentry, &realpath); + struct dentry *realdentry = ovl_dentry_real(dentry); - if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name)) + if (ovl_is_private_xattr(name)) return -ENODATA; - return vfs_getxattr(realpath.dentry, name, value, size); + return vfs_getxattr(realdentry, name, value, size); } ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) { - struct path realpath; - enum ovl_path_type type = ovl_path_real(dentry, &realpath); + struct dentry *realdentry = ovl_dentry_real(dentry); ssize_t res; int off; - res = vfs_listxattr(realpath.dentry, list, size); + res = vfs_listxattr(realdentry, list, size); if (res <= 0 || size == 0) return res; - if (!ovl_need_xattr_filter(dentry, type)) - return res; - /* filter out private xattrs */ for (off = 0; off < res;) { char *s = list + off; @@ -302,7 +314,7 @@ int ovl_removexattr(struct dentry *dentry, const char *name) goto out; err = -ENODATA; - if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name)) + if (ovl_is_private_xattr(name)) goto out_drop_write; if (!OVL_TYPE_UPPER(type)) { @@ -401,12 +413,11 @@ struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, if (!inode) return NULL; - mode &= S_IFMT; - inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_flags |= S_NOATIME | S_NOCMTIME; + mode &= S_IFMT; switch (mode) { case S_IFDIR: inode->i_private = oe; diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 4bd9b5ba8f42..cfbca53590d0 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -187,6 +187,7 @@ static inline void ovl_copyattr(struct inode *from, struct inode *to) { to->i_uid = from->i_uid; to->i_gid = from->i_gid; + to->i_mode = from->i_mode; } /* dir.c */ diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index ce02f46029da..9a7693d5f8ff 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1082,11 +1082,13 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (err < 0) goto out_put_workdir; - if (!err) { - pr_err("overlayfs: upper fs needs to support d_type.\n"); - err = -EINVAL; - goto out_put_workdir; - } + /* + * We allowed this configuration and don't want to + * break users over kernel upgrade. So warn instead + * of erroring out. + */ + if (!err) + pr_warn("overlayfs: upper fs needs to support d_type.\n"); } } diff --git a/fs/pipe.c b/fs/pipe.c index 0d3f5165cb0b..4b32928f5426 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -21,6 +21,7 @@ #include <linux/audit.h> #include <linux/syscalls.h> #include <linux/fcntl.h> +#include <linux/memcontrol.h> #include <asm/uaccess.h> #include <asm/ioctls.h> @@ -137,6 +138,22 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe, put_page(page); } +static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct page *page = buf->page; + + if (page_count(page) == 1) { + if (memcg_kmem_enabled()) { + memcg_kmem_uncharge(page, 0); + __ClearPageKmemcg(page); + } + __SetPageLocked(page); + return 0; + } + return 1; +} + /** * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer * @pipe: the pipe that the buffer belongs to @@ -219,7 +236,7 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = { .can_merge = 1, .confirm = generic_pipe_buf_confirm, .release = anon_pipe_buf_release, - .steal = generic_pipe_buf_steal, + .steal = anon_pipe_buf_steal, .get = generic_pipe_buf_get, }; @@ -227,7 +244,7 @@ static const struct pipe_buf_operations packet_pipe_buf_ops = { .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = anon_pipe_buf_release, - .steal = generic_pipe_buf_steal, + .steal = anon_pipe_buf_steal, .get = generic_pipe_buf_get, }; @@ -405,7 +422,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) int copied; if (!page) { - page = alloc_page(GFP_HIGHUSER); + page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT); if (unlikely(!page)) { ret = ret ? : -ENOMEM; break; @@ -611,7 +628,7 @@ struct pipe_inode_info *alloc_pipe_info(void) { struct pipe_inode_info *pipe; - pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); + pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT); if (pipe) { unsigned long pipe_bufs = PIPE_DEF_BUFFERS; struct user_struct *user = get_current_user(); @@ -619,7 +636,9 @@ struct pipe_inode_info *alloc_pipe_info(void) if (!too_many_pipe_buffers_hard(user)) { if (too_many_pipe_buffers_soft(user)) pipe_bufs = 1; - pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * pipe_bufs, GFP_KERNEL); + pipe->bufs = kcalloc(pipe_bufs, + sizeof(struct pipe_buffer), + GFP_KERNEL_ACCOUNT); } if (pipe->bufs) { @@ -1010,7 +1029,8 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) if (nr_pages < pipe->nrbufs) return -EBUSY; - bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN); + bufs = kcalloc(nr_pages, sizeof(*bufs), + GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (unlikely(!bufs)) return -ENOMEM; diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 8a4a266beff3..edc452c2a563 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -820,39 +820,43 @@ posix_acl_xattr_get(const struct xattr_handler *handler, return error; } -static int -posix_acl_xattr_set(const struct xattr_handler *handler, - struct dentry *unused, struct inode *inode, - const char *name, const void *value, - size_t size, int flags) +int +set_posix_acl(struct inode *inode, int type, struct posix_acl *acl) { - struct posix_acl *acl = NULL; - int ret; - if (!IS_POSIXACL(inode)) return -EOPNOTSUPP; if (!inode->i_op->set_acl) return -EOPNOTSUPP; - if (handler->flags == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) - return value ? -EACCES : 0; + if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; if (!inode_owner_or_capable(inode)) return -EPERM; + if (acl) { + int ret = posix_acl_valid(acl); + if (ret) + return ret; + } + return inode->i_op->set_acl(inode, acl, type); +} +EXPORT_SYMBOL(set_posix_acl); + +static int +posix_acl_xattr_set(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + struct posix_acl *acl = NULL; + int ret; + if (value) { acl = posix_acl_from_xattr(&init_user_ns, value, size); if (IS_ERR(acl)) return PTR_ERR(acl); - - if (acl) { - ret = posix_acl_valid(acl); - if (ret) - goto out; - } } - - ret = inode->i_op->set_acl(inode, acl, handler->flags); -out: + ret = set_posix_acl(inode, handler->flags, acl); posix_acl_release(acl); return ret; } diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 83720460c5bc..cf301a9ef512 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -105,6 +105,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE "AnonHugePages: %8lu kB\n" + "ShmemHugePages: %8lu kB\n" + "ShmemPmdMapped: %8lu kB\n" #endif #ifdef CONFIG_CMA "CmaTotal: %8lu kB\n" @@ -162,8 +164,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) , atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10) #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE - , K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * - HPAGE_PMD_NR) + , K(global_page_state(NR_ANON_THPS) * HPAGE_PMD_NR) + , K(global_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR) + , K(global_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR) #endif #ifdef CONFIG_CMA , K(totalcma_pages) diff --git a/fs/proc/root.c b/fs/proc/root.c index 55bc7d6c8aac..06702783bf40 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -121,6 +121,13 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, if (IS_ERR(sb)) return ERR_CAST(sb); + /* + * procfs isn't actually a stacking filesystem; however, there is + * too much magic going on inside it to permit stacking things on + * top of it + */ + sb->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; + if (!proc_parse_options(options, ns)) { deactivate_locked_super(sb); return ERR_PTR(-EINVAL); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 4648c7f63ae2..187d84ef9de9 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -448,6 +448,7 @@ struct mem_size_stats { unsigned long referenced; unsigned long anonymous; unsigned long anonymous_thp; + unsigned long shmem_thp; unsigned long swap; unsigned long shared_hugetlb; unsigned long private_hugetlb; @@ -576,7 +577,12 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP); if (IS_ERR_OR_NULL(page)) return; - mss->anonymous_thp += HPAGE_PMD_SIZE; + if (PageAnon(page)) + mss->anonymous_thp += HPAGE_PMD_SIZE; + else if (PageSwapBacked(page)) + mss->shmem_thp += HPAGE_PMD_SIZE; + else + VM_BUG_ON_PAGE(1, page); smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd)); } #else @@ -770,6 +776,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) "Referenced: %8lu kB\n" "Anonymous: %8lu kB\n" "AnonHugePages: %8lu kB\n" + "ShmemPmdMapped: %8lu kB\n" "Shared_Hugetlb: %8lu kB\n" "Private_Hugetlb: %7lu kB\n" "Swap: %8lu kB\n" @@ -787,6 +794,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) mss.referenced >> 10, mss.anonymous >> 10, mss.anonymous_thp >> 10, + mss.shmem_thp >> 10, mss.shared_hugetlb >> 10, mss.private_hugetlb >> 10, mss.swap >> 10, diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index 360ae43f590c..be40813eff52 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -1,8 +1,6 @@ config PSTORE tristate "Persistent store support" default n - select ZLIB_DEFLATE - select ZLIB_INFLATE help This option enables generic access to platform level persistent storage via "pstore" filesystem that can @@ -14,6 +12,35 @@ config PSTORE If you don't have a platform persistent store driver, say N. +choice + prompt "Choose compression algorithm" + depends on PSTORE + default PSTORE_ZLIB_COMPRESS + help + This option chooses compression algorithm. + +config PSTORE_ZLIB_COMPRESS + bool "ZLIB" + select ZLIB_DEFLATE + select ZLIB_INFLATE + help + This option enables ZLIB compression algorithm support. + +config PSTORE_LZO_COMPRESS + bool "LZO" + select LZO_COMPRESS + select LZO_DECOMPRESS + help + This option enables LZO compression algorithm support. + +config PSTORE_LZ4_COMPRESS + bool "LZ4" + select LZ4_COMPRESS + select LZ4_DECOMPRESS + help + This option enables LZ4 compression algorithm support. +endchoice + config PSTORE_CONSOLE bool "Log kernel console messages" depends on PSTORE diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 45d6110744cb..ec9ddef5ae75 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -178,7 +178,6 @@ static loff_t pstore_file_llseek(struct file *file, loff_t off, int whence) } static const struct file_operations pstore_file_operations = { - .owner = THIS_MODULE, .open = pstore_file_open, .read = pstore_file_read, .llseek = pstore_file_llseek, diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 588461bb2dd4..16ecca5b72d8 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -28,7 +28,15 @@ #include <linux/console.h> #include <linux/module.h> #include <linux/pstore.h> +#ifdef CONFIG_PSTORE_ZLIB_COMPRESS #include <linux/zlib.h> +#endif +#ifdef CONFIG_PSTORE_LZO_COMPRESS +#include <linux/lzo.h> +#endif +#ifdef CONFIG_PSTORE_LZ4_COMPRESS +#include <linux/lz4.h> +#endif #include <linux/string.h> #include <linux/timer.h> #include <linux/slab.h> @@ -69,10 +77,23 @@ struct pstore_info *psinfo; static char *backend; /* Compression parameters */ +#ifdef CONFIG_PSTORE_ZLIB_COMPRESS #define COMPR_LEVEL 6 #define WINDOW_BITS 12 #define MEM_LEVEL 4 static struct z_stream_s stream; +#else +static unsigned char *workspace; +#endif + +struct pstore_zbackend { + int (*compress)(const void *in, void *out, size_t inlen, size_t outlen); + int (*decompress)(void *in, void *out, size_t inlen, size_t outlen); + void (*allocate)(void); + void (*free)(void); + + const char *name; +}; static char *big_oops_buf; static size_t big_oops_buf_sz; @@ -129,9 +150,9 @@ bool pstore_cannot_block_path(enum kmsg_dump_reason reason) } EXPORT_SYMBOL_GPL(pstore_cannot_block_path); +#ifdef CONFIG_PSTORE_ZLIB_COMPRESS /* Derived from logfs_compress() */ -static int pstore_compress(const void *in, void *out, size_t inlen, - size_t outlen) +static int compress_zlib(const void *in, void *out, size_t inlen, size_t outlen) { int err, ret; @@ -165,7 +186,7 @@ error: } /* Derived from logfs_uncompress */ -static int pstore_decompress(void *in, void *out, size_t inlen, size_t outlen) +static int decompress_zlib(void *in, void *out, size_t inlen, size_t outlen) { int err, ret; @@ -194,7 +215,7 @@ error: return ret; } -static void allocate_buf_for_compression(void) +static void allocate_zlib(void) { size_t size; size_t cmpr; @@ -237,12 +258,190 @@ static void allocate_buf_for_compression(void) } -static void free_buf_for_compression(void) +static void free_zlib(void) { kfree(stream.workspace); stream.workspace = NULL; kfree(big_oops_buf); big_oops_buf = NULL; + big_oops_buf_sz = 0; +} + +static struct pstore_zbackend backend_zlib = { + .compress = compress_zlib, + .decompress = decompress_zlib, + .allocate = allocate_zlib, + .free = free_zlib, + .name = "zlib", +}; +#endif + +#ifdef CONFIG_PSTORE_LZO_COMPRESS +static int compress_lzo(const void *in, void *out, size_t inlen, size_t outlen) +{ + int ret; + + ret = lzo1x_1_compress(in, inlen, out, &outlen, workspace); + if (ret != LZO_E_OK) { + pr_err("lzo_compress error, ret = %d!\n", ret); + return -EIO; + } + + return outlen; +} + +static int decompress_lzo(void *in, void *out, size_t inlen, size_t outlen) +{ + int ret; + + ret = lzo1x_decompress_safe(in, inlen, out, &outlen); + if (ret != LZO_E_OK) { + pr_err("lzo_decompress error, ret = %d!\n", ret); + return -EIO; + } + + return outlen; +} + +static void allocate_lzo(void) +{ + big_oops_buf_sz = lzo1x_worst_compress(psinfo->bufsize); + big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL); + if (big_oops_buf) { + workspace = kmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); + if (!workspace) { + pr_err("No memory for compression workspace; skipping compression\n"); + kfree(big_oops_buf); + big_oops_buf = NULL; + } + } else { + pr_err("No memory for uncompressed data; skipping compression\n"); + workspace = NULL; + } +} + +static void free_lzo(void) +{ + kfree(workspace); + kfree(big_oops_buf); + big_oops_buf = NULL; + big_oops_buf_sz = 0; +} + +static struct pstore_zbackend backend_lzo = { + .compress = compress_lzo, + .decompress = decompress_lzo, + .allocate = allocate_lzo, + .free = free_lzo, + .name = "lzo", +}; +#endif + +#ifdef CONFIG_PSTORE_LZ4_COMPRESS +static int compress_lz4(const void *in, void *out, size_t inlen, size_t outlen) +{ + int ret; + + ret = lz4_compress(in, inlen, out, &outlen, workspace); + if (ret) { + pr_err("lz4_compress error, ret = %d!\n", ret); + return -EIO; + } + + return outlen; +} + +static int decompress_lz4(void *in, void *out, size_t inlen, size_t outlen) +{ + int ret; + + ret = lz4_decompress_unknownoutputsize(in, inlen, out, &outlen); + if (ret) { + pr_err("lz4_decompress error, ret = %d!\n", ret); + return -EIO; + } + + return outlen; +} + +static void allocate_lz4(void) +{ + big_oops_buf_sz = lz4_compressbound(psinfo->bufsize); + big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL); + if (big_oops_buf) { + workspace = kmalloc(LZ4_MEM_COMPRESS, GFP_KERNEL); + if (!workspace) { + pr_err("No memory for compression workspace; skipping compression\n"); + kfree(big_oops_buf); + big_oops_buf = NULL; + } + } else { + pr_err("No memory for uncompressed data; skipping compression\n"); + workspace = NULL; + } +} + +static void free_lz4(void) +{ + kfree(workspace); + kfree(big_oops_buf); + big_oops_buf = NULL; + big_oops_buf_sz = 0; +} + +static struct pstore_zbackend backend_lz4 = { + .compress = compress_lz4, + .decompress = decompress_lz4, + .allocate = allocate_lz4, + .free = free_lz4, + .name = "lz4", +}; +#endif + +static struct pstore_zbackend *zbackend = +#if defined(CONFIG_PSTORE_ZLIB_COMPRESS) + &backend_zlib; +#elif defined(CONFIG_PSTORE_LZO_COMPRESS) + &backend_lzo; +#elif defined(CONFIG_PSTORE_LZ4_COMPRESS) + &backend_lz4; +#else + NULL; +#endif + +static int pstore_compress(const void *in, void *out, + size_t inlen, size_t outlen) +{ + if (zbackend) + return zbackend->compress(in, out, inlen, outlen); + else + return -EIO; +} + +static int pstore_decompress(void *in, void *out, size_t inlen, size_t outlen) +{ + if (zbackend) + return zbackend->decompress(in, out, inlen, outlen); + else + return -EIO; +} + +static void allocate_buf_for_compression(void) +{ + if (zbackend) { + pr_info("using %s compression\n", zbackend->name); + zbackend->allocate(); + } else { + pr_err("allocate compression buffer error!\n"); + } +} + +static void free_buf_for_compression(void) +{ + if (zbackend) + zbackend->free(); + else + pr_err("free compression buffer error!\n"); } /* @@ -284,7 +483,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, u64 id; unsigned int part = 1; unsigned long flags = 0; - int is_locked = 0; + int is_locked; int ret; why = get_reason_str(reason); @@ -295,8 +494,10 @@ static void pstore_dump(struct kmsg_dumper *dumper, pr_err("pstore dump routine blocked in %s path, may corrupt error record\n" , in_nmi() ? "NMI" : why); } - } else + } else { spin_lock_irqsave(&psinfo->buf_lock, flags); + is_locked = 1; + } oopscount++; while (total < kmsg_bytes) { char *dst; @@ -304,19 +505,25 @@ static void pstore_dump(struct kmsg_dumper *dumper, int hsize; int zipped_len = -1; size_t len; - bool compressed; + bool compressed = false; size_t total_len; if (big_oops_buf && is_locked) { dst = big_oops_buf; - hsize = sprintf(dst, "%s#%d Part%u\n", why, - oopscount, part); - size = big_oops_buf_sz - hsize; + size = big_oops_buf_sz; + } else { + dst = psinfo->buf; + size = psinfo->bufsize; + } - if (!kmsg_dump_get_buffer(dumper, true, dst + hsize, - size, &len)) - break; + hsize = sprintf(dst, "%s#%d Part%u\n", why, oopscount, part); + size -= hsize; + + if (!kmsg_dump_get_buffer(dumper, true, dst + hsize, + size, &len)) + break; + if (big_oops_buf && is_locked) { zipped_len = pstore_compress(dst, psinfo->buf, hsize + len, psinfo->bufsize); @@ -324,21 +531,9 @@ static void pstore_dump(struct kmsg_dumper *dumper, compressed = true; total_len = zipped_len; } else { - compressed = false; total_len = copy_kmsg_to_buffer(hsize, len); } } else { - dst = psinfo->buf; - hsize = sprintf(dst, "%s#%d Part%u\n", why, oopscount, - part); - size = psinfo->bufsize - hsize; - dst += hsize; - - if (!kmsg_dump_get_buffer(dumper, true, dst, - size, &len)) - break; - - compressed = false; total_len = hsize + len; } @@ -350,10 +545,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, total += total_len; part++; } - if (pstore_cannot_block_path(reason)) { - if (is_locked) - spin_unlock_irqrestore(&psinfo->buf_lock, flags); - } else + if (is_locked) spin_unlock_irqrestore(&psinfo->buf_lock, flags); } @@ -497,9 +689,11 @@ EXPORT_SYMBOL_GPL(pstore_register); void pstore_unregister(struct pstore_info *psi) { - pstore_unregister_pmsg(); - pstore_unregister_ftrace(); - pstore_unregister_console(); + if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) { + pstore_unregister_pmsg(); + pstore_unregister_ftrace(); + pstore_unregister_console(); + } pstore_unregister_kmsg(); free_buf_for_compression(); @@ -527,6 +721,7 @@ void pstore_get_records(int quiet) int failed = 0, rc; bool compressed; int unzipped_len = -1; + ssize_t ecc_notice_size = 0; if (!psi) return; @@ -536,7 +731,7 @@ void pstore_get_records(int quiet) goto out; while ((size = psi->read(&id, &type, &count, &time, &buf, &compressed, - psi)) > 0) { + &ecc_notice_size, psi)) > 0) { if (compressed && (type == PSTORE_TYPE_DMESG)) { if (big_oops_buf) unzipped_len = pstore_decompress(buf, @@ -544,6 +739,9 @@ void pstore_get_records(int quiet) big_oops_buf_sz); if (unzipped_len > 0) { + if (ecc_notice_size) + memcpy(big_oops_buf + unzipped_len, + buf + size, ecc_notice_size); kfree(buf); buf = big_oops_buf; size = unzipped_len; @@ -555,7 +753,8 @@ void pstore_get_records(int quiet) } } rc = pstore_mkfile(type, psi->name, id, count, buf, - compressed, (size_t)size, time, psi); + compressed, size + ecc_notice_size, + time, psi); if (unzipped_len < 0) { /* Free buffer other than big oops */ kfree(buf); diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index bd9812e83461..47516a794011 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -34,6 +34,8 @@ #include <linux/slab.h> #include <linux/compiler.h> #include <linux/pstore_ram.h> +#include <linux/of.h> +#include <linux/of_address.h> #define RAMOOPS_KERNMSG_HDR "====" #define MIN_MEM_SIZE 4096UL @@ -181,10 +183,10 @@ static bool prz_ok(struct persistent_ram_zone *prz) static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, int *count, struct timespec *time, char **buf, bool *compressed, + ssize_t *ecc_notice_size, struct pstore_info *psi) { ssize_t size; - ssize_t ecc_notice_size; struct ramoops_context *cxt = psi->data; struct persistent_ram_zone *prz = NULL; int header_length = 0; @@ -229,16 +231,16 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, size = persistent_ram_old_size(prz) - header_length; /* ECC correction notice */ - ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0); + *ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0); - *buf = kmalloc(size + ecc_notice_size + 1, GFP_KERNEL); + *buf = kmalloc(size + *ecc_notice_size + 1, GFP_KERNEL); if (*buf == NULL) return -ENOMEM; memcpy(*buf, (char *)persistent_ram_old(prz) + header_length, size); - persistent_ram_ecc_string(prz, *buf + size, ecc_notice_size + 1); + persistent_ram_ecc_string(prz, *buf + size, *ecc_notice_size + 1); - return size + ecc_notice_size; + return size; } static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz, @@ -458,15 +460,98 @@ static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt, return 0; } +static int ramoops_parse_dt_size(struct platform_device *pdev, + const char *propname, u32 *value) +{ + u32 val32 = 0; + int ret; + + ret = of_property_read_u32(pdev->dev.of_node, propname, &val32); + if (ret < 0 && ret != -EINVAL) { + dev_err(&pdev->dev, "failed to parse property %s: %d\n", + propname, ret); + return ret; + } + + if (val32 > INT_MAX) { + dev_err(&pdev->dev, "%s %u > INT_MAX\n", propname, val32); + return -EOVERFLOW; + } + + *value = val32; + return 0; +} + +static int ramoops_parse_dt(struct platform_device *pdev, + struct ramoops_platform_data *pdata) +{ + struct device_node *of_node = pdev->dev.of_node; + struct device_node *mem_region; + struct resource res; + u32 value; + int ret; + + dev_dbg(&pdev->dev, "using Device Tree\n"); + + mem_region = of_parse_phandle(of_node, "memory-region", 0); + if (!mem_region) { + dev_err(&pdev->dev, "no memory-region phandle\n"); + return -ENODEV; + } + + ret = of_address_to_resource(mem_region, 0, &res); + of_node_put(mem_region); + if (ret) { + dev_err(&pdev->dev, + "failed to translate memory-region to resource: %d\n", + ret); + return ret; + } + + pdata->mem_size = resource_size(&res); + pdata->mem_address = res.start; + pdata->mem_type = of_property_read_bool(of_node, "unbuffered"); + pdata->dump_oops = !of_property_read_bool(of_node, "no-dump-oops"); + +#define parse_size(name, field) { \ + ret = ramoops_parse_dt_size(pdev, name, &value); \ + if (ret < 0) \ + return ret; \ + field = value; \ + } + + parse_size("record-size", pdata->record_size); + parse_size("console-size", pdata->console_size); + parse_size("ftrace-size", pdata->ftrace_size); + parse_size("pmsg-size", pdata->pmsg_size); + parse_size("ecc-size", pdata->ecc_info.ecc_size); + +#undef parse_size + + return 0; +} + static int ramoops_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; - struct ramoops_platform_data *pdata = pdev->dev.platform_data; + struct ramoops_platform_data *pdata = dev->platform_data; struct ramoops_context *cxt = &oops_cxt; size_t dump_mem_sz; phys_addr_t paddr; int err = -EINVAL; + if (dev_of_node(dev) && !pdata) { + pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); + if (!pdata) { + err = -ENOMEM; + goto fail_out; + } + + err = ramoops_parse_dt(pdev, pdata); + if (err < 0) + goto fail_out; + } + /* Only a single ramoops area allowed at a time, so fail extra * probes. */ @@ -596,11 +681,17 @@ static int ramoops_remove(struct platform_device *pdev) return 0; } +static const struct of_device_id dt_match[] = { + { .compatible = "ramoops" }, + {} +}; + static struct platform_driver ramoops_driver = { .probe = ramoops_probe, .remove = ramoops_remove, .driver = { - .name = "ramoops", + .name = "ramoops", + .of_match_table = dt_match, }, }; diff --git a/fs/read_write.c b/fs/read_write.c index 933b53a375b4..66215a7b17cf 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1168,6 +1168,15 @@ COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, return do_compat_preadv64(fd, vec, vlen, pos, 0); } +#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2 +COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd, + const struct compat_iovec __user *,vec, + unsigned long, vlen, loff_t, pos, int, flags) +{ + return do_compat_preadv64(fd, vec, vlen, pos, flags); +} +#endif + COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd, const struct compat_iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, @@ -1265,6 +1274,15 @@ COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, return do_compat_pwritev64(fd, vec, vlen, pos, 0); } +#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2 +COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd, + const struct compat_iovec __user *,vec, + unsigned long, vlen, loff_t, pos, int, flags) +{ + return do_compat_pwritev64(fd, vec, vlen, pos, flags); +} +#endif + COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd, const struct compat_iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, int, flags) diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 825455d3e4ba..c2c59f9ff04b 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2668,7 +2668,7 @@ static int reiserfs_write_full_page(struct page *page, do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh(WRITE, bh); + submit_bh(REQ_OP_WRITE, 0, bh); nr++; } put_bh(bh); @@ -2728,7 +2728,7 @@ fail: struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { clear_buffer_dirty(bh); - submit_bh(WRITE, bh); + submit_bh(REQ_OP_WRITE, 0, bh); nr++; } put_bh(bh); diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 2ace90e981f0..bc2dde2423c2 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -652,7 +652,7 @@ static void submit_logged_buffer(struct buffer_head *bh) BUG(); if (!buffer_uptodate(bh)) BUG(); - submit_bh(WRITE, bh); + submit_bh(REQ_OP_WRITE, 0, bh); } static void submit_ordered_buffer(struct buffer_head *bh) @@ -662,7 +662,7 @@ static void submit_ordered_buffer(struct buffer_head *bh) clear_buffer_dirty(bh); if (!buffer_uptodate(bh)) BUG(); - submit_bh(WRITE, bh); + submit_bh(REQ_OP_WRITE, 0, bh); } #define CHUNK_SIZE 32 @@ -870,7 +870,7 @@ loop_next: */ if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) { spin_unlock(lock); - ll_rw_block(WRITE, 1, &bh); + ll_rw_block(REQ_OP_WRITE, 0, 1, &bh); spin_lock(lock); } put_bh(bh); @@ -1057,7 +1057,7 @@ static int flush_commit_list(struct super_block *s, if (tbh) { if (buffer_dirty(tbh)) { depth = reiserfs_write_unlock_nested(s); - ll_rw_block(WRITE, 1, &tbh); + ll_rw_block(REQ_OP_WRITE, 0, 1, &tbh); reiserfs_write_lock_nested(s, depth); } put_bh(tbh) ; @@ -2244,7 +2244,7 @@ abort_replay: } } /* read in the log blocks, memcpy to the corresponding real block */ - ll_rw_block(READ, get_desc_trans_len(desc), log_blocks); + ll_rw_block(REQ_OP_READ, 0, get_desc_trans_len(desc), log_blocks); for (i = 0; i < get_desc_trans_len(desc); i++) { wait_on_buffer(log_blocks[i]); @@ -2269,7 +2269,7 @@ abort_replay: /* flush out the real blocks */ for (i = 0; i < get_desc_trans_len(desc); i++) { set_buffer_dirty(real_blocks[i]); - write_dirty_buffer(real_blocks[i], WRITE); + write_dirty_buffer(real_blocks[i], 0); } for (i = 0; i < get_desc_trans_len(desc); i++) { wait_on_buffer(real_blocks[i]); @@ -2346,7 +2346,7 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev, } else bhlist[j++] = bh; } - ll_rw_block(READ, j, bhlist); + ll_rw_block(REQ_OP_READ, 0, j, bhlist); for (i = 1; i < j; i++) brelse(bhlist[i]); bh = bhlist[0]; diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 5feacd689241..4032d1e87c8f 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -551,7 +551,7 @@ static int search_by_key_reada(struct super_block *s, if (!buffer_uptodate(bh[j])) { if (depth == -1) depth = reiserfs_write_unlock_nested(s); - ll_rw_block(READA, 1, bh + j); + ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, bh + j); } brelse(bh[j]); } @@ -660,7 +660,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, if (!buffer_uptodate(bh) && depth == -1) depth = reiserfs_write_unlock_nested(sb); - ll_rw_block(READ, 1, &bh); + ll_rw_block(REQ_OP_READ, 0, 1, &bh); wait_on_buffer(bh); if (depth != -1) diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index b8f2d1e8c645..7a4a85a6821e 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1393,7 +1393,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) unsigned long safe_mask = 0; unsigned int commit_max_age = (unsigned int)-1; struct reiserfs_journal *journal = SB_JOURNAL(s); - char *new_opts = kstrdup(arg, GFP_KERNEL); + char *new_opts; int err; char *qf_names[REISERFS_MAXQUOTAS]; unsigned int qfmt = 0; @@ -1401,6 +1401,10 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) int i; #endif + new_opts = kstrdup(arg, GFP_KERNEL); + if (arg && !new_opts) + return -ENOMEM; + sync_filesystem(s); reiserfs_write_lock(s); @@ -1546,7 +1550,8 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) } out_ok_unlocked: - replace_mount_options(s, new_opts); + if (new_opts) + replace_mount_options(s, new_opts); return 0; out_err_unlock: @@ -1661,7 +1666,7 @@ static int read_super_block(struct super_block *s, int offset) /* after journal replay, reread all bitmap and super blocks */ static int reread_meta_blocks(struct super_block *s) { - ll_rw_block(READ, 1, &SB_BUFFER_WITH_SB(s)); + ll_rw_block(REQ_OP_READ, 0, 1, &SB_BUFFER_WITH_SB(s)); wait_on_buffer(SB_BUFFER_WITH_SB(s)); if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) { reiserfs_warning(s, "reiserfs-2504", "error reading the super"); diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 2c2618410d51..ce62a380314f 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -124,7 +124,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length, goto block_release; bytes += msblk->devblksize; } - ll_rw_block(READ, b, bh); + ll_rw_block(REQ_OP_READ, 0, b, bh); } else { /* * Metadata block. @@ -156,7 +156,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length, goto block_release; bytes += msblk->devblksize; } - ll_rw_block(READ, b - 1, bh + 1); + ll_rw_block(REQ_OP_READ, 0, b - 1, bh + 1); } for (i = 0; i < b; i++) { diff --git a/fs/super.c b/fs/super.c index d78b9847e6cb..5806ffd45563 100644 --- a/fs/super.c +++ b/fs/super.c @@ -206,6 +206,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) mutex_init(&s->s_sync_lock); INIT_LIST_HEAD(&s->s_inodes); spin_lock_init(&s->s_inode_list_lock); + INIT_LIST_HEAD(&s->s_inodes_wb); + spin_lock_init(&s->s_inode_wblist_lock); if (list_lru_init_memcg(&s->s_dentry_lru)) goto fail; diff --git a/fs/timerfd.c b/fs/timerfd.c index 053818dd6c18..9ae4abb4110b 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -390,6 +390,11 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) clockid != CLOCK_BOOTTIME_ALARM)) return -EINVAL; + if (!capable(CAP_WAKE_ALARM) && + (clockid == CLOCK_REALTIME_ALARM || + clockid == CLOCK_BOOTTIME_ALARM)) + return -EPERM; + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; @@ -433,6 +438,11 @@ static int do_timerfd_settime(int ufd, int flags, return ret; ctx = f.file->private_data; + if (!capable(CAP_WAKE_ALARM) && isalarm(ctx)) { + fdput(f); + return -EPERM; + } + timerfd_setup_cancel(ctx, flags); /* diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 08316972ff93..7bbf420d1289 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -52,6 +52,7 @@ #include "ubifs.h" #include <linux/mount.h> #include <linux/slab.h> +#include <linux/migrate.h> static int read_block(struct inode *inode, void *addr, unsigned int block, struct ubifs_data_node *dn) @@ -1452,6 +1453,26 @@ static int ubifs_set_page_dirty(struct page *page) return ret; } +#ifdef CONFIG_MIGRATION +static int ubifs_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page, enum migrate_mode mode) +{ + int rc; + + rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); + if (rc != MIGRATEPAGE_SUCCESS) + return rc; + + if (PagePrivate(page)) { + ClearPagePrivate(page); + SetPagePrivate(newpage); + } + + migrate_page_copy(newpage, page); + return MIGRATEPAGE_SUCCESS; +} +#endif + static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags) { /* @@ -1591,6 +1612,9 @@ const struct address_space_operations ubifs_file_address_operations = { .write_end = ubifs_write_end, .invalidatepage = ubifs_invalidatepage, .set_page_dirty = ubifs_set_page_dirty, +#ifdef CONFIG_MIGRATION + .migratepage = ubifs_migrate_page, +#endif .releasepage = ubifs_releasepage, }; diff --git a/fs/udf/dir.c b/fs/udf/dir.c index 4c5593abc553..aaec13c95253 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -113,7 +113,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) brelse(tmp); } if (num) { - ll_rw_block(READA, num, bha); + ll_rw_block(REQ_OP_READ, REQ_RAHEAD, num, bha); for (i = 0; i < num; i++) brelse(bha[i]); } diff --git a/fs/udf/directory.c b/fs/udf/directory.c index c763fda257bf..988d5352bdb8 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -87,7 +87,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, brelse(tmp); } if (num) { - ll_rw_block(READA, num, bha); + ll_rw_block(REQ_OP_READ, REQ_RAHEAD, num, bha); for (i = 0; i < num; i++) brelse(bha[i]); } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index f323aff740ef..55aa587bbc38 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1199,7 +1199,7 @@ struct buffer_head *udf_bread(struct inode *inode, int block, if (buffer_uptodate(bh)) return bh; - ll_rw_block(READ, 1, &bh); + ll_rw_block(REQ_OP_READ, 0, 1, &bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) diff --git a/fs/udf/partition.c b/fs/udf/partition.c index 5f861ed287c3..888c364b2fe9 100644 --- a/fs/udf/partition.c +++ b/fs/udf/partition.c @@ -295,7 +295,8 @@ static uint32_t udf_try_read_meta(struct inode *inode, uint32_t block, map = &UDF_SB(sb)->s_partmaps[partition]; /* map to sparable/physical partition desc */ phyblock = udf_get_pblock(sb, eloc.logicalBlockNum, - map->s_partition_num, ext_offset + offset); + map->s_type_specific.s_metadata.s_phys_partition_ref, + ext_offset + offset); } brelse(epos.bh); @@ -317,14 +318,18 @@ uint32_t udf_get_pblock_meta25(struct super_block *sb, uint32_t block, mdata = &map->s_type_specific.s_metadata; inode = mdata->s_metadata_fe ? : mdata->s_mirror_fe; - /* We shouldn't mount such media... */ - BUG_ON(!inode); + if (!inode) + return 0xFFFFFFFF; + retblk = udf_try_read_meta(inode, block, partition, offset); if (retblk == 0xFFFFFFFF && mdata->s_metadata_fe) { udf_warn(sb, "error reading from METADATA, trying to read from MIRROR\n"); if (!(mdata->s_flags & MF_MIRROR_FE_LOADED)) { mdata->s_mirror_fe = udf_find_metadata_inode_efe(sb, - mdata->s_mirror_file_loc, map->s_partition_num); + mdata->s_mirror_file_loc, + mdata->s_phys_partition_ref); + if (IS_ERR(mdata->s_mirror_fe)) + mdata->s_mirror_fe = NULL; mdata->s_flags |= MF_MIRROR_FE_LOADED; } diff --git a/fs/udf/super.c b/fs/udf/super.c index 5e2c8c814e1b..4942549e7dc8 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -951,13 +951,13 @@ out2: } struct inode *udf_find_metadata_inode_efe(struct super_block *sb, - u32 meta_file_loc, u32 partition_num) + u32 meta_file_loc, u32 partition_ref) { struct kernel_lb_addr addr; struct inode *metadata_fe; addr.logicalBlockNum = meta_file_loc; - addr.partitionReferenceNum = partition_num; + addr.partitionReferenceNum = partition_ref; metadata_fe = udf_iget_special(sb, &addr); @@ -974,7 +974,8 @@ struct inode *udf_find_metadata_inode_efe(struct super_block *sb, return metadata_fe; } -static int udf_load_metadata_files(struct super_block *sb, int partition) +static int udf_load_metadata_files(struct super_block *sb, int partition, + int type1_index) { struct udf_sb_info *sbi = UDF_SB(sb); struct udf_part_map *map; @@ -984,20 +985,21 @@ static int udf_load_metadata_files(struct super_block *sb, int partition) map = &sbi->s_partmaps[partition]; mdata = &map->s_type_specific.s_metadata; + mdata->s_phys_partition_ref = type1_index; /* metadata address */ udf_debug("Metadata file location: block = %d part = %d\n", - mdata->s_meta_file_loc, map->s_partition_num); + mdata->s_meta_file_loc, mdata->s_phys_partition_ref); fe = udf_find_metadata_inode_efe(sb, mdata->s_meta_file_loc, - map->s_partition_num); + mdata->s_phys_partition_ref); if (IS_ERR(fe)) { /* mirror file entry */ udf_debug("Mirror metadata file location: block = %d part = %d\n", - mdata->s_mirror_file_loc, map->s_partition_num); + mdata->s_mirror_file_loc, mdata->s_phys_partition_ref); fe = udf_find_metadata_inode_efe(sb, mdata->s_mirror_file_loc, - map->s_partition_num); + mdata->s_phys_partition_ref); if (IS_ERR(fe)) { udf_err(sb, "Both metadata and mirror metadata inode efe can not found\n"); @@ -1015,7 +1017,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition) */ if (mdata->s_bitmap_file_loc != 0xFFFFFFFF) { addr.logicalBlockNum = mdata->s_bitmap_file_loc; - addr.partitionReferenceNum = map->s_partition_num; + addr.partitionReferenceNum = mdata->s_phys_partition_ref; udf_debug("Bitmap file location: block = %d part = %d\n", addr.logicalBlockNum, addr.partitionReferenceNum); @@ -1283,7 +1285,7 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block) p = (struct partitionDesc *)bh->b_data; partitionNumber = le16_to_cpu(p->partitionNumber); - /* First scan for TYPE1, SPARABLE and METADATA partitions */ + /* First scan for TYPE1 and SPARABLE partitions */ for (i = 0; i < sbi->s_partitions; i++) { map = &sbi->s_partmaps[i]; udf_debug("Searching map: (%d == %d)\n", @@ -1333,7 +1335,7 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block) goto out_bh; if (map->s_partition_type == UDF_METADATA_MAP25) { - ret = udf_load_metadata_files(sb, i); + ret = udf_load_metadata_files(sb, i, type1_idx); if (ret < 0) { udf_err(sb, "error loading MetaData partition map %d\n", i); diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 27b5335730c9..c13875d669c0 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -61,6 +61,11 @@ struct udf_meta_data { __u32 s_bitmap_file_loc; __u32 s_alloc_unit_size; __u16 s_align_unit_size; + /* + * Partition Reference Number of the associated physical / sparable + * partition + */ + __u16 s_phys_partition_ref; int s_flags; struct inode *s_metadata_fe; struct inode *s_mirror_fe; diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 0447b949c7f5..67e085d591d8 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -292,7 +292,7 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg, if (!buffer_mapped(bh)) map_bh(bh, inode->i_sb, oldb + pos); if (!buffer_uptodate(bh)) { - ll_rw_block(READ, 1, &bh); + ll_rw_block(REQ_OP_READ, 0, 1, &bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { ufs_error(inode->i_sb, __func__, diff --git a/fs/ufs/util.c b/fs/ufs/util.c index a409e3e7827a..f41ad0a6106f 100644 --- a/fs/ufs/util.c +++ b/fs/ufs/util.c @@ -118,7 +118,7 @@ void ubh_sync_block(struct ufs_buffer_head *ubh) unsigned i; for (i = 0; i < ubh->count; i++) - write_dirty_buffer(ubh->bh[i], WRITE); + write_dirty_buffer(ubh->bh[i], 0); for (i = 0; i < ubh->count; i++) wait_on_buffer(ubh->bh[i]); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 2d97952e341a..85959d8324df 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -257,10 +257,9 @@ out: * fatal_signal_pending()s, and the mmap_sem must be released before * returning it. */ -int handle_userfault(struct vm_area_struct *vma, unsigned long address, - unsigned int flags, unsigned long reason) +int handle_userfault(struct fault_env *fe, unsigned long reason) { - struct mm_struct *mm = vma->vm_mm; + struct mm_struct *mm = fe->vma->vm_mm; struct userfaultfd_ctx *ctx; struct userfaultfd_wait_queue uwq; int ret; @@ -269,7 +268,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); ret = VM_FAULT_SIGBUS; - ctx = vma->vm_userfaultfd_ctx.ctx; + ctx = fe->vma->vm_userfaultfd_ctx.ctx; if (!ctx) goto out; @@ -302,17 +301,17 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, * without first stopping userland access to the memory. For * VM_UFFD_MISSING userfaults this is enough for now. */ - if (unlikely(!(flags & FAULT_FLAG_ALLOW_RETRY))) { + if (unlikely(!(fe->flags & FAULT_FLAG_ALLOW_RETRY))) { /* * Validate the invariant that nowait must allow retry * to be sure not to return SIGBUS erroneously on * nowait invocations. */ - BUG_ON(flags & FAULT_FLAG_RETRY_NOWAIT); + BUG_ON(fe->flags & FAULT_FLAG_RETRY_NOWAIT); #ifdef CONFIG_DEBUG_VM if (printk_ratelimit()) { printk(KERN_WARNING - "FAULT_FLAG_ALLOW_RETRY missing %x\n", flags); + "FAULT_FLAG_ALLOW_RETRY missing %x\n", fe->flags); dump_stack(); } #endif @@ -324,7 +323,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, * and wait. */ ret = VM_FAULT_RETRY; - if (flags & FAULT_FLAG_RETRY_NOWAIT) + if (fe->flags & FAULT_FLAG_RETRY_NOWAIT) goto out; /* take the reference before dropping the mmap_sem */ @@ -332,10 +331,11 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); uwq.wq.private = current; - uwq.msg = userfault_msg(address, flags, reason); + uwq.msg = userfault_msg(fe->address, fe->flags, reason); uwq.ctx = ctx; - return_to_userland = (flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == + return_to_userland = + (fe->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE); spin_lock(&ctx->fault_pending_wqh.lock); @@ -353,7 +353,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, TASK_KILLABLE); spin_unlock(&ctx->fault_pending_wqh.lock); - must_wait = userfaultfd_must_wait(ctx, address, flags, reason); + must_wait = userfaultfd_must_wait(ctx, fe->address, fe->flags, reason); up_read(&mm->mmap_sem); if (likely(must_wait && !ACCESS_ONCE(ctx->released) && diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index ebfde553da38..7575cfc3ad15 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -447,7 +447,8 @@ xfs_submit_ioend( ioend->io_bio->bi_private = ioend; ioend->io_bio->bi_end_io = xfs_end_bio; - + bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE, + (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0); /* * If we are failing the IO now, just mark the ioend with an * error and finish it. This will run IO completion immediately @@ -460,8 +461,7 @@ xfs_submit_ioend( return status; } - submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, - ioend->io_bio); + submit_bio(ioend->io_bio); return 0; } @@ -519,8 +519,9 @@ xfs_chain_bio( bio_chain(ioend->io_bio, new); bio_get(ioend->io_bio); /* for xfs_destroy_ioend */ - submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, - ioend->io_bio); + bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE, + (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0); + submit_bio(ioend->io_bio); ioend->io_bio = new; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 32fc5401a756..47a318ce82e0 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1201,7 +1201,8 @@ xfs_buf_ioapply_map( int map, int *buf_offset, int *count, - int rw) + int op, + int op_flags) { int page_index; int total_nr_pages = bp->b_page_count; @@ -1231,16 +1232,14 @@ xfs_buf_ioapply_map( next_chunk: atomic_inc(&bp->b_io_remaining); - nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT); - if (nr_pages > total_nr_pages) - nr_pages = total_nr_pages; + nr_pages = min(total_nr_pages, BIO_MAX_PAGES); bio = bio_alloc(GFP_NOIO, nr_pages); bio->bi_bdev = bp->b_target->bt_bdev; bio->bi_iter.bi_sector = sector; bio->bi_end_io = xfs_buf_bio_end_io; bio->bi_private = bp; - + bio_set_op_attrs(bio, op, op_flags); for (; size && nr_pages; nr_pages--, page_index++) { int rbytes, nbytes = PAGE_SIZE - offset; @@ -1264,7 +1263,7 @@ next_chunk: flush_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); } - submit_bio(rw, bio); + submit_bio(bio); if (size) goto next_chunk; } else { @@ -1284,7 +1283,8 @@ _xfs_buf_ioapply( struct xfs_buf *bp) { struct blk_plug plug; - int rw; + int op; + int op_flags = 0; int offset; int size; int i; @@ -1303,14 +1303,13 @@ _xfs_buf_ioapply( bp->b_ioend_wq = bp->b_target->bt_mount->m_buf_workqueue; if (bp->b_flags & XBF_WRITE) { + op = REQ_OP_WRITE; if (bp->b_flags & XBF_SYNCIO) - rw = WRITE_SYNC; - else - rw = WRITE; + op_flags = WRITE_SYNC; if (bp->b_flags & XBF_FUA) - rw |= REQ_FUA; + op_flags |= REQ_FUA; if (bp->b_flags & XBF_FLUSH) - rw |= REQ_FLUSH; + op_flags |= REQ_PREFLUSH; /* * Run the write verifier callback function if it exists. If @@ -1340,13 +1339,14 @@ _xfs_buf_ioapply( } } } else if (bp->b_flags & XBF_READ_AHEAD) { - rw = READA; + op = REQ_OP_READ; + op_flags = REQ_RAHEAD; } else { - rw = READ; + op = REQ_OP_READ; } /* we only use the buffer cache for meta-data */ - rw |= REQ_META; + op_flags |= REQ_META; /* * Walk all the vectors issuing IO on them. Set up the initial offset @@ -1358,7 +1358,7 @@ _xfs_buf_ioapply( size = BBTOB(bp->b_io_length); blk_start_plug(&plug); for (i = 0; i < bp->b_map_count; i++) { - xfs_buf_ioapply_map(bp, i, &offset, &size, rw); + xfs_buf_ioapply_map(bp, i, &offset, &size, op, op_flags); if (bp->b_error) break; if (size <= 0) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 35df7576cbab..ed95e5bb04e6 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1502,7 +1502,7 @@ xfs_filemap_page_mkwrite( xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { - ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault); + ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault); } else { ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops); ret = block_page_mkwrite_return(ret); @@ -1536,7 +1536,7 @@ xfs_filemap_fault( * changes to xfs_get_blocks_direct() to map unwritten extent * ioend for conversion on read-only mappings. */ - ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault); + ret = dax_fault(vma, vmf, xfs_get_blocks_dax_fault); } else ret = filemap_fault(vma, vmf); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); @@ -1573,7 +1573,7 @@ xfs_filemap_pmd_fault( } xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault); + ret = dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (flags & FAULT_FLAG_WRITE) |