diff options
Diffstat (limited to 'fs')
348 files changed, 10905 insertions, 31821 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 8aa56bb6e861..6caca025019d 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -52,7 +52,7 @@ enum { /* Options that take integer arguments */ Opt_debug, Opt_dfltuid, Opt_dfltgid, Opt_afid, /* String options */ - Opt_uname, Opt_remotename, Opt_trans, Opt_cache, Opt_cachetag, + Opt_uname, Opt_remotename, Opt_cache, Opt_cachetag, /* Options that take no arguments */ Opt_nodevmap, /* Cache options */ diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 1ef16bd8280b..3abc447783aa 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -381,7 +381,7 @@ static ssize_t v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct p9_fid *fid = iocb->ki_filp->private_data; - int ret, err; + int ret, err = 0; p9_debug(P9_DEBUG_VFS, "count %zu offset %lld\n", iov_iter_count(to), iocb->ki_pos); diff --git a/fs/Kconfig b/fs/Kconfig index 011f43365d7b..da3f32f1a4e4 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -11,18 +11,15 @@ config DCACHE_WORD_ACCESS if BLOCK source "fs/ext2/Kconfig" -source "fs/ext3/Kconfig" source "fs/ext4/Kconfig" -source "fs/jbd/Kconfig" source "fs/jbd2/Kconfig" config FS_MBCACHE # Meta block cache for Extended Attributes (ext2/ext3/ext4) tristate default y if EXT2_FS=y && EXT2_FS_XATTR - default y if EXT3_FS=y && EXT3_FS_XATTR default y if EXT4_FS=y - default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS + default m if EXT2_FS_XATTR || EXT4_FS source "fs/reiserfs/Kconfig" source "fs/jfs/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index cb20e4bf2303..f79cf4043e60 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -27,6 +27,7 @@ obj-$(CONFIG_ANON_INODES) += anon_inodes.o obj-$(CONFIG_SIGNALFD) += signalfd.o obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o +obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_AIO) += aio.o obj-$(CONFIG_FS_DAX) += dax.o obj-$(CONFIG_FILE_LOCKING) += locks.o @@ -62,12 +63,10 @@ obj-$(CONFIG_DLM) += dlm/ # Do not add any filesystems before this line obj-$(CONFIG_FSCACHE) += fscache/ obj-$(CONFIG_REISERFS_FS) += reiserfs/ -obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 obj-$(CONFIG_EXT2_FS) += ext2/ # We place ext4 after ext2 so plain ext2 root fs's are mounted using ext2 # unless explicitly requested by rootfstype obj-$(CONFIG_EXT4_FS) += ext4/ -obj-$(CONFIG_JBD) += jbd/ obj-$(CONFIG_JBD2) += jbd2/ obj-$(CONFIG_CRAMFS) += cramfs/ obj-$(CONFIG_SQUASHFS) += squashfs/ diff --git a/fs/affs/super.c b/fs/affs/super.c index 3f89c9e05b40..5b50c4ca43a7 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -18,6 +18,7 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/writeback.h> +#include <linux/blkdev.h> #include "affs.h" static int affs_statfs(struct dentry *dentry, struct kstatfs *buf); @@ -352,18 +353,19 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) * blocks, we will have to change it. */ - size = sb->s_bdev->bd_inode->i_size >> 9; + size = i_size_read(sb->s_bdev->bd_inode) >> 9; pr_debug("initial blocksize=%d, #blocks=%d\n", 512, size); affs_set_blocksize(sb, PAGE_SIZE); /* Try to find root block. Its location depends on the block size. */ - i = 512; - j = 4096; + i = bdev_logical_block_size(sb->s_bdev); + j = PAGE_SIZE; if (blocksize > 0) { i = j = blocksize; size = size / (blocksize / 512); } + for (blocksize = i; blocksize <= j; blocksize <<= 1, size >>= 1) { sbi->s_root_block = root_block; if (root_block < 0) @@ -308,15 +308,9 @@ static void aio_free_ring(struct kioctx *ctx) } } -static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) -{ - vma->vm_flags |= VM_DONTEXPAND; - vma->vm_ops = &generic_file_vm_ops; - return 0; -} - -static int aio_ring_remap(struct file *file, struct vm_area_struct *vma) +static int aio_ring_mremap(struct vm_area_struct *vma) { + struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; struct kioctx_table *table; int i, res = -EINVAL; @@ -342,9 +336,24 @@ static int aio_ring_remap(struct file *file, struct vm_area_struct *vma) return res; } +static const struct vm_operations_struct aio_ring_vm_ops = { + .mremap = aio_ring_mremap, +#if IS_ENABLED(CONFIG_MMU) + .fault = filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = filemap_page_mkwrite, +#endif +}; + +static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) +{ + vma->vm_flags |= VM_DONTEXPAND; + vma->vm_ops = &aio_ring_vm_ops; + return 0; +} + static const struct file_operations aio_ring_fops = { .mmap = aio_ring_mmap, - .mremap = aio_ring_remap, }; #if IS_ENABLED(CONFIG_MIGRATION) diff --git a/fs/block_dev.c b/fs/block_dev.c index 198243717da5..22ea424ee741 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -28,6 +28,7 @@ #include <linux/namei.h> #include <linux/log2.h> #include <linux/cleancache.h> +#include <linux/dax.h> #include <asm/uaccess.h> #include "internal.h" @@ -441,7 +442,7 @@ EXPORT_SYMBOL_GPL(bdev_write_page); * accessible at this address. */ long bdev_direct_access(struct block_device *bdev, sector_t sector, - void **addr, unsigned long *pfn, long size) + void __pmem **addr, unsigned long *pfn, long size) { long avail; const struct block_device_operations *ops = bdev->bd_disk->fops; @@ -462,7 +463,7 @@ long bdev_direct_access(struct block_device *bdev, sector_t sector, sector += get_start_sect(bdev); if (sector % (PAGE_SIZE / 512)) return -EINVAL; - avail = ops->direct_access(bdev, sector, addr, pfn, size); + avail = ops->direct_access(bdev, sector, addr, pfn); if (!avail) return -ERANGE; return min(avail, size); @@ -1769,7 +1770,7 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) { struct inode *inode, *old_inode = NULL; - spin_lock(&inode_sb_list_lock); + spin_lock(&blockdev_superblock->s_inode_list_lock); list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { struct address_space *mapping = inode->i_mapping; @@ -1781,13 +1782,13 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&inode_sb_list_lock); + spin_unlock(&blockdev_superblock->s_inode_list_lock); /* * We hold a reference to 'inode' so it couldn't have been * removed from s_inodes list while we dropped the - * inode_sb_list_lock. We cannot iput the inode now as we can + * s_inode_list_lock We cannot iput the inode now as we can * be holding the last reference and we cannot iput it under - * inode_sb_list_lock. So we keep the reference and iput it + * s_inode_list_lock. So we keep the reference and iput it * later. */ iput(old_inode); @@ -1795,8 +1796,8 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) func(I_BDEV(inode), arg); - spin_lock(&inode_sb_list_lock); + spin_lock(&blockdev_superblock->s_inode_list_lock); } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&blockdev_superblock->s_inode_list_lock); iput(old_inode); } diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 802fabb30e15..ecbc63d3143e 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -206,10 +206,33 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id, return -ENOMEM; ref->root_id = root_id; - if (key) + if (key) { ref->key_for_search = *key; - else + /* + * We can often find data backrefs with an offset that is too + * large (>= LLONG_MAX, maximum allowed file offset) due to + * underflows when subtracting a file's offset with the data + * offset of its corresponding extent data item. This can + * happen for example in the clone ioctl. + * So if we detect such case we set the search key's offset to + * zero to make sure we will find the matching file extent item + * at add_all_parents(), otherwise we will miss it because the + * offset taken form the backref is much larger then the offset + * of the file extent item. This can make us scan a very large + * number of file extent items, but at least it will not make + * us miss any. + * This is an ugly workaround for a behaviour that should have + * never existed, but it does and a fix for the clone ioctl + * would touch a lot of places, cause backwards incompatibility + * and would not fix the problem for extents cloned with older + * kernels. + */ + if (ref->key_for_search.type == BTRFS_EXTENT_DATA_KEY && + ref->key_for_search.offset >= LLONG_MAX) + ref->key_for_search.offset = 0; + } else { memset(&ref->key_for_search, 0, sizeof(ref->key_for_search)); + } ref->inode_list = NULL; ref->level = level; @@ -632,7 +655,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, struct btrfs_delayed_tree_ref *ref; ref = btrfs_delayed_node_to_tree_ref(node); - ret = __add_prelim_ref(prefs, ref->root, NULL, + ret = __add_prelim_ref(prefs, 0, NULL, ref->level + 1, ref->parent, node->bytenr, node->ref_mod * sgn, GFP_ATOMIC); @@ -664,11 +687,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, struct btrfs_delayed_data_ref *ref; ref = btrfs_delayed_node_to_data_ref(node); - - key.objectid = ref->objectid; - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = ref->offset; - ret = __add_prelim_ref(prefs, ref->root, &key, 0, + ret = __add_prelim_ref(prefs, 0, NULL, 0, ref->parent, node->bytenr, node->ref_mod * sgn, GFP_ATOMIC); break; diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index ce7dec88f4b8..541fbfaed276 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -343,7 +343,7 @@ static int btrfsic_process_written_superblock( struct btrfsic_state *state, struct btrfsic_block *const block, struct btrfs_super_block *const super_hdr); -static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status); +static void btrfsic_bio_end_io(struct bio *bp); static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate); static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state, const struct btrfsic_block *block, @@ -2207,7 +2207,7 @@ continue_loop: goto again; } -static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) +static void btrfsic_bio_end_io(struct bio *bp) { struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private; int iodone_w_error; @@ -2215,7 +2215,7 @@ static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) /* mutex is not held! This is not save if IO is not yet completed * on umount */ iodone_w_error = 0; - if (bio_error_status) + if (bp->bi_error) iodone_w_error = 1; BUG_ON(NULL == block); @@ -2230,7 +2230,7 @@ static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) printk(KERN_INFO "bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n", - bio_error_status, + bp->bi_error, btrfsic_get_block_type(dev_state->state, block), block->logical_bytenr, dev_state->name, block->dev_bytenr, block->mirror_num); @@ -2252,7 +2252,7 @@ static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) block = next_block; } while (NULL != block); - bp->bi_end_io(bp, bio_error_status); + bp->bi_end_io(bp); } static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index ce62324c78e7..57ee8ca29b06 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -97,10 +97,7 @@ static inline int compressed_bio_size(struct btrfs_root *root, static struct bio *compressed_bio_alloc(struct block_device *bdev, u64 first_byte, gfp_t gfp_flags) { - int nr_vecs; - - nr_vecs = bio_get_nr_vecs(bdev); - return btrfs_bio_alloc(bdev, first_byte >> 9, nr_vecs, gfp_flags); + return btrfs_bio_alloc(bdev, first_byte >> 9, BIO_MAX_PAGES, gfp_flags); } static int check_compressed_csum(struct inode *inode, @@ -152,7 +149,7 @@ fail: * The compressed pages are freed here, and it must be run * in process context */ -static void end_compressed_bio_read(struct bio *bio, int err) +static void end_compressed_bio_read(struct bio *bio) { struct compressed_bio *cb = bio->bi_private; struct inode *inode; @@ -160,7 +157,7 @@ static void end_compressed_bio_read(struct bio *bio, int err) unsigned long index; int ret; - if (err) + if (bio->bi_error) cb->errors = 1; /* if there are more bios still pending for this compressed @@ -210,7 +207,7 @@ csum_failed: bio_for_each_segment_all(bvec, cb->orig_bio, i) SetPageChecked(bvec->bv_page); - bio_endio(cb->orig_bio, 0); + bio_endio(cb->orig_bio); } /* finally free the cb struct */ @@ -266,7 +263,7 @@ static noinline void end_compressed_writeback(struct inode *inode, * This also calls the writeback end hooks for the file pages so that * metadata and checksums can be updated in the file. */ -static void end_compressed_bio_write(struct bio *bio, int err) +static void end_compressed_bio_write(struct bio *bio) { struct extent_io_tree *tree; struct compressed_bio *cb = bio->bi_private; @@ -274,7 +271,7 @@ static void end_compressed_bio_write(struct bio *bio, int err) struct page *page; unsigned long index; - if (err) + if (bio->bi_error) cb->errors = 1; /* if there are more bios still pending for this compressed @@ -293,7 +290,7 @@ static void end_compressed_bio_write(struct bio *bio, int err) cb->start, cb->start + cb->len - 1, NULL, - err ? 0 : 1); + bio->bi_error ? 0 : 1); cb->compressed_pages[0]->mapping = NULL; end_compressed_writeback(inode, cb); @@ -697,8 +694,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); - if (ret) - bio_endio(comp_bio, ret); + if (ret) { + bio->bi_error = ret; + bio_endio(comp_bio); + } bio_put(comp_bio); @@ -724,8 +723,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, } ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); - if (ret) - bio_endio(comp_bio, ret); + if (ret) { + bio->bi_error = ret; + bio_endio(comp_bio); + } bio_put(comp_bio); return 0; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 54114b4887dd..5f745eadf77d 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1159,8 +1159,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { ret = btrfs_reloc_cow_block(trans, root, buf, cow); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, root, ret); return ret; + } } if (buf == root->node) { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index aac314e14188..938efe33be80 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1300,7 +1300,7 @@ struct btrfs_block_group_cache { /* for raid56, this is a full stripe, without parity */ unsigned long full_stripe_len; - unsigned int ro:1; + unsigned int ro; unsigned int iref:1; unsigned int has_caching_ctl:1; unsigned int removed:1; @@ -1518,12 +1518,6 @@ struct btrfs_fs_info { */ struct mutex ordered_operations_mutex; - /* - * Same as ordered_operations_mutex except this is for ordered extents - * and not the operations. - */ - struct mutex ordered_extent_flush_mutex; - struct rw_semaphore commit_root_sem; struct rw_semaphore cleanup_work_sem; @@ -3437,6 +3431,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 group_start, struct extent_map *em); void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); +void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache); +void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache); void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); @@ -3495,9 +3491,9 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, void btrfs_block_rsv_release(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes); -int btrfs_set_block_group_ro(struct btrfs_root *root, +int btrfs_inc_block_group_ro(struct btrfs_root *root, struct btrfs_block_group_cache *cache); -void btrfs_set_block_group_rw(struct btrfs_root *root, +void btrfs_dec_block_group_ro(struct btrfs_root *root, struct btrfs_block_group_cache *cache); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); @@ -4073,6 +4069,7 @@ __cold void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); +const char *btrfs_decode_error(int errno); __cold void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, @@ -4185,8 +4182,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *cow); -void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending, +void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, u64 *bytes_to_reserve); int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 862fbc206755..564a7de17d99 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -378,7 +378,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, ret = btrfs_kobj_add_device(tgt_device->fs_devices, tgt_device); if (ret) - btrfs_error(root->fs_info, ret, "kobj add dev failed"); + btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret); printk_in_rcu(KERN_INFO "BTRFS: dev_replace from %s (devid %llu) to %s started\n", diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a9aadb2ad525..9ebd34f1c677 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -703,7 +703,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror) return -EIO; /* we fixed nothing */ } -static void end_workqueue_bio(struct bio *bio, int err) +static void end_workqueue_bio(struct bio *bio) { struct btrfs_end_io_wq *end_io_wq = bio->bi_private; struct btrfs_fs_info *fs_info; @@ -711,7 +711,7 @@ static void end_workqueue_bio(struct bio *bio, int err) btrfs_work_func_t func; fs_info = end_io_wq->info; - end_io_wq->error = err; + end_io_wq->error = bio->bi_error; if (bio->bi_rw & REQ_WRITE) { if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) { @@ -808,7 +808,8 @@ static void run_one_async_done(struct btrfs_work *work) /* If an error occured we just want to clean up the bio and move on */ if (async->error) { - bio_endio(async->bio, async->error); + async->bio->bi_error = async->error; + bio_endio(async->bio); return; } @@ -908,8 +909,10 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, * submission context. Just jump into btrfs_map_bio */ ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); - if (ret) - bio_endio(bio, ret); + if (ret) { + bio->bi_error = ret; + bio_endio(bio); + } return ret; } @@ -960,10 +963,13 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, __btree_submit_bio_done); } - if (ret) { + if (ret) + goto out_w_error; + return 0; + out_w_error: - bio_endio(bio, ret); - } + bio->bi_error = ret; + bio_endio(bio); return ret; } @@ -1724,6 +1730,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE; bdi->congested_fn = btrfs_congested_fn; bdi->congested_data = info; + bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK; return 0; } @@ -1735,16 +1742,15 @@ static void end_workqueue_fn(struct btrfs_work *work) { struct bio *bio; struct btrfs_end_io_wq *end_io_wq; - int error; end_io_wq = container_of(work, struct btrfs_end_io_wq, work); bio = end_io_wq->bio; - error = end_io_wq->error; + bio->bi_error = end_io_wq->error; bio->bi_private = end_io_wq->private; bio->bi_end_io = end_io_wq->end_io; kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq); - bio_endio(bio, error); + bio_endio(bio); } static int cleaner_kthread(void *arg) @@ -2608,7 +2614,6 @@ int open_ctree(struct super_block *sb, mutex_init(&fs_info->ordered_operations_mutex); - mutex_init(&fs_info->ordered_extent_flush_mutex); mutex_init(&fs_info->tree_log_mutex); mutex_init(&fs_info->chunk_mutex); mutex_init(&fs_info->transaction_kthread_mutex); @@ -2842,6 +2847,7 @@ int open_ctree(struct super_block *sb, !extent_buffer_uptodate(chunk_root->node)) { printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n", sb->s_id); + chunk_root->node = NULL; goto fail_tree_roots; } btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); @@ -2879,7 +2885,7 @@ retry_root_backup: !extent_buffer_uptodate(tree_root->node)) { printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n", sb->s_id); - + tree_root->node = NULL; goto recovery_tree_root; } @@ -2949,8 +2955,9 @@ retry_root_backup: if (fs_info->fs_devices->missing_devices > fs_info->num_tolerated_disk_barrier_failures && !(sb->s_flags & MS_RDONLY)) { - printk(KERN_WARNING "BTRFS: " - "too many missing devices, writeable mount is not allowed\n"); + pr_warn("BTRFS: missing devices(%llu) exceeds the limit(%d), writeable mount is not allowed\n", + fs_info->fs_devices->missing_devices, + fs_info->num_tolerated_disk_barrier_failures); goto fail_sysfs; } @@ -3323,10 +3330,8 @@ static int write_dev_supers(struct btrfs_device *device, * endio for the write_dev_flush, this will wake anyone waiting * for the barrier when it is done */ -static void btrfs_end_empty_barrier(struct bio *bio, int err) +static void btrfs_end_empty_barrier(struct bio *bio) { - if (err) - clear_bit(BIO_UPTODATE, &bio->bi_flags); if (bio->bi_private) complete(bio->bi_private); bio_put(bio); @@ -3354,8 +3359,8 @@ static int write_dev_flush(struct btrfs_device *device, int wait) wait_for_completion(&device->flush_wait); - if (!bio_flagged(bio, BIO_UPTODATE)) { - ret = -EIO; + if (bio->bi_error) { + ret = bio->bi_error; btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_FLUSH_ERRS); } @@ -3759,6 +3764,15 @@ void close_ctree(struct btrfs_root *root) cancel_work_sync(&fs_info->async_reclaim_work); if (!(fs_info->sb->s_flags & MS_RDONLY)) { + /* + * If the cleaner thread is stopped and there are + * block groups queued for removal, the deletion will be + * skipped when we quit the cleaner thread. + */ + mutex_lock(&root->fs_info->cleaner_mutex); + btrfs_delete_unused_bgs(root->fs_info); + mutex_unlock(&root->fs_info->cleaner_mutex); + ret = btrfs_commit_super(root); if (ret) btrfs_err(fs_info, "commit super ret %d", ret); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 171312d51799..5411f0ab5683 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1316,8 +1316,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, return ret; } -static noinline u32 extent_data_ref_count(struct btrfs_root *root, - struct btrfs_path *path, +static noinline u32 extent_data_ref_count(struct btrfs_path *path, struct btrfs_extent_inline_ref *iref) { struct btrfs_key key; @@ -1883,10 +1882,77 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, return ret; } -static int btrfs_issue_discard(struct block_device *bdev, - u64 start, u64 len) +#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) +static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, + u64 *discarded_bytes) { - return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0); + int j, ret = 0; + u64 bytes_left, end; + u64 aligned_start = ALIGN(start, 1 << 9); + + if (WARN_ON(start != aligned_start)) { + len -= aligned_start - start; + len = round_down(len, 1 << 9); + start = aligned_start; + } + + *discarded_bytes = 0; + + if (!len) + return 0; + + end = start + len; + bytes_left = len; + + /* Skip any superblocks on this device. */ + for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) { + u64 sb_start = btrfs_sb_offset(j); + u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE; + u64 size = sb_start - start; + + if (!in_range(sb_start, start, bytes_left) && + !in_range(sb_end, start, bytes_left) && + !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE)) + continue; + + /* + * Superblock spans beginning of range. Adjust start and + * try again. + */ + if (sb_start <= start) { + start += sb_end - start; + if (start > end) { + bytes_left = 0; + break; + } + bytes_left = end - start; + continue; + } + + if (size) { + ret = blkdev_issue_discard(bdev, start >> 9, size >> 9, + GFP_NOFS, 0); + if (!ret) + *discarded_bytes += size; + else if (ret != -EOPNOTSUPP) + return ret; + } + + start = sb_end; + if (start > end) { + bytes_left = 0; + break; + } + bytes_left = end - start; + } + + if (bytes_left) { + ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9, + GFP_NOFS, 0); + if (!ret) + *discarded_bytes += bytes_left; + } + return ret; } int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, @@ -1907,14 +1973,16 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, for (i = 0; i < bbio->num_stripes; i++, stripe++) { + u64 bytes; if (!stripe->dev->can_discard) continue; ret = btrfs_issue_discard(stripe->dev->bdev, stripe->physical, - stripe->length); + stripe->length, + &bytes); if (!ret) - discarded_bytes += stripe->length; + discarded_bytes += bytes; else if (ret != -EOPNOTSUPP) break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ @@ -4227,6 +4295,24 @@ out: space_info->chunk_alloc = 0; spin_unlock(&space_info->lock); mutex_unlock(&fs_info->chunk_mutex); + /* + * When we allocate a new chunk we reserve space in the chunk block + * reserve to make sure we can COW nodes/leafs in the chunk tree or + * add new nodes/leafs to it if we end up needing to do it when + * inserting the chunk item and updating device items as part of the + * second phase of chunk allocation, performed by + * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a + * large number of new block groups to create in our transaction + * handle's new_bgs list to avoid exhausting the chunk block reserve + * in extreme cases - like having a single transaction create many new + * block groups when starting to write out the free space caches of all + * the block groups that were made dirty during the lifetime of the + * transaction. + */ + if (trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) { + btrfs_create_pending_block_groups(trans, trans->root); + btrfs_trans_release_chunk_metadata(trans); + } return ret; } @@ -6044,20 +6130,19 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_group_cache *block_group, *tmp; + struct list_head *deleted_bgs; struct extent_io_tree *unpin; u64 start; u64 end; int ret; - if (trans->aborted) - return 0; - if (fs_info->pinned_extents == &fs_info->freed_extents[0]) unpin = &fs_info->freed_extents[1]; else unpin = &fs_info->freed_extents[0]; - while (1) { + while (!trans->aborted) { mutex_lock(&fs_info->unused_bg_unpin_mutex); ret = find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY, NULL); @@ -6076,6 +6161,34 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, cond_resched(); } + /* + * Transaction is finished. We don't need the lock anymore. We + * do need to clean up the block groups in case of a transaction + * abort. + */ + deleted_bgs = &trans->transaction->deleted_bgs; + list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) { + u64 trimmed = 0; + + ret = -EROFS; + if (!trans->aborted) + ret = btrfs_discard_extent(root, + block_group->key.objectid, + block_group->key.offset, + &trimmed); + + list_del_init(&block_group->bg_list); + btrfs_put_block_group_trimming(block_group); + btrfs_put_block_group(block_group); + + if (ret) { + const char *errstr = btrfs_decode_error(ret); + btrfs_warn(fs_info, + "Discard failed while removing blockgroup: errno=%d %s\n", + ret, errstr); + } + } + return 0; } @@ -6331,7 +6444,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } else { if (found_extent) { BUG_ON(is_data && refs_to_drop != - extent_data_ref_count(root, path, iref)); + extent_data_ref_count(path, iref)); if (iref) { BUG_ON(path->slots[0] != extent_slot); } else { @@ -7549,9 +7662,6 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info, /* * finds a free extent and does all the dirty work required for allocation - * returns the key for the extent through ins, and a tree buffer for - * the first block of the extent through buf. - * * returns the tree buffer or an ERR_PTR on error. */ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, @@ -8705,14 +8815,13 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) return flags; } -static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) +static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force) { struct btrfs_space_info *sinfo = cache->space_info; u64 num_bytes; u64 min_allocable_bytes; int ret = -ENOSPC; - /* * We need some metadata space and system metadata space for * allocating chunks in some corner cases until we force to set @@ -8729,6 +8838,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) spin_lock(&cache->lock); if (cache->ro) { + cache->ro++; ret = 0; goto out; } @@ -8740,7 +8850,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes + min_allocable_bytes <= sinfo->total_bytes) { sinfo->bytes_readonly += num_bytes; - cache->ro = 1; + cache->ro++; list_add_tail(&cache->ro_list, &sinfo->ro_bgs); ret = 0; } @@ -8750,7 +8860,7 @@ out: return ret; } -int btrfs_set_block_group_ro(struct btrfs_root *root, +int btrfs_inc_block_group_ro(struct btrfs_root *root, struct btrfs_block_group_cache *cache) { @@ -8758,8 +8868,6 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, u64 alloc_flags; int ret; - BUG_ON(cache->ro); - again: trans = btrfs_join_transaction(root); if (IS_ERR(trans)) @@ -8802,7 +8910,7 @@ again: goto out; } - ret = set_block_group_ro(cache, 0); + ret = inc_block_group_ro(cache, 0); if (!ret) goto out; alloc_flags = get_alloc_profile(root, cache->space_info->flags); @@ -8810,7 +8918,7 @@ again: CHUNK_ALLOC_FORCE); if (ret < 0) goto out; - ret = set_block_group_ro(cache, 0); + ret = inc_block_group_ro(cache, 0); out: if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { alloc_flags = update_block_group_flags(root, cache->flags); @@ -8873,7 +8981,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) return free_bytes; } -void btrfs_set_block_group_rw(struct btrfs_root *root, +void btrfs_dec_block_group_ro(struct btrfs_root *root, struct btrfs_block_group_cache *cache) { struct btrfs_space_info *sinfo = cache->space_info; @@ -8883,11 +8991,13 @@ void btrfs_set_block_group_rw(struct btrfs_root *root, spin_lock(&sinfo->lock); spin_lock(&cache->lock); - num_bytes = cache->key.offset - cache->reserved - cache->pinned - - cache->bytes_super - btrfs_block_group_used(&cache->item); - sinfo->bytes_readonly -= num_bytes; - cache->ro = 0; - list_del_init(&cache->ro_list); + if (!--cache->ro) { + num_bytes = cache->key.offset - cache->reserved - + cache->pinned - cache->bytes_super - + btrfs_block_group_used(&cache->item); + sinfo->bytes_readonly -= num_bytes; + list_del_init(&cache->ro_list); + } spin_unlock(&cache->lock); spin_unlock(&sinfo->lock); } @@ -9403,7 +9513,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) set_avail_alloc_bits(root->fs_info, cache->flags); if (btrfs_chunk_readonly(root, cache->key.objectid)) { - set_block_group_ro(cache, 1); + inc_block_group_ro(cache, 1); } else if (btrfs_block_group_used(&cache->item) == 0) { spin_lock(&info->unused_bgs_lock); /* Should always be true but just in case. */ @@ -9431,11 +9541,11 @@ int btrfs_read_block_groups(struct btrfs_root *root) list_for_each_entry(cache, &space_info->block_groups[BTRFS_RAID_RAID0], list) - set_block_group_ro(cache, 1); + inc_block_group_ro(cache, 1); list_for_each_entry(cache, &space_info->block_groups[BTRFS_RAID_SINGLE], list) - set_block_group_ro(cache, 1); + inc_block_group_ro(cache, 1); } init_global_block_rsv(info); @@ -9816,6 +9926,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * currently running transaction might finish and a new one start, * allowing for new block groups to be created that can reuse the same * physical device locations unless we take this special care. + * + * There may also be an implicit trim operation if the file system + * is mounted with -odiscard. The same protections must remain + * in place until the extents have been discarded completely when + * the transaction commit has completed. */ remove_em = (atomic_read(&block_group->trimming) == 0); /* @@ -9890,6 +10005,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_lock(&fs_info->unused_bgs_lock); while (!list_empty(&fs_info->unused_bgs)) { u64 start, end; + int trimming; block_group = list_first_entry(&fs_info->unused_bgs, struct btrfs_block_group_cache, @@ -9923,7 +10039,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_unlock(&block_group->lock); /* We don't want to force the issue, only flip if it's ok. */ - ret = set_block_group_ro(block_group, 0); + ret = inc_block_group_ro(block_group, 0); up_write(&space_info->groups_sem); if (ret < 0) { ret = 0; @@ -9937,7 +10053,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) /* 1 for btrfs_orphan_reserve_metadata() */ trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { - btrfs_set_block_group_rw(root, block_group); + btrfs_dec_block_group_ro(root, block_group); ret = PTR_ERR(trans); goto next; } @@ -9964,14 +10080,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) EXTENT_DIRTY, GFP_NOFS); if (ret) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); - btrfs_set_block_group_rw(root, block_group); + btrfs_dec_block_group_ro(root, block_group); goto end_trans; } ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, EXTENT_DIRTY, GFP_NOFS); if (ret) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); - btrfs_set_block_group_rw(root, block_group); + btrfs_dec_block_group_ro(root, block_group); goto end_trans; } mutex_unlock(&fs_info->unused_bg_unpin_mutex); @@ -9989,12 +10105,39 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_unlock(&block_group->lock); spin_unlock(&space_info->lock); + /* DISCARD can flip during remount */ + trimming = btrfs_test_opt(root, DISCARD); + + /* Implicit trim during transaction commit. */ + if (trimming) + btrfs_get_block_group_trimming(block_group); + /* * Btrfs_remove_chunk will abort the transaction if things go * horribly wrong. */ ret = btrfs_remove_chunk(trans, root, block_group->key.objectid); + + if (ret) { + if (trimming) + btrfs_put_block_group_trimming(block_group); + goto end_trans; + } + + /* + * If we're not mounted with -odiscard, we can just forget + * about this block group. Otherwise we'll need to wait + * until transaction commit to do the actual discard. + */ + if (trimming) { + WARN_ON(!list_empty(&block_group->bg_list)); + spin_lock(&trans->transaction->deleted_bgs_lock); + list_move(&block_group->bg_list, + &trans->transaction->deleted_bgs); + spin_unlock(&trans->transaction->deleted_bgs_lock); + btrfs_get_block_group(block_group); + } end_trans: btrfs_end_transaction(trans, root); next: @@ -10048,10 +10191,99 @@ int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) return unpin_extent_range(root, start, end, false); } +/* + * It used to be that old block groups would be left around forever. + * Iterating over them would be enough to trim unused space. Since we + * now automatically remove them, we also need to iterate over unallocated + * space. + * + * We don't want a transaction for this since the discard may take a + * substantial amount of time. We don't require that a transaction be + * running, but we do need to take a running transaction into account + * to ensure that we're not discarding chunks that were released in + * the current transaction. + * + * Holding the chunks lock will prevent other threads from allocating + * or releasing chunks, but it won't prevent a running transaction + * from committing and releasing the memory that the pending chunks + * list head uses. For that, we need to take a reference to the + * transaction. + */ +static int btrfs_trim_free_extents(struct btrfs_device *device, + u64 minlen, u64 *trimmed) +{ + u64 start = 0, len = 0; + int ret; + + *trimmed = 0; + + /* Not writeable = nothing to do. */ + if (!device->writeable) + return 0; + + /* No free space = nothing to do. */ + if (device->total_bytes <= device->bytes_used) + return 0; + + ret = 0; + + while (1) { + struct btrfs_fs_info *fs_info = device->dev_root->fs_info; + struct btrfs_transaction *trans; + u64 bytes; + + ret = mutex_lock_interruptible(&fs_info->chunk_mutex); + if (ret) + return ret; + + down_read(&fs_info->commit_root_sem); + + spin_lock(&fs_info->trans_lock); + trans = fs_info->running_transaction; + if (trans) + atomic_inc(&trans->use_count); + spin_unlock(&fs_info->trans_lock); + + ret = find_free_dev_extent_start(trans, device, minlen, start, + &start, &len); + if (trans) + btrfs_put_transaction(trans); + + if (ret) { + up_read(&fs_info->commit_root_sem); + mutex_unlock(&fs_info->chunk_mutex); + if (ret == -ENOSPC) + ret = 0; + break; + } + + ret = btrfs_issue_discard(device->bdev, start, len, &bytes); + up_read(&fs_info->commit_root_sem); + mutex_unlock(&fs_info->chunk_mutex); + + if (ret) + break; + + start += len; + *trimmed += bytes; + + if (fatal_signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + + cond_resched(); + } + + return ret; +} + int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_group_cache *cache = NULL; + struct btrfs_device *device; + struct list_head *devices; u64 group_trimmed; u64 start; u64 end; @@ -10106,6 +10338,18 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) cache = next_block_group(fs_info->tree_root, cache); } + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + devices = &root->fs_info->fs_devices->alloc_list; + list_for_each_entry(device, devices, dev_alloc_list) { + ret = btrfs_trim_free_extents(device, range->minlen, + &group_trimmed); + if (ret) + break; + + trimmed += group_trimmed; + } + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + range->len = trimmed; return ret; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 02d05817cbdf..f1018cfbfefa 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2486,7 +2486,7 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end) * Scheduling is not allowed, so the extent state tree is expected * to have one and only one object corresponding to this IO. */ -static void end_bio_extent_writepage(struct bio *bio, int err) +static void end_bio_extent_writepage(struct bio *bio) { struct bio_vec *bvec; u64 start; @@ -2516,7 +2516,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err) start = page_offset(page); end = start + bvec->bv_offset + bvec->bv_len - 1; - if (end_extent_writepage(page, err, start, end)) + if (end_extent_writepage(page, bio->bi_error, start, end)) continue; end_page_writeback(page); @@ -2548,10 +2548,10 @@ endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, * Scheduling is not allowed, so the extent state tree is expected * to have one and only one object corresponding to this IO. */ -static void end_bio_extent_readpage(struct bio *bio, int err) +static void end_bio_extent_readpage(struct bio *bio) { struct bio_vec *bvec; - int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + int uptodate = !bio->bi_error; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); struct extent_io_tree *tree; u64 offset = 0; @@ -2564,16 +2564,13 @@ static void end_bio_extent_readpage(struct bio *bio, int err) int ret; int i; - if (err) - uptodate = 0; - bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " - "mirror=%u\n", (u64)bio->bi_iter.bi_sector, err, - io_bio->mirror_num); + "mirror=%u\n", (u64)bio->bi_iter.bi_sector, + bio->bi_error, io_bio->mirror_num); tree = &BTRFS_I(inode)->io_tree; /* We always issue full-page reads, but if some block @@ -2614,8 +2611,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) if (tree->ops && tree->ops->readpage_io_failed_hook) { ret = tree->ops->readpage_io_failed_hook(page, mirror); - if (!ret && !err && - test_bit(BIO_UPTODATE, &bio->bi_flags)) + if (!ret && !bio->bi_error) uptodate = 1; } else { /* @@ -2631,10 +2627,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) ret = bio_readpage_error(bio, offset, page, start, end, mirror); if (ret == 0) { - uptodate = - test_bit(BIO_UPTODATE, &bio->bi_flags); - if (err) - uptodate = 0; + uptodate = !bio->bi_error; offset += len; continue; } @@ -2684,7 +2677,7 @@ readpage_ok: endio_readpage_release_extent(tree, extent_start, extent_len, uptodate); if (io_bio->end_io) - io_bio->end_io(io_bio, err); + io_bio->end_io(io_bio, bio->bi_error); bio_put(bio); } @@ -2730,6 +2723,12 @@ struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) btrfs_bio->csum = NULL; btrfs_bio->csum_allocated = NULL; btrfs_bio->end_io = NULL; + +#ifdef CONFIG_BLK_CGROUP + /* FIXME, put this into bio_clone_bioset */ + if (bio->bi_css) + bio_associate_blkcg(new, bio->bi_css); +#endif } return new; } @@ -2790,6 +2789,7 @@ static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page, } static int submit_extent_page(int rw, struct extent_io_tree *tree, + struct writeback_control *wbc, struct page *page, sector_t sector, size_t size, unsigned long offset, struct block_device *bdev, @@ -2802,9 +2802,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, { int ret = 0; struct bio *bio; - int nr; int contig = 0; - int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED; int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE); @@ -2826,21 +2824,24 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, } bio = NULL; } else { + if (wbc) + wbc_account_io(wbc, page, page_size); return 0; } } - if (this_compressed) - nr = BIO_MAX_PAGES; - else - nr = bio_get_nr_vecs(bdev); - bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); + bio = btrfs_bio_alloc(bdev, sector, BIO_MAX_PAGES, + GFP_NOFS | __GFP_HIGH); if (!bio) return -ENOMEM; bio_add_page(bio, page, page_size, offset); bio->bi_end_io = end_io_func; bio->bi_private = tree; + if (wbc) { + wbc_init_bio(wbc, bio); + wbc_account_io(wbc, page, page_size); + } if (bio_ret) *bio_ret = bio; @@ -3051,7 +3052,7 @@ static int __do_readpage(struct extent_io_tree *tree, } pnr -= page->index; - ret = submit_extent_page(rw, tree, page, + ret = submit_extent_page(rw, tree, NULL, page, sector, disk_io_size, pg_offset, bdev, bio, pnr, end_bio_extent_readpage, mirror_num, @@ -3446,7 +3447,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, page->index, cur, end); } - ret = submit_extent_page(write_flags, tree, page, + ret = submit_extent_page(write_flags, tree, wbc, page, sector, iosize, pg_offset, bdev, &epd->bio, max_nr, end_bio_extent_writepage, @@ -3696,7 +3697,7 @@ static void set_btree_ioerr(struct page *page) } } -static void end_bio_extent_buffer_writepage(struct bio *bio, int err) +static void end_bio_extent_buffer_writepage(struct bio *bio) { struct bio_vec *bvec; struct extent_buffer *eb; @@ -3709,7 +3710,8 @@ static void end_bio_extent_buffer_writepage(struct bio *bio, int err) BUG_ON(!eb); done = atomic_dec_and_test(&eb->io_pages); - if (err || test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { + if (bio->bi_error || + test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { ClearPageUptodate(page); set_btree_ioerr(page); } @@ -3749,7 +3751,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, clear_page_dirty_for_io(p); set_page_writeback(p); - ret = submit_extent_page(rw, tree, p, offset >> 9, + ret = submit_extent_page(rw, tree, wbc, p, offset >> 9, PAGE_CACHE_SIZE, 0, bdev, &epd->bio, -1, end_bio_extent_buffer_writepage, 0, epd->bio_flags, bio_flags); @@ -4614,9 +4616,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, { struct extent_buffer *eb = NULL; - eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS); - if (eb == NULL) - return NULL; + eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL); eb->start = start; eb->len = len; eb->fs_info = fs_info; @@ -4874,7 +4874,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, return NULL; for (i = 0; i < num_pages; i++, index++) { - p = find_or_create_page(mapping, index, GFP_NOFS); + p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL); if (!p) goto free_eb; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index fb5a6b1c62a6..abe3a66bd3ba 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -3272,35 +3272,23 @@ next: return ret; } -int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, - u64 *trimmed, u64 start, u64 end, u64 minlen) +void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache) { - int ret; + atomic_inc(&cache->trimming); +} - *trimmed = 0; +void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group) +{ + struct extent_map_tree *em_tree; + struct extent_map *em; + bool cleanup; spin_lock(&block_group->lock); - if (block_group->removed) { - spin_unlock(&block_group->lock); - return 0; - } - atomic_inc(&block_group->trimming); + cleanup = (atomic_dec_and_test(&block_group->trimming) && + block_group->removed); spin_unlock(&block_group->lock); - ret = trim_no_bitmap(block_group, trimmed, start, end, minlen); - if (ret) - goto out; - - ret = trim_bitmaps(block_group, trimmed, start, end, minlen); -out: - spin_lock(&block_group->lock); - if (atomic_dec_and_test(&block_group->trimming) && - block_group->removed) { - struct extent_map_tree *em_tree; - struct extent_map *em; - - spin_unlock(&block_group->lock); - + if (cleanup) { lock_chunks(block_group->fs_info->chunk_root); em_tree = &block_group->fs_info->mapping_tree.map_tree; write_lock(&em_tree->lock); @@ -3324,10 +3312,31 @@ out: * this block group have left 1 entry each one. Free them. */ __btrfs_remove_free_space_cache(block_group->free_space_ctl); - } else { + } +} + +int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, + u64 *trimmed, u64 start, u64 end, u64 minlen) +{ + int ret; + + *trimmed = 0; + + spin_lock(&block_group->lock); + if (block_group->removed) { spin_unlock(&block_group->lock); + return 0; } + btrfs_get_block_group_trimming(block_group); + spin_unlock(&block_group->lock); + + ret = trim_no_bitmap(block_group, trimmed, start, end, minlen); + if (ret) + goto out; + ret = trim_bitmaps(block_group, trimmed, start, end, minlen); +out: + btrfs_put_block_group_trimming(block_group); return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e33dff356460..237da012f7d0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1845,8 +1845,10 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, int ret; ret = btrfs_map_bio(root, rw, bio, mirror_num, 1); - if (ret) - bio_endio(bio, ret); + if (ret) { + bio->bi_error = ret; + bio_endio(bio); + } return ret; } @@ -1906,8 +1908,10 @@ mapit: ret = btrfs_map_bio(root, rw, bio, mirror_num, 0); out: - if (ret < 0) - bio_endio(bio, ret); + if (ret < 0) { + bio->bi_error = ret; + bio_endio(bio); + } return ret; } @@ -3654,6 +3658,35 @@ cache_index: set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); + /* + * We don't persist the id of the transaction where an unlink operation + * against the inode was last made. So here we assume the inode might + * have been evicted, and therefore the exact value of last_unlink_trans + * lost, and set it to last_trans to avoid metadata inconsistencies + * between the inode and its parent if the inode is fsync'ed and the log + * replayed. For example, in the scenario: + * + * touch mydir/foo + * ln mydir/foo mydir/bar + * sync + * unlink mydir/bar + * echo 2 > /proc/sys/vm/drop_caches # evicts inode + * xfs_io -c fsync mydir/foo + * <power failure> + * mount fs, triggers fsync log replay + * + * We must make sure that when we fsync our inode foo we also log its + * parent inode, otherwise after log replay the parent still has the + * dentry with the "bar" name but our inode foo has a link count of 1 + * and doesn't have an inode ref with the name "bar" anymore. + * + * Setting last_unlink_trans to last_trans is a pessimistic approach, + * but it guarantees correctness at the expense of ocassional full + * transaction commits on fsync if our inode is a directory, or if our + * inode is not a directory, logging its parent unnecessarily. + */ + BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; + path->slots[0]++; if (inode->i_nlink != 1 || path->slots[0] >= btrfs_header_nritems(leaf)) @@ -7688,13 +7721,13 @@ struct btrfs_retry_complete { int uptodate; }; -static void btrfs_retry_endio_nocsum(struct bio *bio, int err) +static void btrfs_retry_endio_nocsum(struct bio *bio) { struct btrfs_retry_complete *done = bio->bi_private; struct bio_vec *bvec; int i; - if (err) + if (bio->bi_error) goto end; done->uptodate = 1; @@ -7743,7 +7776,7 @@ try_again: return 0; } -static void btrfs_retry_endio(struct bio *bio, int err) +static void btrfs_retry_endio(struct bio *bio) { struct btrfs_retry_complete *done = bio->bi_private; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); @@ -7752,7 +7785,7 @@ static void btrfs_retry_endio(struct bio *bio, int err) int ret; int i; - if (err) + if (bio->bi_error) goto end; uptodate = 1; @@ -7835,12 +7868,13 @@ static int btrfs_subio_endio_read(struct inode *inode, } } -static void btrfs_endio_direct_read(struct bio *bio, int err) +static void btrfs_endio_direct_read(struct bio *bio) { struct btrfs_dio_private *dip = bio->bi_private; struct inode *inode = dip->inode; struct bio *dio_bio; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); + int err = bio->bi_error; if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) err = btrfs_subio_endio_read(inode, io_bio, err); @@ -7851,17 +7885,14 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) kfree(dip); - /* If we had a csum failure make sure to clear the uptodate flag */ - if (err) - clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); - dio_end_io(dio_bio, err); + dio_end_io(dio_bio, bio->bi_error); if (io_bio->end_io) io_bio->end_io(io_bio, err); bio_put(bio); } -static void btrfs_endio_direct_write(struct bio *bio, int err) +static void btrfs_endio_direct_write(struct bio *bio) { struct btrfs_dio_private *dip = bio->bi_private; struct inode *inode = dip->inode; @@ -7875,7 +7906,8 @@ static void btrfs_endio_direct_write(struct bio *bio, int err) again: ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, &ordered_offset, - ordered_bytes, !err); + ordered_bytes, + !bio->bi_error); if (!ret) goto out_test; @@ -7898,10 +7930,7 @@ out_test: kfree(dip); - /* If we had an error make sure to clear the uptodate flag */ - if (err) - clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); - dio_end_io(dio_bio, err); + dio_end_io(dio_bio, bio->bi_error); bio_put(bio); } @@ -7916,9 +7945,10 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, return 0; } -static void btrfs_end_dio_bio(struct bio *bio, int err) +static void btrfs_end_dio_bio(struct bio *bio) { struct btrfs_dio_private *dip = bio->bi_private; + int err = bio->bi_error; if (err) btrfs_warn(BTRFS_I(dip->inode)->root->fs_info, @@ -7947,8 +7977,8 @@ static void btrfs_end_dio_bio(struct bio *bio, int err) if (dip->errors) { bio_io_error(dip->orig_bio); } else { - set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags); - bio_endio(dip->orig_bio, 0); + dip->dio_bio->bi_error = 0; + bio_endio(dip->orig_bio); } out: bio_put(bio); @@ -7957,8 +7987,11 @@ out: static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, u64 first_sector, gfp_t gfp_flags) { - int nr_vecs = bio_get_nr_vecs(bdev); - return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags); + struct bio *bio; + bio = btrfs_bio_alloc(bdev, first_sector, BIO_MAX_PAGES, gfp_flags); + if (bio) + bio_associate_current(bio); + return bio; } static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root, @@ -8219,7 +8252,8 @@ free_ordered: * callbacks - they require an allocated dip and a clone of dio_bio. */ if (io_bio && dip) { - bio_endio(io_bio, ret); + io_bio->bi_error = -EIO; + bio_endio(io_bio); /* * The end io callbacks free our dip, do the final put on io_bio * and all the cleanup and final put for dio_bio (through @@ -8246,7 +8280,7 @@ free_ordered: unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, file_offset + dio_bio->bi_iter.bi_size - 1); } - clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); + dio_bio->bi_error = -EIO; /* * Releases and cleans up our dio_bio, no need to bio_put() * nor bio_endio()/bio_io_error() against dio_bio. diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0770c91586ca..0adf5422fce9 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1030,6 +1030,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u32 thresh, struct extent_map *em; int ret = 1; bool next_mergeable = true; + bool prev_mergeable = true; /* * make sure that once we start defragging an extent, we keep on @@ -1050,13 +1051,16 @@ static int should_defrag_range(struct inode *inode, u64 start, u32 thresh, goto out; } + if (!*defrag_end) + prev_mergeable = false; + next_mergeable = defrag_check_next_extent(inode, em); /* * we hit a real extent, if it is big or the next extent is not a * real extent, don't bother defragging it */ if (!compress && (*last_len == 0 || *last_len >= thresh) && - (em->len >= thresh || !next_mergeable)) + (em->len >= thresh || (!next_mergeable && !prev_mergeable))) ret = 0; out: /* @@ -1933,6 +1937,7 @@ static noinline int copy_to_sk(struct btrfs_root *root, u64 found_transid; struct extent_buffer *leaf; struct btrfs_ioctl_search_header sh; + struct btrfs_key test; unsigned long item_off; unsigned long item_len; int nritems; @@ -2016,12 +2021,17 @@ static noinline int copy_to_sk(struct btrfs_root *root, } advance_key: ret = 0; - if (key->offset < (u64)-1 && key->offset < sk->max_offset) + test.objectid = sk->max_objectid; + test.type = sk->max_type; + test.offset = sk->max_offset; + if (btrfs_comp_cpu_keys(key, &test) >= 0) + ret = 1; + else if (key->offset < (u64)-1) key->offset++; - else if (key->type < (u8)-1 && key->type < sk->max_type) { + else if (key->type < (u8)-1) { key->offset = 0; key->type++; - } else if (key->objectid < (u64)-1 && key->objectid < sk->max_objectid) { + } else if (key->objectid < (u64)-1) { key->offset = 0; key->type = 0; key->objectid++; @@ -2842,8 +2852,7 @@ static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2) swap(inode1, inode2); mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); - if (inode1 != inode2) - mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); + mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); } static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, @@ -2861,8 +2870,7 @@ static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, swap(loff1, loff2); } lock_extent_range(inode1, loff1, len); - if (inode1 != inode2) - lock_extent_range(inode2, loff2, len); + lock_extent_range(inode2, loff2, len); } struct cmp_pages { @@ -3787,13 +3795,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, goto out_fput; if (!same_inode) { - if (inode < src) { - mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD); - } else { - mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); - } + btrfs_double_inode_lock(src, inode); } else { mutex_lock(&src->i_mutex); } @@ -3843,8 +3845,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, lock_extent_range(src, lock_start, lock_len); } else { - lock_extent_range(src, off, len); - lock_extent_range(inode, destoff, len); + btrfs_double_extent_lock(src, off, inode, destoff, len); } ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); @@ -3855,9 +3856,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end); } else { - unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); - unlock_extent(&BTRFS_I(inode)->io_tree, destoff, - destoff + len - 1); + btrfs_double_extent_unlock(src, off, inode, destoff, len); } /* * Truncate page cache pages so that future reads will see the cloned @@ -3866,17 +3865,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, truncate_inode_pages_range(&inode->i_data, destoff, PAGE_CACHE_ALIGN(destoff + len) - 1); out_unlock: - if (!same_inode) { - if (inode < src) { - mutex_unlock(&src->i_mutex); - mutex_unlock(&inode->i_mutex); - } else { - mutex_unlock(&inode->i_mutex); - mutex_unlock(&src->i_mutex); - } - } else { + if (!same_inode) + btrfs_double_inode_unlock(src, inode); + else mutex_unlock(&src->i_mutex); - } out_fput: fdput(src_file); out_drop_write: diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index f8229ef1b46d..d7e6baf1b205 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -241,6 +241,7 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) */ void btrfs_tree_lock(struct extent_buffer *eb) { + WARN_ON(eb->lock_owner == current->pid); again: wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0); wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index e9ace099162c..d904ee1c5349 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -376,7 +376,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) qgroup = find_qgroup_rb(fs_info, found_key.offset); if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) || (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) { - btrfs_err(fs_info, "inconsitent qgroup config"); + btrfs_err(fs_info, "inconsistent qgroup config"); flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; } if (!qgroup) { @@ -1651,6 +1651,11 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info, /* Exclusive -> exclusive, nothing changed */ } } + + /* For exclusive extent, free its reserved bytes too */ + if (nr_old_roots == 0 && nr_new_roots == 1 && + cur_new_count == nr_new_roots) + qg->reserved -= num_bytes; if (dirty) qgroup_dirty(fs_info, qg); } diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index fa72068bd256..fcf7265ca46f 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -61,9 +61,10 @@ #define RBIO_CACHE_SIZE 1024 enum btrfs_rbio_ops { - BTRFS_RBIO_WRITE = 0, - BTRFS_RBIO_READ_REBUILD = 1, - BTRFS_RBIO_PARITY_SCRUB = 2, + BTRFS_RBIO_WRITE, + BTRFS_RBIO_READ_REBUILD, + BTRFS_RBIO_PARITY_SCRUB, + BTRFS_RBIO_REBUILD_MISSING, }; struct btrfs_raid_bio { @@ -602,6 +603,10 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, cur->operation == BTRFS_RBIO_PARITY_SCRUB) return 0; + if (last->operation == BTRFS_RBIO_REBUILD_MISSING || + cur->operation == BTRFS_RBIO_REBUILD_MISSING) + return 0; + return 1; } @@ -793,7 +798,10 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) if (next->operation == BTRFS_RBIO_READ_REBUILD) async_read_rebuild(next); - else if (next->operation == BTRFS_RBIO_WRITE) { + else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) { + steal_rbio(rbio, next); + async_read_rebuild(next); + } else if (next->operation == BTRFS_RBIO_WRITE) { steal_rbio(rbio, next); async_rmw_stripe(next); } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { @@ -851,7 +859,7 @@ static void free_raid_bio(struct btrfs_raid_bio *rbio) * this frees the rbio and runs through all the bios in the * bio_list and calls end_io on them */ -static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate) +static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err) { struct bio *cur = bio_list_get(&rbio->bio_list); struct bio *next; @@ -864,9 +872,8 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate) while (cur) { next = cur->bi_next; cur->bi_next = NULL; - if (uptodate) - set_bit(BIO_UPTODATE, &cur->bi_flags); - bio_endio(cur, err); + cur->bi_error = err; + bio_endio(cur); cur = next; } } @@ -875,9 +882,10 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate) * end io function used by finish_rmw. When we finally * get here, we've written a full stripe */ -static void raid_write_end_io(struct bio *bio, int err) +static void raid_write_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; + int err = bio->bi_error; if (err) fail_bio_stripe(rbio, bio); @@ -893,7 +901,7 @@ static void raid_write_end_io(struct bio *bio, int err) if (atomic_read(&rbio->error) > rbio->bbio->max_errors) err = -EIO; - rbio_orig_end_io(rbio, err, 0); + rbio_orig_end_io(rbio, err); return; } @@ -1071,7 +1079,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, * devices or if they are not contiguous */ if (last_end == disk_start && stripe->dev->bdev && - test_bit(BIO_UPTODATE, &last->bi_flags) && + !last->bi_error && last->bi_bdev == stripe->dev->bdev) { ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0); if (ret == PAGE_CACHE_SIZE) @@ -1087,7 +1095,6 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, bio->bi_iter.bi_size = 0; bio->bi_bdev = stripe->dev->bdev; bio->bi_iter.bi_sector = disk_start >> 9; - set_bit(BIO_UPTODATE, &bio->bi_flags); bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); bio_list_add(bio_list, bio); @@ -1312,13 +1319,12 @@ write_data: bio->bi_private = rbio; bio->bi_end_io = raid_write_end_io; - BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); submit_bio(WRITE, bio); } return; cleanup: - rbio_orig_end_io(rbio, -EIO, 0); + rbio_orig_end_io(rbio, -EIO); } /* @@ -1441,11 +1447,11 @@ static void set_bio_pages_uptodate(struct bio *bio) * This will usually kick off finish_rmw once all the bios are read in, but it * may trigger parity reconstruction if we had any errors along the way */ -static void raid_rmw_end_io(struct bio *bio, int err) +static void raid_rmw_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - if (err) + if (bio->bi_error) fail_bio_stripe(rbio, bio); else set_bio_pages_uptodate(bio); @@ -1455,7 +1461,6 @@ static void raid_rmw_end_io(struct bio *bio, int err) if (!atomic_dec_and_test(&rbio->stripes_pending)) return; - err = 0; if (atomic_read(&rbio->error) > rbio->bbio->max_errors) goto cleanup; @@ -1469,7 +1474,7 @@ static void raid_rmw_end_io(struct bio *bio, int err) cleanup: - rbio_orig_end_io(rbio, -EIO, 0); + rbio_orig_end_io(rbio, -EIO); } static void async_rmw_stripe(struct btrfs_raid_bio *rbio) @@ -1572,14 +1577,13 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); - BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); submit_bio(READ, bio); } /* the actual write will happen once the reads are done */ return 0; cleanup: - rbio_orig_end_io(rbio, -EIO, 0); + rbio_orig_end_io(rbio, -EIO); return -EIO; finish: @@ -1809,7 +1813,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) faila = rbio->faila; failb = rbio->failb; - if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { + if (rbio->operation == BTRFS_RBIO_READ_REBUILD || + rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { spin_lock_irq(&rbio->bio_list_lock); set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); spin_unlock_irq(&rbio->bio_list_lock); @@ -1834,7 +1839,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) * if we're rebuilding a read, we have to use * pages from the bio list */ - if (rbio->operation == BTRFS_RBIO_READ_REBUILD && + if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || + rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && (stripe == faila || stripe == failb)) { page = page_in_rbio(rbio, stripe, pagenr, 0); } else { @@ -1943,7 +1949,8 @@ pstripe: * if we're rebuilding a read, we have to use * pages from the bio list */ - if (rbio->operation == BTRFS_RBIO_READ_REBUILD && + if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || + rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && (stripe == faila || stripe == failb)) { page = page_in_rbio(rbio, stripe, pagenr, 0); } else { @@ -1964,7 +1971,9 @@ cleanup_io: else clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); - rbio_orig_end_io(rbio, err, err == 0); + rbio_orig_end_io(rbio, err); + } else if (rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { + rbio_orig_end_io(rbio, err); } else if (err == 0) { rbio->faila = -1; rbio->failb = -1; @@ -1976,7 +1985,7 @@ cleanup_io: else BUG(); } else { - rbio_orig_end_io(rbio, err, 0); + rbio_orig_end_io(rbio, err); } } @@ -1984,7 +1993,7 @@ cleanup_io: * This is called only for stripes we've read from disk to * reconstruct the parity. */ -static void raid_recover_end_io(struct bio *bio, int err) +static void raid_recover_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; @@ -1992,7 +2001,7 @@ static void raid_recover_end_io(struct bio *bio, int err) * we only read stripe pages off the disk, set them * up to date if there were no errors */ - if (err) + if (bio->bi_error) fail_bio_stripe(rbio, bio); else set_bio_pages_uptodate(bio); @@ -2002,7 +2011,7 @@ static void raid_recover_end_io(struct bio *bio, int err) return; if (atomic_read(&rbio->error) > rbio->bbio->max_errors) - rbio_orig_end_io(rbio, -EIO, 0); + rbio_orig_end_io(rbio, -EIO); else __raid_recover_end_io(rbio); } @@ -2094,15 +2103,15 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); - BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); submit_bio(READ, bio); } out: return 0; cleanup: - if (rbio->operation == BTRFS_RBIO_READ_REBUILD) - rbio_orig_end_io(rbio, -EIO, 0); + if (rbio->operation == BTRFS_RBIO_READ_REBUILD || + rbio->operation == BTRFS_RBIO_REBUILD_MISSING) + rbio_orig_end_io(rbio, -EIO); return -EIO; } @@ -2232,8 +2241,9 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio, return rbio; } -void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio, - struct page *page, u64 logical) +/* Used for both parity scrub and missing. */ +void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, + u64 logical) { int stripe_offset; int index; @@ -2277,11 +2287,12 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) * end io function used by finish_rmw. When we finally * get here, we've written a full stripe */ -static void raid_write_parity_end_io(struct bio *bio, int err) +static void raid_write_parity_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; + int err = bio->bi_error; - if (err) + if (bio->bi_error) fail_bio_stripe(rbio, bio); bio_put(bio); @@ -2294,7 +2305,7 @@ static void raid_write_parity_end_io(struct bio *bio, int err) if (atomic_read(&rbio->error)) err = -EIO; - rbio_orig_end_io(rbio, err, 0); + rbio_orig_end_io(rbio, err); } static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, @@ -2437,7 +2448,7 @@ submit_write: nr_data = bio_list_size(&bio_list); if (!nr_data) { /* Every parity is right */ - rbio_orig_end_io(rbio, 0, 0); + rbio_orig_end_io(rbio, 0); return; } @@ -2450,13 +2461,12 @@ submit_write: bio->bi_private = rbio; bio->bi_end_io = raid_write_parity_end_io; - BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); submit_bio(WRITE, bio); } return; cleanup: - rbio_orig_end_io(rbio, -EIO, 0); + rbio_orig_end_io(rbio, -EIO); } static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) @@ -2524,7 +2534,7 @@ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) return; cleanup: - rbio_orig_end_io(rbio, -EIO, 0); + rbio_orig_end_io(rbio, -EIO); } /* @@ -2535,11 +2545,11 @@ cleanup: * This will usually kick off finish_rmw once all the bios are read in, but it * may trigger parity reconstruction if we had any errors along the way */ -static void raid56_parity_scrub_end_io(struct bio *bio, int err) +static void raid56_parity_scrub_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - if (err) + if (bio->bi_error) fail_bio_stripe(rbio, bio); else set_bio_pages_uptodate(bio); @@ -2632,14 +2642,13 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); - BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); submit_bio(READ, bio); } /* the actual write will happen once the reads are done */ return; cleanup: - rbio_orig_end_io(rbio, -EIO, 0); + rbio_orig_end_io(rbio, -EIO); return; finish: @@ -2668,3 +2677,55 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) if (!lock_stripe_add(rbio)) async_scrub_parity(rbio); } + +/* The following code is used for dev replace of a missing RAID 5/6 device. */ + +struct btrfs_raid_bio * +raid56_alloc_missing_rbio(struct btrfs_root *root, struct bio *bio, + struct btrfs_bio *bbio, u64 length) +{ + struct btrfs_raid_bio *rbio; + + rbio = alloc_rbio(root, bbio, length); + if (IS_ERR(rbio)) + return NULL; + + rbio->operation = BTRFS_RBIO_REBUILD_MISSING; + bio_list_add(&rbio->bio_list, bio); + /* + * This is a special bio which is used to hold the completion handler + * and make the scrub rbio is similar to the other types + */ + ASSERT(!bio->bi_iter.bi_size); + + rbio->faila = find_logical_bio_stripe(rbio, bio); + if (rbio->faila == -1) { + BUG(); + kfree(rbio); + return NULL; + } + + return rbio; +} + +static void missing_raid56_work(struct btrfs_work *work) +{ + struct btrfs_raid_bio *rbio; + + rbio = container_of(work, struct btrfs_raid_bio, work); + __raid56_parity_recover(rbio); +} + +static void async_missing_raid56(struct btrfs_raid_bio *rbio) +{ + btrfs_init_work(&rbio->work, btrfs_rmw_helper, + missing_raid56_work, NULL, NULL); + + btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); +} + +void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio) +{ + if (!lock_stripe_add(rbio)) + async_missing_raid56(rbio); +} diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 2b5d7977d83b..8b694699d502 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -48,15 +48,21 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, int raid56_parity_write(struct btrfs_root *root, struct bio *bio, struct btrfs_bio *bbio, u64 stripe_len); +void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, + u64 logical); + struct btrfs_raid_bio * raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio, struct btrfs_bio *bbio, u64 stripe_len, struct btrfs_device *scrub_dev, unsigned long *dbitmap, int stripe_nsectors); -void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio, - struct page *page, u64 logical); void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio); +struct btrfs_raid_bio * +raid56_alloc_missing_rbio(struct btrfs_root *root, struct bio *bio, + struct btrfs_bio *bbio, u64 length); +void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio); + int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); #endif diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 0e7beea92b4c..4645cd16d5ba 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -328,6 +328,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, struct btrfs_device *prev_dev; u32 blocksize; u64 length; + int real_stripes; int nzones = 0; int i; unsigned long index = logical >> PAGE_CACHE_SHIFT; @@ -369,7 +370,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, goto error; } - for (nzones = 0; nzones < bbio->num_stripes; ++nzones) { + real_stripes = bbio->num_stripes - bbio->num_tgtdevs; + for (nzones = 0; nzones < real_stripes; ++nzones) { struct reada_zone *zone; dev = bbio->stripes[nzones].dev; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 88cbb5995667..303babeef505 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2523,8 +2523,7 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, * counted. return -ENOENT if the block is root of reloc tree. */ static noinline_for_stack -struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans, - struct backref_node *node) +struct btrfs_root *select_one_root(struct backref_node *node) { struct backref_node *next; struct btrfs_root *root; @@ -2912,7 +2911,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, return 0; BUG_ON(node->processed); - root = select_one_root(trans, node); + root = select_one_root(node); if (root == ERR_PTR(-ENOENT)) { update_processed_blocks(rc, node); goto out; @@ -3755,8 +3754,7 @@ out: * helper to find next unprocessed extent */ static noinline_for_stack -int find_next_extent(struct btrfs_trans_handle *trans, - struct reloc_control *rc, struct btrfs_path *path, +int find_next_extent(struct reloc_control *rc, struct btrfs_path *path, struct btrfs_key *extent_key) { struct btrfs_key key; @@ -3951,7 +3949,7 @@ restart: continue; } - ret = find_next_extent(trans, rc, path, &key); + ret = find_next_extent(rc, path, &key); if (ret < 0) err = ret; if (ret != 0) @@ -3976,6 +3974,10 @@ restart: sizeof(struct btrfs_extent_item_v0)); ret = get_ref_objectid_v0(rc, path, &key, &ref_owner, &path_change); + if (ret < 0) { + err = ret; + break; + } if (ref_owner < BTRFS_FIRST_FREE_OBJECTID) flags = BTRFS_EXTENT_FLAG_TREE_BLOCK; else @@ -4140,7 +4142,7 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans; struct btrfs_root *root; struct btrfs_key key; - u64 objectid = BTRFS_FIRST_FREE_OBJECTID; + u64 objectid; int err = 0; root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID); @@ -4215,14 +4217,12 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) rc->block_group = btrfs_lookup_block_group(fs_info, group_start); BUG_ON(!rc->block_group); - if (!rc->block_group->ro) { - ret = btrfs_set_block_group_ro(extent_root, rc->block_group); - if (ret) { - err = ret; - goto out; - } - rw = 1; + ret = btrfs_inc_block_group_ro(extent_root, rc->block_group); + if (ret) { + err = ret; + goto out; } + rw = 1; path = btrfs_alloc_path(); if (!path) { @@ -4294,7 +4294,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0); out: if (err && rw) - btrfs_set_block_group_rw(extent_root, rc->block_group); + btrfs_dec_block_group_ro(extent_root, rc->block_group); iput(rc->data_inode); btrfs_put_block_group(rc->block_group); kfree(rc); @@ -4594,8 +4594,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, * called before creating snapshot. it calculates metadata reservation * requried for relocating tree blocks in the snapshot */ -void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending, +void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, u64 *bytes_to_reserve) { struct btrfs_root *root; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 94db0fa5225a..9a11db0c47ee 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -125,6 +125,7 @@ struct scrub_block { /* It is for the data with checksum */ unsigned int data_corrected:1; }; + struct btrfs_work work; }; /* Used for the chunks with parity stripe such RAID5/6 */ @@ -278,7 +279,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, u64 physical, struct btrfs_device *dev, u64 flags, u64 gen, int mirror_num, u8 *csum, int force, u64 physical_for_dev_replace); -static void scrub_bio_end_io(struct bio *bio, int err); +static void scrub_bio_end_io(struct bio *bio); static void scrub_bio_end_io_worker(struct btrfs_work *work); static void scrub_block_complete(struct scrub_block *sblock); static void scrub_remap_extent(struct btrfs_fs_info *fs_info, @@ -295,7 +296,7 @@ static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx); static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, struct scrub_page *spage); static void scrub_wr_submit(struct scrub_ctx *sctx); -static void scrub_wr_bio_end_io(struct bio *bio, int err); +static void scrub_wr_bio_end_io(struct bio *bio); static void scrub_wr_bio_end_io_worker(struct btrfs_work *work); static int write_page_nocow(struct scrub_ctx *sctx, u64 physical_for_dev_replace, struct page *page); @@ -332,11 +333,14 @@ static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) } } -static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) +static void scrub_pause_on(struct btrfs_fs_info *fs_info) { atomic_inc(&fs_info->scrubs_paused); wake_up(&fs_info->scrub_pause_wait); +} +static void scrub_pause_off(struct btrfs_fs_info *fs_info) +{ mutex_lock(&fs_info->scrub_lock); __scrub_blocked_if_needed(fs_info); atomic_dec(&fs_info->scrubs_paused); @@ -345,6 +349,12 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) wake_up(&fs_info->scrub_pause_wait); } +static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) +{ + scrub_pause_on(fs_info); + scrub_pause_off(fs_info); +} + /* * used for workers that require transaction commits (i.e., for the * NOCOW case) @@ -454,27 +464,14 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) struct scrub_ctx *sctx; int i; struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; - int pages_per_rd_bio; int ret; - /* - * the setting of pages_per_rd_bio is correct for scrub but might - * be wrong for the dev_replace code where we might read from - * different devices in the initial huge bios. However, that - * code is able to correctly handle the case when adding a page - * to a bio fails. - */ - if (dev->bdev) - pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO, - bio_get_nr_vecs(dev->bdev)); - else - pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; sctx = kzalloc(sizeof(*sctx), GFP_NOFS); if (!sctx) goto nomem; atomic_set(&sctx->refs, 1); sctx->is_dev_replace = is_dev_replace; - sctx->pages_per_rd_bio = pages_per_rd_bio; + sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; sctx->curr = -1; sctx->dev_root = dev->dev_root; for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { @@ -1429,11 +1426,11 @@ struct scrub_bio_ret { int error; }; -static void scrub_bio_wait_endio(struct bio *bio, int error) +static void scrub_bio_wait_endio(struct bio *bio) { struct scrub_bio_ret *ret = bio->bi_private; - ret->error = error; + ret->error = bio->bi_error; complete(&ret->event); } @@ -1790,12 +1787,12 @@ static void scrub_wr_submit(struct scrub_ctx *sctx) btrfsic_submit_bio(WRITE, sbio->bio); } -static void scrub_wr_bio_end_io(struct bio *bio, int err) +static void scrub_wr_bio_end_io(struct bio *bio) { struct scrub_bio *sbio = bio->bi_private; struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info; - sbio->err = err; + sbio->err = bio->bi_error; sbio->bio = bio; btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper, @@ -2087,21 +2084,7 @@ static void scrub_submit(struct scrub_ctx *sctx) sbio = sctx->bios[sctx->curr]; sctx->curr = -1; scrub_pending_bio_inc(sctx); - - if (!sbio->bio->bi_bdev) { - /* - * this case should not happen. If btrfs_map_block() is - * wrong, it could happen for dev-replace operations on - * missing devices when no mirrors are available, but in - * this case it should already fail the mount. - * This case is handled correctly (but _very_ slowly). - */ - printk_ratelimited(KERN_WARNING - "BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n"); - bio_endio(sbio->bio, -EIO); - } else { - btrfsic_submit_bio(READ, sbio->bio); - } + btrfsic_submit_bio(READ, sbio->bio); } static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, @@ -2178,6 +2161,134 @@ again: return 0; } +static void scrub_missing_raid56_end_io(struct bio *bio) +{ + struct scrub_block *sblock = bio->bi_private; + struct btrfs_fs_info *fs_info = sblock->sctx->dev_root->fs_info; + + if (bio->bi_error) + sblock->no_io_error_seen = 0; + + btrfs_queue_work(fs_info->scrub_workers, &sblock->work); +} + +static void scrub_missing_raid56_worker(struct btrfs_work *work) +{ + struct scrub_block *sblock = container_of(work, struct scrub_block, work); + struct scrub_ctx *sctx = sblock->sctx; + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + unsigned int is_metadata; + unsigned int have_csum; + u8 *csum; + u64 generation; + u64 logical; + struct btrfs_device *dev; + + is_metadata = !(sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA); + have_csum = sblock->pagev[0]->have_csum; + csum = sblock->pagev[0]->csum; + generation = sblock->pagev[0]->generation; + logical = sblock->pagev[0]->logical; + dev = sblock->pagev[0]->dev; + + if (sblock->no_io_error_seen) { + scrub_recheck_block_checksum(fs_info, sblock, is_metadata, + have_csum, csum, generation, + sctx->csum_size); + } + + if (!sblock->no_io_error_seen) { + spin_lock(&sctx->stat_lock); + sctx->stat.read_errors++; + spin_unlock(&sctx->stat_lock); + printk_ratelimited_in_rcu(KERN_ERR + "BTRFS: I/O error rebulding logical %llu for dev %s\n", + logical, rcu_str_deref(dev->name)); + } else if (sblock->header_error || sblock->checksum_error) { + spin_lock(&sctx->stat_lock); + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); + printk_ratelimited_in_rcu(KERN_ERR + "BTRFS: failed to rebuild valid logical %llu for dev %s\n", + logical, rcu_str_deref(dev->name)); + } else { + scrub_write_block_to_dev_replace(sblock); + } + + scrub_block_put(sblock); + + if (sctx->is_dev_replace && + atomic_read(&sctx->wr_ctx.flush_all_writes)) { + mutex_lock(&sctx->wr_ctx.wr_lock); + scrub_wr_submit(sctx); + mutex_unlock(&sctx->wr_ctx.wr_lock); + } + + scrub_pending_bio_dec(sctx); +} + +static void scrub_missing_raid56_pages(struct scrub_block *sblock) +{ + struct scrub_ctx *sctx = sblock->sctx; + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + u64 length = sblock->page_count * PAGE_SIZE; + u64 logical = sblock->pagev[0]->logical; + struct btrfs_bio *bbio; + struct bio *bio; + struct btrfs_raid_bio *rbio; + int ret; + int i; + + ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical, &length, + &bbio, 0, 1); + if (ret || !bbio || !bbio->raid_map) + goto bbio_out; + + if (WARN_ON(!sctx->is_dev_replace || + !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) { + /* + * We shouldn't be scrubbing a missing device. Even for dev + * replace, we should only get here for RAID 5/6. We either + * managed to mount something with no mirrors remaining or + * there's a bug in scrub_remap_extent()/btrfs_map_block(). + */ + goto bbio_out; + } + + bio = btrfs_io_bio_alloc(GFP_NOFS, 0); + if (!bio) + goto bbio_out; + + bio->bi_iter.bi_sector = logical >> 9; + bio->bi_private = sblock; + bio->bi_end_io = scrub_missing_raid56_end_io; + + rbio = raid56_alloc_missing_rbio(sctx->dev_root, bio, bbio, length); + if (!rbio) + goto rbio_out; + + for (i = 0; i < sblock->page_count; i++) { + struct scrub_page *spage = sblock->pagev[i]; + + raid56_add_scrub_pages(rbio, spage->page, spage->logical); + } + + btrfs_init_work(&sblock->work, btrfs_scrub_helper, + scrub_missing_raid56_worker, NULL, NULL); + scrub_block_get(sblock); + scrub_pending_bio_inc(sctx); + raid56_submit_missing_rbio(rbio); + return; + +rbio_out: + bio_put(bio); +bbio_out: + btrfs_put_bbio(bbio); + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); +} + static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, u64 physical, struct btrfs_device *dev, u64 flags, u64 gen, int mirror_num, u8 *csum, int force, @@ -2241,31 +2352,39 @@ leave_nomem: } WARN_ON(sblock->page_count == 0); - for (index = 0; index < sblock->page_count; index++) { - struct scrub_page *spage = sblock->pagev[index]; - int ret; + if (dev->missing) { + /* + * This case should only be hit for RAID 5/6 device replace. See + * the comment in scrub_missing_raid56_pages() for details. + */ + scrub_missing_raid56_pages(sblock); + } else { + for (index = 0; index < sblock->page_count; index++) { + struct scrub_page *spage = sblock->pagev[index]; + int ret; - ret = scrub_add_page_to_rd_bio(sctx, spage); - if (ret) { - scrub_block_put(sblock); - return ret; + ret = scrub_add_page_to_rd_bio(sctx, spage); + if (ret) { + scrub_block_put(sblock); + return ret; + } } - } - if (force) - scrub_submit(sctx); + if (force) + scrub_submit(sctx); + } /* last one frees, either here or in bio completion for last page */ scrub_block_put(sblock); return 0; } -static void scrub_bio_end_io(struct bio *bio, int err) +static void scrub_bio_end_io(struct bio *bio) { struct scrub_bio *sbio = bio->bi_private; struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info; - sbio->err = err; + sbio->err = bio->bi_error; sbio->bio = bio; btrfs_queue_work(fs_info->scrub_workers, &sbio->work); @@ -2564,6 +2683,11 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity, u8 csum[BTRFS_CSUM_SIZE]; u32 blocksize; + if (dev->missing) { + scrub_parity_mark_sectors_error(sparity, logical, len); + return 0; + } + if (flags & BTRFS_EXTENT_FLAG_DATA) { blocksize = sctx->sectorsize; } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { @@ -2672,11 +2796,11 @@ static void scrub_parity_bio_endio_worker(struct btrfs_work *work) scrub_pending_bio_dec(sctx); } -static void scrub_parity_bio_endio(struct bio *bio, int error) +static void scrub_parity_bio_endio(struct bio *bio) { struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private; - if (error) + if (bio->bi_error) bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, sparity->nsectors); @@ -2702,7 +2826,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) sparity->nsectors)) goto out; - length = sparity->logic_end - sparity->logic_start + 1; + length = sparity->logic_end - sparity->logic_start; ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE, sparity->logic_start, &length, &bbio, 0, 1); @@ -2725,8 +2849,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) goto rbio_out; list_for_each_entry(spage, &sparity->spages, list) - raid56_parity_add_scrub_pages(rbio, spage->page, - spage->logical); + raid56_add_scrub_pages(rbio, spage->page, spage->logical); scrub_pending_bio_inc(sctx); raid56_parity_submit_scrub_rbio(rbio); @@ -2774,6 +2897,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, struct btrfs_root *root = fs_info->extent_root; struct btrfs_root *csum_root = fs_info->csum_root; struct btrfs_extent_item *extent; + struct btrfs_bio *bbio = NULL; u64 flags; int ret; int slot; @@ -2783,6 +2907,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, u64 extent_logical; u64 extent_physical; u64 extent_len; + u64 mapped_length; struct btrfs_device *extent_dev; struct scrub_parity *sparity; int nsectors; @@ -2856,6 +2981,10 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, } btrfs_item_key_to_cpu(l, &key, slot); + if (key.type != BTRFS_EXTENT_ITEM_KEY && + key.type != BTRFS_METADATA_ITEM_KEY) + goto next; + if (key.type == BTRFS_METADATA_ITEM_KEY) bytes = root->nodesize; else @@ -2864,11 +2993,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, if (key.objectid + bytes <= logic_start) goto next; - if (key.type != BTRFS_EXTENT_ITEM_KEY && - key.type != BTRFS_METADATA_ITEM_KEY) - goto next; - - if (key.objectid > logic_end) { + if (key.objectid >= logic_end) { stop_loop = 1; break; } @@ -2881,11 +3006,12 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, flags = btrfs_extent_flags(l, extent); generation = btrfs_extent_generation(l, extent); - if (key.objectid < logic_start && - (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) { - btrfs_err(fs_info, - "scrub: tree block %llu spanning stripes, ignored. logical=%llu", - key.objectid, logic_start); + if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && + (key.objectid < logic_start || + key.objectid + bytes > + logic_start + map->stripe_len)) { + btrfs_err(fs_info, "scrub: tree block %llu spanning stripes, ignored. logical=%llu", + key.objectid, logic_start); goto next; } again: @@ -2905,10 +3031,21 @@ again: scrub_parity_mark_sectors_data(sparity, extent_logical, extent_len); - scrub_remap_extent(fs_info, extent_logical, - extent_len, &extent_physical, - &extent_dev, - &extent_mirror_num); + mapped_length = extent_len; + ret = btrfs_map_block(fs_info, READ, extent_logical, + &mapped_length, &bbio, 0); + if (!ret) { + if (!bbio || mapped_length < extent_len) + ret = -EIO; + } + if (ret) { + btrfs_put_bbio(bbio); + goto out; + } + extent_physical = bbio->stripes[0].physical; + extent_mirror_num = bbio->mirror_num; + extent_dev = bbio->stripes[0].dev; + btrfs_put_bbio(bbio); ret = btrfs_lookup_csums_range(csum_root, extent_logical, @@ -2923,10 +3060,12 @@ again: extent_dev, flags, generation, extent_mirror_num); + + scrub_free_csums(sctx); + if (ret) goto out; - scrub_free_csums(sctx); if (extent_logical + extent_len < key.objectid + bytes) { logic_start += map->stripe_len; @@ -2955,7 +3094,7 @@ next: out: if (ret < 0) scrub_parity_mark_sectors_error(sparity, logic_start, - logic_end - logic_start + 1); + logic_end - logic_start); scrub_parity_put(sparity); scrub_submit(sctx); mutex_lock(&sctx->wr_ctx.wr_lock); @@ -3104,22 +3243,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, */ ret = 0; while (physical < physical_end) { - /* for raid56, we skip parity stripe */ - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - ret = get_raid56_logic_offset(physical, num, - map, &logical, &stripe_logical); - logical += base; - if (ret) { - stripe_logical += base; - stripe_end = stripe_logical + increment - 1; - ret = scrub_raid56_parity(sctx, map, scrub_dev, - ppath, stripe_logical, - stripe_end); - if (ret) - goto out; - goto skip; - } - } /* * canceled? */ @@ -3144,6 +3267,24 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, scrub_blocked_if_needed(fs_info); } + /* for raid56, we skip parity stripe */ + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + ret = get_raid56_logic_offset(physical, num, map, + &logical, + &stripe_logical); + logical += base; + if (ret) { + stripe_logical += base; + stripe_end = stripe_logical + increment; + ret = scrub_raid56_parity(sctx, map, scrub_dev, + ppath, stripe_logical, + stripe_end); + if (ret) + goto out; + goto skip; + } + } + if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) key.type = BTRFS_METADATA_ITEM_KEY; else @@ -3188,6 +3329,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, } btrfs_item_key_to_cpu(l, &key, slot); + if (key.type != BTRFS_EXTENT_ITEM_KEY && + key.type != BTRFS_METADATA_ITEM_KEY) + goto next; + if (key.type == BTRFS_METADATA_ITEM_KEY) bytes = root->nodesize; else @@ -3196,10 +3341,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, if (key.objectid + bytes <= logical) goto next; - if (key.type != BTRFS_EXTENT_ITEM_KEY && - key.type != BTRFS_METADATA_ITEM_KEY) - goto next; - if (key.objectid >= logical + map->stripe_len) { /* out of this device extent */ if (key.objectid >= logic_end) @@ -3212,8 +3353,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, flags = btrfs_extent_flags(l, extent); generation = btrfs_extent_generation(l, extent); - if (key.objectid < logical && - (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) { + if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && + (key.objectid < logical || + key.objectid + bytes > + logical + map->stripe_len)) { btrfs_err(fs_info, "scrub: tree block %llu spanning " "stripes, ignored. logical=%llu", @@ -3247,9 +3390,11 @@ again: &extent_dev, &extent_mirror_num); - ret = btrfs_lookup_csums_range(csum_root, logical, - logical + map->stripe_len - 1, - &sctx->csum_list, 1); + ret = btrfs_lookup_csums_range(csum_root, + extent_logical, + extent_logical + + extent_len - 1, + &sctx->csum_list, 1); if (ret) goto out; @@ -3257,10 +3402,12 @@ again: extent_physical, extent_dev, flags, generation, extent_mirror_num, extent_logical - logical + physical); + + scrub_free_csums(sctx); + if (ret) goto out; - scrub_free_csums(sctx); if (extent_logical + extent_len < key.objectid + bytes) { if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { @@ -3278,7 +3425,7 @@ loop: if (ret && physical < physical_end) { stripe_logical += base; stripe_end = stripe_logical + - increment - 1; + increment; ret = scrub_raid56_parity(sctx, map, scrub_dev, ppath, stripe_logical, @@ -3387,7 +3534,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, u64 chunk_tree; u64 chunk_objectid; u64 chunk_offset; - int ret; + int ret = 0; int slot; struct extent_buffer *l; struct btrfs_key key; @@ -3415,8 +3562,14 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ret = btrfs_next_leaf(root, path); - if (ret) + if (ret < 0) + break; + if (ret > 0) { + ret = 0; break; + } + } else { + ret = 0; } } @@ -3458,6 +3611,22 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, if (!cache) goto skip; + /* + * we need call btrfs_inc_block_group_ro() with scrubs_paused, + * to avoid deadlock caused by: + * btrfs_inc_block_group_ro() + * -> btrfs_wait_for_commit() + * -> btrfs_commit_transaction() + * -> btrfs_scrub_pause() + */ + scrub_pause_on(fs_info); + ret = btrfs_inc_block_group_ro(root, cache); + scrub_pause_off(fs_info); + if (ret) { + btrfs_put_block_group(cache); + break; + } + dev_replace->cursor_right = found_key.offset + length; dev_replace->cursor_left = found_key.offset; dev_replace->item_needs_writeback = 1; @@ -3483,8 +3652,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); - atomic_inc(&fs_info->scrubs_paused); - wake_up(&fs_info->scrub_pause_wait); + + scrub_pause_on(fs_info); /* * must be called before we decrease @scrub_paused. @@ -3495,11 +3664,9 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, atomic_read(&sctx->workers_pending) == 0); atomic_set(&sctx->wr_ctx.flush_all_writes, 0); - mutex_lock(&fs_info->scrub_lock); - __scrub_blocked_if_needed(fs_info); - atomic_dec(&fs_info->scrubs_paused); - mutex_unlock(&fs_info->scrub_lock); - wake_up(&fs_info->scrub_pause_wait); + scrub_pause_off(fs_info); + + btrfs_dec_block_group_ro(root, cache); btrfs_put_block_group(cache); if (ret) @@ -3523,11 +3690,7 @@ skip: btrfs_free_path(path); - /* - * ret can still be 1 from search_slot or next_leaf, - * that's not an error - */ - return ret < 0 ? ret : 0; + return ret; } static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, @@ -3896,8 +4059,7 @@ static int scrub_setup_wr_ctx(struct scrub_ctx *sctx, return 0; WARN_ON(!dev->bdev); - wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO, - bio_get_nr_vecs(dev->bdev)); + wr_ctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO; wr_ctx->tgtdev = dev; atomic_set(&wr_ctx->flush_all_writes, 0); return 0; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index cd7ef34d2dce..2b07b3581781 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -69,7 +69,7 @@ static struct file_system_type btrfs_fs_type; static int btrfs_remount(struct super_block *sb, int *flags, char *data); -static const char *btrfs_decode_error(int errno) +const char *btrfs_decode_error(int errno) { char *errstr = "unknown"; @@ -1033,6 +1033,7 @@ static int btrfs_fill_super(struct super_block *sb, sb->s_flags |= MS_POSIXACL; #endif sb->s_flags |= MS_I_VERSION; + sb->s_iflags |= SB_I_CGROUPWB; err = open_ctree(sb, fs_devices, (char *)data); if (err) { printk(KERN_ERR "BTRFS: open_ctree failed\n"); @@ -1650,6 +1651,17 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) sb->s_flags |= MS_RDONLY; + /* + * Setting MS_RDONLY will put the cleaner thread to + * sleep at the next loop if it's already active. + * If it's already asleep, we'll leave unused block + * groups on disk until we're mounted read-write again + * unless we clean them up here. + */ + mutex_lock(&root->fs_info->cleaner_mutex); + btrfs_delete_unused_bgs(fs_info); + mutex_unlock(&root->fs_info->cleaner_mutex); + btrfs_dev_replace_suspend_for_unmount(fs_info); btrfs_scrub_cancel(fs_info); btrfs_pause_balance(fs_info); @@ -2163,8 +2175,7 @@ static int btrfs_interface_init(void) static void btrfs_interface_exit(void) { - if (misc_deregister(&btrfs_misc) < 0) - printk(KERN_INFO "BTRFS: misc_deregister failed for control device\n"); + misc_deregister(&btrfs_misc); } static void btrfs_print_info(void) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 51e0f0d0053e..8f259b3a66b3 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -258,6 +258,8 @@ loop: mutex_init(&cur_trans->cache_write_mutex); cur_trans->num_dirty_bgs = 0; spin_lock_init(&cur_trans->dirty_bgs_lock); + INIT_LIST_HEAD(&cur_trans->deleted_bgs); + spin_lock_init(&cur_trans->deleted_bgs_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, fs_info->btree_inode->i_mapping); @@ -1301,7 +1303,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, */ btrfs_set_skip_qgroup(trans, objectid); - btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); + btrfs_reloc_pre_snapshot(pending, &to_reserve); if (to_reserve > 0) { pending->error = btrfs_block_rsv_add(root, @@ -1638,9 +1640,7 @@ static void do_async_commit(struct work_struct *work) * Tell lockdep about it. */ if (ac->newtrans->type & __TRANS_FREEZABLE) - rwsem_acquire_read( - &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], - 0, 1, _THIS_IP_); + __sb_writers_acquired(ac->root->fs_info->sb, SB_FREEZE_FS); current->journal_info = ac->newtrans; @@ -1679,9 +1679,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, * async commit thread will be the one to unlock it. */ if (ac->newtrans->type & __TRANS_FREEZABLE) - rwsem_release( - &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], - 1, _THIS_IP_); + __sb_writers_release(root->fs_info->sb, SB_FREEZE_FS); schedule_work(&ac->work); @@ -1893,8 +1891,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, spin_unlock(&root->fs_info->trans_lock); wait_for_commit(root, prev_trans); + ret = prev_trans->aborted; btrfs_put_transaction(prev_trans); + if (ret) + goto cleanup_transaction; } else { spin_unlock(&root->fs_info->trans_lock); } @@ -2152,7 +2153,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_trans_handle_cachep, trans); - if (current != root->fs_info->transaction_kthread) + if (current != root->fs_info->transaction_kthread && + current != root->fs_info->cleaner_kthread) btrfs_run_delayed_iputs(root); return ret; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index eb09c2067fa8..edc2fbc262d7 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -74,6 +74,8 @@ struct btrfs_transaction { */ struct mutex cache_write_mutex; spinlock_t dirty_bgs_lock; + struct list_head deleted_bgs; + spinlock_t deleted_bgs_lock; struct btrfs_delayed_ref_root delayed_refs; int aborted; int dirty_bg_run; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9c45431e69ab..1bbaace73383 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -140,55 +140,46 @@ static int start_log_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_log_ctx *ctx) { - int index; - int ret; + int ret = 0; mutex_lock(&root->log_mutex); + if (root->log_root) { if (btrfs_need_log_full_commit(root->fs_info, trans)) { ret = -EAGAIN; goto out; } + if (!root->log_start_pid) { - root->log_start_pid = current->pid; clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); + root->log_start_pid = current->pid; } else if (root->log_start_pid != current->pid) { set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); } + } else { + mutex_lock(&root->fs_info->tree_log_mutex); + if (!root->fs_info->log_root_tree) + ret = btrfs_init_log_root_tree(trans, root->fs_info); + mutex_unlock(&root->fs_info->tree_log_mutex); + if (ret) + goto out; - atomic_inc(&root->log_batch); - atomic_inc(&root->log_writers); - if (ctx) { - index = root->log_transid % 2; - list_add_tail(&ctx->list, &root->log_ctxs[index]); - ctx->log_transid = root->log_transid; - } - mutex_unlock(&root->log_mutex); - return 0; - } - - ret = 0; - mutex_lock(&root->fs_info->tree_log_mutex); - if (!root->fs_info->log_root_tree) - ret = btrfs_init_log_root_tree(trans, root->fs_info); - mutex_unlock(&root->fs_info->tree_log_mutex); - if (ret) - goto out; - - if (!root->log_root) { ret = btrfs_add_log_tree(trans, root); if (ret) goto out; + + clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); + root->log_start_pid = current->pid; } - clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); - root->log_start_pid = current->pid; + atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); if (ctx) { - index = root->log_transid % 2; + int index = root->log_transid % 2; list_add_tail(&ctx->list, &root->log_ctxs[index]); ctx->log_transid = root->log_transid; } + out: mutex_unlock(&root->log_mutex); return ret; @@ -731,12 +722,66 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, &ordered_sums, 0); if (ret) goto out; + /* + * Now delete all existing cums in the csum root that + * cover our range. We do this because we can have an + * extent that is completely referenced by one file + * extent item and partially referenced by another + * file extent item (like after using the clone or + * extent_same ioctls). In this case if we end up doing + * the replay of the one that partially references the + * extent first, and we do not do the csum deletion + * below, we can get 2 csum items in the csum tree that + * overlap each other. For example, imagine our log has + * the two following file extent items: + * + * key (257 EXTENT_DATA 409600) + * extent data disk byte 12845056 nr 102400 + * extent data offset 20480 nr 20480 ram 102400 + * + * key (257 EXTENT_DATA 819200) + * extent data disk byte 12845056 nr 102400 + * extent data offset 0 nr 102400 ram 102400 + * + * Where the second one fully references the 100K extent + * that starts at disk byte 12845056, and the log tree + * has a single csum item that covers the entire range + * of the extent: + * + * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 + * + * After the first file extent item is replayed, the + * csum tree gets the following csum item: + * + * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 + * + * Which covers the 20K sub-range starting at offset 20K + * of our extent. Now when we replay the second file + * extent item, if we do not delete existing csum items + * that cover any of its blocks, we end up getting two + * csum items in our csum tree that overlap each other: + * + * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 + * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 + * + * Which is a problem, because after this anyone trying + * to lookup up for the checksum of any block of our + * extent starting at an offset of 40K or higher, will + * end up looking at the second csum item only, which + * does not contain the checksum for any block starting + * at offset 40K or higher of our extent. + */ while (!list_empty(&ordered_sums)) { struct btrfs_ordered_sum *sums; sums = list_entry(ordered_sums.next, struct btrfs_ordered_sum, list); if (!ret) + ret = btrfs_del_csums(trans, + root->fs_info->csum_root, + sums->bytenr, + sums->len); + if (!ret) ret = btrfs_csum_file_blocks(trans, root->fs_info->csum_root, sums); @@ -1549,9 +1594,8 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, */ static noinline int insert_one_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, u64 dirid, u64 index, - char *name, int name_len, u8 type, + char *name, int name_len, struct btrfs_key *location) { struct inode *inode; @@ -1613,6 +1657,9 @@ static bool name_in_log_ref(struct btrfs_root *log_root, * not exist in the FS, it is skipped. fsyncs on directories * do not force down inodes inside that directory, just changes to the * names or unlinks in a directory. + * + * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a + * non-existing inode) and 1 if the name was replayed. */ static noinline int replay_one_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -1631,6 +1678,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, int exists; int ret = 0; bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); + bool name_added = false; dir = read_one_inode(root, key->objectid); if (!dir) @@ -1708,6 +1756,8 @@ out: } kfree(name); iput(dir); + if (!ret && name_added) + ret = 1; return ret; insert: @@ -1719,10 +1769,12 @@ insert: goto out; } btrfs_release_path(path); - ret = insert_one_name(trans, root, path, key->objectid, key->offset, - name, name_len, log_type, &log_key); + ret = insert_one_name(trans, root, key->objectid, key->offset, + name, name_len, &log_key); if (ret && ret != -ENOENT && ret != -EEXIST) goto out; + if (!ret) + name_added = true; update_size = false; ret = 0; goto out; @@ -1740,12 +1792,13 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, struct extent_buffer *eb, int slot, struct btrfs_key *key) { - int ret; + int ret = 0; u32 item_size = btrfs_item_size_nr(eb, slot); struct btrfs_dir_item *di; int name_len; unsigned long ptr; unsigned long ptr_end; + struct btrfs_path *fixup_path = NULL; ptr = btrfs_item_ptr_offset(eb, slot); ptr_end = ptr + item_size; @@ -1755,12 +1808,59 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, return -EIO; name_len = btrfs_dir_name_len(eb, di); ret = replay_one_name(trans, root, path, eb, di, key); - if (ret) - return ret; + if (ret < 0) + break; ptr = (unsigned long)(di + 1); ptr += name_len; + + /* + * If this entry refers to a non-directory (directories can not + * have a link count > 1) and it was added in the transaction + * that was not committed, make sure we fixup the link count of + * the inode it the entry points to. Otherwise something like + * the following would result in a directory pointing to an + * inode with a wrong link that does not account for this dir + * entry: + * + * mkdir testdir + * touch testdir/foo + * touch testdir/bar + * sync + * + * ln testdir/bar testdir/bar_link + * ln testdir/foo testdir/foo_link + * xfs_io -c "fsync" testdir/bar + * + * <power failure> + * + * mount fs, log replay happens + * + * File foo would remain with a link count of 1 when it has two + * entries pointing to it in the directory testdir. This would + * make it impossible to ever delete the parent directory has + * it would result in stale dentries that can never be deleted. + */ + if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { + struct btrfs_key di_key; + + if (!fixup_path) { + fixup_path = btrfs_alloc_path(); + if (!fixup_path) { + ret = -ENOMEM; + break; + } + } + + btrfs_dir_item_key_to_cpu(eb, di, &di_key); + ret = link_to_fixup_dir(trans, root, fixup_path, + di_key.objectid); + if (ret) + break; + } + ret = 0; } - return 0; + btrfs_free_path(fixup_path); + return ret; } /* @@ -2535,8 +2635,7 @@ static int update_log_root(struct btrfs_trans_handle *trans, return ret; } -static void wait_log_commit(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int transid) +static void wait_log_commit(struct btrfs_root *root, int transid) { DEFINE_WAIT(wait); int index = transid % 2; @@ -2561,8 +2660,7 @@ static void wait_log_commit(struct btrfs_trans_handle *trans, atomic_read(&root->log_commit[index])); } -static void wait_for_writer(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static void wait_for_writer(struct btrfs_root *root) { DEFINE_WAIT(wait); @@ -2642,7 +2740,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, index1 = log_transid % 2; if (atomic_read(&root->log_commit[index1])) { - wait_log_commit(trans, root, log_transid); + wait_log_commit(root, log_transid); mutex_unlock(&root->log_mutex); return ctx->log_ret; } @@ -2651,7 +2749,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, /* wait for previous tree log sync to complete */ if (atomic_read(&root->log_commit[(index1 + 1) % 2])) - wait_log_commit(trans, root, log_transid - 1); + wait_log_commit(root, log_transid - 1); while (1) { int batch = atomic_read(&root->log_batch); @@ -2662,7 +2760,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, schedule_timeout_uninterruptible(1); mutex_lock(&root->log_mutex); } - wait_for_writer(trans, root); + wait_for_writer(root); if (batch == atomic_read(&root->log_batch)) break; } @@ -2759,7 +2857,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); btrfs_wait_logged_extents(trans, log, log_transid); - wait_log_commit(trans, log_root_tree, + wait_log_commit(log_root_tree, root_log_ctx.log_transid); mutex_unlock(&log_root_tree->log_mutex); if (!ret) @@ -2770,11 +2868,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, atomic_set(&log_root_tree->log_commit[index2], 1); if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { - wait_log_commit(trans, log_root_tree, + wait_log_commit(log_root_tree, root_log_ctx.log_transid - 1); } - wait_for_writer(trans, log_root_tree); + wait_for_writer(log_root_tree); /* * now that we've moved on to the tree of log tree roots, @@ -4904,6 +5002,94 @@ next_dir_inode: return ret; } +static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, + struct inode *inode, + struct btrfs_log_ctx *ctx) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_root *root = BTRFS_I(inode)->root; + const u64 ino = btrfs_ino(inode); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->skip_locking = 1; + path->search_commit_root = 1; + + key.objectid = ino; + key.type = BTRFS_INODE_REF_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + while (true) { + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + u32 cur_offset = 0; + u32 item_size; + unsigned long ptr; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ + if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) + break; + + item_size = btrfs_item_size_nr(leaf, slot); + ptr = btrfs_item_ptr_offset(leaf, slot); + while (cur_offset < item_size) { + struct btrfs_key inode_key; + struct inode *dir_inode; + + inode_key.type = BTRFS_INODE_ITEM_KEY; + inode_key.offset = 0; + + if (key.type == BTRFS_INODE_EXTREF_KEY) { + struct btrfs_inode_extref *extref; + + extref = (struct btrfs_inode_extref *) + (ptr + cur_offset); + inode_key.objectid = btrfs_inode_extref_parent( + leaf, extref); + cur_offset += sizeof(*extref); + cur_offset += btrfs_inode_extref_name_len(leaf, + extref); + } else { + inode_key.objectid = key.offset; + cur_offset = item_size; + } + + dir_inode = btrfs_iget(root->fs_info->sb, &inode_key, + root, NULL); + /* If parent inode was deleted, skip it. */ + if (IS_ERR(dir_inode)) + continue; + + ret = btrfs_log_inode(trans, root, dir_inode, + LOG_INODE_ALL, 0, LLONG_MAX, ctx); + iput(dir_inode); + if (ret) + goto out; + } + path->slots[0]++; + } + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + /* * helper function around btrfs_log_inode to make sure newly created * parent directories also end up in the log. A minimal inode and backref @@ -4923,9 +5109,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct dentry *old_parent = NULL; int ret = 0; u64 last_committed = root->fs_info->last_trans_committed; - const struct dentry * const first_parent = parent; - const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans > - last_committed); bool log_dentries = false; struct inode *orig_inode = inode; @@ -4986,6 +5169,53 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries) log_dentries = true; + /* + * On unlink we must make sure all our current and old parent directores + * inodes are fully logged. This is to prevent leaving dangling + * directory index entries in directories that were our parents but are + * not anymore. Not doing this results in old parent directory being + * impossible to delete after log replay (rmdir will always fail with + * error -ENOTEMPTY). + * + * Example 1: + * + * mkdir testdir + * touch testdir/foo + * ln testdir/foo testdir/bar + * sync + * unlink testdir/bar + * xfs_io -c fsync testdir/foo + * <power failure> + * mount fs, triggers log replay + * + * If we don't log the parent directory (testdir), after log replay the + * directory still has an entry pointing to the file inode using the bar + * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and + * the file inode has a link count of 1. + * + * Example 2: + * + * mkdir testdir + * touch foo + * ln foo testdir/foo2 + * ln foo testdir/foo3 + * sync + * unlink testdir/foo3 + * xfs_io -c fsync foo + * <power failure> + * mount fs, triggers log replay + * + * Similar as the first example, after log replay the parent directory + * testdir still has an entry pointing to the inode file with name foo3 + * but the file inode does not have a matching BTRFS_INODE_REF_KEY item + * and has a link count of 2. + */ + if (BTRFS_I(inode)->last_unlink_trans > last_committed) { + ret = btrfs_log_all_parents(trans, orig_inode, ctx); + if (ret) + goto end_trans; + } + while (1) { if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) break; @@ -4994,23 +5224,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (root != BTRFS_I(inode)->root) break; - /* - * On unlink we must make sure our immediate parent directory - * inode is fully logged. This is to prevent leaving dangling - * directory index entries and a wrong directory inode's i_size. - * Not doing so can result in a directory being impossible to - * delete after log replay (rmdir will always fail with error - * -ENOTEMPTY). - */ - if (did_unlink && parent == first_parent) - inode_only = LOG_INODE_ALL; - else - inode_only = LOG_INODE_EXISTS; - - if (BTRFS_I(inode)->generation > - root->fs_info->last_trans_committed || - inode_only == LOG_INODE_ALL) { - ret = btrfs_log_inode(trans, root, inode, inode_only, + if (BTRFS_I(inode)->generation > last_committed) { + ret = btrfs_log_inode(trans, root, inode, + LOG_INODE_EXISTS, 0, LLONG_MAX, ctx); if (ret) goto end_trans; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index fbe7c104531c..76201d6f6ce4 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1116,15 +1116,18 @@ out: return ret; } -static int contains_pending_extent(struct btrfs_trans_handle *trans, +static int contains_pending_extent(struct btrfs_transaction *transaction, struct btrfs_device *device, u64 *start, u64 len) { + struct btrfs_fs_info *fs_info = device->dev_root->fs_info; struct extent_map *em; - struct list_head *search_list = &trans->transaction->pending_chunks; + struct list_head *search_list = &fs_info->pinned_chunks; int ret = 0; u64 physical_start = *start; + if (transaction) + search_list = &transaction->pending_chunks; again: list_for_each_entry(em, search_list, list) { struct map_lookup *map; @@ -1159,8 +1162,8 @@ again: } } } - if (search_list == &trans->transaction->pending_chunks) { - search_list = &trans->root->fs_info->pinned_chunks; + if (search_list != &fs_info->pinned_chunks) { + search_list = &fs_info->pinned_chunks; goto again; } @@ -1169,12 +1172,13 @@ again: /* - * find_free_dev_extent - find free space in the specified device - * @device: the device which we search the free space in - * @num_bytes: the size of the free space that we need - * @start: store the start of the free space. - * @len: the size of the free space. that we find, or the size of the max - * free space if we don't find suitable free space + * find_free_dev_extent_start - find free space in the specified device + * @device: the device which we search the free space in + * @num_bytes: the size of the free space that we need + * @search_start: the position from which to begin the search + * @start: store the start of the free space. + * @len: the size of the free space. that we find, or the size + * of the max free space if we don't find suitable free space * * this uses a pretty simple search, the expectation is that it is * called very infrequently and that a given device has a small number @@ -1188,9 +1192,9 @@ again: * But if we don't find suitable free space, it is used to store the size of * the max free space. */ -int find_free_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, u64 num_bytes, - u64 *start, u64 *len) +int find_free_dev_extent_start(struct btrfs_transaction *transaction, + struct btrfs_device *device, u64 num_bytes, + u64 search_start, u64 *start, u64 *len) { struct btrfs_key key; struct btrfs_root *root = device->dev_root; @@ -1200,19 +1204,11 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans, u64 max_hole_start; u64 max_hole_size; u64 extent_end; - u64 search_start; u64 search_end = device->total_bytes; int ret; int slot; struct extent_buffer *l; - /* FIXME use last free of some kind */ - - /* we don't want to overwrite the superblock on the drive, - * so we make sure to start at an offset of at least 1MB - */ - search_start = max(root->fs_info->alloc_start, 1024ull * 1024); - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -1273,7 +1269,7 @@ again: * Have to check before we set max_hole_start, otherwise * we could end up sending back this offset anyway. */ - if (contains_pending_extent(trans, device, + if (contains_pending_extent(transaction, device, &search_start, hole_size)) { if (key.offset >= search_start) { @@ -1322,7 +1318,7 @@ next: if (search_end > search_start) { hole_size = search_end - search_start; - if (contains_pending_extent(trans, device, &search_start, + if (contains_pending_extent(transaction, device, &search_start, hole_size)) { btrfs_release_path(path); goto again; @@ -1348,6 +1344,24 @@ out: return ret; } +int find_free_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 num_bytes, + u64 *start, u64 *len) +{ + struct btrfs_root *root = device->dev_root; + u64 search_start; + + /* FIXME use last free of some kind */ + + /* + * we don't want to overwrite the superblock on the drive, + * so we make sure to start at an offset of at least 1MB + */ + search_start = max(root->fs_info->alloc_start, 1024ull * 1024); + return find_free_dev_extent_start(trans->transaction, device, + num_bytes, search_start, start, len); +} + static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 start, u64 *dev_extent_len) @@ -2755,9 +2769,7 @@ out: return ret; } -static int btrfs_relocate_chunk(struct btrfs_root *root, - u64 chunk_objectid, - u64 chunk_offset) +static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset) { struct btrfs_root *extent_root; struct btrfs_trans_handle *trans; @@ -2785,7 +2797,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, return -ENOSPC; /* step one, relocate all the extents inside this chunk */ + btrfs_scrub_pause(root); ret = btrfs_relocate_block_group(extent_root, chunk_offset); + btrfs_scrub_continue(root); if (ret) return ret; @@ -2855,7 +2869,6 @@ again: if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_relocate_chunk(chunk_root, - found_key.objectid, found_key.offset); if (ret == -ENOSPC) failed++; @@ -3375,7 +3388,6 @@ again: } ret = btrfs_relocate_chunk(chunk_root, - found_key.objectid, found_key.offset); mutex_unlock(&fs_info->delete_unused_bgs_mutex); if (ret && ret != -ENOSPC) @@ -4077,7 +4089,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) struct btrfs_dev_extent *dev_extent = NULL; struct btrfs_path *path; u64 length; - u64 chunk_objectid; u64 chunk_offset; int ret; int slot; @@ -4154,11 +4165,10 @@ again: break; } - chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); btrfs_release_path(path); - ret = btrfs_relocate_chunk(root, chunk_objectid, chunk_offset); + ret = btrfs_relocate_chunk(root, chunk_offset); mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); if (ret && ret != -ENOSPC) goto done; @@ -4200,7 +4210,8 @@ again: u64 start = new_size; u64 len = old_size - new_size; - if (contains_pending_extent(trans, device, &start, len)) { + if (contains_pending_extent(trans->transaction, device, + &start, len)) { unlock_chunks(root); checked_pending_chunks = true; failed = 0; @@ -5071,9 +5082,7 @@ static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes) * and the stripes */ sizeof(u64) * (total_stripes), - GFP_NOFS); - if (!bbio) - return NULL; + GFP_NOFS|__GFP_NOFAIL); atomic_set(&bbio->error, 0); atomic_set(&bbio->refs, 1); @@ -5741,23 +5750,23 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, return 0; } -static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err) +static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio) { bio->bi_private = bbio->private; bio->bi_end_io = bbio->end_io; - bio_endio(bio, err); + bio_endio(bio); btrfs_put_bbio(bbio); } -static void btrfs_end_bio(struct bio *bio, int err) +static void btrfs_end_bio(struct bio *bio) { struct btrfs_bio *bbio = bio->bi_private; int is_orig_bio = 0; - if (err) { + if (bio->bi_error) { atomic_inc(&bbio->error); - if (err == -EIO || err == -EREMOTEIO) { + if (bio->bi_error == -EIO || bio->bi_error == -EREMOTEIO) { unsigned int stripe_index = btrfs_io_bio(bio)->stripe_index; struct btrfs_device *dev; @@ -5795,17 +5804,16 @@ static void btrfs_end_bio(struct bio *bio, int err) * beyond the tolerance of the btrfs bio */ if (atomic_read(&bbio->error) > bbio->max_errors) { - err = -EIO; + bio->bi_error = -EIO; } else { /* * this bio is actually up to date, we didn't * go over the max number of errors */ - set_bit(BIO_UPTODATE, &bio->bi_flags); - err = 0; + bio->bi_error = 0; } - btrfs_end_bbio(bbio, bio, err); + btrfs_end_bbio(bbio, bio); } else if (!is_orig_bio) { bio_put(bio); } @@ -5826,7 +5834,7 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root, struct btrfs_pending_bios *pending_bios; if (device->missing || !device->bdev) { - bio_endio(bio, -EIO); + bio_io_error(bio); return; } @@ -5871,34 +5879,6 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root, &device->work); } -static int bio_size_ok(struct block_device *bdev, struct bio *bio, - sector_t sector) -{ - struct bio_vec *prev; - struct request_queue *q = bdev_get_queue(bdev); - unsigned int max_sectors = queue_max_sectors(q); - struct bvec_merge_data bvm = { - .bi_bdev = bdev, - .bi_sector = sector, - .bi_rw = bio->bi_rw, - }; - - if (WARN_ON(bio->bi_vcnt == 0)) - return 1; - - prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; - if (bio_sectors(bio) > max_sectors) - return 0; - - if (!q->merge_bvec_fn) - return 1; - - bvm.bi_size = bio->bi_iter.bi_size - prev->bv_len; - if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) - return 0; - return 1; -} - static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, struct bio *bio, u64 physical, int dev_nr, int rw, int async) @@ -5932,38 +5912,6 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, btrfsic_submit_bio(rw, bio); } -static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, - struct bio *first_bio, struct btrfs_device *dev, - int dev_nr, int rw, int async) -{ - struct bio_vec *bvec = first_bio->bi_io_vec; - struct bio *bio; - int nr_vecs = bio_get_nr_vecs(dev->bdev); - u64 physical = bbio->stripes[dev_nr].physical; - -again: - bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS); - if (!bio) - return -ENOMEM; - - while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) { - if (bio_add_page(bio, bvec->bv_page, bvec->bv_len, - bvec->bv_offset) < bvec->bv_len) { - u64 len = bio->bi_iter.bi_size; - - atomic_inc(&bbio->stripes_pending); - submit_stripe_bio(root, bbio, bio, physical, dev_nr, - rw, async); - physical += len; - goto again; - } - bvec++; - } - - submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async); - return 0; -} - static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) { atomic_inc(&bbio->error); @@ -5973,8 +5921,8 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; bio->bi_iter.bi_sector = logical >> 9; - - btrfs_end_bbio(bbio, bio, -EIO); + bio->bi_error = -EIO; + btrfs_end_bbio(bbio, bio); } } @@ -6036,18 +5984,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, continue; } - /* - * Check and see if we're ok with this bio based on it's size - * and offset with the given device. - */ - if (!bio_size_ok(dev->bdev, first_bio, - bbio->stripes[dev_nr].physical >> 9)) { - ret = breakup_stripe_bio(root, bbio, first_bio, dev, - dev_nr, rw, async_submit); - BUG_ON(ret); - continue; - } - if (dev_nr < total_devs - 1) { bio = btrfs_bio_clone(first_bio, GFP_NOFS); BUG_ON(!bio); /* -ENOMEM */ diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 95842a909e7f..2ca784a14e84 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -453,6 +453,9 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info); int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); +int find_free_dev_extent_start(struct btrfs_transaction *transaction, + struct btrfs_device *device, u64 num_bytes, + u64 search_start, u64 *start, u64 *max_avail); int find_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *max_avail); diff --git a/fs/buffer.c b/fs/buffer.c index 1cf7a53a0277..82283abb2795 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2957,14 +2957,14 @@ sector_t generic_block_bmap(struct address_space *mapping, sector_t block, } EXPORT_SYMBOL(generic_block_bmap); -static void end_bio_bh_io_sync(struct bio *bio, int err) +static void end_bio_bh_io_sync(struct bio *bio) { struct buffer_head *bh = bio->bi_private; - if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags))) + if (unlikely(bio_flagged(bio, BIO_QUIET))) set_bit(BH_Quiet, &bh->b_state); - bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags)); + bh->b_end_io(bh, !bio->bi_error); bio_put(bio); } @@ -3046,12 +3046,9 @@ static int submit_bh_wbc(int rw, struct buffer_head *bh, bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; - bio->bi_io_vec[0].bv_page = bh->b_page; - bio->bi_io_vec[0].bv_len = bh->b_size; - bio->bi_io_vec[0].bv_offset = bh_offset(bh); - bio->bi_vcnt = 1; - bio->bi_iter.bi_size = bh->b_size; + bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); + BUG_ON(bio->bi_iter.bi_size != bh->b_size); bio->bi_end_io = end_bio_bh_io_sync; bio->bi_private = bh; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 890c50971a69..a268abfe60ac 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1593,7 +1593,7 @@ out: return err; } -static struct vm_operations_struct ceph_vmops = { +static const struct vm_operations_struct ceph_vmops = { .fault = ceph_filemap_fault, .page_mkwrite = ceph_page_mkwrite, }; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index dc10c9dd36c1..ddd5e9471290 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1506,7 +1506,6 @@ static int __mark_caps_flushing(struct inode *inode, swap(cf, ci->i_prealloc_cap_flush); cf->caps = flushing; - cf->kick = false; spin_lock(&mdsc->cap_dirty_lock); list_del_init(&ci->i_dirty_item); @@ -2123,8 +2122,7 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc, static int __kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, - struct ceph_inode_info *ci, - bool kick_all) + struct ceph_inode_info *ci) { struct inode *inode = &ci->vfs_inode; struct ceph_cap *cap; @@ -2150,9 +2148,7 @@ static int __kick_flushing_caps(struct ceph_mds_client *mdsc, for (n = rb_first(&ci->i_cap_flush_tree); n; n = rb_next(n)) { cf = rb_entry(n, struct ceph_cap_flush, i_node); - if (cf->tid < first_tid) - continue; - if (kick_all || cf->kick) + if (cf->tid >= first_tid) break; } if (!n) { @@ -2161,7 +2157,6 @@ static int __kick_flushing_caps(struct ceph_mds_client *mdsc, } cf = rb_entry(n, struct ceph_cap_flush, i_node); - cf->kick = false; first_tid = cf->tid + 1; @@ -2181,8 +2176,6 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, { struct ceph_inode_info *ci; struct ceph_cap *cap; - struct ceph_cap_flush *cf; - struct rb_node *n; dout("early_kick_flushing_caps mds%d\n", session->s_mds); list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { @@ -2205,16 +2198,11 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, if ((cap->issued & ci->i_flushing_caps) != ci->i_flushing_caps) { spin_unlock(&ci->i_ceph_lock); - if (!__kick_flushing_caps(mdsc, session, ci, true)) + if (!__kick_flushing_caps(mdsc, session, ci)) continue; spin_lock(&ci->i_ceph_lock); } - for (n = rb_first(&ci->i_cap_flush_tree); n; n = rb_next(n)) { - cf = rb_entry(n, struct ceph_cap_flush, i_node); - cf->kick = true; - } - spin_unlock(&ci->i_ceph_lock); } } @@ -2228,7 +2216,7 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, dout("kick_flushing_caps mds%d\n", session->s_mds); list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { - int delayed = __kick_flushing_caps(mdsc, session, ci, false); + int delayed = __kick_flushing_caps(mdsc, session, ci); if (delayed) { spin_lock(&ci->i_ceph_lock); __cap_delay_requeue(mdsc, ci); @@ -2261,7 +2249,7 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, spin_unlock(&ci->i_ceph_lock); - delayed = __kick_flushing_caps(mdsc, session, ci, true); + delayed = __kick_flushing_caps(mdsc, session, ci); if (delayed) { spin_lock(&ci->i_ceph_lock); __cap_delay_requeue(mdsc, ci); diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 4347039ecc18..6706bde9ad1b 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -287,7 +287,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode, return 0; spin_lock(&ctx->flc_lock); - list_for_each_entry(lock, &ctx->flc_flock, fl_list) { + list_for_each_entry(lock, &ctx->flc_posix, fl_list) { ++seen_fcntl; if (seen_fcntl > num_fcntl_locks) { err = -ENOSPC; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index d1c833c321b9..7b6bfcbf801c 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -479,7 +479,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT) seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes); if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) - seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name); + seq_show_option(m, "snapdirname", fsopt->snapdir_name); return 0; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 860cc016e70d..2f2460d23a06 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -189,7 +189,6 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) struct ceph_cap_flush { u64 tid; int caps; - bool kick; struct rb_node g_node; // global union { struct rb_node i_node; // inode diff --git a/fs/char_dev.c b/fs/char_dev.c index ea06a3d0364c..24b142569ca9 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -274,7 +274,7 @@ out2: } /** - * unregister_chrdev_region() - return a range of device numbers + * unregister_chrdev_region() - unregister a range of device numbers * @from: the first in the range of numbers to unregister * @count: the number of device numbers to unregister * diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h new file mode 100644 index 000000000000..0065256881d8 --- /dev/null +++ b/fs/cifs/cifs_ioctl.h @@ -0,0 +1,42 @@ +/* + * fs/cifs/cifs_ioctl.h + * + * Structure definitions for io control for cifs/smb3 + * + * Copyright (c) 2015 Steve French <steve.french@primarydata.com> + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + */ + +struct smb_mnt_fs_info { + __u32 version; /* 0001 */ + __u16 protocol_id; + __u16 tcon_flags; + __u32 vol_serial_number; + __u32 vol_create_time; + __u32 share_caps; + __u32 share_flags; + __u32 sector_flags; + __u32 optimal_sector_size; + __u32 max_bytes_chunk; + __u32 fs_attributes; + __u32 max_path_component; + __u32 device_type; + __u32 device_characteristics; + __u32 maximal_access; + __u64 cifs_posix_caps; +} __packed; + +#define CIFS_IOCTL_MAGIC 0xCF +#define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int) +#define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4) +#define CIFS_IOC_GET_MNT_INFO _IOR(CIFS_IOCTL_MAGIC, 5, struct smb_mnt_fs_info) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 0a9fb6b53126..6a1119e87fbb 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -394,17 +394,17 @@ cifs_show_options(struct seq_file *s, struct dentry *root) struct sockaddr *srcaddr; srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr; - seq_printf(s, ",vers=%s", tcon->ses->server->vals->version_string); + seq_show_option(s, "vers", tcon->ses->server->vals->version_string); cifs_show_security(s, tcon->ses); cifs_show_cache_flavor(s, cifs_sb); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) seq_puts(s, ",multiuser"); else if (tcon->ses->user_name) - seq_printf(s, ",username=%s", tcon->ses->user_name); + seq_show_option(s, "username", tcon->ses->user_name); if (tcon->ses->domainName) - seq_printf(s, ",domain=%s", tcon->ses->domainName); + seq_show_option(s, "domain", tcon->ses->domainName); if (srcaddr->sa_family != AF_UNSPEC) { struct sockaddr_in *saddr4; diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index a782b22904e4..27aea110e923 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -136,5 +136,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.06" +#define CIFS_VERSION "2.07" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index 47b030da0781..f5b87303ce46 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -2245,6 +2245,20 @@ typedef struct { #define FILE_DEVICE_VIRTUAL_DISK 0x00000024 #define FILE_DEVICE_NETWORK_REDIRECTOR 0x00000028 +/* Device Characteristics */ +#define FILE_REMOVABLE_MEDIA 0x00000001 +#define FILE_READ_ONLY_DEVICE 0x00000002 +#define FILE_FLOPPY_DISKETTE 0x00000004 +#define FILE_WRITE_ONCE_MEDIA 0x00000008 +#define FILE_REMOTE_DEVICE 0x00000010 +#define FILE_DEVICE_IS_MOUNTED 0x00000020 +#define FILE_VIRTUAL_VOLUME 0x00000040 +#define FILE_DEVICE_SECURE_OPEN 0x00000100 +#define FILE_CHARACTERISTIC_TS_DEVICE 0x00001000 +#define FILE_CHARACTERISTIC_WEBDAV_DEVICE 0x00002000 +#define FILE_PORTABLE_DEVICE 0x00004000 +#define FILE_DEVICE_ALLOW_APPCONTAINER_TRAVERSAL 0x00020000 + typedef struct { __le32 DeviceType; __le32 DeviceCharacteristics; diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 672ef35c9f73..90b4f9f7de66 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -696,7 +696,9 @@ cifs_echo_callback(struct mid_q_entry *mid) { struct TCP_Server_Info *server = mid->callback_data; + mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); + mutex_unlock(&server->srv_mutex); add_credits(server, 1, CIFS_ECHO_OP); } @@ -1572,7 +1574,9 @@ cifs_readv_callback(struct mid_q_entry *mid) } queue_work(cifsiod_wq, &rdata->work); + mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); + mutex_unlock(&server->srv_mutex); add_credits(server, 1, 0); } @@ -2032,6 +2036,7 @@ cifs_writev_callback(struct mid_q_entry *mid) { struct cifs_writedata *wdata = mid->callback_data; struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); + struct TCP_Server_Info *server = tcon->ses->server; unsigned int written; WRITE_RSP *smb = (WRITE_RSP *)mid->resp_buf; @@ -2068,7 +2073,9 @@ cifs_writev_callback(struct mid_q_entry *mid) } queue_work(cifsiod_wq, &wdata->work); + mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); + mutex_unlock(&server->srv_mutex); add_credits(tcon->ses->server, 1, 0); } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 3f50cee79df9..e2a6af1508af 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3216,7 +3216,7 @@ cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) return VM_FAULT_LOCKED; } -static struct vm_operations_struct cifs_file_vm_ops = { +static const struct vm_operations_struct cifs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = cifs_page_mkwrite, diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 49b8b6e41a18..c63f5227b681 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -31,12 +31,9 @@ #include "cifsproto.h" #include "cifs_debug.h" #include "cifsfs.h" +#include "cifs_ioctl.h" #include <linux/btrfs.h> -#define CIFS_IOCTL_MAGIC 0xCF -#define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int) -#define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4) - static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file, unsigned long srcfd, u64 off, u64 len, u64 destoff, bool dup_extents) @@ -135,6 +132,43 @@ out_drop_write: return rc; } +static long smb_mnt_get_fsinfo(unsigned int xid, struct cifs_tcon *tcon, + void __user *arg) +{ + int rc = 0; + struct smb_mnt_fs_info *fsinf; + + fsinf = kzalloc(sizeof(struct smb_mnt_fs_info), GFP_KERNEL); + if (fsinf == NULL) + return -ENOMEM; + + fsinf->version = 1; + fsinf->protocol_id = tcon->ses->server->vals->protocol_id; + fsinf->device_characteristics = + le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics); + fsinf->device_type = le32_to_cpu(tcon->fsDevInfo.DeviceType); + fsinf->fs_attributes = le32_to_cpu(tcon->fsAttrInfo.Attributes); + fsinf->max_path_component = + le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength); +#ifdef CONFIG_CIFS_SMB2 + fsinf->vol_serial_number = tcon->vol_serial_number; + fsinf->vol_create_time = le64_to_cpu(tcon->vol_create_time); + fsinf->share_flags = tcon->share_flags; + fsinf->share_caps = le32_to_cpu(tcon->capabilities); + fsinf->sector_flags = tcon->ss_flags; + fsinf->optimal_sector_size = tcon->perf_sector_size; + fsinf->max_bytes_chunk = tcon->max_bytes_chunk; + fsinf->maximal_access = tcon->maximal_access; +#endif /* SMB2 */ + fsinf->cifs_posix_caps = le64_to_cpu(tcon->fsUnixInfo.Capability); + + if (copy_to_user(arg, fsinf, sizeof(struct smb_mnt_fs_info))) + rc = -EFAULT; + + kfree(fsinf); + return rc; +} + long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) { struct inode *inode = file_inode(filep); @@ -148,8 +182,6 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) xid = get_xid(); - cifs_dbg(FYI, "ioctl file %p cmd %u arg %lu\n", filep, command, arg); - cifs_sb = CIFS_SB(inode->i_sb); switch (command) { @@ -228,6 +260,10 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) else rc = -EOPNOTSUPP; break; + case CIFS_IOC_GET_MNT_INFO: + tcon = tlink_tcon(pSMBFile->tlink); + rc = smb_mnt_get_fsinfo(xid, tcon, (void __user *)arg); + break; default: cifs_dbg(FYI, "unsupported ioctl\n"); break; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index b8b4f08ee094..070fb2ad85ce 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1626,7 +1626,9 @@ smb2_echo_callback(struct mid_q_entry *mid) if (mid->mid_state == MID_RESPONSE_RECEIVED) credits_received = le16_to_cpu(smb2->hdr.CreditRequest); + mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); + mutex_unlock(&server->srv_mutex); add_credits(server, credits_received, CIFS_ECHO_OP); } @@ -1810,7 +1812,9 @@ smb2_readv_callback(struct mid_q_entry *mid) cifs_stats_fail_inc(tcon, SMB2_READ_HE); queue_work(cifsiod_wq, &rdata->work); + mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); + mutex_unlock(&server->srv_mutex); add_credits(server, credits_received, 0); } @@ -1938,6 +1942,7 @@ smb2_writev_callback(struct mid_q_entry *mid) { struct cifs_writedata *wdata = mid->callback_data; struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); + struct TCP_Server_Info *server = tcon->ses->server; unsigned int written; struct smb2_write_rsp *rsp = (struct smb2_write_rsp *)mid->resp_buf; unsigned int credits_received = 1; @@ -1977,7 +1982,9 @@ smb2_writev_callback(struct mid_q_entry *mid) cifs_stats_fail_inc(tcon, SMB2_WRITE_HE); queue_work(cifsiod_wq, &wdata->work); + mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); + mutex_unlock(&server->srv_mutex); add_credits(tcon->ses->server, credits_received, 0); } diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 126f46b887cc..2a24c524fb9a 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -644,7 +644,9 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server) } spin_unlock(&GlobalMid_Lock); + mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); + mutex_unlock(&server->srv_mutex); return rc; } diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index 9b1ffaa0572e..f6c6c8adbc01 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -353,7 +353,7 @@ int venus_readlink(struct super_block *sb, struct CodaFid *fid, char *result; insize = max_t(unsigned int, - INSIZE(readlink), OUTSIZE(readlink)+ *length + 1); + INSIZE(readlink), OUTSIZE(readlink)+ *length); UPARG(CODA_READLINK); inp->coda_readlink.VFid = *fid; @@ -361,8 +361,8 @@ int venus_readlink(struct super_block *sb, struct CodaFid *fid, error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); if (!error) { retlen = outp->coda_readlink.count; - if ( retlen > *length ) - retlen = *length; + if (retlen >= *length) + retlen = *length - 1; *length = retlen; result = (char *)outp + (long)outp->coda_readlink.data; memcpy(buffer, result, retlen); diff --git a/fs/coredump.c b/fs/coredump.c index c5ecde6f3eed..a8f75640ac86 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -513,10 +513,10 @@ void do_coredump(const siginfo_t *siginfo) const struct cred *old_cred; struct cred *cred; int retval = 0; - int flag = 0; int ispipe; struct files_struct *displaced; - bool need_nonrelative = false; + /* require nonrelative corefile path and be extra careful */ + bool need_suid_safe = false; bool core_dumped = false; static atomic_t core_dump_count = ATOMIC_INIT(0); struct coredump_params cprm = { @@ -550,9 +550,8 @@ void do_coredump(const siginfo_t *siginfo) */ if (__get_dumpable(cprm.mm_flags) == SUID_DUMP_ROOT) { /* Setuid core dump mode */ - flag = O_EXCL; /* Stop rewrite attacks */ cred->fsuid = GLOBAL_ROOT_UID; /* Dump root private */ - need_nonrelative = true; + need_suid_safe = true; } retval = coredump_wait(siginfo->si_signo, &core_state); @@ -633,7 +632,7 @@ void do_coredump(const siginfo_t *siginfo) if (cprm.limit < binfmt->min_coredump) goto fail_unlock; - if (need_nonrelative && cn.corename[0] != '/') { + if (need_suid_safe && cn.corename[0] != '/') { printk(KERN_WARNING "Pid %d(%s) can only dump core "\ "to fully qualified path!\n", task_tgid_vnr(current), current->comm); @@ -641,8 +640,35 @@ void do_coredump(const siginfo_t *siginfo) goto fail_unlock; } + /* + * Unlink the file if it exists unless this is a SUID + * binary - in that case, we're running around with root + * privs and don't want to unlink another user's coredump. + */ + if (!need_suid_safe) { + mm_segment_t old_fs; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + /* + * If it doesn't exist, that's fine. If there's some + * other problem, we'll catch it at the filp_open(). + */ + (void) sys_unlink((const char __user *)cn.corename); + set_fs(old_fs); + } + + /* + * There is a race between unlinking and creating the + * file, but if that causes an EEXIST here, that's + * fine - another process raced with us while creating + * the corefile, and the other process won. To userspace, + * what matters is that at least one of the two processes + * writes its coredump successfully, not which one. + */ cprm.file = filp_open(cn.corename, - O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, + O_CREAT | 2 | O_NOFOLLOW | + O_LARGEFILE | O_EXCL, 0600); if (IS_ERR(cprm.file)) goto fail_unlock; @@ -659,11 +685,15 @@ void do_coredump(const siginfo_t *siginfo) if (!S_ISREG(inode->i_mode)) goto close_fail; /* - * Dont allow local users get cute and trick others to coredump - * into their pre-created files. + * Don't dump core if the filesystem changed owner or mode + * of the file during file creation. This is an issue when + * a process dumps core while its cwd is e.g. on a vfat + * filesystem. */ if (!uid_eq(inode->i_uid, current_fsuid())) goto close_fail; + if ((inode->i_mode & 0677) != 0600) + goto close_fail; if (!(cprm.file->f_mode & FMODE_CAN_WRITE)) goto close_fail; if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file)) @@ -17,12 +17,14 @@ #include <linux/atomic.h> #include <linux/blkdev.h> #include <linux/buffer_head.h> +#include <linux/dax.h> #include <linux/fs.h> #include <linux/genhd.h> #include <linux/highmem.h> #include <linux/memcontrol.h> #include <linux/mm.h> #include <linux/mutex.h> +#include <linux/pmem.h> #include <linux/sched.h> #include <linux/uio.h> #include <linux/vmstat.h> @@ -34,7 +36,7 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long size) might_sleep(); do { - void *addr; + void __pmem *addr; unsigned long pfn; long count; @@ -46,10 +48,7 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long size) unsigned pgsz = PAGE_SIZE - offset_in_page(addr); if (pgsz > count) pgsz = count; - if (pgsz < PAGE_SIZE) - memset(addr, 0, pgsz); - else - clear_page(addr); + clear_pmem(addr, pgsz); addr += pgsz; size -= pgsz; count -= pgsz; @@ -59,26 +58,29 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long size) } } while (size); + wmb_pmem(); return 0; } EXPORT_SYMBOL_GPL(dax_clear_blocks); -static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits) +static long dax_get_addr(struct buffer_head *bh, void __pmem **addr, + unsigned blkbits) { unsigned long pfn; sector_t sector = bh->b_blocknr << (blkbits - 9); return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size); } -static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos, - loff_t end) +/* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */ +static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first, + loff_t pos, loff_t end) { loff_t final = end - pos + first; /* The final byte of the buffer */ if (first > 0) - memset(addr, 0, first); + clear_pmem(addr, first); if (final < size) - memset(addr + final, 0, size - final); + clear_pmem(addr + final, size - final); } static bool buffer_written(struct buffer_head *bh) @@ -106,14 +108,15 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, loff_t pos = start; loff_t max = start; loff_t bh_max = start; - void *addr; + void __pmem *addr; bool hole = false; + bool need_wmb = false; if (iov_iter_rw(iter) != WRITE) end = min(end, i_size_read(inode)); while (pos < end) { - unsigned len; + size_t len; if (pos == max) { unsigned blkbits = inode->i_blkbits; sector_t block = pos >> blkbits; @@ -145,19 +148,23 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, retval = dax_get_addr(bh, &addr, blkbits); if (retval < 0) break; - if (buffer_unwritten(bh) || buffer_new(bh)) + if (buffer_unwritten(bh) || buffer_new(bh)) { dax_new_buf(addr, retval, first, pos, end); + need_wmb = true; + } addr += first; size = retval - first; } max = min(pos + size, end); } - if (iov_iter_rw(iter) == WRITE) - len = copy_from_iter_nocache(addr, max - pos, iter); - else if (!hole) - len = copy_to_iter(addr, max - pos, iter); + if (iov_iter_rw(iter) == WRITE) { + len = copy_from_iter_pmem(addr, max - pos, iter); + need_wmb = true; + } else if (!hole) + len = copy_to_iter((void __force *)addr, max - pos, + iter); else len = iov_iter_zero(max - pos, iter); @@ -168,6 +175,9 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, addr += len; } + if (need_wmb) + wmb_pmem(); + return (pos == start) ? retval : pos - start; } @@ -260,11 +270,13 @@ static int dax_load_hole(struct address_space *mapping, struct page *page, static int copy_user_bh(struct page *to, struct buffer_head *bh, unsigned blkbits, unsigned long vaddr) { - void *vfrom, *vto; + void __pmem *vfrom; + void *vto; + if (dax_get_addr(bh, &vfrom, blkbits) < 0) return -EIO; vto = kmap_atomic(to); - copy_user_page(vto, vfrom, vaddr, to); + copy_user_page(vto, (void __force *)vfrom, vaddr, to); kunmap_atomic(vto); return 0; } @@ -272,16 +284,13 @@ static int copy_user_bh(struct page *to, struct buffer_head *bh, static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, struct vm_area_struct *vma, struct vm_fault *vmf) { - struct address_space *mapping = inode->i_mapping; sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); unsigned long vaddr = (unsigned long)vmf->virtual_address; - void *addr; + void __pmem *addr; unsigned long pfn; pgoff_t size; int error; - i_mmap_lock_read(mapping); - /* * Check truncate didn't happen while we were allocating a block. * If it did, this block may or may not be still allocated to the @@ -303,14 +312,14 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, goto out; } - if (buffer_unwritten(bh) || buffer_new(bh)) - clear_page(addr); + if (buffer_unwritten(bh) || buffer_new(bh)) { + clear_pmem(addr, PAGE_SIZE); + wmb_pmem(); + } error = vm_insert_mixed(vma, vaddr, pfn); out: - i_mmap_unlock_read(mapping); - return error; } @@ -319,6 +328,12 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault * @get_block: The filesystem method used to translate file offsets to blocks + * @complete_unwritten: The filesystem method used to convert unwritten blocks + * to written so the data written to them is exposed. This is required for + * required by write faults for filesystems that will return unwritten + * extent mappings from @get_block, but it is optional for reads as + * dax_insert_mapping() will always zero unwritten blocks. If the fs does + * not support unwritten extents, the it should pass NULL. * * When a page fault occurs, filesystems may call this helper in their * fault handler for DAX files. __dax_fault() assumes the caller has done all @@ -366,15 +381,17 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, * from a read fault and we've raced with a truncate */ error = -EIO; - goto unlock_page; + goto unlock; } + } else { + i_mmap_lock_write(mapping); } error = get_block(inode, block, &bh, 0); if (!error && (bh.b_size < PAGE_SIZE)) error = -EIO; /* fs corruption? */ if (error) - goto unlock_page; + goto unlock; if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { if (vmf->flags & FAULT_FLAG_WRITE) { @@ -385,8 +402,9 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, if (!error && (bh.b_size < PAGE_SIZE)) error = -EIO; if (error) - goto unlock_page; + goto unlock; } else { + i_mmap_unlock_write(mapping); return dax_load_hole(mapping, page, vmf); } } @@ -398,17 +416,15 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, else clear_user_highpage(new_page, vaddr); if (error) - goto unlock_page; + goto unlock; vmf->page = page; if (!page) { - i_mmap_lock_read(mapping); /* Check we didn't race with truncate */ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) { - i_mmap_unlock_read(mapping); error = -EIO; - goto out; + goto unlock; } } return VM_FAULT_LOCKED; @@ -437,9 +453,15 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, * as for normal BH based IO completions. */ error = dax_insert_mapping(inode, &bh, vma, vmf); - if (buffer_unwritten(&bh)) - complete_unwritten(&bh, !error); + if (buffer_unwritten(&bh)) { + if (complete_unwritten) + complete_unwritten(&bh, !error); + else + WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); + } + if (!page) + i_mmap_unlock_write(mapping); out: if (error == -ENOMEM) return VM_FAULT_OOM | major; @@ -448,11 +470,14 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, return VM_FAULT_SIGBUS | major; return VM_FAULT_NOPAGE | major; - unlock_page: + unlock: if (page) { unlock_page(page); page_cache_release(page); + } else { + i_mmap_unlock_write(mapping); } + goto out; } EXPORT_SYMBOL(__dax_fault); @@ -484,6 +509,177 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, } EXPORT_SYMBOL_GPL(dax_fault); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* + * The 'colour' (ie low bits) within a PMD of a page offset. This comes up + * more often than one might expect in the below function. + */ +#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) + +int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd, unsigned int flags, get_block_t get_block, + dax_iodone_t complete_unwritten) +{ + struct file *file = vma->vm_file; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct buffer_head bh; + unsigned blkbits = inode->i_blkbits; + unsigned long pmd_addr = address & PMD_MASK; + bool write = flags & FAULT_FLAG_WRITE; + long length; + void __pmem *kaddr; + pgoff_t size, pgoff; + sector_t block, sector; + unsigned long pfn; + int result = 0; + + /* Fall back to PTEs if we're going to COW */ + if (write && !(vma->vm_flags & VM_SHARED)) + return VM_FAULT_FALLBACK; + /* If the PMD would extend outside the VMA */ + if (pmd_addr < vma->vm_start) + return VM_FAULT_FALLBACK; + if ((pmd_addr + PMD_SIZE) > vma->vm_end) + return VM_FAULT_FALLBACK; + + pgoff = linear_page_index(vma, pmd_addr); + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (pgoff >= size) + return VM_FAULT_SIGBUS; + /* If the PMD would cover blocks out of the file */ + if ((pgoff | PG_PMD_COLOUR) >= size) + return VM_FAULT_FALLBACK; + + memset(&bh, 0, sizeof(bh)); + block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); + + bh.b_size = PMD_SIZE; + i_mmap_lock_write(mapping); + length = get_block(inode, block, &bh, write); + if (length) + return VM_FAULT_SIGBUS; + + /* + * If the filesystem isn't willing to tell us the length of a hole, + * just fall back to PTEs. Calling get_block 512 times in a loop + * would be silly. + */ + if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) + goto fallback; + + if (buffer_unwritten(&bh) || buffer_new(&bh)) { + int i; + for (i = 0; i < PTRS_PER_PMD; i++) + clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE); + wmb_pmem(); + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + result |= VM_FAULT_MAJOR; + } + + /* + * If we allocated new storage, make sure no process has any + * zero pages covering this hole + */ + if (buffer_new(&bh)) { + i_mmap_unlock_write(mapping); + unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0); + i_mmap_lock_write(mapping); + } + + /* + * If a truncate happened while we were allocating blocks, we may + * leave blocks allocated to the file that are beyond EOF. We can't + * take i_mutex here, so just leave them hanging; they'll be freed + * when the file is deleted. + */ + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (pgoff >= size) { + result = VM_FAULT_SIGBUS; + goto out; + } + if ((pgoff | PG_PMD_COLOUR) >= size) + goto fallback; + + if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) { + spinlock_t *ptl; + pmd_t entry; + struct page *zero_page = get_huge_zero_page(); + + if (unlikely(!zero_page)) + goto fallback; + + ptl = pmd_lock(vma->vm_mm, pmd); + if (!pmd_none(*pmd)) { + spin_unlock(ptl); + goto fallback; + } + + entry = mk_pmd(zero_page, vma->vm_page_prot); + entry = pmd_mkhuge(entry); + set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry); + result = VM_FAULT_NOPAGE; + spin_unlock(ptl); + } else { + sector = bh.b_blocknr << (blkbits - 9); + length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn, + bh.b_size); + if (length < 0) { + result = VM_FAULT_SIGBUS; + goto out; + } + if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR)) + goto fallback; + + result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write); + } + + out: + if (buffer_unwritten(&bh)) + complete_unwritten(&bh, !(result & VM_FAULT_ERROR)); + + i_mmap_unlock_write(mapping); + + return result; + + fallback: + count_vm_event(THP_FAULT_FALLBACK); + result = VM_FAULT_FALLBACK; + goto out; +} +EXPORT_SYMBOL_GPL(__dax_pmd_fault); + +/** + * dax_pmd_fault - handle a PMD fault on a DAX file + * @vma: The virtual memory area where the fault occurred + * @vmf: The description of the fault + * @get_block: The filesystem method used to translate file offsets to blocks + * + * When a page fault occurs, filesystems may call this helper in their + * pmd_fault handler for DAX files. + */ +int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd, unsigned int flags, get_block_t get_block, + dax_iodone_t complete_unwritten) +{ + int result; + struct super_block *sb = file_inode(vma->vm_file)->i_sb; + + if (flags & FAULT_FLAG_WRITE) { + sb_start_pagefault(sb); + file_update_time(vma->vm_file); + } + result = __dax_pmd_fault(vma, address, pmd, flags, get_block, + complete_unwritten); + if (flags & FAULT_FLAG_WRITE) + sb_end_pagefault(sb); + + return result; +} +EXPORT_SYMBOL_GPL(dax_pmd_fault); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + /** * dax_pfn_mkwrite - handle first write to DAX page * @vma: The virtual memory area where the fault occurred @@ -538,11 +734,12 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, if (err < 0) return err; if (buffer_written(&bh)) { - void *addr; + void __pmem *addr; err = dax_get_addr(&bh, &addr, inode->i_blkbits); if (err < 0) return err; - memset(addr + offset, 0, length); + clear_pmem(addr + offset, length); + wmb_pmem(); } return 0; diff --git a/fs/dcache.c b/fs/dcache.c index 5c8ea15e73a5..5c33aeb0f68f 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2718,7 +2718,7 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2) * This helper attempts to cope with remotely renamed directories * * It assumes that the caller is already holding - * dentry->d_parent->d_inode->i_mutex, inode->i_lock and rename_lock + * dentry->d_parent->d_inode->i_mutex, and rename_lock * * Note: If ever the locking in lock_rename() changes, then please * remember to update this too... @@ -2744,7 +2744,6 @@ out_unalias: __d_move(alias, dentry, false); ret = 0; out_err: - spin_unlock(&inode->i_lock); if (m2) mutex_unlock(m2); if (m1) @@ -2790,10 +2789,11 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) if (S_ISDIR(inode->i_mode)) { struct dentry *new = __d_find_any_alias(inode); if (unlikely(new)) { + /* The reference to new ensures it remains an alias */ + spin_unlock(&inode->i_lock); write_seqlock(&rename_lock); if (unlikely(d_ancestor(new, dentry))) { write_sequnlock(&rename_lock); - spin_unlock(&inode->i_lock); dput(new); new = ERR_PTR(-ELOOP); pr_warn_ratelimited( @@ -2812,7 +2812,6 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) } else { __d_move(new, dentry, false); write_sequnlock(&rename_lock); - spin_unlock(&inode->i_lock); security_d_instantiate(new, inode); } iput(inode); @@ -2926,6 +2925,13 @@ restart: if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { struct mount *parent = ACCESS_ONCE(mnt->mnt_parent); + /* Escaped? */ + if (dentry != vfsmnt->mnt_root) { + bptr = *buffer; + blen = *buflen; + error = 3; + break; + } /* Global root? */ if (mnt != parent) { dentry = ACCESS_ONCE(mnt->mnt_mountpoint); @@ -3442,22 +3448,15 @@ void __init vfs_caches_init_early(void) inode_init_early(); } -void __init vfs_caches_init(unsigned long mempages) +void __init vfs_caches_init(void) { - unsigned long reserve; - - /* Base hash sizes on available memory, with a reserve equal to - 150% of current kernel size */ - - reserve = min((mempages - nr_free_pages()) * 3/2, mempages - 1); - mempages -= reserve; - names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); dcache_init(); inode_init(); - files_init(mempages); + files_init(); + files_maxfiles_init(); mnt_init(); bdev_cache_init(); chrdev_init(); diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 284f9aa0028b..6c55ade071c3 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -435,8 +435,8 @@ struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode, } EXPORT_SYMBOL_GPL(debugfs_create_atomic_t); -static ssize_t read_file_bool(struct file *file, char __user *user_buf, - size_t count, loff_t *ppos) +ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) { char buf[3]; u32 *val = file->private_data; @@ -449,9 +449,10 @@ static ssize_t read_file_bool(struct file *file, char __user *user_buf, buf[2] = 0x00; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); } +EXPORT_SYMBOL_GPL(debugfs_read_file_bool); -static ssize_t write_file_bool(struct file *file, const char __user *user_buf, - size_t count, loff_t *ppos) +ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) { char buf[32]; size_t buf_size; @@ -468,10 +469,11 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf, return count; } +EXPORT_SYMBOL_GPL(debugfs_write_file_bool); static const struct file_operations fops_bool = { - .read = read_file_bool, - .write = write_file_bool, + .read = debugfs_read_file_bool, + .write = debugfs_write_file_bool, .open = simple_open, .llseek = default_llseek, }; diff --git a/fs/direct-io.c b/fs/direct-io.c index 745d2342651a..11256291642e 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -285,7 +285,7 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio); /* * Asynchronous IO callback. */ -static void dio_bio_end_aio(struct bio *bio, int error) +static void dio_bio_end_aio(struct bio *bio) { struct dio *dio = bio->bi_private; unsigned long remaining; @@ -318,7 +318,7 @@ static void dio_bio_end_aio(struct bio *bio, int error) * During I/O bi_private points at the dio. After I/O, bi_private is used to * implement a singly-linked list of completed BIOs, at dio->bio_list. */ -static void dio_bio_end_io(struct bio *bio, int error) +static void dio_bio_end_io(struct bio *bio) { struct dio *dio = bio->bi_private; unsigned long flags; @@ -345,9 +345,9 @@ void dio_end_io(struct bio *bio, int error) struct dio *dio = bio->bi_private; if (dio->is_async) - dio_bio_end_aio(bio, error); + dio_bio_end_aio(bio); else - dio_bio_end_io(bio, error); + dio_bio_end_io(bio); } EXPORT_SYMBOL_GPL(dio_end_io); @@ -457,15 +457,16 @@ static struct bio *dio_await_one(struct dio *dio) */ static int dio_bio_complete(struct dio *dio, struct bio *bio) { - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct bio_vec *bvec; unsigned i; + int err; - if (!uptodate) + if (bio->bi_error) dio->io_error = -EIO; if (dio->is_async && dio->rw == READ) { bio_check_pages_dirty(bio); /* transfers ownership */ + err = bio->bi_error; } else { bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; @@ -474,9 +475,10 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio) set_page_dirty_lock(page); page_cache_release(page); } + err = bio->bi_error; bio_put(bio); } - return uptodate ? 0 : -EIO; + return err; } /* @@ -653,7 +655,7 @@ static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio, if (ret) goto out; sector = start_sector << (sdio->blkbits - 9); - nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev)); + nr_pages = min(sdio->pages_in_io, BIO_MAX_PAGES); BUG_ON(nr_pages <= 0); dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages); sdio->boundary = 0; diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 754fd6c0b747..87e9d796cf7d 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -120,12 +120,11 @@ struct connection { struct cbuf cb; int retries; #define MAX_CONNECT_RETRIES 3 - int sctp_assoc; struct hlist_node list; struct connection *othercon; struct work_struct rwork; /* Receive workqueue */ struct work_struct swork; /* Send workqueue */ - bool try_new_addr; + void (*orig_error_report)(struct sock *sk); }; #define sock2con(x) ((struct connection *)(x)->sk_user_data) @@ -252,26 +251,6 @@ static struct connection *nodeid2con(int nodeid, gfp_t allocation) return con; } -/* This is a bit drastic, but only called when things go wrong */ -static struct connection *assoc2con(int assoc_id) -{ - int i; - struct connection *con; - - mutex_lock(&connections_lock); - - for (i = 0 ; i < CONN_HASH_SIZE; i++) { - hlist_for_each_entry(con, &connection_hash[i], list) { - if (con->sctp_assoc == assoc_id) { - mutex_unlock(&connections_lock); - return con; - } - } - } - mutex_unlock(&connections_lock); - return NULL; -} - static struct dlm_node_addr *find_node_addr(int nodeid) { struct dlm_node_addr *na; @@ -322,14 +301,14 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out, spin_lock(&dlm_node_addrs_spin); na = find_node_addr(nodeid); if (na && na->addr_count) { + memcpy(&sas, na->addr[na->curr_addr_index], + sizeof(struct sockaddr_storage)); + if (try_new_addr) { na->curr_addr_index++; if (na->curr_addr_index == na->addr_count) na->curr_addr_index = 0; } - - memcpy(&sas, na->addr[na->curr_addr_index ], - sizeof(struct sockaddr_storage)); } spin_unlock(&dlm_node_addrs_spin); @@ -459,18 +438,23 @@ static inline void lowcomms_connect_sock(struct connection *con) static void lowcomms_state_change(struct sock *sk) { - if (sk->sk_state == TCP_ESTABLISHED) + /* SCTP layer is not calling sk_data_ready when the connection + * is done, so we catch the signal through here. Also, it + * doesn't switch socket state when entering shutdown, so we + * skip the write in that case. + */ + if (sk->sk_shutdown) { + if (sk->sk_shutdown == RCV_SHUTDOWN) + lowcomms_data_ready(sk); + } else if (sk->sk_state == TCP_ESTABLISHED) { lowcomms_write_space(sk); + } } int dlm_lowcomms_connect_node(int nodeid) { struct connection *con; - /* with sctp there's no connecting without sending */ - if (dlm_config.ci_protocol != 0) - return 0; - if (nodeid == dlm_our_nodeid()) return 0; @@ -481,6 +465,43 @@ int dlm_lowcomms_connect_node(int nodeid) return 0; } +static void lowcomms_error_report(struct sock *sk) +{ + struct connection *con = sock2con(sk); + struct sockaddr_storage saddr; + + if (nodeid_to_addr(con->nodeid, &saddr, NULL, false)) { + printk_ratelimited(KERN_ERR "dlm: node %d: socket error " + "sending to node %d, port %d, " + "sk_err=%d/%d\n", dlm_our_nodeid(), + con->nodeid, dlm_config.ci_tcp_port, + sk->sk_err, sk->sk_err_soft); + return; + } else if (saddr.ss_family == AF_INET) { + struct sockaddr_in *sin4 = (struct sockaddr_in *)&saddr; + + printk_ratelimited(KERN_ERR "dlm: node %d: socket error " + "sending to node %d at %pI4, port %d, " + "sk_err=%d/%d\n", dlm_our_nodeid(), + con->nodeid, &sin4->sin_addr.s_addr, + dlm_config.ci_tcp_port, sk->sk_err, + sk->sk_err_soft); + } else { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&saddr; + + printk_ratelimited(KERN_ERR "dlm: node %d: socket error " + "sending to node %d at %u.%u.%u.%u, " + "port %d, sk_err=%d/%d\n", dlm_our_nodeid(), + con->nodeid, sin6->sin6_addr.s6_addr32[0], + sin6->sin6_addr.s6_addr32[1], + sin6->sin6_addr.s6_addr32[2], + sin6->sin6_addr.s6_addr32[3], + dlm_config.ci_tcp_port, sk->sk_err, + sk->sk_err_soft); + } + con->orig_error_report(sk); +} + /* Make a socket active */ static void add_sock(struct socket *sock, struct connection *con) { @@ -492,6 +513,8 @@ static void add_sock(struct socket *sock, struct connection *con) con->sock->sk->sk_state_change = lowcomms_state_change; con->sock->sk->sk_user_data = con; con->sock->sk->sk_allocation = GFP_NOFS; + con->orig_error_report = con->sock->sk->sk_error_report; + con->sock->sk->sk_error_report = lowcomms_error_report; } /* Add the port number to an IPv6 or 4 sockaddr and return the address @@ -514,17 +537,24 @@ static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port, } /* Close a remote connection and tidy up */ -static void close_connection(struct connection *con, bool and_other) +static void close_connection(struct connection *con, bool and_other, + bool tx, bool rx) { - mutex_lock(&con->sock_mutex); + clear_bit(CF_CONNECT_PENDING, &con->flags); + clear_bit(CF_WRITE_PENDING, &con->flags); + if (tx && cancel_work_sync(&con->swork)) + log_print("canceled swork for node %d", con->nodeid); + if (rx && cancel_work_sync(&con->rwork)) + log_print("canceled rwork for node %d", con->nodeid); + mutex_lock(&con->sock_mutex); if (con->sock) { sock_release(con->sock); con->sock = NULL; } if (con->othercon && and_other) { /* Will only re-enter once. */ - close_connection(con->othercon, false); + close_connection(con->othercon, false, true, true); } if (con->rx_page) { __free_page(con->rx_page); @@ -535,254 +565,6 @@ static void close_connection(struct connection *con, bool and_other) mutex_unlock(&con->sock_mutex); } -/* We only send shutdown messages to nodes that are not part of the cluster */ -static void sctp_send_shutdown(sctp_assoc_t associd) -{ - static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))]; - struct msghdr outmessage; - struct cmsghdr *cmsg; - struct sctp_sndrcvinfo *sinfo; - int ret; - struct connection *con; - - con = nodeid2con(0,0); - BUG_ON(con == NULL); - - outmessage.msg_name = NULL; - outmessage.msg_namelen = 0; - outmessage.msg_control = outcmsg; - outmessage.msg_controllen = sizeof(outcmsg); - outmessage.msg_flags = MSG_EOR; - - cmsg = CMSG_FIRSTHDR(&outmessage); - cmsg->cmsg_level = IPPROTO_SCTP; - cmsg->cmsg_type = SCTP_SNDRCV; - cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); - outmessage.msg_controllen = cmsg->cmsg_len; - sinfo = CMSG_DATA(cmsg); - memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo)); - - sinfo->sinfo_flags |= MSG_EOF; - sinfo->sinfo_assoc_id = associd; - - ret = kernel_sendmsg(con->sock, &outmessage, NULL, 0, 0); - - if (ret != 0) - log_print("send EOF to node failed: %d", ret); -} - -static void sctp_init_failed_foreach(struct connection *con) -{ - - /* - * Don't try to recover base con and handle race where the - * other node's assoc init creates a assoc and we get that - * notification, then we get a notification that our attempt - * failed due. This happens when we are still trying the primary - * address, but the other node has already tried secondary addrs - * and found one that worked. - */ - if (!con->nodeid || con->sctp_assoc) - return; - - log_print("Retrying SCTP association init for node %d\n", con->nodeid); - - con->try_new_addr = true; - con->sctp_assoc = 0; - if (test_and_clear_bit(CF_INIT_PENDING, &con->flags)) { - if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) - queue_work(send_workqueue, &con->swork); - } -} - -/* INIT failed but we don't know which node... - restart INIT on all pending nodes */ -static void sctp_init_failed(void) -{ - mutex_lock(&connections_lock); - - foreach_conn(sctp_init_failed_foreach); - - mutex_unlock(&connections_lock); -} - -static void retry_failed_sctp_send(struct connection *recv_con, - struct sctp_send_failed *sn_send_failed, - char *buf) -{ - int len = sn_send_failed->ssf_length - sizeof(struct sctp_send_failed); - struct dlm_mhandle *mh; - struct connection *con; - char *retry_buf; - int nodeid = sn_send_failed->ssf_info.sinfo_ppid; - - log_print("Retry sending %d bytes to node id %d", len, nodeid); - - if (!nodeid) { - log_print("Shouldn't resend data via listening connection."); - return; - } - - con = nodeid2con(nodeid, 0); - if (!con) { - log_print("Could not look up con for nodeid %d\n", - nodeid); - return; - } - - mh = dlm_lowcomms_get_buffer(nodeid, len, GFP_NOFS, &retry_buf); - if (!mh) { - log_print("Could not allocate buf for retry."); - return; - } - memcpy(retry_buf, buf + sizeof(struct sctp_send_failed), len); - dlm_lowcomms_commit_buffer(mh); - - /* - * If we got a assoc changed event before the send failed event then - * we only need to retry the send. - */ - if (con->sctp_assoc) { - if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) - queue_work(send_workqueue, &con->swork); - } else - sctp_init_failed_foreach(con); -} - -/* Something happened to an association */ -static void process_sctp_notification(struct connection *con, - struct msghdr *msg, char *buf) -{ - union sctp_notification *sn = (union sctp_notification *)buf; - struct linger linger; - - switch (sn->sn_header.sn_type) { - case SCTP_SEND_FAILED: - retry_failed_sctp_send(con, &sn->sn_send_failed, buf); - break; - case SCTP_ASSOC_CHANGE: - switch (sn->sn_assoc_change.sac_state) { - case SCTP_COMM_UP: - case SCTP_RESTART: - { - /* Check that the new node is in the lockspace */ - struct sctp_prim prim; - int nodeid; - int prim_len, ret; - int addr_len; - struct connection *new_con; - - /* - * We get this before any data for an association. - * We verify that the node is in the cluster and - * then peel off a socket for it. - */ - if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) { - log_print("COMM_UP for invalid assoc ID %d", - (int)sn->sn_assoc_change.sac_assoc_id); - sctp_init_failed(); - return; - } - memset(&prim, 0, sizeof(struct sctp_prim)); - prim_len = sizeof(struct sctp_prim); - prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id; - - ret = kernel_getsockopt(con->sock, - IPPROTO_SCTP, - SCTP_PRIMARY_ADDR, - (char*)&prim, - &prim_len); - if (ret < 0) { - log_print("getsockopt/sctp_primary_addr on " - "new assoc %d failed : %d", - (int)sn->sn_assoc_change.sac_assoc_id, - ret); - - /* Retry INIT later */ - new_con = assoc2con(sn->sn_assoc_change.sac_assoc_id); - if (new_con) - clear_bit(CF_CONNECT_PENDING, &con->flags); - return; - } - make_sockaddr(&prim.ssp_addr, 0, &addr_len); - if (addr_to_nodeid(&prim.ssp_addr, &nodeid)) { - unsigned char *b=(unsigned char *)&prim.ssp_addr; - log_print("reject connect from unknown addr"); - print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE, - b, sizeof(struct sockaddr_storage)); - sctp_send_shutdown(prim.ssp_assoc_id); - return; - } - - new_con = nodeid2con(nodeid, GFP_NOFS); - if (!new_con) - return; - - /* Peel off a new sock */ - lock_sock(con->sock->sk); - ret = sctp_do_peeloff(con->sock->sk, - sn->sn_assoc_change.sac_assoc_id, - &new_con->sock); - release_sock(con->sock->sk); - if (ret < 0) { - log_print("Can't peel off a socket for " - "connection %d to node %d: err=%d", - (int)sn->sn_assoc_change.sac_assoc_id, - nodeid, ret); - return; - } - add_sock(new_con->sock, new_con); - - linger.l_onoff = 1; - linger.l_linger = 0; - ret = kernel_setsockopt(new_con->sock, SOL_SOCKET, SO_LINGER, - (char *)&linger, sizeof(linger)); - if (ret < 0) - log_print("set socket option SO_LINGER failed"); - - log_print("connecting to %d sctp association %d", - nodeid, (int)sn->sn_assoc_change.sac_assoc_id); - - new_con->sctp_assoc = sn->sn_assoc_change.sac_assoc_id; - new_con->try_new_addr = false; - /* Send any pending writes */ - clear_bit(CF_CONNECT_PENDING, &new_con->flags); - clear_bit(CF_INIT_PENDING, &new_con->flags); - if (!test_and_set_bit(CF_WRITE_PENDING, &new_con->flags)) { - queue_work(send_workqueue, &new_con->swork); - } - if (!test_and_set_bit(CF_READ_PENDING, &new_con->flags)) - queue_work(recv_workqueue, &new_con->rwork); - } - break; - - case SCTP_COMM_LOST: - case SCTP_SHUTDOWN_COMP: - { - con = assoc2con(sn->sn_assoc_change.sac_assoc_id); - if (con) { - con->sctp_assoc = 0; - } - } - break; - - case SCTP_CANT_STR_ASSOC: - { - /* Will retry init when we get the send failed notification */ - log_print("Can't start SCTP association - retrying"); - } - break; - - default: - log_print("unexpected SCTP assoc change id=%d state=%d", - (int)sn->sn_assoc_change.sac_assoc_id, - sn->sn_assoc_change.sac_state); - } - default: - ; /* fall through */ - } -} - /* Data received from remote end */ static int receive_from_sock(struct connection *con) { @@ -793,7 +575,6 @@ static int receive_from_sock(struct connection *con) int r; int call_again_soon = 0; int nvec; - char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))]; mutex_lock(&con->sock_mutex); @@ -801,6 +582,10 @@ static int receive_from_sock(struct connection *con) ret = -EAGAIN; goto out_close; } + if (con->nodeid == 0) { + ret = -EINVAL; + goto out_close; + } if (con->rx_page == NULL) { /* @@ -813,11 +598,6 @@ static int receive_from_sock(struct connection *con) cbuf_init(&con->cb, PAGE_CACHE_SIZE); } - /* Only SCTP needs these really */ - memset(&incmsg, 0, sizeof(incmsg)); - msg.msg_control = incmsg; - msg.msg_controllen = sizeof(incmsg); - /* * iov[0] is the bit of the circular buffer between the current end * point (cb.base + cb.len) and the end of the buffer. @@ -843,31 +623,18 @@ static int receive_from_sock(struct connection *con) MSG_DONTWAIT | MSG_NOSIGNAL); if (ret <= 0) goto out_close; - - /* Process SCTP notifications */ - if (msg.msg_flags & MSG_NOTIFICATION) { - msg.msg_control = incmsg; - msg.msg_controllen = sizeof(incmsg); - - process_sctp_notification(con, &msg, - page_address(con->rx_page) + con->cb.base); - mutex_unlock(&con->sock_mutex); - return 0; - } - BUG_ON(con->nodeid == 0); - - if (ret == len) + else if (ret == len) call_again_soon = 1; + cbuf_add(&con->cb, ret); ret = dlm_process_incoming_buffer(con->nodeid, page_address(con->rx_page), con->cb.base, con->cb.len, PAGE_CACHE_SIZE); if (ret == -EBADMSG) { - log_print("lowcomms: addr=%p, base=%u, len=%u, " - "iov_len=%u, iov_base[0]=%p, read=%d", - page_address(con->rx_page), con->cb.base, con->cb.len, - len, iov[0].iov_base, r); + log_print("lowcomms: addr=%p, base=%u, len=%u, read=%d", + page_address(con->rx_page), con->cb.base, + con->cb.len, r); } if (ret < 0) goto out_close; @@ -892,7 +659,7 @@ out_resched: out_close: mutex_unlock(&con->sock_mutex); if (ret != -EAGAIN) { - close_connection(con, false); + close_connection(con, false, true, false); /* Reconnect when there is something to send */ } /* Don't return success if we really got EOF */ @@ -1033,6 +800,120 @@ accept_err: return result; } +static int sctp_accept_from_sock(struct connection *con) +{ + /* Check that the new node is in the lockspace */ + struct sctp_prim prim; + int nodeid; + int prim_len, ret; + int addr_len; + struct connection *newcon; + struct connection *addcon; + struct socket *newsock; + + mutex_lock(&connections_lock); + if (!dlm_allow_conn) { + mutex_unlock(&connections_lock); + return -1; + } + mutex_unlock(&connections_lock); + + mutex_lock_nested(&con->sock_mutex, 0); + + ret = kernel_accept(con->sock, &newsock, O_NONBLOCK); + if (ret < 0) + goto accept_err; + + memset(&prim, 0, sizeof(struct sctp_prim)); + prim_len = sizeof(struct sctp_prim); + + ret = kernel_getsockopt(newsock, IPPROTO_SCTP, SCTP_PRIMARY_ADDR, + (char *)&prim, &prim_len); + if (ret < 0) { + log_print("getsockopt/sctp_primary_addr failed: %d", ret); + goto accept_err; + } + + make_sockaddr(&prim.ssp_addr, 0, &addr_len); + if (addr_to_nodeid(&prim.ssp_addr, &nodeid)) { + unsigned char *b = (unsigned char *)&prim.ssp_addr; + + log_print("reject connect from unknown addr"); + print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE, + b, sizeof(struct sockaddr_storage)); + goto accept_err; + } + + newcon = nodeid2con(nodeid, GFP_NOFS); + if (!newcon) { + ret = -ENOMEM; + goto accept_err; + } + + mutex_lock_nested(&newcon->sock_mutex, 1); + + if (newcon->sock) { + struct connection *othercon = newcon->othercon; + + if (!othercon) { + othercon = kmem_cache_zalloc(con_cache, GFP_NOFS); + if (!othercon) { + log_print("failed to allocate incoming socket"); + mutex_unlock(&newcon->sock_mutex); + ret = -ENOMEM; + goto accept_err; + } + othercon->nodeid = nodeid; + othercon->rx_action = receive_from_sock; + mutex_init(&othercon->sock_mutex); + INIT_WORK(&othercon->swork, process_send_sockets); + INIT_WORK(&othercon->rwork, process_recv_sockets); + set_bit(CF_IS_OTHERCON, &othercon->flags); + } + if (!othercon->sock) { + newcon->othercon = othercon; + othercon->sock = newsock; + newsock->sk->sk_user_data = othercon; + add_sock(newsock, othercon); + addcon = othercon; + } else { + printk("Extra connection from node %d attempted\n", nodeid); + ret = -EAGAIN; + mutex_unlock(&newcon->sock_mutex); + goto accept_err; + } + } else { + newsock->sk->sk_user_data = newcon; + newcon->rx_action = receive_from_sock; + add_sock(newsock, newcon); + addcon = newcon; + } + + log_print("connected to %d", nodeid); + + mutex_unlock(&newcon->sock_mutex); + + /* + * Add it to the active queue in case we got data + * between processing the accept adding the socket + * to the read_sockets list + */ + if (!test_and_set_bit(CF_READ_PENDING, &addcon->flags)) + queue_work(recv_workqueue, &addcon->rwork); + mutex_unlock(&con->sock_mutex); + + return 0; + +accept_err: + mutex_unlock(&con->sock_mutex); + if (newsock) + sock_release(newsock); + if (ret != -EAGAIN) + log_print("error accepting connection from node: %d", ret); + + return ret; +} + static void free_entry(struct writequeue_entry *e) { __free_page(e->page); @@ -1057,97 +938,129 @@ static void writequeue_entry_complete(struct writequeue_entry *e, int completed) } } +/* + * sctp_bind_addrs - bind a SCTP socket to all our addresses + */ +static int sctp_bind_addrs(struct connection *con, uint16_t port) +{ + struct sockaddr_storage localaddr; + int i, addr_len, result = 0; + + for (i = 0; i < dlm_local_count; i++) { + memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr)); + make_sockaddr(&localaddr, port, &addr_len); + + if (!i) + result = kernel_bind(con->sock, + (struct sockaddr *)&localaddr, + addr_len); + else + result = kernel_setsockopt(con->sock, SOL_SCTP, + SCTP_SOCKOPT_BINDX_ADD, + (char *)&localaddr, addr_len); + + if (result < 0) { + log_print("Can't bind to %d addr number %d, %d.\n", + port, i + 1, result); + break; + } + } + return result; +} + /* Initiate an SCTP association. This is a special case of send_to_sock() in that we don't yet have a peeled-off socket for this association, so we use the listening socket and add the primary IP address of the remote node. */ -static void sctp_init_assoc(struct connection *con) +static void sctp_connect_to_sock(struct connection *con) { - struct sockaddr_storage rem_addr; - char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))]; - struct msghdr outmessage; - struct cmsghdr *cmsg; - struct sctp_sndrcvinfo *sinfo; - struct connection *base_con; - struct writequeue_entry *e; - int len, offset; - int ret; - int addrlen; - struct kvec iov[1]; + struct sockaddr_storage daddr; + int one = 1; + int result; + int addr_len; + struct socket *sock; + + if (con->nodeid == 0) { + log_print("attempt to connect sock 0 foiled"); + return; + } mutex_lock(&con->sock_mutex); - if (test_and_set_bit(CF_INIT_PENDING, &con->flags)) - goto unlock; - if (nodeid_to_addr(con->nodeid, NULL, (struct sockaddr *)&rem_addr, - con->try_new_addr)) { + /* Some odd races can cause double-connects, ignore them */ + if (con->retries++ > MAX_CONNECT_RETRIES) + goto out; + + if (con->sock) { + log_print("node %d already connected.", con->nodeid); + goto out; + } + + memset(&daddr, 0, sizeof(daddr)); + result = nodeid_to_addr(con->nodeid, &daddr, NULL, true); + if (result < 0) { log_print("no address for nodeid %d", con->nodeid); - goto unlock; + goto out; } - base_con = nodeid2con(0, 0); - BUG_ON(base_con == NULL); - make_sockaddr(&rem_addr, dlm_config.ci_tcp_port, &addrlen); + /* Create a socket to communicate with */ + result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, + SOCK_STREAM, IPPROTO_SCTP, &sock); + if (result < 0) + goto socket_err; - outmessage.msg_name = &rem_addr; - outmessage.msg_namelen = addrlen; - outmessage.msg_control = outcmsg; - outmessage.msg_controllen = sizeof(outcmsg); - outmessage.msg_flags = MSG_EOR; + sock->sk->sk_user_data = con; + con->rx_action = receive_from_sock; + con->connect_action = sctp_connect_to_sock; + add_sock(sock, con); - spin_lock(&con->writequeue_lock); + /* Bind to all addresses. */ + if (sctp_bind_addrs(con, 0)) + goto bind_err; - if (list_empty(&con->writequeue)) { - spin_unlock(&con->writequeue_lock); - log_print("writequeue empty for nodeid %d", con->nodeid); - goto unlock; - } + make_sockaddr(&daddr, dlm_config.ci_tcp_port, &addr_len); - e = list_first_entry(&con->writequeue, struct writequeue_entry, list); - len = e->len; - offset = e->offset; + log_print("connecting to %d", con->nodeid); - /* Send the first block off the write queue */ - iov[0].iov_base = page_address(e->page)+offset; - iov[0].iov_len = len; - spin_unlock(&con->writequeue_lock); + /* Turn off Nagle's algorithm */ + kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one, + sizeof(one)); - if (rem_addr.ss_family == AF_INET) { - struct sockaddr_in *sin = (struct sockaddr_in *)&rem_addr; - log_print("Trying to connect to %pI4", &sin->sin_addr.s_addr); - } else { - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&rem_addr; - log_print("Trying to connect to %pI6", &sin6->sin6_addr); - } + result = sock->ops->connect(sock, (struct sockaddr *)&daddr, addr_len, + O_NONBLOCK); + if (result == -EINPROGRESS) + result = 0; + if (result == 0) + goto out; - cmsg = CMSG_FIRSTHDR(&outmessage); - cmsg->cmsg_level = IPPROTO_SCTP; - cmsg->cmsg_type = SCTP_SNDRCV; - cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); - sinfo = CMSG_DATA(cmsg); - memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo)); - sinfo->sinfo_ppid = cpu_to_le32(con->nodeid); - outmessage.msg_controllen = cmsg->cmsg_len; - sinfo->sinfo_flags |= SCTP_ADDR_OVER; - ret = kernel_sendmsg(base_con->sock, &outmessage, iov, 1, len); - if (ret < 0) { - log_print("Send first packet to node %d failed: %d", - con->nodeid, ret); +bind_err: + con->sock = NULL; + sock_release(sock); - /* Try again later */ +socket_err: + /* + * Some errors are fatal and this list might need adjusting. For other + * errors we try again until the max number of retries is reached. + */ + if (result != -EHOSTUNREACH && + result != -ENETUNREACH && + result != -ENETDOWN && + result != -EINVAL && + result != -EPROTONOSUPPORT) { + log_print("connect %d try %d error %d", con->nodeid, + con->retries, result); + mutex_unlock(&con->sock_mutex); + msleep(1000); clear_bit(CF_CONNECT_PENDING, &con->flags); - clear_bit(CF_INIT_PENDING, &con->flags); - } - else { - spin_lock(&con->writequeue_lock); - writequeue_entry_complete(e, ret); - spin_unlock(&con->writequeue_lock); + lowcomms_connect_sock(con); + return; } -unlock: +out: mutex_unlock(&con->sock_mutex); + set_bit(CF_WRITE_PENDING, &con->flags); } /* Connect a new socket to its peer */ @@ -1236,11 +1149,13 @@ out_err: con->retries, result); mutex_unlock(&con->sock_mutex); msleep(1000); + clear_bit(CF_CONNECT_PENDING, &con->flags); lowcomms_connect_sock(con); return; } out: mutex_unlock(&con->sock_mutex); + set_bit(CF_WRITE_PENDING, &con->flags); return; } @@ -1325,37 +1240,11 @@ static void init_local(void) } } -/* Bind to an IP address. SCTP allows multiple address so it can do - multi-homing */ -static int add_sctp_bind_addr(struct connection *sctp_con, - struct sockaddr_storage *addr, - int addr_len, int num) -{ - int result = 0; - - if (num == 1) - result = kernel_bind(sctp_con->sock, - (struct sockaddr *) addr, - addr_len); - else - result = kernel_setsockopt(sctp_con->sock, SOL_SCTP, - SCTP_SOCKOPT_BINDX_ADD, - (char *)addr, addr_len); - - if (result < 0) - log_print("Can't bind to port %d addr number %d", - dlm_config.ci_tcp_port, num); - - return result; -} - /* Initialise SCTP socket and bind to all interfaces */ static int sctp_listen_for_all(void) { struct socket *sock = NULL; - struct sockaddr_storage localaddr; - struct sctp_event_subscribe subscribe; - int result = -EINVAL, num = 1, i, addr_len; + int result = -EINVAL; struct connection *con = nodeid2con(0, GFP_NOFS); int bufsize = NEEDED_RMEM; int one = 1; @@ -1366,33 +1255,17 @@ static int sctp_listen_for_all(void) log_print("Using SCTP for communications"); result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, - SOCK_SEQPACKET, IPPROTO_SCTP, &sock); + SOCK_STREAM, IPPROTO_SCTP, &sock); if (result < 0) { log_print("Can't create comms socket, check SCTP is loaded"); goto out; } - /* Listen for events */ - memset(&subscribe, 0, sizeof(subscribe)); - subscribe.sctp_data_io_event = 1; - subscribe.sctp_association_event = 1; - subscribe.sctp_send_failure_event = 1; - subscribe.sctp_shutdown_event = 1; - subscribe.sctp_partial_delivery_event = 1; - result = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVBUFFORCE, (char *)&bufsize, sizeof(bufsize)); if (result) log_print("Error increasing buffer space on socket %d", result); - result = kernel_setsockopt(sock, SOL_SCTP, SCTP_EVENTS, - (char *)&subscribe, sizeof(subscribe)); - if (result < 0) { - log_print("Failed to set SCTP_EVENTS on socket: result=%d", - result); - goto create_delsock; - } - result = kernel_setsockopt(sock, SOL_SCTP, SCTP_NODELAY, (char *)&one, sizeof(one)); if (result < 0) @@ -1402,19 +1275,12 @@ static int sctp_listen_for_all(void) sock->sk->sk_user_data = con; con->sock = sock; con->sock->sk->sk_data_ready = lowcomms_data_ready; - con->rx_action = receive_from_sock; - con->connect_action = sctp_init_assoc; + con->rx_action = sctp_accept_from_sock; + con->connect_action = sctp_connect_to_sock; - /* Bind to all interfaces. */ - for (i = 0; i < dlm_local_count; i++) { - memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr)); - make_sockaddr(&localaddr, dlm_config.ci_tcp_port, &addr_len); - - result = add_sctp_bind_addr(con, &localaddr, addr_len, num); - if (result) - goto create_delsock; - ++num; - } + /* Bind to all addresses. */ + if (sctp_bind_addrs(con, dlm_config.ci_tcp_port)) + goto create_delsock; result = sock->ops->listen(sock, 5); if (result < 0) { @@ -1612,14 +1478,13 @@ out: send_error: mutex_unlock(&con->sock_mutex); - close_connection(con, false); + close_connection(con, false, false, true); lowcomms_connect_sock(con); return; out_connect: mutex_unlock(&con->sock_mutex); - if (!test_bit(CF_INIT_PENDING, &con->flags)) - lowcomms_connect_sock(con); + lowcomms_connect_sock(con); } static void clean_one_writequeue(struct connection *con) @@ -1644,15 +1509,9 @@ int dlm_lowcomms_close(int nodeid) log_print("closing connection to node %d", nodeid); con = nodeid2con(nodeid, 0); if (con) { - clear_bit(CF_CONNECT_PENDING, &con->flags); - clear_bit(CF_WRITE_PENDING, &con->flags); set_bit(CF_CLOSE, &con->flags); - if (cancel_work_sync(&con->swork)) - log_print("canceled swork for node %d", nodeid); - if (cancel_work_sync(&con->rwork)) - log_print("canceled rwork for node %d", nodeid); + close_connection(con, true, true, true); clean_one_writequeue(con); - close_connection(con, true); } spin_lock(&dlm_node_addrs_spin); @@ -1685,10 +1544,8 @@ static void process_send_sockets(struct work_struct *work) { struct connection *con = container_of(work, struct connection, swork); - if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) { + if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) con->connect_action(con); - set_bit(CF_WRITE_PENDING, &con->flags); - } if (test_and_clear_bit(CF_WRITE_PENDING, &con->flags)) send_to_sock(con); } @@ -1735,7 +1592,7 @@ static void stop_conn(struct connection *con) static void free_conn(struct connection *con) { - close_connection(con, true); + close_connection(con, true, true, true); if (con->othercon) kmem_cache_free(con_cache, con->othercon); hlist_del(&con->list); @@ -1806,7 +1663,7 @@ fail_unlisten: dlm_allow_conn = 0; con = nodeid2con(0,0); if (con) { - close_connection(con, false); + close_connection(con, false, true, true); kmem_cache_free(con_cache, con); } fail_destroy: diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index e0ab3a93eeff..5532f097f6da 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -509,7 +509,6 @@ int dlm_plock_init(void) void dlm_plock_exit(void) { - if (misc_deregister(&plock_dev_misc) < 0) - log_print("dlm_plock_exit: misc_deregister failed"); + misc_deregister(&plock_dev_misc); } diff --git a/fs/dlm/user.c b/fs/dlm/user.c index fb85f32e9eca..173b3873a4f4 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -362,18 +362,15 @@ fail: int dlm_device_deregister(struct dlm_ls *ls) { - int error; - /* The device is not registered. This happens when the lockspace was never used from userspace, or when device_create_lockspace() calls dlm_release_lockspace() after the register fails. */ if (!ls->ls_device.name) return 0; - error = misc_deregister(&ls->ls_device); - if (!error) - kfree(ls->ls_device.name); - return error; + misc_deregister(&ls->ls_device); + kfree(ls->ls_device.name); + return 0; } static int device_user_purge(struct dlm_user_proc *proc, @@ -785,6 +782,7 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count, DECLARE_WAITQUEUE(wait, current); struct dlm_callback cb; int rv, resid, copy_lvb = 0; + int old_mode, new_mode; if (count == sizeof(struct dlm_device_version)) { rv = copy_version_to_user(buf, count); @@ -841,6 +839,9 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count, lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_cb_list); + /* rem_lkb_callback sets a new lkb_last_cast */ + old_mode = lkb->lkb_last_cast.mode; + rv = dlm_rem_lkb_callback(lkb->lkb_resource->res_ls, lkb, &cb, &resid); if (rv < 0) { /* this shouldn't happen; lkb should have been removed from @@ -864,9 +865,6 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count, } if (cb.flags & DLM_CB_CAST) { - int old_mode, new_mode; - - old_mode = lkb->lkb_last_cast.mode; new_mode = cb.mode; if (!cb.sb_status && lkb->lkb_lksb->sb_lvbptr && diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 5718cb9f7273..d72d52b90433 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -17,7 +17,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) { struct inode *inode, *toput_inode = NULL; - spin_lock(&inode_sb_list_lock); + spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || @@ -27,13 +27,15 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); + invalidate_mapping_pages(inode->i_mapping, 0, -1); iput(toput_inode); toput_inode = inode; - spin_lock(&inode_sb_list_lock); + + spin_lock(&sb->s_inode_list_lock); } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); iput(toput_inode); } diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 97315f2f6816..80d6901493cf 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -258,8 +258,7 @@ void ecryptfs_destroy_mount_crypt_stat( &mount_crypt_stat->global_auth_tok_list, mount_crypt_stat_list) { list_del(&auth_tok->mount_crypt_stat_list); - if (auth_tok->global_auth_tok_key - && !(auth_tok->flags & ECRYPTFS_AUTH_TOK_INVALID)) + if (!(auth_tok->flags & ECRYPTFS_AUTH_TOK_INVALID)) key_put(auth_tok->global_auth_tok_key); kmem_cache_free(ecryptfs_global_auth_tok_cache, auth_tok); } diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c index 8db0b464483f..63cd2c147221 100644 --- a/fs/ecryptfs/dentry.c +++ b/fs/ecryptfs/dentry.c @@ -45,20 +45,20 @@ static int ecryptfs_d_revalidate(struct dentry *dentry, unsigned int flags) { struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); - int rc; - - if (!(lower_dentry->d_flags & DCACHE_OP_REVALIDATE)) - return 1; + int rc = 1; if (flags & LOOKUP_RCU) return -ECHILD; - rc = lower_dentry->d_op->d_revalidate(lower_dentry, flags); + if (lower_dentry->d_flags & DCACHE_OP_REVALIDATE) + rc = lower_dentry->d_op->d_revalidate(lower_dentry, flags); + if (d_really_is_positive(dentry)) { - struct inode *lower_inode = - ecryptfs_inode_to_lower(d_inode(dentry)); + struct inode *inode = d_inode(dentry); - fsstack_copy_attr_all(d_inode(dentry), lower_inode); + fsstack_copy_attr_all(inode, ecryptfs_inode_to_lower(inode)); + if (!inode->i_nlink) + return 0; } return rc; } diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index cf208522998e..caba848ac763 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -299,7 +299,7 @@ static int ecryptfs_write_begin(struct file *file, rc = ecryptfs_read_lower_page_segment( page, index, 0, PAGE_CACHE_SIZE, mapping->host); if (rc) { - printk(KERN_ERR "%s: Error attemping to read " + printk(KERN_ERR "%s: Error attempting to read " "lower page segment; rc = [%d]\n", __func__, rc); ClearPageUptodate(page); diff --git a/fs/exec.c b/fs/exec.c index 1977c2a553ac..b06623a9347f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -98,6 +98,12 @@ static inline void put_binfmt(struct linux_binfmt * fmt) module_put(fmt->module); } +bool path_noexec(const struct path *path) +{ + return (path->mnt->mnt_flags & MNT_NOEXEC) || + (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC); +} + #ifdef CONFIG_USELIB /* * Note that a shared library must be both readable and executable due to @@ -132,7 +138,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) goto exit; error = -EACCES; - if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) + if (path_noexec(&file->f_path)) goto exit; fsnotify_open(file); @@ -777,7 +783,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags) if (!S_ISREG(file_inode(file)->i_mode)) goto exit; - if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) + if (path_noexec(&file->f_path)) goto exit; err = deny_write_access(file); diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 3b57c9f83c9b..1982c3f11aec 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -20,6 +20,7 @@ #include <linux/time.h> #include <linux/pagemap.h> +#include <linux/dax.h> #include <linux/quotaops.h> #include "ext2.h" #include "xattr.h" @@ -31,6 +32,12 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return dax_fault(vma, vmf, ext2_get_block, NULL); } +static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, unsigned int flags) +{ + return dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL); +} + static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { return dax_mkwrite(vma, vmf, ext2_get_block, NULL); @@ -38,6 +45,7 @@ static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) static const struct vm_operations_struct ext2_dax_vm_ops = { .fault = ext2_dax_fault, + .pmd_fault = ext2_dax_pmd_fault, .page_mkwrite = ext2_dax_mkwrite, .pfn_mkwrite = dax_pfn_mkwrite, }; @@ -49,7 +57,7 @@ static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); vma->vm_ops = &ext2_dax_vm_ops; - vma->vm_flags |= VM_MIXEDMAP; + vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; return 0; } #else diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 5c04a0ddea80..efe5fb21c533 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -577,7 +577,10 @@ got: goto fail; } - dquot_initialize(inode); + err = dquot_initialize(inode); + if (err) + goto fail_drop; + err = dquot_alloc_inode(inode); if (err) goto fail_drop; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 5c09776d347f..c60a248c640c 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -25,6 +25,7 @@ #include <linux/time.h> #include <linux/highuid.h> #include <linux/pagemap.h> +#include <linux/dax.h> #include <linux/quotaops.h> #include <linux/writeback.h> #include <linux/buffer_head.h> @@ -1552,8 +1553,11 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr) if (error) return error; - if (is_quota_modification(inode, iattr)) - dquot_initialize(inode); + if (is_quota_modification(inode, iattr)) { + error = dquot_initialize(inode); + if (error) + return error; + } if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) || (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) { error = dquot_transfer(inode, iattr); diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 13ec54a99c96..b4841e3066a5 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -96,8 +96,11 @@ struct dentry *ext2_get_parent(struct dentry *child) static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode, bool excl) { struct inode *inode; + int err; - dquot_initialize(dir); + err = dquot_initialize(dir); + if (err) + return err; inode = ext2_new_inode(dir, mode, &dentry->d_name); if (IS_ERR(inode)) @@ -143,7 +146,9 @@ static int ext2_mknod (struct inode * dir, struct dentry *dentry, umode_t mode, if (!new_valid_dev(rdev)) return -EINVAL; - dquot_initialize(dir); + err = dquot_initialize(dir); + if (err) + return err; inode = ext2_new_inode (dir, mode, &dentry->d_name); err = PTR_ERR(inode); @@ -169,7 +174,9 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry, if (l > sb->s_blocksize) goto out; - dquot_initialize(dir); + err = dquot_initialize(dir); + if (err) + goto out; inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO, &dentry->d_name); err = PTR_ERR(inode); @@ -212,7 +219,9 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir, struct inode *inode = d_inode(old_dentry); int err; - dquot_initialize(dir); + err = dquot_initialize(dir); + if (err) + return err; inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); @@ -233,7 +242,9 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) struct inode * inode; int err; - dquot_initialize(dir); + err = dquot_initialize(dir); + if (err) + return err; inode_inc_link_count(dir); @@ -279,13 +290,17 @@ static int ext2_unlink(struct inode * dir, struct dentry *dentry) struct inode * inode = d_inode(dentry); struct ext2_dir_entry_2 * de; struct page * page; - int err = -ENOENT; + int err; - dquot_initialize(dir); + err = dquot_initialize(dir); + if (err) + goto out; de = ext2_find_entry (dir, &dentry->d_name, &page); - if (!de) + if (!de) { + err = -ENOENT; goto out; + } err = ext2_delete_entry (de, page); if (err) @@ -323,14 +338,21 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, struct ext2_dir_entry_2 * dir_de = NULL; struct page * old_page; struct ext2_dir_entry_2 * old_de; - int err = -ENOENT; + int err; + + err = dquot_initialize(old_dir); + if (err) + goto out; - dquot_initialize(old_dir); - dquot_initialize(new_dir); + err = dquot_initialize(new_dir); + if (err) + goto out; old_de = ext2_find_entry (old_dir, &old_dentry->d_name, &old_page); - if (!old_de) + if (!old_de) { + err = -ENOENT; goto out; + } if (S_ISDIR(old_inode->i_mode)) { err = -EIO; diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig deleted file mode 100644 index e8c6ba0e4a3e..000000000000 --- a/fs/ext3/Kconfig +++ /dev/null @@ -1,89 +0,0 @@ -config EXT3_FS - tristate "Ext3 journalling file system support" - select JBD - help - This is the journalling version of the Second extended file system - (often called ext3), the de facto standard Linux file system - (method to organize files on a storage device) for hard disks. - - The journalling code included in this driver means you do not have - to run e2fsck (file system checker) on your file systems after a - crash. The journal keeps track of any changes that were being made - at the time the system crashed, and can ensure that your file system - is consistent without the need for a lengthy check. - - Other than adding the journal to the file system, the on-disk format - of ext3 is identical to ext2. It is possible to freely switch - between using the ext3 driver and the ext2 driver, as long as the - file system has been cleanly unmounted, or e2fsck is run on the file - system. - - To add a journal on an existing ext2 file system or change the - behavior of ext3 file systems, you can use the tune2fs utility ("man - tune2fs"). To modify attributes of files and directories on ext3 - file systems, use chattr ("man chattr"). You need to be using - e2fsprogs version 1.20 or later in order to create ext3 journals - (available at <http://sourceforge.net/projects/e2fsprogs/>). - - To compile this file system support as a module, choose M here: the - module will be called ext3. - -config EXT3_DEFAULTS_TO_ORDERED - bool "Default to 'data=ordered' in ext3" - depends on EXT3_FS - default y - help - The journal mode options for ext3 have different tradeoffs - between when data is guaranteed to be on disk and - performance. The use of "data=writeback" can cause - unwritten data to appear in files after an system crash or - power failure, which can be a security issue. However, - "data=ordered" mode can also result in major performance - problems, including seconds-long delays before an fsync() - call returns. For details, see: - - http://ext4.wiki.kernel.org/index.php/Ext3_data_mode_tradeoffs - - If you have been historically happy with ext3's performance, - data=ordered mode will be a safe choice and you should - answer 'y' here. If you understand the reliability and data - privacy issues of data=writeback and are willing to make - that trade off, answer 'n'. - -config EXT3_FS_XATTR - bool "Ext3 extended attributes" - depends on EXT3_FS - default y - help - Extended attributes are name:value pairs associated with inodes by - the kernel or by users (see the attr(5) manual page, or visit - <http://acl.bestbits.at/> for details). - - If unsure, say N. - - You need this for POSIX ACL support on ext3. - -config EXT3_FS_POSIX_ACL - bool "Ext3 POSIX Access Control Lists" - depends on EXT3_FS_XATTR - select FS_POSIX_ACL - help - Posix Access Control Lists (ACLs) support permissions for users and - groups beyond the owner/group/world scheme. - - To learn more about Access Control Lists, visit the Posix ACLs for - Linux website <http://acl.bestbits.at/>. - - If you don't know what Access Control Lists are, say N - -config EXT3_FS_SECURITY - bool "Ext3 Security Labels" - depends on EXT3_FS_XATTR - help - Security labels support alternative access control models - implemented by security modules like SELinux. This option - enables an extended attribute handler for file security - labels in the ext3 filesystem. - - If you are not using a security module that requires using - extended attributes for file security labels, say N. diff --git a/fs/ext3/Makefile b/fs/ext3/Makefile deleted file mode 100644 index e77766a8b3f0..000000000000 --- a/fs/ext3/Makefile +++ /dev/null @@ -1,12 +0,0 @@ -# -# Makefile for the linux ext3-filesystem routines. -# - -obj-$(CONFIG_EXT3_FS) += ext3.o - -ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ - ioctl.o namei.o super.o symlink.o hash.o resize.o ext3_jbd.o - -ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o -ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o -ext3-$(CONFIG_EXT3_FS_SECURITY) += xattr_security.o diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c deleted file mode 100644 index 8bbaf5bcf982..000000000000 --- a/fs/ext3/acl.c +++ /dev/null @@ -1,281 +0,0 @@ -/* - * linux/fs/ext3/acl.c - * - * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> - */ - -#include "ext3.h" -#include "xattr.h" -#include "acl.h" - -/* - * Convert from filesystem to in-memory representation. - */ -static struct posix_acl * -ext3_acl_from_disk(const void *value, size_t size) -{ - const char *end = (char *)value + size; - int n, count; - struct posix_acl *acl; - - if (!value) - return NULL; - if (size < sizeof(ext3_acl_header)) - return ERR_PTR(-EINVAL); - if (((ext3_acl_header *)value)->a_version != - cpu_to_le32(EXT3_ACL_VERSION)) - return ERR_PTR(-EINVAL); - value = (char *)value + sizeof(ext3_acl_header); - count = ext3_acl_count(size); - if (count < 0) - return ERR_PTR(-EINVAL); - if (count == 0) - return NULL; - acl = posix_acl_alloc(count, GFP_NOFS); - if (!acl) - return ERR_PTR(-ENOMEM); - for (n=0; n < count; n++) { - ext3_acl_entry *entry = - (ext3_acl_entry *)value; - if ((char *)value + sizeof(ext3_acl_entry_short) > end) - goto fail; - acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); - acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); - switch(acl->a_entries[n].e_tag) { - case ACL_USER_OBJ: - case ACL_GROUP_OBJ: - case ACL_MASK: - case ACL_OTHER: - value = (char *)value + - sizeof(ext3_acl_entry_short); - break; - - case ACL_USER: - value = (char *)value + sizeof(ext3_acl_entry); - if ((char *)value > end) - goto fail; - acl->a_entries[n].e_uid = - make_kuid(&init_user_ns, - le32_to_cpu(entry->e_id)); - break; - case ACL_GROUP: - value = (char *)value + sizeof(ext3_acl_entry); - if ((char *)value > end) - goto fail; - acl->a_entries[n].e_gid = - make_kgid(&init_user_ns, - le32_to_cpu(entry->e_id)); - break; - - default: - goto fail; - } - } - if (value != end) - goto fail; - return acl; - -fail: - posix_acl_release(acl); - return ERR_PTR(-EINVAL); -} - -/* - * Convert from in-memory to filesystem representation. - */ -static void * -ext3_acl_to_disk(const struct posix_acl *acl, size_t *size) -{ - ext3_acl_header *ext_acl; - char *e; - size_t n; - - *size = ext3_acl_size(acl->a_count); - ext_acl = kmalloc(sizeof(ext3_acl_header) + acl->a_count * - sizeof(ext3_acl_entry), GFP_NOFS); - if (!ext_acl) - return ERR_PTR(-ENOMEM); - ext_acl->a_version = cpu_to_le32(EXT3_ACL_VERSION); - e = (char *)ext_acl + sizeof(ext3_acl_header); - for (n=0; n < acl->a_count; n++) { - const struct posix_acl_entry *acl_e = &acl->a_entries[n]; - ext3_acl_entry *entry = (ext3_acl_entry *)e; - entry->e_tag = cpu_to_le16(acl_e->e_tag); - entry->e_perm = cpu_to_le16(acl_e->e_perm); - switch(acl_e->e_tag) { - case ACL_USER: - entry->e_id = cpu_to_le32( - from_kuid(&init_user_ns, acl_e->e_uid)); - e += sizeof(ext3_acl_entry); - break; - case ACL_GROUP: - entry->e_id = cpu_to_le32( - from_kgid(&init_user_ns, acl_e->e_gid)); - e += sizeof(ext3_acl_entry); - break; - - case ACL_USER_OBJ: - case ACL_GROUP_OBJ: - case ACL_MASK: - case ACL_OTHER: - e += sizeof(ext3_acl_entry_short); - break; - - default: - goto fail; - } - } - return (char *)ext_acl; - -fail: - kfree(ext_acl); - return ERR_PTR(-EINVAL); -} - -/* - * Inode operation get_posix_acl(). - * - * inode->i_mutex: don't care - */ -struct posix_acl * -ext3_get_acl(struct inode *inode, int type) -{ - int name_index; - char *value = NULL; - struct posix_acl *acl; - int retval; - - switch (type) { - case ACL_TYPE_ACCESS: - name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS; - break; - case ACL_TYPE_DEFAULT: - name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT; - break; - default: - BUG(); - } - - retval = ext3_xattr_get(inode, name_index, "", NULL, 0); - if (retval > 0) { - value = kmalloc(retval, GFP_NOFS); - if (!value) - return ERR_PTR(-ENOMEM); - retval = ext3_xattr_get(inode, name_index, "", value, retval); - } - if (retval > 0) - acl = ext3_acl_from_disk(value, retval); - else if (retval == -ENODATA || retval == -ENOSYS) - acl = NULL; - else - acl = ERR_PTR(retval); - kfree(value); - - if (!IS_ERR(acl)) - set_cached_acl(inode, type, acl); - - return acl; -} - -/* - * Set the access or default ACL of an inode. - * - * inode->i_mutex: down unless called from ext3_new_inode - */ -static int -__ext3_set_acl(handle_t *handle, struct inode *inode, int type, - struct posix_acl *acl) -{ - int name_index; - void *value = NULL; - size_t size = 0; - int error; - - switch(type) { - case ACL_TYPE_ACCESS: - name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS; - if (acl) { - error = posix_acl_equiv_mode(acl, &inode->i_mode); - if (error < 0) - return error; - else { - inode->i_ctime = CURRENT_TIME_SEC; - ext3_mark_inode_dirty(handle, inode); - if (error == 0) - acl = NULL; - } - } - break; - - case ACL_TYPE_DEFAULT: - name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT; - if (!S_ISDIR(inode->i_mode)) - return acl ? -EACCES : 0; - break; - - default: - return -EINVAL; - } - if (acl) { - value = ext3_acl_to_disk(acl, &size); - if (IS_ERR(value)) - return (int)PTR_ERR(value); - } - - error = ext3_xattr_set_handle(handle, inode, name_index, "", - value, size, 0); - - kfree(value); - - if (!error) - set_cached_acl(inode, type, acl); - - return error; -} - -int -ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type) -{ - handle_t *handle; - int error, retries = 0; - -retry: - handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - error = __ext3_set_acl(handle, inode, type, acl); - ext3_journal_stop(handle); - if (error == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - return error; -} - -/* - * Initialize the ACLs of a new inode. Called from ext3_new_inode. - * - * dir->i_mutex: down - * inode->i_mutex: up (access to inode is still exclusive) - */ -int -ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) -{ - struct posix_acl *default_acl, *acl; - int error; - - error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); - if (error) - return error; - - if (default_acl) { - error = __ext3_set_acl(handle, inode, ACL_TYPE_DEFAULT, - default_acl); - posix_acl_release(default_acl); - } - if (acl) { - if (!error) - error = __ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, - acl); - posix_acl_release(acl); - } - return error; -} diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h deleted file mode 100644 index ea1c69edab9e..000000000000 --- a/fs/ext3/acl.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - File: fs/ext3/acl.h - - (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org> -*/ - -#include <linux/posix_acl_xattr.h> - -#define EXT3_ACL_VERSION 0x0001 - -typedef struct { - __le16 e_tag; - __le16 e_perm; - __le32 e_id; -} ext3_acl_entry; - -typedef struct { - __le16 e_tag; - __le16 e_perm; -} ext3_acl_entry_short; - -typedef struct { - __le32 a_version; -} ext3_acl_header; - -static inline size_t ext3_acl_size(int count) -{ - if (count <= 4) { - return sizeof(ext3_acl_header) + - count * sizeof(ext3_acl_entry_short); - } else { - return sizeof(ext3_acl_header) + - 4 * sizeof(ext3_acl_entry_short) + - (count - 4) * sizeof(ext3_acl_entry); - } -} - -static inline int ext3_acl_count(size_t size) -{ - ssize_t s; - size -= sizeof(ext3_acl_header); - s = size - 4 * sizeof(ext3_acl_entry_short); - if (s < 0) { - if (size % sizeof(ext3_acl_entry_short)) - return -1; - return size / sizeof(ext3_acl_entry_short); - } else { - if (s % sizeof(ext3_acl_entry)) - return -1; - return s / sizeof(ext3_acl_entry) + 4; - } -} - -#ifdef CONFIG_EXT3_FS_POSIX_ACL - -/* acl.c */ -extern struct posix_acl *ext3_get_acl(struct inode *inode, int type); -extern int ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type); -extern int ext3_init_acl (handle_t *, struct inode *, struct inode *); - -#else /* CONFIG_EXT3_FS_POSIX_ACL */ -#include <linux/sched.h> -#define ext3_get_acl NULL -#define ext3_set_acl NULL - -static inline int -ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) -{ - return 0; -} -#endif /* CONFIG_EXT3_FS_POSIX_ACL */ - diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c deleted file mode 100644 index 158b5d4ce067..000000000000 --- a/fs/ext3/balloc.c +++ /dev/null @@ -1,2158 +0,0 @@ -/* - * linux/fs/ext3/balloc.c - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993 - * Big-endian to little-endian byte-swapping/bitmaps by - * David S. Miller (davem@caip.rutgers.edu), 1995 - */ - -#include <linux/quotaops.h> -#include <linux/blkdev.h> -#include "ext3.h" - -/* - * balloc.c contains the blocks allocation and deallocation routines - */ - -/* - * The free blocks are managed by bitmaps. A file system contains several - * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap - * block for inodes, N blocks for the inode table and data blocks. - * - * The file system contains group descriptors which are located after the - * super block. Each descriptor contains the number of the bitmap block and - * the free blocks count in the block. The descriptors are loaded in memory - * when a file system is mounted (see ext3_fill_super). - */ - - -#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) - -/* - * Calculate the block group number and offset, given a block number - */ -static void ext3_get_group_no_and_offset(struct super_block *sb, - ext3_fsblk_t blocknr, unsigned long *blockgrpp, ext3_grpblk_t *offsetp) -{ - struct ext3_super_block *es = EXT3_SB(sb)->s_es; - - blocknr = blocknr - le32_to_cpu(es->s_first_data_block); - if (offsetp) - *offsetp = blocknr % EXT3_BLOCKS_PER_GROUP(sb); - if (blockgrpp) - *blockgrpp = blocknr / EXT3_BLOCKS_PER_GROUP(sb); -} - -/** - * ext3_get_group_desc() -- load group descriptor from disk - * @sb: super block - * @block_group: given block group - * @bh: pointer to the buffer head to store the block - * group descriptor - */ -struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, - unsigned int block_group, - struct buffer_head ** bh) -{ - unsigned long group_desc; - unsigned long offset; - struct ext3_group_desc * desc; - struct ext3_sb_info *sbi = EXT3_SB(sb); - - if (block_group >= sbi->s_groups_count) { - ext3_error (sb, "ext3_get_group_desc", - "block_group >= groups_count - " - "block_group = %d, groups_count = %lu", - block_group, sbi->s_groups_count); - - return NULL; - } - smp_rmb(); - - group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb); - offset = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1); - if (!sbi->s_group_desc[group_desc]) { - ext3_error (sb, "ext3_get_group_desc", - "Group descriptor not loaded - " - "block_group = %d, group_desc = %lu, desc = %lu", - block_group, group_desc, offset); - return NULL; - } - - desc = (struct ext3_group_desc *) sbi->s_group_desc[group_desc]->b_data; - if (bh) - *bh = sbi->s_group_desc[group_desc]; - return desc + offset; -} - -static int ext3_valid_block_bitmap(struct super_block *sb, - struct ext3_group_desc *desc, - unsigned int block_group, - struct buffer_head *bh) -{ - ext3_grpblk_t offset; - ext3_grpblk_t next_zero_bit; - ext3_fsblk_t bitmap_blk; - ext3_fsblk_t group_first_block; - - group_first_block = ext3_group_first_block_no(sb, block_group); - - /* check whether block bitmap block number is set */ - bitmap_blk = le32_to_cpu(desc->bg_block_bitmap); - offset = bitmap_blk - group_first_block; - if (!ext3_test_bit(offset, bh->b_data)) - /* bad block bitmap */ - goto err_out; - - /* check whether the inode bitmap block number is set */ - bitmap_blk = le32_to_cpu(desc->bg_inode_bitmap); - offset = bitmap_blk - group_first_block; - if (!ext3_test_bit(offset, bh->b_data)) - /* bad block bitmap */ - goto err_out; - - /* check whether the inode table block number is set */ - bitmap_blk = le32_to_cpu(desc->bg_inode_table); - offset = bitmap_blk - group_first_block; - next_zero_bit = ext3_find_next_zero_bit(bh->b_data, - offset + EXT3_SB(sb)->s_itb_per_group, - offset); - if (next_zero_bit >= offset + EXT3_SB(sb)->s_itb_per_group) - /* good bitmap for inode tables */ - return 1; - -err_out: - ext3_error(sb, __func__, - "Invalid block bitmap - " - "block_group = %d, block = %lu", - block_group, bitmap_blk); - return 0; -} - -/** - * read_block_bitmap() - * @sb: super block - * @block_group: given block group - * - * Read the bitmap for a given block_group,and validate the - * bits for block/inode/inode tables are set in the bitmaps - * - * Return buffer_head on success or NULL in case of failure. - */ -static struct buffer_head * -read_block_bitmap(struct super_block *sb, unsigned int block_group) -{ - struct ext3_group_desc * desc; - struct buffer_head * bh = NULL; - ext3_fsblk_t bitmap_blk; - - desc = ext3_get_group_desc(sb, block_group, NULL); - if (!desc) - return NULL; - trace_ext3_read_block_bitmap(sb, block_group); - bitmap_blk = le32_to_cpu(desc->bg_block_bitmap); - bh = sb_getblk(sb, bitmap_blk); - if (unlikely(!bh)) { - ext3_error(sb, __func__, - "Cannot read block bitmap - " - "block_group = %d, block_bitmap = %u", - block_group, le32_to_cpu(desc->bg_block_bitmap)); - return NULL; - } - if (likely(bh_uptodate_or_lock(bh))) - return bh; - - if (bh_submit_read(bh) < 0) { - brelse(bh); - ext3_error(sb, __func__, - "Cannot read block bitmap - " - "block_group = %d, block_bitmap = %u", - block_group, le32_to_cpu(desc->bg_block_bitmap)); - return NULL; - } - ext3_valid_block_bitmap(sb, desc, block_group, bh); - /* - * file system mounted not to panic on error, continue with corrupt - * bitmap - */ - return bh; -} -/* - * The reservation window structure operations - * -------------------------------------------- - * Operations include: - * dump, find, add, remove, is_empty, find_next_reservable_window, etc. - * - * We use a red-black tree to represent per-filesystem reservation - * windows. - * - */ - -/** - * __rsv_window_dump() -- Dump the filesystem block allocation reservation map - * @rb_root: root of per-filesystem reservation rb tree - * @verbose: verbose mode - * @fn: function which wishes to dump the reservation map - * - * If verbose is turned on, it will print the whole block reservation - * windows(start, end). Otherwise, it will only print out the "bad" windows, - * those windows that overlap with their immediate neighbors. - */ -#if 1 -static void __rsv_window_dump(struct rb_root *root, int verbose, - const char *fn) -{ - struct rb_node *n; - struct ext3_reserve_window_node *rsv, *prev; - int bad; - -restart: - n = rb_first(root); - bad = 0; - prev = NULL; - - printk("Block Allocation Reservation Windows Map (%s):\n", fn); - while (n) { - rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node); - if (verbose) - printk("reservation window 0x%p " - "start: %lu, end: %lu\n", - rsv, rsv->rsv_start, rsv->rsv_end); - if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) { - printk("Bad reservation %p (start >= end)\n", - rsv); - bad = 1; - } - if (prev && prev->rsv_end >= rsv->rsv_start) { - printk("Bad reservation %p (prev->end >= start)\n", - rsv); - bad = 1; - } - if (bad) { - if (!verbose) { - printk("Restarting reservation walk in verbose mode\n"); - verbose = 1; - goto restart; - } - } - n = rb_next(n); - prev = rsv; - } - printk("Window map complete.\n"); - BUG_ON(bad); -} -#define rsv_window_dump(root, verbose) \ - __rsv_window_dump((root), (verbose), __func__) -#else -#define rsv_window_dump(root, verbose) do {} while (0) -#endif - -/** - * goal_in_my_reservation() - * @rsv: inode's reservation window - * @grp_goal: given goal block relative to the allocation block group - * @group: the current allocation block group - * @sb: filesystem super block - * - * Test if the given goal block (group relative) is within the file's - * own block reservation window range. - * - * If the reservation window is outside the goal allocation group, return 0; - * grp_goal (given goal block) could be -1, which means no specific - * goal block. In this case, always return 1. - * If the goal block is within the reservation window, return 1; - * otherwise, return 0; - */ -static int -goal_in_my_reservation(struct ext3_reserve_window *rsv, ext3_grpblk_t grp_goal, - unsigned int group, struct super_block * sb) -{ - ext3_fsblk_t group_first_block, group_last_block; - - group_first_block = ext3_group_first_block_no(sb, group); - group_last_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1); - - if ((rsv->_rsv_start > group_last_block) || - (rsv->_rsv_end < group_first_block)) - return 0; - if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start) - || (grp_goal + group_first_block > rsv->_rsv_end))) - return 0; - return 1; -} - -/** - * search_reserve_window() - * @rb_root: root of reservation tree - * @goal: target allocation block - * - * Find the reserved window which includes the goal, or the previous one - * if the goal is not in any window. - * Returns NULL if there are no windows or if all windows start after the goal. - */ -static struct ext3_reserve_window_node * -search_reserve_window(struct rb_root *root, ext3_fsblk_t goal) -{ - struct rb_node *n = root->rb_node; - struct ext3_reserve_window_node *rsv; - - if (!n) - return NULL; - - do { - rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node); - - if (goal < rsv->rsv_start) - n = n->rb_left; - else if (goal > rsv->rsv_end) - n = n->rb_right; - else - return rsv; - } while (n); - /* - * We've fallen off the end of the tree: the goal wasn't inside - * any particular node. OK, the previous node must be to one - * side of the interval containing the goal. If it's the RHS, - * we need to back up one. - */ - if (rsv->rsv_start > goal) { - n = rb_prev(&rsv->rsv_node); - rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node); - } - return rsv; -} - -/** - * ext3_rsv_window_add() -- Insert a window to the block reservation rb tree. - * @sb: super block - * @rsv: reservation window to add - * - * Must be called with rsv_lock hold. - */ -void ext3_rsv_window_add(struct super_block *sb, - struct ext3_reserve_window_node *rsv) -{ - struct rb_root *root = &EXT3_SB(sb)->s_rsv_window_root; - struct rb_node *node = &rsv->rsv_node; - ext3_fsblk_t start = rsv->rsv_start; - - struct rb_node ** p = &root->rb_node; - struct rb_node * parent = NULL; - struct ext3_reserve_window_node *this; - - trace_ext3_rsv_window_add(sb, rsv); - while (*p) - { - parent = *p; - this = rb_entry(parent, struct ext3_reserve_window_node, rsv_node); - - if (start < this->rsv_start) - p = &(*p)->rb_left; - else if (start > this->rsv_end) - p = &(*p)->rb_right; - else { - rsv_window_dump(root, 1); - BUG(); - } - } - - rb_link_node(node, parent, p); - rb_insert_color(node, root); -} - -/** - * ext3_rsv_window_remove() -- unlink a window from the reservation rb tree - * @sb: super block - * @rsv: reservation window to remove - * - * Mark the block reservation window as not allocated, and unlink it - * from the filesystem reservation window rb tree. Must be called with - * rsv_lock hold. - */ -static void rsv_window_remove(struct super_block *sb, - struct ext3_reserve_window_node *rsv) -{ - rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; - rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; - rsv->rsv_alloc_hit = 0; - rb_erase(&rsv->rsv_node, &EXT3_SB(sb)->s_rsv_window_root); -} - -/* - * rsv_is_empty() -- Check if the reservation window is allocated. - * @rsv: given reservation window to check - * - * returns 1 if the end block is EXT3_RESERVE_WINDOW_NOT_ALLOCATED. - */ -static inline int rsv_is_empty(struct ext3_reserve_window *rsv) -{ - /* a valid reservation end block could not be 0 */ - return rsv->_rsv_end == EXT3_RESERVE_WINDOW_NOT_ALLOCATED; -} - -/** - * ext3_init_block_alloc_info() - * @inode: file inode structure - * - * Allocate and initialize the reservation window structure, and - * link the window to the ext3 inode structure at last - * - * The reservation window structure is only dynamically allocated - * and linked to ext3 inode the first time the open file - * needs a new block. So, before every ext3_new_block(s) call, for - * regular files, we should check whether the reservation window - * structure exists or not. In the latter case, this function is called. - * Fail to do so will result in block reservation being turned off for that - * open file. - * - * This function is called from ext3_get_blocks_handle(), also called - * when setting the reservation window size through ioctl before the file - * is open for write (needs block allocation). - * - * Needs truncate_mutex protection prior to call this function. - */ -void ext3_init_block_alloc_info(struct inode *inode) -{ - struct ext3_inode_info *ei = EXT3_I(inode); - struct ext3_block_alloc_info *block_i; - struct super_block *sb = inode->i_sb; - - block_i = kmalloc(sizeof(*block_i), GFP_NOFS); - if (block_i) { - struct ext3_reserve_window_node *rsv = &block_i->rsv_window_node; - - rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; - rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; - - /* - * if filesystem is mounted with NORESERVATION, the goal - * reservation window size is set to zero to indicate - * block reservation is off - */ - if (!test_opt(sb, RESERVATION)) - rsv->rsv_goal_size = 0; - else - rsv->rsv_goal_size = EXT3_DEFAULT_RESERVE_BLOCKS; - rsv->rsv_alloc_hit = 0; - block_i->last_alloc_logical_block = 0; - block_i->last_alloc_physical_block = 0; - } - ei->i_block_alloc_info = block_i; -} - -/** - * ext3_discard_reservation() - * @inode: inode - * - * Discard(free) block reservation window on last file close, or truncate - * or at last iput(). - * - * It is being called in three cases: - * ext3_release_file(): last writer close the file - * ext3_clear_inode(): last iput(), when nobody link to this file. - * ext3_truncate(): when the block indirect map is about to change. - * - */ -void ext3_discard_reservation(struct inode *inode) -{ - struct ext3_inode_info *ei = EXT3_I(inode); - struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info; - struct ext3_reserve_window_node *rsv; - spinlock_t *rsv_lock = &EXT3_SB(inode->i_sb)->s_rsv_window_lock; - - if (!block_i) - return; - - rsv = &block_i->rsv_window_node; - if (!rsv_is_empty(&rsv->rsv_window)) { - spin_lock(rsv_lock); - if (!rsv_is_empty(&rsv->rsv_window)) { - trace_ext3_discard_reservation(inode, rsv); - rsv_window_remove(inode->i_sb, rsv); - } - spin_unlock(rsv_lock); - } -} - -/** - * ext3_free_blocks_sb() -- Free given blocks and update quota - * @handle: handle to this transaction - * @sb: super block - * @block: start physical block to free - * @count: number of blocks to free - * @pdquot_freed_blocks: pointer to quota - */ -void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb, - ext3_fsblk_t block, unsigned long count, - unsigned long *pdquot_freed_blocks) -{ - struct buffer_head *bitmap_bh = NULL; - struct buffer_head *gd_bh; - unsigned long block_group; - ext3_grpblk_t bit; - unsigned long i; - unsigned long overflow; - struct ext3_group_desc * desc; - struct ext3_super_block * es; - struct ext3_sb_info *sbi; - int err = 0, ret; - ext3_grpblk_t group_freed; - - *pdquot_freed_blocks = 0; - sbi = EXT3_SB(sb); - es = sbi->s_es; - if (block < le32_to_cpu(es->s_first_data_block) || - block + count < block || - block + count > le32_to_cpu(es->s_blocks_count)) { - ext3_error (sb, "ext3_free_blocks", - "Freeing blocks not in datazone - " - "block = "E3FSBLK", count = %lu", block, count); - goto error_return; - } - - ext3_debug ("freeing block(s) %lu-%lu\n", block, block + count - 1); - -do_more: - overflow = 0; - block_group = (block - le32_to_cpu(es->s_first_data_block)) / - EXT3_BLOCKS_PER_GROUP(sb); - bit = (block - le32_to_cpu(es->s_first_data_block)) % - EXT3_BLOCKS_PER_GROUP(sb); - /* - * Check to see if we are freeing blocks across a group - * boundary. - */ - if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) { - overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb); - count -= overflow; - } - brelse(bitmap_bh); - bitmap_bh = read_block_bitmap(sb, block_group); - if (!bitmap_bh) - goto error_return; - desc = ext3_get_group_desc (sb, block_group, &gd_bh); - if (!desc) - goto error_return; - - if (in_range (le32_to_cpu(desc->bg_block_bitmap), block, count) || - in_range (le32_to_cpu(desc->bg_inode_bitmap), block, count) || - in_range (block, le32_to_cpu(desc->bg_inode_table), - sbi->s_itb_per_group) || - in_range (block + count - 1, le32_to_cpu(desc->bg_inode_table), - sbi->s_itb_per_group)) { - ext3_error (sb, "ext3_free_blocks", - "Freeing blocks in system zones - " - "Block = "E3FSBLK", count = %lu", - block, count); - goto error_return; - } - - /* - * We are about to start releasing blocks in the bitmap, - * so we need undo access. - */ - /* @@@ check errors */ - BUFFER_TRACE(bitmap_bh, "getting undo access"); - err = ext3_journal_get_undo_access(handle, bitmap_bh); - if (err) - goto error_return; - - /* - * We are about to modify some metadata. Call the journal APIs - * to unshare ->b_data if a currently-committing transaction is - * using it - */ - BUFFER_TRACE(gd_bh, "get_write_access"); - err = ext3_journal_get_write_access(handle, gd_bh); - if (err) - goto error_return; - - jbd_lock_bh_state(bitmap_bh); - - for (i = 0, group_freed = 0; i < count; i++) { - /* - * An HJ special. This is expensive... - */ -#ifdef CONFIG_JBD_DEBUG - jbd_unlock_bh_state(bitmap_bh); - { - struct buffer_head *debug_bh; - debug_bh = sb_find_get_block(sb, block + i); - if (debug_bh) { - BUFFER_TRACE(debug_bh, "Deleted!"); - if (!bh2jh(bitmap_bh)->b_committed_data) - BUFFER_TRACE(debug_bh, - "No committed data in bitmap"); - BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap"); - __brelse(debug_bh); - } - } - jbd_lock_bh_state(bitmap_bh); -#endif - if (need_resched()) { - jbd_unlock_bh_state(bitmap_bh); - cond_resched(); - jbd_lock_bh_state(bitmap_bh); - } - /* @@@ This prevents newly-allocated data from being - * freed and then reallocated within the same - * transaction. - * - * Ideally we would want to allow that to happen, but to - * do so requires making journal_forget() capable of - * revoking the queued write of a data block, which - * implies blocking on the journal lock. *forget() - * cannot block due to truncate races. - * - * Eventually we can fix this by making journal_forget() - * return a status indicating whether or not it was able - * to revoke the buffer. On successful revoke, it is - * safe not to set the allocation bit in the committed - * bitmap, because we know that there is no outstanding - * activity on the buffer any more and so it is safe to - * reallocate it. - */ - BUFFER_TRACE(bitmap_bh, "set in b_committed_data"); - J_ASSERT_BH(bitmap_bh, - bh2jh(bitmap_bh)->b_committed_data != NULL); - ext3_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i, - bh2jh(bitmap_bh)->b_committed_data); - - /* - * We clear the bit in the bitmap after setting the committed - * data bit, because this is the reverse order to that which - * the allocator uses. - */ - BUFFER_TRACE(bitmap_bh, "clear bit"); - if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group), - bit + i, bitmap_bh->b_data)) { - jbd_unlock_bh_state(bitmap_bh); - ext3_error(sb, __func__, - "bit already cleared for block "E3FSBLK, - block + i); - jbd_lock_bh_state(bitmap_bh); - BUFFER_TRACE(bitmap_bh, "bit already cleared"); - } else { - group_freed++; - } - } - jbd_unlock_bh_state(bitmap_bh); - - spin_lock(sb_bgl_lock(sbi, block_group)); - le16_add_cpu(&desc->bg_free_blocks_count, group_freed); - spin_unlock(sb_bgl_lock(sbi, block_group)); - percpu_counter_add(&sbi->s_freeblocks_counter, count); - - /* We dirtied the bitmap block */ - BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); - err = ext3_journal_dirty_metadata(handle, bitmap_bh); - - /* And the group descriptor block */ - BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); - ret = ext3_journal_dirty_metadata(handle, gd_bh); - if (!err) err = ret; - *pdquot_freed_blocks += group_freed; - - if (overflow && !err) { - block += count; - count = overflow; - goto do_more; - } - -error_return: - brelse(bitmap_bh); - ext3_std_error(sb, err); - return; -} - -/** - * ext3_free_blocks() -- Free given blocks and update quota - * @handle: handle for this transaction - * @inode: inode - * @block: start physical block to free - * @count: number of blocks to count - */ -void ext3_free_blocks(handle_t *handle, struct inode *inode, - ext3_fsblk_t block, unsigned long count) -{ - struct super_block *sb = inode->i_sb; - unsigned long dquot_freed_blocks; - - trace_ext3_free_blocks(inode, block, count); - ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); - if (dquot_freed_blocks) - dquot_free_block(inode, dquot_freed_blocks); - return; -} - -/** - * ext3_test_allocatable() - * @nr: given allocation block group - * @bh: bufferhead contains the bitmap of the given block group - * - * For ext3 allocations, we must not reuse any blocks which are - * allocated in the bitmap buffer's "last committed data" copy. This - * prevents deletes from freeing up the page for reuse until we have - * committed the delete transaction. - * - * If we didn't do this, then deleting something and reallocating it as - * data would allow the old block to be overwritten before the - * transaction committed (because we force data to disk before commit). - * This would lead to corruption if we crashed between overwriting the - * data and committing the delete. - * - * @@@ We may want to make this allocation behaviour conditional on - * data-writes at some point, and disable it for metadata allocations or - * sync-data inodes. - */ -static int ext3_test_allocatable(ext3_grpblk_t nr, struct buffer_head *bh) -{ - int ret; - struct journal_head *jh = bh2jh(bh); - - if (ext3_test_bit(nr, bh->b_data)) - return 0; - - jbd_lock_bh_state(bh); - if (!jh->b_committed_data) - ret = 1; - else - ret = !ext3_test_bit(nr, jh->b_committed_data); - jbd_unlock_bh_state(bh); - return ret; -} - -/** - * bitmap_search_next_usable_block() - * @start: the starting block (group relative) of the search - * @bh: bufferhead contains the block group bitmap - * @maxblocks: the ending block (group relative) of the reservation - * - * The bitmap search --- search forward alternately through the actual - * bitmap on disk and the last-committed copy in journal, until we find a - * bit free in both bitmaps. - */ -static ext3_grpblk_t -bitmap_search_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh, - ext3_grpblk_t maxblocks) -{ - ext3_grpblk_t next; - struct journal_head *jh = bh2jh(bh); - - while (start < maxblocks) { - next = ext3_find_next_zero_bit(bh->b_data, maxblocks, start); - if (next >= maxblocks) - return -1; - if (ext3_test_allocatable(next, bh)) - return next; - jbd_lock_bh_state(bh); - if (jh->b_committed_data) - start = ext3_find_next_zero_bit(jh->b_committed_data, - maxblocks, next); - jbd_unlock_bh_state(bh); - } - return -1; -} - -/** - * find_next_usable_block() - * @start: the starting block (group relative) to find next - * allocatable block in bitmap. - * @bh: bufferhead contains the block group bitmap - * @maxblocks: the ending block (group relative) for the search - * - * Find an allocatable block in a bitmap. We honor both the bitmap and - * its last-committed copy (if that exists), and perform the "most - * appropriate allocation" algorithm of looking for a free block near - * the initial goal; then for a free byte somewhere in the bitmap; then - * for any free bit in the bitmap. - */ -static ext3_grpblk_t -find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh, - ext3_grpblk_t maxblocks) -{ - ext3_grpblk_t here, next; - char *p, *r; - - if (start > 0) { - /* - * The goal was occupied; search forward for a free - * block within the next XX blocks. - * - * end_goal is more or less random, but it has to be - * less than EXT3_BLOCKS_PER_GROUP. Aligning up to the - * next 64-bit boundary is simple.. - */ - ext3_grpblk_t end_goal = (start + 63) & ~63; - if (end_goal > maxblocks) - end_goal = maxblocks; - here = ext3_find_next_zero_bit(bh->b_data, end_goal, start); - if (here < end_goal && ext3_test_allocatable(here, bh)) - return here; - ext3_debug("Bit not found near goal\n"); - } - - here = start; - if (here < 0) - here = 0; - - p = bh->b_data + (here >> 3); - r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3)); - next = (r - bh->b_data) << 3; - - if (next < maxblocks && next >= start && ext3_test_allocatable(next, bh)) - return next; - - /* - * The bitmap search --- search forward alternately through the actual - * bitmap and the last-committed copy until we find a bit free in - * both - */ - here = bitmap_search_next_usable_block(here, bh, maxblocks); - return here; -} - -/** - * claim_block() - * @lock: the spin lock for this block group - * @block: the free block (group relative) to allocate - * @bh: the buffer_head contains the block group bitmap - * - * We think we can allocate this block in this bitmap. Try to set the bit. - * If that succeeds then check that nobody has allocated and then freed the - * block since we saw that is was not marked in b_committed_data. If it _was_ - * allocated and freed then clear the bit in the bitmap again and return - * zero (failure). - */ -static inline int -claim_block(spinlock_t *lock, ext3_grpblk_t block, struct buffer_head *bh) -{ - struct journal_head *jh = bh2jh(bh); - int ret; - - if (ext3_set_bit_atomic(lock, block, bh->b_data)) - return 0; - jbd_lock_bh_state(bh); - if (jh->b_committed_data && ext3_test_bit(block,jh->b_committed_data)) { - ext3_clear_bit_atomic(lock, block, bh->b_data); - ret = 0; - } else { - ret = 1; - } - jbd_unlock_bh_state(bh); - return ret; -} - -/** - * ext3_try_to_allocate() - * @sb: superblock - * @handle: handle to this transaction - * @group: given allocation block group - * @bitmap_bh: bufferhead holds the block bitmap - * @grp_goal: given target block within the group - * @count: target number of blocks to allocate - * @my_rsv: reservation window - * - * Attempt to allocate blocks within a give range. Set the range of allocation - * first, then find the first free bit(s) from the bitmap (within the range), - * and at last, allocate the blocks by claiming the found free bit as allocated. - * - * To set the range of this allocation: - * if there is a reservation window, only try to allocate block(s) from the - * file's own reservation window; - * Otherwise, the allocation range starts from the give goal block, ends at - * the block group's last block. - * - * If we failed to allocate the desired block then we may end up crossing to a - * new bitmap. In that case we must release write access to the old one via - * ext3_journal_release_buffer(), else we'll run out of credits. - */ -static ext3_grpblk_t -ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group, - struct buffer_head *bitmap_bh, ext3_grpblk_t grp_goal, - unsigned long *count, struct ext3_reserve_window *my_rsv) -{ - ext3_fsblk_t group_first_block; - ext3_grpblk_t start, end; - unsigned long num = 0; - - /* we do allocation within the reservation window if we have a window */ - if (my_rsv) { - group_first_block = ext3_group_first_block_no(sb, group); - if (my_rsv->_rsv_start >= group_first_block) - start = my_rsv->_rsv_start - group_first_block; - else - /* reservation window cross group boundary */ - start = 0; - end = my_rsv->_rsv_end - group_first_block + 1; - if (end > EXT3_BLOCKS_PER_GROUP(sb)) - /* reservation window crosses group boundary */ - end = EXT3_BLOCKS_PER_GROUP(sb); - if ((start <= grp_goal) && (grp_goal < end)) - start = grp_goal; - else - grp_goal = -1; - } else { - if (grp_goal > 0) - start = grp_goal; - else - start = 0; - end = EXT3_BLOCKS_PER_GROUP(sb); - } - - BUG_ON(start > EXT3_BLOCKS_PER_GROUP(sb)); - -repeat: - if (grp_goal < 0 || !ext3_test_allocatable(grp_goal, bitmap_bh)) { - grp_goal = find_next_usable_block(start, bitmap_bh, end); - if (grp_goal < 0) - goto fail_access; - if (!my_rsv) { - int i; - - for (i = 0; i < 7 && grp_goal > start && - ext3_test_allocatable(grp_goal - 1, - bitmap_bh); - i++, grp_goal--) - ; - } - } - start = grp_goal; - - if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group), - grp_goal, bitmap_bh)) { - /* - * The block was allocated by another thread, or it was - * allocated and then freed by another thread - */ - start++; - grp_goal++; - if (start >= end) - goto fail_access; - goto repeat; - } - num++; - grp_goal++; - while (num < *count && grp_goal < end - && ext3_test_allocatable(grp_goal, bitmap_bh) - && claim_block(sb_bgl_lock(EXT3_SB(sb), group), - grp_goal, bitmap_bh)) { - num++; - grp_goal++; - } - *count = num; - return grp_goal - num; -fail_access: - *count = num; - return -1; -} - -/** - * find_next_reservable_window(): - * find a reservable space within the given range. - * It does not allocate the reservation window for now: - * alloc_new_reservation() will do the work later. - * - * @search_head: the head of the searching list; - * This is not necessarily the list head of the whole filesystem - * - * We have both head and start_block to assist the search - * for the reservable space. The list starts from head, - * but we will shift to the place where start_block is, - * then start from there, when looking for a reservable space. - * - * @my_rsv: the reservation window - * - * @sb: the super block - * - * @start_block: the first block we consider to start - * the real search from - * - * @last_block: - * the maximum block number that our goal reservable space - * could start from. This is normally the last block in this - * group. The search will end when we found the start of next - * possible reservable space is out of this boundary. - * This could handle the cross boundary reservation window - * request. - * - * basically we search from the given range, rather than the whole - * reservation double linked list, (start_block, last_block) - * to find a free region that is of my size and has not - * been reserved. - * - */ -static int find_next_reservable_window( - struct ext3_reserve_window_node *search_head, - struct ext3_reserve_window_node *my_rsv, - struct super_block * sb, - ext3_fsblk_t start_block, - ext3_fsblk_t last_block) -{ - struct rb_node *next; - struct ext3_reserve_window_node *rsv, *prev; - ext3_fsblk_t cur; - int size = my_rsv->rsv_goal_size; - - /* TODO: make the start of the reservation window byte-aligned */ - /* cur = *start_block & ~7;*/ - cur = start_block; - rsv = search_head; - if (!rsv) - return -1; - - while (1) { - if (cur <= rsv->rsv_end) - cur = rsv->rsv_end + 1; - - /* TODO? - * in the case we could not find a reservable space - * that is what is expected, during the re-search, we could - * remember what's the largest reservable space we could have - * and return that one. - * - * For now it will fail if we could not find the reservable - * space with expected-size (or more)... - */ - if (cur > last_block) - return -1; /* fail */ - - prev = rsv; - next = rb_next(&rsv->rsv_node); - rsv = rb_entry(next,struct ext3_reserve_window_node,rsv_node); - - /* - * Reached the last reservation, we can just append to the - * previous one. - */ - if (!next) - break; - - if (cur + size <= rsv->rsv_start) { - /* - * Found a reserveable space big enough. We could - * have a reservation across the group boundary here - */ - break; - } - } - /* - * we come here either : - * when we reach the end of the whole list, - * and there is empty reservable space after last entry in the list. - * append it to the end of the list. - * - * or we found one reservable space in the middle of the list, - * return the reservation window that we could append to. - * succeed. - */ - - if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window))) - rsv_window_remove(sb, my_rsv); - - /* - * Let's book the whole available window for now. We will check the - * disk bitmap later and then, if there are free blocks then we adjust - * the window size if it's larger than requested. - * Otherwise, we will remove this node from the tree next time - * call find_next_reservable_window. - */ - my_rsv->rsv_start = cur; - my_rsv->rsv_end = cur + size - 1; - my_rsv->rsv_alloc_hit = 0; - - if (prev != my_rsv) - ext3_rsv_window_add(sb, my_rsv); - - return 0; -} - -/** - * alloc_new_reservation()--allocate a new reservation window - * - * To make a new reservation, we search part of the filesystem - * reservation list (the list that inside the group). We try to - * allocate a new reservation window near the allocation goal, - * or the beginning of the group, if there is no goal. - * - * We first find a reservable space after the goal, then from - * there, we check the bitmap for the first free block after - * it. If there is no free block until the end of group, then the - * whole group is full, we failed. Otherwise, check if the free - * block is inside the expected reservable space, if so, we - * succeed. - * If the first free block is outside the reservable space, then - * start from the first free block, we search for next available - * space, and go on. - * - * on succeed, a new reservation will be found and inserted into the list - * It contains at least one free block, and it does not overlap with other - * reservation windows. - * - * failed: we failed to find a reservation window in this group - * - * @my_rsv: the reservation window - * - * @grp_goal: The goal (group-relative). It is where the search for a - * free reservable space should start from. - * if we have a grp_goal(grp_goal >0 ), then start from there, - * no grp_goal(grp_goal = -1), we start from the first block - * of the group. - * - * @sb: the super block - * @group: the group we are trying to allocate in - * @bitmap_bh: the block group block bitmap - * - */ -static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv, - ext3_grpblk_t grp_goal, struct super_block *sb, - unsigned int group, struct buffer_head *bitmap_bh) -{ - struct ext3_reserve_window_node *search_head; - ext3_fsblk_t group_first_block, group_end_block, start_block; - ext3_grpblk_t first_free_block; - struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root; - unsigned long size; - int ret; - spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock; - - group_first_block = ext3_group_first_block_no(sb, group); - group_end_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1); - - if (grp_goal < 0) - start_block = group_first_block; - else - start_block = grp_goal + group_first_block; - - trace_ext3_alloc_new_reservation(sb, start_block); - size = my_rsv->rsv_goal_size; - - if (!rsv_is_empty(&my_rsv->rsv_window)) { - /* - * if the old reservation is cross group boundary - * and if the goal is inside the old reservation window, - * we will come here when we just failed to allocate from - * the first part of the window. We still have another part - * that belongs to the next group. In this case, there is no - * point to discard our window and try to allocate a new one - * in this group(which will fail). we should - * keep the reservation window, just simply move on. - * - * Maybe we could shift the start block of the reservation - * window to the first block of next group. - */ - - if ((my_rsv->rsv_start <= group_end_block) && - (my_rsv->rsv_end > group_end_block) && - (start_block >= my_rsv->rsv_start)) - return -1; - - if ((my_rsv->rsv_alloc_hit > - (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) { - /* - * if the previously allocation hit ratio is - * greater than 1/2, then we double the size of - * the reservation window the next time, - * otherwise we keep the same size window - */ - size = size * 2; - if (size > EXT3_MAX_RESERVE_BLOCKS) - size = EXT3_MAX_RESERVE_BLOCKS; - my_rsv->rsv_goal_size= size; - } - } - - spin_lock(rsv_lock); - /* - * shift the search start to the window near the goal block - */ - search_head = search_reserve_window(fs_rsv_root, start_block); - - /* - * find_next_reservable_window() simply finds a reservable window - * inside the given range(start_block, group_end_block). - * - * To make sure the reservation window has a free bit inside it, we - * need to check the bitmap after we found a reservable window. - */ -retry: - ret = find_next_reservable_window(search_head, my_rsv, sb, - start_block, group_end_block); - - if (ret == -1) { - if (!rsv_is_empty(&my_rsv->rsv_window)) - rsv_window_remove(sb, my_rsv); - spin_unlock(rsv_lock); - return -1; - } - - /* - * On success, find_next_reservable_window() returns the - * reservation window where there is a reservable space after it. - * Before we reserve this reservable space, we need - * to make sure there is at least a free block inside this region. - * - * searching the first free bit on the block bitmap and copy of - * last committed bitmap alternatively, until we found a allocatable - * block. Search start from the start block of the reservable space - * we just found. - */ - spin_unlock(rsv_lock); - first_free_block = bitmap_search_next_usable_block( - my_rsv->rsv_start - group_first_block, - bitmap_bh, group_end_block - group_first_block + 1); - - if (first_free_block < 0) { - /* - * no free block left on the bitmap, no point - * to reserve the space. return failed. - */ - spin_lock(rsv_lock); - if (!rsv_is_empty(&my_rsv->rsv_window)) - rsv_window_remove(sb, my_rsv); - spin_unlock(rsv_lock); - return -1; /* failed */ - } - - start_block = first_free_block + group_first_block; - /* - * check if the first free block is within the - * free space we just reserved - */ - if (start_block >= my_rsv->rsv_start && - start_block <= my_rsv->rsv_end) { - trace_ext3_reserved(sb, start_block, my_rsv); - return 0; /* success */ - } - /* - * if the first free bit we found is out of the reservable space - * continue search for next reservable space, - * start from where the free block is, - * we also shift the list head to where we stopped last time - */ - search_head = my_rsv; - spin_lock(rsv_lock); - goto retry; -} - -/** - * try_to_extend_reservation() - * @my_rsv: given reservation window - * @sb: super block - * @size: the delta to extend - * - * Attempt to expand the reservation window large enough to have - * required number of free blocks - * - * Since ext3_try_to_allocate() will always allocate blocks within - * the reservation window range, if the window size is too small, - * multiple blocks allocation has to stop at the end of the reservation - * window. To make this more efficient, given the total number of - * blocks needed and the current size of the window, we try to - * expand the reservation window size if necessary on a best-effort - * basis before ext3_new_blocks() tries to allocate blocks, - */ -static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv, - struct super_block *sb, int size) -{ - struct ext3_reserve_window_node *next_rsv; - struct rb_node *next; - spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock; - - if (!spin_trylock(rsv_lock)) - return; - - next = rb_next(&my_rsv->rsv_node); - - if (!next) - my_rsv->rsv_end += size; - else { - next_rsv = rb_entry(next, struct ext3_reserve_window_node, rsv_node); - - if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size) - my_rsv->rsv_end += size; - else - my_rsv->rsv_end = next_rsv->rsv_start - 1; - } - spin_unlock(rsv_lock); -} - -/** - * ext3_try_to_allocate_with_rsv() - * @sb: superblock - * @handle: handle to this transaction - * @group: given allocation block group - * @bitmap_bh: bufferhead holds the block bitmap - * @grp_goal: given target block within the group - * @my_rsv: reservation window - * @count: target number of blocks to allocate - * @errp: pointer to store the error code - * - * This is the main function used to allocate a new block and its reservation - * window. - * - * Each time when a new block allocation is need, first try to allocate from - * its own reservation. If it does not have a reservation window, instead of - * looking for a free bit on bitmap first, then look up the reservation list to - * see if it is inside somebody else's reservation window, we try to allocate a - * reservation window for it starting from the goal first. Then do the block - * allocation within the reservation window. - * - * This will avoid keeping on searching the reservation list again and - * again when somebody is looking for a free block (without - * reservation), and there are lots of free blocks, but they are all - * being reserved. - * - * We use a red-black tree for the per-filesystem reservation list. - * - */ -static ext3_grpblk_t -ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle, - unsigned int group, struct buffer_head *bitmap_bh, - ext3_grpblk_t grp_goal, - struct ext3_reserve_window_node * my_rsv, - unsigned long *count, int *errp) -{ - ext3_fsblk_t group_first_block, group_last_block; - ext3_grpblk_t ret = 0; - int fatal; - unsigned long num = *count; - - *errp = 0; - - /* - * Make sure we use undo access for the bitmap, because it is critical - * that we do the frozen_data COW on bitmap buffers in all cases even - * if the buffer is in BJ_Forget state in the committing transaction. - */ - BUFFER_TRACE(bitmap_bh, "get undo access for new block"); - fatal = ext3_journal_get_undo_access(handle, bitmap_bh); - if (fatal) { - *errp = fatal; - return -1; - } - - /* - * we don't deal with reservation when - * filesystem is mounted without reservation - * or the file is not a regular file - * or last attempt to allocate a block with reservation turned on failed - */ - if (my_rsv == NULL ) { - ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, - grp_goal, count, NULL); - goto out; - } - /* - * grp_goal is a group relative block number (if there is a goal) - * 0 <= grp_goal < EXT3_BLOCKS_PER_GROUP(sb) - * first block is a filesystem wide block number - * first block is the block number of the first block in this group - */ - group_first_block = ext3_group_first_block_no(sb, group); - group_last_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1); - - /* - * Basically we will allocate a new block from inode's reservation - * window. - * - * We need to allocate a new reservation window, if: - * a) inode does not have a reservation window; or - * b) last attempt to allocate a block from existing reservation - * failed; or - * c) we come here with a goal and with a reservation window - * - * We do not need to allocate a new reservation window if we come here - * at the beginning with a goal and the goal is inside the window, or - * we don't have a goal but already have a reservation window. - * then we could go to allocate from the reservation window directly. - */ - while (1) { - if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) || - !goal_in_my_reservation(&my_rsv->rsv_window, - grp_goal, group, sb)) { - if (my_rsv->rsv_goal_size < *count) - my_rsv->rsv_goal_size = *count; - ret = alloc_new_reservation(my_rsv, grp_goal, sb, - group, bitmap_bh); - if (ret < 0) - break; /* failed */ - - if (!goal_in_my_reservation(&my_rsv->rsv_window, - grp_goal, group, sb)) - grp_goal = -1; - } else if (grp_goal >= 0) { - int curr = my_rsv->rsv_end - - (grp_goal + group_first_block) + 1; - - if (curr < *count) - try_to_extend_reservation(my_rsv, sb, - *count - curr); - } - - if ((my_rsv->rsv_start > group_last_block) || - (my_rsv->rsv_end < group_first_block)) { - rsv_window_dump(&EXT3_SB(sb)->s_rsv_window_root, 1); - BUG(); - } - ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, - grp_goal, &num, &my_rsv->rsv_window); - if (ret >= 0) { - my_rsv->rsv_alloc_hit += num; - *count = num; - break; /* succeed */ - } - num = *count; - } -out: - if (ret >= 0) { - BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for " - "bitmap block"); - fatal = ext3_journal_dirty_metadata(handle, bitmap_bh); - if (fatal) { - *errp = fatal; - return -1; - } - return ret; - } - - BUFFER_TRACE(bitmap_bh, "journal_release_buffer"); - ext3_journal_release_buffer(handle, bitmap_bh); - return ret; -} - -/** - * ext3_has_free_blocks() - * @sbi: in-core super block structure. - * - * Check if filesystem has at least 1 free block available for allocation. - */ -static int ext3_has_free_blocks(struct ext3_sb_info *sbi, int use_reservation) -{ - ext3_fsblk_t free_blocks, root_blocks; - - free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); - root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count); - if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && - !use_reservation && !uid_eq(sbi->s_resuid, current_fsuid()) && - (gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) || - !in_group_p (sbi->s_resgid))) { - return 0; - } - return 1; -} - -/** - * ext3_should_retry_alloc() - * @sb: super block - * @retries number of attemps has been made - * - * ext3_should_retry_alloc() is called when ENOSPC is returned, and if - * it is profitable to retry the operation, this function will wait - * for the current or committing transaction to complete, and then - * return TRUE. - * - * if the total number of retries exceed three times, return FALSE. - */ -int ext3_should_retry_alloc(struct super_block *sb, int *retries) -{ - if (!ext3_has_free_blocks(EXT3_SB(sb), 0) || (*retries)++ > 3) - return 0; - - jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); - - return journal_force_commit_nested(EXT3_SB(sb)->s_journal); -} - -/** - * ext3_new_blocks() -- core block(s) allocation function - * @handle: handle to this transaction - * @inode: file inode - * @goal: given target block(filesystem wide) - * @count: target number of blocks to allocate - * @errp: error code - * - * ext3_new_blocks uses a goal block to assist allocation. It tries to - * allocate block(s) from the block group contains the goal block first. If that - * fails, it will try to allocate block(s) from other block groups without - * any specific goal block. - * - */ -ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode, - ext3_fsblk_t goal, unsigned long *count, int *errp) -{ - struct buffer_head *bitmap_bh = NULL; - struct buffer_head *gdp_bh; - int group_no; - int goal_group; - ext3_grpblk_t grp_target_blk; /* blockgroup relative goal block */ - ext3_grpblk_t grp_alloc_blk; /* blockgroup-relative allocated block*/ - ext3_fsblk_t ret_block; /* filesyetem-wide allocated block */ - int bgi; /* blockgroup iteration index */ - int fatal = 0, err; - int performed_allocation = 0; - ext3_grpblk_t free_blocks; /* number of free blocks in a group */ - struct super_block *sb; - struct ext3_group_desc *gdp; - struct ext3_super_block *es; - struct ext3_sb_info *sbi; - struct ext3_reserve_window_node *my_rsv = NULL; - struct ext3_block_alloc_info *block_i; - unsigned short windowsz = 0; -#ifdef EXT3FS_DEBUG - static int goal_hits, goal_attempts; -#endif - unsigned long ngroups; - unsigned long num = *count; - - *errp = -ENOSPC; - sb = inode->i_sb; - - /* - * Check quota for allocation of this block. - */ - err = dquot_alloc_block(inode, num); - if (err) { - *errp = err; - return 0; - } - - trace_ext3_request_blocks(inode, goal, num); - - sbi = EXT3_SB(sb); - es = sbi->s_es; - ext3_debug("goal=%lu.\n", goal); - /* - * Allocate a block from reservation only when - * filesystem is mounted with reservation(default,-o reservation), and - * it's a regular file, and - * the desired window size is greater than 0 (One could use ioctl - * command EXT3_IOC_SETRSVSZ to set the window size to 0 to turn off - * reservation on that particular file) - */ - block_i = EXT3_I(inode)->i_block_alloc_info; - if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0)) - my_rsv = &block_i->rsv_window_node; - - if (!ext3_has_free_blocks(sbi, IS_NOQUOTA(inode))) { - *errp = -ENOSPC; - goto out; - } - - /* - * First, test whether the goal block is free. - */ - if (goal < le32_to_cpu(es->s_first_data_block) || - goal >= le32_to_cpu(es->s_blocks_count)) - goal = le32_to_cpu(es->s_first_data_block); - group_no = (goal - le32_to_cpu(es->s_first_data_block)) / - EXT3_BLOCKS_PER_GROUP(sb); - goal_group = group_no; -retry_alloc: - gdp = ext3_get_group_desc(sb, group_no, &gdp_bh); - if (!gdp) - goto io_error; - - free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); - /* - * if there is not enough free blocks to make a new resevation - * turn off reservation for this allocation - */ - if (my_rsv && (free_blocks < windowsz) - && (free_blocks > 0) - && (rsv_is_empty(&my_rsv->rsv_window))) - my_rsv = NULL; - - if (free_blocks > 0) { - grp_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) % - EXT3_BLOCKS_PER_GROUP(sb)); - bitmap_bh = read_block_bitmap(sb, group_no); - if (!bitmap_bh) - goto io_error; - grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle, - group_no, bitmap_bh, grp_target_blk, - my_rsv, &num, &fatal); - if (fatal) - goto out; - if (grp_alloc_blk >= 0) - goto allocated; - } - - ngroups = EXT3_SB(sb)->s_groups_count; - smp_rmb(); - - /* - * Now search the rest of the groups. We assume that - * group_no and gdp correctly point to the last group visited. - */ - for (bgi = 0; bgi < ngroups; bgi++) { - group_no++; - if (group_no >= ngroups) - group_no = 0; - gdp = ext3_get_group_desc(sb, group_no, &gdp_bh); - if (!gdp) - goto io_error; - free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); - /* - * skip this group (and avoid loading bitmap) if there - * are no free blocks - */ - if (!free_blocks) - continue; - /* - * skip this group if the number of - * free blocks is less than half of the reservation - * window size. - */ - if (my_rsv && (free_blocks <= (windowsz/2))) - continue; - - brelse(bitmap_bh); - bitmap_bh = read_block_bitmap(sb, group_no); - if (!bitmap_bh) - goto io_error; - /* - * try to allocate block(s) from this group, without a goal(-1). - */ - grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle, - group_no, bitmap_bh, -1, my_rsv, - &num, &fatal); - if (fatal) - goto out; - if (grp_alloc_blk >= 0) - goto allocated; - } - /* - * We may end up a bogus earlier ENOSPC error due to - * filesystem is "full" of reservations, but - * there maybe indeed free blocks available on disk - * In this case, we just forget about the reservations - * just do block allocation as without reservations. - */ - if (my_rsv) { - my_rsv = NULL; - windowsz = 0; - group_no = goal_group; - goto retry_alloc; - } - /* No space left on the device */ - *errp = -ENOSPC; - goto out; - -allocated: - - ext3_debug("using block group %d(%d)\n", - group_no, gdp->bg_free_blocks_count); - - BUFFER_TRACE(gdp_bh, "get_write_access"); - fatal = ext3_journal_get_write_access(handle, gdp_bh); - if (fatal) - goto out; - - ret_block = grp_alloc_blk + ext3_group_first_block_no(sb, group_no); - - if (in_range(le32_to_cpu(gdp->bg_block_bitmap), ret_block, num) || - in_range(le32_to_cpu(gdp->bg_inode_bitmap), ret_block, num) || - in_range(ret_block, le32_to_cpu(gdp->bg_inode_table), - EXT3_SB(sb)->s_itb_per_group) || - in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table), - EXT3_SB(sb)->s_itb_per_group)) { - ext3_error(sb, "ext3_new_block", - "Allocating block in system zone - " - "blocks from "E3FSBLK", length %lu", - ret_block, num); - /* - * claim_block() marked the blocks we allocated as in use. So we - * may want to selectively mark some of the blocks as free. - */ - goto retry_alloc; - } - - performed_allocation = 1; - -#ifdef CONFIG_JBD_DEBUG - { - struct buffer_head *debug_bh; - - /* Record bitmap buffer state in the newly allocated block */ - debug_bh = sb_find_get_block(sb, ret_block); - if (debug_bh) { - BUFFER_TRACE(debug_bh, "state when allocated"); - BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state"); - brelse(debug_bh); - } - } - jbd_lock_bh_state(bitmap_bh); - spin_lock(sb_bgl_lock(sbi, group_no)); - if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) { - int i; - - for (i = 0; i < num; i++) { - if (ext3_test_bit(grp_alloc_blk+i, - bh2jh(bitmap_bh)->b_committed_data)) { - printk("%s: block was unexpectedly set in " - "b_committed_data\n", __func__); - } - } - } - ext3_debug("found bit %d\n", grp_alloc_blk); - spin_unlock(sb_bgl_lock(sbi, group_no)); - jbd_unlock_bh_state(bitmap_bh); -#endif - - if (ret_block + num - 1 >= le32_to_cpu(es->s_blocks_count)) { - ext3_error(sb, "ext3_new_block", - "block("E3FSBLK") >= blocks count(%d) - " - "block_group = %d, es == %p ", ret_block, - le32_to_cpu(es->s_blocks_count), group_no, es); - goto out; - } - - /* - * It is up to the caller to add the new buffer to a journal - * list of some description. We don't know in advance whether - * the caller wants to use it as metadata or data. - */ - ext3_debug("allocating block %lu. Goal hits %d of %d.\n", - ret_block, goal_hits, goal_attempts); - - spin_lock(sb_bgl_lock(sbi, group_no)); - le16_add_cpu(&gdp->bg_free_blocks_count, -num); - spin_unlock(sb_bgl_lock(sbi, group_no)); - percpu_counter_sub(&sbi->s_freeblocks_counter, num); - - BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor"); - fatal = ext3_journal_dirty_metadata(handle, gdp_bh); - if (fatal) - goto out; - - *errp = 0; - brelse(bitmap_bh); - - if (num < *count) { - dquot_free_block(inode, *count-num); - *count = num; - } - - trace_ext3_allocate_blocks(inode, goal, num, - (unsigned long long)ret_block); - - return ret_block; - -io_error: - *errp = -EIO; -out: - if (fatal) { - *errp = fatal; - ext3_std_error(sb, fatal); - } - /* - * Undo the block allocation - */ - if (!performed_allocation) - dquot_free_block(inode, *count); - brelse(bitmap_bh); - return 0; -} - -ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode, - ext3_fsblk_t goal, int *errp) -{ - unsigned long count = 1; - - return ext3_new_blocks(handle, inode, goal, &count, errp); -} - -/** - * ext3_count_free_blocks() -- count filesystem free blocks - * @sb: superblock - * - * Adds up the number of free blocks from each block group. - */ -ext3_fsblk_t ext3_count_free_blocks(struct super_block *sb) -{ - ext3_fsblk_t desc_count; - struct ext3_group_desc *gdp; - int i; - unsigned long ngroups = EXT3_SB(sb)->s_groups_count; -#ifdef EXT3FS_DEBUG - struct ext3_super_block *es; - ext3_fsblk_t bitmap_count; - unsigned long x; - struct buffer_head *bitmap_bh = NULL; - - es = EXT3_SB(sb)->s_es; - desc_count = 0; - bitmap_count = 0; - gdp = NULL; - - smp_rmb(); - for (i = 0; i < ngroups; i++) { - gdp = ext3_get_group_desc(sb, i, NULL); - if (!gdp) - continue; - desc_count += le16_to_cpu(gdp->bg_free_blocks_count); - brelse(bitmap_bh); - bitmap_bh = read_block_bitmap(sb, i); - if (bitmap_bh == NULL) - continue; - - x = ext3_count_free(bitmap_bh, sb->s_blocksize); - printk("group %d: stored = %d, counted = %lu\n", - i, le16_to_cpu(gdp->bg_free_blocks_count), x); - bitmap_count += x; - } - brelse(bitmap_bh); - printk("ext3_count_free_blocks: stored = "E3FSBLK - ", computed = "E3FSBLK", "E3FSBLK"\n", - (ext3_fsblk_t)le32_to_cpu(es->s_free_blocks_count), - desc_count, bitmap_count); - return bitmap_count; -#else - desc_count = 0; - smp_rmb(); - for (i = 0; i < ngroups; i++) { - gdp = ext3_get_group_desc(sb, i, NULL); - if (!gdp) - continue; - desc_count += le16_to_cpu(gdp->bg_free_blocks_count); - } - - return desc_count; -#endif -} - -static inline int test_root(int a, int b) -{ - int num = b; - - while (a > num) - num *= b; - return num == a; -} - -static int ext3_group_sparse(int group) -{ - if (group <= 1) - return 1; - if (!(group & 1)) - return 0; - return (test_root(group, 7) || test_root(group, 5) || - test_root(group, 3)); -} - -/** - * ext3_bg_has_super - number of blocks used by the superblock in group - * @sb: superblock for filesystem - * @group: group number to check - * - * Return the number of blocks used by the superblock (primary or backup) - * in this group. Currently this will be only 0 or 1. - */ -int ext3_bg_has_super(struct super_block *sb, int group) -{ - if (EXT3_HAS_RO_COMPAT_FEATURE(sb, - EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER) && - !ext3_group_sparse(group)) - return 0; - return 1; -} - -static unsigned long ext3_bg_num_gdb_meta(struct super_block *sb, int group) -{ - unsigned long metagroup = group / EXT3_DESC_PER_BLOCK(sb); - unsigned long first = metagroup * EXT3_DESC_PER_BLOCK(sb); - unsigned long last = first + EXT3_DESC_PER_BLOCK(sb) - 1; - - if (group == first || group == first + 1 || group == last) - return 1; - return 0; -} - -static unsigned long ext3_bg_num_gdb_nometa(struct super_block *sb, int group) -{ - return ext3_bg_has_super(sb, group) ? EXT3_SB(sb)->s_gdb_count : 0; -} - -/** - * ext3_bg_num_gdb - number of blocks used by the group table in group - * @sb: superblock for filesystem - * @group: group number to check - * - * Return the number of blocks used by the group descriptor table - * (primary or backup) in this group. In the future there may be a - * different number of descriptor blocks in each group. - */ -unsigned long ext3_bg_num_gdb(struct super_block *sb, int group) -{ - unsigned long first_meta_bg = - le32_to_cpu(EXT3_SB(sb)->s_es->s_first_meta_bg); - unsigned long metagroup = group / EXT3_DESC_PER_BLOCK(sb); - - if (!EXT3_HAS_INCOMPAT_FEATURE(sb,EXT3_FEATURE_INCOMPAT_META_BG) || - metagroup < first_meta_bg) - return ext3_bg_num_gdb_nometa(sb,group); - - return ext3_bg_num_gdb_meta(sb,group); - -} - -/** - * ext3_trim_all_free -- function to trim all free space in alloc. group - * @sb: super block for file system - * @group: allocation group to trim - * @start: first group block to examine - * @max: last group block to examine - * @gdp: allocation group description structure - * @minblocks: minimum extent block count - * - * ext3_trim_all_free walks through group's block bitmap searching for free - * blocks. When the free block is found, it tries to allocate this block and - * consequent free block to get the biggest free extent possible, until it - * reaches any used block. Then issue a TRIM command on this extent and free - * the extent in the block bitmap. This is done until whole group is scanned. - */ -static ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, - unsigned int group, - ext3_grpblk_t start, ext3_grpblk_t max, - ext3_grpblk_t minblocks) -{ - handle_t *handle; - ext3_grpblk_t next, free_blocks, bit, freed, count = 0; - ext3_fsblk_t discard_block; - struct ext3_sb_info *sbi; - struct buffer_head *gdp_bh, *bitmap_bh = NULL; - struct ext3_group_desc *gdp; - int err = 0, ret = 0; - - /* - * We will update one block bitmap, and one group descriptor - */ - handle = ext3_journal_start_sb(sb, 2); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - bitmap_bh = read_block_bitmap(sb, group); - if (!bitmap_bh) { - err = -EIO; - goto err_out; - } - - BUFFER_TRACE(bitmap_bh, "getting undo access"); - err = ext3_journal_get_undo_access(handle, bitmap_bh); - if (err) - goto err_out; - - gdp = ext3_get_group_desc(sb, group, &gdp_bh); - if (!gdp) { - err = -EIO; - goto err_out; - } - - BUFFER_TRACE(gdp_bh, "get_write_access"); - err = ext3_journal_get_write_access(handle, gdp_bh); - if (err) - goto err_out; - - free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); - sbi = EXT3_SB(sb); - - /* Walk through the whole group */ - while (start <= max) { - start = bitmap_search_next_usable_block(start, bitmap_bh, max); - if (start < 0) - break; - next = start; - - /* - * Allocate contiguous free extents by setting bits in the - * block bitmap - */ - while (next <= max - && claim_block(sb_bgl_lock(sbi, group), - next, bitmap_bh)) { - next++; - } - - /* We did not claim any blocks */ - if (next == start) - continue; - - discard_block = (ext3_fsblk_t)start + - ext3_group_first_block_no(sb, group); - - /* Update counters */ - spin_lock(sb_bgl_lock(sbi, group)); - le16_add_cpu(&gdp->bg_free_blocks_count, start - next); - spin_unlock(sb_bgl_lock(sbi, group)); - percpu_counter_sub(&sbi->s_freeblocks_counter, next - start); - - free_blocks -= next - start; - /* Do not issue a TRIM on extents smaller than minblocks */ - if ((next - start) < minblocks) - goto free_extent; - - trace_ext3_discard_blocks(sb, discard_block, next - start); - /* Send the TRIM command down to the device */ - err = sb_issue_discard(sb, discard_block, next - start, - GFP_NOFS, 0); - count += (next - start); -free_extent: - freed = 0; - - /* - * Clear bits in the bitmap - */ - for (bit = start; bit < next; bit++) { - BUFFER_TRACE(bitmap_bh, "clear bit"); - if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, group), - bit, bitmap_bh->b_data)) { - ext3_error(sb, __func__, - "bit already cleared for block "E3FSBLK, - (unsigned long)bit); - BUFFER_TRACE(bitmap_bh, "bit already cleared"); - } else { - freed++; - } - } - - /* Update couters */ - spin_lock(sb_bgl_lock(sbi, group)); - le16_add_cpu(&gdp->bg_free_blocks_count, freed); - spin_unlock(sb_bgl_lock(sbi, group)); - percpu_counter_add(&sbi->s_freeblocks_counter, freed); - - start = next; - if (err < 0) { - if (err != -EOPNOTSUPP) - ext3_warning(sb, __func__, "Discard command " - "returned error %d\n", err); - break; - } - - if (fatal_signal_pending(current)) { - err = -ERESTARTSYS; - break; - } - - cond_resched(); - - /* No more suitable extents */ - if (free_blocks < minblocks) - break; - } - - /* We dirtied the bitmap block */ - BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); - ret = ext3_journal_dirty_metadata(handle, bitmap_bh); - if (!err) - err = ret; - - /* And the group descriptor block */ - BUFFER_TRACE(gdp_bh, "dirtied group descriptor block"); - ret = ext3_journal_dirty_metadata(handle, gdp_bh); - if (!err) - err = ret; - - ext3_debug("trimmed %d blocks in the group %d\n", - count, group); - -err_out: - if (err) - count = err; - ext3_journal_stop(handle); - brelse(bitmap_bh); - - return count; -} - -/** - * ext3_trim_fs() -- trim ioctl handle function - * @sb: superblock for filesystem - * @start: First Byte to trim - * @len: number of Bytes to trim from start - * @minlen: minimum extent length in Bytes - * - * ext3_trim_fs goes through all allocation groups containing Bytes from - * start to start+len. For each such a group ext3_trim_all_free function - * is invoked to trim all free space. - */ -int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range) -{ - ext3_grpblk_t last_block, first_block; - unsigned long group, first_group, last_group; - struct ext3_group_desc *gdp; - struct ext3_super_block *es = EXT3_SB(sb)->s_es; - uint64_t start, minlen, end, trimmed = 0; - ext3_fsblk_t first_data_blk = - le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block); - ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count); - int ret = 0; - - start = range->start >> sb->s_blocksize_bits; - end = start + (range->len >> sb->s_blocksize_bits) - 1; - minlen = range->minlen >> sb->s_blocksize_bits; - - if (minlen > EXT3_BLOCKS_PER_GROUP(sb) || - start >= max_blks || - range->len < sb->s_blocksize) - return -EINVAL; - if (end >= max_blks) - end = max_blks - 1; - if (end <= first_data_blk) - goto out; - if (start < first_data_blk) - start = first_data_blk; - - smp_rmb(); - - /* Determine first and last group to examine based on start and len */ - ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) start, - &first_group, &first_block); - ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) end, - &last_group, &last_block); - - /* end now represents the last block to discard in this group */ - end = EXT3_BLOCKS_PER_GROUP(sb) - 1; - - for (group = first_group; group <= last_group; group++) { - gdp = ext3_get_group_desc(sb, group, NULL); - if (!gdp) - break; - - /* - * For all the groups except the last one, last block will - * always be EXT3_BLOCKS_PER_GROUP(sb)-1, so we only need to - * change it for the last group, note that last_block is - * already computed earlier by ext3_get_group_no_and_offset() - */ - if (group == last_group) - end = last_block; - - if (le16_to_cpu(gdp->bg_free_blocks_count) >= minlen) { - ret = ext3_trim_all_free(sb, group, first_block, - end, minlen); - if (ret < 0) - break; - trimmed += ret; - } - - /* - * For every group except the first one, we are sure - * that the first block to discard will be block #0. - */ - first_block = 0; - } - - if (ret > 0) - ret = 0; - -out: - range->len = trimmed * sb->s_blocksize; - return ret; -} diff --git a/fs/ext3/bitmap.c b/fs/ext3/bitmap.c deleted file mode 100644 index ef9c643e8e9d..000000000000 --- a/fs/ext3/bitmap.c +++ /dev/null @@ -1,20 +0,0 @@ -/* - * linux/fs/ext3/bitmap.c - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - */ - -#include "ext3.h" - -#ifdef EXT3FS_DEBUG - -unsigned long ext3_count_free (struct buffer_head * map, unsigned int numchars) -{ - return numchars * BITS_PER_BYTE - memweight(map->b_data, numchars); -} - -#endif /* EXT3FS_DEBUG */ - diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c deleted file mode 100644 index 17742eed2c16..000000000000 --- a/fs/ext3/dir.c +++ /dev/null @@ -1,537 +0,0 @@ -/* - * linux/fs/ext3/dir.c - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * from - * - * linux/fs/minix/dir.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * ext3 directory handling functions - * - * Big-endian to little-endian byte-swapping/bitmaps by - * David S. Miller (davem@caip.rutgers.edu), 1995 - * - * Hash Tree Directory indexing (c) 2001 Daniel Phillips - * - */ - -#include <linux/compat.h> -#include "ext3.h" - -static unsigned char ext3_filetype_table[] = { - DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK -}; - -static int ext3_dx_readdir(struct file *, struct dir_context *); - -static unsigned char get_dtype(struct super_block *sb, int filetype) -{ - if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) || - (filetype >= EXT3_FT_MAX)) - return DT_UNKNOWN; - - return (ext3_filetype_table[filetype]); -} - -/** - * Check if the given dir-inode refers to an htree-indexed directory - * (or a directory which could potentially get converted to use htree - * indexing). - * - * Return 1 if it is a dx dir, 0 if not - */ -static int is_dx_dir(struct inode *inode) -{ - struct super_block *sb = inode->i_sb; - - if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb, - EXT3_FEATURE_COMPAT_DIR_INDEX) && - ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) || - ((inode->i_size >> sb->s_blocksize_bits) == 1))) - return 1; - - return 0; -} - -int ext3_check_dir_entry (const char * function, struct inode * dir, - struct ext3_dir_entry_2 * de, - struct buffer_head * bh, - unsigned long offset) -{ - const char * error_msg = NULL; - const int rlen = ext3_rec_len_from_disk(de->rec_len); - - if (unlikely(rlen < EXT3_DIR_REC_LEN(1))) - error_msg = "rec_len is smaller than minimal"; - else if (unlikely(rlen % 4 != 0)) - error_msg = "rec_len % 4 != 0"; - else if (unlikely(rlen < EXT3_DIR_REC_LEN(de->name_len))) - error_msg = "rec_len is too small for name_len"; - else if (unlikely((((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize))) - error_msg = "directory entry across blocks"; - else if (unlikely(le32_to_cpu(de->inode) > - le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count))) - error_msg = "inode out of bounds"; - - if (unlikely(error_msg != NULL)) - ext3_error (dir->i_sb, function, - "bad entry in directory #%lu: %s - " - "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", - dir->i_ino, error_msg, offset, - (unsigned long) le32_to_cpu(de->inode), - rlen, de->name_len); - - return error_msg == NULL ? 1 : 0; -} - -static int ext3_readdir(struct file *file, struct dir_context *ctx) -{ - unsigned long offset; - int i; - struct ext3_dir_entry_2 *de; - int err; - struct inode *inode = file_inode(file); - struct super_block *sb = inode->i_sb; - int dir_has_error = 0; - - if (is_dx_dir(inode)) { - err = ext3_dx_readdir(file, ctx); - if (err != ERR_BAD_DX_DIR) - return err; - /* - * We don't set the inode dirty flag since it's not - * critical that it get flushed back to the disk. - */ - EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL; - } - offset = ctx->pos & (sb->s_blocksize - 1); - - while (ctx->pos < inode->i_size) { - unsigned long blk = ctx->pos >> EXT3_BLOCK_SIZE_BITS(sb); - struct buffer_head map_bh; - struct buffer_head *bh = NULL; - - map_bh.b_state = 0; - err = ext3_get_blocks_handle(NULL, inode, blk, 1, &map_bh, 0); - if (err > 0) { - pgoff_t index = map_bh.b_blocknr >> - (PAGE_CACHE_SHIFT - inode->i_blkbits); - if (!ra_has_index(&file->f_ra, index)) - page_cache_sync_readahead( - sb->s_bdev->bd_inode->i_mapping, - &file->f_ra, file, - index, 1); - file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; - bh = ext3_bread(NULL, inode, blk, 0, &err); - } - - /* - * We ignore I/O errors on directories so users have a chance - * of recovering data when there's a bad sector - */ - if (!bh) { - if (!dir_has_error) { - ext3_error(sb, __func__, "directory #%lu " - "contains a hole at offset %lld", - inode->i_ino, ctx->pos); - dir_has_error = 1; - } - /* corrupt size? Maybe no more blocks to read */ - if (ctx->pos > inode->i_blocks << 9) - break; - ctx->pos += sb->s_blocksize - offset; - continue; - } - - /* If the dir block has changed since the last call to - * readdir(2), then we might be pointing to an invalid - * dirent right now. Scan from the start of the block - * to make sure. */ - if (offset && file->f_version != inode->i_version) { - for (i = 0; i < sb->s_blocksize && i < offset; ) { - de = (struct ext3_dir_entry_2 *) - (bh->b_data + i); - /* It's too expensive to do a full - * dirent test each time round this - * loop, but we do have to test at - * least that it is non-zero. A - * failure will be detected in the - * dirent test below. */ - if (ext3_rec_len_from_disk(de->rec_len) < - EXT3_DIR_REC_LEN(1)) - break; - i += ext3_rec_len_from_disk(de->rec_len); - } - offset = i; - ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1)) - | offset; - file->f_version = inode->i_version; - } - - while (ctx->pos < inode->i_size - && offset < sb->s_blocksize) { - de = (struct ext3_dir_entry_2 *) (bh->b_data + offset); - if (!ext3_check_dir_entry ("ext3_readdir", inode, de, - bh, offset)) { - /* On error, skip the to the - next block. */ - ctx->pos = (ctx->pos | - (sb->s_blocksize - 1)) + 1; - break; - } - offset += ext3_rec_len_from_disk(de->rec_len); - if (le32_to_cpu(de->inode)) { - if (!dir_emit(ctx, de->name, de->name_len, - le32_to_cpu(de->inode), - get_dtype(sb, de->file_type))) { - brelse(bh); - return 0; - } - } - ctx->pos += ext3_rec_len_from_disk(de->rec_len); - } - offset = 0; - brelse (bh); - if (ctx->pos < inode->i_size) - if (!dir_relax(inode)) - return 0; - } - return 0; -} - -static inline int is_32bit_api(void) -{ -#ifdef CONFIG_COMPAT - return is_compat_task(); -#else - return (BITS_PER_LONG == 32); -#endif -} - -/* - * These functions convert from the major/minor hash to an f_pos - * value for dx directories - * - * Upper layer (for example NFS) should specify FMODE_32BITHASH or - * FMODE_64BITHASH explicitly. On the other hand, we allow ext3 to be mounted - * directly on both 32-bit and 64-bit nodes, under such case, neither - * FMODE_32BITHASH nor FMODE_64BITHASH is specified. - */ -static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor) -{ - if ((filp->f_mode & FMODE_32BITHASH) || - (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) - return major >> 1; - else - return ((__u64)(major >> 1) << 32) | (__u64)minor; -} - -static inline __u32 pos2maj_hash(struct file *filp, loff_t pos) -{ - if ((filp->f_mode & FMODE_32BITHASH) || - (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) - return (pos << 1) & 0xffffffff; - else - return ((pos >> 32) << 1) & 0xffffffff; -} - -static inline __u32 pos2min_hash(struct file *filp, loff_t pos) -{ - if ((filp->f_mode & FMODE_32BITHASH) || - (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) - return 0; - else - return pos & 0xffffffff; -} - -/* - * Return 32- or 64-bit end-of-file for dx directories - */ -static inline loff_t ext3_get_htree_eof(struct file *filp) -{ - if ((filp->f_mode & FMODE_32BITHASH) || - (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) - return EXT3_HTREE_EOF_32BIT; - else - return EXT3_HTREE_EOF_64BIT; -} - - -/* - * ext3_dir_llseek() calls generic_file_llseek[_size]() to handle both - * non-htree and htree directories, where the "offset" is in terms - * of the filename hash value instead of the byte offset. - * - * Because we may return a 64-bit hash that is well beyond s_maxbytes, - * we need to pass the max hash as the maximum allowable offset in - * the htree directory case. - * - * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX) - * will be invalid once the directory was converted into a dx directory - */ -static loff_t ext3_dir_llseek(struct file *file, loff_t offset, int whence) -{ - struct inode *inode = file->f_mapping->host; - int dx_dir = is_dx_dir(inode); - loff_t htree_max = ext3_get_htree_eof(file); - - if (likely(dx_dir)) - return generic_file_llseek_size(file, offset, whence, - htree_max, htree_max); - else - return generic_file_llseek(file, offset, whence); -} - -/* - * This structure holds the nodes of the red-black tree used to store - * the directory entry in hash order. - */ -struct fname { - __u32 hash; - __u32 minor_hash; - struct rb_node rb_hash; - struct fname *next; - __u32 inode; - __u8 name_len; - __u8 file_type; - char name[0]; -}; - -/* - * This functoin implements a non-recursive way of freeing all of the - * nodes in the red-black tree. - */ -static void free_rb_tree_fname(struct rb_root *root) -{ - struct fname *fname, *next; - - rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash) - do { - struct fname *old = fname; - fname = fname->next; - kfree(old); - } while (fname); - - *root = RB_ROOT; -} - -static struct dir_private_info *ext3_htree_create_dir_info(struct file *filp, - loff_t pos) -{ - struct dir_private_info *p; - - p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); - if (!p) - return NULL; - p->curr_hash = pos2maj_hash(filp, pos); - p->curr_minor_hash = pos2min_hash(filp, pos); - return p; -} - -void ext3_htree_free_dir_info(struct dir_private_info *p) -{ - free_rb_tree_fname(&p->root); - kfree(p); -} - -/* - * Given a directory entry, enter it into the fname rb tree. - */ -int ext3_htree_store_dirent(struct file *dir_file, __u32 hash, - __u32 minor_hash, - struct ext3_dir_entry_2 *dirent) -{ - struct rb_node **p, *parent = NULL; - struct fname * fname, *new_fn; - struct dir_private_info *info; - int len; - - info = (struct dir_private_info *) dir_file->private_data; - p = &info->root.rb_node; - - /* Create and allocate the fname structure */ - len = sizeof(struct fname) + dirent->name_len + 1; - new_fn = kzalloc(len, GFP_KERNEL); - if (!new_fn) - return -ENOMEM; - new_fn->hash = hash; - new_fn->minor_hash = minor_hash; - new_fn->inode = le32_to_cpu(dirent->inode); - new_fn->name_len = dirent->name_len; - new_fn->file_type = dirent->file_type; - memcpy(new_fn->name, dirent->name, dirent->name_len); - new_fn->name[dirent->name_len] = 0; - - while (*p) { - parent = *p; - fname = rb_entry(parent, struct fname, rb_hash); - - /* - * If the hash and minor hash match up, then we put - * them on a linked list. This rarely happens... - */ - if ((new_fn->hash == fname->hash) && - (new_fn->minor_hash == fname->minor_hash)) { - new_fn->next = fname->next; - fname->next = new_fn; - return 0; - } - - if (new_fn->hash < fname->hash) - p = &(*p)->rb_left; - else if (new_fn->hash > fname->hash) - p = &(*p)->rb_right; - else if (new_fn->minor_hash < fname->minor_hash) - p = &(*p)->rb_left; - else /* if (new_fn->minor_hash > fname->minor_hash) */ - p = &(*p)->rb_right; - } - - rb_link_node(&new_fn->rb_hash, parent, p); - rb_insert_color(&new_fn->rb_hash, &info->root); - return 0; -} - - - -/* - * This is a helper function for ext3_dx_readdir. It calls filldir - * for all entres on the fname linked list. (Normally there is only - * one entry on the linked list, unless there are 62 bit hash collisions.) - */ -static bool call_filldir(struct file *file, struct dir_context *ctx, - struct fname *fname) -{ - struct dir_private_info *info = file->private_data; - struct inode *inode = file_inode(file); - struct super_block *sb = inode->i_sb; - - if (!fname) { - printk("call_filldir: called with null fname?!?\n"); - return true; - } - ctx->pos = hash2pos(file, fname->hash, fname->minor_hash); - while (fname) { - if (!dir_emit(ctx, fname->name, fname->name_len, - fname->inode, - get_dtype(sb, fname->file_type))) { - info->extra_fname = fname; - return false; - } - fname = fname->next; - } - return true; -} - -static int ext3_dx_readdir(struct file *file, struct dir_context *ctx) -{ - struct dir_private_info *info = file->private_data; - struct inode *inode = file_inode(file); - struct fname *fname; - int ret; - - if (!info) { - info = ext3_htree_create_dir_info(file, ctx->pos); - if (!info) - return -ENOMEM; - file->private_data = info; - } - - if (ctx->pos == ext3_get_htree_eof(file)) - return 0; /* EOF */ - - /* Some one has messed with f_pos; reset the world */ - if (info->last_pos != ctx->pos) { - free_rb_tree_fname(&info->root); - info->curr_node = NULL; - info->extra_fname = NULL; - info->curr_hash = pos2maj_hash(file, ctx->pos); - info->curr_minor_hash = pos2min_hash(file, ctx->pos); - } - - /* - * If there are any leftover names on the hash collision - * chain, return them first. - */ - if (info->extra_fname) { - if (!call_filldir(file, ctx, info->extra_fname)) - goto finished; - info->extra_fname = NULL; - goto next_node; - } else if (!info->curr_node) - info->curr_node = rb_first(&info->root); - - while (1) { - /* - * Fill the rbtree if we have no more entries, - * or the inode has changed since we last read in the - * cached entries. - */ - if ((!info->curr_node) || - (file->f_version != inode->i_version)) { - info->curr_node = NULL; - free_rb_tree_fname(&info->root); - file->f_version = inode->i_version; - ret = ext3_htree_fill_tree(file, info->curr_hash, - info->curr_minor_hash, - &info->next_hash); - if (ret < 0) - return ret; - if (ret == 0) { - ctx->pos = ext3_get_htree_eof(file); - break; - } - info->curr_node = rb_first(&info->root); - } - - fname = rb_entry(info->curr_node, struct fname, rb_hash); - info->curr_hash = fname->hash; - info->curr_minor_hash = fname->minor_hash; - if (!call_filldir(file, ctx, fname)) - break; - next_node: - info->curr_node = rb_next(info->curr_node); - if (info->curr_node) { - fname = rb_entry(info->curr_node, struct fname, - rb_hash); - info->curr_hash = fname->hash; - info->curr_minor_hash = fname->minor_hash; - } else { - if (info->next_hash == ~0) { - ctx->pos = ext3_get_htree_eof(file); - break; - } - info->curr_hash = info->next_hash; - info->curr_minor_hash = 0; - } - } -finished: - info->last_pos = ctx->pos; - return 0; -} - -static int ext3_release_dir (struct inode * inode, struct file * filp) -{ - if (filp->private_data) - ext3_htree_free_dir_info(filp->private_data); - - return 0; -} - -const struct file_operations ext3_dir_operations = { - .llseek = ext3_dir_llseek, - .read = generic_read_dir, - .iterate = ext3_readdir, - .unlocked_ioctl = ext3_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = ext3_compat_ioctl, -#endif - .fsync = ext3_sync_file, - .release = ext3_release_dir, -}; diff --git a/fs/ext3/ext3.h b/fs/ext3/ext3.h deleted file mode 100644 index f483a80b3fe7..000000000000 --- a/fs/ext3/ext3.h +++ /dev/null @@ -1,1332 +0,0 @@ -/* - * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 - * - * Copyright 1998--1999 Red Hat corp --- All Rights Reserved - * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * from - * - * linux/include/linux/minix_fs.h - * - * Copyright (C) 1991, 1992 Linus Torvalds - */ - -#include <linux/fs.h> -#include <linux/jbd.h> -#include <linux/magic.h> -#include <linux/bug.h> -#include <linux/blockgroup_lock.h> - -/* - * The second extended filesystem constants/structures - */ - -/* - * Define EXT3FS_DEBUG to produce debug messages - */ -#undef EXT3FS_DEBUG - -/* - * Define EXT3_RESERVATION to reserve data blocks for expanding files - */ -#define EXT3_DEFAULT_RESERVE_BLOCKS 8 -/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */ -#define EXT3_MAX_RESERVE_BLOCKS 1027 -#define EXT3_RESERVE_WINDOW_NOT_ALLOCATED 0 - -/* - * Debug code - */ -#ifdef EXT3FS_DEBUG -#define ext3_debug(f, a...) \ - do { \ - printk (KERN_DEBUG "EXT3-fs DEBUG (%s, %d): %s:", \ - __FILE__, __LINE__, __func__); \ - printk (KERN_DEBUG f, ## a); \ - } while (0) -#else -#define ext3_debug(f, a...) do {} while (0) -#endif - -/* - * Special inodes numbers - */ -#define EXT3_BAD_INO 1 /* Bad blocks inode */ -#define EXT3_ROOT_INO 2 /* Root inode */ -#define EXT3_BOOT_LOADER_INO 5 /* Boot loader inode */ -#define EXT3_UNDEL_DIR_INO 6 /* Undelete directory inode */ -#define EXT3_RESIZE_INO 7 /* Reserved group descriptors inode */ -#define EXT3_JOURNAL_INO 8 /* Journal inode */ - -/* First non-reserved inode for old ext3 filesystems */ -#define EXT3_GOOD_OLD_FIRST_INO 11 - -/* - * Maximal count of links to a file - */ -#define EXT3_LINK_MAX 32000 - -/* - * Macro-instructions used to manage several block sizes - */ -#define EXT3_MIN_BLOCK_SIZE 1024 -#define EXT3_MAX_BLOCK_SIZE 65536 -#define EXT3_MIN_BLOCK_LOG_SIZE 10 -#define EXT3_BLOCK_SIZE(s) ((s)->s_blocksize) -#define EXT3_ADDR_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (__u32)) -#define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) -#define EXT3_ADDR_PER_BLOCK_BITS(s) (EXT3_SB(s)->s_addr_per_block_bits) -#define EXT3_INODE_SIZE(s) (EXT3_SB(s)->s_inode_size) -#define EXT3_FIRST_INO(s) (EXT3_SB(s)->s_first_ino) - -/* - * Macro-instructions used to manage fragments - */ -#define EXT3_MIN_FRAG_SIZE 1024 -#define EXT3_MAX_FRAG_SIZE 4096 -#define EXT3_MIN_FRAG_LOG_SIZE 10 -#define EXT3_FRAG_SIZE(s) (EXT3_SB(s)->s_frag_size) -#define EXT3_FRAGS_PER_BLOCK(s) (EXT3_SB(s)->s_frags_per_block) - -/* - * Structure of a blocks group descriptor - */ -struct ext3_group_desc -{ - __le32 bg_block_bitmap; /* Blocks bitmap block */ - __le32 bg_inode_bitmap; /* Inodes bitmap block */ - __le32 bg_inode_table; /* Inodes table block */ - __le16 bg_free_blocks_count; /* Free blocks count */ - __le16 bg_free_inodes_count; /* Free inodes count */ - __le16 bg_used_dirs_count; /* Directories count */ - __u16 bg_pad; - __le32 bg_reserved[3]; -}; - -/* - * Macro-instructions used to manage group descriptors - */ -#define EXT3_BLOCKS_PER_GROUP(s) (EXT3_SB(s)->s_blocks_per_group) -#define EXT3_DESC_PER_BLOCK(s) (EXT3_SB(s)->s_desc_per_block) -#define EXT3_INODES_PER_GROUP(s) (EXT3_SB(s)->s_inodes_per_group) -#define EXT3_DESC_PER_BLOCK_BITS(s) (EXT3_SB(s)->s_desc_per_block_bits) - -/* - * Constants relative to the data blocks - */ -#define EXT3_NDIR_BLOCKS 12 -#define EXT3_IND_BLOCK EXT3_NDIR_BLOCKS -#define EXT3_DIND_BLOCK (EXT3_IND_BLOCK + 1) -#define EXT3_TIND_BLOCK (EXT3_DIND_BLOCK + 1) -#define EXT3_N_BLOCKS (EXT3_TIND_BLOCK + 1) - -/* - * Inode flags - */ -#define EXT3_SECRM_FL 0x00000001 /* Secure deletion */ -#define EXT3_UNRM_FL 0x00000002 /* Undelete */ -#define EXT3_COMPR_FL 0x00000004 /* Compress file */ -#define EXT3_SYNC_FL 0x00000008 /* Synchronous updates */ -#define EXT3_IMMUTABLE_FL 0x00000010 /* Immutable file */ -#define EXT3_APPEND_FL 0x00000020 /* writes to file may only append */ -#define EXT3_NODUMP_FL 0x00000040 /* do not dump file */ -#define EXT3_NOATIME_FL 0x00000080 /* do not update atime */ -/* Reserved for compression usage... */ -#define EXT3_DIRTY_FL 0x00000100 -#define EXT3_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ -#define EXT3_NOCOMPR_FL 0x00000400 /* Don't compress */ -#define EXT3_ECOMPR_FL 0x00000800 /* Compression error */ -/* End compression flags --- maybe not all used */ -#define EXT3_INDEX_FL 0x00001000 /* hash-indexed directory */ -#define EXT3_IMAGIC_FL 0x00002000 /* AFS directory */ -#define EXT3_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ -#define EXT3_NOTAIL_FL 0x00008000 /* file tail should not be merged */ -#define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ -#define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ -#define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ - -#define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ -#define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ - -/* Flags that should be inherited by new inodes from their parent. */ -#define EXT3_FL_INHERITED (EXT3_SECRM_FL | EXT3_UNRM_FL | EXT3_COMPR_FL |\ - EXT3_SYNC_FL | EXT3_NODUMP_FL |\ - EXT3_NOATIME_FL | EXT3_COMPRBLK_FL |\ - EXT3_NOCOMPR_FL | EXT3_JOURNAL_DATA_FL |\ - EXT3_NOTAIL_FL | EXT3_DIRSYNC_FL) - -/* Flags that are appropriate for regular files (all but dir-specific ones). */ -#define EXT3_REG_FLMASK (~(EXT3_DIRSYNC_FL | EXT3_TOPDIR_FL)) - -/* Flags that are appropriate for non-directories/regular files. */ -#define EXT3_OTHER_FLMASK (EXT3_NODUMP_FL | EXT3_NOATIME_FL) - -/* Mask out flags that are inappropriate for the given type of inode. */ -static inline __u32 ext3_mask_flags(umode_t mode, __u32 flags) -{ - if (S_ISDIR(mode)) - return flags; - else if (S_ISREG(mode)) - return flags & EXT3_REG_FLMASK; - else - return flags & EXT3_OTHER_FLMASK; -} - -/* Used to pass group descriptor data when online resize is done */ -struct ext3_new_group_input { - __u32 group; /* Group number for this data */ - __u32 block_bitmap; /* Absolute block number of block bitmap */ - __u32 inode_bitmap; /* Absolute block number of inode bitmap */ - __u32 inode_table; /* Absolute block number of inode table start */ - __u32 blocks_count; /* Total number of blocks in this group */ - __u16 reserved_blocks; /* Number of reserved blocks in this group */ - __u16 unused; -}; - -/* The struct ext3_new_group_input in kernel space, with free_blocks_count */ -struct ext3_new_group_data { - __u32 group; - __u32 block_bitmap; - __u32 inode_bitmap; - __u32 inode_table; - __u32 blocks_count; - __u16 reserved_blocks; - __u16 unused; - __u32 free_blocks_count; -}; - - -/* - * ioctl commands - */ -#define EXT3_IOC_GETFLAGS FS_IOC_GETFLAGS -#define EXT3_IOC_SETFLAGS FS_IOC_SETFLAGS -#define EXT3_IOC_GETVERSION _IOR('f', 3, long) -#define EXT3_IOC_SETVERSION _IOW('f', 4, long) -#define EXT3_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) -#define EXT3_IOC_GROUP_ADD _IOW('f', 8,struct ext3_new_group_input) -#define EXT3_IOC_GETVERSION_OLD FS_IOC_GETVERSION -#define EXT3_IOC_SETVERSION_OLD FS_IOC_SETVERSION -#ifdef CONFIG_JBD_DEBUG -#define EXT3_IOC_WAIT_FOR_READONLY _IOR('f', 99, long) -#endif -#define EXT3_IOC_GETRSVSZ _IOR('f', 5, long) -#define EXT3_IOC_SETRSVSZ _IOW('f', 6, long) - -/* - * ioctl commands in 32 bit emulation - */ -#define EXT3_IOC32_GETFLAGS FS_IOC32_GETFLAGS -#define EXT3_IOC32_SETFLAGS FS_IOC32_SETFLAGS -#define EXT3_IOC32_GETVERSION _IOR('f', 3, int) -#define EXT3_IOC32_SETVERSION _IOW('f', 4, int) -#define EXT3_IOC32_GETRSVSZ _IOR('f', 5, int) -#define EXT3_IOC32_SETRSVSZ _IOW('f', 6, int) -#define EXT3_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) -#ifdef CONFIG_JBD_DEBUG -#define EXT3_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int) -#endif -#define EXT3_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION -#define EXT3_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION - -/* Number of supported quota types */ -#define EXT3_MAXQUOTAS 2 - -/* - * Mount options - */ -struct ext3_mount_options { - unsigned long s_mount_opt; - kuid_t s_resuid; - kgid_t s_resgid; - unsigned long s_commit_interval; -#ifdef CONFIG_QUOTA - int s_jquota_fmt; - char *s_qf_names[EXT3_MAXQUOTAS]; -#endif -}; - -/* - * Structure of an inode on the disk - */ -struct ext3_inode { - __le16 i_mode; /* File mode */ - __le16 i_uid; /* Low 16 bits of Owner Uid */ - __le32 i_size; /* Size in bytes */ - __le32 i_atime; /* Access time */ - __le32 i_ctime; /* Creation time */ - __le32 i_mtime; /* Modification time */ - __le32 i_dtime; /* Deletion Time */ - __le16 i_gid; /* Low 16 bits of Group Id */ - __le16 i_links_count; /* Links count */ - __le32 i_blocks; /* Blocks count */ - __le32 i_flags; /* File flags */ - union { - struct { - __u32 l_i_reserved1; - } linux1; - struct { - __u32 h_i_translator; - } hurd1; - struct { - __u32 m_i_reserved1; - } masix1; - } osd1; /* OS dependent 1 */ - __le32 i_block[EXT3_N_BLOCKS];/* Pointers to blocks */ - __le32 i_generation; /* File version (for NFS) */ - __le32 i_file_acl; /* File ACL */ - __le32 i_dir_acl; /* Directory ACL */ - __le32 i_faddr; /* Fragment address */ - union { - struct { - __u8 l_i_frag; /* Fragment number */ - __u8 l_i_fsize; /* Fragment size */ - __u16 i_pad1; - __le16 l_i_uid_high; /* these 2 fields */ - __le16 l_i_gid_high; /* were reserved2[0] */ - __u32 l_i_reserved2; - } linux2; - struct { - __u8 h_i_frag; /* Fragment number */ - __u8 h_i_fsize; /* Fragment size */ - __u16 h_i_mode_high; - __u16 h_i_uid_high; - __u16 h_i_gid_high; - __u32 h_i_author; - } hurd2; - struct { - __u8 m_i_frag; /* Fragment number */ - __u8 m_i_fsize; /* Fragment size */ - __u16 m_pad1; - __u32 m_i_reserved2[2]; - } masix2; - } osd2; /* OS dependent 2 */ - __le16 i_extra_isize; - __le16 i_pad1; -}; - -#define i_size_high i_dir_acl - -#define i_reserved1 osd1.linux1.l_i_reserved1 -#define i_frag osd2.linux2.l_i_frag -#define i_fsize osd2.linux2.l_i_fsize -#define i_uid_low i_uid -#define i_gid_low i_gid -#define i_uid_high osd2.linux2.l_i_uid_high -#define i_gid_high osd2.linux2.l_i_gid_high -#define i_reserved2 osd2.linux2.l_i_reserved2 - -/* - * File system states - */ -#define EXT3_VALID_FS 0x0001 /* Unmounted cleanly */ -#define EXT3_ERROR_FS 0x0002 /* Errors detected */ -#define EXT3_ORPHAN_FS 0x0004 /* Orphans being recovered */ - -/* - * Misc. filesystem flags - */ -#define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */ -#define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */ -#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ - -/* - * Mount flags - */ -#define EXT3_MOUNT_CHECK 0x00001 /* Do mount-time checks */ -/* EXT3_MOUNT_OLDALLOC was there */ -#define EXT3_MOUNT_GRPID 0x00004 /* Create files with directory's group */ -#define EXT3_MOUNT_DEBUG 0x00008 /* Some debugging messages */ -#define EXT3_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ -#define EXT3_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */ -#define EXT3_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ -#define EXT3_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ -#define EXT3_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ -#define EXT3_MOUNT_ABORT 0x00200 /* Fatal error detected */ -#define EXT3_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ -#define EXT3_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ -#define EXT3_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ -#define EXT3_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */ -#define EXT3_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */ -#define EXT3_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */ -#define EXT3_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */ -#define EXT3_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ -#define EXT3_MOUNT_RESERVATION 0x10000 /* Preallocation */ -#define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */ -#define EXT3_MOUNT_QUOTA 0x80000 /* Some quota option set */ -#define EXT3_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ -#define EXT3_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ -#define EXT3_MOUNT_DATA_ERR_ABORT 0x400000 /* Abort on file data write - * error in ordered mode */ - -/* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ -#ifndef _LINUX_EXT2_FS_H -#define clear_opt(o, opt) o &= ~EXT3_MOUNT_##opt -#define set_opt(o, opt) o |= EXT3_MOUNT_##opt -#define test_opt(sb, opt) (EXT3_SB(sb)->s_mount_opt & \ - EXT3_MOUNT_##opt) -#else -#define EXT2_MOUNT_NOLOAD EXT3_MOUNT_NOLOAD -#define EXT2_MOUNT_ABORT EXT3_MOUNT_ABORT -#define EXT2_MOUNT_DATA_FLAGS EXT3_MOUNT_DATA_FLAGS -#endif - -#define ext3_set_bit __set_bit_le -#define ext3_set_bit_atomic ext2_set_bit_atomic -#define ext3_clear_bit __clear_bit_le -#define ext3_clear_bit_atomic ext2_clear_bit_atomic -#define ext3_test_bit test_bit_le -#define ext3_find_next_zero_bit find_next_zero_bit_le - -/* - * Maximal mount counts between two filesystem checks - */ -#define EXT3_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */ -#define EXT3_DFL_CHECKINTERVAL 0 /* Don't use interval check */ - -/* - * Behaviour when detecting errors - */ -#define EXT3_ERRORS_CONTINUE 1 /* Continue execution */ -#define EXT3_ERRORS_RO 2 /* Remount fs read-only */ -#define EXT3_ERRORS_PANIC 3 /* Panic */ -#define EXT3_ERRORS_DEFAULT EXT3_ERRORS_CONTINUE - -/* - * Structure of the super block - */ -struct ext3_super_block { -/*00*/ __le32 s_inodes_count; /* Inodes count */ - __le32 s_blocks_count; /* Blocks count */ - __le32 s_r_blocks_count; /* Reserved blocks count */ - __le32 s_free_blocks_count; /* Free blocks count */ -/*10*/ __le32 s_free_inodes_count; /* Free inodes count */ - __le32 s_first_data_block; /* First Data Block */ - __le32 s_log_block_size; /* Block size */ - __le32 s_log_frag_size; /* Fragment size */ -/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */ - __le32 s_frags_per_group; /* # Fragments per group */ - __le32 s_inodes_per_group; /* # Inodes per group */ - __le32 s_mtime; /* Mount time */ -/*30*/ __le32 s_wtime; /* Write time */ - __le16 s_mnt_count; /* Mount count */ - __le16 s_max_mnt_count; /* Maximal mount count */ - __le16 s_magic; /* Magic signature */ - __le16 s_state; /* File system state */ - __le16 s_errors; /* Behaviour when detecting errors */ - __le16 s_minor_rev_level; /* minor revision level */ -/*40*/ __le32 s_lastcheck; /* time of last check */ - __le32 s_checkinterval; /* max. time between checks */ - __le32 s_creator_os; /* OS */ - __le32 s_rev_level; /* Revision level */ -/*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */ - __le16 s_def_resgid; /* Default gid for reserved blocks */ - /* - * These fields are for EXT3_DYNAMIC_REV superblocks only. - * - * Note: the difference between the compatible feature set and - * the incompatible feature set is that if there is a bit set - * in the incompatible feature set that the kernel doesn't - * know about, it should refuse to mount the filesystem. - * - * e2fsck's requirements are more strict; if it doesn't know - * about a feature in either the compatible or incompatible - * feature set, it must abort and not try to meddle with - * things it doesn't understand... - */ - __le32 s_first_ino; /* First non-reserved inode */ - __le16 s_inode_size; /* size of inode structure */ - __le16 s_block_group_nr; /* block group # of this superblock */ - __le32 s_feature_compat; /* compatible feature set */ -/*60*/ __le32 s_feature_incompat; /* incompatible feature set */ - __le32 s_feature_ro_compat; /* readonly-compatible feature set */ -/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ -/*78*/ char s_volume_name[16]; /* volume name */ -/*88*/ char s_last_mounted[64]; /* directory where last mounted */ -/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */ - /* - * Performance hints. Directory preallocation should only - * happen if the EXT3_FEATURE_COMPAT_DIR_PREALLOC flag is on. - */ - __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/ - __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */ - __le16 s_reserved_gdt_blocks; /* Per group desc for online growth */ - /* - * Journaling support valid if EXT3_FEATURE_COMPAT_HAS_JOURNAL set. - */ -/*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */ -/*E0*/ __le32 s_journal_inum; /* inode number of journal file */ - __le32 s_journal_dev; /* device number of journal file */ - __le32 s_last_orphan; /* start of list of inodes to delete */ - __le32 s_hash_seed[4]; /* HTREE hash seed */ - __u8 s_def_hash_version; /* Default hash version to use */ - __u8 s_reserved_char_pad; - __u16 s_reserved_word_pad; - __le32 s_default_mount_opts; - __le32 s_first_meta_bg; /* First metablock block group */ - __le32 s_mkfs_time; /* When the filesystem was created */ - __le32 s_jnl_blocks[17]; /* Backup of the journal inode */ - /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */ -/*150*/ __le32 s_blocks_count_hi; /* Blocks count */ - __le32 s_r_blocks_count_hi; /* Reserved blocks count */ - __le32 s_free_blocks_count_hi; /* Free blocks count */ - __le16 s_min_extra_isize; /* All inodes have at least # bytes */ - __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ - __le32 s_flags; /* Miscellaneous flags */ - __le16 s_raid_stride; /* RAID stride */ - __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ - __le64 s_mmp_block; /* Block for multi-mount protection */ - __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ - __u8 s_log_groups_per_flex; /* FLEX_BG group size */ - __u8 s_reserved_char_pad2; - __le16 s_reserved_pad; - __u32 s_reserved[162]; /* Padding to the end of the block */ -}; - -/* data type for block offset of block group */ -typedef int ext3_grpblk_t; - -/* data type for filesystem-wide blocks number */ -typedef unsigned long ext3_fsblk_t; - -#define E3FSBLK "%lu" - -struct ext3_reserve_window { - ext3_fsblk_t _rsv_start; /* First byte reserved */ - ext3_fsblk_t _rsv_end; /* Last byte reserved or 0 */ -}; - -struct ext3_reserve_window_node { - struct rb_node rsv_node; - __u32 rsv_goal_size; - __u32 rsv_alloc_hit; - struct ext3_reserve_window rsv_window; -}; - -struct ext3_block_alloc_info { - /* information about reservation window */ - struct ext3_reserve_window_node rsv_window_node; - /* - * was i_next_alloc_block in ext3_inode_info - * is the logical (file-relative) number of the - * most-recently-allocated block in this file. - * We use this for detecting linearly ascending allocation requests. - */ - __u32 last_alloc_logical_block; - /* - * Was i_next_alloc_goal in ext3_inode_info - * is the *physical* companion to i_next_alloc_block. - * it the physical block number of the block which was most-recentl - * allocated to this file. This give us the goal (target) for the next - * allocation when we detect linearly ascending requests. - */ - ext3_fsblk_t last_alloc_physical_block; -}; - -#define rsv_start rsv_window._rsv_start -#define rsv_end rsv_window._rsv_end - -/* - * third extended file system inode data in memory - */ -struct ext3_inode_info { - __le32 i_data[15]; /* unconverted */ - __u32 i_flags; -#ifdef EXT3_FRAGMENTS - __u32 i_faddr; - __u8 i_frag_no; - __u8 i_frag_size; -#endif - ext3_fsblk_t i_file_acl; - __u32 i_dir_acl; - __u32 i_dtime; - - /* - * i_block_group is the number of the block group which contains - * this file's inode. Constant across the lifetime of the inode, - * it is ued for making block allocation decisions - we try to - * place a file's data blocks near its inode block, and new inodes - * near to their parent directory's inode. - */ - __u32 i_block_group; - unsigned long i_state_flags; /* Dynamic state flags for ext3 */ - - /* block reservation info */ - struct ext3_block_alloc_info *i_block_alloc_info; - - __u32 i_dir_start_lookup; -#ifdef CONFIG_EXT3_FS_XATTR - /* - * Extended attributes can be read independently of the main file - * data. Taking i_mutex even when reading would cause contention - * between readers of EAs and writers of regular file data, so - * instead we synchronize on xattr_sem when reading or changing - * EAs. - */ - struct rw_semaphore xattr_sem; -#endif - - struct list_head i_orphan; /* unlinked but open inodes */ - - /* - * i_disksize keeps track of what the inode size is ON DISK, not - * in memory. During truncate, i_size is set to the new size by - * the VFS prior to calling ext3_truncate(), but the filesystem won't - * set i_disksize to 0 until the truncate is actually under way. - * - * The intent is that i_disksize always represents the blocks which - * are used by this file. This allows recovery to restart truncate - * on orphans if we crash during truncate. We actually write i_disksize - * into the on-disk inode when writing inodes out, instead of i_size. - * - * The only time when i_disksize and i_size may be different is when - * a truncate is in progress. The only things which change i_disksize - * are ext3_get_block (growth) and ext3_truncate (shrinkth). - */ - loff_t i_disksize; - - /* on-disk additional length */ - __u16 i_extra_isize; - - /* - * truncate_mutex is for serialising ext3_truncate() against - * ext3_getblock(). In the 2.4 ext2 design, great chunks of inode's - * data tree are chopped off during truncate. We can't do that in - * ext3 because whenever we perform intermediate commits during - * truncate, the inode and all the metadata blocks *must* be in a - * consistent state which allows truncation of the orphans to restart - * during recovery. Hence we must fix the get_block-vs-truncate race - * by other means, so we have truncate_mutex. - */ - struct mutex truncate_mutex; - - /* - * Transactions that contain inode's metadata needed to complete - * fsync and fdatasync, respectively. - */ - atomic_t i_sync_tid; - atomic_t i_datasync_tid; - -#ifdef CONFIG_QUOTA - struct dquot *i_dquot[MAXQUOTAS]; -#endif - - struct inode vfs_inode; -}; - -/* - * third extended-fs super-block data in memory - */ -struct ext3_sb_info { - unsigned long s_frag_size; /* Size of a fragment in bytes */ - unsigned long s_frags_per_block;/* Number of fragments per block */ - unsigned long s_inodes_per_block;/* Number of inodes per block */ - unsigned long s_frags_per_group;/* Number of fragments in a group */ - unsigned long s_blocks_per_group;/* Number of blocks in a group */ - unsigned long s_inodes_per_group;/* Number of inodes in a group */ - unsigned long s_itb_per_group; /* Number of inode table blocks per group */ - unsigned long s_gdb_count; /* Number of group descriptor blocks */ - unsigned long s_desc_per_block; /* Number of group descriptors per block */ - unsigned long s_groups_count; /* Number of groups in the fs */ - unsigned long s_overhead_last; /* Last calculated overhead */ - unsigned long s_blocks_last; /* Last seen block count */ - struct buffer_head * s_sbh; /* Buffer containing the super block */ - struct ext3_super_block * s_es; /* Pointer to the super block in the buffer */ - struct buffer_head ** s_group_desc; - unsigned long s_mount_opt; - ext3_fsblk_t s_sb_block; - kuid_t s_resuid; - kgid_t s_resgid; - unsigned short s_mount_state; - unsigned short s_pad; - int s_addr_per_block_bits; - int s_desc_per_block_bits; - int s_inode_size; - int s_first_ino; - spinlock_t s_next_gen_lock; - u32 s_next_generation; - u32 s_hash_seed[4]; - int s_def_hash_version; - int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ - struct percpu_counter s_freeblocks_counter; - struct percpu_counter s_freeinodes_counter; - struct percpu_counter s_dirs_counter; - struct blockgroup_lock *s_blockgroup_lock; - - /* root of the per fs reservation window tree */ - spinlock_t s_rsv_window_lock; - struct rb_root s_rsv_window_root; - struct ext3_reserve_window_node s_rsv_window_head; - - /* Journaling */ - struct inode * s_journal_inode; - struct journal_s * s_journal; - struct list_head s_orphan; - struct mutex s_orphan_lock; - struct mutex s_resize_lock; - unsigned long s_commit_interval; - struct block_device *journal_bdev; -#ifdef CONFIG_QUOTA - char *s_qf_names[EXT3_MAXQUOTAS]; /* Names of quota files with journalled quota */ - int s_jquota_fmt; /* Format of quota to use */ -#endif -}; - -static inline spinlock_t * -sb_bgl_lock(struct ext3_sb_info *sbi, unsigned int block_group) -{ - return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group); -} - -static inline struct ext3_sb_info * EXT3_SB(struct super_block *sb) -{ - return sb->s_fs_info; -} -static inline struct ext3_inode_info *EXT3_I(struct inode *inode) -{ - return container_of(inode, struct ext3_inode_info, vfs_inode); -} - -static inline int ext3_valid_inum(struct super_block *sb, unsigned long ino) -{ - return ino == EXT3_ROOT_INO || - ino == EXT3_JOURNAL_INO || - ino == EXT3_RESIZE_INO || - (ino >= EXT3_FIRST_INO(sb) && - ino <= le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count)); -} - -/* - * Inode dynamic state flags - */ -enum { - EXT3_STATE_JDATA, /* journaled data exists */ - EXT3_STATE_NEW, /* inode is newly created */ - EXT3_STATE_XATTR, /* has in-inode xattrs */ - EXT3_STATE_FLUSH_ON_CLOSE, /* flush dirty pages on close */ -}; - -static inline int ext3_test_inode_state(struct inode *inode, int bit) -{ - return test_bit(bit, &EXT3_I(inode)->i_state_flags); -} - -static inline void ext3_set_inode_state(struct inode *inode, int bit) -{ - set_bit(bit, &EXT3_I(inode)->i_state_flags); -} - -static inline void ext3_clear_inode_state(struct inode *inode, int bit) -{ - clear_bit(bit, &EXT3_I(inode)->i_state_flags); -} - -#define NEXT_ORPHAN(inode) EXT3_I(inode)->i_dtime - -/* - * Codes for operating systems - */ -#define EXT3_OS_LINUX 0 -#define EXT3_OS_HURD 1 -#define EXT3_OS_MASIX 2 -#define EXT3_OS_FREEBSD 3 -#define EXT3_OS_LITES 4 - -/* - * Revision levels - */ -#define EXT3_GOOD_OLD_REV 0 /* The good old (original) format */ -#define EXT3_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ - -#define EXT3_CURRENT_REV EXT3_GOOD_OLD_REV -#define EXT3_MAX_SUPP_REV EXT3_DYNAMIC_REV - -#define EXT3_GOOD_OLD_INODE_SIZE 128 - -/* - * Feature set definitions - */ - -#define EXT3_HAS_COMPAT_FEATURE(sb,mask) \ - ( EXT3_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) ) -#define EXT3_HAS_RO_COMPAT_FEATURE(sb,mask) \ - ( EXT3_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) ) -#define EXT3_HAS_INCOMPAT_FEATURE(sb,mask) \ - ( EXT3_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) ) -#define EXT3_SET_COMPAT_FEATURE(sb,mask) \ - EXT3_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) -#define EXT3_SET_RO_COMPAT_FEATURE(sb,mask) \ - EXT3_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask) -#define EXT3_SET_INCOMPAT_FEATURE(sb,mask) \ - EXT3_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask) -#define EXT3_CLEAR_COMPAT_FEATURE(sb,mask) \ - EXT3_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask) -#define EXT3_CLEAR_RO_COMPAT_FEATURE(sb,mask) \ - EXT3_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask) -#define EXT3_CLEAR_INCOMPAT_FEATURE(sb,mask) \ - EXT3_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask) - -#define EXT3_FEATURE_COMPAT_DIR_PREALLOC 0x0001 -#define EXT3_FEATURE_COMPAT_IMAGIC_INODES 0x0002 -#define EXT3_FEATURE_COMPAT_HAS_JOURNAL 0x0004 -#define EXT3_FEATURE_COMPAT_EXT_ATTR 0x0008 -#define EXT3_FEATURE_COMPAT_RESIZE_INODE 0x0010 -#define EXT3_FEATURE_COMPAT_DIR_INDEX 0x0020 - -#define EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 -#define EXT3_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 -#define EXT3_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 - -#define EXT3_FEATURE_INCOMPAT_COMPRESSION 0x0001 -#define EXT3_FEATURE_INCOMPAT_FILETYPE 0x0002 -#define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ -#define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ -#define EXT3_FEATURE_INCOMPAT_META_BG 0x0010 - -#define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR -#define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ - EXT3_FEATURE_INCOMPAT_RECOVER| \ - EXT3_FEATURE_INCOMPAT_META_BG) -#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ - EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ - EXT3_FEATURE_RO_COMPAT_BTREE_DIR) - -/* - * Default values for user and/or group using reserved blocks - */ -#define EXT3_DEF_RESUID 0 -#define EXT3_DEF_RESGID 0 - -/* - * Default mount options - */ -#define EXT3_DEFM_DEBUG 0x0001 -#define EXT3_DEFM_BSDGROUPS 0x0002 -#define EXT3_DEFM_XATTR_USER 0x0004 -#define EXT3_DEFM_ACL 0x0008 -#define EXT3_DEFM_UID16 0x0010 -#define EXT3_DEFM_JMODE 0x0060 -#define EXT3_DEFM_JMODE_DATA 0x0020 -#define EXT3_DEFM_JMODE_ORDERED 0x0040 -#define EXT3_DEFM_JMODE_WBACK 0x0060 - -/* - * Structure of a directory entry - */ -#define EXT3_NAME_LEN 255 - -struct ext3_dir_entry { - __le32 inode; /* Inode number */ - __le16 rec_len; /* Directory entry length */ - __le16 name_len; /* Name length */ - char name[EXT3_NAME_LEN]; /* File name */ -}; - -/* - * The new version of the directory entry. Since EXT3 structures are - * stored in intel byte order, and the name_len field could never be - * bigger than 255 chars, it's safe to reclaim the extra byte for the - * file_type field. - */ -struct ext3_dir_entry_2 { - __le32 inode; /* Inode number */ - __le16 rec_len; /* Directory entry length */ - __u8 name_len; /* Name length */ - __u8 file_type; - char name[EXT3_NAME_LEN]; /* File name */ -}; - -/* - * Ext3 directory file types. Only the low 3 bits are used. The - * other bits are reserved for now. - */ -#define EXT3_FT_UNKNOWN 0 -#define EXT3_FT_REG_FILE 1 -#define EXT3_FT_DIR 2 -#define EXT3_FT_CHRDEV 3 -#define EXT3_FT_BLKDEV 4 -#define EXT3_FT_FIFO 5 -#define EXT3_FT_SOCK 6 -#define EXT3_FT_SYMLINK 7 - -#define EXT3_FT_MAX 8 - -/* - * EXT3_DIR_PAD defines the directory entries boundaries - * - * NOTE: It must be a multiple of 4 - */ -#define EXT3_DIR_PAD 4 -#define EXT3_DIR_ROUND (EXT3_DIR_PAD - 1) -#define EXT3_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT3_DIR_ROUND) & \ - ~EXT3_DIR_ROUND) -#define EXT3_MAX_REC_LEN ((1<<16)-1) - -/* - * Tests against MAX_REC_LEN etc were put in place for 64k block - * sizes; if that is not possible on this arch, we can skip - * those tests and speed things up. - */ -static inline unsigned ext3_rec_len_from_disk(__le16 dlen) -{ - unsigned len = le16_to_cpu(dlen); - -#if (PAGE_CACHE_SIZE >= 65536) - if (len == EXT3_MAX_REC_LEN) - return 1 << 16; -#endif - return len; -} - -static inline __le16 ext3_rec_len_to_disk(unsigned len) -{ -#if (PAGE_CACHE_SIZE >= 65536) - if (len == (1 << 16)) - return cpu_to_le16(EXT3_MAX_REC_LEN); - else if (len > (1 << 16)) - BUG(); -#endif - return cpu_to_le16(len); -} - -/* - * Hash Tree Directory indexing - * (c) Daniel Phillips, 2001 - */ - -#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ - EXT3_FEATURE_COMPAT_DIR_INDEX) && \ - (EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) -#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX) -#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) - -/* Legal values for the dx_root hash_version field: */ - -#define DX_HASH_LEGACY 0 -#define DX_HASH_HALF_MD4 1 -#define DX_HASH_TEA 2 -#define DX_HASH_LEGACY_UNSIGNED 3 -#define DX_HASH_HALF_MD4_UNSIGNED 4 -#define DX_HASH_TEA_UNSIGNED 5 - -/* hash info structure used by the directory hash */ -struct dx_hash_info -{ - u32 hash; - u32 minor_hash; - int hash_version; - u32 *seed; -}; - - -/* 32 and 64 bit signed EOF for dx directories */ -#define EXT3_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1) -#define EXT3_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1) - - -/* - * Control parameters used by ext3_htree_next_block - */ -#define HASH_NB_ALWAYS 1 - - -/* - * Describe an inode's exact location on disk and in memory - */ -struct ext3_iloc -{ - struct buffer_head *bh; - unsigned long offset; - unsigned long block_group; -}; - -static inline struct ext3_inode *ext3_raw_inode(struct ext3_iloc *iloc) -{ - return (struct ext3_inode *) (iloc->bh->b_data + iloc->offset); -} - -/* - * This structure is stuffed into the struct file's private_data field - * for directories. It is where we put information so that we can do - * readdir operations in hash tree order. - */ -struct dir_private_info { - struct rb_root root; - struct rb_node *curr_node; - struct fname *extra_fname; - loff_t last_pos; - __u32 curr_hash; - __u32 curr_minor_hash; - __u32 next_hash; -}; - -/* calculate the first block number of the group */ -static inline ext3_fsblk_t -ext3_group_first_block_no(struct super_block *sb, unsigned long group_no) -{ - return group_no * (ext3_fsblk_t)EXT3_BLOCKS_PER_GROUP(sb) + - le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block); -} - -/* - * Special error return code only used by dx_probe() and its callers. - */ -#define ERR_BAD_DX_DIR -75000 - -/* - * Function prototypes - */ - -/* - * Ok, these declarations are also in <linux/kernel.h> but none of the - * ext3 source programs needs to include it so they are duplicated here. - */ -# define NORET_TYPE /**/ -# define ATTRIB_NORET __attribute__((noreturn)) -# define NORET_AND noreturn, - -/* balloc.c */ -extern int ext3_bg_has_super(struct super_block *sb, int group); -extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); -extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode, - ext3_fsblk_t goal, int *errp); -extern ext3_fsblk_t ext3_new_blocks (handle_t *handle, struct inode *inode, - ext3_fsblk_t goal, unsigned long *count, int *errp); -extern void ext3_free_blocks (handle_t *handle, struct inode *inode, - ext3_fsblk_t block, unsigned long count); -extern void ext3_free_blocks_sb (handle_t *handle, struct super_block *sb, - ext3_fsblk_t block, unsigned long count, - unsigned long *pdquot_freed_blocks); -extern ext3_fsblk_t ext3_count_free_blocks (struct super_block *); -extern void ext3_check_blocks_bitmap (struct super_block *); -extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, - unsigned int block_group, - struct buffer_head ** bh); -extern int ext3_should_retry_alloc(struct super_block *sb, int *retries); -extern void ext3_init_block_alloc_info(struct inode *); -extern void ext3_rsv_window_add(struct super_block *sb, struct ext3_reserve_window_node *rsv); -extern int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range); - -/* dir.c */ -extern int ext3_check_dir_entry(const char *, struct inode *, - struct ext3_dir_entry_2 *, - struct buffer_head *, unsigned long); -extern int ext3_htree_store_dirent(struct file *dir_file, __u32 hash, - __u32 minor_hash, - struct ext3_dir_entry_2 *dirent); -extern void ext3_htree_free_dir_info(struct dir_private_info *p); - -/* fsync.c */ -extern int ext3_sync_file(struct file *, loff_t, loff_t, int); - -/* hash.c */ -extern int ext3fs_dirhash(const char *name, int len, struct - dx_hash_info *hinfo); - -/* ialloc.c */ -extern struct inode * ext3_new_inode (handle_t *, struct inode *, - const struct qstr *, umode_t); -extern void ext3_free_inode (handle_t *, struct inode *); -extern struct inode * ext3_orphan_get (struct super_block *, unsigned long); -extern unsigned long ext3_count_free_inodes (struct super_block *); -extern unsigned long ext3_count_dirs (struct super_block *); -extern void ext3_check_inodes_bitmap (struct super_block *); -extern unsigned long ext3_count_free (struct buffer_head *, unsigned); - - -/* inode.c */ -int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode, - struct buffer_head *bh, ext3_fsblk_t blocknr); -struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); -struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); -int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, - sector_t iblock, unsigned long maxblocks, struct buffer_head *bh_result, - int create); - -extern struct inode *ext3_iget(struct super_block *, unsigned long); -extern int ext3_write_inode (struct inode *, struct writeback_control *); -extern int ext3_setattr (struct dentry *, struct iattr *); -extern void ext3_evict_inode (struct inode *); -extern int ext3_sync_inode (handle_t *, struct inode *); -extern void ext3_discard_reservation (struct inode *); -extern void ext3_dirty_inode(struct inode *, int); -extern int ext3_change_inode_journal_flag(struct inode *, int); -extern int ext3_get_inode_loc(struct inode *, struct ext3_iloc *); -extern int ext3_can_truncate(struct inode *inode); -extern void ext3_truncate(struct inode *inode); -extern void ext3_set_inode_flags(struct inode *); -extern void ext3_get_inode_flags(struct ext3_inode_info *); -extern void ext3_set_aops(struct inode *inode); -extern int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - u64 start, u64 len); - -/* ioctl.c */ -extern long ext3_ioctl(struct file *, unsigned int, unsigned long); -extern long ext3_compat_ioctl(struct file *, unsigned int, unsigned long); - -/* namei.c */ -extern int ext3_orphan_add(handle_t *, struct inode *); -extern int ext3_orphan_del(handle_t *, struct inode *); -extern int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, - __u32 start_minor_hash, __u32 *next_hash); - -/* resize.c */ -extern int ext3_group_add(struct super_block *sb, - struct ext3_new_group_data *input); -extern int ext3_group_extend(struct super_block *sb, - struct ext3_super_block *es, - ext3_fsblk_t n_blocks_count); - -/* super.c */ -extern __printf(3, 4) -void ext3_error(struct super_block *, const char *, const char *, ...); -extern void __ext3_std_error (struct super_block *, const char *, int); -extern __printf(3, 4) -void ext3_abort(struct super_block *, const char *, const char *, ...); -extern __printf(3, 4) -void ext3_warning(struct super_block *, const char *, const char *, ...); -extern __printf(3, 4) -void ext3_msg(struct super_block *, const char *, const char *, ...); -extern void ext3_update_dynamic_rev (struct super_block *sb); - -#define ext3_std_error(sb, errno) \ -do { \ - if ((errno)) \ - __ext3_std_error((sb), __func__, (errno)); \ -} while (0) - -/* - * Inodes and files operations - */ - -/* dir.c */ -extern const struct file_operations ext3_dir_operations; - -/* file.c */ -extern const struct inode_operations ext3_file_inode_operations; -extern const struct file_operations ext3_file_operations; - -/* namei.c */ -extern const struct inode_operations ext3_dir_inode_operations; -extern const struct inode_operations ext3_special_inode_operations; - -/* symlink.c */ -extern const struct inode_operations ext3_symlink_inode_operations; -extern const struct inode_operations ext3_fast_symlink_inode_operations; - -#define EXT3_JOURNAL(inode) (EXT3_SB((inode)->i_sb)->s_journal) - -/* Define the number of blocks we need to account to a transaction to - * modify one block of data. - * - * We may have to touch one inode, one bitmap buffer, up to three - * indirection blocks, the group and superblock summaries, and the data - * block to complete the transaction. */ - -#define EXT3_SINGLEDATA_TRANS_BLOCKS 8U - -/* Extended attribute operations touch at most two data buffers, - * two bitmap buffers, and two group summaries, in addition to the inode - * and the superblock, which are already accounted for. */ - -#define EXT3_XATTR_TRANS_BLOCKS 6U - -/* Define the minimum size for a transaction which modifies data. This - * needs to take into account the fact that we may end up modifying two - * quota files too (one for the group, one for the user quota). The - * superblock only gets updated once, of course, so don't bother - * counting that again for the quota updates. */ - -#define EXT3_DATA_TRANS_BLOCKS(sb) (EXT3_SINGLEDATA_TRANS_BLOCKS + \ - EXT3_XATTR_TRANS_BLOCKS - 2 + \ - EXT3_MAXQUOTAS_TRANS_BLOCKS(sb)) - -/* Delete operations potentially hit one directory's namespace plus an - * entire inode, plus arbitrary amounts of bitmap/indirection data. Be - * generous. We can grow the delete transaction later if necessary. */ - -#define EXT3_DELETE_TRANS_BLOCKS(sb) (EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) + 64) - -/* Define an arbitrary limit for the amount of data we will anticipate - * writing to any given transaction. For unbounded transactions such as - * write(2) and truncate(2) we can write more than this, but we always - * start off at the maximum transaction size and grow the transaction - * optimistically as we go. */ - -#define EXT3_MAX_TRANS_DATA 64U - -/* We break up a large truncate or write transaction once the handle's - * buffer credits gets this low, we need either to extend the - * transaction or to start a new one. Reserve enough space here for - * inode, bitmap, superblock, group and indirection updates for at least - * one block, plus two quota updates. Quota allocations are not - * needed. */ - -#define EXT3_RESERVE_TRANS_BLOCKS 12U - -#define EXT3_INDEX_EXTRA_TRANS_BLOCKS 8 - -#ifdef CONFIG_QUOTA -/* Amount of blocks needed for quota update - we know that the structure was - * allocated so we need to update only inode+data */ -#define EXT3_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0) -/* Amount of blocks needed for quota insert/delete - we do some block writes - * but inode, sb and group updates are done only once */ -#define EXT3_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ - (EXT3_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_INIT_REWRITE) : 0) -#define EXT3_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\ - (EXT3_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_DEL_REWRITE) : 0) -#else -#define EXT3_QUOTA_TRANS_BLOCKS(sb) 0 -#define EXT3_QUOTA_INIT_BLOCKS(sb) 0 -#define EXT3_QUOTA_DEL_BLOCKS(sb) 0 -#endif -#define EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_TRANS_BLOCKS(sb)) -#define EXT3_MAXQUOTAS_INIT_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_INIT_BLOCKS(sb)) -#define EXT3_MAXQUOTAS_DEL_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_DEL_BLOCKS(sb)) - -int -ext3_mark_iloc_dirty(handle_t *handle, - struct inode *inode, - struct ext3_iloc *iloc); - -/* - * On success, We end up with an outstanding reference count against - * iloc->bh. This _must_ be cleaned up later. - */ - -int ext3_reserve_inode_write(handle_t *handle, struct inode *inode, - struct ext3_iloc *iloc); - -int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode); - -/* - * Wrapper functions with which ext3 calls into JBD. The intent here is - * to allow these to be turned into appropriate stubs so ext3 can control - * ext2 filesystems, so ext2+ext3 systems only nee one fs. This work hasn't - * been done yet. - */ - -static inline void ext3_journal_release_buffer(handle_t *handle, - struct buffer_head *bh) -{ - journal_release_buffer(handle, bh); -} - -void ext3_journal_abort_handle(const char *caller, const char *err_fn, - struct buffer_head *bh, handle_t *handle, int err); - -int __ext3_journal_get_undo_access(const char *where, handle_t *handle, - struct buffer_head *bh); - -int __ext3_journal_get_write_access(const char *where, handle_t *handle, - struct buffer_head *bh); - -int __ext3_journal_forget(const char *where, handle_t *handle, - struct buffer_head *bh); - -int __ext3_journal_revoke(const char *where, handle_t *handle, - unsigned long blocknr, struct buffer_head *bh); - -int __ext3_journal_get_create_access(const char *where, - handle_t *handle, struct buffer_head *bh); - -int __ext3_journal_dirty_metadata(const char *where, - handle_t *handle, struct buffer_head *bh); - -#define ext3_journal_get_undo_access(handle, bh) \ - __ext3_journal_get_undo_access(__func__, (handle), (bh)) -#define ext3_journal_get_write_access(handle, bh) \ - __ext3_journal_get_write_access(__func__, (handle), (bh)) -#define ext3_journal_revoke(handle, blocknr, bh) \ - __ext3_journal_revoke(__func__, (handle), (blocknr), (bh)) -#define ext3_journal_get_create_access(handle, bh) \ - __ext3_journal_get_create_access(__func__, (handle), (bh)) -#define ext3_journal_dirty_metadata(handle, bh) \ - __ext3_journal_dirty_metadata(__func__, (handle), (bh)) -#define ext3_journal_forget(handle, bh) \ - __ext3_journal_forget(__func__, (handle), (bh)) - -int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh); - -handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks); -int __ext3_journal_stop(const char *where, handle_t *handle); - -static inline handle_t *ext3_journal_start(struct inode *inode, int nblocks) -{ - return ext3_journal_start_sb(inode->i_sb, nblocks); -} - -#define ext3_journal_stop(handle) \ - __ext3_journal_stop(__func__, (handle)) - -static inline handle_t *ext3_journal_current_handle(void) -{ - return journal_current_handle(); -} - -static inline int ext3_journal_extend(handle_t *handle, int nblocks) -{ - return journal_extend(handle, nblocks); -} - -static inline int ext3_journal_restart(handle_t *handle, int nblocks) -{ - return journal_restart(handle, nblocks); -} - -static inline int ext3_journal_blocks_per_page(struct inode *inode) -{ - return journal_blocks_per_page(inode); -} - -static inline int ext3_journal_force_commit(journal_t *journal) -{ - return journal_force_commit(journal); -} - -/* super.c */ -int ext3_force_commit(struct super_block *sb); - -static inline int ext3_should_journal_data(struct inode *inode) -{ - if (!S_ISREG(inode->i_mode)) - return 1; - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA) - return 1; - if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL) - return 1; - return 0; -} - -static inline int ext3_should_order_data(struct inode *inode) -{ - if (!S_ISREG(inode->i_mode)) - return 0; - if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL) - return 0; - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA) - return 1; - return 0; -} - -static inline int ext3_should_writeback_data(struct inode *inode) -{ - if (!S_ISREG(inode->i_mode)) - return 0; - if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL) - return 0; - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA) - return 1; - return 0; -} - -#include <trace/events/ext3.h> diff --git a/fs/ext3/ext3_jbd.c b/fs/ext3/ext3_jbd.c deleted file mode 100644 index 785a3261a26c..000000000000 --- a/fs/ext3/ext3_jbd.c +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Interface between ext3 and JBD - */ - -#include "ext3.h" - -int __ext3_journal_get_undo_access(const char *where, handle_t *handle, - struct buffer_head *bh) -{ - int err = journal_get_undo_access(handle, bh); - if (err) - ext3_journal_abort_handle(where, __func__, bh, handle,err); - return err; -} - -int __ext3_journal_get_write_access(const char *where, handle_t *handle, - struct buffer_head *bh) -{ - int err = journal_get_write_access(handle, bh); - if (err) - ext3_journal_abort_handle(where, __func__, bh, handle,err); - return err; -} - -int __ext3_journal_forget(const char *where, handle_t *handle, - struct buffer_head *bh) -{ - int err = journal_forget(handle, bh); - if (err) - ext3_journal_abort_handle(where, __func__, bh, handle,err); - return err; -} - -int __ext3_journal_revoke(const char *where, handle_t *handle, - unsigned long blocknr, struct buffer_head *bh) -{ - int err = journal_revoke(handle, blocknr, bh); - if (err) - ext3_journal_abort_handle(where, __func__, bh, handle,err); - return err; -} - -int __ext3_journal_get_create_access(const char *where, - handle_t *handle, struct buffer_head *bh) -{ - int err = journal_get_create_access(handle, bh); - if (err) - ext3_journal_abort_handle(where, __func__, bh, handle,err); - return err; -} - -int __ext3_journal_dirty_metadata(const char *where, - handle_t *handle, struct buffer_head *bh) -{ - int err = journal_dirty_metadata(handle, bh); - if (err) - ext3_journal_abort_handle(where, __func__, bh, handle,err); - return err; -} diff --git a/fs/ext3/file.c b/fs/ext3/file.c deleted file mode 100644 index 3b8f650de22c..000000000000 --- a/fs/ext3/file.c +++ /dev/null @@ -1,79 +0,0 @@ -/* - * linux/fs/ext3/file.c - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * from - * - * linux/fs/minix/file.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * ext3 fs regular file handling primitives - * - * 64-bit file support on 64-bit platforms by Jakub Jelinek - * (jj@sunsite.ms.mff.cuni.cz) - */ - -#include <linux/quotaops.h> -#include "ext3.h" -#include "xattr.h" -#include "acl.h" - -/* - * Called when an inode is released. Note that this is different - * from ext3_file_open: open gets called at every open, but release - * gets called only when /all/ the files are closed. - */ -static int ext3_release_file (struct inode * inode, struct file * filp) -{ - if (ext3_test_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE)) { - filemap_flush(inode->i_mapping); - ext3_clear_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE); - } - /* if we are the last writer on the inode, drop the block reservation */ - if ((filp->f_mode & FMODE_WRITE) && - (atomic_read(&inode->i_writecount) == 1)) - { - mutex_lock(&EXT3_I(inode)->truncate_mutex); - ext3_discard_reservation(inode); - mutex_unlock(&EXT3_I(inode)->truncate_mutex); - } - if (is_dx(inode) && filp->private_data) - ext3_htree_free_dir_info(filp->private_data); - - return 0; -} - -const struct file_operations ext3_file_operations = { - .llseek = generic_file_llseek, - .read_iter = generic_file_read_iter, - .write_iter = generic_file_write_iter, - .unlocked_ioctl = ext3_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = ext3_compat_ioctl, -#endif - .mmap = generic_file_mmap, - .open = dquot_file_open, - .release = ext3_release_file, - .fsync = ext3_sync_file, - .splice_read = generic_file_splice_read, - .splice_write = iter_file_splice_write, -}; - -const struct inode_operations ext3_file_inode_operations = { - .setattr = ext3_setattr, -#ifdef CONFIG_EXT3_FS_XATTR - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .listxattr = ext3_listxattr, - .removexattr = generic_removexattr, -#endif - .get_acl = ext3_get_acl, - .set_acl = ext3_set_acl, - .fiemap = ext3_fiemap, -}; - diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c deleted file mode 100644 index 1cb9c7e10c6f..000000000000 --- a/fs/ext3/fsync.c +++ /dev/null @@ -1,109 +0,0 @@ -/* - * linux/fs/ext3/fsync.c - * - * Copyright (C) 1993 Stephen Tweedie (sct@redhat.com) - * from - * Copyright (C) 1992 Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * from - * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds - * - * ext3fs fsync primitive - * - * Big-endian to little-endian byte-swapping/bitmaps by - * David S. Miller (davem@caip.rutgers.edu), 1995 - * - * Removed unnecessary code duplication for little endian machines - * and excessive __inline__s. - * Andi Kleen, 1997 - * - * Major simplications and cleanup - we only need to do the metadata, because - * we can depend on generic_block_fdatasync() to sync the data blocks. - */ - -#include <linux/blkdev.h> -#include <linux/writeback.h> -#include "ext3.h" - -/* - * akpm: A new design for ext3_sync_file(). - * - * This is only called from sys_fsync(), sys_fdatasync() and sys_msync(). - * There cannot be a transaction open by this task. - * Another task could have dirtied this inode. Its data can be in any - * state in the journalling system. - * - * What we do is just kick off a commit and wait on it. This will snapshot the - * inode to disk. - */ - -int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync) -{ - struct inode *inode = file->f_mapping->host; - struct ext3_inode_info *ei = EXT3_I(inode); - journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; - int ret, needs_barrier = 0; - tid_t commit_tid; - - trace_ext3_sync_file_enter(file, datasync); - - if (inode->i_sb->s_flags & MS_RDONLY) { - /* Make sure that we read updated state */ - smp_rmb(); - if (EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS) - return -EROFS; - return 0; - } - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (ret) - goto out; - - J_ASSERT(ext3_journal_current_handle() == NULL); - - /* - * data=writeback,ordered: - * The caller's filemap_fdatawrite()/wait will sync the data. - * Metadata is in the journal, we wait for a proper transaction - * to commit here. - * - * data=journal: - * filemap_fdatawrite won't do anything (the buffers are clean). - * ext3_force_commit will write the file data into the journal and - * will wait on that. - * filemap_fdatawait() will encounter a ton of newly-dirtied pages - * (they were dirtied by commit). But that's OK - the blocks are - * safe in-journal, which is all fsync() needs to ensure. - */ - if (ext3_should_journal_data(inode)) { - ret = ext3_force_commit(inode->i_sb); - goto out; - } - - if (datasync) - commit_tid = atomic_read(&ei->i_datasync_tid); - else - commit_tid = atomic_read(&ei->i_sync_tid); - - if (test_opt(inode->i_sb, BARRIER) && - !journal_trans_will_send_data_barrier(journal, commit_tid)) - needs_barrier = 1; - log_start_commit(journal, commit_tid); - ret = log_wait_commit(journal, commit_tid); - - /* - * In case we didn't commit a transaction, we have to flush - * disk caches manually so that data really is on persistent - * storage - */ - if (needs_barrier) { - int err; - - err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); - if (!ret) - ret = err; - } -out: - trace_ext3_sync_file_exit(inode, ret); - return ret; -} diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c deleted file mode 100644 index ede315cdf126..000000000000 --- a/fs/ext3/hash.c +++ /dev/null @@ -1,206 +0,0 @@ -/* - * linux/fs/ext3/hash.c - * - * Copyright (C) 2002 by Theodore Ts'o - * - * This file is released under the GPL v2. - * - * This file may be redistributed under the terms of the GNU Public - * License. - */ - -#include "ext3.h" -#include <linux/cryptohash.h> - -#define DELTA 0x9E3779B9 - -static void TEA_transform(__u32 buf[4], __u32 const in[]) -{ - __u32 sum = 0; - __u32 b0 = buf[0], b1 = buf[1]; - __u32 a = in[0], b = in[1], c = in[2], d = in[3]; - int n = 16; - - do { - sum += DELTA; - b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); - b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); - } while(--n); - - buf[0] += b0; - buf[1] += b1; -} - - -/* The old legacy hash */ -static __u32 dx_hack_hash_unsigned(const char *name, int len) -{ - __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; - const unsigned char *ucp = (const unsigned char *) name; - - while (len--) { - hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373)); - - if (hash & 0x80000000) - hash -= 0x7fffffff; - hash1 = hash0; - hash0 = hash; - } - return hash0 << 1; -} - -static __u32 dx_hack_hash_signed(const char *name, int len) -{ - __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; - const signed char *scp = (const signed char *) name; - - while (len--) { - hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373)); - - if (hash & 0x80000000) - hash -= 0x7fffffff; - hash1 = hash0; - hash0 = hash; - } - return hash0 << 1; -} - -static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num) -{ - __u32 pad, val; - int i; - const signed char *scp = (const signed char *) msg; - - pad = (__u32)len | ((__u32)len << 8); - pad |= pad << 16; - - val = pad; - if (len > num*4) - len = num * 4; - for (i = 0; i < len; i++) { - if ((i % 4) == 0) - val = pad; - val = ((int) scp[i]) + (val << 8); - if ((i % 4) == 3) { - *buf++ = val; - val = pad; - num--; - } - } - if (--num >= 0) - *buf++ = val; - while (--num >= 0) - *buf++ = pad; -} - -static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num) -{ - __u32 pad, val; - int i; - const unsigned char *ucp = (const unsigned char *) msg; - - pad = (__u32)len | ((__u32)len << 8); - pad |= pad << 16; - - val = pad; - if (len > num*4) - len = num * 4; - for (i=0; i < len; i++) { - if ((i % 4) == 0) - val = pad; - val = ((int) ucp[i]) + (val << 8); - if ((i % 4) == 3) { - *buf++ = val; - val = pad; - num--; - } - } - if (--num >= 0) - *buf++ = val; - while (--num >= 0) - *buf++ = pad; -} - -/* - * Returns the hash of a filename. If len is 0 and name is NULL, then - * this function can be used to test whether or not a hash version is - * supported. - * - * The seed is an 4 longword (32 bits) "secret" which can be used to - * uniquify a hash. If the seed is all zero's, then some default seed - * may be used. - * - * A particular hash version specifies whether or not the seed is - * represented, and whether or not the returned hash is 32 bits or 64 - * bits. 32 bit hashes will return 0 for the minor hash. - */ -int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) -{ - __u32 hash; - __u32 minor_hash = 0; - const char *p; - int i; - __u32 in[8], buf[4]; - void (*str2hashbuf)(const char *, int, __u32 *, int) = - str2hashbuf_signed; - - /* Initialize the default seed for the hash checksum functions */ - buf[0] = 0x67452301; - buf[1] = 0xefcdab89; - buf[2] = 0x98badcfe; - buf[3] = 0x10325476; - - /* Check to see if the seed is all zero's */ - if (hinfo->seed) { - for (i=0; i < 4; i++) { - if (hinfo->seed[i]) - break; - } - if (i < 4) - memcpy(buf, hinfo->seed, sizeof(buf)); - } - - switch (hinfo->hash_version) { - case DX_HASH_LEGACY_UNSIGNED: - hash = dx_hack_hash_unsigned(name, len); - break; - case DX_HASH_LEGACY: - hash = dx_hack_hash_signed(name, len); - break; - case DX_HASH_HALF_MD4_UNSIGNED: - str2hashbuf = str2hashbuf_unsigned; - case DX_HASH_HALF_MD4: - p = name; - while (len > 0) { - (*str2hashbuf)(p, len, in, 8); - half_md4_transform(buf, in); - len -= 32; - p += 32; - } - minor_hash = buf[2]; - hash = buf[1]; - break; - case DX_HASH_TEA_UNSIGNED: - str2hashbuf = str2hashbuf_unsigned; - case DX_HASH_TEA: - p = name; - while (len > 0) { - (*str2hashbuf)(p, len, in, 4); - TEA_transform(buf, in); - len -= 16; - p += 16; - } - hash = buf[0]; - minor_hash = buf[1]; - break; - default: - hinfo->hash = 0; - return -1; - } - hash = hash & ~1; - if (hash == (EXT3_HTREE_EOF_32BIT << 1)) - hash = (EXT3_HTREE_EOF_32BIT - 1) << 1; - hinfo->hash = hash; - hinfo->minor_hash = minor_hash; - return 0; -} diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c deleted file mode 100644 index 3ad242e5840e..000000000000 --- a/fs/ext3/ialloc.c +++ /dev/null @@ -1,706 +0,0 @@ -/* - * linux/fs/ext3/ialloc.c - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * BSD ufs-inspired inode and directory allocation by - * Stephen Tweedie (sct@redhat.com), 1993 - * Big-endian to little-endian byte-swapping/bitmaps by - * David S. Miller (davem@caip.rutgers.edu), 1995 - */ - -#include <linux/quotaops.h> -#include <linux/random.h> - -#include "ext3.h" -#include "xattr.h" -#include "acl.h" - -/* - * ialloc.c contains the inodes allocation and deallocation routines - */ - -/* - * The free inodes are managed by bitmaps. A file system contains several - * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap - * block for inodes, N blocks for the inode table and data blocks. - * - * The file system contains group descriptors which are located after the - * super block. Each descriptor contains the number of the bitmap block and - * the free blocks count in the block. - */ - - -/* - * Read the inode allocation bitmap for a given block_group, reading - * into the specified slot in the superblock's bitmap cache. - * - * Return buffer_head of bitmap on success or NULL. - */ -static struct buffer_head * -read_inode_bitmap(struct super_block * sb, unsigned long block_group) -{ - struct ext3_group_desc *desc; - struct buffer_head *bh = NULL; - - desc = ext3_get_group_desc(sb, block_group, NULL); - if (!desc) - goto error_out; - - bh = sb_bread(sb, le32_to_cpu(desc->bg_inode_bitmap)); - if (!bh) - ext3_error(sb, "read_inode_bitmap", - "Cannot read inode bitmap - " - "block_group = %lu, inode_bitmap = %u", - block_group, le32_to_cpu(desc->bg_inode_bitmap)); -error_out: - return bh; -} - -/* - * NOTE! When we get the inode, we're the only people - * that have access to it, and as such there are no - * race conditions we have to worry about. The inode - * is not on the hash-lists, and it cannot be reached - * through the filesystem because the directory entry - * has been deleted earlier. - * - * HOWEVER: we must make sure that we get no aliases, - * which means that we have to call "clear_inode()" - * _before_ we mark the inode not in use in the inode - * bitmaps. Otherwise a newly created file might use - * the same inode number (not actually the same pointer - * though), and then we'd have two inodes sharing the - * same inode number and space on the harddisk. - */ -void ext3_free_inode (handle_t *handle, struct inode * inode) -{ - struct super_block * sb = inode->i_sb; - int is_directory; - unsigned long ino; - struct buffer_head *bitmap_bh = NULL; - struct buffer_head *bh2; - unsigned long block_group; - unsigned long bit; - struct ext3_group_desc * gdp; - struct ext3_super_block * es; - struct ext3_sb_info *sbi; - int fatal = 0, err; - - if (atomic_read(&inode->i_count) > 1) { - printk ("ext3_free_inode: inode has count=%d\n", - atomic_read(&inode->i_count)); - return; - } - if (inode->i_nlink) { - printk ("ext3_free_inode: inode has nlink=%d\n", - inode->i_nlink); - return; - } - if (!sb) { - printk("ext3_free_inode: inode on nonexistent device\n"); - return; - } - sbi = EXT3_SB(sb); - - ino = inode->i_ino; - ext3_debug ("freeing inode %lu\n", ino); - trace_ext3_free_inode(inode); - - is_directory = S_ISDIR(inode->i_mode); - - es = EXT3_SB(sb)->s_es; - if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { - ext3_error (sb, "ext3_free_inode", - "reserved or nonexistent inode %lu", ino); - goto error_return; - } - block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); - bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb); - bitmap_bh = read_inode_bitmap(sb, block_group); - if (!bitmap_bh) - goto error_return; - - BUFFER_TRACE(bitmap_bh, "get_write_access"); - fatal = ext3_journal_get_write_access(handle, bitmap_bh); - if (fatal) - goto error_return; - - /* Ok, now we can actually update the inode bitmaps.. */ - if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group), - bit, bitmap_bh->b_data)) - ext3_error (sb, "ext3_free_inode", - "bit already cleared for inode %lu", ino); - else { - gdp = ext3_get_group_desc (sb, block_group, &bh2); - - BUFFER_TRACE(bh2, "get_write_access"); - fatal = ext3_journal_get_write_access(handle, bh2); - if (fatal) goto error_return; - - if (gdp) { - spin_lock(sb_bgl_lock(sbi, block_group)); - le16_add_cpu(&gdp->bg_free_inodes_count, 1); - if (is_directory) - le16_add_cpu(&gdp->bg_used_dirs_count, -1); - spin_unlock(sb_bgl_lock(sbi, block_group)); - percpu_counter_inc(&sbi->s_freeinodes_counter); - if (is_directory) - percpu_counter_dec(&sbi->s_dirs_counter); - - } - BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); - err = ext3_journal_dirty_metadata(handle, bh2); - if (!fatal) fatal = err; - } - BUFFER_TRACE(bitmap_bh, "call ext3_journal_dirty_metadata"); - err = ext3_journal_dirty_metadata(handle, bitmap_bh); - if (!fatal) - fatal = err; - -error_return: - brelse(bitmap_bh); - ext3_std_error(sb, fatal); -} - -/* - * Orlov's allocator for directories. - * - * We always try to spread first-level directories. - * - * If there are blockgroups with both free inodes and free blocks counts - * not worse than average we return one with smallest directory count. - * Otherwise we simply return a random group. - * - * For the rest rules look so: - * - * It's OK to put directory into a group unless - * it has too many directories already (max_dirs) or - * it has too few free inodes left (min_inodes) or - * it has too few free blocks left (min_blocks). - * Parent's group is preferred, if it doesn't satisfy these - * conditions we search cyclically through the rest. If none - * of the groups look good we just look for a group with more - * free inodes than average (starting at parent's group). - * - * Debt is incremented each time we allocate a directory and decremented - * when we allocate an inode, within 0--255. - */ - -static int find_group_orlov(struct super_block *sb, struct inode *parent) -{ - int parent_group = EXT3_I(parent)->i_block_group; - struct ext3_sb_info *sbi = EXT3_SB(sb); - int ngroups = sbi->s_groups_count; - int inodes_per_group = EXT3_INODES_PER_GROUP(sb); - unsigned int freei, avefreei; - ext3_fsblk_t freeb, avefreeb; - unsigned int ndirs; - int max_dirs, min_inodes; - ext3_grpblk_t min_blocks; - int group = -1, i; - struct ext3_group_desc *desc; - - freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); - avefreei = freei / ngroups; - freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter); - avefreeb = freeb / ngroups; - ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); - - if ((parent == d_inode(sb->s_root)) || - (EXT3_I(parent)->i_flags & EXT3_TOPDIR_FL)) { - int best_ndir = inodes_per_group; - int best_group = -1; - - group = prandom_u32(); - parent_group = (unsigned)group % ngroups; - for (i = 0; i < ngroups; i++) { - group = (parent_group + i) % ngroups; - desc = ext3_get_group_desc (sb, group, NULL); - if (!desc || !desc->bg_free_inodes_count) - continue; - if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir) - continue; - if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) - continue; - if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb) - continue; - best_group = group; - best_ndir = le16_to_cpu(desc->bg_used_dirs_count); - } - if (best_group >= 0) - return best_group; - goto fallback; - } - - max_dirs = ndirs / ngroups + inodes_per_group / 16; - min_inodes = avefreei - inodes_per_group / 4; - min_blocks = avefreeb - EXT3_BLOCKS_PER_GROUP(sb) / 4; - - for (i = 0; i < ngroups; i++) { - group = (parent_group + i) % ngroups; - desc = ext3_get_group_desc (sb, group, NULL); - if (!desc || !desc->bg_free_inodes_count) - continue; - if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs) - continue; - if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes) - continue; - if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks) - continue; - return group; - } - -fallback: - for (i = 0; i < ngroups; i++) { - group = (parent_group + i) % ngroups; - desc = ext3_get_group_desc (sb, group, NULL); - if (!desc || !desc->bg_free_inodes_count) - continue; - if (le16_to_cpu(desc->bg_free_inodes_count) >= avefreei) - return group; - } - - if (avefreei) { - /* - * The free-inodes counter is approximate, and for really small - * filesystems the above test can fail to find any blockgroups - */ - avefreei = 0; - goto fallback; - } - - return -1; -} - -static int find_group_other(struct super_block *sb, struct inode *parent) -{ - int parent_group = EXT3_I(parent)->i_block_group; - int ngroups = EXT3_SB(sb)->s_groups_count; - struct ext3_group_desc *desc; - int group, i; - - /* - * Try to place the inode in its parent directory - */ - group = parent_group; - desc = ext3_get_group_desc (sb, group, NULL); - if (desc && le16_to_cpu(desc->bg_free_inodes_count) && - le16_to_cpu(desc->bg_free_blocks_count)) - return group; - - /* - * We're going to place this inode in a different blockgroup from its - * parent. We want to cause files in a common directory to all land in - * the same blockgroup. But we want files which are in a different - * directory which shares a blockgroup with our parent to land in a - * different blockgroup. - * - * So add our directory's i_ino into the starting point for the hash. - */ - group = (group + parent->i_ino) % ngroups; - - /* - * Use a quadratic hash to find a group with a free inode and some free - * blocks. - */ - for (i = 1; i < ngroups; i <<= 1) { - group += i; - if (group >= ngroups) - group -= ngroups; - desc = ext3_get_group_desc (sb, group, NULL); - if (desc && le16_to_cpu(desc->bg_free_inodes_count) && - le16_to_cpu(desc->bg_free_blocks_count)) - return group; - } - - /* - * That failed: try linear search for a free inode, even if that group - * has no free blocks. - */ - group = parent_group; - for (i = 0; i < ngroups; i++) { - if (++group >= ngroups) - group = 0; - desc = ext3_get_group_desc (sb, group, NULL); - if (desc && le16_to_cpu(desc->bg_free_inodes_count)) - return group; - } - - return -1; -} - -/* - * There are two policies for allocating an inode. If the new inode is - * a directory, then a forward search is made for a block group with both - * free space and a low directory-to-inode ratio; if that fails, then of - * the groups with above-average free space, that group with the fewest - * directories already is chosen. - * - * For other inodes, search forward from the parent directory's block - * group to find a free inode. - */ -struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, - const struct qstr *qstr, umode_t mode) -{ - struct super_block *sb; - struct buffer_head *bitmap_bh = NULL; - struct buffer_head *bh2; - int group; - unsigned long ino = 0; - struct inode * inode; - struct ext3_group_desc * gdp = NULL; - struct ext3_super_block * es; - struct ext3_inode_info *ei; - struct ext3_sb_info *sbi; - int err = 0; - struct inode *ret; - int i; - - /* Cannot create files in a deleted directory */ - if (!dir || !dir->i_nlink) - return ERR_PTR(-EPERM); - - sb = dir->i_sb; - trace_ext3_request_inode(dir, mode); - inode = new_inode(sb); - if (!inode) - return ERR_PTR(-ENOMEM); - ei = EXT3_I(inode); - - sbi = EXT3_SB(sb); - es = sbi->s_es; - if (S_ISDIR(mode)) - group = find_group_orlov(sb, dir); - else - group = find_group_other(sb, dir); - - err = -ENOSPC; - if (group == -1) - goto out; - - for (i = 0; i < sbi->s_groups_count; i++) { - err = -EIO; - - gdp = ext3_get_group_desc(sb, group, &bh2); - if (!gdp) - goto fail; - - brelse(bitmap_bh); - bitmap_bh = read_inode_bitmap(sb, group); - if (!bitmap_bh) - goto fail; - - ino = 0; - -repeat_in_this_group: - ino = ext3_find_next_zero_bit((unsigned long *) - bitmap_bh->b_data, EXT3_INODES_PER_GROUP(sb), ino); - if (ino < EXT3_INODES_PER_GROUP(sb)) { - - BUFFER_TRACE(bitmap_bh, "get_write_access"); - err = ext3_journal_get_write_access(handle, bitmap_bh); - if (err) - goto fail; - - if (!ext3_set_bit_atomic(sb_bgl_lock(sbi, group), - ino, bitmap_bh->b_data)) { - /* we won it */ - BUFFER_TRACE(bitmap_bh, - "call ext3_journal_dirty_metadata"); - err = ext3_journal_dirty_metadata(handle, - bitmap_bh); - if (err) - goto fail; - goto got; - } - /* we lost it */ - journal_release_buffer(handle, bitmap_bh); - - if (++ino < EXT3_INODES_PER_GROUP(sb)) - goto repeat_in_this_group; - } - - /* - * This case is possible in concurrent environment. It is very - * rare. We cannot repeat the find_group_xxx() call because - * that will simply return the same blockgroup, because the - * group descriptor metadata has not yet been updated. - * So we just go onto the next blockgroup. - */ - if (++group == sbi->s_groups_count) - group = 0; - } - err = -ENOSPC; - goto out; - -got: - ino += group * EXT3_INODES_PER_GROUP(sb) + 1; - if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { - ext3_error (sb, "ext3_new_inode", - "reserved inode or inode > inodes count - " - "block_group = %d, inode=%lu", group, ino); - err = -EIO; - goto fail; - } - - BUFFER_TRACE(bh2, "get_write_access"); - err = ext3_journal_get_write_access(handle, bh2); - if (err) goto fail; - spin_lock(sb_bgl_lock(sbi, group)); - le16_add_cpu(&gdp->bg_free_inodes_count, -1); - if (S_ISDIR(mode)) { - le16_add_cpu(&gdp->bg_used_dirs_count, 1); - } - spin_unlock(sb_bgl_lock(sbi, group)); - BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); - err = ext3_journal_dirty_metadata(handle, bh2); - if (err) goto fail; - - percpu_counter_dec(&sbi->s_freeinodes_counter); - if (S_ISDIR(mode)) - percpu_counter_inc(&sbi->s_dirs_counter); - - - if (test_opt(sb, GRPID)) { - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = dir->i_gid; - } else - inode_init_owner(inode, dir, mode); - - inode->i_ino = ino; - /* This is the optimal IO size (for stat), not the fs block size */ - inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; - - memset(ei->i_data, 0, sizeof(ei->i_data)); - ei->i_dir_start_lookup = 0; - ei->i_disksize = 0; - - ei->i_flags = - ext3_mask_flags(mode, EXT3_I(dir)->i_flags & EXT3_FL_INHERITED); -#ifdef EXT3_FRAGMENTS - ei->i_faddr = 0; - ei->i_frag_no = 0; - ei->i_frag_size = 0; -#endif - ei->i_file_acl = 0; - ei->i_dir_acl = 0; - ei->i_dtime = 0; - ei->i_block_alloc_info = NULL; - ei->i_block_group = group; - - ext3_set_inode_flags(inode); - if (IS_DIRSYNC(inode)) - handle->h_sync = 1; - if (insert_inode_locked(inode) < 0) { - /* - * Likely a bitmap corruption causing inode to be allocated - * twice. - */ - err = -EIO; - goto fail; - } - spin_lock(&sbi->s_next_gen_lock); - inode->i_generation = sbi->s_next_generation++; - spin_unlock(&sbi->s_next_gen_lock); - - ei->i_state_flags = 0; - ext3_set_inode_state(inode, EXT3_STATE_NEW); - - /* See comment in ext3_iget for explanation */ - if (ino >= EXT3_FIRST_INO(sb) + 1 && - EXT3_INODE_SIZE(sb) > EXT3_GOOD_OLD_INODE_SIZE) { - ei->i_extra_isize = - sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE; - } else { - ei->i_extra_isize = 0; - } - - ret = inode; - dquot_initialize(inode); - err = dquot_alloc_inode(inode); - if (err) - goto fail_drop; - - err = ext3_init_acl(handle, inode, dir); - if (err) - goto fail_free_drop; - - err = ext3_init_security(handle, inode, dir, qstr); - if (err) - goto fail_free_drop; - - err = ext3_mark_inode_dirty(handle, inode); - if (err) { - ext3_std_error(sb, err); - goto fail_free_drop; - } - - ext3_debug("allocating inode %lu\n", inode->i_ino); - trace_ext3_allocate_inode(inode, dir, mode); - goto really_out; -fail: - ext3_std_error(sb, err); -out: - iput(inode); - ret = ERR_PTR(err); -really_out: - brelse(bitmap_bh); - return ret; - -fail_free_drop: - dquot_free_inode(inode); - -fail_drop: - dquot_drop(inode); - inode->i_flags |= S_NOQUOTA; - clear_nlink(inode); - unlock_new_inode(inode); - iput(inode); - brelse(bitmap_bh); - return ERR_PTR(err); -} - -/* Verify that we are loading a valid orphan from disk */ -struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino) -{ - unsigned long max_ino = le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count); - unsigned long block_group; - int bit; - struct buffer_head *bitmap_bh; - struct inode *inode = NULL; - long err = -EIO; - - /* Error cases - e2fsck has already cleaned up for us */ - if (ino > max_ino) { - ext3_warning(sb, __func__, - "bad orphan ino %lu! e2fsck was run?", ino); - goto error; - } - - block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); - bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb); - bitmap_bh = read_inode_bitmap(sb, block_group); - if (!bitmap_bh) { - ext3_warning(sb, __func__, - "inode bitmap error for orphan %lu", ino); - goto error; - } - - /* Having the inode bit set should be a 100% indicator that this - * is a valid orphan (no e2fsck run on fs). Orphans also include - * inodes that were being truncated, so we can't check i_nlink==0. - */ - if (!ext3_test_bit(bit, bitmap_bh->b_data)) - goto bad_orphan; - - inode = ext3_iget(sb, ino); - if (IS_ERR(inode)) - goto iget_failed; - - /* - * If the orphans has i_nlinks > 0 then it should be able to be - * truncated, otherwise it won't be removed from the orphan list - * during processing and an infinite loop will result. - */ - if (inode->i_nlink && !ext3_can_truncate(inode)) - goto bad_orphan; - - if (NEXT_ORPHAN(inode) > max_ino) - goto bad_orphan; - brelse(bitmap_bh); - return inode; - -iget_failed: - err = PTR_ERR(inode); - inode = NULL; -bad_orphan: - ext3_warning(sb, __func__, - "bad orphan inode %lu! e2fsck was run?", ino); - printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%llu) = %d\n", - bit, (unsigned long long)bitmap_bh->b_blocknr, - ext3_test_bit(bit, bitmap_bh->b_data)); - printk(KERN_NOTICE "inode=%p\n", inode); - if (inode) { - printk(KERN_NOTICE "is_bad_inode(inode)=%d\n", - is_bad_inode(inode)); - printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n", - NEXT_ORPHAN(inode)); - printk(KERN_NOTICE "max_ino=%lu\n", max_ino); - printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink); - /* Avoid freeing blocks if we got a bad deleted inode */ - if (inode->i_nlink == 0) - inode->i_blocks = 0; - iput(inode); - } - brelse(bitmap_bh); -error: - return ERR_PTR(err); -} - -unsigned long ext3_count_free_inodes (struct super_block * sb) -{ - unsigned long desc_count; - struct ext3_group_desc *gdp; - int i; -#ifdef EXT3FS_DEBUG - struct ext3_super_block *es; - unsigned long bitmap_count, x; - struct buffer_head *bitmap_bh = NULL; - - es = EXT3_SB(sb)->s_es; - desc_count = 0; - bitmap_count = 0; - gdp = NULL; - for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { - gdp = ext3_get_group_desc (sb, i, NULL); - if (!gdp) - continue; - desc_count += le16_to_cpu(gdp->bg_free_inodes_count); - brelse(bitmap_bh); - bitmap_bh = read_inode_bitmap(sb, i); - if (!bitmap_bh) - continue; - - x = ext3_count_free(bitmap_bh, EXT3_INODES_PER_GROUP(sb) / 8); - printk("group %d: stored = %d, counted = %lu\n", - i, le16_to_cpu(gdp->bg_free_inodes_count), x); - bitmap_count += x; - } - brelse(bitmap_bh); - printk("ext3_count_free_inodes: stored = %u, computed = %lu, %lu\n", - le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count); - return desc_count; -#else - desc_count = 0; - for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { - gdp = ext3_get_group_desc (sb, i, NULL); - if (!gdp) - continue; - desc_count += le16_to_cpu(gdp->bg_free_inodes_count); - cond_resched(); - } - return desc_count; -#endif -} - -/* Called at mount-time, super-block is locked */ -unsigned long ext3_count_dirs (struct super_block * sb) -{ - unsigned long count = 0; - int i; - - for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { - struct ext3_group_desc *gdp = ext3_get_group_desc (sb, i, NULL); - if (!gdp) - continue; - count += le16_to_cpu(gdp->bg_used_dirs_count); - } - return count; -} - diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c deleted file mode 100644 index 6c7e5468a2f8..000000000000 --- a/fs/ext3/inode.c +++ /dev/null @@ -1,3574 +0,0 @@ -/* - * linux/fs/ext3/inode.c - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * from - * - * linux/fs/minix/inode.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * Goal-directed block allocation by Stephen Tweedie - * (sct@redhat.com), 1993, 1998 - * Big-endian to little-endian byte-swapping/bitmaps by - * David S. Miller (davem@caip.rutgers.edu), 1995 - * 64-bit file support on 64-bit platforms by Jakub Jelinek - * (jj@sunsite.ms.mff.cuni.cz) - * - * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000 - */ - -#include <linux/highuid.h> -#include <linux/quotaops.h> -#include <linux/writeback.h> -#include <linux/mpage.h> -#include <linux/namei.h> -#include <linux/uio.h> -#include "ext3.h" -#include "xattr.h" -#include "acl.h" - -static int ext3_writepage_trans_blocks(struct inode *inode); -static int ext3_block_truncate_page(struct inode *inode, loff_t from); - -/* - * Test whether an inode is a fast symlink. - */ -static int ext3_inode_is_fast_symlink(struct inode *inode) -{ - int ea_blocks = EXT3_I(inode)->i_file_acl ? - (inode->i_sb->s_blocksize >> 9) : 0; - - return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); -} - -/* - * The ext3 forget function must perform a revoke if we are freeing data - * which has been journaled. Metadata (eg. indirect blocks) must be - * revoked in all cases. - * - * "bh" may be NULL: a metadata block may have been freed from memory - * but there may still be a record of it in the journal, and that record - * still needs to be revoked. - */ -int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode, - struct buffer_head *bh, ext3_fsblk_t blocknr) -{ - int err; - - might_sleep(); - - trace_ext3_forget(inode, is_metadata, blocknr); - BUFFER_TRACE(bh, "enter"); - - jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " - "data mode %lx\n", - bh, is_metadata, inode->i_mode, - test_opt(inode->i_sb, DATA_FLAGS)); - - /* Never use the revoke function if we are doing full data - * journaling: there is no need to, and a V1 superblock won't - * support it. Otherwise, only skip the revoke on un-journaled - * data blocks. */ - - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA || - (!is_metadata && !ext3_should_journal_data(inode))) { - if (bh) { - BUFFER_TRACE(bh, "call journal_forget"); - return ext3_journal_forget(handle, bh); - } - return 0; - } - - /* - * data!=journal && (is_metadata || should_journal_data(inode)) - */ - BUFFER_TRACE(bh, "call ext3_journal_revoke"); - err = ext3_journal_revoke(handle, blocknr, bh); - if (err) - ext3_abort(inode->i_sb, __func__, - "error %d when attempting revoke", err); - BUFFER_TRACE(bh, "exit"); - return err; -} - -/* - * Work out how many blocks we need to proceed with the next chunk of a - * truncate transaction. - */ -static unsigned long blocks_for_truncate(struct inode *inode) -{ - unsigned long needed; - - needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); - - /* Give ourselves just enough room to cope with inodes in which - * i_blocks is corrupt: we've seen disk corruptions in the past - * which resulted in random data in an inode which looked enough - * like a regular file for ext3 to try to delete it. Things - * will go a bit crazy if that happens, but at least we should - * try not to panic the whole kernel. */ - if (needed < 2) - needed = 2; - - /* But we need to bound the transaction so we don't overflow the - * journal. */ - if (needed > EXT3_MAX_TRANS_DATA) - needed = EXT3_MAX_TRANS_DATA; - - return EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + needed; -} - -/* - * Truncate transactions can be complex and absolutely huge. So we need to - * be able to restart the transaction at a conventient checkpoint to make - * sure we don't overflow the journal. - * - * start_transaction gets us a new handle for a truncate transaction, - * and extend_transaction tries to extend the existing one a bit. If - * extend fails, we need to propagate the failure up and restart the - * transaction in the top-level truncate loop. --sct - */ -static handle_t *start_transaction(struct inode *inode) -{ - handle_t *result; - - result = ext3_journal_start(inode, blocks_for_truncate(inode)); - if (!IS_ERR(result)) - return result; - - ext3_std_error(inode->i_sb, PTR_ERR(result)); - return result; -} - -/* - * Try to extend this transaction for the purposes of truncation. - * - * Returns 0 if we managed to create more room. If we can't create more - * room, and the transaction must be restarted we return 1. - */ -static int try_to_extend_transaction(handle_t *handle, struct inode *inode) -{ - if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS) - return 0; - if (!ext3_journal_extend(handle, blocks_for_truncate(inode))) - return 0; - return 1; -} - -/* - * Restart the transaction associated with *handle. This does a commit, - * so before we call here everything must be consistently dirtied against - * this transaction. - */ -static int truncate_restart_transaction(handle_t *handle, struct inode *inode) -{ - int ret; - - jbd_debug(2, "restarting handle %p\n", handle); - /* - * Drop truncate_mutex to avoid deadlock with ext3_get_blocks_handle - * At this moment, get_block can be called only for blocks inside - * i_size since page cache has been already dropped and writes are - * blocked by i_mutex. So we can safely drop the truncate_mutex. - */ - mutex_unlock(&EXT3_I(inode)->truncate_mutex); - ret = ext3_journal_restart(handle, blocks_for_truncate(inode)); - mutex_lock(&EXT3_I(inode)->truncate_mutex); - return ret; -} - -/* - * Called at inode eviction from icache - */ -void ext3_evict_inode (struct inode *inode) -{ - struct ext3_inode_info *ei = EXT3_I(inode); - struct ext3_block_alloc_info *rsv; - handle_t *handle; - int want_delete = 0; - - trace_ext3_evict_inode(inode); - if (!inode->i_nlink && !is_bad_inode(inode)) { - dquot_initialize(inode); - want_delete = 1; - } - - /* - * When journalling data dirty buffers are tracked only in the journal. - * So although mm thinks everything is clean and ready for reaping the - * inode might still have some pages to write in the running - * transaction or waiting to be checkpointed. Thus calling - * journal_invalidatepage() (via truncate_inode_pages()) to discard - * these buffers can cause data loss. Also even if we did not discard - * these buffers, we would have no way to find them after the inode - * is reaped and thus user could see stale data if he tries to read - * them before the transaction is checkpointed. So be careful and - * force everything to disk here... We use ei->i_datasync_tid to - * store the newest transaction containing inode's data. - * - * Note that directories do not have this problem because they don't - * use page cache. - * - * The s_journal check handles the case when ext3_get_journal() fails - * and puts the journal inode. - */ - if (inode->i_nlink && ext3_should_journal_data(inode) && - EXT3_SB(inode->i_sb)->s_journal && - (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) && - inode->i_ino != EXT3_JOURNAL_INO) { - tid_t commit_tid = atomic_read(&ei->i_datasync_tid); - journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; - - log_start_commit(journal, commit_tid); - log_wait_commit(journal, commit_tid); - filemap_write_and_wait(&inode->i_data); - } - truncate_inode_pages_final(&inode->i_data); - - ext3_discard_reservation(inode); - rsv = ei->i_block_alloc_info; - ei->i_block_alloc_info = NULL; - if (unlikely(rsv)) - kfree(rsv); - - if (!want_delete) - goto no_delete; - - handle = start_transaction(inode); - if (IS_ERR(handle)) { - /* - * If we're going to skip the normal cleanup, we still need to - * make sure that the in-core orphan linked list is properly - * cleaned up. - */ - ext3_orphan_del(NULL, inode); - goto no_delete; - } - - if (IS_SYNC(inode)) - handle->h_sync = 1; - inode->i_size = 0; - if (inode->i_blocks) - ext3_truncate(inode); - /* - * Kill off the orphan record created when the inode lost the last - * link. Note that ext3_orphan_del() has to be able to cope with the - * deletion of a non-existent orphan - ext3_truncate() could - * have removed the record. - */ - ext3_orphan_del(handle, inode); - ei->i_dtime = get_seconds(); - - /* - * One subtle ordering requirement: if anything has gone wrong - * (transaction abort, IO errors, whatever), then we can still - * do these next steps (the fs will already have been marked as - * having errors), but we can't free the inode if the mark_dirty - * fails. - */ - if (ext3_mark_inode_dirty(handle, inode)) { - /* If that failed, just dquot_drop() and be done with that */ - dquot_drop(inode); - clear_inode(inode); - } else { - ext3_xattr_delete_inode(handle, inode); - dquot_free_inode(inode); - dquot_drop(inode); - clear_inode(inode); - ext3_free_inode(handle, inode); - } - ext3_journal_stop(handle); - return; -no_delete: - clear_inode(inode); - dquot_drop(inode); -} - -typedef struct { - __le32 *p; - __le32 key; - struct buffer_head *bh; -} Indirect; - -static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) -{ - p->key = *(p->p = v); - p->bh = bh; -} - -static int verify_chain(Indirect *from, Indirect *to) -{ - while (from <= to && from->key == *from->p) - from++; - return (from > to); -} - -/** - * ext3_block_to_path - parse the block number into array of offsets - * @inode: inode in question (we are only interested in its superblock) - * @i_block: block number to be parsed - * @offsets: array to store the offsets in - * @boundary: set this non-zero if the referred-to block is likely to be - * followed (on disk) by an indirect block. - * - * To store the locations of file's data ext3 uses a data structure common - * for UNIX filesystems - tree of pointers anchored in the inode, with - * data blocks at leaves and indirect blocks in intermediate nodes. - * This function translates the block number into path in that tree - - * return value is the path length and @offsets[n] is the offset of - * pointer to (n+1)th node in the nth one. If @block is out of range - * (negative or too large) warning is printed and zero returned. - * - * Note: function doesn't find node addresses, so no IO is needed. All - * we need to know is the capacity of indirect blocks (taken from the - * inode->i_sb). - */ - -/* - * Portability note: the last comparison (check that we fit into triple - * indirect block) is spelled differently, because otherwise on an - * architecture with 32-bit longs and 8Kb pages we might get into trouble - * if our filesystem had 8Kb blocks. We might use long long, but that would - * kill us on x86. Oh, well, at least the sign propagation does not matter - - * i_block would have to be negative in the very beginning, so we would not - * get there at all. - */ - -static int ext3_block_to_path(struct inode *inode, - long i_block, int offsets[4], int *boundary) -{ - int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb); - int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb); - const long direct_blocks = EXT3_NDIR_BLOCKS, - indirect_blocks = ptrs, - double_blocks = (1 << (ptrs_bits * 2)); - int n = 0; - int final = 0; - - if (i_block < 0) { - ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0"); - } else if (i_block < direct_blocks) { - offsets[n++] = i_block; - final = direct_blocks; - } else if ( (i_block -= direct_blocks) < indirect_blocks) { - offsets[n++] = EXT3_IND_BLOCK; - offsets[n++] = i_block; - final = ptrs; - } else if ((i_block -= indirect_blocks) < double_blocks) { - offsets[n++] = EXT3_DIND_BLOCK; - offsets[n++] = i_block >> ptrs_bits; - offsets[n++] = i_block & (ptrs - 1); - final = ptrs; - } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { - offsets[n++] = EXT3_TIND_BLOCK; - offsets[n++] = i_block >> (ptrs_bits * 2); - offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); - offsets[n++] = i_block & (ptrs - 1); - final = ptrs; - } else { - ext3_warning(inode->i_sb, "ext3_block_to_path", "block > big"); - } - if (boundary) - *boundary = final - 1 - (i_block & (ptrs - 1)); - return n; -} - -/** - * ext3_get_branch - read the chain of indirect blocks leading to data - * @inode: inode in question - * @depth: depth of the chain (1 - direct pointer, etc.) - * @offsets: offsets of pointers in inode/indirect blocks - * @chain: place to store the result - * @err: here we store the error value - * - * Function fills the array of triples <key, p, bh> and returns %NULL - * if everything went OK or the pointer to the last filled triple - * (incomplete one) otherwise. Upon the return chain[i].key contains - * the number of (i+1)-th block in the chain (as it is stored in memory, - * i.e. little-endian 32-bit), chain[i].p contains the address of that - * number (it points into struct inode for i==0 and into the bh->b_data - * for i>0) and chain[i].bh points to the buffer_head of i-th indirect - * block for i>0 and NULL for i==0. In other words, it holds the block - * numbers of the chain, addresses they were taken from (and where we can - * verify that chain did not change) and buffer_heads hosting these - * numbers. - * - * Function stops when it stumbles upon zero pointer (absent block) - * (pointer to last triple returned, *@err == 0) - * or when it gets an IO error reading an indirect block - * (ditto, *@err == -EIO) - * or when it notices that chain had been changed while it was reading - * (ditto, *@err == -EAGAIN) - * or when it reads all @depth-1 indirect blocks successfully and finds - * the whole chain, all way to the data (returns %NULL, *err == 0). - */ -static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets, - Indirect chain[4], int *err) -{ - struct super_block *sb = inode->i_sb; - Indirect *p = chain; - struct buffer_head *bh; - - *err = 0; - /* i_data is not going away, no lock needed */ - add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets); - if (!p->key) - goto no_block; - while (--depth) { - bh = sb_bread(sb, le32_to_cpu(p->key)); - if (!bh) - goto failure; - /* Reader: pointers */ - if (!verify_chain(chain, p)) - goto changed; - add_chain(++p, bh, (__le32*)bh->b_data + *++offsets); - /* Reader: end */ - if (!p->key) - goto no_block; - } - return NULL; - -changed: - brelse(bh); - *err = -EAGAIN; - goto no_block; -failure: - *err = -EIO; -no_block: - return p; -} - -/** - * ext3_find_near - find a place for allocation with sufficient locality - * @inode: owner - * @ind: descriptor of indirect block. - * - * This function returns the preferred place for block allocation. - * It is used when heuristic for sequential allocation fails. - * Rules are: - * + if there is a block to the left of our position - allocate near it. - * + if pointer will live in indirect block - allocate near that block. - * + if pointer will live in inode - allocate in the same - * cylinder group. - * - * In the latter case we colour the starting block by the callers PID to - * prevent it from clashing with concurrent allocations for a different inode - * in the same block group. The PID is used here so that functionally related - * files will be close-by on-disk. - * - * Caller must make sure that @ind is valid and will stay that way. - */ -static ext3_fsblk_t ext3_find_near(struct inode *inode, Indirect *ind) -{ - struct ext3_inode_info *ei = EXT3_I(inode); - __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data; - __le32 *p; - ext3_fsblk_t bg_start; - ext3_grpblk_t colour; - - /* Try to find previous block */ - for (p = ind->p - 1; p >= start; p--) { - if (*p) - return le32_to_cpu(*p); - } - - /* No such thing, so let's try location of indirect block */ - if (ind->bh) - return ind->bh->b_blocknr; - - /* - * It is going to be referred to from the inode itself? OK, just put it - * into the same cylinder group then. - */ - bg_start = ext3_group_first_block_no(inode->i_sb, ei->i_block_group); - colour = (current->pid % 16) * - (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); - return bg_start + colour; -} - -/** - * ext3_find_goal - find a preferred place for allocation. - * @inode: owner - * @block: block we want - * @partial: pointer to the last triple within a chain - * - * Normally this function find the preferred place for block allocation, - * returns it. - */ - -static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block, - Indirect *partial) -{ - struct ext3_block_alloc_info *block_i; - - block_i = EXT3_I(inode)->i_block_alloc_info; - - /* - * try the heuristic for sequential allocation, - * failing that at least try to get decent locality. - */ - if (block_i && (block == block_i->last_alloc_logical_block + 1) - && (block_i->last_alloc_physical_block != 0)) { - return block_i->last_alloc_physical_block + 1; - } - - return ext3_find_near(inode, partial); -} - -/** - * ext3_blks_to_allocate - Look up the block map and count the number - * of direct blocks need to be allocated for the given branch. - * - * @branch: chain of indirect blocks - * @k: number of blocks need for indirect blocks - * @blks: number of data blocks to be mapped. - * @blocks_to_boundary: the offset in the indirect block - * - * return the total number of blocks to be allocate, including the - * direct and indirect blocks. - */ -static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks, - int blocks_to_boundary) -{ - unsigned long count = 0; - - /* - * Simple case, [t,d]Indirect block(s) has not allocated yet - * then it's clear blocks on that path have not allocated - */ - if (k > 0) { - /* right now we don't handle cross boundary allocation */ - if (blks < blocks_to_boundary + 1) - count += blks; - else - count += blocks_to_boundary + 1; - return count; - } - - count++; - while (count < blks && count <= blocks_to_boundary && - le32_to_cpu(*(branch[0].p + count)) == 0) { - count++; - } - return count; -} - -/** - * ext3_alloc_blocks - multiple allocate blocks needed for a branch - * @handle: handle for this transaction - * @inode: owner - * @goal: preferred place for allocation - * @indirect_blks: the number of blocks need to allocate for indirect - * blocks - * @blks: number of blocks need to allocated for direct blocks - * @new_blocks: on return it will store the new block numbers for - * the indirect blocks(if needed) and the first direct block, - * @err: here we store the error value - * - * return the number of direct blocks allocated - */ -static int ext3_alloc_blocks(handle_t *handle, struct inode *inode, - ext3_fsblk_t goal, int indirect_blks, int blks, - ext3_fsblk_t new_blocks[4], int *err) -{ - int target, i; - unsigned long count = 0; - int index = 0; - ext3_fsblk_t current_block = 0; - int ret = 0; - - /* - * Here we try to allocate the requested multiple blocks at once, - * on a best-effort basis. - * To build a branch, we should allocate blocks for - * the indirect blocks(if not allocated yet), and at least - * the first direct block of this branch. That's the - * minimum number of blocks need to allocate(required) - */ - target = blks + indirect_blks; - - while (1) { - count = target; - /* allocating blocks for indirect blocks and direct blocks */ - current_block = ext3_new_blocks(handle,inode,goal,&count,err); - if (*err) - goto failed_out; - - target -= count; - /* allocate blocks for indirect blocks */ - while (index < indirect_blks && count) { - new_blocks[index++] = current_block++; - count--; - } - - if (count > 0) - break; - } - - /* save the new block number for the first direct block */ - new_blocks[index] = current_block; - - /* total number of blocks allocated for direct blocks */ - ret = count; - *err = 0; - return ret; -failed_out: - for (i = 0; i <index; i++) - ext3_free_blocks(handle, inode, new_blocks[i], 1); - return ret; -} - -/** - * ext3_alloc_branch - allocate and set up a chain of blocks. - * @handle: handle for this transaction - * @inode: owner - * @indirect_blks: number of allocated indirect blocks - * @blks: number of allocated direct blocks - * @goal: preferred place for allocation - * @offsets: offsets (in the blocks) to store the pointers to next. - * @branch: place to store the chain in. - * - * This function allocates blocks, zeroes out all but the last one, - * links them into chain and (if we are synchronous) writes them to disk. - * In other words, it prepares a branch that can be spliced onto the - * inode. It stores the information about that chain in the branch[], in - * the same format as ext3_get_branch() would do. We are calling it after - * we had read the existing part of chain and partial points to the last - * triple of that (one with zero ->key). Upon the exit we have the same - * picture as after the successful ext3_get_block(), except that in one - * place chain is disconnected - *branch->p is still zero (we did not - * set the last link), but branch->key contains the number that should - * be placed into *branch->p to fill that gap. - * - * If allocation fails we free all blocks we've allocated (and forget - * their buffer_heads) and return the error value the from failed - * ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain - * as described above and return 0. - */ -static int ext3_alloc_branch(handle_t *handle, struct inode *inode, - int indirect_blks, int *blks, ext3_fsblk_t goal, - int *offsets, Indirect *branch) -{ - int blocksize = inode->i_sb->s_blocksize; - int i, n = 0; - int err = 0; - struct buffer_head *bh; - int num; - ext3_fsblk_t new_blocks[4]; - ext3_fsblk_t current_block; - - num = ext3_alloc_blocks(handle, inode, goal, indirect_blks, - *blks, new_blocks, &err); - if (err) - return err; - - branch[0].key = cpu_to_le32(new_blocks[0]); - /* - * metadata blocks and data blocks are allocated. - */ - for (n = 1; n <= indirect_blks; n++) { - /* - * Get buffer_head for parent block, zero it out - * and set the pointer to new one, then send - * parent to disk. - */ - bh = sb_getblk(inode->i_sb, new_blocks[n-1]); - if (unlikely(!bh)) { - err = -ENOMEM; - goto failed; - } - branch[n].bh = bh; - lock_buffer(bh); - BUFFER_TRACE(bh, "call get_create_access"); - err = ext3_journal_get_create_access(handle, bh); - if (err) { - unlock_buffer(bh); - brelse(bh); - goto failed; - } - - memset(bh->b_data, 0, blocksize); - branch[n].p = (__le32 *) bh->b_data + offsets[n]; - branch[n].key = cpu_to_le32(new_blocks[n]); - *branch[n].p = branch[n].key; - if ( n == indirect_blks) { - current_block = new_blocks[n]; - /* - * End of chain, update the last new metablock of - * the chain to point to the new allocated - * data blocks numbers - */ - for (i=1; i < num; i++) - *(branch[n].p + i) = cpu_to_le32(++current_block); - } - BUFFER_TRACE(bh, "marking uptodate"); - set_buffer_uptodate(bh); - unlock_buffer(bh); - - BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); - err = ext3_journal_dirty_metadata(handle, bh); - if (err) - goto failed; - } - *blks = num; - return err; -failed: - /* Allocation failed, free what we already allocated */ - for (i = 1; i <= n ; i++) { - BUFFER_TRACE(branch[i].bh, "call journal_forget"); - ext3_journal_forget(handle, branch[i].bh); - } - for (i = 0; i < indirect_blks; i++) - ext3_free_blocks(handle, inode, new_blocks[i], 1); - - ext3_free_blocks(handle, inode, new_blocks[i], num); - - return err; -} - -/** - * ext3_splice_branch - splice the allocated branch onto inode. - * @handle: handle for this transaction - * @inode: owner - * @block: (logical) number of block we are adding - * @where: location of missing link - * @num: number of indirect blocks we are adding - * @blks: number of direct blocks we are adding - * - * This function fills the missing link and does all housekeeping needed in - * inode (->i_blocks, etc.). In case of success we end up with the full - * chain to new block and return 0. - */ -static int ext3_splice_branch(handle_t *handle, struct inode *inode, - long block, Indirect *where, int num, int blks) -{ - int i; - int err = 0; - struct ext3_block_alloc_info *block_i; - ext3_fsblk_t current_block; - struct ext3_inode_info *ei = EXT3_I(inode); - struct timespec now; - - block_i = ei->i_block_alloc_info; - /* - * If we're splicing into a [td]indirect block (as opposed to the - * inode) then we need to get write access to the [td]indirect block - * before the splice. - */ - if (where->bh) { - BUFFER_TRACE(where->bh, "get_write_access"); - err = ext3_journal_get_write_access(handle, where->bh); - if (err) - goto err_out; - } - /* That's it */ - - *where->p = where->key; - - /* - * Update the host buffer_head or inode to point to more just allocated - * direct blocks blocks - */ - if (num == 0 && blks > 1) { - current_block = le32_to_cpu(where->key) + 1; - for (i = 1; i < blks; i++) - *(where->p + i ) = cpu_to_le32(current_block++); - } - - /* - * update the most recently allocated logical & physical block - * in i_block_alloc_info, to assist find the proper goal block for next - * allocation - */ - if (block_i) { - block_i->last_alloc_logical_block = block + blks - 1; - block_i->last_alloc_physical_block = - le32_to_cpu(where[num].key) + blks - 1; - } - - /* We are done with atomic stuff, now do the rest of housekeeping */ - now = CURRENT_TIME_SEC; - if (!timespec_equal(&inode->i_ctime, &now) || !where->bh) { - inode->i_ctime = now; - ext3_mark_inode_dirty(handle, inode); - } - /* ext3_mark_inode_dirty already updated i_sync_tid */ - atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); - - /* had we spliced it onto indirect block? */ - if (where->bh) { - /* - * If we spliced it onto an indirect block, we haven't - * altered the inode. Note however that if it is being spliced - * onto an indirect block at the very end of the file (the - * file is growing) then we *will* alter the inode to reflect - * the new i_size. But that is not done here - it is done in - * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode. - */ - jbd_debug(5, "splicing indirect only\n"); - BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata"); - err = ext3_journal_dirty_metadata(handle, where->bh); - if (err) - goto err_out; - } else { - /* - * OK, we spliced it into the inode itself on a direct block. - * Inode was dirtied above. - */ - jbd_debug(5, "splicing direct\n"); - } - return err; - -err_out: - for (i = 1; i <= num; i++) { - BUFFER_TRACE(where[i].bh, "call journal_forget"); - ext3_journal_forget(handle, where[i].bh); - ext3_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1); - } - ext3_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks); - - return err; -} - -/* - * Allocation strategy is simple: if we have to allocate something, we will - * have to go the whole way to leaf. So let's do it before attaching anything - * to tree, set linkage between the newborn blocks, write them if sync is - * required, recheck the path, free and repeat if check fails, otherwise - * set the last missing link (that will protect us from any truncate-generated - * removals - all blocks on the path are immune now) and possibly force the - * write on the parent block. - * That has a nice additional property: no special recovery from the failed - * allocations is needed - we simply release blocks and do not touch anything - * reachable from inode. - * - * `handle' can be NULL if create == 0. - * - * The BKL may not be held on entry here. Be sure to take it early. - * return > 0, # of blocks mapped or allocated. - * return = 0, if plain lookup failed. - * return < 0, error case. - */ -int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, - sector_t iblock, unsigned long maxblocks, - struct buffer_head *bh_result, - int create) -{ - int err = -EIO; - int offsets[4]; - Indirect chain[4]; - Indirect *partial; - ext3_fsblk_t goal; - int indirect_blks; - int blocks_to_boundary = 0; - int depth; - struct ext3_inode_info *ei = EXT3_I(inode); - int count = 0; - ext3_fsblk_t first_block = 0; - - - trace_ext3_get_blocks_enter(inode, iblock, maxblocks, create); - J_ASSERT(handle != NULL || create == 0); - depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary); - - if (depth == 0) - goto out; - - partial = ext3_get_branch(inode, depth, offsets, chain, &err); - - /* Simplest case - block found, no allocation needed */ - if (!partial) { - first_block = le32_to_cpu(chain[depth - 1].key); - clear_buffer_new(bh_result); - count++; - /*map more blocks*/ - while (count < maxblocks && count <= blocks_to_boundary) { - ext3_fsblk_t blk; - - if (!verify_chain(chain, chain + depth - 1)) { - /* - * Indirect block might be removed by - * truncate while we were reading it. - * Handling of that case: forget what we've - * got now. Flag the err as EAGAIN, so it - * will reread. - */ - err = -EAGAIN; - count = 0; - break; - } - blk = le32_to_cpu(*(chain[depth-1].p + count)); - - if (blk == first_block + count) - count++; - else - break; - } - if (err != -EAGAIN) - goto got_it; - } - - /* Next simple case - plain lookup or failed read of indirect block */ - if (!create || err == -EIO) - goto cleanup; - - /* - * Block out ext3_truncate while we alter the tree - */ - mutex_lock(&ei->truncate_mutex); - - /* - * If the indirect block is missing while we are reading - * the chain(ext3_get_branch() returns -EAGAIN err), or - * if the chain has been changed after we grab the semaphore, - * (either because another process truncated this branch, or - * another get_block allocated this branch) re-grab the chain to see if - * the request block has been allocated or not. - * - * Since we already block the truncate/other get_block - * at this point, we will have the current copy of the chain when we - * splice the branch into the tree. - */ - if (err == -EAGAIN || !verify_chain(chain, partial)) { - while (partial > chain) { - brelse(partial->bh); - partial--; - } - partial = ext3_get_branch(inode, depth, offsets, chain, &err); - if (!partial) { - count++; - mutex_unlock(&ei->truncate_mutex); - if (err) - goto cleanup; - clear_buffer_new(bh_result); - goto got_it; - } - } - - /* - * Okay, we need to do block allocation. Lazily initialize the block - * allocation info here if necessary - */ - if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info)) - ext3_init_block_alloc_info(inode); - - goal = ext3_find_goal(inode, iblock, partial); - - /* the number of blocks need to allocate for [d,t]indirect blocks */ - indirect_blks = (chain + depth) - partial - 1; - - /* - * Next look up the indirect map to count the totoal number of - * direct blocks to allocate for this branch. - */ - count = ext3_blks_to_allocate(partial, indirect_blks, - maxblocks, blocks_to_boundary); - err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal, - offsets + (partial - chain), partial); - - /* - * The ext3_splice_branch call will free and forget any buffers - * on the new chain if there is a failure, but that risks using - * up transaction credits, especially for bitmaps where the - * credits cannot be returned. Can we handle this somehow? We - * may need to return -EAGAIN upwards in the worst case. --sct - */ - if (!err) - err = ext3_splice_branch(handle, inode, iblock, - partial, indirect_blks, count); - mutex_unlock(&ei->truncate_mutex); - if (err) - goto cleanup; - - set_buffer_new(bh_result); -got_it: - map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); - if (count > blocks_to_boundary) - set_buffer_boundary(bh_result); - err = count; - /* Clean up and exit */ - partial = chain + depth - 1; /* the whole chain */ -cleanup: - while (partial > chain) { - BUFFER_TRACE(partial->bh, "call brelse"); - brelse(partial->bh); - partial--; - } - BUFFER_TRACE(bh_result, "returned"); -out: - trace_ext3_get_blocks_exit(inode, iblock, - depth ? le32_to_cpu(chain[depth-1].key) : 0, - count, err); - return err; -} - -/* Maximum number of blocks we map for direct IO at once. */ -#define DIO_MAX_BLOCKS 4096 -/* - * Number of credits we need for writing DIO_MAX_BLOCKS: - * We need sb + group descriptor + bitmap + inode -> 4 - * For B blocks with A block pointers per block we need: - * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect). - * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25. - */ -#define DIO_CREDITS 25 - -static int ext3_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - handle_t *handle = ext3_journal_current_handle(); - int ret = 0, started = 0; - unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; - - if (create && !handle) { /* Direct IO write... */ - if (max_blocks > DIO_MAX_BLOCKS) - max_blocks = DIO_MAX_BLOCKS; - handle = ext3_journal_start(inode, DIO_CREDITS + - EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb)); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - started = 1; - } - - ret = ext3_get_blocks_handle(handle, inode, iblock, - max_blocks, bh_result, create); - if (ret > 0) { - bh_result->b_size = (ret << inode->i_blkbits); - ret = 0; - } - if (started) - ext3_journal_stop(handle); -out: - return ret; -} - -int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - u64 start, u64 len) -{ - return generic_block_fiemap(inode, fieinfo, start, len, - ext3_get_block); -} - -/* - * `handle' can be NULL if create is zero - */ -struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode, - long block, int create, int *errp) -{ - struct buffer_head dummy; - int fatal = 0, err; - - J_ASSERT(handle != NULL || create == 0); - - dummy.b_state = 0; - dummy.b_blocknr = -1000; - buffer_trace_init(&dummy.b_history); - err = ext3_get_blocks_handle(handle, inode, block, 1, - &dummy, create); - /* - * ext3_get_blocks_handle() returns number of blocks - * mapped. 0 in case of a HOLE. - */ - if (err > 0) { - WARN_ON(err > 1); - err = 0; - } - *errp = err; - if (!err && buffer_mapped(&dummy)) { - struct buffer_head *bh; - bh = sb_getblk(inode->i_sb, dummy.b_blocknr); - if (unlikely(!bh)) { - *errp = -ENOMEM; - goto err; - } - if (buffer_new(&dummy)) { - J_ASSERT(create != 0); - J_ASSERT(handle != NULL); - - /* - * Now that we do not always journal data, we should - * keep in mind whether this should always journal the - * new buffer as metadata. For now, regular file - * writes use ext3_get_block instead, so it's not a - * problem. - */ - lock_buffer(bh); - BUFFER_TRACE(bh, "call get_create_access"); - fatal = ext3_journal_get_create_access(handle, bh); - if (!fatal && !buffer_uptodate(bh)) { - memset(bh->b_data,0,inode->i_sb->s_blocksize); - set_buffer_uptodate(bh); - } - unlock_buffer(bh); - BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); - err = ext3_journal_dirty_metadata(handle, bh); - if (!fatal) - fatal = err; - } else { - BUFFER_TRACE(bh, "not a new buffer"); - } - if (fatal) { - *errp = fatal; - brelse(bh); - bh = NULL; - } - return bh; - } -err: - return NULL; -} - -struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode, - int block, int create, int *err) -{ - struct buffer_head * bh; - - bh = ext3_getblk(handle, inode, block, create, err); - if (!bh) - return bh; - if (bh_uptodate_or_lock(bh)) - return bh; - get_bh(bh); - bh->b_end_io = end_buffer_read_sync; - submit_bh(READ | REQ_META | REQ_PRIO, bh); - wait_on_buffer(bh); - if (buffer_uptodate(bh)) - return bh; - put_bh(bh); - *err = -EIO; - return NULL; -} - -static int walk_page_buffers( handle_t *handle, - struct buffer_head *head, - unsigned from, - unsigned to, - int *partial, - int (*fn)( handle_t *handle, - struct buffer_head *bh)) -{ - struct buffer_head *bh; - unsigned block_start, block_end; - unsigned blocksize = head->b_size; - int err, ret = 0; - struct buffer_head *next; - - for ( bh = head, block_start = 0; - ret == 0 && (bh != head || !block_start); - block_start = block_end, bh = next) - { - next = bh->b_this_page; - block_end = block_start + blocksize; - if (block_end <= from || block_start >= to) { - if (partial && !buffer_uptodate(bh)) - *partial = 1; - continue; - } - err = (*fn)(handle, bh); - if (!ret) - ret = err; - } - return ret; -} - -/* - * To preserve ordering, it is essential that the hole instantiation and - * the data write be encapsulated in a single transaction. We cannot - * close off a transaction and start a new one between the ext3_get_block() - * and the commit_write(). So doing the journal_start at the start of - * prepare_write() is the right place. - * - * Also, this function can nest inside ext3_writepage() -> - * block_write_full_page(). In that case, we *know* that ext3_writepage() - * has generated enough buffer credits to do the whole page. So we won't - * block on the journal in that case, which is good, because the caller may - * be PF_MEMALLOC. - * - * By accident, ext3 can be reentered when a transaction is open via - * quota file writes. If we were to commit the transaction while thus - * reentered, there can be a deadlock - we would be holding a quota - * lock, and the commit would never complete if another thread had a - * transaction open and was blocking on the quota lock - a ranking - * violation. - * - * So what we do is to rely on the fact that journal_stop/journal_start - * will _not_ run commit under these circumstances because handle->h_ref - * is elevated. We'll still have enough credits for the tiny quotafile - * write. - */ -static int do_journal_get_write_access(handle_t *handle, - struct buffer_head *bh) -{ - int dirty = buffer_dirty(bh); - int ret; - - if (!buffer_mapped(bh) || buffer_freed(bh)) - return 0; - /* - * __block_prepare_write() could have dirtied some buffers. Clean - * the dirty bit as jbd2_journal_get_write_access() could complain - * otherwise about fs integrity issues. Setting of the dirty bit - * by __block_prepare_write() isn't a real problem here as we clear - * the bit before releasing a page lock and thus writeback cannot - * ever write the buffer. - */ - if (dirty) - clear_buffer_dirty(bh); - ret = ext3_journal_get_write_access(handle, bh); - if (!ret && dirty) - ret = ext3_journal_dirty_metadata(handle, bh); - return ret; -} - -/* - * Truncate blocks that were not used by write. We have to truncate the - * pagecache as well so that corresponding buffers get properly unmapped. - */ -static void ext3_truncate_failed_write(struct inode *inode) -{ - truncate_inode_pages(inode->i_mapping, inode->i_size); - ext3_truncate(inode); -} - -/* - * Truncate blocks that were not used by direct IO write. We have to zero out - * the last file block as well because direct IO might have written to it. - */ -static void ext3_truncate_failed_direct_write(struct inode *inode) -{ - ext3_block_truncate_page(inode, inode->i_size); - ext3_truncate(inode); -} - -static int ext3_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) -{ - struct inode *inode = mapping->host; - int ret; - handle_t *handle; - int retries = 0; - struct page *page; - pgoff_t index; - unsigned from, to; - /* Reserve one block more for addition to orphan list in case - * we allocate blocks but write fails for some reason */ - int needed_blocks = ext3_writepage_trans_blocks(inode) + 1; - - trace_ext3_write_begin(inode, pos, len, flags); - - index = pos >> PAGE_CACHE_SHIFT; - from = pos & (PAGE_CACHE_SIZE - 1); - to = from + len; - -retry: - page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) - return -ENOMEM; - *pagep = page; - - handle = ext3_journal_start(inode, needed_blocks); - if (IS_ERR(handle)) { - unlock_page(page); - page_cache_release(page); - ret = PTR_ERR(handle); - goto out; - } - ret = __block_write_begin(page, pos, len, ext3_get_block); - if (ret) - goto write_begin_failed; - - if (ext3_should_journal_data(inode)) { - ret = walk_page_buffers(handle, page_buffers(page), - from, to, NULL, do_journal_get_write_access); - } -write_begin_failed: - if (ret) { - /* - * block_write_begin may have instantiated a few blocks - * outside i_size. Trim these off again. Don't need - * i_size_read because we hold i_mutex. - * - * Add inode to orphan list in case we crash before truncate - * finishes. Do this only if ext3_can_truncate() agrees so - * that orphan processing code is happy. - */ - if (pos + len > inode->i_size && ext3_can_truncate(inode)) - ext3_orphan_add(handle, inode); - ext3_journal_stop(handle); - unlock_page(page); - page_cache_release(page); - if (pos + len > inode->i_size) - ext3_truncate_failed_write(inode); - } - if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) - goto retry; -out: - return ret; -} - - -int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh) -{ - int err = journal_dirty_data(handle, bh); - if (err) - ext3_journal_abort_handle(__func__, __func__, - bh, handle, err); - return err; -} - -/* For ordered writepage and write_end functions */ -static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) -{ - /* - * Write could have mapped the buffer but it didn't copy the data in - * yet. So avoid filing such buffer into a transaction. - */ - if (buffer_mapped(bh) && buffer_uptodate(bh)) - return ext3_journal_dirty_data(handle, bh); - return 0; -} - -/* For write_end() in data=journal mode */ -static int write_end_fn(handle_t *handle, struct buffer_head *bh) -{ - if (!buffer_mapped(bh) || buffer_freed(bh)) - return 0; - set_buffer_uptodate(bh); - return ext3_journal_dirty_metadata(handle, bh); -} - -/* - * This is nasty and subtle: ext3_write_begin() could have allocated blocks - * for the whole page but later we failed to copy the data in. Update inode - * size according to what we managed to copy. The rest is going to be - * truncated in write_end function. - */ -static void update_file_sizes(struct inode *inode, loff_t pos, unsigned copied) -{ - /* What matters to us is i_disksize. We don't write i_size anywhere */ - if (pos + copied > inode->i_size) - i_size_write(inode, pos + copied); - if (pos + copied > EXT3_I(inode)->i_disksize) { - EXT3_I(inode)->i_disksize = pos + copied; - mark_inode_dirty(inode); - } -} - -/* - * We need to pick up the new inode size which generic_commit_write gave us - * `file' can be NULL - eg, when called from page_symlink(). - * - * ext3 never places buffers on inode->i_mapping->private_list. metadata - * buffers are managed internally. - */ -static int ext3_ordered_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - handle_t *handle = ext3_journal_current_handle(); - struct inode *inode = file->f_mapping->host; - unsigned from, to; - int ret = 0, ret2; - - trace_ext3_ordered_write_end(inode, pos, len, copied); - copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); - - from = pos & (PAGE_CACHE_SIZE - 1); - to = from + copied; - ret = walk_page_buffers(handle, page_buffers(page), - from, to, NULL, journal_dirty_data_fn); - - if (ret == 0) - update_file_sizes(inode, pos, copied); - /* - * There may be allocated blocks outside of i_size because - * we failed to copy some data. Prepare for truncate. - */ - if (pos + len > inode->i_size && ext3_can_truncate(inode)) - ext3_orphan_add(handle, inode); - ret2 = ext3_journal_stop(handle); - if (!ret) - ret = ret2; - unlock_page(page); - page_cache_release(page); - - if (pos + len > inode->i_size) - ext3_truncate_failed_write(inode); - return ret ? ret : copied; -} - -static int ext3_writeback_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - handle_t *handle = ext3_journal_current_handle(); - struct inode *inode = file->f_mapping->host; - int ret; - - trace_ext3_writeback_write_end(inode, pos, len, copied); - copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); - update_file_sizes(inode, pos, copied); - /* - * There may be allocated blocks outside of i_size because - * we failed to copy some data. Prepare for truncate. - */ - if (pos + len > inode->i_size && ext3_can_truncate(inode)) - ext3_orphan_add(handle, inode); - ret = ext3_journal_stop(handle); - unlock_page(page); - page_cache_release(page); - - if (pos + len > inode->i_size) - ext3_truncate_failed_write(inode); - return ret ? ret : copied; -} - -static int ext3_journalled_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - handle_t *handle = ext3_journal_current_handle(); - struct inode *inode = mapping->host; - struct ext3_inode_info *ei = EXT3_I(inode); - int ret = 0, ret2; - int partial = 0; - unsigned from, to; - - trace_ext3_journalled_write_end(inode, pos, len, copied); - from = pos & (PAGE_CACHE_SIZE - 1); - to = from + len; - - if (copied < len) { - if (!PageUptodate(page)) - copied = 0; - page_zero_new_buffers(page, from + copied, to); - to = from + copied; - } - - ret = walk_page_buffers(handle, page_buffers(page), from, - to, &partial, write_end_fn); - if (!partial) - SetPageUptodate(page); - - if (pos + copied > inode->i_size) - i_size_write(inode, pos + copied); - /* - * There may be allocated blocks outside of i_size because - * we failed to copy some data. Prepare for truncate. - */ - if (pos + len > inode->i_size && ext3_can_truncate(inode)) - ext3_orphan_add(handle, inode); - ext3_set_inode_state(inode, EXT3_STATE_JDATA); - atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); - if (inode->i_size > ei->i_disksize) { - ei->i_disksize = inode->i_size; - ret2 = ext3_mark_inode_dirty(handle, inode); - if (!ret) - ret = ret2; - } - - ret2 = ext3_journal_stop(handle); - if (!ret) - ret = ret2; - unlock_page(page); - page_cache_release(page); - - if (pos + len > inode->i_size) - ext3_truncate_failed_write(inode); - return ret ? ret : copied; -} - -/* - * bmap() is special. It gets used by applications such as lilo and by - * the swapper to find the on-disk block of a specific piece of data. - * - * Naturally, this is dangerous if the block concerned is still in the - * journal. If somebody makes a swapfile on an ext3 data-journaling - * filesystem and enables swap, then they may get a nasty shock when the - * data getting swapped to that swapfile suddenly gets overwritten by - * the original zero's written out previously to the journal and - * awaiting writeback in the kernel's buffer cache. - * - * So, if we see any bmap calls here on a modified, data-journaled file, - * take extra steps to flush any blocks which might be in the cache. - */ -static sector_t ext3_bmap(struct address_space *mapping, sector_t block) -{ - struct inode *inode = mapping->host; - journal_t *journal; - int err; - - if (ext3_test_inode_state(inode, EXT3_STATE_JDATA)) { - /* - * This is a REALLY heavyweight approach, but the use of - * bmap on dirty files is expected to be extremely rare: - * only if we run lilo or swapon on a freshly made file - * do we expect this to happen. - * - * (bmap requires CAP_SYS_RAWIO so this does not - * represent an unprivileged user DOS attack --- we'd be - * in trouble if mortal users could trigger this path at - * will.) - * - * NB. EXT3_STATE_JDATA is not set on files other than - * regular files. If somebody wants to bmap a directory - * or symlink and gets confused because the buffer - * hasn't yet been flushed to disk, they deserve - * everything they get. - */ - - ext3_clear_inode_state(inode, EXT3_STATE_JDATA); - journal = EXT3_JOURNAL(inode); - journal_lock_updates(journal); - err = journal_flush(journal); - journal_unlock_updates(journal); - - if (err) - return 0; - } - - return generic_block_bmap(mapping,block,ext3_get_block); -} - -static int bget_one(handle_t *handle, struct buffer_head *bh) -{ - get_bh(bh); - return 0; -} - -static int bput_one(handle_t *handle, struct buffer_head *bh) -{ - put_bh(bh); - return 0; -} - -static int buffer_unmapped(handle_t *handle, struct buffer_head *bh) -{ - return !buffer_mapped(bh); -} - -/* - * Note that whenever we need to map blocks we start a transaction even if - * we're not journalling data. This is to preserve ordering: any hole - * instantiation within __block_write_full_page -> ext3_get_block() should be - * journalled along with the data so we don't crash and then get metadata which - * refers to old data. - * - * In all journalling modes block_write_full_page() will start the I/O. - * - * We don't honour synchronous mounts for writepage(). That would be - * disastrous. Any write() or metadata operation will sync the fs for - * us. - */ -static int ext3_ordered_writepage(struct page *page, - struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - struct buffer_head *page_bufs; - handle_t *handle = NULL; - int ret = 0; - int err; - - J_ASSERT(PageLocked(page)); - /* - * We don't want to warn for emergency remount. The condition is - * ordered to avoid dereferencing inode->i_sb in non-error case to - * avoid slow-downs. - */ - WARN_ON_ONCE(IS_RDONLY(inode) && - !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); - - /* - * We give up here if we're reentered, because it might be for a - * different filesystem. - */ - if (ext3_journal_current_handle()) - goto out_fail; - - trace_ext3_ordered_writepage(page); - if (!page_has_buffers(page)) { - create_empty_buffers(page, inode->i_sb->s_blocksize, - (1 << BH_Dirty)|(1 << BH_Uptodate)); - page_bufs = page_buffers(page); - } else { - page_bufs = page_buffers(page); - if (!walk_page_buffers(NULL, page_bufs, 0, PAGE_CACHE_SIZE, - NULL, buffer_unmapped)) { - /* Provide NULL get_block() to catch bugs if buffers - * weren't really mapped */ - return block_write_full_page(page, NULL, wbc); - } - } - handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); - - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out_fail; - } - - walk_page_buffers(handle, page_bufs, 0, - PAGE_CACHE_SIZE, NULL, bget_one); - - ret = block_write_full_page(page, ext3_get_block, wbc); - - /* - * The page can become unlocked at any point now, and - * truncate can then come in and change things. So we - * can't touch *page from now on. But *page_bufs is - * safe due to elevated refcount. - */ - - /* - * And attach them to the current transaction. But only if - * block_write_full_page() succeeded. Otherwise they are unmapped, - * and generally junk. - */ - if (ret == 0) - ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, - NULL, journal_dirty_data_fn); - walk_page_buffers(handle, page_bufs, 0, - PAGE_CACHE_SIZE, NULL, bput_one); - err = ext3_journal_stop(handle); - if (!ret) - ret = err; - return ret; - -out_fail: - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return ret; -} - -static int ext3_writeback_writepage(struct page *page, - struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - handle_t *handle = NULL; - int ret = 0; - int err; - - J_ASSERT(PageLocked(page)); - /* - * We don't want to warn for emergency remount. The condition is - * ordered to avoid dereferencing inode->i_sb in non-error case to - * avoid slow-downs. - */ - WARN_ON_ONCE(IS_RDONLY(inode) && - !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); - - if (ext3_journal_current_handle()) - goto out_fail; - - trace_ext3_writeback_writepage(page); - if (page_has_buffers(page)) { - if (!walk_page_buffers(NULL, page_buffers(page), 0, - PAGE_CACHE_SIZE, NULL, buffer_unmapped)) { - /* Provide NULL get_block() to catch bugs if buffers - * weren't really mapped */ - return block_write_full_page(page, NULL, wbc); - } - } - - handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out_fail; - } - - ret = block_write_full_page(page, ext3_get_block, wbc); - - err = ext3_journal_stop(handle); - if (!ret) - ret = err; - return ret; - -out_fail: - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return ret; -} - -static int ext3_journalled_writepage(struct page *page, - struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - handle_t *handle = NULL; - int ret = 0; - int err; - - J_ASSERT(PageLocked(page)); - /* - * We don't want to warn for emergency remount. The condition is - * ordered to avoid dereferencing inode->i_sb in non-error case to - * avoid slow-downs. - */ - WARN_ON_ONCE(IS_RDONLY(inode) && - !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); - - trace_ext3_journalled_writepage(page); - if (!page_has_buffers(page) || PageChecked(page)) { - if (ext3_journal_current_handle()) - goto no_write; - - handle = ext3_journal_start(inode, - ext3_writepage_trans_blocks(inode)); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto no_write; - } - /* - * It's mmapped pagecache. Add buffers and journal it. There - * doesn't seem much point in redirtying the page here. - */ - ClearPageChecked(page); - ret = __block_write_begin(page, 0, PAGE_CACHE_SIZE, - ext3_get_block); - if (ret != 0) { - ext3_journal_stop(handle); - goto out_unlock; - } - ret = walk_page_buffers(handle, page_buffers(page), 0, - PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); - - err = walk_page_buffers(handle, page_buffers(page), 0, - PAGE_CACHE_SIZE, NULL, write_end_fn); - if (ret == 0) - ret = err; - ext3_set_inode_state(inode, EXT3_STATE_JDATA); - atomic_set(&EXT3_I(inode)->i_datasync_tid, - handle->h_transaction->t_tid); - unlock_page(page); - err = ext3_journal_stop(handle); - if (!ret) - ret = err; - } else { - /* - * It is a page full of checkpoint-mode buffers. Go and write - * them. They should have been already mapped when they went - * to the journal so provide NULL get_block function to catch - * errors. - */ - ret = block_write_full_page(page, NULL, wbc); - } -out: - return ret; - -no_write: - redirty_page_for_writepage(wbc, page); -out_unlock: - unlock_page(page); - goto out; -} - -static int ext3_readpage(struct file *file, struct page *page) -{ - trace_ext3_readpage(page); - return mpage_readpage(page, ext3_get_block); -} - -static int -ext3_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) -{ - return mpage_readpages(mapping, pages, nr_pages, ext3_get_block); -} - -static void ext3_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) -{ - journal_t *journal = EXT3_JOURNAL(page->mapping->host); - - trace_ext3_invalidatepage(page, offset, length); - - /* - * If it's a full truncate we just forget about the pending dirtying - */ - if (offset == 0 && length == PAGE_CACHE_SIZE) - ClearPageChecked(page); - - journal_invalidatepage(journal, page, offset, length); -} - -static int ext3_releasepage(struct page *page, gfp_t wait) -{ - journal_t *journal = EXT3_JOURNAL(page->mapping->host); - - trace_ext3_releasepage(page); - WARN_ON(PageChecked(page)); - if (!page_has_buffers(page)) - return 0; - return journal_try_to_free_buffers(journal, page, wait); -} - -/* - * If the O_DIRECT write will extend the file then add this inode to the - * orphan list. So recovery will truncate it back to the original size - * if the machine crashes during the write. - * - * If the O_DIRECT write is intantiating holes inside i_size and the machine - * crashes then stale disk data _may_ be exposed inside the file. But current - * VFS code falls back into buffered path in that case so we are safe. - */ -static ssize_t ext3_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct ext3_inode_info *ei = EXT3_I(inode); - handle_t *handle; - ssize_t ret; - int orphan = 0; - size_t count = iov_iter_count(iter); - int retries = 0; - - trace_ext3_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); - - if (iov_iter_rw(iter) == WRITE) { - loff_t final_size = offset + count; - - if (final_size > inode->i_size) { - /* Credits for sb + inode write */ - handle = ext3_journal_start(inode, 2); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - ret = ext3_orphan_add(handle, inode); - if (ret) { - ext3_journal_stop(handle); - goto out; - } - orphan = 1; - ei->i_disksize = inode->i_size; - ext3_journal_stop(handle); - } - } - -retry: - ret = blockdev_direct_IO(iocb, inode, iter, offset, ext3_get_block); - /* - * In case of error extending write may have instantiated a few - * blocks outside i_size. Trim these off again. - */ - if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { - loff_t isize = i_size_read(inode); - loff_t end = offset + count; - - if (end > isize) - ext3_truncate_failed_direct_write(inode); - } - if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - - if (orphan) { - int err; - - /* Credits for sb + inode write */ - handle = ext3_journal_start(inode, 2); - if (IS_ERR(handle)) { - /* This is really bad luck. We've written the data - * but cannot extend i_size. Truncate allocated blocks - * and pretend the write failed... */ - ext3_truncate_failed_direct_write(inode); - ret = PTR_ERR(handle); - if (inode->i_nlink) - ext3_orphan_del(NULL, inode); - goto out; - } - if (inode->i_nlink) - ext3_orphan_del(handle, inode); - if (ret > 0) { - loff_t end = offset + ret; - if (end > inode->i_size) { - ei->i_disksize = end; - i_size_write(inode, end); - /* - * We're going to return a positive `ret' - * here due to non-zero-length I/O, so there's - * no way of reporting error returns from - * ext3_mark_inode_dirty() to userspace. So - * ignore it. - */ - ext3_mark_inode_dirty(handle, inode); - } - } - err = ext3_journal_stop(handle); - if (ret == 0) - ret = err; - } -out: - trace_ext3_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret); - return ret; -} - -/* - * Pages can be marked dirty completely asynchronously from ext3's journalling - * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do - * much here because ->set_page_dirty is called under VFS locks. The page is - * not necessarily locked. - * - * We cannot just dirty the page and leave attached buffers clean, because the - * buffers' dirty state is "definitive". We cannot just set the buffers dirty - * or jbddirty because all the journalling code will explode. - * - * So what we do is to mark the page "pending dirty" and next time writepage - * is called, propagate that into the buffers appropriately. - */ -static int ext3_journalled_set_page_dirty(struct page *page) -{ - SetPageChecked(page); - return __set_page_dirty_nobuffers(page); -} - -static const struct address_space_operations ext3_ordered_aops = { - .readpage = ext3_readpage, - .readpages = ext3_readpages, - .writepage = ext3_ordered_writepage, - .write_begin = ext3_write_begin, - .write_end = ext3_ordered_write_end, - .bmap = ext3_bmap, - .invalidatepage = ext3_invalidatepage, - .releasepage = ext3_releasepage, - .direct_IO = ext3_direct_IO, - .migratepage = buffer_migrate_page, - .is_partially_uptodate = block_is_partially_uptodate, - .is_dirty_writeback = buffer_check_dirty_writeback, - .error_remove_page = generic_error_remove_page, -}; - -static const struct address_space_operations ext3_writeback_aops = { - .readpage = ext3_readpage, - .readpages = ext3_readpages, - .writepage = ext3_writeback_writepage, - .write_begin = ext3_write_begin, - .write_end = ext3_writeback_write_end, - .bmap = ext3_bmap, - .invalidatepage = ext3_invalidatepage, - .releasepage = ext3_releasepage, - .direct_IO = ext3_direct_IO, - .migratepage = buffer_migrate_page, - .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, -}; - -static const struct address_space_operations ext3_journalled_aops = { - .readpage = ext3_readpage, - .readpages = ext3_readpages, - .writepage = ext3_journalled_writepage, - .write_begin = ext3_write_begin, - .write_end = ext3_journalled_write_end, - .set_page_dirty = ext3_journalled_set_page_dirty, - .bmap = ext3_bmap, - .invalidatepage = ext3_invalidatepage, - .releasepage = ext3_releasepage, - .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, -}; - -void ext3_set_aops(struct inode *inode) -{ - if (ext3_should_order_data(inode)) - inode->i_mapping->a_ops = &ext3_ordered_aops; - else if (ext3_should_writeback_data(inode)) - inode->i_mapping->a_ops = &ext3_writeback_aops; - else - inode->i_mapping->a_ops = &ext3_journalled_aops; -} - -/* - * ext3_block_truncate_page() zeroes out a mapping from file offset `from' - * up to the end of the block which corresponds to `from'. - * This required during truncate. We need to physically zero the tail end - * of that block so it doesn't yield old data if the file is later grown. - */ -static int ext3_block_truncate_page(struct inode *inode, loff_t from) -{ - ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT; - unsigned offset = from & (PAGE_CACHE_SIZE - 1); - unsigned blocksize, iblock, length, pos; - struct page *page; - handle_t *handle = NULL; - struct buffer_head *bh; - int err = 0; - - /* Truncated on block boundary - nothing to do */ - blocksize = inode->i_sb->s_blocksize; - if ((from & (blocksize - 1)) == 0) - return 0; - - page = grab_cache_page(inode->i_mapping, index); - if (!page) - return -ENOMEM; - length = blocksize - (offset & (blocksize - 1)); - iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); - - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); - - /* Find the buffer that contains "offset" */ - bh = page_buffers(page); - pos = blocksize; - while (offset >= pos) { - bh = bh->b_this_page; - iblock++; - pos += blocksize; - } - - err = 0; - if (buffer_freed(bh)) { - BUFFER_TRACE(bh, "freed: skip"); - goto unlock; - } - - if (!buffer_mapped(bh)) { - BUFFER_TRACE(bh, "unmapped"); - ext3_get_block(inode, iblock, bh, 0); - /* unmapped? It's a hole - nothing to do */ - if (!buffer_mapped(bh)) { - BUFFER_TRACE(bh, "still unmapped"); - goto unlock; - } - } - - /* Ok, it's mapped. Make sure it's up-to-date */ - if (PageUptodate(page)) - set_buffer_uptodate(bh); - - if (!bh_uptodate_or_lock(bh)) { - err = bh_submit_read(bh); - /* Uhhuh. Read error. Complain and punt. */ - if (err) - goto unlock; - } - - /* data=writeback mode doesn't need transaction to zero-out data */ - if (!ext3_should_writeback_data(inode)) { - /* We journal at most one block */ - handle = ext3_journal_start(inode, 1); - if (IS_ERR(handle)) { - clear_highpage(page); - flush_dcache_page(page); - err = PTR_ERR(handle); - goto unlock; - } - } - - if (ext3_should_journal_data(inode)) { - BUFFER_TRACE(bh, "get write access"); - err = ext3_journal_get_write_access(handle, bh); - if (err) - goto stop; - } - - zero_user(page, offset, length); - BUFFER_TRACE(bh, "zeroed end of block"); - - err = 0; - if (ext3_should_journal_data(inode)) { - err = ext3_journal_dirty_metadata(handle, bh); - } else { - if (ext3_should_order_data(inode)) - err = ext3_journal_dirty_data(handle, bh); - mark_buffer_dirty(bh); - } -stop: - if (handle) - ext3_journal_stop(handle); - -unlock: - unlock_page(page); - page_cache_release(page); - return err; -} - -/* - * Probably it should be a library function... search for first non-zero word - * or memcmp with zero_page, whatever is better for particular architecture. - * Linus? - */ -static inline int all_zeroes(__le32 *p, __le32 *q) -{ - while (p < q) - if (*p++) - return 0; - return 1; -} - -/** - * ext3_find_shared - find the indirect blocks for partial truncation. - * @inode: inode in question - * @depth: depth of the affected branch - * @offsets: offsets of pointers in that branch (see ext3_block_to_path) - * @chain: place to store the pointers to partial indirect blocks - * @top: place to the (detached) top of branch - * - * This is a helper function used by ext3_truncate(). - * - * When we do truncate() we may have to clean the ends of several - * indirect blocks but leave the blocks themselves alive. Block is - * partially truncated if some data below the new i_size is referred - * from it (and it is on the path to the first completely truncated - * data block, indeed). We have to free the top of that path along - * with everything to the right of the path. Since no allocation - * past the truncation point is possible until ext3_truncate() - * finishes, we may safely do the latter, but top of branch may - * require special attention - pageout below the truncation point - * might try to populate it. - * - * We atomically detach the top of branch from the tree, store the - * block number of its root in *@top, pointers to buffer_heads of - * partially truncated blocks - in @chain[].bh and pointers to - * their last elements that should not be removed - in - * @chain[].p. Return value is the pointer to last filled element - * of @chain. - * - * The work left to caller to do the actual freeing of subtrees: - * a) free the subtree starting from *@top - * b) free the subtrees whose roots are stored in - * (@chain[i].p+1 .. end of @chain[i].bh->b_data) - * c) free the subtrees growing from the inode past the @chain[0]. - * (no partially truncated stuff there). */ - -static Indirect *ext3_find_shared(struct inode *inode, int depth, - int offsets[4], Indirect chain[4], __le32 *top) -{ - Indirect *partial, *p; - int k, err; - - *top = 0; - /* Make k index the deepest non-null offset + 1 */ - for (k = depth; k > 1 && !offsets[k-1]; k--) - ; - partial = ext3_get_branch(inode, k, offsets, chain, &err); - /* Writer: pointers */ - if (!partial) - partial = chain + k-1; - /* - * If the branch acquired continuation since we've looked at it - - * fine, it should all survive and (new) top doesn't belong to us. - */ - if (!partial->key && *partial->p) - /* Writer: end */ - goto no_top; - for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--) - ; - /* - * OK, we've found the last block that must survive. The rest of our - * branch should be detached before unlocking. However, if that rest - * of branch is all ours and does not grow immediately from the inode - * it's easier to cheat and just decrement partial->p. - */ - if (p == chain + k - 1 && p > chain) { - p->p--; - } else { - *top = *p->p; - /* Nope, don't do this in ext3. Must leave the tree intact */ -#if 0 - *p->p = 0; -#endif - } - /* Writer: end */ - - while(partial > p) { - brelse(partial->bh); - partial--; - } -no_top: - return partial; -} - -/* - * Zero a number of block pointers in either an inode or an indirect block. - * If we restart the transaction we must again get write access to the - * indirect block for further modification. - * - * We release `count' blocks on disk, but (last - first) may be greater - * than `count' because there can be holes in there. - */ -static void ext3_clear_blocks(handle_t *handle, struct inode *inode, - struct buffer_head *bh, ext3_fsblk_t block_to_free, - unsigned long count, __le32 *first, __le32 *last) -{ - __le32 *p; - if (try_to_extend_transaction(handle, inode)) { - if (bh) { - BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); - if (ext3_journal_dirty_metadata(handle, bh)) - return; - } - ext3_mark_inode_dirty(handle, inode); - truncate_restart_transaction(handle, inode); - if (bh) { - BUFFER_TRACE(bh, "retaking write access"); - if (ext3_journal_get_write_access(handle, bh)) - return; - } - } - - /* - * Any buffers which are on the journal will be in memory. We find - * them on the hash table so journal_revoke() will run journal_forget() - * on them. We've already detached each block from the file, so - * bforget() in journal_forget() should be safe. - * - * AKPM: turn on bforget in journal_forget()!!! - */ - for (p = first; p < last; p++) { - u32 nr = le32_to_cpu(*p); - if (nr) { - struct buffer_head *bh; - - *p = 0; - bh = sb_find_get_block(inode->i_sb, nr); - ext3_forget(handle, 0, inode, bh, nr); - } - } - - ext3_free_blocks(handle, inode, block_to_free, count); -} - -/** - * ext3_free_data - free a list of data blocks - * @handle: handle for this transaction - * @inode: inode we are dealing with - * @this_bh: indirect buffer_head which contains *@first and *@last - * @first: array of block numbers - * @last: points immediately past the end of array - * - * We are freeing all blocks referred from that array (numbers are stored as - * little-endian 32-bit) and updating @inode->i_blocks appropriately. - * - * We accumulate contiguous runs of blocks to free. Conveniently, if these - * blocks are contiguous then releasing them at one time will only affect one - * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't - * actually use a lot of journal space. - * - * @this_bh will be %NULL if @first and @last point into the inode's direct - * block pointers. - */ -static void ext3_free_data(handle_t *handle, struct inode *inode, - struct buffer_head *this_bh, - __le32 *first, __le32 *last) -{ - ext3_fsblk_t block_to_free = 0; /* Starting block # of a run */ - unsigned long count = 0; /* Number of blocks in the run */ - __le32 *block_to_free_p = NULL; /* Pointer into inode/ind - corresponding to - block_to_free */ - ext3_fsblk_t nr; /* Current block # */ - __le32 *p; /* Pointer into inode/ind - for current block */ - int err; - - if (this_bh) { /* For indirect block */ - BUFFER_TRACE(this_bh, "get_write_access"); - err = ext3_journal_get_write_access(handle, this_bh); - /* Important: if we can't update the indirect pointers - * to the blocks, we can't free them. */ - if (err) - return; - } - - for (p = first; p < last; p++) { - nr = le32_to_cpu(*p); - if (nr) { - /* accumulate blocks to free if they're contiguous */ - if (count == 0) { - block_to_free = nr; - block_to_free_p = p; - count = 1; - } else if (nr == block_to_free + count) { - count++; - } else { - ext3_clear_blocks(handle, inode, this_bh, - block_to_free, - count, block_to_free_p, p); - block_to_free = nr; - block_to_free_p = p; - count = 1; - } - } - } - - if (count > 0) - ext3_clear_blocks(handle, inode, this_bh, block_to_free, - count, block_to_free_p, p); - - if (this_bh) { - BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata"); - - /* - * The buffer head should have an attached journal head at this - * point. However, if the data is corrupted and an indirect - * block pointed to itself, it would have been detached when - * the block was cleared. Check for this instead of OOPSing. - */ - if (bh2jh(this_bh)) - ext3_journal_dirty_metadata(handle, this_bh); - else - ext3_error(inode->i_sb, "ext3_free_data", - "circular indirect block detected, " - "inode=%lu, block=%llu", - inode->i_ino, - (unsigned long long)this_bh->b_blocknr); - } -} - -/** - * ext3_free_branches - free an array of branches - * @handle: JBD handle for this transaction - * @inode: inode we are dealing with - * @parent_bh: the buffer_head which contains *@first and *@last - * @first: array of block numbers - * @last: pointer immediately past the end of array - * @depth: depth of the branches to free - * - * We are freeing all blocks referred from these branches (numbers are - * stored as little-endian 32-bit) and updating @inode->i_blocks - * appropriately. - */ -static void ext3_free_branches(handle_t *handle, struct inode *inode, - struct buffer_head *parent_bh, - __le32 *first, __le32 *last, int depth) -{ - ext3_fsblk_t nr; - __le32 *p; - - if (is_handle_aborted(handle)) - return; - - if (depth--) { - struct buffer_head *bh; - int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); - p = last; - while (--p >= first) { - nr = le32_to_cpu(*p); - if (!nr) - continue; /* A hole */ - - /* Go read the buffer for the next level down */ - bh = sb_bread(inode->i_sb, nr); - - /* - * A read failure? Report error and clear slot - * (should be rare). - */ - if (!bh) { - ext3_error(inode->i_sb, "ext3_free_branches", - "Read failure, inode=%lu, block="E3FSBLK, - inode->i_ino, nr); - continue; - } - - /* This zaps the entire block. Bottom up. */ - BUFFER_TRACE(bh, "free child branches"); - ext3_free_branches(handle, inode, bh, - (__le32*)bh->b_data, - (__le32*)bh->b_data + addr_per_block, - depth); - - /* - * Everything below this this pointer has been - * released. Now let this top-of-subtree go. - * - * We want the freeing of this indirect block to be - * atomic in the journal with the updating of the - * bitmap block which owns it. So make some room in - * the journal. - * - * We zero the parent pointer *after* freeing its - * pointee in the bitmaps, so if extend_transaction() - * for some reason fails to put the bitmap changes and - * the release into the same transaction, recovery - * will merely complain about releasing a free block, - * rather than leaking blocks. - */ - if (is_handle_aborted(handle)) - return; - if (try_to_extend_transaction(handle, inode)) { - ext3_mark_inode_dirty(handle, inode); - truncate_restart_transaction(handle, inode); - } - - /* - * We've probably journalled the indirect block several - * times during the truncate. But it's no longer - * needed and we now drop it from the transaction via - * journal_revoke(). - * - * That's easy if it's exclusively part of this - * transaction. But if it's part of the committing - * transaction then journal_forget() will simply - * brelse() it. That means that if the underlying - * block is reallocated in ext3_get_block(), - * unmap_underlying_metadata() will find this block - * and will try to get rid of it. damn, damn. Thus - * we don't allow a block to be reallocated until - * a transaction freeing it has fully committed. - * - * We also have to make sure journal replay after a - * crash does not overwrite non-journaled data blocks - * with old metadata when the block got reallocated for - * data. Thus we have to store a revoke record for a - * block in the same transaction in which we free the - * block. - */ - ext3_forget(handle, 1, inode, bh, bh->b_blocknr); - - ext3_free_blocks(handle, inode, nr, 1); - - if (parent_bh) { - /* - * The block which we have just freed is - * pointed to by an indirect block: journal it - */ - BUFFER_TRACE(parent_bh, "get_write_access"); - if (!ext3_journal_get_write_access(handle, - parent_bh)){ - *p = 0; - BUFFER_TRACE(parent_bh, - "call ext3_journal_dirty_metadata"); - ext3_journal_dirty_metadata(handle, - parent_bh); - } - } - } - } else { - /* We have reached the bottom of the tree. */ - BUFFER_TRACE(parent_bh, "free data blocks"); - ext3_free_data(handle, inode, parent_bh, first, last); - } -} - -int ext3_can_truncate(struct inode *inode) -{ - if (S_ISREG(inode->i_mode)) - return 1; - if (S_ISDIR(inode->i_mode)) - return 1; - if (S_ISLNK(inode->i_mode)) - return !ext3_inode_is_fast_symlink(inode); - return 0; -} - -/* - * ext3_truncate() - * - * We block out ext3_get_block() block instantiations across the entire - * transaction, and VFS/VM ensures that ext3_truncate() cannot run - * simultaneously on behalf of the same inode. - * - * As we work through the truncate and commit bits of it to the journal there - * is one core, guiding principle: the file's tree must always be consistent on - * disk. We must be able to restart the truncate after a crash. - * - * The file's tree may be transiently inconsistent in memory (although it - * probably isn't), but whenever we close off and commit a journal transaction, - * the contents of (the filesystem + the journal) must be consistent and - * restartable. It's pretty simple, really: bottom up, right to left (although - * left-to-right works OK too). - * - * Note that at recovery time, journal replay occurs *before* the restart of - * truncate against the orphan inode list. - * - * The committed inode has the new, desired i_size (which is the same as - * i_disksize in this case). After a crash, ext3_orphan_cleanup() will see - * that this inode's truncate did not complete and it will again call - * ext3_truncate() to have another go. So there will be instantiated blocks - * to the right of the truncation point in a crashed ext3 filesystem. But - * that's fine - as long as they are linked from the inode, the post-crash - * ext3_truncate() run will find them and release them. - */ -void ext3_truncate(struct inode *inode) -{ - handle_t *handle; - struct ext3_inode_info *ei = EXT3_I(inode); - __le32 *i_data = ei->i_data; - int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); - int offsets[4]; - Indirect chain[4]; - Indirect *partial; - __le32 nr = 0; - int n; - long last_block; - unsigned blocksize = inode->i_sb->s_blocksize; - - trace_ext3_truncate_enter(inode); - - if (!ext3_can_truncate(inode)) - goto out_notrans; - - if (inode->i_size == 0 && ext3_should_writeback_data(inode)) - ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE); - - handle = start_transaction(inode); - if (IS_ERR(handle)) - goto out_notrans; - - last_block = (inode->i_size + blocksize-1) - >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); - n = ext3_block_to_path(inode, last_block, offsets, NULL); - if (n == 0) - goto out_stop; /* error */ - - /* - * OK. This truncate is going to happen. We add the inode to the - * orphan list, so that if this truncate spans multiple transactions, - * and we crash, we will resume the truncate when the filesystem - * recovers. It also marks the inode dirty, to catch the new size. - * - * Implication: the file must always be in a sane, consistent - * truncatable state while each transaction commits. - */ - if (ext3_orphan_add(handle, inode)) - goto out_stop; - - /* - * The orphan list entry will now protect us from any crash which - * occurs before the truncate completes, so it is now safe to propagate - * the new, shorter inode size (held for now in i_size) into the - * on-disk inode. We do this via i_disksize, which is the value which - * ext3 *really* writes onto the disk inode. - */ - ei->i_disksize = inode->i_size; - - /* - * From here we block out all ext3_get_block() callers who want to - * modify the block allocation tree. - */ - mutex_lock(&ei->truncate_mutex); - - if (n == 1) { /* direct blocks */ - ext3_free_data(handle, inode, NULL, i_data+offsets[0], - i_data + EXT3_NDIR_BLOCKS); - goto do_indirects; - } - - partial = ext3_find_shared(inode, n, offsets, chain, &nr); - /* Kill the top of shared branch (not detached) */ - if (nr) { - if (partial == chain) { - /* Shared branch grows from the inode */ - ext3_free_branches(handle, inode, NULL, - &nr, &nr+1, (chain+n-1) - partial); - *partial->p = 0; - /* - * We mark the inode dirty prior to restart, - * and prior to stop. No need for it here. - */ - } else { - /* Shared branch grows from an indirect block */ - ext3_free_branches(handle, inode, partial->bh, - partial->p, - partial->p+1, (chain+n-1) - partial); - } - } - /* Clear the ends of indirect blocks on the shared branch */ - while (partial > chain) { - ext3_free_branches(handle, inode, partial->bh, partial->p + 1, - (__le32*)partial->bh->b_data+addr_per_block, - (chain+n-1) - partial); - BUFFER_TRACE(partial->bh, "call brelse"); - brelse (partial->bh); - partial--; - } -do_indirects: - /* Kill the remaining (whole) subtrees */ - switch (offsets[0]) { - default: - nr = i_data[EXT3_IND_BLOCK]; - if (nr) { - ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 1); - i_data[EXT3_IND_BLOCK] = 0; - } - case EXT3_IND_BLOCK: - nr = i_data[EXT3_DIND_BLOCK]; - if (nr) { - ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 2); - i_data[EXT3_DIND_BLOCK] = 0; - } - case EXT3_DIND_BLOCK: - nr = i_data[EXT3_TIND_BLOCK]; - if (nr) { - ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 3); - i_data[EXT3_TIND_BLOCK] = 0; - } - case EXT3_TIND_BLOCK: - ; - } - - ext3_discard_reservation(inode); - - mutex_unlock(&ei->truncate_mutex); - inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; - ext3_mark_inode_dirty(handle, inode); - - /* - * In a multi-transaction truncate, we only make the final transaction - * synchronous - */ - if (IS_SYNC(inode)) - handle->h_sync = 1; -out_stop: - /* - * If this was a simple ftruncate(), and the file will remain alive - * then we need to clear up the orphan record which we created above. - * However, if this was a real unlink then we were called by - * ext3_evict_inode(), and we allow that function to clean up the - * orphan info for us. - */ - if (inode->i_nlink) - ext3_orphan_del(handle, inode); - - ext3_journal_stop(handle); - trace_ext3_truncate_exit(inode); - return; -out_notrans: - /* - * Delete the inode from orphan list so that it doesn't stay there - * forever and trigger assertion on umount. - */ - if (inode->i_nlink) - ext3_orphan_del(NULL, inode); - trace_ext3_truncate_exit(inode); -} - -static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb, - unsigned long ino, struct ext3_iloc *iloc) -{ - unsigned long block_group; - unsigned long offset; - ext3_fsblk_t block; - struct ext3_group_desc *gdp; - - if (!ext3_valid_inum(sb, ino)) { - /* - * This error is already checked for in namei.c unless we are - * looking at an NFS filehandle, in which case no error - * report is needed - */ - return 0; - } - - block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); - gdp = ext3_get_group_desc(sb, block_group, NULL); - if (!gdp) - return 0; - /* - * Figure out the offset within the block group inode table - */ - offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) * - EXT3_INODE_SIZE(sb); - block = le32_to_cpu(gdp->bg_inode_table) + - (offset >> EXT3_BLOCK_SIZE_BITS(sb)); - - iloc->block_group = block_group; - iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1); - return block; -} - -/* - * ext3_get_inode_loc returns with an extra refcount against the inode's - * underlying buffer_head on success. If 'in_mem' is true, we have all - * data in memory that is needed to recreate the on-disk version of this - * inode. - */ -static int __ext3_get_inode_loc(struct inode *inode, - struct ext3_iloc *iloc, int in_mem) -{ - ext3_fsblk_t block; - struct buffer_head *bh; - - block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc); - if (!block) - return -EIO; - - bh = sb_getblk(inode->i_sb, block); - if (unlikely(!bh)) { - ext3_error (inode->i_sb, "ext3_get_inode_loc", - "unable to read inode block - " - "inode=%lu, block="E3FSBLK, - inode->i_ino, block); - return -ENOMEM; - } - if (!buffer_uptodate(bh)) { - lock_buffer(bh); - - /* - * If the buffer has the write error flag, we have failed - * to write out another inode in the same block. In this - * case, we don't have to read the block because we may - * read the old inode data successfully. - */ - if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) - set_buffer_uptodate(bh); - - if (buffer_uptodate(bh)) { - /* someone brought it uptodate while we waited */ - unlock_buffer(bh); - goto has_buffer; - } - - /* - * If we have all information of the inode in memory and this - * is the only valid inode in the block, we need not read the - * block. - */ - if (in_mem) { - struct buffer_head *bitmap_bh; - struct ext3_group_desc *desc; - int inodes_per_buffer; - int inode_offset, i; - int block_group; - int start; - - block_group = (inode->i_ino - 1) / - EXT3_INODES_PER_GROUP(inode->i_sb); - inodes_per_buffer = bh->b_size / - EXT3_INODE_SIZE(inode->i_sb); - inode_offset = ((inode->i_ino - 1) % - EXT3_INODES_PER_GROUP(inode->i_sb)); - start = inode_offset & ~(inodes_per_buffer - 1); - - /* Is the inode bitmap in cache? */ - desc = ext3_get_group_desc(inode->i_sb, - block_group, NULL); - if (!desc) - goto make_io; - - bitmap_bh = sb_getblk(inode->i_sb, - le32_to_cpu(desc->bg_inode_bitmap)); - if (unlikely(!bitmap_bh)) - goto make_io; - - /* - * If the inode bitmap isn't in cache then the - * optimisation may end up performing two reads instead - * of one, so skip it. - */ - if (!buffer_uptodate(bitmap_bh)) { - brelse(bitmap_bh); - goto make_io; - } - for (i = start; i < start + inodes_per_buffer; i++) { - if (i == inode_offset) - continue; - if (ext3_test_bit(i, bitmap_bh->b_data)) - break; - } - brelse(bitmap_bh); - if (i == start + inodes_per_buffer) { - /* all other inodes are free, so skip I/O */ - memset(bh->b_data, 0, bh->b_size); - set_buffer_uptodate(bh); - unlock_buffer(bh); - goto has_buffer; - } - } - -make_io: - /* - * There are other valid inodes in the buffer, this inode - * has in-inode xattrs, or we don't have this inode in memory. - * Read the block from disk. - */ - trace_ext3_load_inode(inode); - get_bh(bh); - bh->b_end_io = end_buffer_read_sync; - submit_bh(READ | REQ_META | REQ_PRIO, bh); - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) { - ext3_error(inode->i_sb, "ext3_get_inode_loc", - "unable to read inode block - " - "inode=%lu, block="E3FSBLK, - inode->i_ino, block); - brelse(bh); - return -EIO; - } - } -has_buffer: - iloc->bh = bh; - return 0; -} - -int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc) -{ - /* We have all inode data except xattrs in memory here. */ - return __ext3_get_inode_loc(inode, iloc, - !ext3_test_inode_state(inode, EXT3_STATE_XATTR)); -} - -void ext3_set_inode_flags(struct inode *inode) -{ - unsigned int flags = EXT3_I(inode)->i_flags; - - inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); - if (flags & EXT3_SYNC_FL) - inode->i_flags |= S_SYNC; - if (flags & EXT3_APPEND_FL) - inode->i_flags |= S_APPEND; - if (flags & EXT3_IMMUTABLE_FL) - inode->i_flags |= S_IMMUTABLE; - if (flags & EXT3_NOATIME_FL) - inode->i_flags |= S_NOATIME; - if (flags & EXT3_DIRSYNC_FL) - inode->i_flags |= S_DIRSYNC; -} - -/* Propagate flags from i_flags to EXT3_I(inode)->i_flags */ -void ext3_get_inode_flags(struct ext3_inode_info *ei) -{ - unsigned int flags = ei->vfs_inode.i_flags; - - ei->i_flags &= ~(EXT3_SYNC_FL|EXT3_APPEND_FL| - EXT3_IMMUTABLE_FL|EXT3_NOATIME_FL|EXT3_DIRSYNC_FL); - if (flags & S_SYNC) - ei->i_flags |= EXT3_SYNC_FL; - if (flags & S_APPEND) - ei->i_flags |= EXT3_APPEND_FL; - if (flags & S_IMMUTABLE) - ei->i_flags |= EXT3_IMMUTABLE_FL; - if (flags & S_NOATIME) - ei->i_flags |= EXT3_NOATIME_FL; - if (flags & S_DIRSYNC) - ei->i_flags |= EXT3_DIRSYNC_FL; -} - -struct inode *ext3_iget(struct super_block *sb, unsigned long ino) -{ - struct ext3_iloc iloc; - struct ext3_inode *raw_inode; - struct ext3_inode_info *ei; - struct buffer_head *bh; - struct inode *inode; - journal_t *journal = EXT3_SB(sb)->s_journal; - transaction_t *transaction; - long ret; - int block; - uid_t i_uid; - gid_t i_gid; - - inode = iget_locked(sb, ino); - if (!inode) - return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) - return inode; - - ei = EXT3_I(inode); - ei->i_block_alloc_info = NULL; - - ret = __ext3_get_inode_loc(inode, &iloc, 0); - if (ret < 0) - goto bad_inode; - bh = iloc.bh; - raw_inode = ext3_raw_inode(&iloc); - inode->i_mode = le16_to_cpu(raw_inode->i_mode); - i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); - i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); - if(!(test_opt (inode->i_sb, NO_UID32))) { - i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; - i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; - } - i_uid_write(inode, i_uid); - i_gid_write(inode, i_gid); - set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); - inode->i_size = le32_to_cpu(raw_inode->i_size); - inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); - inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime); - inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime); - inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0; - - ei->i_state_flags = 0; - ei->i_dir_start_lookup = 0; - ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); - /* We now have enough fields to check if the inode was active or not. - * This is needed because nfsd might try to access dead inodes - * the test is that same one that e2fsck uses - * NeilBrown 1999oct15 - */ - if (inode->i_nlink == 0) { - if (inode->i_mode == 0 || - !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) { - /* this inode is deleted */ - brelse (bh); - ret = -ESTALE; - goto bad_inode; - } - /* The only unlinked inodes we let through here have - * valid i_mode and are being read by the orphan - * recovery code: that's fine, we're about to complete - * the process of deleting those. */ - } - inode->i_blocks = le32_to_cpu(raw_inode->i_blocks); - ei->i_flags = le32_to_cpu(raw_inode->i_flags); -#ifdef EXT3_FRAGMENTS - ei->i_faddr = le32_to_cpu(raw_inode->i_faddr); - ei->i_frag_no = raw_inode->i_frag; - ei->i_frag_size = raw_inode->i_fsize; -#endif - ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl); - if (!S_ISREG(inode->i_mode)) { - ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl); - } else { - inode->i_size |= - ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32; - } - ei->i_disksize = inode->i_size; - inode->i_generation = le32_to_cpu(raw_inode->i_generation); - ei->i_block_group = iloc.block_group; - /* - * NOTE! The in-memory inode i_data array is in little-endian order - * even on big-endian machines: we do NOT byteswap the block numbers! - */ - for (block = 0; block < EXT3_N_BLOCKS; block++) - ei->i_data[block] = raw_inode->i_block[block]; - INIT_LIST_HEAD(&ei->i_orphan); - - /* - * Set transaction id's of transactions that have to be committed - * to finish f[data]sync. We set them to currently running transaction - * as we cannot be sure that the inode or some of its metadata isn't - * part of the transaction - the inode could have been reclaimed and - * now it is reread from disk. - */ - if (journal) { - tid_t tid; - - spin_lock(&journal->j_state_lock); - if (journal->j_running_transaction) - transaction = journal->j_running_transaction; - else - transaction = journal->j_committing_transaction; - if (transaction) - tid = transaction->t_tid; - else - tid = journal->j_commit_sequence; - spin_unlock(&journal->j_state_lock); - atomic_set(&ei->i_sync_tid, tid); - atomic_set(&ei->i_datasync_tid, tid); - } - - if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 && - EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) { - /* - * When mke2fs creates big inodes it does not zero out - * the unused bytes above EXT3_GOOD_OLD_INODE_SIZE, - * so ignore those first few inodes. - */ - ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); - if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > - EXT3_INODE_SIZE(inode->i_sb)) { - brelse (bh); - ret = -EIO; - goto bad_inode; - } - if (ei->i_extra_isize == 0) { - /* The extra space is currently unused. Use it. */ - ei->i_extra_isize = sizeof(struct ext3_inode) - - EXT3_GOOD_OLD_INODE_SIZE; - } else { - __le32 *magic = (void *)raw_inode + - EXT3_GOOD_OLD_INODE_SIZE + - ei->i_extra_isize; - if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC)) - ext3_set_inode_state(inode, EXT3_STATE_XATTR); - } - } else - ei->i_extra_isize = 0; - - if (S_ISREG(inode->i_mode)) { - inode->i_op = &ext3_file_inode_operations; - inode->i_fop = &ext3_file_operations; - ext3_set_aops(inode); - } else if (S_ISDIR(inode->i_mode)) { - inode->i_op = &ext3_dir_inode_operations; - inode->i_fop = &ext3_dir_operations; - } else if (S_ISLNK(inode->i_mode)) { - if (ext3_inode_is_fast_symlink(inode)) { - inode->i_op = &ext3_fast_symlink_inode_operations; - nd_terminate_link(ei->i_data, inode->i_size, - sizeof(ei->i_data) - 1); - inode->i_link = (char *)ei->i_data; - } else { - inode->i_op = &ext3_symlink_inode_operations; - ext3_set_aops(inode); - } - } else { - inode->i_op = &ext3_special_inode_operations; - if (raw_inode->i_block[0]) - init_special_inode(inode, inode->i_mode, - old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); - else - init_special_inode(inode, inode->i_mode, - new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); - } - brelse (iloc.bh); - ext3_set_inode_flags(inode); - unlock_new_inode(inode); - return inode; - -bad_inode: - iget_failed(inode); - return ERR_PTR(ret); -} - -/* - * Post the struct inode info into an on-disk inode location in the - * buffer-cache. This gobbles the caller's reference to the - * buffer_head in the inode location struct. - * - * The caller must have write access to iloc->bh. - */ -static int ext3_do_update_inode(handle_t *handle, - struct inode *inode, - struct ext3_iloc *iloc) -{ - struct ext3_inode *raw_inode = ext3_raw_inode(iloc); - struct ext3_inode_info *ei = EXT3_I(inode); - struct buffer_head *bh = iloc->bh; - int err = 0, rc, block; - int need_datasync = 0; - __le32 disksize; - uid_t i_uid; - gid_t i_gid; - -again: - /* we can't allow multiple procs in here at once, its a bit racey */ - lock_buffer(bh); - - /* For fields not not tracking in the in-memory inode, - * initialise them to zero for new inodes. */ - if (ext3_test_inode_state(inode, EXT3_STATE_NEW)) - memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size); - - ext3_get_inode_flags(ei); - raw_inode->i_mode = cpu_to_le16(inode->i_mode); - i_uid = i_uid_read(inode); - i_gid = i_gid_read(inode); - if(!(test_opt(inode->i_sb, NO_UID32))) { - raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); - raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); -/* - * Fix up interoperability with old kernels. Otherwise, old inodes get - * re-used with the upper 16 bits of the uid/gid intact - */ - if(!ei->i_dtime) { - raw_inode->i_uid_high = - cpu_to_le16(high_16_bits(i_uid)); - raw_inode->i_gid_high = - cpu_to_le16(high_16_bits(i_gid)); - } else { - raw_inode->i_uid_high = 0; - raw_inode->i_gid_high = 0; - } - } else { - raw_inode->i_uid_low = - cpu_to_le16(fs_high2lowuid(i_uid)); - raw_inode->i_gid_low = - cpu_to_le16(fs_high2lowgid(i_gid)); - raw_inode->i_uid_high = 0; - raw_inode->i_gid_high = 0; - } - raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); - disksize = cpu_to_le32(ei->i_disksize); - if (disksize != raw_inode->i_size) { - need_datasync = 1; - raw_inode->i_size = disksize; - } - raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); - raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); - raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); - raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); - raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); - raw_inode->i_flags = cpu_to_le32(ei->i_flags); -#ifdef EXT3_FRAGMENTS - raw_inode->i_faddr = cpu_to_le32(ei->i_faddr); - raw_inode->i_frag = ei->i_frag_no; - raw_inode->i_fsize = ei->i_frag_size; -#endif - raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl); - if (!S_ISREG(inode->i_mode)) { - raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl); - } else { - disksize = cpu_to_le32(ei->i_disksize >> 32); - if (disksize != raw_inode->i_size_high) { - raw_inode->i_size_high = disksize; - need_datasync = 1; - } - if (ei->i_disksize > 0x7fffffffULL) { - struct super_block *sb = inode->i_sb; - if (!EXT3_HAS_RO_COMPAT_FEATURE(sb, - EXT3_FEATURE_RO_COMPAT_LARGE_FILE) || - EXT3_SB(sb)->s_es->s_rev_level == - cpu_to_le32(EXT3_GOOD_OLD_REV)) { - /* If this is the first large file - * created, add a flag to the superblock. - */ - unlock_buffer(bh); - err = ext3_journal_get_write_access(handle, - EXT3_SB(sb)->s_sbh); - if (err) - goto out_brelse; - - ext3_update_dynamic_rev(sb); - EXT3_SET_RO_COMPAT_FEATURE(sb, - EXT3_FEATURE_RO_COMPAT_LARGE_FILE); - handle->h_sync = 1; - err = ext3_journal_dirty_metadata(handle, - EXT3_SB(sb)->s_sbh); - /* get our lock and start over */ - goto again; - } - } - } - raw_inode->i_generation = cpu_to_le32(inode->i_generation); - if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { - if (old_valid_dev(inode->i_rdev)) { - raw_inode->i_block[0] = - cpu_to_le32(old_encode_dev(inode->i_rdev)); - raw_inode->i_block[1] = 0; - } else { - raw_inode->i_block[0] = 0; - raw_inode->i_block[1] = - cpu_to_le32(new_encode_dev(inode->i_rdev)); - raw_inode->i_block[2] = 0; - } - } else for (block = 0; block < EXT3_N_BLOCKS; block++) - raw_inode->i_block[block] = ei->i_data[block]; - - if (ei->i_extra_isize) - raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); - - BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); - unlock_buffer(bh); - rc = ext3_journal_dirty_metadata(handle, bh); - if (!err) - err = rc; - ext3_clear_inode_state(inode, EXT3_STATE_NEW); - - atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid); - if (need_datasync) - atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); -out_brelse: - brelse (bh); - ext3_std_error(inode->i_sb, err); - return err; -} - -/* - * ext3_write_inode() - * - * We are called from a few places: - * - * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files. - * Here, there will be no transaction running. We wait for any running - * transaction to commit. - * - * - Within flush work (for sys_sync(), kupdate and such). - * We wait on commit, if told to. - * - * - Within iput_final() -> write_inode_now() - * We wait on commit, if told to. - * - * In all cases it is actually safe for us to return without doing anything, - * because the inode has been copied into a raw inode buffer in - * ext3_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL - * writeback. - * - * Note that we are absolutely dependent upon all inode dirtiers doing the - * right thing: they *must* call mark_inode_dirty() after dirtying info in - * which we are interested. - * - * It would be a bug for them to not do this. The code: - * - * mark_inode_dirty(inode) - * stuff(); - * inode->i_size = expr; - * - * is in error because write_inode() could occur while `stuff()' is running, - * and the new i_size will be lost. Plus the inode will no longer be on the - * superblock's dirty inode list. - */ -int ext3_write_inode(struct inode *inode, struct writeback_control *wbc) -{ - if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) - return 0; - - if (ext3_journal_current_handle()) { - jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); - dump_stack(); - return -EIO; - } - - /* - * No need to force transaction in WB_SYNC_NONE mode. Also - * ext3_sync_fs() will force the commit after everything is - * written. - */ - if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) - return 0; - - return ext3_force_commit(inode->i_sb); -} - -/* - * ext3_setattr() - * - * Called from notify_change. - * - * We want to trap VFS attempts to truncate the file as soon as - * possible. In particular, we want to make sure that when the VFS - * shrinks i_size, we put the inode on the orphan list and modify - * i_disksize immediately, so that during the subsequent flushing of - * dirty pages and freeing of disk blocks, we can guarantee that any - * commit will leave the blocks being flushed in an unused state on - * disk. (On recovery, the inode will get truncated and the blocks will - * be freed, so we have a strong guarantee that no future commit will - * leave these blocks visible to the user.) - * - * Called with inode->sem down. - */ -int ext3_setattr(struct dentry *dentry, struct iattr *attr) -{ - struct inode *inode = d_inode(dentry); - int error, rc = 0; - const unsigned int ia_valid = attr->ia_valid; - - error = inode_change_ok(inode, attr); - if (error) - return error; - - if (is_quota_modification(inode, attr)) - dquot_initialize(inode); - if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || - (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { - handle_t *handle; - - /* (user+group)*(old+new) structure, inode write (sb, - * inode block, ? - but truncate inode update has it) */ - handle = ext3_journal_start(inode, EXT3_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ - EXT3_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)+3); - if (IS_ERR(handle)) { - error = PTR_ERR(handle); - goto err_out; - } - error = dquot_transfer(inode, attr); - if (error) { - ext3_journal_stop(handle); - return error; - } - /* Update corresponding info in inode so that everything is in - * one transaction */ - if (attr->ia_valid & ATTR_UID) - inode->i_uid = attr->ia_uid; - if (attr->ia_valid & ATTR_GID) - inode->i_gid = attr->ia_gid; - error = ext3_mark_inode_dirty(handle, inode); - ext3_journal_stop(handle); - } - - if (attr->ia_valid & ATTR_SIZE) - inode_dio_wait(inode); - - if (S_ISREG(inode->i_mode) && - attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { - handle_t *handle; - - handle = ext3_journal_start(inode, 3); - if (IS_ERR(handle)) { - error = PTR_ERR(handle); - goto err_out; - } - - error = ext3_orphan_add(handle, inode); - if (error) { - ext3_journal_stop(handle); - goto err_out; - } - EXT3_I(inode)->i_disksize = attr->ia_size; - error = ext3_mark_inode_dirty(handle, inode); - ext3_journal_stop(handle); - if (error) { - /* Some hard fs error must have happened. Bail out. */ - ext3_orphan_del(NULL, inode); - goto err_out; - } - rc = ext3_block_truncate_page(inode, attr->ia_size); - if (rc) { - /* Cleanup orphan list and exit */ - handle = ext3_journal_start(inode, 3); - if (IS_ERR(handle)) { - ext3_orphan_del(NULL, inode); - goto err_out; - } - ext3_orphan_del(handle, inode); - ext3_journal_stop(handle); - goto err_out; - } - } - - if ((attr->ia_valid & ATTR_SIZE) && - attr->ia_size != i_size_read(inode)) { - truncate_setsize(inode, attr->ia_size); - ext3_truncate(inode); - } - - setattr_copy(inode, attr); - mark_inode_dirty(inode); - - if (ia_valid & ATTR_MODE) - rc = posix_acl_chmod(inode, inode->i_mode); - -err_out: - ext3_std_error(inode->i_sb, error); - if (!error) - error = rc; - return error; -} - - -/* - * How many blocks doth make a writepage()? - * - * With N blocks per page, it may be: - * N data blocks - * 2 indirect block - * 2 dindirect - * 1 tindirect - * N+5 bitmap blocks (from the above) - * N+5 group descriptor summary blocks - * 1 inode block - * 1 superblock. - * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files - * - * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS - * - * With ordered or writeback data it's the same, less the N data blocks. - * - * If the inode's direct blocks can hold an integral number of pages then a - * page cannot straddle two indirect blocks, and we can only touch one indirect - * and dindirect block, and the "5" above becomes "3". - * - * This still overestimates under most circumstances. If we were to pass the - * start and end offsets in here as well we could do block_to_path() on each - * block and work out the exact number of indirects which are touched. Pah. - */ - -static int ext3_writepage_trans_blocks(struct inode *inode) -{ - int bpp = ext3_journal_blocks_per_page(inode); - int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; - int ret; - - if (ext3_should_journal_data(inode)) - ret = 3 * (bpp + indirects) + 2; - else - ret = 2 * (bpp + indirects) + indirects + 2; - -#ifdef CONFIG_QUOTA - /* We know that structure was already allocated during dquot_initialize so - * we will be updating only the data blocks + inodes */ - ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); -#endif - - return ret; -} - -/* - * The caller must have previously called ext3_reserve_inode_write(). - * Give this, we know that the caller already has write access to iloc->bh. - */ -int ext3_mark_iloc_dirty(handle_t *handle, - struct inode *inode, struct ext3_iloc *iloc) -{ - int err = 0; - - /* the do_update_inode consumes one bh->b_count */ - get_bh(iloc->bh); - - /* ext3_do_update_inode() does journal_dirty_metadata */ - err = ext3_do_update_inode(handle, inode, iloc); - put_bh(iloc->bh); - return err; -} - -/* - * On success, We end up with an outstanding reference count against - * iloc->bh. This _must_ be cleaned up later. - */ - -int -ext3_reserve_inode_write(handle_t *handle, struct inode *inode, - struct ext3_iloc *iloc) -{ - int err = 0; - if (handle) { - err = ext3_get_inode_loc(inode, iloc); - if (!err) { - BUFFER_TRACE(iloc->bh, "get_write_access"); - err = ext3_journal_get_write_access(handle, iloc->bh); - if (err) { - brelse(iloc->bh); - iloc->bh = NULL; - } - } - } - ext3_std_error(inode->i_sb, err); - return err; -} - -/* - * What we do here is to mark the in-core inode as clean with respect to inode - * dirtiness (it may still be data-dirty). - * This means that the in-core inode may be reaped by prune_icache - * without having to perform any I/O. This is a very good thing, - * because *any* task may call prune_icache - even ones which - * have a transaction open against a different journal. - * - * Is this cheating? Not really. Sure, we haven't written the - * inode out, but prune_icache isn't a user-visible syncing function. - * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) - * we start and wait on commits. - */ -int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) -{ - struct ext3_iloc iloc; - int err; - - might_sleep(); - trace_ext3_mark_inode_dirty(inode, _RET_IP_); - err = ext3_reserve_inode_write(handle, inode, &iloc); - if (!err) - err = ext3_mark_iloc_dirty(handle, inode, &iloc); - return err; -} - -/* - * ext3_dirty_inode() is called from __mark_inode_dirty() - * - * We're really interested in the case where a file is being extended. - * i_size has been changed by generic_commit_write() and we thus need - * to include the updated inode in the current transaction. - * - * Also, dquot_alloc_space() will always dirty the inode when blocks - * are allocated to the file. - * - * If the inode is marked synchronous, we don't honour that here - doing - * so would cause a commit on atime updates, which we don't bother doing. - * We handle synchronous inodes at the highest possible level. - */ -void ext3_dirty_inode(struct inode *inode, int flags) -{ - handle_t *current_handle = ext3_journal_current_handle(); - handle_t *handle; - - handle = ext3_journal_start(inode, 2); - if (IS_ERR(handle)) - goto out; - if (current_handle && - current_handle->h_transaction != handle->h_transaction) { - /* This task has a transaction open against a different fs */ - printk(KERN_EMERG "%s: transactions do not match!\n", - __func__); - } else { - jbd_debug(5, "marking dirty. outer handle=%p\n", - current_handle); - ext3_mark_inode_dirty(handle, inode); - } - ext3_journal_stop(handle); -out: - return; -} - -#if 0 -/* - * Bind an inode's backing buffer_head into this transaction, to prevent - * it from being flushed to disk early. Unlike - * ext3_reserve_inode_write, this leaves behind no bh reference and - * returns no iloc structure, so the caller needs to repeat the iloc - * lookup to mark the inode dirty later. - */ -static int ext3_pin_inode(handle_t *handle, struct inode *inode) -{ - struct ext3_iloc iloc; - - int err = 0; - if (handle) { - err = ext3_get_inode_loc(inode, &iloc); - if (!err) { - BUFFER_TRACE(iloc.bh, "get_write_access"); - err = journal_get_write_access(handle, iloc.bh); - if (!err) - err = ext3_journal_dirty_metadata(handle, - iloc.bh); - brelse(iloc.bh); - } - } - ext3_std_error(inode->i_sb, err); - return err; -} -#endif - -int ext3_change_inode_journal_flag(struct inode *inode, int val) -{ - journal_t *journal; - handle_t *handle; - int err; - - /* - * We have to be very careful here: changing a data block's - * journaling status dynamically is dangerous. If we write a - * data block to the journal, change the status and then delete - * that block, we risk forgetting to revoke the old log record - * from the journal and so a subsequent replay can corrupt data. - * So, first we make sure that the journal is empty and that - * nobody is changing anything. - */ - - journal = EXT3_JOURNAL(inode); - if (is_journal_aborted(journal)) - return -EROFS; - - journal_lock_updates(journal); - journal_flush(journal); - - /* - * OK, there are no updates running now, and all cached data is - * synced to disk. We are now in a completely consistent state - * which doesn't have anything in the journal, and we know that - * no filesystem updates are running, so it is safe to modify - * the inode's in-core data-journaling state flag now. - */ - - if (val) - EXT3_I(inode)->i_flags |= EXT3_JOURNAL_DATA_FL; - else - EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL; - ext3_set_aops(inode); - - journal_unlock_updates(journal); - - /* Finally we can mark the inode as dirty. */ - - handle = ext3_journal_start(inode, 1); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - err = ext3_mark_inode_dirty(handle, inode); - handle->h_sync = 1; - ext3_journal_stop(handle); - ext3_std_error(inode->i_sb, err); - - return err; -} diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c deleted file mode 100644 index 4d96e9a64532..000000000000 --- a/fs/ext3/ioctl.c +++ /dev/null @@ -1,327 +0,0 @@ -/* - * linux/fs/ext3/ioctl.c - * - * Copyright (C) 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - */ - -#include <linux/mount.h> -#include <linux/compat.h> -#include <asm/uaccess.h> -#include "ext3.h" - -long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) -{ - struct inode *inode = file_inode(filp); - struct ext3_inode_info *ei = EXT3_I(inode); - unsigned int flags; - unsigned short rsv_window_size; - - ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg); - - switch (cmd) { - case EXT3_IOC_GETFLAGS: - ext3_get_inode_flags(ei); - flags = ei->i_flags & EXT3_FL_USER_VISIBLE; - return put_user(flags, (int __user *) arg); - case EXT3_IOC_SETFLAGS: { - handle_t *handle = NULL; - int err; - struct ext3_iloc iloc; - unsigned int oldflags; - unsigned int jflag; - - if (!inode_owner_or_capable(inode)) - return -EACCES; - - if (get_user(flags, (int __user *) arg)) - return -EFAULT; - - err = mnt_want_write_file(filp); - if (err) - return err; - - flags = ext3_mask_flags(inode->i_mode, flags); - - mutex_lock(&inode->i_mutex); - - /* Is it quota file? Do not allow user to mess with it */ - err = -EPERM; - if (IS_NOQUOTA(inode)) - goto flags_out; - - oldflags = ei->i_flags; - - /* The JOURNAL_DATA flag is modifiable only by root */ - jflag = flags & EXT3_JOURNAL_DATA_FL; - - /* - * The IMMUTABLE and APPEND_ONLY flags can only be changed by - * the relevant capability. - * - * This test looks nicer. Thanks to Pauline Middelink - */ - if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) { - if (!capable(CAP_LINUX_IMMUTABLE)) - goto flags_out; - } - - /* - * The JOURNAL_DATA flag can only be changed by - * the relevant capability. - */ - if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) { - if (!capable(CAP_SYS_RESOURCE)) - goto flags_out; - } - - handle = ext3_journal_start(inode, 1); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - goto flags_out; - } - if (IS_SYNC(inode)) - handle->h_sync = 1; - err = ext3_reserve_inode_write(handle, inode, &iloc); - if (err) - goto flags_err; - - flags = flags & EXT3_FL_USER_MODIFIABLE; - flags |= oldflags & ~EXT3_FL_USER_MODIFIABLE; - ei->i_flags = flags; - - ext3_set_inode_flags(inode); - inode->i_ctime = CURRENT_TIME_SEC; - - err = ext3_mark_iloc_dirty(handle, inode, &iloc); -flags_err: - ext3_journal_stop(handle); - if (err) - goto flags_out; - - if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) - err = ext3_change_inode_journal_flag(inode, jflag); -flags_out: - mutex_unlock(&inode->i_mutex); - mnt_drop_write_file(filp); - return err; - } - case EXT3_IOC_GETVERSION: - case EXT3_IOC_GETVERSION_OLD: - return put_user(inode->i_generation, (int __user *) arg); - case EXT3_IOC_SETVERSION: - case EXT3_IOC_SETVERSION_OLD: { - handle_t *handle; - struct ext3_iloc iloc; - __u32 generation; - int err; - - if (!inode_owner_or_capable(inode)) - return -EPERM; - - err = mnt_want_write_file(filp); - if (err) - return err; - if (get_user(generation, (int __user *) arg)) { - err = -EFAULT; - goto setversion_out; - } - - mutex_lock(&inode->i_mutex); - handle = ext3_journal_start(inode, 1); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - goto unlock_out; - } - err = ext3_reserve_inode_write(handle, inode, &iloc); - if (err == 0) { - inode->i_ctime = CURRENT_TIME_SEC; - inode->i_generation = generation; - err = ext3_mark_iloc_dirty(handle, inode, &iloc); - } - ext3_journal_stop(handle); - -unlock_out: - mutex_unlock(&inode->i_mutex); -setversion_out: - mnt_drop_write_file(filp); - return err; - } - case EXT3_IOC_GETRSVSZ: - if (test_opt(inode->i_sb, RESERVATION) - && S_ISREG(inode->i_mode) - && ei->i_block_alloc_info) { - rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size; - return put_user(rsv_window_size, (int __user *)arg); - } - return -ENOTTY; - case EXT3_IOC_SETRSVSZ: { - int err; - - if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode)) - return -ENOTTY; - - err = mnt_want_write_file(filp); - if (err) - return err; - - if (!inode_owner_or_capable(inode)) { - err = -EACCES; - goto setrsvsz_out; - } - - if (get_user(rsv_window_size, (int __user *)arg)) { - err = -EFAULT; - goto setrsvsz_out; - } - - if (rsv_window_size > EXT3_MAX_RESERVE_BLOCKS) - rsv_window_size = EXT3_MAX_RESERVE_BLOCKS; - - /* - * need to allocate reservation structure for this inode - * before set the window size - */ - mutex_lock(&ei->truncate_mutex); - if (!ei->i_block_alloc_info) - ext3_init_block_alloc_info(inode); - - if (ei->i_block_alloc_info){ - struct ext3_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node; - rsv->rsv_goal_size = rsv_window_size; - } - mutex_unlock(&ei->truncate_mutex); -setrsvsz_out: - mnt_drop_write_file(filp); - return err; - } - case EXT3_IOC_GROUP_EXTEND: { - ext3_fsblk_t n_blocks_count; - struct super_block *sb = inode->i_sb; - int err, err2; - - if (!capable(CAP_SYS_RESOURCE)) - return -EPERM; - - err = mnt_want_write_file(filp); - if (err) - return err; - - if (get_user(n_blocks_count, (__u32 __user *)arg)) { - err = -EFAULT; - goto group_extend_out; - } - err = ext3_group_extend(sb, EXT3_SB(sb)->s_es, n_blocks_count); - journal_lock_updates(EXT3_SB(sb)->s_journal); - err2 = journal_flush(EXT3_SB(sb)->s_journal); - journal_unlock_updates(EXT3_SB(sb)->s_journal); - if (err == 0) - err = err2; -group_extend_out: - mnt_drop_write_file(filp); - return err; - } - case EXT3_IOC_GROUP_ADD: { - struct ext3_new_group_data input; - struct super_block *sb = inode->i_sb; - int err, err2; - - if (!capable(CAP_SYS_RESOURCE)) - return -EPERM; - - err = mnt_want_write_file(filp); - if (err) - return err; - - if (copy_from_user(&input, (struct ext3_new_group_input __user *)arg, - sizeof(input))) { - err = -EFAULT; - goto group_add_out; - } - - err = ext3_group_add(sb, &input); - journal_lock_updates(EXT3_SB(sb)->s_journal); - err2 = journal_flush(EXT3_SB(sb)->s_journal); - journal_unlock_updates(EXT3_SB(sb)->s_journal); - if (err == 0) - err = err2; -group_add_out: - mnt_drop_write_file(filp); - return err; - } - case FITRIM: { - - struct super_block *sb = inode->i_sb; - struct fstrim_range range; - int ret = 0; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (copy_from_user(&range, (struct fstrim_range __user *)arg, - sizeof(range))) - return -EFAULT; - - ret = ext3_trim_fs(sb, &range); - if (ret < 0) - return ret; - - if (copy_to_user((struct fstrim_range __user *)arg, &range, - sizeof(range))) - return -EFAULT; - - return 0; - } - - default: - return -ENOTTY; - } -} - -#ifdef CONFIG_COMPAT -long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - /* These are just misnamed, they actually get/put from/to user an int */ - switch (cmd) { - case EXT3_IOC32_GETFLAGS: - cmd = EXT3_IOC_GETFLAGS; - break; - case EXT3_IOC32_SETFLAGS: - cmd = EXT3_IOC_SETFLAGS; - break; - case EXT3_IOC32_GETVERSION: - cmd = EXT3_IOC_GETVERSION; - break; - case EXT3_IOC32_SETVERSION: - cmd = EXT3_IOC_SETVERSION; - break; - case EXT3_IOC32_GROUP_EXTEND: - cmd = EXT3_IOC_GROUP_EXTEND; - break; - case EXT3_IOC32_GETVERSION_OLD: - cmd = EXT3_IOC_GETVERSION_OLD; - break; - case EXT3_IOC32_SETVERSION_OLD: - cmd = EXT3_IOC_SETVERSION_OLD; - break; -#ifdef CONFIG_JBD_DEBUG - case EXT3_IOC32_WAIT_FOR_READONLY: - cmd = EXT3_IOC_WAIT_FOR_READONLY; - break; -#endif - case EXT3_IOC32_GETRSVSZ: - cmd = EXT3_IOC_GETRSVSZ; - break; - case EXT3_IOC32_SETRSVSZ: - cmd = EXT3_IOC_SETRSVSZ; - break; - case EXT3_IOC_GROUP_ADD: - break; - default: - return -ENOIOCTLCMD; - } - return ext3_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); -} -#endif diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c deleted file mode 100644 index c9e767cd4b67..000000000000 --- a/fs/ext3/namei.c +++ /dev/null @@ -1,2586 +0,0 @@ -/* - * linux/fs/ext3/namei.c - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * from - * - * linux/fs/minix/namei.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * Big-endian to little-endian byte-swapping/bitmaps by - * David S. Miller (davem@caip.rutgers.edu), 1995 - * Directory entry file type support and forward compatibility hooks - * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 - * Hash Tree Directory indexing (c) - * Daniel Phillips, 2001 - * Hash Tree Directory indexing porting - * Christopher Li, 2002 - * Hash Tree Directory indexing cleanup - * Theodore Ts'o, 2002 - */ - -#include <linux/quotaops.h> -#include "ext3.h" -#include "namei.h" -#include "xattr.h" -#include "acl.h" - -/* - * define how far ahead to read directories while searching them. - */ -#define NAMEI_RA_CHUNKS 2 -#define NAMEI_RA_BLOCKS 4 -#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) - -static struct buffer_head *ext3_append(handle_t *handle, - struct inode *inode, - u32 *block, int *err) -{ - struct buffer_head *bh; - - *block = inode->i_size >> inode->i_sb->s_blocksize_bits; - - if ((bh = ext3_dir_bread(handle, inode, *block, 1, err))) { - inode->i_size += inode->i_sb->s_blocksize; - EXT3_I(inode)->i_disksize = inode->i_size; - *err = ext3_journal_get_write_access(handle, bh); - if (*err) { - brelse(bh); - bh = NULL; - } - } - return bh; -} - -#ifndef assert -#define assert(test) J_ASSERT(test) -#endif - -#ifdef DX_DEBUG -#define dxtrace(command) command -#else -#define dxtrace(command) -#endif - -struct fake_dirent -{ - __le32 inode; - __le16 rec_len; - u8 name_len; - u8 file_type; -}; - -struct dx_countlimit -{ - __le16 limit; - __le16 count; -}; - -struct dx_entry -{ - __le32 hash; - __le32 block; -}; - -/* - * dx_root_info is laid out so that if it should somehow get overlaid by a - * dirent the two low bits of the hash version will be zero. Therefore, the - * hash version mod 4 should never be 0. Sincerely, the paranoia department. - */ - -struct dx_root -{ - struct fake_dirent dot; - char dot_name[4]; - struct fake_dirent dotdot; - char dotdot_name[4]; - struct dx_root_info - { - __le32 reserved_zero; - u8 hash_version; - u8 info_length; /* 8 */ - u8 indirect_levels; - u8 unused_flags; - } - info; - struct dx_entry entries[0]; -}; - -struct dx_node -{ - struct fake_dirent fake; - struct dx_entry entries[0]; -}; - - -struct dx_frame -{ - struct buffer_head *bh; - struct dx_entry *entries; - struct dx_entry *at; -}; - -struct dx_map_entry -{ - u32 hash; - u16 offs; - u16 size; -}; - -static inline unsigned dx_get_block (struct dx_entry *entry); -static void dx_set_block (struct dx_entry *entry, unsigned value); -static inline unsigned dx_get_hash (struct dx_entry *entry); -static void dx_set_hash (struct dx_entry *entry, unsigned value); -static unsigned dx_get_count (struct dx_entry *entries); -static unsigned dx_get_limit (struct dx_entry *entries); -static void dx_set_count (struct dx_entry *entries, unsigned value); -static void dx_set_limit (struct dx_entry *entries, unsigned value); -static unsigned dx_root_limit (struct inode *dir, unsigned infosize); -static unsigned dx_node_limit (struct inode *dir); -static struct dx_frame *dx_probe(struct qstr *entry, - struct inode *dir, - struct dx_hash_info *hinfo, - struct dx_frame *frame, - int *err); -static void dx_release (struct dx_frame *frames); -static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize, - struct dx_hash_info *hinfo, struct dx_map_entry map[]); -static void dx_sort_map(struct dx_map_entry *map, unsigned count); -static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to, - struct dx_map_entry *offsets, int count); -static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize); -static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); -static int ext3_htree_next_block(struct inode *dir, __u32 hash, - struct dx_frame *frame, - struct dx_frame *frames, - __u32 *start_hash); -static struct buffer_head * ext3_dx_find_entry(struct inode *dir, - struct qstr *entry, struct ext3_dir_entry_2 **res_dir, - int *err); -static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, - struct inode *inode); - -/* - * p is at least 6 bytes before the end of page - */ -static inline struct ext3_dir_entry_2 * -ext3_next_entry(struct ext3_dir_entry_2 *p) -{ - return (struct ext3_dir_entry_2 *)((char *)p + - ext3_rec_len_from_disk(p->rec_len)); -} - -/* - * Future: use high four bits of block for coalesce-on-delete flags - * Mask them off for now. - */ - -static inline unsigned dx_get_block (struct dx_entry *entry) -{ - return le32_to_cpu(entry->block) & 0x00ffffff; -} - -static inline void dx_set_block (struct dx_entry *entry, unsigned value) -{ - entry->block = cpu_to_le32(value); -} - -static inline unsigned dx_get_hash (struct dx_entry *entry) -{ - return le32_to_cpu(entry->hash); -} - -static inline void dx_set_hash (struct dx_entry *entry, unsigned value) -{ - entry->hash = cpu_to_le32(value); -} - -static inline unsigned dx_get_count (struct dx_entry *entries) -{ - return le16_to_cpu(((struct dx_countlimit *) entries)->count); -} - -static inline unsigned dx_get_limit (struct dx_entry *entries) -{ - return le16_to_cpu(((struct dx_countlimit *) entries)->limit); -} - -static inline void dx_set_count (struct dx_entry *entries, unsigned value) -{ - ((struct dx_countlimit *) entries)->count = cpu_to_le16(value); -} - -static inline void dx_set_limit (struct dx_entry *entries, unsigned value) -{ - ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); -} - -static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize) -{ - unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) - - EXT3_DIR_REC_LEN(2) - infosize; - return entry_space / sizeof(struct dx_entry); -} - -static inline unsigned dx_node_limit (struct inode *dir) -{ - unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0); - return entry_space / sizeof(struct dx_entry); -} - -/* - * Debug - */ -#ifdef DX_DEBUG -static void dx_show_index (char * label, struct dx_entry *entries) -{ - int i, n = dx_get_count (entries); - printk("%s index ", label); - for (i = 0; i < n; i++) - { - printk("%x->%u ", i? dx_get_hash(entries + i): 0, dx_get_block(entries + i)); - } - printk("\n"); -} - -struct stats -{ - unsigned names; - unsigned space; - unsigned bcount; -}; - -static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_entry_2 *de, - int size, int show_names) -{ - unsigned names = 0, space = 0; - char *base = (char *) de; - struct dx_hash_info h = *hinfo; - - printk("names: "); - while ((char *) de < base + size) - { - if (de->inode) - { - if (show_names) - { - int len = de->name_len; - char *name = de->name; - while (len--) printk("%c", *name++); - ext3fs_dirhash(de->name, de->name_len, &h); - printk(":%x.%u ", h.hash, - (unsigned) ((char *) de - base)); - } - space += EXT3_DIR_REC_LEN(de->name_len); - names++; - } - de = ext3_next_entry(de); - } - printk("(%i)\n", names); - return (struct stats) { names, space, 1 }; -} - -struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, - struct dx_entry *entries, int levels) -{ - unsigned blocksize = dir->i_sb->s_blocksize; - unsigned count = dx_get_count (entries), names = 0, space = 0, i; - unsigned bcount = 0; - struct buffer_head *bh; - int err; - printk("%i indexed blocks...\n", count); - for (i = 0; i < count; i++, entries++) - { - u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0; - u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; - struct stats stats; - printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); - if (!(bh = ext3_bread (NULL,dir, block, 0,&err))) continue; - stats = levels? - dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): - dx_show_leaf(hinfo, (struct ext3_dir_entry_2 *) bh->b_data, blocksize, 0); - names += stats.names; - space += stats.space; - bcount += stats.bcount; - brelse (bh); - } - if (bcount) - printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ", - names, space/bcount,(space/bcount)*100/blocksize); - return (struct stats) { names, space, bcount}; -} -#endif /* DX_DEBUG */ - -/* - * Probe for a directory leaf block to search. - * - * dx_probe can return ERR_BAD_DX_DIR, which means there was a format - * error in the directory index, and the caller should fall back to - * searching the directory normally. The callers of dx_probe **MUST** - * check for this error code, and make sure it never gets reflected - * back to userspace. - */ -static struct dx_frame * -dx_probe(struct qstr *entry, struct inode *dir, - struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) -{ - unsigned count, indirect; - struct dx_entry *at, *entries, *p, *q, *m; - struct dx_root *root; - struct buffer_head *bh; - struct dx_frame *frame = frame_in; - u32 hash; - - frame->bh = NULL; - if (!(bh = ext3_dir_bread(NULL, dir, 0, 0, err))) { - *err = ERR_BAD_DX_DIR; - goto fail; - } - root = (struct dx_root *) bh->b_data; - if (root->info.hash_version != DX_HASH_TEA && - root->info.hash_version != DX_HASH_HALF_MD4 && - root->info.hash_version != DX_HASH_LEGACY) { - ext3_warning(dir->i_sb, __func__, - "Unrecognised inode hash code %d", - root->info.hash_version); - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto fail; - } - hinfo->hash_version = root->info.hash_version; - if (hinfo->hash_version <= DX_HASH_TEA) - hinfo->hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned; - hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed; - if (entry) - ext3fs_dirhash(entry->name, entry->len, hinfo); - hash = hinfo->hash; - - if (root->info.unused_flags & 1) { - ext3_warning(dir->i_sb, __func__, - "Unimplemented inode hash flags: %#06x", - root->info.unused_flags); - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto fail; - } - - if ((indirect = root->info.indirect_levels) > 1) { - ext3_warning(dir->i_sb, __func__, - "Unimplemented inode hash depth: %#06x", - root->info.indirect_levels); - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto fail; - } - - entries = (struct dx_entry *) (((char *)&root->info) + - root->info.info_length); - - if (dx_get_limit(entries) != dx_root_limit(dir, - root->info.info_length)) { - ext3_warning(dir->i_sb, __func__, - "dx entry: limit != root limit"); - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto fail; - } - - dxtrace (printk("Look up %x", hash)); - while (1) - { - count = dx_get_count(entries); - if (!count || count > dx_get_limit(entries)) { - ext3_warning(dir->i_sb, __func__, - "dx entry: no count or count > limit"); - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto fail2; - } - - p = entries + 1; - q = entries + count - 1; - while (p <= q) - { - m = p + (q - p)/2; - dxtrace(printk(".")); - if (dx_get_hash(m) > hash) - q = m - 1; - else - p = m + 1; - } - - if (0) // linear search cross check - { - unsigned n = count - 1; - at = entries; - while (n--) - { - dxtrace(printk(",")); - if (dx_get_hash(++at) > hash) - { - at--; - break; - } - } - assert (at == p - 1); - } - - at = p - 1; - dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); - frame->bh = bh; - frame->entries = entries; - frame->at = at; - if (!indirect--) return frame; - if (!(bh = ext3_dir_bread(NULL, dir, dx_get_block(at), 0, err))) { - *err = ERR_BAD_DX_DIR; - goto fail2; - } - at = entries = ((struct dx_node *) bh->b_data)->entries; - if (dx_get_limit(entries) != dx_node_limit (dir)) { - ext3_warning(dir->i_sb, __func__, - "dx entry: limit != node limit"); - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto fail2; - } - frame++; - frame->bh = NULL; - } -fail2: - while (frame >= frame_in) { - brelse(frame->bh); - frame--; - } -fail: - if (*err == ERR_BAD_DX_DIR) - ext3_warning(dir->i_sb, __func__, - "Corrupt dir inode %ld, running e2fsck is " - "recommended.", dir->i_ino); - return NULL; -} - -static void dx_release (struct dx_frame *frames) -{ - if (frames[0].bh == NULL) - return; - - if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) - brelse(frames[1].bh); - brelse(frames[0].bh); -} - -/* - * This function increments the frame pointer to search the next leaf - * block, and reads in the necessary intervening nodes if the search - * should be necessary. Whether or not the search is necessary is - * controlled by the hash parameter. If the hash value is even, then - * the search is only continued if the next block starts with that - * hash value. This is used if we are searching for a specific file. - * - * If the hash value is HASH_NB_ALWAYS, then always go to the next block. - * - * This function returns 1 if the caller should continue to search, - * or 0 if it should not. If there is an error reading one of the - * index blocks, it will a negative error code. - * - * If start_hash is non-null, it will be filled in with the starting - * hash of the next page. - */ -static int ext3_htree_next_block(struct inode *dir, __u32 hash, - struct dx_frame *frame, - struct dx_frame *frames, - __u32 *start_hash) -{ - struct dx_frame *p; - struct buffer_head *bh; - int err, num_frames = 0; - __u32 bhash; - - p = frame; - /* - * Find the next leaf page by incrementing the frame pointer. - * If we run out of entries in the interior node, loop around and - * increment pointer in the parent node. When we break out of - * this loop, num_frames indicates the number of interior - * nodes need to be read. - */ - while (1) { - if (++(p->at) < p->entries + dx_get_count(p->entries)) - break; - if (p == frames) - return 0; - num_frames++; - p--; - } - - /* - * If the hash is 1, then continue only if the next page has a - * continuation hash of any value. This is used for readdir - * handling. Otherwise, check to see if the hash matches the - * desired contiuation hash. If it doesn't, return since - * there's no point to read in the successive index pages. - */ - bhash = dx_get_hash(p->at); - if (start_hash) - *start_hash = bhash; - if ((hash & 1) == 0) { - if ((bhash & ~1) != hash) - return 0; - } - /* - * If the hash is HASH_NB_ALWAYS, we always go to the next - * block so no check is necessary - */ - while (num_frames--) { - if (!(bh = ext3_dir_bread(NULL, dir, dx_get_block(p->at), - 0, &err))) - return err; /* Failure */ - p++; - brelse (p->bh); - p->bh = bh; - p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; - } - return 1; -} - - -/* - * This function fills a red-black tree with information from a - * directory block. It returns the number directory entries loaded - * into the tree. If there is an error it is returned in err. - */ -static int htree_dirblock_to_tree(struct file *dir_file, - struct inode *dir, int block, - struct dx_hash_info *hinfo, - __u32 start_hash, __u32 start_minor_hash) -{ - struct buffer_head *bh; - struct ext3_dir_entry_2 *de, *top; - int err = 0, count = 0; - - dxtrace(printk("In htree dirblock_to_tree: block %d\n", block)); - - if (!(bh = ext3_dir_bread(NULL, dir, block, 0, &err))) - return err; - - de = (struct ext3_dir_entry_2 *) bh->b_data; - top = (struct ext3_dir_entry_2 *) ((char *) de + - dir->i_sb->s_blocksize - - EXT3_DIR_REC_LEN(0)); - for (; de < top; de = ext3_next_entry(de)) { - if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh, - (block<<EXT3_BLOCK_SIZE_BITS(dir->i_sb)) - +((char *)de - bh->b_data))) { - /* silently ignore the rest of the block */ - break; - } - ext3fs_dirhash(de->name, de->name_len, hinfo); - if ((hinfo->hash < start_hash) || - ((hinfo->hash == start_hash) && - (hinfo->minor_hash < start_minor_hash))) - continue; - if (de->inode == 0) - continue; - if ((err = ext3_htree_store_dirent(dir_file, - hinfo->hash, hinfo->minor_hash, de)) != 0) { - brelse(bh); - return err; - } - count++; - } - brelse(bh); - return count; -} - - -/* - * This function fills a red-black tree with information from a - * directory. We start scanning the directory in hash order, starting - * at start_hash and start_minor_hash. - * - * This function returns the number of entries inserted into the tree, - * or a negative error code. - */ -int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, - __u32 start_minor_hash, __u32 *next_hash) -{ - struct dx_hash_info hinfo; - struct ext3_dir_entry_2 *de; - struct dx_frame frames[2], *frame; - struct inode *dir; - int block, err; - int count = 0; - int ret; - __u32 hashval; - - dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, - start_minor_hash)); - dir = file_inode(dir_file); - if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) { - hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; - if (hinfo.hash_version <= DX_HASH_TEA) - hinfo.hash_version += - EXT3_SB(dir->i_sb)->s_hash_unsigned; - hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; - count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, - start_hash, start_minor_hash); - *next_hash = ~0; - return count; - } - hinfo.hash = start_hash; - hinfo.minor_hash = 0; - frame = dx_probe(NULL, file_inode(dir_file), &hinfo, frames, &err); - if (!frame) - return err; - - /* Add '.' and '..' from the htree header */ - if (!start_hash && !start_minor_hash) { - de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data; - if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0) - goto errout; - count++; - } - if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) { - de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data; - de = ext3_next_entry(de); - if ((err = ext3_htree_store_dirent(dir_file, 2, 0, de)) != 0) - goto errout; - count++; - } - - while (1) { - block = dx_get_block(frame->at); - ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo, - start_hash, start_minor_hash); - if (ret < 0) { - err = ret; - goto errout; - } - count += ret; - hashval = ~0; - ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, - frame, frames, &hashval); - *next_hash = hashval; - if (ret < 0) { - err = ret; - goto errout; - } - /* - * Stop if: (a) there are no more entries, or - * (b) we have inserted at least one entry and the - * next hash value is not a continuation - */ - if ((ret == 0) || - (count && ((hashval & 1) == 0))) - break; - } - dx_release(frames); - dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n", - count, *next_hash)); - return count; -errout: - dx_release(frames); - return (err); -} - - -/* - * Directory block splitting, compacting - */ - -/* - * Create map of hash values, offsets, and sizes, stored at end of block. - * Returns number of entries mapped. - */ -static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize, - struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) -{ - int count = 0; - char *base = (char *) de; - struct dx_hash_info h = *hinfo; - - while ((char *) de < base + blocksize) - { - if (de->name_len && de->inode) { - ext3fs_dirhash(de->name, de->name_len, &h); - map_tail--; - map_tail->hash = h.hash; - map_tail->offs = (u16) ((char *) de - base); - map_tail->size = le16_to_cpu(de->rec_len); - count++; - cond_resched(); - } - /* XXX: do we need to check rec_len == 0 case? -Chris */ - de = ext3_next_entry(de); - } - return count; -} - -/* Sort map by hash value */ -static void dx_sort_map (struct dx_map_entry *map, unsigned count) -{ - struct dx_map_entry *p, *q, *top = map + count - 1; - int more; - /* Combsort until bubble sort doesn't suck */ - while (count > 2) - { - count = count*10/13; - if (count - 9 < 2) /* 9, 10 -> 11 */ - count = 11; - for (p = top, q = p - count; q >= map; p--, q--) - if (p->hash < q->hash) - swap(*p, *q); - } - /* Garden variety bubble sort */ - do { - more = 0; - q = top; - while (q-- > map) - { - if (q[1].hash >= q[0].hash) - continue; - swap(*(q+1), *q); - more = 1; - } - } while(more); -} - -static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block) -{ - struct dx_entry *entries = frame->entries; - struct dx_entry *old = frame->at, *new = old + 1; - int count = dx_get_count(entries); - - assert(count < dx_get_limit(entries)); - assert(old < entries + count); - memmove(new + 1, new, (char *)(entries + count) - (char *)(new)); - dx_set_hash(new, hash); - dx_set_block(new, block); - dx_set_count(entries, count + 1); -} - -static void ext3_update_dx_flag(struct inode *inode) -{ - if (!EXT3_HAS_COMPAT_FEATURE(inode->i_sb, - EXT3_FEATURE_COMPAT_DIR_INDEX)) - EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL; -} - -/* - * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure. - * - * `len <= EXT3_NAME_LEN' is guaranteed by caller. - * `de != NULL' is guaranteed by caller. - */ -static inline int ext3_match (int len, const char * const name, - struct ext3_dir_entry_2 * de) -{ - if (len != de->name_len) - return 0; - if (!de->inode) - return 0; - return !memcmp(name, de->name, len); -} - -/* - * Returns 0 if not found, -1 on failure, and 1 on success - */ -static inline int search_dirblock(struct buffer_head * bh, - struct inode *dir, - struct qstr *child, - unsigned long offset, - struct ext3_dir_entry_2 ** res_dir) -{ - struct ext3_dir_entry_2 * de; - char * dlimit; - int de_len; - const char *name = child->name; - int namelen = child->len; - - de = (struct ext3_dir_entry_2 *) bh->b_data; - dlimit = bh->b_data + dir->i_sb->s_blocksize; - while ((char *) de < dlimit) { - /* this code is executed quadratically often */ - /* do minimal checking `by hand' */ - - if ((char *) de + namelen <= dlimit && - ext3_match (namelen, name, de)) { - /* found a match - just to be sure, do a full check */ - if (!ext3_check_dir_entry("ext3_find_entry", - dir, de, bh, offset)) - return -1; - *res_dir = de; - return 1; - } - /* prevent looping on a bad block */ - de_len = ext3_rec_len_from_disk(de->rec_len); - if (de_len <= 0) - return -1; - offset += de_len; - de = (struct ext3_dir_entry_2 *) ((char *) de + de_len); - } - return 0; -} - - -/* - * ext3_find_entry() - * - * finds an entry in the specified directory with the wanted name. It - * returns the cache buffer in which the entry was found, and the entry - * itself (as a parameter - res_dir). It does NOT read the inode of the - * entry - you'll have to do that yourself if you want to. - * - * The returned buffer_head has ->b_count elevated. The caller is expected - * to brelse() it when appropriate. - */ -static struct buffer_head *ext3_find_entry(struct inode *dir, - struct qstr *entry, - struct ext3_dir_entry_2 **res_dir) -{ - struct super_block * sb; - struct buffer_head * bh_use[NAMEI_RA_SIZE]; - struct buffer_head * bh, *ret = NULL; - unsigned long start, block, b; - const u8 *name = entry->name; - int ra_max = 0; /* Number of bh's in the readahead - buffer, bh_use[] */ - int ra_ptr = 0; /* Current index into readahead - buffer */ - int num = 0; - int nblocks, i, err; - int namelen; - - *res_dir = NULL; - sb = dir->i_sb; - namelen = entry->len; - if (namelen > EXT3_NAME_LEN) - return NULL; - if ((namelen <= 2) && (name[0] == '.') && - (name[1] == '.' || name[1] == 0)) { - /* - * "." or ".." will only be in the first block - * NFS may look up ".."; "." should be handled by the VFS - */ - block = start = 0; - nblocks = 1; - goto restart; - } - if (is_dx(dir)) { - bh = ext3_dx_find_entry(dir, entry, res_dir, &err); - /* - * On success, or if the error was file not found, - * return. Otherwise, fall back to doing a search the - * old fashioned way. - */ - if (bh || (err != ERR_BAD_DX_DIR)) - return bh; - dxtrace(printk("ext3_find_entry: dx failed, falling back\n")); - } - nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); - start = EXT3_I(dir)->i_dir_start_lookup; - if (start >= nblocks) - start = 0; - block = start; -restart: - do { - /* - * We deal with the read-ahead logic here. - */ - if (ra_ptr >= ra_max) { - /* Refill the readahead buffer */ - ra_ptr = 0; - b = block; - for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) { - /* - * Terminate if we reach the end of the - * directory and must wrap, or if our - * search has finished at this block. - */ - if (b >= nblocks || (num && block == start)) { - bh_use[ra_max] = NULL; - break; - } - num++; - bh = ext3_getblk(NULL, dir, b++, 0, &err); - bh_use[ra_max] = bh; - if (bh && !bh_uptodate_or_lock(bh)) { - get_bh(bh); - bh->b_end_io = end_buffer_read_sync; - submit_bh(READ | REQ_META | REQ_PRIO, - bh); - } - } - } - if ((bh = bh_use[ra_ptr++]) == NULL) - goto next; - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) { - /* read error, skip block & hope for the best */ - ext3_error(sb, __func__, "reading directory #%lu " - "offset %lu", dir->i_ino, block); - brelse(bh); - goto next; - } - i = search_dirblock(bh, dir, entry, - block << EXT3_BLOCK_SIZE_BITS(sb), res_dir); - if (i == 1) { - EXT3_I(dir)->i_dir_start_lookup = block; - ret = bh; - goto cleanup_and_exit; - } else { - brelse(bh); - if (i < 0) - goto cleanup_and_exit; - } - next: - if (++block >= nblocks) - block = 0; - } while (block != start); - - /* - * If the directory has grown while we were searching, then - * search the last part of the directory before giving up. - */ - block = nblocks; - nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); - if (block < nblocks) { - start = 0; - goto restart; - } - -cleanup_and_exit: - /* Clean up the read-ahead blocks */ - for (; ra_ptr < ra_max; ra_ptr++) - brelse (bh_use[ra_ptr]); - return ret; -} - -static struct buffer_head * ext3_dx_find_entry(struct inode *dir, - struct qstr *entry, struct ext3_dir_entry_2 **res_dir, - int *err) -{ - struct super_block *sb = dir->i_sb; - struct dx_hash_info hinfo; - struct dx_frame frames[2], *frame; - struct buffer_head *bh; - unsigned long block; - int retval; - - if (!(frame = dx_probe(entry, dir, &hinfo, frames, err))) - return NULL; - do { - block = dx_get_block(frame->at); - if (!(bh = ext3_dir_bread (NULL, dir, block, 0, err))) - goto errout; - - retval = search_dirblock(bh, dir, entry, - block << EXT3_BLOCK_SIZE_BITS(sb), - res_dir); - if (retval == 1) { - dx_release(frames); - return bh; - } - brelse(bh); - if (retval == -1) { - *err = ERR_BAD_DX_DIR; - goto errout; - } - - /* Check to see if we should continue to search */ - retval = ext3_htree_next_block(dir, hinfo.hash, frame, - frames, NULL); - if (retval < 0) { - ext3_warning(sb, __func__, - "error reading index page in directory #%lu", - dir->i_ino); - *err = retval; - goto errout; - } - } while (retval == 1); - - *err = -ENOENT; -errout: - dxtrace(printk("%s not found\n", entry->name)); - dx_release (frames); - return NULL; -} - -static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, unsigned int flags) -{ - struct inode * inode; - struct ext3_dir_entry_2 * de; - struct buffer_head * bh; - - if (dentry->d_name.len > EXT3_NAME_LEN) - return ERR_PTR(-ENAMETOOLONG); - - bh = ext3_find_entry(dir, &dentry->d_name, &de); - inode = NULL; - if (bh) { - unsigned long ino = le32_to_cpu(de->inode); - brelse (bh); - if (!ext3_valid_inum(dir->i_sb, ino)) { - ext3_error(dir->i_sb, "ext3_lookup", - "bad inode number: %lu", ino); - return ERR_PTR(-EIO); - } - inode = ext3_iget(dir->i_sb, ino); - if (inode == ERR_PTR(-ESTALE)) { - ext3_error(dir->i_sb, __func__, - "deleted inode referenced: %lu", - ino); - return ERR_PTR(-EIO); - } - } - return d_splice_alias(inode, dentry); -} - - -struct dentry *ext3_get_parent(struct dentry *child) -{ - unsigned long ino; - struct qstr dotdot = QSTR_INIT("..", 2); - struct ext3_dir_entry_2 * de; - struct buffer_head *bh; - - bh = ext3_find_entry(d_inode(child), &dotdot, &de); - if (!bh) - return ERR_PTR(-ENOENT); - ino = le32_to_cpu(de->inode); - brelse(bh); - - if (!ext3_valid_inum(d_inode(child)->i_sb, ino)) { - ext3_error(d_inode(child)->i_sb, "ext3_get_parent", - "bad inode number: %lu", ino); - return ERR_PTR(-EIO); - } - - return d_obtain_alias(ext3_iget(d_inode(child)->i_sb, ino)); -} - -#define S_SHIFT 12 -static unsigned char ext3_type_by_mode[S_IFMT >> S_SHIFT] = { - [S_IFREG >> S_SHIFT] = EXT3_FT_REG_FILE, - [S_IFDIR >> S_SHIFT] = EXT3_FT_DIR, - [S_IFCHR >> S_SHIFT] = EXT3_FT_CHRDEV, - [S_IFBLK >> S_SHIFT] = EXT3_FT_BLKDEV, - [S_IFIFO >> S_SHIFT] = EXT3_FT_FIFO, - [S_IFSOCK >> S_SHIFT] = EXT3_FT_SOCK, - [S_IFLNK >> S_SHIFT] = EXT3_FT_SYMLINK, -}; - -static inline void ext3_set_de_type(struct super_block *sb, - struct ext3_dir_entry_2 *de, - umode_t mode) { - if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE)) - de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; -} - -/* - * Move count entries from end of map between two memory locations. - * Returns pointer to last entry moved. - */ -static struct ext3_dir_entry_2 * -dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count) -{ - unsigned rec_len = 0; - - while (count--) { - struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs); - rec_len = EXT3_DIR_REC_LEN(de->name_len); - memcpy (to, de, rec_len); - ((struct ext3_dir_entry_2 *) to)->rec_len = - ext3_rec_len_to_disk(rec_len); - de->inode = 0; - map++; - to += rec_len; - } - return (struct ext3_dir_entry_2 *) (to - rec_len); -} - -/* - * Compact each dir entry in the range to the minimal rec_len. - * Returns pointer to last entry in range. - */ -static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize) -{ - struct ext3_dir_entry_2 *next, *to, *prev; - struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *)base; - unsigned rec_len = 0; - - prev = to = de; - while ((char *)de < base + blocksize) { - next = ext3_next_entry(de); - if (de->inode && de->name_len) { - rec_len = EXT3_DIR_REC_LEN(de->name_len); - if (de > to) - memmove(to, de, rec_len); - to->rec_len = ext3_rec_len_to_disk(rec_len); - prev = to; - to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len); - } - de = next; - } - return prev; -} - -/* - * Split a full leaf block to make room for a new dir entry. - * Allocate a new block, and move entries so that they are approx. equally full. - * Returns pointer to de in block into which the new entry will be inserted. - */ -static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, - struct buffer_head **bh,struct dx_frame *frame, - struct dx_hash_info *hinfo, int *error) -{ - unsigned blocksize = dir->i_sb->s_blocksize; - unsigned count, continued; - struct buffer_head *bh2; - u32 newblock; - u32 hash2; - struct dx_map_entry *map; - char *data1 = (*bh)->b_data, *data2; - unsigned split, move, size; - struct ext3_dir_entry_2 *de = NULL, *de2; - int err = 0, i; - - bh2 = ext3_append (handle, dir, &newblock, &err); - if (!(bh2)) { - brelse(*bh); - *bh = NULL; - goto errout; - } - - BUFFER_TRACE(*bh, "get_write_access"); - err = ext3_journal_get_write_access(handle, *bh); - if (err) - goto journal_error; - - BUFFER_TRACE(frame->bh, "get_write_access"); - err = ext3_journal_get_write_access(handle, frame->bh); - if (err) - goto journal_error; - - data2 = bh2->b_data; - - /* create map in the end of data2 block */ - map = (struct dx_map_entry *) (data2 + blocksize); - count = dx_make_map ((struct ext3_dir_entry_2 *) data1, - blocksize, hinfo, map); - map -= count; - dx_sort_map (map, count); - /* Split the existing block in the middle, size-wise */ - size = 0; - move = 0; - for (i = count-1; i >= 0; i--) { - /* is more than half of this entry in 2nd half of the block? */ - if (size + map[i].size/2 > blocksize/2) - break; - size += map[i].size; - move++; - } - /* map index at which we will split */ - split = count - move; - hash2 = map[split].hash; - continued = hash2 == map[split - 1].hash; - dxtrace(printk("Split block %i at %x, %i/%i\n", - dx_get_block(frame->at), hash2, split, count-split)); - - /* Fancy dance to stay within two buffers */ - de2 = dx_move_dirents(data1, data2, map + split, count - split); - de = dx_pack_dirents(data1,blocksize); - de->rec_len = ext3_rec_len_to_disk(data1 + blocksize - (char *) de); - de2->rec_len = ext3_rec_len_to_disk(data2 + blocksize - (char *) de2); - dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1)); - dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1)); - - /* Which block gets the new entry? */ - if (hinfo->hash >= hash2) - { - swap(*bh, bh2); - de = de2; - } - dx_insert_block (frame, hash2 + continued, newblock); - err = ext3_journal_dirty_metadata (handle, bh2); - if (err) - goto journal_error; - err = ext3_journal_dirty_metadata (handle, frame->bh); - if (err) - goto journal_error; - brelse (bh2); - dxtrace(dx_show_index ("frame", frame->entries)); - return de; - -journal_error: - brelse(*bh); - brelse(bh2); - *bh = NULL; - ext3_std_error(dir->i_sb, err); -errout: - *error = err; - return NULL; -} - - -/* - * Add a new entry into a directory (leaf) block. If de is non-NULL, - * it points to a directory entry which is guaranteed to be large - * enough for new directory entry. If de is NULL, then - * add_dirent_to_buf will attempt search the directory block for - * space. It will return -ENOSPC if no space is available, and -EIO - * and -EEXIST if directory entry already exists. - * - * NOTE! bh is NOT released in the case where ENOSPC is returned. In - * all other cases bh is released. - */ -static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, - struct inode *inode, struct ext3_dir_entry_2 *de, - struct buffer_head * bh) -{ - struct inode *dir = d_inode(dentry->d_parent); - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; - unsigned long offset = 0; - unsigned short reclen; - int nlen, rlen, err; - char *top; - - reclen = EXT3_DIR_REC_LEN(namelen); - if (!de) { - de = (struct ext3_dir_entry_2 *)bh->b_data; - top = bh->b_data + dir->i_sb->s_blocksize - reclen; - while ((char *) de <= top) { - if (!ext3_check_dir_entry("ext3_add_entry", dir, de, - bh, offset)) { - brelse (bh); - return -EIO; - } - if (ext3_match (namelen, name, de)) { - brelse (bh); - return -EEXIST; - } - nlen = EXT3_DIR_REC_LEN(de->name_len); - rlen = ext3_rec_len_from_disk(de->rec_len); - if ((de->inode? rlen - nlen: rlen) >= reclen) - break; - de = (struct ext3_dir_entry_2 *)((char *)de + rlen); - offset += rlen; - } - if ((char *) de > top) - return -ENOSPC; - } - BUFFER_TRACE(bh, "get_write_access"); - err = ext3_journal_get_write_access(handle, bh); - if (err) { - ext3_std_error(dir->i_sb, err); - brelse(bh); - return err; - } - - /* By now the buffer is marked for journaling */ - nlen = EXT3_DIR_REC_LEN(de->name_len); - rlen = ext3_rec_len_from_disk(de->rec_len); - if (de->inode) { - struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen); - de1->rec_len = ext3_rec_len_to_disk(rlen - nlen); - de->rec_len = ext3_rec_len_to_disk(nlen); - de = de1; - } - de->file_type = EXT3_FT_UNKNOWN; - if (inode) { - de->inode = cpu_to_le32(inode->i_ino); - ext3_set_de_type(dir->i_sb, de, inode->i_mode); - } else - de->inode = 0; - de->name_len = namelen; - memcpy (de->name, name, namelen); - /* - * XXX shouldn't update any times until successful - * completion of syscall, but too many callers depend - * on this. - * - * XXX similarly, too many callers depend on - * ext3_new_inode() setting the times, but error - * recovery deletes the inode, so the worst that can - * happen is that the times are slightly out of date - * and/or different from the directory change time. - */ - dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; - ext3_update_dx_flag(dir); - dir->i_version++; - ext3_mark_inode_dirty(handle, dir); - BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); - err = ext3_journal_dirty_metadata(handle, bh); - if (err) - ext3_std_error(dir->i_sb, err); - brelse(bh); - return 0; -} - -/* - * This converts a one block unindexed directory to a 3 block indexed - * directory, and adds the dentry to the indexed directory. - */ -static int make_indexed_dir(handle_t *handle, struct dentry *dentry, - struct inode *inode, struct buffer_head *bh) -{ - struct inode *dir = d_inode(dentry->d_parent); - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; - struct buffer_head *bh2; - struct dx_root *root; - struct dx_frame frames[2], *frame; - struct dx_entry *entries; - struct ext3_dir_entry_2 *de, *de2; - char *data1, *top; - unsigned len; - int retval; - unsigned blocksize; - struct dx_hash_info hinfo; - u32 block; - struct fake_dirent *fde; - - blocksize = dir->i_sb->s_blocksize; - dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino)); - retval = ext3_journal_get_write_access(handle, bh); - if (retval) { - ext3_std_error(dir->i_sb, retval); - brelse(bh); - return retval; - } - root = (struct dx_root *) bh->b_data; - - /* The 0th block becomes the root, move the dirents out */ - fde = &root->dotdot; - de = (struct ext3_dir_entry_2 *)((char *)fde + - ext3_rec_len_from_disk(fde->rec_len)); - if ((char *) de >= (((char *) root) + blocksize)) { - ext3_error(dir->i_sb, __func__, - "invalid rec_len for '..' in inode %lu", - dir->i_ino); - brelse(bh); - return -EIO; - } - len = ((char *) root) + blocksize - (char *) de; - - bh2 = ext3_append (handle, dir, &block, &retval); - if (!(bh2)) { - brelse(bh); - return retval; - } - EXT3_I(dir)->i_flags |= EXT3_INDEX_FL; - data1 = bh2->b_data; - - memcpy (data1, de, len); - de = (struct ext3_dir_entry_2 *) data1; - top = data1 + len; - while ((char *)(de2 = ext3_next_entry(de)) < top) - de = de2; - de->rec_len = ext3_rec_len_to_disk(data1 + blocksize - (char *) de); - /* Initialize the root; the dot dirents already exist */ - de = (struct ext3_dir_entry_2 *) (&root->dotdot); - de->rec_len = ext3_rec_len_to_disk(blocksize - EXT3_DIR_REC_LEN(2)); - memset (&root->info, 0, sizeof(root->info)); - root->info.info_length = sizeof(root->info); - root->info.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; - entries = root->entries; - dx_set_block (entries, 1); - dx_set_count (entries, 1); - dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info))); - - /* Initialize as for dx_probe */ - hinfo.hash_version = root->info.hash_version; - if (hinfo.hash_version <= DX_HASH_TEA) - hinfo.hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned; - hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; - ext3fs_dirhash(name, namelen, &hinfo); - frame = frames; - frame->entries = entries; - frame->at = entries; - frame->bh = bh; - bh = bh2; - /* - * Mark buffers dirty here so that if do_split() fails we write a - * consistent set of buffers to disk. - */ - ext3_journal_dirty_metadata(handle, frame->bh); - ext3_journal_dirty_metadata(handle, bh); - de = do_split(handle,dir, &bh, frame, &hinfo, &retval); - if (!de) { - ext3_mark_inode_dirty(handle, dir); - dx_release(frames); - return retval; - } - dx_release(frames); - - return add_dirent_to_buf(handle, dentry, inode, de, bh); -} - -/* - * ext3_add_entry() - * - * adds a file entry to the specified directory, using the same - * semantics as ext3_find_entry(). It returns NULL if it failed. - * - * NOTE!! The inode part of 'de' is left at 0 - which means you - * may not sleep between calling this and putting something into - * the entry, as someone else might have used it while you slept. - */ -static int ext3_add_entry (handle_t *handle, struct dentry *dentry, - struct inode *inode) -{ - struct inode *dir = d_inode(dentry->d_parent); - struct buffer_head * bh; - struct ext3_dir_entry_2 *de; - struct super_block * sb; - int retval; - int dx_fallback=0; - unsigned blocksize; - u32 block, blocks; - - sb = dir->i_sb; - blocksize = sb->s_blocksize; - if (!dentry->d_name.len) - return -EINVAL; - if (is_dx(dir)) { - retval = ext3_dx_add_entry(handle, dentry, inode); - if (!retval || (retval != ERR_BAD_DX_DIR)) - return retval; - EXT3_I(dir)->i_flags &= ~EXT3_INDEX_FL; - dx_fallback++; - ext3_mark_inode_dirty(handle, dir); - } - blocks = dir->i_size >> sb->s_blocksize_bits; - for (block = 0; block < blocks; block++) { - if (!(bh = ext3_dir_bread(handle, dir, block, 0, &retval))) - return retval; - - retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); - if (retval != -ENOSPC) - return retval; - - if (blocks == 1 && !dx_fallback && - EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) - return make_indexed_dir(handle, dentry, inode, bh); - brelse(bh); - } - bh = ext3_append(handle, dir, &block, &retval); - if (!bh) - return retval; - de = (struct ext3_dir_entry_2 *) bh->b_data; - de->inode = 0; - de->rec_len = ext3_rec_len_to_disk(blocksize); - return add_dirent_to_buf(handle, dentry, inode, de, bh); -} - -/* - * Returns 0 for success, or a negative error value - */ -static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, - struct inode *inode) -{ - struct dx_frame frames[2], *frame; - struct dx_entry *entries, *at; - struct dx_hash_info hinfo; - struct buffer_head * bh; - struct inode *dir = d_inode(dentry->d_parent); - struct super_block * sb = dir->i_sb; - struct ext3_dir_entry_2 *de; - int err; - - frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err); - if (!frame) - return err; - entries = frame->entries; - at = frame->at; - - if (!(bh = ext3_dir_bread(handle, dir, dx_get_block(frame->at), 0, &err))) - goto cleanup; - - BUFFER_TRACE(bh, "get_write_access"); - err = ext3_journal_get_write_access(handle, bh); - if (err) - goto journal_error; - - err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); - if (err != -ENOSPC) { - bh = NULL; - goto cleanup; - } - - /* Block full, should compress but for now just split */ - dxtrace(printk("using %u of %u node entries\n", - dx_get_count(entries), dx_get_limit(entries))); - /* Need to split index? */ - if (dx_get_count(entries) == dx_get_limit(entries)) { - u32 newblock; - unsigned icount = dx_get_count(entries); - int levels = frame - frames; - struct dx_entry *entries2; - struct dx_node *node2; - struct buffer_head *bh2; - - if (levels && (dx_get_count(frames->entries) == - dx_get_limit(frames->entries))) { - ext3_warning(sb, __func__, - "Directory index full!"); - err = -ENOSPC; - goto cleanup; - } - bh2 = ext3_append (handle, dir, &newblock, &err); - if (!(bh2)) - goto cleanup; - node2 = (struct dx_node *)(bh2->b_data); - entries2 = node2->entries; - memset(&node2->fake, 0, sizeof(struct fake_dirent)); - node2->fake.rec_len = ext3_rec_len_to_disk(sb->s_blocksize); - BUFFER_TRACE(frame->bh, "get_write_access"); - err = ext3_journal_get_write_access(handle, frame->bh); - if (err) - goto journal_error; - if (levels) { - unsigned icount1 = icount/2, icount2 = icount - icount1; - unsigned hash2 = dx_get_hash(entries + icount1); - dxtrace(printk("Split index %i/%i\n", icount1, icount2)); - - BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ - err = ext3_journal_get_write_access(handle, - frames[0].bh); - if (err) - goto journal_error; - - memcpy ((char *) entries2, (char *) (entries + icount1), - icount2 * sizeof(struct dx_entry)); - dx_set_count (entries, icount1); - dx_set_count (entries2, icount2); - dx_set_limit (entries2, dx_node_limit(dir)); - - /* Which index block gets the new entry? */ - if (at - entries >= icount1) { - frame->at = at = at - entries - icount1 + entries2; - frame->entries = entries = entries2; - swap(frame->bh, bh2); - } - dx_insert_block (frames + 0, hash2, newblock); - dxtrace(dx_show_index ("node", frames[1].entries)); - dxtrace(dx_show_index ("node", - ((struct dx_node *) bh2->b_data)->entries)); - err = ext3_journal_dirty_metadata(handle, bh2); - if (err) - goto journal_error; - brelse (bh2); - } else { - dxtrace(printk("Creating second level index...\n")); - memcpy((char *) entries2, (char *) entries, - icount * sizeof(struct dx_entry)); - dx_set_limit(entries2, dx_node_limit(dir)); - - /* Set up root */ - dx_set_count(entries, 1); - dx_set_block(entries + 0, newblock); - ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; - - /* Add new access path frame */ - frame = frames + 1; - frame->at = at = at - entries + entries2; - frame->entries = entries = entries2; - frame->bh = bh2; - err = ext3_journal_get_write_access(handle, - frame->bh); - if (err) - goto journal_error; - } - err = ext3_journal_dirty_metadata(handle, frames[0].bh); - if (err) - goto journal_error; - } - de = do_split(handle, dir, &bh, frame, &hinfo, &err); - if (!de) - goto cleanup; - err = add_dirent_to_buf(handle, dentry, inode, de, bh); - bh = NULL; - goto cleanup; - -journal_error: - ext3_std_error(dir->i_sb, err); -cleanup: - if (bh) - brelse(bh); - dx_release(frames); - return err; -} - -/* - * ext3_delete_entry deletes a directory entry by merging it with the - * previous entry - */ -static int ext3_delete_entry (handle_t *handle, - struct inode * dir, - struct ext3_dir_entry_2 * de_del, - struct buffer_head * bh) -{ - struct ext3_dir_entry_2 * de, * pde; - int i; - - i = 0; - pde = NULL; - de = (struct ext3_dir_entry_2 *) bh->b_data; - while (i < bh->b_size) { - if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i)) - return -EIO; - if (de == de_del) { - int err; - - BUFFER_TRACE(bh, "get_write_access"); - err = ext3_journal_get_write_access(handle, bh); - if (err) - goto journal_error; - - if (pde) - pde->rec_len = ext3_rec_len_to_disk( - ext3_rec_len_from_disk(pde->rec_len) + - ext3_rec_len_from_disk(de->rec_len)); - else - de->inode = 0; - dir->i_version++; - BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); - err = ext3_journal_dirty_metadata(handle, bh); - if (err) { -journal_error: - ext3_std_error(dir->i_sb, err); - return err; - } - return 0; - } - i += ext3_rec_len_from_disk(de->rec_len); - pde = de; - de = ext3_next_entry(de); - } - return -ENOENT; -} - -static int ext3_add_nondir(handle_t *handle, - struct dentry *dentry, struct inode *inode) -{ - int err = ext3_add_entry(handle, dentry, inode); - if (!err) { - ext3_mark_inode_dirty(handle, inode); - unlock_new_inode(inode); - d_instantiate(dentry, inode); - return 0; - } - drop_nlink(inode); - unlock_new_inode(inode); - iput(inode); - return err; -} - -/* - * By the time this is called, we already have created - * the directory cache entry for the new file, but it - * is so far negative - it has no inode. - * - * If the create succeeds, we fill in the inode information - * with d_instantiate(). - */ -static int ext3_create (struct inode * dir, struct dentry * dentry, umode_t mode, - bool excl) -{ - handle_t *handle; - struct inode * inode; - int err, retries = 0; - - dquot_initialize(dir); - -retry: - handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(dir)) - handle->h_sync = 1; - - inode = ext3_new_inode (handle, dir, &dentry->d_name, mode); - err = PTR_ERR(inode); - if (!IS_ERR(inode)) { - inode->i_op = &ext3_file_inode_operations; - inode->i_fop = &ext3_file_operations; - ext3_set_aops(inode); - err = ext3_add_nondir(handle, dentry, inode); - } - ext3_journal_stop(handle); - if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) - goto retry; - return err; -} - -static int ext3_mknod (struct inode * dir, struct dentry *dentry, - umode_t mode, dev_t rdev) -{ - handle_t *handle; - struct inode *inode; - int err, retries = 0; - - if (!new_valid_dev(rdev)) - return -EINVAL; - - dquot_initialize(dir); - -retry: - handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(dir)) - handle->h_sync = 1; - - inode = ext3_new_inode (handle, dir, &dentry->d_name, mode); - err = PTR_ERR(inode); - if (!IS_ERR(inode)) { - init_special_inode(inode, inode->i_mode, rdev); -#ifdef CONFIG_EXT3_FS_XATTR - inode->i_op = &ext3_special_inode_operations; -#endif - err = ext3_add_nondir(handle, dentry, inode); - } - ext3_journal_stop(handle); - if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) - goto retry; - return err; -} - -static int ext3_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) -{ - handle_t *handle; - struct inode *inode; - int err, retries = 0; - - dquot_initialize(dir); - -retry: - handle = ext3_journal_start(dir, EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + - 4 + EXT3_XATTR_TRANS_BLOCKS); - - if (IS_ERR(handle)) - return PTR_ERR(handle); - - inode = ext3_new_inode (handle, dir, NULL, mode); - err = PTR_ERR(inode); - if (!IS_ERR(inode)) { - inode->i_op = &ext3_file_inode_operations; - inode->i_fop = &ext3_file_operations; - ext3_set_aops(inode); - d_tmpfile(dentry, inode); - err = ext3_orphan_add(handle, inode); - if (err) - goto err_unlock_inode; - mark_inode_dirty(inode); - unlock_new_inode(inode); - } - ext3_journal_stop(handle); - if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) - goto retry; - return err; -err_unlock_inode: - ext3_journal_stop(handle); - unlock_new_inode(inode); - return err; -} - -static int ext3_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) -{ - handle_t *handle; - struct inode * inode; - struct buffer_head * dir_block = NULL; - struct ext3_dir_entry_2 * de; - int err, retries = 0; - - if (dir->i_nlink >= EXT3_LINK_MAX) - return -EMLINK; - - dquot_initialize(dir); - -retry: - handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(dir)) - handle->h_sync = 1; - - inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFDIR | mode); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out_stop; - - inode->i_op = &ext3_dir_inode_operations; - inode->i_fop = &ext3_dir_operations; - inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; - if (!(dir_block = ext3_dir_bread(handle, inode, 0, 1, &err))) - goto out_clear_inode; - - BUFFER_TRACE(dir_block, "get_write_access"); - err = ext3_journal_get_write_access(handle, dir_block); - if (err) - goto out_clear_inode; - - de = (struct ext3_dir_entry_2 *) dir_block->b_data; - de->inode = cpu_to_le32(inode->i_ino); - de->name_len = 1; - de->rec_len = ext3_rec_len_to_disk(EXT3_DIR_REC_LEN(de->name_len)); - strcpy (de->name, "."); - ext3_set_de_type(dir->i_sb, de, S_IFDIR); - de = ext3_next_entry(de); - de->inode = cpu_to_le32(dir->i_ino); - de->rec_len = ext3_rec_len_to_disk(inode->i_sb->s_blocksize - - EXT3_DIR_REC_LEN(1)); - de->name_len = 2; - strcpy (de->name, ".."); - ext3_set_de_type(dir->i_sb, de, S_IFDIR); - set_nlink(inode, 2); - BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata"); - err = ext3_journal_dirty_metadata(handle, dir_block); - if (err) - goto out_clear_inode; - - err = ext3_mark_inode_dirty(handle, inode); - if (!err) - err = ext3_add_entry (handle, dentry, inode); - - if (err) { -out_clear_inode: - clear_nlink(inode); - unlock_new_inode(inode); - ext3_mark_inode_dirty(handle, inode); - iput (inode); - goto out_stop; - } - inc_nlink(dir); - ext3_update_dx_flag(dir); - err = ext3_mark_inode_dirty(handle, dir); - if (err) - goto out_clear_inode; - - unlock_new_inode(inode); - d_instantiate(dentry, inode); -out_stop: - brelse(dir_block); - ext3_journal_stop(handle); - if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) - goto retry; - return err; -} - -/* - * routine to check that the specified directory is empty (for rmdir) - */ -static int empty_dir (struct inode * inode) -{ - unsigned long offset; - struct buffer_head * bh; - struct ext3_dir_entry_2 * de, * de1; - struct super_block * sb; - int err = 0; - - sb = inode->i_sb; - if (inode->i_size < EXT3_DIR_REC_LEN(1) + EXT3_DIR_REC_LEN(2) || - !(bh = ext3_dir_bread(NULL, inode, 0, 0, &err))) { - if (err) - ext3_error(inode->i_sb, __func__, - "error %d reading directory #%lu offset 0", - err, inode->i_ino); - else - ext3_warning(inode->i_sb, __func__, - "bad directory (dir #%lu) - no data block", - inode->i_ino); - return 1; - } - de = (struct ext3_dir_entry_2 *) bh->b_data; - de1 = ext3_next_entry(de); - if (le32_to_cpu(de->inode) != inode->i_ino || - !le32_to_cpu(de1->inode) || - strcmp (".", de->name) || - strcmp ("..", de1->name)) { - ext3_warning (inode->i_sb, "empty_dir", - "bad directory (dir #%lu) - no `.' or `..'", - inode->i_ino); - brelse (bh); - return 1; - } - offset = ext3_rec_len_from_disk(de->rec_len) + - ext3_rec_len_from_disk(de1->rec_len); - de = ext3_next_entry(de1); - while (offset < inode->i_size ) { - if (!bh || - (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { - err = 0; - brelse (bh); - if (!(bh = ext3_dir_bread (NULL, inode, - offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err))) { - if (err) - ext3_error(sb, __func__, - "error %d reading directory" - " #%lu offset %lu", - err, inode->i_ino, offset); - offset += sb->s_blocksize; - continue; - } - de = (struct ext3_dir_entry_2 *) bh->b_data; - } - if (!ext3_check_dir_entry("empty_dir", inode, de, bh, offset)) { - de = (struct ext3_dir_entry_2 *)(bh->b_data + - sb->s_blocksize); - offset = (offset | (sb->s_blocksize - 1)) + 1; - continue; - } - if (le32_to_cpu(de->inode)) { - brelse (bh); - return 0; - } - offset += ext3_rec_len_from_disk(de->rec_len); - de = ext3_next_entry(de); - } - brelse (bh); - return 1; -} - -/* ext3_orphan_add() links an unlinked or truncated inode into a list of - * such inodes, starting at the superblock, in case we crash before the - * file is closed/deleted, or in case the inode truncate spans multiple - * transactions and the last transaction is not recovered after a crash. - * - * At filesystem recovery time, we walk this list deleting unlinked - * inodes and truncating linked inodes in ext3_orphan_cleanup(). - */ -int ext3_orphan_add(handle_t *handle, struct inode *inode) -{ - struct super_block *sb = inode->i_sb; - struct ext3_iloc iloc; - int err = 0, rc; - - mutex_lock(&EXT3_SB(sb)->s_orphan_lock); - if (!list_empty(&EXT3_I(inode)->i_orphan)) - goto out_unlock; - - /* Orphan handling is only valid for files with data blocks - * being truncated, or files being unlinked. */ - - /* @@@ FIXME: Observation from aviro: - * I think I can trigger J_ASSERT in ext3_orphan_add(). We block - * here (on s_orphan_lock), so race with ext3_link() which might bump - * ->i_nlink. For, say it, character device. Not a regular file, - * not a directory, not a symlink and ->i_nlink > 0. - * - * tytso, 4/25/2009: I'm not sure how that could happen; - * shouldn't the fs core protect us from these sort of - * unlink()/link() races? - */ - J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); - - BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access"); - err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); - if (err) - goto out_unlock; - - err = ext3_reserve_inode_write(handle, inode, &iloc); - if (err) - goto out_unlock; - - /* Insert this inode at the head of the on-disk orphan list... */ - NEXT_ORPHAN(inode) = le32_to_cpu(EXT3_SB(sb)->s_es->s_last_orphan); - EXT3_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); - err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); - rc = ext3_mark_iloc_dirty(handle, inode, &iloc); - if (!err) - err = rc; - - /* Only add to the head of the in-memory list if all the - * previous operations succeeded. If the orphan_add is going to - * fail (possibly taking the journal offline), we can't risk - * leaving the inode on the orphan list: stray orphan-list - * entries can cause panics at unmount time. - * - * This is safe: on error we're going to ignore the orphan list - * anyway on the next recovery. */ - if (!err) - list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan); - - jbd_debug(4, "superblock will point to %lu\n", inode->i_ino); - jbd_debug(4, "orphan inode %lu will point to %d\n", - inode->i_ino, NEXT_ORPHAN(inode)); -out_unlock: - mutex_unlock(&EXT3_SB(sb)->s_orphan_lock); - ext3_std_error(inode->i_sb, err); - return err; -} - -/* - * ext3_orphan_del() removes an unlinked or truncated inode from the list - * of such inodes stored on disk, because it is finally being cleaned up. - */ -int ext3_orphan_del(handle_t *handle, struct inode *inode) -{ - struct list_head *prev; - struct ext3_inode_info *ei = EXT3_I(inode); - struct ext3_sb_info *sbi; - unsigned long ino_next; - struct ext3_iloc iloc; - int err = 0; - - mutex_lock(&EXT3_SB(inode->i_sb)->s_orphan_lock); - if (list_empty(&ei->i_orphan)) - goto out; - - ino_next = NEXT_ORPHAN(inode); - prev = ei->i_orphan.prev; - sbi = EXT3_SB(inode->i_sb); - - jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino); - - list_del_init(&ei->i_orphan); - - /* If we're on an error path, we may not have a valid - * transaction handle with which to update the orphan list on - * disk, but we still need to remove the inode from the linked - * list in memory. */ - if (!handle) - goto out; - - err = ext3_reserve_inode_write(handle, inode, &iloc); - if (err) - goto out_err; - - if (prev == &sbi->s_orphan) { - jbd_debug(4, "superblock will point to %lu\n", ino_next); - BUFFER_TRACE(sbi->s_sbh, "get_write_access"); - err = ext3_journal_get_write_access(handle, sbi->s_sbh); - if (err) - goto out_brelse; - sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); - err = ext3_journal_dirty_metadata(handle, sbi->s_sbh); - } else { - struct ext3_iloc iloc2; - struct inode *i_prev = - &list_entry(prev, struct ext3_inode_info, i_orphan)->vfs_inode; - - jbd_debug(4, "orphan inode %lu will point to %lu\n", - i_prev->i_ino, ino_next); - err = ext3_reserve_inode_write(handle, i_prev, &iloc2); - if (err) - goto out_brelse; - NEXT_ORPHAN(i_prev) = ino_next; - err = ext3_mark_iloc_dirty(handle, i_prev, &iloc2); - } - if (err) - goto out_brelse; - NEXT_ORPHAN(inode) = 0; - err = ext3_mark_iloc_dirty(handle, inode, &iloc); - -out_err: - ext3_std_error(inode->i_sb, err); -out: - mutex_unlock(&EXT3_SB(inode->i_sb)->s_orphan_lock); - return err; - -out_brelse: - brelse(iloc.bh); - goto out_err; -} - -static int ext3_rmdir (struct inode * dir, struct dentry *dentry) -{ - int retval; - struct inode * inode; - struct buffer_head * bh; - struct ext3_dir_entry_2 * de; - handle_t *handle; - - /* Initialize quotas before so that eventual writes go in - * separate transaction */ - dquot_initialize(dir); - dquot_initialize(d_inode(dentry)); - - handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - retval = -ENOENT; - bh = ext3_find_entry(dir, &dentry->d_name, &de); - if (!bh) - goto end_rmdir; - - if (IS_DIRSYNC(dir)) - handle->h_sync = 1; - - inode = d_inode(dentry); - - retval = -EIO; - if (le32_to_cpu(de->inode) != inode->i_ino) - goto end_rmdir; - - retval = -ENOTEMPTY; - if (!empty_dir (inode)) - goto end_rmdir; - - retval = ext3_delete_entry(handle, dir, de, bh); - if (retval) - goto end_rmdir; - if (inode->i_nlink != 2) - ext3_warning (inode->i_sb, "ext3_rmdir", - "empty directory has nlink!=2 (%d)", - inode->i_nlink); - inode->i_version++; - clear_nlink(inode); - /* There's no need to set i_disksize: the fact that i_nlink is - * zero will ensure that the right thing happens during any - * recovery. */ - inode->i_size = 0; - ext3_orphan_add(handle, inode); - inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; - ext3_mark_inode_dirty(handle, inode); - drop_nlink(dir); - ext3_update_dx_flag(dir); - ext3_mark_inode_dirty(handle, dir); - -end_rmdir: - ext3_journal_stop(handle); - brelse (bh); - return retval; -} - -static int ext3_unlink(struct inode * dir, struct dentry *dentry) -{ - int retval; - struct inode * inode; - struct buffer_head * bh; - struct ext3_dir_entry_2 * de; - handle_t *handle; - - trace_ext3_unlink_enter(dir, dentry); - /* Initialize quotas before so that eventual writes go - * in separate transaction */ - dquot_initialize(dir); - dquot_initialize(d_inode(dentry)); - - handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(dir)) - handle->h_sync = 1; - - retval = -ENOENT; - bh = ext3_find_entry(dir, &dentry->d_name, &de); - if (!bh) - goto end_unlink; - - inode = d_inode(dentry); - - retval = -EIO; - if (le32_to_cpu(de->inode) != inode->i_ino) - goto end_unlink; - - if (!inode->i_nlink) { - ext3_warning (inode->i_sb, "ext3_unlink", - "Deleting nonexistent file (%lu), %d", - inode->i_ino, inode->i_nlink); - set_nlink(inode, 1); - } - retval = ext3_delete_entry(handle, dir, de, bh); - if (retval) - goto end_unlink; - dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; - ext3_update_dx_flag(dir); - ext3_mark_inode_dirty(handle, dir); - drop_nlink(inode); - if (!inode->i_nlink) - ext3_orphan_add(handle, inode); - inode->i_ctime = dir->i_ctime; - ext3_mark_inode_dirty(handle, inode); - retval = 0; - -end_unlink: - ext3_journal_stop(handle); - brelse (bh); - trace_ext3_unlink_exit(dentry, retval); - return retval; -} - -static int ext3_symlink (struct inode * dir, - struct dentry *dentry, const char * symname) -{ - handle_t *handle; - struct inode * inode; - int l, err, retries = 0; - int credits; - - l = strlen(symname)+1; - if (l > dir->i_sb->s_blocksize) - return -ENAMETOOLONG; - - dquot_initialize(dir); - - if (l > EXT3_N_BLOCKS * 4) { - /* - * For non-fast symlinks, we just allocate inode and put it on - * orphan list in the first transaction => we need bitmap, - * group descriptor, sb, inode block, quota blocks, and - * possibly selinux xattr blocks. - */ - credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + - EXT3_XATTR_TRANS_BLOCKS; - } else { - /* - * Fast symlink. We have to add entry to directory - * (EXT3_DATA_TRANS_BLOCKS + EXT3_INDEX_EXTRA_TRANS_BLOCKS), - * allocate new inode (bitmap, group descriptor, inode block, - * quota blocks, sb is already counted in previous macros). - */ - credits = EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); - } -retry: - handle = ext3_journal_start(dir, credits); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(dir)) - handle->h_sync = 1; - - inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFLNK|S_IRWXUGO); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out_stop; - - if (l > EXT3_N_BLOCKS * 4) { - inode->i_op = &ext3_symlink_inode_operations; - ext3_set_aops(inode); - /* - * We cannot call page_symlink() with transaction started - * because it calls into ext3_write_begin() which acquires page - * lock which ranks below transaction start (and it can also - * wait for journal commit if we are running out of space). So - * we have to stop transaction now and restart it when symlink - * contents is written. - * - * To keep fs consistent in case of crash, we have to put inode - * to orphan list in the mean time. - */ - drop_nlink(inode); - err = ext3_orphan_add(handle, inode); - ext3_journal_stop(handle); - if (err) - goto err_drop_inode; - err = __page_symlink(inode, symname, l, 1); - if (err) - goto err_drop_inode; - /* - * Now inode is being linked into dir (EXT3_DATA_TRANS_BLOCKS - * + EXT3_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified - */ - handle = ext3_journal_start(dir, - EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - goto err_drop_inode; - } - set_nlink(inode, 1); - err = ext3_orphan_del(handle, inode); - if (err) { - ext3_journal_stop(handle); - drop_nlink(inode); - goto err_drop_inode; - } - } else { - inode->i_op = &ext3_fast_symlink_inode_operations; - inode->i_link = (char*)&EXT3_I(inode)->i_data; - memcpy(inode->i_link, symname, l); - inode->i_size = l-1; - } - EXT3_I(inode)->i_disksize = inode->i_size; - err = ext3_add_nondir(handle, dentry, inode); -out_stop: - ext3_journal_stop(handle); - if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) - goto retry; - return err; -err_drop_inode: - unlock_new_inode(inode); - iput(inode); - return err; -} - -static int ext3_link (struct dentry * old_dentry, - struct inode * dir, struct dentry *dentry) -{ - handle_t *handle; - struct inode *inode = d_inode(old_dentry); - int err, retries = 0; - - if (inode->i_nlink >= EXT3_LINK_MAX) - return -EMLINK; - - dquot_initialize(dir); - -retry: - handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(dir)) - handle->h_sync = 1; - - inode->i_ctime = CURRENT_TIME_SEC; - inc_nlink(inode); - ihold(inode); - - err = ext3_add_entry(handle, dentry, inode); - if (!err) { - ext3_mark_inode_dirty(handle, inode); - /* this can happen only for tmpfile being - * linked the first time - */ - if (inode->i_nlink == 1) - ext3_orphan_del(handle, inode); - d_instantiate(dentry, inode); - } else { - drop_nlink(inode); - iput(inode); - } - ext3_journal_stop(handle); - if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) - goto retry; - return err; -} - -#define PARENT_INO(buffer) \ - (ext3_next_entry((struct ext3_dir_entry_2 *)(buffer))->inode) - -/* - * Anybody can rename anything with this: the permission checks are left to the - * higher-level routines. - */ -static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, - struct inode * new_dir,struct dentry *new_dentry) -{ - handle_t *handle; - struct inode * old_inode, * new_inode; - struct buffer_head * old_bh, * new_bh, * dir_bh; - struct ext3_dir_entry_2 * old_de, * new_de; - int retval, flush_file = 0; - - dquot_initialize(old_dir); - dquot_initialize(new_dir); - - old_bh = new_bh = dir_bh = NULL; - - /* Initialize quotas before so that eventual writes go - * in separate transaction */ - if (d_really_is_positive(new_dentry)) - dquot_initialize(d_inode(new_dentry)); - handle = ext3_journal_start(old_dir, 2 * - EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) + - EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) - handle->h_sync = 1; - - old_bh = ext3_find_entry(old_dir, &old_dentry->d_name, &old_de); - /* - * Check for inode number is _not_ due to possible IO errors. - * We might rmdir the source, keep it as pwd of some process - * and merrily kill the link to whatever was created under the - * same name. Goodbye sticky bit ;-< - */ - old_inode = d_inode(old_dentry); - retval = -ENOENT; - if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino) - goto end_rename; - - new_inode = d_inode(new_dentry); - new_bh = ext3_find_entry(new_dir, &new_dentry->d_name, &new_de); - if (new_bh) { - if (!new_inode) { - brelse (new_bh); - new_bh = NULL; - } - } - if (S_ISDIR(old_inode->i_mode)) { - if (new_inode) { - retval = -ENOTEMPTY; - if (!empty_dir (new_inode)) - goto end_rename; - } - retval = -EIO; - dir_bh = ext3_dir_bread(handle, old_inode, 0, 0, &retval); - if (!dir_bh) - goto end_rename; - if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) - goto end_rename; - retval = -EMLINK; - if (!new_inode && new_dir!=old_dir && - new_dir->i_nlink >= EXT3_LINK_MAX) - goto end_rename; - } - if (!new_bh) { - retval = ext3_add_entry (handle, new_dentry, old_inode); - if (retval) - goto end_rename; - } else { - BUFFER_TRACE(new_bh, "get write access"); - retval = ext3_journal_get_write_access(handle, new_bh); - if (retval) - goto journal_error; - new_de->inode = cpu_to_le32(old_inode->i_ino); - if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb, - EXT3_FEATURE_INCOMPAT_FILETYPE)) - new_de->file_type = old_de->file_type; - new_dir->i_version++; - new_dir->i_ctime = new_dir->i_mtime = CURRENT_TIME_SEC; - ext3_mark_inode_dirty(handle, new_dir); - BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata"); - retval = ext3_journal_dirty_metadata(handle, new_bh); - if (retval) - goto journal_error; - brelse(new_bh); - new_bh = NULL; - } - - /* - * Like most other Unix systems, set the ctime for inodes on a - * rename. - */ - old_inode->i_ctime = CURRENT_TIME_SEC; - ext3_mark_inode_dirty(handle, old_inode); - - /* - * ok, that's it - */ - if (le32_to_cpu(old_de->inode) != old_inode->i_ino || - old_de->name_len != old_dentry->d_name.len || - strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) || - (retval = ext3_delete_entry(handle, old_dir, - old_de, old_bh)) == -ENOENT) { - /* old_de could have moved from under us during htree split, so - * make sure that we are deleting the right entry. We might - * also be pointing to a stale entry in the unused part of - * old_bh so just checking inum and the name isn't enough. */ - struct buffer_head *old_bh2; - struct ext3_dir_entry_2 *old_de2; - - old_bh2 = ext3_find_entry(old_dir, &old_dentry->d_name, - &old_de2); - if (old_bh2) { - retval = ext3_delete_entry(handle, old_dir, - old_de2, old_bh2); - brelse(old_bh2); - } - } - if (retval) { - ext3_warning(old_dir->i_sb, "ext3_rename", - "Deleting old file (%lu), %d, error=%d", - old_dir->i_ino, old_dir->i_nlink, retval); - } - - if (new_inode) { - drop_nlink(new_inode); - new_inode->i_ctime = CURRENT_TIME_SEC; - } - old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC; - ext3_update_dx_flag(old_dir); - if (dir_bh) { - BUFFER_TRACE(dir_bh, "get_write_access"); - retval = ext3_journal_get_write_access(handle, dir_bh); - if (retval) - goto journal_error; - PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); - BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata"); - retval = ext3_journal_dirty_metadata(handle, dir_bh); - if (retval) { -journal_error: - ext3_std_error(new_dir->i_sb, retval); - goto end_rename; - } - drop_nlink(old_dir); - if (new_inode) { - drop_nlink(new_inode); - } else { - inc_nlink(new_dir); - ext3_update_dx_flag(new_dir); - ext3_mark_inode_dirty(handle, new_dir); - } - } - ext3_mark_inode_dirty(handle, old_dir); - if (new_inode) { - ext3_mark_inode_dirty(handle, new_inode); - if (!new_inode->i_nlink) - ext3_orphan_add(handle, new_inode); - if (ext3_should_writeback_data(new_inode)) - flush_file = 1; - } - retval = 0; - -end_rename: - brelse (dir_bh); - brelse (old_bh); - brelse (new_bh); - ext3_journal_stop(handle); - if (retval == 0 && flush_file) - filemap_flush(old_inode->i_mapping); - return retval; -} - -/* - * directories can handle most operations... - */ -const struct inode_operations ext3_dir_inode_operations = { - .create = ext3_create, - .lookup = ext3_lookup, - .link = ext3_link, - .unlink = ext3_unlink, - .symlink = ext3_symlink, - .mkdir = ext3_mkdir, - .rmdir = ext3_rmdir, - .mknod = ext3_mknod, - .tmpfile = ext3_tmpfile, - .rename = ext3_rename, - .setattr = ext3_setattr, -#ifdef CONFIG_EXT3_FS_XATTR - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .listxattr = ext3_listxattr, - .removexattr = generic_removexattr, -#endif - .get_acl = ext3_get_acl, - .set_acl = ext3_set_acl, -}; - -const struct inode_operations ext3_special_inode_operations = { - .setattr = ext3_setattr, -#ifdef CONFIG_EXT3_FS_XATTR - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .listxattr = ext3_listxattr, - .removexattr = generic_removexattr, -#endif - .get_acl = ext3_get_acl, - .set_acl = ext3_set_acl, -}; diff --git a/fs/ext3/namei.h b/fs/ext3/namei.h deleted file mode 100644 index 46304d8c9f0a..000000000000 --- a/fs/ext3/namei.h +++ /dev/null @@ -1,27 +0,0 @@ -/* linux/fs/ext3/namei.h - * - * Copyright (C) 2005 Simtec Electronics - * Ben Dooks <ben@simtec.co.uk> - * -*/ - -extern struct dentry *ext3_get_parent(struct dentry *child); - -static inline struct buffer_head *ext3_dir_bread(handle_t *handle, - struct inode *inode, - int block, int create, - int *err) -{ - struct buffer_head *bh; - - bh = ext3_bread(handle, inode, block, create, err); - - if (!bh && !(*err)) { - *err = -EIO; - ext3_error(inode->i_sb, __func__, - "Directory hole detected on inode %lu\n", - inode->i_ino); - return NULL; - } - return bh; -} diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c deleted file mode 100644 index 27105655502c..000000000000 --- a/fs/ext3/resize.c +++ /dev/null @@ -1,1117 +0,0 @@ -/* - * linux/fs/ext3/resize.c - * - * Support for resizing an ext3 filesystem while it is mounted. - * - * Copyright (C) 2001, 2002 Andreas Dilger <adilger@clusterfs.com> - * - * This could probably be made into a module, because it is not often in use. - */ - - -#define EXT3FS_DEBUG - -#include "ext3.h" - - -#define outside(b, first, last) ((b) < (first) || (b) >= (last)) -#define inside(b, first, last) ((b) >= (first) && (b) < (last)) - -static int verify_group_input(struct super_block *sb, - struct ext3_new_group_data *input) -{ - struct ext3_sb_info *sbi = EXT3_SB(sb); - struct ext3_super_block *es = sbi->s_es; - ext3_fsblk_t start = le32_to_cpu(es->s_blocks_count); - ext3_fsblk_t end = start + input->blocks_count; - unsigned group = input->group; - ext3_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; - unsigned overhead = ext3_bg_has_super(sb, group) ? - (1 + ext3_bg_num_gdb(sb, group) + - le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; - ext3_fsblk_t metaend = start + overhead; - struct buffer_head *bh = NULL; - ext3_grpblk_t free_blocks_count; - int err = -EINVAL; - - input->free_blocks_count = free_blocks_count = - input->blocks_count - 2 - overhead - sbi->s_itb_per_group; - - if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG "EXT3-fs: adding %s group %u: %u blocks " - "(%d free, %u reserved)\n", - ext3_bg_has_super(sb, input->group) ? "normal" : - "no-super", input->group, input->blocks_count, - free_blocks_count, input->reserved_blocks); - - if (group != sbi->s_groups_count) - ext3_warning(sb, __func__, - "Cannot add at group %u (only %lu groups)", - input->group, sbi->s_groups_count); - else if ((start - le32_to_cpu(es->s_first_data_block)) % - EXT3_BLOCKS_PER_GROUP(sb)) - ext3_warning(sb, __func__, "Last group not full"); - else if (input->reserved_blocks > input->blocks_count / 5) - ext3_warning(sb, __func__, "Reserved blocks too high (%u)", - input->reserved_blocks); - else if (free_blocks_count < 0) - ext3_warning(sb, __func__, "Bad blocks count %u", - input->blocks_count); - else if (!(bh = sb_bread(sb, end - 1))) - ext3_warning(sb, __func__, - "Cannot read last block ("E3FSBLK")", - end - 1); - else if (outside(input->block_bitmap, start, end)) - ext3_warning(sb, __func__, - "Block bitmap not in group (block %u)", - input->block_bitmap); - else if (outside(input->inode_bitmap, start, end)) - ext3_warning(sb, __func__, - "Inode bitmap not in group (block %u)", - input->inode_bitmap); - else if (outside(input->inode_table, start, end) || - outside(itend - 1, start, end)) - ext3_warning(sb, __func__, - "Inode table not in group (blocks %u-"E3FSBLK")", - input->inode_table, itend - 1); - else if (input->inode_bitmap == input->block_bitmap) - ext3_warning(sb, __func__, - "Block bitmap same as inode bitmap (%u)", - input->block_bitmap); - else if (inside(input->block_bitmap, input->inode_table, itend)) - ext3_warning(sb, __func__, - "Block bitmap (%u) in inode table (%u-"E3FSBLK")", - input->block_bitmap, input->inode_table, itend-1); - else if (inside(input->inode_bitmap, input->inode_table, itend)) - ext3_warning(sb, __func__, - "Inode bitmap (%u) in inode table (%u-"E3FSBLK")", - input->inode_bitmap, input->inode_table, itend-1); - else if (inside(input->block_bitmap, start, metaend)) - ext3_warning(sb, __func__, - "Block bitmap (%u) in GDT table" - " ("E3FSBLK"-"E3FSBLK")", - input->block_bitmap, start, metaend - 1); - else if (inside(input->inode_bitmap, start, metaend)) - ext3_warning(sb, __func__, - "Inode bitmap (%u) in GDT table" - " ("E3FSBLK"-"E3FSBLK")", - input->inode_bitmap, start, metaend - 1); - else if (inside(input->inode_table, start, metaend) || - inside(itend - 1, start, metaend)) - ext3_warning(sb, __func__, - "Inode table (%u-"E3FSBLK") overlaps" - "GDT table ("E3FSBLK"-"E3FSBLK")", - input->inode_table, itend - 1, start, metaend - 1); - else - err = 0; - brelse(bh); - - return err; -} - -static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, - ext3_fsblk_t blk) -{ - struct buffer_head *bh; - int err; - - bh = sb_getblk(sb, blk); - if (unlikely(!bh)) - return ERR_PTR(-ENOMEM); - if ((err = ext3_journal_get_write_access(handle, bh))) { - brelse(bh); - bh = ERR_PTR(err); - } else { - lock_buffer(bh); - memset(bh->b_data, 0, sb->s_blocksize); - set_buffer_uptodate(bh); - unlock_buffer(bh); - } - - return bh; -} - -/* - * To avoid calling the atomic setbit hundreds or thousands of times, we only - * need to use it within a single byte (to ensure we get endianness right). - * We can use memset for the rest of the bitmap as there are no other users. - */ -static void mark_bitmap_end(int start_bit, int end_bit, char *bitmap) -{ - int i; - - if (start_bit >= end_bit) - return; - - ext3_debug("mark end bits +%d through +%d used\n", start_bit, end_bit); - for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++) - ext3_set_bit(i, bitmap); - if (i < end_bit) - memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3); -} - -/* - * If we have fewer than thresh credits, extend by EXT3_MAX_TRANS_DATA. - * If that fails, restart the transaction & regain write access for the - * buffer head which is used for block_bitmap modifications. - */ -static int extend_or_restart_transaction(handle_t *handle, int thresh, - struct buffer_head *bh) -{ - int err; - - if (handle->h_buffer_credits >= thresh) - return 0; - - err = ext3_journal_extend(handle, EXT3_MAX_TRANS_DATA); - if (err < 0) - return err; - if (err) { - err = ext3_journal_restart(handle, EXT3_MAX_TRANS_DATA); - if (err) - return err; - err = ext3_journal_get_write_access(handle, bh); - if (err) - return err; - } - - return 0; -} - -/* - * Set up the block and inode bitmaps, and the inode table for the new group. - * This doesn't need to be part of the main transaction, since we are only - * changing blocks outside the actual filesystem. We still do journaling to - * ensure the recovery is correct in case of a failure just after resize. - * If any part of this fails, we simply abort the resize. - */ -static int setup_new_group_blocks(struct super_block *sb, - struct ext3_new_group_data *input) -{ - struct ext3_sb_info *sbi = EXT3_SB(sb); - ext3_fsblk_t start = ext3_group_first_block_no(sb, input->group); - int reserved_gdb = ext3_bg_has_super(sb, input->group) ? - le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0; - unsigned long gdblocks = ext3_bg_num_gdb(sb, input->group); - struct buffer_head *bh; - handle_t *handle; - ext3_fsblk_t block; - ext3_grpblk_t bit; - int i; - int err = 0, err2; - - /* This transaction may be extended/restarted along the way */ - handle = ext3_journal_start_sb(sb, EXT3_MAX_TRANS_DATA); - - if (IS_ERR(handle)) - return PTR_ERR(handle); - - mutex_lock(&sbi->s_resize_lock); - if (input->group != sbi->s_groups_count) { - err = -EBUSY; - goto exit_journal; - } - - if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) { - err = PTR_ERR(bh); - goto exit_journal; - } - - if (ext3_bg_has_super(sb, input->group)) { - ext3_debug("mark backup superblock %#04lx (+0)\n", start); - ext3_set_bit(0, bh->b_data); - } - - /* Copy all of the GDT blocks into the backup in this group */ - for (i = 0, bit = 1, block = start + 1; - i < gdblocks; i++, block++, bit++) { - struct buffer_head *gdb; - - ext3_debug("update backup group %#04lx (+%d)\n", block, bit); - - err = extend_or_restart_transaction(handle, 1, bh); - if (err) - goto exit_bh; - - gdb = sb_getblk(sb, block); - if (unlikely(!gdb)) { - err = -ENOMEM; - goto exit_bh; - } - if ((err = ext3_journal_get_write_access(handle, gdb))) { - brelse(gdb); - goto exit_bh; - } - lock_buffer(gdb); - memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); - set_buffer_uptodate(gdb); - unlock_buffer(gdb); - err = ext3_journal_dirty_metadata(handle, gdb); - if (err) { - brelse(gdb); - goto exit_bh; - } - ext3_set_bit(bit, bh->b_data); - brelse(gdb); - } - - /* Zero out all of the reserved backup group descriptor table blocks */ - for (i = 0, bit = gdblocks + 1, block = start + bit; - i < reserved_gdb; i++, block++, bit++) { - struct buffer_head *gdb; - - ext3_debug("clear reserved block %#04lx (+%d)\n", block, bit); - - err = extend_or_restart_transaction(handle, 1, bh); - if (err) - goto exit_bh; - - if (IS_ERR(gdb = bclean(handle, sb, block))) { - err = PTR_ERR(gdb); - goto exit_bh; - } - err = ext3_journal_dirty_metadata(handle, gdb); - if (err) { - brelse(gdb); - goto exit_bh; - } - ext3_set_bit(bit, bh->b_data); - brelse(gdb); - } - ext3_debug("mark block bitmap %#04x (+%ld)\n", input->block_bitmap, - input->block_bitmap - start); - ext3_set_bit(input->block_bitmap - start, bh->b_data); - ext3_debug("mark inode bitmap %#04x (+%ld)\n", input->inode_bitmap, - input->inode_bitmap - start); - ext3_set_bit(input->inode_bitmap - start, bh->b_data); - - /* Zero out all of the inode table blocks */ - for (i = 0, block = input->inode_table, bit = block - start; - i < sbi->s_itb_per_group; i++, bit++, block++) { - struct buffer_head *it; - - ext3_debug("clear inode block %#04lx (+%d)\n", block, bit); - - err = extend_or_restart_transaction(handle, 1, bh); - if (err) - goto exit_bh; - - if (IS_ERR(it = bclean(handle, sb, block))) { - err = PTR_ERR(it); - goto exit_bh; - } - err = ext3_journal_dirty_metadata(handle, it); - if (err) { - brelse(it); - goto exit_bh; - } - brelse(it); - ext3_set_bit(bit, bh->b_data); - } - - err = extend_or_restart_transaction(handle, 2, bh); - if (err) - goto exit_bh; - - mark_bitmap_end(input->blocks_count, EXT3_BLOCKS_PER_GROUP(sb), - bh->b_data); - err = ext3_journal_dirty_metadata(handle, bh); - if (err) - goto exit_bh; - brelse(bh); - - /* Mark unused entries in inode bitmap used */ - ext3_debug("clear inode bitmap %#04x (+%ld)\n", - input->inode_bitmap, input->inode_bitmap - start); - if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) { - err = PTR_ERR(bh); - goto exit_journal; - } - - mark_bitmap_end(EXT3_INODES_PER_GROUP(sb), EXT3_BLOCKS_PER_GROUP(sb), - bh->b_data); - err = ext3_journal_dirty_metadata(handle, bh); -exit_bh: - brelse(bh); - -exit_journal: - mutex_unlock(&sbi->s_resize_lock); - if ((err2 = ext3_journal_stop(handle)) && !err) - err = err2; - - return err; -} - -/* - * Iterate through the groups which hold BACKUP superblock/GDT copies in an - * ext3 filesystem. The counters should be initialized to 1, 5, and 7 before - * calling this for the first time. In a sparse filesystem it will be the - * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ... - * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ... - */ -static unsigned ext3_list_backups(struct super_block *sb, unsigned *three, - unsigned *five, unsigned *seven) -{ - unsigned *min = three; - int mult = 3; - unsigned ret; - - if (!EXT3_HAS_RO_COMPAT_FEATURE(sb, - EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) { - ret = *min; - *min += 1; - return ret; - } - - if (*five < *min) { - min = five; - mult = 5; - } - if (*seven < *min) { - min = seven; - mult = 7; - } - - ret = *min; - *min *= mult; - - return ret; -} - -/* - * Check that all of the backup GDT blocks are held in the primary GDT block. - * It is assumed that they are stored in group order. Returns the number of - * groups in current filesystem that have BACKUPS, or -ve error code. - */ -static int verify_reserved_gdb(struct super_block *sb, - struct buffer_head *primary) -{ - const ext3_fsblk_t blk = primary->b_blocknr; - const unsigned long end = EXT3_SB(sb)->s_groups_count; - unsigned three = 1; - unsigned five = 5; - unsigned seven = 7; - unsigned grp; - __le32 *p = (__le32 *)primary->b_data; - int gdbackups = 0; - - while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) { - if (le32_to_cpu(*p++) != grp * EXT3_BLOCKS_PER_GROUP(sb) + blk){ - ext3_warning(sb, __func__, - "reserved GDT "E3FSBLK - " missing grp %d ("E3FSBLK")", - blk, grp, - grp * EXT3_BLOCKS_PER_GROUP(sb) + blk); - return -EINVAL; - } - if (++gdbackups > EXT3_ADDR_PER_BLOCK(sb)) - return -EFBIG; - } - - return gdbackups; -} - -/* - * Called when we need to bring a reserved group descriptor table block into - * use from the resize inode. The primary copy of the new GDT block currently - * is an indirect block (under the double indirect block in the resize inode). - * The new backup GDT blocks will be stored as leaf blocks in this indirect - * block, in group order. Even though we know all the block numbers we need, - * we check to ensure that the resize inode has actually reserved these blocks. - * - * Don't need to update the block bitmaps because the blocks are still in use. - * - * We get all of the error cases out of the way, so that we are sure to not - * fail once we start modifying the data on disk, because JBD has no rollback. - */ -static int add_new_gdb(handle_t *handle, struct inode *inode, - struct ext3_new_group_data *input, - struct buffer_head **primary) -{ - struct super_block *sb = inode->i_sb; - struct ext3_super_block *es = EXT3_SB(sb)->s_es; - unsigned long gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb); - ext3_fsblk_t gdblock = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num; - struct buffer_head **o_group_desc, **n_group_desc; - struct buffer_head *dind; - int gdbackups; - struct ext3_iloc iloc; - __le32 *data; - int err; - - if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG - "EXT3-fs: ext3_add_new_gdb: adding group block %lu\n", - gdb_num); - - /* - * If we are not using the primary superblock/GDT copy don't resize, - * because the user tools have no way of handling this. Probably a - * bad time to do it anyways. - */ - if (EXT3_SB(sb)->s_sbh->b_blocknr != - le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) { - ext3_warning(sb, __func__, - "won't resize using backup superblock at %llu", - (unsigned long long)EXT3_SB(sb)->s_sbh->b_blocknr); - return -EPERM; - } - - *primary = sb_bread(sb, gdblock); - if (!*primary) - return -EIO; - - if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) { - err = gdbackups; - goto exit_bh; - } - - data = EXT3_I(inode)->i_data + EXT3_DIND_BLOCK; - dind = sb_bread(sb, le32_to_cpu(*data)); - if (!dind) { - err = -EIO; - goto exit_bh; - } - - data = (__le32 *)dind->b_data; - if (le32_to_cpu(data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)]) != gdblock) { - ext3_warning(sb, __func__, - "new group %u GDT block "E3FSBLK" not reserved", - input->group, gdblock); - err = -EINVAL; - goto exit_dind; - } - - if ((err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh))) - goto exit_dind; - - if ((err = ext3_journal_get_write_access(handle, *primary))) - goto exit_sbh; - - if ((err = ext3_journal_get_write_access(handle, dind))) - goto exit_primary; - - /* ext3_reserve_inode_write() gets a reference on the iloc */ - if ((err = ext3_reserve_inode_write(handle, inode, &iloc))) - goto exit_dindj; - - n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *), - GFP_NOFS); - if (!n_group_desc) { - err = -ENOMEM; - ext3_warning (sb, __func__, - "not enough memory for %lu groups", gdb_num + 1); - goto exit_inode; - } - - /* - * Finally, we have all of the possible failures behind us... - * - * Remove new GDT block from inode double-indirect block and clear out - * the new GDT block for use (which also "frees" the backup GDT blocks - * from the reserved inode). We don't need to change the bitmaps for - * these blocks, because they are marked as in-use from being in the - * reserved inode, and will become GDT blocks (primary and backup). - */ - data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)] = 0; - err = ext3_journal_dirty_metadata(handle, dind); - if (err) - goto exit_group_desc; - brelse(dind); - dind = NULL; - inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; - err = ext3_mark_iloc_dirty(handle, inode, &iloc); - if (err) - goto exit_group_desc; - memset((*primary)->b_data, 0, sb->s_blocksize); - err = ext3_journal_dirty_metadata(handle, *primary); - if (err) - goto exit_group_desc; - - o_group_desc = EXT3_SB(sb)->s_group_desc; - memcpy(n_group_desc, o_group_desc, - EXT3_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); - n_group_desc[gdb_num] = *primary; - EXT3_SB(sb)->s_group_desc = n_group_desc; - EXT3_SB(sb)->s_gdb_count++; - kfree(o_group_desc); - - le16_add_cpu(&es->s_reserved_gdt_blocks, -1); - err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); - if (err) - goto exit_inode; - - return 0; - -exit_group_desc: - kfree(n_group_desc); -exit_inode: - //ext3_journal_release_buffer(handle, iloc.bh); - brelse(iloc.bh); -exit_dindj: - //ext3_journal_release_buffer(handle, dind); -exit_primary: - //ext3_journal_release_buffer(handle, *primary); -exit_sbh: - //ext3_journal_release_buffer(handle, *primary); -exit_dind: - brelse(dind); -exit_bh: - brelse(*primary); - - ext3_debug("leaving with error %d\n", err); - return err; -} - -/* - * Called when we are adding a new group which has a backup copy of each of - * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks. - * We need to add these reserved backup GDT blocks to the resize inode, so - * that they are kept for future resizing and not allocated to files. - * - * Each reserved backup GDT block will go into a different indirect block. - * The indirect blocks are actually the primary reserved GDT blocks, - * so we know in advance what their block numbers are. We only get the - * double-indirect block to verify it is pointing to the primary reserved - * GDT blocks so we don't overwrite a data block by accident. The reserved - * backup GDT blocks are stored in their reserved primary GDT block. - */ -static int reserve_backup_gdb(handle_t *handle, struct inode *inode, - struct ext3_new_group_data *input) -{ - struct super_block *sb = inode->i_sb; - int reserved_gdb =le16_to_cpu(EXT3_SB(sb)->s_es->s_reserved_gdt_blocks); - struct buffer_head **primary; - struct buffer_head *dind; - struct ext3_iloc iloc; - ext3_fsblk_t blk; - __le32 *data, *end; - int gdbackups = 0; - int res, i; - int err; - - primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_NOFS); - if (!primary) - return -ENOMEM; - - data = EXT3_I(inode)->i_data + EXT3_DIND_BLOCK; - dind = sb_bread(sb, le32_to_cpu(*data)); - if (!dind) { - err = -EIO; - goto exit_free; - } - - blk = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + EXT3_SB(sb)->s_gdb_count; - data = (__le32 *)dind->b_data + (EXT3_SB(sb)->s_gdb_count % - EXT3_ADDR_PER_BLOCK(sb)); - end = (__le32 *)dind->b_data + EXT3_ADDR_PER_BLOCK(sb); - - /* Get each reserved primary GDT block and verify it holds backups */ - for (res = 0; res < reserved_gdb; res++, blk++) { - if (le32_to_cpu(*data) != blk) { - ext3_warning(sb, __func__, - "reserved block "E3FSBLK - " not at offset %ld", - blk, - (long)(data - (__le32 *)dind->b_data)); - err = -EINVAL; - goto exit_bh; - } - primary[res] = sb_bread(sb, blk); - if (!primary[res]) { - err = -EIO; - goto exit_bh; - } - if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) { - brelse(primary[res]); - err = gdbackups; - goto exit_bh; - } - if (++data >= end) - data = (__le32 *)dind->b_data; - } - - for (i = 0; i < reserved_gdb; i++) { - if ((err = ext3_journal_get_write_access(handle, primary[i]))) { - /* - int j; - for (j = 0; j < i; j++) - ext3_journal_release_buffer(handle, primary[j]); - */ - goto exit_bh; - } - } - - if ((err = ext3_reserve_inode_write(handle, inode, &iloc))) - goto exit_bh; - - /* - * Finally we can add each of the reserved backup GDT blocks from - * the new group to its reserved primary GDT block. - */ - blk = input->group * EXT3_BLOCKS_PER_GROUP(sb); - for (i = 0; i < reserved_gdb; i++) { - int err2; - data = (__le32 *)primary[i]->b_data; - /* printk("reserving backup %lu[%u] = %lu\n", - primary[i]->b_blocknr, gdbackups, - blk + primary[i]->b_blocknr); */ - data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr); - err2 = ext3_journal_dirty_metadata(handle, primary[i]); - if (!err) - err = err2; - } - inode->i_blocks += reserved_gdb * sb->s_blocksize >> 9; - ext3_mark_iloc_dirty(handle, inode, &iloc); - -exit_bh: - while (--res >= 0) - brelse(primary[res]); - brelse(dind); - -exit_free: - kfree(primary); - - return err; -} - -/* - * Update the backup copies of the ext3 metadata. These don't need to be part - * of the main resize transaction, because e2fsck will re-write them if there - * is a problem (basically only OOM will cause a problem). However, we - * _should_ update the backups if possible, in case the primary gets trashed - * for some reason and we need to run e2fsck from a backup superblock. The - * important part is that the new block and inode counts are in the backup - * superblocks, and the location of the new group metadata in the GDT backups. - * - * We do not need take the s_resize_lock for this, because these - * blocks are not otherwise touched by the filesystem code when it is - * mounted. We don't need to worry about last changing from - * sbi->s_groups_count, because the worst that can happen is that we - * do not copy the full number of backups at this time. The resize - * which changed s_groups_count will backup again. - */ -static void update_backups(struct super_block *sb, - int blk_off, char *data, int size) -{ - struct ext3_sb_info *sbi = EXT3_SB(sb); - const unsigned long last = sbi->s_groups_count; - const int bpg = EXT3_BLOCKS_PER_GROUP(sb); - unsigned three = 1; - unsigned five = 5; - unsigned seven = 7; - unsigned group; - int rest = sb->s_blocksize - size; - handle_t *handle; - int err = 0, err2; - - handle = ext3_journal_start_sb(sb, EXT3_MAX_TRANS_DATA); - if (IS_ERR(handle)) { - group = 1; - err = PTR_ERR(handle); - goto exit_err; - } - - while ((group = ext3_list_backups(sb, &three, &five, &seven)) < last) { - struct buffer_head *bh; - - /* Out of journal space, and can't get more - abort - so sad */ - if (handle->h_buffer_credits == 0 && - ext3_journal_extend(handle, EXT3_MAX_TRANS_DATA) && - (err = ext3_journal_restart(handle, EXT3_MAX_TRANS_DATA))) - break; - - bh = sb_getblk(sb, group * bpg + blk_off); - if (unlikely(!bh)) { - err = -ENOMEM; - break; - } - ext3_debug("update metadata backup %#04lx\n", - (unsigned long)bh->b_blocknr); - if ((err = ext3_journal_get_write_access(handle, bh))) { - brelse(bh); - break; - } - lock_buffer(bh); - memcpy(bh->b_data, data, size); - if (rest) - memset(bh->b_data + size, 0, rest); - set_buffer_uptodate(bh); - unlock_buffer(bh); - err = ext3_journal_dirty_metadata(handle, bh); - brelse(bh); - if (err) - break; - } - if ((err2 = ext3_journal_stop(handle)) && !err) - err = err2; - - /* - * Ugh! Need to have e2fsck write the backup copies. It is too - * late to revert the resize, we shouldn't fail just because of - * the backup copies (they are only needed in case of corruption). - * - * However, if we got here we have a journal problem too, so we - * can't really start a transaction to mark the superblock. - * Chicken out and just set the flag on the hope it will be written - * to disk, and if not - we will simply wait until next fsck. - */ -exit_err: - if (err) { - ext3_warning(sb, __func__, - "can't update backup for group %d (err %d), " - "forcing fsck on next reboot", group, err); - sbi->s_mount_state &= ~EXT3_VALID_FS; - sbi->s_es->s_state &= cpu_to_le16(~EXT3_VALID_FS); - mark_buffer_dirty(sbi->s_sbh); - } -} - -/* Add group descriptor data to an existing or new group descriptor block. - * Ensure we handle all possible error conditions _before_ we start modifying - * the filesystem, because we cannot abort the transaction and not have it - * write the data to disk. - * - * If we are on a GDT block boundary, we need to get the reserved GDT block. - * Otherwise, we may need to add backup GDT blocks for a sparse group. - * - * We only need to hold the superblock lock while we are actually adding - * in the new group's counts to the superblock. Prior to that we have - * not really "added" the group at all. We re-check that we are still - * adding in the last group in case things have changed since verifying. - */ -int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) -{ - struct ext3_sb_info *sbi = EXT3_SB(sb); - struct ext3_super_block *es = sbi->s_es; - int reserved_gdb = ext3_bg_has_super(sb, input->group) ? - le16_to_cpu(es->s_reserved_gdt_blocks) : 0; - struct buffer_head *primary = NULL; - struct ext3_group_desc *gdp; - struct inode *inode = NULL; - handle_t *handle; - int gdb_off, gdb_num; - int err, err2; - - gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb); - gdb_off = input->group % EXT3_DESC_PER_BLOCK(sb); - - if (gdb_off == 0 && !EXT3_HAS_RO_COMPAT_FEATURE(sb, - EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) { - ext3_warning(sb, __func__, - "Can't resize non-sparse filesystem further"); - return -EPERM; - } - - if (le32_to_cpu(es->s_blocks_count) + input->blocks_count < - le32_to_cpu(es->s_blocks_count)) { - ext3_warning(sb, __func__, "blocks_count overflow\n"); - return -EINVAL; - } - - if (le32_to_cpu(es->s_inodes_count) + EXT3_INODES_PER_GROUP(sb) < - le32_to_cpu(es->s_inodes_count)) { - ext3_warning(sb, __func__, "inodes_count overflow\n"); - return -EINVAL; - } - - if (reserved_gdb || gdb_off == 0) { - if (!EXT3_HAS_COMPAT_FEATURE(sb, - EXT3_FEATURE_COMPAT_RESIZE_INODE) - || !le16_to_cpu(es->s_reserved_gdt_blocks)) { - ext3_warning(sb, __func__, - "No reserved GDT blocks, can't resize"); - return -EPERM; - } - inode = ext3_iget(sb, EXT3_RESIZE_INO); - if (IS_ERR(inode)) { - ext3_warning(sb, __func__, - "Error opening resize inode"); - return PTR_ERR(inode); - } - } - - if ((err = verify_group_input(sb, input))) - goto exit_put; - - if ((err = setup_new_group_blocks(sb, input))) - goto exit_put; - - /* - * We will always be modifying at least the superblock and a GDT - * block. If we are adding a group past the last current GDT block, - * we will also modify the inode and the dindirect block. If we - * are adding a group with superblock/GDT backups we will also - * modify each of the reserved GDT dindirect blocks. - */ - handle = ext3_journal_start_sb(sb, - ext3_bg_has_super(sb, input->group) ? - 3 + reserved_gdb : 4); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - goto exit_put; - } - - mutex_lock(&sbi->s_resize_lock); - if (input->group != sbi->s_groups_count) { - ext3_warning(sb, __func__, - "multiple resizers run on filesystem!"); - err = -EBUSY; - goto exit_journal; - } - - if ((err = ext3_journal_get_write_access(handle, sbi->s_sbh))) - goto exit_journal; - - /* - * We will only either add reserved group blocks to a backup group - * or remove reserved blocks for the first group in a new group block. - * Doing both would be mean more complex code, and sane people don't - * use non-sparse filesystems anymore. This is already checked above. - */ - if (gdb_off) { - primary = sbi->s_group_desc[gdb_num]; - if ((err = ext3_journal_get_write_access(handle, primary))) - goto exit_journal; - - if (reserved_gdb && ext3_bg_num_gdb(sb, input->group) && - (err = reserve_backup_gdb(handle, inode, input))) - goto exit_journal; - } else if ((err = add_new_gdb(handle, inode, input, &primary))) - goto exit_journal; - - /* - * OK, now we've set up the new group. Time to make it active. - * - * We do not lock all allocations via s_resize_lock - * so we have to be safe wrt. concurrent accesses the group - * data. So we need to be careful to set all of the relevant - * group descriptor data etc. *before* we enable the group. - * - * The key field here is sbi->s_groups_count: as long as - * that retains its old value, nobody is going to access the new - * group. - * - * So first we update all the descriptor metadata for the new - * group; then we update the total disk blocks count; then we - * update the groups count to enable the group; then finally we - * update the free space counts so that the system can start - * using the new disk blocks. - */ - - /* Update group descriptor block for new group */ - gdp = (struct ext3_group_desc *)primary->b_data + gdb_off; - - gdp->bg_block_bitmap = cpu_to_le32(input->block_bitmap); - gdp->bg_inode_bitmap = cpu_to_le32(input->inode_bitmap); - gdp->bg_inode_table = cpu_to_le32(input->inode_table); - gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count); - gdp->bg_free_inodes_count = cpu_to_le16(EXT3_INODES_PER_GROUP(sb)); - - /* - * Make the new blocks and inodes valid next. We do this before - * increasing the group count so that once the group is enabled, - * all of its blocks and inodes are already valid. - * - * We always allocate group-by-group, then block-by-block or - * inode-by-inode within a group, so enabling these - * blocks/inodes before the group is live won't actually let us - * allocate the new space yet. - */ - le32_add_cpu(&es->s_blocks_count, input->blocks_count); - le32_add_cpu(&es->s_inodes_count, EXT3_INODES_PER_GROUP(sb)); - - /* - * We need to protect s_groups_count against other CPUs seeing - * inconsistent state in the superblock. - * - * The precise rules we use are: - * - * * Writers of s_groups_count *must* hold s_resize_lock - * AND - * * Writers must perform a smp_wmb() after updating all dependent - * data and before modifying the groups count - * - * * Readers must hold s_resize_lock over the access - * OR - * * Readers must perform an smp_rmb() after reading the groups count - * and before reading any dependent data. - * - * NB. These rules can be relaxed when checking the group count - * while freeing data, as we can only allocate from a block - * group after serialising against the group count, and we can - * only then free after serialising in turn against that - * allocation. - */ - smp_wmb(); - - /* Update the global fs size fields */ - sbi->s_groups_count++; - - err = ext3_journal_dirty_metadata(handle, primary); - if (err) - goto exit_journal; - - /* Update the reserved block counts only once the new group is - * active. */ - le32_add_cpu(&es->s_r_blocks_count, input->reserved_blocks); - - /* Update the free space counts */ - percpu_counter_add(&sbi->s_freeblocks_counter, - input->free_blocks_count); - percpu_counter_add(&sbi->s_freeinodes_counter, - EXT3_INODES_PER_GROUP(sb)); - - err = ext3_journal_dirty_metadata(handle, sbi->s_sbh); - -exit_journal: - mutex_unlock(&sbi->s_resize_lock); - if ((err2 = ext3_journal_stop(handle)) && !err) - err = err2; - if (!err) { - update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, - sizeof(struct ext3_super_block)); - update_backups(sb, primary->b_blocknr, primary->b_data, - primary->b_size); - } -exit_put: - iput(inode); - return err; -} /* ext3_group_add */ - -/* Extend the filesystem to the new number of blocks specified. This entry - * point is only used to extend the current filesystem to the end of the last - * existing group. It can be accessed via ioctl, or by "remount,resize=<size>" - * for emergencies (because it has no dependencies on reserved blocks). - * - * If we _really_ wanted, we could use default values to call ext3_group_add() - * allow the "remount" trick to work for arbitrary resizing, assuming enough - * GDT blocks are reserved to grow to the desired size. - */ -int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, - ext3_fsblk_t n_blocks_count) -{ - ext3_fsblk_t o_blocks_count; - ext3_grpblk_t last; - ext3_grpblk_t add; - struct buffer_head * bh; - handle_t *handle; - int err; - unsigned long freed_blocks; - - /* We don't need to worry about locking wrt other resizers just - * yet: we're going to revalidate es->s_blocks_count after - * taking the s_resize_lock below. */ - o_blocks_count = le32_to_cpu(es->s_blocks_count); - - if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK - " up to "E3FSBLK" blocks\n", - o_blocks_count, n_blocks_count); - - if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) - return 0; - - if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { - printk(KERN_ERR "EXT3-fs: filesystem on %s:" - " too large to resize to "E3FSBLK" blocks safely\n", - sb->s_id, n_blocks_count); - if (sizeof(sector_t) < 8) - ext3_warning(sb, __func__, - "CONFIG_LBDAF not enabled\n"); - return -EINVAL; - } - - if (n_blocks_count < o_blocks_count) { - ext3_warning(sb, __func__, - "can't shrink FS - resize aborted"); - return -EBUSY; - } - - /* Handle the remaining blocks in the last group only. */ - last = (o_blocks_count - le32_to_cpu(es->s_first_data_block)) % - EXT3_BLOCKS_PER_GROUP(sb); - - if (last == 0) { - ext3_warning(sb, __func__, - "need to use ext2online to resize further"); - return -EPERM; - } - - add = EXT3_BLOCKS_PER_GROUP(sb) - last; - - if (o_blocks_count + add < o_blocks_count) { - ext3_warning(sb, __func__, "blocks_count overflow"); - return -EINVAL; - } - - if (o_blocks_count + add > n_blocks_count) - add = n_blocks_count - o_blocks_count; - - if (o_blocks_count + add < n_blocks_count) - ext3_warning(sb, __func__, - "will only finish group ("E3FSBLK - " blocks, %u new)", - o_blocks_count + add, add); - - /* See if the device is actually as big as what was requested */ - bh = sb_bread(sb, o_blocks_count + add -1); - if (!bh) { - ext3_warning(sb, __func__, - "can't read last block, resize aborted"); - return -ENOSPC; - } - brelse(bh); - - /* We will update the superblock, one block bitmap, and - * one group descriptor via ext3_free_blocks(). - */ - handle = ext3_journal_start_sb(sb, 3); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - ext3_warning(sb, __func__, "error %d on journal start",err); - goto exit_put; - } - - mutex_lock(&EXT3_SB(sb)->s_resize_lock); - if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) { - ext3_warning(sb, __func__, - "multiple resizers run on filesystem!"); - mutex_unlock(&EXT3_SB(sb)->s_resize_lock); - ext3_journal_stop(handle); - err = -EBUSY; - goto exit_put; - } - - if ((err = ext3_journal_get_write_access(handle, - EXT3_SB(sb)->s_sbh))) { - ext3_warning(sb, __func__, - "error %d on journal write access", err); - mutex_unlock(&EXT3_SB(sb)->s_resize_lock); - ext3_journal_stop(handle); - goto exit_put; - } - es->s_blocks_count = cpu_to_le32(o_blocks_count + add); - err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); - mutex_unlock(&EXT3_SB(sb)->s_resize_lock); - if (err) { - ext3_warning(sb, __func__, - "error %d on journal dirty metadata", err); - ext3_journal_stop(handle); - goto exit_put; - } - ext3_debug("freeing blocks "E3FSBLK" through "E3FSBLK"\n", - o_blocks_count, o_blocks_count + add); - ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); - ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n", - o_blocks_count, o_blocks_count + add); - if ((err = ext3_journal_stop(handle))) - goto exit_put; - if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG "EXT3-fs: extended group to %u blocks\n", - le32_to_cpu(es->s_blocks_count)); - update_backups(sb, EXT3_SB(sb)->s_sbh->b_blocknr, (char *)es, - sizeof(struct ext3_super_block)); -exit_put: - return err; -} /* ext3_group_extend */ diff --git a/fs/ext3/super.c b/fs/ext3/super.c deleted file mode 100644 index 5ed0044fbb37..000000000000 --- a/fs/ext3/super.c +++ /dev/null @@ -1,3165 +0,0 @@ -/* - * linux/fs/ext3/super.c - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * from - * - * linux/fs/minix/inode.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * Big-endian to little-endian byte-swapping/bitmaps by - * David S. Miller (davem@caip.rutgers.edu), 1995 - */ - -#include <linux/module.h> -#include <linux/blkdev.h> -#include <linux/parser.h> -#include <linux/exportfs.h> -#include <linux/statfs.h> -#include <linux/random.h> -#include <linux/mount.h> -#include <linux/quotaops.h> -#include <linux/seq_file.h> -#include <linux/log2.h> -#include <linux/cleancache.h> -#include <linux/namei.h> - -#include <asm/uaccess.h> - -#define CREATE_TRACE_POINTS - -#include "ext3.h" -#include "xattr.h" -#include "acl.h" -#include "namei.h" - -#ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED - #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA -#else - #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_WRITEBACK_DATA -#endif - -static int ext3_load_journal(struct super_block *, struct ext3_super_block *, - unsigned long journal_devnum); -static int ext3_create_journal(struct super_block *, struct ext3_super_block *, - unsigned int); -static int ext3_commit_super(struct super_block *sb, - struct ext3_super_block *es, - int sync); -static void ext3_mark_recovery_complete(struct super_block * sb, - struct ext3_super_block * es); -static void ext3_clear_journal_err(struct super_block * sb, - struct ext3_super_block * es); -static int ext3_sync_fs(struct super_block *sb, int wait); -static const char *ext3_decode_error(struct super_block * sb, int errno, - char nbuf[16]); -static int ext3_remount (struct super_block * sb, int * flags, char * data); -static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf); -static int ext3_unfreeze(struct super_block *sb); -static int ext3_freeze(struct super_block *sb); - -/* - * Wrappers for journal_start/end. - */ -handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks) -{ - journal_t *journal; - - if (sb->s_flags & MS_RDONLY) - return ERR_PTR(-EROFS); - - /* Special case here: if the journal has aborted behind our - * backs (eg. EIO in the commit thread), then we still need to - * take the FS itself readonly cleanly. */ - journal = EXT3_SB(sb)->s_journal; - if (is_journal_aborted(journal)) { - ext3_abort(sb, __func__, - "Detected aborted journal"); - return ERR_PTR(-EROFS); - } - - return journal_start(journal, nblocks); -} - -int __ext3_journal_stop(const char *where, handle_t *handle) -{ - struct super_block *sb; - int err; - int rc; - - sb = handle->h_transaction->t_journal->j_private; - err = handle->h_err; - rc = journal_stop(handle); - - if (!err) - err = rc; - if (err) - __ext3_std_error(sb, where, err); - return err; -} - -void ext3_journal_abort_handle(const char *caller, const char *err_fn, - struct buffer_head *bh, handle_t *handle, int err) -{ - char nbuf[16]; - const char *errstr = ext3_decode_error(NULL, err, nbuf); - - if (bh) - BUFFER_TRACE(bh, "abort"); - - if (!handle->h_err) - handle->h_err = err; - - if (is_handle_aborted(handle)) - return; - - printk(KERN_ERR "EXT3-fs: %s: aborting transaction: %s in %s\n", - caller, errstr, err_fn); - - journal_abort_handle(handle); -} - -void ext3_msg(struct super_block *sb, const char *prefix, - const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - va_start(args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; - - printk("%sEXT3-fs (%s): %pV\n", prefix, sb->s_id, &vaf); - - va_end(args); -} - -/* Deal with the reporting of failure conditions on a filesystem such as - * inconsistencies detected or read IO failures. - * - * On ext2, we can store the error state of the filesystem in the - * superblock. That is not possible on ext3, because we may have other - * write ordering constraints on the superblock which prevent us from - * writing it out straight away; and given that the journal is about to - * be aborted, we can't rely on the current, or future, transactions to - * write out the superblock safely. - * - * We'll just use the journal_abort() error code to record an error in - * the journal instead. On recovery, the journal will complain about - * that error until we've noted it down and cleared it. - */ - -static void ext3_handle_error(struct super_block *sb) -{ - struct ext3_super_block *es = EXT3_SB(sb)->s_es; - - EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; - es->s_state |= cpu_to_le16(EXT3_ERROR_FS); - - if (sb->s_flags & MS_RDONLY) - return; - - if (!test_opt (sb, ERRORS_CONT)) { - journal_t *journal = EXT3_SB(sb)->s_journal; - - set_opt(EXT3_SB(sb)->s_mount_opt, ABORT); - if (journal) - journal_abort(journal, -EIO); - } - if (test_opt (sb, ERRORS_RO)) { - ext3_msg(sb, KERN_CRIT, - "error: remounting filesystem read-only"); - /* - * Make sure updated value of ->s_mount_state will be visible - * before ->s_flags update. - */ - smp_wmb(); - sb->s_flags |= MS_RDONLY; - } - ext3_commit_super(sb, es, 1); - if (test_opt(sb, ERRORS_PANIC)) - panic("EXT3-fs (%s): panic forced after error\n", - sb->s_id); -} - -void ext3_error(struct super_block *sb, const char *function, - const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - va_start(args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; - - printk(KERN_CRIT "EXT3-fs error (device %s): %s: %pV\n", - sb->s_id, function, &vaf); - - va_end(args); - - ext3_handle_error(sb); -} - -static const char *ext3_decode_error(struct super_block * sb, int errno, - char nbuf[16]) -{ - char *errstr = NULL; - - switch (errno) { - case -EIO: - errstr = "IO failure"; - break; - case -ENOMEM: - errstr = "Out of memory"; - break; - case -EROFS: - if (!sb || EXT3_SB(sb)->s_journal->j_flags & JFS_ABORT) - errstr = "Journal has aborted"; - else - errstr = "Readonly filesystem"; - break; - default: - /* If the caller passed in an extra buffer for unknown - * errors, textualise them now. Else we just return - * NULL. */ - if (nbuf) { - /* Check for truncated error codes... */ - if (snprintf(nbuf, 16, "error %d", -errno) >= 0) - errstr = nbuf; - } - break; - } - - return errstr; -} - -/* __ext3_std_error decodes expected errors from journaling functions - * automatically and invokes the appropriate error response. */ - -void __ext3_std_error (struct super_block * sb, const char * function, - int errno) -{ - char nbuf[16]; - const char *errstr; - - /* Special case: if the error is EROFS, and we're not already - * inside a transaction, then there's really no point in logging - * an error. */ - if (errno == -EROFS && journal_current_handle() == NULL && - (sb->s_flags & MS_RDONLY)) - return; - - errstr = ext3_decode_error(sb, errno, nbuf); - ext3_msg(sb, KERN_CRIT, "error in %s: %s", function, errstr); - - ext3_handle_error(sb); -} - -/* - * ext3_abort is a much stronger failure handler than ext3_error. The - * abort function may be used to deal with unrecoverable failures such - * as journal IO errors or ENOMEM at a critical moment in log management. - * - * We unconditionally force the filesystem into an ABORT|READONLY state, - * unless the error response on the fs has been set to panic in which - * case we take the easy way out and panic immediately. - */ - -void ext3_abort(struct super_block *sb, const char *function, - const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - va_start(args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; - - printk(KERN_CRIT "EXT3-fs (%s): error: %s: %pV\n", - sb->s_id, function, &vaf); - - va_end(args); - - if (test_opt(sb, ERRORS_PANIC)) - panic("EXT3-fs: panic from previous error\n"); - - if (sb->s_flags & MS_RDONLY) - return; - - ext3_msg(sb, KERN_CRIT, - "error: remounting filesystem read-only"); - EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; - set_opt(EXT3_SB(sb)->s_mount_opt, ABORT); - /* - * Make sure updated value of ->s_mount_state will be visible - * before ->s_flags update. - */ - smp_wmb(); - sb->s_flags |= MS_RDONLY; - - if (EXT3_SB(sb)->s_journal) - journal_abort(EXT3_SB(sb)->s_journal, -EIO); -} - -void ext3_warning(struct super_block *sb, const char *function, - const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - va_start(args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; - - printk(KERN_WARNING "EXT3-fs (%s): warning: %s: %pV\n", - sb->s_id, function, &vaf); - - va_end(args); -} - -void ext3_update_dynamic_rev(struct super_block *sb) -{ - struct ext3_super_block *es = EXT3_SB(sb)->s_es; - - if (le32_to_cpu(es->s_rev_level) > EXT3_GOOD_OLD_REV) - return; - - ext3_msg(sb, KERN_WARNING, - "warning: updating to rev %d because of " - "new feature flag, running e2fsck is recommended", - EXT3_DYNAMIC_REV); - - es->s_first_ino = cpu_to_le32(EXT3_GOOD_OLD_FIRST_INO); - es->s_inode_size = cpu_to_le16(EXT3_GOOD_OLD_INODE_SIZE); - es->s_rev_level = cpu_to_le32(EXT3_DYNAMIC_REV); - /* leave es->s_feature_*compat flags alone */ - /* es->s_uuid will be set by e2fsck if empty */ - - /* - * The rest of the superblock fields should be zero, and if not it - * means they are likely already in use, so leave them alone. We - * can leave it up to e2fsck to clean up any inconsistencies there. - */ -} - -/* - * Open the external journal device - */ -static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb) -{ - struct block_device *bdev; - char b[BDEVNAME_SIZE]; - - bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb); - if (IS_ERR(bdev)) - goto fail; - return bdev; - -fail: - ext3_msg(sb, KERN_ERR, "error: failed to open journal device %s: %ld", - __bdevname(dev, b), PTR_ERR(bdev)); - - return NULL; -} - -/* - * Release the journal device - */ -static void ext3_blkdev_put(struct block_device *bdev) -{ - blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); -} - -static void ext3_blkdev_remove(struct ext3_sb_info *sbi) -{ - struct block_device *bdev; - bdev = sbi->journal_bdev; - if (bdev) { - ext3_blkdev_put(bdev); - sbi->journal_bdev = NULL; - } -} - -static inline struct inode *orphan_list_entry(struct list_head *l) -{ - return &list_entry(l, struct ext3_inode_info, i_orphan)->vfs_inode; -} - -static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi) -{ - struct list_head *l; - - ext3_msg(sb, KERN_ERR, "error: sb orphan head is %d", - le32_to_cpu(sbi->s_es->s_last_orphan)); - - ext3_msg(sb, KERN_ERR, "sb_info orphan list:"); - list_for_each(l, &sbi->s_orphan) { - struct inode *inode = orphan_list_entry(l); - ext3_msg(sb, KERN_ERR, " " - "inode %s:%lu at %p: mode %o, nlink %d, next %d\n", - inode->i_sb->s_id, inode->i_ino, inode, - inode->i_mode, inode->i_nlink, - NEXT_ORPHAN(inode)); - } -} - -static void ext3_put_super (struct super_block * sb) -{ - struct ext3_sb_info *sbi = EXT3_SB(sb); - struct ext3_super_block *es = sbi->s_es; - int i, err; - - dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); - ext3_xattr_put_super(sb); - err = journal_destroy(sbi->s_journal); - sbi->s_journal = NULL; - if (err < 0) - ext3_abort(sb, __func__, "Couldn't clean up the journal"); - - if (!(sb->s_flags & MS_RDONLY)) { - EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); - es->s_state = cpu_to_le16(sbi->s_mount_state); - BUFFER_TRACE(sbi->s_sbh, "marking dirty"); - mark_buffer_dirty(sbi->s_sbh); - ext3_commit_super(sb, es, 1); - } - - for (i = 0; i < sbi->s_gdb_count; i++) - brelse(sbi->s_group_desc[i]); - kfree(sbi->s_group_desc); - percpu_counter_destroy(&sbi->s_freeblocks_counter); - percpu_counter_destroy(&sbi->s_freeinodes_counter); - percpu_counter_destroy(&sbi->s_dirs_counter); - brelse(sbi->s_sbh); -#ifdef CONFIG_QUOTA - for (i = 0; i < EXT3_MAXQUOTAS; i++) - kfree(sbi->s_qf_names[i]); -#endif - - /* Debugging code just in case the in-memory inode orphan list - * isn't empty. The on-disk one can be non-empty if we've - * detected an error and taken the fs readonly, but the - * in-memory list had better be clean by this point. */ - if (!list_empty(&sbi->s_orphan)) - dump_orphan_list(sb, sbi); - J_ASSERT(list_empty(&sbi->s_orphan)); - - invalidate_bdev(sb->s_bdev); - if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) { - /* - * Invalidate the journal device's buffers. We don't want them - * floating about in memory - the physical journal device may - * hotswapped, and it breaks the `ro-after' testing code. - */ - sync_blockdev(sbi->journal_bdev); - invalidate_bdev(sbi->journal_bdev); - ext3_blkdev_remove(sbi); - } - sb->s_fs_info = NULL; - kfree(sbi->s_blockgroup_lock); - mutex_destroy(&sbi->s_orphan_lock); - mutex_destroy(&sbi->s_resize_lock); - kfree(sbi); -} - -static struct kmem_cache *ext3_inode_cachep; - -/* - * Called inside transaction, so use GFP_NOFS - */ -static struct inode *ext3_alloc_inode(struct super_block *sb) -{ - struct ext3_inode_info *ei; - - ei = kmem_cache_alloc(ext3_inode_cachep, GFP_NOFS); - if (!ei) - return NULL; - ei->i_block_alloc_info = NULL; - ei->vfs_inode.i_version = 1; - atomic_set(&ei->i_datasync_tid, 0); - atomic_set(&ei->i_sync_tid, 0); -#ifdef CONFIG_QUOTA - memset(&ei->i_dquot, 0, sizeof(ei->i_dquot)); -#endif - - return &ei->vfs_inode; -} - -static int ext3_drop_inode(struct inode *inode) -{ - int drop = generic_drop_inode(inode); - - trace_ext3_drop_inode(inode, drop); - return drop; -} - -static void ext3_i_callback(struct rcu_head *head) -{ - struct inode *inode = container_of(head, struct inode, i_rcu); - kmem_cache_free(ext3_inode_cachep, EXT3_I(inode)); -} - -static void ext3_destroy_inode(struct inode *inode) -{ - if (!list_empty(&(EXT3_I(inode)->i_orphan))) { - printk("EXT3 Inode %p: orphan list check failed!\n", - EXT3_I(inode)); - print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4, - EXT3_I(inode), sizeof(struct ext3_inode_info), - false); - dump_stack(); - } - call_rcu(&inode->i_rcu, ext3_i_callback); -} - -static void init_once(void *foo) -{ - struct ext3_inode_info *ei = (struct ext3_inode_info *) foo; - - INIT_LIST_HEAD(&ei->i_orphan); -#ifdef CONFIG_EXT3_FS_XATTR - init_rwsem(&ei->xattr_sem); -#endif - mutex_init(&ei->truncate_mutex); - inode_init_once(&ei->vfs_inode); -} - -static int __init init_inodecache(void) -{ - ext3_inode_cachep = kmem_cache_create("ext3_inode_cache", - sizeof(struct ext3_inode_info), - 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), - init_once); - if (ext3_inode_cachep == NULL) - return -ENOMEM; - return 0; -} - -static void destroy_inodecache(void) -{ - /* - * Make sure all delayed rcu free inodes are flushed before we - * destroy cache. - */ - rcu_barrier(); - kmem_cache_destroy(ext3_inode_cachep); -} - -static inline void ext3_show_quota_options(struct seq_file *seq, struct super_block *sb) -{ -#if defined(CONFIG_QUOTA) - struct ext3_sb_info *sbi = EXT3_SB(sb); - - if (sbi->s_jquota_fmt) { - char *fmtname = ""; - - switch (sbi->s_jquota_fmt) { - case QFMT_VFS_OLD: - fmtname = "vfsold"; - break; - case QFMT_VFS_V0: - fmtname = "vfsv0"; - break; - case QFMT_VFS_V1: - fmtname = "vfsv1"; - break; - } - seq_printf(seq, ",jqfmt=%s", fmtname); - } - - if (sbi->s_qf_names[USRQUOTA]) - seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); - - if (sbi->s_qf_names[GRPQUOTA]) - seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); - - if (test_opt(sb, USRQUOTA)) - seq_puts(seq, ",usrquota"); - - if (test_opt(sb, GRPQUOTA)) - seq_puts(seq, ",grpquota"); -#endif -} - -static char *data_mode_string(unsigned long mode) -{ - switch (mode) { - case EXT3_MOUNT_JOURNAL_DATA: - return "journal"; - case EXT3_MOUNT_ORDERED_DATA: - return "ordered"; - case EXT3_MOUNT_WRITEBACK_DATA: - return "writeback"; - } - return "unknown"; -} - -/* - * Show an option if - * - it's set to a non-default value OR - * - if the per-sb default is different from the global default - */ -static int ext3_show_options(struct seq_file *seq, struct dentry *root) -{ - struct super_block *sb = root->d_sb; - struct ext3_sb_info *sbi = EXT3_SB(sb); - struct ext3_super_block *es = sbi->s_es; - unsigned long def_mount_opts; - - def_mount_opts = le32_to_cpu(es->s_default_mount_opts); - - if (sbi->s_sb_block != 1) - seq_printf(seq, ",sb=%lu", sbi->s_sb_block); - if (test_opt(sb, MINIX_DF)) - seq_puts(seq, ",minixdf"); - if (test_opt(sb, GRPID)) - seq_puts(seq, ",grpid"); - if (!test_opt(sb, GRPID) && (def_mount_opts & EXT3_DEFM_BSDGROUPS)) - seq_puts(seq, ",nogrpid"); - if (!uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT3_DEF_RESUID)) || - le16_to_cpu(es->s_def_resuid) != EXT3_DEF_RESUID) { - seq_printf(seq, ",resuid=%u", - from_kuid_munged(&init_user_ns, sbi->s_resuid)); - } - if (!gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT3_DEF_RESGID)) || - le16_to_cpu(es->s_def_resgid) != EXT3_DEF_RESGID) { - seq_printf(seq, ",resgid=%u", - from_kgid_munged(&init_user_ns, sbi->s_resgid)); - } - if (test_opt(sb, ERRORS_RO)) { - int def_errors = le16_to_cpu(es->s_errors); - - if (def_errors == EXT3_ERRORS_PANIC || - def_errors == EXT3_ERRORS_CONTINUE) { - seq_puts(seq, ",errors=remount-ro"); - } - } - if (test_opt(sb, ERRORS_CONT)) - seq_puts(seq, ",errors=continue"); - if (test_opt(sb, ERRORS_PANIC)) - seq_puts(seq, ",errors=panic"); - if (test_opt(sb, NO_UID32)) - seq_puts(seq, ",nouid32"); - if (test_opt(sb, DEBUG)) - seq_puts(seq, ",debug"); -#ifdef CONFIG_EXT3_FS_XATTR - if (test_opt(sb, XATTR_USER)) - seq_puts(seq, ",user_xattr"); - if (!test_opt(sb, XATTR_USER) && - (def_mount_opts & EXT3_DEFM_XATTR_USER)) { - seq_puts(seq, ",nouser_xattr"); - } -#endif -#ifdef CONFIG_EXT3_FS_POSIX_ACL - if (test_opt(sb, POSIX_ACL)) - seq_puts(seq, ",acl"); - if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT3_DEFM_ACL)) - seq_puts(seq, ",noacl"); -#endif - if (!test_opt(sb, RESERVATION)) - seq_puts(seq, ",noreservation"); - if (sbi->s_commit_interval) { - seq_printf(seq, ",commit=%u", - (unsigned) (sbi->s_commit_interval / HZ)); - } - - /* - * Always display barrier state so it's clear what the status is. - */ - seq_puts(seq, ",barrier="); - seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0"); - seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS))); - if (test_opt(sb, DATA_ERR_ABORT)) - seq_puts(seq, ",data_err=abort"); - - if (test_opt(sb, NOLOAD)) - seq_puts(seq, ",norecovery"); - - ext3_show_quota_options(seq, sb); - - return 0; -} - - -static struct inode *ext3_nfs_get_inode(struct super_block *sb, - u64 ino, u32 generation) -{ - struct inode *inode; - - if (ino < EXT3_FIRST_INO(sb) && ino != EXT3_ROOT_INO) - return ERR_PTR(-ESTALE); - if (ino > le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count)) - return ERR_PTR(-ESTALE); - - /* iget isn't really right if the inode is currently unallocated!! - * - * ext3_read_inode will return a bad_inode if the inode had been - * deleted, so we should be safe. - * - * Currently we don't know the generation for parent directory, so - * a generation of 0 means "accept any" - */ - inode = ext3_iget(sb, ino); - if (IS_ERR(inode)) - return ERR_CAST(inode); - if (generation && inode->i_generation != generation) { - iput(inode); - return ERR_PTR(-ESTALE); - } - - return inode; -} - -static struct dentry *ext3_fh_to_dentry(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type) -{ - return generic_fh_to_dentry(sb, fid, fh_len, fh_type, - ext3_nfs_get_inode); -} - -static struct dentry *ext3_fh_to_parent(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type) -{ - return generic_fh_to_parent(sb, fid, fh_len, fh_type, - ext3_nfs_get_inode); -} - -/* - * Try to release metadata pages (indirect blocks, directories) which are - * mapped via the block device. Since these pages could have journal heads - * which would prevent try_to_free_buffers() from freeing them, we must use - * jbd layer's try_to_free_buffers() function to release them. - */ -static int bdev_try_to_free_page(struct super_block *sb, struct page *page, - gfp_t wait) -{ - journal_t *journal = EXT3_SB(sb)->s_journal; - - WARN_ON(PageChecked(page)); - if (!page_has_buffers(page)) - return 0; - if (journal) - return journal_try_to_free_buffers(journal, page, - wait & ~__GFP_WAIT); - return try_to_free_buffers(page); -} - -#ifdef CONFIG_QUOTA -#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group") -#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) - -static int ext3_write_dquot(struct dquot *dquot); -static int ext3_acquire_dquot(struct dquot *dquot); -static int ext3_release_dquot(struct dquot *dquot); -static int ext3_mark_dquot_dirty(struct dquot *dquot); -static int ext3_write_info(struct super_block *sb, int type); -static int ext3_quota_on(struct super_block *sb, int type, int format_id, - struct path *path); -static int ext3_quota_on_mount(struct super_block *sb, int type); -static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data, - size_t len, loff_t off); -static ssize_t ext3_quota_write(struct super_block *sb, int type, - const char *data, size_t len, loff_t off); -static struct dquot **ext3_get_dquots(struct inode *inode) -{ - return EXT3_I(inode)->i_dquot; -} - -static const struct dquot_operations ext3_quota_operations = { - .write_dquot = ext3_write_dquot, - .acquire_dquot = ext3_acquire_dquot, - .release_dquot = ext3_release_dquot, - .mark_dirty = ext3_mark_dquot_dirty, - .write_info = ext3_write_info, - .alloc_dquot = dquot_alloc, - .destroy_dquot = dquot_destroy, -}; - -static const struct quotactl_ops ext3_qctl_operations = { - .quota_on = ext3_quota_on, - .quota_off = dquot_quota_off, - .quota_sync = dquot_quota_sync, - .get_state = dquot_get_state, - .set_info = dquot_set_dqinfo, - .get_dqblk = dquot_get_dqblk, - .set_dqblk = dquot_set_dqblk -}; -#endif - -static const struct super_operations ext3_sops = { - .alloc_inode = ext3_alloc_inode, - .destroy_inode = ext3_destroy_inode, - .write_inode = ext3_write_inode, - .dirty_inode = ext3_dirty_inode, - .drop_inode = ext3_drop_inode, - .evict_inode = ext3_evict_inode, - .put_super = ext3_put_super, - .sync_fs = ext3_sync_fs, - .freeze_fs = ext3_freeze, - .unfreeze_fs = ext3_unfreeze, - .statfs = ext3_statfs, - .remount_fs = ext3_remount, - .show_options = ext3_show_options, -#ifdef CONFIG_QUOTA - .quota_read = ext3_quota_read, - .quota_write = ext3_quota_write, - .get_dquots = ext3_get_dquots, -#endif - .bdev_try_to_free_page = bdev_try_to_free_page, -}; - -static const struct export_operations ext3_export_ops = { - .fh_to_dentry = ext3_fh_to_dentry, - .fh_to_parent = ext3_fh_to_parent, - .get_parent = ext3_get_parent, -}; - -enum { - Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, - Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, - Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, - Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, - Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, - Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, - Opt_journal_path, - Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, - Opt_data_err_abort, Opt_data_err_ignore, - Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, - Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, - Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, - Opt_resize, Opt_usrquota, Opt_grpquota -}; - -static const match_table_t tokens = { - {Opt_bsd_df, "bsddf"}, - {Opt_minix_df, "minixdf"}, - {Opt_grpid, "grpid"}, - {Opt_grpid, "bsdgroups"}, - {Opt_nogrpid, "nogrpid"}, - {Opt_nogrpid, "sysvgroups"}, - {Opt_resgid, "resgid=%u"}, - {Opt_resuid, "resuid=%u"}, - {Opt_sb, "sb=%u"}, - {Opt_err_cont, "errors=continue"}, - {Opt_err_panic, "errors=panic"}, - {Opt_err_ro, "errors=remount-ro"}, - {Opt_nouid32, "nouid32"}, - {Opt_nocheck, "nocheck"}, - {Opt_nocheck, "check=none"}, - {Opt_debug, "debug"}, - {Opt_oldalloc, "oldalloc"}, - {Opt_orlov, "orlov"}, - {Opt_user_xattr, "user_xattr"}, - {Opt_nouser_xattr, "nouser_xattr"}, - {Opt_acl, "acl"}, - {Opt_noacl, "noacl"}, - {Opt_reservation, "reservation"}, - {Opt_noreservation, "noreservation"}, - {Opt_noload, "noload"}, - {Opt_noload, "norecovery"}, - {Opt_nobh, "nobh"}, - {Opt_bh, "bh"}, - {Opt_commit, "commit=%u"}, - {Opt_journal_update, "journal=update"}, - {Opt_journal_inum, "journal=%u"}, - {Opt_journal_dev, "journal_dev=%u"}, - {Opt_journal_path, "journal_path=%s"}, - {Opt_abort, "abort"}, - {Opt_data_journal, "data=journal"}, - {Opt_data_ordered, "data=ordered"}, - {Opt_data_writeback, "data=writeback"}, - {Opt_data_err_abort, "data_err=abort"}, - {Opt_data_err_ignore, "data_err=ignore"}, - {Opt_offusrjquota, "usrjquota="}, - {Opt_usrjquota, "usrjquota=%s"}, - {Opt_offgrpjquota, "grpjquota="}, - {Opt_grpjquota, "grpjquota=%s"}, - {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, - {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, - {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, - {Opt_grpquota, "grpquota"}, - {Opt_noquota, "noquota"}, - {Opt_quota, "quota"}, - {Opt_usrquota, "usrquota"}, - {Opt_barrier, "barrier=%u"}, - {Opt_barrier, "barrier"}, - {Opt_nobarrier, "nobarrier"}, - {Opt_resize, "resize"}, - {Opt_err, NULL}, -}; - -static ext3_fsblk_t get_sb_block(void **data, struct super_block *sb) -{ - ext3_fsblk_t sb_block; - char *options = (char *) *data; - - if (!options || strncmp(options, "sb=", 3) != 0) - return 1; /* Default location */ - options += 3; - /*todo: use simple_strtoll with >32bit ext3 */ - sb_block = simple_strtoul(options, &options, 0); - if (*options && *options != ',') { - ext3_msg(sb, KERN_ERR, "error: invalid sb specification: %s", - (char *) *data); - return 1; - } - if (*options == ',') - options++; - *data = (void *) options; - return sb_block; -} - -#ifdef CONFIG_QUOTA -static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) -{ - struct ext3_sb_info *sbi = EXT3_SB(sb); - char *qname; - - if (sb_any_quota_loaded(sb) && - !sbi->s_qf_names[qtype]) { - ext3_msg(sb, KERN_ERR, - "Cannot change journaled " - "quota options when quota turned on"); - return 0; - } - qname = match_strdup(args); - if (!qname) { - ext3_msg(sb, KERN_ERR, - "Not enough memory for storing quotafile name"); - return 0; - } - if (sbi->s_qf_names[qtype]) { - int same = !strcmp(sbi->s_qf_names[qtype], qname); - - kfree(qname); - if (!same) { - ext3_msg(sb, KERN_ERR, - "%s quota file already specified", - QTYPE2NAME(qtype)); - } - return same; - } - if (strchr(qname, '/')) { - ext3_msg(sb, KERN_ERR, - "quotafile must be on filesystem root"); - kfree(qname); - return 0; - } - sbi->s_qf_names[qtype] = qname; - set_opt(sbi->s_mount_opt, QUOTA); - return 1; -} - -static int clear_qf_name(struct super_block *sb, int qtype) { - - struct ext3_sb_info *sbi = EXT3_SB(sb); - - if (sb_any_quota_loaded(sb) && - sbi->s_qf_names[qtype]) { - ext3_msg(sb, KERN_ERR, "Cannot change journaled quota options" - " when quota turned on"); - return 0; - } - if (sbi->s_qf_names[qtype]) { - kfree(sbi->s_qf_names[qtype]); - sbi->s_qf_names[qtype] = NULL; - } - return 1; -} -#endif - -static int parse_options (char *options, struct super_block *sb, - unsigned int *inum, unsigned long *journal_devnum, - ext3_fsblk_t *n_blocks_count, int is_remount) -{ - struct ext3_sb_info *sbi = EXT3_SB(sb); - char * p; - substring_t args[MAX_OPT_ARGS]; - int data_opt = 0; - int option; - kuid_t uid; - kgid_t gid; - char *journal_path; - struct inode *journal_inode; - struct path path; - int error; - -#ifdef CONFIG_QUOTA - int qfmt; -#endif - - if (!options) - return 1; - - while ((p = strsep (&options, ",")) != NULL) { - int token; - if (!*p) - continue; - /* - * Initialize args struct so we know whether arg was - * found; some options take optional arguments. - */ - args[0].to = args[0].from = NULL; - token = match_token(p, tokens, args); - switch (token) { - case Opt_bsd_df: - clear_opt (sbi->s_mount_opt, MINIX_DF); - break; - case Opt_minix_df: - set_opt (sbi->s_mount_opt, MINIX_DF); - break; - case Opt_grpid: - set_opt (sbi->s_mount_opt, GRPID); - break; - case Opt_nogrpid: - clear_opt (sbi->s_mount_opt, GRPID); - break; - case Opt_resuid: - if (match_int(&args[0], &option)) - return 0; - uid = make_kuid(current_user_ns(), option); - if (!uid_valid(uid)) { - ext3_msg(sb, KERN_ERR, "Invalid uid value %d", option); - return 0; - - } - sbi->s_resuid = uid; - break; - case Opt_resgid: - if (match_int(&args[0], &option)) - return 0; - gid = make_kgid(current_user_ns(), option); - if (!gid_valid(gid)) { - ext3_msg(sb, KERN_ERR, "Invalid gid value %d", option); - return 0; - } - sbi->s_resgid = gid; - break; - case Opt_sb: - /* handled by get_sb_block() instead of here */ - /* *sb_block = match_int(&args[0]); */ - break; - case Opt_err_panic: - clear_opt (sbi->s_mount_opt, ERRORS_CONT); - clear_opt (sbi->s_mount_opt, ERRORS_RO); - set_opt (sbi->s_mount_opt, ERRORS_PANIC); - break; - case Opt_err_ro: - clear_opt (sbi->s_mount_opt, ERRORS_CONT); - clear_opt (sbi->s_mount_opt, ERRORS_PANIC); - set_opt (sbi->s_mount_opt, ERRORS_RO); - break; - case Opt_err_cont: - clear_opt (sbi->s_mount_opt, ERRORS_RO); - clear_opt (sbi->s_mount_opt, ERRORS_PANIC); - set_opt (sbi->s_mount_opt, ERRORS_CONT); - break; - case Opt_nouid32: - set_opt (sbi->s_mount_opt, NO_UID32); - break; - case Opt_nocheck: - clear_opt (sbi->s_mount_opt, CHECK); - break; - case Opt_debug: - set_opt (sbi->s_mount_opt, DEBUG); - break; - case Opt_oldalloc: - ext3_msg(sb, KERN_WARNING, - "Ignoring deprecated oldalloc option"); - break; - case Opt_orlov: - ext3_msg(sb, KERN_WARNING, - "Ignoring deprecated orlov option"); - break; -#ifdef CONFIG_EXT3_FS_XATTR - case Opt_user_xattr: - set_opt (sbi->s_mount_opt, XATTR_USER); - break; - case Opt_nouser_xattr: - clear_opt (sbi->s_mount_opt, XATTR_USER); - break; -#else - case Opt_user_xattr: - case Opt_nouser_xattr: - ext3_msg(sb, KERN_INFO, - "(no)user_xattr options not supported"); - break; -#endif -#ifdef CONFIG_EXT3_FS_POSIX_ACL - case Opt_acl: - set_opt(sbi->s_mount_opt, POSIX_ACL); - break; - case Opt_noacl: - clear_opt(sbi->s_mount_opt, POSIX_ACL); - break; -#else - case Opt_acl: - case Opt_noacl: - ext3_msg(sb, KERN_INFO, - "(no)acl options not supported"); - break; -#endif - case Opt_reservation: - set_opt(sbi->s_mount_opt, RESERVATION); - break; - case Opt_noreservation: - clear_opt(sbi->s_mount_opt, RESERVATION); - break; - case Opt_journal_update: - /* @@@ FIXME */ - /* Eventually we will want to be able to create - a journal file here. For now, only allow the - user to specify an existing inode to be the - journal file. */ - if (is_remount) { - ext3_msg(sb, KERN_ERR, "error: cannot specify " - "journal on remount"); - return 0; - } - set_opt (sbi->s_mount_opt, UPDATE_JOURNAL); - break; - case Opt_journal_inum: - if (is_remount) { - ext3_msg(sb, KERN_ERR, "error: cannot specify " - "journal on remount"); - return 0; - } - if (match_int(&args[0], &option)) - return 0; - *inum = option; - break; - case Opt_journal_dev: - if (is_remount) { - ext3_msg(sb, KERN_ERR, "error: cannot specify " - "journal on remount"); - return 0; - } - if (match_int(&args[0], &option)) - return 0; - *journal_devnum = option; - break; - case Opt_journal_path: - if (is_remount) { - ext3_msg(sb, KERN_ERR, "error: cannot specify " - "journal on remount"); - return 0; - } - - journal_path = match_strdup(&args[0]); - if (!journal_path) { - ext3_msg(sb, KERN_ERR, "error: could not dup " - "journal device string"); - return 0; - } - - error = kern_path(journal_path, LOOKUP_FOLLOW, &path); - if (error) { - ext3_msg(sb, KERN_ERR, "error: could not find " - "journal device path: error %d", error); - kfree(journal_path); - return 0; - } - - journal_inode = d_inode(path.dentry); - if (!S_ISBLK(journal_inode->i_mode)) { - ext3_msg(sb, KERN_ERR, "error: journal path %s " - "is not a block device", journal_path); - path_put(&path); - kfree(journal_path); - return 0; - } - - *journal_devnum = new_encode_dev(journal_inode->i_rdev); - path_put(&path); - kfree(journal_path); - break; - case Opt_noload: - set_opt (sbi->s_mount_opt, NOLOAD); - break; - case Opt_commit: - if (match_int(&args[0], &option)) - return 0; - if (option < 0) - return 0; - if (option == 0) - option = JBD_DEFAULT_MAX_COMMIT_AGE; - sbi->s_commit_interval = HZ * option; - break; - case Opt_data_journal: - data_opt = EXT3_MOUNT_JOURNAL_DATA; - goto datacheck; - case Opt_data_ordered: - data_opt = EXT3_MOUNT_ORDERED_DATA; - goto datacheck; - case Opt_data_writeback: - data_opt = EXT3_MOUNT_WRITEBACK_DATA; - datacheck: - if (is_remount) { - if (test_opt(sb, DATA_FLAGS) == data_opt) - break; - ext3_msg(sb, KERN_ERR, - "error: cannot change " - "data mode on remount. The filesystem " - "is mounted in data=%s mode and you " - "try to remount it in data=%s mode.", - data_mode_string(test_opt(sb, - DATA_FLAGS)), - data_mode_string(data_opt)); - return 0; - } else { - clear_opt(sbi->s_mount_opt, DATA_FLAGS); - sbi->s_mount_opt |= data_opt; - } - break; - case Opt_data_err_abort: - set_opt(sbi->s_mount_opt, DATA_ERR_ABORT); - break; - case Opt_data_err_ignore: - clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); - break; -#ifdef CONFIG_QUOTA - case Opt_usrjquota: - if (!set_qf_name(sb, USRQUOTA, &args[0])) - return 0; - break; - case Opt_grpjquota: - if (!set_qf_name(sb, GRPQUOTA, &args[0])) - return 0; - break; - case Opt_offusrjquota: - if (!clear_qf_name(sb, USRQUOTA)) - return 0; - break; - case Opt_offgrpjquota: - if (!clear_qf_name(sb, GRPQUOTA)) - return 0; - break; - case Opt_jqfmt_vfsold: - qfmt = QFMT_VFS_OLD; - goto set_qf_format; - case Opt_jqfmt_vfsv0: - qfmt = QFMT_VFS_V0; - goto set_qf_format; - case Opt_jqfmt_vfsv1: - qfmt = QFMT_VFS_V1; -set_qf_format: - if (sb_any_quota_loaded(sb) && - sbi->s_jquota_fmt != qfmt) { - ext3_msg(sb, KERN_ERR, "error: cannot change " - "journaled quota options when " - "quota turned on."); - return 0; - } - sbi->s_jquota_fmt = qfmt; - break; - case Opt_quota: - case Opt_usrquota: - set_opt(sbi->s_mount_opt, QUOTA); - set_opt(sbi->s_mount_opt, USRQUOTA); - break; - case Opt_grpquota: - set_opt(sbi->s_mount_opt, QUOTA); - set_opt(sbi->s_mount_opt, GRPQUOTA); - break; - case Opt_noquota: - if (sb_any_quota_loaded(sb)) { - ext3_msg(sb, KERN_ERR, "error: cannot change " - "quota options when quota turned on."); - return 0; - } - clear_opt(sbi->s_mount_opt, QUOTA); - clear_opt(sbi->s_mount_opt, USRQUOTA); - clear_opt(sbi->s_mount_opt, GRPQUOTA); - break; -#else - case Opt_quota: - case Opt_usrquota: - case Opt_grpquota: - ext3_msg(sb, KERN_ERR, - "error: quota options not supported."); - break; - case Opt_usrjquota: - case Opt_grpjquota: - case Opt_offusrjquota: - case Opt_offgrpjquota: - case Opt_jqfmt_vfsold: - case Opt_jqfmt_vfsv0: - case Opt_jqfmt_vfsv1: - ext3_msg(sb, KERN_ERR, - "error: journaled quota options not " - "supported."); - break; - case Opt_noquota: - break; -#endif - case Opt_abort: - set_opt(sbi->s_mount_opt, ABORT); - break; - case Opt_nobarrier: - clear_opt(sbi->s_mount_opt, BARRIER); - break; - case Opt_barrier: - if (args[0].from) { - if (match_int(&args[0], &option)) - return 0; - } else - option = 1; /* No argument, default to 1 */ - if (option) - set_opt(sbi->s_mount_opt, BARRIER); - else - clear_opt(sbi->s_mount_opt, BARRIER); - break; - case Opt_ignore: - break; - case Opt_resize: - if (!is_remount) { - ext3_msg(sb, KERN_ERR, - "error: resize option only available " - "for remount"); - return 0; - } - if (match_int(&args[0], &option) != 0) - return 0; - *n_blocks_count = option; - break; - case Opt_nobh: - ext3_msg(sb, KERN_WARNING, - "warning: ignoring deprecated nobh option"); - break; - case Opt_bh: - ext3_msg(sb, KERN_WARNING, - "warning: ignoring deprecated bh option"); - break; - default: - ext3_msg(sb, KERN_ERR, - "error: unrecognized mount option \"%s\" " - "or missing value", p); - return 0; - } - } -#ifdef CONFIG_QUOTA - if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { - if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) - clear_opt(sbi->s_mount_opt, USRQUOTA); - if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) - clear_opt(sbi->s_mount_opt, GRPQUOTA); - - if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) { - ext3_msg(sb, KERN_ERR, "error: old and new quota " - "format mixing."); - return 0; - } - - if (!sbi->s_jquota_fmt) { - ext3_msg(sb, KERN_ERR, "error: journaled quota format " - "not specified."); - return 0; - } - } -#endif - return 1; -} - -static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es, - int read_only) -{ - struct ext3_sb_info *sbi = EXT3_SB(sb); - int res = 0; - - if (le32_to_cpu(es->s_rev_level) > EXT3_MAX_SUPP_REV) { - ext3_msg(sb, KERN_ERR, - "error: revision level too high, " - "forcing read-only mode"); - res = MS_RDONLY; - } - if (read_only) - return res; - if (!(sbi->s_mount_state & EXT3_VALID_FS)) - ext3_msg(sb, KERN_WARNING, - "warning: mounting unchecked fs, " - "running e2fsck is recommended"); - else if ((sbi->s_mount_state & EXT3_ERROR_FS)) - ext3_msg(sb, KERN_WARNING, - "warning: mounting fs with errors, " - "running e2fsck is recommended"); - else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 && - le16_to_cpu(es->s_mnt_count) >= - le16_to_cpu(es->s_max_mnt_count)) - ext3_msg(sb, KERN_WARNING, - "warning: maximal mount count reached, " - "running e2fsck is recommended"); - else if (le32_to_cpu(es->s_checkinterval) && - (le32_to_cpu(es->s_lastcheck) + - le32_to_cpu(es->s_checkinterval) <= get_seconds())) - ext3_msg(sb, KERN_WARNING, - "warning: checktime reached, " - "running e2fsck is recommended"); -#if 0 - /* @@@ We _will_ want to clear the valid bit if we find - inconsistencies, to force a fsck at reboot. But for - a plain journaled filesystem we can keep it set as - valid forever! :) */ - es->s_state &= cpu_to_le16(~EXT3_VALID_FS); -#endif - if (!le16_to_cpu(es->s_max_mnt_count)) - es->s_max_mnt_count = cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT); - le16_add_cpu(&es->s_mnt_count, 1); - es->s_mtime = cpu_to_le32(get_seconds()); - ext3_update_dynamic_rev(sb); - EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); - - ext3_commit_super(sb, es, 1); - if (test_opt(sb, DEBUG)) - ext3_msg(sb, KERN_INFO, "[bs=%lu, gc=%lu, " - "bpg=%lu, ipg=%lu, mo=%04lx]", - sb->s_blocksize, - sbi->s_groups_count, - EXT3_BLOCKS_PER_GROUP(sb), - EXT3_INODES_PER_GROUP(sb), - sbi->s_mount_opt); - - if (EXT3_SB(sb)->s_journal->j_inode == NULL) { - char b[BDEVNAME_SIZE]; - ext3_msg(sb, KERN_INFO, "using external journal on %s", - bdevname(EXT3_SB(sb)->s_journal->j_dev, b)); - } else { - ext3_msg(sb, KERN_INFO, "using internal journal"); - } - cleancache_init_fs(sb); - return res; -} - -/* Called at mount-time, super-block is locked */ -static int ext3_check_descriptors(struct super_block *sb) -{ - struct ext3_sb_info *sbi = EXT3_SB(sb); - int i; - - ext3_debug ("Checking group descriptors"); - - for (i = 0; i < sbi->s_groups_count; i++) { - struct ext3_group_desc *gdp = ext3_get_group_desc(sb, i, NULL); - ext3_fsblk_t first_block = ext3_group_first_block_no(sb, i); - ext3_fsblk_t last_block; - - if (i == sbi->s_groups_count - 1) - last_block = le32_to_cpu(sbi->s_es->s_blocks_count) - 1; - else - last_block = first_block + - (EXT3_BLOCKS_PER_GROUP(sb) - 1); - - if (le32_to_cpu(gdp->bg_block_bitmap) < first_block || - le32_to_cpu(gdp->bg_block_bitmap) > last_block) - { - ext3_error (sb, "ext3_check_descriptors", - "Block bitmap for group %d" - " not in group (block %lu)!", - i, (unsigned long) - le32_to_cpu(gdp->bg_block_bitmap)); - return 0; - } - if (le32_to_cpu(gdp->bg_inode_bitmap) < first_block || - le32_to_cpu(gdp->bg_inode_bitmap) > last_block) - { - ext3_error (sb, "ext3_check_descriptors", - "Inode bitmap for group %d" - " not in group (block %lu)!", - i, (unsigned long) - le32_to_cpu(gdp->bg_inode_bitmap)); - return 0; - } - if (le32_to_cpu(gdp->bg_inode_table) < first_block || - le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group - 1 > - last_block) - { - ext3_error (sb, "ext3_check_descriptors", - "Inode table for group %d" - " not in group (block %lu)!", - i, (unsigned long) - le32_to_cpu(gdp->bg_inode_table)); - return 0; - } - } - - sbi->s_es->s_free_blocks_count=cpu_to_le32(ext3_count_free_blocks(sb)); - sbi->s_es->s_free_inodes_count=cpu_to_le32(ext3_count_free_inodes(sb)); - return 1; -} - - -/* ext3_orphan_cleanup() walks a singly-linked list of inodes (starting at - * the superblock) which were deleted from all directories, but held open by - * a process at the time of a crash. We walk the list and try to delete these - * inodes at recovery time (only with a read-write filesystem). - * - * In order to keep the orphan inode chain consistent during traversal (in - * case of crash during recovery), we link each inode into the superblock - * orphan list_head and handle it the same way as an inode deletion during - * normal operation (which journals the operations for us). - * - * We only do an iget() and an iput() on each inode, which is very safe if we - * accidentally point at an in-use or already deleted inode. The worst that - * can happen in this case is that we get a "bit already cleared" message from - * ext3_free_inode(). The only reason we would point at a wrong inode is if - * e2fsck was run on this filesystem, and it must have already done the orphan - * inode cleanup for us, so we can safely abort without any further action. - */ -static void ext3_orphan_cleanup (struct super_block * sb, - struct ext3_super_block * es) -{ - unsigned int s_flags = sb->s_flags; - int nr_orphans = 0, nr_truncates = 0; -#ifdef CONFIG_QUOTA - int i; -#endif - if (!es->s_last_orphan) { - jbd_debug(4, "no orphan inodes to clean up\n"); - return; - } - - if (bdev_read_only(sb->s_bdev)) { - ext3_msg(sb, KERN_ERR, "error: write access " - "unavailable, skipping orphan cleanup."); - return; - } - - /* Check if feature set allows readwrite operations */ - if (EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP)) { - ext3_msg(sb, KERN_INFO, "Skipping orphan cleanup due to " - "unknown ROCOMPAT features"); - return; - } - - if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) { - /* don't clear list on RO mount w/ errors */ - if (es->s_last_orphan && !(s_flags & MS_RDONLY)) { - jbd_debug(1, "Errors on filesystem, " - "clearing orphan list.\n"); - es->s_last_orphan = 0; - } - jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); - return; - } - - if (s_flags & MS_RDONLY) { - ext3_msg(sb, KERN_INFO, "orphan cleanup on readonly fs"); - sb->s_flags &= ~MS_RDONLY; - } -#ifdef CONFIG_QUOTA - /* Needed for iput() to work correctly and not trash data */ - sb->s_flags |= MS_ACTIVE; - /* Turn on quotas so that they are updated correctly */ - for (i = 0; i < EXT3_MAXQUOTAS; i++) { - if (EXT3_SB(sb)->s_qf_names[i]) { - int ret = ext3_quota_on_mount(sb, i); - if (ret < 0) - ext3_msg(sb, KERN_ERR, - "error: cannot turn on journaled " - "quota: %d", ret); - } - } -#endif - - while (es->s_last_orphan) { - struct inode *inode; - - inode = ext3_orphan_get(sb, le32_to_cpu(es->s_last_orphan)); - if (IS_ERR(inode)) { - es->s_last_orphan = 0; - break; - } - - list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan); - dquot_initialize(inode); - if (inode->i_nlink) { - printk(KERN_DEBUG - "%s: truncating inode %lu to %Ld bytes\n", - __func__, inode->i_ino, inode->i_size); - jbd_debug(2, "truncating inode %lu to %Ld bytes\n", - inode->i_ino, inode->i_size); - ext3_truncate(inode); - nr_truncates++; - } else { - printk(KERN_DEBUG - "%s: deleting unreferenced inode %lu\n", - __func__, inode->i_ino); - jbd_debug(2, "deleting unreferenced inode %lu\n", - inode->i_ino); - nr_orphans++; - } - iput(inode); /* The delete magic happens here! */ - } - -#define PLURAL(x) (x), ((x)==1) ? "" : "s" - - if (nr_orphans) - ext3_msg(sb, KERN_INFO, "%d orphan inode%s deleted", - PLURAL(nr_orphans)); - if (nr_truncates) - ext3_msg(sb, KERN_INFO, "%d truncate%s cleaned up", - PLURAL(nr_truncates)); -#ifdef CONFIG_QUOTA - /* Turn quotas off */ - for (i = 0; i < EXT3_MAXQUOTAS; i++) { - if (sb_dqopt(sb)->files[i]) - dquot_quota_off(sb, i); - } -#endif - sb->s_flags = s_flags; /* Restore MS_RDONLY status */ -} - -/* - * Maximal file size. There is a direct, and {,double-,triple-}indirect - * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks. - * We need to be 1 filesystem block less than the 2^32 sector limit. - */ -static loff_t ext3_max_size(int bits) -{ - loff_t res = EXT3_NDIR_BLOCKS; - int meta_blocks; - loff_t upper_limit; - - /* This is calculated to be the largest file size for a - * dense, file such that the total number of - * sectors in the file, including data and all indirect blocks, - * does not exceed 2^32 -1 - * __u32 i_blocks representing the total number of - * 512 bytes blocks of the file - */ - upper_limit = (1LL << 32) - 1; - - /* total blocks in file system block size */ - upper_limit >>= (bits - 9); - - - /* indirect blocks */ - meta_blocks = 1; - /* double indirect blocks */ - meta_blocks += 1 + (1LL << (bits-2)); - /* tripple indirect blocks */ - meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2))); - - upper_limit -= meta_blocks; - upper_limit <<= bits; - - res += 1LL << (bits-2); - res += 1LL << (2*(bits-2)); - res += 1LL << (3*(bits-2)); - res <<= bits; - if (res > upper_limit) - res = upper_limit; - - if (res > MAX_LFS_FILESIZE) - res = MAX_LFS_FILESIZE; - - return res; -} - -static ext3_fsblk_t descriptor_loc(struct super_block *sb, - ext3_fsblk_t logic_sb_block, - int nr) -{ - struct ext3_sb_info *sbi = EXT3_SB(sb); - unsigned long bg, first_meta_bg; - int has_super = 0; - - first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); - - if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_META_BG) || - nr < first_meta_bg) - return (logic_sb_block + nr + 1); - bg = sbi->s_desc_per_block * nr; - if (ext3_bg_has_super(sb, bg)) - has_super = 1; - return (has_super + ext3_group_first_block_no(sb, bg)); -} - - -static int ext3_fill_super (struct super_block *sb, void *data, int silent) -{ - struct buffer_head * bh; - struct ext3_super_block *es = NULL; - struct ext3_sb_info *sbi; - ext3_fsblk_t block; - ext3_fsblk_t sb_block = get_sb_block(&data, sb); - ext3_fsblk_t logic_sb_block; - unsigned long offset = 0; - unsigned int journal_inum = 0; - unsigned long journal_devnum = 0; - unsigned long def_mount_opts; - struct inode *root; - int blocksize; - int hblock; - int db_count; - int i; - int needs_recovery; - int ret = -EINVAL; - __le32 features; - int err; - - sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); - if (!sbi) - return -ENOMEM; - - sbi->s_blockgroup_lock = - kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); - if (!sbi->s_blockgroup_lock) { - kfree(sbi); - return -ENOMEM; - } - sb->s_fs_info = sbi; - sbi->s_sb_block = sb_block; - - blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE); - if (!blocksize) { - ext3_msg(sb, KERN_ERR, "error: unable to set blocksize"); - goto out_fail; - } - - /* - * The ext3 superblock will not be buffer aligned for other than 1kB - * block sizes. We need to calculate the offset from buffer start. - */ - if (blocksize != EXT3_MIN_BLOCK_SIZE) { - logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize; - offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize; - } else { - logic_sb_block = sb_block; - } - - if (!(bh = sb_bread(sb, logic_sb_block))) { - ext3_msg(sb, KERN_ERR, "error: unable to read superblock"); - goto out_fail; - } - /* - * Note: s_es must be initialized as soon as possible because - * some ext3 macro-instructions depend on its value - */ - es = (struct ext3_super_block *) (bh->b_data + offset); - sbi->s_es = es; - sb->s_magic = le16_to_cpu(es->s_magic); - if (sb->s_magic != EXT3_SUPER_MAGIC) - goto cantfind_ext3; - - /* Set defaults before we parse the mount options */ - def_mount_opts = le32_to_cpu(es->s_default_mount_opts); - if (def_mount_opts & EXT3_DEFM_DEBUG) - set_opt(sbi->s_mount_opt, DEBUG); - if (def_mount_opts & EXT3_DEFM_BSDGROUPS) - set_opt(sbi->s_mount_opt, GRPID); - if (def_mount_opts & EXT3_DEFM_UID16) - set_opt(sbi->s_mount_opt, NO_UID32); -#ifdef CONFIG_EXT3_FS_XATTR - if (def_mount_opts & EXT3_DEFM_XATTR_USER) - set_opt(sbi->s_mount_opt, XATTR_USER); -#endif -#ifdef CONFIG_EXT3_FS_POSIX_ACL - if (def_mount_opts & EXT3_DEFM_ACL) - set_opt(sbi->s_mount_opt, POSIX_ACL); -#endif - if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_DATA) - set_opt(sbi->s_mount_opt, JOURNAL_DATA); - else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_ORDERED) - set_opt(sbi->s_mount_opt, ORDERED_DATA); - else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_WBACK) - set_opt(sbi->s_mount_opt, WRITEBACK_DATA); - - if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_PANIC) - set_opt(sbi->s_mount_opt, ERRORS_PANIC); - else if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_CONTINUE) - set_opt(sbi->s_mount_opt, ERRORS_CONT); - else - set_opt(sbi->s_mount_opt, ERRORS_RO); - - sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid)); - sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid)); - - /* enable barriers by default */ - set_opt(sbi->s_mount_opt, BARRIER); - set_opt(sbi->s_mount_opt, RESERVATION); - - if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum, - NULL, 0)) - goto failed_mount; - - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | - (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); - - if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV && - (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) || - EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) || - EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U))) - ext3_msg(sb, KERN_WARNING, - "warning: feature flags set on rev 0 fs, " - "running e2fsck is recommended"); - /* - * Check feature flags regardless of the revision level, since we - * previously didn't change the revision level when setting the flags, - * so there is a chance incompat flags are set on a rev 0 filesystem. - */ - features = EXT3_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP); - if (features) { - ext3_msg(sb, KERN_ERR, - "error: couldn't mount because of unsupported " - "optional features (%x)", le32_to_cpu(features)); - goto failed_mount; - } - features = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP); - if (!(sb->s_flags & MS_RDONLY) && features) { - ext3_msg(sb, KERN_ERR, - "error: couldn't mount RDWR because of unsupported " - "optional features (%x)", le32_to_cpu(features)); - goto failed_mount; - } - blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); - - if (blocksize < EXT3_MIN_BLOCK_SIZE || - blocksize > EXT3_MAX_BLOCK_SIZE) { - ext3_msg(sb, KERN_ERR, - "error: couldn't mount because of unsupported " - "filesystem blocksize %d", blocksize); - goto failed_mount; - } - - hblock = bdev_logical_block_size(sb->s_bdev); - if (sb->s_blocksize != blocksize) { - /* - * Make sure the blocksize for the filesystem is larger - * than the hardware sectorsize for the machine. - */ - if (blocksize < hblock) { - ext3_msg(sb, KERN_ERR, - "error: fsblocksize %d too small for " - "hardware sectorsize %d", blocksize, hblock); - goto failed_mount; - } - - brelse (bh); - if (!sb_set_blocksize(sb, blocksize)) { - ext3_msg(sb, KERN_ERR, - "error: bad blocksize %d", blocksize); - goto out_fail; - } - logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize; - offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize; - bh = sb_bread(sb, logic_sb_block); - if (!bh) { - ext3_msg(sb, KERN_ERR, - "error: can't read superblock on 2nd try"); - goto failed_mount; - } - es = (struct ext3_super_block *)(bh->b_data + offset); - sbi->s_es = es; - if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) { - ext3_msg(sb, KERN_ERR, - "error: magic mismatch"); - goto failed_mount; - } - } - - sb->s_maxbytes = ext3_max_size(sb->s_blocksize_bits); - - if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV) { - sbi->s_inode_size = EXT3_GOOD_OLD_INODE_SIZE; - sbi->s_first_ino = EXT3_GOOD_OLD_FIRST_INO; - } else { - sbi->s_inode_size = le16_to_cpu(es->s_inode_size); - sbi->s_first_ino = le32_to_cpu(es->s_first_ino); - if ((sbi->s_inode_size < EXT3_GOOD_OLD_INODE_SIZE) || - (!is_power_of_2(sbi->s_inode_size)) || - (sbi->s_inode_size > blocksize)) { - ext3_msg(sb, KERN_ERR, - "error: unsupported inode size: %d", - sbi->s_inode_size); - goto failed_mount; - } - } - sbi->s_frag_size = EXT3_MIN_FRAG_SIZE << - le32_to_cpu(es->s_log_frag_size); - if (blocksize != sbi->s_frag_size) { - ext3_msg(sb, KERN_ERR, - "error: fragsize %lu != blocksize %u (unsupported)", - sbi->s_frag_size, blocksize); - goto failed_mount; - } - sbi->s_frags_per_block = 1; - sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); - sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group); - sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); - if (EXT3_INODE_SIZE(sb) == 0 || EXT3_INODES_PER_GROUP(sb) == 0) - goto cantfind_ext3; - sbi->s_inodes_per_block = blocksize / EXT3_INODE_SIZE(sb); - if (sbi->s_inodes_per_block == 0) - goto cantfind_ext3; - sbi->s_itb_per_group = sbi->s_inodes_per_group / - sbi->s_inodes_per_block; - sbi->s_desc_per_block = blocksize / sizeof(struct ext3_group_desc); - sbi->s_sbh = bh; - sbi->s_mount_state = le16_to_cpu(es->s_state); - sbi->s_addr_per_block_bits = ilog2(EXT3_ADDR_PER_BLOCK(sb)); - sbi->s_desc_per_block_bits = ilog2(EXT3_DESC_PER_BLOCK(sb)); - for (i = 0; i < 4; i++) - sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); - sbi->s_def_hash_version = es->s_def_hash_version; - i = le32_to_cpu(es->s_flags); - if (i & EXT2_FLAGS_UNSIGNED_HASH) - sbi->s_hash_unsigned = 3; - else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { -#ifdef __CHAR_UNSIGNED__ - es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); - sbi->s_hash_unsigned = 3; -#else - es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); -#endif - } - - if (sbi->s_blocks_per_group > blocksize * 8) { - ext3_msg(sb, KERN_ERR, - "#blocks per group too big: %lu", - sbi->s_blocks_per_group); - goto failed_mount; - } - if (sbi->s_frags_per_group > blocksize * 8) { - ext3_msg(sb, KERN_ERR, - "error: #fragments per group too big: %lu", - sbi->s_frags_per_group); - goto failed_mount; - } - if (sbi->s_inodes_per_group > blocksize * 8) { - ext3_msg(sb, KERN_ERR, - "error: #inodes per group too big: %lu", - sbi->s_inodes_per_group); - goto failed_mount; - } - - err = generic_check_addressable(sb->s_blocksize_bits, - le32_to_cpu(es->s_blocks_count)); - if (err) { - ext3_msg(sb, KERN_ERR, - "error: filesystem is too large to mount safely"); - if (sizeof(sector_t) < 8) - ext3_msg(sb, KERN_ERR, - "error: CONFIG_LBDAF not enabled"); - ret = err; - goto failed_mount; - } - - if (EXT3_BLOCKS_PER_GROUP(sb) == 0) - goto cantfind_ext3; - sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) - - le32_to_cpu(es->s_first_data_block) - 1) - / EXT3_BLOCKS_PER_GROUP(sb)) + 1; - db_count = DIV_ROUND_UP(sbi->s_groups_count, EXT3_DESC_PER_BLOCK(sb)); - sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *), - GFP_KERNEL); - if (sbi->s_group_desc == NULL) { - ext3_msg(sb, KERN_ERR, - "error: not enough memory"); - ret = -ENOMEM; - goto failed_mount; - } - - bgl_lock_init(sbi->s_blockgroup_lock); - - for (i = 0; i < db_count; i++) { - block = descriptor_loc(sb, logic_sb_block, i); - sbi->s_group_desc[i] = sb_bread(sb, block); - if (!sbi->s_group_desc[i]) { - ext3_msg(sb, KERN_ERR, - "error: can't read group descriptor %d", i); - db_count = i; - goto failed_mount2; - } - } - if (!ext3_check_descriptors (sb)) { - ext3_msg(sb, KERN_ERR, - "error: group descriptors corrupted"); - goto failed_mount2; - } - sbi->s_gdb_count = db_count; - get_random_bytes(&sbi->s_next_generation, sizeof(u32)); - spin_lock_init(&sbi->s_next_gen_lock); - - /* per fileystem reservation list head & lock */ - spin_lock_init(&sbi->s_rsv_window_lock); - sbi->s_rsv_window_root = RB_ROOT; - /* Add a single, static dummy reservation to the start of the - * reservation window list --- it gives us a placeholder for - * append-at-start-of-list which makes the allocation logic - * _much_ simpler. */ - sbi->s_rsv_window_head.rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; - sbi->s_rsv_window_head.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; - sbi->s_rsv_window_head.rsv_alloc_hit = 0; - sbi->s_rsv_window_head.rsv_goal_size = 0; - ext3_rsv_window_add(sb, &sbi->s_rsv_window_head); - - /* - * set up enough so that it can read an inode - */ - sb->s_op = &ext3_sops; - sb->s_export_op = &ext3_export_ops; - sb->s_xattr = ext3_xattr_handlers; -#ifdef CONFIG_QUOTA - sb->s_qcop = &ext3_qctl_operations; - sb->dq_op = &ext3_quota_operations; - sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; -#endif - memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); - INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ - mutex_init(&sbi->s_orphan_lock); - mutex_init(&sbi->s_resize_lock); - - sb->s_root = NULL; - - needs_recovery = (es->s_last_orphan != 0 || - EXT3_HAS_INCOMPAT_FEATURE(sb, - EXT3_FEATURE_INCOMPAT_RECOVER)); - - /* - * The first inode we look at is the journal inode. Don't try - * root first: it may be modified in the journal! - */ - if (!test_opt(sb, NOLOAD) && - EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) { - if (ext3_load_journal(sb, es, journal_devnum)) - goto failed_mount2; - } else if (journal_inum) { - if (ext3_create_journal(sb, es, journal_inum)) - goto failed_mount2; - } else { - if (!silent) - ext3_msg(sb, KERN_ERR, - "error: no journal found. " - "mounting ext3 over ext2?"); - goto failed_mount2; - } - err = percpu_counter_init(&sbi->s_freeblocks_counter, - ext3_count_free_blocks(sb), GFP_KERNEL); - if (!err) { - err = percpu_counter_init(&sbi->s_freeinodes_counter, - ext3_count_free_inodes(sb), GFP_KERNEL); - } - if (!err) { - err = percpu_counter_init(&sbi->s_dirs_counter, - ext3_count_dirs(sb), GFP_KERNEL); - } - if (err) { - ext3_msg(sb, KERN_ERR, "error: insufficient memory"); - ret = err; - goto failed_mount3; - } - - /* We have now updated the journal if required, so we can - * validate the data journaling mode. */ - switch (test_opt(sb, DATA_FLAGS)) { - case 0: - /* No mode set, assume a default based on the journal - capabilities: ORDERED_DATA if the journal can - cope, else JOURNAL_DATA */ - if (journal_check_available_features - (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) - set_opt(sbi->s_mount_opt, DEFAULT_DATA_MODE); - else - set_opt(sbi->s_mount_opt, JOURNAL_DATA); - break; - - case EXT3_MOUNT_ORDERED_DATA: - case EXT3_MOUNT_WRITEBACK_DATA: - if (!journal_check_available_features - (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) { - ext3_msg(sb, KERN_ERR, - "error: journal does not support " - "requested data journaling mode"); - goto failed_mount3; - } - default: - break; - } - - /* - * The journal_load will have done any necessary log recovery, - * so we can safely mount the rest of the filesystem now. - */ - - root = ext3_iget(sb, EXT3_ROOT_INO); - if (IS_ERR(root)) { - ext3_msg(sb, KERN_ERR, "error: get root inode failed"); - ret = PTR_ERR(root); - goto failed_mount3; - } - if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { - iput(root); - ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck"); - goto failed_mount3; - } - sb->s_root = d_make_root(root); - if (!sb->s_root) { - ext3_msg(sb, KERN_ERR, "error: get root dentry failed"); - ret = -ENOMEM; - goto failed_mount3; - } - - if (ext3_setup_super(sb, es, sb->s_flags & MS_RDONLY)) - sb->s_flags |= MS_RDONLY; - - EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS; - ext3_orphan_cleanup(sb, es); - EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS; - if (needs_recovery) { - ext3_mark_recovery_complete(sb, es); - ext3_msg(sb, KERN_INFO, "recovery complete"); - } - ext3_msg(sb, KERN_INFO, "mounted filesystem with %s data mode", - test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal": - test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": - "writeback"); - - return 0; - -cantfind_ext3: - if (!silent) - ext3_msg(sb, KERN_INFO, - "error: can't find ext3 filesystem on dev %s.", - sb->s_id); - goto failed_mount; - -failed_mount3: - percpu_counter_destroy(&sbi->s_freeblocks_counter); - percpu_counter_destroy(&sbi->s_freeinodes_counter); - percpu_counter_destroy(&sbi->s_dirs_counter); - journal_destroy(sbi->s_journal); -failed_mount2: - for (i = 0; i < db_count; i++) - brelse(sbi->s_group_desc[i]); - kfree(sbi->s_group_desc); -failed_mount: -#ifdef CONFIG_QUOTA - for (i = 0; i < EXT3_MAXQUOTAS; i++) - kfree(sbi->s_qf_names[i]); -#endif - ext3_blkdev_remove(sbi); - brelse(bh); -out_fail: - sb->s_fs_info = NULL; - kfree(sbi->s_blockgroup_lock); - kfree(sbi); - return ret; -} - -/* - * Setup any per-fs journal parameters now. We'll do this both on - * initial mount, once the journal has been initialised but before we've - * done any recovery; and again on any subsequent remount. - */ -static void ext3_init_journal_params(struct super_block *sb, journal_t *journal) -{ - struct ext3_sb_info *sbi = EXT3_SB(sb); - - if (sbi->s_commit_interval) - journal->j_commit_interval = sbi->s_commit_interval; - /* We could also set up an ext3-specific default for the commit - * interval here, but for now we'll just fall back to the jbd - * default. */ - - spin_lock(&journal->j_state_lock); - if (test_opt(sb, BARRIER)) - journal->j_flags |= JFS_BARRIER; - else - journal->j_flags &= ~JFS_BARRIER; - if (test_opt(sb, DATA_ERR_ABORT)) - journal->j_flags |= JFS_ABORT_ON_SYNCDATA_ERR; - else - journal->j_flags &= ~JFS_ABORT_ON_SYNCDATA_ERR; - spin_unlock(&journal->j_state_lock); -} - -static journal_t *ext3_get_journal(struct super_block *sb, - unsigned int journal_inum) -{ - struct inode *journal_inode; - journal_t *journal; - - /* First, test for the existence of a valid inode on disk. Bad - * things happen if we iget() an unused inode, as the subsequent - * iput() will try to delete it. */ - - journal_inode = ext3_iget(sb, journal_inum); - if (IS_ERR(journal_inode)) { - ext3_msg(sb, KERN_ERR, "error: no journal found"); - return NULL; - } - if (!journal_inode->i_nlink) { - make_bad_inode(journal_inode); - iput(journal_inode); - ext3_msg(sb, KERN_ERR, "error: journal inode is deleted"); - return NULL; - } - - jbd_debug(2, "Journal inode found at %p: %Ld bytes\n", - journal_inode, journal_inode->i_size); - if (!S_ISREG(journal_inode->i_mode)) { - ext3_msg(sb, KERN_ERR, "error: invalid journal inode"); - iput(journal_inode); - return NULL; - } - - journal = journal_init_inode(journal_inode); - if (!journal) { - ext3_msg(sb, KERN_ERR, "error: could not load journal inode"); - iput(journal_inode); - return NULL; - } - journal->j_private = sb; - ext3_init_journal_params(sb, journal); - return journal; -} - -static journal_t *ext3_get_dev_journal(struct super_block *sb, - dev_t j_dev) -{ - struct buffer_head * bh; - journal_t *journal; - ext3_fsblk_t start; - ext3_fsblk_t len; - int hblock, blocksize; - ext3_fsblk_t sb_block; - unsigned long offset; - struct ext3_super_block * es; - struct block_device *bdev; - - bdev = ext3_blkdev_get(j_dev, sb); - if (bdev == NULL) - return NULL; - - blocksize = sb->s_blocksize; - hblock = bdev_logical_block_size(bdev); - if (blocksize < hblock) { - ext3_msg(sb, KERN_ERR, - "error: blocksize too small for journal device"); - goto out_bdev; - } - - sb_block = EXT3_MIN_BLOCK_SIZE / blocksize; - offset = EXT3_MIN_BLOCK_SIZE % blocksize; - set_blocksize(bdev, blocksize); - if (!(bh = __bread(bdev, sb_block, blocksize))) { - ext3_msg(sb, KERN_ERR, "error: couldn't read superblock of " - "external journal"); - goto out_bdev; - } - - es = (struct ext3_super_block *) (bh->b_data + offset); - if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) || - !(le32_to_cpu(es->s_feature_incompat) & - EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) { - ext3_msg(sb, KERN_ERR, "error: external journal has " - "bad superblock"); - brelse(bh); - goto out_bdev; - } - - if (memcmp(EXT3_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { - ext3_msg(sb, KERN_ERR, "error: journal UUID does not match"); - brelse(bh); - goto out_bdev; - } - - len = le32_to_cpu(es->s_blocks_count); - start = sb_block + 1; - brelse(bh); /* we're done with the superblock */ - - journal = journal_init_dev(bdev, sb->s_bdev, - start, len, blocksize); - if (!journal) { - ext3_msg(sb, KERN_ERR, - "error: failed to create device journal"); - goto out_bdev; - } - journal->j_private = sb; - if (!bh_uptodate_or_lock(journal->j_sb_buffer)) { - if (bh_submit_read(journal->j_sb_buffer)) { - ext3_msg(sb, KERN_ERR, "I/O error on journal device"); - goto out_journal; - } - } - if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { - ext3_msg(sb, KERN_ERR, - "error: external journal has more than one " - "user (unsupported) - %d", - be32_to_cpu(journal->j_superblock->s_nr_users)); - goto out_journal; - } - EXT3_SB(sb)->journal_bdev = bdev; - ext3_init_journal_params(sb, journal); - return journal; -out_journal: - journal_destroy(journal); -out_bdev: - ext3_blkdev_put(bdev); - return NULL; -} - -static int ext3_load_journal(struct super_block *sb, - struct ext3_super_block *es, - unsigned long journal_devnum) -{ - journal_t *journal; - unsigned int journal_inum = le32_to_cpu(es->s_journal_inum); - dev_t journal_dev; - int err = 0; - int really_read_only; - - if (journal_devnum && - journal_devnum != le32_to_cpu(es->s_journal_dev)) { - ext3_msg(sb, KERN_INFO, "external journal device major/minor " - "numbers have changed"); - journal_dev = new_decode_dev(journal_devnum); - } else - journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); - - really_read_only = bdev_read_only(sb->s_bdev); - - /* - * Are we loading a blank journal or performing recovery after a - * crash? For recovery, we need to check in advance whether we - * can get read-write access to the device. - */ - - if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) { - if (sb->s_flags & MS_RDONLY) { - ext3_msg(sb, KERN_INFO, - "recovery required on readonly filesystem"); - if (really_read_only) { - ext3_msg(sb, KERN_ERR, "error: write access " - "unavailable, cannot proceed"); - return -EROFS; - } - ext3_msg(sb, KERN_INFO, - "write access will be enabled during recovery"); - } - } - - if (journal_inum && journal_dev) { - ext3_msg(sb, KERN_ERR, "error: filesystem has both journal " - "and inode journals"); - return -EINVAL; - } - - if (journal_inum) { - if (!(journal = ext3_get_journal(sb, journal_inum))) - return -EINVAL; - } else { - if (!(journal = ext3_get_dev_journal(sb, journal_dev))) - return -EINVAL; - } - - if (!(journal->j_flags & JFS_BARRIER)) - printk(KERN_INFO "EXT3-fs: barriers not enabled\n"); - - if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { - err = journal_update_format(journal); - if (err) { - ext3_msg(sb, KERN_ERR, "error updating journal"); - journal_destroy(journal); - return err; - } - } - - if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) - err = journal_wipe(journal, !really_read_only); - if (!err) - err = journal_load(journal); - - if (err) { - ext3_msg(sb, KERN_ERR, "error loading journal"); - journal_destroy(journal); - return err; - } - - EXT3_SB(sb)->s_journal = journal; - ext3_clear_journal_err(sb, es); - - if (!really_read_only && journal_devnum && - journal_devnum != le32_to_cpu(es->s_journal_dev)) { - es->s_journal_dev = cpu_to_le32(journal_devnum); - - /* Make sure we flush the recovery flag to disk. */ - ext3_commit_super(sb, es, 1); - } - - return 0; -} - -static int ext3_create_journal(struct super_block *sb, - struct ext3_super_block *es, - unsigned int journal_inum) -{ - journal_t *journal; - int err; - - if (sb->s_flags & MS_RDONLY) { - ext3_msg(sb, KERN_ERR, - "error: readonly filesystem when trying to " - "create journal"); - return -EROFS; - } - - journal = ext3_get_journal(sb, journal_inum); - if (!journal) - return -EINVAL; - - ext3_msg(sb, KERN_INFO, "creating new journal on inode %u", - journal_inum); - - err = journal_create(journal); - if (err) { - ext3_msg(sb, KERN_ERR, "error creating journal"); - journal_destroy(journal); - return -EIO; - } - - EXT3_SB(sb)->s_journal = journal; - - ext3_update_dynamic_rev(sb); - EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); - EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL); - - es->s_journal_inum = cpu_to_le32(journal_inum); - - /* Make sure we flush the recovery flag to disk. */ - ext3_commit_super(sb, es, 1); - - return 0; -} - -static int ext3_commit_super(struct super_block *sb, - struct ext3_super_block *es, - int sync) -{ - struct buffer_head *sbh = EXT3_SB(sb)->s_sbh; - int error = 0; - - if (!sbh) - return error; - - if (buffer_write_io_error(sbh)) { - /* - * Oh, dear. A previous attempt to write the - * superblock failed. This could happen because the - * USB device was yanked out. Or it could happen to - * be a transient write error and maybe the block will - * be remapped. Nothing we can do but to retry the - * write and hope for the best. - */ - ext3_msg(sb, KERN_ERR, "previous I/O error to " - "superblock detected"); - clear_buffer_write_io_error(sbh); - set_buffer_uptodate(sbh); - } - /* - * If the file system is mounted read-only, don't update the - * superblock write time. This avoids updating the superblock - * write time when we are mounting the root file system - * read/only but we need to replay the journal; at that point, - * for people who are east of GMT and who make their clock - * tick in localtime for Windows bug-for-bug compatibility, - * the clock is set in the future, and this will cause e2fsck - * to complain and force a full file system check. - */ - if (!(sb->s_flags & MS_RDONLY)) - es->s_wtime = cpu_to_le32(get_seconds()); - es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb)); - es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb)); - BUFFER_TRACE(sbh, "marking dirty"); - mark_buffer_dirty(sbh); - if (sync) { - error = sync_dirty_buffer(sbh); - if (buffer_write_io_error(sbh)) { - ext3_msg(sb, KERN_ERR, "I/O error while writing " - "superblock"); - clear_buffer_write_io_error(sbh); - set_buffer_uptodate(sbh); - } - } - return error; -} - - -/* - * Have we just finished recovery? If so, and if we are mounting (or - * remounting) the filesystem readonly, then we will end up with a - * consistent fs on disk. Record that fact. - */ -static void ext3_mark_recovery_complete(struct super_block * sb, - struct ext3_super_block * es) -{ - journal_t *journal = EXT3_SB(sb)->s_journal; - - journal_lock_updates(journal); - if (journal_flush(journal) < 0) - goto out; - - if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) && - sb->s_flags & MS_RDONLY) { - EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); - ext3_commit_super(sb, es, 1); - } - -out: - journal_unlock_updates(journal); -} - -/* - * If we are mounting (or read-write remounting) a filesystem whose journal - * has recorded an error from a previous lifetime, move that error to the - * main filesystem now. - */ -static void ext3_clear_journal_err(struct super_block *sb, - struct ext3_super_block *es) -{ - journal_t *journal; - int j_errno; - const char *errstr; - - journal = EXT3_SB(sb)->s_journal; - - /* - * Now check for any error status which may have been recorded in the - * journal by a prior ext3_error() or ext3_abort() - */ - - j_errno = journal_errno(journal); - if (j_errno) { - char nbuf[16]; - - errstr = ext3_decode_error(sb, j_errno, nbuf); - ext3_warning(sb, __func__, "Filesystem error recorded " - "from previous mount: %s", errstr); - ext3_warning(sb, __func__, "Marking fs in need of " - "filesystem check."); - - EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; - es->s_state |= cpu_to_le16(EXT3_ERROR_FS); - ext3_commit_super (sb, es, 1); - - journal_clear_err(journal); - } -} - -/* - * Force the running and committing transactions to commit, - * and wait on the commit. - */ -int ext3_force_commit(struct super_block *sb) -{ - journal_t *journal; - int ret; - - if (sb->s_flags & MS_RDONLY) - return 0; - - journal = EXT3_SB(sb)->s_journal; - ret = ext3_journal_force_commit(journal); - return ret; -} - -static int ext3_sync_fs(struct super_block *sb, int wait) -{ - tid_t target; - - trace_ext3_sync_fs(sb, wait); - /* - * Writeback quota in non-journalled quota case - journalled quota has - * no dirty dquots - */ - dquot_writeback_dquots(sb, -1); - if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) { - if (wait) - log_wait_commit(EXT3_SB(sb)->s_journal, target); - } - return 0; -} - -/* - * LVM calls this function before a (read-only) snapshot is created. This - * gives us a chance to flush the journal completely and mark the fs clean. - */ -static int ext3_freeze(struct super_block *sb) -{ - int error = 0; - journal_t *journal; - - if (!(sb->s_flags & MS_RDONLY)) { - journal = EXT3_SB(sb)->s_journal; - - /* Now we set up the journal barrier. */ - journal_lock_updates(journal); - - /* - * We don't want to clear needs_recovery flag when we failed - * to flush the journal. - */ - error = journal_flush(journal); - if (error < 0) - goto out; - - /* Journal blocked and flushed, clear needs_recovery flag. */ - EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); - error = ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); - if (error) - goto out; - } - return 0; - -out: - journal_unlock_updates(journal); - return error; -} - -/* - * Called by LVM after the snapshot is done. We need to reset the RECOVER - * flag here, even though the filesystem is not technically dirty yet. - */ -static int ext3_unfreeze(struct super_block *sb) -{ - if (!(sb->s_flags & MS_RDONLY)) { - /* Reser the needs_recovery flag before the fs is unlocked. */ - EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); - ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); - journal_unlock_updates(EXT3_SB(sb)->s_journal); - } - return 0; -} - -static int ext3_remount (struct super_block * sb, int * flags, char * data) -{ - struct ext3_super_block * es; - struct ext3_sb_info *sbi = EXT3_SB(sb); - ext3_fsblk_t n_blocks_count = 0; - unsigned long old_sb_flags; - struct ext3_mount_options old_opts; - int enable_quota = 0; - int err; -#ifdef CONFIG_QUOTA - int i; -#endif - - sync_filesystem(sb); - - /* Store the original options */ - old_sb_flags = sb->s_flags; - old_opts.s_mount_opt = sbi->s_mount_opt; - old_opts.s_resuid = sbi->s_resuid; - old_opts.s_resgid = sbi->s_resgid; - old_opts.s_commit_interval = sbi->s_commit_interval; -#ifdef CONFIG_QUOTA - old_opts.s_jquota_fmt = sbi->s_jquota_fmt; - for (i = 0; i < EXT3_MAXQUOTAS; i++) - if (sbi->s_qf_names[i]) { - old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i], - GFP_KERNEL); - if (!old_opts.s_qf_names[i]) { - int j; - - for (j = 0; j < i; j++) - kfree(old_opts.s_qf_names[j]); - return -ENOMEM; - } - } else - old_opts.s_qf_names[i] = NULL; -#endif - - /* - * Allow the "check" option to be passed as a remount option. - */ - if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) { - err = -EINVAL; - goto restore_opts; - } - - if (test_opt(sb, ABORT)) - ext3_abort(sb, __func__, "Abort forced by user"); - - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | - (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); - - es = sbi->s_es; - - ext3_init_journal_params(sb, sbi->s_journal); - - if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || - n_blocks_count > le32_to_cpu(es->s_blocks_count)) { - if (test_opt(sb, ABORT)) { - err = -EROFS; - goto restore_opts; - } - - if (*flags & MS_RDONLY) { - err = dquot_suspend(sb, -1); - if (err < 0) - goto restore_opts; - - /* - * First of all, the unconditional stuff we have to do - * to disable replay of the journal when we next remount - */ - sb->s_flags |= MS_RDONLY; - - /* - * OK, test if we are remounting a valid rw partition - * readonly, and if so set the rdonly flag and then - * mark the partition as valid again. - */ - if (!(es->s_state & cpu_to_le16(EXT3_VALID_FS)) && - (sbi->s_mount_state & EXT3_VALID_FS)) - es->s_state = cpu_to_le16(sbi->s_mount_state); - - ext3_mark_recovery_complete(sb, es); - } else { - __le32 ret; - if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb, - ~EXT3_FEATURE_RO_COMPAT_SUPP))) { - ext3_msg(sb, KERN_WARNING, - "warning: couldn't remount RDWR " - "because of unsupported optional " - "features (%x)", le32_to_cpu(ret)); - err = -EROFS; - goto restore_opts; - } - - /* - * If we have an unprocessed orphan list hanging - * around from a previously readonly bdev mount, - * require a full umount & mount for now. - */ - if (es->s_last_orphan) { - ext3_msg(sb, KERN_WARNING, "warning: couldn't " - "remount RDWR because of unprocessed " - "orphan inode list. Please " - "umount & mount instead."); - err = -EINVAL; - goto restore_opts; - } - - /* - * Mounting a RDONLY partition read-write, so reread - * and store the current valid flag. (It may have - * been changed by e2fsck since we originally mounted - * the partition.) - */ - ext3_clear_journal_err(sb, es); - sbi->s_mount_state = le16_to_cpu(es->s_state); - if ((err = ext3_group_extend(sb, es, n_blocks_count))) - goto restore_opts; - if (!ext3_setup_super (sb, es, 0)) - sb->s_flags &= ~MS_RDONLY; - enable_quota = 1; - } - } -#ifdef CONFIG_QUOTA - /* Release old quota file names */ - for (i = 0; i < EXT3_MAXQUOTAS; i++) - kfree(old_opts.s_qf_names[i]); -#endif - if (enable_quota) - dquot_resume(sb, -1); - return 0; -restore_opts: - sb->s_flags = old_sb_flags; - sbi->s_mount_opt = old_opts.s_mount_opt; - sbi->s_resuid = old_opts.s_resuid; - sbi->s_resgid = old_opts.s_resgid; - sbi->s_commit_interval = old_opts.s_commit_interval; -#ifdef CONFIG_QUOTA - sbi->s_jquota_fmt = old_opts.s_jquota_fmt; - for (i = 0; i < EXT3_MAXQUOTAS; i++) { - kfree(sbi->s_qf_names[i]); - sbi->s_qf_names[i] = old_opts.s_qf_names[i]; - } -#endif - return err; -} - -static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf) -{ - struct super_block *sb = dentry->d_sb; - struct ext3_sb_info *sbi = EXT3_SB(sb); - struct ext3_super_block *es = sbi->s_es; - u64 fsid; - - if (test_opt(sb, MINIX_DF)) { - sbi->s_overhead_last = 0; - } else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) { - unsigned long ngroups = sbi->s_groups_count, i; - ext3_fsblk_t overhead = 0; - smp_rmb(); - - /* - * Compute the overhead (FS structures). This is constant - * for a given filesystem unless the number of block groups - * changes so we cache the previous value until it does. - */ - - /* - * All of the blocks before first_data_block are - * overhead - */ - overhead = le32_to_cpu(es->s_first_data_block); - - /* - * Add the overhead attributed to the superblock and - * block group descriptors. If the sparse superblocks - * feature is turned on, then not all groups have this. - */ - for (i = 0; i < ngroups; i++) { - overhead += ext3_bg_has_super(sb, i) + - ext3_bg_num_gdb(sb, i); - cond_resched(); - } - - /* - * Every block group has an inode bitmap, a block - * bitmap, and an inode table. - */ - overhead += ngroups * (2 + sbi->s_itb_per_group); - - /* Add the internal journal blocks as well */ - if (sbi->s_journal && !sbi->journal_bdev) - overhead += sbi->s_journal->j_maxlen; - - sbi->s_overhead_last = overhead; - smp_wmb(); - sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count); - } - - buf->f_type = EXT3_SUPER_MAGIC; - buf->f_bsize = sb->s_blocksize; - buf->f_blocks = le32_to_cpu(es->s_blocks_count) - sbi->s_overhead_last; - buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter); - buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count); - if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count)) - buf->f_bavail = 0; - buf->f_files = le32_to_cpu(es->s_inodes_count); - buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); - buf->f_namelen = EXT3_NAME_LEN; - fsid = le64_to_cpup((void *)es->s_uuid) ^ - le64_to_cpup((void *)es->s_uuid + sizeof(u64)); - buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; - buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; - return 0; -} - -/* Helper function for writing quotas on sync - we need to start transaction before quota file - * is locked for write. Otherwise the are possible deadlocks: - * Process 1 Process 2 - * ext3_create() quota_sync() - * journal_start() write_dquot() - * dquot_initialize() down(dqio_mutex) - * down(dqio_mutex) journal_start() - * - */ - -#ifdef CONFIG_QUOTA - -static inline struct inode *dquot_to_inode(struct dquot *dquot) -{ - return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type]; -} - -static int ext3_write_dquot(struct dquot *dquot) -{ - int ret, err; - handle_t *handle; - struct inode *inode; - - inode = dquot_to_inode(dquot); - handle = ext3_journal_start(inode, - EXT3_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - ret = dquot_commit(dquot); - err = ext3_journal_stop(handle); - if (!ret) - ret = err; - return ret; -} - -static int ext3_acquire_dquot(struct dquot *dquot) -{ - int ret, err; - handle_t *handle; - - handle = ext3_journal_start(dquot_to_inode(dquot), - EXT3_QUOTA_INIT_BLOCKS(dquot->dq_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - ret = dquot_acquire(dquot); - err = ext3_journal_stop(handle); - if (!ret) - ret = err; - return ret; -} - -static int ext3_release_dquot(struct dquot *dquot) -{ - int ret, err; - handle_t *handle; - - handle = ext3_journal_start(dquot_to_inode(dquot), - EXT3_QUOTA_DEL_BLOCKS(dquot->dq_sb)); - if (IS_ERR(handle)) { - /* Release dquot anyway to avoid endless cycle in dqput() */ - dquot_release(dquot); - return PTR_ERR(handle); - } - ret = dquot_release(dquot); - err = ext3_journal_stop(handle); - if (!ret) - ret = err; - return ret; -} - -static int ext3_mark_dquot_dirty(struct dquot *dquot) -{ - /* Are we journaling quotas? */ - if (EXT3_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] || - EXT3_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) { - dquot_mark_dquot_dirty(dquot); - return ext3_write_dquot(dquot); - } else { - return dquot_mark_dquot_dirty(dquot); - } -} - -static int ext3_write_info(struct super_block *sb, int type) -{ - int ret, err; - handle_t *handle; - - /* Data block + inode block */ - handle = ext3_journal_start(d_inode(sb->s_root), 2); - if (IS_ERR(handle)) - return PTR_ERR(handle); - ret = dquot_commit_info(sb, type); - err = ext3_journal_stop(handle); - if (!ret) - ret = err; - return ret; -} - -/* - * Turn on quotas during mount time - we need to find - * the quota file and such... - */ -static int ext3_quota_on_mount(struct super_block *sb, int type) -{ - return dquot_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type], - EXT3_SB(sb)->s_jquota_fmt, type); -} - -/* - * Standard function to be called on quota_on - */ -static int ext3_quota_on(struct super_block *sb, int type, int format_id, - struct path *path) -{ - int err; - - if (!test_opt(sb, QUOTA)) - return -EINVAL; - - /* Quotafile not on the same filesystem? */ - if (path->dentry->d_sb != sb) - return -EXDEV; - /* Journaling quota? */ - if (EXT3_SB(sb)->s_qf_names[type]) { - /* Quotafile not of fs root? */ - if (path->dentry->d_parent != sb->s_root) - ext3_msg(sb, KERN_WARNING, - "warning: Quota file not on filesystem root. " - "Journaled quota will not work."); - } - - /* - * When we journal data on quota file, we have to flush journal to see - * all updates to the file when we bypass pagecache... - */ - if (ext3_should_journal_data(d_inode(path->dentry))) { - /* - * We don't need to lock updates but journal_flush() could - * otherwise be livelocked... - */ - journal_lock_updates(EXT3_SB(sb)->s_journal); - err = journal_flush(EXT3_SB(sb)->s_journal); - journal_unlock_updates(EXT3_SB(sb)->s_journal); - if (err) - return err; - } - - return dquot_quota_on(sb, type, format_id, path); -} - -/* Read data from quotafile - avoid pagecache and such because we cannot afford - * acquiring the locks... As quota files are never truncated and quota code - * itself serializes the operations (and no one else should touch the files) - * we don't have to be afraid of races */ -static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data, - size_t len, loff_t off) -{ - struct inode *inode = sb_dqopt(sb)->files[type]; - sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb); - int err = 0; - int offset = off & (sb->s_blocksize - 1); - int tocopy; - size_t toread; - struct buffer_head *bh; - loff_t i_size = i_size_read(inode); - - if (off > i_size) - return 0; - if (off+len > i_size) - len = i_size-off; - toread = len; - while (toread > 0) { - tocopy = sb->s_blocksize - offset < toread ? - sb->s_blocksize - offset : toread; - bh = ext3_bread(NULL, inode, blk, 0, &err); - if (err) - return err; - if (!bh) /* A hole? */ - memset(data, 0, tocopy); - else - memcpy(data, bh->b_data+offset, tocopy); - brelse(bh); - offset = 0; - toread -= tocopy; - data += tocopy; - blk++; - } - return len; -} - -/* Write to quotafile (we know the transaction is already started and has - * enough credits) */ -static ssize_t ext3_quota_write(struct super_block *sb, int type, - const char *data, size_t len, loff_t off) -{ - struct inode *inode = sb_dqopt(sb)->files[type]; - sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb); - int err = 0; - int offset = off & (sb->s_blocksize - 1); - int journal_quota = EXT3_SB(sb)->s_qf_names[type] != NULL; - struct buffer_head *bh; - handle_t *handle = journal_current_handle(); - - if (!handle) { - ext3_msg(sb, KERN_WARNING, - "warning: quota write (off=%llu, len=%llu)" - " cancelled because transaction is not started.", - (unsigned long long)off, (unsigned long long)len); - return -EIO; - } - - /* - * Since we account only one data block in transaction credits, - * then it is impossible to cross a block boundary. - */ - if (sb->s_blocksize - offset < len) { - ext3_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" - " cancelled because not block aligned", - (unsigned long long)off, (unsigned long long)len); - return -EIO; - } - bh = ext3_bread(handle, inode, blk, 1, &err); - if (!bh) - goto out; - if (journal_quota) { - err = ext3_journal_get_write_access(handle, bh); - if (err) { - brelse(bh); - goto out; - } - } - lock_buffer(bh); - memcpy(bh->b_data+offset, data, len); - flush_dcache_page(bh->b_page); - unlock_buffer(bh); - if (journal_quota) - err = ext3_journal_dirty_metadata(handle, bh); - else { - /* Always do at least ordered writes for quotas */ - err = ext3_journal_dirty_data(handle, bh); - mark_buffer_dirty(bh); - } - brelse(bh); -out: - if (err) - return err; - if (inode->i_size < off + len) { - i_size_write(inode, off + len); - EXT3_I(inode)->i_disksize = inode->i_size; - } - inode->i_version++; - inode->i_mtime = inode->i_ctime = CURRENT_TIME; - ext3_mark_inode_dirty(handle, inode); - return len; -} - -#endif - -static struct dentry *ext3_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) -{ - return mount_bdev(fs_type, flags, dev_name, data, ext3_fill_super); -} - -static struct file_system_type ext3_fs_type = { - .owner = THIS_MODULE, - .name = "ext3", - .mount = ext3_mount, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, -}; -MODULE_ALIAS_FS("ext3"); - -static int __init init_ext3_fs(void) -{ - int err = init_ext3_xattr(); - if (err) - return err; - err = init_inodecache(); - if (err) - goto out1; - err = register_filesystem(&ext3_fs_type); - if (err) - goto out; - return 0; -out: - destroy_inodecache(); -out1: - exit_ext3_xattr(); - return err; -} - -static void __exit exit_ext3_fs(void) -{ - unregister_filesystem(&ext3_fs_type); - destroy_inodecache(); - exit_ext3_xattr(); -} - -MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); -MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); -MODULE_LICENSE("GPL"); -module_init(init_ext3_fs) -module_exit(exit_ext3_fs) diff --git a/fs/ext3/symlink.c b/fs/ext3/symlink.c deleted file mode 100644 index c08c59094ae6..000000000000 --- a/fs/ext3/symlink.c +++ /dev/null @@ -1,46 +0,0 @@ -/* - * linux/fs/ext3/symlink.c - * - * Only fast symlinks left here - the rest is done by generic code. AV, 1999 - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * from - * - * linux/fs/minix/symlink.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * ext3 symlink handling code - */ - -#include "ext3.h" -#include "xattr.h" - -const struct inode_operations ext3_symlink_inode_operations = { - .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, - .setattr = ext3_setattr, -#ifdef CONFIG_EXT3_FS_XATTR - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .listxattr = ext3_listxattr, - .removexattr = generic_removexattr, -#endif -}; - -const struct inode_operations ext3_fast_symlink_inode_operations = { - .readlink = generic_readlink, - .follow_link = simple_follow_link, - .setattr = ext3_setattr, -#ifdef CONFIG_EXT3_FS_XATTR - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .listxattr = ext3_listxattr, - .removexattr = generic_removexattr, -#endif -}; diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c deleted file mode 100644 index 7cf36501ccf4..000000000000 --- a/fs/ext3/xattr.c +++ /dev/null @@ -1,1330 +0,0 @@ -/* - * linux/fs/ext3/xattr.c - * - * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> - * - * Fix by Harrison Xing <harrison@mountainviewdata.com>. - * Ext3 code with a lot of help from Eric Jarman <ejarman@acm.org>. - * Extended attributes for symlinks and special files added per - * suggestion of Luka Renko <luka.renko@hermes.si>. - * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>, - * Red Hat Inc. - * ea-in-inode support by Alex Tomas <alex@clusterfs.com> aka bzzz - * and Andreas Gruenbacher <agruen@suse.de>. - */ - -/* - * Extended attributes are stored directly in inodes (on file systems with - * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl - * field contains the block number if an inode uses an additional block. All - * attributes must fit in the inode and one additional block. Blocks that - * contain the identical set of attributes may be shared among several inodes. - * Identical blocks are detected by keeping a cache of blocks that have - * recently been accessed. - * - * The attributes in inodes and on blocks have a different header; the entries - * are stored in the same format: - * - * +------------------+ - * | header | - * | entry 1 | | - * | entry 2 | | growing downwards - * | entry 3 | v - * | four null bytes | - * | . . . | - * | value 1 | ^ - * | value 3 | | growing upwards - * | value 2 | | - * +------------------+ - * - * The header is followed by multiple entry descriptors. In disk blocks, the - * entry descriptors are kept sorted. In inodes, they are unsorted. The - * attribute values are aligned to the end of the block in no specific order. - * - * Locking strategy - * ---------------- - * EXT3_I(inode)->i_file_acl is protected by EXT3_I(inode)->xattr_sem. - * EA blocks are only changed if they are exclusive to an inode, so - * holding xattr_sem also means that nothing but the EA block's reference - * count can change. Multiple writers to the same block are synchronized - * by the buffer lock. - */ - -#include "ext3.h" -#include <linux/mbcache.h> -#include <linux/quotaops.h> -#include "xattr.h" -#include "acl.h" - -#define BHDR(bh) ((struct ext3_xattr_header *)((bh)->b_data)) -#define ENTRY(ptr) ((struct ext3_xattr_entry *)(ptr)) -#define BFIRST(bh) ENTRY(BHDR(bh)+1) -#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) - -#define IHDR(inode, raw_inode) \ - ((struct ext3_xattr_ibody_header *) \ - ((void *)raw_inode + \ - EXT3_GOOD_OLD_INODE_SIZE + \ - EXT3_I(inode)->i_extra_isize)) -#define IFIRST(hdr) ((struct ext3_xattr_entry *)((hdr)+1)) - -#ifdef EXT3_XATTR_DEBUG -# define ea_idebug(inode, f...) do { \ - printk(KERN_DEBUG "inode %s:%lu: ", \ - inode->i_sb->s_id, inode->i_ino); \ - printk(f); \ - printk("\n"); \ - } while (0) -# define ea_bdebug(bh, f...) do { \ - char b[BDEVNAME_SIZE]; \ - printk(KERN_DEBUG "block %s:%lu: ", \ - bdevname(bh->b_bdev, b), \ - (unsigned long) bh->b_blocknr); \ - printk(f); \ - printk("\n"); \ - } while (0) -#else -# define ea_idebug(f...) -# define ea_bdebug(f...) -#endif - -static void ext3_xattr_cache_insert(struct buffer_head *); -static struct buffer_head *ext3_xattr_cache_find(struct inode *, - struct ext3_xattr_header *, - struct mb_cache_entry **); -static void ext3_xattr_rehash(struct ext3_xattr_header *, - struct ext3_xattr_entry *); -static int ext3_xattr_list(struct dentry *dentry, char *buffer, - size_t buffer_size); - -static struct mb_cache *ext3_xattr_cache; - -static const struct xattr_handler *ext3_xattr_handler_map[] = { - [EXT3_XATTR_INDEX_USER] = &ext3_xattr_user_handler, -#ifdef CONFIG_EXT3_FS_POSIX_ACL - [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, - [EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler, -#endif - [EXT3_XATTR_INDEX_TRUSTED] = &ext3_xattr_trusted_handler, -#ifdef CONFIG_EXT3_FS_SECURITY - [EXT3_XATTR_INDEX_SECURITY] = &ext3_xattr_security_handler, -#endif -}; - -const struct xattr_handler *ext3_xattr_handlers[] = { - &ext3_xattr_user_handler, - &ext3_xattr_trusted_handler, -#ifdef CONFIG_EXT3_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif -#ifdef CONFIG_EXT3_FS_SECURITY - &ext3_xattr_security_handler, -#endif - NULL -}; - -static inline const struct xattr_handler * -ext3_xattr_handler(int name_index) -{ - const struct xattr_handler *handler = NULL; - - if (name_index > 0 && name_index < ARRAY_SIZE(ext3_xattr_handler_map)) - handler = ext3_xattr_handler_map[name_index]; - return handler; -} - -/* - * Inode operation listxattr() - * - * d_inode(dentry)->i_mutex: don't care - */ -ssize_t -ext3_listxattr(struct dentry *dentry, char *buffer, size_t size) -{ - return ext3_xattr_list(dentry, buffer, size); -} - -static int -ext3_xattr_check_names(struct ext3_xattr_entry *entry, void *end) -{ - while (!IS_LAST_ENTRY(entry)) { - struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(entry); - if ((void *)next >= end) - return -EIO; - entry = next; - } - return 0; -} - -static inline int -ext3_xattr_check_block(struct buffer_head *bh) -{ - int error; - - if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || - BHDR(bh)->h_blocks != cpu_to_le32(1)) - return -EIO; - error = ext3_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size); - return error; -} - -static inline int -ext3_xattr_check_entry(struct ext3_xattr_entry *entry, size_t size) -{ - size_t value_size = le32_to_cpu(entry->e_value_size); - - if (entry->e_value_block != 0 || value_size > size || - le16_to_cpu(entry->e_value_offs) + value_size > size) - return -EIO; - return 0; -} - -static int -ext3_xattr_find_entry(struct ext3_xattr_entry **pentry, int name_index, - const char *name, size_t size, int sorted) -{ - struct ext3_xattr_entry *entry; - size_t name_len; - int cmp = 1; - - if (name == NULL) - return -EINVAL; - name_len = strlen(name); - entry = *pentry; - for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) { - cmp = name_index - entry->e_name_index; - if (!cmp) - cmp = name_len - entry->e_name_len; - if (!cmp) - cmp = memcmp(name, entry->e_name, name_len); - if (cmp <= 0 && (sorted || cmp == 0)) - break; - } - *pentry = entry; - if (!cmp && ext3_xattr_check_entry(entry, size)) - return -EIO; - return cmp ? -ENODATA : 0; -} - -static int -ext3_xattr_block_get(struct inode *inode, int name_index, const char *name, - void *buffer, size_t buffer_size) -{ - struct buffer_head *bh = NULL; - struct ext3_xattr_entry *entry; - size_t size; - int error; - - ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", - name_index, name, buffer, (long)buffer_size); - - error = -ENODATA; - if (!EXT3_I(inode)->i_file_acl) - goto cleanup; - ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl); - bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl); - if (!bh) - goto cleanup; - ea_bdebug(bh, "b_count=%d, refcount=%d", - atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); - if (ext3_xattr_check_block(bh)) { -bad_block: ext3_error(inode->i_sb, __func__, - "inode %lu: bad block "E3FSBLK, inode->i_ino, - EXT3_I(inode)->i_file_acl); - error = -EIO; - goto cleanup; - } - ext3_xattr_cache_insert(bh); - entry = BFIRST(bh); - error = ext3_xattr_find_entry(&entry, name_index, name, bh->b_size, 1); - if (error == -EIO) - goto bad_block; - if (error) - goto cleanup; - size = le32_to_cpu(entry->e_value_size); - if (buffer) { - error = -ERANGE; - if (size > buffer_size) - goto cleanup; - memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), - size); - } - error = size; - -cleanup: - brelse(bh); - return error; -} - -static int -ext3_xattr_ibody_get(struct inode *inode, int name_index, const char *name, - void *buffer, size_t buffer_size) -{ - struct ext3_xattr_ibody_header *header; - struct ext3_xattr_entry *entry; - struct ext3_inode *raw_inode; - struct ext3_iloc iloc; - size_t size; - void *end; - int error; - - if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR)) - return -ENODATA; - error = ext3_get_inode_loc(inode, &iloc); - if (error) - return error; - raw_inode = ext3_raw_inode(&iloc); - header = IHDR(inode, raw_inode); - entry = IFIRST(header); - end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; - error = ext3_xattr_check_names(entry, end); - if (error) - goto cleanup; - error = ext3_xattr_find_entry(&entry, name_index, name, - end - (void *)entry, 0); - if (error) - goto cleanup; - size = le32_to_cpu(entry->e_value_size); - if (buffer) { - error = -ERANGE; - if (size > buffer_size) - goto cleanup; - memcpy(buffer, (void *)IFIRST(header) + - le16_to_cpu(entry->e_value_offs), size); - } - error = size; - -cleanup: - brelse(iloc.bh); - return error; -} - -/* - * ext3_xattr_get() - * - * Copy an extended attribute into the buffer - * provided, or compute the buffer size required. - * Buffer is NULL to compute the size of the buffer required. - * - * Returns a negative error number on failure, or the number of bytes - * used / required on success. - */ -int -ext3_xattr_get(struct inode *inode, int name_index, const char *name, - void *buffer, size_t buffer_size) -{ - int error; - - down_read(&EXT3_I(inode)->xattr_sem); - error = ext3_xattr_ibody_get(inode, name_index, name, buffer, - buffer_size); - if (error == -ENODATA) - error = ext3_xattr_block_get(inode, name_index, name, buffer, - buffer_size); - up_read(&EXT3_I(inode)->xattr_sem); - return error; -} - -static int -ext3_xattr_list_entries(struct dentry *dentry, struct ext3_xattr_entry *entry, - char *buffer, size_t buffer_size) -{ - size_t rest = buffer_size; - - for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) { - const struct xattr_handler *handler = - ext3_xattr_handler(entry->e_name_index); - - if (handler) { - size_t size = handler->list(dentry, buffer, rest, - entry->e_name, - entry->e_name_len, - handler->flags); - if (buffer) { - if (size > rest) - return -ERANGE; - buffer += size; - } - rest -= size; - } - } - return buffer_size - rest; -} - -static int -ext3_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) -{ - struct inode *inode = d_inode(dentry); - struct buffer_head *bh = NULL; - int error; - - ea_idebug(inode, "buffer=%p, buffer_size=%ld", - buffer, (long)buffer_size); - - error = 0; - if (!EXT3_I(inode)->i_file_acl) - goto cleanup; - ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl); - bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl); - error = -EIO; - if (!bh) - goto cleanup; - ea_bdebug(bh, "b_count=%d, refcount=%d", - atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); - if (ext3_xattr_check_block(bh)) { - ext3_error(inode->i_sb, __func__, - "inode %lu: bad block "E3FSBLK, inode->i_ino, - EXT3_I(inode)->i_file_acl); - error = -EIO; - goto cleanup; - } - ext3_xattr_cache_insert(bh); - error = ext3_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); - -cleanup: - brelse(bh); - - return error; -} - -static int -ext3_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) -{ - struct inode *inode = d_inode(dentry); - struct ext3_xattr_ibody_header *header; - struct ext3_inode *raw_inode; - struct ext3_iloc iloc; - void *end; - int error; - - if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR)) - return 0; - error = ext3_get_inode_loc(inode, &iloc); - if (error) - return error; - raw_inode = ext3_raw_inode(&iloc); - header = IHDR(inode, raw_inode); - end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; - error = ext3_xattr_check_names(IFIRST(header), end); - if (error) - goto cleanup; - error = ext3_xattr_list_entries(dentry, IFIRST(header), - buffer, buffer_size); - -cleanup: - brelse(iloc.bh); - return error; -} - -/* - * ext3_xattr_list() - * - * Copy a list of attribute names into the buffer - * provided, or compute the buffer size required. - * Buffer is NULL to compute the size of the buffer required. - * - * Returns a negative error number on failure, or the number of bytes - * used / required on success. - */ -static int -ext3_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) -{ - int i_error, b_error; - - down_read(&EXT3_I(d_inode(dentry))->xattr_sem); - i_error = ext3_xattr_ibody_list(dentry, buffer, buffer_size); - if (i_error < 0) { - b_error = 0; - } else { - if (buffer) { - buffer += i_error; - buffer_size -= i_error; - } - b_error = ext3_xattr_block_list(dentry, buffer, buffer_size); - if (b_error < 0) - i_error = 0; - } - up_read(&EXT3_I(d_inode(dentry))->xattr_sem); - return i_error + b_error; -} - -/* - * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is - * not set, set it. - */ -static void ext3_xattr_update_super_block(handle_t *handle, - struct super_block *sb) -{ - if (EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR)) - return; - - if (ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh) == 0) { - EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR); - ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); - } -} - -/* - * Release the xattr block BH: If the reference count is > 1, decrement - * it; otherwise free the block. - */ -static void -ext3_xattr_release_block(handle_t *handle, struct inode *inode, - struct buffer_head *bh) -{ - struct mb_cache_entry *ce = NULL; - int error = 0; - - ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_bdev, bh->b_blocknr); - error = ext3_journal_get_write_access(handle, bh); - if (error) - goto out; - - lock_buffer(bh); - - if (BHDR(bh)->h_refcount == cpu_to_le32(1)) { - ea_bdebug(bh, "refcount now=0; freeing"); - if (ce) - mb_cache_entry_free(ce); - ext3_free_blocks(handle, inode, bh->b_blocknr, 1); - get_bh(bh); - ext3_forget(handle, 1, inode, bh, bh->b_blocknr); - } else { - le32_add_cpu(&BHDR(bh)->h_refcount, -1); - error = ext3_journal_dirty_metadata(handle, bh); - if (IS_SYNC(inode)) - handle->h_sync = 1; - dquot_free_block(inode, 1); - ea_bdebug(bh, "refcount now=%d; releasing", - le32_to_cpu(BHDR(bh)->h_refcount)); - if (ce) - mb_cache_entry_release(ce); - } - unlock_buffer(bh); -out: - ext3_std_error(inode->i_sb, error); - return; -} - -struct ext3_xattr_info { - int name_index; - const char *name; - const void *value; - size_t value_len; -}; - -struct ext3_xattr_search { - struct ext3_xattr_entry *first; - void *base; - void *end; - struct ext3_xattr_entry *here; - int not_found; -}; - -static int -ext3_xattr_set_entry(struct ext3_xattr_info *i, struct ext3_xattr_search *s) -{ - struct ext3_xattr_entry *last; - size_t free, min_offs = s->end - s->base, name_len = strlen(i->name); - - /* Compute min_offs and last. */ - last = s->first; - for (; !IS_LAST_ENTRY(last); last = EXT3_XATTR_NEXT(last)) { - if (!last->e_value_block && last->e_value_size) { - size_t offs = le16_to_cpu(last->e_value_offs); - if (offs < min_offs) - min_offs = offs; - } - } - free = min_offs - ((void *)last - s->base) - sizeof(__u32); - if (!s->not_found) { - if (!s->here->e_value_block && s->here->e_value_size) { - size_t size = le32_to_cpu(s->here->e_value_size); - free += EXT3_XATTR_SIZE(size); - } - free += EXT3_XATTR_LEN(name_len); - } - if (i->value) { - if (free < EXT3_XATTR_LEN(name_len) + - EXT3_XATTR_SIZE(i->value_len)) - return -ENOSPC; - } - - if (i->value && s->not_found) { - /* Insert the new name. */ - size_t size = EXT3_XATTR_LEN(name_len); - size_t rest = (void *)last - (void *)s->here + sizeof(__u32); - memmove((void *)s->here + size, s->here, rest); - memset(s->here, 0, size); - s->here->e_name_index = i->name_index; - s->here->e_name_len = name_len; - memcpy(s->here->e_name, i->name, name_len); - } else { - if (!s->here->e_value_block && s->here->e_value_size) { - void *first_val = s->base + min_offs; - size_t offs = le16_to_cpu(s->here->e_value_offs); - void *val = s->base + offs; - size_t size = EXT3_XATTR_SIZE( - le32_to_cpu(s->here->e_value_size)); - - if (i->value && size == EXT3_XATTR_SIZE(i->value_len)) { - /* The old and the new value have the same - size. Just replace. */ - s->here->e_value_size = - cpu_to_le32(i->value_len); - memset(val + size - EXT3_XATTR_PAD, 0, - EXT3_XATTR_PAD); /* Clear pad bytes. */ - memcpy(val, i->value, i->value_len); - return 0; - } - - /* Remove the old value. */ - memmove(first_val + size, first_val, val - first_val); - memset(first_val, 0, size); - s->here->e_value_size = 0; - s->here->e_value_offs = 0; - min_offs += size; - - /* Adjust all value offsets. */ - last = s->first; - while (!IS_LAST_ENTRY(last)) { - size_t o = le16_to_cpu(last->e_value_offs); - if (!last->e_value_block && - last->e_value_size && o < offs) - last->e_value_offs = - cpu_to_le16(o + size); - last = EXT3_XATTR_NEXT(last); - } - } - if (!i->value) { - /* Remove the old name. */ - size_t size = EXT3_XATTR_LEN(name_len); - last = ENTRY((void *)last - size); - memmove(s->here, (void *)s->here + size, - (void *)last - (void *)s->here + sizeof(__u32)); - memset(last, 0, size); - } - } - - if (i->value) { - /* Insert the new value. */ - s->here->e_value_size = cpu_to_le32(i->value_len); - if (i->value_len) { - size_t size = EXT3_XATTR_SIZE(i->value_len); - void *val = s->base + min_offs - size; - s->here->e_value_offs = cpu_to_le16(min_offs - size); - memset(val + size - EXT3_XATTR_PAD, 0, - EXT3_XATTR_PAD); /* Clear the pad bytes. */ - memcpy(val, i->value, i->value_len); - } - } - return 0; -} - -struct ext3_xattr_block_find { - struct ext3_xattr_search s; - struct buffer_head *bh; -}; - -static int -ext3_xattr_block_find(struct inode *inode, struct ext3_xattr_info *i, - struct ext3_xattr_block_find *bs) -{ - struct super_block *sb = inode->i_sb; - int error; - - ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", - i->name_index, i->name, i->value, (long)i->value_len); - - if (EXT3_I(inode)->i_file_acl) { - /* The inode already has an extended attribute block. */ - bs->bh = sb_bread(sb, EXT3_I(inode)->i_file_acl); - error = -EIO; - if (!bs->bh) - goto cleanup; - ea_bdebug(bs->bh, "b_count=%d, refcount=%d", - atomic_read(&(bs->bh->b_count)), - le32_to_cpu(BHDR(bs->bh)->h_refcount)); - if (ext3_xattr_check_block(bs->bh)) { - ext3_error(sb, __func__, - "inode %lu: bad block "E3FSBLK, inode->i_ino, - EXT3_I(inode)->i_file_acl); - error = -EIO; - goto cleanup; - } - /* Find the named attribute. */ - bs->s.base = BHDR(bs->bh); - bs->s.first = BFIRST(bs->bh); - bs->s.end = bs->bh->b_data + bs->bh->b_size; - bs->s.here = bs->s.first; - error = ext3_xattr_find_entry(&bs->s.here, i->name_index, - i->name, bs->bh->b_size, 1); - if (error && error != -ENODATA) - goto cleanup; - bs->s.not_found = error; - } - error = 0; - -cleanup: - return error; -} - -static int -ext3_xattr_block_set(handle_t *handle, struct inode *inode, - struct ext3_xattr_info *i, - struct ext3_xattr_block_find *bs) -{ - struct super_block *sb = inode->i_sb; - struct buffer_head *new_bh = NULL; - struct ext3_xattr_search *s = &bs->s; - struct mb_cache_entry *ce = NULL; - int error = 0; - -#define header(x) ((struct ext3_xattr_header *)(x)) - - if (i->value && i->value_len > sb->s_blocksize) - return -ENOSPC; - if (s->base) { - ce = mb_cache_entry_get(ext3_xattr_cache, bs->bh->b_bdev, - bs->bh->b_blocknr); - error = ext3_journal_get_write_access(handle, bs->bh); - if (error) - goto cleanup; - lock_buffer(bs->bh); - - if (header(s->base)->h_refcount == cpu_to_le32(1)) { - if (ce) { - mb_cache_entry_free(ce); - ce = NULL; - } - ea_bdebug(bs->bh, "modifying in-place"); - error = ext3_xattr_set_entry(i, s); - if (!error) { - if (!IS_LAST_ENTRY(s->first)) - ext3_xattr_rehash(header(s->base), - s->here); - ext3_xattr_cache_insert(bs->bh); - } - unlock_buffer(bs->bh); - if (error == -EIO) - goto bad_block; - if (!error) - error = ext3_journal_dirty_metadata(handle, - bs->bh); - if (error) - goto cleanup; - goto inserted; - } else { - int offset = (char *)s->here - bs->bh->b_data; - - unlock_buffer(bs->bh); - journal_release_buffer(handle, bs->bh); - - if (ce) { - mb_cache_entry_release(ce); - ce = NULL; - } - ea_bdebug(bs->bh, "cloning"); - s->base = kmalloc(bs->bh->b_size, GFP_NOFS); - error = -ENOMEM; - if (s->base == NULL) - goto cleanup; - memcpy(s->base, BHDR(bs->bh), bs->bh->b_size); - s->first = ENTRY(header(s->base)+1); - header(s->base)->h_refcount = cpu_to_le32(1); - s->here = ENTRY(s->base + offset); - s->end = s->base + bs->bh->b_size; - } - } else { - /* Allocate a buffer where we construct the new block. */ - s->base = kzalloc(sb->s_blocksize, GFP_NOFS); - /* assert(header == s->base) */ - error = -ENOMEM; - if (s->base == NULL) - goto cleanup; - header(s->base)->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC); - header(s->base)->h_blocks = cpu_to_le32(1); - header(s->base)->h_refcount = cpu_to_le32(1); - s->first = ENTRY(header(s->base)+1); - s->here = ENTRY(header(s->base)+1); - s->end = s->base + sb->s_blocksize; - } - - error = ext3_xattr_set_entry(i, s); - if (error == -EIO) - goto bad_block; - if (error) - goto cleanup; - if (!IS_LAST_ENTRY(s->first)) - ext3_xattr_rehash(header(s->base), s->here); - -inserted: - if (!IS_LAST_ENTRY(s->first)) { - new_bh = ext3_xattr_cache_find(inode, header(s->base), &ce); - if (new_bh) { - /* We found an identical block in the cache. */ - if (new_bh == bs->bh) - ea_bdebug(new_bh, "keeping"); - else { - /* The old block is released after updating - the inode. */ - error = dquot_alloc_block(inode, 1); - if (error) - goto cleanup; - error = ext3_journal_get_write_access(handle, - new_bh); - if (error) - goto cleanup_dquot; - lock_buffer(new_bh); - le32_add_cpu(&BHDR(new_bh)->h_refcount, 1); - ea_bdebug(new_bh, "reusing; refcount now=%d", - le32_to_cpu(BHDR(new_bh)->h_refcount)); - unlock_buffer(new_bh); - error = ext3_journal_dirty_metadata(handle, - new_bh); - if (error) - goto cleanup_dquot; - } - mb_cache_entry_release(ce); - ce = NULL; - } else if (bs->bh && s->base == bs->bh->b_data) { - /* We were modifying this block in-place. */ - ea_bdebug(bs->bh, "keeping this block"); - new_bh = bs->bh; - get_bh(new_bh); - } else { - /* We need to allocate a new block */ - ext3_fsblk_t goal = ext3_group_first_block_no(sb, - EXT3_I(inode)->i_block_group); - ext3_fsblk_t block; - - /* - * Protect us agaist concurrent allocations to the - * same inode from ext3_..._writepage(). Reservation - * code does not expect racing allocations. - */ - mutex_lock(&EXT3_I(inode)->truncate_mutex); - block = ext3_new_block(handle, inode, goal, &error); - mutex_unlock(&EXT3_I(inode)->truncate_mutex); - if (error) - goto cleanup; - ea_idebug(inode, "creating block %d", block); - - new_bh = sb_getblk(sb, block); - if (unlikely(!new_bh)) { -getblk_failed: - ext3_free_blocks(handle, inode, block, 1); - error = -ENOMEM; - goto cleanup; - } - lock_buffer(new_bh); - error = ext3_journal_get_create_access(handle, new_bh); - if (error) { - unlock_buffer(new_bh); - goto getblk_failed; - } - memcpy(new_bh->b_data, s->base, new_bh->b_size); - set_buffer_uptodate(new_bh); - unlock_buffer(new_bh); - ext3_xattr_cache_insert(new_bh); - error = ext3_journal_dirty_metadata(handle, new_bh); - if (error) - goto cleanup; - } - } - - /* Update the inode. */ - EXT3_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; - - /* Drop the previous xattr block. */ - if (bs->bh && bs->bh != new_bh) - ext3_xattr_release_block(handle, inode, bs->bh); - error = 0; - -cleanup: - if (ce) - mb_cache_entry_release(ce); - brelse(new_bh); - if (!(bs->bh && s->base == bs->bh->b_data)) - kfree(s->base); - - return error; - -cleanup_dquot: - dquot_free_block(inode, 1); - goto cleanup; - -bad_block: - ext3_error(inode->i_sb, __func__, - "inode %lu: bad block "E3FSBLK, inode->i_ino, - EXT3_I(inode)->i_file_acl); - goto cleanup; - -#undef header -} - -struct ext3_xattr_ibody_find { - struct ext3_xattr_search s; - struct ext3_iloc iloc; -}; - -static int -ext3_xattr_ibody_find(struct inode *inode, struct ext3_xattr_info *i, - struct ext3_xattr_ibody_find *is) -{ - struct ext3_xattr_ibody_header *header; - struct ext3_inode *raw_inode; - int error; - - if (EXT3_I(inode)->i_extra_isize == 0) - return 0; - raw_inode = ext3_raw_inode(&is->iloc); - header = IHDR(inode, raw_inode); - is->s.base = is->s.first = IFIRST(header); - is->s.here = is->s.first; - is->s.end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; - if (ext3_test_inode_state(inode, EXT3_STATE_XATTR)) { - error = ext3_xattr_check_names(IFIRST(header), is->s.end); - if (error) - return error; - /* Find the named attribute. */ - error = ext3_xattr_find_entry(&is->s.here, i->name_index, - i->name, is->s.end - - (void *)is->s.base, 0); - if (error && error != -ENODATA) - return error; - is->s.not_found = error; - } - return 0; -} - -static int -ext3_xattr_ibody_set(handle_t *handle, struct inode *inode, - struct ext3_xattr_info *i, - struct ext3_xattr_ibody_find *is) -{ - struct ext3_xattr_ibody_header *header; - struct ext3_xattr_search *s = &is->s; - int error; - - if (EXT3_I(inode)->i_extra_isize == 0) - return -ENOSPC; - error = ext3_xattr_set_entry(i, s); - if (error) - return error; - header = IHDR(inode, ext3_raw_inode(&is->iloc)); - if (!IS_LAST_ENTRY(s->first)) { - header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC); - ext3_set_inode_state(inode, EXT3_STATE_XATTR); - } else { - header->h_magic = cpu_to_le32(0); - ext3_clear_inode_state(inode, EXT3_STATE_XATTR); - } - return 0; -} - -/* - * ext3_xattr_set_handle() - * - * Create, replace or remove an extended attribute for this inode. Value - * is NULL to remove an existing extended attribute, and non-NULL to - * either replace an existing extended attribute, or create a new extended - * attribute. The flags XATTR_REPLACE and XATTR_CREATE - * specify that an extended attribute must exist and must not exist - * previous to the call, respectively. - * - * Returns 0, or a negative error number on failure. - */ -int -ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, - const char *name, const void *value, size_t value_len, - int flags) -{ - struct ext3_xattr_info i = { - .name_index = name_index, - .name = name, - .value = value, - .value_len = value_len, - - }; - struct ext3_xattr_ibody_find is = { - .s = { .not_found = -ENODATA, }, - }; - struct ext3_xattr_block_find bs = { - .s = { .not_found = -ENODATA, }, - }; - int error; - - if (!name) - return -EINVAL; - if (strlen(name) > 255) - return -ERANGE; - down_write(&EXT3_I(inode)->xattr_sem); - error = ext3_get_inode_loc(inode, &is.iloc); - if (error) - goto cleanup; - - error = ext3_journal_get_write_access(handle, is.iloc.bh); - if (error) - goto cleanup; - - if (ext3_test_inode_state(inode, EXT3_STATE_NEW)) { - struct ext3_inode *raw_inode = ext3_raw_inode(&is.iloc); - memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size); - ext3_clear_inode_state(inode, EXT3_STATE_NEW); - } - - error = ext3_xattr_ibody_find(inode, &i, &is); - if (error) - goto cleanup; - if (is.s.not_found) - error = ext3_xattr_block_find(inode, &i, &bs); - if (error) - goto cleanup; - if (is.s.not_found && bs.s.not_found) { - error = -ENODATA; - if (flags & XATTR_REPLACE) - goto cleanup; - error = 0; - if (!value) - goto cleanup; - } else { - error = -EEXIST; - if (flags & XATTR_CREATE) - goto cleanup; - } - if (!value) { - if (!is.s.not_found) - error = ext3_xattr_ibody_set(handle, inode, &i, &is); - else if (!bs.s.not_found) - error = ext3_xattr_block_set(handle, inode, &i, &bs); - } else { - error = ext3_xattr_ibody_set(handle, inode, &i, &is); - if (!error && !bs.s.not_found) { - i.value = NULL; - error = ext3_xattr_block_set(handle, inode, &i, &bs); - } else if (error == -ENOSPC) { - if (EXT3_I(inode)->i_file_acl && !bs.s.base) { - error = ext3_xattr_block_find(inode, &i, &bs); - if (error) - goto cleanup; - } - error = ext3_xattr_block_set(handle, inode, &i, &bs); - if (error) - goto cleanup; - if (!is.s.not_found) { - i.value = NULL; - error = ext3_xattr_ibody_set(handle, inode, &i, - &is); - } - } - } - if (!error) { - ext3_xattr_update_super_block(handle, inode->i_sb); - inode->i_ctime = CURRENT_TIME_SEC; - error = ext3_mark_iloc_dirty(handle, inode, &is.iloc); - /* - * The bh is consumed by ext3_mark_iloc_dirty, even with - * error != 0. - */ - is.iloc.bh = NULL; - if (IS_SYNC(inode)) - handle->h_sync = 1; - } - -cleanup: - brelse(is.iloc.bh); - brelse(bs.bh); - up_write(&EXT3_I(inode)->xattr_sem); - return error; -} - -/* - * ext3_xattr_set() - * - * Like ext3_xattr_set_handle, but start from an inode. This extended - * attribute modification is a filesystem transaction by itself. - * - * Returns 0, or a negative error number on failure. - */ -int -ext3_xattr_set(struct inode *inode, int name_index, const char *name, - const void *value, size_t value_len, int flags) -{ - handle_t *handle; - int error, retries = 0; - -retry: - handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb)); - if (IS_ERR(handle)) { - error = PTR_ERR(handle); - } else { - int error2; - - error = ext3_xattr_set_handle(handle, inode, name_index, name, - value, value_len, flags); - error2 = ext3_journal_stop(handle); - if (error == -ENOSPC && - ext3_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - if (error == 0) - error = error2; - } - - return error; -} - -/* - * ext3_xattr_delete_inode() - * - * Free extended attribute resources associated with this inode. This - * is called immediately before an inode is freed. We have exclusive - * access to the inode. - */ -void -ext3_xattr_delete_inode(handle_t *handle, struct inode *inode) -{ - struct buffer_head *bh = NULL; - - if (!EXT3_I(inode)->i_file_acl) - goto cleanup; - bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl); - if (!bh) { - ext3_error(inode->i_sb, __func__, - "inode %lu: block "E3FSBLK" read error", inode->i_ino, - EXT3_I(inode)->i_file_acl); - goto cleanup; - } - if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || - BHDR(bh)->h_blocks != cpu_to_le32(1)) { - ext3_error(inode->i_sb, __func__, - "inode %lu: bad block "E3FSBLK, inode->i_ino, - EXT3_I(inode)->i_file_acl); - goto cleanup; - } - ext3_xattr_release_block(handle, inode, bh); - EXT3_I(inode)->i_file_acl = 0; - -cleanup: - brelse(bh); -} - -/* - * ext3_xattr_put_super() - * - * This is called when a file system is unmounted. - */ -void -ext3_xattr_put_super(struct super_block *sb) -{ - mb_cache_shrink(sb->s_bdev); -} - -/* - * ext3_xattr_cache_insert() - * - * Create a new entry in the extended attribute cache, and insert - * it unless such an entry is already in the cache. - * - * Returns 0, or a negative error number on failure. - */ -static void -ext3_xattr_cache_insert(struct buffer_head *bh) -{ - __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); - struct mb_cache_entry *ce; - int error; - - ce = mb_cache_entry_alloc(ext3_xattr_cache, GFP_NOFS); - if (!ce) { - ea_bdebug(bh, "out of memory"); - return; - } - error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash); - if (error) { - mb_cache_entry_free(ce); - if (error == -EBUSY) { - ea_bdebug(bh, "already in cache"); - error = 0; - } - } else { - ea_bdebug(bh, "inserting [%x]", (int)hash); - mb_cache_entry_release(ce); - } -} - -/* - * ext3_xattr_cmp() - * - * Compare two extended attribute blocks for equality. - * - * Returns 0 if the blocks are equal, 1 if they differ, and - * a negative error number on errors. - */ -static int -ext3_xattr_cmp(struct ext3_xattr_header *header1, - struct ext3_xattr_header *header2) -{ - struct ext3_xattr_entry *entry1, *entry2; - - entry1 = ENTRY(header1+1); - entry2 = ENTRY(header2+1); - while (!IS_LAST_ENTRY(entry1)) { - if (IS_LAST_ENTRY(entry2)) - return 1; - if (entry1->e_hash != entry2->e_hash || - entry1->e_name_index != entry2->e_name_index || - entry1->e_name_len != entry2->e_name_len || - entry1->e_value_size != entry2->e_value_size || - memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) - return 1; - if (entry1->e_value_block != 0 || entry2->e_value_block != 0) - return -EIO; - if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), - (char *)header2 + le16_to_cpu(entry2->e_value_offs), - le32_to_cpu(entry1->e_value_size))) - return 1; - - entry1 = EXT3_XATTR_NEXT(entry1); - entry2 = EXT3_XATTR_NEXT(entry2); - } - if (!IS_LAST_ENTRY(entry2)) - return 1; - return 0; -} - -/* - * ext3_xattr_cache_find() - * - * Find an identical extended attribute block. - * - * Returns a pointer to the block found, or NULL if such a block was - * not found or an error occurred. - */ -static struct buffer_head * -ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header, - struct mb_cache_entry **pce) -{ - __u32 hash = le32_to_cpu(header->h_hash); - struct mb_cache_entry *ce; - - if (!header->h_hash) - return NULL; /* never share */ - ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); -again: - ce = mb_cache_entry_find_first(ext3_xattr_cache, inode->i_sb->s_bdev, - hash); - while (ce) { - struct buffer_head *bh; - - if (IS_ERR(ce)) { - if (PTR_ERR(ce) == -EAGAIN) - goto again; - break; - } - bh = sb_bread(inode->i_sb, ce->e_block); - if (!bh) { - ext3_error(inode->i_sb, __func__, - "inode %lu: block %lu read error", - inode->i_ino, (unsigned long) ce->e_block); - } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= - EXT3_XATTR_REFCOUNT_MAX) { - ea_idebug(inode, "block %lu refcount %d>=%d", - (unsigned long) ce->e_block, - le32_to_cpu(BHDR(bh)->h_refcount), - EXT3_XATTR_REFCOUNT_MAX); - } else if (ext3_xattr_cmp(header, BHDR(bh)) == 0) { - *pce = ce; - return bh; - } - brelse(bh); - ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash); - } - return NULL; -} - -#define NAME_HASH_SHIFT 5 -#define VALUE_HASH_SHIFT 16 - -/* - * ext3_xattr_hash_entry() - * - * Compute the hash of an extended attribute. - */ -static inline void ext3_xattr_hash_entry(struct ext3_xattr_header *header, - struct ext3_xattr_entry *entry) -{ - __u32 hash = 0; - char *name = entry->e_name; - int n; - - for (n=0; n < entry->e_name_len; n++) { - hash = (hash << NAME_HASH_SHIFT) ^ - (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ - *name++; - } - - if (entry->e_value_block == 0 && entry->e_value_size != 0) { - __le32 *value = (__le32 *)((char *)header + - le16_to_cpu(entry->e_value_offs)); - for (n = (le32_to_cpu(entry->e_value_size) + - EXT3_XATTR_ROUND) >> EXT3_XATTR_PAD_BITS; n; n--) { - hash = (hash << VALUE_HASH_SHIFT) ^ - (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ - le32_to_cpu(*value++); - } - } - entry->e_hash = cpu_to_le32(hash); -} - -#undef NAME_HASH_SHIFT -#undef VALUE_HASH_SHIFT - -#define BLOCK_HASH_SHIFT 16 - -/* - * ext3_xattr_rehash() - * - * Re-compute the extended attribute hash value after an entry has changed. - */ -static void ext3_xattr_rehash(struct ext3_xattr_header *header, - struct ext3_xattr_entry *entry) -{ - struct ext3_xattr_entry *here; - __u32 hash = 0; - - ext3_xattr_hash_entry(header, entry); - here = ENTRY(header+1); - while (!IS_LAST_ENTRY(here)) { - if (!here->e_hash) { - /* Block is not shared if an entry's hash value == 0 */ - hash = 0; - break; - } - hash = (hash << BLOCK_HASH_SHIFT) ^ - (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^ - le32_to_cpu(here->e_hash); - here = EXT3_XATTR_NEXT(here); - } - header->h_hash = cpu_to_le32(hash); -} - -#undef BLOCK_HASH_SHIFT - -int __init -init_ext3_xattr(void) -{ - ext3_xattr_cache = mb_cache_create("ext3_xattr", 6); - if (!ext3_xattr_cache) - return -ENOMEM; - return 0; -} - -void -exit_ext3_xattr(void) -{ - if (ext3_xattr_cache) - mb_cache_destroy(ext3_xattr_cache); - ext3_xattr_cache = NULL; -} diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h deleted file mode 100644 index 32e93ebf8031..000000000000 --- a/fs/ext3/xattr.h +++ /dev/null @@ -1,136 +0,0 @@ -/* - File: fs/ext3/xattr.h - - On-disk format of extended attributes for the ext3 filesystem. - - (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org> -*/ - -#include <linux/xattr.h> - -/* Magic value in attribute blocks */ -#define EXT3_XATTR_MAGIC 0xEA020000 - -/* Maximum number of references to one attribute block */ -#define EXT3_XATTR_REFCOUNT_MAX 1024 - -/* Name indexes */ -#define EXT3_XATTR_INDEX_USER 1 -#define EXT3_XATTR_INDEX_POSIX_ACL_ACCESS 2 -#define EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT 3 -#define EXT3_XATTR_INDEX_TRUSTED 4 -#define EXT3_XATTR_INDEX_LUSTRE 5 -#define EXT3_XATTR_INDEX_SECURITY 6 - -struct ext3_xattr_header { - __le32 h_magic; /* magic number for identification */ - __le32 h_refcount; /* reference count */ - __le32 h_blocks; /* number of disk blocks used */ - __le32 h_hash; /* hash value of all attributes */ - __u32 h_reserved[4]; /* zero right now */ -}; - -struct ext3_xattr_ibody_header { - __le32 h_magic; /* magic number for identification */ -}; - -struct ext3_xattr_entry { - __u8 e_name_len; /* length of name */ - __u8 e_name_index; /* attribute name index */ - __le16 e_value_offs; /* offset in disk block of value */ - __le32 e_value_block; /* disk block attribute is stored on (n/i) */ - __le32 e_value_size; /* size of attribute value */ - __le32 e_hash; /* hash value of name and value */ - char e_name[0]; /* attribute name */ -}; - -#define EXT3_XATTR_PAD_BITS 2 -#define EXT3_XATTR_PAD (1<<EXT3_XATTR_PAD_BITS) -#define EXT3_XATTR_ROUND (EXT3_XATTR_PAD-1) -#define EXT3_XATTR_LEN(name_len) \ - (((name_len) + EXT3_XATTR_ROUND + \ - sizeof(struct ext3_xattr_entry)) & ~EXT3_XATTR_ROUND) -#define EXT3_XATTR_NEXT(entry) \ - ( (struct ext3_xattr_entry *)( \ - (char *)(entry) + EXT3_XATTR_LEN((entry)->e_name_len)) ) -#define EXT3_XATTR_SIZE(size) \ - (((size) + EXT3_XATTR_ROUND) & ~EXT3_XATTR_ROUND) - -# ifdef CONFIG_EXT3_FS_XATTR - -extern const struct xattr_handler ext3_xattr_user_handler; -extern const struct xattr_handler ext3_xattr_trusted_handler; -extern const struct xattr_handler ext3_xattr_security_handler; - -extern ssize_t ext3_listxattr(struct dentry *, char *, size_t); - -extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t); -extern int ext3_xattr_set(struct inode *, int, const char *, const void *, size_t, int); -extern int ext3_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); - -extern void ext3_xattr_delete_inode(handle_t *, struct inode *); -extern void ext3_xattr_put_super(struct super_block *); - -extern int init_ext3_xattr(void); -extern void exit_ext3_xattr(void); - -extern const struct xattr_handler *ext3_xattr_handlers[]; - -# else /* CONFIG_EXT3_FS_XATTR */ - -static inline int -ext3_xattr_get(struct inode *inode, int name_index, const char *name, - void *buffer, size_t size, int flags) -{ - return -EOPNOTSUPP; -} - -static inline int -ext3_xattr_set(struct inode *inode, int name_index, const char *name, - const void *value, size_t size, int flags) -{ - return -EOPNOTSUPP; -} - -static inline int -ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, - const char *name, const void *value, size_t size, int flags) -{ - return -EOPNOTSUPP; -} - -static inline void -ext3_xattr_delete_inode(handle_t *handle, struct inode *inode) -{ -} - -static inline void -ext3_xattr_put_super(struct super_block *sb) -{ -} - -static inline int -init_ext3_xattr(void) -{ - return 0; -} - -static inline void -exit_ext3_xattr(void) -{ -} - -#define ext3_xattr_handlers NULL - -# endif /* CONFIG_EXT3_FS_XATTR */ - -#ifdef CONFIG_EXT3_FS_SECURITY -extern int ext3_init_security(handle_t *handle, struct inode *inode, - struct inode *dir, const struct qstr *qstr); -#else -static inline int ext3_init_security(handle_t *handle, struct inode *inode, - struct inode *dir, const struct qstr *qstr) -{ - return 0; -} -#endif diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c deleted file mode 100644 index c9506d5e3b13..000000000000 --- a/fs/ext3/xattr_security.c +++ /dev/null @@ -1,78 +0,0 @@ -/* - * linux/fs/ext3/xattr_security.c - * Handler for storing security labels as extended attributes. - */ - -#include <linux/security.h> -#include "ext3.h" -#include "xattr.h" - -static size_t -ext3_xattr_security_list(struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len, int type) -{ - const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN; - const size_t total_len = prefix_len + name_len + 1; - - - if (list && total_len <= list_size) { - memcpy(list, XATTR_SECURITY_PREFIX, prefix_len); - memcpy(list+prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; - } - return total_len; -} - -static int -ext3_xattr_security_get(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) -{ - if (strcmp(name, "") == 0) - return -EINVAL; - return ext3_xattr_get(d_inode(dentry), EXT3_XATTR_INDEX_SECURITY, - name, buffer, size); -} - -static int -ext3_xattr_security_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) -{ - if (strcmp(name, "") == 0) - return -EINVAL; - return ext3_xattr_set(d_inode(dentry), EXT3_XATTR_INDEX_SECURITY, - name, value, size, flags); -} - -static int ext3_initxattrs(struct inode *inode, - const struct xattr *xattr_array, - void *fs_info) -{ - const struct xattr *xattr; - handle_t *handle = fs_info; - int err = 0; - - for (xattr = xattr_array; xattr->name != NULL; xattr++) { - err = ext3_xattr_set_handle(handle, inode, - EXT3_XATTR_INDEX_SECURITY, - xattr->name, xattr->value, - xattr->value_len, 0); - if (err < 0) - break; - } - return err; -} - -int -ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir, - const struct qstr *qstr) -{ - return security_inode_init_security(inode, dir, qstr, - &ext3_initxattrs, handle); -} - -const struct xattr_handler ext3_xattr_security_handler = { - .prefix = XATTR_SECURITY_PREFIX, - .list = ext3_xattr_security_list, - .get = ext3_xattr_security_get, - .set = ext3_xattr_security_set, -}; diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c deleted file mode 100644 index 206cc66dc285..000000000000 --- a/fs/ext3/xattr_trusted.c +++ /dev/null @@ -1,54 +0,0 @@ -/* - * linux/fs/ext3/xattr_trusted.c - * Handler for trusted extended attributes. - * - * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org> - */ - -#include "ext3.h" -#include "xattr.h" - -static size_t -ext3_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len, int type) -{ - const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; - const size_t total_len = prefix_len + name_len + 1; - - if (!capable(CAP_SYS_ADMIN)) - return 0; - - if (list && total_len <= list_size) { - memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); - memcpy(list+prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; - } - return total_len; -} - -static int -ext3_xattr_trusted_get(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) -{ - if (strcmp(name, "") == 0) - return -EINVAL; - return ext3_xattr_get(d_inode(dentry), EXT3_XATTR_INDEX_TRUSTED, - name, buffer, size); -} - -static int -ext3_xattr_trusted_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) -{ - if (strcmp(name, "") == 0) - return -EINVAL; - return ext3_xattr_set(d_inode(dentry), EXT3_XATTR_INDEX_TRUSTED, name, - value, size, flags); -} - -const struct xattr_handler ext3_xattr_trusted_handler = { - .prefix = XATTR_TRUSTED_PREFIX, - .list = ext3_xattr_trusted_list, - .get = ext3_xattr_trusted_get, - .set = ext3_xattr_trusted_set, -}; diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c deleted file mode 100644 index 021508ad1616..000000000000 --- a/fs/ext3/xattr_user.c +++ /dev/null @@ -1,58 +0,0 @@ -/* - * linux/fs/ext3/xattr_user.c - * Handler for extended user attributes. - * - * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org> - */ - -#include "ext3.h" -#include "xattr.h" - -static size_t -ext3_xattr_user_list(struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len, int type) -{ - const size_t prefix_len = XATTR_USER_PREFIX_LEN; - const size_t total_len = prefix_len + name_len + 1; - - if (!test_opt(dentry->d_sb, XATTR_USER)) - return 0; - - if (list && total_len <= list_size) { - memcpy(list, XATTR_USER_PREFIX, prefix_len); - memcpy(list+prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; - } - return total_len; -} - -static int -ext3_xattr_user_get(struct dentry *dentry, const char *name, void *buffer, - size_t size, int type) -{ - if (strcmp(name, "") == 0) - return -EINVAL; - if (!test_opt(dentry->d_sb, XATTR_USER)) - return -EOPNOTSUPP; - return ext3_xattr_get(d_inode(dentry), EXT3_XATTR_INDEX_USER, - name, buffer, size); -} - -static int -ext3_xattr_user_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) -{ - if (strcmp(name, "") == 0) - return -EINVAL; - if (!test_opt(dentry->d_sb, XATTR_USER)) - return -EOPNOTSUPP; - return ext3_xattr_set(d_inode(dentry), EXT3_XATTR_INDEX_USER, - name, value, size, flags); -} - -const struct xattr_handler ext3_xattr_user_handler = { - .prefix = XATTR_USER_PREFIX, - .list = ext3_xattr_user_list, - .get = ext3_xattr_user_get, - .set = ext3_xattr_user_set, -}; diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index bf8bc8aba471..47728da7702c 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -1,5 +1,38 @@ +# Ext3 configs are here for backward compatibility with old configs which may +# have EXT3_FS set but not EXT4_FS set and thus would result in non-bootable +# kernels after the removal of ext3 driver. +config EXT3_FS + tristate "The Extended 3 (ext3) filesystem" + # These must match EXT4_FS selects... + select EXT4_FS + select JBD2 + select CRC16 + select CRYPTO + select CRYPTO_CRC32C + help + This config option is here only for backward compatibility. ext3 + filesystem is now handled by the ext4 driver. + +config EXT3_FS_POSIX_ACL + bool "Ext3 POSIX Access Control Lists" + depends on EXT3_FS + select EXT4_FS_POSIX_ACL + select FS_POSIX_ACL + help + This config option is here only for backward compatibility. ext3 + filesystem is now handled by the ext4 driver. + +config EXT3_FS_SECURITY + bool "Ext3 Security Labels" + depends on EXT3_FS + select EXT4_FS_SECURITY + help + This config option is here only for backward compatibility. ext3 + filesystem is now handled by the ext4 driver. + config EXT4_FS tristate "The Extended 4 (ext4) filesystem" + # Please update EXT3_FS selects when changing these select JBD2 select CRC16 select CRYPTO @@ -16,26 +49,27 @@ config EXT4_FS up fsck time. For more information, please see the web pages at http://ext4.wiki.kernel.org. - The ext4 filesystem will support mounting an ext3 - filesystem; while there will be some performance gains from - the delayed allocation and inode table readahead, the best - performance gains will require enabling ext4 features in the - filesystem, or formatting a new filesystem as an ext4 - filesystem initially. + The ext4 filesystem supports mounting an ext3 filesystem; while there + are some performance gains from the delayed allocation and inode + table readahead, the best performance gains require enabling ext4 + features in the filesystem using tune2fs, or formatting a new + filesystem as an ext4 filesystem initially. Without explicit enabling + of ext4 features, the on disk filesystem format stays fully backward + compatible. To compile this file system support as a module, choose M here. The module will be called ext4. If unsure, say N. -config EXT4_USE_FOR_EXT23 +config EXT4_USE_FOR_EXT2 bool "Use ext4 for ext2/ext3 file systems" depends on EXT4_FS - depends on EXT3_FS=n || EXT2_FS=n + depends on EXT2_FS=n default y help - Allow the ext4 file system driver code to be used for ext2 or - ext3 file system mounts. This allows users to reduce their + Allow the ext4 file system driver code to be used for ext2 + file system mounts. This allows users to reduce their compiled kernel size by using one file system driver for ext2, ext3, and ext4 file systems. diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c index 7dc4eb55913c..847f919c84d9 100644 --- a/fs/ext4/crypto_fname.c +++ b/fs/ext4/crypto_fname.c @@ -19,7 +19,6 @@ #include <linux/gfp.h> #include <linux/kernel.h> #include <linux/key.h> -#include <linux/key.h> #include <linux/list.h> #include <linux/mempool.h> #include <linux/random.h> @@ -329,6 +328,10 @@ int _ext4_fname_disk_to_usr(struct inode *inode, return oname->len; } } + if (iname->len < EXT4_CRYPTO_BLOCK_SIZE) { + EXT4_ERROR_INODE(inode, "encrypted inode too small"); + return -EUCLEAN; + } if (EXT4_I(inode)->i_crypt_info) return ext4_fname_decrypt(inode, iname, oname); diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index 442d24e8efc0..1d510c11b100 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -30,7 +30,7 @@ static void derive_crypt_complete(struct crypto_async_request *req, int rc) /** * ext4_derive_key_aes() - Derive a key using AES-128-ECB - * @deriving_key: Encryption key used for derivatio. + * @deriving_key: Encryption key used for derivation. * @source_key: Source key to which to apply derivation. * @derived_key: Derived key. * @@ -220,6 +220,8 @@ retry: BUG_ON(master_key->size != EXT4_AES_256_XTS_KEY_SIZE); res = ext4_derive_key_aes(ctx.nonce, master_key->raw, raw_key); + if (res) + goto out; got_key: ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0); if (!ctfm || IS_ERR(ctfm)) { diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c index 02c4e5df7afb..a640ec2c4b13 100644 --- a/fs/ext4/crypto_policy.c +++ b/fs/ext4/crypto_policy.c @@ -12,6 +12,7 @@ #include <linux/string.h> #include <linux/types.h> +#include "ext4_jbd2.h" #include "ext4.h" #include "xattr.h" @@ -49,7 +50,8 @@ static int ext4_create_encryption_context_from_policy( struct inode *inode, const struct ext4_encryption_policy *policy) { struct ext4_encryption_context ctx; - int res = 0; + handle_t *handle; + int res, res2; res = ext4_convert_inline_data(inode); if (res) @@ -78,11 +80,22 @@ static int ext4_create_encryption_context_from_policy( BUILD_BUG_ON(sizeof(ctx.nonce) != EXT4_KEY_DERIVATION_NONCE_SIZE); get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE); + handle = ext4_journal_start(inode, EXT4_HT_MISC, + ext4_jbd2_credits_xattr(inode)); + if (IS_ERR(handle)) + return PTR_ERR(handle); res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION, EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, sizeof(ctx), 0); - if (!res) + if (!res) { ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); + res = ext4_mark_inode_dirty(handle, inode); + if (res) + EXT4_ERROR_INODE(inode, "Failed to mark inode dirty"); + } + res2 = ext4_journal_stop(handle); + if (!res) + res = res2; return res; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f5e9f04220c1..fd1f28be5296 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -187,7 +187,7 @@ typedef struct ext4_io_end { } ext4_io_end_t; struct ext4_io_submit { - int io_op; + struct writeback_control *io_wbc; struct bio *io_bio; ext4_io_end_t *io_end; sector_t io_next_block; @@ -2272,6 +2272,8 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); int ext4_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); +int ext4_get_block_dax(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, diff --git a/fs/ext4/file.c b/fs/ext4/file.c index bc313ac5d3fa..113837e7ba98 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -22,6 +22,7 @@ #include <linux/fs.h> #include <linux/mount.h> #include <linux/path.h> +#include <linux/dax.h> #include <linux/quotaops.h> #include <linux/pagevec.h> #include <linux/uio.h> @@ -195,7 +196,7 @@ out: static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate) { struct inode *inode = bh->b_assoc_map->host; - /* XXX: breaks on 32-bit > 16GB. Is that even supported? */ + /* XXX: breaks on 32-bit > 16TB. Is that even supported? */ loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits; int err; if (!uptodate) @@ -206,17 +207,74 @@ static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate) static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { - return dax_fault(vma, vmf, ext4_get_block, ext4_end_io_unwritten); - /* Is this the right get_block? */ + int result; + handle_t *handle = NULL; + struct super_block *sb = file_inode(vma->vm_file)->i_sb; + bool write = vmf->flags & FAULT_FLAG_WRITE; + + if (write) { + sb_start_pagefault(sb); + file_update_time(vma->vm_file); + handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, + EXT4_DATA_TRANS_BLOCKS(sb)); + } + + if (IS_ERR(handle)) + result = VM_FAULT_SIGBUS; + else + result = __dax_fault(vma, vmf, ext4_get_block_dax, + ext4_end_io_unwritten); + + if (write) { + if (!IS_ERR(handle)) + ext4_journal_stop(handle); + sb_end_pagefault(sb); + } + + return result; +} + +static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, unsigned int flags) +{ + int result; + handle_t *handle = NULL; + struct inode *inode = file_inode(vma->vm_file); + struct super_block *sb = inode->i_sb; + bool write = flags & FAULT_FLAG_WRITE; + + if (write) { + sb_start_pagefault(sb); + file_update_time(vma->vm_file); + handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, + ext4_chunk_trans_blocks(inode, + PMD_SIZE / PAGE_SIZE)); + } + + if (IS_ERR(handle)) + result = VM_FAULT_SIGBUS; + else + result = __dax_pmd_fault(vma, addr, pmd, flags, + ext4_get_block_dax, ext4_end_io_unwritten); + + if (write) { + if (!IS_ERR(handle)) + ext4_journal_stop(handle); + sb_end_pagefault(sb); + } + + return result; } static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { - return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten); + return dax_mkwrite(vma, vmf, ext4_get_block_dax, + ext4_end_io_unwritten); } static const struct vm_operations_struct ext4_dax_vm_ops = { .fault = ext4_dax_fault, + .pmd_fault = ext4_dax_pmd_fault, .page_mkwrite = ext4_dax_mkwrite, .pfn_mkwrite = dax_pfn_mkwrite, }; @@ -244,7 +302,7 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); if (IS_DAX(file_inode(file))) { vma->vm_ops = &ext4_dax_vm_ops; - vma->vm_flags |= VM_MIXEDMAP; + vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; } else { vma->vm_ops = &ext4_file_vm_ops; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 173c1ae21395..619bfc1fda8c 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -721,7 +721,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, struct ext4_group_desc *gdp = NULL; struct ext4_inode_info *ei; struct ext4_sb_info *sbi; - int ret2, err = 0; + int ret2, err; struct inode *ret; ext4_group_t i; ext4_group_t flex_group; @@ -769,7 +769,9 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, inode->i_gid = dir->i_gid; } else inode_init_owner(inode, dir, mode); - dquot_initialize(inode); + err = dquot_initialize(inode); + if (err) + goto out; if (!goal) goal = sbi->s_inode_goal; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 4f6ac499f09e..2468261748b2 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -22,6 +22,7 @@ #include "ext4_jbd2.h" #include "truncate.h" +#include <linux/dax.h> #include <linux/uio.h> #include <trace/events/ext4.h> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index cecf9aa10811..612fbcf76b5c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -22,6 +22,7 @@ #include <linux/time.h> #include <linux/highuid.h> #include <linux/pagemap.h> +#include <linux/dax.h> #include <linux/quotaops.h> #include <linux/string.h> #include <linux/buffer_head.h> @@ -3020,6 +3021,17 @@ static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, EXT4_GET_BLOCKS_NO_LOCK); } +int ext4_get_block_dax(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int flags = EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_UNWRIT_EXT; + if (create) + flags |= EXT4_GET_BLOCKS_CREATE; + ext4_debug("ext4_get_block_dax: inode %lu, create flag %d\n", + inode->i_ino, create); + return _ext4_get_block(inode, iblock, bh_result, flags); +} + static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ssize_t size, void *private) { @@ -4661,8 +4673,11 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; - if (is_quota_modification(inode, attr)) - dquot_initialize(inode); + if (is_quota_modification(inode, attr)) { + error = dquot_initialize(inode); + if (error) + return error; + } if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { handle_t *handle; @@ -4725,6 +4740,14 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) error = ext4_orphan_add(handle, inode); orphan = 1; } + /* + * Update c/mtime on truncate up, ext4_truncate() will + * update c/mtime in shrink case below + */ + if (!shrink) { + inode->i_mtime = ext4_current_time(inode); + inode->i_ctime = inode->i_mtime; + } down_write(&EXT4_I(inode)->i_data_sem); EXT4_I(inode)->i_disksize = attr->ia_size; rc = ext4_mark_inode_dirty(handle, inode); diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 8313ca3324ec..6eb1a619890c 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -69,6 +69,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, ext4_fsblk_t mmp_block) { struct mmp_struct *mmp; + int ret; if (*bh) clear_buffer_uptodate(*bh); @@ -76,33 +77,36 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, /* This would be sb_bread(sb, mmp_block), except we need to be sure * that the MD RAID device cache has been bypassed, and that the read * is not blocked in the elevator. */ - if (!*bh) + if (!*bh) { *bh = sb_getblk(sb, mmp_block); - if (!*bh) - return -ENOMEM; - if (*bh) { - get_bh(*bh); - lock_buffer(*bh); - (*bh)->b_end_io = end_buffer_read_sync; - submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh); - wait_on_buffer(*bh); - if (!buffer_uptodate(*bh)) { - brelse(*bh); - *bh = NULL; + if (!*bh) { + ret = -ENOMEM; + goto warn_exit; } } - if (unlikely(!*bh)) { - ext4_warning(sb, "Error while reading MMP block %llu", - mmp_block); - return -EIO; + + get_bh(*bh); + lock_buffer(*bh); + (*bh)->b_end_io = end_buffer_read_sync; + submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh); + wait_on_buffer(*bh); + if (!buffer_uptodate(*bh)) { + brelse(*bh); + *bh = NULL; + ret = -EIO; + goto warn_exit; } mmp = (struct mmp_struct *)((*bh)->b_data); - if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC || - !ext4_mmp_csum_verify(sb, mmp)) - return -EINVAL; - - return 0; + if (le32_to_cpu(mmp->mmp_magic) == EXT4_MMP_MAGIC && + ext4_mmp_csum_verify(sb, mmp)) + return 0; + ret = -EINVAL; + +warn_exit: + ext4_warning(sb, "Error %d while reading MMP block %llu", + ret, mmp_block); + return ret; } /* @@ -111,7 +115,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp, const char *function, unsigned int line, const char *msg) { - __ext4_warning(sb, function, line, msg); + __ext4_warning(sb, function, line, "%s", msg); __ext4_warning(sb, function, line, "MMP failure info: last update time: %llu, last update " "node: %s, last update device: %s\n", diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 011dcfb5cce3..9f61e7679a6d 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2436,7 +2436,9 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct inode *inode; int err, credits, retries = 0; - dquot_initialize(dir); + err = dquot_initialize(dir); + if (err) + return err; credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); @@ -2470,7 +2472,9 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry, if (!new_valid_dev(rdev)) return -EINVAL; - dquot_initialize(dir); + err = dquot_initialize(dir); + if (err) + return err; credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); @@ -2499,7 +2503,9 @@ static int ext4_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) struct inode *inode; int err, retries = 0; - dquot_initialize(dir); + err = dquot_initialize(dir); + if (err) + return err; retry: inode = ext4_new_inode_start_handle(dir, mode, @@ -2612,7 +2618,9 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (EXT4_DIR_LINK_MAX(dir)) return -EMLINK; - dquot_initialize(dir); + err = dquot_initialize(dir); + if (err) + return err; credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); @@ -2910,8 +2918,12 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go in * separate transaction */ - dquot_initialize(dir); - dquot_initialize(d_inode(dentry)); + retval = dquot_initialize(dir); + if (retval) + return retval; + retval = dquot_initialize(d_inode(dentry)); + if (retval) + return retval; retval = -ENOENT; bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); @@ -2980,8 +2992,12 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) trace_ext4_unlink_enter(dir, dentry); /* Initialize quotas before so that eventual writes go * in separate transaction */ - dquot_initialize(dir); - dquot_initialize(d_inode(dentry)); + retval = dquot_initialize(dir); + if (retval) + return retval; + retval = dquot_initialize(d_inode(dentry)); + if (retval) + return retval; retval = -ENOENT; bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); @@ -3066,7 +3082,9 @@ static int ext4_symlink(struct inode *dir, goto err_free_sd; } - dquot_initialize(dir); + err = dquot_initialize(dir); + if (err) + goto err_free_sd; if ((disk_link.len > EXT4_N_BLOCKS * 4)) { /* @@ -3197,7 +3215,9 @@ static int ext4_link(struct dentry *old_dentry, if (ext4_encrypted_inode(dir) && !ext4_is_child_context_consistent_with_parent(dir, inode)) return -EPERM; - dquot_initialize(dir); + err = dquot_initialize(dir); + if (err) + return err; retry: handle = ext4_journal_start(dir, EXT4_HT_DIR, @@ -3476,13 +3496,20 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, int credits; u8 old_file_type; - dquot_initialize(old.dir); - dquot_initialize(new.dir); + retval = dquot_initialize(old.dir); + if (retval) + return retval; + retval = dquot_initialize(new.dir); + if (retval) + return retval; /* Initialize quotas before so that eventual writes go * in separate transaction */ - if (new.inode) - dquot_initialize(new.inode); + if (new.inode) { + retval = dquot_initialize(new.inode); + if (retval) + return retval; + } old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL); if (IS_ERR(old.bh)) @@ -3678,8 +3705,12 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, new.inode))) return -EPERM; - dquot_initialize(old.dir); - dquot_initialize(new.dir); + retval = dquot_initialize(old.dir); + if (retval) + return retval; + retval = dquot_initialize(new.dir); + if (retval) + return retval; old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, &old.inlined); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 5602450f03f6..84ba4d2b3a35 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -61,7 +61,6 @@ static void buffer_io_error(struct buffer_head *bh) static void ext4_finish_bio(struct bio *bio) { int i; - int error = !test_bit(BIO_UPTODATE, &bio->bi_flags); struct bio_vec *bvec; bio_for_each_segment_all(bvec, bio, i) { @@ -88,7 +87,7 @@ static void ext4_finish_bio(struct bio *bio) } #endif - if (error) { + if (bio->bi_error) { SetPageError(page); set_bit(AS_EIO, &page->mapping->flags); } @@ -107,7 +106,7 @@ static void ext4_finish_bio(struct bio *bio) continue; } clear_buffer_async_write(bh); - if (error) + if (bio->bi_error) buffer_io_error(bh); } while ((bh = bh->b_this_page) != head); bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); @@ -310,27 +309,25 @@ ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end) } /* BIO completion function for page writeback */ -static void ext4_end_bio(struct bio *bio, int error) +static void ext4_end_bio(struct bio *bio) { ext4_io_end_t *io_end = bio->bi_private; sector_t bi_sector = bio->bi_iter.bi_sector; BUG_ON(!io_end); bio->bi_end_io = NULL; - if (test_bit(BIO_UPTODATE, &bio->bi_flags)) - error = 0; - if (error) { + if (bio->bi_error) { struct inode *inode = io_end->inode; ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu " "(offset %llu size %ld starting block %llu)", - error, inode->i_ino, + bio->bi_error, inode->i_ino, (unsigned long long) io_end->offset, (long) io_end->size, (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); - mapping_set_error(inode->i_mapping, error); + mapping_set_error(inode->i_mapping, bio->bi_error); } if (io_end->flag & EXT4_IO_END_UNWRITTEN) { @@ -357,8 +354,10 @@ void ext4_io_submit(struct ext4_io_submit *io) struct bio *bio = io->io_bio; if (bio) { + int io_op = io->io_wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC : WRITE; bio_get(io->io_bio); - submit_bio(io->io_op, io->io_bio); + submit_bio(io_op, io->io_bio); bio_put(io->io_bio); } io->io_bio = NULL; @@ -367,7 +366,7 @@ void ext4_io_submit(struct ext4_io_submit *io) void ext4_io_submit_init(struct ext4_io_submit *io, struct writeback_control *wbc) { - io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); + io->io_wbc = wbc; io->io_bio = NULL; io->io_end = NULL; } @@ -375,12 +374,12 @@ void ext4_io_submit_init(struct ext4_io_submit *io, static int io_submit_init_bio(struct ext4_io_submit *io, struct buffer_head *bh) { - int nvecs = bio_get_nr_vecs(bh->b_bdev); struct bio *bio; - bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); + bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); if (!bio) return -ENOMEM; + wbc_init_bio(io->io_wbc, bio); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; bio->bi_end_io = ext4_end_bio; @@ -409,6 +408,7 @@ submit_and_retry: ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh)); if (ret != bh->b_size) goto submit_and_retry; + wbc_account_io(io->io_wbc, page, bh->b_size); io->io_next_block++; return 0; } diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index ec3ef93a52db..e26803fb210d 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -98,7 +98,7 @@ static inline bool ext4_bio_encrypted(struct bio *bio) * status of that page is hard. See end_buffer_async_read() for the details. * There is no point in duplicating all that complexity. */ -static void mpage_end_io(struct bio *bio, int err) +static void mpage_end_io(struct bio *bio) { struct bio_vec *bv; int i; @@ -106,7 +106,7 @@ static void mpage_end_io(struct bio *bio, int err) if (ext4_bio_encrypted(bio)) { struct ext4_crypto_ctx *ctx = bio->bi_private; - if (err) { + if (bio->bi_error) { ext4_release_crypto_ctx(ctx); } else { INIT_WORK(&ctx->r.work, completion_pages); @@ -118,7 +118,7 @@ static void mpage_end_io(struct bio *bio, int err) bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; - if (!err) { + if (!bio->bi_error) { SetPageUptodate(page); } else { ClearPageUptodate(page); @@ -284,7 +284,7 @@ int ext4_mpage_readpages(struct address_space *mapping, goto set_error_page; } bio = bio_alloc(GFP_KERNEL, - min_t(int, nr_pages, bio_get_nr_vecs(bdev))); + min_t(int, nr_pages, BIO_MAX_PAGES)); if (!bio) { if (ctx) ext4_release_crypto_ctx(ctx); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 58987b5c514b..a63c7b0a10cf 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -60,6 +60,7 @@ static struct ext4_lazy_init *ext4_li_info; static struct mutex ext4_li_mtx; static struct ext4_features *ext4_feat; static int ext4_mballoc_ready; +static struct ratelimit_state ext4_mount_msg_ratelimit; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); @@ -84,7 +85,7 @@ static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t); -#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) +#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2) static struct file_system_type ext2_fs_type = { .owner = THIS_MODULE, .name = "ext2", @@ -100,7 +101,6 @@ MODULE_ALIAS("ext2"); #endif -#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) static struct file_system_type ext3_fs_type = { .owner = THIS_MODULE, .name = "ext3", @@ -111,9 +111,6 @@ static struct file_system_type ext3_fs_type = { MODULE_ALIAS_FS("ext3"); MODULE_ALIAS("ext3"); #define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type) -#else -#define IS_EXT3_SB(sb) (0) -#endif static int ext4_verify_csum_type(struct super_block *sb, struct ext4_super_block *es) @@ -325,6 +322,22 @@ static void save_error_info(struct super_block *sb, const char *func, ext4_commit_super(sb, 1); } +/* + * The del_gendisk() function uninitializes the disk-specific data + * structures, including the bdi structure, without telling anyone + * else. Once this happens, any attempt to call mark_buffer_dirty() + * (for example, by ext4_commit_super), will cause a kernel OOPS. + * This is a kludge to prevent these oops until we can put in a proper + * hook in del_gendisk() to inform the VFS and file system layers. + */ +static int block_device_ejected(struct super_block *sb) +{ + struct inode *bd_inode = sb->s_bdev->bd_inode; + struct backing_dev_info *bdi = inode_to_bdi(bd_inode); + + return bdi->dev == NULL; +} + static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) { struct super_block *sb = journal->j_private; @@ -1394,9 +1407,9 @@ static const struct mount_opts { {Opt_stripe, 0, MOPT_GTE0}, {Opt_resuid, 0, MOPT_GTE0}, {Opt_resgid, 0, MOPT_GTE0}, - {Opt_journal_dev, 0, MOPT_GTE0}, - {Opt_journal_path, 0, MOPT_STRING}, - {Opt_journal_ioprio, 0, MOPT_GTE0}, + {Opt_journal_dev, 0, MOPT_NO_EXT2 | MOPT_GTE0}, + {Opt_journal_path, 0, MOPT_NO_EXT2 | MOPT_STRING}, + {Opt_journal_ioprio, 0, MOPT_NO_EXT2 | MOPT_GTE0}, {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ}, {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ}, {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, @@ -1763,10 +1776,10 @@ static inline void ext4_show_quota_options(struct seq_file *seq, } if (sbi->s_qf_names[USRQUOTA]) - seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); + seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]); if (sbi->s_qf_names[GRPQUOTA]) - seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); + seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]); #endif } @@ -3643,6 +3656,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } if (test_opt(sb, DELALLOC)) clear_opt(sb, DELALLOC); + } else { + sb->s_iflags |= SB_I_CGROUPWB; } sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | @@ -4275,9 +4290,10 @@ no_journal: "the device does not support discard"); } - ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " - "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts, - *sbi->s_es->s_mount_opts ? "; " : "", orig_data); + if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount")) + ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " + "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts, + *sbi->s_es->s_mount_opts ? "; " : "", orig_data); if (es->s_error_count) mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ @@ -4617,7 +4633,7 @@ static int ext4_commit_super(struct super_block *sb, int sync) struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; int error = 0; - if (!sbh) + if (!sbh || block_device_ejected(sb)) return error; if (buffer_write_io_error(sbh)) { /* @@ -4665,7 +4681,8 @@ static int ext4_commit_super(struct super_block *sb, int sync) ext4_superblock_csum_set(sb); mark_buffer_dirty(sbh); if (sync) { - error = sync_dirty_buffer(sbh); + error = __sync_dirty_buffer(sbh, + test_opt(sb, BARRIER) ? WRITE_FUA : WRITE_SYNC); if (error) return error; @@ -4833,10 +4850,11 @@ static int ext4_freeze(struct super_block *sb) error = jbd2_journal_flush(journal); if (error < 0) goto out; + + /* Journal blocked and flushed, clear needs_recovery flag. */ + EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); } - /* Journal blocked and flushed, clear needs_recovery flag. */ - EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); error = ext4_commit_super(sb, 1); out: if (journal) @@ -4854,8 +4872,11 @@ static int ext4_unfreeze(struct super_block *sb) if (sb->s_flags & MS_RDONLY) return 0; - /* Reset the needs_recovery flag before the fs is unlocked. */ - EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + if (EXT4_SB(sb)->s_journal) { + /* Reset the needs_recovery flag before the fs is unlocked. */ + EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + } + ext4_commit_super(sb, 1); return 0; } @@ -5500,7 +5521,7 @@ static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super); } -#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) +#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2) static inline void register_as_ext2(void) { int err = register_filesystem(&ext2_fs_type); @@ -5530,7 +5551,6 @@ static inline void unregister_as_ext2(void) { } static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; } #endif -#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) static inline void register_as_ext3(void) { int err = register_filesystem(&ext3_fs_type); @@ -5556,11 +5576,6 @@ static inline int ext3_feature_set_ok(struct super_block *sb) return 0; return 1; } -#else -static inline void register_as_ext3(void) { } -static inline void unregister_as_ext3(void) { } -static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; } -#endif static struct file_system_type ext4_fs_type = { .owner = THIS_MODULE, @@ -5610,6 +5625,7 @@ static int __init ext4_init_fs(void) { int i, err; + ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64); ext4_li_info = NULL; mutex_init(&ext4_li_mtx); diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index c629762005bc..b0a9dc929f88 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -45,7 +45,7 @@ config F2FS_FS_POSIX_ACL default y help Posix Access Control Lists (ACLs) support permissions for users and - gourps beyond the owner/group/world scheme. + groups beyond the owner/group/world scheme. To learn more about Access Control Lists, visit the POSIX ACLs for Linux website <http://acl.bestbits.at/>. diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index 396be1a39e55..08e101ed914c 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -2,6 +2,7 @@ obj-$(CONFIG_F2FS_FS) += f2fs.o f2fs-y := dir.o file.o inode.o namei.o hash.o super.o inline.o f2fs-y += checkpoint.o gc.o data.o node.o segment.o recovery.o +f2fs-y += shrinker.o extent_cache.o f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index b70bbe1a6a8c..c5a38e352a80 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -69,14 +69,24 @@ repeat: fio.page = page; - if (f2fs_submit_page_bio(&fio)) + if (f2fs_submit_page_bio(&fio)) { + f2fs_put_page(page, 1); goto repeat; + } lock_page(page); if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); goto repeat; } + + /* + * if there is any IO error when accessing device, make our filesystem + * readonly and make sure do not write checkpoint with non-uptodate + * meta page. + */ + if (unlikely(!PageUptodate(page))) + f2fs_stop_checkpoint(sbi); out: return page; } @@ -326,26 +336,18 @@ const struct address_space_operations f2fs_meta_aops = { static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { struct inode_management *im = &sbi->im[type]; - struct ino_entry *e; + struct ino_entry *e, *tmp; + + tmp = f2fs_kmem_cache_alloc(ino_entry_slab, GFP_NOFS); retry: - if (radix_tree_preload(GFP_NOFS)) { - cond_resched(); - goto retry; - } + radix_tree_preload(GFP_NOFS | __GFP_NOFAIL); spin_lock(&im->ino_lock); - e = radix_tree_lookup(&im->ino_root, ino); if (!e) { - e = kmem_cache_alloc(ino_entry_slab, GFP_ATOMIC); - if (!e) { - spin_unlock(&im->ino_lock); - radix_tree_preload_end(); - goto retry; - } + e = tmp; if (radix_tree_insert(&im->ino_root, ino, e)) { spin_unlock(&im->ino_lock); - kmem_cache_free(ino_entry_slab, e); radix_tree_preload_end(); goto retry; } @@ -358,6 +360,9 @@ retry: } spin_unlock(&im->ino_lock); radix_tree_preload_end(); + + if (e != tmp) + kmem_cache_free(ino_entry_slab, tmp); } static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) @@ -458,24 +463,34 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) __remove_ino_entry(sbi, ino, ORPHAN_INO); } -static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) +static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { - struct inode *inode = f2fs_iget(sbi->sb, ino); - f2fs_bug_on(sbi, IS_ERR(inode)); + struct inode *inode; + + inode = f2fs_iget(sbi->sb, ino); + if (IS_ERR(inode)) { + /* + * there should be a bug that we can't find the entry + * to orphan inode. + */ + f2fs_bug_on(sbi, PTR_ERR(inode) == -ENOENT); + return PTR_ERR(inode); + } + clear_nlink(inode); /* truncate all the data during iput */ iput(inode); + return 0; } -void recover_orphan_inodes(struct f2fs_sb_info *sbi) +int recover_orphan_inodes(struct f2fs_sb_info *sbi) { block_t start_blk, orphan_blocks, i, j; + int err; if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG)) - return; - - set_sbi_flag(sbi, SBI_POR_DOING); + return 0; start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi); @@ -489,14 +504,17 @@ void recover_orphan_inodes(struct f2fs_sb_info *sbi) orphan_blk = (struct f2fs_orphan_block *)page_address(page); for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) { nid_t ino = le32_to_cpu(orphan_blk->ino[j]); - recover_orphan_inode(sbi, ino); + err = recover_orphan_inode(sbi, ino); + if (err) { + f2fs_put_page(page, 1); + return err; + } } f2fs_put_page(page, 1); } /* clear Orphan Flag */ clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG); - clear_sbi_flag(sbi, SBI_POR_DOING); - return; + return 0; } static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) @@ -504,7 +522,7 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) struct list_head *head; struct f2fs_orphan_block *orphan_blk = NULL; unsigned int nentries = 0; - unsigned short index; + unsigned short index = 1; unsigned short orphan_blocks; struct page *page = NULL; struct ino_entry *orphan = NULL; @@ -512,11 +530,6 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) orphan_blocks = GET_ORPHAN_BLOCKS(im->ino_num); - for (index = 0; index < orphan_blocks; index++) - grab_meta_page(sbi, start_blk + index); - - index = 1; - /* * we don't need to do spin_lock(&im->ino_lock) here, since all the * orphan inode operations are covered under f2fs_lock_op(). @@ -527,12 +540,10 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) /* loop for each orphan inode entry and write them in Jornal block */ list_for_each_entry(orphan, head, list) { if (!page) { - page = find_get_page(META_MAPPING(sbi), start_blk++); - f2fs_bug_on(sbi, !page); + page = grab_meta_page(sbi, start_blk++); orphan_blk = (struct f2fs_orphan_block *)page_address(page); memset(orphan_blk, 0, sizeof(*orphan_blk)); - f2fs_put_page(page, 0); } orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino); @@ -704,7 +715,8 @@ void update_dirty_page(struct inode *inode, struct page *page) struct inode_entry *new; int ret = 0; - if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode)) + if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && + !S_ISLNK(inode->i_mode)) return; if (!S_ISDIR(inode->i_mode)) { @@ -892,12 +904,15 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) __u32 crc32 = 0; int i; int cp_payload_blks = __cp_payload(sbi); + block_t discard_blk = NEXT_FREE_BLKADDR(sbi, curseg); + bool invalidate = false; /* * This avoids to conduct wrong roll-forward operations and uses * metapages, so should be called prior to sync_meta_pages below. */ - discard_next_dnode(sbi, NEXT_FREE_BLKADDR(sbi, curseg)); + if (discard_next_dnode(sbi, discard_blk)) + invalidate = true; /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) { @@ -1026,6 +1041,14 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* wait for previous submitted meta pages writeback */ wait_on_all_pages_writeback(sbi); + /* + * invalidate meta page which is used temporarily for zeroing out + * block at the end of warm node chain. + */ + if (invalidate) + invalidate_mapping_pages(META_MAPPING(sbi), discard_blk, + discard_blk); + release_dirty_inode(sbi); if (unlikely(f2fs_cp_error(sbi))) diff --git a/fs/f2fs/crypto_key.c b/fs/f2fs/crypto_key.c index 95b8f936f00b..9f77de2ef317 100644 --- a/fs/f2fs/crypto_key.c +++ b/fs/f2fs/crypto_key.c @@ -92,8 +92,7 @@ static void f2fs_free_crypt_info(struct f2fs_crypt_info *ci) if (!ci) return; - if (ci->ci_keyring_key) - key_put(ci->ci_keyring_key); + key_put(ci->ci_keyring_key); crypto_free_ablkcipher(ci->ci_ctfm); kmem_cache_free(f2fs_crypt_info_cachep, ci); } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9bedfa8dd3a5..a82abe921b89 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -14,6 +14,7 @@ #include <linux/mpage.h> #include <linux/writeback.h> #include <linux/backing-dev.h> +#include <linux/pagevec.h> #include <linux/blkdev.h> #include <linux/bio.h> #include <linux/prefetch.h> @@ -26,16 +27,13 @@ #include "trace.h" #include <trace/events/f2fs.h> -static struct kmem_cache *extent_tree_slab; -static struct kmem_cache *extent_node_slab; - -static void f2fs_read_end_io(struct bio *bio, int err) +static void f2fs_read_end_io(struct bio *bio) { struct bio_vec *bvec; int i; if (f2fs_bio_encrypted(bio)) { - if (err) { + if (bio->bi_error) { f2fs_release_crypto_ctx(bio->bi_private); } else { f2fs_end_io_crypto_work(bio->bi_private, bio); @@ -46,7 +44,7 @@ static void f2fs_read_end_io(struct bio *bio, int err) bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; - if (!err) { + if (!bio->bi_error) { SetPageUptodate(page); } else { ClearPageUptodate(page); @@ -57,7 +55,7 @@ static void f2fs_read_end_io(struct bio *bio, int err) bio_put(bio); } -static void f2fs_write_end_io(struct bio *bio, int err) +static void f2fs_write_end_io(struct bio *bio) { struct f2fs_sb_info *sbi = bio->bi_private; struct bio_vec *bvec; @@ -68,7 +66,7 @@ static void f2fs_write_end_io(struct bio *bio, int err) f2fs_restore_and_release_control_page(&page); - if (unlikely(err)) { + if (unlikely(bio->bi_error)) { set_page_dirty(page); set_bit(AS_EIO, &page->mapping->flags); f2fs_stop_checkpoint(sbi); @@ -92,8 +90,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, { struct bio *bio; - /* No failure on bio allocation */ - bio = bio_alloc(GFP_NOIO, npages); + bio = f2fs_bio_alloc(npages); bio->bi_bdev = sbi->sb->s_bdev; bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr); @@ -158,7 +155,6 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { bio_put(bio); - f2fs_put_page(page, 1); return -EFAULT; } @@ -266,645 +262,17 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) return err; } -static bool lookup_extent_info(struct inode *inode, pgoff_t pgofs, - struct extent_info *ei) -{ - struct f2fs_inode_info *fi = F2FS_I(inode); - pgoff_t start_fofs, end_fofs; - block_t start_blkaddr; - - read_lock(&fi->ext_lock); - if (fi->ext.len == 0) { - read_unlock(&fi->ext_lock); - return false; - } - - stat_inc_total_hit(inode->i_sb); - - start_fofs = fi->ext.fofs; - end_fofs = fi->ext.fofs + fi->ext.len - 1; - start_blkaddr = fi->ext.blk; - - if (pgofs >= start_fofs && pgofs <= end_fofs) { - *ei = fi->ext; - stat_inc_read_hit(inode->i_sb); - read_unlock(&fi->ext_lock); - return true; - } - read_unlock(&fi->ext_lock); - return false; -} - -static bool update_extent_info(struct inode *inode, pgoff_t fofs, - block_t blkaddr) -{ - struct f2fs_inode_info *fi = F2FS_I(inode); - pgoff_t start_fofs, end_fofs; - block_t start_blkaddr, end_blkaddr; - int need_update = true; - - write_lock(&fi->ext_lock); - - start_fofs = fi->ext.fofs; - end_fofs = fi->ext.fofs + fi->ext.len - 1; - start_blkaddr = fi->ext.blk; - end_blkaddr = fi->ext.blk + fi->ext.len - 1; - - /* Drop and initialize the matched extent */ - if (fi->ext.len == 1 && fofs == start_fofs) - fi->ext.len = 0; - - /* Initial extent */ - if (fi->ext.len == 0) { - if (blkaddr != NULL_ADDR) { - fi->ext.fofs = fofs; - fi->ext.blk = blkaddr; - fi->ext.len = 1; - } - goto end_update; - } - - /* Front merge */ - if (fofs == start_fofs - 1 && blkaddr == start_blkaddr - 1) { - fi->ext.fofs--; - fi->ext.blk--; - fi->ext.len++; - goto end_update; - } - - /* Back merge */ - if (fofs == end_fofs + 1 && blkaddr == end_blkaddr + 1) { - fi->ext.len++; - goto end_update; - } - - /* Split the existing extent */ - if (fi->ext.len > 1 && - fofs >= start_fofs && fofs <= end_fofs) { - if ((end_fofs - fofs) < (fi->ext.len >> 1)) { - fi->ext.len = fofs - start_fofs; - } else { - fi->ext.fofs = fofs + 1; - fi->ext.blk = start_blkaddr + fofs - start_fofs + 1; - fi->ext.len -= fofs - start_fofs + 1; - } - } else { - need_update = false; - } - - /* Finally, if the extent is very fragmented, let's drop the cache. */ - if (fi->ext.len < F2FS_MIN_EXTENT_LEN) { - fi->ext.len = 0; - set_inode_flag(fi, FI_NO_EXTENT); - need_update = true; - } -end_update: - write_unlock(&fi->ext_lock); - return need_update; -} - -static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, - struct extent_tree *et, struct extent_info *ei, - struct rb_node *parent, struct rb_node **p) -{ - struct extent_node *en; - - en = kmem_cache_alloc(extent_node_slab, GFP_ATOMIC); - if (!en) - return NULL; - - en->ei = *ei; - INIT_LIST_HEAD(&en->list); - - rb_link_node(&en->rb_node, parent, p); - rb_insert_color(&en->rb_node, &et->root); - et->count++; - atomic_inc(&sbi->total_ext_node); - return en; -} - -static void __detach_extent_node(struct f2fs_sb_info *sbi, - struct extent_tree *et, struct extent_node *en) -{ - rb_erase(&en->rb_node, &et->root); - et->count--; - atomic_dec(&sbi->total_ext_node); - - if (et->cached_en == en) - et->cached_en = NULL; -} - -static struct extent_tree *__find_extent_tree(struct f2fs_sb_info *sbi, - nid_t ino) -{ - struct extent_tree *et; - - down_read(&sbi->extent_tree_lock); - et = radix_tree_lookup(&sbi->extent_tree_root, ino); - if (!et) { - up_read(&sbi->extent_tree_lock); - return NULL; - } - atomic_inc(&et->refcount); - up_read(&sbi->extent_tree_lock); - - return et; -} - -static struct extent_tree *__grab_extent_tree(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et; - nid_t ino = inode->i_ino; - - down_write(&sbi->extent_tree_lock); - et = radix_tree_lookup(&sbi->extent_tree_root, ino); - if (!et) { - et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS); - f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et); - memset(et, 0, sizeof(struct extent_tree)); - et->ino = ino; - et->root = RB_ROOT; - et->cached_en = NULL; - rwlock_init(&et->lock); - atomic_set(&et->refcount, 0); - et->count = 0; - sbi->total_ext_tree++; - } - atomic_inc(&et->refcount); - up_write(&sbi->extent_tree_lock); - - return et; -} - -static struct extent_node *__lookup_extent_tree(struct extent_tree *et, - unsigned int fofs) -{ - struct rb_node *node = et->root.rb_node; - struct extent_node *en; - - if (et->cached_en) { - struct extent_info *cei = &et->cached_en->ei; - - if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) - return et->cached_en; - } - - while (node) { - en = rb_entry(node, struct extent_node, rb_node); - - if (fofs < en->ei.fofs) { - node = node->rb_left; - } else if (fofs >= en->ei.fofs + en->ei.len) { - node = node->rb_right; - } else { - et->cached_en = en; - return en; - } - } - return NULL; -} - -static struct extent_node *__try_back_merge(struct f2fs_sb_info *sbi, - struct extent_tree *et, struct extent_node *en) -{ - struct extent_node *prev; - struct rb_node *node; - - node = rb_prev(&en->rb_node); - if (!node) - return NULL; - - prev = rb_entry(node, struct extent_node, rb_node); - if (__is_back_mergeable(&en->ei, &prev->ei)) { - en->ei.fofs = prev->ei.fofs; - en->ei.blk = prev->ei.blk; - en->ei.len += prev->ei.len; - __detach_extent_node(sbi, et, prev); - return prev; - } - return NULL; -} - -static struct extent_node *__try_front_merge(struct f2fs_sb_info *sbi, - struct extent_tree *et, struct extent_node *en) -{ - struct extent_node *next; - struct rb_node *node; - - node = rb_next(&en->rb_node); - if (!node) - return NULL; - - next = rb_entry(node, struct extent_node, rb_node); - if (__is_front_mergeable(&en->ei, &next->ei)) { - en->ei.len += next->ei.len; - __detach_extent_node(sbi, et, next); - return next; - } - return NULL; -} - -static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, - struct extent_tree *et, struct extent_info *ei, - struct extent_node **den) -{ - struct rb_node **p = &et->root.rb_node; - struct rb_node *parent = NULL; - struct extent_node *en; - - while (*p) { - parent = *p; - en = rb_entry(parent, struct extent_node, rb_node); - - if (ei->fofs < en->ei.fofs) { - if (__is_front_mergeable(ei, &en->ei)) { - f2fs_bug_on(sbi, !den); - en->ei.fofs = ei->fofs; - en->ei.blk = ei->blk; - en->ei.len += ei->len; - *den = __try_back_merge(sbi, et, en); - return en; - } - p = &(*p)->rb_left; - } else if (ei->fofs >= en->ei.fofs + en->ei.len) { - if (__is_back_mergeable(ei, &en->ei)) { - f2fs_bug_on(sbi, !den); - en->ei.len += ei->len; - *den = __try_front_merge(sbi, et, en); - return en; - } - p = &(*p)->rb_right; - } else { - f2fs_bug_on(sbi, 1); - } - } - - return __attach_extent_node(sbi, et, ei, parent, p); -} - -static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, - struct extent_tree *et, bool free_all) +int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) { - struct rb_node *node, *next; - struct extent_node *en; - unsigned int count = et->count; - - node = rb_first(&et->root); - while (node) { - next = rb_next(node); - en = rb_entry(node, struct extent_node, rb_node); - - if (free_all) { - spin_lock(&sbi->extent_lock); - if (!list_empty(&en->list)) - list_del_init(&en->list); - spin_unlock(&sbi->extent_lock); - } - - if (free_all || list_empty(&en->list)) { - __detach_extent_node(sbi, et, en); - kmem_cache_free(extent_node_slab, en); - } - node = next; - } - - return count - et->count; -} - -static void f2fs_init_extent_tree(struct inode *inode, - struct f2fs_extent *i_ext) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et; - struct extent_node *en; struct extent_info ei; + struct inode *inode = dn->inode; - if (le32_to_cpu(i_ext->len) < F2FS_MIN_EXTENT_LEN) - return; - - et = __grab_extent_tree(inode); - - write_lock(&et->lock); - if (et->count) - goto out; - - set_extent_info(&ei, le32_to_cpu(i_ext->fofs), - le32_to_cpu(i_ext->blk), le32_to_cpu(i_ext->len)); - - en = __insert_extent_tree(sbi, et, &ei, NULL); - if (en) { - et->cached_en = en; - - spin_lock(&sbi->extent_lock); - list_add_tail(&en->list, &sbi->extent_list); - spin_unlock(&sbi->extent_lock); - } -out: - write_unlock(&et->lock); - atomic_dec(&et->refcount); -} - -static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, - struct extent_info *ei) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et; - struct extent_node *en; - - trace_f2fs_lookup_extent_tree_start(inode, pgofs); - - et = __find_extent_tree(sbi, inode->i_ino); - if (!et) - return false; - - read_lock(&et->lock); - en = __lookup_extent_tree(et, pgofs); - if (en) { - *ei = en->ei; - spin_lock(&sbi->extent_lock); - if (!list_empty(&en->list)) - list_move_tail(&en->list, &sbi->extent_list); - spin_unlock(&sbi->extent_lock); - stat_inc_read_hit(sbi->sb); - } - stat_inc_total_hit(sbi->sb); - read_unlock(&et->lock); - - trace_f2fs_lookup_extent_tree_end(inode, pgofs, en); - - atomic_dec(&et->refcount); - return en ? true : false; -} - -static void f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs, - block_t blkaddr) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et; - struct extent_node *en = NULL, *en1 = NULL, *en2 = NULL, *en3 = NULL; - struct extent_node *den = NULL; - struct extent_info ei, dei; - unsigned int endofs; - - trace_f2fs_update_extent_tree(inode, fofs, blkaddr); - - et = __grab_extent_tree(inode); - - write_lock(&et->lock); - - /* 1. lookup and remove existing extent info in cache */ - en = __lookup_extent_tree(et, fofs); - if (!en) - goto update_extent; - - dei = en->ei; - __detach_extent_node(sbi, et, en); - - /* 2. if extent can be split more, split and insert the left part */ - if (dei.len > 1) { - /* insert left part of split extent into cache */ - if (fofs - dei.fofs >= F2FS_MIN_EXTENT_LEN) { - set_extent_info(&ei, dei.fofs, dei.blk, - fofs - dei.fofs); - en1 = __insert_extent_tree(sbi, et, &ei, NULL); - } - - /* insert right part of split extent into cache */ - endofs = dei.fofs + dei.len - 1; - if (endofs - fofs >= F2FS_MIN_EXTENT_LEN) { - set_extent_info(&ei, fofs + 1, - fofs - dei.fofs + dei.blk, endofs - fofs); - en2 = __insert_extent_tree(sbi, et, &ei, NULL); - } - } - -update_extent: - /* 3. update extent in extent cache */ - if (blkaddr) { - set_extent_info(&ei, fofs, blkaddr, 1); - en3 = __insert_extent_tree(sbi, et, &ei, &den); - } - - /* 4. update in global extent list */ - spin_lock(&sbi->extent_lock); - if (en && !list_empty(&en->list)) - list_del(&en->list); - /* - * en1 and en2 split from en, they will become more and more smaller - * fragments after splitting several times. So if the length is smaller - * than F2FS_MIN_EXTENT_LEN, we will not add them into extent tree. - */ - if (en1) - list_add_tail(&en1->list, &sbi->extent_list); - if (en2) - list_add_tail(&en2->list, &sbi->extent_list); - if (en3) { - if (list_empty(&en3->list)) - list_add_tail(&en3->list, &sbi->extent_list); - else - list_move_tail(&en3->list, &sbi->extent_list); - } - if (den && !list_empty(&den->list)) - list_del(&den->list); - spin_unlock(&sbi->extent_lock); - - /* 5. release extent node */ - if (en) - kmem_cache_free(extent_node_slab, en); - if (den) - kmem_cache_free(extent_node_slab, den); - - write_unlock(&et->lock); - atomic_dec(&et->refcount); -} - -void f2fs_preserve_extent_tree(struct inode *inode) -{ - struct extent_tree *et; - struct extent_info *ext = &F2FS_I(inode)->ext; - bool sync = false; - - if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE)) - return; - - et = __find_extent_tree(F2FS_I_SB(inode), inode->i_ino); - if (!et) { - if (ext->len) { - ext->len = 0; - update_inode_page(inode); - } - return; - } - - read_lock(&et->lock); - if (et->count) { - struct extent_node *en; - - if (et->cached_en) { - en = et->cached_en; - } else { - struct rb_node *node = rb_first(&et->root); - - if (!node) - node = rb_last(&et->root); - en = rb_entry(node, struct extent_node, rb_node); - } - - if (__is_extent_same(ext, &en->ei)) - goto out; - - *ext = en->ei; - sync = true; - } else if (ext->len) { - ext->len = 0; - sync = true; - } -out: - read_unlock(&et->lock); - atomic_dec(&et->refcount); - - if (sync) - update_inode_page(inode); -} - -void f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) -{ - struct extent_tree *treevec[EXT_TREE_VEC_SIZE]; - struct extent_node *en, *tmp; - unsigned long ino = F2FS_ROOT_INO(sbi); - struct radix_tree_iter iter; - void **slot; - unsigned int found; - unsigned int node_cnt = 0, tree_cnt = 0; - - if (!test_opt(sbi, EXTENT_CACHE)) - return; - - if (available_free_memory(sbi, EXTENT_CACHE)) - return; - - spin_lock(&sbi->extent_lock); - list_for_each_entry_safe(en, tmp, &sbi->extent_list, list) { - if (!nr_shrink--) - break; - list_del_init(&en->list); - } - spin_unlock(&sbi->extent_lock); - - down_read(&sbi->extent_tree_lock); - while ((found = radix_tree_gang_lookup(&sbi->extent_tree_root, - (void **)treevec, ino, EXT_TREE_VEC_SIZE))) { - unsigned i; - - ino = treevec[found - 1]->ino + 1; - for (i = 0; i < found; i++) { - struct extent_tree *et = treevec[i]; - - atomic_inc(&et->refcount); - write_lock(&et->lock); - node_cnt += __free_extent_tree(sbi, et, false); - write_unlock(&et->lock); - atomic_dec(&et->refcount); - } - } - up_read(&sbi->extent_tree_lock); - - down_write(&sbi->extent_tree_lock); - radix_tree_for_each_slot(slot, &sbi->extent_tree_root, &iter, - F2FS_ROOT_INO(sbi)) { - struct extent_tree *et = (struct extent_tree *)*slot; - - if (!atomic_read(&et->refcount) && !et->count) { - radix_tree_delete(&sbi->extent_tree_root, et->ino); - kmem_cache_free(extent_tree_slab, et); - sbi->total_ext_tree--; - tree_cnt++; - } - } - up_write(&sbi->extent_tree_lock); - - trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt); -} - -void f2fs_destroy_extent_tree(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et; - unsigned int node_cnt = 0; - - if (!test_opt(sbi, EXTENT_CACHE)) - return; - - et = __find_extent_tree(sbi, inode->i_ino); - if (!et) - goto out; - - /* free all extent info belong to this extent tree */ - write_lock(&et->lock); - node_cnt = __free_extent_tree(sbi, et, true); - write_unlock(&et->lock); - - atomic_dec(&et->refcount); - - /* try to find and delete extent tree entry in radix tree */ - down_write(&sbi->extent_tree_lock); - et = radix_tree_lookup(&sbi->extent_tree_root, inode->i_ino); - if (!et) { - up_write(&sbi->extent_tree_lock); - goto out; + if (f2fs_lookup_extent_cache(inode, index, &ei)) { + dn->data_blkaddr = ei.blk + index - ei.fofs; + return 0; } - f2fs_bug_on(sbi, atomic_read(&et->refcount) || et->count); - radix_tree_delete(&sbi->extent_tree_root, inode->i_ino); - kmem_cache_free(extent_tree_slab, et); - sbi->total_ext_tree--; - up_write(&sbi->extent_tree_lock); -out: - trace_f2fs_destroy_extent_tree(inode, node_cnt); - return; -} - -void f2fs_init_extent_cache(struct inode *inode, struct f2fs_extent *i_ext) -{ - if (test_opt(F2FS_I_SB(inode), EXTENT_CACHE)) - f2fs_init_extent_tree(inode, i_ext); - - write_lock(&F2FS_I(inode)->ext_lock); - get_extent_info(&F2FS_I(inode)->ext, *i_ext); - write_unlock(&F2FS_I(inode)->ext_lock); -} -static bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, - struct extent_info *ei) -{ - if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) - return false; - - if (test_opt(F2FS_I_SB(inode), EXTENT_CACHE)) - return f2fs_lookup_extent_tree(inode, pgofs, ei); - - return lookup_extent_info(inode, pgofs, ei); -} - -void f2fs_update_extent_cache(struct dnode_of_data *dn) -{ - struct f2fs_inode_info *fi = F2FS_I(dn->inode); - pgoff_t fofs; - - f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR); - - if (is_inode_flag_set(fi, FI_NO_EXTENT)) - return; - - fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + - dn->ofs_in_node; - - if (test_opt(F2FS_I_SB(dn->inode), EXTENT_CACHE)) - return f2fs_update_extent_tree(dn->inode, fofs, - dn->data_blkaddr); - - if (update_extent_info(dn->inode, fofs, dn->data_blkaddr)) - sync_inode_page(dn); + return f2fs_reserve_block(dn, index); } struct page *get_read_data_page(struct inode *inode, pgoff_t index, int rw) @@ -935,15 +303,13 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, int rw) set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, index, LOOKUP_NODE); - if (err) { - f2fs_put_page(page, 1); - return ERR_PTR(err); - } + if (err) + goto put_err; f2fs_put_dnode(&dn); if (unlikely(dn.data_blkaddr == NULL_ADDR)) { - f2fs_put_page(page, 1); - return ERR_PTR(-ENOENT); + err = -ENOENT; + goto put_err; } got_it: if (PageUptodate(page)) { @@ -968,8 +334,12 @@ got_it: fio.page = page; err = f2fs_submit_page_bio(&fio); if (err) - return ERR_PTR(err); + goto put_err; return page; + +put_err: + f2fs_put_page(page, 1); + return ERR_PTR(err); } struct page *find_data_page(struct inode *inode, pgoff_t index) @@ -1030,7 +400,8 @@ repeat: * * Also, caller should grab and release a rwsem by calling f2fs_lock_op() and * f2fs_unlock_op(). - * Note that, ipage is set only by make_empty_dir. + * Note that, ipage is set only by make_empty_dir, and if any error occur, + * ipage should be released by this function. */ struct page *get_new_data_page(struct inode *inode, struct page *ipage, pgoff_t index, bool new_i_size) @@ -1041,8 +412,14 @@ struct page *get_new_data_page(struct inode *inode, int err; repeat: page = grab_cache_page(mapping, index); - if (!page) + if (!page) { + /* + * before exiting, we should make sure ipage will be released + * if any error occur. + */ + f2fs_put_page(ipage, 1); return ERR_PTR(-ENOMEM); + } set_new_dnode(&dn, inode, ipage, NULL, 0); err = f2fs_reserve_block(&dn, index); @@ -1107,8 +484,6 @@ alloc: allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr, &sum, seg); - - /* direct IO doesn't use extent cache to maximize the performance */ set_data_blkaddr(dn); /* update i_size */ @@ -1117,6 +492,9 @@ alloc: if (i_size_read(dn->inode) < ((fofs + 1) << PAGE_CACHE_SHIFT)) i_size_write(dn->inode, ((fofs + 1) << PAGE_CACHE_SHIFT)); + /* direct IO doesn't use extent cache to maximize the performance */ + f2fs_drop_largest_extent(dn->inode, fofs); + return 0; } @@ -1183,7 +561,7 @@ out: * c. give the block addresses to blockdev */ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, - int create, bool fiemap) + int create, int flag) { unsigned int maxblocks = map->m_len; struct dnode_of_data dn; @@ -1217,8 +595,19 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, err = 0; goto unlock_out; } - if (dn.data_blkaddr == NEW_ADDR && !fiemap) - goto put_out; + if (dn.data_blkaddr == NEW_ADDR) { + if (flag == F2FS_GET_BLOCK_BMAP) { + err = -ENOENT; + goto put_out; + } else if (flag == F2FS_GET_BLOCK_READ || + flag == F2FS_GET_BLOCK_DIO) { + goto put_out; + } + /* + * if it is in fiemap call path (flag = F2FS_GET_BLOCK_FIEMAP), + * mark it as mapped and unwritten block. + */ + } if (dn.data_blkaddr != NULL_ADDR) { map->m_flags = F2FS_MAP_MAPPED; @@ -1233,6 +622,8 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, map->m_flags = F2FS_MAP_NEW | F2FS_MAP_MAPPED; map->m_pblk = dn.data_blkaddr; } else { + if (flag == F2FS_GET_BLOCK_BMAP) + err = -ENOENT; goto put_out; } @@ -1255,7 +646,9 @@ get_next: err = 0; goto unlock_out; } - if (dn.data_blkaddr == NEW_ADDR && !fiemap) + + if (dn.data_blkaddr == NEW_ADDR && + flag != F2FS_GET_BLOCK_FIEMAP) goto put_out; end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); @@ -1297,7 +690,7 @@ out: } static int __get_data_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create, bool fiemap) + struct buffer_head *bh, int create, int flag) { struct f2fs_map_blocks map; int ret; @@ -1305,7 +698,7 @@ static int __get_data_block(struct inode *inode, sector_t iblock, map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; - ret = f2fs_map_blocks(inode, &map, create, fiemap); + ret = f2fs_map_blocks(inode, &map, create, flag); if (!ret) { map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags; @@ -1315,15 +708,23 @@ static int __get_data_block(struct inode *inode, sector_t iblock, } static int get_data_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create, int flag) +{ + return __get_data_block(inode, iblock, bh_result, create, flag); +} + +static int get_data_block_dio(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - return __get_data_block(inode, iblock, bh_result, create, false); + return __get_data_block(inode, iblock, bh_result, create, + F2FS_GET_BLOCK_DIO); } -static int get_data_block_fiemap(struct inode *inode, sector_t iblock, +static int get_data_block_bmap(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - return __get_data_block(inode, iblock, bh_result, create, true); + return __get_data_block(inode, iblock, bh_result, create, + F2FS_GET_BLOCK_BMAP); } static inline sector_t logical_to_blk(struct inode *inode, loff_t offset) @@ -1367,7 +768,8 @@ next: memset(&map_bh, 0, sizeof(struct buffer_head)); map_bh.b_size = len; - ret = get_data_block_fiemap(inode, start_blk, &map_bh, 0); + ret = get_data_block(inode, start_blk, &map_bh, 0, + F2FS_GET_BLOCK_FIEMAP); if (ret) goto out; @@ -1552,7 +954,7 @@ submit_and_realloc: } bio = bio_alloc(GFP_KERNEL, - min_t(int, nr_pages, bio_get_nr_vecs(bdev))); + min_t(int, nr_pages, BIO_MAX_PAGES)); if (!bio) { if (ctx) f2fs_release_crypto_ctx(ctx); @@ -1770,6 +1172,137 @@ static int __f2fs_writepage(struct page *page, struct writeback_control *wbc, return ret; } +/* + * This function was copied from write_cche_pages from mm/page-writeback.c. + * The major change is making write step of cold data page separately from + * warm/hot data page. + */ +static int f2fs_write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc, writepage_t writepage, + void *data) +{ + int ret = 0; + int done = 0; + struct pagevec pvec; + int nr_pages; + pgoff_t uninitialized_var(writeback_index); + pgoff_t index; + pgoff_t end; /* Inclusive */ + pgoff_t done_index; + int cycled; + int range_whole = 0; + int tag; + int step = 0; + + pagevec_init(&pvec, 0); +next: + if (wbc->range_cyclic) { + writeback_index = mapping->writeback_index; /* prev offset */ + index = writeback_index; + if (index == 0) + cycled = 1; + else + cycled = 0; + end = -1; + } else { + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + cycled = 1; /* ignore range_cyclic tests */ + } + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; +retry: + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag_pages_for_writeback(mapping, index, end); + done_index = index; + while (!done && (index <= end)) { + int i; + + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (page->index > end) { + done = 1; + break; + } + + done_index = page->index; + + lock_page(page); + + if (unlikely(page->mapping != mapping)) { +continue_unlock: + unlock_page(page); + continue; + } + + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + if (step == is_cold_data(page)) + goto continue_unlock; + + if (PageWriteback(page)) { + if (wbc->sync_mode != WB_SYNC_NONE) + f2fs_wait_on_page_writeback(page, DATA); + else + goto continue_unlock; + } + + BUG_ON(PageWriteback(page)); + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + + ret = (*writepage)(page, wbc, data); + if (unlikely(ret)) { + if (ret == AOP_WRITEPAGE_ACTIVATE) { + unlock_page(page); + ret = 0; + } else { + done_index = page->index + 1; + done = 1; + break; + } + } + + if (--wbc->nr_to_write <= 0 && + wbc->sync_mode == WB_SYNC_NONE) { + done = 1; + break; + } + } + pagevec_release(&pvec); + cond_resched(); + } + + if (step < 1) { + step++; + goto next; + } + + if (!cycled && !done) { + cycled = 1; + index = 0; + end = writeback_index - 1; + goto retry; + } + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + mapping->writeback_index = done_index; + + return ret; +} + static int f2fs_write_data_pages(struct address_space *mapping, struct writeback_control *wbc) { @@ -1785,6 +1318,10 @@ static int f2fs_write_data_pages(struct address_space *mapping, if (!mapping->a_ops->writepage) return 0; + /* skip writing if there is no dirty page in this inode */ + if (!get_dirty_pages(inode) && wbc->sync_mode == WB_SYNC_NONE) + return 0; + if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) && available_free_memory(sbi, DIRTY_DENTS)) @@ -1800,12 +1337,11 @@ static int f2fs_write_data_pages(struct address_space *mapping, mutex_lock(&sbi->writepages); locked = true; } - ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); + ret = f2fs_write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); + f2fs_submit_merged_bio(sbi, DATA, WRITE); if (locked) mutex_unlock(&sbi->writepages); - f2fs_submit_merged_bio(sbi, DATA, WRITE); - remove_dirty_dir_inode(inode); wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); @@ -1832,7 +1368,8 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct page *page, *ipage; + struct page *page = NULL; + struct page *ipage; pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT; struct dnode_of_data dn; int err = 0; @@ -1882,25 +1419,28 @@ repeat: if (err) goto put_fail; } - err = f2fs_reserve_block(&dn, index); + + err = f2fs_get_block(&dn, index); if (err) goto put_fail; put_next: f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); - if ((len == PAGE_CACHE_SIZE) || PageUptodate(page)) - return 0; - f2fs_wait_on_page_writeback(page, DATA); + if (len == PAGE_CACHE_SIZE) + goto out_update; + if (PageUptodate(page)) + goto out_clear; + if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) { unsigned start = pos & (PAGE_CACHE_SIZE - 1); unsigned end = start + len; /* Reading beyond i_size is simple: memset to zero */ zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE); - goto out; + goto out_update; } if (dn.data_blkaddr == NEW_ADDR) { @@ -1920,7 +1460,6 @@ put_next: lock_page(page); if (unlikely(!PageUptodate(page))) { - f2fs_put_page(page, 1); err = -EIO; goto fail; } @@ -1932,14 +1471,13 @@ put_next: /* avoid symlink page */ if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { err = f2fs_decrypt_one(inode, page); - if (err) { - f2fs_put_page(page, 1); + if (err) goto fail; - } } } -out: +out_update: SetPageUptodate(page); +out_clear: clear_cold_data(page); return 0; @@ -1947,8 +1485,8 @@ put_fail: f2fs_put_dnode(&dn); unlock_fail: f2fs_unlock_op(sbi); - f2fs_put_page(page, 1); fail: + f2fs_put_page(page, 1); f2fs_write_failed(mapping, pos + len); return err; } @@ -1979,9 +1517,6 @@ static int check_direct_IO(struct inode *inode, struct iov_iter *iter, { unsigned blocksize_mask = inode->i_sb->s_blocksize - 1; - if (iov_iter_rw(iter) == READ) - return 0; - if (offset & blocksize_mask) return -EINVAL; @@ -2010,15 +1545,16 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) return 0; - if (check_direct_IO(inode, iter, offset)) - return 0; + err = check_direct_IO(inode, iter, offset); + if (err) + return err; trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); if (iov_iter_rw(iter) == WRITE) __allocate_data_blocks(inode, offset, count); - err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block); + err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio); if (err < 0 && iov_iter_rw(iter) == WRITE) f2fs_write_failed(mapping, offset + count); @@ -2045,6 +1581,11 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, else inode_dec_dirty_pages(inode); } + + /* This is atomic written page, keep Private */ + if (IS_ATOMIC_WRITTEN_PAGE(page)) + return; + ClearPagePrivate(page); } @@ -2054,6 +1595,10 @@ int f2fs_release_page(struct page *page, gfp_t wait) if (PageDirty(page)) return 0; + /* This is atomic written page, keep Private */ + if (IS_ATOMIC_WRITTEN_PAGE(page)) + return 0; + ClearPagePrivate(page); return 1; } @@ -2068,12 +1613,17 @@ static int f2fs_set_data_page_dirty(struct page *page) SetPageUptodate(page); if (f2fs_is_atomic_file(inode)) { - register_inmem_page(inode, page); - return 1; + if (!IS_ATOMIC_WRITTEN_PAGE(page)) { + register_inmem_page(inode, page); + return 1; + } + /* + * Previously, this page has been registered, we just + * return here. + */ + return 0; } - mark_inode_dirty(inode); - if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); update_dirty_page(inode, page); @@ -2092,38 +1642,7 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) if (err) return err; } - return generic_block_bmap(mapping, block, get_data_block); -} - -void init_extent_cache_info(struct f2fs_sb_info *sbi) -{ - INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO); - init_rwsem(&sbi->extent_tree_lock); - INIT_LIST_HEAD(&sbi->extent_list); - spin_lock_init(&sbi->extent_lock); - sbi->total_ext_tree = 0; - atomic_set(&sbi->total_ext_node, 0); -} - -int __init create_extent_cache(void) -{ - extent_tree_slab = f2fs_kmem_cache_create("f2fs_extent_tree", - sizeof(struct extent_tree)); - if (!extent_tree_slab) - return -ENOMEM; - extent_node_slab = f2fs_kmem_cache_create("f2fs_extent_node", - sizeof(struct extent_node)); - if (!extent_node_slab) { - kmem_cache_destroy(extent_tree_slab); - return -ENOMEM; - } - return 0; -} - -void destroy_extent_cache(void) -{ - kmem_cache_destroy(extent_node_slab); - kmem_cache_destroy(extent_tree_slab); + return generic_block_bmap(mapping, block, get_data_block_bmap); } const struct address_space_operations f2fs_dblock_aops = { diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 75176e0dd6c8..d013d8479753 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -33,8 +33,11 @@ static void update_general_status(struct f2fs_sb_info *sbi) int i; /* validation check of the segment numbers */ - si->hit_ext = sbi->read_hit_ext; - si->total_ext = sbi->total_hit_ext; + si->hit_largest = atomic_read(&sbi->read_hit_largest); + si->hit_cached = atomic_read(&sbi->read_hit_cached); + si->hit_rbtree = atomic_read(&sbi->read_hit_rbtree); + si->hit_total = si->hit_largest + si->hit_cached + si->hit_rbtree; + si->total_ext = atomic_read(&sbi->total_hit_ext); si->ext_tree = sbi->total_ext_tree; si->ext_node = atomic_read(&sbi->total_ext_node); si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES); @@ -49,6 +52,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->valid_count = valid_user_blocks(sbi); si->valid_node_count = valid_node_count(sbi); si->valid_inode_count = valid_inode_count(sbi); + si->inline_xattr = atomic_read(&sbi->inline_xattr); si->inline_inode = atomic_read(&sbi->inline_inode); si->inline_dir = atomic_read(&sbi->inline_dir); si->utilization = utilization(sbi); @@ -226,6 +230,8 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, "Other: %u)\n - Data: %u\n", si->valid_node_count - si->valid_inode_count, si->valid_count - si->valid_node_count); + seq_printf(s, " - Inline_xattr Inode: %u\n", + si->inline_xattr); seq_printf(s, " - Inline_data Inode: %u\n", si->inline_inode); seq_printf(s, " - Inline_dentry Inode: %u\n", @@ -276,10 +282,16 @@ static int stat_show(struct seq_file *s, void *v) si->bg_data_blks); seq_printf(s, " - node blocks : %d (%d)\n", si->node_blks, si->bg_node_blks); - seq_printf(s, "\nExtent Hit Ratio: %d / %d\n", - si->hit_ext, si->total_ext); - seq_printf(s, "\nExtent Tree Count: %d\n", si->ext_tree); - seq_printf(s, "\nExtent Node Count: %d\n", si->ext_node); + seq_puts(s, "\nExtent Cache:\n"); + seq_printf(s, " - Hit Count: L1-1:%d L1-2:%d L2:%d\n", + si->hit_largest, si->hit_cached, + si->hit_rbtree); + seq_printf(s, " - Hit Ratio: %d%% (%d / %d)\n", + !si->total_ext ? 0 : + (si->hit_total * 100) / si->total_ext, + si->hit_total, si->total_ext); + seq_printf(s, " - Inner Struct Count: tree: %d, node: %d\n", + si->ext_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); seq_printf(s, " - inmem: %4d, wb: %4d\n", si->inmem_pages, si->wb_pages); @@ -366,6 +378,12 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) si->sbi = sbi; sbi->stat_info = si; + atomic_set(&sbi->total_hit_ext, 0); + atomic_set(&sbi->read_hit_rbtree, 0); + atomic_set(&sbi->read_hit_largest, 0); + atomic_set(&sbi->read_hit_cached, 0); + + atomic_set(&sbi->inline_xattr, 0); atomic_set(&sbi->inline_inode, 0); atomic_set(&sbi->inline_dir, 0); atomic_set(&sbi->inplace_count, 0); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index a34ebd8312ab..8f15fc134040 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -718,8 +718,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, if (inode) f2fs_drop_nlink(dir, inode, NULL); - if (bit_pos == NR_DENTRY_IN_BLOCK) { - truncate_hole(dir, page->index, page->index + 1); + if (bit_pos == NR_DENTRY_IN_BLOCK && + !truncate_hole(dir, page->index, page->index + 1)) { clear_page_dirty_for_io(page); ClearPagePrivate(page); ClearPageUptodate(page); diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c new file mode 100644 index 000000000000..997ac86f2a1d --- /dev/null +++ b/fs/f2fs/extent_cache.c @@ -0,0 +1,791 @@ +/* + * f2fs extent cache support + * + * Copyright (c) 2015 Motorola Mobility + * Copyright (c) 2015 Samsung Electronics + * Authors: Jaegeuk Kim <jaegeuk@kernel.org> + * Chao Yu <chao2.yu@samsung.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/fs.h> +#include <linux/f2fs_fs.h> + +#include "f2fs.h" +#include "node.h" +#include <trace/events/f2fs.h> + +static struct kmem_cache *extent_tree_slab; +static struct kmem_cache *extent_node_slab; + +static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_info *ei, + struct rb_node *parent, struct rb_node **p) +{ + struct extent_node *en; + + en = kmem_cache_alloc(extent_node_slab, GFP_ATOMIC); + if (!en) + return NULL; + + en->ei = *ei; + INIT_LIST_HEAD(&en->list); + + rb_link_node(&en->rb_node, parent, p); + rb_insert_color(&en->rb_node, &et->root); + et->count++; + atomic_inc(&sbi->total_ext_node); + return en; +} + +static void __detach_extent_node(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_node *en) +{ + rb_erase(&en->rb_node, &et->root); + et->count--; + atomic_dec(&sbi->total_ext_node); + + if (et->cached_en == en) + et->cached_en = NULL; +} + +static struct extent_tree *__grab_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et; + nid_t ino = inode->i_ino; + + down_write(&sbi->extent_tree_lock); + et = radix_tree_lookup(&sbi->extent_tree_root, ino); + if (!et) { + et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS); + f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et); + memset(et, 0, sizeof(struct extent_tree)); + et->ino = ino; + et->root = RB_ROOT; + et->cached_en = NULL; + rwlock_init(&et->lock); + atomic_set(&et->refcount, 0); + et->count = 0; + sbi->total_ext_tree++; + } + atomic_inc(&et->refcount); + up_write(&sbi->extent_tree_lock); + + /* never died until evict_inode */ + F2FS_I(inode)->extent_tree = et; + + return et; +} + +static struct extent_node *__lookup_extent_tree(struct f2fs_sb_info *sbi, + struct extent_tree *et, unsigned int fofs) +{ + struct rb_node *node = et->root.rb_node; + struct extent_node *en = et->cached_en; + + if (en) { + struct extent_info *cei = &en->ei; + + if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) { + stat_inc_cached_node_hit(sbi); + return en; + } + } + + while (node) { + en = rb_entry(node, struct extent_node, rb_node); + + if (fofs < en->ei.fofs) { + node = node->rb_left; + } else if (fofs >= en->ei.fofs + en->ei.len) { + node = node->rb_right; + } else { + stat_inc_rbtree_node_hit(sbi); + return en; + } + } + return NULL; +} + +static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_info *ei) +{ + struct rb_node **p = &et->root.rb_node; + struct extent_node *en; + + en = __attach_extent_node(sbi, et, ei, NULL, p); + if (!en) + return NULL; + + et->largest = en->ei; + et->cached_en = en; + return en; +} + +static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, + struct extent_tree *et, bool free_all) +{ + struct rb_node *node, *next; + struct extent_node *en; + unsigned int count = et->count; + + node = rb_first(&et->root); + while (node) { + next = rb_next(node); + en = rb_entry(node, struct extent_node, rb_node); + + if (free_all) { + spin_lock(&sbi->extent_lock); + if (!list_empty(&en->list)) + list_del_init(&en->list); + spin_unlock(&sbi->extent_lock); + } + + if (free_all || list_empty(&en->list)) { + __detach_extent_node(sbi, et, en); + kmem_cache_free(extent_node_slab, en); + } + node = next; + } + + return count - et->count; +} + +static void __drop_largest_extent(struct inode *inode, pgoff_t fofs) +{ + struct extent_info *largest = &F2FS_I(inode)->extent_tree->largest; + + if (largest->fofs <= fofs && largest->fofs + largest->len > fofs) + largest->len = 0; +} + +void f2fs_drop_largest_extent(struct inode *inode, pgoff_t fofs) +{ + if (!f2fs_may_extent_tree(inode)) + return; + + __drop_largest_extent(inode, fofs); +} + +void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et; + struct extent_node *en; + struct extent_info ei; + + if (!f2fs_may_extent_tree(inode)) + return; + + et = __grab_extent_tree(inode); + + if (!i_ext || le32_to_cpu(i_ext->len) < F2FS_MIN_EXTENT_LEN) + return; + + set_extent_info(&ei, le32_to_cpu(i_ext->fofs), + le32_to_cpu(i_ext->blk), le32_to_cpu(i_ext->len)); + + write_lock(&et->lock); + if (et->count) + goto out; + + en = __init_extent_tree(sbi, et, &ei); + if (en) { + spin_lock(&sbi->extent_lock); + list_add_tail(&en->list, &sbi->extent_list); + spin_unlock(&sbi->extent_lock); + } +out: + write_unlock(&et->lock); +} + +static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_node *en; + bool ret = false; + + f2fs_bug_on(sbi, !et); + + trace_f2fs_lookup_extent_tree_start(inode, pgofs); + + read_lock(&et->lock); + + if (et->largest.fofs <= pgofs && + et->largest.fofs + et->largest.len > pgofs) { + *ei = et->largest; + ret = true; + stat_inc_largest_node_hit(sbi); + goto out; + } + + en = __lookup_extent_tree(sbi, et, pgofs); + if (en) { + *ei = en->ei; + spin_lock(&sbi->extent_lock); + if (!list_empty(&en->list)) + list_move_tail(&en->list, &sbi->extent_list); + et->cached_en = en; + spin_unlock(&sbi->extent_lock); + ret = true; + } +out: + stat_inc_total_hit(sbi); + read_unlock(&et->lock); + + trace_f2fs_lookup_extent_tree_end(inode, pgofs, ei); + return ret; +} + + +/* + * lookup extent at @fofs, if hit, return the extent + * if not, return NULL and + * @prev_ex: extent before fofs + * @next_ex: extent after fofs + * @insert_p: insert point for new extent at fofs + * in order to simpfy the insertion after. + * tree must stay unchanged between lookup and insertion. + */ +static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et, + unsigned int fofs, + struct extent_node **prev_ex, + struct extent_node **next_ex, + struct rb_node ***insert_p, + struct rb_node **insert_parent) +{ + struct rb_node **pnode = &et->root.rb_node; + struct rb_node *parent = NULL, *tmp_node; + struct extent_node *en = et->cached_en; + + *insert_p = NULL; + *insert_parent = NULL; + *prev_ex = NULL; + *next_ex = NULL; + + if (RB_EMPTY_ROOT(&et->root)) + return NULL; + + if (en) { + struct extent_info *cei = &en->ei; + + if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) + goto lookup_neighbors; + } + + while (*pnode) { + parent = *pnode; + en = rb_entry(*pnode, struct extent_node, rb_node); + + if (fofs < en->ei.fofs) + pnode = &(*pnode)->rb_left; + else if (fofs >= en->ei.fofs + en->ei.len) + pnode = &(*pnode)->rb_right; + else + goto lookup_neighbors; + } + + *insert_p = pnode; + *insert_parent = parent; + + en = rb_entry(parent, struct extent_node, rb_node); + tmp_node = parent; + if (parent && fofs > en->ei.fofs) + tmp_node = rb_next(parent); + *next_ex = tmp_node ? + rb_entry(tmp_node, struct extent_node, rb_node) : NULL; + + tmp_node = parent; + if (parent && fofs < en->ei.fofs) + tmp_node = rb_prev(parent); + *prev_ex = tmp_node ? + rb_entry(tmp_node, struct extent_node, rb_node) : NULL; + return NULL; + +lookup_neighbors: + if (fofs == en->ei.fofs) { + /* lookup prev node for merging backward later */ + tmp_node = rb_prev(&en->rb_node); + *prev_ex = tmp_node ? + rb_entry(tmp_node, struct extent_node, rb_node) : NULL; + } + if (fofs == en->ei.fofs + en->ei.len - 1) { + /* lookup next node for merging frontward later */ + tmp_node = rb_next(&en->rb_node); + *next_ex = tmp_node ? + rb_entry(tmp_node, struct extent_node, rb_node) : NULL; + } + return en; +} + +static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_info *ei, + struct extent_node **den, + struct extent_node *prev_ex, + struct extent_node *next_ex) +{ + struct extent_node *en = NULL; + + if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei)) { + prev_ex->ei.len += ei->len; + ei = &prev_ex->ei; + en = prev_ex; + } + + if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) { + if (en) { + __detach_extent_node(sbi, et, prev_ex); + *den = prev_ex; + } + next_ex->ei.fofs = ei->fofs; + next_ex->ei.blk = ei->blk; + next_ex->ei.len += ei->len; + en = next_ex; + } + + if (en) { + if (en->ei.len > et->largest.len) + et->largest = en->ei; + et->cached_en = en; + } + return en; +} + +static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_info *ei, + struct rb_node **insert_p, + struct rb_node *insert_parent) +{ + struct rb_node **p = &et->root.rb_node; + struct rb_node *parent = NULL; + struct extent_node *en = NULL; + + if (insert_p && insert_parent) { + parent = insert_parent; + p = insert_p; + goto do_insert; + } + + while (*p) { + parent = *p; + en = rb_entry(parent, struct extent_node, rb_node); + + if (ei->fofs < en->ei.fofs) + p = &(*p)->rb_left; + else if (ei->fofs >= en->ei.fofs + en->ei.len) + p = &(*p)->rb_right; + else + f2fs_bug_on(sbi, 1); + } +do_insert: + en = __attach_extent_node(sbi, et, ei, parent, p); + if (!en) + return NULL; + + if (en->ei.len > et->largest.len) + et->largest = en->ei; + et->cached_en = en; + return en; +} + +unsigned int f2fs_update_extent_tree_range(struct inode *inode, + pgoff_t fofs, block_t blkaddr, unsigned int len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_node *en = NULL, *en1 = NULL, *en2 = NULL, *en3 = NULL; + struct extent_node *prev_en = NULL, *next_en = NULL; + struct extent_info ei, dei, prev; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + unsigned int end = fofs + len; + unsigned int pos = (unsigned int)fofs; + + if (!et) + return false; + + write_lock(&et->lock); + + if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) { + write_unlock(&et->lock); + return false; + } + + prev = et->largest; + dei.len = 0; + + /* we do not guarantee that the largest extent is cached all the time */ + __drop_largest_extent(inode, fofs); + + /* 1. lookup first extent node in range [fofs, fofs + len - 1] */ + en = __lookup_extent_tree_ret(et, fofs, &prev_en, &next_en, + &insert_p, &insert_parent); + if (!en) { + if (next_en) { + en = next_en; + f2fs_bug_on(sbi, en->ei.fofs <= pos); + pos = en->ei.fofs; + } else { + /* + * skip searching in the tree since there is no + * larger extent node in the cache. + */ + goto update_extent; + } + } + + /* 2. invlidate all extent nodes in range [fofs, fofs + len - 1] */ + while (en) { + struct rb_node *node; + + if (pos >= end) + break; + + dei = en->ei; + en1 = en2 = NULL; + + node = rb_next(&en->rb_node); + + /* + * 2.1 there are four cases when we invalidate blkaddr in extent + * node, |V: valid address, X: will be invalidated| + */ + /* case#1, invalidate right part of extent node |VVVVVXXXXX| */ + if (pos > dei.fofs && end >= dei.fofs + dei.len) { + en->ei.len = pos - dei.fofs; + + if (en->ei.len < F2FS_MIN_EXTENT_LEN) { + __detach_extent_node(sbi, et, en); + insert_p = NULL; + insert_parent = NULL; + goto update; + } + + if (__is_extent_same(&dei, &et->largest)) + et->largest = en->ei; + goto next; + } + + /* case#2, invalidate left part of extent node |XXXXXVVVVV| */ + if (pos <= dei.fofs && end < dei.fofs + dei.len) { + en->ei.fofs = end; + en->ei.blk += end - dei.fofs; + en->ei.len -= end - dei.fofs; + + if (en->ei.len < F2FS_MIN_EXTENT_LEN) { + __detach_extent_node(sbi, et, en); + insert_p = NULL; + insert_parent = NULL; + goto update; + } + + if (__is_extent_same(&dei, &et->largest)) + et->largest = en->ei; + goto next; + } + + __detach_extent_node(sbi, et, en); + + /* + * if we remove node in rb-tree, our parent node pointer may + * point the wrong place, discard them. + */ + insert_p = NULL; + insert_parent = NULL; + + /* case#3, invalidate entire extent node |XXXXXXXXXX| */ + if (pos <= dei.fofs && end >= dei.fofs + dei.len) { + if (__is_extent_same(&dei, &et->largest)) + et->largest.len = 0; + goto update; + } + + /* + * case#4, invalidate data in the middle of extent node + * |VVVXXXXVVV| + */ + if (dei.len > F2FS_MIN_EXTENT_LEN) { + unsigned int endofs; + + /* insert left part of split extent into cache */ + if (pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) { + set_extent_info(&ei, dei.fofs, dei.blk, + pos - dei.fofs); + en1 = __insert_extent_tree(sbi, et, &ei, + NULL, NULL); + } + + /* insert right part of split extent into cache */ + endofs = dei.fofs + dei.len; + if (endofs - end >= F2FS_MIN_EXTENT_LEN) { + set_extent_info(&ei, end, + end - dei.fofs + dei.blk, + endofs - end); + en2 = __insert_extent_tree(sbi, et, &ei, + NULL, NULL); + } + } +update: + /* 2.2 update in global extent list */ + spin_lock(&sbi->extent_lock); + if (en && !list_empty(&en->list)) + list_del(&en->list); + if (en1) + list_add_tail(&en1->list, &sbi->extent_list); + if (en2) + list_add_tail(&en2->list, &sbi->extent_list); + spin_unlock(&sbi->extent_lock); + + /* 2.3 release extent node */ + if (en) + kmem_cache_free(extent_node_slab, en); +next: + en = node ? rb_entry(node, struct extent_node, rb_node) : NULL; + next_en = en; + if (en) + pos = en->ei.fofs; + } + +update_extent: + /* 3. update extent in extent cache */ + if (blkaddr) { + struct extent_node *den = NULL; + + set_extent_info(&ei, fofs, blkaddr, len); + en3 = __try_merge_extent_node(sbi, et, &ei, &den, + prev_en, next_en); + if (!en3) + en3 = __insert_extent_tree(sbi, et, &ei, + insert_p, insert_parent); + + /* give up extent_cache, if split and small updates happen */ + if (dei.len >= 1 && + prev.len < F2FS_MIN_EXTENT_LEN && + et->largest.len < F2FS_MIN_EXTENT_LEN) { + et->largest.len = 0; + set_inode_flag(F2FS_I(inode), FI_NO_EXTENT); + } + + spin_lock(&sbi->extent_lock); + if (en3) { + if (list_empty(&en3->list)) + list_add_tail(&en3->list, &sbi->extent_list); + else + list_move_tail(&en3->list, &sbi->extent_list); + } + if (den && !list_empty(&den->list)) + list_del(&den->list); + spin_unlock(&sbi->extent_lock); + + if (den) + kmem_cache_free(extent_node_slab, den); + } + + if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) + __free_extent_tree(sbi, et, true); + + write_unlock(&et->lock); + + return !__is_extent_same(&prev, &et->largest); +} + +unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) +{ + struct extent_tree *treevec[EXT_TREE_VEC_SIZE]; + struct extent_node *en, *tmp; + unsigned long ino = F2FS_ROOT_INO(sbi); + struct radix_tree_root *root = &sbi->extent_tree_root; + unsigned int found; + unsigned int node_cnt = 0, tree_cnt = 0; + int remained; + + if (!test_opt(sbi, EXTENT_CACHE)) + return 0; + + if (!down_write_trylock(&sbi->extent_tree_lock)) + goto out; + + /* 1. remove unreferenced extent tree */ + while ((found = radix_tree_gang_lookup(root, + (void **)treevec, ino, EXT_TREE_VEC_SIZE))) { + unsigned i; + + ino = treevec[found - 1]->ino + 1; + for (i = 0; i < found; i++) { + struct extent_tree *et = treevec[i]; + + if (!atomic_read(&et->refcount)) { + write_lock(&et->lock); + node_cnt += __free_extent_tree(sbi, et, true); + write_unlock(&et->lock); + + radix_tree_delete(root, et->ino); + kmem_cache_free(extent_tree_slab, et); + sbi->total_ext_tree--; + tree_cnt++; + + if (node_cnt + tree_cnt >= nr_shrink) + goto unlock_out; + } + } + } + up_write(&sbi->extent_tree_lock); + + /* 2. remove LRU extent entries */ + if (!down_write_trylock(&sbi->extent_tree_lock)) + goto out; + + remained = nr_shrink - (node_cnt + tree_cnt); + + spin_lock(&sbi->extent_lock); + list_for_each_entry_safe(en, tmp, &sbi->extent_list, list) { + if (!remained--) + break; + list_del_init(&en->list); + } + spin_unlock(&sbi->extent_lock); + + while ((found = radix_tree_gang_lookup(root, + (void **)treevec, ino, EXT_TREE_VEC_SIZE))) { + unsigned i; + + ino = treevec[found - 1]->ino + 1; + for (i = 0; i < found; i++) { + struct extent_tree *et = treevec[i]; + + write_lock(&et->lock); + node_cnt += __free_extent_tree(sbi, et, false); + write_unlock(&et->lock); + + if (node_cnt + tree_cnt >= nr_shrink) + break; + } + } +unlock_out: + up_write(&sbi->extent_tree_lock); +out: + trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt); + + return node_cnt + tree_cnt; +} + +unsigned int f2fs_destroy_extent_node(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree; + unsigned int node_cnt = 0; + + if (!et) + return 0; + + write_lock(&et->lock); + node_cnt = __free_extent_tree(sbi, et, true); + write_unlock(&et->lock); + + return node_cnt; +} + +void f2fs_destroy_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree; + unsigned int node_cnt = 0; + + if (!et) + return; + + if (inode->i_nlink && !is_bad_inode(inode) && et->count) { + atomic_dec(&et->refcount); + return; + } + + /* free all extent info belong to this extent tree */ + node_cnt = f2fs_destroy_extent_node(inode); + + /* delete extent tree entry in radix tree */ + down_write(&sbi->extent_tree_lock); + atomic_dec(&et->refcount); + f2fs_bug_on(sbi, atomic_read(&et->refcount) || et->count); + radix_tree_delete(&sbi->extent_tree_root, inode->i_ino); + kmem_cache_free(extent_tree_slab, et); + sbi->total_ext_tree--; + up_write(&sbi->extent_tree_lock); + + F2FS_I(inode)->extent_tree = NULL; + + trace_f2fs_destroy_extent_tree(inode, node_cnt); +} + +bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei) +{ + if (!f2fs_may_extent_tree(inode)) + return false; + + return f2fs_lookup_extent_tree(inode, pgofs, ei); +} + +void f2fs_update_extent_cache(struct dnode_of_data *dn) +{ + struct f2fs_inode_info *fi = F2FS_I(dn->inode); + pgoff_t fofs; + + if (!f2fs_may_extent_tree(dn->inode)) + return; + + f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR); + + + fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + + dn->ofs_in_node; + + if (f2fs_update_extent_tree_range(dn->inode, fofs, dn->data_blkaddr, 1)) + sync_inode_page(dn); +} + +void f2fs_update_extent_cache_range(struct dnode_of_data *dn, + pgoff_t fofs, block_t blkaddr, unsigned int len) + +{ + if (!f2fs_may_extent_tree(dn->inode)) + return; + + if (f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len)) + sync_inode_page(dn); +} + +void init_extent_cache_info(struct f2fs_sb_info *sbi) +{ + INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO); + init_rwsem(&sbi->extent_tree_lock); + INIT_LIST_HEAD(&sbi->extent_list); + spin_lock_init(&sbi->extent_lock); + sbi->total_ext_tree = 0; + atomic_set(&sbi->total_ext_node, 0); +} + +int __init create_extent_cache(void) +{ + extent_tree_slab = f2fs_kmem_cache_create("f2fs_extent_tree", + sizeof(struct extent_tree)); + if (!extent_tree_slab) + return -ENOMEM; + extent_node_slab = f2fs_kmem_cache_create("f2fs_extent_node", + sizeof(struct extent_node)); + if (!extent_node_slab) { + kmem_cache_destroy(extent_tree_slab); + return -ENOMEM; + } + return 0; +} + +void destroy_extent_cache(void) +{ + kmem_cache_destroy(extent_node_slab); + kmem_cache_destroy(extent_tree_slab); +} diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a8327ed73898..f1a90ffd7cad 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -19,6 +19,7 @@ #include <linux/magic.h> #include <linux/kobject.h> #include <linux/sched.h> +#include <linux/bio.h> #ifdef CONFIG_F2FS_CHECK_FS #define f2fs_bug_on(sbi, condition) BUG_ON(condition) @@ -228,6 +229,7 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, #define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3) #define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4) #define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) +#define F2FS_IOC_GARBAGE_COLLECT _IO(F2FS_IOCTL_MAGIC, 6) #define F2FS_IOC_SET_ENCRYPTION_POLICY \ _IOR('f', 19, struct f2fs_encryption_policy) @@ -320,7 +322,7 @@ enum { */ }; -#define F2FS_LINK_MAX 32000 /* maximum link count per file */ +#define F2FS_LINK_MAX 0xffffffff /* maximum link count per file */ #define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */ @@ -349,6 +351,7 @@ struct extent_tree { nid_t ino; /* inode number */ struct rb_root root; /* root of extent info rb-tree */ struct extent_node *cached_en; /* recently accessed extent node */ + struct extent_info largest; /* largested extent info */ rwlock_t lock; /* protect extent info rb-tree */ atomic_t refcount; /* reference count of rb-tree */ unsigned int count; /* # of extent node in rb-tree*/ @@ -372,6 +375,12 @@ struct f2fs_map_blocks { unsigned int m_flags; }; +/* for flag in get_data_block */ +#define F2FS_GET_BLOCK_READ 0 +#define F2FS_GET_BLOCK_DIO 1 +#define F2FS_GET_BLOCK_FIEMAP 2 +#define F2FS_GET_BLOCK_BMAP 3 + /* * i_advise uses FADVISE_XXX_BIT. We can add additional hints later. */ @@ -420,14 +429,13 @@ struct f2fs_inode_info { unsigned int clevel; /* maximum level of given file name */ nid_t i_xattr_nid; /* node id that contains xattrs */ unsigned long long xattr_ver; /* cp version of xattr modification */ - struct extent_info ext; /* in-memory extent cache entry */ - rwlock_t ext_lock; /* rwlock for single extent cache */ struct inode_entry *dirty_dir; /* the pointer of dirty dir */ - struct radix_tree_root inmem_root; /* radix tree for inmem pages */ struct list_head inmem_pages; /* inmemory pages managed by f2fs */ struct mutex inmem_lock; /* lock for inmemory pages */ + struct extent_tree *extent_tree; /* cached extent_tree entry */ + #ifdef CONFIG_F2FS_FS_ENCRYPTION /* Encryption params */ struct f2fs_crypt_info *i_crypt_info; @@ -779,7 +787,11 @@ struct f2fs_sb_info { unsigned int segment_count[2]; /* # of allocated segments */ unsigned int block_count[2]; /* # of allocated blocks */ atomic_t inplace_count; /* # of inplace update */ - int total_hit_ext, read_hit_ext; /* extent cache hit ratio */ + atomic_t total_hit_ext; /* # of lookup extent cache */ + atomic_t read_hit_rbtree; /* # of hit rbtree extent node */ + atomic_t read_hit_largest; /* # of hit largest extent node */ + atomic_t read_hit_cached; /* # of hit cached extent node */ + atomic_t inline_xattr; /* # of inline_xattr inodes */ atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ int bg_gc; /* background gc calls */ @@ -791,6 +803,11 @@ struct f2fs_sb_info { /* For sysfs suppport */ struct kobject s_kobj; struct completion s_kobj_unregister; + + /* For shrinker support */ + struct list_head s_list; + struct mutex umount_mutex; + unsigned int shrinker_run_no; }; /* @@ -1039,7 +1056,8 @@ static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) static inline void inode_dec_dirty_pages(struct inode *inode) { - if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode)) + if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && + !S_ISLNK(inode->i_mode)) return; atomic_dec(&F2FS_I(inode)->dirty_pages); @@ -1234,16 +1252,24 @@ static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { void *entry; -retry: - entry = kmem_cache_alloc(cachep, flags); - if (!entry) { - cond_resched(); - goto retry; - } + entry = kmem_cache_alloc(cachep, flags); + if (!entry) + entry = kmem_cache_alloc(cachep, flags | __GFP_NOFAIL); return entry; } +static inline struct bio *f2fs_bio_alloc(int npages) +{ + struct bio *bio; + + /* No failure on bio allocation */ + bio = bio_alloc(GFP_NOIO, npages); + if (!bio) + bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages); + return bio; +} + static inline void f2fs_radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *item) { @@ -1342,6 +1368,7 @@ enum { FI_INC_LINK, /* need to increment i_nlink */ FI_ACL_MODE, /* indicate acl mode */ FI_NO_ALLOC, /* should not allocate any blocks */ + FI_FREE_NID, /* free allocated nide */ FI_UPDATE_DIR, /* should update inode block for consistency */ FI_DELAY_IPUT, /* used for the recovery */ FI_NO_EXTENT, /* not to use the extent cache */ @@ -1541,6 +1568,17 @@ static inline bool is_dot_dotdot(const struct qstr *str) return false; } +static inline bool f2fs_may_extent_tree(struct inode *inode) +{ + mode_t mode = inode->i_mode; + + if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE) || + is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) + return false; + + return S_ISREG(mode); +} + #define get_inode_mode(i) \ ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) @@ -1557,7 +1595,7 @@ static inline bool is_dot_dotdot(const struct qstr *str) int f2fs_sync_file(struct file *, loff_t, loff_t, int); void truncate_data_blocks(struct dnode_of_data *); int truncate_blocks(struct inode *, u64, bool); -void f2fs_truncate(struct inode *); +int f2fs_truncate(struct inode *, bool); int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *); int f2fs_setattr(struct dentry *, struct iattr *); int truncate_hole(struct inode *, pgoff_t, pgoff_t); @@ -1649,7 +1687,7 @@ int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); int truncate_inode_blocks(struct inode *, pgoff_t); int truncate_xattr_node(struct inode *, struct page *); int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t); -void remove_inode_page(struct inode *); +int remove_inode_page(struct inode *); struct page *new_inode_page(struct inode *); struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *); void ra_node_page(struct f2fs_sb_info *, nid_t); @@ -1660,6 +1698,7 @@ int sync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *); bool alloc_nid(struct f2fs_sb_info *, nid_t *); void alloc_nid_done(struct f2fs_sb_info *, nid_t); void alloc_nid_failed(struct f2fs_sb_info *, nid_t); +int try_to_free_nids(struct f2fs_sb_info *, int); void recover_inline_xattr(struct inode *, struct page *); void recover_xattr_data(struct inode *, struct page *, block_t); int recover_inode_page(struct f2fs_sb_info *, struct page *); @@ -1675,7 +1714,7 @@ void destroy_node_manager_caches(void); * segment.c */ void register_inmem_page(struct inode *, struct page *); -void commit_inmem_pages(struct inode *, bool); +int commit_inmem_pages(struct inode *, bool); void f2fs_balance_fs(struct f2fs_sb_info *); void f2fs_balance_fs_bg(struct f2fs_sb_info *); int f2fs_issue_flush(struct f2fs_sb_info *); @@ -1685,7 +1724,7 @@ void invalidate_blocks(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *); void release_discard_addrs(struct f2fs_sb_info *); -void discard_next_dnode(struct f2fs_sb_info *, block_t); +bool discard_next_dnode(struct f2fs_sb_info *, block_t); int npages_for_summary_flush(struct f2fs_sb_info *, bool); void allocate_new_segments(struct f2fs_sb_info *); int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *); @@ -1727,7 +1766,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *); void release_orphan_inode(struct f2fs_sb_info *); void add_orphan_inode(struct f2fs_sb_info *, nid_t); void remove_orphan_inode(struct f2fs_sb_info *, nid_t); -void recover_orphan_inodes(struct f2fs_sb_info *); +int recover_orphan_inodes(struct f2fs_sb_info *); int get_valid_checkpoint(struct f2fs_sb_info *); void update_dirty_page(struct inode *, struct page *); void add_dirty_dir_inode(struct inode *); @@ -1746,21 +1785,14 @@ int f2fs_submit_page_bio(struct f2fs_io_info *); void f2fs_submit_page_mbio(struct f2fs_io_info *); void set_data_blkaddr(struct dnode_of_data *); int reserve_new_block(struct dnode_of_data *); +int f2fs_get_block(struct dnode_of_data *, pgoff_t); int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); -void f2fs_shrink_extent_tree(struct f2fs_sb_info *, int); -void f2fs_destroy_extent_tree(struct inode *); -void f2fs_init_extent_cache(struct inode *, struct f2fs_extent *); -void f2fs_update_extent_cache(struct dnode_of_data *); -void f2fs_preserve_extent_tree(struct inode *); struct page *get_read_data_page(struct inode *, pgoff_t, int); struct page *find_data_page(struct inode *, pgoff_t); struct page *get_lock_data_page(struct inode *, pgoff_t); struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); int do_write_data_page(struct f2fs_io_info *); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64); -void init_extent_cache_info(struct f2fs_sb_info *); -int __init create_extent_cache(void); -void destroy_extent_cache(void); void f2fs_invalidate_page(struct page *, unsigned int, unsigned int); int f2fs_release_page(struct page *, gfp_t); @@ -1788,11 +1820,13 @@ struct f2fs_stat_info { struct f2fs_sb_info *sbi; int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs; int main_area_segs, main_area_sections, main_area_zones; - int hit_ext, total_ext, ext_tree, ext_node; + int hit_largest, hit_cached, hit_rbtree, hit_total, total_ext; + int ext_tree, ext_node; int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta; int nats, dirty_nats, sits, dirty_sits, fnids; int total_count, utilization; - int bg_gc, inline_inode, inline_dir, inmem_pages, wb_pages; + int bg_gc, inmem_pages, wb_pages; + int inline_xattr, inline_inode, inline_dir; unsigned int valid_count, valid_node_count, valid_inode_count; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; @@ -1823,8 +1857,20 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) #define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++) #define stat_inc_dirty_dir(sbi) ((sbi)->n_dirty_dirs++) #define stat_dec_dirty_dir(sbi) ((sbi)->n_dirty_dirs--) -#define stat_inc_total_hit(sb) ((F2FS_SB(sb))->total_hit_ext++) -#define stat_inc_read_hit(sb) ((F2FS_SB(sb))->read_hit_ext++) +#define stat_inc_total_hit(sbi) (atomic_inc(&(sbi)->total_hit_ext)) +#define stat_inc_rbtree_node_hit(sbi) (atomic_inc(&(sbi)->read_hit_rbtree)) +#define stat_inc_largest_node_hit(sbi) (atomic_inc(&(sbi)->read_hit_largest)) +#define stat_inc_cached_node_hit(sbi) (atomic_inc(&(sbi)->read_hit_cached)) +#define stat_inc_inline_xattr(inode) \ + do { \ + if (f2fs_has_inline_xattr(inode)) \ + (atomic_inc(&F2FS_I_SB(inode)->inline_xattr)); \ + } while (0) +#define stat_dec_inline_xattr(inode) \ + do { \ + if (f2fs_has_inline_xattr(inode)) \ + (atomic_dec(&F2FS_I_SB(inode)->inline_xattr)); \ + } while (0) #define stat_inc_inline_inode(inode) \ do { \ if (f2fs_has_inline_data(inode)) \ @@ -1894,7 +1940,11 @@ void f2fs_destroy_root_stats(void); #define stat_inc_dirty_dir(sbi) #define stat_dec_dirty_dir(sbi) #define stat_inc_total_hit(sb) -#define stat_inc_read_hit(sb) +#define stat_inc_rbtree_node_hit(sb) +#define stat_inc_largest_node_hit(sbi) +#define stat_inc_cached_node_hit(sbi) +#define stat_inc_inline_xattr(inode) +#define stat_dec_inline_xattr(inode) #define stat_inc_inline_inode(inode) #define stat_dec_inline_inode(inode) #define stat_inc_inline_dir(inode) @@ -1950,6 +2000,30 @@ int f2fs_read_inline_dir(struct file *, struct dir_context *, struct f2fs_str *); /* + * shrinker.c + */ +unsigned long f2fs_shrink_count(struct shrinker *, struct shrink_control *); +unsigned long f2fs_shrink_scan(struct shrinker *, struct shrink_control *); +void f2fs_join_shrinker(struct f2fs_sb_info *); +void f2fs_leave_shrinker(struct f2fs_sb_info *); + +/* + * extent_cache.c + */ +unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *, int); +void f2fs_drop_largest_extent(struct inode *, pgoff_t); +void f2fs_init_extent_tree(struct inode *, struct f2fs_extent *); +unsigned int f2fs_destroy_extent_node(struct inode *); +void f2fs_destroy_extent_tree(struct inode *); +bool f2fs_lookup_extent_cache(struct inode *, pgoff_t, struct extent_info *); +void f2fs_update_extent_cache(struct dnode_of_data *); +void f2fs_update_extent_cache_range(struct dnode_of_data *dn, + pgoff_t, block_t, unsigned int); +void init_extent_cache_info(struct f2fs_sb_info *); +int __init create_extent_cache(void); +void destroy_extent_cache(void); + +/* * crypto support */ static inline int f2fs_encrypted_inode(struct inode *inode) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ada2a3dd701a..8120f8685141 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -27,6 +27,7 @@ #include "segment.h" #include "xattr.h" #include "acl.h" +#include "gc.h" #include "trace.h" #include <trace/events/f2fs.h> @@ -85,6 +86,8 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, mapped: /* fill the page */ f2fs_wait_on_page_writeback(page, DATA); + /* if gced page is attached, don't write to cold segment */ + clear_cold_data(page); out: sb_end_pagefault(inode->i_sb); return block_page_mkwrite_return(err); @@ -203,8 +206,8 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) } /* if the inode is dirty, let's recover all the time */ - if (!datasync && is_inode_flag_set(fi, FI_DIRTY_INODE)) { - update_inode_page(inode); + if (!datasync) { + f2fs_write_inode(inode, NULL); goto go_write; } @@ -442,9 +445,9 @@ static int f2fs_file_open(struct inode *inode, struct file *filp) int truncate_data_blocks_range(struct dnode_of_data *dn, int count) { - int nr_free = 0, ofs = dn->ofs_in_node; struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct f2fs_node *raw_node; + int nr_free = 0, ofs = dn->ofs_in_node, len = count; __le32 *addr; raw_node = F2FS_NODE(dn->node_page); @@ -457,14 +460,22 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) dn->data_blkaddr = NULL_ADDR; set_data_blkaddr(dn); - f2fs_update_extent_cache(dn); invalidate_blocks(sbi, blkaddr); if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page)) clear_inode_flag(F2FS_I(dn->inode), FI_FIRST_BLOCK_WRITTEN); nr_free++; } + if (nr_free) { + pgoff_t fofs; + /* + * once we invalidate valid blkaddr in range [ofs, ofs + count], + * we will invalidate all blkaddr in the whole range. + */ + fofs = start_bidx_of_node(ofs_of_node(dn->node_page), + F2FS_I(dn->inode)) + ofs; + f2fs_update_extent_cache_range(dn, fofs, 0, len); dec_valid_block_count(sbi, dn->inode, nr_free); set_page_dirty(dn->node_page); sync_inode_page(dn); @@ -576,24 +587,30 @@ out: return err; } -void f2fs_truncate(struct inode *inode) +int f2fs_truncate(struct inode *inode, bool lock) { + int err; + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) - return; + return 0; trace_f2fs_truncate(inode); /* we should check inline_data size */ if (f2fs_has_inline_data(inode) && !f2fs_may_inline_data(inode)) { - if (f2fs_convert_inline_inode(inode)) - return; + err = f2fs_convert_inline_inode(inode); + if (err) + return err; } - if (!truncate_blocks(inode, i_size_read(inode), true)) { - inode->i_mtime = inode->i_ctime = CURRENT_TIME; - mark_inode_dirty(inode); - } + err = truncate_blocks(inode, i_size_read(inode), lock); + if (err) + return err; + + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(inode); + return 0; } int f2fs_getattr(struct vfsmount *mnt, @@ -653,7 +670,9 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_size <= i_size_read(inode)) { truncate_setsize(inode, attr->ia_size); - f2fs_truncate(inode); + err = f2fs_truncate(inode, true); + if (err) + return err; f2fs_balance_fs(F2FS_I_SB(inode)); } else { /* @@ -692,14 +711,14 @@ const struct inode_operations f2fs_file_inode_operations = { .fiemap = f2fs_fiemap, }; -static void fill_zero(struct inode *inode, pgoff_t index, +static int fill_zero(struct inode *inode, pgoff_t index, loff_t start, loff_t len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *page; if (!len) - return; + return 0; f2fs_balance_fs(sbi); @@ -707,12 +726,14 @@ static void fill_zero(struct inode *inode, pgoff_t index, page = get_new_data_page(inode, NULL, index, false); f2fs_unlock_op(sbi); - if (!IS_ERR(page)) { - f2fs_wait_on_page_writeback(page, DATA); - zero_user(page, start, len); - set_page_dirty(page); - f2fs_put_page(page, 1); - } + if (IS_ERR(page)) + return PTR_ERR(page); + + f2fs_wait_on_page_writeback(page, DATA); + zero_user(page, start, len); + set_page_dirty(page); + f2fs_put_page(page, 1); + return 0; } int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) @@ -760,14 +781,22 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); if (pg_start == pg_end) { - fill_zero(inode, pg_start, off_start, + ret = fill_zero(inode, pg_start, off_start, off_end - off_start); + if (ret) + return ret; } else { - if (off_start) - fill_zero(inode, pg_start++, off_start, - PAGE_CACHE_SIZE - off_start); - if (off_end) - fill_zero(inode, pg_end, 0, off_end); + if (off_start) { + ret = fill_zero(inode, pg_start++, off_start, + PAGE_CACHE_SIZE - off_start); + if (ret) + return ret; + } + if (off_end) { + ret = fill_zero(inode, pg_end, 0, off_end); + if (ret) + return ret; + } if (pg_start < pg_end) { struct address_space *mapping = inode->i_mapping; @@ -797,11 +826,11 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; int ret = 0; - f2fs_lock_op(sbi); - for (; end < nrpages; start++, end++) { block_t new_addr, old_addr; + f2fs_lock_op(sbi); + set_new_dnode(&dn, inode, NULL, NULL, 0); ret = get_dnode_of_data(&dn, end, LOOKUP_NODE_RA); if (ret && ret != -ENOENT) { @@ -817,13 +846,16 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) if (new_addr == NULL_ADDR) { set_new_dnode(&dn, inode, NULL, NULL, 0); ret = get_dnode_of_data(&dn, start, LOOKUP_NODE_RA); - if (ret && ret != -ENOENT) + if (ret && ret != -ENOENT) { goto out; - else if (ret == -ENOENT) + } else if (ret == -ENOENT) { + f2fs_unlock_op(sbi); continue; + } if (dn.data_blkaddr == NULL_ADDR) { f2fs_put_dnode(&dn); + f2fs_unlock_op(sbi); continue; } else { truncate_data_blocks_range(&dn, 1); @@ -862,8 +894,9 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) f2fs_put_dnode(&dn); } + f2fs_unlock_op(sbi); } - ret = 0; + return 0; out: f2fs_unlock_op(sbi); return ret; @@ -885,6 +918,14 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1)) return -EINVAL; + f2fs_balance_fs(F2FS_I_SB(inode)); + + if (f2fs_has_inline_data(inode)) { + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + } + pg_start = offset >> PAGE_CACHE_SHIFT; pg_end = (offset + len) >> PAGE_CACHE_SHIFT; @@ -946,14 +987,21 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); if (pg_start == pg_end) { - fill_zero(inode, pg_start, off_start, off_end - off_start); + ret = fill_zero(inode, pg_start, off_start, + off_end - off_start); + if (ret) + return ret; + if (offset + len > new_size) new_size = offset + len; new_size = max_t(loff_t, new_size, offset + len); } else { if (off_start) { - fill_zero(inode, pg_start++, off_start, - PAGE_CACHE_SIZE - off_start); + ret = fill_zero(inode, pg_start++, off_start, + PAGE_CACHE_SIZE - off_start); + if (ret) + return ret; + new_size = max_t(loff_t, new_size, pg_start << PAGE_CACHE_SHIFT); } @@ -995,7 +1043,10 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, } if (off_end) { - fill_zero(inode, pg_end, 0, off_end); + ret = fill_zero(inode, pg_end, 0, off_end); + if (ret) + goto out; + new_size = max_t(loff_t, new_size, offset + len); } } @@ -1033,6 +1084,12 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi); + if (f2fs_has_inline_data(inode)) { + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + } + ret = truncate_blocks(inode, i_size_read(inode), true); if (ret) return ret; @@ -1302,6 +1359,7 @@ static int f2fs_ioc_getversion(struct file *filp, unsigned long arg) static int f2fs_ioc_start_atomic_write(struct file *filp) { struct inode *inode = file_inode(filp); + int ret; if (!inode_owner_or_capable(inode)) return -EACCES; @@ -1311,9 +1369,12 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (f2fs_is_atomic_file(inode)) return 0; - set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; - return f2fs_convert_inline_inode(inode); + set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + return 0; } static int f2fs_ioc_commit_atomic_write(struct file *filp) @@ -1331,18 +1392,23 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) if (ret) return ret; - if (f2fs_is_atomic_file(inode)) - commit_inmem_pages(inode, false); + if (f2fs_is_atomic_file(inode)) { + clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + ret = commit_inmem_pages(inode, false); + if (ret) + goto err_out; + } - ret = f2fs_sync_file(filp, 0, LONG_MAX, 0); + ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0); +err_out: mnt_drop_write_file(filp); - clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); return ret; } static int f2fs_ioc_start_volatile_write(struct file *filp) { struct inode *inode = file_inode(filp); + int ret; if (!inode_owner_or_capable(inode)) return -EACCES; @@ -1350,9 +1416,12 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) if (f2fs_is_volatile_file(inode)) return 0; - set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; - return f2fs_convert_inline_inode(inode); + set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); + return 0; } static int f2fs_ioc_release_volatile_write(struct file *filp) @@ -1387,8 +1456,8 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) f2fs_balance_fs(F2FS_I_SB(inode)); if (f2fs_is_atomic_file(inode)) { - commit_inmem_pages(inode, false); clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + commit_inmem_pages(inode, true); } if (f2fs_is_volatile_file(inode)) @@ -1543,6 +1612,35 @@ got_it: return 0; } +static int f2fs_ioc_gc(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + __u32 i, count; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (get_user(count, (__u32 __user *)arg)) + return -EFAULT; + + if (!count || count > F2FS_BATCH_GC_MAX_NUM) + return -EINVAL; + + for (i = 0; i < count; i++) { + if (!mutex_trylock(&sbi->gc_mutex)) + break; + + if (f2fs_gc(sbi)) + break; + } + + if (put_user(i, (__u32 __user *)arg)) + return -EFAULT; + + return 0; +} + long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { @@ -1572,6 +1670,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_get_encryption_policy(filp, arg); case F2FS_IOC_GET_ENCRYPTION_PWSALT: return f2fs_ioc_get_encryption_pwsalt(filp, arg); + case F2FS_IOC_GARBAGE_COLLECT: + return f2fs_ioc_gc(filp, arg); default: return -ENOTTY; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index e1e73617d13b..782b8e72c094 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -391,23 +391,27 @@ static int check_valid_map(struct f2fs_sb_info *sbi, * On validity, copy that node with cold status, otherwise (invalid node) * ignore that. */ -static void gc_node_segment(struct f2fs_sb_info *sbi, +static int gc_node_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, unsigned int segno, int gc_type) { bool initial = true; struct f2fs_summary *entry; + block_t start_addr; int off; + start_addr = START_BLOCK(sbi, segno); + next_step: entry = sum; for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { nid_t nid = le32_to_cpu(entry->nid); struct page *node_page; + struct node_info ni; /* stop BG_GC if there is not enough free sections. */ if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0)) - return; + return 0; if (check_valid_map(sbi, segno, off) == 0) continue; @@ -426,6 +430,12 @@ next_step: continue; } + get_node_info(sbi, nid, &ni); + if (ni.blk_addr != start_addr + off) { + f2fs_put_page(node_page, 1); + continue; + } + /* set page dirty and write it */ if (gc_type == FG_GC) { f2fs_wait_on_page_writeback(node_page, NODE); @@ -451,13 +461,11 @@ next_step: }; sync_node_pages(sbi, 0, &wbc); - /* - * In the case of FG_GC, it'd be better to reclaim this victim - * completely. - */ - if (get_valid_blocks(sbi, segno, 1) != 0) - goto next_step; + /* return 1 only if FG_GC succefully reclaimed one */ + if (get_valid_blocks(sbi, segno, 1) == 0) + return 1; } + return 0; } /* @@ -487,7 +495,7 @@ block_t start_bidx_of_node(unsigned int node_ofs, struct f2fs_inode_info *fi) return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi); } -static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, +static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, struct node_info *dni, block_t blkaddr, unsigned int *nofs) { struct page *node_page; @@ -500,13 +508,13 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, node_page = get_node_page(sbi, nid); if (IS_ERR(node_page)) - return 0; + return false; get_node_info(sbi, nid, dni); if (sum->version != dni->version) { f2fs_put_page(node_page, 1); - return 0; + return false; } *nofs = ofs_of_node(node_page); @@ -514,8 +522,8 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, f2fs_put_page(node_page, 1); if (source_blkaddr != blkaddr) - return 0; - return 1; + return false; + return true; } static void move_encrypted_block(struct inode *inode, block_t bidx) @@ -552,31 +560,46 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) fio.page = page; fio.blk_addr = dn.data_blkaddr; - fio.encrypted_page = grab_cache_page(META_MAPPING(fio.sbi), fio.blk_addr); + fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi), + fio.blk_addr, + FGP_LOCK|FGP_CREAT, + GFP_NOFS); if (!fio.encrypted_page) goto put_out; - f2fs_submit_page_bio(&fio); + err = f2fs_submit_page_bio(&fio); + if (err) + goto put_page_out; + + /* write page */ + lock_page(fio.encrypted_page); + + if (unlikely(!PageUptodate(fio.encrypted_page))) + goto put_page_out; + if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) + goto put_page_out; + + set_page_dirty(fio.encrypted_page); + f2fs_wait_on_page_writeback(fio.encrypted_page, META); + if (clear_page_dirty_for_io(fio.encrypted_page)) + dec_page_count(fio.sbi, F2FS_DIRTY_META); + + set_page_writeback(fio.encrypted_page); /* allocate block address */ f2fs_wait_on_page_writeback(dn.node_page, NODE); - allocate_data_block(fio.sbi, NULL, fio.blk_addr, &fio.blk_addr, &sum, CURSEG_COLD_DATA); - dn.data_blkaddr = fio.blk_addr; - - /* write page */ - lock_page(fio.encrypted_page); - set_page_writeback(fio.encrypted_page); fio.rw = WRITE_SYNC; f2fs_submit_page_mbio(&fio); + dn.data_blkaddr = fio.blk_addr; set_data_blkaddr(&dn); f2fs_update_extent_cache(&dn); set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); if (page->index == 0) set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); - +put_page_out: f2fs_put_page(fio.encrypted_page, 1); put_out: f2fs_put_dnode(&dn); @@ -605,8 +628,8 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type) .page = page, .encrypted_page = NULL, }; + set_page_dirty(page); f2fs_wait_on_page_writeback(page, DATA); - if (clear_page_dirty_for_io(page)) inode_dec_dirty_pages(inode); set_cold_data(page); @@ -624,7 +647,7 @@ out: * If the parent node is not valid or the data block address is different, * the victim data block is ignored. */ -static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, +static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, struct gc_inode_list *gc_list, unsigned int segno, int gc_type) { struct super_block *sb = sbi->sb; @@ -647,7 +670,7 @@ next_step: /* stop BG_GC if there is not enough free sections. */ if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0)) - return; + return 0; if (check_valid_map(sbi, segno, off) == 0) continue; @@ -658,7 +681,7 @@ next_step: } /* Get an inode by ino with checking validity */ - if (check_dnode(sbi, entry, &dni, start_addr + off, &nofs) == 0) + if (!is_alive(sbi, entry, &dni, start_addr + off, &nofs)) continue; if (phase == 1) { @@ -712,15 +735,11 @@ next_step: if (gc_type == FG_GC) { f2fs_submit_merged_bio(sbi, DATA, WRITE); - /* - * In the case of FG_GC, it'd be better to reclaim this victim - * completely. - */ - if (get_valid_blocks(sbi, segno, 1) != 0) { - phase = 2; - goto next_step; - } + /* return 1 only if FG_GC succefully reclaimed one */ + if (get_valid_blocks(sbi, segno, 1) == 0) + return 1; } + return 0; } static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, @@ -736,12 +755,13 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, return ret; } -static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, +static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, struct gc_inode_list *gc_list, int gc_type) { struct page *sum_page; struct f2fs_summary_block *sum; struct blk_plug plug; + int nfree = 0; /* read segment summary of victim */ sum_page = get_sum_page(sbi, segno); @@ -761,10 +781,11 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, switch (GET_SUM_TYPE((&sum->footer))) { case SUM_TYPE_NODE: - gc_node_segment(sbi, sum->entries, segno, gc_type); + nfree = gc_node_segment(sbi, sum->entries, segno, gc_type); break; case SUM_TYPE_DATA: - gc_data_segment(sbi, sum->entries, gc_list, segno, gc_type); + nfree = gc_data_segment(sbi, sum->entries, gc_list, + segno, gc_type); break; } blk_finish_plug(&plug); @@ -773,11 +794,13 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, stat_inc_call_count(sbi->stat_info); f2fs_put_page(sum_page, 0); + return nfree; } int f2fs_gc(struct f2fs_sb_info *sbi) { - unsigned int segno, i; + unsigned int segno = NULL_SEGNO; + unsigned int i; int gc_type = BG_GC; int nfree = 0; int ret = -1; @@ -796,10 +819,11 @@ gc_more: if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) { gc_type = FG_GC; - write_checkpoint(sbi, &cpc); + if (__get_victim(sbi, &segno, gc_type) || prefree_segments(sbi)) + write_checkpoint(sbi, &cpc); } - if (!__get_victim(sbi, &segno, gc_type)) + if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type)) goto stop; ret = 0; @@ -809,13 +833,10 @@ gc_more: META_SSA); for (i = 0; i < sbi->segs_per_sec; i++) - do_garbage_collect(sbi, segno + i, &gc_list, gc_type); + nfree += do_garbage_collect(sbi, segno + i, &gc_list, gc_type); - if (gc_type == FG_GC) { + if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; - nfree++; - WARN_ON(get_valid_blocks(sbi, segno, sbi->segs_per_sec)); - } if (has_not_enough_free_secs(sbi, nfree)) goto gc_more; diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index b4a65be9f7d3..c5a055b3376e 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -19,6 +19,12 @@ #define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */ #define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */ +/* + * with this macro, we can control the max time we do garbage collection, + * when user triggers batch mode gc by ioctl. + */ +#define F2FS_BATCH_GC_MAX_NUM 16 + /* Search max. number of dirty segments to select a victim segment */ #define DEF_MAX_VICTIM_SEARCH 4096 /* covers 8GB */ diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 38e75fb1e488..3d143be42895 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -141,6 +141,8 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) kunmap_atomic(dst_addr); SetPageUptodate(page); no_update: + set_page_dirty(page); + /* clear dirty state */ dirty = clear_page_dirty_for_io(page); @@ -358,6 +360,10 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent, return 0; } +/* + * NOTE: ipage is grabbed by caller, but if any error occurs, we should + * release ipage in this function. + */ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, struct f2fs_inline_dentry *inline_dentry) { @@ -367,8 +373,10 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, int err; page = grab_cache_page(dir->i_mapping, 0); - if (!page) + if (!page) { + f2fs_put_page(ipage, 1); return -ENOMEM; + } set_new_dnode(&dn, dir, ipage, NULL, 0); err = f2fs_reserve_block(&dn, 0); @@ -376,13 +384,21 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, goto out; f2fs_wait_on_page_writeback(page, DATA); - zero_user_segment(page, 0, PAGE_CACHE_SIZE); + zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); dentry_blk = kmap_atomic(page); /* copy data from inline dentry block to new dentry block */ memcpy(dentry_blk->dentry_bitmap, inline_dentry->dentry_bitmap, INLINE_DENTRY_BITMAP_SIZE); + memset(dentry_blk->dentry_bitmap + INLINE_DENTRY_BITMAP_SIZE, 0, + SIZE_OF_DENTRY_BITMAP - INLINE_DENTRY_BITMAP_SIZE); + /* + * we do not need to zero out remainder part of dentry and filename + * field, since we have used bitmap for marking the usage status of + * them, besides, we can also ignore copying/zeroing reserved space + * of dentry block, because them haven't been used so far. + */ memcpy(dentry_blk->dentry, inline_dentry->dentry, sizeof(struct f2fs_dir_entry) * NR_INLINE_DENTRY); memcpy(dentry_blk->filename, inline_dentry->filename, @@ -432,8 +448,9 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name, slots, NR_INLINE_DENTRY); if (bit_pos >= NR_INLINE_DENTRY) { err = f2fs_convert_inline_dir(dir, ipage, dentry_blk); - if (!err) - err = -EAGAIN; + if (err) + return err; + err = -EAGAIN; goto out; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 2550868dc651..35aae65b3e5d 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -12,7 +12,6 @@ #include <linux/f2fs_fs.h> #include <linux/buffer_head.h> #include <linux/writeback.h> -#include <linux/bitops.h> #include "f2fs.h" #include "node.h" @@ -34,8 +33,8 @@ void f2fs_set_inode_flags(struct inode *inode) new_fl |= S_NOATIME; if (flags & FS_DIRSYNC_FL) new_fl |= S_DIRSYNC; - set_mask_bits(&inode->i_flags, - S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC, new_fl); + inode_set_flags(inode, new_fl, + S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); } static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) @@ -139,7 +138,7 @@ static int do_read_inode(struct inode *inode) fi->i_pino = le32_to_cpu(ri->i_pino); fi->i_dir_level = ri->i_dir_level; - f2fs_init_extent_cache(inode, &ri->i_ext); + f2fs_init_extent_tree(inode, &ri->i_ext); get_inline_info(fi, ri); @@ -155,6 +154,7 @@ static int do_read_inode(struct inode *inode) f2fs_put_page(node_page, 1); + stat_inc_inline_xattr(inode); stat_inc_inline_inode(inode); stat_inc_inline_dir(inode); @@ -237,10 +237,11 @@ void update_inode(struct inode *inode, struct page *node_page) ri->i_size = cpu_to_le64(i_size_read(inode)); ri->i_blocks = cpu_to_le64(inode->i_blocks); - read_lock(&F2FS_I(inode)->ext_lock); - set_raw_extent(&F2FS_I(inode)->ext, &ri->i_ext); - read_unlock(&F2FS_I(inode)->ext_lock); - + if (F2FS_I(inode)->extent_tree) + set_raw_extent(&F2FS_I(inode)->extent_tree->largest, + &ri->i_ext); + else + memset(&ri->i_ext, 0, sizeof(ri->i_ext)); set_raw_inline(F2FS_I(inode), ri); ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec); @@ -314,7 +315,9 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) void f2fs_evict_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - nid_t xnid = F2FS_I(inode)->i_xattr_nid; + struct f2fs_inode_info *fi = F2FS_I(inode); + nid_t xnid = fi->i_xattr_nid; + int err = 0; /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) @@ -330,41 +333,62 @@ void f2fs_evict_inode(struct inode *inode) f2fs_bug_on(sbi, get_dirty_pages(inode)); remove_dirty_dir_inode(inode); + f2fs_destroy_extent_tree(inode); + if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; sb_start_intwrite(inode->i_sb); - set_inode_flag(F2FS_I(inode), FI_NO_ALLOC); + set_inode_flag(fi, FI_NO_ALLOC); i_size_write(inode, 0); if (F2FS_HAS_BLOCKS(inode)) - f2fs_truncate(inode); + err = f2fs_truncate(inode, true); - f2fs_lock_op(sbi); - remove_inode_page(inode); - f2fs_unlock_op(sbi); + if (!err) { + f2fs_lock_op(sbi); + err = remove_inode_page(inode); + f2fs_unlock_op(sbi); + } sb_end_intwrite(inode->i_sb); no_delete: + stat_dec_inline_xattr(inode); stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); - /* update extent info in inode */ - if (inode->i_nlink) - f2fs_preserve_extent_tree(inode); - f2fs_destroy_extent_tree(inode); - invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); if (xnid) invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); - if (is_inode_flag_set(F2FS_I(inode), FI_APPEND_WRITE)) + if (is_inode_flag_set(fi, FI_APPEND_WRITE)) add_dirty_inode(sbi, inode->i_ino, APPEND_INO); - if (is_inode_flag_set(F2FS_I(inode), FI_UPDATE_WRITE)) + if (is_inode_flag_set(fi, FI_UPDATE_WRITE)) add_dirty_inode(sbi, inode->i_ino, UPDATE_INO); + if (is_inode_flag_set(fi, FI_FREE_NID)) { + if (err && err != -ENOENT) + alloc_nid_done(sbi, inode->i_ino); + else + alloc_nid_failed(sbi, inode->i_ino); + clear_inode_flag(fi, FI_FREE_NID); + } + + if (err && err != -ENOENT) { + if (!exist_written_data(sbi, inode->i_ino, ORPHAN_INO)) { + /* + * get here because we failed to release resource + * of inode previously, reminder our user to run fsck + * for fixing. + */ + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "inode (ino:%lu) resource leak, run fsck " + "to fix this issue!", inode->i_ino); + } + } out_clear: #ifdef CONFIG_F2FS_FS_ENCRYPTION - if (F2FS_I(inode)->i_crypt_info) - f2fs_free_encryption_info(inode, F2FS_I(inode)->i_crypt_info); + if (fi->i_crypt_info) + f2fs_free_encryption_info(inode, fi->i_crypt_info); #endif clear_inode(inode); } @@ -373,6 +397,7 @@ out_clear: void handle_failed_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int err = 0; clear_nlink(inode); make_bad_inode(inode); @@ -380,13 +405,29 @@ void handle_failed_inode(struct inode *inode) i_size_write(inode, 0); if (F2FS_HAS_BLOCKS(inode)) - f2fs_truncate(inode); + err = f2fs_truncate(inode, false); - remove_inode_page(inode); + if (!err) + err = remove_inode_page(inode); + + /* + * if we skip truncate_node in remove_inode_page bacause we failed + * before, it's better to find another way to release resource of + * this inode (e.g. valid block count, node block or nid). Here we + * choose to add this inode to orphan list, so that we can call iput + * for releasing in orphan recovery flow. + * + * Note: we should add inode to orphan list before f2fs_unlock_op() + * so we can prevent losing this orphan when encoutering checkpoint + * and following suddenly power-off. + */ + if (err && err != -ENOENT) { + err = acquire_orphan_inode(sbi); + if (!err) + add_orphan_inode(sbi, inode->i_ino); + } - clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA); - clear_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY); - alloc_nid_failed(sbi, inode->i_ino); + set_inode_flag(F2FS_I(inode), FI_FREE_NID); f2fs_unlock_op(sbi); /* iput will drop the inode object */ diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index fdbae21ee8fb..a680bf38e4f0 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -53,7 +53,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (err) { err = -EINVAL; nid_free = true; - goto out; + goto fail; } /* If the directory encrypted, then we should encrypt the inode. */ @@ -65,6 +65,9 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (f2fs_may_inline_dentry(inode)) set_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY); + f2fs_init_extent_tree(inode, NULL); + + stat_inc_inline_xattr(inode); stat_inc_inline_inode(inode); stat_inc_inline_dir(inode); @@ -72,15 +75,12 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) mark_inode_dirty(inode); return inode; -out: - clear_nlink(inode); - unlock_new_inode(inode); fail: trace_f2fs_new_inode(inode, err); make_bad_inode(inode); - iput(inode); if (nid_free) - alloc_nid_failed(sbi, ino); + set_inode_flag(F2FS_I(inode), FI_FREE_NID); + iput(inode); return ERR_PTR(err); } @@ -89,7 +89,14 @@ static int is_multimedia_file(const unsigned char *s, const char *sub) size_t slen = strlen(s); size_t sublen = strlen(sub); - if (sublen > slen) + /* + * filename format of multimedia file should be defined as: + * "filename + '.' + extension". + */ + if (slen < sublen + 2) + return 0; + + if (s[slen - sublen - 1] != '.') return 0; return !strncasecmp(s + slen - sublen, sub, sublen); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 7dd63b794bfb..27d1a74dd6f3 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -159,7 +159,7 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, head = radix_tree_lookup(&nm_i->nat_set_root, set); if (!head) { - head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC); + head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_NOFS); INIT_LIST_HEAD(&head->entry_list); INIT_LIST_HEAD(&head->set_list); @@ -246,7 +246,7 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) { struct nat_entry *new; - new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_ATOMIC); + new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_NOFS); f2fs_radix_tree_insert(&nm_i->nat_root, nid, new); memset(new, 0, sizeof(struct nat_entry)); nat_set_nid(new, nid); @@ -306,6 +306,10 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) { unsigned char version = nat_get_version(e); nat_set_version(e, inc_node_version(version)); + + /* in order to reuse the nid */ + if (nm_i->next_scan_nid > ni->nid) + nm_i->next_scan_nid = ni->nid; } /* change address */ @@ -328,11 +332,11 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) { struct f2fs_nm_info *nm_i = NM_I(sbi); + int nr = nr_shrink; - if (available_free_memory(sbi, NAT_ENTRIES)) + if (!down_write_trylock(&nm_i->nat_tree_lock)) return 0; - down_write(&nm_i->nat_tree_lock); while (nr_shrink && !list_empty(&nm_i->nat_entries)) { struct nat_entry *ne; ne = list_first_entry(&nm_i->nat_entries, @@ -341,7 +345,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) nr_shrink--; } up_write(&nm_i->nat_tree_lock); - return nr_shrink; + return nr - nr_shrink; } /* @@ -898,17 +902,20 @@ int truncate_xattr_node(struct inode *inode, struct page *page) * Caller should grab and release a rwsem by calling f2fs_lock_op() and * f2fs_unlock_op(). */ -void remove_inode_page(struct inode *inode) +int remove_inode_page(struct inode *inode) { struct dnode_of_data dn; + int err; set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); - if (get_dnode_of_data(&dn, 0, LOOKUP_NODE)) - return; + err = get_dnode_of_data(&dn, 0, LOOKUP_NODE); + if (err) + return err; - if (truncate_xattr_node(inode, dn.inode_page)) { + err = truncate_xattr_node(inode, dn.inode_page); + if (err) { f2fs_put_dnode(&dn); - return; + return err; } /* remove potential inline_data blocks */ @@ -922,6 +929,7 @@ void remove_inode_page(struct inode *inode) /* will put inode & node pages */ truncate_node(&dn); + return 0; } struct page *new_inode_page(struct inode *inode) @@ -991,8 +999,7 @@ fail: /* * Caller should do after getting the following values. * 0: f2fs_put_page(page, 0) - * LOCKED_PAGE: f2fs_put_page(page, 1) - * error: nothing + * LOCKED_PAGE or error: f2fs_put_page(page, 1) */ static int read_node_page(struct page *page, int rw) { @@ -1010,7 +1017,6 @@ static int read_node_page(struct page *page, int rw) if (unlikely(ni.blk_addr == NULL_ADDR)) { ClearPageUptodate(page); - f2fs_put_page(page, 1); return -ENOENT; } @@ -1041,10 +1047,7 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) return; err = read_node_page(apage, READA); - if (err == 0) - f2fs_put_page(apage, 0); - else if (err == LOCKED_PAGE) - f2fs_put_page(apage, 1); + f2fs_put_page(apage, err ? 1 : 0); } struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) @@ -1057,10 +1060,12 @@ repeat: return ERR_PTR(-ENOMEM); err = read_node_page(page, READ_SYNC); - if (err < 0) + if (err < 0) { + f2fs_put_page(page, 1); return ERR_PTR(err); - else if (err != LOCKED_PAGE) + } else if (err != LOCKED_PAGE) { lock_page(page); + } if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) { ClearPageUptodate(page); @@ -1096,10 +1101,12 @@ repeat: return ERR_PTR(-ENOMEM); err = read_node_page(page, READ_SYNC); - if (err < 0) + if (err < 0) { + f2fs_put_page(page, 1); return ERR_PTR(err); - else if (err == LOCKED_PAGE) + } else if (err == LOCKED_PAGE) { goto page_hit; + } blk_start_plug(&plug); @@ -1533,7 +1540,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi) if (unlikely(nid >= nm_i->max_nid)) nid = 0; - if (i++ == FREE_NID_PAGES) + if (++i >= FREE_NID_PAGES) break; } @@ -1570,6 +1577,8 @@ retry: /* We should not use stale free nids created by build_free_nids */ if (nm_i->fcnt && !on_build_free_nids(nm_i)) { + struct node_info ni; + f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); list_for_each_entry(i, &nm_i->free_nid_list, list) if (i->state == NID_NEW) @@ -1580,6 +1589,13 @@ retry: i->state = NID_ALLOC; nm_i->fcnt--; spin_unlock(&nm_i->free_nid_list_lock); + + /* check nid is allocated already */ + get_node_info(sbi, *nid, &ni); + if (ni.blk_addr != NULL_ADDR) { + alloc_nid_done(sbi, *nid); + goto retry; + } return true; } spin_unlock(&nm_i->free_nid_list_lock); @@ -1636,6 +1652,32 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) kmem_cache_free(free_nid_slab, i); } +int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct free_nid *i, *next; + int nr = nr_shrink; + + if (!mutex_trylock(&nm_i->build_lock)) + return 0; + + spin_lock(&nm_i->free_nid_list_lock); + list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) { + if (nr_shrink <= 0 || nm_i->fcnt <= NAT_ENTRY_PER_BLOCK) + break; + if (i->state == NID_ALLOC) + continue; + __del_from_free_nid_list(nm_i, i); + kmem_cache_free(free_nid_slab, i); + nm_i->fcnt--; + nr_shrink--; + } + spin_unlock(&nm_i->free_nid_list_lock); + mutex_unlock(&nm_i->build_lock); + + return nr - nr_shrink; +} + void recover_inline_xattr(struct inode *inode, struct page *page) { void *src_addr, *dst_addr; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 24a8c1d4f45f..faec2ca004b9 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -399,14 +399,35 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, f2fs_bug_on(sbi, ni.ino != ino_of_node(page)); f2fs_bug_on(sbi, ofs_of_node(dn.node_page) != ofs_of_node(page)); - for (; start < end; start++) { + for (; start < end; start++, dn.ofs_in_node++) { block_t src, dest; src = datablock_addr(dn.node_page, dn.ofs_in_node); dest = datablock_addr(page, dn.ofs_in_node); - if (src != dest && dest != NEW_ADDR && dest != NULL_ADDR && - is_valid_blkaddr(sbi, dest, META_POR)) { + /* skip recovering if dest is the same as src */ + if (src == dest) + continue; + + /* dest is invalid, just invalidate src block */ + if (dest == NULL_ADDR) { + truncate_data_blocks_range(&dn, 1); + continue; + } + + /* + * dest is reserved block, invalidate src block + * and then reserve one new block in dnode page. + */ + if (dest == NEW_ADDR) { + truncate_data_blocks_range(&dn, 1); + err = reserve_new_block(&dn); + f2fs_bug_on(sbi, err); + continue; + } + + /* dest is valid block, try to recover from src to dest */ + if (is_valid_blkaddr(sbi, dest, META_POR)) { if (src == NULL_ADDR) { err = reserve_new_block(&dn); @@ -424,7 +445,6 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, ni.version, false); recovered++; } - dn.ofs_in_node++; } if (IS_INODE(dn.node_page)) @@ -525,14 +545,12 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&inode_list); - /* step #1: find fsynced inode numbers */ - set_sbi_flag(sbi, SBI_POR_DOING); - /* prevent checkpoint */ mutex_lock(&sbi->cp_mutex); blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + /* step #1: find fsynced inode numbers */ err = find_fsync_dnodes(sbi, &inode_list); if (err) goto out; @@ -561,11 +579,20 @@ out: clear_sbi_flag(sbi, SBI_POR_DOING); if (err) { - discard_next_dnode(sbi, blkaddr); + bool invalidate = false; + + if (discard_next_dnode(sbi, blkaddr)) + invalidate = true; /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) sync_meta_pages(sbi, META, LONG_MAX); + + /* invalidate temporary meta page */ + if (invalidate) + invalidate_mapping_pages(META_MAPPING(sbi), + blkaddr, blkaddr); + set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); mutex_unlock(&sbi->cp_mutex); } else if (need_writecp) { diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1eb343768781..78e6d0696847 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -197,28 +197,20 @@ void register_inmem_page(struct inode *inode, struct page *page) { struct f2fs_inode_info *fi = F2FS_I(inode); struct inmem_pages *new; - int err; - SetPagePrivate(page); f2fs_trace_pid(page); + set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE); + SetPagePrivate(page); + new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); /* add atomic page indices to the list */ new->page = page; INIT_LIST_HEAD(&new->list); -retry: + /* increase reference count with clean state */ mutex_lock(&fi->inmem_lock); - err = radix_tree_insert(&fi->inmem_root, page->index, new); - if (err == -EEXIST) { - mutex_unlock(&fi->inmem_lock); - kmem_cache_free(inmem_entry_slab, new); - return; - } else if (err) { - mutex_unlock(&fi->inmem_lock); - goto retry; - } get_page(page); list_add_tail(&new->list, &fi->inmem_pages); inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); @@ -227,7 +219,7 @@ retry: trace_f2fs_register_inmem_page(page, INMEM); } -void commit_inmem_pages(struct inode *inode, bool abort) +int commit_inmem_pages(struct inode *inode, bool abort) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); @@ -239,6 +231,7 @@ void commit_inmem_pages(struct inode *inode, bool abort) .rw = WRITE_SYNC | REQ_PRIO, .encrypted_page = NULL, }; + int err = 0; /* * The abort is true only when f2fs_evict_inode is called. @@ -254,23 +247,29 @@ void commit_inmem_pages(struct inode *inode, bool abort) mutex_lock(&fi->inmem_lock); list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { + lock_page(cur->page); if (!abort) { - lock_page(cur->page); if (cur->page->mapping == inode->i_mapping) { + set_page_dirty(cur->page); f2fs_wait_on_page_writeback(cur->page, DATA); if (clear_page_dirty_for_io(cur->page)) inode_dec_dirty_pages(inode); trace_f2fs_commit_inmem_page(cur->page, INMEM); fio.page = cur->page; - do_write_data_page(&fio); + err = do_write_data_page(&fio); submit_bio = true; + if (err) { + unlock_page(cur->page); + break; + } } - f2fs_put_page(cur->page, 1); } else { trace_f2fs_commit_inmem_page(cur->page, INMEM_DROP); - put_page(cur->page); } - radix_tree_delete(&fi->inmem_root, cur->page->index); + set_page_private(cur->page, 0); + ClearPagePrivate(cur->page); + f2fs_put_page(cur->page, 1); + list_del(&cur->list); kmem_cache_free(inmem_entry_slab, cur); dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); @@ -282,6 +281,7 @@ void commit_inmem_pages(struct inode *inode, bool abort) if (submit_bio) f2fs_submit_merged_bio(sbi, DATA, WRITE); } + return err; } /* @@ -303,10 +303,18 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi) void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) { /* try to shrink extent cache when there is no enough memory */ - f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER); + if (!available_free_memory(sbi, EXTENT_CACHE)) + f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER); + + /* check the # of cached NAT entries */ + if (!available_free_memory(sbi, NAT_ENTRIES)) + try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK); + + if (!available_free_memory(sbi, FREE_NIDS)) + try_to_free_nids(sbi, NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES); - /* check the # of cached NAT entries and prefree segments */ - if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK) || + /* checkpoint is the only way to shrink partial cached entries */ + if (!available_free_memory(sbi, NAT_ENTRIES) || excess_prefree_segs(sbi) || !available_free_memory(sbi, INO_ENTRIES)) f2fs_sync_fs(sbi->sb, true); @@ -322,10 +330,12 @@ repeat: return 0; if (!llist_empty(&fcc->issue_list)) { - struct bio *bio = bio_alloc(GFP_NOIO, 0); + struct bio *bio; struct flush_cmd *cmd, *next; int ret; + bio = f2fs_bio_alloc(0); + fcc->dispatch_list = llist_del_all(&fcc->issue_list); fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); @@ -357,8 +367,15 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) if (test_opt(sbi, NOBARRIER)) return 0; - if (!test_opt(sbi, FLUSH_MERGE)) - return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL); + if (!test_opt(sbi, FLUSH_MERGE)) { + struct bio *bio = f2fs_bio_alloc(0); + int ret; + + bio->bi_bdev = sbi->sb->s_bdev; + ret = submit_bio_wait(WRITE_FLUSH, bio); + bio_put(bio); + return ret; + } init_completion(&cmd.wait); @@ -502,7 +519,7 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0); } -void discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr) +bool discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr) { int err = -ENOTSUPP; @@ -512,13 +529,16 @@ void discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr) unsigned int offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); if (f2fs_test_bit(offset, se->discard_map)) - return; + return false; err = f2fs_issue_discard(sbi, blkaddr, 1); } - if (err) + if (err) { update_meta_page(sbi, NULL, blkaddr); + return true; + } + return false; } static void __add_discard_entry(struct f2fs_sb_info *sbi, @@ -1217,7 +1237,8 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, mutex_lock(&sit_i->sentry_lock); /* direct_io'ed data is aligned to the segment for better performance */ - if (direct_io && curseg->next_blkoff) + if (direct_io && curseg->next_blkoff && + !has_not_enough_free_secs(sbi, 0)) __allocate_new_segments(sbi, type); *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); @@ -1732,7 +1753,7 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, static struct sit_entry_set *grab_sit_entry_set(void) { struct sit_entry_set *ses = - f2fs_kmem_cache_alloc(sit_entry_set_slab, GFP_ATOMIC); + f2fs_kmem_cache_alloc(sit_entry_set_slab, GFP_NOFS); ses->entry_cnt = 0; INIT_LIST_HEAD(&ses->set_list); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 79e7b879a753..b6e4ed15c698 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -177,6 +177,15 @@ struct segment_allocation { void (*allocate_segment)(struct f2fs_sb_info *, int, bool); }; +/* + * this value is set in page as a private data which indicate that + * the page is atomically written, and it is in inmem_pages list. + */ +#define ATOMIC_WRITTEN_PAGE 0x0000ffff + +#define IS_ATOMIC_WRITTEN_PAGE(page) \ + (page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE) + struct inmem_pages { struct list_head list; struct page *page; @@ -555,16 +564,15 @@ static inline unsigned short curseg_blkoff(struct f2fs_sb_info *sbi, int type) return curseg->next_blkoff; } -#ifdef CONFIG_F2FS_CHECK_FS static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) { - BUG_ON(segno > TOTAL_SEGS(sbi) - 1); + f2fs_bug_on(sbi, segno > TOTAL_SEGS(sbi) - 1); } static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) { - BUG_ON(blk_addr < SEG0_BLKADDR(sbi)); - BUG_ON(blk_addr >= MAX_BLKADDR(sbi)); + f2fs_bug_on(sbi, blk_addr < SEG0_BLKADDR(sbi) + || blk_addr >= MAX_BLKADDR(sbi)); } /* @@ -573,16 +581,11 @@ static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) static inline void check_block_count(struct f2fs_sb_info *sbi, int segno, struct f2fs_sit_entry *raw_sit) { +#ifdef CONFIG_F2FS_CHECK_FS bool is_valid = test_bit_le(0, raw_sit->valid_map) ? true : false; int valid_blocks = 0; int cur_pos = 0, next_pos; - /* check segment usage */ - BUG_ON(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg); - - /* check boundary of a given segment number */ - BUG_ON(segno > TOTAL_SEGS(sbi) - 1); - /* check bitmap with valid block count */ do { if (is_valid) { @@ -598,35 +601,11 @@ static inline void check_block_count(struct f2fs_sb_info *sbi, is_valid = !is_valid; } while (cur_pos < sbi->blocks_per_seg); BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks); -} -#else -static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) -{ - if (segno > TOTAL_SEGS(sbi) - 1) - set_sbi_flag(sbi, SBI_NEED_FSCK); -} - -static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) -{ - if (blk_addr < SEG0_BLKADDR(sbi) || blk_addr >= MAX_BLKADDR(sbi)) - set_sbi_flag(sbi, SBI_NEED_FSCK); -} - -/* - * Summary block is always treated as an invalid block - */ -static inline void check_block_count(struct f2fs_sb_info *sbi, - int segno, struct f2fs_sit_entry *raw_sit) -{ - /* check segment usage */ - if (GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg) - set_sbi_flag(sbi, SBI_NEED_FSCK); - - /* check boundary of a given segment number */ - if (segno > TOTAL_SEGS(sbi) - 1) - set_sbi_flag(sbi, SBI_NEED_FSCK); -} #endif + /* check segment usage, and check boundary of a given segment number */ + f2fs_bug_on(sbi, GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg + || segno > TOTAL_SEGS(sbi) - 1); +} static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi, unsigned int start) diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c new file mode 100644 index 000000000000..da0d8e0b55a5 --- /dev/null +++ b/fs/f2fs/shrinker.c @@ -0,0 +1,139 @@ +/* + * f2fs shrinker support + * the basic infra was copied from fs/ubifs/shrinker.c + * + * Copyright (c) 2015 Motorola Mobility + * Copyright (c) 2015 Jaegeuk Kim <jaegeuk@kernel.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/fs.h> +#include <linux/f2fs_fs.h> + +#include "f2fs.h" + +static LIST_HEAD(f2fs_list); +static DEFINE_SPINLOCK(f2fs_list_lock); +static unsigned int shrinker_run_no; + +static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) +{ + return NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt; +} + +static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) +{ + if (NM_I(sbi)->fcnt > NAT_ENTRY_PER_BLOCK) + return NM_I(sbi)->fcnt - NAT_ENTRY_PER_BLOCK; + return 0; +} + +static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi) +{ + return sbi->total_ext_tree + atomic_read(&sbi->total_ext_node); +} + +unsigned long f2fs_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct f2fs_sb_info *sbi; + struct list_head *p; + unsigned long count = 0; + + spin_lock(&f2fs_list_lock); + p = f2fs_list.next; + while (p != &f2fs_list) { + sbi = list_entry(p, struct f2fs_sb_info, s_list); + + /* stop f2fs_put_super */ + if (!mutex_trylock(&sbi->umount_mutex)) { + p = p->next; + continue; + } + spin_unlock(&f2fs_list_lock); + + /* count extent cache entries */ + count += __count_extent_cache(sbi); + + /* shrink clean nat cache entries */ + count += __count_nat_entries(sbi); + + /* count free nids cache entries */ + count += __count_free_nids(sbi); + + spin_lock(&f2fs_list_lock); + p = p->next; + mutex_unlock(&sbi->umount_mutex); + } + spin_unlock(&f2fs_list_lock); + return count; +} + +unsigned long f2fs_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + unsigned long nr = sc->nr_to_scan; + struct f2fs_sb_info *sbi; + struct list_head *p; + unsigned int run_no; + unsigned long freed = 0; + + spin_lock(&f2fs_list_lock); + do { + run_no = ++shrinker_run_no; + } while (run_no == 0); + p = f2fs_list.next; + while (p != &f2fs_list) { + sbi = list_entry(p, struct f2fs_sb_info, s_list); + + if (sbi->shrinker_run_no == run_no) + break; + + /* stop f2fs_put_super */ + if (!mutex_trylock(&sbi->umount_mutex)) { + p = p->next; + continue; + } + spin_unlock(&f2fs_list_lock); + + sbi->shrinker_run_no = run_no; + + /* shrink extent cache entries */ + freed += f2fs_shrink_extent_tree(sbi, nr >> 1); + + /* shrink clean nat cache entries */ + if (freed < nr) + freed += try_to_free_nats(sbi, nr - freed); + + /* shrink free nids cache entries */ + if (freed < nr) + freed += try_to_free_nids(sbi, nr - freed); + + spin_lock(&f2fs_list_lock); + p = p->next; + list_move_tail(&sbi->s_list, &f2fs_list); + mutex_unlock(&sbi->umount_mutex); + if (freed >= nr) + break; + } + spin_unlock(&f2fs_list_lock); + return freed; +} + +void f2fs_join_shrinker(struct f2fs_sb_info *sbi) +{ + spin_lock(&f2fs_list_lock); + list_add_tail(&sbi->s_list, &f2fs_list); + spin_unlock(&f2fs_list_lock); +} + +void f2fs_leave_shrinker(struct f2fs_sb_info *sbi) +{ + f2fs_shrink_extent_tree(sbi, __count_extent_cache(sbi)); + + spin_lock(&f2fs_list_lock); + list_del(&sbi->s_list); + spin_unlock(&f2fs_list_lock); +} diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a06b0b46fe69..f79478115d37 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -39,6 +39,13 @@ static struct proc_dir_entry *f2fs_proc_root; static struct kmem_cache *f2fs_inode_cachep; static struct kset *f2fs_kset; +/* f2fs-wide shrinker description */ +static struct shrinker f2fs_shrinker_info = { + .scan_objects = f2fs_shrink_scan, + .count_objects = f2fs_shrink_count, + .seeks = DEFAULT_SEEKS, +}; + enum { Opt_gc_background, Opt_disable_roll_forward, @@ -58,6 +65,7 @@ enum { Opt_nobarrier, Opt_fastboot, Opt_extent_cache, + Opt_noextent_cache, Opt_noinline_data, Opt_err, }; @@ -81,6 +89,7 @@ static match_table_t f2fs_tokens = { {Opt_nobarrier, "nobarrier"}, {Opt_fastboot, "fastboot"}, {Opt_extent_cache, "extent_cache"}, + {Opt_noextent_cache, "noextent_cache"}, {Opt_noinline_data, "noinline_data"}, {Opt_err, NULL}, }; @@ -382,6 +391,9 @@ static int parse_options(struct super_block *sb, char *options) case Opt_extent_cache: set_opt(sbi, EXTENT_CACHE); break; + case Opt_noextent_cache: + clear_opt(sbi, EXTENT_CACHE); + break; case Opt_noinline_data: clear_opt(sbi, INLINE_DATA); break; @@ -410,9 +422,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) atomic_set(&fi->dirty_pages, 0); fi->i_current_depth = 1; fi->i_advise = 0; - rwlock_init(&fi->ext_lock); init_rwsem(&fi->i_sem); - INIT_RADIX_TREE(&fi->inmem_root, GFP_NOFS); INIT_LIST_HEAD(&fi->inmem_pages); mutex_init(&fi->inmem_lock); @@ -441,17 +451,22 @@ static int f2fs_drop_inode(struct inode *inode) */ if (!inode_unhashed(inode) && inode->i_state & I_SYNC) { if (!inode->i_nlink && !is_bad_inode(inode)) { + /* to avoid evict_inode call simultaneously */ + atomic_inc(&inode->i_count); spin_unlock(&inode->i_lock); /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) commit_inmem_pages(inode, true); + /* should remain fi->extent_tree for writepage */ + f2fs_destroy_extent_node(inode); + sb_start_intwrite(inode->i_sb); i_size_write(inode, 0); if (F2FS_HAS_BLOCKS(inode)) - f2fs_truncate(inode); + f2fs_truncate(inode, true); sb_end_intwrite(inode->i_sb); @@ -461,6 +476,7 @@ static int f2fs_drop_inode(struct inode *inode) F2FS_I(inode)->i_crypt_info); #endif spin_lock(&inode->i_lock); + atomic_dec(&inode->i_count); } return 0; } @@ -498,9 +514,11 @@ static void f2fs_put_super(struct super_block *sb) } kobject_del(&sbi->s_kobj); - f2fs_destroy_stats(sbi); stop_gc_thread(sbi); + /* prevent remaining shrinker jobs */ + mutex_lock(&sbi->umount_mutex); + /* * We don't need to do checkpoint when superblock is clean. * But, the previous checkpoint was not done by umount, it needs to do @@ -514,6 +532,9 @@ static void f2fs_put_super(struct super_block *sb) write_checkpoint(sbi, &cpc); } + /* write_checkpoint can update stat informaion */ + f2fs_destroy_stats(sbi); + /* * normally superblock is clean, so we need to release this. * In addition, EIO will skip do checkpoint, we need this as well. @@ -521,6 +542,9 @@ static void f2fs_put_super(struct super_block *sb) release_dirty_inode(sbi); release_discard_addrs(sbi); + f2fs_leave_shrinker(sbi); + mutex_unlock(&sbi->umount_mutex); + iput(sbi->node_inode); iput(sbi->meta_inode); @@ -647,6 +671,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",fastboot"); if (test_opt(sbi, EXTENT_CACHE)) seq_puts(seq, ",extent_cache"); + else + seq_puts(seq, ",noextent_cache"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); return 0; @@ -667,7 +693,7 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset) struct seg_entry *se = get_seg_entry(sbi, i); if ((i % 10) == 0) - seq_printf(seq, "%-5d", i); + seq_printf(seq, "%-10d", i); seq_printf(seq, "%d|%-3u", se->type, get_valid_blocks(sbi, i, 1)); if ((i % 10) == 9 || i == (total_segs - 1)) @@ -699,6 +725,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, BG_GC); set_opt(sbi, INLINE_DATA); + set_opt(sbi, EXTENT_CACHE); #ifdef CONFIG_F2FS_FS_XATTR set_opt(sbi, XATTR_USER); @@ -970,6 +997,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->dir_level = DEF_DIR_LEVEL; clear_sbi_flag(sbi, SBI_NEED_FSCK); + + INIT_LIST_HEAD(&sbi->s_list); + mutex_init(&sbi->umount_mutex); } /* @@ -1135,7 +1165,9 @@ try_onemore: mutex_init(&sbi->writepages); mutex_init(&sbi->cp_mutex); init_rwsem(&sbi->node_write); - clear_sbi_flag(sbi, SBI_POR_DOING); + + /* disallow all the data/node/meta page writes */ + set_sbi_flag(sbi, SBI_POR_DOING); spin_lock_init(&sbi->stat_lock); init_rwsem(&sbi->read_io.io_rwsem); @@ -1212,8 +1244,12 @@ try_onemore: goto free_nm; } + f2fs_join_shrinker(sbi); + /* if there are nt orphan nodes free them */ - recover_orphan_inodes(sbi); + err = recover_orphan_inodes(sbi); + if (err) + goto free_node_inode; /* read root inode and dentry */ root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); @@ -1275,6 +1311,8 @@ try_onemore: goto free_kobj; } } + /* recover_fsync_data() cleared this already */ + clear_sbi_flag(sbi, SBI_POR_DOING); /* * If filesystem is not mounted as read-only then @@ -1308,7 +1346,10 @@ free_root_inode: dput(sb->s_root); sb->s_root = NULL; free_node_inode: + mutex_lock(&sbi->umount_mutex); + f2fs_leave_shrinker(sbi); iput(sbi->node_inode); + mutex_unlock(&sbi->umount_mutex); free_nm: destroy_node_manager(sbi); free_sm: @@ -1404,13 +1445,20 @@ static int __init init_f2fs_fs(void) err = f2fs_init_crypto(); if (err) goto free_kset; - err = register_filesystem(&f2fs_fs_type); + + err = register_shrinker(&f2fs_shrinker_info); if (err) goto free_crypto; + + err = register_filesystem(&f2fs_fs_type); + if (err) + goto free_shrinker; f2fs_create_root_stats(); f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); return 0; +free_shrinker: + unregister_shrinker(&f2fs_shrinker_info); free_crypto: f2fs_exit_crypto(); free_kset: @@ -1433,6 +1481,7 @@ static void __exit exit_f2fs_fs(void) { remove_proc_entry("fs/f2fs", NULL); f2fs_destroy_root_stats(); + unregister_shrinker(&f2fs_shrinker_info); unregister_filesystem(&f2fs_fs_type); f2fs_exit_crypto(); destroy_extent_cache(); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 07449b980acb..4de2286c0e4d 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -499,9 +499,12 @@ static int __f2fs_setxattr(struct inode *inode, int index, len = strlen(name); - if (len > F2FS_NAME_LEN || size > MAX_VALUE_LEN(inode)) + if (len > F2FS_NAME_LEN) return -ERANGE; + if (size > MAX_VALUE_LEN(inode)) + return -E2BIG; + base_addr = read_all_xattrs(inode, ipage); if (!base_addr) goto exit; diff --git a/fs/file_table.c b/fs/file_table.c index 7f9d407c7595..ad17e05ebf95 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -25,6 +25,7 @@ #include <linux/hardirq.h> #include <linux/task_work.h> #include <linux/ima.h> +#include <linux/swap.h> #include <linux/atomic.h> @@ -308,19 +309,24 @@ void put_filp(struct file *file) } } -void __init files_init(unsigned long mempages) +void __init files_init(void) { - unsigned long n; - filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + percpu_counter_init(&nr_files, 0, GFP_KERNEL); +} - /* - * One file with associated inode and dcache is very roughly 1K. - * Per default don't use more than 10% of our memory for files. - */ +/* + * One file with associated inode and dcache is very roughly 1K. Per default + * do not use more than 10% of our memory for files. + */ +void __init files_maxfiles_init(void) +{ + unsigned long n; + unsigned long memreserve = (totalram_pages - nr_free_pages()) * 3/2; + + memreserve = min(memreserve, totalram_pages - 1); + n = ((totalram_pages - memreserve) * (PAGE_SIZE / 1024)) / 10; - n = (mempages * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = max_t(unsigned long, n, NR_FILE); - percpu_counter_init(&nr_files, 0, GFP_KERNEL); } diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c index 484b32d3234a..1cff72df0389 100644 --- a/fs/freevxfs/vxfs_lookup.c +++ b/fs/freevxfs/vxfs_lookup.c @@ -192,7 +192,7 @@ vxfs_inode_by_name(struct inode *dip, struct dentry *dp) * by @dp in @dip. * * Returns: - * A NULL-pointer on success, else an negative error code encoded + * A NULL-pointer on success, else a negative error code encoded * in the return pointer. */ static struct dentry * diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index f4f0f228a530..24489126f8ca 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -86,7 +86,7 @@ unsigned int dirtytime_expire_interval = 12 * 60 * 60; static inline struct inode *wb_inode(struct list_head *head) { - return list_entry(head, struct inode, i_wb_list); + return list_entry(head, struct inode, i_io_list); } /* @@ -123,22 +123,22 @@ static void wb_io_lists_depopulated(struct bdi_writeback *wb) } /** - * inode_wb_list_move_locked - move an inode onto a bdi_writeback IO list + * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list * @inode: inode to be moved * @wb: target bdi_writeback * @head: one of @wb->b_{dirty|io|more_io} * - * Move @inode->i_wb_list to @list of @wb and set %WB_has_dirty_io. + * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io. * Returns %true if @inode is the first occupant of the !dirty_time IO * lists; otherwise, %false. */ -static bool inode_wb_list_move_locked(struct inode *inode, +static bool inode_io_list_move_locked(struct inode *inode, struct bdi_writeback *wb, struct list_head *head) { assert_spin_locked(&wb->list_lock); - list_move(&inode->i_wb_list, head); + list_move(&inode->i_io_list, head); /* dirty_time doesn't count as dirty_io until expiration */ if (head != &wb->b_dirty_time) @@ -149,19 +149,19 @@ static bool inode_wb_list_move_locked(struct inode *inode, } /** - * inode_wb_list_del_locked - remove an inode from its bdi_writeback IO list + * inode_io_list_del_locked - remove an inode from its bdi_writeback IO list * @inode: inode to be removed * @wb: bdi_writeback @inode is being removed from * * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and * clear %WB_has_dirty_io if all are empty afterwards. */ -static void inode_wb_list_del_locked(struct inode *inode, +static void inode_io_list_del_locked(struct inode *inode, struct bdi_writeback *wb) { assert_spin_locked(&wb->list_lock); - list_del_init(&inode->i_wb_list); + list_del_init(&inode->i_io_list); wb_io_lists_depopulated(wb); } @@ -346,7 +346,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) /* * Once I_FREEING is visible under i_lock, the eviction path owns - * the inode and we shouldn't modify ->i_wb_list. + * the inode and we shouldn't modify ->i_io_list. */ if (unlikely(inode->i_state & I_FREEING)) goto skip_switch; @@ -385,16 +385,16 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) * is always correct including from ->b_dirty_time. The transfer * preserves @inode->dirtied_when ordering. */ - if (!list_empty(&inode->i_wb_list)) { + if (!list_empty(&inode->i_io_list)) { struct inode *pos; - inode_wb_list_del_locked(inode, old_wb); + inode_io_list_del_locked(inode, old_wb); inode->i_wb = new_wb; - list_for_each_entry(pos, &new_wb->b_dirty, i_wb_list) + list_for_each_entry(pos, &new_wb->b_dirty, i_io_list) if (time_after_eq(inode->dirtied_when, pos->dirtied_when)) break; - inode_wb_list_move_locked(inode, new_wb, pos->i_wb_list.prev); + inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev); } else { inode->i_wb = new_wb; } @@ -783,9 +783,6 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, struct wb_iter iter; might_sleep(); - - if (!bdi_has_dirty_io(bdi)) - return; restart: rcu_read_lock(); bdi_for_each_wb(wb, bdi, &iter, next_memcg_id) { @@ -794,8 +791,12 @@ restart: struct wb_writeback_work *work; long nr_pages; - if (!wb_has_dirty_io(wb) || - (skip_if_busy && writeback_in_progress(wb))) + /* SYNC_ALL writes out I_DIRTY_TIME too */ + if (!wb_has_dirty_io(wb) && + (base_work->sync_mode == WB_SYNC_NONE || + list_empty(&wb->b_dirty_time))) + continue; + if (skip_if_busy && writeback_in_progress(wb)) continue; nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages); @@ -860,8 +861,7 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, { might_sleep(); - if (bdi_has_dirty_io(bdi) && - (!skip_if_busy || !writeback_in_progress(&bdi->wb))) { + if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) { base_work->auto_free = 0; wb_queue_work(&bdi->wb, base_work); } @@ -920,12 +920,12 @@ void wb_start_background_writeback(struct bdi_writeback *wb) /* * Remove the inode from the writeback list it is on. */ -void inode_wb_list_del(struct inode *inode) +void inode_io_list_del(struct inode *inode) { struct bdi_writeback *wb; wb = inode_to_wb_and_lock_list(inode); - inode_wb_list_del_locked(inode, wb); + inode_io_list_del_locked(inode, wb); spin_unlock(&wb->list_lock); } @@ -947,7 +947,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) if (time_before(inode->dirtied_when, tail->dirtied_when)) inode->dirtied_when = jiffies; } - inode_wb_list_move_locked(inode, wb, &wb->b_dirty); + inode_io_list_move_locked(inode, wb, &wb->b_dirty); } /* @@ -955,7 +955,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) */ static void requeue_io(struct inode *inode, struct bdi_writeback *wb) { - inode_wb_list_move_locked(inode, wb, &wb->b_more_io); + inode_io_list_move_locked(inode, wb, &wb->b_more_io); } static void inode_sync_complete(struct inode *inode) @@ -1014,7 +1014,7 @@ static int move_expired_inodes(struct list_head *delaying_queue, if (older_than_this && inode_dirtied_after(inode, *older_than_this)) break; - list_move(&inode->i_wb_list, &tmp); + list_move(&inode->i_io_list, &tmp); moved++; if (flags & EXPIRE_DIRTY_ATIME) set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state); @@ -1037,7 +1037,7 @@ static int move_expired_inodes(struct list_head *delaying_queue, list_for_each_prev_safe(pos, node, &tmp) { inode = wb_inode(pos); if (inode->i_sb == sb) - list_move(&inode->i_wb_list, dispatch_queue); + list_move(&inode->i_io_list, dispatch_queue); } } out: @@ -1191,10 +1191,10 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, redirty_tail(inode, wb); } else if (inode->i_state & I_DIRTY_TIME) { inode->dirtied_when = jiffies; - inode_wb_list_move_locked(inode, wb, &wb->b_dirty_time); + inode_io_list_move_locked(inode, wb, &wb->b_dirty_time); } else { /* The inode is clean. Remove from writeback lists. */ - inode_wb_list_del_locked(inode, wb); + inode_io_list_del_locked(inode, wb); } } @@ -1337,7 +1337,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, * touch it. See comment above for explanation. */ if (!(inode->i_state & I_DIRTY_ALL)) - inode_wb_list_del_locked(inode, wb); + inode_io_list_del_locked(inode, wb); spin_unlock(&wb->list_lock); inode_sync_complete(inode); out: @@ -1398,7 +1398,9 @@ static long writeback_sb_inodes(struct super_block *sb, unsigned long start_time = jiffies; long write_chunk; long wrote = 0; /* count both pages and inodes */ + struct blk_plug plug; + blk_start_plug(&plug); while (!list_empty(&wb->b_io)) { struct inode *inode = wb_inode(wb->b_io.prev); @@ -1496,6 +1498,7 @@ static long writeback_sb_inodes(struct super_block *sb, break; } } + blk_finish_plug(&plug); return wrote; } @@ -2035,7 +2038,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) else dirty_list = &wb->b_dirty_time; - wakeup_bdi = inode_wb_list_move_locked(inode, wb, + wakeup_bdi = inode_io_list_move_locked(inode, wb, dirty_list); spin_unlock(&wb->list_lock); @@ -2058,6 +2061,15 @@ out_unlock_inode: } EXPORT_SYMBOL(__mark_inode_dirty); +/* + * The @s_sync_lock is used to serialise concurrent sync operations + * to avoid lock contention problems with concurrent wait_sb_inodes() calls. + * Concurrent callers will block on the s_sync_lock rather than doing contending + * walks. The queueing maintains sync(2) required behaviour as all the IO that + * has been issued up to the time this function is enter is guaranteed to be + * completed by the time we have gained the lock and waited for all IO that is + * in progress regardless of the order callers are granted the lock. + */ static void wait_sb_inodes(struct super_block *sb) { struct inode *inode, *old_inode = NULL; @@ -2068,7 +2080,8 @@ static void wait_sb_inodes(struct super_block *sb) */ WARN_ON(!rwsem_is_locked(&sb->s_umount)); - spin_lock(&inode_sb_list_lock); + mutex_lock(&sb->s_sync_lock); + spin_lock(&sb->s_inode_list_lock); /* * Data integrity sync. Must wait for all pages under writeback, @@ -2088,14 +2101,14 @@ static void wait_sb_inodes(struct super_block *sb) } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); /* * We hold a reference to 'inode' so it couldn't have been * removed from s_inodes list while we dropped the - * inode_sb_list_lock. We cannot iput the inode now as we can + * s_inode_list_lock. We cannot iput the inode now as we can * be holding the last reference and we cannot iput it under - * inode_sb_list_lock. So we keep the reference and iput it + * s_inode_list_lock. So we keep the reference and iput it * later. */ iput(old_inode); @@ -2105,10 +2118,11 @@ static void wait_sb_inodes(struct super_block *sb) cond_resched(); - spin_lock(&inode_sb_list_lock); + spin_lock(&sb->s_inode_list_lock); } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); iput(old_inode); + mutex_unlock(&sb->s_sync_lock); } static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, @@ -2222,8 +2236,12 @@ void sync_inodes_sb(struct super_block *sb) }; struct backing_dev_info *bdi = sb->s_bdi; - /* Nothing to do? */ - if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info) + /* + * Can't skip on !bdi_has_dirty() because we should wait for !dirty + * inodes under writeback and I_DIRTY_TIME inodes ignored by + * bdi_has_dirty() need to be written out too. + */ + if (bdi == &noop_backing_dev_info) return; WARN_ON(!rwsem_is_locked(&sb->s_umount)); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 80cc1b35d460..ebb5e37455a0 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2246,7 +2246,15 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, err = -EINVAL; if (old) { - struct fuse_dev *fud = fuse_get_dev(old); + struct fuse_dev *fud = NULL; + + /* + * Check against file->f_op because CUSE + * uses the same ioctl handler. + */ + if (old->f_op == file->f_op && + old->f_cred->user_ns == file->f_cred->user_ns) + fud = fuse_get_dev(old); if (fud) { mutex_lock(&fuse_mutex); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 2c1ae861dc94..92324ac58290 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -202,22 +202,22 @@ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec, * */ -static void gfs2_end_log_write(struct bio *bio, int error) +static void gfs2_end_log_write(struct bio *bio) { struct gfs2_sbd *sdp = bio->bi_private; struct bio_vec *bvec; struct page *page; int i; - if (error) { - sdp->sd_log_error = error; - fs_err(sdp, "Error %d writing to log\n", error); + if (bio->bi_error) { + sdp->sd_log_error = bio->bi_error; + fs_err(sdp, "Error %d writing to log\n", bio->bi_error); } bio_for_each_segment_all(bvec, bio, i) { page = bvec->bv_page; if (page_has_buffers(page)) - gfs2_end_log_write_bh(sdp, bvec, error); + gfs2_end_log_write_bh(sdp, bvec, bio->bi_error); else mempool_free(page, gfs2_page_pool); } @@ -261,18 +261,11 @@ void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int rw) static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno) { struct super_block *sb = sdp->sd_vfs; - unsigned nrvecs = bio_get_nr_vecs(sb->s_bdev); struct bio *bio; BUG_ON(sdp->sd_log_bio); - while (1) { - bio = bio_alloc(GFP_NOIO, nrvecs); - if (likely(bio)) - break; - nrvecs = max(nrvecs/2, 1U); - } - + bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); bio->bi_iter.bi_sector = blkno * (sb->s_blocksize >> 9); bio->bi_bdev = sb->s_bdev; bio->bi_end_io = gfs2_end_log_write; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 1e3a93f2f71d..02586e7eb964 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -171,14 +171,14 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent) return -EINVAL; } -static void end_bio_io_page(struct bio *bio, int error) +static void end_bio_io_page(struct bio *bio) { struct page *page = bio->bi_private; - if (!error) + if (!bio->bi_error) SetPageUptodate(page); else - pr_warn("error %d reading superblock\n", error); + pr_warn("error %d reading superblock\n", bio->bi_error); unlock_page(page); } diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 2982445947e1..894fb01a91da 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1334,11 +1334,11 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root) if (is_ancestor(root, sdp->sd_master_dir)) seq_puts(s, ",meta"); if (args->ar_lockproto[0]) - seq_printf(s, ",lockproto=%s", args->ar_lockproto); + seq_show_option(s, "lockproto", args->ar_lockproto); if (args->ar_locktable[0]) - seq_printf(s, ",locktable=%s", args->ar_locktable); + seq_show_option(s, "locktable", args->ar_locktable); if (args->ar_hostdata[0]) - seq_printf(s, ",hostdata=%s", args->ar_hostdata); + seq_show_option(s, "hostdata", args->ar_hostdata); if (args->ar_spectator) seq_puts(s, ",spectator"); if (args->ar_localflocks) diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index d3fa6bd9503e..221719eac5de 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -288,7 +288,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) page_cache_release(page); goto fail; } - page_cache_release(page); node->page[i] = page; } @@ -398,11 +397,11 @@ node_error: void hfs_bnode_free(struct hfs_bnode *node) { - //int i; + int i; - //for (i = 0; i < node->tree->pages_per_bnode; i++) - // if (node->page[i]) - // page_cache_release(node->page[i]); + for (i = 0; i < node->tree->pages_per_bnode; i++) + if (node->page[i]) + page_cache_release(node->page[i]); kfree(node); } diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c index 9f4ee7f52026..6fc766df0461 100644 --- a/fs/hfs/brec.c +++ b/fs/hfs/brec.c @@ -131,13 +131,16 @@ skip: hfs_bnode_write(node, entry, data_off + key_len, entry_len); hfs_bnode_dump(node); - if (new_node) { - /* update parent key if we inserted a key - * at the start of the first node - */ - if (!rec && new_node != node) - hfs_brec_update_parent(fd); + /* + * update parent key if we inserted a key + * at the start of the node and it is not the new node + */ + if (!rec && new_node != node) { + hfs_bnode_read_key(node, fd->search_key, data_off + size); + hfs_brec_update_parent(fd); + } + if (new_node) { hfs_bnode_put(fd->bnode); if (!new_node->parent) { hfs_btree_inc_height(tree); @@ -166,9 +169,6 @@ skip: goto again; } - if (!rec) - hfs_brec_update_parent(fd); - return 0; } @@ -366,6 +366,8 @@ again: if (IS_ERR(parent)) return PTR_ERR(parent); __hfs_brec_find(parent, fd); + if (fd->record < 0) + return -ENOENT; hfs_bnode_dump(parent); rec = fd->record; diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 55c03b9e9070..4574fdd3d421 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -136,9 +136,9 @@ static int hfs_show_options(struct seq_file *seq, struct dentry *root) struct hfs_sb_info *sbi = HFS_SB(root->d_sb); if (sbi->s_creator != cpu_to_be32(0x3f3f3f3f)) - seq_printf(seq, ",creator=%.4s", (char *)&sbi->s_creator); + seq_show_option_n(seq, "creator", (char *)&sbi->s_creator, 4); if (sbi->s_type != cpu_to_be32(0x3f3f3f3f)) - seq_printf(seq, ",type=%.4s", (char *)&sbi->s_type); + seq_show_option_n(seq, "type", (char *)&sbi->s_type, 4); seq_printf(seq, ",uid=%u,gid=%u", from_kuid_munged(&init_user_ns, sbi->s_uid), from_kgid_munged(&init_user_ns, sbi->s_gid)); diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index 759708fd9331..63924662aaf3 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -454,7 +454,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) page_cache_release(page); goto fail; } - page_cache_release(page); node->page[i] = page; } @@ -566,13 +565,11 @@ node_error: void hfs_bnode_free(struct hfs_bnode *node) { -#if 0 int i; for (i = 0; i < node->tree->pages_per_bnode; i++) if (node->page[i]) page_cache_release(node->page[i]); -#endif kfree(node); } diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index c90b72ee676d..bb806e58c977 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -218,9 +218,9 @@ int hfsplus_show_options(struct seq_file *seq, struct dentry *root) struct hfsplus_sb_info *sbi = HFSPLUS_SB(root->d_sb); if (sbi->creator != HFSPLUS_DEF_CR_TYPE) - seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator); + seq_show_option_n(seq, "creator", (char *)&sbi->creator, 4); if (sbi->type != HFSPLUS_DEF_CR_TYPE) - seq_printf(seq, ",type=%.4s", (char *)&sbi->type); + seq_show_option_n(seq, "type", (char *)&sbi->type, 4); seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask, from_kuid_munged(&init_user_ns, sbi->uid), from_kgid_munged(&init_user_ns, sbi->gid)); diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 059597b23f67..2ac99db3750e 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -260,7 +260,7 @@ static int hostfs_show_options(struct seq_file *seq, struct dentry *root) size_t offset = strlen(root_ino) + 1; if (strlen(root_path) > offset) - seq_printf(seq, ",%s", root_path + offset); + seq_show_option(seq, root_path + offset, NULL); if (append) seq_puts(seq, ",append"); diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c index 8057fe4e6574..f626114449e4 100644 --- a/fs/hpfs/buffer.c +++ b/fs/hpfs/buffer.c @@ -10,6 +10,30 @@ #include <linux/blkdev.h> #include "hpfs_fn.h" +secno hpfs_search_hotfix_map(struct super_block *s, secno sec) +{ + unsigned i; + struct hpfs_sb_info *sbi = hpfs_sb(s); + for (i = 0; unlikely(i < sbi->n_hotfixes); i++) { + if (sbi->hotfix_from[i] == sec) { + return sbi->hotfix_to[i]; + } + } + return sec; +} + +unsigned hpfs_search_hotfix_map_for_range(struct super_block *s, secno sec, unsigned n) +{ + unsigned i; + struct hpfs_sb_info *sbi = hpfs_sb(s); + for (i = 0; unlikely(i < sbi->n_hotfixes); i++) { + if (sbi->hotfix_from[i] >= sec && sbi->hotfix_from[i] < sec + n) { + n = sbi->hotfix_from[i] - sec; + } + } + return n; +} + void hpfs_prefetch_sectors(struct super_block *s, unsigned secno, int n) { struct buffer_head *bh; @@ -18,6 +42,9 @@ void hpfs_prefetch_sectors(struct super_block *s, unsigned secno, int n) if (n <= 0 || unlikely(secno >= hpfs_sb(s)->sb_fs_size)) return; + if (unlikely(hpfs_search_hotfix_map_for_range(s, secno, n) != n)) + return; + bh = sb_find_get_block(s, secno); if (bh) { if (buffer_uptodate(bh)) { @@ -51,7 +78,7 @@ void *hpfs_map_sector(struct super_block *s, unsigned secno, struct buffer_head cond_resched(); - *bhp = bh = sb_bread(s, secno); + *bhp = bh = sb_bread(s, hpfs_search_hotfix_map(s, secno)); if (bh != NULL) return bh->b_data; else { @@ -71,7 +98,7 @@ void *hpfs_get_sector(struct super_block *s, unsigned secno, struct buffer_head cond_resched(); - if ((*bhp = bh = sb_getblk(s, secno)) != NULL) { + if ((*bhp = bh = sb_getblk(s, hpfs_search_hotfix_map(s, secno))) != NULL) { if (!buffer_uptodate(bh)) wait_on_buffer(bh); set_buffer_uptodate(bh); return bh->b_data; @@ -99,10 +126,10 @@ void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffe hpfs_prefetch_sectors(s, secno, 4 + ahead); - if (!(qbh->bh[0] = sb_bread(s, secno + 0))) goto bail0; - if (!(qbh->bh[1] = sb_bread(s, secno + 1))) goto bail1; - if (!(qbh->bh[2] = sb_bread(s, secno + 2))) goto bail2; - if (!(qbh->bh[3] = sb_bread(s, secno + 3))) goto bail3; + if (!hpfs_map_sector(s, secno + 0, &qbh->bh[0], 0)) goto bail0; + if (!hpfs_map_sector(s, secno + 1, &qbh->bh[1], 0)) goto bail1; + if (!hpfs_map_sector(s, secno + 2, &qbh->bh[2], 0)) goto bail2; + if (!hpfs_map_sector(s, secno + 3, &qbh->bh[3], 0)) goto bail3; if (likely(qbh->bh[1]->b_data == qbh->bh[0]->b_data + 1 * 512) && likely(qbh->bh[2]->b_data == qbh->bh[0]->b_data + 2 * 512) && diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 7ca28d604bf7..d3bcdd975700 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -83,6 +83,11 @@ static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_he if (s) { if (bh_result->b_size >> 9 < n_secs) n_secs = bh_result->b_size >> 9; + n_secs = hpfs_search_hotfix_map_for_range(inode->i_sb, s, n_secs); + if (unlikely(!n_secs)) { + s = hpfs_search_hotfix_map(inode->i_sb, s); + n_secs = 1; + } map_bh(bh_result, inode->i_sb, s); bh_result->b_size = n_secs << 9; goto ret_0; @@ -101,7 +106,7 @@ static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_he inode->i_blocks++; hpfs_i(inode)->mmu_private += 512; set_buffer_new(bh_result); - map_bh(bh_result, inode->i_sb, s); + map_bh(bh_result, inode->i_sb, hpfs_search_hotfix_map(inode->i_sb, s)); ret_0: r = 0; ret_r: @@ -181,7 +186,7 @@ static int hpfs_write_end(struct file *file, struct address_space *mapping, static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block) { - return generic_block_bmap(mapping,block,hpfs_get_block); + return generic_block_bmap(mapping, block, hpfs_get_block); } const struct address_space_operations hpfs_aops = { diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index c4867b5116dd..975654a63c13 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -88,6 +88,10 @@ struct hpfs_sb_info { unsigned sb_max_fwd_alloc; /* max forwad allocation */ int sb_timeshift; struct rcu_head rcu; + + unsigned n_hotfixes; + secno hotfix_from[256]; + secno hotfix_to[256]; }; /* Four 512-byte buffers and the 2k block obtained by concatenating them */ @@ -217,6 +221,8 @@ void hpfs_remove_fnode(struct super_block *, fnode_secno fno); /* buffer.c */ +secno hpfs_search_hotfix_map(struct super_block *s, secno sec); +unsigned hpfs_search_hotfix_map_for_range(struct super_block *s, secno sec, unsigned n); void hpfs_prefetch_sectors(struct super_block *, unsigned, int); void *hpfs_map_sector(struct super_block *, unsigned, struct buffer_head **, int); void *hpfs_get_sector(struct super_block *, unsigned, struct buffer_head **); @@ -285,6 +291,7 @@ __le32 *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head void hpfs_prefetch_bitmap(struct super_block *, unsigned); unsigned char *hpfs_load_code_page(struct super_block *, secno); __le32 *hpfs_load_bitmap_directory(struct super_block *, secno bmp); +void hpfs_load_hotfix_map(struct super_block *s, struct hpfs_spare_block *spareblock); struct fnode *hpfs_map_fnode(struct super_block *s, ino_t, struct buffer_head **); struct anode *hpfs_map_anode(struct super_block *s, anode_secno, struct buffer_head **); struct dnode *hpfs_map_dnode(struct super_block *s, dnode_secno, struct quad_buffer_head *); diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c index 442770edcdc7..a69bbc1e87f8 100644 --- a/fs/hpfs/map.c +++ b/fs/hpfs/map.c @@ -130,6 +130,32 @@ __le32 *hpfs_load_bitmap_directory(struct super_block *s, secno bmp) return b; } +void hpfs_load_hotfix_map(struct super_block *s, struct hpfs_spare_block *spareblock) +{ + struct quad_buffer_head qbh; + u32 *directory; + u32 n_hotfixes, n_used_hotfixes; + unsigned i; + + n_hotfixes = le32_to_cpu(spareblock->n_spares); + n_used_hotfixes = le32_to_cpu(spareblock->n_spares_used); + + if (n_hotfixes > 256 || n_used_hotfixes > n_hotfixes) { + hpfs_error(s, "invalid number of hotfixes: %u, used: %u", n_hotfixes, n_used_hotfixes); + return; + } + if (!(directory = hpfs_map_4sectors(s, le32_to_cpu(spareblock->hotfix_map), &qbh, 0))) { + hpfs_error(s, "can't load hotfix map"); + return; + } + for (i = 0; i < n_used_hotfixes; i++) { + hpfs_sb(s)->hotfix_from[i] = le32_to_cpu(directory[i]); + hpfs_sb(s)->hotfix_to[i] = le32_to_cpu(directory[n_hotfixes + i]); + } + hpfs_sb(s)->n_hotfixes = n_used_hotfixes; + hpfs_brelse4(&qbh); +} + /* * Load fnode to memory */ diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index a0872f239f04..9e92c9c2d319 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -8,6 +8,17 @@ #include <linux/sched.h> #include "hpfs_fn.h" +static void hpfs_update_directory_times(struct inode *dir) +{ + time_t t = get_seconds(); + if (t == dir->i_mtime.tv_sec && + t == dir->i_ctime.tv_sec) + return; + dir->i_mtime.tv_sec = dir->i_ctime.tv_sec = t; + dir->i_mtime.tv_nsec = dir->i_ctime.tv_nsec = 0; + hpfs_write_inode_nolock(dir); +} + static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { const unsigned char *name = dentry->d_name.name; @@ -99,6 +110,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) result->i_mode = mode | S_IFDIR; hpfs_write_inode_nolock(result); } + hpfs_update_directory_times(dir); d_instantiate(dentry, result); hpfs_unlock(dir->i_sb); return 0; @@ -187,6 +199,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, b result->i_mode = mode | S_IFREG; hpfs_write_inode_nolock(result); } + hpfs_update_directory_times(dir); d_instantiate(dentry, result); hpfs_unlock(dir->i_sb); return 0; @@ -262,6 +275,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, de insert_inode_hash(result); hpfs_write_inode_nolock(result); + hpfs_update_directory_times(dir); d_instantiate(dentry, result); brelse(bh); hpfs_unlock(dir->i_sb); @@ -340,6 +354,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy insert_inode_hash(result); hpfs_write_inode_nolock(result); + hpfs_update_directory_times(dir); d_instantiate(dentry, result); hpfs_unlock(dir->i_sb); return 0; @@ -423,6 +438,8 @@ again: out1: hpfs_brelse4(&qbh); out: + if (!err) + hpfs_update_directory_times(dir); hpfs_unlock(dir->i_sb); return err; } @@ -477,6 +494,8 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry) out1: hpfs_brelse4(&qbh); out: + if (!err) + hpfs_update_directory_times(dir); hpfs_unlock(dir->i_sb); return err; } @@ -595,7 +614,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto end1; } - end: +end: hpfs_i(i)->i_parent_dir = new_dir->i_ino; if (S_ISDIR(i->i_mode)) { inc_nlink(new_dir); @@ -610,6 +629,10 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, brelse(bh); } end1: + if (!err) { + hpfs_update_directory_times(old_dir); + hpfs_update_directory_times(new_dir); + } hpfs_unlock(i->i_sb); return err; } diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 68a9bed05628..a561591896bd 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -628,6 +628,9 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) goto bail4; } + if (spareblock->n_spares_used) + hpfs_load_hotfix_map(s, spareblock); + /* Load bitmap directory */ if (!(sbi->sb_bmp_dir = hpfs_load_bitmap_directory(s, le32_to_cpu(superblock->bitmaps)))) goto bail4; @@ -647,18 +650,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) mark_buffer_dirty(bh2); } - if (spareblock->hotfixes_used || spareblock->n_spares_used) { - if (errs >= 2) { - pr_err("Hotfixes not supported here, try chkdsk\n"); - mark_dirty(s, 0); - goto bail4; - } - hpfs_error(s, "hotfixes not supported here, try chkdsk"); - if (errs == 0) - pr_err("Proceeding, but your filesystem will be probably corrupted by this driver...\n"); - else - pr_err("This driver may read bad files or crash when operating on disk with hotfixes.\n"); - } if (le32_to_cpu(spareblock->n_dnode_spares) != le32_to_cpu(spareblock->n_dnode_spares_free)) { if (errs >= 2) { pr_err("Spare dnodes used, try chkdsk\n"); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 0cf74df68617..316adb968b65 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -12,6 +12,7 @@ #include <linux/thread_info.h> #include <asm/current.h> #include <linux/sched.h> /* remove ASAP */ +#include <linux/falloc.h> #include <linux/fs.h> #include <linux/mount.h> #include <linux/file.h> @@ -84,6 +85,29 @@ static const match_table_t tokens = { {Opt_err, NULL}, }; +#ifdef CONFIG_NUMA +static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma, + struct inode *inode, pgoff_t index) +{ + vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy, + index); +} + +static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma) +{ + mpol_cond_put(vma->vm_policy); +} +#else +static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma, + struct inode *inode, pgoff_t index) +{ +} + +static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma) +{ +} +#endif + static void huge_pagevec_release(struct pagevec *pvec) { int i; @@ -293,26 +317,61 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, return -EINVAL; } -static void truncate_huge_page(struct page *page) +static void remove_huge_page(struct page *page) { ClearPageDirty(page); ClearPageUptodate(page); delete_from_page_cache(page); } -static void truncate_hugepages(struct inode *inode, loff_t lstart) + +/* + * remove_inode_hugepages handles two distinct cases: truncation and hole + * punch. There are subtle differences in operation for each case. + + * truncation is indicated by end of range being LLONG_MAX + * In this case, we first scan the range and release found pages. + * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv + * maps and global counts. + * hole punch is indicated if end is not LLONG_MAX + * In the hole punch case we scan the range and release found pages. + * Only when releasing a page is the associated region/reserv map + * deleted. The region/reserv map for ranges without associated + * pages are not modified. + * Note: If the passed end of range value is beyond the end of file, but + * not LLONG_MAX this routine still performs a hole punch operation. + */ +static void remove_inode_hugepages(struct inode *inode, loff_t lstart, + loff_t lend) { struct hstate *h = hstate_inode(inode); struct address_space *mapping = &inode->i_data; const pgoff_t start = lstart >> huge_page_shift(h); + const pgoff_t end = lend >> huge_page_shift(h); + struct vm_area_struct pseudo_vma; struct pagevec pvec; pgoff_t next; int i, freed = 0; + long lookup_nr = PAGEVEC_SIZE; + bool truncate_op = (lend == LLONG_MAX); + memset(&pseudo_vma, 0, sizeof(struct vm_area_struct)); + pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED); pagevec_init(&pvec, 0); next = start; - while (1) { - if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + while (next < end) { + /* + * Make sure to never grab more pages that we + * might possibly need. + */ + if (end - next < lookup_nr) + lookup_nr = end - next; + + /* + * This pagevec_lookup() may return pages past 'end', + * so we must check for page->index > end. + */ + if (!pagevec_lookup(&pvec, mapping, next, lookup_nr)) { if (next == start) break; next = start; @@ -321,26 +380,69 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart) for (i = 0; i < pagevec_count(&pvec); ++i) { struct page *page = pvec.pages[i]; + u32 hash; + + hash = hugetlb_fault_mutex_hash(h, current->mm, + &pseudo_vma, + mapping, next, 0); + mutex_lock(&hugetlb_fault_mutex_table[hash]); lock_page(page); + if (page->index >= end) { + unlock_page(page); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + next = end; /* we are done */ + break; + } + + /* + * If page is mapped, it was faulted in after being + * unmapped. Do nothing in this race case. In the + * normal case page is not mapped. + */ + if (!page_mapped(page)) { + bool rsv_on_error = !PagePrivate(page); + /* + * We must free the huge page and remove + * from page cache (remove_huge_page) BEFORE + * removing the region/reserve map + * (hugetlb_unreserve_pages). In rare out + * of memory conditions, removal of the + * region/reserve map could fail. Before + * free'ing the page, note PagePrivate which + * is used in case of error. + */ + remove_huge_page(page); + freed++; + if (!truncate_op) { + if (unlikely(hugetlb_unreserve_pages( + inode, next, + next + 1, 1))) + hugetlb_fix_reserve_counts( + inode, rsv_on_error); + } + } + if (page->index > next) next = page->index; + ++next; - truncate_huge_page(page); unlock_page(page); - freed++; + + mutex_unlock(&hugetlb_fault_mutex_table[hash]); } huge_pagevec_release(&pvec); } - BUG_ON(!lstart && mapping->nrpages); - hugetlb_unreserve_pages(inode, start, freed); + + if (truncate_op) + (void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed); } static void hugetlbfs_evict_inode(struct inode *inode) { struct resv_map *resv_map; - truncate_hugepages(inode, 0); + remove_inode_hugepages(inode, 0, LLONG_MAX); resv_map = (struct resv_map *)inode->i_mapping->private_data; /* root inode doesn't have the resv_map, so we should check it */ if (resv_map) @@ -349,11 +451,15 @@ static void hugetlbfs_evict_inode(struct inode *inode) } static inline void -hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff) +hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end) { struct vm_area_struct *vma; - vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) { + /* + * end == 0 indicates that the entire range after + * start should be unmapped. + */ + vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) { unsigned long v_offset; /* @@ -362,13 +468,20 @@ hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff) * which overlap the truncated area starting at pgoff, * and no vma on a 32-bit arch can span beyond the 4GB. */ - if (vma->vm_pgoff < pgoff) - v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT; + if (vma->vm_pgoff < start) + v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT; else v_offset = 0; - unmap_hugepage_range(vma, vma->vm_start + v_offset, - vma->vm_end, NULL); + if (end) { + end = ((end - start) << PAGE_SHIFT) + + vma->vm_start + v_offset; + if (end > vma->vm_end) + end = vma->vm_end; + } else + end = vma->vm_end; + + unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL); } } @@ -384,12 +497,164 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) i_size_write(inode, offset); i_mmap_lock_write(mapping); if (!RB_EMPTY_ROOT(&mapping->i_mmap)) - hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); + hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0); i_mmap_unlock_write(mapping); - truncate_hugepages(inode, offset); + remove_inode_hugepages(inode, offset, LLONG_MAX); return 0; } +static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) +{ + struct hstate *h = hstate_inode(inode); + loff_t hpage_size = huge_page_size(h); + loff_t hole_start, hole_end; + + /* + * For hole punch round up the beginning offset of the hole and + * round down the end. + */ + hole_start = round_up(offset, hpage_size); + hole_end = round_down(offset + len, hpage_size); + + if (hole_end > hole_start) { + struct address_space *mapping = inode->i_mapping; + + mutex_lock(&inode->i_mutex); + i_mmap_lock_write(mapping); + if (!RB_EMPTY_ROOT(&mapping->i_mmap)) + hugetlb_vmdelete_list(&mapping->i_mmap, + hole_start >> PAGE_SHIFT, + hole_end >> PAGE_SHIFT); + i_mmap_unlock_write(mapping); + remove_inode_hugepages(inode, hole_start, hole_end); + mutex_unlock(&inode->i_mutex); + } + + return 0; +} + +static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + struct address_space *mapping = inode->i_mapping; + struct hstate *h = hstate_inode(inode); + struct vm_area_struct pseudo_vma; + struct mm_struct *mm = current->mm; + loff_t hpage_size = huge_page_size(h); + unsigned long hpage_shift = huge_page_shift(h); + pgoff_t start, index, end; + int error; + u32 hash; + + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + + if (mode & FALLOC_FL_PUNCH_HOLE) + return hugetlbfs_punch_hole(inode, offset, len); + + /* + * Default preallocate case. + * For this range, start is rounded down and end is rounded up + * as well as being converted to page offsets. + */ + start = offset >> hpage_shift; + end = (offset + len + hpage_size - 1) >> hpage_shift; + + mutex_lock(&inode->i_mutex); + + /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ + error = inode_newsize_ok(inode, offset + len); + if (error) + goto out; + + /* + * Initialize a pseudo vma as this is required by the huge page + * allocation routines. If NUMA is configured, use page index + * as input to create an allocation policy. + */ + memset(&pseudo_vma, 0, sizeof(struct vm_area_struct)); + pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED); + pseudo_vma.vm_file = file; + + for (index = start; index < end; index++) { + /* + * This is supposed to be the vaddr where the page is being + * faulted in, but we have no vaddr here. + */ + struct page *page; + unsigned long addr; + int avoid_reserve = 0; + + cond_resched(); + + /* + * fallocate(2) manpage permits EINTR; we may have been + * interrupted because we are using up too much memory. + */ + if (signal_pending(current)) { + error = -EINTR; + break; + } + + /* Set numa allocation policy based on index */ + hugetlb_set_vma_policy(&pseudo_vma, inode, index); + + /* addr is the offset within the file (zero based) */ + addr = index * hpage_size; + + /* mutex taken here, fault path and hole punch */ + hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping, + index, addr); + mutex_lock(&hugetlb_fault_mutex_table[hash]); + + /* See if already present in mapping to avoid alloc/free */ + page = find_get_page(mapping, index); + if (page) { + put_page(page); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + hugetlb_drop_vma_policy(&pseudo_vma); + continue; + } + + /* Allocate page and add to page cache */ + page = alloc_huge_page(&pseudo_vma, addr, avoid_reserve); + hugetlb_drop_vma_policy(&pseudo_vma); + if (IS_ERR(page)) { + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + error = PTR_ERR(page); + goto out; + } + clear_huge_page(page, addr, pages_per_huge_page(h)); + __SetPageUptodate(page); + error = huge_add_to_page_cache(page, mapping, index); + if (unlikely(error)) { + put_page(page); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + goto out; + } + + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + + /* + * page_put due to reference from alloc_huge_page() + * unlock_page because locked by add_to_page_cache() + */ + put_page(page); + unlock_page(page); + } + + if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) + i_size_write(inode, offset + len); + inode->i_ctime = CURRENT_TIME; + spin_lock(&inode->i_lock); + inode->i_private = NULL; + spin_unlock(&inode->i_lock); +out: + mutex_unlock(&inode->i_mutex); + return error; +} + static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -701,7 +966,8 @@ const struct file_operations hugetlbfs_file_operations = { .mmap = hugetlbfs_file_mmap, .fsync = noop_fsync, .get_unmapped_area = hugetlb_get_unmapped_area, - .llseek = default_llseek, + .llseek = default_llseek, + .fallocate = hugetlbfs_fallocate, }; static const struct inode_operations hugetlbfs_dir_inode_operations = { @@ -1010,6 +1276,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size, inode = hugetlbfs_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0); if (!inode) goto out_dentry; + if (creat_flags == HUGETLB_SHMFS_INODE) + inode->i_flags |= S_PRIVATE; file = ERR_PTR(-ENOMEM); if (hugetlb_reserve_pages(inode, 0, diff --git a/fs/inode.c b/fs/inode.c index d30640f7a193..78a17b8859e1 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -28,16 +28,16 @@ * inode->i_state, inode->i_hash, __iget() * Inode LRU list locks protect: * inode->i_sb->s_inode_lru, inode->i_lru - * inode_sb_list_lock protects: - * sb->s_inodes, inode->i_sb_list + * inode->i_sb->s_inode_list_lock protects: + * inode->i_sb->s_inodes, inode->i_sb_list * bdi->wb.list_lock protects: - * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_wb_list + * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list * inode_hash_lock protects: * inode_hashtable, inode->i_hash * * Lock ordering: * - * inode_sb_list_lock + * inode->i_sb->s_inode_list_lock * inode->i_lock * Inode LRU list locks * @@ -45,7 +45,7 @@ * inode->i_lock * * inode_hash_lock - * inode_sb_list_lock + * inode->i_sb->s_inode_list_lock * inode->i_lock * * iunique_lock @@ -57,8 +57,6 @@ static unsigned int i_hash_shift __read_mostly; static struct hlist_head *inode_hashtable __read_mostly; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); -__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); - /* * Empty aops. Can be used for the cases where the user does not * define any of the address_space operations. @@ -359,7 +357,7 @@ void inode_init_once(struct inode *inode) memset(inode, 0, sizeof(*inode)); INIT_HLIST_NODE(&inode->i_hash); INIT_LIST_HEAD(&inode->i_devices); - INIT_LIST_HEAD(&inode->i_wb_list); + INIT_LIST_HEAD(&inode->i_io_list); INIT_LIST_HEAD(&inode->i_lru); address_space_init_once(&inode->i_data); i_size_ordered_init(inode); @@ -426,18 +424,18 @@ static void inode_lru_list_del(struct inode *inode) */ void inode_sb_list_add(struct inode *inode) { - spin_lock(&inode_sb_list_lock); + spin_lock(&inode->i_sb->s_inode_list_lock); list_add(&inode->i_sb_list, &inode->i_sb->s_inodes); - spin_unlock(&inode_sb_list_lock); + spin_unlock(&inode->i_sb->s_inode_list_lock); } EXPORT_SYMBOL_GPL(inode_sb_list_add); static inline void inode_sb_list_del(struct inode *inode) { if (!list_empty(&inode->i_sb_list)) { - spin_lock(&inode_sb_list_lock); + spin_lock(&inode->i_sb->s_inode_list_lock); list_del_init(&inode->i_sb_list); - spin_unlock(&inode_sb_list_lock); + spin_unlock(&inode->i_sb->s_inode_list_lock); } } @@ -527,8 +525,8 @@ static void evict(struct inode *inode) BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(!list_empty(&inode->i_lru)); - if (!list_empty(&inode->i_wb_list)) - inode_wb_list_del(inode); + if (!list_empty(&inode->i_io_list)) + inode_io_list_del(inode); inode_sb_list_del(inode); @@ -577,6 +575,7 @@ static void dispose_list(struct list_head *head) list_del_init(&inode->i_lru); evict(inode); + cond_resched(); } } @@ -594,7 +593,8 @@ void evict_inodes(struct super_block *sb) struct inode *inode, *next; LIST_HEAD(dispose); - spin_lock(&inode_sb_list_lock); +again: + spin_lock(&sb->s_inode_list_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { if (atomic_read(&inode->i_count)) continue; @@ -609,8 +609,20 @@ void evict_inodes(struct super_block *sb) inode_lru_list_del(inode); spin_unlock(&inode->i_lock); list_add(&inode->i_lru, &dispose); + + /* + * We can have a ton of inodes to evict at unmount time given + * enough memory, check to see if we need to go to sleep for a + * bit so we don't livelock. + */ + if (need_resched()) { + spin_unlock(&sb->s_inode_list_lock); + cond_resched(); + dispose_list(&dispose); + goto again; + } } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); dispose_list(&dispose); } @@ -631,7 +643,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) struct inode *inode, *next; LIST_HEAD(dispose); - spin_lock(&inode_sb_list_lock); + spin_lock(&sb->s_inode_list_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { @@ -654,7 +666,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) spin_unlock(&inode->i_lock); list_add(&inode->i_lru, &dispose); } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); dispose_list(&dispose); @@ -890,7 +902,7 @@ struct inode *new_inode(struct super_block *sb) { struct inode *inode; - spin_lock_prefetch(&inode_sb_list_lock); + spin_lock_prefetch(&sb->s_inode_list_lock); inode = new_inode_pseudo(sb); if (inode) diff --git a/fs/internal.h b/fs/internal.h index 4d5af583ab03..71859c4d0b41 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -112,14 +112,13 @@ extern int vfs_open(const struct path *, struct file *, const struct cred *); /* * inode.c */ -extern spinlock_t inode_sb_list_lock; extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); extern void inode_add_lru(struct inode *inode); /* * fs-writeback.c */ -extern void inode_wb_list_del(struct inode *inode); +extern void inode_io_list_del(struct inode *inode); extern long get_nr_dirty_inodes(void); extern void evict_inodes(struct super_block *); diff --git a/fs/jbd/Kconfig b/fs/jbd/Kconfig deleted file mode 100644 index 4e28beeed157..000000000000 --- a/fs/jbd/Kconfig +++ /dev/null @@ -1,30 +0,0 @@ -config JBD - tristate - help - This is a generic journalling layer for block devices. It is - currently used by the ext3 file system, but it could also be - used to add journal support to other file systems or block - devices such as RAID or LVM. - - If you are using the ext3 file system, you need to say Y here. - If you are not using ext3 then you will probably want to say N. - - To compile this device as a module, choose M here: the module will be - called jbd. If you are compiling ext3 into the kernel, you - cannot compile this code as a module. - -config JBD_DEBUG - bool "JBD (ext3) debugging support" - depends on JBD && DEBUG_FS - help - If you are using the ext3 journaled file system (or potentially any - other file system/device using JBD), this option allows you to - enable debugging output while the system is running, in order to - help track down any problems you are having. By default the - debugging output will be turned off. - - If you select Y here, then you will be able to turn on debugging - with "echo N > /sys/kernel/debug/jbd/jbd-debug", where N is a - number between 1 and 5, the higher the number, the more debugging - output is generated. To turn debugging off again, do - "echo 0 > /sys/kernel/debug/jbd/jbd-debug". diff --git a/fs/jbd/Makefile b/fs/jbd/Makefile deleted file mode 100644 index 54aca4868a36..000000000000 --- a/fs/jbd/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -# -# Makefile for the linux journaling routines. -# - -obj-$(CONFIG_JBD) += jbd.o - -jbd-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c deleted file mode 100644 index 08c03044abdd..000000000000 --- a/fs/jbd/checkpoint.c +++ /dev/null @@ -1,782 +0,0 @@ -/* - * linux/fs/jbd/checkpoint.c - * - * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 - * - * Copyright 1999 Red Hat Software --- All Rights Reserved - * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - * - * Checkpoint routines for the generic filesystem journaling code. - * Part of the ext2fs journaling system. - * - * Checkpointing is the process of ensuring that a section of the log is - * committed fully to disk, so that that portion of the log can be - * reused. - */ - -#include <linux/time.h> -#include <linux/fs.h> -#include <linux/jbd.h> -#include <linux/errno.h> -#include <linux/slab.h> -#include <linux/blkdev.h> -#include <trace/events/jbd.h> - -/* - * Unlink a buffer from a transaction checkpoint list. - * - * Called with j_list_lock held. - */ -static inline void __buffer_unlink_first(struct journal_head *jh) -{ - transaction_t *transaction = jh->b_cp_transaction; - - jh->b_cpnext->b_cpprev = jh->b_cpprev; - jh->b_cpprev->b_cpnext = jh->b_cpnext; - if (transaction->t_checkpoint_list == jh) { - transaction->t_checkpoint_list = jh->b_cpnext; - if (transaction->t_checkpoint_list == jh) - transaction->t_checkpoint_list = NULL; - } -} - -/* - * Unlink a buffer from a transaction checkpoint(io) list. - * - * Called with j_list_lock held. - */ -static inline void __buffer_unlink(struct journal_head *jh) -{ - transaction_t *transaction = jh->b_cp_transaction; - - __buffer_unlink_first(jh); - if (transaction->t_checkpoint_io_list == jh) { - transaction->t_checkpoint_io_list = jh->b_cpnext; - if (transaction->t_checkpoint_io_list == jh) - transaction->t_checkpoint_io_list = NULL; - } -} - -/* - * Move a buffer from the checkpoint list to the checkpoint io list - * - * Called with j_list_lock held - */ -static inline void __buffer_relink_io(struct journal_head *jh) -{ - transaction_t *transaction = jh->b_cp_transaction; - - __buffer_unlink_first(jh); - - if (!transaction->t_checkpoint_io_list) { - jh->b_cpnext = jh->b_cpprev = jh; - } else { - jh->b_cpnext = transaction->t_checkpoint_io_list; - jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev; - jh->b_cpprev->b_cpnext = jh; - jh->b_cpnext->b_cpprev = jh; - } - transaction->t_checkpoint_io_list = jh; -} - -/* - * Try to release a checkpointed buffer from its transaction. - * Returns 1 if we released it and 2 if we also released the - * whole transaction. - * - * Requires j_list_lock - * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it - */ -static int __try_to_free_cp_buf(struct journal_head *jh) -{ - int ret = 0; - struct buffer_head *bh = jh2bh(jh); - - if (jh->b_jlist == BJ_None && !buffer_locked(bh) && - !buffer_dirty(bh) && !buffer_write_io_error(bh)) { - /* - * Get our reference so that bh cannot be freed before - * we unlock it - */ - get_bh(bh); - JBUFFER_TRACE(jh, "remove from checkpoint list"); - ret = __journal_remove_checkpoint(jh) + 1; - jbd_unlock_bh_state(bh); - BUFFER_TRACE(bh, "release"); - __brelse(bh); - } else { - jbd_unlock_bh_state(bh); - } - return ret; -} - -/* - * __log_wait_for_space: wait until there is space in the journal. - * - * Called under j-state_lock *only*. It will be unlocked if we have to wait - * for a checkpoint to free up some space in the log. - */ -void __log_wait_for_space(journal_t *journal) -{ - int nblocks, space_left; - assert_spin_locked(&journal->j_state_lock); - - nblocks = jbd_space_needed(journal); - while (__log_space_left(journal) < nblocks) { - if (journal->j_flags & JFS_ABORT) - return; - spin_unlock(&journal->j_state_lock); - mutex_lock(&journal->j_checkpoint_mutex); - - /* - * Test again, another process may have checkpointed while we - * were waiting for the checkpoint lock. If there are no - * transactions ready to be checkpointed, try to recover - * journal space by calling cleanup_journal_tail(), and if - * that doesn't work, by waiting for the currently committing - * transaction to complete. If there is absolutely no way - * to make progress, this is either a BUG or corrupted - * filesystem, so abort the journal and leave a stack - * trace for forensic evidence. - */ - spin_lock(&journal->j_state_lock); - spin_lock(&journal->j_list_lock); - nblocks = jbd_space_needed(journal); - space_left = __log_space_left(journal); - if (space_left < nblocks) { - int chkpt = journal->j_checkpoint_transactions != NULL; - tid_t tid = 0; - - if (journal->j_committing_transaction) - tid = journal->j_committing_transaction->t_tid; - spin_unlock(&journal->j_list_lock); - spin_unlock(&journal->j_state_lock); - if (chkpt) { - log_do_checkpoint(journal); - } else if (cleanup_journal_tail(journal) == 0) { - /* We were able to recover space; yay! */ - ; - } else if (tid) { - log_wait_commit(journal, tid); - } else { - printk(KERN_ERR "%s: needed %d blocks and " - "only had %d space available\n", - __func__, nblocks, space_left); - printk(KERN_ERR "%s: no way to get more " - "journal space\n", __func__); - WARN_ON(1); - journal_abort(journal, 0); - } - spin_lock(&journal->j_state_lock); - } else { - spin_unlock(&journal->j_list_lock); - } - mutex_unlock(&journal->j_checkpoint_mutex); - } -} - -/* - * We were unable to perform jbd_trylock_bh_state() inside j_list_lock. - * The caller must restart a list walk. Wait for someone else to run - * jbd_unlock_bh_state(). - */ -static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh) - __releases(journal->j_list_lock) -{ - get_bh(bh); - spin_unlock(&journal->j_list_lock); - jbd_lock_bh_state(bh); - jbd_unlock_bh_state(bh); - put_bh(bh); -} - -/* - * Clean up transaction's list of buffers submitted for io. - * We wait for any pending IO to complete and remove any clean - * buffers. Note that we take the buffers in the opposite ordering - * from the one in which they were submitted for IO. - * - * Return 0 on success, and return <0 if some buffers have failed - * to be written out. - * - * Called with j_list_lock held. - */ -static int __wait_cp_io(journal_t *journal, transaction_t *transaction) -{ - struct journal_head *jh; - struct buffer_head *bh; - tid_t this_tid; - int released = 0; - int ret = 0; - - this_tid = transaction->t_tid; -restart: - /* Did somebody clean up the transaction in the meanwhile? */ - if (journal->j_checkpoint_transactions != transaction || - transaction->t_tid != this_tid) - return ret; - while (!released && transaction->t_checkpoint_io_list) { - jh = transaction->t_checkpoint_io_list; - bh = jh2bh(jh); - if (!jbd_trylock_bh_state(bh)) { - jbd_sync_bh(journal, bh); - spin_lock(&journal->j_list_lock); - goto restart; - } - get_bh(bh); - if (buffer_locked(bh)) { - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - wait_on_buffer(bh); - /* the journal_head may have gone by now */ - BUFFER_TRACE(bh, "brelse"); - __brelse(bh); - spin_lock(&journal->j_list_lock); - goto restart; - } - if (unlikely(buffer_write_io_error(bh))) - ret = -EIO; - - /* - * Now in whatever state the buffer currently is, we know that - * it has been written out and so we can drop it from the list - */ - released = __journal_remove_checkpoint(jh); - jbd_unlock_bh_state(bh); - __brelse(bh); - } - - return ret; -} - -#define NR_BATCH 64 - -static void -__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) -{ - int i; - struct blk_plug plug; - - blk_start_plug(&plug); - for (i = 0; i < *batch_count; i++) - write_dirty_buffer(bhs[i], WRITE_SYNC); - blk_finish_plug(&plug); - - for (i = 0; i < *batch_count; i++) { - struct buffer_head *bh = bhs[i]; - clear_buffer_jwrite(bh); - BUFFER_TRACE(bh, "brelse"); - __brelse(bh); - } - *batch_count = 0; -} - -/* - * Try to flush one buffer from the checkpoint list to disk. - * - * Return 1 if something happened which requires us to abort the current - * scan of the checkpoint list. Return <0 if the buffer has failed to - * be written out. - * - * Called with j_list_lock held and drops it if 1 is returned - * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it - */ -static int __process_buffer(journal_t *journal, struct journal_head *jh, - struct buffer_head **bhs, int *batch_count) -{ - struct buffer_head *bh = jh2bh(jh); - int ret = 0; - - if (buffer_locked(bh)) { - get_bh(bh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - wait_on_buffer(bh); - /* the journal_head may have gone by now */ - BUFFER_TRACE(bh, "brelse"); - __brelse(bh); - ret = 1; - } else if (jh->b_transaction != NULL) { - transaction_t *t = jh->b_transaction; - tid_t tid = t->t_tid; - - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - log_start_commit(journal, tid); - log_wait_commit(journal, tid); - ret = 1; - } else if (!buffer_dirty(bh)) { - ret = 1; - if (unlikely(buffer_write_io_error(bh))) - ret = -EIO; - get_bh(bh); - J_ASSERT_JH(jh, !buffer_jbddirty(bh)); - BUFFER_TRACE(bh, "remove from checkpoint"); - __journal_remove_checkpoint(jh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - __brelse(bh); - } else { - /* - * Important: we are about to write the buffer, and - * possibly block, while still holding the journal lock. - * We cannot afford to let the transaction logic start - * messing around with this buffer before we write it to - * disk, as that would break recoverability. - */ - BUFFER_TRACE(bh, "queue"); - get_bh(bh); - J_ASSERT_BH(bh, !buffer_jwrite(bh)); - set_buffer_jwrite(bh); - bhs[*batch_count] = bh; - __buffer_relink_io(jh); - jbd_unlock_bh_state(bh); - (*batch_count)++; - if (*batch_count == NR_BATCH) { - spin_unlock(&journal->j_list_lock); - __flush_batch(journal, bhs, batch_count); - ret = 1; - } - } - return ret; -} - -/* - * Perform an actual checkpoint. We take the first transaction on the - * list of transactions to be checkpointed and send all its buffers - * to disk. We submit larger chunks of data at once. - * - * The journal should be locked before calling this function. - * Called with j_checkpoint_mutex held. - */ -int log_do_checkpoint(journal_t *journal) -{ - transaction_t *transaction; - tid_t this_tid; - int result; - - jbd_debug(1, "Start checkpoint\n"); - - /* - * First thing: if there are any transactions in the log which - * don't need checkpointing, just eliminate them from the - * journal straight away. - */ - result = cleanup_journal_tail(journal); - trace_jbd_checkpoint(journal, result); - jbd_debug(1, "cleanup_journal_tail returned %d\n", result); - if (result <= 0) - return result; - - /* - * OK, we need to start writing disk blocks. Take one transaction - * and write it. - */ - result = 0; - spin_lock(&journal->j_list_lock); - if (!journal->j_checkpoint_transactions) - goto out; - transaction = journal->j_checkpoint_transactions; - this_tid = transaction->t_tid; -restart: - /* - * If someone cleaned up this transaction while we slept, we're - * done (maybe it's a new transaction, but it fell at the same - * address). - */ - if (journal->j_checkpoint_transactions == transaction && - transaction->t_tid == this_tid) { - int batch_count = 0; - struct buffer_head *bhs[NR_BATCH]; - struct journal_head *jh; - int retry = 0, err; - - while (!retry && transaction->t_checkpoint_list) { - struct buffer_head *bh; - - jh = transaction->t_checkpoint_list; - bh = jh2bh(jh); - if (!jbd_trylock_bh_state(bh)) { - jbd_sync_bh(journal, bh); - retry = 1; - break; - } - retry = __process_buffer(journal, jh, bhs,&batch_count); - if (retry < 0 && !result) - result = retry; - if (!retry && (need_resched() || - spin_needbreak(&journal->j_list_lock))) { - spin_unlock(&journal->j_list_lock); - retry = 1; - break; - } - } - - if (batch_count) { - if (!retry) { - spin_unlock(&journal->j_list_lock); - retry = 1; - } - __flush_batch(journal, bhs, &batch_count); - } - - if (retry) { - spin_lock(&journal->j_list_lock); - goto restart; - } - /* - * Now we have cleaned up the first transaction's checkpoint - * list. Let's clean up the second one - */ - err = __wait_cp_io(journal, transaction); - if (!result) - result = err; - } -out: - spin_unlock(&journal->j_list_lock); - if (result < 0) - journal_abort(journal, result); - else - result = cleanup_journal_tail(journal); - - return (result < 0) ? result : 0; -} - -/* - * Check the list of checkpoint transactions for the journal to see if - * we have already got rid of any since the last update of the log tail - * in the journal superblock. If so, we can instantly roll the - * superblock forward to remove those transactions from the log. - * - * Return <0 on error, 0 on success, 1 if there was nothing to clean up. - * - * This is the only part of the journaling code which really needs to be - * aware of transaction aborts. Checkpointing involves writing to the - * main filesystem area rather than to the journal, so it can proceed - * even in abort state, but we must not update the super block if - * checkpointing may have failed. Otherwise, we would lose some metadata - * buffers which should be written-back to the filesystem. - */ - -int cleanup_journal_tail(journal_t *journal) -{ - transaction_t * transaction; - tid_t first_tid; - unsigned int blocknr, freed; - - if (is_journal_aborted(journal)) - return 1; - - /* - * OK, work out the oldest transaction remaining in the log, and - * the log block it starts at. - * - * If the log is now empty, we need to work out which is the - * next transaction ID we will write, and where it will - * start. - */ - spin_lock(&journal->j_state_lock); - spin_lock(&journal->j_list_lock); - transaction = journal->j_checkpoint_transactions; - if (transaction) { - first_tid = transaction->t_tid; - blocknr = transaction->t_log_start; - } else if ((transaction = journal->j_committing_transaction) != NULL) { - first_tid = transaction->t_tid; - blocknr = transaction->t_log_start; - } else if ((transaction = journal->j_running_transaction) != NULL) { - first_tid = transaction->t_tid; - blocknr = journal->j_head; - } else { - first_tid = journal->j_transaction_sequence; - blocknr = journal->j_head; - } - spin_unlock(&journal->j_list_lock); - J_ASSERT(blocknr != 0); - - /* If the oldest pinned transaction is at the tail of the log - already then there's not much we can do right now. */ - if (journal->j_tail_sequence == first_tid) { - spin_unlock(&journal->j_state_lock); - return 1; - } - spin_unlock(&journal->j_state_lock); - - /* - * We need to make sure that any blocks that were recently written out - * --- perhaps by log_do_checkpoint() --- are flushed out before we - * drop the transactions from the journal. Similarly we need to be sure - * superblock makes it to disk before next transaction starts reusing - * freed space (otherwise we could replay some blocks of the new - * transaction thinking they belong to the old one). So we use - * WRITE_FLUSH_FUA. It's unlikely this will be necessary, especially - * with an appropriately sized journal, but we need this to guarantee - * correctness. Fortunately cleanup_journal_tail() doesn't get called - * all that often. - */ - journal_update_sb_log_tail(journal, first_tid, blocknr, - WRITE_FLUSH_FUA); - - spin_lock(&journal->j_state_lock); - /* OK, update the superblock to recover the freed space. - * Physical blocks come first: have we wrapped beyond the end of - * the log? */ - freed = blocknr - journal->j_tail; - if (blocknr < journal->j_tail) - freed = freed + journal->j_last - journal->j_first; - - trace_jbd_cleanup_journal_tail(journal, first_tid, blocknr, freed); - jbd_debug(1, - "Cleaning journal tail from %d to %d (offset %u), " - "freeing %u\n", - journal->j_tail_sequence, first_tid, blocknr, freed); - - journal->j_free += freed; - journal->j_tail_sequence = first_tid; - journal->j_tail = blocknr; - spin_unlock(&journal->j_state_lock); - return 0; -} - - -/* Checkpoint list management */ - -/* - * journal_clean_one_cp_list - * - * Find all the written-back checkpoint buffers in the given list and release - * them. - * - * Called with j_list_lock held. - * Returns number of buffers reaped (for debug) - */ - -static int journal_clean_one_cp_list(struct journal_head *jh, int *released) -{ - struct journal_head *last_jh; - struct journal_head *next_jh = jh; - int ret, freed = 0; - - *released = 0; - if (!jh) - return 0; - - last_jh = jh->b_cpprev; - do { - jh = next_jh; - next_jh = jh->b_cpnext; - /* Use trylock because of the ranking */ - if (jbd_trylock_bh_state(jh2bh(jh))) { - ret = __try_to_free_cp_buf(jh); - if (ret) { - freed++; - if (ret == 2) { - *released = 1; - return freed; - } - } - } - /* - * This function only frees up some memory - * if possible so we dont have an obligation - * to finish processing. Bail out if preemption - * requested: - */ - if (need_resched()) - return freed; - } while (jh != last_jh); - - return freed; -} - -/* - * journal_clean_checkpoint_list - * - * Find all the written-back checkpoint buffers in the journal and release them. - * - * Called with the journal locked. - * Called with j_list_lock held. - * Returns number of buffers reaped (for debug) - */ - -int __journal_clean_checkpoint_list(journal_t *journal) -{ - transaction_t *transaction, *last_transaction, *next_transaction; - int ret = 0; - int released; - - transaction = journal->j_checkpoint_transactions; - if (!transaction) - goto out; - - last_transaction = transaction->t_cpprev; - next_transaction = transaction; - do { - transaction = next_transaction; - next_transaction = transaction->t_cpnext; - ret += journal_clean_one_cp_list(transaction-> - t_checkpoint_list, &released); - /* - * This function only frees up some memory if possible so we - * dont have an obligation to finish processing. Bail out if - * preemption requested: - */ - if (need_resched()) - goto out; - if (released) - continue; - /* - * It is essential that we are as careful as in the case of - * t_checkpoint_list with removing the buffer from the list as - * we can possibly see not yet submitted buffers on io_list - */ - ret += journal_clean_one_cp_list(transaction-> - t_checkpoint_io_list, &released); - if (need_resched()) - goto out; - } while (transaction != last_transaction); -out: - return ret; -} - -/* - * journal_remove_checkpoint: called after a buffer has been committed - * to disk (either by being write-back flushed to disk, or being - * committed to the log). - * - * We cannot safely clean a transaction out of the log until all of the - * buffer updates committed in that transaction have safely been stored - * elsewhere on disk. To achieve this, all of the buffers in a - * transaction need to be maintained on the transaction's checkpoint - * lists until they have been rewritten, at which point this function is - * called to remove the buffer from the existing transaction's - * checkpoint lists. - * - * The function returns 1 if it frees the transaction, 0 otherwise. - * The function can free jh and bh. - * - * This function is called with j_list_lock held. - * This function is called with jbd_lock_bh_state(jh2bh(jh)) - */ - -int __journal_remove_checkpoint(struct journal_head *jh) -{ - transaction_t *transaction; - journal_t *journal; - int ret = 0; - - JBUFFER_TRACE(jh, "entry"); - - if ((transaction = jh->b_cp_transaction) == NULL) { - JBUFFER_TRACE(jh, "not on transaction"); - goto out; - } - journal = transaction->t_journal; - - JBUFFER_TRACE(jh, "removing from transaction"); - __buffer_unlink(jh); - jh->b_cp_transaction = NULL; - journal_put_journal_head(jh); - - if (transaction->t_checkpoint_list != NULL || - transaction->t_checkpoint_io_list != NULL) - goto out; - - /* - * There is one special case to worry about: if we have just pulled the - * buffer off a running or committing transaction's checkpoing list, - * then even if the checkpoint list is empty, the transaction obviously - * cannot be dropped! - * - * The locking here around t_state is a bit sleazy. - * See the comment at the end of journal_commit_transaction(). - */ - if (transaction->t_state != T_FINISHED) - goto out; - - /* OK, that was the last buffer for the transaction: we can now - safely remove this transaction from the log */ - - __journal_drop_transaction(journal, transaction); - - /* Just in case anybody was waiting for more transactions to be - checkpointed... */ - wake_up(&journal->j_wait_logspace); - ret = 1; -out: - return ret; -} - -/* - * journal_insert_checkpoint: put a committed buffer onto a checkpoint - * list so that we know when it is safe to clean the transaction out of - * the log. - * - * Called with the journal locked. - * Called with j_list_lock held. - */ -void __journal_insert_checkpoint(struct journal_head *jh, - transaction_t *transaction) -{ - JBUFFER_TRACE(jh, "entry"); - J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh))); - J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); - - /* Get reference for checkpointing transaction */ - journal_grab_journal_head(jh2bh(jh)); - jh->b_cp_transaction = transaction; - - if (!transaction->t_checkpoint_list) { - jh->b_cpnext = jh->b_cpprev = jh; - } else { - jh->b_cpnext = transaction->t_checkpoint_list; - jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev; - jh->b_cpprev->b_cpnext = jh; - jh->b_cpnext->b_cpprev = jh; - } - transaction->t_checkpoint_list = jh; -} - -/* - * We've finished with this transaction structure: adios... - * - * The transaction must have no links except for the checkpoint by this - * point. - * - * Called with the journal locked. - * Called with j_list_lock held. - */ - -void __journal_drop_transaction(journal_t *journal, transaction_t *transaction) -{ - assert_spin_locked(&journal->j_list_lock); - if (transaction->t_cpnext) { - transaction->t_cpnext->t_cpprev = transaction->t_cpprev; - transaction->t_cpprev->t_cpnext = transaction->t_cpnext; - if (journal->j_checkpoint_transactions == transaction) - journal->j_checkpoint_transactions = - transaction->t_cpnext; - if (journal->j_checkpoint_transactions == transaction) - journal->j_checkpoint_transactions = NULL; - } - - J_ASSERT(transaction->t_state == T_FINISHED); - J_ASSERT(transaction->t_buffers == NULL); - J_ASSERT(transaction->t_sync_datalist == NULL); - J_ASSERT(transaction->t_forget == NULL); - J_ASSERT(transaction->t_iobuf_list == NULL); - J_ASSERT(transaction->t_shadow_list == NULL); - J_ASSERT(transaction->t_log_list == NULL); - J_ASSERT(transaction->t_checkpoint_list == NULL); - J_ASSERT(transaction->t_checkpoint_io_list == NULL); - J_ASSERT(transaction->t_updates == 0); - J_ASSERT(journal->j_committing_transaction != transaction); - J_ASSERT(journal->j_running_transaction != transaction); - - trace_jbd_drop_transaction(journal, transaction); - jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); - kfree(transaction); -} diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c deleted file mode 100644 index bb217dcb41af..000000000000 --- a/fs/jbd/commit.c +++ /dev/null @@ -1,1021 +0,0 @@ -/* - * linux/fs/jbd/commit.c - * - * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 - * - * Copyright 1998 Red Hat corp --- All Rights Reserved - * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - * - * Journal commit routines for the generic filesystem journaling code; - * part of the ext2fs journaling system. - */ - -#include <linux/time.h> -#include <linux/fs.h> -#include <linux/jbd.h> -#include <linux/errno.h> -#include <linux/mm.h> -#include <linux/pagemap.h> -#include <linux/bio.h> -#include <linux/blkdev.h> -#include <trace/events/jbd.h> - -/* - * Default IO end handler for temporary BJ_IO buffer_heads. - */ -static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) -{ - BUFFER_TRACE(bh, ""); - if (uptodate) - set_buffer_uptodate(bh); - else - clear_buffer_uptodate(bh); - unlock_buffer(bh); -} - -/* - * When an ext3-ordered file is truncated, it is possible that many pages are - * not successfully freed, because they are attached to a committing transaction. - * After the transaction commits, these pages are left on the LRU, with no - * ->mapping, and with attached buffers. These pages are trivially reclaimable - * by the VM, but their apparent absence upsets the VM accounting, and it makes - * the numbers in /proc/meminfo look odd. - * - * So here, we have a buffer which has just come off the forget list. Look to - * see if we can strip all buffers from the backing page. - * - * Called under journal->j_list_lock. The caller provided us with a ref - * against the buffer, and we drop that here. - */ -static void release_buffer_page(struct buffer_head *bh) -{ - struct page *page; - - if (buffer_dirty(bh)) - goto nope; - if (atomic_read(&bh->b_count) != 1) - goto nope; - page = bh->b_page; - if (!page) - goto nope; - if (page->mapping) - goto nope; - - /* OK, it's a truncated page */ - if (!trylock_page(page)) - goto nope; - - page_cache_get(page); - __brelse(bh); - try_to_free_buffers(page); - unlock_page(page); - page_cache_release(page); - return; - -nope: - __brelse(bh); -} - -/* - * Decrement reference counter for data buffer. If it has been marked - * 'BH_Freed', release it and the page to which it belongs if possible. - */ -static void release_data_buffer(struct buffer_head *bh) -{ - if (buffer_freed(bh)) { - WARN_ON_ONCE(buffer_dirty(bh)); - clear_buffer_freed(bh); - clear_buffer_mapped(bh); - clear_buffer_new(bh); - clear_buffer_req(bh); - bh->b_bdev = NULL; - release_buffer_page(bh); - } else - put_bh(bh); -} - -/* - * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is - * held. For ranking reasons we must trylock. If we lose, schedule away and - * return 0. j_list_lock is dropped in this case. - */ -static int inverted_lock(journal_t *journal, struct buffer_head *bh) -{ - if (!jbd_trylock_bh_state(bh)) { - spin_unlock(&journal->j_list_lock); - schedule(); - return 0; - } - return 1; -} - -/* Done it all: now write the commit record. We should have - * cleaned up our previous buffers by now, so if we are in abort - * mode we can now just skip the rest of the journal write - * entirely. - * - * Returns 1 if the journal needs to be aborted or 0 on success - */ -static int journal_write_commit_record(journal_t *journal, - transaction_t *commit_transaction) -{ - struct journal_head *descriptor; - struct buffer_head *bh; - journal_header_t *header; - int ret; - - if (is_journal_aborted(journal)) - return 0; - - descriptor = journal_get_descriptor_buffer(journal); - if (!descriptor) - return 1; - - bh = jh2bh(descriptor); - - header = (journal_header_t *)(bh->b_data); - header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); - header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK); - header->h_sequence = cpu_to_be32(commit_transaction->t_tid); - - JBUFFER_TRACE(descriptor, "write commit block"); - set_buffer_dirty(bh); - - if (journal->j_flags & JFS_BARRIER) - ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_FLUSH_FUA); - else - ret = sync_dirty_buffer(bh); - - put_bh(bh); /* One for getblk() */ - journal_put_journal_head(descriptor); - - return (ret == -EIO); -} - -static void journal_do_submit_data(struct buffer_head **wbuf, int bufs, - int write_op) -{ - int i; - - for (i = 0; i < bufs; i++) { - wbuf[i]->b_end_io = end_buffer_write_sync; - /* - * Here we write back pagecache data that may be mmaped. Since - * we cannot afford to clean the page and set PageWriteback - * here due to lock ordering (page lock ranks above transaction - * start), the data can change while IO is in flight. Tell the - * block layer it should bounce the bio pages if stable data - * during write is required. - * - * We use up our safety reference in submit_bh(). - */ - _submit_bh(write_op, wbuf[i], 1 << BIO_SNAP_STABLE); - } -} - -/* - * Submit all the data buffers to disk - */ -static int journal_submit_data_buffers(journal_t *journal, - transaction_t *commit_transaction, - int write_op) -{ - struct journal_head *jh; - struct buffer_head *bh; - int locked; - int bufs = 0; - struct buffer_head **wbuf = journal->j_wbuf; - int err = 0; - - /* - * Whenever we unlock the journal and sleep, things can get added - * onto ->t_sync_datalist, so we have to keep looping back to - * write_out_data until we *know* that the list is empty. - * - * Cleanup any flushed data buffers from the data list. Even in - * abort mode, we want to flush this out as soon as possible. - */ -write_out_data: - cond_resched(); - spin_lock(&journal->j_list_lock); - - while (commit_transaction->t_sync_datalist) { - jh = commit_transaction->t_sync_datalist; - bh = jh2bh(jh); - locked = 0; - - /* Get reference just to make sure buffer does not disappear - * when we are forced to drop various locks */ - get_bh(bh); - /* If the buffer is dirty, we need to submit IO and hence - * we need the buffer lock. We try to lock the buffer without - * blocking. If we fail, we need to drop j_list_lock and do - * blocking lock_buffer(). - */ - if (buffer_dirty(bh)) { - if (!trylock_buffer(bh)) { - BUFFER_TRACE(bh, "needs blocking lock"); - spin_unlock(&journal->j_list_lock); - trace_jbd_do_submit_data(journal, - commit_transaction); - /* Write out all data to prevent deadlocks */ - journal_do_submit_data(wbuf, bufs, write_op); - bufs = 0; - lock_buffer(bh); - spin_lock(&journal->j_list_lock); - } - locked = 1; - } - /* We have to get bh_state lock. Again out of order, sigh. */ - if (!inverted_lock(journal, bh)) { - jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); - } - /* Someone already cleaned up the buffer? */ - if (!buffer_jbd(bh) || bh2jh(bh) != jh - || jh->b_transaction != commit_transaction - || jh->b_jlist != BJ_SyncData) { - jbd_unlock_bh_state(bh); - if (locked) - unlock_buffer(bh); - BUFFER_TRACE(bh, "already cleaned up"); - release_data_buffer(bh); - continue; - } - if (locked && test_clear_buffer_dirty(bh)) { - BUFFER_TRACE(bh, "needs writeout, adding to array"); - wbuf[bufs++] = bh; - __journal_file_buffer(jh, commit_transaction, - BJ_Locked); - jbd_unlock_bh_state(bh); - if (bufs == journal->j_wbufsize) { - spin_unlock(&journal->j_list_lock); - trace_jbd_do_submit_data(journal, - commit_transaction); - journal_do_submit_data(wbuf, bufs, write_op); - bufs = 0; - goto write_out_data; - } - } else if (!locked && buffer_locked(bh)) { - __journal_file_buffer(jh, commit_transaction, - BJ_Locked); - jbd_unlock_bh_state(bh); - put_bh(bh); - } else { - BUFFER_TRACE(bh, "writeout complete: unfile"); - if (unlikely(!buffer_uptodate(bh))) - err = -EIO; - __journal_unfile_buffer(jh); - jbd_unlock_bh_state(bh); - if (locked) - unlock_buffer(bh); - release_data_buffer(bh); - } - - if (need_resched() || spin_needbreak(&journal->j_list_lock)) { - spin_unlock(&journal->j_list_lock); - goto write_out_data; - } - } - spin_unlock(&journal->j_list_lock); - trace_jbd_do_submit_data(journal, commit_transaction); - journal_do_submit_data(wbuf, bufs, write_op); - - return err; -} - -/* - * journal_commit_transaction - * - * The primary function for committing a transaction to the log. This - * function is called by the journal thread to begin a complete commit. - */ -void journal_commit_transaction(journal_t *journal) -{ - transaction_t *commit_transaction; - struct journal_head *jh, *new_jh, *descriptor; - struct buffer_head **wbuf = journal->j_wbuf; - int bufs; - int flags; - int err; - unsigned int blocknr; - ktime_t start_time; - u64 commit_time; - char *tagp = NULL; - journal_header_t *header; - journal_block_tag_t *tag = NULL; - int space_left = 0; - int first_tag = 0; - int tag_flag; - int i; - struct blk_plug plug; - int write_op = WRITE; - - /* - * First job: lock down the current transaction and wait for - * all outstanding updates to complete. - */ - - /* Do we need to erase the effects of a prior journal_flush? */ - if (journal->j_flags & JFS_FLUSHED) { - jbd_debug(3, "super block updated\n"); - mutex_lock(&journal->j_checkpoint_mutex); - /* - * We hold j_checkpoint_mutex so tail cannot change under us. - * We don't need any special data guarantees for writing sb - * since journal is empty and it is ok for write to be - * flushed only with transaction commit. - */ - journal_update_sb_log_tail(journal, journal->j_tail_sequence, - journal->j_tail, WRITE_SYNC); - mutex_unlock(&journal->j_checkpoint_mutex); - } else { - jbd_debug(3, "superblock not updated\n"); - } - - J_ASSERT(journal->j_running_transaction != NULL); - J_ASSERT(journal->j_committing_transaction == NULL); - - commit_transaction = journal->j_running_transaction; - - trace_jbd_start_commit(journal, commit_transaction); - jbd_debug(1, "JBD: starting commit of transaction %d\n", - commit_transaction->t_tid); - - spin_lock(&journal->j_state_lock); - J_ASSERT(commit_transaction->t_state == T_RUNNING); - commit_transaction->t_state = T_LOCKED; - - trace_jbd_commit_locking(journal, commit_transaction); - spin_lock(&commit_transaction->t_handle_lock); - while (commit_transaction->t_updates) { - DEFINE_WAIT(wait); - - prepare_to_wait(&journal->j_wait_updates, &wait, - TASK_UNINTERRUPTIBLE); - if (commit_transaction->t_updates) { - spin_unlock(&commit_transaction->t_handle_lock); - spin_unlock(&journal->j_state_lock); - schedule(); - spin_lock(&journal->j_state_lock); - spin_lock(&commit_transaction->t_handle_lock); - } - finish_wait(&journal->j_wait_updates, &wait); - } - spin_unlock(&commit_transaction->t_handle_lock); - - J_ASSERT (commit_transaction->t_outstanding_credits <= - journal->j_max_transaction_buffers); - - /* - * First thing we are allowed to do is to discard any remaining - * BJ_Reserved buffers. Note, it is _not_ permissible to assume - * that there are no such buffers: if a large filesystem - * operation like a truncate needs to split itself over multiple - * transactions, then it may try to do a journal_restart() while - * there are still BJ_Reserved buffers outstanding. These must - * be released cleanly from the current transaction. - * - * In this case, the filesystem must still reserve write access - * again before modifying the buffer in the new transaction, but - * we do not require it to remember exactly which old buffers it - * has reserved. This is consistent with the existing behaviour - * that multiple journal_get_write_access() calls to the same - * buffer are perfectly permissible. - */ - while (commit_transaction->t_reserved_list) { - jh = commit_transaction->t_reserved_list; - JBUFFER_TRACE(jh, "reserved, unused: refile"); - /* - * A journal_get_undo_access()+journal_release_buffer() may - * leave undo-committed data. - */ - if (jh->b_committed_data) { - struct buffer_head *bh = jh2bh(jh); - - jbd_lock_bh_state(bh); - jbd_free(jh->b_committed_data, bh->b_size); - jh->b_committed_data = NULL; - jbd_unlock_bh_state(bh); - } - journal_refile_buffer(journal, jh); - } - - /* - * Now try to drop any written-back buffers from the journal's - * checkpoint lists. We do this *before* commit because it potentially - * frees some memory - */ - spin_lock(&journal->j_list_lock); - __journal_clean_checkpoint_list(journal); - spin_unlock(&journal->j_list_lock); - - jbd_debug (3, "JBD: commit phase 1\n"); - - /* - * Clear revoked flag to reflect there is no revoked buffers - * in the next transaction which is going to be started. - */ - journal_clear_buffer_revoked_flags(journal); - - /* - * Switch to a new revoke table. - */ - journal_switch_revoke_table(journal); - - trace_jbd_commit_flushing(journal, commit_transaction); - commit_transaction->t_state = T_FLUSH; - journal->j_committing_transaction = commit_transaction; - journal->j_running_transaction = NULL; - start_time = ktime_get(); - commit_transaction->t_log_start = journal->j_head; - wake_up(&journal->j_wait_transaction_locked); - spin_unlock(&journal->j_state_lock); - - jbd_debug (3, "JBD: commit phase 2\n"); - - if (tid_geq(journal->j_commit_waited, commit_transaction->t_tid)) - write_op = WRITE_SYNC; - - /* - * Now start flushing things to disk, in the order they appear - * on the transaction lists. Data blocks go first. - */ - blk_start_plug(&plug); - err = journal_submit_data_buffers(journal, commit_transaction, - write_op); - blk_finish_plug(&plug); - - /* - * Wait for all previously submitted IO to complete. - */ - spin_lock(&journal->j_list_lock); - while (commit_transaction->t_locked_list) { - struct buffer_head *bh; - - jh = commit_transaction->t_locked_list->b_tprev; - bh = jh2bh(jh); - get_bh(bh); - if (buffer_locked(bh)) { - spin_unlock(&journal->j_list_lock); - wait_on_buffer(bh); - spin_lock(&journal->j_list_lock); - } - if (unlikely(!buffer_uptodate(bh))) { - if (!trylock_page(bh->b_page)) { - spin_unlock(&journal->j_list_lock); - lock_page(bh->b_page); - spin_lock(&journal->j_list_lock); - } - if (bh->b_page->mapping) - set_bit(AS_EIO, &bh->b_page->mapping->flags); - - unlock_page(bh->b_page); - SetPageError(bh->b_page); - err = -EIO; - } - if (!inverted_lock(journal, bh)) { - put_bh(bh); - spin_lock(&journal->j_list_lock); - continue; - } - if (buffer_jbd(bh) && bh2jh(bh) == jh && - jh->b_transaction == commit_transaction && - jh->b_jlist == BJ_Locked) - __journal_unfile_buffer(jh); - jbd_unlock_bh_state(bh); - release_data_buffer(bh); - cond_resched_lock(&journal->j_list_lock); - } - spin_unlock(&journal->j_list_lock); - - if (err) { - char b[BDEVNAME_SIZE]; - - printk(KERN_WARNING - "JBD: Detected IO errors while flushing file data " - "on %s\n", bdevname(journal->j_fs_dev, b)); - if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR) - journal_abort(journal, err); - err = 0; - } - - blk_start_plug(&plug); - - journal_write_revoke_records(journal, commit_transaction, write_op); - - /* - * If we found any dirty or locked buffers, then we should have - * looped back up to the write_out_data label. If there weren't - * any then journal_clean_data_list should have wiped the list - * clean by now, so check that it is in fact empty. - */ - J_ASSERT (commit_transaction->t_sync_datalist == NULL); - - jbd_debug (3, "JBD: commit phase 3\n"); - - /* - * Way to go: we have now written out all of the data for a - * transaction! Now comes the tricky part: we need to write out - * metadata. Loop over the transaction's entire buffer list: - */ - spin_lock(&journal->j_state_lock); - commit_transaction->t_state = T_COMMIT; - spin_unlock(&journal->j_state_lock); - - trace_jbd_commit_logging(journal, commit_transaction); - J_ASSERT(commit_transaction->t_nr_buffers <= - commit_transaction->t_outstanding_credits); - - descriptor = NULL; - bufs = 0; - while (commit_transaction->t_buffers) { - - /* Find the next buffer to be journaled... */ - - jh = commit_transaction->t_buffers; - - /* If we're in abort mode, we just un-journal the buffer and - release it. */ - - if (is_journal_aborted(journal)) { - clear_buffer_jbddirty(jh2bh(jh)); - JBUFFER_TRACE(jh, "journal is aborting: refile"); - journal_refile_buffer(journal, jh); - /* If that was the last one, we need to clean up - * any descriptor buffers which may have been - * already allocated, even if we are now - * aborting. */ - if (!commit_transaction->t_buffers) - goto start_journal_io; - continue; - } - - /* Make sure we have a descriptor block in which to - record the metadata buffer. */ - - if (!descriptor) { - struct buffer_head *bh; - - J_ASSERT (bufs == 0); - - jbd_debug(4, "JBD: get descriptor\n"); - - descriptor = journal_get_descriptor_buffer(journal); - if (!descriptor) { - journal_abort(journal, -EIO); - continue; - } - - bh = jh2bh(descriptor); - jbd_debug(4, "JBD: got buffer %llu (%p)\n", - (unsigned long long)bh->b_blocknr, bh->b_data); - header = (journal_header_t *)&bh->b_data[0]; - header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); - header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK); - header->h_sequence = cpu_to_be32(commit_transaction->t_tid); - - tagp = &bh->b_data[sizeof(journal_header_t)]; - space_left = bh->b_size - sizeof(journal_header_t); - first_tag = 1; - set_buffer_jwrite(bh); - set_buffer_dirty(bh); - wbuf[bufs++] = bh; - - /* Record it so that we can wait for IO - completion later */ - BUFFER_TRACE(bh, "ph3: file as descriptor"); - journal_file_buffer(descriptor, commit_transaction, - BJ_LogCtl); - } - - /* Where is the buffer to be written? */ - - err = journal_next_log_block(journal, &blocknr); - /* If the block mapping failed, just abandon the buffer - and repeat this loop: we'll fall into the - refile-on-abort condition above. */ - if (err) { - journal_abort(journal, err); - continue; - } - - /* - * start_this_handle() uses t_outstanding_credits to determine - * the free space in the log, but this counter is changed - * by journal_next_log_block() also. - */ - commit_transaction->t_outstanding_credits--; - - /* Bump b_count to prevent truncate from stumbling over - the shadowed buffer! @@@ This can go if we ever get - rid of the BJ_IO/BJ_Shadow pairing of buffers. */ - get_bh(jh2bh(jh)); - - /* Make a temporary IO buffer with which to write it out - (this will requeue both the metadata buffer and the - temporary IO buffer). new_bh goes on BJ_IO*/ - - set_buffer_jwrite(jh2bh(jh)); - /* - * akpm: journal_write_metadata_buffer() sets - * new_bh->b_transaction to commit_transaction. - * We need to clean this up before we release new_bh - * (which is of type BJ_IO) - */ - JBUFFER_TRACE(jh, "ph3: write metadata"); - flags = journal_write_metadata_buffer(commit_transaction, - jh, &new_jh, blocknr); - set_buffer_jwrite(jh2bh(new_jh)); - wbuf[bufs++] = jh2bh(new_jh); - - /* Record the new block's tag in the current descriptor - buffer */ - - tag_flag = 0; - if (flags & 1) - tag_flag |= JFS_FLAG_ESCAPE; - if (!first_tag) - tag_flag |= JFS_FLAG_SAME_UUID; - - tag = (journal_block_tag_t *) tagp; - tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr); - tag->t_flags = cpu_to_be32(tag_flag); - tagp += sizeof(journal_block_tag_t); - space_left -= sizeof(journal_block_tag_t); - - if (first_tag) { - memcpy (tagp, journal->j_uuid, 16); - tagp += 16; - space_left -= 16; - first_tag = 0; - } - - /* If there's no more to do, or if the descriptor is full, - let the IO rip! */ - - if (bufs == journal->j_wbufsize || - commit_transaction->t_buffers == NULL || - space_left < sizeof(journal_block_tag_t) + 16) { - - jbd_debug(4, "JBD: Submit %d IOs\n", bufs); - - /* Write an end-of-descriptor marker before - submitting the IOs. "tag" still points to - the last tag we set up. */ - - tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG); - -start_journal_io: - for (i = 0; i < bufs; i++) { - struct buffer_head *bh = wbuf[i]; - lock_buffer(bh); - clear_buffer_dirty(bh); - set_buffer_uptodate(bh); - bh->b_end_io = journal_end_buffer_io_sync; - /* - * In data=journal mode, here we can end up - * writing pagecache data that might be - * mmapped. Since we can't afford to clean the - * page and set PageWriteback (see the comment - * near the other use of _submit_bh()), the - * data can change while the write is in - * flight. Tell the block layer to bounce the - * bio pages if stable pages are required. - */ - _submit_bh(write_op, bh, 1 << BIO_SNAP_STABLE); - } - cond_resched(); - - /* Force a new descriptor to be generated next - time round the loop. */ - descriptor = NULL; - bufs = 0; - } - } - - blk_finish_plug(&plug); - - /* Lo and behold: we have just managed to send a transaction to - the log. Before we can commit it, wait for the IO so far to - complete. Control buffers being written are on the - transaction's t_log_list queue, and metadata buffers are on - the t_iobuf_list queue. - - Wait for the buffers in reverse order. That way we are - less likely to be woken up until all IOs have completed, and - so we incur less scheduling load. - */ - - jbd_debug(3, "JBD: commit phase 4\n"); - - /* - * akpm: these are BJ_IO, and j_list_lock is not needed. - * See __journal_try_to_free_buffer. - */ -wait_for_iobuf: - while (commit_transaction->t_iobuf_list != NULL) { - struct buffer_head *bh; - - jh = commit_transaction->t_iobuf_list->b_tprev; - bh = jh2bh(jh); - if (buffer_locked(bh)) { - wait_on_buffer(bh); - goto wait_for_iobuf; - } - if (cond_resched()) - goto wait_for_iobuf; - - if (unlikely(!buffer_uptodate(bh))) - err = -EIO; - - clear_buffer_jwrite(bh); - - JBUFFER_TRACE(jh, "ph4: unfile after journal write"); - journal_unfile_buffer(journal, jh); - - /* - * ->t_iobuf_list should contain only dummy buffer_heads - * which were created by journal_write_metadata_buffer(). - */ - BUFFER_TRACE(bh, "dumping temporary bh"); - journal_put_journal_head(jh); - __brelse(bh); - J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); - free_buffer_head(bh); - - /* We also have to unlock and free the corresponding - shadowed buffer */ - jh = commit_transaction->t_shadow_list->b_tprev; - bh = jh2bh(jh); - clear_buffer_jwrite(bh); - J_ASSERT_BH(bh, buffer_jbddirty(bh)); - - /* The metadata is now released for reuse, but we need - to remember it against this transaction so that when - we finally commit, we can do any checkpointing - required. */ - JBUFFER_TRACE(jh, "file as BJ_Forget"); - journal_file_buffer(jh, commit_transaction, BJ_Forget); - /* - * Wake up any transactions which were waiting for this - * IO to complete. The barrier must be here so that changes - * by journal_file_buffer() take effect before wake_up_bit() - * does the waitqueue check. - */ - smp_mb(); - wake_up_bit(&bh->b_state, BH_Unshadow); - JBUFFER_TRACE(jh, "brelse shadowed buffer"); - __brelse(bh); - } - - J_ASSERT (commit_transaction->t_shadow_list == NULL); - - jbd_debug(3, "JBD: commit phase 5\n"); - - /* Here we wait for the revoke record and descriptor record buffers */ - wait_for_ctlbuf: - while (commit_transaction->t_log_list != NULL) { - struct buffer_head *bh; - - jh = commit_transaction->t_log_list->b_tprev; - bh = jh2bh(jh); - if (buffer_locked(bh)) { - wait_on_buffer(bh); - goto wait_for_ctlbuf; - } - if (cond_resched()) - goto wait_for_ctlbuf; - - if (unlikely(!buffer_uptodate(bh))) - err = -EIO; - - BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); - clear_buffer_jwrite(bh); - journal_unfile_buffer(journal, jh); - journal_put_journal_head(jh); - __brelse(bh); /* One for getblk */ - /* AKPM: bforget here */ - } - - if (err) - journal_abort(journal, err); - - jbd_debug(3, "JBD: commit phase 6\n"); - - /* All metadata is written, now write commit record and do cleanup */ - spin_lock(&journal->j_state_lock); - J_ASSERT(commit_transaction->t_state == T_COMMIT); - commit_transaction->t_state = T_COMMIT_RECORD; - spin_unlock(&journal->j_state_lock); - - if (journal_write_commit_record(journal, commit_transaction)) - err = -EIO; - - if (err) - journal_abort(journal, err); - - /* End of a transaction! Finally, we can do checkpoint - processing: any buffers committed as a result of this - transaction can be removed from any checkpoint list it was on - before. */ - - jbd_debug(3, "JBD: commit phase 7\n"); - - J_ASSERT(commit_transaction->t_sync_datalist == NULL); - J_ASSERT(commit_transaction->t_buffers == NULL); - J_ASSERT(commit_transaction->t_checkpoint_list == NULL); - J_ASSERT(commit_transaction->t_iobuf_list == NULL); - J_ASSERT(commit_transaction->t_shadow_list == NULL); - J_ASSERT(commit_transaction->t_log_list == NULL); - -restart_loop: - /* - * As there are other places (journal_unmap_buffer()) adding buffers - * to this list we have to be careful and hold the j_list_lock. - */ - spin_lock(&journal->j_list_lock); - while (commit_transaction->t_forget) { - transaction_t *cp_transaction; - struct buffer_head *bh; - int try_to_free = 0; - - jh = commit_transaction->t_forget; - spin_unlock(&journal->j_list_lock); - bh = jh2bh(jh); - /* - * Get a reference so that bh cannot be freed before we are - * done with it. - */ - get_bh(bh); - jbd_lock_bh_state(bh); - J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || - jh->b_transaction == journal->j_running_transaction); - - /* - * If there is undo-protected committed data against - * this buffer, then we can remove it now. If it is a - * buffer needing such protection, the old frozen_data - * field now points to a committed version of the - * buffer, so rotate that field to the new committed - * data. - * - * Otherwise, we can just throw away the frozen data now. - */ - if (jh->b_committed_data) { - jbd_free(jh->b_committed_data, bh->b_size); - jh->b_committed_data = NULL; - if (jh->b_frozen_data) { - jh->b_committed_data = jh->b_frozen_data; - jh->b_frozen_data = NULL; - } - } else if (jh->b_frozen_data) { - jbd_free(jh->b_frozen_data, bh->b_size); - jh->b_frozen_data = NULL; - } - - spin_lock(&journal->j_list_lock); - cp_transaction = jh->b_cp_transaction; - if (cp_transaction) { - JBUFFER_TRACE(jh, "remove from old cp transaction"); - __journal_remove_checkpoint(jh); - } - - /* Only re-checkpoint the buffer_head if it is marked - * dirty. If the buffer was added to the BJ_Forget list - * by journal_forget, it may no longer be dirty and - * there's no point in keeping a checkpoint record for - * it. */ - - /* - * A buffer which has been freed while still being journaled by - * a previous transaction. - */ - if (buffer_freed(bh)) { - /* - * If the running transaction is the one containing - * "add to orphan" operation (b_next_transaction != - * NULL), we have to wait for that transaction to - * commit before we can really get rid of the buffer. - * So just clear b_modified to not confuse transaction - * credit accounting and refile the buffer to - * BJ_Forget of the running transaction. If the just - * committed transaction contains "add to orphan" - * operation, we can completely invalidate the buffer - * now. We are rather throughout in that since the - * buffer may be still accessible when blocksize < - * pagesize and it is attached to the last partial - * page. - */ - jh->b_modified = 0; - if (!jh->b_next_transaction) { - clear_buffer_freed(bh); - clear_buffer_jbddirty(bh); - clear_buffer_mapped(bh); - clear_buffer_new(bh); - clear_buffer_req(bh); - bh->b_bdev = NULL; - } - } - - if (buffer_jbddirty(bh)) { - JBUFFER_TRACE(jh, "add to new checkpointing trans"); - __journal_insert_checkpoint(jh, commit_transaction); - if (is_journal_aborted(journal)) - clear_buffer_jbddirty(bh); - } else { - J_ASSERT_BH(bh, !buffer_dirty(bh)); - /* - * The buffer on BJ_Forget list and not jbddirty means - * it has been freed by this transaction and hence it - * could not have been reallocated until this - * transaction has committed. *BUT* it could be - * reallocated once we have written all the data to - * disk and before we process the buffer on BJ_Forget - * list. - */ - if (!jh->b_next_transaction) - try_to_free = 1; - } - JBUFFER_TRACE(jh, "refile or unfile freed buffer"); - __journal_refile_buffer(jh); - jbd_unlock_bh_state(bh); - if (try_to_free) - release_buffer_page(bh); - else - __brelse(bh); - cond_resched_lock(&journal->j_list_lock); - } - spin_unlock(&journal->j_list_lock); - /* - * This is a bit sleazy. We use j_list_lock to protect transition - * of a transaction into T_FINISHED state and calling - * __journal_drop_transaction(). Otherwise we could race with - * other checkpointing code processing the transaction... - */ - spin_lock(&journal->j_state_lock); - spin_lock(&journal->j_list_lock); - /* - * Now recheck if some buffers did not get attached to the transaction - * while the lock was dropped... - */ - if (commit_transaction->t_forget) { - spin_unlock(&journal->j_list_lock); - spin_unlock(&journal->j_state_lock); - goto restart_loop; - } - - /* Done with this transaction! */ - - jbd_debug(3, "JBD: commit phase 8\n"); - - J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD); - - commit_transaction->t_state = T_FINISHED; - J_ASSERT(commit_transaction == journal->j_committing_transaction); - journal->j_commit_sequence = commit_transaction->t_tid; - journal->j_committing_transaction = NULL; - commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); - - /* - * weight the commit time higher than the average time so we don't - * react too strongly to vast changes in commit time - */ - if (likely(journal->j_average_commit_time)) - journal->j_average_commit_time = (commit_time*3 + - journal->j_average_commit_time) / 4; - else - journal->j_average_commit_time = commit_time; - - spin_unlock(&journal->j_state_lock); - - if (commit_transaction->t_checkpoint_list == NULL && - commit_transaction->t_checkpoint_io_list == NULL) { - __journal_drop_transaction(journal, commit_transaction); - } else { - if (journal->j_checkpoint_transactions == NULL) { - journal->j_checkpoint_transactions = commit_transaction; - commit_transaction->t_cpnext = commit_transaction; - commit_transaction->t_cpprev = commit_transaction; - } else { - commit_transaction->t_cpnext = - journal->j_checkpoint_transactions; - commit_transaction->t_cpprev = - commit_transaction->t_cpnext->t_cpprev; - commit_transaction->t_cpnext->t_cpprev = - commit_transaction; - commit_transaction->t_cpprev->t_cpnext = - commit_transaction; - } - } - spin_unlock(&journal->j_list_lock); - - trace_jbd_end_commit(journal, commit_transaction); - jbd_debug(1, "JBD: commit %d complete, head %d\n", - journal->j_commit_sequence, journal->j_tail_sequence); - - wake_up(&journal->j_wait_done_commit); -} diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c deleted file mode 100644 index c46a79adb6ad..000000000000 --- a/fs/jbd/journal.c +++ /dev/null @@ -1,2145 +0,0 @@ -/* - * linux/fs/jbd/journal.c - * - * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 - * - * Copyright 1998 Red Hat corp --- All Rights Reserved - * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - * - * Generic filesystem journal-writing code; part of the ext2fs - * journaling system. - * - * This file manages journals: areas of disk reserved for logging - * transactional updates. This includes the kernel journaling thread - * which is responsible for scheduling updates to the log. - * - * We do not actually manage the physical storage of the journal in this - * file: that is left to a per-journal policy function, which allows us - * to store the journal within a filesystem-specified area for ext2 - * journaling (ext2 can use a reserved inode for storing the log). - */ - -#include <linux/module.h> -#include <linux/time.h> -#include <linux/fs.h> -#include <linux/jbd.h> -#include <linux/errno.h> -#include <linux/slab.h> -#include <linux/init.h> -#include <linux/mm.h> -#include <linux/freezer.h> -#include <linux/pagemap.h> -#include <linux/kthread.h> -#include <linux/poison.h> -#include <linux/proc_fs.h> -#include <linux/debugfs.h> -#include <linux/ratelimit.h> - -#define CREATE_TRACE_POINTS -#include <trace/events/jbd.h> - -#include <asm/uaccess.h> -#include <asm/page.h> - -EXPORT_SYMBOL(journal_start); -EXPORT_SYMBOL(journal_restart); -EXPORT_SYMBOL(journal_extend); -EXPORT_SYMBOL(journal_stop); -EXPORT_SYMBOL(journal_lock_updates); -EXPORT_SYMBOL(journal_unlock_updates); -EXPORT_SYMBOL(journal_get_write_access); -EXPORT_SYMBOL(journal_get_create_access); -EXPORT_SYMBOL(journal_get_undo_access); -EXPORT_SYMBOL(journal_dirty_data); -EXPORT_SYMBOL(journal_dirty_metadata); -EXPORT_SYMBOL(journal_release_buffer); -EXPORT_SYMBOL(journal_forget); -#if 0 -EXPORT_SYMBOL(journal_sync_buffer); -#endif -EXPORT_SYMBOL(journal_flush); -EXPORT_SYMBOL(journal_revoke); - -EXPORT_SYMBOL(journal_init_dev); -EXPORT_SYMBOL(journal_init_inode); -EXPORT_SYMBOL(journal_update_format); -EXPORT_SYMBOL(journal_check_used_features); -EXPORT_SYMBOL(journal_check_available_features); -EXPORT_SYMBOL(journal_set_features); -EXPORT_SYMBOL(journal_create); -EXPORT_SYMBOL(journal_load); -EXPORT_SYMBOL(journal_destroy); -EXPORT_SYMBOL(journal_abort); -EXPORT_SYMBOL(journal_errno); -EXPORT_SYMBOL(journal_ack_err); -EXPORT_SYMBOL(journal_clear_err); -EXPORT_SYMBOL(log_wait_commit); -EXPORT_SYMBOL(log_start_commit); -EXPORT_SYMBOL(journal_start_commit); -EXPORT_SYMBOL(journal_force_commit_nested); -EXPORT_SYMBOL(journal_wipe); -EXPORT_SYMBOL(journal_blocks_per_page); -EXPORT_SYMBOL(journal_invalidatepage); -EXPORT_SYMBOL(journal_try_to_free_buffers); -EXPORT_SYMBOL(journal_force_commit); - -static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); -static void __journal_abort_soft (journal_t *journal, int errno); -static const char *journal_dev_name(journal_t *journal, char *buffer); - -#ifdef CONFIG_JBD_DEBUG -void __jbd_debug(int level, const char *file, const char *func, - unsigned int line, const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - if (level > journal_enable_debug) - return; - va_start(args, fmt); - vaf.fmt = fmt; - vaf.va = &args; - printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf); - va_end(args); -} -EXPORT_SYMBOL(__jbd_debug); -#endif - -/* - * Helper function used to manage commit timeouts - */ - -static void commit_timeout(unsigned long __data) -{ - struct task_struct * p = (struct task_struct *) __data; - - wake_up_process(p); -} - -/* - * kjournald: The main thread function used to manage a logging device - * journal. - * - * This kernel thread is responsible for two things: - * - * 1) COMMIT: Every so often we need to commit the current state of the - * filesystem to disk. The journal thread is responsible for writing - * all of the metadata buffers to disk. - * - * 2) CHECKPOINT: We cannot reuse a used section of the log file until all - * of the data in that part of the log has been rewritten elsewhere on - * the disk. Flushing these old buffers to reclaim space in the log is - * known as checkpointing, and this thread is responsible for that job. - */ - -static int kjournald(void *arg) -{ - journal_t *journal = arg; - transaction_t *transaction; - - /* - * Set up an interval timer which can be used to trigger a commit wakeup - * after the commit interval expires - */ - setup_timer(&journal->j_commit_timer, commit_timeout, - (unsigned long)current); - - set_freezable(); - - /* Record that the journal thread is running */ - journal->j_task = current; - wake_up(&journal->j_wait_done_commit); - - printk(KERN_INFO "kjournald starting. Commit interval %ld seconds\n", - journal->j_commit_interval / HZ); - - /* - * And now, wait forever for commit wakeup events. - */ - spin_lock(&journal->j_state_lock); - -loop: - if (journal->j_flags & JFS_UNMOUNT) - goto end_loop; - - jbd_debug(1, "commit_sequence=%d, commit_request=%d\n", - journal->j_commit_sequence, journal->j_commit_request); - - if (journal->j_commit_sequence != journal->j_commit_request) { - jbd_debug(1, "OK, requests differ\n"); - spin_unlock(&journal->j_state_lock); - del_timer_sync(&journal->j_commit_timer); - journal_commit_transaction(journal); - spin_lock(&journal->j_state_lock); - goto loop; - } - - wake_up(&journal->j_wait_done_commit); - if (freezing(current)) { - /* - * The simpler the better. Flushing journal isn't a - * good idea, because that depends on threads that may - * be already stopped. - */ - jbd_debug(1, "Now suspending kjournald\n"); - spin_unlock(&journal->j_state_lock); - try_to_freeze(); - spin_lock(&journal->j_state_lock); - } else { - /* - * We assume on resume that commits are already there, - * so we don't sleep - */ - DEFINE_WAIT(wait); - int should_sleep = 1; - - prepare_to_wait(&journal->j_wait_commit, &wait, - TASK_INTERRUPTIBLE); - if (journal->j_commit_sequence != journal->j_commit_request) - should_sleep = 0; - transaction = journal->j_running_transaction; - if (transaction && time_after_eq(jiffies, - transaction->t_expires)) - should_sleep = 0; - if (journal->j_flags & JFS_UNMOUNT) - should_sleep = 0; - if (should_sleep) { - spin_unlock(&journal->j_state_lock); - schedule(); - spin_lock(&journal->j_state_lock); - } - finish_wait(&journal->j_wait_commit, &wait); - } - - jbd_debug(1, "kjournald wakes\n"); - - /* - * Were we woken up by a commit wakeup event? - */ - transaction = journal->j_running_transaction; - if (transaction && time_after_eq(jiffies, transaction->t_expires)) { - journal->j_commit_request = transaction->t_tid; - jbd_debug(1, "woke because of timeout\n"); - } - goto loop; - -end_loop: - spin_unlock(&journal->j_state_lock); - del_timer_sync(&journal->j_commit_timer); - journal->j_task = NULL; - wake_up(&journal->j_wait_done_commit); - jbd_debug(1, "Journal thread exiting.\n"); - return 0; -} - -static int journal_start_thread(journal_t *journal) -{ - struct task_struct *t; - - t = kthread_run(kjournald, journal, "kjournald"); - if (IS_ERR(t)) - return PTR_ERR(t); - - wait_event(journal->j_wait_done_commit, journal->j_task != NULL); - return 0; -} - -static void journal_kill_thread(journal_t *journal) -{ - spin_lock(&journal->j_state_lock); - journal->j_flags |= JFS_UNMOUNT; - - while (journal->j_task) { - wake_up(&journal->j_wait_commit); - spin_unlock(&journal->j_state_lock); - wait_event(journal->j_wait_done_commit, - journal->j_task == NULL); - spin_lock(&journal->j_state_lock); - } - spin_unlock(&journal->j_state_lock); -} - -/* - * journal_write_metadata_buffer: write a metadata buffer to the journal. - * - * Writes a metadata buffer to a given disk block. The actual IO is not - * performed but a new buffer_head is constructed which labels the data - * to be written with the correct destination disk block. - * - * Any magic-number escaping which needs to be done will cause a - * copy-out here. If the buffer happens to start with the - * JFS_MAGIC_NUMBER, then we can't write it to the log directly: the - * magic number is only written to the log for descripter blocks. In - * this case, we copy the data and replace the first word with 0, and we - * return a result code which indicates that this buffer needs to be - * marked as an escaped buffer in the corresponding log descriptor - * block. The missing word can then be restored when the block is read - * during recovery. - * - * If the source buffer has already been modified by a new transaction - * since we took the last commit snapshot, we use the frozen copy of - * that data for IO. If we end up using the existing buffer_head's data - * for the write, then we *have* to lock the buffer to prevent anyone - * else from using and possibly modifying it while the IO is in - * progress. - * - * The function returns a pointer to the buffer_heads to be used for IO. - * - * We assume that the journal has already been locked in this function. - * - * Return value: - * <0: Error - * >=0: Finished OK - * - * On success: - * Bit 0 set == escape performed on the data - * Bit 1 set == buffer copy-out performed (kfree the data after IO) - */ - -int journal_write_metadata_buffer(transaction_t *transaction, - struct journal_head *jh_in, - struct journal_head **jh_out, - unsigned int blocknr) -{ - int need_copy_out = 0; - int done_copy_out = 0; - int do_escape = 0; - char *mapped_data; - struct buffer_head *new_bh; - struct journal_head *new_jh; - struct page *new_page; - unsigned int new_offset; - struct buffer_head *bh_in = jh2bh(jh_in); - journal_t *journal = transaction->t_journal; - - /* - * The buffer really shouldn't be locked: only the current committing - * transaction is allowed to write it, so nobody else is allowed - * to do any IO. - * - * akpm: except if we're journalling data, and write() output is - * also part of a shared mapping, and another thread has - * decided to launch a writepage() against this buffer. - */ - J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); - - new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); - /* keep subsequent assertions sane */ - atomic_set(&new_bh->b_count, 1); - new_jh = journal_add_journal_head(new_bh); /* This sleeps */ - - /* - * If a new transaction has already done a buffer copy-out, then - * we use that version of the data for the commit. - */ - jbd_lock_bh_state(bh_in); -repeat: - if (jh_in->b_frozen_data) { - done_copy_out = 1; - new_page = virt_to_page(jh_in->b_frozen_data); - new_offset = offset_in_page(jh_in->b_frozen_data); - } else { - new_page = jh2bh(jh_in)->b_page; - new_offset = offset_in_page(jh2bh(jh_in)->b_data); - } - - mapped_data = kmap_atomic(new_page); - /* - * Check for escaping - */ - if (*((__be32 *)(mapped_data + new_offset)) == - cpu_to_be32(JFS_MAGIC_NUMBER)) { - need_copy_out = 1; - do_escape = 1; - } - kunmap_atomic(mapped_data); - - /* - * Do we need to do a data copy? - */ - if (need_copy_out && !done_copy_out) { - char *tmp; - - jbd_unlock_bh_state(bh_in); - tmp = jbd_alloc(bh_in->b_size, GFP_NOFS); - jbd_lock_bh_state(bh_in); - if (jh_in->b_frozen_data) { - jbd_free(tmp, bh_in->b_size); - goto repeat; - } - - jh_in->b_frozen_data = tmp; - mapped_data = kmap_atomic(new_page); - memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size); - kunmap_atomic(mapped_data); - - new_page = virt_to_page(tmp); - new_offset = offset_in_page(tmp); - done_copy_out = 1; - } - - /* - * Did we need to do an escaping? Now we've done all the - * copying, we can finally do so. - */ - if (do_escape) { - mapped_data = kmap_atomic(new_page); - *((unsigned int *)(mapped_data + new_offset)) = 0; - kunmap_atomic(mapped_data); - } - - set_bh_page(new_bh, new_page, new_offset); - new_jh->b_transaction = NULL; - new_bh->b_size = jh2bh(jh_in)->b_size; - new_bh->b_bdev = transaction->t_journal->j_dev; - new_bh->b_blocknr = blocknr; - set_buffer_mapped(new_bh); - set_buffer_dirty(new_bh); - - *jh_out = new_jh; - - /* - * The to-be-written buffer needs to get moved to the io queue, - * and the original buffer whose contents we are shadowing or - * copying is moved to the transaction's shadow queue. - */ - JBUFFER_TRACE(jh_in, "file as BJ_Shadow"); - spin_lock(&journal->j_list_lock); - __journal_file_buffer(jh_in, transaction, BJ_Shadow); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh_in); - - JBUFFER_TRACE(new_jh, "file as BJ_IO"); - journal_file_buffer(new_jh, transaction, BJ_IO); - - return do_escape | (done_copy_out << 1); -} - -/* - * Allocation code for the journal file. Manage the space left in the - * journal, so that we can begin checkpointing when appropriate. - */ - -/* - * __log_space_left: Return the number of free blocks left in the journal. - * - * Called with the journal already locked. - * - * Called under j_state_lock - */ - -int __log_space_left(journal_t *journal) -{ - int left = journal->j_free; - - assert_spin_locked(&journal->j_state_lock); - - /* - * Be pessimistic here about the number of those free blocks which - * might be required for log descriptor control blocks. - */ - -#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */ - - left -= MIN_LOG_RESERVED_BLOCKS; - - if (left <= 0) - return 0; - left -= (left >> 3); - return left; -} - -/* - * Called under j_state_lock. Returns true if a transaction commit was started. - */ -int __log_start_commit(journal_t *journal, tid_t target) -{ - /* - * The only transaction we can possibly wait upon is the - * currently running transaction (if it exists). Otherwise, - * the target tid must be an old one. - */ - if (journal->j_commit_request != target && - journal->j_running_transaction && - journal->j_running_transaction->t_tid == target) { - /* - * We want a new commit: OK, mark the request and wakeup the - * commit thread. We do _not_ do the commit ourselves. - */ - - journal->j_commit_request = target; - jbd_debug(1, "JBD: requesting commit %d/%d\n", - journal->j_commit_request, - journal->j_commit_sequence); - wake_up(&journal->j_wait_commit); - return 1; - } else if (!tid_geq(journal->j_commit_request, target)) - /* This should never happen, but if it does, preserve - the evidence before kjournald goes into a loop and - increments j_commit_sequence beyond all recognition. */ - WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n", - journal->j_commit_request, journal->j_commit_sequence, - target, journal->j_running_transaction ? - journal->j_running_transaction->t_tid : 0); - return 0; -} - -int log_start_commit(journal_t *journal, tid_t tid) -{ - int ret; - - spin_lock(&journal->j_state_lock); - ret = __log_start_commit(journal, tid); - spin_unlock(&journal->j_state_lock); - return ret; -} - -/* - * Force and wait upon a commit if the calling process is not within - * transaction. This is used for forcing out undo-protected data which contains - * bitmaps, when the fs is running out of space. - * - * We can only force the running transaction if we don't have an active handle; - * otherwise, we will deadlock. - * - * Returns true if a transaction was started. - */ -int journal_force_commit_nested(journal_t *journal) -{ - transaction_t *transaction = NULL; - tid_t tid; - - spin_lock(&journal->j_state_lock); - if (journal->j_running_transaction && !current->journal_info) { - transaction = journal->j_running_transaction; - __log_start_commit(journal, transaction->t_tid); - } else if (journal->j_committing_transaction) - transaction = journal->j_committing_transaction; - - if (!transaction) { - spin_unlock(&journal->j_state_lock); - return 0; /* Nothing to retry */ - } - - tid = transaction->t_tid; - spin_unlock(&journal->j_state_lock); - log_wait_commit(journal, tid); - return 1; -} - -/* - * Start a commit of the current running transaction (if any). Returns true - * if a transaction is going to be committed (or is currently already - * committing), and fills its tid in at *ptid - */ -int journal_start_commit(journal_t *journal, tid_t *ptid) -{ - int ret = 0; - - spin_lock(&journal->j_state_lock); - if (journal->j_running_transaction) { - tid_t tid = journal->j_running_transaction->t_tid; - - __log_start_commit(journal, tid); - /* There's a running transaction and we've just made sure - * it's commit has been scheduled. */ - if (ptid) - *ptid = tid; - ret = 1; - } else if (journal->j_committing_transaction) { - /* - * If commit has been started, then we have to wait for - * completion of that transaction. - */ - if (ptid) - *ptid = journal->j_committing_transaction->t_tid; - ret = 1; - } - spin_unlock(&journal->j_state_lock); - return ret; -} - -/* - * Wait for a specified commit to complete. - * The caller may not hold the journal lock. - */ -int log_wait_commit(journal_t *journal, tid_t tid) -{ - int err = 0; - -#ifdef CONFIG_JBD_DEBUG - spin_lock(&journal->j_state_lock); - if (!tid_geq(journal->j_commit_request, tid)) { - printk(KERN_ERR - "%s: error: j_commit_request=%d, tid=%d\n", - __func__, journal->j_commit_request, tid); - } - spin_unlock(&journal->j_state_lock); -#endif - spin_lock(&journal->j_state_lock); - /* - * Not running or committing trans? Must be already committed. This - * saves us from waiting for a *long* time when tid overflows. - */ - if (!((journal->j_running_transaction && - journal->j_running_transaction->t_tid == tid) || - (journal->j_committing_transaction && - journal->j_committing_transaction->t_tid == tid))) - goto out_unlock; - - if (!tid_geq(journal->j_commit_waited, tid)) - journal->j_commit_waited = tid; - while (tid_gt(tid, journal->j_commit_sequence)) { - jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", - tid, journal->j_commit_sequence); - wake_up(&journal->j_wait_commit); - spin_unlock(&journal->j_state_lock); - wait_event(journal->j_wait_done_commit, - !tid_gt(tid, journal->j_commit_sequence)); - spin_lock(&journal->j_state_lock); - } -out_unlock: - spin_unlock(&journal->j_state_lock); - - if (unlikely(is_journal_aborted(journal))) - err = -EIO; - return err; -} - -/* - * Return 1 if a given transaction has not yet sent barrier request - * connected with a transaction commit. If 0 is returned, transaction - * may or may not have sent the barrier. Used to avoid sending barrier - * twice in common cases. - */ -int journal_trans_will_send_data_barrier(journal_t *journal, tid_t tid) -{ - int ret = 0; - transaction_t *commit_trans; - - if (!(journal->j_flags & JFS_BARRIER)) - return 0; - spin_lock(&journal->j_state_lock); - /* Transaction already committed? */ - if (tid_geq(journal->j_commit_sequence, tid)) - goto out; - /* - * Transaction is being committed and we already proceeded to - * writing commit record? - */ - commit_trans = journal->j_committing_transaction; - if (commit_trans && commit_trans->t_tid == tid && - commit_trans->t_state >= T_COMMIT_RECORD) - goto out; - ret = 1; -out: - spin_unlock(&journal->j_state_lock); - return ret; -} -EXPORT_SYMBOL(journal_trans_will_send_data_barrier); - -/* - * Log buffer allocation routines: - */ - -int journal_next_log_block(journal_t *journal, unsigned int *retp) -{ - unsigned int blocknr; - - spin_lock(&journal->j_state_lock); - J_ASSERT(journal->j_free > 1); - - blocknr = journal->j_head; - journal->j_head++; - journal->j_free--; - if (journal->j_head == journal->j_last) - journal->j_head = journal->j_first; - spin_unlock(&journal->j_state_lock); - return journal_bmap(journal, blocknr, retp); -} - -/* - * Conversion of logical to physical block numbers for the journal - * - * On external journals the journal blocks are identity-mapped, so - * this is a no-op. If needed, we can use j_blk_offset - everything is - * ready. - */ -int journal_bmap(journal_t *journal, unsigned int blocknr, - unsigned int *retp) -{ - int err = 0; - unsigned int ret; - - if (journal->j_inode) { - ret = bmap(journal->j_inode, blocknr); - if (ret) - *retp = ret; - else { - char b[BDEVNAME_SIZE]; - - printk(KERN_ALERT "%s: journal block not found " - "at offset %u on %s\n", - __func__, - blocknr, - bdevname(journal->j_dev, b)); - err = -EIO; - __journal_abort_soft(journal, err); - } - } else { - *retp = blocknr; /* +journal->j_blk_offset */ - } - return err; -} - -/* - * We play buffer_head aliasing tricks to write data/metadata blocks to - * the journal without copying their contents, but for journal - * descriptor blocks we do need to generate bona fide buffers. - * - * After the caller of journal_get_descriptor_buffer() has finished modifying - * the buffer's contents they really should run flush_dcache_page(bh->b_page). - * But we don't bother doing that, so there will be coherency problems with - * mmaps of blockdevs which hold live JBD-controlled filesystems. - */ -struct journal_head *journal_get_descriptor_buffer(journal_t *journal) -{ - struct buffer_head *bh; - unsigned int blocknr; - int err; - - err = journal_next_log_block(journal, &blocknr); - - if (err) - return NULL; - - bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); - if (!bh) - return NULL; - lock_buffer(bh); - memset(bh->b_data, 0, journal->j_blocksize); - set_buffer_uptodate(bh); - unlock_buffer(bh); - BUFFER_TRACE(bh, "return this buffer"); - return journal_add_journal_head(bh); -} - -/* - * Management for journal control blocks: functions to create and - * destroy journal_t structures, and to initialise and read existing - * journal blocks from disk. */ - -/* First: create and setup a journal_t object in memory. We initialise - * very few fields yet: that has to wait until we have created the - * journal structures from from scratch, or loaded them from disk. */ - -static journal_t * journal_init_common (void) -{ - journal_t *journal; - int err; - - journal = kzalloc(sizeof(*journal), GFP_KERNEL); - if (!journal) - goto fail; - - init_waitqueue_head(&journal->j_wait_transaction_locked); - init_waitqueue_head(&journal->j_wait_logspace); - init_waitqueue_head(&journal->j_wait_done_commit); - init_waitqueue_head(&journal->j_wait_checkpoint); - init_waitqueue_head(&journal->j_wait_commit); - init_waitqueue_head(&journal->j_wait_updates); - mutex_init(&journal->j_checkpoint_mutex); - spin_lock_init(&journal->j_revoke_lock); - spin_lock_init(&journal->j_list_lock); - spin_lock_init(&journal->j_state_lock); - - journal->j_commit_interval = (HZ * JBD_DEFAULT_MAX_COMMIT_AGE); - - /* The journal is marked for error until we succeed with recovery! */ - journal->j_flags = JFS_ABORT; - - /* Set up a default-sized revoke table for the new mount. */ - err = journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH); - if (err) { - kfree(journal); - goto fail; - } - return journal; -fail: - return NULL; -} - -/* journal_init_dev and journal_init_inode: - * - * Create a journal structure assigned some fixed set of disk blocks to - * the journal. We don't actually touch those disk blocks yet, but we - * need to set up all of the mapping information to tell the journaling - * system where the journal blocks are. - * - */ - -/** - * journal_t * journal_init_dev() - creates and initialises a journal structure - * @bdev: Block device on which to create the journal - * @fs_dev: Device which hold journalled filesystem for this journal. - * @start: Block nr Start of journal. - * @len: Length of the journal in blocks. - * @blocksize: blocksize of journalling device - * - * Returns: a newly created journal_t * - * - * journal_init_dev creates a journal which maps a fixed contiguous - * range of blocks on an arbitrary block device. - * - */ -journal_t * journal_init_dev(struct block_device *bdev, - struct block_device *fs_dev, - int start, int len, int blocksize) -{ - journal_t *journal = journal_init_common(); - struct buffer_head *bh; - int n; - - if (!journal) - return NULL; - - /* journal descriptor can store up to n blocks -bzzz */ - journal->j_blocksize = blocksize; - n = journal->j_blocksize / sizeof(journal_block_tag_t); - journal->j_wbufsize = n; - journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); - if (!journal->j_wbuf) { - printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n", - __func__); - goto out_err; - } - journal->j_dev = bdev; - journal->j_fs_dev = fs_dev; - journal->j_blk_offset = start; - journal->j_maxlen = len; - - bh = __getblk(journal->j_dev, start, journal->j_blocksize); - if (!bh) { - printk(KERN_ERR - "%s: Cannot get buffer for journal superblock\n", - __func__); - goto out_err; - } - journal->j_sb_buffer = bh; - journal->j_superblock = (journal_superblock_t *)bh->b_data; - - return journal; -out_err: - kfree(journal->j_wbuf); - kfree(journal); - return NULL; -} - -/** - * journal_t * journal_init_inode () - creates a journal which maps to a inode. - * @inode: An inode to create the journal in - * - * journal_init_inode creates a journal which maps an on-disk inode as - * the journal. The inode must exist already, must support bmap() and - * must have all data blocks preallocated. - */ -journal_t * journal_init_inode (struct inode *inode) -{ - struct buffer_head *bh; - journal_t *journal = journal_init_common(); - int err; - int n; - unsigned int blocknr; - - if (!journal) - return NULL; - - journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev; - journal->j_inode = inode; - jbd_debug(1, - "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", - journal, inode->i_sb->s_id, inode->i_ino, - (long long) inode->i_size, - inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); - - journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits; - journal->j_blocksize = inode->i_sb->s_blocksize; - - /* journal descriptor can store up to n blocks -bzzz */ - n = journal->j_blocksize / sizeof(journal_block_tag_t); - journal->j_wbufsize = n; - journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); - if (!journal->j_wbuf) { - printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n", - __func__); - goto out_err; - } - - err = journal_bmap(journal, 0, &blocknr); - /* If that failed, give up */ - if (err) { - printk(KERN_ERR "%s: Cannot locate journal superblock\n", - __func__); - goto out_err; - } - - bh = getblk_unmovable(journal->j_dev, blocknr, journal->j_blocksize); - if (!bh) { - printk(KERN_ERR - "%s: Cannot get buffer for journal superblock\n", - __func__); - goto out_err; - } - journal->j_sb_buffer = bh; - journal->j_superblock = (journal_superblock_t *)bh->b_data; - - return journal; -out_err: - kfree(journal->j_wbuf); - kfree(journal); - return NULL; -} - -/* - * If the journal init or create aborts, we need to mark the journal - * superblock as being NULL to prevent the journal destroy from writing - * back a bogus superblock. - */ -static void journal_fail_superblock (journal_t *journal) -{ - struct buffer_head *bh = journal->j_sb_buffer; - brelse(bh); - journal->j_sb_buffer = NULL; -} - -/* - * Given a journal_t structure, initialise the various fields for - * startup of a new journaling session. We use this both when creating - * a journal, and after recovering an old journal to reset it for - * subsequent use. - */ - -static int journal_reset(journal_t *journal) -{ - journal_superblock_t *sb = journal->j_superblock; - unsigned int first, last; - - first = be32_to_cpu(sb->s_first); - last = be32_to_cpu(sb->s_maxlen); - if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) { - printk(KERN_ERR "JBD: Journal too short (blocks %u-%u).\n", - first, last); - journal_fail_superblock(journal); - return -EINVAL; - } - - journal->j_first = first; - journal->j_last = last; - - journal->j_head = first; - journal->j_tail = first; - journal->j_free = last - first; - - journal->j_tail_sequence = journal->j_transaction_sequence; - journal->j_commit_sequence = journal->j_transaction_sequence - 1; - journal->j_commit_request = journal->j_commit_sequence; - - journal->j_max_transaction_buffers = journal->j_maxlen / 4; - - /* - * As a special case, if the on-disk copy is already marked as needing - * no recovery (s_start == 0), then we can safely defer the superblock - * update until the next commit by setting JFS_FLUSHED. This avoids - * attempting a write to a potential-readonly device. - */ - if (sb->s_start == 0) { - jbd_debug(1,"JBD: Skipping superblock update on recovered sb " - "(start %u, seq %d, errno %d)\n", - journal->j_tail, journal->j_tail_sequence, - journal->j_errno); - journal->j_flags |= JFS_FLUSHED; - } else { - /* Lock here to make assertions happy... */ - mutex_lock(&journal->j_checkpoint_mutex); - /* - * Update log tail information. We use WRITE_FUA since new - * transaction will start reusing journal space and so we - * must make sure information about current log tail is on - * disk before that. - */ - journal_update_sb_log_tail(journal, - journal->j_tail_sequence, - journal->j_tail, - WRITE_FUA); - mutex_unlock(&journal->j_checkpoint_mutex); - } - return journal_start_thread(journal); -} - -/** - * int journal_create() - Initialise the new journal file - * @journal: Journal to create. This structure must have been initialised - * - * Given a journal_t structure which tells us which disk blocks we can - * use, create a new journal superblock and initialise all of the - * journal fields from scratch. - **/ -int journal_create(journal_t *journal) -{ - unsigned int blocknr; - struct buffer_head *bh; - journal_superblock_t *sb; - int i, err; - - if (journal->j_maxlen < JFS_MIN_JOURNAL_BLOCKS) { - printk (KERN_ERR "Journal length (%d blocks) too short.\n", - journal->j_maxlen); - journal_fail_superblock(journal); - return -EINVAL; - } - - if (journal->j_inode == NULL) { - /* - * We don't know what block to start at! - */ - printk(KERN_EMERG - "%s: creation of journal on external device!\n", - __func__); - BUG(); - } - - /* Zero out the entire journal on disk. We cannot afford to - have any blocks on disk beginning with JFS_MAGIC_NUMBER. */ - jbd_debug(1, "JBD: Zeroing out journal blocks...\n"); - for (i = 0; i < journal->j_maxlen; i++) { - err = journal_bmap(journal, i, &blocknr); - if (err) - return err; - bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); - if (unlikely(!bh)) - return -ENOMEM; - lock_buffer(bh); - memset (bh->b_data, 0, journal->j_blocksize); - BUFFER_TRACE(bh, "marking dirty"); - mark_buffer_dirty(bh); - BUFFER_TRACE(bh, "marking uptodate"); - set_buffer_uptodate(bh); - unlock_buffer(bh); - __brelse(bh); - } - - sync_blockdev(journal->j_dev); - jbd_debug(1, "JBD: journal cleared.\n"); - - /* OK, fill in the initial static fields in the new superblock */ - sb = journal->j_superblock; - - sb->s_header.h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); - sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2); - - sb->s_blocksize = cpu_to_be32(journal->j_blocksize); - sb->s_maxlen = cpu_to_be32(journal->j_maxlen); - sb->s_first = cpu_to_be32(1); - - journal->j_transaction_sequence = 1; - - journal->j_flags &= ~JFS_ABORT; - journal->j_format_version = 2; - - return journal_reset(journal); -} - -static void journal_write_superblock(journal_t *journal, int write_op) -{ - struct buffer_head *bh = journal->j_sb_buffer; - int ret; - - trace_journal_write_superblock(journal, write_op); - if (!(journal->j_flags & JFS_BARRIER)) - write_op &= ~(REQ_FUA | REQ_FLUSH); - lock_buffer(bh); - if (buffer_write_io_error(bh)) { - char b[BDEVNAME_SIZE]; - /* - * Oh, dear. A previous attempt to write the journal - * superblock failed. This could happen because the - * USB device was yanked out. Or it could happen to - * be a transient write error and maybe the block will - * be remapped. Nothing we can do but to retry the - * write and hope for the best. - */ - printk(KERN_ERR "JBD: previous I/O error detected " - "for journal superblock update for %s.\n", - journal_dev_name(journal, b)); - clear_buffer_write_io_error(bh); - set_buffer_uptodate(bh); - } - - get_bh(bh); - bh->b_end_io = end_buffer_write_sync; - ret = submit_bh(write_op, bh); - wait_on_buffer(bh); - if (buffer_write_io_error(bh)) { - clear_buffer_write_io_error(bh); - set_buffer_uptodate(bh); - ret = -EIO; - } - if (ret) { - char b[BDEVNAME_SIZE]; - printk(KERN_ERR "JBD: Error %d detected " - "when updating journal superblock for %s.\n", - ret, journal_dev_name(journal, b)); - } -} - -/** - * journal_update_sb_log_tail() - Update log tail in journal sb on disk. - * @journal: The journal to update. - * @tail_tid: TID of the new transaction at the tail of the log - * @tail_block: The first block of the transaction at the tail of the log - * @write_op: With which operation should we write the journal sb - * - * Update a journal's superblock information about log tail and write it to - * disk, waiting for the IO to complete. - */ -void journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, - unsigned int tail_block, int write_op) -{ - journal_superblock_t *sb = journal->j_superblock; - - BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); - jbd_debug(1,"JBD: updating superblock (start %u, seq %u)\n", - tail_block, tail_tid); - - sb->s_sequence = cpu_to_be32(tail_tid); - sb->s_start = cpu_to_be32(tail_block); - - journal_write_superblock(journal, write_op); - - /* Log is no longer empty */ - spin_lock(&journal->j_state_lock); - WARN_ON(!sb->s_sequence); - journal->j_flags &= ~JFS_FLUSHED; - spin_unlock(&journal->j_state_lock); -} - -/** - * mark_journal_empty() - Mark on disk journal as empty. - * @journal: The journal to update. - * - * Update a journal's dynamic superblock fields to show that journal is empty. - * Write updated superblock to disk waiting for IO to complete. - */ -static void mark_journal_empty(journal_t *journal) -{ - journal_superblock_t *sb = journal->j_superblock; - - BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); - spin_lock(&journal->j_state_lock); - /* Is it already empty? */ - if (sb->s_start == 0) { - spin_unlock(&journal->j_state_lock); - return; - } - jbd_debug(1, "JBD: Marking journal as empty (seq %d)\n", - journal->j_tail_sequence); - - sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); - sb->s_start = cpu_to_be32(0); - spin_unlock(&journal->j_state_lock); - - journal_write_superblock(journal, WRITE_FUA); - - spin_lock(&journal->j_state_lock); - /* Log is empty */ - journal->j_flags |= JFS_FLUSHED; - spin_unlock(&journal->j_state_lock); -} - -/** - * journal_update_sb_errno() - Update error in the journal. - * @journal: The journal to update. - * - * Update a journal's errno. Write updated superblock to disk waiting for IO - * to complete. - */ -static void journal_update_sb_errno(journal_t *journal) -{ - journal_superblock_t *sb = journal->j_superblock; - - spin_lock(&journal->j_state_lock); - jbd_debug(1, "JBD: updating superblock error (errno %d)\n", - journal->j_errno); - sb->s_errno = cpu_to_be32(journal->j_errno); - spin_unlock(&journal->j_state_lock); - - journal_write_superblock(journal, WRITE_SYNC); -} - -/* - * Read the superblock for a given journal, performing initial - * validation of the format. - */ - -static int journal_get_superblock(journal_t *journal) -{ - struct buffer_head *bh; - journal_superblock_t *sb; - int err = -EIO; - - bh = journal->j_sb_buffer; - - J_ASSERT(bh != NULL); - if (!buffer_uptodate(bh)) { - ll_rw_block(READ, 1, &bh); - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) { - printk (KERN_ERR - "JBD: IO error reading journal superblock\n"); - goto out; - } - } - - sb = journal->j_superblock; - - err = -EINVAL; - - if (sb->s_header.h_magic != cpu_to_be32(JFS_MAGIC_NUMBER) || - sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) { - printk(KERN_WARNING "JBD: no valid journal superblock found\n"); - goto out; - } - - switch(be32_to_cpu(sb->s_header.h_blocktype)) { - case JFS_SUPERBLOCK_V1: - journal->j_format_version = 1; - break; - case JFS_SUPERBLOCK_V2: - journal->j_format_version = 2; - break; - default: - printk(KERN_WARNING "JBD: unrecognised superblock format ID\n"); - goto out; - } - - if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen) - journal->j_maxlen = be32_to_cpu(sb->s_maxlen); - else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) { - printk (KERN_WARNING "JBD: journal file too short\n"); - goto out; - } - - if (be32_to_cpu(sb->s_first) == 0 || - be32_to_cpu(sb->s_first) >= journal->j_maxlen) { - printk(KERN_WARNING - "JBD: Invalid start block of journal: %u\n", - be32_to_cpu(sb->s_first)); - goto out; - } - - return 0; - -out: - journal_fail_superblock(journal); - return err; -} - -/* - * Load the on-disk journal superblock and read the key fields into the - * journal_t. - */ - -static int load_superblock(journal_t *journal) -{ - int err; - journal_superblock_t *sb; - - err = journal_get_superblock(journal); - if (err) - return err; - - sb = journal->j_superblock; - - journal->j_tail_sequence = be32_to_cpu(sb->s_sequence); - journal->j_tail = be32_to_cpu(sb->s_start); - journal->j_first = be32_to_cpu(sb->s_first); - journal->j_last = be32_to_cpu(sb->s_maxlen); - journal->j_errno = be32_to_cpu(sb->s_errno); - - return 0; -} - - -/** - * int journal_load() - Read journal from disk. - * @journal: Journal to act on. - * - * Given a journal_t structure which tells us which disk blocks contain - * a journal, read the journal from disk to initialise the in-memory - * structures. - */ -int journal_load(journal_t *journal) -{ - int err; - journal_superblock_t *sb; - - err = load_superblock(journal); - if (err) - return err; - - sb = journal->j_superblock; - /* If this is a V2 superblock, then we have to check the - * features flags on it. */ - - if (journal->j_format_version >= 2) { - if ((sb->s_feature_ro_compat & - ~cpu_to_be32(JFS_KNOWN_ROCOMPAT_FEATURES)) || - (sb->s_feature_incompat & - ~cpu_to_be32(JFS_KNOWN_INCOMPAT_FEATURES))) { - printk (KERN_WARNING - "JBD: Unrecognised features on journal\n"); - return -EINVAL; - } - } - - /* Let the recovery code check whether it needs to recover any - * data from the journal. */ - if (journal_recover(journal)) - goto recovery_error; - - /* OK, we've finished with the dynamic journal bits: - * reinitialise the dynamic contents of the superblock in memory - * and reset them on disk. */ - if (journal_reset(journal)) - goto recovery_error; - - journal->j_flags &= ~JFS_ABORT; - journal->j_flags |= JFS_LOADED; - return 0; - -recovery_error: - printk (KERN_WARNING "JBD: recovery failed\n"); - return -EIO; -} - -/** - * void journal_destroy() - Release a journal_t structure. - * @journal: Journal to act on. - * - * Release a journal_t structure once it is no longer in use by the - * journaled object. - * Return <0 if we couldn't clean up the journal. - */ -int journal_destroy(journal_t *journal) -{ - int err = 0; - - - /* Wait for the commit thread to wake up and die. */ - journal_kill_thread(journal); - - /* Force a final log commit */ - if (journal->j_running_transaction) - journal_commit_transaction(journal); - - /* Force any old transactions to disk */ - - /* We cannot race with anybody but must keep assertions happy */ - mutex_lock(&journal->j_checkpoint_mutex); - /* Totally anal locking here... */ - spin_lock(&journal->j_list_lock); - while (journal->j_checkpoint_transactions != NULL) { - spin_unlock(&journal->j_list_lock); - log_do_checkpoint(journal); - spin_lock(&journal->j_list_lock); - } - - J_ASSERT(journal->j_running_transaction == NULL); - J_ASSERT(journal->j_committing_transaction == NULL); - J_ASSERT(journal->j_checkpoint_transactions == NULL); - spin_unlock(&journal->j_list_lock); - - if (journal->j_sb_buffer) { - if (!is_journal_aborted(journal)) { - journal->j_tail_sequence = - ++journal->j_transaction_sequence; - mark_journal_empty(journal); - } else - err = -EIO; - brelse(journal->j_sb_buffer); - } - mutex_unlock(&journal->j_checkpoint_mutex); - - iput(journal->j_inode); - if (journal->j_revoke) - journal_destroy_revoke(journal); - kfree(journal->j_wbuf); - kfree(journal); - - return err; -} - - -/** - *int journal_check_used_features () - Check if features specified are used. - * @journal: Journal to check. - * @compat: bitmask of compatible features - * @ro: bitmask of features that force read-only mount - * @incompat: bitmask of incompatible features - * - * Check whether the journal uses all of a given set of - * features. Return true (non-zero) if it does. - **/ - -int journal_check_used_features (journal_t *journal, unsigned long compat, - unsigned long ro, unsigned long incompat) -{ - journal_superblock_t *sb; - - if (!compat && !ro && !incompat) - return 1; - if (journal->j_format_version == 1) - return 0; - - sb = journal->j_superblock; - - if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) && - ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) && - ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat)) - return 1; - - return 0; -} - -/** - * int journal_check_available_features() - Check feature set in journalling layer - * @journal: Journal to check. - * @compat: bitmask of compatible features - * @ro: bitmask of features that force read-only mount - * @incompat: bitmask of incompatible features - * - * Check whether the journaling code supports the use of - * all of a given set of features on this journal. Return true - * (non-zero) if it can. */ - -int journal_check_available_features (journal_t *journal, unsigned long compat, - unsigned long ro, unsigned long incompat) -{ - if (!compat && !ro && !incompat) - return 1; - - /* We can support any known requested features iff the - * superblock is in version 2. Otherwise we fail to support any - * extended sb features. */ - - if (journal->j_format_version != 2) - return 0; - - if ((compat & JFS_KNOWN_COMPAT_FEATURES) == compat && - (ro & JFS_KNOWN_ROCOMPAT_FEATURES) == ro && - (incompat & JFS_KNOWN_INCOMPAT_FEATURES) == incompat) - return 1; - - return 0; -} - -/** - * int journal_set_features () - Mark a given journal feature in the superblock - * @journal: Journal to act on. - * @compat: bitmask of compatible features - * @ro: bitmask of features that force read-only mount - * @incompat: bitmask of incompatible features - * - * Mark a given journal feature as present on the - * superblock. Returns true if the requested features could be set. - * - */ - -int journal_set_features (journal_t *journal, unsigned long compat, - unsigned long ro, unsigned long incompat) -{ - journal_superblock_t *sb; - - if (journal_check_used_features(journal, compat, ro, incompat)) - return 1; - - if (!journal_check_available_features(journal, compat, ro, incompat)) - return 0; - - jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n", - compat, ro, incompat); - - sb = journal->j_superblock; - - sb->s_feature_compat |= cpu_to_be32(compat); - sb->s_feature_ro_compat |= cpu_to_be32(ro); - sb->s_feature_incompat |= cpu_to_be32(incompat); - - return 1; -} - - -/** - * int journal_update_format () - Update on-disk journal structure. - * @journal: Journal to act on. - * - * Given an initialised but unloaded journal struct, poke about in the - * on-disk structure to update it to the most recent supported version. - */ -int journal_update_format (journal_t *journal) -{ - journal_superblock_t *sb; - int err; - - err = journal_get_superblock(journal); - if (err) - return err; - - sb = journal->j_superblock; - - switch (be32_to_cpu(sb->s_header.h_blocktype)) { - case JFS_SUPERBLOCK_V2: - return 0; - case JFS_SUPERBLOCK_V1: - return journal_convert_superblock_v1(journal, sb); - default: - break; - } - return -EINVAL; -} - -static int journal_convert_superblock_v1(journal_t *journal, - journal_superblock_t *sb) -{ - int offset, blocksize; - struct buffer_head *bh; - - printk(KERN_WARNING - "JBD: Converting superblock from version 1 to 2.\n"); - - /* Pre-initialise new fields to zero */ - offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb); - blocksize = be32_to_cpu(sb->s_blocksize); - memset(&sb->s_feature_compat, 0, blocksize-offset); - - sb->s_nr_users = cpu_to_be32(1); - sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2); - journal->j_format_version = 2; - - bh = journal->j_sb_buffer; - BUFFER_TRACE(bh, "marking dirty"); - mark_buffer_dirty(bh); - sync_dirty_buffer(bh); - return 0; -} - - -/** - * int journal_flush () - Flush journal - * @journal: Journal to act on. - * - * Flush all data for a given journal to disk and empty the journal. - * Filesystems can use this when remounting readonly to ensure that - * recovery does not need to happen on remount. - */ - -int journal_flush(journal_t *journal) -{ - int err = 0; - transaction_t *transaction = NULL; - - spin_lock(&journal->j_state_lock); - - /* Force everything buffered to the log... */ - if (journal->j_running_transaction) { - transaction = journal->j_running_transaction; - __log_start_commit(journal, transaction->t_tid); - } else if (journal->j_committing_transaction) - transaction = journal->j_committing_transaction; - - /* Wait for the log commit to complete... */ - if (transaction) { - tid_t tid = transaction->t_tid; - - spin_unlock(&journal->j_state_lock); - log_wait_commit(journal, tid); - } else { - spin_unlock(&journal->j_state_lock); - } - - /* ...and flush everything in the log out to disk. */ - spin_lock(&journal->j_list_lock); - while (!err && journal->j_checkpoint_transactions != NULL) { - spin_unlock(&journal->j_list_lock); - mutex_lock(&journal->j_checkpoint_mutex); - err = log_do_checkpoint(journal); - mutex_unlock(&journal->j_checkpoint_mutex); - spin_lock(&journal->j_list_lock); - } - spin_unlock(&journal->j_list_lock); - - if (is_journal_aborted(journal)) - return -EIO; - - mutex_lock(&journal->j_checkpoint_mutex); - cleanup_journal_tail(journal); - - /* Finally, mark the journal as really needing no recovery. - * This sets s_start==0 in the underlying superblock, which is - * the magic code for a fully-recovered superblock. Any future - * commits of data to the journal will restore the current - * s_start value. */ - mark_journal_empty(journal); - mutex_unlock(&journal->j_checkpoint_mutex); - spin_lock(&journal->j_state_lock); - J_ASSERT(!journal->j_running_transaction); - J_ASSERT(!journal->j_committing_transaction); - J_ASSERT(!journal->j_checkpoint_transactions); - J_ASSERT(journal->j_head == journal->j_tail); - J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); - spin_unlock(&journal->j_state_lock); - return 0; -} - -/** - * int journal_wipe() - Wipe journal contents - * @journal: Journal to act on. - * @write: flag (see below) - * - * Wipe out all of the contents of a journal, safely. This will produce - * a warning if the journal contains any valid recovery information. - * Must be called between journal_init_*() and journal_load(). - * - * If 'write' is non-zero, then we wipe out the journal on disk; otherwise - * we merely suppress recovery. - */ - -int journal_wipe(journal_t *journal, int write) -{ - int err = 0; - - J_ASSERT (!(journal->j_flags & JFS_LOADED)); - - err = load_superblock(journal); - if (err) - return err; - - if (!journal->j_tail) - goto no_recovery; - - printk (KERN_WARNING "JBD: %s recovery information on journal\n", - write ? "Clearing" : "Ignoring"); - - err = journal_skip_recovery(journal); - if (write) { - /* Lock to make assertions happy... */ - mutex_lock(&journal->j_checkpoint_mutex); - mark_journal_empty(journal); - mutex_unlock(&journal->j_checkpoint_mutex); - } - - no_recovery: - return err; -} - -/* - * journal_dev_name: format a character string to describe on what - * device this journal is present. - */ - -static const char *journal_dev_name(journal_t *journal, char *buffer) -{ - struct block_device *bdev; - - if (journal->j_inode) - bdev = journal->j_inode->i_sb->s_bdev; - else - bdev = journal->j_dev; - - return bdevname(bdev, buffer); -} - -/* - * Journal abort has very specific semantics, which we describe - * for journal abort. - * - * Two internal function, which provide abort to te jbd layer - * itself are here. - */ - -/* - * Quick version for internal journal use (doesn't lock the journal). - * Aborts hard --- we mark the abort as occurred, but do _nothing_ else, - * and don't attempt to make any other journal updates. - */ -static void __journal_abort_hard(journal_t *journal) -{ - transaction_t *transaction; - char b[BDEVNAME_SIZE]; - - if (journal->j_flags & JFS_ABORT) - return; - - printk(KERN_ERR "Aborting journal on device %s.\n", - journal_dev_name(journal, b)); - - spin_lock(&journal->j_state_lock); - journal->j_flags |= JFS_ABORT; - transaction = journal->j_running_transaction; - if (transaction) - __log_start_commit(journal, transaction->t_tid); - spin_unlock(&journal->j_state_lock); -} - -/* Soft abort: record the abort error status in the journal superblock, - * but don't do any other IO. */ -static void __journal_abort_soft (journal_t *journal, int errno) -{ - if (journal->j_flags & JFS_ABORT) - return; - - if (!journal->j_errno) - journal->j_errno = errno; - - __journal_abort_hard(journal); - - if (errno) - journal_update_sb_errno(journal); -} - -/** - * void journal_abort () - Shutdown the journal immediately. - * @journal: the journal to shutdown. - * @errno: an error number to record in the journal indicating - * the reason for the shutdown. - * - * Perform a complete, immediate shutdown of the ENTIRE - * journal (not of a single transaction). This operation cannot be - * undone without closing and reopening the journal. - * - * The journal_abort function is intended to support higher level error - * recovery mechanisms such as the ext2/ext3 remount-readonly error - * mode. - * - * Journal abort has very specific semantics. Any existing dirty, - * unjournaled buffers in the main filesystem will still be written to - * disk by bdflush, but the journaling mechanism will be suspended - * immediately and no further transaction commits will be honoured. - * - * Any dirty, journaled buffers will be written back to disk without - * hitting the journal. Atomicity cannot be guaranteed on an aborted - * filesystem, but we _do_ attempt to leave as much data as possible - * behind for fsck to use for cleanup. - * - * Any attempt to get a new transaction handle on a journal which is in - * ABORT state will just result in an -EROFS error return. A - * journal_stop on an existing handle will return -EIO if we have - * entered abort state during the update. - * - * Recursive transactions are not disturbed by journal abort until the - * final journal_stop, which will receive the -EIO error. - * - * Finally, the journal_abort call allows the caller to supply an errno - * which will be recorded (if possible) in the journal superblock. This - * allows a client to record failure conditions in the middle of a - * transaction without having to complete the transaction to record the - * failure to disk. ext3_error, for example, now uses this - * functionality. - * - * Errors which originate from within the journaling layer will NOT - * supply an errno; a null errno implies that absolutely no further - * writes are done to the journal (unless there are any already in - * progress). - * - */ - -void journal_abort(journal_t *journal, int errno) -{ - __journal_abort_soft(journal, errno); -} - -/** - * int journal_errno () - returns the journal's error state. - * @journal: journal to examine. - * - * This is the errno numbet set with journal_abort(), the last - * time the journal was mounted - if the journal was stopped - * without calling abort this will be 0. - * - * If the journal has been aborted on this mount time -EROFS will - * be returned. - */ -int journal_errno(journal_t *journal) -{ - int err; - - spin_lock(&journal->j_state_lock); - if (journal->j_flags & JFS_ABORT) - err = -EROFS; - else - err = journal->j_errno; - spin_unlock(&journal->j_state_lock); - return err; -} - -/** - * int journal_clear_err () - clears the journal's error state - * @journal: journal to act on. - * - * An error must be cleared or Acked to take a FS out of readonly - * mode. - */ -int journal_clear_err(journal_t *journal) -{ - int err = 0; - - spin_lock(&journal->j_state_lock); - if (journal->j_flags & JFS_ABORT) - err = -EROFS; - else - journal->j_errno = 0; - spin_unlock(&journal->j_state_lock); - return err; -} - -/** - * void journal_ack_err() - Ack journal err. - * @journal: journal to act on. - * - * An error must be cleared or Acked to take a FS out of readonly - * mode. - */ -void journal_ack_err(journal_t *journal) -{ - spin_lock(&journal->j_state_lock); - if (journal->j_errno) - journal->j_flags |= JFS_ACK_ERR; - spin_unlock(&journal->j_state_lock); -} - -int journal_blocks_per_page(struct inode *inode) -{ - return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); -} - -/* - * Journal_head storage management - */ -static struct kmem_cache *journal_head_cache; -#ifdef CONFIG_JBD_DEBUG -static atomic_t nr_journal_heads = ATOMIC_INIT(0); -#endif - -static int journal_init_journal_head_cache(void) -{ - int retval; - - J_ASSERT(journal_head_cache == NULL); - journal_head_cache = kmem_cache_create("journal_head", - sizeof(struct journal_head), - 0, /* offset */ - SLAB_TEMPORARY, /* flags */ - NULL); /* ctor */ - retval = 0; - if (!journal_head_cache) { - retval = -ENOMEM; - printk(KERN_EMERG "JBD: no memory for journal_head cache\n"); - } - return retval; -} - -static void journal_destroy_journal_head_cache(void) -{ - if (journal_head_cache) { - kmem_cache_destroy(journal_head_cache); - journal_head_cache = NULL; - } -} - -/* - * journal_head splicing and dicing - */ -static struct journal_head *journal_alloc_journal_head(void) -{ - struct journal_head *ret; - -#ifdef CONFIG_JBD_DEBUG - atomic_inc(&nr_journal_heads); -#endif - ret = kmem_cache_zalloc(journal_head_cache, GFP_NOFS); - if (ret == NULL) { - jbd_debug(1, "out of memory for journal_head\n"); - printk_ratelimited(KERN_NOTICE "ENOMEM in %s, retrying.\n", - __func__); - - while (ret == NULL) { - yield(); - ret = kmem_cache_zalloc(journal_head_cache, GFP_NOFS); - } - } - return ret; -} - -static void journal_free_journal_head(struct journal_head *jh) -{ -#ifdef CONFIG_JBD_DEBUG - atomic_dec(&nr_journal_heads); - memset(jh, JBD_POISON_FREE, sizeof(*jh)); -#endif - kmem_cache_free(journal_head_cache, jh); -} - -/* - * A journal_head is attached to a buffer_head whenever JBD has an - * interest in the buffer. - * - * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit - * is set. This bit is tested in core kernel code where we need to take - * JBD-specific actions. Testing the zeroness of ->b_private is not reliable - * there. - * - * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one. - * - * When a buffer has its BH_JBD bit set it is immune from being released by - * core kernel code, mainly via ->b_count. - * - * A journal_head is detached from its buffer_head when the journal_head's - * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint - * transaction (b_cp_transaction) hold their references to b_jcount. - * - * Various places in the kernel want to attach a journal_head to a buffer_head - * _before_ attaching the journal_head to a transaction. To protect the - * journal_head in this situation, journal_add_journal_head elevates the - * journal_head's b_jcount refcount by one. The caller must call - * journal_put_journal_head() to undo this. - * - * So the typical usage would be: - * - * (Attach a journal_head if needed. Increments b_jcount) - * struct journal_head *jh = journal_add_journal_head(bh); - * ... - * (Get another reference for transaction) - * journal_grab_journal_head(bh); - * jh->b_transaction = xxx; - * (Put original reference) - * journal_put_journal_head(jh); - */ - -/* - * Give a buffer_head a journal_head. - * - * May sleep. - */ -struct journal_head *journal_add_journal_head(struct buffer_head *bh) -{ - struct journal_head *jh; - struct journal_head *new_jh = NULL; - -repeat: - if (!buffer_jbd(bh)) - new_jh = journal_alloc_journal_head(); - - jbd_lock_bh_journal_head(bh); - if (buffer_jbd(bh)) { - jh = bh2jh(bh); - } else { - J_ASSERT_BH(bh, - (atomic_read(&bh->b_count) > 0) || - (bh->b_page && bh->b_page->mapping)); - - if (!new_jh) { - jbd_unlock_bh_journal_head(bh); - goto repeat; - } - - jh = new_jh; - new_jh = NULL; /* We consumed it */ - set_buffer_jbd(bh); - bh->b_private = jh; - jh->b_bh = bh; - get_bh(bh); - BUFFER_TRACE(bh, "added journal_head"); - } - jh->b_jcount++; - jbd_unlock_bh_journal_head(bh); - if (new_jh) - journal_free_journal_head(new_jh); - return bh->b_private; -} - -/* - * Grab a ref against this buffer_head's journal_head. If it ended up not - * having a journal_head, return NULL - */ -struct journal_head *journal_grab_journal_head(struct buffer_head *bh) -{ - struct journal_head *jh = NULL; - - jbd_lock_bh_journal_head(bh); - if (buffer_jbd(bh)) { - jh = bh2jh(bh); - jh->b_jcount++; - } - jbd_unlock_bh_journal_head(bh); - return jh; -} - -static void __journal_remove_journal_head(struct buffer_head *bh) -{ - struct journal_head *jh = bh2jh(bh); - - J_ASSERT_JH(jh, jh->b_jcount >= 0); - J_ASSERT_JH(jh, jh->b_transaction == NULL); - J_ASSERT_JH(jh, jh->b_next_transaction == NULL); - J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); - J_ASSERT_JH(jh, jh->b_jlist == BJ_None); - J_ASSERT_BH(bh, buffer_jbd(bh)); - J_ASSERT_BH(bh, jh2bh(jh) == bh); - BUFFER_TRACE(bh, "remove journal_head"); - if (jh->b_frozen_data) { - printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__); - jbd_free(jh->b_frozen_data, bh->b_size); - } - if (jh->b_committed_data) { - printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__); - jbd_free(jh->b_committed_data, bh->b_size); - } - bh->b_private = NULL; - jh->b_bh = NULL; /* debug, really */ - clear_buffer_jbd(bh); - journal_free_journal_head(jh); -} - -/* - * Drop a reference on the passed journal_head. If it fell to zero then - * release the journal_head from the buffer_head. - */ -void journal_put_journal_head(struct journal_head *jh) -{ - struct buffer_head *bh = jh2bh(jh); - - jbd_lock_bh_journal_head(bh); - J_ASSERT_JH(jh, jh->b_jcount > 0); - --jh->b_jcount; - if (!jh->b_jcount) { - __journal_remove_journal_head(bh); - jbd_unlock_bh_journal_head(bh); - __brelse(bh); - } else - jbd_unlock_bh_journal_head(bh); -} - -/* - * debugfs tunables - */ -#ifdef CONFIG_JBD_DEBUG - -u8 journal_enable_debug __read_mostly; -EXPORT_SYMBOL(journal_enable_debug); - -static struct dentry *jbd_debugfs_dir; -static struct dentry *jbd_debug; - -static void __init jbd_create_debugfs_entry(void) -{ - jbd_debugfs_dir = debugfs_create_dir("jbd", NULL); - if (jbd_debugfs_dir) - jbd_debug = debugfs_create_u8("jbd-debug", S_IRUGO | S_IWUSR, - jbd_debugfs_dir, - &journal_enable_debug); -} - -static void __exit jbd_remove_debugfs_entry(void) -{ - debugfs_remove(jbd_debug); - debugfs_remove(jbd_debugfs_dir); -} - -#else - -static inline void jbd_create_debugfs_entry(void) -{ -} - -static inline void jbd_remove_debugfs_entry(void) -{ -} - -#endif - -struct kmem_cache *jbd_handle_cache; - -static int __init journal_init_handle_cache(void) -{ - jbd_handle_cache = kmem_cache_create("journal_handle", - sizeof(handle_t), - 0, /* offset */ - SLAB_TEMPORARY, /* flags */ - NULL); /* ctor */ - if (jbd_handle_cache == NULL) { - printk(KERN_EMERG "JBD: failed to create handle cache\n"); - return -ENOMEM; - } - return 0; -} - -static void journal_destroy_handle_cache(void) -{ - if (jbd_handle_cache) - kmem_cache_destroy(jbd_handle_cache); -} - -/* - * Module startup and shutdown - */ - -static int __init journal_init_caches(void) -{ - int ret; - - ret = journal_init_revoke_caches(); - if (ret == 0) - ret = journal_init_journal_head_cache(); - if (ret == 0) - ret = journal_init_handle_cache(); - return ret; -} - -static void journal_destroy_caches(void) -{ - journal_destroy_revoke_caches(); - journal_destroy_journal_head_cache(); - journal_destroy_handle_cache(); -} - -static int __init journal_init(void) -{ - int ret; - - BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024); - - ret = journal_init_caches(); - if (ret != 0) - journal_destroy_caches(); - jbd_create_debugfs_entry(); - return ret; -} - -static void __exit journal_exit(void) -{ -#ifdef CONFIG_JBD_DEBUG - int n = atomic_read(&nr_journal_heads); - if (n) - printk(KERN_ERR "JBD: leaked %d journal_heads!\n", n); -#endif - jbd_remove_debugfs_entry(); - journal_destroy_caches(); -} - -MODULE_LICENSE("GPL"); -module_init(journal_init); -module_exit(journal_exit); - diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c deleted file mode 100644 index a748fe21465a..000000000000 --- a/fs/jbd/recovery.c +++ /dev/null @@ -1,594 +0,0 @@ -/* - * linux/fs/jbd/recovery.c - * - * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 - * - * Copyright 1999-2000 Red Hat Software --- All Rights Reserved - * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - * - * Journal recovery routines for the generic filesystem journaling code; - * part of the ext2fs journaling system. - */ - -#ifndef __KERNEL__ -#include "jfs_user.h" -#else -#include <linux/time.h> -#include <linux/fs.h> -#include <linux/jbd.h> -#include <linux/errno.h> -#include <linux/blkdev.h> -#endif - -/* - * Maintain information about the progress of the recovery job, so that - * the different passes can carry information between them. - */ -struct recovery_info -{ - tid_t start_transaction; - tid_t end_transaction; - - int nr_replays; - int nr_revokes; - int nr_revoke_hits; -}; - -enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY}; -static int do_one_pass(journal_t *journal, - struct recovery_info *info, enum passtype pass); -static int scan_revoke_records(journal_t *, struct buffer_head *, - tid_t, struct recovery_info *); - -#ifdef __KERNEL__ - -/* Release readahead buffers after use */ -static void journal_brelse_array(struct buffer_head *b[], int n) -{ - while (--n >= 0) - brelse (b[n]); -} - - -/* - * When reading from the journal, we are going through the block device - * layer directly and so there is no readahead being done for us. We - * need to implement any readahead ourselves if we want it to happen at - * all. Recovery is basically one long sequential read, so make sure we - * do the IO in reasonably large chunks. - * - * This is not so critical that we need to be enormously clever about - * the readahead size, though. 128K is a purely arbitrary, good-enough - * fixed value. - */ - -#define MAXBUF 8 -static int do_readahead(journal_t *journal, unsigned int start) -{ - int err; - unsigned int max, nbufs, next; - unsigned int blocknr; - struct buffer_head *bh; - - struct buffer_head * bufs[MAXBUF]; - - /* Do up to 128K of readahead */ - max = start + (128 * 1024 / journal->j_blocksize); - if (max > journal->j_maxlen) - max = journal->j_maxlen; - - /* Do the readahead itself. We'll submit MAXBUF buffer_heads at - * a time to the block device IO layer. */ - - nbufs = 0; - - for (next = start; next < max; next++) { - err = journal_bmap(journal, next, &blocknr); - - if (err) { - printk (KERN_ERR "JBD: bad block at offset %u\n", - next); - goto failed; - } - - bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); - if (!bh) { - err = -ENOMEM; - goto failed; - } - - if (!buffer_uptodate(bh) && !buffer_locked(bh)) { - bufs[nbufs++] = bh; - if (nbufs == MAXBUF) { - ll_rw_block(READ, nbufs, bufs); - journal_brelse_array(bufs, nbufs); - nbufs = 0; - } - } else - brelse(bh); - } - - if (nbufs) - ll_rw_block(READ, nbufs, bufs); - err = 0; - -failed: - if (nbufs) - journal_brelse_array(bufs, nbufs); - return err; -} - -#endif /* __KERNEL__ */ - - -/* - * Read a block from the journal - */ - -static int jread(struct buffer_head **bhp, journal_t *journal, - unsigned int offset) -{ - int err; - unsigned int blocknr; - struct buffer_head *bh; - - *bhp = NULL; - - if (offset >= journal->j_maxlen) { - printk(KERN_ERR "JBD: corrupted journal superblock\n"); - return -EIO; - } - - err = journal_bmap(journal, offset, &blocknr); - - if (err) { - printk (KERN_ERR "JBD: bad block at offset %u\n", - offset); - return err; - } - - bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); - if (!bh) - return -ENOMEM; - - if (!buffer_uptodate(bh)) { - /* If this is a brand new buffer, start readahead. - Otherwise, we assume we are already reading it. */ - if (!buffer_req(bh)) - do_readahead(journal, offset); - wait_on_buffer(bh); - } - - if (!buffer_uptodate(bh)) { - printk (KERN_ERR "JBD: Failed to read block at offset %u\n", - offset); - brelse(bh); - return -EIO; - } - - *bhp = bh; - return 0; -} - - -/* - * Count the number of in-use tags in a journal descriptor block. - */ - -static int count_tags(struct buffer_head *bh, int size) -{ - char * tagp; - journal_block_tag_t * tag; - int nr = 0; - - tagp = &bh->b_data[sizeof(journal_header_t)]; - - while ((tagp - bh->b_data + sizeof(journal_block_tag_t)) <= size) { - tag = (journal_block_tag_t *) tagp; - - nr++; - tagp += sizeof(journal_block_tag_t); - if (!(tag->t_flags & cpu_to_be32(JFS_FLAG_SAME_UUID))) - tagp += 16; - - if (tag->t_flags & cpu_to_be32(JFS_FLAG_LAST_TAG)) - break; - } - - return nr; -} - - -/* Make sure we wrap around the log correctly! */ -#define wrap(journal, var) \ -do { \ - if (var >= (journal)->j_last) \ - var -= ((journal)->j_last - (journal)->j_first); \ -} while (0) - -/** - * journal_recover - recovers a on-disk journal - * @journal: the journal to recover - * - * The primary function for recovering the log contents when mounting a - * journaled device. - * - * Recovery is done in three passes. In the first pass, we look for the - * end of the log. In the second, we assemble the list of revoke - * blocks. In the third and final pass, we replay any un-revoked blocks - * in the log. - */ -int journal_recover(journal_t *journal) -{ - int err, err2; - journal_superblock_t * sb; - - struct recovery_info info; - - memset(&info, 0, sizeof(info)); - sb = journal->j_superblock; - - /* - * The journal superblock's s_start field (the current log head) - * is always zero if, and only if, the journal was cleanly - * unmounted. - */ - - if (!sb->s_start) { - jbd_debug(1, "No recovery required, last transaction %d\n", - be32_to_cpu(sb->s_sequence)); - journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1; - return 0; - } - - err = do_one_pass(journal, &info, PASS_SCAN); - if (!err) - err = do_one_pass(journal, &info, PASS_REVOKE); - if (!err) - err = do_one_pass(journal, &info, PASS_REPLAY); - - jbd_debug(1, "JBD: recovery, exit status %d, " - "recovered transactions %u to %u\n", - err, info.start_transaction, info.end_transaction); - jbd_debug(1, "JBD: Replayed %d and revoked %d/%d blocks\n", - info.nr_replays, info.nr_revoke_hits, info.nr_revokes); - - /* Restart the log at the next transaction ID, thus invalidating - * any existing commit records in the log. */ - journal->j_transaction_sequence = ++info.end_transaction; - - journal_clear_revoke(journal); - err2 = sync_blockdev(journal->j_fs_dev); - if (!err) - err = err2; - /* Flush disk caches to get replayed data on the permanent storage */ - if (journal->j_flags & JFS_BARRIER) { - err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); - if (!err) - err = err2; - } - - return err; -} - -/** - * journal_skip_recovery - Start journal and wipe exiting records - * @journal: journal to startup - * - * Locate any valid recovery information from the journal and set up the - * journal structures in memory to ignore it (presumably because the - * caller has evidence that it is out of date). - * This function does'nt appear to be exorted.. - * - * We perform one pass over the journal to allow us to tell the user how - * much recovery information is being erased, and to let us initialise - * the journal transaction sequence numbers to the next unused ID. - */ -int journal_skip_recovery(journal_t *journal) -{ - int err; - struct recovery_info info; - - memset (&info, 0, sizeof(info)); - - err = do_one_pass(journal, &info, PASS_SCAN); - - if (err) { - printk(KERN_ERR "JBD: error %d scanning journal\n", err); - ++journal->j_transaction_sequence; - } else { -#ifdef CONFIG_JBD_DEBUG - int dropped = info.end_transaction - - be32_to_cpu(journal->j_superblock->s_sequence); - jbd_debug(1, - "JBD: ignoring %d transaction%s from the journal.\n", - dropped, (dropped == 1) ? "" : "s"); -#endif - journal->j_transaction_sequence = ++info.end_transaction; - } - - journal->j_tail = 0; - return err; -} - -static int do_one_pass(journal_t *journal, - struct recovery_info *info, enum passtype pass) -{ - unsigned int first_commit_ID, next_commit_ID; - unsigned int next_log_block; - int err, success = 0; - journal_superblock_t * sb; - journal_header_t * tmp; - struct buffer_head * bh; - unsigned int sequence; - int blocktype; - - /* - * First thing is to establish what we expect to find in the log - * (in terms of transaction IDs), and where (in terms of log - * block offsets): query the superblock. - */ - - sb = journal->j_superblock; - next_commit_ID = be32_to_cpu(sb->s_sequence); - next_log_block = be32_to_cpu(sb->s_start); - - first_commit_ID = next_commit_ID; - if (pass == PASS_SCAN) - info->start_transaction = first_commit_ID; - - jbd_debug(1, "Starting recovery pass %d\n", pass); - - /* - * Now we walk through the log, transaction by transaction, - * making sure that each transaction has a commit block in the - * expected place. Each complete transaction gets replayed back - * into the main filesystem. - */ - - while (1) { - int flags; - char * tagp; - journal_block_tag_t * tag; - struct buffer_head * obh; - struct buffer_head * nbh; - - cond_resched(); - - /* If we already know where to stop the log traversal, - * check right now that we haven't gone past the end of - * the log. */ - - if (pass != PASS_SCAN) - if (tid_geq(next_commit_ID, info->end_transaction)) - break; - - jbd_debug(2, "Scanning for sequence ID %u at %u/%u\n", - next_commit_ID, next_log_block, journal->j_last); - - /* Skip over each chunk of the transaction looking - * either the next descriptor block or the final commit - * record. */ - - jbd_debug(3, "JBD: checking block %u\n", next_log_block); - err = jread(&bh, journal, next_log_block); - if (err) - goto failed; - - next_log_block++; - wrap(journal, next_log_block); - - /* What kind of buffer is it? - * - * If it is a descriptor block, check that it has the - * expected sequence number. Otherwise, we're all done - * here. */ - - tmp = (journal_header_t *)bh->b_data; - - if (tmp->h_magic != cpu_to_be32(JFS_MAGIC_NUMBER)) { - brelse(bh); - break; - } - - blocktype = be32_to_cpu(tmp->h_blocktype); - sequence = be32_to_cpu(tmp->h_sequence); - jbd_debug(3, "Found magic %d, sequence %d\n", - blocktype, sequence); - - if (sequence != next_commit_ID) { - brelse(bh); - break; - } - - /* OK, we have a valid descriptor block which matches - * all of the sequence number checks. What are we going - * to do with it? That depends on the pass... */ - - switch(blocktype) { - case JFS_DESCRIPTOR_BLOCK: - /* If it is a valid descriptor block, replay it - * in pass REPLAY; otherwise, just skip over the - * blocks it describes. */ - if (pass != PASS_REPLAY) { - next_log_block += - count_tags(bh, journal->j_blocksize); - wrap(journal, next_log_block); - brelse(bh); - continue; - } - - /* A descriptor block: we can now write all of - * the data blocks. Yay, useful work is finally - * getting done here! */ - - tagp = &bh->b_data[sizeof(journal_header_t)]; - while ((tagp - bh->b_data +sizeof(journal_block_tag_t)) - <= journal->j_blocksize) { - unsigned int io_block; - - tag = (journal_block_tag_t *) tagp; - flags = be32_to_cpu(tag->t_flags); - - io_block = next_log_block++; - wrap(journal, next_log_block); - err = jread(&obh, journal, io_block); - if (err) { - /* Recover what we can, but - * report failure at the end. */ - success = err; - printk (KERN_ERR - "JBD: IO error %d recovering " - "block %u in log\n", - err, io_block); - } else { - unsigned int blocknr; - - J_ASSERT(obh != NULL); - blocknr = be32_to_cpu(tag->t_blocknr); - - /* If the block has been - * revoked, then we're all done - * here. */ - if (journal_test_revoke - (journal, blocknr, - next_commit_ID)) { - brelse(obh); - ++info->nr_revoke_hits; - goto skip_write; - } - - /* Find a buffer for the new - * data being restored */ - nbh = __getblk(journal->j_fs_dev, - blocknr, - journal->j_blocksize); - if (nbh == NULL) { - printk(KERN_ERR - "JBD: Out of memory " - "during recovery.\n"); - err = -ENOMEM; - brelse(bh); - brelse(obh); - goto failed; - } - - lock_buffer(nbh); - memcpy(nbh->b_data, obh->b_data, - journal->j_blocksize); - if (flags & JFS_FLAG_ESCAPE) { - *((__be32 *)nbh->b_data) = - cpu_to_be32(JFS_MAGIC_NUMBER); - } - - BUFFER_TRACE(nbh, "marking dirty"); - set_buffer_uptodate(nbh); - mark_buffer_dirty(nbh); - BUFFER_TRACE(nbh, "marking uptodate"); - ++info->nr_replays; - /* ll_rw_block(WRITE, 1, &nbh); */ - unlock_buffer(nbh); - brelse(obh); - brelse(nbh); - } - - skip_write: - tagp += sizeof(journal_block_tag_t); - if (!(flags & JFS_FLAG_SAME_UUID)) - tagp += 16; - - if (flags & JFS_FLAG_LAST_TAG) - break; - } - - brelse(bh); - continue; - - case JFS_COMMIT_BLOCK: - /* Found an expected commit block: not much to - * do other than move on to the next sequence - * number. */ - brelse(bh); - next_commit_ID++; - continue; - - case JFS_REVOKE_BLOCK: - /* If we aren't in the REVOKE pass, then we can - * just skip over this block. */ - if (pass != PASS_REVOKE) { - brelse(bh); - continue; - } - - err = scan_revoke_records(journal, bh, - next_commit_ID, info); - brelse(bh); - if (err) - goto failed; - continue; - - default: - jbd_debug(3, "Unrecognised magic %d, end of scan.\n", - blocktype); - brelse(bh); - goto done; - } - } - - done: - /* - * We broke out of the log scan loop: either we came to the - * known end of the log or we found an unexpected block in the - * log. If the latter happened, then we know that the "current" - * transaction marks the end of the valid log. - */ - - if (pass == PASS_SCAN) - info->end_transaction = next_commit_ID; - else { - /* It's really bad news if different passes end up at - * different places (but possible due to IO errors). */ - if (info->end_transaction != next_commit_ID) { - printk (KERN_ERR "JBD: recovery pass %d ended at " - "transaction %u, expected %u\n", - pass, next_commit_ID, info->end_transaction); - if (!success) - success = -EIO; - } - } - - return success; - - failed: - return err; -} - - -/* Scan a revoke record, marking all blocks mentioned as revoked. */ - -static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, - tid_t sequence, struct recovery_info *info) -{ - journal_revoke_header_t *header; - int offset, max; - - header = (journal_revoke_header_t *) bh->b_data; - offset = sizeof(journal_revoke_header_t); - max = be32_to_cpu(header->r_count); - - while (offset < max) { - unsigned int blocknr; - int err; - - blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset))); - offset += 4; - err = journal_set_revoke(journal, blocknr, sequence); - if (err) - return err; - ++info->nr_revokes; - } - return 0; -} diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c deleted file mode 100644 index dcead636c33b..000000000000 --- a/fs/jbd/revoke.c +++ /dev/null @@ -1,733 +0,0 @@ -/* - * linux/fs/jbd/revoke.c - * - * Written by Stephen C. Tweedie <sct@redhat.com>, 2000 - * - * Copyright 2000 Red Hat corp --- All Rights Reserved - * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - * - * Journal revoke routines for the generic filesystem journaling code; - * part of the ext2fs journaling system. - * - * Revoke is the mechanism used to prevent old log records for deleted - * metadata from being replayed on top of newer data using the same - * blocks. The revoke mechanism is used in two separate places: - * - * + Commit: during commit we write the entire list of the current - * transaction's revoked blocks to the journal - * - * + Recovery: during recovery we record the transaction ID of all - * revoked blocks. If there are multiple revoke records in the log - * for a single block, only the last one counts, and if there is a log - * entry for a block beyond the last revoke, then that log entry still - * gets replayed. - * - * We can get interactions between revokes and new log data within a - * single transaction: - * - * Block is revoked and then journaled: - * The desired end result is the journaling of the new block, so we - * cancel the revoke before the transaction commits. - * - * Block is journaled and then revoked: - * The revoke must take precedence over the write of the block, so we - * need either to cancel the journal entry or to write the revoke - * later in the log than the log block. In this case, we choose the - * latter: journaling a block cancels any revoke record for that block - * in the current transaction, so any revoke for that block in the - * transaction must have happened after the block was journaled and so - * the revoke must take precedence. - * - * Block is revoked and then written as data: - * The data write is allowed to succeed, but the revoke is _not_ - * cancelled. We still need to prevent old log records from - * overwriting the new data. We don't even need to clear the revoke - * bit here. - * - * We cache revoke status of a buffer in the current transaction in b_states - * bits. As the name says, revokevalid flag indicates that the cached revoke - * status of a buffer is valid and we can rely on the cached status. - * - * Revoke information on buffers is a tri-state value: - * - * RevokeValid clear: no cached revoke status, need to look it up - * RevokeValid set, Revoked clear: - * buffer has not been revoked, and cancel_revoke - * need do nothing. - * RevokeValid set, Revoked set: - * buffer has been revoked. - * - * Locking rules: - * We keep two hash tables of revoke records. One hashtable belongs to the - * running transaction (is pointed to by journal->j_revoke), the other one - * belongs to the committing transaction. Accesses to the second hash table - * happen only from the kjournald and no other thread touches this table. Also - * journal_switch_revoke_table() which switches which hashtable belongs to the - * running and which to the committing transaction is called only from - * kjournald. Therefore we need no locks when accessing the hashtable belonging - * to the committing transaction. - * - * All users operating on the hash table belonging to the running transaction - * have a handle to the transaction. Therefore they are safe from kjournald - * switching hash tables under them. For operations on the lists of entries in - * the hash table j_revoke_lock is used. - * - * Finally, also replay code uses the hash tables but at this moment no one else - * can touch them (filesystem isn't mounted yet) and hence no locking is - * needed. - */ - -#ifndef __KERNEL__ -#include "jfs_user.h" -#else -#include <linux/time.h> -#include <linux/fs.h> -#include <linux/jbd.h> -#include <linux/errno.h> -#include <linux/slab.h> -#include <linux/list.h> -#include <linux/init.h> -#include <linux/bio.h> -#endif -#include <linux/log2.h> -#include <linux/hash.h> - -static struct kmem_cache *revoke_record_cache; -static struct kmem_cache *revoke_table_cache; - -/* Each revoke record represents one single revoked block. During - journal replay, this involves recording the transaction ID of the - last transaction to revoke this block. */ - -struct jbd_revoke_record_s -{ - struct list_head hash; - tid_t sequence; /* Used for recovery only */ - unsigned int blocknr; -}; - - -/* The revoke table is just a simple hash table of revoke records. */ -struct jbd_revoke_table_s -{ - /* It is conceivable that we might want a larger hash table - * for recovery. Must be a power of two. */ - int hash_size; - int hash_shift; - struct list_head *hash_table; -}; - - -#ifdef __KERNEL__ -static void write_one_revoke_record(journal_t *, transaction_t *, - struct journal_head **, int *, - struct jbd_revoke_record_s *, int); -static void flush_descriptor(journal_t *, struct journal_head *, int, int); -#endif - -/* Utility functions to maintain the revoke table */ - -static inline int hash(journal_t *journal, unsigned int block) -{ - struct jbd_revoke_table_s *table = journal->j_revoke; - - return hash_32(block, table->hash_shift); -} - -static int insert_revoke_hash(journal_t *journal, unsigned int blocknr, - tid_t seq) -{ - struct list_head *hash_list; - struct jbd_revoke_record_s *record; - -repeat: - record = kmem_cache_alloc(revoke_record_cache, GFP_NOFS); - if (!record) - goto oom; - - record->sequence = seq; - record->blocknr = blocknr; - hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; - spin_lock(&journal->j_revoke_lock); - list_add(&record->hash, hash_list); - spin_unlock(&journal->j_revoke_lock); - return 0; - -oom: - if (!journal_oom_retry) - return -ENOMEM; - jbd_debug(1, "ENOMEM in %s, retrying\n", __func__); - yield(); - goto repeat; -} - -/* Find a revoke record in the journal's hash table. */ - -static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal, - unsigned int blocknr) -{ - struct list_head *hash_list; - struct jbd_revoke_record_s *record; - - hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; - - spin_lock(&journal->j_revoke_lock); - record = (struct jbd_revoke_record_s *) hash_list->next; - while (&(record->hash) != hash_list) { - if (record->blocknr == blocknr) { - spin_unlock(&journal->j_revoke_lock); - return record; - } - record = (struct jbd_revoke_record_s *) record->hash.next; - } - spin_unlock(&journal->j_revoke_lock); - return NULL; -} - -void journal_destroy_revoke_caches(void) -{ - if (revoke_record_cache) { - kmem_cache_destroy(revoke_record_cache); - revoke_record_cache = NULL; - } - if (revoke_table_cache) { - kmem_cache_destroy(revoke_table_cache); - revoke_table_cache = NULL; - } -} - -int __init journal_init_revoke_caches(void) -{ - J_ASSERT(!revoke_record_cache); - J_ASSERT(!revoke_table_cache); - - revoke_record_cache = kmem_cache_create("revoke_record", - sizeof(struct jbd_revoke_record_s), - 0, - SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY, - NULL); - if (!revoke_record_cache) - goto record_cache_failure; - - revoke_table_cache = kmem_cache_create("revoke_table", - sizeof(struct jbd_revoke_table_s), - 0, SLAB_TEMPORARY, NULL); - if (!revoke_table_cache) - goto table_cache_failure; - - return 0; - -table_cache_failure: - journal_destroy_revoke_caches(); -record_cache_failure: - return -ENOMEM; -} - -static struct jbd_revoke_table_s *journal_init_revoke_table(int hash_size) -{ - int i; - struct jbd_revoke_table_s *table; - - table = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL); - if (!table) - goto out; - - table->hash_size = hash_size; - table->hash_shift = ilog2(hash_size); - table->hash_table = - kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL); - if (!table->hash_table) { - kmem_cache_free(revoke_table_cache, table); - table = NULL; - goto out; - } - - for (i = 0; i < hash_size; i++) - INIT_LIST_HEAD(&table->hash_table[i]); - -out: - return table; -} - -static void journal_destroy_revoke_table(struct jbd_revoke_table_s *table) -{ - int i; - struct list_head *hash_list; - - for (i = 0; i < table->hash_size; i++) { - hash_list = &table->hash_table[i]; - J_ASSERT(list_empty(hash_list)); - } - - kfree(table->hash_table); - kmem_cache_free(revoke_table_cache, table); -} - -/* Initialise the revoke table for a given journal to a given size. */ -int journal_init_revoke(journal_t *journal, int hash_size) -{ - J_ASSERT(journal->j_revoke_table[0] == NULL); - J_ASSERT(is_power_of_2(hash_size)); - - journal->j_revoke_table[0] = journal_init_revoke_table(hash_size); - if (!journal->j_revoke_table[0]) - goto fail0; - - journal->j_revoke_table[1] = journal_init_revoke_table(hash_size); - if (!journal->j_revoke_table[1]) - goto fail1; - - journal->j_revoke = journal->j_revoke_table[1]; - - spin_lock_init(&journal->j_revoke_lock); - - return 0; - -fail1: - journal_destroy_revoke_table(journal->j_revoke_table[0]); -fail0: - return -ENOMEM; -} - -/* Destroy a journal's revoke table. The table must already be empty! */ -void journal_destroy_revoke(journal_t *journal) -{ - journal->j_revoke = NULL; - if (journal->j_revoke_table[0]) - journal_destroy_revoke_table(journal->j_revoke_table[0]); - if (journal->j_revoke_table[1]) - journal_destroy_revoke_table(journal->j_revoke_table[1]); -} - - -#ifdef __KERNEL__ - -/* - * journal_revoke: revoke a given buffer_head from the journal. This - * prevents the block from being replayed during recovery if we take a - * crash after this current transaction commits. Any subsequent - * metadata writes of the buffer in this transaction cancel the - * revoke. - * - * Note that this call may block --- it is up to the caller to make - * sure that there are no further calls to journal_write_metadata - * before the revoke is complete. In ext3, this implies calling the - * revoke before clearing the block bitmap when we are deleting - * metadata. - * - * Revoke performs a journal_forget on any buffer_head passed in as a - * parameter, but does _not_ forget the buffer_head if the bh was only - * found implicitly. - * - * bh_in may not be a journalled buffer - it may have come off - * the hash tables without an attached journal_head. - * - * If bh_in is non-zero, journal_revoke() will decrement its b_count - * by one. - */ - -int journal_revoke(handle_t *handle, unsigned int blocknr, - struct buffer_head *bh_in) -{ - struct buffer_head *bh = NULL; - journal_t *journal; - struct block_device *bdev; - int err; - - might_sleep(); - if (bh_in) - BUFFER_TRACE(bh_in, "enter"); - - journal = handle->h_transaction->t_journal; - if (!journal_set_features(journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)){ - J_ASSERT (!"Cannot set revoke feature!"); - return -EINVAL; - } - - bdev = journal->j_fs_dev; - bh = bh_in; - - if (!bh) { - bh = __find_get_block(bdev, blocknr, journal->j_blocksize); - if (bh) - BUFFER_TRACE(bh, "found on hash"); - } -#ifdef JBD_EXPENSIVE_CHECKING - else { - struct buffer_head *bh2; - - /* If there is a different buffer_head lying around in - * memory anywhere... */ - bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize); - if (bh2) { - /* ... and it has RevokeValid status... */ - if (bh2 != bh && buffer_revokevalid(bh2)) - /* ...then it better be revoked too, - * since it's illegal to create a revoke - * record against a buffer_head which is - * not marked revoked --- that would - * risk missing a subsequent revoke - * cancel. */ - J_ASSERT_BH(bh2, buffer_revoked(bh2)); - put_bh(bh2); - } - } -#endif - - /* We really ought not ever to revoke twice in a row without - first having the revoke cancelled: it's illegal to free a - block twice without allocating it in between! */ - if (bh) { - if (!J_EXPECT_BH(bh, !buffer_revoked(bh), - "inconsistent data on disk")) { - if (!bh_in) - brelse(bh); - return -EIO; - } - set_buffer_revoked(bh); - set_buffer_revokevalid(bh); - if (bh_in) { - BUFFER_TRACE(bh_in, "call journal_forget"); - journal_forget(handle, bh_in); - } else { - BUFFER_TRACE(bh, "call brelse"); - __brelse(bh); - } - } - - jbd_debug(2, "insert revoke for block %u, bh_in=%p\n", blocknr, bh_in); - err = insert_revoke_hash(journal, blocknr, - handle->h_transaction->t_tid); - BUFFER_TRACE(bh_in, "exit"); - return err; -} - -/* - * Cancel an outstanding revoke. For use only internally by the - * journaling code (called from journal_get_write_access). - * - * We trust buffer_revoked() on the buffer if the buffer is already - * being journaled: if there is no revoke pending on the buffer, then we - * don't do anything here. - * - * This would break if it were possible for a buffer to be revoked and - * discarded, and then reallocated within the same transaction. In such - * a case we would have lost the revoked bit, but when we arrived here - * the second time we would still have a pending revoke to cancel. So, - * do not trust the Revoked bit on buffers unless RevokeValid is also - * set. - */ -int journal_cancel_revoke(handle_t *handle, struct journal_head *jh) -{ - struct jbd_revoke_record_s *record; - journal_t *journal = handle->h_transaction->t_journal; - int need_cancel; - int did_revoke = 0; /* akpm: debug */ - struct buffer_head *bh = jh2bh(jh); - - jbd_debug(4, "journal_head %p, cancelling revoke\n", jh); - - /* Is the existing Revoke bit valid? If so, we trust it, and - * only perform the full cancel if the revoke bit is set. If - * not, we can't trust the revoke bit, and we need to do the - * full search for a revoke record. */ - if (test_set_buffer_revokevalid(bh)) { - need_cancel = test_clear_buffer_revoked(bh); - } else { - need_cancel = 1; - clear_buffer_revoked(bh); - } - - if (need_cancel) { - record = find_revoke_record(journal, bh->b_blocknr); - if (record) { - jbd_debug(4, "cancelled existing revoke on " - "blocknr %llu\n", (unsigned long long)bh->b_blocknr); - spin_lock(&journal->j_revoke_lock); - list_del(&record->hash); - spin_unlock(&journal->j_revoke_lock); - kmem_cache_free(revoke_record_cache, record); - did_revoke = 1; - } - } - -#ifdef JBD_EXPENSIVE_CHECKING - /* There better not be one left behind by now! */ - record = find_revoke_record(journal, bh->b_blocknr); - J_ASSERT_JH(jh, record == NULL); -#endif - - /* Finally, have we just cleared revoke on an unhashed - * buffer_head? If so, we'd better make sure we clear the - * revoked status on any hashed alias too, otherwise the revoke - * state machine will get very upset later on. */ - if (need_cancel) { - struct buffer_head *bh2; - bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size); - if (bh2) { - if (bh2 != bh) - clear_buffer_revoked(bh2); - __brelse(bh2); - } - } - return did_revoke; -} - -/* - * journal_clear_revoked_flags clears revoked flag of buffers in - * revoke table to reflect there is no revoked buffer in the next - * transaction which is going to be started. - */ -void journal_clear_buffer_revoked_flags(journal_t *journal) -{ - struct jbd_revoke_table_s *revoke = journal->j_revoke; - int i = 0; - - for (i = 0; i < revoke->hash_size; i++) { - struct list_head *hash_list; - struct list_head *list_entry; - hash_list = &revoke->hash_table[i]; - - list_for_each(list_entry, hash_list) { - struct jbd_revoke_record_s *record; - struct buffer_head *bh; - record = (struct jbd_revoke_record_s *)list_entry; - bh = __find_get_block(journal->j_fs_dev, - record->blocknr, - journal->j_blocksize); - if (bh) { - clear_buffer_revoked(bh); - __brelse(bh); - } - } - } -} - -/* journal_switch_revoke table select j_revoke for next transaction - * we do not want to suspend any processing until all revokes are - * written -bzzz - */ -void journal_switch_revoke_table(journal_t *journal) -{ - int i; - - if (journal->j_revoke == journal->j_revoke_table[0]) - journal->j_revoke = journal->j_revoke_table[1]; - else - journal->j_revoke = journal->j_revoke_table[0]; - - for (i = 0; i < journal->j_revoke->hash_size; i++) - INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]); -} - -/* - * Write revoke records to the journal for all entries in the current - * revoke hash, deleting the entries as we go. - */ -void journal_write_revoke_records(journal_t *journal, - transaction_t *transaction, int write_op) -{ - struct journal_head *descriptor; - struct jbd_revoke_record_s *record; - struct jbd_revoke_table_s *revoke; - struct list_head *hash_list; - int i, offset, count; - - descriptor = NULL; - offset = 0; - count = 0; - - /* select revoke table for committing transaction */ - revoke = journal->j_revoke == journal->j_revoke_table[0] ? - journal->j_revoke_table[1] : journal->j_revoke_table[0]; - - for (i = 0; i < revoke->hash_size; i++) { - hash_list = &revoke->hash_table[i]; - - while (!list_empty(hash_list)) { - record = (struct jbd_revoke_record_s *) - hash_list->next; - write_one_revoke_record(journal, transaction, - &descriptor, &offset, - record, write_op); - count++; - list_del(&record->hash); - kmem_cache_free(revoke_record_cache, record); - } - } - if (descriptor) - flush_descriptor(journal, descriptor, offset, write_op); - jbd_debug(1, "Wrote %d revoke records\n", count); -} - -/* - * Write out one revoke record. We need to create a new descriptor - * block if the old one is full or if we have not already created one. - */ - -static void write_one_revoke_record(journal_t *journal, - transaction_t *transaction, - struct journal_head **descriptorp, - int *offsetp, - struct jbd_revoke_record_s *record, - int write_op) -{ - struct journal_head *descriptor; - int offset; - journal_header_t *header; - - /* If we are already aborting, this all becomes a noop. We - still need to go round the loop in - journal_write_revoke_records in order to free all of the - revoke records: only the IO to the journal is omitted. */ - if (is_journal_aborted(journal)) - return; - - descriptor = *descriptorp; - offset = *offsetp; - - /* Make sure we have a descriptor with space left for the record */ - if (descriptor) { - if (offset == journal->j_blocksize) { - flush_descriptor(journal, descriptor, offset, write_op); - descriptor = NULL; - } - } - - if (!descriptor) { - descriptor = journal_get_descriptor_buffer(journal); - if (!descriptor) - return; - header = (journal_header_t *) &jh2bh(descriptor)->b_data[0]; - header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); - header->h_blocktype = cpu_to_be32(JFS_REVOKE_BLOCK); - header->h_sequence = cpu_to_be32(transaction->t_tid); - - /* Record it so that we can wait for IO completion later */ - JBUFFER_TRACE(descriptor, "file as BJ_LogCtl"); - journal_file_buffer(descriptor, transaction, BJ_LogCtl); - - offset = sizeof(journal_revoke_header_t); - *descriptorp = descriptor; - } - - * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) = - cpu_to_be32(record->blocknr); - offset += 4; - *offsetp = offset; -} - -/* - * Flush a revoke descriptor out to the journal. If we are aborting, - * this is a noop; otherwise we are generating a buffer which needs to - * be waited for during commit, so it has to go onto the appropriate - * journal buffer list. - */ - -static void flush_descriptor(journal_t *journal, - struct journal_head *descriptor, - int offset, int write_op) -{ - journal_revoke_header_t *header; - struct buffer_head *bh = jh2bh(descriptor); - - if (is_journal_aborted(journal)) { - put_bh(bh); - return; - } - - header = (journal_revoke_header_t *) jh2bh(descriptor)->b_data; - header->r_count = cpu_to_be32(offset); - set_buffer_jwrite(bh); - BUFFER_TRACE(bh, "write"); - set_buffer_dirty(bh); - write_dirty_buffer(bh, write_op); -} -#endif - -/* - * Revoke support for recovery. - * - * Recovery needs to be able to: - * - * record all revoke records, including the tid of the latest instance - * of each revoke in the journal - * - * check whether a given block in a given transaction should be replayed - * (ie. has not been revoked by a revoke record in that or a subsequent - * transaction) - * - * empty the revoke table after recovery. - */ - -/* - * First, setting revoke records. We create a new revoke record for - * every block ever revoked in the log as we scan it for recovery, and - * we update the existing records if we find multiple revokes for a - * single block. - */ - -int journal_set_revoke(journal_t *journal, - unsigned int blocknr, - tid_t sequence) -{ - struct jbd_revoke_record_s *record; - - record = find_revoke_record(journal, blocknr); - if (record) { - /* If we have multiple occurrences, only record the - * latest sequence number in the hashed record */ - if (tid_gt(sequence, record->sequence)) - record->sequence = sequence; - return 0; - } - return insert_revoke_hash(journal, blocknr, sequence); -} - -/* - * Test revoke records. For a given block referenced in the log, has - * that block been revoked? A revoke record with a given transaction - * sequence number revokes all blocks in that transaction and earlier - * ones, but later transactions still need replayed. - */ - -int journal_test_revoke(journal_t *journal, - unsigned int blocknr, - tid_t sequence) -{ - struct jbd_revoke_record_s *record; - - record = find_revoke_record(journal, blocknr); - if (!record) - return 0; - if (tid_gt(sequence, record->sequence)) - return 0; - return 1; -} - -/* - * Finally, once recovery is over, we need to clear the revoke table so - * that it can be reused by the running filesystem. - */ - -void journal_clear_revoke(journal_t *journal) -{ - int i; - struct list_head *hash_list; - struct jbd_revoke_record_s *record; - struct jbd_revoke_table_s *revoke; - - revoke = journal->j_revoke; - - for (i = 0; i < revoke->hash_size; i++) { - hash_list = &revoke->hash_table[i]; - while (!list_empty(hash_list)) { - record = (struct jbd_revoke_record_s*) hash_list->next; - list_del(&record->hash); - kmem_cache_free(revoke_record_cache, record); - } - } -} diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c deleted file mode 100644 index 1695ba8334a2..000000000000 --- a/fs/jbd/transaction.c +++ /dev/null @@ -1,2237 +0,0 @@ -/* - * linux/fs/jbd/transaction.c - * - * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 - * - * Copyright 1998 Red Hat corp --- All Rights Reserved - * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - * - * Generic filesystem transaction handling code; part of the ext2fs - * journaling system. - * - * This file manages transactions (compound commits managed by the - * journaling code) and handles (individual atomic operations by the - * filesystem). - */ - -#include <linux/time.h> -#include <linux/fs.h> -#include <linux/jbd.h> -#include <linux/errno.h> -#include <linux/slab.h> -#include <linux/timer.h> -#include <linux/mm.h> -#include <linux/highmem.h> -#include <linux/hrtimer.h> - -static void __journal_temp_unlink_buffer(struct journal_head *jh); - -/* - * get_transaction: obtain a new transaction_t object. - * - * Simply allocate and initialise a new transaction. Create it in - * RUNNING state and add it to the current journal (which should not - * have an existing running transaction: we only make a new transaction - * once we have started to commit the old one). - * - * Preconditions: - * The journal MUST be locked. We don't perform atomic mallocs on the - * new transaction and we can't block without protecting against other - * processes trying to touch the journal while it is in transition. - * - * Called under j_state_lock - */ - -static transaction_t * -get_transaction(journal_t *journal, transaction_t *transaction) -{ - transaction->t_journal = journal; - transaction->t_state = T_RUNNING; - transaction->t_start_time = ktime_get(); - transaction->t_tid = journal->j_transaction_sequence++; - transaction->t_expires = jiffies + journal->j_commit_interval; - spin_lock_init(&transaction->t_handle_lock); - - /* Set up the commit timer for the new transaction. */ - journal->j_commit_timer.expires = - round_jiffies_up(transaction->t_expires); - add_timer(&journal->j_commit_timer); - - J_ASSERT(journal->j_running_transaction == NULL); - journal->j_running_transaction = transaction; - - return transaction; -} - -/* - * Handle management. - * - * A handle_t is an object which represents a single atomic update to a - * filesystem, and which tracks all of the modifications which form part - * of that one update. - */ - -/* - * start_this_handle: Given a handle, deal with any locking or stalling - * needed to make sure that there is enough journal space for the handle - * to begin. Attach the handle to a transaction and set up the - * transaction's buffer credits. - */ - -static int start_this_handle(journal_t *journal, handle_t *handle) -{ - transaction_t *transaction; - int needed; - int nblocks = handle->h_buffer_credits; - transaction_t *new_transaction = NULL; - int ret = 0; - - if (nblocks > journal->j_max_transaction_buffers) { - printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", - current->comm, nblocks, - journal->j_max_transaction_buffers); - ret = -ENOSPC; - goto out; - } - -alloc_transaction: - if (!journal->j_running_transaction) { - new_transaction = kzalloc(sizeof(*new_transaction), - GFP_NOFS|__GFP_NOFAIL); - if (!new_transaction) { - ret = -ENOMEM; - goto out; - } - } - - jbd_debug(3, "New handle %p going live.\n", handle); - -repeat: - - /* - * We need to hold j_state_lock until t_updates has been incremented, - * for proper journal barrier handling - */ - spin_lock(&journal->j_state_lock); -repeat_locked: - if (is_journal_aborted(journal) || - (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) { - spin_unlock(&journal->j_state_lock); - ret = -EROFS; - goto out; - } - - /* Wait on the journal's transaction barrier if necessary */ - if (journal->j_barrier_count) { - spin_unlock(&journal->j_state_lock); - wait_event(journal->j_wait_transaction_locked, - journal->j_barrier_count == 0); - goto repeat; - } - - if (!journal->j_running_transaction) { - if (!new_transaction) { - spin_unlock(&journal->j_state_lock); - goto alloc_transaction; - } - get_transaction(journal, new_transaction); - new_transaction = NULL; - } - - transaction = journal->j_running_transaction; - - /* - * If the current transaction is locked down for commit, wait for the - * lock to be released. - */ - if (transaction->t_state == T_LOCKED) { - DEFINE_WAIT(wait); - - prepare_to_wait(&journal->j_wait_transaction_locked, - &wait, TASK_UNINTERRUPTIBLE); - spin_unlock(&journal->j_state_lock); - schedule(); - finish_wait(&journal->j_wait_transaction_locked, &wait); - goto repeat; - } - - /* - * If there is not enough space left in the log to write all potential - * buffers requested by this operation, we need to stall pending a log - * checkpoint to free some more log space. - */ - spin_lock(&transaction->t_handle_lock); - needed = transaction->t_outstanding_credits + nblocks; - - if (needed > journal->j_max_transaction_buffers) { - /* - * If the current transaction is already too large, then start - * to commit it: we can then go back and attach this handle to - * a new transaction. - */ - DEFINE_WAIT(wait); - - jbd_debug(2, "Handle %p starting new commit...\n", handle); - spin_unlock(&transaction->t_handle_lock); - prepare_to_wait(&journal->j_wait_transaction_locked, &wait, - TASK_UNINTERRUPTIBLE); - __log_start_commit(journal, transaction->t_tid); - spin_unlock(&journal->j_state_lock); - schedule(); - finish_wait(&journal->j_wait_transaction_locked, &wait); - goto repeat; - } - - /* - * The commit code assumes that it can get enough log space - * without forcing a checkpoint. This is *critical* for - * correctness: a checkpoint of a buffer which is also - * associated with a committing transaction creates a deadlock, - * so commit simply cannot force through checkpoints. - * - * We must therefore ensure the necessary space in the journal - * *before* starting to dirty potentially checkpointed buffers - * in the new transaction. - * - * The worst part is, any transaction currently committing can - * reduce the free space arbitrarily. Be careful to account for - * those buffers when checkpointing. - */ - - /* - * @@@ AKPM: This seems rather over-defensive. We're giving commit - * a _lot_ of headroom: 1/4 of the journal plus the size of - * the committing transaction. Really, we only need to give it - * committing_transaction->t_outstanding_credits plus "enough" for - * the log control blocks. - * Also, this test is inconsistent with the matching one in - * journal_extend(). - */ - if (__log_space_left(journal) < jbd_space_needed(journal)) { - jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle); - spin_unlock(&transaction->t_handle_lock); - __log_wait_for_space(journal); - goto repeat_locked; - } - - /* OK, account for the buffers that this operation expects to - * use and add the handle to the running transaction. */ - - handle->h_transaction = transaction; - transaction->t_outstanding_credits += nblocks; - transaction->t_updates++; - transaction->t_handle_count++; - jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", - handle, nblocks, transaction->t_outstanding_credits, - __log_space_left(journal)); - spin_unlock(&transaction->t_handle_lock); - spin_unlock(&journal->j_state_lock); - - lock_map_acquire(&handle->h_lockdep_map); -out: - if (unlikely(new_transaction)) /* It's usually NULL */ - kfree(new_transaction); - return ret; -} - -static struct lock_class_key jbd_handle_key; - -/* Allocate a new handle. This should probably be in a slab... */ -static handle_t *new_handle(int nblocks) -{ - handle_t *handle = jbd_alloc_handle(GFP_NOFS); - if (!handle) - return NULL; - handle->h_buffer_credits = nblocks; - handle->h_ref = 1; - - lockdep_init_map(&handle->h_lockdep_map, "jbd_handle", &jbd_handle_key, 0); - - return handle; -} - -/** - * handle_t *journal_start() - Obtain a new handle. - * @journal: Journal to start transaction on. - * @nblocks: number of block buffer we might modify - * - * We make sure that the transaction can guarantee at least nblocks of - * modified buffers in the log. We block until the log can guarantee - * that much space. - * - * This function is visible to journal users (like ext3fs), so is not - * called with the journal already locked. - * - * Return a pointer to a newly allocated handle, or an ERR_PTR() value - * on failure. - */ -handle_t *journal_start(journal_t *journal, int nblocks) -{ - handle_t *handle = journal_current_handle(); - int err; - - if (!journal) - return ERR_PTR(-EROFS); - - if (handle) { - J_ASSERT(handle->h_transaction->t_journal == journal); - handle->h_ref++; - return handle; - } - - handle = new_handle(nblocks); - if (!handle) - return ERR_PTR(-ENOMEM); - - current->journal_info = handle; - - err = start_this_handle(journal, handle); - if (err < 0) { - jbd_free_handle(handle); - current->journal_info = NULL; - handle = ERR_PTR(err); - } - return handle; -} - -/** - * int journal_extend() - extend buffer credits. - * @handle: handle to 'extend' - * @nblocks: nr blocks to try to extend by. - * - * Some transactions, such as large extends and truncates, can be done - * atomically all at once or in several stages. The operation requests - * a credit for a number of buffer modications in advance, but can - * extend its credit if it needs more. - * - * journal_extend tries to give the running handle more buffer credits. - * It does not guarantee that allocation - this is a best-effort only. - * The calling process MUST be able to deal cleanly with a failure to - * extend here. - * - * Return 0 on success, non-zero on failure. - * - * return code < 0 implies an error - * return code > 0 implies normal transaction-full status. - */ -int journal_extend(handle_t *handle, int nblocks) -{ - transaction_t *transaction = handle->h_transaction; - journal_t *journal = transaction->t_journal; - int result; - int wanted; - - result = -EIO; - if (is_handle_aborted(handle)) - goto out; - - result = 1; - - spin_lock(&journal->j_state_lock); - - /* Don't extend a locked-down transaction! */ - if (handle->h_transaction->t_state != T_RUNNING) { - jbd_debug(3, "denied handle %p %d blocks: " - "transaction not running\n", handle, nblocks); - goto error_out; - } - - spin_lock(&transaction->t_handle_lock); - wanted = transaction->t_outstanding_credits + nblocks; - - if (wanted > journal->j_max_transaction_buffers) { - jbd_debug(3, "denied handle %p %d blocks: " - "transaction too large\n", handle, nblocks); - goto unlock; - } - - if (wanted > __log_space_left(journal)) { - jbd_debug(3, "denied handle %p %d blocks: " - "insufficient log space\n", handle, nblocks); - goto unlock; - } - - handle->h_buffer_credits += nblocks; - transaction->t_outstanding_credits += nblocks; - result = 0; - - jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); -unlock: - spin_unlock(&transaction->t_handle_lock); -error_out: - spin_unlock(&journal->j_state_lock); -out: - return result; -} - - -/** - * int journal_restart() - restart a handle. - * @handle: handle to restart - * @nblocks: nr credits requested - * - * Restart a handle for a multi-transaction filesystem - * operation. - * - * If the journal_extend() call above fails to grant new buffer credits - * to a running handle, a call to journal_restart will commit the - * handle's transaction so far and reattach the handle to a new - * transaction capabable of guaranteeing the requested number of - * credits. - */ - -int journal_restart(handle_t *handle, int nblocks) -{ - transaction_t *transaction = handle->h_transaction; - journal_t *journal = transaction->t_journal; - int ret; - - /* If we've had an abort of any type, don't even think about - * actually doing the restart! */ - if (is_handle_aborted(handle)) - return 0; - - /* - * First unlink the handle from its current transaction, and start the - * commit on that. - */ - J_ASSERT(transaction->t_updates > 0); - J_ASSERT(journal_current_handle() == handle); - - spin_lock(&journal->j_state_lock); - spin_lock(&transaction->t_handle_lock); - transaction->t_outstanding_credits -= handle->h_buffer_credits; - transaction->t_updates--; - - if (!transaction->t_updates) - wake_up(&journal->j_wait_updates); - spin_unlock(&transaction->t_handle_lock); - - jbd_debug(2, "restarting handle %p\n", handle); - __log_start_commit(journal, transaction->t_tid); - spin_unlock(&journal->j_state_lock); - - lock_map_release(&handle->h_lockdep_map); - handle->h_buffer_credits = nblocks; - ret = start_this_handle(journal, handle); - return ret; -} - - -/** - * void journal_lock_updates () - establish a transaction barrier. - * @journal: Journal to establish a barrier on. - * - * This locks out any further updates from being started, and blocks until all - * existing updates have completed, returning only once the journal is in a - * quiescent state with no updates running. - * - * We do not use simple mutex for synchronization as there are syscalls which - * want to return with filesystem locked and that trips up lockdep. Also - * hibernate needs to lock filesystem but locked mutex then blocks hibernation. - * Since locking filesystem is rare operation, we use simple counter and - * waitqueue for locking. - */ -void journal_lock_updates(journal_t *journal) -{ - DEFINE_WAIT(wait); - -wait: - /* Wait for previous locked operation to finish */ - wait_event(journal->j_wait_transaction_locked, - journal->j_barrier_count == 0); - - spin_lock(&journal->j_state_lock); - /* - * Check reliably under the lock whether we are the ones winning the race - * and locking the journal - */ - if (journal->j_barrier_count > 0) { - spin_unlock(&journal->j_state_lock); - goto wait; - } - ++journal->j_barrier_count; - - /* Wait until there are no running updates */ - while (1) { - transaction_t *transaction = journal->j_running_transaction; - - if (!transaction) - break; - - spin_lock(&transaction->t_handle_lock); - if (!transaction->t_updates) { - spin_unlock(&transaction->t_handle_lock); - break; - } - prepare_to_wait(&journal->j_wait_updates, &wait, - TASK_UNINTERRUPTIBLE); - spin_unlock(&transaction->t_handle_lock); - spin_unlock(&journal->j_state_lock); - schedule(); - finish_wait(&journal->j_wait_updates, &wait); - spin_lock(&journal->j_state_lock); - } - spin_unlock(&journal->j_state_lock); -} - -/** - * void journal_unlock_updates (journal_t* journal) - release barrier - * @journal: Journal to release the barrier on. - * - * Release a transaction barrier obtained with journal_lock_updates(). - */ -void journal_unlock_updates (journal_t *journal) -{ - J_ASSERT(journal->j_barrier_count != 0); - - spin_lock(&journal->j_state_lock); - --journal->j_barrier_count; - spin_unlock(&journal->j_state_lock); - wake_up(&journal->j_wait_transaction_locked); -} - -static void warn_dirty_buffer(struct buffer_head *bh) -{ - char b[BDEVNAME_SIZE]; - - printk(KERN_WARNING - "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). " - "There's a risk of filesystem corruption in case of system " - "crash.\n", - bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); -} - -/* - * If the buffer is already part of the current transaction, then there - * is nothing we need to do. If it is already part of a prior - * transaction which we are still committing to disk, then we need to - * make sure that we do not overwrite the old copy: we do copy-out to - * preserve the copy going to disk. We also account the buffer against - * the handle's metadata buffer credits (unless the buffer is already - * part of the transaction, that is). - * - */ -static int -do_get_write_access(handle_t *handle, struct journal_head *jh, - int force_copy) -{ - struct buffer_head *bh; - transaction_t *transaction; - journal_t *journal; - int error; - char *frozen_buffer = NULL; - int need_copy = 0; - - if (is_handle_aborted(handle)) - return -EROFS; - - transaction = handle->h_transaction; - journal = transaction->t_journal; - - jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy); - - JBUFFER_TRACE(jh, "entry"); -repeat: - bh = jh2bh(jh); - - /* @@@ Need to check for errors here at some point. */ - - lock_buffer(bh); - jbd_lock_bh_state(bh); - - /* We now hold the buffer lock so it is safe to query the buffer - * state. Is the buffer dirty? - * - * If so, there are two possibilities. The buffer may be - * non-journaled, and undergoing a quite legitimate writeback. - * Otherwise, it is journaled, and we don't expect dirty buffers - * in that state (the buffers should be marked JBD_Dirty - * instead.) So either the IO is being done under our own - * control and this is a bug, or it's a third party IO such as - * dump(8) (which may leave the buffer scheduled for read --- - * ie. locked but not dirty) or tune2fs (which may actually have - * the buffer dirtied, ugh.) */ - - if (buffer_dirty(bh)) { - /* - * First question: is this buffer already part of the current - * transaction or the existing committing transaction? - */ - if (jh->b_transaction) { - J_ASSERT_JH(jh, - jh->b_transaction == transaction || - jh->b_transaction == - journal->j_committing_transaction); - if (jh->b_next_transaction) - J_ASSERT_JH(jh, jh->b_next_transaction == - transaction); - warn_dirty_buffer(bh); - } - /* - * In any case we need to clean the dirty flag and we must - * do it under the buffer lock to be sure we don't race - * with running write-out. - */ - JBUFFER_TRACE(jh, "Journalling dirty buffer"); - clear_buffer_dirty(bh); - set_buffer_jbddirty(bh); - } - - unlock_buffer(bh); - - error = -EROFS; - if (is_handle_aborted(handle)) { - jbd_unlock_bh_state(bh); - goto out; - } - error = 0; - - /* - * The buffer is already part of this transaction if b_transaction or - * b_next_transaction points to it - */ - if (jh->b_transaction == transaction || - jh->b_next_transaction == transaction) - goto done; - - /* - * this is the first time this transaction is touching this buffer, - * reset the modified flag - */ - jh->b_modified = 0; - - /* - * If there is already a copy-out version of this buffer, then we don't - * need to make another one - */ - if (jh->b_frozen_data) { - JBUFFER_TRACE(jh, "has frozen data"); - J_ASSERT_JH(jh, jh->b_next_transaction == NULL); - jh->b_next_transaction = transaction; - goto done; - } - - /* Is there data here we need to preserve? */ - - if (jh->b_transaction && jh->b_transaction != transaction) { - JBUFFER_TRACE(jh, "owned by older transaction"); - J_ASSERT_JH(jh, jh->b_next_transaction == NULL); - J_ASSERT_JH(jh, jh->b_transaction == - journal->j_committing_transaction); - - /* There is one case we have to be very careful about. - * If the committing transaction is currently writing - * this buffer out to disk and has NOT made a copy-out, - * then we cannot modify the buffer contents at all - * right now. The essence of copy-out is that it is the - * extra copy, not the primary copy, which gets - * journaled. If the primary copy is already going to - * disk then we cannot do copy-out here. */ - - if (jh->b_jlist == BJ_Shadow) { - DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow); - wait_queue_head_t *wqh; - - wqh = bit_waitqueue(&bh->b_state, BH_Unshadow); - - JBUFFER_TRACE(jh, "on shadow: sleep"); - jbd_unlock_bh_state(bh); - /* commit wakes up all shadow buffers after IO */ - for ( ; ; ) { - prepare_to_wait(wqh, &wait.wait, - TASK_UNINTERRUPTIBLE); - if (jh->b_jlist != BJ_Shadow) - break; - schedule(); - } - finish_wait(wqh, &wait.wait); - goto repeat; - } - - /* Only do the copy if the currently-owning transaction - * still needs it. If it is on the Forget list, the - * committing transaction is past that stage. The - * buffer had better remain locked during the kmalloc, - * but that should be true --- we hold the journal lock - * still and the buffer is already on the BUF_JOURNAL - * list so won't be flushed. - * - * Subtle point, though: if this is a get_undo_access, - * then we will be relying on the frozen_data to contain - * the new value of the committed_data record after the - * transaction, so we HAVE to force the frozen_data copy - * in that case. */ - - if (jh->b_jlist != BJ_Forget || force_copy) { - JBUFFER_TRACE(jh, "generate frozen data"); - if (!frozen_buffer) { - JBUFFER_TRACE(jh, "allocate memory for buffer"); - jbd_unlock_bh_state(bh); - frozen_buffer = - jbd_alloc(jh2bh(jh)->b_size, - GFP_NOFS); - if (!frozen_buffer) { - printk(KERN_ERR - "%s: OOM for frozen_buffer\n", - __func__); - JBUFFER_TRACE(jh, "oom!"); - error = -ENOMEM; - jbd_lock_bh_state(bh); - goto done; - } - goto repeat; - } - jh->b_frozen_data = frozen_buffer; - frozen_buffer = NULL; - need_copy = 1; - } - jh->b_next_transaction = transaction; - } - - - /* - * Finally, if the buffer is not journaled right now, we need to make - * sure it doesn't get written to disk before the caller actually - * commits the new data - */ - if (!jh->b_transaction) { - JBUFFER_TRACE(jh, "no transaction"); - J_ASSERT_JH(jh, !jh->b_next_transaction); - JBUFFER_TRACE(jh, "file as BJ_Reserved"); - spin_lock(&journal->j_list_lock); - __journal_file_buffer(jh, transaction, BJ_Reserved); - spin_unlock(&journal->j_list_lock); - } - -done: - if (need_copy) { - struct page *page; - int offset; - char *source; - - J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)), - "Possible IO failure.\n"); - page = jh2bh(jh)->b_page; - offset = offset_in_page(jh2bh(jh)->b_data); - source = kmap_atomic(page); - memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); - kunmap_atomic(source); - } - jbd_unlock_bh_state(bh); - - /* - * If we are about to journal a buffer, then any revoke pending on it is - * no longer valid - */ - journal_cancel_revoke(handle, jh); - -out: - if (unlikely(frozen_buffer)) /* It's usually NULL */ - jbd_free(frozen_buffer, bh->b_size); - - JBUFFER_TRACE(jh, "exit"); - return error; -} - -/** - * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update. - * @handle: transaction to add buffer modifications to - * @bh: bh to be used for metadata writes - * - * Returns an error code or 0 on success. - * - * In full data journalling mode the buffer may be of type BJ_AsyncData, - * because we're write()ing a buffer which is also part of a shared mapping. - */ - -int journal_get_write_access(handle_t *handle, struct buffer_head *bh) -{ - struct journal_head *jh = journal_add_journal_head(bh); - int rc; - - /* We do not want to get caught playing with fields which the - * log thread also manipulates. Make sure that the buffer - * completes any outstanding IO before proceeding. */ - rc = do_get_write_access(handle, jh, 0); - journal_put_journal_head(jh); - return rc; -} - - -/* - * When the user wants to journal a newly created buffer_head - * (ie. getblk() returned a new buffer and we are going to populate it - * manually rather than reading off disk), then we need to keep the - * buffer_head locked until it has been completely filled with new - * data. In this case, we should be able to make the assertion that - * the bh is not already part of an existing transaction. - * - * The buffer should already be locked by the caller by this point. - * There is no lock ranking violation: it was a newly created, - * unlocked buffer beforehand. */ - -/** - * int journal_get_create_access () - notify intent to use newly created bh - * @handle: transaction to new buffer to - * @bh: new buffer. - * - * Call this if you create a new bh. - */ -int journal_get_create_access(handle_t *handle, struct buffer_head *bh) -{ - transaction_t *transaction = handle->h_transaction; - journal_t *journal = transaction->t_journal; - struct journal_head *jh = journal_add_journal_head(bh); - int err; - - jbd_debug(5, "journal_head %p\n", jh); - err = -EROFS; - if (is_handle_aborted(handle)) - goto out; - err = 0; - - JBUFFER_TRACE(jh, "entry"); - /* - * The buffer may already belong to this transaction due to pre-zeroing - * in the filesystem's new_block code. It may also be on the previous, - * committing transaction's lists, but it HAS to be in Forget state in - * that case: the transaction must have deleted the buffer for it to be - * reused here. - */ - jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); - J_ASSERT_JH(jh, (jh->b_transaction == transaction || - jh->b_transaction == NULL || - (jh->b_transaction == journal->j_committing_transaction && - jh->b_jlist == BJ_Forget))); - - J_ASSERT_JH(jh, jh->b_next_transaction == NULL); - J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); - - if (jh->b_transaction == NULL) { - /* - * Previous journal_forget() could have left the buffer - * with jbddirty bit set because it was being committed. When - * the commit finished, we've filed the buffer for - * checkpointing and marked it dirty. Now we are reallocating - * the buffer so the transaction freeing it must have - * committed and so it's safe to clear the dirty bit. - */ - clear_buffer_dirty(jh2bh(jh)); - - /* first access by this transaction */ - jh->b_modified = 0; - - JBUFFER_TRACE(jh, "file as BJ_Reserved"); - __journal_file_buffer(jh, transaction, BJ_Reserved); - } else if (jh->b_transaction == journal->j_committing_transaction) { - /* first access by this transaction */ - jh->b_modified = 0; - - JBUFFER_TRACE(jh, "set next transaction"); - jh->b_next_transaction = transaction; - } - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - - /* - * akpm: I added this. ext3_alloc_branch can pick up new indirect - * blocks which contain freed but then revoked metadata. We need - * to cancel the revoke in case we end up freeing it yet again - * and the reallocating as data - this would cause a second revoke, - * which hits an assertion error. - */ - JBUFFER_TRACE(jh, "cancelling revoke"); - journal_cancel_revoke(handle, jh); -out: - journal_put_journal_head(jh); - return err; -} - -/** - * int journal_get_undo_access() - Notify intent to modify metadata with non-rewindable consequences - * @handle: transaction - * @bh: buffer to undo - * - * Sometimes there is a need to distinguish between metadata which has - * been committed to disk and that which has not. The ext3fs code uses - * this for freeing and allocating space, we have to make sure that we - * do not reuse freed space until the deallocation has been committed, - * since if we overwrote that space we would make the delete - * un-rewindable in case of a crash. - * - * To deal with that, journal_get_undo_access requests write access to a - * buffer for parts of non-rewindable operations such as delete - * operations on the bitmaps. The journaling code must keep a copy of - * the buffer's contents prior to the undo_access call until such time - * as we know that the buffer has definitely been committed to disk. - * - * We never need to know which transaction the committed data is part - * of, buffers touched here are guaranteed to be dirtied later and so - * will be committed to a new transaction in due course, at which point - * we can discard the old committed data pointer. - * - * Returns error number or 0 on success. - */ -int journal_get_undo_access(handle_t *handle, struct buffer_head *bh) -{ - int err; - struct journal_head *jh = journal_add_journal_head(bh); - char *committed_data = NULL; - - JBUFFER_TRACE(jh, "entry"); - - /* - * Do this first --- it can drop the journal lock, so we want to - * make sure that obtaining the committed_data is done - * atomically wrt. completion of any outstanding commits. - */ - err = do_get_write_access(handle, jh, 1); - if (err) - goto out; - -repeat: - if (!jh->b_committed_data) { - committed_data = jbd_alloc(jh2bh(jh)->b_size, GFP_NOFS); - if (!committed_data) { - printk(KERN_ERR "%s: No memory for committed data\n", - __func__); - err = -ENOMEM; - goto out; - } - } - - jbd_lock_bh_state(bh); - if (!jh->b_committed_data) { - /* Copy out the current buffer contents into the - * preserved, committed copy. */ - JBUFFER_TRACE(jh, "generate b_committed data"); - if (!committed_data) { - jbd_unlock_bh_state(bh); - goto repeat; - } - - jh->b_committed_data = committed_data; - committed_data = NULL; - memcpy(jh->b_committed_data, bh->b_data, bh->b_size); - } - jbd_unlock_bh_state(bh); -out: - journal_put_journal_head(jh); - if (unlikely(committed_data)) - jbd_free(committed_data, bh->b_size); - return err; -} - -/** - * int journal_dirty_data() - mark a buffer as containing dirty data to be flushed - * @handle: transaction - * @bh: bufferhead to mark - * - * Description: - * Mark a buffer as containing dirty data which needs to be flushed before - * we can commit the current transaction. - * - * The buffer is placed on the transaction's data list and is marked as - * belonging to the transaction. - * - * Returns error number or 0 on success. - * - * journal_dirty_data() can be called via page_launder->ext3_writepage - * by kswapd. - */ -int journal_dirty_data(handle_t *handle, struct buffer_head *bh) -{ - journal_t *journal = handle->h_transaction->t_journal; - int need_brelse = 0; - struct journal_head *jh; - int ret = 0; - - if (is_handle_aborted(handle)) - return ret; - - jh = journal_add_journal_head(bh); - JBUFFER_TRACE(jh, "entry"); - - /* - * The buffer could *already* be dirty. Writeout can start - * at any time. - */ - jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid); - - /* - * What if the buffer is already part of a running transaction? - * - * There are two cases: - * 1) It is part of the current running transaction. Refile it, - * just in case we have allocated it as metadata, deallocated - * it, then reallocated it as data. - * 2) It is part of the previous, still-committing transaction. - * If all we want to do is to guarantee that the buffer will be - * written to disk before this new transaction commits, then - * being sure that the *previous* transaction has this same - * property is sufficient for us! Just leave it on its old - * transaction. - * - * In case (2), the buffer must not already exist as metadata - * --- that would violate write ordering (a transaction is free - * to write its data at any point, even before the previous - * committing transaction has committed). The caller must - * never, ever allow this to happen: there's nothing we can do - * about it in this layer. - */ - jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); - - /* Now that we have bh_state locked, are we really still mapped? */ - if (!buffer_mapped(bh)) { - JBUFFER_TRACE(jh, "unmapped buffer, bailing out"); - goto no_journal; - } - - if (jh->b_transaction) { - JBUFFER_TRACE(jh, "has transaction"); - if (jh->b_transaction != handle->h_transaction) { - JBUFFER_TRACE(jh, "belongs to older transaction"); - J_ASSERT_JH(jh, jh->b_transaction == - journal->j_committing_transaction); - - /* @@@ IS THIS TRUE ? */ - /* - * Not any more. Scenario: someone does a write() - * in data=journal mode. The buffer's transaction has - * moved into commit. Then someone does another - * write() to the file. We do the frozen data copyout - * and set b_next_transaction to point to j_running_t. - * And while we're in that state, someone does a - * writepage() in an attempt to pageout the same area - * of the file via a shared mapping. At present that - * calls journal_dirty_data(), and we get right here. - * It may be too late to journal the data. Simply - * falling through to the next test will suffice: the - * data will be dirty and wil be checkpointed. The - * ordering comments in the next comment block still - * apply. - */ - //J_ASSERT_JH(jh, jh->b_next_transaction == NULL); - - /* - * If we're journalling data, and this buffer was - * subject to a write(), it could be metadata, forget - * or shadow against the committing transaction. Now, - * someone has dirtied the same darn page via a mapping - * and it is being writepage()'d. - * We *could* just steal the page from commit, with some - * fancy locking there. Instead, we just skip it - - * don't tie the page's buffers to the new transaction - * at all. - * Implication: if we crash before the writepage() data - * is written into the filesystem, recovery will replay - * the write() data. - */ - if (jh->b_jlist != BJ_None && - jh->b_jlist != BJ_SyncData && - jh->b_jlist != BJ_Locked) { - JBUFFER_TRACE(jh, "Not stealing"); - goto no_journal; - } - - /* - * This buffer may be undergoing writeout in commit. We - * can't return from here and let the caller dirty it - * again because that can cause the write-out loop in - * commit to never terminate. - */ - if (buffer_dirty(bh)) { - get_bh(bh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - need_brelse = 1; - sync_dirty_buffer(bh); - jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); - /* Since we dropped the lock... */ - if (!buffer_mapped(bh)) { - JBUFFER_TRACE(jh, "buffer got unmapped"); - goto no_journal; - } - /* The buffer may become locked again at any - time if it is redirtied */ - } - - /* - * We cannot remove the buffer with io error from the - * committing transaction, because otherwise it would - * miss the error and the commit would not abort. - */ - if (unlikely(!buffer_uptodate(bh))) { - ret = -EIO; - goto no_journal; - } - /* We might have slept so buffer could be refiled now */ - if (jh->b_transaction != NULL && - jh->b_transaction != handle->h_transaction) { - JBUFFER_TRACE(jh, "unfile from commit"); - __journal_temp_unlink_buffer(jh); - /* It still points to the committing - * transaction; move it to this one so - * that the refile assert checks are - * happy. */ - jh->b_transaction = handle->h_transaction; - } - /* The buffer will be refiled below */ - - } - /* - * Special case --- the buffer might actually have been - * allocated and then immediately deallocated in the previous, - * committing transaction, so might still be left on that - * transaction's metadata lists. - */ - if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) { - JBUFFER_TRACE(jh, "not on correct data list: unfile"); - J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow); - JBUFFER_TRACE(jh, "file as data"); - __journal_file_buffer(jh, handle->h_transaction, - BJ_SyncData); - } - } else { - JBUFFER_TRACE(jh, "not on a transaction"); - __journal_file_buffer(jh, handle->h_transaction, BJ_SyncData); - } -no_journal: - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - if (need_brelse) { - BUFFER_TRACE(bh, "brelse"); - __brelse(bh); - } - JBUFFER_TRACE(jh, "exit"); - journal_put_journal_head(jh); - return ret; -} - -/** - * int journal_dirty_metadata() - mark a buffer as containing dirty metadata - * @handle: transaction to add buffer to. - * @bh: buffer to mark - * - * Mark dirty metadata which needs to be journaled as part of the current - * transaction. - * - * The buffer is placed on the transaction's metadata list and is marked - * as belonging to the transaction. - * - * Returns error number or 0 on success. - * - * Special care needs to be taken if the buffer already belongs to the - * current committing transaction (in which case we should have frozen - * data present for that commit). In that case, we don't relink the - * buffer: that only gets done when the old transaction finally - * completes its commit. - */ -int journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) -{ - transaction_t *transaction = handle->h_transaction; - journal_t *journal = transaction->t_journal; - struct journal_head *jh = bh2jh(bh); - - jbd_debug(5, "journal_head %p\n", jh); - JBUFFER_TRACE(jh, "entry"); - if (is_handle_aborted(handle)) - goto out; - - jbd_lock_bh_state(bh); - - if (jh->b_modified == 0) { - /* - * This buffer's got modified and becoming part - * of the transaction. This needs to be done - * once a transaction -bzzz - */ - jh->b_modified = 1; - J_ASSERT_JH(jh, handle->h_buffer_credits > 0); - handle->h_buffer_credits--; - } - - /* - * fastpath, to avoid expensive locking. If this buffer is already - * on the running transaction's metadata list there is nothing to do. - * Nobody can take it off again because there is a handle open. - * I _think_ we're OK here with SMP barriers - a mistaken decision will - * result in this test being false, so we go in and take the locks. - */ - if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) { - JBUFFER_TRACE(jh, "fastpath"); - J_ASSERT_JH(jh, jh->b_transaction == - journal->j_running_transaction); - goto out_unlock_bh; - } - - set_buffer_jbddirty(bh); - - /* - * Metadata already on the current transaction list doesn't - * need to be filed. Metadata on another transaction's list must - * be committing, and will be refiled once the commit completes: - * leave it alone for now. - */ - if (jh->b_transaction != transaction) { - JBUFFER_TRACE(jh, "already on other transaction"); - J_ASSERT_JH(jh, jh->b_transaction == - journal->j_committing_transaction); - J_ASSERT_JH(jh, jh->b_next_transaction == transaction); - /* And this case is illegal: we can't reuse another - * transaction's data buffer, ever. */ - goto out_unlock_bh; - } - - /* That test should have eliminated the following case: */ - J_ASSERT_JH(jh, jh->b_frozen_data == NULL); - - JBUFFER_TRACE(jh, "file as BJ_Metadata"); - spin_lock(&journal->j_list_lock); - __journal_file_buffer(jh, handle->h_transaction, BJ_Metadata); - spin_unlock(&journal->j_list_lock); -out_unlock_bh: - jbd_unlock_bh_state(bh); -out: - JBUFFER_TRACE(jh, "exit"); - return 0; -} - -/* - * journal_release_buffer: undo a get_write_access without any buffer - * updates, if the update decided in the end that it didn't need access. - * - */ -void -journal_release_buffer(handle_t *handle, struct buffer_head *bh) -{ - BUFFER_TRACE(bh, "entry"); -} - -/** - * void journal_forget() - bforget() for potentially-journaled buffers. - * @handle: transaction handle - * @bh: bh to 'forget' - * - * We can only do the bforget if there are no commits pending against the - * buffer. If the buffer is dirty in the current running transaction we - * can safely unlink it. - * - * bh may not be a journalled buffer at all - it may be a non-JBD - * buffer which came off the hashtable. Check for this. - * - * Decrements bh->b_count by one. - * - * Allow this call even if the handle has aborted --- it may be part of - * the caller's cleanup after an abort. - */ -int journal_forget (handle_t *handle, struct buffer_head *bh) -{ - transaction_t *transaction = handle->h_transaction; - journal_t *journal = transaction->t_journal; - struct journal_head *jh; - int drop_reserve = 0; - int err = 0; - int was_modified = 0; - - BUFFER_TRACE(bh, "entry"); - - jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); - - if (!buffer_jbd(bh)) - goto not_jbd; - jh = bh2jh(bh); - - /* Critical error: attempting to delete a bitmap buffer, maybe? - * Don't do any jbd operations, and return an error. */ - if (!J_EXPECT_JH(jh, !jh->b_committed_data, - "inconsistent data on disk")) { - err = -EIO; - goto not_jbd; - } - - /* keep track of whether or not this transaction modified us */ - was_modified = jh->b_modified; - - /* - * The buffer's going from the transaction, we must drop - * all references -bzzz - */ - jh->b_modified = 0; - - if (jh->b_transaction == handle->h_transaction) { - J_ASSERT_JH(jh, !jh->b_frozen_data); - - /* If we are forgetting a buffer which is already part - * of this transaction, then we can just drop it from - * the transaction immediately. */ - clear_buffer_dirty(bh); - clear_buffer_jbddirty(bh); - - JBUFFER_TRACE(jh, "belongs to current transaction: unfile"); - - /* - * we only want to drop a reference if this transaction - * modified the buffer - */ - if (was_modified) - drop_reserve = 1; - - /* - * We are no longer going to journal this buffer. - * However, the commit of this transaction is still - * important to the buffer: the delete that we are now - * processing might obsolete an old log entry, so by - * committing, we can satisfy the buffer's checkpoint. - * - * So, if we have a checkpoint on the buffer, we should - * now refile the buffer on our BJ_Forget list so that - * we know to remove the checkpoint after we commit. - */ - - if (jh->b_cp_transaction) { - __journal_temp_unlink_buffer(jh); - __journal_file_buffer(jh, transaction, BJ_Forget); - } else { - __journal_unfile_buffer(jh); - if (!buffer_jbd(bh)) { - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - __bforget(bh); - goto drop; - } - } - } else if (jh->b_transaction) { - J_ASSERT_JH(jh, (jh->b_transaction == - journal->j_committing_transaction)); - /* However, if the buffer is still owned by a prior - * (committing) transaction, we can't drop it yet... */ - JBUFFER_TRACE(jh, "belongs to older transaction"); - /* ... but we CAN drop it from the new transaction if we - * have also modified it since the original commit. */ - - if (jh->b_next_transaction) { - J_ASSERT(jh->b_next_transaction == transaction); - jh->b_next_transaction = NULL; - - /* - * only drop a reference if this transaction modified - * the buffer - */ - if (was_modified) - drop_reserve = 1; - } - } - -not_jbd: - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - __brelse(bh); -drop: - if (drop_reserve) { - /* no need to reserve log space for this block -bzzz */ - handle->h_buffer_credits++; - } - return err; -} - -/** - * int journal_stop() - complete a transaction - * @handle: tranaction to complete. - * - * All done for a particular handle. - * - * There is not much action needed here. We just return any remaining - * buffer credits to the transaction and remove the handle. The only - * complication is that we need to start a commit operation if the - * filesystem is marked for synchronous update. - * - * journal_stop itself will not usually return an error, but it may - * do so in unusual circumstances. In particular, expect it to - * return -EIO if a journal_abort has been executed since the - * transaction began. - */ -int journal_stop(handle_t *handle) -{ - transaction_t *transaction = handle->h_transaction; - journal_t *journal = transaction->t_journal; - int err; - pid_t pid; - - J_ASSERT(journal_current_handle() == handle); - - if (is_handle_aborted(handle)) - err = -EIO; - else { - J_ASSERT(transaction->t_updates > 0); - err = 0; - } - - if (--handle->h_ref > 0) { - jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, - handle->h_ref); - return err; - } - - jbd_debug(4, "Handle %p going down\n", handle); - - /* - * Implement synchronous transaction batching. If the handle - * was synchronous, don't force a commit immediately. Let's - * yield and let another thread piggyback onto this transaction. - * Keep doing that while new threads continue to arrive. - * It doesn't cost much - we're about to run a commit and sleep - * on IO anyway. Speeds up many-threaded, many-dir operations - * by 30x or more... - * - * We try and optimize the sleep time against what the underlying disk - * can do, instead of having a static sleep time. This is useful for - * the case where our storage is so fast that it is more optimal to go - * ahead and force a flush and wait for the transaction to be committed - * than it is to wait for an arbitrary amount of time for new writers to - * join the transaction. We achieve this by measuring how long it takes - * to commit a transaction, and compare it with how long this - * transaction has been running, and if run time < commit time then we - * sleep for the delta and commit. This greatly helps super fast disks - * that would see slowdowns as more threads started doing fsyncs. - * - * But don't do this if this process was the most recent one to - * perform a synchronous write. We do this to detect the case where a - * single process is doing a stream of sync writes. No point in waiting - * for joiners in that case. - */ - pid = current->pid; - if (handle->h_sync && journal->j_last_sync_writer != pid) { - u64 commit_time, trans_time; - - journal->j_last_sync_writer = pid; - - spin_lock(&journal->j_state_lock); - commit_time = journal->j_average_commit_time; - spin_unlock(&journal->j_state_lock); - - trans_time = ktime_to_ns(ktime_sub(ktime_get(), - transaction->t_start_time)); - - commit_time = min_t(u64, commit_time, - 1000*jiffies_to_usecs(1)); - - if (trans_time < commit_time) { - ktime_t expires = ktime_add_ns(ktime_get(), - commit_time); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); - } - } - - current->journal_info = NULL; - spin_lock(&journal->j_state_lock); - spin_lock(&transaction->t_handle_lock); - transaction->t_outstanding_credits -= handle->h_buffer_credits; - transaction->t_updates--; - if (!transaction->t_updates) { - wake_up(&journal->j_wait_updates); - if (journal->j_barrier_count) - wake_up(&journal->j_wait_transaction_locked); - } - - /* - * If the handle is marked SYNC, we need to set another commit - * going! We also want to force a commit if the current - * transaction is occupying too much of the log, or if the - * transaction is too old now. - */ - if (handle->h_sync || - transaction->t_outstanding_credits > - journal->j_max_transaction_buffers || - time_after_eq(jiffies, transaction->t_expires)) { - /* Do this even for aborted journals: an abort still - * completes the commit thread, it just doesn't write - * anything to disk. */ - tid_t tid = transaction->t_tid; - - spin_unlock(&transaction->t_handle_lock); - jbd_debug(2, "transaction too old, requesting commit for " - "handle %p\n", handle); - /* This is non-blocking */ - __log_start_commit(journal, transaction->t_tid); - spin_unlock(&journal->j_state_lock); - - /* - * Special case: JFS_SYNC synchronous updates require us - * to wait for the commit to complete. - */ - if (handle->h_sync && !(current->flags & PF_MEMALLOC)) - err = log_wait_commit(journal, tid); - } else { - spin_unlock(&transaction->t_handle_lock); - spin_unlock(&journal->j_state_lock); - } - - lock_map_release(&handle->h_lockdep_map); - - jbd_free_handle(handle); - return err; -} - -/** - * int journal_force_commit() - force any uncommitted transactions - * @journal: journal to force - * - * For synchronous operations: force any uncommitted transactions - * to disk. May seem kludgy, but it reuses all the handle batching - * code in a very simple manner. - */ -int journal_force_commit(journal_t *journal) -{ - handle_t *handle; - int ret; - - handle = journal_start(journal, 1); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - } else { - handle->h_sync = 1; - ret = journal_stop(handle); - } - return ret; -} - -/* - * - * List management code snippets: various functions for manipulating the - * transaction buffer lists. - * - */ - -/* - * Append a buffer to a transaction list, given the transaction's list head - * pointer. - * - * j_list_lock is held. - * - * jbd_lock_bh_state(jh2bh(jh)) is held. - */ - -static inline void -__blist_add_buffer(struct journal_head **list, struct journal_head *jh) -{ - if (!*list) { - jh->b_tnext = jh->b_tprev = jh; - *list = jh; - } else { - /* Insert at the tail of the list to preserve order */ - struct journal_head *first = *list, *last = first->b_tprev; - jh->b_tprev = last; - jh->b_tnext = first; - last->b_tnext = first->b_tprev = jh; - } -} - -/* - * Remove a buffer from a transaction list, given the transaction's list - * head pointer. - * - * Called with j_list_lock held, and the journal may not be locked. - * - * jbd_lock_bh_state(jh2bh(jh)) is held. - */ - -static inline void -__blist_del_buffer(struct journal_head **list, struct journal_head *jh) -{ - if (*list == jh) { - *list = jh->b_tnext; - if (*list == jh) - *list = NULL; - } - jh->b_tprev->b_tnext = jh->b_tnext; - jh->b_tnext->b_tprev = jh->b_tprev; -} - -/* - * Remove a buffer from the appropriate transaction list. - * - * Note that this function can *change* the value of - * bh->b_transaction->t_sync_datalist, t_buffers, t_forget, - * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller - * is holding onto a copy of one of thee pointers, it could go bad. - * Generally the caller needs to re-read the pointer from the transaction_t. - * - * Called under j_list_lock. The journal may not be locked. - */ -static void __journal_temp_unlink_buffer(struct journal_head *jh) -{ - struct journal_head **list = NULL; - transaction_t *transaction; - struct buffer_head *bh = jh2bh(jh); - - J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); - transaction = jh->b_transaction; - if (transaction) - assert_spin_locked(&transaction->t_journal->j_list_lock); - - J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); - if (jh->b_jlist != BJ_None) - J_ASSERT_JH(jh, transaction != NULL); - - switch (jh->b_jlist) { - case BJ_None: - return; - case BJ_SyncData: - list = &transaction->t_sync_datalist; - break; - case BJ_Metadata: - transaction->t_nr_buffers--; - J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0); - list = &transaction->t_buffers; - break; - case BJ_Forget: - list = &transaction->t_forget; - break; - case BJ_IO: - list = &transaction->t_iobuf_list; - break; - case BJ_Shadow: - list = &transaction->t_shadow_list; - break; - case BJ_LogCtl: - list = &transaction->t_log_list; - break; - case BJ_Reserved: - list = &transaction->t_reserved_list; - break; - case BJ_Locked: - list = &transaction->t_locked_list; - break; - } - - __blist_del_buffer(list, jh); - jh->b_jlist = BJ_None; - if (test_clear_buffer_jbddirty(bh)) - mark_buffer_dirty(bh); /* Expose it to the VM */ -} - -/* - * Remove buffer from all transactions. - * - * Called with bh_state lock and j_list_lock - * - * jh and bh may be already freed when this function returns. - */ -void __journal_unfile_buffer(struct journal_head *jh) -{ - __journal_temp_unlink_buffer(jh); - jh->b_transaction = NULL; - journal_put_journal_head(jh); -} - -void journal_unfile_buffer(journal_t *journal, struct journal_head *jh) -{ - struct buffer_head *bh = jh2bh(jh); - - /* Get reference so that buffer cannot be freed before we unlock it */ - get_bh(bh); - jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); - __journal_unfile_buffer(jh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - __brelse(bh); -} - -/* - * Called from journal_try_to_free_buffers(). - * - * Called under jbd_lock_bh_state(bh) - */ -static void -__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) -{ - struct journal_head *jh; - - jh = bh2jh(bh); - - if (buffer_locked(bh) || buffer_dirty(bh)) - goto out; - - if (jh->b_next_transaction != NULL) - goto out; - - spin_lock(&journal->j_list_lock); - if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) { - if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) { - /* A written-back ordered data buffer */ - JBUFFER_TRACE(jh, "release data"); - __journal_unfile_buffer(jh); - } - } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { - /* written-back checkpointed metadata buffer */ - if (jh->b_jlist == BJ_None) { - JBUFFER_TRACE(jh, "remove from checkpoint list"); - __journal_remove_checkpoint(jh); - } - } - spin_unlock(&journal->j_list_lock); -out: - return; -} - -/** - * int journal_try_to_free_buffers() - try to free page buffers. - * @journal: journal for operation - * @page: to try and free - * @gfp_mask: we use the mask to detect how hard should we try to release - * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to - * release the buffers. - * - * - * For all the buffers on this page, - * if they are fully written out ordered data, move them onto BUF_CLEAN - * so try_to_free_buffers() can reap them. - * - * This function returns non-zero if we wish try_to_free_buffers() - * to be called. We do this if the page is releasable by try_to_free_buffers(). - * We also do it if the page has locked or dirty buffers and the caller wants - * us to perform sync or async writeout. - * - * This complicates JBD locking somewhat. We aren't protected by the - * BKL here. We wish to remove the buffer from its committing or - * running transaction's ->t_datalist via __journal_unfile_buffer. - * - * This may *change* the value of transaction_t->t_datalist, so anyone - * who looks at t_datalist needs to lock against this function. - * - * Even worse, someone may be doing a journal_dirty_data on this - * buffer. So we need to lock against that. journal_dirty_data() - * will come out of the lock with the buffer dirty, which makes it - * ineligible for release here. - * - * Who else is affected by this? hmm... Really the only contender - * is do_get_write_access() - it could be looking at the buffer while - * journal_try_to_free_buffer() is changing its state. But that - * cannot happen because we never reallocate freed data as metadata - * while the data is part of a transaction. Yes? - * - * Return 0 on failure, 1 on success - */ -int journal_try_to_free_buffers(journal_t *journal, - struct page *page, gfp_t gfp_mask) -{ - struct buffer_head *head; - struct buffer_head *bh; - int ret = 0; - - J_ASSERT(PageLocked(page)); - - head = page_buffers(page); - bh = head; - do { - struct journal_head *jh; - - /* - * We take our own ref against the journal_head here to avoid - * having to add tons of locking around each instance of - * journal_put_journal_head(). - */ - jh = journal_grab_journal_head(bh); - if (!jh) - continue; - - jbd_lock_bh_state(bh); - __journal_try_to_free_buffer(journal, bh); - journal_put_journal_head(jh); - jbd_unlock_bh_state(bh); - if (buffer_jbd(bh)) - goto busy; - } while ((bh = bh->b_this_page) != head); - - ret = try_to_free_buffers(page); - -busy: - return ret; -} - -/* - * This buffer is no longer needed. If it is on an older transaction's - * checkpoint list we need to record it on this transaction's forget list - * to pin this buffer (and hence its checkpointing transaction) down until - * this transaction commits. If the buffer isn't on a checkpoint list, we - * release it. - * Returns non-zero if JBD no longer has an interest in the buffer. - * - * Called under j_list_lock. - * - * Called under jbd_lock_bh_state(bh). - */ -static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) -{ - int may_free = 1; - struct buffer_head *bh = jh2bh(jh); - - if (jh->b_cp_transaction) { - JBUFFER_TRACE(jh, "on running+cp transaction"); - __journal_temp_unlink_buffer(jh); - /* - * We don't want to write the buffer anymore, clear the - * bit so that we don't confuse checks in - * __journal_file_buffer - */ - clear_buffer_dirty(bh); - __journal_file_buffer(jh, transaction, BJ_Forget); - may_free = 0; - } else { - JBUFFER_TRACE(jh, "on running transaction"); - __journal_unfile_buffer(jh); - } - return may_free; -} - -/* - * journal_invalidatepage - * - * This code is tricky. It has a number of cases to deal with. - * - * There are two invariants which this code relies on: - * - * i_size must be updated on disk before we start calling invalidatepage on the - * data. - * - * This is done in ext3 by defining an ext3_setattr method which - * updates i_size before truncate gets going. By maintaining this - * invariant, we can be sure that it is safe to throw away any buffers - * attached to the current transaction: once the transaction commits, - * we know that the data will not be needed. - * - * Note however that we can *not* throw away data belonging to the - * previous, committing transaction! - * - * Any disk blocks which *are* part of the previous, committing - * transaction (and which therefore cannot be discarded immediately) are - * not going to be reused in the new running transaction - * - * The bitmap committed_data images guarantee this: any block which is - * allocated in one transaction and removed in the next will be marked - * as in-use in the committed_data bitmap, so cannot be reused until - * the next transaction to delete the block commits. This means that - * leaving committing buffers dirty is quite safe: the disk blocks - * cannot be reallocated to a different file and so buffer aliasing is - * not possible. - * - * - * The above applies mainly to ordered data mode. In writeback mode we - * don't make guarantees about the order in which data hits disk --- in - * particular we don't guarantee that new dirty data is flushed before - * transaction commit --- so it is always safe just to discard data - * immediately in that mode. --sct - */ - -/* - * The journal_unmap_buffer helper function returns zero if the buffer - * concerned remains pinned as an anonymous buffer belonging to an older - * transaction. - * - * We're outside-transaction here. Either or both of j_running_transaction - * and j_committing_transaction may be NULL. - */ -static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, - int partial_page) -{ - transaction_t *transaction; - struct journal_head *jh; - int may_free = 1; - - BUFFER_TRACE(bh, "entry"); - -retry: - /* - * It is safe to proceed here without the j_list_lock because the - * buffers cannot be stolen by try_to_free_buffers as long as we are - * holding the page lock. --sct - */ - - if (!buffer_jbd(bh)) - goto zap_buffer_unlocked; - - spin_lock(&journal->j_state_lock); - jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); - - jh = journal_grab_journal_head(bh); - if (!jh) - goto zap_buffer_no_jh; - - /* - * We cannot remove the buffer from checkpoint lists until the - * transaction adding inode to orphan list (let's call it T) - * is committed. Otherwise if the transaction changing the - * buffer would be cleaned from the journal before T is - * committed, a crash will cause that the correct contents of - * the buffer will be lost. On the other hand we have to - * clear the buffer dirty bit at latest at the moment when the - * transaction marking the buffer as freed in the filesystem - * structures is committed because from that moment on the - * block can be reallocated and used by a different page. - * Since the block hasn't been freed yet but the inode has - * already been added to orphan list, it is safe for us to add - * the buffer to BJ_Forget list of the newest transaction. - * - * Also we have to clear buffer_mapped flag of a truncated buffer - * because the buffer_head may be attached to the page straddling - * i_size (can happen only when blocksize < pagesize) and thus the - * buffer_head can be reused when the file is extended again. So we end - * up keeping around invalidated buffers attached to transactions' - * BJ_Forget list just to stop checkpointing code from cleaning up - * the transaction this buffer was modified in. - */ - transaction = jh->b_transaction; - if (transaction == NULL) { - /* First case: not on any transaction. If it - * has no checkpoint link, then we can zap it: - * it's a writeback-mode buffer so we don't care - * if it hits disk safely. */ - if (!jh->b_cp_transaction) { - JBUFFER_TRACE(jh, "not on any transaction: zap"); - goto zap_buffer; - } - - if (!buffer_dirty(bh)) { - /* bdflush has written it. We can drop it now */ - goto zap_buffer; - } - - /* OK, it must be in the journal but still not - * written fully to disk: it's metadata or - * journaled data... */ - - if (journal->j_running_transaction) { - /* ... and once the current transaction has - * committed, the buffer won't be needed any - * longer. */ - JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget"); - may_free = __dispose_buffer(jh, - journal->j_running_transaction); - goto zap_buffer; - } else { - /* There is no currently-running transaction. So the - * orphan record which we wrote for this file must have - * passed into commit. We must attach this buffer to - * the committing transaction, if it exists. */ - if (journal->j_committing_transaction) { - JBUFFER_TRACE(jh, "give to committing trans"); - may_free = __dispose_buffer(jh, - journal->j_committing_transaction); - goto zap_buffer; - } else { - /* The orphan record's transaction has - * committed. We can cleanse this buffer */ - clear_buffer_jbddirty(bh); - goto zap_buffer; - } - } - } else if (transaction == journal->j_committing_transaction) { - JBUFFER_TRACE(jh, "on committing transaction"); - if (jh->b_jlist == BJ_Locked) { - /* - * The buffer is on the committing transaction's locked - * list. We have the buffer locked, so I/O has - * completed. So we can nail the buffer now. - */ - may_free = __dispose_buffer(jh, transaction); - goto zap_buffer; - } - /* - * The buffer is committing, we simply cannot touch - * it. If the page is straddling i_size we have to wait - * for commit and try again. - */ - if (partial_page) { - tid_t tid = journal->j_committing_transaction->t_tid; - - journal_put_journal_head(jh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); - unlock_buffer(bh); - log_wait_commit(journal, tid); - lock_buffer(bh); - goto retry; - } - /* - * OK, buffer won't be reachable after truncate. We just set - * j_next_transaction to the running transaction (if there is - * one) and mark buffer as freed so that commit code knows it - * should clear dirty bits when it is done with the buffer. - */ - set_buffer_freed(bh); - if (journal->j_running_transaction && buffer_jbddirty(bh)) - jh->b_next_transaction = journal->j_running_transaction; - journal_put_journal_head(jh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); - return 0; - } else { - /* Good, the buffer belongs to the running transaction. - * We are writing our own transaction's data, not any - * previous one's, so it is safe to throw it away - * (remember that we expect the filesystem to have set - * i_size already for this truncate so recovery will not - * expose the disk blocks we are discarding here.) */ - J_ASSERT_JH(jh, transaction == journal->j_running_transaction); - JBUFFER_TRACE(jh, "on running transaction"); - may_free = __dispose_buffer(jh, transaction); - } - -zap_buffer: - /* - * This is tricky. Although the buffer is truncated, it may be reused - * if blocksize < pagesize and it is attached to the page straddling - * EOF. Since the buffer might have been added to BJ_Forget list of the - * running transaction, journal_get_write_access() won't clear - * b_modified and credit accounting gets confused. So clear b_modified - * here. */ - jh->b_modified = 0; - journal_put_journal_head(jh); -zap_buffer_no_jh: - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); -zap_buffer_unlocked: - clear_buffer_dirty(bh); - J_ASSERT_BH(bh, !buffer_jbddirty(bh)); - clear_buffer_mapped(bh); - clear_buffer_req(bh); - clear_buffer_new(bh); - bh->b_bdev = NULL; - return may_free; -} - -/** - * void journal_invalidatepage() - invalidate a journal page - * @journal: journal to use for flush - * @page: page to flush - * @offset: offset of the range to invalidate - * @length: length of the range to invalidate - * - * Reap page buffers containing data in specified range in page. - */ -void journal_invalidatepage(journal_t *journal, - struct page *page, - unsigned int offset, - unsigned int length) -{ - struct buffer_head *head, *bh, *next; - unsigned int stop = offset + length; - unsigned int curr_off = 0; - int partial_page = (offset || length < PAGE_CACHE_SIZE); - int may_free = 1; - - if (!PageLocked(page)) - BUG(); - if (!page_has_buffers(page)) - return; - - BUG_ON(stop > PAGE_CACHE_SIZE || stop < length); - - /* We will potentially be playing with lists other than just the - * data lists (especially for journaled data mode), so be - * cautious in our locking. */ - - head = bh = page_buffers(page); - do { - unsigned int next_off = curr_off + bh->b_size; - next = bh->b_this_page; - - if (next_off > stop) - return; - - if (offset <= curr_off) { - /* This block is wholly outside the truncation point */ - lock_buffer(bh); - may_free &= journal_unmap_buffer(journal, bh, - partial_page); - unlock_buffer(bh); - } - curr_off = next_off; - bh = next; - - } while (bh != head); - - if (!partial_page) { - if (may_free && try_to_free_buffers(page)) - J_ASSERT(!page_has_buffers(page)); - } -} - -/* - * File a buffer on the given transaction list. - */ -void __journal_file_buffer(struct journal_head *jh, - transaction_t *transaction, int jlist) -{ - struct journal_head **list = NULL; - int was_dirty = 0; - struct buffer_head *bh = jh2bh(jh); - - J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); - assert_spin_locked(&transaction->t_journal->j_list_lock); - - J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); - J_ASSERT_JH(jh, jh->b_transaction == transaction || - jh->b_transaction == NULL); - - if (jh->b_transaction && jh->b_jlist == jlist) - return; - - if (jlist == BJ_Metadata || jlist == BJ_Reserved || - jlist == BJ_Shadow || jlist == BJ_Forget) { - /* - * For metadata buffers, we track dirty bit in buffer_jbddirty - * instead of buffer_dirty. We should not see a dirty bit set - * here because we clear it in do_get_write_access but e.g. - * tune2fs can modify the sb and set the dirty bit at any time - * so we try to gracefully handle that. - */ - if (buffer_dirty(bh)) - warn_dirty_buffer(bh); - if (test_clear_buffer_dirty(bh) || - test_clear_buffer_jbddirty(bh)) - was_dirty = 1; - } - - if (jh->b_transaction) - __journal_temp_unlink_buffer(jh); - else - journal_grab_journal_head(bh); - jh->b_transaction = transaction; - - switch (jlist) { - case BJ_None: - J_ASSERT_JH(jh, !jh->b_committed_data); - J_ASSERT_JH(jh, !jh->b_frozen_data); - return; - case BJ_SyncData: - list = &transaction->t_sync_datalist; - break; - case BJ_Metadata: - transaction->t_nr_buffers++; - list = &transaction->t_buffers; - break; - case BJ_Forget: - list = &transaction->t_forget; - break; - case BJ_IO: - list = &transaction->t_iobuf_list; - break; - case BJ_Shadow: - list = &transaction->t_shadow_list; - break; - case BJ_LogCtl: - list = &transaction->t_log_list; - break; - case BJ_Reserved: - list = &transaction->t_reserved_list; - break; - case BJ_Locked: - list = &transaction->t_locked_list; - break; - } - - __blist_add_buffer(list, jh); - jh->b_jlist = jlist; - - if (was_dirty) - set_buffer_jbddirty(bh); -} - -void journal_file_buffer(struct journal_head *jh, - transaction_t *transaction, int jlist) -{ - jbd_lock_bh_state(jh2bh(jh)); - spin_lock(&transaction->t_journal->j_list_lock); - __journal_file_buffer(jh, transaction, jlist); - spin_unlock(&transaction->t_journal->j_list_lock); - jbd_unlock_bh_state(jh2bh(jh)); -} - -/* - * Remove a buffer from its current buffer list in preparation for - * dropping it from its current transaction entirely. If the buffer has - * already started to be used by a subsequent transaction, refile the - * buffer on that transaction's metadata list. - * - * Called under j_list_lock - * Called under jbd_lock_bh_state(jh2bh(jh)) - * - * jh and bh may be already free when this function returns - */ -void __journal_refile_buffer(struct journal_head *jh) -{ - int was_dirty, jlist; - struct buffer_head *bh = jh2bh(jh); - - J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); - if (jh->b_transaction) - assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock); - - /* If the buffer is now unused, just drop it. */ - if (jh->b_next_transaction == NULL) { - __journal_unfile_buffer(jh); - return; - } - - /* - * It has been modified by a later transaction: add it to the new - * transaction's metadata list. - */ - - was_dirty = test_clear_buffer_jbddirty(bh); - __journal_temp_unlink_buffer(jh); - /* - * We set b_transaction here because b_next_transaction will inherit - * our jh reference and thus __journal_file_buffer() must not take a - * new one. - */ - jh->b_transaction = jh->b_next_transaction; - jh->b_next_transaction = NULL; - if (buffer_freed(bh)) - jlist = BJ_Forget; - else if (jh->b_modified) - jlist = BJ_Metadata; - else - jlist = BJ_Reserved; - __journal_file_buffer(jh, jh->b_transaction, jlist); - J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); - - if (was_dirty) - set_buffer_jbddirty(bh); -} - -/* - * __journal_refile_buffer() with necessary locking added. We take our bh - * reference so that we can safely unlock bh. - * - * The jh and bh may be freed by this call. - */ -void journal_refile_buffer(journal_t *journal, struct journal_head *jh) -{ - struct buffer_head *bh = jh2bh(jh); - - /* Get reference so that buffer cannot be freed before we unlock it */ - get_bh(bh); - jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); - __journal_refile_buffer(jh); - jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_list_lock); - __brelse(bh); -} diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 4227dc4f7437..8c44654ce274 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -417,12 +417,12 @@ int jbd2_cleanup_journal_tail(journal_t *journal) * journal_clean_one_cp_list * * Find all the written-back checkpoint buffers in the given list and - * release them. + * release them. If 'destroy' is set, clean all buffers unconditionally. * * Called with j_list_lock held. * Returns 1 if we freed the transaction, 0 otherwise. */ -static int journal_clean_one_cp_list(struct journal_head *jh) +static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy) { struct journal_head *last_jh; struct journal_head *next_jh = jh; @@ -436,7 +436,10 @@ static int journal_clean_one_cp_list(struct journal_head *jh) do { jh = next_jh; next_jh = jh->b_cpnext; - ret = __try_to_free_cp_buf(jh); + if (!destroy) + ret = __try_to_free_cp_buf(jh); + else + ret = __jbd2_journal_remove_checkpoint(jh) + 1; if (!ret) return freed; if (ret == 2) @@ -459,10 +462,11 @@ static int journal_clean_one_cp_list(struct journal_head *jh) * journal_clean_checkpoint_list * * Find all the written-back checkpoint buffers in the journal and release them. + * If 'destroy' is set, release all buffers unconditionally. * * Called with j_list_lock held. */ -void __jbd2_journal_clean_checkpoint_list(journal_t *journal) +void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy) { transaction_t *transaction, *last_transaction, *next_transaction; int ret; @@ -476,7 +480,8 @@ void __jbd2_journal_clean_checkpoint_list(journal_t *journal) do { transaction = next_transaction; next_transaction = transaction->t_cpnext; - ret = journal_clean_one_cp_list(transaction->t_checkpoint_list); + ret = journal_clean_one_cp_list(transaction->t_checkpoint_list, + destroy); /* * This function only frees up some memory if possible so we * dont have an obligation to finish processing. Bail out if @@ -492,7 +497,7 @@ void __jbd2_journal_clean_checkpoint_list(journal_t *journal) * we can possibly see not yet submitted buffers on io_list */ ret = journal_clean_one_cp_list(transaction-> - t_checkpoint_io_list); + t_checkpoint_io_list, destroy); if (need_resched()) return; /* @@ -506,6 +511,28 @@ void __jbd2_journal_clean_checkpoint_list(journal_t *journal) } /* + * Remove buffers from all checkpoint lists as journal is aborted and we just + * need to free memory + */ +void jbd2_journal_destroy_checkpoint(journal_t *journal) +{ + /* + * We loop because __jbd2_journal_clean_checkpoint_list() may abort + * early due to a need of rescheduling. + */ + while (1) { + spin_lock(&journal->j_list_lock); + if (!journal->j_checkpoint_transactions) { + spin_unlock(&journal->j_list_lock); + break; + } + __jbd2_journal_clean_checkpoint_list(journal, true); + spin_unlock(&journal->j_list_lock); + cond_resched(); + } +} + +/* * journal_remove_checkpoint: called after a buffer has been committed * to disk (either by being write-back flushed to disk, or being * committed to the log). diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index b73e0215baa7..362e5f614450 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -510,7 +510,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) * frees some memory */ spin_lock(&journal->j_list_lock); - __jbd2_journal_clean_checkpoint_list(journal); + __jbd2_journal_clean_checkpoint_list(journal, false); spin_unlock(&journal->j_list_lock); jbd_debug(3, "JBD2: commit phase 1\n"); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 4ff3fad4e9e3..8270fe9e3641 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1456,7 +1456,7 @@ void jbd2_journal_update_sb_errno(journal_t *journal) sb->s_errno = cpu_to_be32(journal->j_errno); read_unlock(&journal->j_state_lock); - jbd2_write_superblock(journal, WRITE_SYNC); + jbd2_write_superblock(journal, WRITE_FUA); } EXPORT_SYMBOL(jbd2_journal_update_sb_errno); @@ -1693,8 +1693,17 @@ int jbd2_journal_destroy(journal_t *journal) while (journal->j_checkpoint_transactions != NULL) { spin_unlock(&journal->j_list_lock); mutex_lock(&journal->j_checkpoint_mutex); - jbd2_log_do_checkpoint(journal); + err = jbd2_log_do_checkpoint(journal); mutex_unlock(&journal->j_checkpoint_mutex); + /* + * If checkpointing failed, just free the buffers to avoid + * looping forever + */ + if (err) { + jbd2_journal_destroy_checkpoint(journal); + spin_lock(&journal->j_list_lock); + break; + } spin_lock(&journal->j_list_lock); } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index f3d06174b051..6b8338ec2464 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -204,6 +204,20 @@ static int add_transaction_credits(journal_t *journal, int blocks, * attach this handle to a new transaction. */ atomic_sub(total, &t->t_outstanding_credits); + + /* + * Is the number of reserved credits in the current transaction too + * big to fit this handle? Wait until reserved credits are freed. + */ + if (atomic_read(&journal->j_reserved_credits) + total > + journal->j_max_transaction_buffers) { + read_unlock(&journal->j_state_lock); + wait_event(journal->j_wait_reserved, + atomic_read(&journal->j_reserved_credits) + total <= + journal->j_max_transaction_buffers); + return 1; + } + wait_transaction_locked(journal); return 1; } @@ -262,20 +276,24 @@ static int start_this_handle(journal_t *journal, handle_t *handle, int rsv_blocks = 0; unsigned long ts = jiffies; + if (handle->h_rsv_handle) + rsv_blocks = handle->h_rsv_handle->h_buffer_credits; + /* - * 1/2 of transaction can be reserved so we can practically handle - * only 1/2 of maximum transaction size per operation + * Limit the number of reserved credits to 1/2 of maximum transaction + * size and limit the number of total credits to not exceed maximum + * transaction size per operation. */ - if (WARN_ON(blocks > journal->j_max_transaction_buffers / 2)) { - printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n", - current->comm, blocks, - journal->j_max_transaction_buffers / 2); + if ((rsv_blocks > journal->j_max_transaction_buffers / 2) || + (rsv_blocks + blocks > journal->j_max_transaction_buffers)) { + printk(KERN_ERR "JBD2: %s wants too many credits " + "credits:%d rsv_credits:%d max:%d\n", + current->comm, blocks, rsv_blocks, + journal->j_max_transaction_buffers); + WARN_ON(1); return -ENOSPC; } - if (handle->h_rsv_handle) - rsv_blocks = handle->h_rsv_handle->h_buffer_credits; - alloc_transaction: if (!journal->j_running_transaction) { /* @@ -1280,8 +1298,6 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh, triggers->t_abort(triggers, jh2bh(jh)); } - - /** * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata * @handle: transaction to add buffer to. @@ -1314,12 +1330,41 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) if (is_handle_aborted(handle)) return -EROFS; - journal = transaction->t_journal; - jh = jbd2_journal_grab_journal_head(bh); - if (!jh) { + if (!buffer_jbd(bh)) { ret = -EUCLEAN; goto out; } + /* + * We don't grab jh reference here since the buffer must be part + * of the running transaction. + */ + jh = bh2jh(bh); + /* + * This and the following assertions are unreliable since we may see jh + * in inconsistent state unless we grab bh_state lock. But this is + * crucial to catch bugs so let's do a reliable check until the + * lockless handling is fully proven. + */ + if (jh->b_transaction != transaction && + jh->b_next_transaction != transaction) { + jbd_lock_bh_state(bh); + J_ASSERT_JH(jh, jh->b_transaction == transaction || + jh->b_next_transaction == transaction); + jbd_unlock_bh_state(bh); + } + if (jh->b_modified == 1) { + /* If it's in our transaction it must be in BJ_Metadata list. */ + if (jh->b_transaction == transaction && + jh->b_jlist != BJ_Metadata) { + jbd_lock_bh_state(bh); + J_ASSERT_JH(jh, jh->b_transaction != transaction || + jh->b_jlist == BJ_Metadata); + jbd_unlock_bh_state(bh); + } + goto out; + } + + journal = transaction->t_journal; jbd_debug(5, "journal_head %p\n", jh); JBUFFER_TRACE(jh, "entry"); @@ -1410,7 +1455,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) spin_unlock(&journal->j_list_lock); out_unlock_bh: jbd_unlock_bh_state(bh); - jbd2_journal_put_journal_head(jh); out: JBUFFER_TRACE(jh, "exit"); return ret; diff --git a/fs/jfs/file.c b/fs/jfs/file.c index b9dc23cd04f2..0e026a7bdcd4 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -107,8 +107,11 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr) if (rc) return rc; - if (is_quota_modification(inode, iattr)) - dquot_initialize(inode); + if (is_quota_modification(inode, iattr)) { + rc = dquot_initialize(inode); + if (rc) + return rc; + } if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) || (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) { rc = dquot_transfer(inode, iattr); diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index 6b0f816201a2..cf7936fe2e68 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -109,7 +109,9 @@ struct inode *ialloc(struct inode *parent, umode_t mode) /* * Allocate inode to quota. */ - dquot_initialize(inode); + rc = dquot_initialize(inode); + if (rc) + goto fail_drop; rc = dquot_alloc_inode(inode); if (rc) goto fail_drop; diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index bc462dcd7a40..a69bdf2a1085 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1999,19 +1999,16 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); bio->bi_bdev = log->bdev; - bio->bi_io_vec[0].bv_page = bp->l_page; - bio->bi_io_vec[0].bv_len = LOGPSIZE; - bio->bi_io_vec[0].bv_offset = bp->l_offset; - bio->bi_vcnt = 1; - bio->bi_iter.bi_size = LOGPSIZE; + bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); + BUG_ON(bio->bi_iter.bi_size != LOGPSIZE); bio->bi_end_io = lbmIODone; bio->bi_private = bp; /*check if journaling to disk has been disabled*/ if (log->no_integrity) { bio->bi_iter.bi_size = 0; - lbmIODone(bio, 0); + lbmIODone(bio); } else { submit_bio(READ_SYNC, bio); } @@ -2145,12 +2142,9 @@ static void lbmStartIO(struct lbuf * bp) bio = bio_alloc(GFP_NOFS, 1); bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); bio->bi_bdev = log->bdev; - bio->bi_io_vec[0].bv_page = bp->l_page; - bio->bi_io_vec[0].bv_len = LOGPSIZE; - bio->bi_io_vec[0].bv_offset = bp->l_offset; - bio->bi_vcnt = 1; - bio->bi_iter.bi_size = LOGPSIZE; + bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); + BUG_ON(bio->bi_iter.bi_size != LOGPSIZE); bio->bi_end_io = lbmIODone; bio->bi_private = bp; @@ -2158,7 +2152,7 @@ static void lbmStartIO(struct lbuf * bp) /* check if journaling to disk has been disabled */ if (log->no_integrity) { bio->bi_iter.bi_size = 0; - lbmIODone(bio, 0); + lbmIODone(bio); } else { submit_bio(WRITE_SYNC, bio); INCREMENT(lmStat.submitted); @@ -2196,7 +2190,7 @@ static int lbmIOWait(struct lbuf * bp, int flag) * * executed at INTIODONE level */ -static void lbmIODone(struct bio *bio, int error) +static void lbmIODone(struct bio *bio) { struct lbuf *bp = bio->bi_private; struct lbuf *nextbp, *tail; @@ -2212,7 +2206,7 @@ static void lbmIODone(struct bio *bio, int error) bp->l_flag |= lbmDONE; - if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { + if (bio->bi_error) { bp->l_flag |= lbmERROR; jfs_err("lbmIODone: I/O error in JFS log"); diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 16a0922beb59..a3eb316b1ac3 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -276,11 +276,11 @@ static void last_read_complete(struct page *page) unlock_page(page); } -static void metapage_read_end_io(struct bio *bio, int err) +static void metapage_read_end_io(struct bio *bio) { struct page *page = bio->bi_private; - if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { + if (bio->bi_error) { printk(KERN_ERR "metapage_read_end_io: I/O error\n"); SetPageError(page); } @@ -331,13 +331,13 @@ static void last_write_complete(struct page *page) end_page_writeback(page); } -static void metapage_write_end_io(struct bio *bio, int err) +static void metapage_write_end_io(struct bio *bio) { struct page *page = bio->bi_private; BUG_ON(!PagePrivate(page)); - if (! test_bit(BIO_UPTODATE, &bio->bi_flags)) { + if (bio->bi_error) { printk(KERN_ERR "metapage_write_end_io: I/O error\n"); SetPageError(page); } diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index a5ac97b9a933..35976bdccafc 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -86,7 +86,9 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, umode_t mode, jfs_info("jfs_create: dip:0x%p name:%pd", dip, dentry); - dquot_initialize(dip); + rc = dquot_initialize(dip); + if (rc) + goto out1; /* * search parent directory for entry/freespace @@ -218,7 +220,9 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) jfs_info("jfs_mkdir: dip:0x%p name:%pd", dip, dentry); - dquot_initialize(dip); + rc = dquot_initialize(dip); + if (rc) + goto out1; /* * search parent directory for entry/freespace @@ -355,8 +359,12 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry) jfs_info("jfs_rmdir: dip:0x%p name:%pd", dip, dentry); /* Init inode for quota operations. */ - dquot_initialize(dip); - dquot_initialize(ip); + rc = dquot_initialize(dip); + if (rc) + goto out; + rc = dquot_initialize(ip); + if (rc) + goto out; /* directory must be empty to be removed */ if (!dtEmpty(ip)) { @@ -483,8 +491,12 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry) jfs_info("jfs_unlink: dip:0x%p name:%pd", dip, dentry); /* Init inode for quota operations. */ - dquot_initialize(dip); - dquot_initialize(ip); + rc = dquot_initialize(dip); + if (rc) + goto out; + rc = dquot_initialize(ip); + if (rc) + goto out; if ((rc = get_UCSname(&dname, dentry))) goto out; @@ -799,7 +811,9 @@ static int jfs_link(struct dentry *old_dentry, jfs_info("jfs_link: %pd %pd", old_dentry, dentry); - dquot_initialize(dir); + rc = dquot_initialize(dir); + if (rc) + goto out; tid = txBegin(ip->i_sb, 0); @@ -810,7 +824,7 @@ static int jfs_link(struct dentry *old_dentry, * scan parent directory for entry/freespace */ if ((rc = get_UCSname(&dname, dentry))) - goto out; + goto out_tx; if ((rc = dtSearch(dir, &dname, &ino, &btstack, JFS_CREATE))) goto free_dname; @@ -842,12 +856,13 @@ static int jfs_link(struct dentry *old_dentry, free_dname: free_UCSname(&dname); - out: + out_tx: txEnd(tid); mutex_unlock(&JFS_IP(ip)->commit_mutex); mutex_unlock(&JFS_IP(dir)->commit_mutex); + out: jfs_info("jfs_link: rc:%d", rc); return rc; } @@ -891,7 +906,9 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, jfs_info("jfs_symlink: dip:0x%p name:%s", dip, name); - dquot_initialize(dip); + rc = dquot_initialize(dip); + if (rc) + goto out1; ssize = strlen(name) + 1; @@ -1082,8 +1099,12 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, jfs_info("jfs_rename: %pd %pd", old_dentry, new_dentry); - dquot_initialize(old_dir); - dquot_initialize(new_dir); + rc = dquot_initialize(old_dir); + if (rc) + goto out1; + rc = dquot_initialize(new_dir); + if (rc) + goto out1; old_ip = d_inode(old_dentry); new_ip = d_inode(new_dentry); @@ -1130,7 +1151,9 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, } else if (new_ip) { IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL); /* Init inode for quota operations. */ - dquot_initialize(new_ip); + rc = dquot_initialize(new_ip); + if (rc) + goto out_unlock; } /* @@ -1318,6 +1341,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, clear_cflag(COMMIT_Stale, old_dir); } + out_unlock: if (new_ip && !S_ISDIR(new_ip->i_mode)) IWRITE_UNLOCK(new_ip); out3: @@ -1353,7 +1377,9 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry, jfs_info("jfs_mknod: %pd", dentry); - dquot_initialize(dir); + rc = dquot_initialize(dir); + if (rc) + goto out; if ((rc = get_UCSname(&dname, dentry))) goto out; diff --git a/fs/libfs.c b/fs/libfs.c index 102edfd39000..c7cbfb092e94 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1185,7 +1185,7 @@ void make_empty_dir_inode(struct inode *inode) inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; inode->i_rdev = 0; - inode->i_size = 2; + inode->i_size = 0; inode->i_blkbits = PAGE_SHIFT; inode->i_blocks = 0; diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 55505cbe11af..d678bcc3cbcb 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -322,6 +322,11 @@ out_rqst: return error; } +static struct svc_serv_ops lockd_sv_ops = { + .svo_shutdown = svc_rpcb_cleanup, + .svo_enqueue_xprt = svc_xprt_do_enqueue, +}; + static struct svc_serv *lockd_create_svc(void) { struct svc_serv *serv; @@ -350,7 +355,7 @@ static struct svc_serv *lockd_create_svc(void) nlm_timeout = LOCKD_DFLT_TIMEO; nlmsvc_timeout = nlm_timeout * HZ; - serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, svc_rpcb_cleanup); + serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, &lockd_sv_ops); if (!serv) { printk(KERN_WARNING "lockd_up: create service failed\n"); return ERR_PTR(-ENOMEM); @@ -586,6 +591,7 @@ static int lockd_init_net(struct net *net) INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender); INIT_LIST_HEAD(&ln->lockd_manager.list); + ln->lockd_manager.block_opens = false; spin_lock_init(&ln->nsm_clnt_lock); return 0; } diff --git a/fs/locks.c b/fs/locks.c index d3d558ba4da7..2a54c800a223 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1568,6 +1568,7 @@ int fcntl_getlease(struct file *filp) * desired lease. * @dentry: dentry to check * @arg: type of lease that we're trying to acquire + * @flags: current lock flags * * Check to see if there's an existing open fd on this file that would * conflict with the lease we're trying to set. diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c index 76279e11982d..a7fdbd868474 100644 --- a/fs/logfs/dev_bdev.c +++ b/fs/logfs/dev_bdev.c @@ -53,16 +53,14 @@ static int bdev_readpage(void *_sb, struct page *page) static DECLARE_WAIT_QUEUE_HEAD(wq); -static void writeseg_end_io(struct bio *bio, int err) +static void writeseg_end_io(struct bio *bio) { - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct bio_vec *bvec; int i; struct super_block *sb = bio->bi_private; struct logfs_super *super = logfs_super(sb); - BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */ - BUG_ON(err); + BUG_ON(bio->bi_error); /* FIXME: Retry io or write elsewhere */ bio_for_each_segment_all(bvec, bio, i) { end_page_writeback(bvec->bv_page); @@ -83,7 +81,7 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, unsigned int max_pages; int i; - max_pages = min(nr_pages, (size_t) bio_get_nr_vecs(super->s_bdev)); + max_pages = min(nr_pages, BIO_MAX_PAGES); bio = bio_alloc(GFP_NOFS, max_pages); BUG_ON(!bio); @@ -153,14 +151,12 @@ static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len) } -static void erase_end_io(struct bio *bio, int err) +static void erase_end_io(struct bio *bio) { - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct super_block *sb = bio->bi_private; struct logfs_super *super = logfs_super(sb); - BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */ - BUG_ON(err); + BUG_ON(bio->bi_error); /* FIXME: Retry io or write elsewhere */ BUG_ON(bio->bi_vcnt == 0); bio_put(bio); if (atomic_dec_and_test(&super->s_pending_writes)) @@ -175,7 +171,7 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index, unsigned int max_pages; int i; - max_pages = min(nr_pages, (size_t) bio_get_nr_vecs(super->s_bdev)); + max_pages = min(nr_pages, BIO_MAX_PAGES); bio = bio_alloc(GFP_NOFS, max_pages); BUG_ON(!bio); diff --git a/fs/mpage.c b/fs/mpage.c index ca0244b69de8..778a4ddef77a 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -42,14 +42,14 @@ * status of that page is hard. See end_buffer_async_read() for the details. * There is no point in duplicating all that complexity. */ -static void mpage_end_io(struct bio *bio, int err) +static void mpage_end_io(struct bio *bio) { struct bio_vec *bv; int i; bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; - page_endio(page, bio_data_dir(bio), err); + page_endio(page, bio_data_dir(bio), bio->bi_error); } bio_put(bio); @@ -277,7 +277,7 @@ alloc_new: goto out; } bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), - min_t(int, nr_pages, bio_get_nr_vecs(bdev)), + min_t(int, nr_pages, BIO_MAX_PAGES), GFP_KERNEL); if (bio == NULL) goto confused; @@ -602,7 +602,7 @@ alloc_new: } } bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), - bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH); + BIO_MAX_PAGES, GFP_NOFS|__GFP_HIGH); if (bio == NULL) goto confused; diff --git a/fs/namei.c b/fs/namei.c index ae4e4c18b2ac..726d211db484 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -560,6 +560,24 @@ static int __nd_alloc_stack(struct nameidata *nd) return 0; } +/** + * path_connected - Verify that a path->dentry is below path->mnt.mnt_root + * @path: nameidate to verify + * + * Rename can sometimes move a file or directory outside of a bind + * mount, path_connected allows those cases to be detected. + */ +static bool path_connected(const struct path *path) +{ + struct vfsmount *mnt = path->mnt; + + /* Only bind mounts can have disconnected paths */ + if (mnt->mnt_root == mnt->mnt_sb->s_root) + return true; + + return is_subdir(path->dentry, mnt->mnt_root); +} + static inline int nd_alloc_stack(struct nameidata *nd) { if (likely(nd->depth != EMBEDDED_LEVELS)) @@ -879,7 +897,7 @@ static inline int may_follow_link(struct nameidata *nd) return 0; /* Allowed if parent directory not sticky and world-writable. */ - parent = nd->path.dentry->d_inode; + parent = nd->inode; if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH)) return 0; @@ -1296,6 +1314,8 @@ static int follow_dotdot_rcu(struct nameidata *nd) return -ECHILD; nd->path.dentry = parent; nd->seq = seq; + if (unlikely(!path_connected(&nd->path))) + return -ENOENT; break; } else { struct mount *mnt = real_mount(nd->path.mnt); @@ -1396,7 +1416,7 @@ static void follow_mount(struct path *path) } } -static void follow_dotdot(struct nameidata *nd) +static int follow_dotdot(struct nameidata *nd) { if (!nd->root.mnt) set_root(nd); @@ -1412,6 +1432,8 @@ static void follow_dotdot(struct nameidata *nd) /* rare case of legitimate dget_parent()... */ nd->path.dentry = dget_parent(nd->path.dentry); dput(old); + if (unlikely(!path_connected(&nd->path))) + return -ENOENT; break; } if (!follow_up(&nd->path)) @@ -1419,6 +1441,7 @@ static void follow_dotdot(struct nameidata *nd) } follow_mount(&nd->path); nd->inode = nd->path.dentry->d_inode; + return 0; } /* @@ -1634,7 +1657,7 @@ static inline int handle_dots(struct nameidata *nd, int type) if (nd->flags & LOOKUP_RCU) { return follow_dotdot_rcu(nd); } else - follow_dotdot(nd); + return follow_dotdot(nd); } return 0; } @@ -1954,8 +1977,13 @@ OK: continue; } } - if (unlikely(!d_can_lookup(nd->path.dentry))) + if (unlikely(!d_can_lookup(nd->path.dentry))) { + if (nd->flags & LOOKUP_RCU) { + if (unlazy_walk(nd, NULL, 0)) + return -ECHILD; + } return -ENOTDIR; + } } } @@ -2410,7 +2438,7 @@ done: /** * path_mountpoint - look up a path to be umounted - * @nameidata: lookup context + * @nd: lookup context * @flags: lookup flags * @path: pointer to container for result * diff --git a/fs/namespace.c b/fs/namespace.c index 2b8aa15fd6df..0570729c87fd 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3218,6 +3218,8 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags) down_read(&namespace_sem); list_for_each_entry(mnt, &ns->list, mnt_list) { struct mount *child; + int mnt_flags; + if (mnt->mnt.mnt_sb->s_type != type) continue; @@ -3227,17 +3229,30 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags) if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root) continue; + /* Read the mount flags and filter out flags that + * may safely be ignored. + */ + mnt_flags = mnt->mnt.mnt_flags; + if (mnt->mnt.mnt_sb->s_iflags & SB_I_NOEXEC) + mnt_flags &= ~(MNT_LOCK_NOSUID | MNT_LOCK_NOEXEC); + /* Verify the mount flags are equal to or more permissive * than the proposed new mount. */ - if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) && + if ((mnt_flags & MNT_LOCK_READONLY) && !(new_flags & MNT_READONLY)) continue; - if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) && + if ((mnt_flags & MNT_LOCK_NODEV) && !(new_flags & MNT_NODEV)) continue; - if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) && - ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK))) + if ((mnt_flags & MNT_LOCK_NOSUID) && + !(new_flags & MNT_NOSUID)) + continue; + if ((mnt_flags & MNT_LOCK_NOEXEC) && + !(new_flags & MNT_NOEXEC)) + continue; + if ((mnt_flags & MNT_LOCK_ATIME) && + ((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK))) continue; /* This mount is not fully visible if there are any @@ -3247,16 +3262,18 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags) list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { struct inode *inode = child->mnt_mountpoint->d_inode; /* Only worry about locked mounts */ - if (!(mnt->mnt.mnt_flags & MNT_LOCKED)) + if (!(mnt_flags & MNT_LOCKED)) continue; /* Is the directory permanetly empty? */ if (!is_empty_dir_inode(inode)) goto next; } /* Preserve the locked attributes */ - *new_mnt_flags |= mnt->mnt.mnt_flags & (MNT_LOCK_READONLY | \ - MNT_LOCK_NODEV | \ - MNT_LOCK_ATIME); + *new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \ + MNT_LOCK_NODEV | \ + MNT_LOCK_NOSUID | \ + MNT_LOCK_NOEXEC | \ + MNT_LOCK_ATIME); visible = true; goto found; next: ; diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index d2554fe140a3..9cd4eb3a1e22 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -116,7 +116,7 @@ bl_submit_bio(int rw, struct bio *bio) static struct bio * bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector, - void (*end_io)(struct bio *, int err), struct parallel_io *par) + bio_end_io_t end_io, struct parallel_io *par) { struct bio *bio; @@ -139,8 +139,7 @@ bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector, static struct bio * do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, struct page *page, struct pnfs_block_dev_map *map, - struct pnfs_block_extent *be, - void (*end_io)(struct bio *, int err), + struct pnfs_block_extent *be, bio_end_io_t end_io, struct parallel_io *par, unsigned int offset, int *len) { struct pnfs_block_dev *dev = @@ -183,11 +182,11 @@ retry: return bio; } -static void bl_end_io_read(struct bio *bio, int err) +static void bl_end_io_read(struct bio *bio) { struct parallel_io *par = bio->bi_private; - if (err) { + if (bio->bi_error) { struct nfs_pgio_header *header = par->data; if (!header->pnfs_error) @@ -316,13 +315,12 @@ out: return PNFS_ATTEMPTED; } -static void bl_end_io_write(struct bio *bio, int err) +static void bl_end_io_write(struct bio *bio) { struct parallel_io *par = bio->bi_private; - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct nfs_pgio_header *header = par->data; - if (!uptodate) { + if (bio->bi_error) { if (!header->pnfs_error) header->pnfs_error = -EIO; pnfs_set_lo_fail(header->lseg); diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index 92dca9e90d8d..c556640dcf3b 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -46,13 +46,6 @@ struct pnfs_block_dev; -enum pnfs_block_volume_type { - PNFS_BLOCK_VOLUME_SIMPLE = 0, - PNFS_BLOCK_VOLUME_SLICE = 1, - PNFS_BLOCK_VOLUME_CONCAT = 2, - PNFS_BLOCK_VOLUME_STRIPE = 3, -}; - #define PNFS_BLOCK_MAX_UUIDS 4 #define PNFS_BLOCK_MAX_DEVICES 64 @@ -117,13 +110,6 @@ struct pnfs_block_dev { struct pnfs_block_dev_map *map); }; -enum exstate4 { - PNFS_BLOCK_READWRITE_DATA = 0, - PNFS_BLOCK_READ_DATA = 1, - PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */ - PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */ -}; - /* sector_t fields are all in 512-byte sectors */ struct pnfs_block_extent { union { @@ -134,15 +120,12 @@ struct pnfs_block_extent { sector_t be_f_offset; /* the starting offset in the file */ sector_t be_length; /* the size of the extent */ sector_t be_v_offset; /* the starting offset in the volume */ - enum exstate4 be_state; /* the state of this extent */ + enum pnfs_block_extent_state be_state; /* the state of this extent */ #define EXTENT_WRITTEN 1 #define EXTENT_COMMITTING 2 unsigned int be_tag; }; -/* on the wire size of the extent */ -#define BL_EXTENT_SIZE (7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE) - struct pnfs_block_layout { struct pnfs_layout_hdr bl_layout; struct rb_root bl_ext_rw; diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index e535599a0719..a861bbdfe577 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -22,7 +22,7 @@ bl_free_device(struct pnfs_block_dev *dev) kfree(dev->children); } else { if (dev->bdev) - blkdev_put(dev->bdev, FMODE_READ); + blkdev_put(dev->bdev, FMODE_READ | FMODE_WRITE); } } @@ -65,6 +65,11 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) return -EIO; p = xdr_decode_hyper(p, &b->simple.sigs[i].offset); b->simple.sigs[i].sig_len = be32_to_cpup(p++); + if (b->simple.sigs[i].sig_len > PNFS_BLOCK_UUID_LEN) { + pr_info("signature too long: %d\n", + b->simple.sigs[i].sig_len); + return -EIO; + } p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len); if (!p) @@ -195,7 +200,7 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d, if (!dev) return -EIO; - d->bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL); + d->bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL); if (IS_ERR(d->bdev)) { printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n", MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev)); diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c index 31d0b5e53dfd..c59a59c37f3d 100644 --- a/fs/nfs/blocklayout/extent_tree.c +++ b/fs/nfs/blocklayout/extent_tree.c @@ -462,6 +462,12 @@ out: return err; } +static size_t ext_tree_layoutupdate_size(size_t count) +{ + return sizeof(__be32) /* number of entries */ + + PNFS_BLOCK_EXTENT_SIZE * count; +} + static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg, size_t buffer_size) { @@ -489,7 +495,7 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, continue; (*count)++; - if (*count * BL_EXTENT_SIZE > buffer_size) { + if (ext_tree_layoutupdate_size(*count) > buffer_size) { /* keep counting.. */ ret = -ENOSPC; continue; @@ -530,7 +536,7 @@ retry: if (unlikely(ret)) { ext_tree_free_commitdata(arg, buffer_size); - buffer_size = sizeof(__be32) + BL_EXTENT_SIZE * count; + buffer_size = ext_tree_layoutupdate_size(count); count = 0; arg->layoutupdate_pages = @@ -549,17 +555,14 @@ retry: } *start_p = cpu_to_be32(count); - arg->layoutupdate_len = sizeof(__be32) + BL_EXTENT_SIZE * count; + arg->layoutupdate_len = ext_tree_layoutupdate_size(count); if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) { - __be32 *p = start_p; + void *p = start_p, *end = p + arg->layoutupdate_len; int i = 0; - for (p = start_p; - p < start_p + arg->layoutupdate_len; - p += PAGE_SIZE) { + for ( ; p < end; p += PAGE_SIZE) arg->layoutupdate_pages[i++] = vmalloc_to_page(p); - } } dprintk("%s found %zu ranges\n", __func__, count); diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 682529c00996..75f7c0a7538a 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -162,10 +162,6 @@ nfs41_callback_up(struct svc_serv *serv) spin_lock_init(&serv->sv_cb_lock); init_waitqueue_head(&serv->sv_cb_waitq); rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); - if (IS_ERR(rqstp)) { - svc_xprt_put(serv->sv_bc_xprt); - serv->sv_bc_xprt = NULL; - } dprintk("--> %s return %d\n", __func__, PTR_ERR_OR_ZERO(rqstp)); return rqstp; } @@ -308,6 +304,10 @@ err_bind: return ret; } +static struct svc_serv_ops nfs_cb_sv_ops = { + .svo_enqueue_xprt = svc_xprt_do_enqueue, +}; + static struct svc_serv *nfs_callback_create_svc(int minorversion) { struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; @@ -333,7 +333,7 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion) printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n", cb_info->users); - serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL); + serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, &nfs_cb_sv_ops); if (!serv) { printk(KERN_ERR "nfs_callback_create_svc: create service failed\n"); return ERR_PTR(-ENOMEM); diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 29e3c1b011b7..b85cf7a30232 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -40,8 +40,11 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); inode = nfs_delegation_find_inode(cps->clp, &args->fh); - if (inode == NULL) + if (inode == NULL) { + trace_nfs4_cb_getattr(cps->clp, &args->fh, NULL, + -ntohl(res->status)); goto out; + } nfsi = NFS_I(inode); rcu_read_lock(); delegation = rcu_dereference(nfsi->delegation); @@ -60,6 +63,7 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, res->status = 0; out_iput: rcu_read_unlock(); + trace_nfs4_cb_getattr(cps->clp, &args->fh, inode, -ntohl(res->status)); iput(inode); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status)); @@ -194,6 +198,7 @@ unlock: spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&free_me_list); pnfs_put_layout_hdr(lo); + trace_nfs4_cb_layoutrecall_inode(clp, &args->cbl_fh, ino, -rv); iput(ino); out: return rv; @@ -554,7 +559,7 @@ __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy, status = htonl(NFS4_OK); nfs41_set_target_slotid(fc_tbl, args->crsa_target_highest_slotid); - nfs41_server_notify_target_slotid_update(cps->clp); + nfs41_notify_server(cps->clp); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); return status; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index ecebb406cc1a..57c5a02f6213 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -20,6 +20,7 @@ #include <linux/stat.h> #include <linux/errno.h> #include <linux/unistd.h> +#include <linux/sunrpc/addr.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/metrics.h> @@ -285,116 +286,6 @@ void nfs_put_client(struct nfs_client *clp) } EXPORT_SYMBOL_GPL(nfs_put_client); -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -/* - * Test if two ip6 socket addresses refer to the same socket by - * comparing relevant fields. The padding bytes specifically, are not - * compared. sin6_flowinfo is not compared because it only affects QoS - * and sin6_scope_id is only compared if the address is "link local" - * because "link local" addresses need only be unique to a specific - * link. Conversely, ordinary unicast addresses might have different - * sin6_scope_id. - * - * The caller should ensure both socket addresses are AF_INET6. - */ -static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, - const struct sockaddr *sa2) -{ - const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1; - const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2; - - if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr)) - return 0; - else if (ipv6_addr_type(&sin1->sin6_addr) & IPV6_ADDR_LINKLOCAL) - return sin1->sin6_scope_id == sin2->sin6_scope_id; - - return 1; -} -#else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */ -static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, - const struct sockaddr *sa2) -{ - return 0; -} -#endif - -/* - * Test if two ip4 socket addresses refer to the same socket, by - * comparing relevant fields. The padding bytes specifically, are - * not compared. - * - * The caller should ensure both socket addresses are AF_INET. - */ -static int nfs_sockaddr_match_ipaddr4(const struct sockaddr *sa1, - const struct sockaddr *sa2) -{ - const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1; - const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2; - - return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr; -} - -static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1, - const struct sockaddr *sa2) -{ - const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1; - const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2; - - return nfs_sockaddr_match_ipaddr6(sa1, sa2) && - (sin1->sin6_port == sin2->sin6_port); -} - -static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1, - const struct sockaddr *sa2) -{ - const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1; - const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2; - - return nfs_sockaddr_match_ipaddr4(sa1, sa2) && - (sin1->sin_port == sin2->sin_port); -} - -#if defined(CONFIG_NFS_V4_1) -/* - * Test if two socket addresses represent the same actual socket, - * by comparing (only) relevant fields, excluding the port number. - */ -int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, - const struct sockaddr *sa2) -{ - if (sa1->sa_family != sa2->sa_family) - return 0; - - switch (sa1->sa_family) { - case AF_INET: - return nfs_sockaddr_match_ipaddr4(sa1, sa2); - case AF_INET6: - return nfs_sockaddr_match_ipaddr6(sa1, sa2); - } - return 0; -} -EXPORT_SYMBOL_GPL(nfs_sockaddr_match_ipaddr); -#endif /* CONFIG_NFS_V4_1 */ - -/* - * Test if two socket addresses represent the same actual socket, - * by comparing (only) relevant fields, including the port number. - */ -static int nfs_sockaddr_cmp(const struct sockaddr *sa1, - const struct sockaddr *sa2) -{ - if (sa1->sa_family != sa2->sa_family) - return 0; - - switch (sa1->sa_family) { - case AF_INET: - return nfs_sockaddr_cmp_ip4(sa1, sa2); - case AF_INET6: - return nfs_sockaddr_cmp_ip6(sa1, sa2); - } - return 0; -} - /* * Find an nfs_client on the list that matches the initialisation data * that is supplied. @@ -421,7 +312,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat if (clp->cl_minorversion != data->minorversion) continue; /* Match the full socket address */ - if (!nfs_sockaddr_cmp(sap, clap)) + if (!rpc_cmp_addr_port(sap, clap)) continue; atomic_inc(&clp->cl_count); @@ -775,7 +666,7 @@ static int nfs_init_server(struct nfs_server *server, server->options = data->options; server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID| NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP| - NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME|NFS_CAP_CHANGE_ATTR; + NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME; if (data->rsize) server->rsize = nfs_block_size(data->rsize, NULL); diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 029d688a969f..2714ef835bdd 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -175,7 +175,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, if (delegation->inode != NULL) { nfs4_stateid_copy(&delegation->stateid, &res->delegation); delegation->type = res->delegation_type; - delegation->maxsize = res->maxsize; + delegation->pagemod_limit = res->pagemod_limit; oldcred = delegation->cred; delegation->cred = get_rpccred(cred); clear_bit(NFS_DELEGATION_NEED_RECLAIM, @@ -337,7 +337,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct return -ENOMEM; nfs4_stateid_copy(&delegation->stateid, &res->delegation); delegation->type = res->delegation_type; - delegation->maxsize = res->maxsize; + delegation->pagemod_limit = res->pagemod_limit; delegation->change_attr = inode->i_version; delegation->cred = get_rpccred(cred); delegation->inode = inode; @@ -900,3 +900,28 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, rcu_read_unlock(); return ret; } + +/** + * nfs4_delegation_flush_on_close - Check if we must flush file on close + * @inode: inode to check + * + * This function checks the number of outstanding writes to the file + * against the delegation 'space_limit' field to see if + * the spec requires us to flush the file on close. + */ +bool nfs4_delegation_flush_on_close(const struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_delegation *delegation; + bool ret = true; + + rcu_read_lock(); + delegation = rcu_dereference(nfsi->delegation); + if (delegation == NULL || !(delegation->type & FMODE_WRITE)) + goto out; + if (nfsi->nrequests < delegation->pagemod_limit) + ret = false; +out: + rcu_read_unlock(); + return ret; +} diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index e3c20a3ccc93..a44829173e57 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -18,7 +18,7 @@ struct nfs_delegation { struct inode *inode; nfs4_stateid stateid; fmode_t type; - loff_t maxsize; + unsigned long pagemod_limit; __u64 change_attr; unsigned long flags; spinlock_t lock; @@ -61,6 +61,7 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); int nfs4_have_delegation(struct inode *inode, fmode_t flags); int nfs4_check_delegation(struct inode *inode, fmode_t flags); +bool nfs4_delegation_flush_on_close(const struct inode *inode); #endif diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 547308a5ec6f..3d8e4ffa0a33 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -583,26 +583,19 @@ out_nopages: } static -void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages) +void nfs_readdir_free_pages(struct page **pages, unsigned int npages) { unsigned int i; for (i = 0; i < npages; i++) put_page(pages[i]); } -static -void nfs_readdir_free_large_page(void *ptr, struct page **pages, - unsigned int npages) -{ - nfs_readdir_free_pagearray(pages, npages); -} - /* * nfs_readdir_large_page will allocate pages that must be freed with a call - * to nfs_readdir_free_large_page + * to nfs_readdir_free_pagearray */ static -int nfs_readdir_large_page(struct page **pages, unsigned int npages) +int nfs_readdir_alloc_pages(struct page **pages, unsigned int npages) { unsigned int i; @@ -615,7 +608,7 @@ int nfs_readdir_large_page(struct page **pages, unsigned int npages) return 0; out_freepages: - nfs_readdir_free_pagearray(pages, i); + nfs_readdir_free_pages(pages, i); return -ENOMEM; } @@ -623,7 +616,6 @@ static int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode) { struct page *pages[NFS_MAX_READDIR_PAGES]; - void *pages_ptr = NULL; struct nfs_entry entry; struct file *file = desc->file; struct nfs_cache_array *array; @@ -653,7 +645,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, memset(array, 0, sizeof(struct nfs_cache_array)); array->eof_index = -1; - status = nfs_readdir_large_page(pages, array_size); + status = nfs_readdir_alloc_pages(pages, array_size); if (status < 0) goto out_release_array; do { @@ -671,7 +663,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, } } while (array->eof_index < 0); - nfs_readdir_free_large_page(pages_ptr, pages, array_size); + nfs_readdir_free_pages(pages, array_size); out_release_array: nfs_readdir_release_array(page); out_label_free: diff --git a/fs/nfs/file.c b/fs/nfs/file.c index cc4fa1ed61fc..c0f9b1ed12b9 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -82,7 +82,8 @@ nfs_file_release(struct inode *inode, struct file *filp) dprintk("NFS: release(%pD2)\n", filp); nfs_inc_stats(inode, NFSIOS_VFSRELEASE); - return nfs_release(inode, filp); + nfs_file_clear_open_context(filp); + return 0; } EXPORT_SYMBOL_GPL(nfs_file_release); @@ -141,7 +142,7 @@ EXPORT_SYMBOL_GPL(nfs_file_llseek); /* * Flush all dirty pages, and check for write errors. */ -int +static int nfs_file_flush(struct file *file, fl_owner_t id) { struct inode *inode = file_inode(file); @@ -152,17 +153,9 @@ nfs_file_flush(struct file *file, fl_owner_t id) if ((file->f_mode & FMODE_WRITE) == 0) return 0; - /* - * If we're holding a write delegation, then just start the i/o - * but don't wait for completion (or send a commit). - */ - if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) - return filemap_fdatawrite(file->f_mapping); - /* Flush writes to the server and return any errors */ return vfs_fsync(file, 0); } -EXPORT_SYMBOL_GPL(nfs_file_flush); ssize_t nfs_file_read(struct kiocb *iocb, struct iov_iter *to) @@ -644,12 +637,10 @@ static const struct vm_operations_struct nfs_file_vm_ops = { .page_mkwrite = nfs_vm_page_mkwrite, }; -static int nfs_need_sync_write(struct file *filp, struct inode *inode) +static int nfs_need_check_write(struct file *filp, struct inode *inode) { struct nfs_open_context *ctx; - if (IS_SYNC(inode) || (filp->f_flags & O_DSYNC)) - return 1; ctx = nfs_file_open_context(filp); if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) || nfs_ctx_key_to_expire(ctx)) @@ -699,8 +690,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) if (result > 0) written = result; - /* Return error values for O_DSYNC and IS_SYNC() */ - if (result >= 0 && nfs_need_sync_write(file, inode)) { + /* Return error values */ + if (result >= 0 && nfs_need_check_write(file, inode)) { int err = vfs_fsync(file, 0); if (err < 0) result = err; diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index c12951b9551e..fbc5a56de875 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -34,6 +34,7 @@ ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) ffl = kzalloc(sizeof(*ffl), gfp_flags); if (ffl) { INIT_LIST_HEAD(&ffl->error_list); + INIT_LIST_HEAD(&ffl->mirrors); return &ffl->generic_hdr; } else return NULL; @@ -135,6 +136,95 @@ decode_name(struct xdr_stream *xdr, u32 *id) return 0; } +static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1, + const struct nfs4_ff_layout_mirror *m2) +{ + int i, j; + + if (m1->fh_versions_cnt != m2->fh_versions_cnt) + return false; + for (i = 0; i < m1->fh_versions_cnt; i++) { + bool found_fh = false; + for (j = 0; j < m2->fh_versions_cnt; i++) { + if (nfs_compare_fh(&m1->fh_versions[i], + &m2->fh_versions[j]) == 0) { + found_fh = true; + break; + } + } + if (!found_fh) + return false; + } + return true; +} + +static struct nfs4_ff_layout_mirror * +ff_layout_add_mirror(struct pnfs_layout_hdr *lo, + struct nfs4_ff_layout_mirror *mirror) +{ + struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo); + struct nfs4_ff_layout_mirror *pos; + struct inode *inode = lo->plh_inode; + + spin_lock(&inode->i_lock); + list_for_each_entry(pos, &ff_layout->mirrors, mirrors) { + if (mirror->mirror_ds != pos->mirror_ds) + continue; + if (!ff_mirror_match_fh(mirror, pos)) + continue; + if (atomic_inc_not_zero(&pos->ref)) { + spin_unlock(&inode->i_lock); + return pos; + } + } + list_add(&mirror->mirrors, &ff_layout->mirrors); + mirror->layout = lo; + spin_unlock(&inode->i_lock); + return mirror; +} + +static void +ff_layout_remove_mirror(struct nfs4_ff_layout_mirror *mirror) +{ + struct inode *inode; + if (mirror->layout == NULL) + return; + inode = mirror->layout->plh_inode; + spin_lock(&inode->i_lock); + list_del(&mirror->mirrors); + spin_unlock(&inode->i_lock); + mirror->layout = NULL; +} + +static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags) +{ + struct nfs4_ff_layout_mirror *mirror; + + mirror = kzalloc(sizeof(*mirror), gfp_flags); + if (mirror != NULL) { + spin_lock_init(&mirror->lock); + atomic_set(&mirror->ref, 1); + INIT_LIST_HEAD(&mirror->mirrors); + } + return mirror; +} + +static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror) +{ + ff_layout_remove_mirror(mirror); + kfree(mirror->fh_versions); + if (mirror->cred) + put_rpccred(mirror->cred); + nfs4_ff_layout_put_deviceid(mirror->mirror_ds); + kfree(mirror); +} + +static void ff_layout_put_mirror(struct nfs4_ff_layout_mirror *mirror) +{ + if (mirror != NULL && atomic_dec_and_test(&mirror->ref)) + ff_layout_free_mirror(mirror); +} + static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls) { int i; @@ -144,11 +234,7 @@ static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls) /* normally mirror_ds is freed in * .free_deviceid_node but we still do it here * for .alloc_lseg error path */ - if (fls->mirror_array[i]) { - kfree(fls->mirror_array[i]->fh_versions); - nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds); - kfree(fls->mirror_array[i]); - } + ff_layout_put_mirror(fls->mirror_array[i]); } kfree(fls->mirror_array); fls->mirror_array = NULL; @@ -181,6 +267,65 @@ static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls) } } +static bool +ff_lseg_range_is_after(const struct pnfs_layout_range *l1, + const struct pnfs_layout_range *l2) +{ + u64 end1, end2; + + if (l1->iomode != l2->iomode) + return l1->iomode != IOMODE_READ; + end1 = pnfs_calc_offset_end(l1->offset, l1->length); + end2 = pnfs_calc_offset_end(l2->offset, l2->length); + if (end1 < l2->offset) + return false; + if (end2 < l1->offset) + return true; + return l2->offset <= l1->offset; +} + +static bool +ff_lseg_merge(struct pnfs_layout_segment *new, + struct pnfs_layout_segment *old) +{ + u64 new_end, old_end; + + if (new->pls_range.iomode != old->pls_range.iomode) + return false; + old_end = pnfs_calc_offset_end(old->pls_range.offset, + old->pls_range.length); + if (old_end < new->pls_range.offset) + return false; + new_end = pnfs_calc_offset_end(new->pls_range.offset, + new->pls_range.length); + if (new_end < old->pls_range.offset) + return false; + + /* Mergeable: copy info from 'old' to 'new' */ + if (new_end < old_end) + new_end = old_end; + if (new->pls_range.offset < old->pls_range.offset) + new->pls_range.offset = old->pls_range.offset; + new->pls_range.length = pnfs_calc_offset_length(new->pls_range.offset, + new_end); + if (test_bit(NFS_LSEG_ROC, &old->pls_flags)) + set_bit(NFS_LSEG_ROC, &new->pls_flags); + if (test_bit(NFS_LSEG_LAYOUTRETURN, &old->pls_flags)) + set_bit(NFS_LSEG_LAYOUTRETURN, &new->pls_flags); + return true; +} + +static void +ff_layout_add_lseg(struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg, + struct list_head *free_me) +{ + pnfs_generic_layout_insert_lseg(lo, lseg, + ff_lseg_range_is_after, + ff_lseg_merge, + free_me); +} + static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls) { int i, j; @@ -246,6 +391,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, goto out_err_free; for (i = 0; i < fls->mirror_array_cnt; i++) { + struct nfs4_ff_layout_mirror *mirror; struct nfs4_deviceid devid; struct nfs4_deviceid_node *idnode; u32 ds_count; @@ -262,17 +408,13 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, if (ds_count != 1) goto out_err_free; - fls->mirror_array[i] = - kzalloc(sizeof(struct nfs4_ff_layout_mirror), - gfp_flags); + fls->mirror_array[i] = ff_layout_alloc_mirror(gfp_flags); if (fls->mirror_array[i] == NULL) { rc = -ENOMEM; goto out_err_free; } - spin_lock_init(&fls->mirror_array[i]->lock); fls->mirror_array[i]->ds_count = ds_count; - fls->mirror_array[i]->lseg = &fls->generic_hdr; /* deviceid */ rc = decode_deviceid(&stream, &devid); @@ -338,6 +480,12 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, if (rc) goto out_err_free; + mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]); + if (mirror != fls->mirror_array[i]) { + ff_layout_free_mirror(fls->mirror_array[i]); + fls->mirror_array[i] = mirror; + } + dprintk("%s: uid %d gid %d\n", __func__, fls->mirror_array[i]->uid, fls->mirror_array[i]->gid); @@ -379,21 +527,9 @@ static void ff_layout_free_lseg(struct pnfs_layout_segment *lseg) { struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); - int i; dprintk("--> %s\n", __func__); - for (i = 0; i < fls->mirror_array_cnt; i++) { - if (fls->mirror_array[i]) { - nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds); - fls->mirror_array[i]->mirror_ds = NULL; - if (fls->mirror_array[i]->cred) { - put_rpccred(fls->mirror_array[i]->cred); - fls->mirror_array[i]->cred = NULL; - } - } - } - if (lseg->pls_range.iomode == IOMODE_RW) { struct nfs4_flexfile_layout *ffl; struct inode *inode; @@ -419,48 +555,44 @@ ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls) } static void -nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer) +nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now) { /* first IO request? */ if (atomic_inc_return(&timer->n_ops) == 1) { - timer->start_time = ktime_get(); + timer->start_time = now; } } static ktime_t -nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer) +nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now) { - ktime_t start, now; + ktime_t start; if (atomic_dec_return(&timer->n_ops) < 0) WARN_ON_ONCE(1); - now = ktime_get(); start = timer->start_time; timer->start_time = now; return ktime_sub(now, start); } -static ktime_t -nfs4_ff_layout_calc_completion_time(struct rpc_task *task) -{ - return ktime_sub(ktime_get(), task->tk_start); -} - static bool nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror, - struct nfs4_ff_layoutstat *layoutstat) + struct nfs4_ff_layoutstat *layoutstat, + ktime_t now) { static const ktime_t notime = {0}; - ktime_t now = ktime_get(); + s64 report_interval = FF_LAYOUTSTATS_REPORT_INTERVAL; - nfs4_ff_start_busy_timer(&layoutstat->busy_timer); + nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now); if (ktime_equal(mirror->start_time, notime)) mirror->start_time = now; if (ktime_equal(mirror->last_report_time, notime)) mirror->last_report_time = now; + if (layoutstats_timer != 0) + report_interval = (s64)layoutstats_timer * 1000LL; if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >= - FF_LAYOUTSTATS_REPORT_INTERVAL) { + report_interval) { mirror->last_report_time = now; return true; } @@ -482,35 +614,39 @@ static void nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat, __u64 requested, __u64 completed, - ktime_t time_completed) + ktime_t time_completed, + ktime_t time_started) { struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat; + ktime_t completion_time = ktime_sub(time_completed, time_started); ktime_t timer; iostat->ops_completed++; iostat->bytes_completed += completed; iostat->bytes_not_delivered += requested - completed; - timer = nfs4_ff_end_busy_timer(&layoutstat->busy_timer); + timer = nfs4_ff_end_busy_timer(&layoutstat->busy_timer, time_completed); iostat->total_busy_time = ktime_add(iostat->total_busy_time, timer); iostat->aggregate_completion_time = - ktime_add(iostat->aggregate_completion_time, time_completed); + ktime_add(iostat->aggregate_completion_time, + completion_time); } static void -nfs4_ff_layout_stat_io_start_read(struct nfs4_ff_layout_mirror *mirror, - __u64 requested) +nfs4_ff_layout_stat_io_start_read(struct inode *inode, + struct nfs4_ff_layout_mirror *mirror, + __u64 requested, ktime_t now) { bool report; spin_lock(&mirror->lock); - report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat); + report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat, now); nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested); spin_unlock(&mirror->lock); if (report) - pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode); + pnfs_report_layoutstat(inode, GFP_KERNEL); } static void @@ -522,23 +658,24 @@ nfs4_ff_layout_stat_io_end_read(struct rpc_task *task, spin_lock(&mirror->lock); nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat, requested, completed, - nfs4_ff_layout_calc_completion_time(task)); + ktime_get(), task->tk_start); spin_unlock(&mirror->lock); } static void -nfs4_ff_layout_stat_io_start_write(struct nfs4_ff_layout_mirror *mirror, - __u64 requested) +nfs4_ff_layout_stat_io_start_write(struct inode *inode, + struct nfs4_ff_layout_mirror *mirror, + __u64 requested, ktime_t now) { bool report; spin_lock(&mirror->lock); - report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat); + report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat, now); nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested); spin_unlock(&mirror->lock); if (report) - pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode); + pnfs_report_layoutstat(inode, GFP_NOIO); } static void @@ -553,8 +690,7 @@ nfs4_ff_layout_stat_io_end_write(struct rpc_task *task, spin_lock(&mirror->lock); nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat, - requested, completed, - nfs4_ff_layout_calc_completion_time(task)); + requested, completed, ktime_get(), task->tk_start); spin_unlock(&mirror->lock); } @@ -728,8 +864,6 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio, return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg); /* no lseg means that pnfs is not in use, so no mirroring here */ - pnfs_put_lseg(pgio->pg_lseg); - pgio->pg_lseg = NULL; nfs_pageio_reset_write_mds(pgio); return 1; } @@ -931,18 +1065,26 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task, if (task->tk_status >= 0) return 0; - if (task->tk_status != -EJUKEBOX) { + switch (task->tk_status) { + /* File access problems. Don't mark the device as unavailable */ + case -EACCES: + case -ESTALE: + case -EISDIR: + case -EBADHANDLE: + case -ELOOP: + case -ENOSPC: + break; + case -EJUKEBOX: + nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY); + goto out_retry; + default: dprintk("%s DS connection error %d\n", __func__, task->tk_status); nfs4_mark_deviceid_unavailable(devid); - if (ff_layout_has_available_ds(lseg)) - return -NFS4ERR_RESET_TO_PNFS; - else - return -NFS4ERR_RESET_TO_MDS; } - - if (task->tk_status == -EJUKEBOX) - nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY); + /* FIXME: Need to prevent infinite looping here. */ + return -NFS4ERR_RESET_TO_PNFS; +out_retry: task->tk_status = 0; rpc_restart_call(task); rpc_delay(task, NFS_JUKEBOX_RETRY_TIME); @@ -972,15 +1114,41 @@ static int ff_layout_async_handle_error(struct rpc_task *task, static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, int idx, u64 offset, u64 length, - u32 status, int opnum) + u32 status, int opnum, int error) { struct nfs4_ff_layout_mirror *mirror; int err; + if (status == 0) { + switch (error) { + case -ETIMEDOUT: + case -EPFNOSUPPORT: + case -EPROTONOSUPPORT: + case -EOPNOTSUPP: + case -ECONNREFUSED: + case -ECONNRESET: + case -EHOSTDOWN: + case -EHOSTUNREACH: + case -ENETUNREACH: + case -EADDRINUSE: + case -ENOBUFS: + case -EPIPE: + case -EPERM: + status = NFS4ERR_NXIO; + break; + case -EACCES: + status = NFS4ERR_ACCESS; + break; + default: + return; + } + } + mirror = FF_LAYOUT_COMP(lseg, idx); err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), mirror, offset, length, status, opnum, GFP_NOIO); + pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg); dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status); } @@ -989,16 +1157,14 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, static int ff_layout_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { - struct inode *inode; int err; trace_nfs4_pnfs_read(hdr, task->tk_status); - if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status) - hdr->res.op_status = NFS4ERR_NXIO; - if (task->tk_status < 0 && hdr->res.op_status) + if (task->tk_status < 0) ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx, hdr->args.offset, hdr->args.count, - hdr->res.op_status, OP_READ); + hdr->res.op_status, OP_READ, + task->tk_status); err = ff_layout_async_handle_error(task, hdr->args.context->state, hdr->ds_clp, hdr->lseg, hdr->pgio_mirror_idx); @@ -1010,8 +1176,6 @@ static int ff_layout_read_done_cb(struct rpc_task *task, pnfs_read_resend_pnfs(hdr); return task->tk_status; case -NFS4ERR_RESET_TO_MDS: - inode = hdr->lseg->pls_layout->plh_inode; - pnfs_error_mark_layout_for_return(inode, hdr->lseg); ff_layout_reset_read(hdr); return task->tk_status; case -EAGAIN: @@ -1061,9 +1225,10 @@ ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx) static int ff_layout_read_prepare_common(struct rpc_task *task, struct nfs_pgio_header *hdr) { - nfs4_ff_layout_stat_io_start_read( + nfs4_ff_layout_stat_io_start_read(hdr->inode, FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), - hdr->args.count); + hdr->args.count, + task->tk_start); if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) { rpc_exit(task, -EIO); @@ -1163,32 +1328,26 @@ static void ff_layout_read_count_stats(struct rpc_task *task, void *data) static int ff_layout_write_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { - struct inode *inode; int err; trace_nfs4_pnfs_write(hdr, task->tk_status); - if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status) - hdr->res.op_status = NFS4ERR_NXIO; - if (task->tk_status < 0 && hdr->res.op_status) + if (task->tk_status < 0) ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx, hdr->args.offset, hdr->args.count, - hdr->res.op_status, OP_WRITE); + hdr->res.op_status, OP_WRITE, + task->tk_status); err = ff_layout_async_handle_error(task, hdr->args.context->state, hdr->ds_clp, hdr->lseg, hdr->pgio_mirror_idx); switch (err) { case -NFS4ERR_RESET_TO_PNFS: + pnfs_set_retry_layoutget(hdr->lseg->pls_layout); + ff_layout_reset_write(hdr, true); + return task->tk_status; case -NFS4ERR_RESET_TO_MDS: - inode = hdr->lseg->pls_layout->plh_inode; - pnfs_error_mark_layout_for_return(inode, hdr->lseg); - if (err == -NFS4ERR_RESET_TO_PNFS) { - pnfs_set_retry_layoutget(hdr->lseg->pls_layout); - ff_layout_reset_write(hdr, true); - } else { - pnfs_clear_retry_layoutget(hdr->lseg->pls_layout); - ff_layout_reset_write(hdr, false); - } + pnfs_clear_retry_layoutget(hdr->lseg->pls_layout); + ff_layout_reset_write(hdr, false); return task->tk_status; case -EAGAIN: rpc_restart_call_prepare(task); @@ -1199,34 +1358,35 @@ static int ff_layout_write_done_cb(struct rpc_task *task, hdr->res.verf->committed == NFS_DATA_SYNC) ff_layout_set_layoutcommit(hdr); + /* zero out fattr since we don't care DS attr at all */ + hdr->fattr.valid = 0; + if (task->tk_status >= 0) + nfs_writeback_update_inode(hdr); + return 0; } static int ff_layout_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *data) { - struct inode *inode; int err; trace_nfs4_pnfs_commit_ds(data, task->tk_status); - if (task->tk_status == -ETIMEDOUT && !data->res.op_status) - data->res.op_status = NFS4ERR_NXIO; - if (task->tk_status < 0 && data->res.op_status) + if (task->tk_status < 0) ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index, data->args.offset, data->args.count, - data->res.op_status, OP_COMMIT); + data->res.op_status, OP_COMMIT, + task->tk_status); err = ff_layout_async_handle_error(task, NULL, data->ds_clp, data->lseg, data->ds_commit_index); switch (err) { case -NFS4ERR_RESET_TO_PNFS: + pnfs_set_retry_layoutget(data->lseg->pls_layout); + pnfs_generic_prepare_to_resend_writes(data); + return -EAGAIN; case -NFS4ERR_RESET_TO_MDS: - inode = data->lseg->pls_layout->plh_inode; - pnfs_error_mark_layout_for_return(inode, data->lseg); - if (err == -NFS4ERR_RESET_TO_PNFS) - pnfs_set_retry_layoutget(data->lseg->pls_layout); - else - pnfs_clear_retry_layoutget(data->lseg->pls_layout); + pnfs_clear_retry_layoutget(data->lseg->pls_layout); pnfs_generic_prepare_to_resend_writes(data); return -EAGAIN; case -EAGAIN: @@ -1244,9 +1404,10 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, static int ff_layout_write_prepare_common(struct rpc_task *task, struct nfs_pgio_header *hdr) { - nfs4_ff_layout_stat_io_start_write( + nfs4_ff_layout_stat_io_start_write(hdr->inode, FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), - hdr->args.count); + hdr->args.count, + task->tk_start); if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) { rpc_exit(task, -EIO); @@ -1325,9 +1486,9 @@ static void ff_layout_write_count_stats(struct rpc_task *task, void *data) static void ff_layout_commit_prepare_common(struct rpc_task *task, struct nfs_commit_data *cdata) { - nfs4_ff_layout_stat_io_start_write( + nfs4_ff_layout_stat_io_start_write(cdata->inode, FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index), - 0); + 0, task->tk_start); } static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data) @@ -1842,53 +2003,55 @@ ff_layout_encode_layoutstats(struct xdr_stream *xdr, *start = cpu_to_be32((xdr->p - start - 1) * 4); } -static bool +static int ff_layout_mirror_prepare_stats(struct nfs42_layoutstat_args *args, - struct pnfs_layout_segment *pls, - int *dev_count, int dev_limit) + struct pnfs_layout_hdr *lo, + int dev_limit) { + struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo); struct nfs4_ff_layout_mirror *mirror; struct nfs4_deviceid_node *dev; struct nfs42_layoutstat_devinfo *devinfo; - int i; + int i = 0; - for (i = 0; i <= FF_LAYOUT_MIRROR_COUNT(pls); i++) { - if (*dev_count >= dev_limit) + list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) { + if (i >= dev_limit) break; - mirror = FF_LAYOUT_COMP(pls, i); - if (!mirror || !mirror->mirror_ds) + if (!mirror->mirror_ds) + continue; + /* mirror refcount put in cleanup_layoutstats */ + if (!atomic_inc_not_zero(&mirror->ref)) continue; - dev = FF_LAYOUT_DEVID_NODE(pls, i); - devinfo = &args->devinfo[*dev_count]; + dev = &mirror->mirror_ds->id_node; + devinfo = &args->devinfo[i]; memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE); - devinfo->offset = pls->pls_range.offset; - devinfo->length = pls->pls_range.length; - /* well, we don't really know if IO is continuous or not! */ - devinfo->read_count = mirror->read_stat.io_stat.bytes_completed; + devinfo->offset = 0; + devinfo->length = NFS4_MAX_UINT64; + devinfo->read_count = mirror->read_stat.io_stat.ops_completed; devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed; - devinfo->write_count = mirror->write_stat.io_stat.bytes_completed; + devinfo->write_count = mirror->write_stat.io_stat.ops_completed; devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed; devinfo->layout_type = LAYOUT_FLEX_FILES; devinfo->layoutstats_encode = ff_layout_encode_layoutstats; devinfo->layout_private = mirror; - /* lseg refcount put in cleanup_layoutstats */ - pnfs_get_lseg(pls); - ++(*dev_count); + i++; } - - return *dev_count < dev_limit; + return i; } static int ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args) { - struct pnfs_layout_segment *pls; + struct nfs4_flexfile_layout *ff_layout; + struct nfs4_ff_layout_mirror *mirror; int dev_count = 0; spin_lock(&args->inode->i_lock); - list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) { - dev_count += FF_LAYOUT_MIRROR_COUNT(pls); + ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout); + list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) { + if (atomic_read(&mirror->ref) != 0) + dev_count ++; } spin_unlock(&args->inode->i_lock); /* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */ @@ -1897,20 +2060,14 @@ ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args) __func__, dev_count, PNFS_LAYOUTSTATS_MAXDEV); dev_count = PNFS_LAYOUTSTATS_MAXDEV; } - args->devinfo = kmalloc(dev_count * sizeof(*args->devinfo), GFP_KERNEL); + args->devinfo = kmalloc_array(dev_count, sizeof(*args->devinfo), GFP_NOIO); if (!args->devinfo) return -ENOMEM; - dev_count = 0; spin_lock(&args->inode->i_lock); - list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) { - if (!ff_layout_mirror_prepare_stats(args, pls, &dev_count, - PNFS_LAYOUTSTATS_MAXDEV)) { - break; - } - } + args->num_dev = ff_layout_mirror_prepare_stats(args, + &ff_layout->generic_hdr, dev_count); spin_unlock(&args->inode->i_lock); - args->num_dev = dev_count; return 0; } @@ -1924,7 +2081,7 @@ ff_layout_cleanup_layoutstats(struct nfs42_layoutstat_data *data) for (i = 0; i < data->args.num_dev; i++) { mirror = data->args.devinfo[i].layout_private; data->args.devinfo[i].layout_private = NULL; - pnfs_put_lseg(mirror->lseg); + ff_layout_put_mirror(mirror); } } @@ -1936,6 +2093,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = { .free_layout_hdr = ff_layout_free_layout_hdr, .alloc_lseg = ff_layout_alloc_lseg, .free_lseg = ff_layout_free_lseg, + .add_lseg = ff_layout_add_lseg, .pg_read_ops = &ff_layout_pg_read_ops, .pg_write_ops = &ff_layout_pg_write_ops, .get_ds_info = ff_layout_get_ds_info, diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index f92f9a0a856b..68cc0d9828f9 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -67,7 +67,8 @@ struct nfs4_ff_layoutstat { }; struct nfs4_ff_layout_mirror { - struct pnfs_layout_segment *lseg; /* back pointer */ + struct pnfs_layout_hdr *layout; + struct list_head mirrors; u32 ds_count; u32 efficiency; struct nfs4_ff_layout_ds *mirror_ds; @@ -77,6 +78,7 @@ struct nfs4_ff_layout_mirror { u32 uid; u32 gid; struct rpc_cred *cred; + atomic_t ref; spinlock_t lock; struct nfs4_ff_layoutstat read_stat; struct nfs4_ff_layoutstat write_stat; @@ -95,6 +97,7 @@ struct nfs4_ff_layout_segment { struct nfs4_flexfile_layout { struct pnfs_layout_hdr generic_hdr; struct pnfs_ds_commit_info commit_info; + struct list_head mirrors; struct list_head error_list; /* nfs4_ff_layout_ds_err */ }; diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index f13e1969eedd..e125e55de86d 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -172,6 +172,32 @@ out_err: return NULL; } +static void ff_layout_mark_devid_invalid(struct pnfs_layout_segment *lseg, + struct nfs4_deviceid_node *devid) +{ + nfs4_mark_deviceid_unavailable(devid); + if (!ff_layout_has_available_ds(lseg)) + pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, + lseg); +} + +static bool ff_layout_mirror_valid(struct pnfs_layout_segment *lseg, + struct nfs4_ff_layout_mirror *mirror) +{ + if (mirror == NULL || mirror->mirror_ds == NULL) { + pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, + lseg); + return false; + } + if (mirror->mirror_ds->ds == NULL) { + struct nfs4_deviceid_node *devid; + devid = &mirror->mirror_ds->id_node; + ff_layout_mark_devid_invalid(lseg, devid); + return false; + } + return true; +} + static u64 end_offset(u64 start, u64 len) { @@ -336,16 +362,10 @@ nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx) { struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx); struct nfs_fh *fh = NULL; - struct nfs4_deviceid_node *devid; - if (mirror == NULL || mirror->mirror_ds == NULL || - mirror->mirror_ds->ds == NULL) { - printk(KERN_ERR "NFS: %s: No data server for mirror offset index %d\n", + if (!ff_layout_mirror_valid(lseg, mirror)) { + pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n", __func__, mirror_idx); - if (mirror && mirror->mirror_ds) { - devid = &mirror->mirror_ds->id_node; - pnfs_generic_mark_devid_invalid(devid); - } goto out; } @@ -368,14 +388,9 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, unsigned int max_payload; rpc_authflavor_t flavor; - if (mirror == NULL || mirror->mirror_ds == NULL || - mirror->mirror_ds->ds == NULL) { - printk(KERN_ERR "NFS: %s: No data server for offset index %d\n", + if (!ff_layout_mirror_valid(lseg, mirror)) { + pr_err_ratelimited("NFS: %s: No data server for offset index %d\n", __func__, ds_idx); - if (mirror && mirror->mirror_ds) { - devid = &mirror->mirror_ds->id_node; - pnfs_generic_mark_devid_invalid(devid); - } goto out; } @@ -500,16 +515,19 @@ int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo, range->offset, range->length)) continue; /* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE) - * + deviceid(NFS4_DEVICEID4_SIZE) + status(4) + opnum(4) + * + array length + deviceid(NFS4_DEVICEID4_SIZE) + * + status(4) + opnum(4) */ p = xdr_reserve_space(xdr, - 24 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE); + 28 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE); if (unlikely(!p)) return -ENOBUFS; p = xdr_encode_hyper(p, err->offset); p = xdr_encode_hyper(p, err->length); p = xdr_encode_opaque_fixed(p, &err->stateid, NFS4_STATEID_SIZE); + /* Encode 1 error */ + *p++ = cpu_to_be32(1); p = xdr_encode_opaque_fixed(p, &err->deviceid, NFS4_DEVICEID4_SIZE); *p++ = cpu_to_be32(err->status); @@ -525,11 +543,11 @@ int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo, return 0; } -bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg) +static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg) { struct nfs4_ff_layout_mirror *mirror; struct nfs4_deviceid_node *devid; - int idx; + u32 idx; for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) { mirror = FF_LAYOUT_COMP(lseg, idx); @@ -543,6 +561,32 @@ bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg) return false; } +static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg) +{ + struct nfs4_ff_layout_mirror *mirror; + struct nfs4_deviceid_node *devid; + u32 idx; + + for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) { + mirror = FF_LAYOUT_COMP(lseg, idx); + if (!mirror || !mirror->mirror_ds) + return false; + devid = &mirror->mirror_ds->id_node; + if (ff_layout_test_devid_unavailable(devid)) + return false; + } + + return FF_LAYOUT_MIRROR_COUNT(lseg) != 0; +} + +bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg) +{ + if (lseg->pls_range.iomode == IOMODE_READ) + return ff_read_layout_has_available_ds(lseg); + /* Note: RW layout needs all mirrors available */ + return ff_rw_layout_has_available_ds(lseg); +} + module_param(dataserver_retrans, uint, 0644); MODULE_PARM_DESC(dataserver_retrans, "The number of times the NFSv4.1 client " "retries a request before it attempts further " diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index b77b328a06d7..326d9e10d833 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -442,8 +442,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_CHANGE) inode->i_version = fattr->change_attr; - else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + else + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR + | NFS_INO_REVAL_PAGECACHE); if (fattr->valid & NFS_ATTR_FATTR_SIZE) inode->i_size = nfs_size_to_loff_t(fattr->size); else @@ -503,7 +504,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct nfs_fattr *fattr; - int error = -ENOMEM; + int error = 0; nfs_inc_stats(inode, NFSIOS_VFSSETATTR); @@ -512,15 +513,14 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) attr->ia_valid &= ~ATTR_MODE; if (attr->ia_valid & ATTR_SIZE) { - loff_t i_size; - BUG_ON(!S_ISREG(inode->i_mode)); - i_size = i_size_read(inode); - if (attr->ia_size == i_size) + error = inode_newsize_ok(inode, attr->ia_size); + if (error) + return error; + + if (attr->ia_size == i_size_read(inode)) attr->ia_valid &= ~ATTR_SIZE; - else if (attr->ia_size < i_size && IS_SWAPFILE(inode)) - return -ETXTBSY; } /* Optimization: if the end result is no change, don't RPC */ @@ -535,8 +535,11 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) nfs_sync_inode(inode); fattr = nfs_alloc_fattr(); - if (fattr == NULL) + if (fattr == NULL) { + error = -ENOMEM; goto out; + } + /* * Return any delegations if we're going to change ACLs */ @@ -758,11 +761,13 @@ EXPORT_SYMBOL_GPL(nfs_put_lock_context); * @ctx: pointer to context * @is_sync: is this a synchronous close * - * always ensure that the attributes are up to date if we're mounted - * with close-to-open semantics + * Ensure that the attributes are up to date if we're mounted + * with close-to-open semantics and we have cached data that will + * need to be revalidated on open. */ void nfs_close_context(struct nfs_open_context *ctx, int is_sync) { + struct nfs_inode *nfsi; struct inode *inode; struct nfs_server *server; @@ -771,7 +776,12 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync) if (!is_sync) return; inode = d_inode(ctx->dentry); - if (!list_empty(&NFS_I(inode)->open_files)) + nfsi = NFS_I(inode); + if (inode->i_mapping->nrpages == 0) + return; + if (nfsi->cache_validity & NFS_INO_INVALID_DATA) + return; + if (!list_empty(&nfsi->open_files)) return; server = NFS_SERVER(inode); if (server->flags & NFS_MOUNT_NOCTO) @@ -843,6 +853,11 @@ void put_nfs_open_context(struct nfs_open_context *ctx) } EXPORT_SYMBOL_GPL(put_nfs_open_context); +static void put_nfs_open_context_sync(struct nfs_open_context *ctx) +{ + __put_nfs_open_context(ctx, 1); +} + /* * Ensure that mmap has a recent RPC credential for use when writing out * shared pages @@ -887,7 +902,7 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c return ctx; } -static void nfs_file_clear_open_context(struct file *filp) +void nfs_file_clear_open_context(struct file *filp) { struct nfs_open_context *ctx = nfs_file_open_context(filp); @@ -898,7 +913,7 @@ static void nfs_file_clear_open_context(struct file *filp) spin_lock(&inode->i_lock); list_move_tail(&ctx->list, &NFS_I(inode)->open_files); spin_unlock(&inode->i_lock); - __put_nfs_open_context(ctx, filp->f_flags & O_DIRECT ? 0 : 1); + put_nfs_open_context_sync(ctx); } } @@ -918,12 +933,6 @@ int nfs_open(struct inode *inode, struct file *filp) return 0; } -int nfs_release(struct inode *inode, struct file *filp) -{ - nfs_file_clear_open_context(filp); - return 0; -} - /* * This function is called whenever some part of NFS notices that * the cached attributes have to be refreshed. @@ -1244,9 +1253,11 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if (fattr->valid & NFS_ATTR_FATTR_SIZE) { cur_size = i_size_read(inode); new_isize = nfs_size_to_loff_t(fattr->size); - if (cur_size != new_isize && nfsi->nrequests == 0) + if (cur_size != new_isize) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; } + if (nfsi->nrequests != 0) + invalid &= ~NFS_INO_REVAL_PAGECACHE; /* Have any file permissions changed? */ if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) @@ -1270,13 +1281,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat return 0; } -static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr) -{ - if (!(fattr->valid & NFS_ATTR_FATTR_CTIME)) - return 0; - return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0; -} - static atomic_long_t nfs_attr_generation_counter; static unsigned long nfs_read_attr_generation_counter(void) @@ -1425,7 +1429,6 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n const struct nfs_inode *nfsi = NFS_I(inode); return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 || - nfs_ctime_need_update(inode, fattr) || ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0); } @@ -1488,6 +1491,13 @@ static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr { unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + /* + * Don't revalidate the pagecache if we hold a delegation, but do + * force an attribute update + */ + if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) + invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_FORCED; + if (S_ISDIR(inode->i_mode)) invalid |= NFS_INO_INVALID_DATA; nfs_set_cache_invalid(inode, invalid); @@ -1684,13 +1694,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_DATA | NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL - | NFS_INO_REVAL_PAGECACHE; + | NFS_INO_INVALID_ACL; if (S_ISDIR(inode->i_mode)) nfs_force_lookup_revalidate(inode); inode->i_version = fattr->change_attr; } - } else if (server->caps & NFS_CAP_CHANGE_ATTR) + } else nfsi->cache_validity |= save_cache_validity; if (fattr->valid & NFS_ATTR_FATTR_MTIME) { @@ -1717,7 +1726,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if ((nfsi->nrequests == 0) || new_isize > cur_isize) { i_size_write(inode, new_isize); invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; - invalid &= ~NFS_INO_REVAL_PAGECACHE; } dprintk("NFS: isize change on server for file %s/%ld " "(%Ld to %Ld)\n", diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 7e3c4604bea8..56cfde26fb9c 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -219,10 +219,6 @@ static inline void nfs_fs_proc_exit(void) } #endif -#ifdef CONFIG_NFS_V4_1 -int nfs_sockaddr_match_ipaddr(const struct sockaddr *, const struct sockaddr *); -#endif - /* callback_xdr.c */ extern struct svc_version nfs4_callback_version1; extern struct svc_version nfs4_callback_version4; @@ -296,6 +292,22 @@ extern struct rpc_procinfo nfs4_procedures[]; #ifdef CONFIG_NFS_V4_SECURITY_LABEL extern struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags); +static inline struct nfs4_label * +nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src) +{ + if (!dst || !src) + return NULL; + + if (src->len > NFS4_MAXLABELLEN) + return NULL; + + dst->lfs = src->lfs; + dst->pi = src->pi; + dst->len = src->len; + memcpy(dst->label, src->label, src->len); + + return dst; +} static inline void nfs4_label_free(struct nfs4_label *label) { if (label) { @@ -316,6 +328,11 @@ static inline void nfs4_label_free(void *label) {} static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi) { } +static inline struct nfs4_label * +nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src) +{ + return NULL; +} #endif /* CONFIG_NFS_V4_SECURITY_LABEL */ /* proc.c */ @@ -343,7 +360,6 @@ int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *) /* file.c */ int nfs_file_fsync_commit(struct file *, loff_t, loff_t, int); loff_t nfs_file_llseek(struct file *, loff_t, int); -int nfs_file_flush(struct file *, fl_owner_t); ssize_t nfs_file_read(struct kiocb *, struct iov_iter *); ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); @@ -469,6 +485,9 @@ void nfs_retry_commit(struct list_head *page_list, void nfs_commitdata_release(struct nfs_commit_data *data); void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst, struct nfs_commit_info *cinfo); +void nfs_request_add_commit_list_locked(struct nfs_page *req, + struct list_head *dst, + struct nfs_commit_info *cinfo); void nfs_request_remove_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo); void nfs_init_cinfo(struct nfs_commit_info *cinfo, @@ -602,13 +621,15 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize) * Record the page as unstable and mark its inode as dirty. */ static inline -void nfs_mark_page_unstable(struct page *page) +void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo) { - struct inode *inode = page_file_mapping(page)->host; + if (!cinfo->dreq) { + struct inode *inode = page_file_mapping(page)->host; - inc_zone_page_state(page, NR_UNSTABLE_NFS); - inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE); - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + inc_zone_page_state(page, NR_UNSTABLE_NFS); + inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE); + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + } } /* diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 9b04c2e6fffc..267126d32ec0 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -1103,6 +1103,7 @@ static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req, { encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen); encode_symlinkdata3(xdr, args); + xdr->buf->flags |= XDRBUF_WRITE; } /* diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h index ff66ae700b89..814c1255f1d2 100644 --- a/fs/nfs/nfs42.h +++ b/fs/nfs/nfs42.h @@ -17,7 +17,5 @@ int nfs42_proc_deallocate(struct file *, loff_t, loff_t); loff_t nfs42_proc_llseek(struct file *, loff_t, int); int nfs42_proc_layoutstats_generic(struct nfs_server *, struct nfs42_layoutstat_data *); -/* nfs4.2xdr.h */ -extern struct rpc_procinfo nfs4_2_procedures[]; #endif /* __LINUX_FS_NFS_NFS4_2_H */ diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index f486b80f927a..d731bbf974aa 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -135,7 +135,7 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) return err; } -loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) +static loff_t _nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) { struct inode *inode = file_inode(filep); struct nfs42_seek_args args = { @@ -171,6 +171,23 @@ loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) return vfs_setpos(filep, res.sr_offset, inode->i_sb->s_maxbytes); } +loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) +{ + struct nfs_server *server = NFS_SERVER(file_inode(filep)); + struct nfs4_exception exception = { }; + int err; + + do { + err = _nfs42_proc_llseek(filep, offset, whence); + if (err == -ENOTSUPP) + return -EOPNOTSUPP; + err = nfs4_handle_exception(server, err, &exception); + } while (exception.retry); + + return err; +} + + static void nfs42_layoutstat_prepare(struct rpc_task *task, void *calldata) { diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index a6bd27da6286..0eb29e14070d 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -238,8 +238,7 @@ out_overflow: return -EIO; } -static int decode_layoutstats(struct xdr_stream *xdr, - struct nfs42_layoutstat_res *res) +static int decode_layoutstats(struct xdr_stream *xdr) { return decode_op_hdr(xdr, OP_LAYOUTSTATS); } @@ -343,7 +342,7 @@ static int nfs4_xdr_dec_layoutstats(struct rpc_rqst *rqstp, goto out; WARN_ON(res->num_dev > PNFS_LAYOUTSTATS_MAXDEV); for (i = 0; i < res->num_dev; i++) { - status = decode_layoutstats(xdr, res); + status = decode_layoutstats(xdr); if (status) goto out; } diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index ea3bee919a76..50cfc4ca7a02 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -405,9 +405,7 @@ int nfs40_discover_server_trunking(struct nfs_client *clp, int nfs41_discover_server_trunking(struct nfs_client *clp, struct nfs_client **, struct rpc_cred *); extern void nfs4_schedule_session_recovery(struct nfs4_session *, int); -extern void nfs41_server_notify_target_slotid_update(struct nfs_client *clp); -extern void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp); - +extern void nfs41_notify_server(struct nfs_client *); #else static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) { diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 3aa6a9ba5113..223bedda64ae 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -729,10 +729,7 @@ static bool nfs4_cb_match_client(const struct sockaddr *addr, return false; /* Match only the IP address, not the port number */ - if (!nfs_sockaddr_match_ipaddr(addr, clap)) - return false; - - return true; + return rpc_cmp_addr(addr, clap); } /* diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index dcd39d4e2efe..b0dbe0abed53 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -6,7 +6,9 @@ #include <linux/fs.h> #include <linux/falloc.h> #include <linux/nfs_fs.h> +#include "delegation.h" #include "internal.h" +#include "iostat.h" #include "fscache.h" #include "pnfs.h" @@ -27,7 +29,6 @@ nfs4_file_open(struct inode *inode, struct file *filp) struct inode *dir; unsigned openflags = filp->f_flags; struct iattr attr; - int opened = 0; int err; /* @@ -66,7 +67,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) nfs_sync_inode(inode); } - inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, &opened); + inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, NULL); if (IS_ERR(inode)) { err = PTR_ERR(inode); switch (err) { @@ -100,6 +101,31 @@ out_drop: goto out_put_ctx; } +/* + * Flush all dirty pages, and check for write errors. + */ +static int +nfs4_file_flush(struct file *file, fl_owner_t id) +{ + struct inode *inode = file_inode(file); + + dprintk("NFS: flush(%pD2)\n", file); + + nfs_inc_stats(inode, NFSIOS_VFSFLUSH); + if ((file->f_mode & FMODE_WRITE) == 0) + return 0; + + /* + * If we're holding a write delegation, then check if we're required + * to flush the i/o on close. If not, then just start the i/o now. + */ + if (!nfs4_delegation_flush_on_close(inode)) + return filemap_fdatawrite(file->f_mapping); + + /* Flush writes to the server and return any errors */ + return vfs_fsync(file, 0); +} + static int nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) { @@ -178,7 +204,7 @@ const struct file_operations nfs4_file_operations = { .write_iter = nfs_file_write, .mmap = nfs_file_mmap, .open = nfs4_file_open, - .flush = nfs_file_flush, + .flush = nfs4_file_flush, .release = nfs_file_release, .fsync = nfs4_file_fsync, .lock = nfs_lock, diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index 535dfc69c628..2e4902203c35 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -184,7 +184,7 @@ static struct key_type key_type_id_resolver = { .read = user_read, }; -static int nfs_idmap_init_keyring(void) +int nfs_idmap_init(void) { struct cred *cred; struct key *keyring; @@ -230,7 +230,7 @@ failed_put_cred: return ret; } -static void nfs_idmap_quit_keyring(void) +void nfs_idmap_quit(void) { key_revoke(id_resolver_cache->thread_keyring); unregister_key_type(&key_type_id_resolver); @@ -492,16 +492,6 @@ nfs_idmap_delete(struct nfs_client *clp) kfree(idmap); } -int nfs_idmap_init(void) -{ - return nfs_idmap_init_keyring(); -} - -void nfs_idmap_quit(void) -{ - nfs_idmap_quit_keyring(); -} - static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap, struct idmap_msg *im, struct rpc_pipe_msg *msg) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 8bee93469617..693b903b48bd 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -467,7 +467,10 @@ static void do_renew_lease(struct nfs_client *clp, unsigned long timestamp) static void renew_lease(const struct nfs_server *server, unsigned long timestamp) { - do_renew_lease(server->nfs_client, timestamp); + struct nfs_client *clp = server->nfs_client; + + if (!nfs4_has_session(clp)) + do_renew_lease(clp, timestamp); } struct nfs4_call_sync_data { @@ -583,7 +586,7 @@ out_unlock: spin_unlock(&tbl->slot_tbl_lock); res->sr_slot = NULL; if (send_new_highest_used_slotid) - nfs41_server_notify_highest_slotid_update(session->clp); + nfs41_notify_server(session->clp); } int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) @@ -616,8 +619,7 @@ int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) clp = session->clp; do_renew_lease(clp, res->sr_timestamp); /* Check sequence flags */ - if (res->sr_status_flags != 0) - nfs4_schedule_lease_recovery(clp); + nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); nfs41_update_target_slotid(slot->table, slot, res); break; case 1: @@ -910,6 +912,7 @@ struct nfs4_opendata { struct nfs_open_confirmres c_res; struct nfs4_string owner_name; struct nfs4_string group_name; + struct nfs4_label *a_label; struct nfs_fattr f_attr; struct nfs4_label *f_label; struct dentry *dir; @@ -1013,6 +1016,10 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, if (IS_ERR(p->f_label)) goto err_free_p; + p->a_label = nfs4_label_alloc(server, gfp_mask); + if (IS_ERR(p->a_label)) + goto err_free_f; + alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid; p->o_arg.seqid = alloc_seqid(&sp->so_seqid, gfp_mask); if (IS_ERR(p->o_arg.seqid)) @@ -1041,7 +1048,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, p->o_arg.server = server; p->o_arg.bitmask = nfs4_bitmask(server, label); p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0]; - p->o_arg.label = label; + p->o_arg.label = nfs4_label_copy(p->a_label, label); p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim); switch (p->o_arg.claim) { case NFS4_OPEN_CLAIM_NULL: @@ -1074,6 +1081,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, return p; err_free_label: + nfs4_label_free(p->a_label); +err_free_f: nfs4_label_free(p->f_label); err_free_p: kfree(p); @@ -1093,6 +1102,7 @@ static void nfs4_opendata_free(struct kref *kref) nfs4_put_open_state(p->state); nfs4_put_state_owner(p->owner); + nfs4_label_free(p->a_label); nfs4_label_free(p->f_label); dput(p->dir); @@ -1140,7 +1150,8 @@ out: return ret; } -static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode) +static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode, + enum open_claim_type4 claim) { if (delegation == NULL) return 0; @@ -1148,6 +1159,16 @@ static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode) return 0; if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) return 0; + switch (claim) { + case NFS4_OPEN_CLAIM_NULL: + case NFS4_OPEN_CLAIM_FH: + break; + case NFS4_OPEN_CLAIM_PREVIOUS: + if (!test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags)) + break; + default: + return 0; + } nfs_mark_delegation_referenced(delegation); return 1; } @@ -1198,15 +1219,19 @@ static bool nfs_need_update_open_stateid(struct nfs4_state *state, static void nfs_resync_open_stateid_locked(struct nfs4_state *state) { + if (!(state->n_wronly || state->n_rdonly || state->n_rdwr)) + return; if (state->n_wronly) set_bit(NFS_O_WRONLY_STATE, &state->flags); if (state->n_rdonly) set_bit(NFS_O_RDONLY_STATE, &state->flags); if (state->n_rdwr) set_bit(NFS_O_RDWR_STATE, &state->flags); + set_bit(NFS_OPEN_STATE, &state->flags); } static void nfs_clear_open_stateid_locked(struct nfs4_state *state, + nfs4_stateid *arg_stateid, nfs4_stateid *stateid, fmode_t fmode) { clear_bit(NFS_O_RDWR_STATE, &state->flags); @@ -1225,8 +1250,9 @@ static void nfs_clear_open_stateid_locked(struct nfs4_state *state, if (stateid == NULL) return; /* Handle races with OPEN */ - if (!nfs4_stateid_match_other(stateid, &state->open_stateid) || - !nfs4_stateid_is_newer(stateid, &state->open_stateid)) { + if (!nfs4_stateid_match_other(arg_stateid, &state->open_stateid) || + (nfs4_stateid_match_other(stateid, &state->open_stateid) && + !nfs4_stateid_is_newer(stateid, &state->open_stateid))) { nfs_resync_open_stateid_locked(state); return; } @@ -1235,10 +1261,12 @@ static void nfs_clear_open_stateid_locked(struct nfs4_state *state, nfs4_stateid_copy(&state->open_stateid, stateid); } -static void nfs_clear_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) +static void nfs_clear_open_stateid(struct nfs4_state *state, + nfs4_stateid *arg_stateid, + nfs4_stateid *stateid, fmode_t fmode) { write_seqlock(&state->seqlock); - nfs_clear_open_stateid_locked(state, stateid, fmode); + nfs_clear_open_stateid_locked(state, arg_stateid, stateid, fmode); write_sequnlock(&state->seqlock); if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) nfs4_schedule_state_manager(state->owner->so_server->nfs_client); @@ -1363,6 +1391,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) struct nfs_delegation *delegation; int open_mode = opendata->o_arg.open_flags; fmode_t fmode = opendata->o_arg.fmode; + enum open_claim_type4 claim = opendata->o_arg.claim; nfs4_stateid stateid; int ret = -EAGAIN; @@ -1376,7 +1405,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) spin_unlock(&state->owner->so_lock); rcu_read_lock(); delegation = rcu_dereference(nfsi->delegation); - if (!can_open_delegated(delegation, fmode)) { + if (!can_open_delegated(delegation, fmode, claim)) { rcu_read_unlock(); break; } @@ -1839,6 +1868,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) struct nfs4_opendata *data = calldata; struct nfs4_state_owner *sp = data->owner; struct nfs_client *clp = sp->so_server->nfs_client; + enum open_claim_type4 claim = data->o_arg.claim; if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0) goto out_wait; @@ -1853,15 +1883,15 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) goto out_no_action; rcu_read_lock(); delegation = rcu_dereference(NFS_I(data->state->inode)->delegation); - if (data->o_arg.claim != NFS4_OPEN_CLAIM_DELEGATE_CUR && - data->o_arg.claim != NFS4_OPEN_CLAIM_DELEG_CUR_FH && - can_open_delegated(delegation, data->o_arg.fmode)) + if (can_open_delegated(delegation, data->o_arg.fmode, claim)) goto unlock_no_action; rcu_read_unlock(); } /* Update client id. */ data->o_arg.clientid = clp->cl_clientid; - switch (data->o_arg.claim) { + switch (claim) { + default: + break; case NFS4_OPEN_CLAIM_PREVIOUS: case NFS4_OPEN_CLAIM_DELEG_CUR_FH: case NFS4_OPEN_CLAIM_DELEG_PREV_FH: @@ -2281,15 +2311,25 @@ static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st * fields corresponding to attributes that were used to store the verifier. * Make sure we clobber those fields in the later setattr call */ -static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct iattr *sattr) +static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, + struct iattr *sattr, struct nfs4_label **label) { - if ((opendata->o_res.attrset[1] & FATTR4_WORD1_TIME_ACCESS) && + const u32 *attrset = opendata->o_res.attrset; + + if ((attrset[1] & FATTR4_WORD1_TIME_ACCESS) && !(sattr->ia_valid & ATTR_ATIME_SET)) sattr->ia_valid |= ATTR_ATIME; - if ((opendata->o_res.attrset[1] & FATTR4_WORD1_TIME_MODIFY) && + if ((attrset[1] & FATTR4_WORD1_TIME_MODIFY) && !(sattr->ia_valid & ATTR_MTIME_SET)) sattr->ia_valid |= ATTR_MTIME; + + /* Except MODE, it seems harmless of setting twice. */ + if ((attrset[1] & FATTR4_WORD1_MODE)) + sattr->ia_valid &= ~ATTR_MODE; + + if (attrset[2] & FATTR4_WORD2_SECURITY_LABEL) + *label = NULL; } static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, @@ -2412,9 +2452,9 @@ static int _nfs4_do_open(struct inode *dir, goto err_free_label; state = ctx->state; - if ((opendata->o_arg.open_flags & O_EXCL) && + if ((opendata->o_arg.open_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL) && (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) { - nfs4_exclusive_attrset(opendata, sattr); + nfs4_exclusive_attrset(opendata, sattr, &label); nfs_fattr_init(opendata->o_res.f_attr); status = nfs4_do_setattr(state->inode, cred, @@ -2426,7 +2466,7 @@ static int _nfs4_do_open(struct inode *dir, nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel); } } - if (opendata->file_created) + if (opened && opendata->file_created) *opened |= FILE_CREATED; if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) { @@ -2648,7 +2688,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) switch (task->tk_status) { case 0: res_stateid = &calldata->res.stateid; - if (calldata->arg.fmode == 0 && calldata->roc) + if (calldata->roc) pnfs_roc_set_barrier(state->inode, calldata->roc_barrier); renew_lease(server, calldata->timestamp); @@ -2671,7 +2711,8 @@ static void nfs4_close_done(struct rpc_task *task, void *data) goto out_release; } } - nfs_clear_open_stateid(state, res_stateid, calldata->arg.fmode); + nfs_clear_open_stateid(state, &calldata->arg.stateid, + res_stateid, calldata->arg.fmode); out_release: nfs_release_seqid(calldata->arg.seqid); nfs_refresh_inode(calldata->inode, calldata->res.fattr); @@ -2722,14 +2763,11 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) goto out_no_action; } - if (calldata->arg.fmode == 0) { + if (calldata->arg.fmode == 0) task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE]; - if (calldata->roc && - pnfs_roc_drain(inode, &calldata->roc_barrier, task)) { - nfs_release_seqid(calldata->arg.seqid); - goto out_wait; - } - } + if (calldata->roc) + pnfs_roc_get_barrier(inode, &calldata->roc_barrier); + calldata->arg.share_access = nfs4_map_atomic_open_share(NFS_SERVER(inode), calldata->arg.fmode, 0); @@ -2870,8 +2908,10 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) { + u32 bitmask[3] = {}, minorversion = server->nfs_client->cl_minorversion; struct nfs4_server_caps_arg args = { .fhandle = fhandle, + .bitmask = bitmask, }; struct nfs4_server_caps_res res = {}; struct rpc_message msg = { @@ -2881,10 +2921,18 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f }; int status; + bitmask[0] = FATTR4_WORD0_SUPPORTED_ATTRS | + FATTR4_WORD0_FH_EXPIRE_TYPE | + FATTR4_WORD0_LINK_SUPPORT | + FATTR4_WORD0_SYMLINK_SUPPORT | + FATTR4_WORD0_ACLSUPPORT; + if (minorversion) + bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT; + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (status == 0) { /* Sanity check the server answers */ - switch (server->nfs_client->cl_minorversion) { + switch (minorversion) { case 0: res.attr_bitmask[1] &= FATTR4_WORD1_NFS40_MASK; res.attr_bitmask[2] = 0; @@ -2937,6 +2985,8 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE; server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; server->cache_consistency_bitmask[2] = 0; + memcpy(server->exclcreat_bitmask, res.exclcreat_bitmask, + sizeof(server->exclcreat_bitmask)); server->acl_bitmask = res.acl_bitmask; server->fh_expire_type = res.fh_expire_type; } @@ -3539,7 +3589,6 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, struct nfs4_label l, *ilabel = NULL; struct nfs_open_context *ctx; struct nfs4_state *state; - int opened = 0; int status = 0; ctx = alloc_nfs_open_context(dentry, FMODE_READ); @@ -3549,7 +3598,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, ilabel = nfs4_label_init_security(dir, dentry, sattr, &l); sattr->ia_mode &= ~current_umask(); - state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, &opened); + state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, NULL); if (IS_ERR(state)) { status = PTR_ERR(state); goto out; @@ -4965,13 +5014,12 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp) int result; size_t len; char *str; - bool retried = false; if (clp->cl_owner_id != NULL) return 0; -retry: + rcu_read_lock(); - len = 10 + strlen(clp->cl_ipaddr) + 1 + + len = 14 + strlen(clp->cl_ipaddr) + 1 + strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)) + 1 + strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)) + @@ -4997,14 +5045,6 @@ retry: rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)); rcu_read_unlock(); - /* Did something change? */ - if (result >= len) { - kfree(str); - if (retried) - return -EINVAL; - retried = true; - goto retry; - } clp->cl_owner_id = str; return 0; } @@ -5036,10 +5076,6 @@ nfs4_init_uniquifier_client_string(struct nfs_client *clp) clp->rpc_ops->version, clp->cl_minorversion, nfs4_client_id_uniquifier, clp->cl_rpcclient->cl_nodename); - if (result >= len) { - kfree(str); - return -EINVAL; - } clp->cl_owner_id = str; return 0; } @@ -5075,10 +5111,6 @@ nfs4_init_uniform_client_string(struct nfs_client *clp) result = scnprintf(str, len, "Linux NFSv%u.%u %s", clp->rpc_ops->version, clp->cl_minorversion, clp->cl_rpcclient->cl_nodename); - if (result >= len) { - kfree(str); - return -EINVAL; - } clp->cl_owner_id = str; return 0; } @@ -5276,9 +5308,8 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data) d_data = (struct nfs4_delegreturndata *)data; - if (d_data->roc && - pnfs_roc_drain(d_data->inode, &d_data->roc_barrier, task)) - return; + if (d_data->roc) + pnfs_roc_get_barrier(d_data->inode, &d_data->roc_barrier); nfs4_setup_sequence(d_data->res.server, &d_data->args.seq_args, @@ -7571,13 +7602,8 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) goto out; } ret = rpc_wait_for_completion_task(task); - if (!ret) { - struct nfs4_sequence_res *res = task->tk_msg.rpc_resp; - - if (task->tk_status == 0) - nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); + if (!ret) ret = task->tk_status; - } rpc_put_task(task); out: dprintk("<-- %s status=%d\n", __func__, ret); @@ -7738,10 +7764,19 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) case 0: goto out; /* + * NFS4ERR_BADLAYOUT means the MDS cannot return a layout of + * length lgp->args.minlength != 0 (see RFC5661 section 18.43.3). + */ + case -NFS4ERR_BADLAYOUT: + goto out_overflow; + /* * NFS4ERR_LAYOUTTRYLATER is a conflict with another client - * (or clients) writing to the same RAID stripe + * (or clients) writing to the same RAID stripe except when + * the minlength argument is 0 (see RFC5661 section 18.43.3). */ case -NFS4ERR_LAYOUTTRYLATER: + if (lgp->args.minlength == 0) + goto out_overflow; /* * NFS4ERR_RECALLCONFLICT is when conflict with self (must recall * existing layout before getting a new one). @@ -7797,6 +7832,10 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) rpc_restart_call_prepare(task); out: dprintk("<-- %s\n", __func__); + return; +out_overflow: + task->tk_status = -EOVERFLOW; + goto out; } static size_t max_response_pages(struct nfs_server *server) @@ -7965,16 +8004,17 @@ static void nfs4_layoutreturn_release(void *calldata) { struct nfs4_layoutreturn *lrp = calldata; struct pnfs_layout_hdr *lo = lrp->args.layout; + LIST_HEAD(freeme); dprintk("--> %s\n", __func__); spin_lock(&lo->plh_inode->i_lock); if (lrp->res.lrs_present) pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); + pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range); pnfs_clear_layoutreturn_waitbit(lo); - clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags); - rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq); lo->plh_block_lgets--; spin_unlock(&lo->plh_inode->i_lock); + pnfs_free_lseg_list(&freeme); pnfs_put_layout_hdr(lrp->args.layout); nfs_iput_and_deactive(lrp->inode); kfree(calldata); @@ -8588,7 +8628,6 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { .minor_version = 0, .init_caps = NFS_CAP_READDIRPLUS | NFS_CAP_ATOMIC_OPEN - | NFS_CAP_CHANGE_ATTR | NFS_CAP_POSIX_LOCK, .init_client = nfs40_init_client, .shutdown_client = nfs40_shutdown_client, @@ -8614,7 +8653,6 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { .minor_version = 1, .init_caps = NFS_CAP_READDIRPLUS | NFS_CAP_ATOMIC_OPEN - | NFS_CAP_CHANGE_ATTR | NFS_CAP_POSIX_LOCK | NFS_CAP_STATEID_NFSV41 | NFS_CAP_ATOMIC_OPEN_V1, @@ -8637,7 +8675,6 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { .minor_version = 2, .init_caps = NFS_CAP_READDIRPLUS | NFS_CAP_ATOMIC_OPEN - | NFS_CAP_CHANGE_ATTR | NFS_CAP_POSIX_LOCK | NFS_CAP_STATEID_NFSV41 | NFS_CAP_ATOMIC_OPEN_V1 @@ -8655,6 +8692,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { .reboot_recovery_ops = &nfs41_reboot_recovery_ops, .nograce_recovery_ops = &nfs41_nograce_recovery_ops, .state_renewal_ops = &nfs41_state_renewal_ops, + .mig_recovery_ops = &nfs41_mig_recovery_ops, }; #endif diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 605840dc89cf..da73bc443238 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2152,23 +2152,13 @@ void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) } EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery); -static void nfs41_ping_server(struct nfs_client *clp) +void nfs41_notify_server(struct nfs_client *clp) { /* Use CHECK_LEASE to ping the server with a SEQUENCE */ set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); nfs4_schedule_state_manager(clp); } -void nfs41_server_notify_target_slotid_update(struct nfs_client *clp) -{ - nfs41_ping_server(clp); -} - -void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp) -{ - nfs41_ping_server(clp); -} - static void nfs4_reset_all_state(struct nfs_client *clp) { if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) { @@ -2191,25 +2181,35 @@ static void nfs41_handle_server_reboot(struct nfs_client *clp) } } -static void nfs41_handle_state_revoked(struct nfs_client *clp) +static void nfs41_handle_all_state_revoked(struct nfs_client *clp) { nfs4_reset_all_state(clp); dprintk("%s: state revoked on server %s\n", __func__, clp->cl_hostname); } +static void nfs41_handle_some_state_revoked(struct nfs_client *clp) +{ + nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce); + nfs4_schedule_state_manager(clp); + + dprintk("%s: state revoked on server %s\n", __func__, clp->cl_hostname); +} + static void nfs41_handle_recallable_state_revoked(struct nfs_client *clp) { - /* This will need to handle layouts too */ - nfs_expire_all_delegations(clp); + /* FIXME: For now, we destroy all layouts. */ + pnfs_destroy_all_layouts(clp); + /* FIXME: For now, we test all delegations+open state+locks. */ + nfs41_handle_some_state_revoked(clp); dprintk("%s: Recallable state revoked on server %s!\n", __func__, clp->cl_hostname); } static void nfs41_handle_backchannel_fault(struct nfs_client *clp) { - nfs_expire_all_delegations(clp); - if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0) - nfs4_schedule_state_manager(clp); + set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + nfs4_schedule_state_manager(clp); + dprintk("%s: server %s declared a backchannel fault\n", __func__, clp->cl_hostname); } @@ -2231,10 +2231,11 @@ void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED) nfs41_handle_server_reboot(clp); - if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED | - SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED | + if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED)) + nfs41_handle_all_state_revoked(clp); + if (flags & (SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED | SEQ4_STATUS_ADMIN_STATE_REVOKED)) - nfs41_handle_state_revoked(clp); + nfs41_handle_some_state_revoked(clp); if (flags & SEQ4_STATUS_LEASE_MOVED) nfs4_schedule_lease_moved_recovery(clp); if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED) diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 470af1a78bec..28df12e525ba 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -884,6 +884,66 @@ DEFINE_NFS4_GETATTR_EVENT(nfs4_getattr); DEFINE_NFS4_GETATTR_EVENT(nfs4_lookup_root); DEFINE_NFS4_GETATTR_EVENT(nfs4_fsinfo); +DECLARE_EVENT_CLASS(nfs4_inode_callback_event, + TP_PROTO( + const struct nfs_client *clp, + const struct nfs_fh *fhandle, + const struct inode *inode, + int error + ), + + TP_ARGS(clp, fhandle, inode, error), + + TP_STRUCT__entry( + __field(int, error) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __string(dstaddr, clp ? + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_ADDR) : "unknown") + ), + + TP_fast_assign( + __entry->error = error; + __entry->fhandle = nfs_fhandle_hash(fhandle); + if (inode != NULL) { + __entry->fileid = NFS_FILEID(inode); + __entry->dev = inode->i_sb->s_dev; + } else { + __entry->fileid = 0; + __entry->dev = 0; + } + __assign_str(dstaddr, clp ? + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_ADDR) : "unknown") + ), + + TP_printk( + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "dstaddr=%s", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __get_str(dstaddr) + ) +); + +#define DEFINE_NFS4_INODE_CALLBACK_EVENT(name) \ + DEFINE_EVENT(nfs4_inode_callback_event, name, \ + TP_PROTO( \ + const struct nfs_client *clp, \ + const struct nfs_fh *fhandle, \ + const struct inode *inode, \ + int error \ + ), \ + TP_ARGS(clp, fhandle, inode, error)) +DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_getattr); +DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_layoutrecall_inode); + + DECLARE_EVENT_CLASS(nfs4_idmap_event, TP_PROTO( const char *name, @@ -1136,6 +1196,7 @@ TRACE_EVENT(nfs4_layoutget, DEFINE_NFS4_INODE_EVENT(nfs4_layoutcommit); DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn); +DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close); #endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 558cd65dbdb7..788adf3897c7 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -400,7 +400,8 @@ static int nfs4_stat_to_errno(int); #define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3) #define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \ encode_stateid_maxsz + \ - 1 /* FIXME: opaque lrf_body always empty at the moment */) + 1 + \ + XDR_QUADLEN(NFS4_OPAQUE_LIMIT)) #define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \ 1 + decode_stateid_maxsz) #define encode_secinfo_no_name_maxsz (op_encode_hdr_maxsz + 1) @@ -1001,7 +1002,8 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs4_label *label, - const struct nfs_server *server) + const struct nfs_server *server, + bool excl_check) { char owner_name[IDMAP_NAMESZ]; char owner_group[IDMAP_NAMESZ]; @@ -1067,6 +1069,17 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET; len += 4; } + + if (excl_check) { + const u32 *excl_bmval = server->exclcreat_bitmask; + bmval[0] &= excl_bmval[0]; + bmval[1] &= excl_bmval[1]; + bmval[2] &= excl_bmval[2]; + + if (!(excl_bmval[2] & FATTR4_WORD2_SECURITY_LABEL)) + label = NULL; + } + if (label) { len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2); bmval[2] |= FATTR4_WORD2_SECURITY_LABEL; @@ -1154,7 +1167,9 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg * case NF4LNK: p = reserve_space(xdr, 4); *p = cpu_to_be32(create->u.symlink.len); - xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len); + xdr_write_pages(xdr, create->u.symlink.pages, 0, + create->u.symlink.len); + xdr->buf->flags |= XDRBUF_WRITE; break; case NF4BLK: case NF4CHR: @@ -1168,7 +1183,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg * } encode_string(xdr, create->name->len, create->name->name); - encode_attrs(xdr, create->attrs, create->label, create->server); + encode_attrs(xdr, create->attrs, create->label, create->server, false); } static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr) @@ -1382,18 +1397,17 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg) { - struct iattr dummy; __be32 *p; p = reserve_space(xdr, 4); switch(arg->createmode) { case NFS4_CREATE_UNCHECKED: *p = cpu_to_be32(NFS4_CREATE_UNCHECKED); - encode_attrs(xdr, arg->u.attrs, arg->label, arg->server); + encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false); break; case NFS4_CREATE_GUARDED: *p = cpu_to_be32(NFS4_CREATE_GUARDED); - encode_attrs(xdr, arg->u.attrs, arg->label, arg->server); + encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false); break; case NFS4_CREATE_EXCLUSIVE: *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); @@ -1402,8 +1416,7 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op case NFS4_CREATE_EXCLUSIVE4_1: *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1); encode_nfs4_verifier(xdr, &arg->u.verifier); - dummy.ia_valid = 0; - encode_attrs(xdr, &dummy, arg->label, arg->server); + encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, true); } } @@ -1659,7 +1672,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs { encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr); encode_nfs4_stateid(xdr, &arg->stateid); - encode_attrs(xdr, arg->iap, arg->label, server); + encode_attrs(xdr, arg->iap, arg->label, server, false); } static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr) @@ -2580,6 +2593,7 @@ static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req, struct xdr_stream *xdr, struct nfs4_server_caps_arg *args) { + const u32 *bitmask = args->bitmask; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; @@ -2587,11 +2601,7 @@ static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req, encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fhandle, &hdr); - encode_getattr_one(xdr, FATTR4_WORD0_SUPPORTED_ATTRS| - FATTR4_WORD0_FH_EXPIRE_TYPE| - FATTR4_WORD0_LINK_SUPPORT| - FATTR4_WORD0_SYMLINK_SUPPORT| - FATTR4_WORD0_ACLSUPPORT, &hdr); + encode_getattr_three(xdr, bitmask[0], bitmask[1], bitmask[2], &hdr); encode_nops(&hdr); } @@ -3368,6 +3378,22 @@ out_overflow: return -EIO; } +static int decode_attr_exclcreat_supported(struct xdr_stream *xdr, + uint32_t *bitmap, uint32_t *bitmask) +{ + if (likely(bitmap[2] & FATTR4_WORD2_SUPPATTR_EXCLCREAT)) { + int ret; + ret = decode_attr_bitmap(xdr, bitmask); + if (unlikely(ret < 0)) + return ret; + bitmap[2] &= ~FATTR4_WORD2_SUPPATTR_EXCLCREAT; + } else + bitmask[0] = bitmask[1] = bitmask[2] = 0; + dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__, + bitmask[0], bitmask[1], bitmask[2]); + return 0; +} + static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh) { __be32 *p; @@ -4321,6 +4347,9 @@ static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_re goto xdr_error; if ((status = decode_attr_aclsupport(xdr, bitmap, &res->acl_bitmask)) != 0) goto xdr_error; + if ((status = decode_attr_exclcreat_supported(xdr, bitmap, + res->exclcreat_bitmask)) != 0) + goto xdr_error; status = verify_attr_len(xdr, savep, attrlen); xdr_error: dprintk("%s: xdr returned %d!\n", __func__, -status); @@ -4903,24 +4932,28 @@ static int decode_lookup(struct xdr_stream *xdr) } /* This is too sick! */ -static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize) +static int decode_space_limit(struct xdr_stream *xdr, + unsigned long *pagemod_limit) { __be32 *p; uint32_t limit_type, nblocks, blocksize; + u64 maxsize = 0; p = xdr_inline_decode(xdr, 12); if (unlikely(!p)) goto out_overflow; limit_type = be32_to_cpup(p++); switch (limit_type) { - case 1: - xdr_decode_hyper(p, maxsize); + case NFS4_LIMIT_SIZE: + xdr_decode_hyper(p, &maxsize); break; - case 2: + case NFS4_LIMIT_BLOCKS: nblocks = be32_to_cpup(p++); blocksize = be32_to_cpup(p); - *maxsize = (uint64_t)nblocks * (uint64_t)blocksize; + maxsize = (uint64_t)nblocks * (uint64_t)blocksize; } + maxsize >>= PAGE_CACHE_SHIFT; + *pagemod_limit = min_t(u64, maxsize, ULONG_MAX); return 0; out_overflow: print_overflow_msg(__func__, xdr); @@ -4948,7 +4981,7 @@ static int decode_rw_delegation(struct xdr_stream *xdr, break; case NFS4_OPEN_DELEGATE_WRITE: res->delegation_type = FMODE_WRITE|FMODE_READ; - if (decode_space_limit(xdr, &res->maxsize) < 0) + if (decode_space_limit(xdr, &res->pagemod_limit) < 0) return -EIO; } return decode_ace(xdr, NULL, res->server->nfs_client); diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 1da68d3b1eda..7c5718ba625e 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -77,8 +77,8 @@ EXPORT_SYMBOL_GPL(nfs_pgheader_init); void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos) { spin_lock(&hdr->lock); - if (pos < hdr->io_start + hdr->good_bytes) { - set_bit(NFS_IOHDR_ERROR, &hdr->flags); + if (!test_and_set_bit(NFS_IOHDR_ERROR, &hdr->flags) + || pos < hdr->io_start + hdr->good_bytes) { clear_bit(NFS_IOHDR_EOF, &hdr->flags); hdr->good_bytes = pos - hdr->io_start; hdr->error = error; @@ -1100,8 +1100,6 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) mirror->pg_base = 0; mirror->pg_recoalesce = 0; - desc->pg_moreio = 0; - while (!list_empty(&head)) { struct nfs_page *req; @@ -1109,8 +1107,11 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) nfs_list_remove_request(req); if (__nfs_pageio_add_request(desc, req)) continue; - if (desc->pg_error < 0) + if (desc->pg_error < 0) { + list_splice_tail(&head, &mirror->pg_list); + mirror->pg_recoalesce = 1; return 0; + } break; } } while (mirror->pg_recoalesce); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 0ba9a02c9566..ba1246433794 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -352,7 +352,7 @@ pnfs_layout_need_return(struct pnfs_layout_hdr *lo, { struct pnfs_layout_segment *s; - if (!test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) + if (!test_and_clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) return false; list_for_each_entry(s, &lo->plh_segs, pls_list) @@ -362,6 +362,17 @@ pnfs_layout_need_return(struct pnfs_layout_hdr *lo, return true; } +static bool +pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo) +{ + if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) + return false; + lo->plh_return_iomode = 0; + pnfs_get_layout_hdr(lo); + clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags); + return true; +} + static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg, struct pnfs_layout_hdr *lo, struct inode *inode) { @@ -372,17 +383,16 @@ static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg, if (pnfs_layout_need_return(lo, lseg)) { nfs4_stateid stateid; enum pnfs_iomode iomode; + bool send; stateid = lo->plh_stateid; iomode = lo->plh_return_iomode; - /* decreased in pnfs_send_layoutreturn() */ - lo->plh_block_lgets++; - lo->plh_return_iomode = 0; + send = pnfs_prepare_layoutreturn(lo); spin_unlock(&inode->i_lock); - pnfs_get_layout_hdr(lo); - - /* Send an async layoutreturn so we dont deadlock */ - pnfs_send_layoutreturn(lo, stateid, iomode, false); + if (send) { + /* Send an async layoutreturn so we dont deadlock */ + pnfs_send_layoutreturn(lo, stateid, iomode, false); + } } else spin_unlock(&inode->i_lock); } @@ -411,6 +421,10 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg) pnfs_layoutreturn_before_put_lseg(lseg, lo, inode); if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { + if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) { + spin_unlock(&inode->i_lock); + return; + } pnfs_get_layout_hdr(lo); pnfs_layout_remove_lseg(lo, lseg); spin_unlock(&inode->i_lock); @@ -451,6 +465,8 @@ pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg) test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); if (atomic_dec_and_test(&lseg->pls_refcount)) { struct pnfs_layout_hdr *lo = lseg->pls_layout; + if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) + return; pnfs_get_layout_hdr(lo); pnfs_layout_remove_lseg(lo, lseg); pnfs_free_lseg_async(lseg); @@ -800,25 +816,12 @@ pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo, return !pnfs_seqid_is_newer(seqid, lo->plh_barrier); } -static bool -pnfs_layout_returning(const struct pnfs_layout_hdr *lo, - struct pnfs_layout_range *range) -{ - return test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) && - (lo->plh_return_iomode == IOMODE_ANY || - lo->plh_return_iomode == range->iomode); -} - /* lget is set to 1 if called from inside send_layoutget call chain */ static bool -pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo, - struct pnfs_layout_range *range, int lget) +pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo) { return lo->plh_block_lgets || - test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || - (list_empty(&lo->plh_segs) && - (atomic_read(&lo->plh_outstanding) > lget)) || - pnfs_layout_returning(lo, range); + test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); } int @@ -830,7 +833,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, dprintk("--> %s\n", __func__); spin_lock(&lo->plh_inode->i_lock); - if (pnfs_layoutgets_blocked(lo, range, 1)) { + if (pnfs_layoutgets_blocked(lo)) { status = -EAGAIN; } else if (!nfs4_valid_open_stateid(open_state)) { status = -EBADF; @@ -865,6 +868,7 @@ send_layoutget(struct pnfs_layout_hdr *lo, struct nfs_server *server = NFS_SERVER(ino); struct nfs4_layoutget *lgp; struct pnfs_layout_segment *lseg; + loff_t i_size; dprintk("--> %s\n", __func__); @@ -872,9 +876,17 @@ send_layoutget(struct pnfs_layout_hdr *lo, if (lgp == NULL) return NULL; + i_size = i_size_read(ino); + lgp->args.minlength = PAGE_CACHE_SIZE; if (lgp->args.minlength > range->length) lgp->args.minlength = range->length; + if (range->iomode == IOMODE_READ) { + if (range->offset >= i_size) + lgp->args.minlength = 0; + else if (i_size - range->offset < lgp->args.minlength) + lgp->args.minlength = i_size - range->offset; + } lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; lgp->args.range = *range; lgp->args.type = server->pnfs_curr_ld->id; @@ -924,6 +936,7 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo) clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags); smp_mb__after_atomic(); wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN); + rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq); } static int @@ -938,9 +951,7 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid, if (unlikely(lrp == NULL)) { status = -ENOMEM; spin_lock(&ino->i_lock); - lo->plh_block_lgets--; pnfs_clear_layoutreturn_waitbit(lo); - rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq); spin_unlock(&ino->i_lock); pnfs_put_layout_hdr(lo); goto out; @@ -978,6 +989,7 @@ _pnfs_return_layout(struct inode *ino) LIST_HEAD(tmp_list); nfs4_stateid stateid; int status = 0, empty; + bool send; dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino); @@ -1007,17 +1019,18 @@ _pnfs_return_layout(struct inode *ino) /* Don't send a LAYOUTRETURN if list was initially empty */ if (empty) { spin_unlock(&ino->i_lock); - pnfs_put_layout_hdr(lo); dprintk("NFS: %s no layout segments to return\n", __func__); - goto out; + goto out_put_layout_hdr; } set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - lo->plh_block_lgets++; + send = pnfs_prepare_layoutreturn(lo); spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); - - status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); + if (send) + status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); +out_put_layout_hdr: + pnfs_put_layout_hdr(lo); out: dprintk("<-- %s status: %d\n", __func__, status); return status; @@ -1060,15 +1073,14 @@ bool pnfs_roc(struct inode *ino) struct pnfs_layout_segment *lseg, *tmp; nfs4_stateid stateid; LIST_HEAD(tmp_list); - bool found = false, layoutreturn = false; + bool found = false, layoutreturn = false, roc = false; spin_lock(&ino->i_lock); lo = nfsi->layout; - if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) || - test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) + if (!lo || test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) goto out_noroc; - /* Don't return layout if we hold a delegation */ + /* no roc if we hold a delegation */ if (nfs4_check_delegation(ino, FMODE_READ)) goto out_noroc; @@ -1079,38 +1091,41 @@ bool pnfs_roc(struct inode *ino) goto out_noroc; } + stateid = lo->plh_stateid; + /* always send layoutreturn if being marked so */ + if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, + &lo->plh_flags)) + layoutreturn = pnfs_prepare_layoutreturn(lo); + pnfs_clear_retry_layoutget(lo); list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list) - if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { + /* If we are sending layoutreturn, invalidate all valid lsegs */ + if (layoutreturn || test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { mark_lseg_invalid(lseg, &tmp_list); found = true; } - if (!found) - goto out_noroc; - lo->plh_block_lgets++; - pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */ - spin_unlock(&ino->i_lock); - pnfs_free_lseg_list(&tmp_list); - pnfs_layoutcommit_inode(ino, true); - return true; + /* pnfs_prepare_layoutreturn() grabs lo ref and it will be put + * in pnfs_roc_release(). We don't really send a layoutreturn but + * still want others to view us like we are sending one! + * + * If pnfs_prepare_layoutreturn() fails, it means someone else is doing + * LAYOUTRETURN, so we proceed like there are no layouts to return. + * + * ROC in three conditions: + * 1. there are ROC lsegs + * 2. we don't send layoutreturn + * 3. no others are sending layoutreturn + */ + if (found && !layoutreturn && pnfs_prepare_layoutreturn(lo)) + roc = true; out_noroc: - if (lo) { - stateid = lo->plh_stateid; - layoutreturn = - test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, - &lo->plh_flags); - if (layoutreturn) { - lo->plh_block_lgets++; - pnfs_get_layout_hdr(lo); - } - } spin_unlock(&ino->i_lock); - if (layoutreturn) { - pnfs_layoutcommit_inode(ino, true); + pnfs_free_lseg_list(&tmp_list); + pnfs_layoutcommit_inode(ino, true); + if (layoutreturn) pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); - } - return false; + return roc; } void pnfs_roc_release(struct inode *ino) @@ -1119,7 +1134,7 @@ void pnfs_roc_release(struct inode *ino) spin_lock(&ino->i_lock); lo = NFS_I(ino)->layout; - lo->plh_block_lgets--; + pnfs_clear_layoutreturn_waitbit(lo); if (atomic_dec_and_test(&lo->plh_refcount)) { pnfs_detach_layout_hdr(lo); spin_unlock(&ino->i_lock); @@ -1137,24 +1152,16 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier) if (pnfs_seqid_is_newer(barrier, lo->plh_barrier)) lo->plh_barrier = barrier; spin_unlock(&ino->i_lock); + trace_nfs4_layoutreturn_on_close(ino, 0); } -bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task) +void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier) { struct nfs_inode *nfsi = NFS_I(ino); struct pnfs_layout_hdr *lo; - struct pnfs_layout_segment *lseg; - nfs4_stateid stateid; u32 current_seqid; - bool found = false, layoutreturn = false; spin_lock(&ino->i_lock); - list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) - if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { - rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); - found = true; - goto out; - } lo = nfsi->layout; current_seqid = be32_to_cpu(lo->plh_stateid.seqid); @@ -1162,23 +1169,7 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task) * a barrier, we choose the worst-case barrier. */ *barrier = current_seqid + atomic_read(&lo->plh_outstanding); -out: - if (!found) { - stateid = lo->plh_stateid; - layoutreturn = - test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, - &lo->plh_flags); - if (layoutreturn) { - lo->plh_block_lgets++; - pnfs_get_layout_hdr(lo); - } - } spin_unlock(&ino->i_lock); - if (layoutreturn) { - rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); - pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, false); - } - return found; } /* @@ -1206,16 +1197,41 @@ pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1, return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ); } -static void -pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo, - struct pnfs_layout_segment *lseg) +static bool +pnfs_lseg_range_is_after(const struct pnfs_layout_range *l1, + const struct pnfs_layout_range *l2) +{ + return pnfs_lseg_range_cmp(l1, l2) > 0; +} + +static bool +pnfs_lseg_no_merge(struct pnfs_layout_segment *lseg, + struct pnfs_layout_segment *old) { - struct pnfs_layout_segment *lp; + return false; +} + +void +pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg, + bool (*is_after)(const struct pnfs_layout_range *, + const struct pnfs_layout_range *), + bool (*do_merge)(struct pnfs_layout_segment *, + struct pnfs_layout_segment *), + struct list_head *free_me) +{ + struct pnfs_layout_segment *lp, *tmp; dprintk("%s:Begin\n", __func__); - list_for_each_entry(lp, &lo->plh_segs, pls_list) { - if (pnfs_lseg_range_cmp(&lseg->pls_range, &lp->pls_range) > 0) + list_for_each_entry_safe(lp, tmp, &lo->plh_segs, pls_list) { + if (test_bit(NFS_LSEG_VALID, &lp->pls_flags) == 0) + continue; + if (do_merge(lseg, lp)) { + mark_lseg_invalid(lp, free_me); + continue; + } + if (is_after(&lseg->pls_range, &lp->pls_range)) continue; list_add_tail(&lseg->pls_list, &lp->pls_list); dprintk("%s: inserted lseg %p " @@ -1237,6 +1253,24 @@ out: dprintk("%s:Return\n", __func__); } +EXPORT_SYMBOL_GPL(pnfs_generic_layout_insert_lseg); + +static void +pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg, + struct list_head *free_me) +{ + struct inode *inode = lo->plh_inode; + struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + + if (ld->add_lseg != NULL) + ld->add_lseg(lo, lseg, free_me); + else + pnfs_generic_layout_insert_lseg(lo, lseg, + pnfs_lseg_range_is_after, + pnfs_lseg_no_merge, + free_me); +} static struct pnfs_layout_hdr * alloc_init_layout_hdr(struct inode *ino, @@ -1329,8 +1363,6 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, ret = pnfs_get_lseg(lseg); break; } - if (lseg->pls_range.offset > range->offset) - break; } dprintk("%s:Return lseg %p ref %d\n", @@ -1423,6 +1455,8 @@ static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key) static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo) { + if (!pnfs_should_retry_layoutget(lo)) + return false; /* * send layoutcommit as it can hold up layoutreturn due to lseg * reference @@ -1469,6 +1503,9 @@ pnfs_update_layout(struct inode *ino, if (!pnfs_enabled_sb(NFS_SERVER(ino))) goto out; + if (iomode == IOMODE_READ && i_size_read(ino) == 0) + goto out; + if (pnfs_within_mdsthreshold(ctx, ino, iomode)) goto out; @@ -1518,8 +1555,7 @@ lookup_again: * Because we free lsegs before sending LAYOUTRETURN, we need to wait * for LAYOUTRETURN even if first is true. */ - if (!lseg && pnfs_should_retry_layoutget(lo) && - test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) { + if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) { spin_unlock(&ino->i_lock); dprintk("%s wait for layoutreturn\n", __func__); if (pnfs_prepare_to_retry_layoutget(lo)) { @@ -1532,7 +1568,7 @@ lookup_again: goto out_put_layout_hdr; } - if (pnfs_layoutgets_blocked(lo, &arg, 0)) + if (pnfs_layoutgets_blocked(lo)) goto out_unlock; atomic_inc(&lo->plh_outstanding); spin_unlock(&ino->i_lock); @@ -1578,6 +1614,26 @@ out_unlock: } EXPORT_SYMBOL_GPL(pnfs_update_layout); +static bool +pnfs_sanity_check_layout_range(struct pnfs_layout_range *range) +{ + switch (range->iomode) { + case IOMODE_READ: + case IOMODE_RW: + break; + default: + return false; + } + if (range->offset == NFS4_MAX_UINT64) + return false; + if (range->length == 0) + return false; + if (range->length != NFS4_MAX_UINT64 && + range->length > NFS4_MAX_UINT64 - range->offset) + return false; + return true; +} + struct pnfs_layout_segment * pnfs_layout_process(struct nfs4_layoutget *lgp) { @@ -1586,7 +1642,10 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) struct pnfs_layout_segment *lseg; struct inode *ino = lo->plh_inode; LIST_HEAD(free_me); - int status = 0; + int status = -EINVAL; + + if (!pnfs_sanity_check_layout_range(&res->range)) + goto out; /* Inject layout blob into I/O device driver */ lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags); @@ -1604,12 +1663,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) lseg->pls_range = res->range; spin_lock(&ino->i_lock); - if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { - dprintk("%s forget reply due to recall\n", __func__); - goto out_forget_reply; - } - - if (pnfs_layoutgets_blocked(lo, &lgp->args.range, 1)) { + if (pnfs_layoutgets_blocked(lo)) { dprintk("%s forget reply due to state\n", __func__); goto out_forget_reply; } @@ -1636,12 +1690,10 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); pnfs_get_lseg(lseg); - pnfs_layout_insert_lseg(lo, lseg); + pnfs_layout_insert_lseg(lo, lseg, &free_me); - if (res->return_on_close) { + if (res->return_on_close) set_bit(NFS_LSEG_ROC, &lseg->pls_flags); - set_bit(NFS_LAYOUT_ROC, &lo->plh_flags); - } spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&free_me); @@ -1677,6 +1729,8 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, lseg->pls_range.length); set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags); mark_lseg_invalid(lseg, tmp_list); + set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, + &lo->plh_flags); } } @@ -1695,7 +1749,6 @@ void pnfs_error_mark_layout_for_return(struct inode *inode, spin_lock(&inode->i_lock); /* set failure bit so that pnfs path will be retried later */ pnfs_layout_set_fail_bit(lo, iomode); - set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags); if (lo->plh_return_iomode == 0) lo->plh_return_iomode = range.iomode; else if (lo->plh_return_iomode != range.iomode) @@ -2207,13 +2260,12 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) if (ld->prepare_layoutcommit) { status = ld->prepare_layoutcommit(&data->args); if (status) { + put_rpccred(data->cred); spin_lock(&inode->i_lock); set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags); if (end_pos > nfsi->layout->plh_lwb) nfsi->layout->plh_lwb = end_pos; - spin_unlock(&inode->i_lock); - put_rpccred(data->cred); - goto clear_layoutcommitting; + goto out_unlock; } } @@ -2254,7 +2306,7 @@ struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) #if IS_ENABLED(CONFIG_NFS_V4_2) int -pnfs_report_layoutstat(struct inode *inode) +pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags) { struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; struct nfs_server *server = NFS_SERVER(inode); @@ -2281,7 +2333,7 @@ pnfs_report_layoutstat(struct inode *inode) pnfs_get_layout_hdr(hdr); spin_unlock(&inode->i_lock); - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), gfp_flags); if (!data) { status = -ENOMEM; goto out_put; @@ -2311,3 +2363,7 @@ out_put: } EXPORT_SYMBOL_GPL(pnfs_report_layoutstat); #endif + +unsigned int layoutstats_timer; +module_param(layoutstats_timer, uint, 0644); +EXPORT_SYMBOL_GPL(layoutstats_timer); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 3e6ab7bfbabd..78c9351ff117 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -94,7 +94,6 @@ enum { NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */ NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */ NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */ - NFS_LAYOUT_ROC, /* some lseg had roc bit set */ NFS_LAYOUT_RETURN, /* Return this layout ASAP */ NFS_LAYOUT_RETURN_BEFORE_CLOSE, /* Return this layout before close */ NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */ @@ -129,6 +128,9 @@ struct pnfs_layoutdriver_type { struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags); void (*free_lseg) (struct pnfs_layout_segment *lseg); + void (*add_lseg) (struct pnfs_layout_hdr *layoutid, + struct pnfs_layout_segment *lseg, + struct list_head *free_me); void (*return_range) (struct pnfs_layout_hdr *lo, struct pnfs_layout_range *range); @@ -184,15 +186,15 @@ struct pnfs_layoutdriver_type { struct pnfs_layout_hdr { atomic_t plh_refcount; + atomic_t plh_outstanding; /* number of RPCs out */ struct list_head plh_layouts; /* other client layouts */ struct list_head plh_bulk_destroy; struct list_head plh_segs; /* layout segments list */ - nfs4_stateid plh_stateid; - atomic_t plh_outstanding; /* number of RPCs out */ unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */ - u32 plh_barrier; /* ignore lower seqids */ unsigned long plh_retry_timestamp; unsigned long plh_flags; + nfs4_stateid plh_stateid; + u32 plh_barrier; /* ignore lower seqids */ enum pnfs_iomode plh_return_iomode; loff_t plh_lwb; /* last write byte for layoutcommit */ struct rpc_cred *plh_lc_cred; /* layoutcommit cred */ @@ -267,7 +269,7 @@ int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, bool pnfs_roc(struct inode *ino); void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); -bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task); +void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier); void pnfs_set_layoutcommit(struct inode *, struct pnfs_layout_segment *, loff_t); void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); int pnfs_layoutcommit_inode(struct inode *inode, bool sync); @@ -286,6 +288,14 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, gfp_t gfp_flags); void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo); +void pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg, + bool (*is_after)(const struct pnfs_layout_range *lseg_range, + const struct pnfs_layout_range *old), + bool (*do_merge)(struct pnfs_layout_segment *lseg, + struct pnfs_layout_segment *old), + struct list_head *free_me); + void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp); int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *); int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *); @@ -529,12 +539,31 @@ pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src, nfss->pnfs_curr_ld->id == src->l_type); } +static inline u64 +pnfs_calc_offset_end(u64 offset, u64 len) +{ + if (len == NFS4_MAX_UINT64 || len >= NFS4_MAX_UINT64 - offset) + return NFS4_MAX_UINT64; + return offset + len - 1; +} + +static inline u64 +pnfs_calc_offset_length(u64 offset, u64 end) +{ + if (end == NFS4_MAX_UINT64 || end <= offset) + return NFS4_MAX_UINT64; + return 1 + end - offset; +} + +extern unsigned int layoutstats_timer; + #ifdef NFS_DEBUG void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id); #else static inline void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id) { } + #endif /* NFS_DEBUG */ #else /* CONFIG_NFS_V4_1 */ @@ -605,10 +634,9 @@ pnfs_roc_set_barrier(struct inode *ino, u32 barrier) { } -static inline bool -pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task) +static inline void +pnfs_roc_get_barrier(struct inode *ino, u32 *barrier) { - return false; } static inline void set_pnfs_layoutdriver(struct nfs_server *s, @@ -691,10 +719,10 @@ static inline void nfs4_pnfs_v3_ds_connect_unload(void) #endif /* CONFIG_NFS_V4_1 */ #if IS_ENABLED(CONFIG_NFS_V4_2) -int pnfs_report_layoutstat(struct inode *inode); +int pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags); #else static inline int -pnfs_report_layoutstat(struct inode *inode) +pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags) { return 0; } diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index f37e25b6311c..24655b807d44 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -124,11 +124,12 @@ pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, if (ret) { cinfo->ds->nwritten -= ret; cinfo->ds->ncommitting += ret; - bucket->clseg = bucket->wlseg; - if (list_empty(src)) + if (bucket->clseg == NULL) + bucket->clseg = pnfs_get_lseg(bucket->wlseg); + if (list_empty(src)) { + pnfs_put_lseg_locked(bucket->wlseg); bucket->wlseg = NULL; - else - pnfs_get_lseg(bucket->clseg); + } } return ret; } @@ -182,19 +183,23 @@ static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx) struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; struct pnfs_commit_bucket *bucket; struct pnfs_layout_segment *freeme; + LIST_HEAD(pages); int i; + spin_lock(cinfo->lock); for (i = idx; i < fl_cinfo->nbuckets; i++) { bucket = &fl_cinfo->buckets[i]; if (list_empty(&bucket->committing)) continue; - nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo, i); - spin_lock(cinfo->lock); freeme = bucket->clseg; bucket->clseg = NULL; + list_splice_init(&bucket->committing, &pages); spin_unlock(cinfo->lock); + nfs_retry_commit(&pages, freeme, cinfo, i); pnfs_put_lseg(freeme); + spin_lock(cinfo->lock); } + spin_unlock(cinfo->lock); } static unsigned int @@ -216,10 +221,6 @@ pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo, if (!data) break; data->ds_commit_index = i; - spin_lock(cinfo->lock); - data->lseg = bucket->clseg; - bucket->clseg = NULL; - spin_unlock(cinfo->lock); list_add(&data->pages, list); nreq++; } @@ -229,6 +230,22 @@ pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo, return nreq; } +static inline +void pnfs_fetch_commit_bucket_list(struct list_head *pages, + struct nfs_commit_data *data, + struct nfs_commit_info *cinfo) +{ + struct pnfs_commit_bucket *bucket; + + bucket = &cinfo->ds->buckets[data->ds_commit_index]; + spin_lock(cinfo->lock); + list_splice_init(&bucket->committing, pages); + data->lseg = bucket->clseg; + bucket->clseg = NULL; + spin_unlock(cinfo->lock); + +} + /* This follows nfs_commit_list pretty closely */ int pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, @@ -243,7 +260,7 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, if (!list_empty(mds_pages)) { data = nfs_commitdata_alloc(); if (data != NULL) { - data->lseg = NULL; + data->ds_commit_index = -1; list_add(&data->pages, &list); nreq++; } else { @@ -265,19 +282,16 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, list_for_each_entry_safe(data, tmp, &list, pages) { list_del_init(&data->pages); - if (!data->lseg) { + if (data->ds_commit_index < 0) { nfs_init_commit(data, mds_pages, NULL, cinfo); nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(data->inode), data->mds_ops, how, 0); } else { - struct pnfs_commit_bucket *buckets; + LIST_HEAD(pages); - buckets = cinfo->ds->buckets; - nfs_init_commit(data, - &buckets[data->ds_commit_index].committing, - data->lseg, - cinfo); + pnfs_fetch_commit_bucket_list(&pages, data, cinfo); + nfs_init_commit(data, &pages, data->lseg, cinfo); initiate_commit(data, how); } } @@ -359,26 +373,31 @@ same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2) return false; } +/* + * Checks if 'dsaddrs1' contains a subset of 'dsaddrs2'. If it does, + * declare a match. + */ static bool _same_data_server_addrs_locked(const struct list_head *dsaddrs1, const struct list_head *dsaddrs2) { struct nfs4_pnfs_ds_addr *da1, *da2; - - /* step through both lists, comparing as we go */ - for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node), - da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node); - da1 != NULL && da2 != NULL; - da1 = list_entry(da1->da_node.next, typeof(*da1), da_node), - da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) { - if (!same_sockaddr((struct sockaddr *)&da1->da_addr, - (struct sockaddr *)&da2->da_addr)) - return false; + struct sockaddr *sa1, *sa2; + bool match = false; + + list_for_each_entry(da1, dsaddrs1, da_node) { + sa1 = (struct sockaddr *)&da1->da_addr; + match = false; + list_for_each_entry(da2, dsaddrs2, da_node) { + sa2 = (struct sockaddr *)&da2->da_addr; + match = same_sockaddr(sa1, sa2); + if (match) + break; + } + if (!match) + break; } - if (da1 == NULL && da2 == NULL) - return true; - - return false; + return match; } /* @@ -863,9 +882,10 @@ pnfs_layout_mark_request_commit(struct nfs_page *req, } set_bit(PG_COMMIT_TO_DS, &req->wb_flags); cinfo->ds->nwritten++; - spin_unlock(cinfo->lock); - nfs_request_add_commit_list(req, list, cinfo); + nfs_request_add_commit_list_locked(req, list, cinfo); + spin_unlock(cinfo->lock); + nfs_mark_page_unstable(req->wb_page, cinfo); } EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index aa62004f1706..383a027de452 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -381,9 +381,12 @@ int __init register_nfs_fs(void) ret = nfs_register_sysctl(); if (ret < 0) goto error_2; - register_shrinker(&acl_shrinker); + ret = register_shrinker(&acl_shrinker); + if (ret < 0) + goto error_3; return 0; - +error_3: + nfs_unregister_sysctl(); error_2: unregister_nfs4_fs(); error_1: diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 65869ca9c851..388f48079c43 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -768,6 +768,28 @@ nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi, } /** + * nfs_request_add_commit_list_locked - add request to a commit list + * @req: pointer to a struct nfs_page + * @dst: commit list head + * @cinfo: holds list lock and accounting info + * + * This sets the PG_CLEAN bit, updates the cinfo count of + * number of outstanding requests requiring a commit as well as + * the MM page stats. + * + * The caller must hold the cinfo->lock, and the nfs_page lock. + */ +void +nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst, + struct nfs_commit_info *cinfo) +{ + set_bit(PG_CLEAN, &req->wb_flags); + nfs_list_add_request(req, dst); + cinfo->mds->ncommit++; +} +EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked); + +/** * nfs_request_add_commit_list - add request to a commit list * @req: pointer to a struct nfs_page * @dst: commit list head @@ -784,13 +806,10 @@ void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst, struct nfs_commit_info *cinfo) { - set_bit(PG_CLEAN, &(req)->wb_flags); spin_lock(cinfo->lock); - nfs_list_add_request(req, dst); - cinfo->mds->ncommit++; + nfs_request_add_commit_list_locked(req, dst, cinfo); spin_unlock(cinfo->lock); - if (!cinfo->dreq) - nfs_mark_page_unstable(req->wb_page); + nfs_mark_page_unstable(req->wb_page, cinfo); } EXPORT_SYMBOL_GPL(nfs_request_add_commit_list); @@ -1379,24 +1398,27 @@ static void nfs_writeback_check_extend(struct nfs_pgio_header *hdr, { struct nfs_pgio_args *argp = &hdr->args; struct nfs_pgio_res *resp = &hdr->res; + u64 size = argp->offset + resp->count; if (!(fattr->valid & NFS_ATTR_FATTR_SIZE)) + fattr->size = size; + if (nfs_size_to_loff_t(fattr->size) < i_size_read(hdr->inode)) { + fattr->valid &= ~NFS_ATTR_FATTR_SIZE; return; - if (argp->offset + resp->count != fattr->size) - return; - if (nfs_size_to_loff_t(fattr->size) < i_size_read(hdr->inode)) + } + if (size != fattr->size) return; /* Set attribute barrier */ nfs_fattr_set_barrier(fattr); + /* ...and update size */ + fattr->valid |= NFS_ATTR_FATTR_SIZE; } void nfs_writeback_update_inode(struct nfs_pgio_header *hdr) { - struct nfs_fattr *fattr = hdr->res.fattr; + struct nfs_fattr *fattr = &hdr->fattr; struct inode *inode = hdr->inode; - if (fattr == NULL) - return; spin_lock(&inode->i_lock); nfs_writeback_check_extend(hdr, fattr); nfs_post_op_update_inode_force_wcc_locked(inode, fattr); @@ -1790,7 +1812,7 @@ out_mark_dirty: return res; } -static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc) +int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct nfs_inode *nfsi = NFS_I(inode); int flags = FLUSH_SYNC; @@ -1825,11 +1847,6 @@ out_mark_dirty: __mark_inode_dirty(inode, I_DIRTY_DATASYNC); return ret; } - -int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) -{ - return nfs_commit_unstable_pages(inode, wbc); -} EXPORT_SYMBOL_GPL(nfs_write_inode); /* diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c index ae6e58ea4de5..fd8c9a5bcac4 100644 --- a/fs/nfs_common/grace.c +++ b/fs/nfs_common/grace.c @@ -63,14 +63,33 @@ EXPORT_SYMBOL_GPL(locks_end_grace); * lock reclaims. */ int -locks_in_grace(struct net *net) +__state_in_grace(struct net *net, bool open) { struct list_head *grace_list = net_generic(net, grace_net_id); + struct lock_manager *lm; - return !list_empty(grace_list); + if (!open) + return !list_empty(grace_list); + + list_for_each_entry(lm, grace_list, list) { + if (lm->block_opens) + return true; + } + return false; +} + +int locks_in_grace(struct net *net) +{ + return __state_in_grace(net, 0); } EXPORT_SYMBOL_GPL(locks_in_grace); +int opens_in_grace(struct net *net) +{ + return __state_in_grace(net, 1); +} +EXPORT_SYMBOL_GPL(opens_in_grace); + static int __net_init grace_init_net(struct net *net) { diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c index 9aa2796da90d..6d834dc9bbc8 100644 --- a/fs/nfsd/blocklayoutxdr.c +++ b/fs/nfsd/blocklayoutxdr.c @@ -101,7 +101,7 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, } nr_iomaps = be32_to_cpup(p++); - expected = sizeof(__be32) + nr_iomaps * NFS4_BLOCK_EXTENT_SIZE; + expected = sizeof(__be32) + nr_iomaps * PNFS_BLOCK_EXTENT_SIZE; if (len != expected) { dprintk("%s: extent array size mismatch: %u/%u\n", __func__, len, expected); diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h index fdc79037c0e7..6de925fe8499 100644 --- a/fs/nfsd/blocklayoutxdr.h +++ b/fs/nfsd/blocklayoutxdr.h @@ -7,13 +7,6 @@ struct iomap; struct xdr_stream; -enum pnfs_block_extent_state { - PNFS_BLOCK_READWRITE_DATA = 0, - PNFS_BLOCK_READ_DATA = 1, - PNFS_BLOCK_INVALID_DATA = 2, - PNFS_BLOCK_NONE_DATA = 3, -}; - struct pnfs_block_extent { struct nfsd4_deviceid vol_id; u64 foff; @@ -21,14 +14,6 @@ struct pnfs_block_extent { u64 soff; enum pnfs_block_extent_state es; }; -#define NFS4_BLOCK_EXTENT_SIZE 44 - -enum pnfs_block_volume_type { - PNFS_BLOCK_VOLUME_SIMPLE = 0, - PNFS_BLOCK_VOLUME_SLICE = 1, - PNFS_BLOCK_VOLUME_CONCAT = 2, - PNFS_BLOCK_VOLUME_STRIPE = 3, -}; /* * Random upper cap for the uuid length to avoid unbounded allocation. diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index f79521a59747..b4d84b579f20 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1075,73 +1075,6 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp) return rv; } -/* Iterator */ - -static void *e_start(struct seq_file *m, loff_t *pos) - __acquires(((struct cache_detail *)m->private)->hash_lock) -{ - loff_t n = *pos; - unsigned hash, export; - struct cache_head *ch; - struct cache_detail *cd = m->private; - struct cache_head **export_table = cd->hash_table; - - read_lock(&cd->hash_lock); - if (!n--) - return SEQ_START_TOKEN; - hash = n >> 32; - export = n & ((1LL<<32) - 1); - - - for (ch=export_table[hash]; ch; ch=ch->next) - if (!export--) - return ch; - n &= ~((1LL<<32) - 1); - do { - hash++; - n += 1LL<<32; - } while(hash < EXPORT_HASHMAX && export_table[hash]==NULL); - if (hash >= EXPORT_HASHMAX) - return NULL; - *pos = n+1; - return export_table[hash]; -} - -static void *e_next(struct seq_file *m, void *p, loff_t *pos) -{ - struct cache_head *ch = p; - int hash = (*pos >> 32); - struct cache_detail *cd = m->private; - struct cache_head **export_table = cd->hash_table; - - if (p == SEQ_START_TOKEN) - hash = 0; - else if (ch->next == NULL) { - hash++; - *pos += 1LL<<32; - } else { - ++*pos; - return ch->next; - } - *pos &= ~((1LL<<32) - 1); - while (hash < EXPORT_HASHMAX && export_table[hash] == NULL) { - hash++; - *pos += 1LL<<32; - } - if (hash >= EXPORT_HASHMAX) - return NULL; - ++*pos; - return export_table[hash]; -} - -static void e_stop(struct seq_file *m, void *p) - __releases(((struct cache_detail *)m->private)->hash_lock) -{ - struct cache_detail *cd = m->private; - - read_unlock(&cd->hash_lock); -} - static struct flags { int flag; char *name[2]; @@ -1270,9 +1203,9 @@ static int e_show(struct seq_file *m, void *p) } const struct seq_operations nfs_exports_op = { - .start = e_start, - .next = e_next, - .stop = e_stop, + .start = cache_seq_start, + .next = cache_seq_next, + .stop = cache_seq_stop, .show = e_show, }; diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h index 1f52bfcc436f..2e315072bf3f 100644 --- a/fs/nfsd/export.h +++ b/fs/nfsd/export.h @@ -6,6 +6,7 @@ #include <linux/sunrpc/cache.h> #include <uapi/linux/nfsd/export.h> +#include <linux/nfs4.h> struct knfsd_fh; struct svc_fh; diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h index a3f34900091f..23cc85d1efdd 100644 --- a/fs/nfsd/idmap.h +++ b/fs/nfsd/idmap.h @@ -37,9 +37,7 @@ #include <linux/in.h> #include <linux/sunrpc/svc.h> - -/* XXX from linux/nfs_idmap.h */ -#define IDMAP_NAMESZ 128 +#include <linux/nfs_idmap.h> #ifdef CONFIG_NFSD_V4 int nfsd_idmap_init(struct net *); diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index ea6749a32760..d8b16c2568f3 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -110,6 +110,7 @@ struct nfsd_net { unsigned int max_connections; u32 clientid_counter; + u32 clverifier_counter; struct svc_serv *nfsd_serv; }; diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index d54701f6dc78..1580ea6fd64d 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -44,13 +44,13 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp, inode = d_inode(fh->fh_dentry); - if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) + if (argp->mask & ~NFS_ACL_MASK) RETURN_STATUS(nfserr_inval); resp->mask = argp->mask; nfserr = fh_getattr(fh, &resp->stat); if (nfserr) - goto fail; + RETURN_STATUS(nfserr); if (resp->mask & (NFS_ACL|NFS_ACLCNT)) { acl = get_acl(inode, ACL_TYPE_ACCESS); @@ -202,7 +202,7 @@ static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p, if (!p) return 0; argp->mask = ntohl(*p++); - if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT) || + if (argp->mask & ~NFS_ACL_MASK || !xdr_argsize_check(rqstp, p)) return 0; @@ -293,9 +293,7 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p, resp->acl_default, resp->mask & NFS_DFACL, NFS_ACL_DEFAULT); - if (n <= 0) - return 0; - return 1; + return (n > 0); } static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, __be32 *p, diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 882b1a14bc3e..01df4cd7c753 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -41,7 +41,7 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp, inode = d_inode(fh->fh_dentry); - if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) + if (argp->mask & ~NFS_ACL_MASK) RETURN_STATUS(nfserr_inval); resp->mask = argp->mask; @@ -148,7 +148,7 @@ static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p, if (!p) return 0; args->mask = ntohl(*p++); - if (args->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT) || + if (args->mask & ~NFS_ACL_MASK || !xdr_argsize_check(rqstp, p)) return 0; diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index eb5accf1b37f..6adabd6049b7 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -34,8 +34,10 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include <linux/fs.h> #include <linux/slab.h> -#include <linux/nfs_fs.h> +#include <linux/posix_acl.h> + #include "nfsfh.h" #include "nfsd.h" #include "acl.h" @@ -100,7 +102,7 @@ deny_mask_from_posix(unsigned short perm, u32 flags) /* We only map from NFSv4 to POSIX ACLs when setting ACLs, when we err on the * side of being more restrictive, so the mode bit mapping below is * pessimistic. An optimistic version would be needed to handle DENY's, - * but we espect to coalesce all ALLOWs and DENYs before mapping to mode + * but we expect to coalesce all ALLOWs and DENYs before mapping to mode * bits. */ static void @@ -458,7 +460,7 @@ init_state(struct posix_acl_state *state, int cnt) state->empty = 1; /* * In the worst case, each individual acl could be for a distinct - * named user or group, but we don't no which, so we allocate + * named user or group, but we don't know which, so we allocate * enough space for either: */ alloc = sizeof(struct posix_ace_state_array) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index a49201835a97..e7f50c4081d6 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -435,12 +435,12 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr, */ status = 0; out: - if (status) - nfsd4_mark_cb_fault(cb->cb_clp, status); + cb->cb_seq_status = status; return status; out_overflow: print_overflow_msg(__func__, xdr); - return -EIO; + status = -EIO; + goto out; } static int decode_cb_sequence4res(struct xdr_stream *xdr, @@ -451,11 +451,10 @@ static int decode_cb_sequence4res(struct xdr_stream *xdr, if (cb->cb_minorversion == 0) return 0; - status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &cb->cb_status); - if (unlikely(status || cb->cb_status)) + status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &cb->cb_seq_status); + if (unlikely(status || cb->cb_seq_status)) return status; - cb->cb_update_seq_nr = true; return decode_cb_sequence4resok(xdr, cb); } @@ -527,7 +526,7 @@ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, if (cb != NULL) { status = decode_cb_sequence4res(xdr, cb); - if (unlikely(status || cb->cb_status)) + if (unlikely(status || cb->cb_seq_status)) return status; } @@ -617,7 +616,7 @@ static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp, if (cb) { status = decode_cb_sequence4res(xdr, cb); - if (unlikely(status || cb->cb_status)) + if (unlikely(status || cb->cb_seq_status)) return status; } return decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &cb->cb_status); @@ -876,7 +875,11 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) u32 minorversion = clp->cl_minorversion; cb->cb_minorversion = minorversion; - cb->cb_update_seq_nr = false; + /* + * cb_seq_status is only set in decode_cb_sequence4res, + * and so will remain 1 if an rpc level failure occurs. + */ + cb->cb_seq_status = 1; cb->cb_status = 0; if (minorversion) { if (!nfsd41_cb_get_slot(clp, task)) @@ -885,15 +888,30 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) rpc_call_start(task); } -static void nfsd4_cb_done(struct rpc_task *task, void *calldata) +static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback *cb) { - struct nfsd4_callback *cb = calldata; struct nfs4_client *clp = cb->cb_clp; + struct nfsd4_session *session = clp->cl_cb_session; + bool ret = true; - dprintk("%s: minorversion=%d\n", __func__, - clp->cl_minorversion); + if (!clp->cl_minorversion) { + /* + * If the backchannel connection was shut down while this + * task was queued, we need to resubmit it after setting up + * a new backchannel connection. + * + * Note that if we lost our callback connection permanently + * the submission code will error out, so we don't need to + * handle that case here. + */ + if (task->tk_flags & RPC_TASK_KILLED) + goto need_restart; + + return true; + } - if (clp->cl_minorversion) { + switch (cb->cb_seq_status) { + case 0: /* * No need for lock, access serialized in nfsd4_cb_prepare * @@ -901,29 +919,63 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata) * If CB_SEQUENCE returns an error, then the state of the slot * (sequence ID, cached reply) MUST NOT change. */ - if (cb->cb_update_seq_nr) - ++clp->cl_cb_session->se_cb_seq_nr; - - clear_bit(0, &clp->cl_cb_slot_busy); - rpc_wake_up_next(&clp->cl_cb_waitq); - dprintk("%s: freed slot, new seqid=%d\n", __func__, - clp->cl_cb_session->se_cb_seq_nr); + ++session->se_cb_seq_nr; + break; + case -ESERVERFAULT: + ++session->se_cb_seq_nr; + case 1: + case -NFS4ERR_BADSESSION: + nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status); + ret = false; + break; + case -NFS4ERR_DELAY: + if (!rpc_restart_call(task)) + goto out; + + rpc_delay(task, 2 * HZ); + return false; + case -NFS4ERR_BADSLOT: + goto retry_nowait; + case -NFS4ERR_SEQ_MISORDERED: + if (session->se_cb_seq_nr != 1) { + session->se_cb_seq_nr = 1; + goto retry_nowait; + } + break; + default: + dprintk("%s: unprocessed error %d\n", __func__, + cb->cb_seq_status); } - /* - * If the backchannel connection was shut down while this - * task was queued, we need to resubmit it after setting up - * a new backchannel connection. - * - * Note that if we lost our callback connection permanently - * the submission code will error out, so we don't need to - * handle that case here. - */ - if (task->tk_flags & RPC_TASK_KILLED) { - task->tk_status = 0; - cb->cb_need_restart = true; + clear_bit(0, &clp->cl_cb_slot_busy); + rpc_wake_up_next(&clp->cl_cb_waitq); + dprintk("%s: freed slot, new seqid=%d\n", __func__, + clp->cl_cb_session->se_cb_seq_nr); + + if (task->tk_flags & RPC_TASK_KILLED) + goto need_restart; +out: + return ret; +retry_nowait: + if (rpc_restart_call_prepare(task)) + ret = false; + goto out; +need_restart: + task->tk_status = 0; + cb->cb_need_restart = true; + return false; +} + +static void nfsd4_cb_done(struct rpc_task *task, void *calldata) +{ + struct nfsd4_callback *cb = calldata; + struct nfs4_client *clp = cb->cb_clp; + + dprintk("%s: minorversion=%d\n", __func__, + clp->cl_minorversion); + + if (!nfsd4_cb_sequence_done(task, cb)) return; - } if (cb->cb_status) { WARN_ON_ONCE(task->tk_status); @@ -1099,8 +1151,8 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, cb->cb_msg.rpc_resp = cb; cb->cb_ops = ops; INIT_WORK(&cb->cb_work, nfsd4_run_cb_work); + cb->cb_seq_status = 1; cb->cb_status = 0; - cb->cb_update_seq_nr = false; cb->cb_need_restart = false; } diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index e1b3d3d472da..5b20577dcdd2 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -59,9 +59,6 @@ MODULE_PARM_DESC(nfs4_disable_idmapping, * that. */ -#define IDMAP_TYPE_USER 0 -#define IDMAP_TYPE_GROUP 1 - struct ent { struct cache_head h; int type; /* User / Group */ diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 6904213a4363..ebf90e487c75 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -212,6 +212,7 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, BUG_ON(!ls->ls_file); if (nfsd4_layout_setlease(ls)) { + fput(ls->ls_file); put_nfs4_file(fp); kmem_cache_free(nfs4_layout_stateid_cache, ls); return NULL; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 90cfda75313c..4ce6b97b31ad 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -276,13 +276,13 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru nfsd4_security_inode_setsecctx(*resfh, &open->op_label, open->op_bmval); /* - * Following rfc 3530 14.2.16, use the returned bitmask - * to indicate which attributes we used to store the - * verifier: + * Following rfc 3530 14.2.16, and rfc 5661 18.16.4 + * use the returned bitmask to indicate which attributes + * we used to store the verifier: */ - if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0) - open->op_bmval[1] = (FATTR4_WORD1_TIME_ACCESS | - FATTR4_WORD1_TIME_MODIFY); + if (nfsd_create_is_exclusive(open->op_createmode) && status == 0) + open->op_bmval[1] |= (FATTR4_WORD1_TIME_ACCESS | + FATTR4_WORD1_TIME_MODIFY); } else /* * Note this may exit with the parent still locked. @@ -362,7 +362,6 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { __be32 status; struct svc_fh *resfh = NULL; - struct nfsd4_compoundres *resp; struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); @@ -389,8 +388,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, copy_clientid(&open->op_clientid, cstate->session); /* check seqid for replay. set nfs4_owner */ - resp = rqstp->rq_resp; - status = nfsd4_process_open1(&resp->cstate, open, nn); + status = nfsd4_process_open1(cstate, open, nn); if (status == nfserr_replay_me) { struct nfs4_replay *rp = &open->op_openowner->oo_owner.so_replay; fh_put(&cstate->current_fh); @@ -417,10 +415,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* Openowner is now set, so sequence id will get bumped. Now we need * these checks before we do any creates: */ status = nfserr_grace; - if (locks_in_grace(net) && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) + if (opens_in_grace(net) && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) goto out; status = nfserr_no_grace; - if (!locks_in_grace(net) && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) + if (!opens_in_grace(net) && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) goto out; switch (open->op_claim_type) { @@ -829,7 +827,7 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { __be32 status; - if (locks_in_grace(SVC_NET(rqstp))) + if (opens_in_grace(SVC_NET(rqstp))) return nfserr_grace; status = nfsd_unlink(rqstp, &cstate->current_fh, 0, remove->rm_name, remove->rm_namelen); @@ -848,7 +846,7 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!cstate->save_fh.fh_dentry) return status; - if (locks_in_grace(SVC_NET(rqstp)) && + if (opens_in_grace(SVC_NET(rqstp)) && !(cstate->save_fh.fh_export->ex_flags & NFSEXP_NOSUBTREECHECK)) return nfserr_grace; status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname, @@ -1364,10 +1362,6 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp, goto out; } - nfserr = ops->proc_layoutcommit(inode, lcp); - if (nfserr) - goto out_put_stid; - if (new_size > i_size_read(inode)) { lcp->lc_size_chg = 1; lcp->lc_newsize = new_size; @@ -1375,7 +1369,7 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp, lcp->lc_size_chg = 0; } -out_put_stid: + nfserr = ops->proc_layoutcommit(inode, lcp); nfs4_put_stid(&ls->ls_stid); out: return nfserr; diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index d88ea7b9a85c..e3d47091b191 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -272,6 +272,7 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) .ctx.actor = nfsd4_build_namelist, .names = LIST_HEAD_INIT(ctx.names) }; + struct name_list *entry, *tmp; int status; status = nfs4_save_creds(&original_cred); @@ -286,9 +287,8 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) status = iterate_dir(nn->rec_file, &ctx.ctx); mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); - while (!list_empty(&ctx.names)) { - struct name_list *entry; - entry = list_entry(ctx.names.next, struct name_list, list); + + list_for_each_entry_safe(entry, tmp, &ctx.names, list) { if (!status) { struct dentry *dentry; dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1); @@ -304,6 +304,12 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) } mutex_unlock(&d_inode(dir)->i_mutex); nfs4_reset_creds(original_cred); + + list_for_each_entry_safe(entry, tmp, &ctx.names, list) { + dprintk("NFSD: %s. Left entry %s\n", __func__, entry->name); + list_del(&entry->list); + kfree(entry); + } return status; } @@ -541,8 +547,7 @@ nfsd4_legacy_tracking_init(struct net *net) /* XXX: The legacy code won't work in a container */ if (net != &init_net) { - WARN(1, KERN_ERR "NFSD: attempt to initialize legacy client " - "tracking in a container!\n"); + pr_warn("NFSD: attempt to initialize legacy client tracking in a container ignored.\n"); return -EINVAL; } @@ -1254,8 +1259,7 @@ nfsd4_umh_cltrack_init(struct net *net) /* XXX: The usermode helper s not working in container yet. */ if (net != &init_net) { - WARN(1, KERN_ERR "NFSD: attempt to initialize umh client " - "tracking in a container!\n"); + pr_warn("NFSD: attempt to initialize umh client tracking in a container ignored.\n"); return -EINVAL; } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 61dfb33f0559..0f1d5691b795 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -777,13 +777,16 @@ hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp) list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations); } -static void +static bool unhash_delegation_locked(struct nfs4_delegation *dp) { struct nfs4_file *fp = dp->dl_stid.sc_file; lockdep_assert_held(&state_lock); + if (list_empty(&dp->dl_perfile)) + return false; + dp->dl_stid.sc_type = NFS4_CLOSED_DELEG_STID; /* Ensure that deleg break won't try to requeue it */ ++dp->dl_time; @@ -792,16 +795,21 @@ unhash_delegation_locked(struct nfs4_delegation *dp) list_del_init(&dp->dl_recall_lru); list_del_init(&dp->dl_perfile); spin_unlock(&fp->fi_lock); + return true; } static void destroy_delegation(struct nfs4_delegation *dp) { + bool unhashed; + spin_lock(&state_lock); - unhash_delegation_locked(dp); + unhashed = unhash_delegation_locked(dp); spin_unlock(&state_lock); - put_clnt_odstate(dp->dl_clnt_odstate); - nfs4_put_deleg_lease(dp->dl_stid.sc_file); - nfs4_put_stid(&dp->dl_stid); + if (unhashed) { + put_clnt_odstate(dp->dl_clnt_odstate); + nfs4_put_deleg_lease(dp->dl_stid.sc_file); + nfs4_put_stid(&dp->dl_stid); + } } static void revoke_delegation(struct nfs4_delegation *dp) @@ -990,6 +998,12 @@ release_all_access(struct nfs4_ol_stateid *stp) } } +static inline void nfs4_free_stateowner(struct nfs4_stateowner *sop) +{ + kfree(sop->so_owner.data); + sop->so_ops->so_free(sop); +} + static void nfs4_put_stateowner(struct nfs4_stateowner *sop) { struct nfs4_client *clp = sop->so_client; @@ -1000,20 +1014,23 @@ static void nfs4_put_stateowner(struct nfs4_stateowner *sop) return; sop->so_ops->so_unhash(sop); spin_unlock(&clp->cl_lock); - kfree(sop->so_owner.data); - sop->so_ops->so_free(sop); + nfs4_free_stateowner(sop); } -static void unhash_ol_stateid(struct nfs4_ol_stateid *stp) +static bool unhash_ol_stateid(struct nfs4_ol_stateid *stp) { struct nfs4_file *fp = stp->st_stid.sc_file; lockdep_assert_held(&stp->st_stateowner->so_client->cl_lock); + if (list_empty(&stp->st_perfile)) + return false; + spin_lock(&fp->fi_lock); - list_del(&stp->st_perfile); + list_del_init(&stp->st_perfile); spin_unlock(&fp->fi_lock); list_del(&stp->st_perstateowner); + return true; } static void nfs4_free_ol_stateid(struct nfs4_stid *stid) @@ -1063,25 +1080,27 @@ static void put_ol_stateid_locked(struct nfs4_ol_stateid *stp, list_add(&stp->st_locks, reaplist); } -static void unhash_lock_stateid(struct nfs4_ol_stateid *stp) +static bool unhash_lock_stateid(struct nfs4_ol_stateid *stp) { struct nfs4_openowner *oo = openowner(stp->st_openstp->st_stateowner); lockdep_assert_held(&oo->oo_owner.so_client->cl_lock); list_del_init(&stp->st_locks); - unhash_ol_stateid(stp); nfs4_unhash_stid(&stp->st_stid); + return unhash_ol_stateid(stp); } static void release_lock_stateid(struct nfs4_ol_stateid *stp) { struct nfs4_openowner *oo = openowner(stp->st_openstp->st_stateowner); + bool unhashed; spin_lock(&oo->oo_owner.so_client->cl_lock); - unhash_lock_stateid(stp); + unhashed = unhash_lock_stateid(stp); spin_unlock(&oo->oo_owner.so_client->cl_lock); - nfs4_put_stid(&stp->st_stid); + if (unhashed) + nfs4_put_stid(&stp->st_stid); } static void unhash_lockowner_locked(struct nfs4_lockowner *lo) @@ -1129,7 +1148,7 @@ static void release_lockowner(struct nfs4_lockowner *lo) while (!list_empty(&lo->lo_owner.so_stateids)) { stp = list_first_entry(&lo->lo_owner.so_stateids, struct nfs4_ol_stateid, st_perstateowner); - unhash_lock_stateid(stp); + WARN_ON(!unhash_lock_stateid(stp)); put_ol_stateid_locked(stp, &reaplist); } spin_unlock(&clp->cl_lock); @@ -1142,21 +1161,26 @@ static void release_open_stateid_locks(struct nfs4_ol_stateid *open_stp, { struct nfs4_ol_stateid *stp; + lockdep_assert_held(&open_stp->st_stid.sc_client->cl_lock); + while (!list_empty(&open_stp->st_locks)) { stp = list_entry(open_stp->st_locks.next, struct nfs4_ol_stateid, st_locks); - unhash_lock_stateid(stp); + WARN_ON(!unhash_lock_stateid(stp)); put_ol_stateid_locked(stp, reaplist); } } -static void unhash_open_stateid(struct nfs4_ol_stateid *stp, +static bool unhash_open_stateid(struct nfs4_ol_stateid *stp, struct list_head *reaplist) { + bool unhashed; + lockdep_assert_held(&stp->st_stid.sc_client->cl_lock); - unhash_ol_stateid(stp); + unhashed = unhash_ol_stateid(stp); release_open_stateid_locks(stp, reaplist); + return unhashed; } static void release_open_stateid(struct nfs4_ol_stateid *stp) @@ -1164,8 +1188,8 @@ static void release_open_stateid(struct nfs4_ol_stateid *stp) LIST_HEAD(reaplist); spin_lock(&stp->st_stid.sc_client->cl_lock); - unhash_open_stateid(stp, &reaplist); - put_ol_stateid_locked(stp, &reaplist); + if (unhash_open_stateid(stp, &reaplist)) + put_ol_stateid_locked(stp, &reaplist); spin_unlock(&stp->st_stid.sc_client->cl_lock); free_ol_stateid_reaplist(&reaplist); } @@ -1210,8 +1234,8 @@ static void release_openowner(struct nfs4_openowner *oo) while (!list_empty(&oo->oo_owner.so_stateids)) { stp = list_first_entry(&oo->oo_owner.so_stateids, struct nfs4_ol_stateid, st_perstateowner); - unhash_open_stateid(stp, &reaplist); - put_ol_stateid_locked(stp, &reaplist); + if (unhash_open_stateid(stp, &reaplist)) + put_ol_stateid_locked(stp, &reaplist); } spin_unlock(&clp->cl_lock); free_ol_stateid_reaplist(&reaplist); @@ -1714,7 +1738,7 @@ __destroy_client(struct nfs4_client *clp) spin_lock(&state_lock); while (!list_empty(&clp->cl_delegations)) { dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt); - unhash_delegation_locked(dp); + WARN_ON(!unhash_delegation_locked(dp)); list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); @@ -1894,7 +1918,7 @@ static void gen_confirm(struct nfs4_client *clp, struct nfsd_net *nn) * __force to keep sparse happy */ verf[0] = (__force __be32)get_seconds(); - verf[1] = (__force __be32)nn->clientid_counter; + verf[1] = (__force __be32)nn->clverifier_counter++; memcpy(clp->cl_confirm.data, verf, sizeof(clp->cl_confirm.data)); } @@ -2241,6 +2265,9 @@ static bool client_has_state(struct nfs4_client *clp) * Also note we should probably be using this in 4.0 case too. */ return !list_empty(&clp->cl_openowners) +#ifdef CONFIG_NFSD_PNFS + || !list_empty(&clp->cl_lo_states) +#endif || !list_empty(&clp->cl_delegations) || !list_empty(&clp->cl_sessions); } @@ -2547,11 +2574,9 @@ nfsd4_create_session(struct svc_rqst *rqstp, goto out_free_conn; cs_slot = &conf->cl_cs_slot; status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); - if (status == nfserr_replay_cache) { - status = nfsd4_replay_create_session(cr_ses, cs_slot); - goto out_free_conn; - } else if (cr_ses->seqid != cs_slot->sl_seqid + 1) { - status = nfserr_seq_misordered; + if (status) { + if (status == nfserr_replay_cache) + status = nfsd4_replay_create_session(cr_ses, cs_slot); goto out_free_conn; } } else if (unconf) { @@ -3041,10 +3066,11 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, unconf = find_unconfirmed_client_by_name(&clname, nn); if (unconf) unhash_client_locked(unconf); - if (conf && same_verf(&conf->cl_verifier, &clverifier)) + if (conf && same_verf(&conf->cl_verifier, &clverifier)) { /* case 1: probable callback update */ copy_clid(new, conf); - else /* case 4 (new client) or cases 2, 3 (client reboot): */ + gen_confirm(new, nn); + } else /* case 4 (new client) or cases 2, 3 (client reboot): */ gen_clid(new, nn); new->cl_minorversion = 0; gen_callback(new, setclid, rqstp); @@ -3085,10 +3111,11 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, /* * We try hard to give out unique clientid's, so if we get an * attempt to confirm the same clientid with a different cred, - * there's a bug somewhere. Let's charitably assume it's our - * bug. + * the client may be buggy; this should never happen. + * + * Nevertheless, RFC 7530 recommends INUSE for this case: */ - status = nfserr_serverfault; + status = nfserr_clid_inuse; if (unconf && !same_creds(&unconf->cl_cred, &rqstp->rq_cred)) goto out; if (conf && !same_creds(&conf->cl_cred, &rqstp->rq_cred)) @@ -3315,7 +3342,8 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open, hash_openowner(oo, clp, strhashval); ret = oo; } else - nfs4_free_openowner(&oo->oo_owner); + nfs4_free_stateowner(&oo->oo_owner); + spin_unlock(&clp->cl_lock); return ret; } @@ -3482,6 +3510,9 @@ static int nfsd4_cb_recall_done(struct nfsd4_callback *cb, { struct nfs4_delegation *dp = cb_to_delegation(cb); + if (dp->dl_stid.sc_type == NFS4_CLOSED_DELEG_STID) + return 1; + switch (task->tk_status) { case 0: return 1; @@ -3885,12 +3916,6 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c return status; } -static void -nfs4_set_claim_prev(struct nfsd4_open *open, bool has_session) -{ - open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; -} - /* Should we give out recallable state?: */ static bool nfsd4_cb_channel_good(struct nfs4_client *clp) { @@ -3923,7 +3948,7 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_file *fp, int flag) static int nfs4_setlease(struct nfs4_delegation *dp) { struct nfs4_file *fp = dp->dl_stid.sc_file; - struct file_lock *fl, *ret; + struct file_lock *fl; struct file *filp; int status = 0; @@ -3934,10 +3959,10 @@ static int nfs4_setlease(struct nfs4_delegation *dp) if (!filp) { /* We should always have a readable file here */ WARN_ON_ONCE(1); + locks_free_lock(fl); return -EBADF; } fl->fl_file = filp; - ret = fl; status = vfs_setlease(filp, fl->fl_type, &fl, NULL); if (fl) locks_free_lock(fl); @@ -4063,7 +4088,8 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, case NFS4_OPEN_CLAIM_FH: /* * Let's not give out any delegations till everyone's - * had the chance to reclaim theirs.... + * had the chance to reclaim theirs, *and* until + * NLM locks have all been reclaimed: */ if (locks_in_grace(clp->net)) goto out_no_deleg; @@ -4209,7 +4235,7 @@ out: if (fp) put_nfs4_file(fp); if (status == 0 && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) - nfs4_set_claim_prev(open, nfsd4_has_session(&resp->cstate)); + open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; /* * To finish the open response, we just need to set the rflags. */ @@ -4338,14 +4364,12 @@ nfs4_laundromat(struct nfsd_net *nn) spin_lock(&state_lock); list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); - if (net_generic(dp->dl_stid.sc_client->net, nfsd_net_id) != nn) - continue; if (time_after((unsigned long)dp->dl_time, (unsigned long)cutoff)) { t = dp->dl_time - cutoff; new_timeo = min(new_timeo, t); break; } - unhash_delegation_locked(dp); + WARN_ON(!unhash_delegation_locked(dp)); list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); @@ -4396,9 +4420,9 @@ laundromat_main(struct work_struct *laundry) queue_delayed_work(laundry_wq, &nn->laundromat_work, t*HZ); } -static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp) +static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp) { - if (!fh_match(&fhp->fh_handle, &stp->st_stid.sc_file->fi_fhandle)) + if (!fh_match(&fhp->fh_handle, &stp->sc_file->fi_fhandle)) return nfserr_bad_stateid; return nfs_ok; } @@ -4440,7 +4464,7 @@ check_special_stateids(struct net *net, svc_fh *current_fh, stateid_t *stateid, { if (ONE_STATEID(stateid) && (flags & RD_STATE)) return nfs_ok; - else if (locks_in_grace(net)) { + else if (opens_in_grace(net)) { /* Answer in remaining cases depends on existence of * conflicting state; so we must wait out the grace period. */ return nfserr_grace; @@ -4459,7 +4483,7 @@ check_special_stateids(struct net *net, svc_fh *current_fh, stateid_t *stateid, static inline int grace_disallows_io(struct net *net, struct inode *inode) { - return locks_in_grace(net) && mandatory_lock(inode); + return opens_in_grace(net) && mandatory_lock(inode); } /* Returns true iff a is later than b: */ @@ -4601,9 +4625,6 @@ nfs4_check_olstateid(struct svc_fh *fhp, struct nfs4_ol_stateid *ols, int flags) { __be32 status; - status = nfs4_check_fh(fhp, ols); - if (status) - return status; status = nfsd4_check_openowner_confirmed(ols); if (status) return status; @@ -4690,6 +4711,9 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, status = nfserr_bad_stateid; break; } + if (status) + goto out; + status = nfs4_check_fh(fhp, s); done: if (!status && filpp) @@ -4751,7 +4775,7 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (check_for_locks(stp->st_stid.sc_file, lockowner(stp->st_stateowner))) break; - unhash_lock_stateid(stp); + WARN_ON(!unhash_lock_stateid(stp)); spin_unlock(&cl->cl_lock); nfs4_put_stid(s); ret = nfs_ok; @@ -4798,7 +4822,7 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_ status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate)); if (status) return status; - return nfs4_check_fh(current_fh, stp); + return nfs4_check_fh(current_fh, &stp->st_stid); } /* @@ -4967,20 +4991,23 @@ out: static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s) { struct nfs4_client *clp = s->st_stid.sc_client; + bool unhashed; LIST_HEAD(reaplist); s->st_stid.sc_type = NFS4_CLOSED_STID; spin_lock(&clp->cl_lock); - unhash_open_stateid(s, &reaplist); + unhashed = unhash_open_stateid(s, &reaplist); if (clp->cl_minorversion) { - put_ol_stateid_locked(s, &reaplist); + if (unhashed) + put_ol_stateid_locked(s, &reaplist); spin_unlock(&clp->cl_lock); free_ol_stateid_reaplist(&reaplist); } else { spin_unlock(&clp->cl_lock); free_ol_stateid_reaplist(&reaplist); - move_to_close_lru(s, clp->net); + if (unhashed) + move_to_close_lru(s, clp->net); } } @@ -5045,9 +5072,6 @@ out: return status; } - -#define LOFF_OVERFLOW(start, len) ((u64)(len) > ~(u64)(start)) - static inline u64 end_offset(u64 start, u64 len) { @@ -5139,8 +5163,7 @@ nevermind: } static struct nfs4_lockowner * -find_lockowner_str_locked(clientid_t *clid, struct xdr_netobj *owner, - struct nfs4_client *clp) +find_lockowner_str_locked(struct nfs4_client *clp, struct xdr_netobj *owner) { unsigned int strhashval = ownerstr_hashval(owner); struct nfs4_stateowner *so; @@ -5158,13 +5181,12 @@ find_lockowner_str_locked(clientid_t *clid, struct xdr_netobj *owner, } static struct nfs4_lockowner * -find_lockowner_str(clientid_t *clid, struct xdr_netobj *owner, - struct nfs4_client *clp) +find_lockowner_str(struct nfs4_client *clp, struct xdr_netobj *owner) { struct nfs4_lockowner *lo; spin_lock(&clp->cl_lock); - lo = find_lockowner_str_locked(clid, owner, clp); + lo = find_lockowner_str_locked(clp, owner); spin_unlock(&clp->cl_lock); return lo; } @@ -5208,14 +5230,14 @@ alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, lo->lo_owner.so_seqid = lock->lk_new_lock_seqid; lo->lo_owner.so_ops = &lockowner_ops; spin_lock(&clp->cl_lock); - ret = find_lockowner_str_locked(&clp->cl_clientid, - &lock->lk_new_owner, clp); + ret = find_lockowner_str_locked(clp, &lock->lk_new_owner); if (ret == NULL) { list_add(&lo->lo_owner.so_strhash, &clp->cl_ownerstr_hashtbl[strhashval]); ret = lo; } else - nfs4_free_lockowner(&lo->lo_owner); + nfs4_free_stateowner(&lo->lo_owner); + spin_unlock(&clp->cl_lock); return ret; } @@ -5298,8 +5320,8 @@ find_or_create_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fi, static int check_lock_length(u64 offset, u64 length) { - return ((length == 0) || ((length != NFS4_MAX_UINT64) && - LOFF_OVERFLOW(offset, length))); + return ((length == 0) || ((length != NFS4_MAX_UINT64) && + (length > ~offset))); } static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access) @@ -5328,9 +5350,9 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_lockowner *lo; unsigned int strhashval; - lo = find_lockowner_str(&cl->cl_clientid, &lock->v.new.owner, cl); + lo = find_lockowner_str(cl, &lock->lk_new_owner); if (!lo) { - strhashval = ownerstr_hashval(&lock->v.new.owner); + strhashval = ownerstr_hashval(&lock->lk_new_owner); lo = alloc_init_lock_stateowner(strhashval, cl, ost, lock); if (lo == NULL) return nfserr_jukebox; @@ -5391,7 +5413,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (lock->lk_is_new) { if (nfsd4_has_session(cstate)) /* See rfc 5661 18.10.3: given clientid is ignored: */ - memcpy(&lock->v.new.clientid, + memcpy(&lock->lk_new_clientid, &cstate->session->se_client->cl_clientid, sizeof(clientid_t)); @@ -5409,7 +5431,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, open_sop = openowner(open_stp->st_stateowner); status = nfserr_bad_stateid; if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid, - &lock->v.new.clientid)) + &lock->lk_new_clientid)) goto out; status = lookup_or_create_lock_state(cstate, open_stp, lock, &lock_stp, &new); @@ -5603,8 +5625,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } - lo = find_lockowner_str(&lockt->lt_clientid, &lockt->lt_owner, - cstate->clp); + lo = find_lockowner_str(cstate->clp, &lockt->lt_owner); if (lo) file_lock->fl_owner = (fl_owner_t)lo; file_lock->fl_pid = current->tgid; @@ -6019,7 +6040,7 @@ nfsd_inject_add_lock_to_list(struct nfs4_ol_stateid *lst, static u64 nfsd_foreach_client_lock(struct nfs4_client *clp, u64 max, struct list_head *collect, - void (*func)(struct nfs4_ol_stateid *)) + bool (*func)(struct nfs4_ol_stateid *)) { struct nfs4_openowner *oop; struct nfs4_ol_stateid *stp, *st_next; @@ -6033,9 +6054,9 @@ static u64 nfsd_foreach_client_lock(struct nfs4_client *clp, u64 max, list_for_each_entry_safe(lst, lst_next, &stp->st_locks, st_locks) { if (func) { - func(lst); - nfsd_inject_add_lock_to_list(lst, - collect); + if (func(lst)) + nfsd_inject_add_lock_to_list(lst, + collect); } ++count; /* @@ -6305,7 +6326,7 @@ static u64 nfsd_find_all_delegations(struct nfs4_client *clp, u64 max, continue; atomic_inc(&clp->cl_refcount); - unhash_delegation_locked(dp); + WARN_ON(!unhash_delegation_locked(dp)); list_add(&dp->dl_recall_lru, victims); } ++count; @@ -6584,6 +6605,7 @@ nfs4_state_start_net(struct net *net) return ret; nn->boot_time = get_seconds(); nn->grace_ended = false; + nn->nfsd4_manager.block_opens = true; locks_start_grace(net, &nn->nfsd4_manager); nfsd4_client_tracking_init(net); printk(KERN_INFO "NFSD: starting %ld-second grace period (net %p)\n", @@ -6602,7 +6624,7 @@ nfs4_state_start(void) ret = set_callback_cred(); if (ret) return -ENOMEM; - laundry_wq = create_singlethread_workqueue("nfsd4"); + laundry_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, "nfsd4"); if (laundry_wq == NULL) { ret = -ENOMEM; goto out_recovery; @@ -6635,7 +6657,7 @@ nfs4_state_shutdown_net(struct net *net) spin_lock(&state_lock); list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); - unhash_delegation_locked(dp); + WARN_ON(!unhash_delegation_locked(dp)); list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 54633858733a..51c9e9ca39a4 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2140,9 +2140,31 @@ nfsd4_encode_aclname(struct xdr_stream *xdr, struct svc_rqst *rqstp, return nfsd4_encode_user(xdr, rqstp, ace->who_uid); } +static inline __be32 +nfsd4_encode_layout_type(struct xdr_stream *xdr, enum pnfs_layouttype layout_type) +{ + __be32 *p; + + if (layout_type) { + p = xdr_reserve_space(xdr, 8); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(1); + *p++ = cpu_to_be32(layout_type); + } else { + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(0); + } + + return 0; +} + #define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \ FATTR4_WORD0_RDATTR_ERROR) #define WORD1_ABSENT_FS_ATTRS FATTR4_WORD1_MOUNTED_ON_FILEID +#define WORD2_ABSENT_FS_ATTRS 0 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL static inline __be32 @@ -2171,7 +2193,7 @@ nfsd4_encode_security_label(struct xdr_stream *xdr, struct svc_rqst *rqstp, { return 0; } #endif -static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err) +static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *bmval2, u32 *rdattr_err) { /* As per referral draft: */ if (*bmval0 & ~WORD0_ABSENT_FS_ATTRS || @@ -2184,6 +2206,7 @@ static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err) } *bmval0 &= WORD0_ABSENT_FS_ATTRS; *bmval1 &= WORD1_ABSENT_FS_ATTRS; + *bmval2 &= WORD2_ABSENT_FS_ATTRS; return 0; } @@ -2203,6 +2226,39 @@ static int get_parent_attributes(struct svc_export *exp, struct kstat *stat) return err; } +static __be32 +nfsd4_encode_bitmap(struct xdr_stream *xdr, u32 bmval0, u32 bmval1, u32 bmval2) +{ + __be32 *p; + + if (bmval2) { + p = xdr_reserve_space(xdr, 16); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(3); + *p++ = cpu_to_be32(bmval0); + *p++ = cpu_to_be32(bmval1); + *p++ = cpu_to_be32(bmval2); + } else if (bmval1) { + p = xdr_reserve_space(xdr, 12); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(bmval0); + *p++ = cpu_to_be32(bmval1); + } else { + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(1); + *p++ = cpu_to_be32(bmval0); + } + + return 0; +out_resource: + return nfserr_resource; +} + /* * Note: @fhp can be NULL; in this case, we might have to compose the filehandle * ourselves. @@ -2246,8 +2302,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, BUG_ON(bmval2 & ~nfsd_suppattrs2(minorversion)); if (exp->ex_fslocs.migrated) { - BUG_ON(bmval[2]); - status = fattr_handle_absent_fs(&bmval0, &bmval1, &rdattr_err); + status = fattr_handle_absent_fs(&bmval0, &bmval1, &bmval2, &rdattr_err); if (status) goto out; } @@ -2286,8 +2341,8 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, } #ifdef CONFIG_NFSD_V4_SECURITY_LABEL - if ((bmval[2] & FATTR4_WORD2_SECURITY_LABEL) || - bmval[0] & FATTR4_WORD0_SUPPORTED_ATTRS) { + if ((bmval2 & FATTR4_WORD2_SECURITY_LABEL) || + bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) { err = security_inode_getsecctx(d_inode(dentry), &context, &contextlen); contextsupport = (err == 0); @@ -2300,28 +2355,9 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, } #endif /* CONFIG_NFSD_V4_SECURITY_LABEL */ - if (bmval2) { - p = xdr_reserve_space(xdr, 16); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(3); - *p++ = cpu_to_be32(bmval0); - *p++ = cpu_to_be32(bmval1); - *p++ = cpu_to_be32(bmval2); - } else if (bmval1) { - p = xdr_reserve_space(xdr, 12); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(2); - *p++ = cpu_to_be32(bmval0); - *p++ = cpu_to_be32(bmval1); - } else { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(1); - *p++ = cpu_to_be32(bmval0); - } + status = nfsd4_encode_bitmap(xdr, bmval0, bmval1, bmval2); + if (status) + goto out; attrlen_offset = xdr->buf->len; p = xdr_reserve_space(xdr, 4); @@ -2674,6 +2710,9 @@ out_acl: *p++ = cpu_to_be32(stat.mtime.tv_nsec); } if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) { + struct kstat parent_stat; + u64 ino = stat.ino; + p = xdr_reserve_space(xdr, 8); if (!p) goto out_resource; @@ -2682,25 +2721,25 @@ out_acl: * and this is the root of a cross-mounted filesystem. */ if (ignore_crossmnt == 0 && - dentry == exp->ex_path.mnt->mnt_root) - get_parent_attributes(exp, &stat); - p = xdr_encode_hyper(p, stat.ino); + dentry == exp->ex_path.mnt->mnt_root) { + err = get_parent_attributes(exp, &parent_stat); + if (err) + goto out_nfserr; + ino = parent_stat.ino; + } + p = xdr_encode_hyper(p, ino); } #ifdef CONFIG_NFSD_PNFS - if ((bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) || - (bmval2 & FATTR4_WORD2_LAYOUT_TYPES)) { - if (exp->ex_layout_type) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(1); - *p++ = cpu_to_be32(exp->ex_layout_type); - } else { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(0); - } + if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) { + status = nfsd4_encode_layout_type(xdr, exp->ex_layout_type); + if (status) + goto out; + } + + if (bmval2 & FATTR4_WORD2_LAYOUT_TYPES) { + status = nfsd4_encode_layout_type(xdr, exp->ex_layout_type); + if (status) + goto out; } if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) { @@ -2710,21 +2749,20 @@ out_acl: *p++ = cpu_to_be32(stat.blksize); } #endif /* CONFIG_NFSD_PNFS */ + if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { + status = nfsd4_encode_bitmap(xdr, NFSD_SUPPATTR_EXCLCREAT_WORD0, + NFSD_SUPPATTR_EXCLCREAT_WORD1, + NFSD_SUPPATTR_EXCLCREAT_WORD2); + if (status) + goto out; + } + if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) { status = nfsd4_encode_security_label(xdr, rqstp, context, contextlen); if (status) goto out; } - if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { - p = xdr_reserve_space(xdr, 16); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(3); - *p++ = cpu_to_be32(NFSD_SUPPATTR_EXCLCREAT_WORD0); - *p++ = cpu_to_be32(NFSD_SUPPATTR_EXCLCREAT_WORD1); - *p++ = cpu_to_be32(NFSD_SUPPATTR_EXCLCREAT_WORD2); - } attrlen = htonl(xdr->buf->len - attrlen_offset - 4); write_bytes_to_xdr_buf(xdr->buf, attrlen_offset, &attrlen, 4); @@ -3043,13 +3081,12 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ __be32 *p; if (!nfserr) { - p = xdr_reserve_space(xdr, 32); + p = xdr_reserve_space(xdr, 20); if (!p) return nfserr_resource; - p = encode_cinfo(p, &create->cr_cinfo); - *p++ = cpu_to_be32(2); - *p++ = cpu_to_be32(create->cr_bmval[0]); - *p++ = cpu_to_be32(create->cr_bmval[1]); + encode_cinfo(p, &create->cr_cinfo); + nfserr = nfsd4_encode_bitmap(xdr, create->cr_bmval[0], + create->cr_bmval[1], create->cr_bmval[2]); } return nfserr; } @@ -3189,16 +3226,22 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op nfserr = nfsd4_encode_stateid(xdr, &open->op_stateid); if (nfserr) goto out; - p = xdr_reserve_space(xdr, 40); + p = xdr_reserve_space(xdr, 24); if (!p) return nfserr_resource; p = encode_cinfo(p, &open->op_cinfo); *p++ = cpu_to_be32(open->op_rflags); - *p++ = cpu_to_be32(2); - *p++ = cpu_to_be32(open->op_bmval[0]); - *p++ = cpu_to_be32(open->op_bmval[1]); - *p++ = cpu_to_be32(open->op_delegate_type); + nfserr = nfsd4_encode_bitmap(xdr, open->op_bmval[0], open->op_bmval[1], + open->op_bmval[2]); + if (nfserr) + goto out; + + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + + *p++ = cpu_to_be32(open->op_delegate_type); switch (open->op_delegate_type) { case NFS4_OPEN_DELEGATE_NONE: break; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 9277cc91c21b..ad4e2377dd63 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -391,6 +391,14 @@ static int nfsd_get_default_max_blksize(void) return ret; } +static struct svc_serv_ops nfsd_thread_sv_ops = { + .svo_shutdown = nfsd_last_thread, + .svo_function = nfsd, + .svo_enqueue_xprt = svc_xprt_do_enqueue, + .svo_setup = svc_set_num_threads, + .svo_module = THIS_MODULE, +}; + int nfsd_create_serv(struct net *net) { int error; @@ -405,7 +413,7 @@ int nfsd_create_serv(struct net *net) nfsd_max_blksize = nfsd_get_default_max_blksize(); nfsd_reset_versions(); nn->nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, - nfsd_last_thread, nfsd, THIS_MODULE); + &nfsd_thread_sv_ops); if (nn->nfsd_serv == NULL) return -ENOMEM; @@ -500,8 +508,8 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) /* apply the new numbers */ svc_get(nn->nfsd_serv); for (i = 0; i < n; i++) { - err = svc_set_num_threads(nn->nfsd_serv, &nn->nfsd_serv->sv_pools[i], - nthreads[i]); + err = nn->nfsd_serv->sv_ops->svo_setup(nn->nfsd_serv, + &nn->nfsd_serv->sv_pools[i], nthreads[i]); if (err) break; } @@ -540,7 +548,8 @@ nfsd_svc(int nrservs, struct net *net) error = nfsd_startup_net(nrservs, net); if (error) goto out_destroy; - error = svc_set_num_threads(nn->nfsd_serv, NULL, nrservs); + error = nn->nfsd_serv->sv_ops->svo_setup(nn->nfsd_serv, + NULL, nrservs); if (error) goto out_shutdown; /* We are holding a reference to nn->nfsd_serv which diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 4874ce515fc1..583ffc13cae2 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -67,8 +67,8 @@ struct nfsd4_callback { struct rpc_message cb_msg; struct nfsd4_callback_ops *cb_ops; struct work_struct cb_work; + int cb_seq_status; int cb_status; - bool cb_update_seq_nr; bool cb_need_restart; }; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index b5e077a6e7d4..45c04979e7b3 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1249,12 +1249,6 @@ out_nfserr: #ifdef CONFIG_NFSD_V3 -static inline int nfsd_create_is_exclusive(int createmode) -{ - return createmode == NFS3_CREATE_EXCLUSIVE - || createmode == NFS4_CREATE_EXCLUSIVE4_1; -} - /* * NFSv3 and NFSv4 version of nfsd_create */ diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 5be875e3e638..fee2451ae248 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -131,4 +131,10 @@ static inline __be32 fh_getattr(struct svc_fh *fh, struct kstat *stat) return nfserrno(vfs_getattr(&p, stat)); } +static inline int nfsd_create_is_exclusive(int createmode) +{ + return createmode == NFS3_CREATE_EXCLUSIVE + || createmode == NFS4_CREATE_EXCLUSIVE4_1; +} + #endif /* LINUX_NFSD_VFS_H */ diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 42468e5ab3e7..f63620ce3892 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -338,12 +338,11 @@ void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed) /* * BIO operations */ -static void nilfs_end_bio_write(struct bio *bio, int err) +static void nilfs_end_bio_write(struct bio *bio) { - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct nilfs_segment_buffer *segbuf = bio->bi_private; - if (!uptodate) + if (bio->bi_error) atomic_inc(&segbuf->sb_err); bio_put(bio); @@ -415,7 +414,7 @@ static void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf, { wi->bio = NULL; wi->rest_blocks = segbuf->sb_sum.nblocks; - wi->max_pages = bio_get_nr_vecs(wi->nilfs->ns_bdev); + wi->max_pages = BIO_MAX_PAGES; wi->nr_vecs = min(wi->max_pages, wi->rest_blocks); wi->start = wi->end = 0; wi->blocknr = segbuf->sb_pseg_start; diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 44523f4a6084..6faaf710e563 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -154,6 +154,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id) struct dnotify_struct *dn; struct dnotify_struct **prev; struct inode *inode; + bool free = false; inode = file_inode(filp); if (!S_ISDIR(inode->i_mode)) @@ -182,11 +183,15 @@ void dnotify_flush(struct file *filp, fl_owner_t id) /* nothing else could have found us thanks to the dnotify_groups mark_mutex */ - if (dn_mark->dn == NULL) - fsnotify_destroy_mark_locked(fsn_mark, dnotify_group); + if (dn_mark->dn == NULL) { + fsnotify_detach_mark(fsn_mark); + free = true; + } mutex_unlock(&dnotify_group->mark_mutex); + if (free) + fsnotify_free_mark(fsn_mark); fsnotify_put_mark(fsn_mark); } @@ -362,9 +367,10 @@ out: spin_unlock(&fsn_mark->lock); if (destroy) - fsnotify_destroy_mark_locked(fsn_mark, dnotify_group); - + fsnotify_detach_mark(fsn_mark); mutex_unlock(&dnotify_group->mark_mutex); + if (destroy) + fsnotify_free_mark(fsn_mark); fsnotify_put_mark(fsn_mark); out_err: if (new_fsn_mark) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index cf275500a665..8e8e6bcd1d43 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -529,8 +529,10 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group, removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, &destroy_mark); if (destroy_mark) - fsnotify_destroy_mark_locked(fsn_mark, group); + fsnotify_detach_mark(fsn_mark); mutex_unlock(&group->mark_mutex); + if (destroy_mark) + fsnotify_free_mark(fsn_mark); fsnotify_put_mark(fsn_mark); if (removed & real_mount(mnt)->mnt_fsnotify_mask) @@ -557,8 +559,10 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group, removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, &destroy_mark); if (destroy_mark) - fsnotify_destroy_mark_locked(fsn_mark, group); + fsnotify_detach_mark(fsn_mark); mutex_unlock(&group->mark_mutex); + if (destroy_mark) + fsnotify_free_mark(fsn_mark); /* matches the fsnotify_find_inode_mark() */ fsnotify_put_mark(fsn_mark); diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 58b7cdb63da9..6b6f0d472ae8 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -76,7 +76,8 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) struct inotify_inode_mark *inode_mark; struct inode *inode; - if (!(mark->flags & (FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_INODE))) + if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE) || + !(mark->flags & FSNOTIFY_MARK_FLAG_INODE)) return; inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index dd3fb0b17be7..db39de2dd4cb 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -26,7 +26,6 @@ #include <linux/fsnotify_backend.h> #include "fsnotify.h" -#include "../mount.h" /* * Clear all of the marks on an inode when it is being evicted from core @@ -205,6 +204,16 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, mnt = NULL; /* + * Optimization: srcu_read_lock() has a memory barrier which can + * be expensive. It protects walking the *_fsnotify_marks lists. + * However, if we do not walk the lists, we do not have to do + * SRCU because we have no references to any objects and do not + * need SRCU to keep them "alive". + */ + if (hlist_empty(&to_tell->i_fsnotify_marks) && + (!mnt || hlist_empty(&mnt->mnt_fsnotify_marks))) + return 0; + /* * if this is a modify event we may need to clear the ignored masks * otherwise return if neither the inode nor the vfsmount care about * this type of event. diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index 13a00be516d2..b44c68a857e7 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -6,6 +6,8 @@ #include <linux/srcu.h> #include <linux/types.h> +#include "../mount.h" + /* destroy all events sitting in this groups notification queue */ extern void fsnotify_flush_notify(struct fsnotify_group *group); @@ -38,15 +40,22 @@ extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark); /* inode specific destruction of a mark */ extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark); -/* Destroy all marks in the given list */ -extern void fsnotify_destroy_marks(struct list_head *to_free); /* Find mark belonging to given group in the list of marks */ extern struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head, struct fsnotify_group *group); -/* run the list of all marks associated with inode and flag them to be freed */ -extern void fsnotify_clear_marks_by_inode(struct inode *inode); -/* run the list of all marks associated with vfsmount and flag them to be freed */ -extern void fsnotify_clear_marks_by_mount(struct vfsmount *mnt); +/* Destroy all marks in the given list protected by 'lock' */ +extern void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock); +/* run the list of all marks associated with inode and destroy them */ +static inline void fsnotify_clear_marks_by_inode(struct inode *inode) +{ + fsnotify_destroy_marks(&inode->i_fsnotify_marks, &inode->i_lock); +} +/* run the list of all marks associated with vfsmount and destroy them */ +static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) +{ + fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks, + &mnt->mnt_root->d_lock); +} /* * update the dentry->d_flags of all of inode's children to indicate if inode cares * about events that happen to its children. diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index 3daf513ee99e..e785fd954c30 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -65,26 +65,6 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark) } /* - * Given an inode, destroy all of the marks associated with that inode. - */ -void fsnotify_clear_marks_by_inode(struct inode *inode) -{ - struct fsnotify_mark *mark; - struct hlist_node *n; - LIST_HEAD(free_list); - - spin_lock(&inode->i_lock); - hlist_for_each_entry_safe(mark, n, &inode->i_fsnotify_marks, obj_list) { - list_add(&mark->free_list, &free_list); - hlist_del_init_rcu(&mark->obj_list); - fsnotify_get_mark(mark); - } - spin_unlock(&inode->i_lock); - - fsnotify_destroy_marks(&free_list); -} - -/* * Given a group clear all of the inode marks associated with that group. */ void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group) @@ -163,17 +143,17 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark, /** * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. - * @list: list of inodes being unmounted (sb->s_inodes) + * @sb: superblock being unmounted. * * Called during unmount with no locks held, so needs to be safe against - * concurrent modifiers. We temporarily drop inode_sb_list_lock and CAN block. + * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block. */ -void fsnotify_unmount_inodes(struct list_head *list) +void fsnotify_unmount_inodes(struct super_block *sb) { struct inode *inode, *next_i, *need_iput = NULL; - spin_lock(&inode_sb_list_lock); - list_for_each_entry_safe(inode, next_i, list, i_sb_list) { + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry_safe(inode, next_i, &sb->s_inodes, i_sb_list) { struct inode *need_iput_tmp; /* @@ -209,7 +189,7 @@ void fsnotify_unmount_inodes(struct list_head *list) spin_unlock(&inode->i_lock); /* In case the dropping of a reference would nuke next_i. */ - while (&next_i->i_sb_list != list) { + while (&next_i->i_sb_list != &sb->s_inodes) { spin_lock(&next_i->i_lock); if (!(next_i->i_state & (I_FREEING | I_WILL_FREE)) && atomic_read(&next_i->i_count)) { @@ -224,12 +204,12 @@ void fsnotify_unmount_inodes(struct list_head *list) } /* - * We can safely drop inode_sb_list_lock here because either + * We can safely drop s_inode_list_lock here because either * we actually hold references on both inode and next_i or * end of list. Also no new inodes will be added since the * umount has begun. */ - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); if (need_iput_tmp) iput(need_iput_tmp); @@ -241,7 +221,7 @@ void fsnotify_unmount_inodes(struct list_head *list) iput(inode); - spin_lock(&inode_sb_list_lock); + spin_lock(&sb->s_inode_list_lock); } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); } diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 92e48c70f0f0..fc0df4442f7b 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -122,26 +122,27 @@ u32 fsnotify_recalc_mask(struct hlist_head *head) } /* - * Any time a mark is getting freed we end up here. - * The caller had better be holding a reference to this mark so we don't actually - * do the final put under the mark->lock + * Remove mark from inode / vfsmount list, group list, drop inode reference + * if we got one. + * + * Must be called with group->mark_mutex held. */ -void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark, - struct fsnotify_group *group) +void fsnotify_detach_mark(struct fsnotify_mark *mark) { struct inode *inode = NULL; + struct fsnotify_group *group = mark->group; BUG_ON(!mutex_is_locked(&group->mark_mutex)); spin_lock(&mark->lock); /* something else already called this function on this mark */ - if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { + if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { spin_unlock(&mark->lock); return; } - mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; + mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED; if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { inode = mark->inode; @@ -150,6 +151,12 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark, fsnotify_destroy_vfsmount_mark(mark); else BUG(); + /* + * Note that we didn't update flags telling whether inode cares about + * what's happening with children. We update these flags from + * __fsnotify_parent() lazily when next event happens on one of our + * children. + */ list_del_init(&mark->g_list); @@ -157,18 +164,32 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark, if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) iput(inode); - /* release lock temporarily */ - mutex_unlock(&group->mark_mutex); + + atomic_dec(&group->num_marks); +} + +/* + * Free fsnotify mark. The freeing is actually happening from a kthread which + * first waits for srcu period end. Caller must have a reference to the mark + * or be protected by fsnotify_mark_srcu. + */ +void fsnotify_free_mark(struct fsnotify_mark *mark) +{ + struct fsnotify_group *group = mark->group; + + spin_lock(&mark->lock); + /* something else already called this function on this mark */ + if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { + spin_unlock(&mark->lock); + return; + } + mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; + spin_unlock(&mark->lock); spin_lock(&destroy_lock); list_add(&mark->g_list, &destroy_list); spin_unlock(&destroy_lock); wake_up(&destroy_waitq); - /* - * We don't necessarily have a ref on mark from caller so the above destroy - * may have actually freed it, unless this group provides a 'freeing_mark' - * function which must be holding a reference. - */ /* * Some groups like to know that marks are being freed. This is a @@ -177,50 +198,45 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark, */ if (group->ops->freeing_mark) group->ops->freeing_mark(mark, group); - - /* - * __fsnotify_update_child_dentry_flags(inode); - * - * I really want to call that, but we can't, we have no idea if the inode - * still exists the second we drop the mark->lock. - * - * The next time an event arrive to this inode from one of it's children - * __fsnotify_parent will see that the inode doesn't care about it's - * children and will update all of these flags then. So really this - * is just a lazy update (and could be a perf win...) - */ - - atomic_dec(&group->num_marks); - - mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); } void fsnotify_destroy_mark(struct fsnotify_mark *mark, struct fsnotify_group *group) { mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); - fsnotify_destroy_mark_locked(mark, group); + fsnotify_detach_mark(mark); mutex_unlock(&group->mark_mutex); + fsnotify_free_mark(mark); } -/* - * Destroy all marks in the given list. The marks must be already detached from - * the original inode / vfsmount. - */ -void fsnotify_destroy_marks(struct list_head *to_free) +void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock) { - struct fsnotify_mark *mark, *lmark; - struct fsnotify_group *group; - - list_for_each_entry_safe(mark, lmark, to_free, free_list) { - spin_lock(&mark->lock); - fsnotify_get_group(mark->group); - group = mark->group; - spin_unlock(&mark->lock); + struct fsnotify_mark *mark; - fsnotify_destroy_mark(mark, group); + while (1) { + /* + * We have to be careful since we can race with e.g. + * fsnotify_clear_marks_by_group() and once we drop 'lock', + * mark can get removed from the obj_list and destroyed. But + * we are holding mark reference so mark cannot be freed and + * calling fsnotify_destroy_mark() more than once is fine. + */ + spin_lock(lock); + if (hlist_empty(head)) { + spin_unlock(lock); + break; + } + mark = hlist_entry(head->first, struct fsnotify_mark, obj_list); + /* + * We don't update i_fsnotify_mask / mnt_fsnotify_mask here + * since inode / mount is going away anyway. So just remove + * mark from the list. + */ + hlist_del_init_rcu(&mark->obj_list); + fsnotify_get_mark(mark); + spin_unlock(lock); + fsnotify_destroy_mark(mark, mark->group); fsnotify_put_mark(mark); - fsnotify_put_group(group); } } @@ -332,7 +348,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, * inode->i_lock */ spin_lock(&mark->lock); - mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE; + mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED; fsnotify_get_group(group); mark->group = group; @@ -412,16 +428,37 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, unsigned int flags) { struct fsnotify_mark *lmark, *mark; + LIST_HEAD(to_free); + /* + * We have to be really careful here. Anytime we drop mark_mutex, e.g. + * fsnotify_clear_marks_by_inode() can come and free marks. Even in our + * to_free list so we have to use mark_mutex even when accessing that + * list. And freeing mark requires us to drop mark_mutex. So we can + * reliably free only the first mark in the list. That's why we first + * move marks to free to to_free list in one go and then free marks in + * to_free list one by one. + */ mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) { - if (mark->flags & flags) { - fsnotify_get_mark(mark); - fsnotify_destroy_mark_locked(mark, group); - fsnotify_put_mark(mark); - } + if (mark->flags & flags) + list_move(&mark->g_list, &to_free); } mutex_unlock(&group->mark_mutex); + + while (1) { + mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); + if (list_empty(&to_free)) { + mutex_unlock(&group->mark_mutex); + break; + } + mark = list_first_entry(&to_free, struct fsnotify_mark, g_list); + fsnotify_get_mark(mark); + fsnotify_detach_mark(mark); + mutex_unlock(&group->mark_mutex); + fsnotify_free_mark(mark); + fsnotify_put_mark(mark); + } } /* diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c index 326b148e623c..a8fcab68faef 100644 --- a/fs/notify/vfsmount_mark.c +++ b/fs/notify/vfsmount_mark.c @@ -28,25 +28,6 @@ #include <linux/fsnotify_backend.h> #include "fsnotify.h" -#include "../mount.h" - -void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) -{ - struct fsnotify_mark *mark; - struct hlist_node *n; - struct mount *m = real_mount(mnt); - LIST_HEAD(free_list); - - spin_lock(&mnt->mnt_root->d_lock); - hlist_for_each_entry_safe(mark, n, &m->mnt_fsnotify_marks, obj_list) { - list_add(&mark->free_list, &free_list); - hlist_del_init_rcu(&mark->obj_list); - fsnotify_get_mark(mark); - } - spin_unlock(&mnt->mnt_root->d_lock); - - fsnotify_destroy_marks(&free_list); -} void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) { diff --git a/fs/nsfs.c b/fs/nsfs.c index 99521e7c492b..e4905fbf3396 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -4,6 +4,7 @@ #include <linux/proc_ns.h> #include <linux/magic.h> #include <linux/ktime.h> +#include <linux/seq_file.h> static struct vfsmount *nsfs_mnt; @@ -136,9 +137,18 @@ out_invalid: return ERR_PTR(-EINVAL); } +static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + const struct proc_ns_operations *ns_ops = dentry->d_fsdata; + + return seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino); +} + static const struct super_operations nsfs_ops = { .statfs = simple_statfs, .evict_inode = nsfs_evict, + .show_path = nsfs_show_path, }; static struct dentry *nsfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 9e1e112074fb..d1a853585b53 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -543,7 +543,7 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt) return -EROFS; } if (!ntfs_stamp_usnjrnl(vol)) { - ntfs_error(sb, "Failed to stamp transation log " + ntfs_error(sb, "Failed to stamp transaction log " "($UsnJrnl)%s", es); NVolSetErrors(vol); return -EROFS; @@ -2204,17 +2204,12 @@ get_ctx_vol_failed: return true; #ifdef NTFS_RW iput_usnjrnl_err_out: - if (vol->usnjrnl_j_ino) - iput(vol->usnjrnl_j_ino); - if (vol->usnjrnl_max_ino) - iput(vol->usnjrnl_max_ino); - if (vol->usnjrnl_ino) - iput(vol->usnjrnl_ino); + iput(vol->usnjrnl_j_ino); + iput(vol->usnjrnl_max_ino); + iput(vol->usnjrnl_ino); iput_quota_err_out: - if (vol->quota_q_ino) - iput(vol->quota_q_ino); - if (vol->quota_ino) - iput(vol->quota_ino); + iput(vol->quota_q_ino); + iput(vol->quota_ino); iput(vol->extend_ino); #endif /* NTFS_RW */ iput_sec_err_out: @@ -2223,8 +2218,7 @@ iput_root_err_out: iput(vol->root_ino); iput_logfile_err_out: #ifdef NTFS_RW - if (vol->logfile_ino) - iput(vol->logfile_ino); + iput(vol->logfile_ino); iput_vol_err_out: #endif /* NTFS_RW */ iput(vol->vol_ino); @@ -2254,8 +2248,7 @@ iput_mftbmp_err_out: iput(vol->mftbmp_ino); iput_mirr_err_out: #ifdef NTFS_RW - if (vol->mftmirr_ino) - iput(vol->mftmirr_ino); + iput(vol->mftmirr_ino); #endif /* NTFS_RW */ return false; } diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index c58a1bcfda0f..0cdf497c91ef 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -284,7 +284,19 @@ int ocfs2_set_acl(handle_t *handle, int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type) { - return ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL); + struct buffer_head *bh = NULL; + int status = 0; + + status = ocfs2_inode_lock(inode, &bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + return status; + } + status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL); + ocfs2_inode_unlock(inode, 1); + brelse(bh); + return status; } struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) @@ -292,19 +304,21 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) struct ocfs2_super *osb; struct buffer_head *di_bh = NULL; struct posix_acl *acl; - int ret = -EAGAIN; + int ret; osb = OCFS2_SB(inode->i_sb); if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) return NULL; - - ret = ocfs2_read_inode_block(inode, &di_bh); - if (ret < 0) + ret = ocfs2_inode_lock(inode, &di_bh, 0); + if (ret < 0) { + if (ret != -ENOENT) + mlog_errno(ret); return ERR_PTR(ret); + } acl = ocfs2_get_acl_nolock(inode, type, di_bh); + ocfs2_inode_unlock(inode, 0); brelse(di_bh); - return acl; } diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 5997c00a1515..86181d6526dc 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -908,32 +908,30 @@ static int ocfs2_validate_extent_block(struct super_block *sb, */ if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - ocfs2_error(sb, - "Extent block #%llu has bad signature %.*s", - (unsigned long long)bh->b_blocknr, 7, - eb->h_signature); - return -EINVAL; + rc = ocfs2_error(sb, + "Extent block #%llu has bad signature %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + eb->h_signature); + goto bail; } if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) { - ocfs2_error(sb, - "Extent block #%llu has an invalid h_blkno " - "of %llu", - (unsigned long long)bh->b_blocknr, - (unsigned long long)le64_to_cpu(eb->h_blkno)); - return -EINVAL; + rc = ocfs2_error(sb, + "Extent block #%llu has an invalid h_blkno of %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(eb->h_blkno)); + goto bail; } if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) { - ocfs2_error(sb, - "Extent block #%llu has an invalid " - "h_fs_generation of #%u", - (unsigned long long)bh->b_blocknr, - le32_to_cpu(eb->h_fs_generation)); - return -EINVAL; + rc = ocfs2_error(sb, + "Extent block #%llu has an invalid h_fs_generation of #%u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(eb->h_fs_generation)); + goto bail; } - - return 0; +bail: + return rc; } int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno, @@ -1446,8 +1444,7 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et, while(le16_to_cpu(el->l_tree_depth) > 1) { if (le16_to_cpu(el->l_next_free_rec) == 0) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has empty " - "extent list (next_free_rec == 0)", + "Owner %llu has empty extent list (next_free_rec == 0)\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); status = -EIO; goto bail; @@ -1456,9 +1453,7 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et, blkno = le64_to_cpu(el->l_recs[i].e_blkno); if (!blkno) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has extent " - "list where extent # %d has no physical " - "block start", + "Owner %llu has extent list where extent # %d has no physical block start\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i); status = -EIO; goto bail; @@ -1788,8 +1783,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci, while (el->l_tree_depth) { if (le16_to_cpu(el->l_next_free_rec) == 0) { ocfs2_error(ocfs2_metadata_cache_get_super(ci), - "Owner %llu has empty extent list at " - "depth %u\n", + "Owner %llu has empty extent list at depth %u\n", (unsigned long long)ocfs2_metadata_cache_owner(ci), le16_to_cpu(el->l_tree_depth)); ret = -EROFS; @@ -1814,8 +1808,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci, blkno = le64_to_cpu(el->l_recs[i].e_blkno); if (blkno == 0) { ocfs2_error(ocfs2_metadata_cache_get_super(ci), - "Owner %llu has bad blkno in extent list " - "at depth %u (index %d)\n", + "Owner %llu has bad blkno in extent list at depth %u (index %d)\n", (unsigned long long)ocfs2_metadata_cache_owner(ci), le16_to_cpu(el->l_tree_depth), i); ret = -EROFS; @@ -1836,8 +1829,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci, if (le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count)) { ocfs2_error(ocfs2_metadata_cache_get_super(ci), - "Owner %llu has bad count in extent list " - "at block %llu (next free=%u, count=%u)\n", + "Owner %llu has bad count in extent list at block %llu (next free=%u, count=%u)\n", (unsigned long long)ocfs2_metadata_cache_owner(ci), (unsigned long long)bh->b_blocknr, le16_to_cpu(el->l_next_free_rec), @@ -2116,8 +2108,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle, if (left_el->l_next_free_rec != left_el->l_count) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Inode %llu has non-full interior leaf node %llu" - "(next free = %u)", + "Inode %llu has non-full interior leaf node %llu (next free = %u)\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), (unsigned long long)left_leaf_bh->b_blocknr, le16_to_cpu(left_el->l_next_free_rec)); @@ -2256,8 +2247,7 @@ int ocfs2_find_cpos_for_left_leaf(struct super_block *sb, * If we got here, we never found a valid node where * the tree indicated one should be. */ - ocfs2_error(sb, - "Invalid extent tree at extent block %llu\n", + ocfs2_error(sb, "Invalid extent tree at extent block %llu\n", (unsigned long long)blkno); ret = -EROFS; goto out; @@ -2872,8 +2862,7 @@ int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, * If we got here, we never found a valid node where * the tree indicated one should be. */ - ocfs2_error(sb, - "Invalid extent tree at extent block %llu\n", + ocfs2_error(sb, "Invalid extent tree at extent block %llu\n", (unsigned long long)blkno); ret = -EROFS; goto out; @@ -3131,6 +3120,30 @@ out: return ret; } +static int ocfs2_remove_rightmost_empty_extent(struct ocfs2_super *osb, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + handle_t *handle; + int ret; + int credits = path->p_tree_depth * 2 + 1; + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + return ret; + } + + ret = ocfs2_remove_rightmost_path(handle, et, path, dealloc); + if (ret) + mlog_errno(ret); + + ocfs2_commit_trans(osb, handle); + return ret; +} + /* * Left rotation of btree records. * @@ -3200,7 +3213,7 @@ rightmost_no_delete: if (le16_to_cpu(el->l_next_free_rec) == 0) { ret = -EIO; ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has empty extent block at %llu", + "Owner %llu has empty extent block at %llu\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), (unsigned long long)le64_to_cpu(eb->h_blkno)); goto out; @@ -3930,7 +3943,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle, next_free = le16_to_cpu(el->l_next_free_rec); if (next_free == 0) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has a bad extent list", + "Owner %llu has a bad extent list\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); ret = -EIO; return; @@ -4355,10 +4368,7 @@ static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, bh = path_leaf_bh(left_path); eb = (struct ocfs2_extent_block *)bh->b_data; ocfs2_error(sb, - "Extent block #%llu has an " - "invalid l_next_free_rec of " - "%d. It should have " - "matched the l_count of %d", + "Extent block #%llu has an invalid l_next_free_rec of %d. It should have matched the l_count of %d\n", (unsigned long long)le64_to_cpu(eb->h_blkno), le16_to_cpu(new_el->l_next_free_rec), le16_to_cpu(new_el->l_count)); @@ -4413,8 +4423,7 @@ static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, bh = path_leaf_bh(right_path); eb = (struct ocfs2_extent_block *)bh->b_data; ocfs2_error(sb, - "Extent block #%llu has an " - "invalid l_next_free_rec of %d", + "Extent block #%llu has an invalid l_next_free_rec of %d\n", (unsigned long long)le64_to_cpu(eb->h_blkno), le16_to_cpu(new_el->l_next_free_rec)); status = -EINVAL; @@ -4970,10 +4979,9 @@ leftright: split_index = ocfs2_search_extent_list(el, cpos); if (split_index == -1) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has an extent at cpos %u " - "which can no longer be found.\n", - (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), - cpos); + "Owner %llu has an extent at cpos %u which can no longer be found\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos); ret = -EROFS; goto out; } @@ -5158,10 +5166,9 @@ int ocfs2_change_extent_flag(handle_t *handle, index = ocfs2_search_extent_list(el, cpos); if (index == -1) { ocfs2_error(sb, - "Owner %llu has an extent at cpos %u which can no " - "longer be found.\n", - (unsigned long long) - ocfs2_metadata_cache_owner(et->et_ci), cpos); + "Owner %llu has an extent at cpos %u which can no longer be found\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos); ret = -EROFS; goto out; } @@ -5228,9 +5235,7 @@ int ocfs2_mark_extent_written(struct inode *inode, cpos, len, phys); if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) { - ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents " - "that are being written to, but the feature bit " - "is not set in the super block.", + ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents that are being written to, but the feature bit is not set in the super block\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); ret = -EROFS; goto out; @@ -5514,8 +5519,7 @@ int ocfs2_remove_extent(handle_t *handle, index = ocfs2_search_extent_list(el, cpos); if (index == -1) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has an extent at cpos %u which can no " - "longer be found.\n", + "Owner %llu has an extent at cpos %u which can no longer be found\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), cpos); ret = -EROFS; @@ -5580,7 +5584,7 @@ int ocfs2_remove_extent(handle_t *handle, index = ocfs2_search_extent_list(el, cpos); if (index == -1) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu: split at cpos %u lost record.", + "Owner %llu: split at cpos %u lost record\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), cpos); ret = -EROFS; @@ -5596,8 +5600,7 @@ int ocfs2_remove_extent(handle_t *handle, ocfs2_rec_clusters(el, rec); if (rec_range != trunc_range) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu: error after split at cpos %u" - "trunc len %u, existing record is (%u,%u)", + "Owner %llu: error after split at cpos %u trunc len %u, existing record is (%u,%u)\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), cpos, len, le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); @@ -6175,7 +6178,7 @@ bail: iput(tl_inode); brelse(tl_bh); - if (status < 0 && (*tl_copy)) { + if (status < 0) { kfree(*tl_copy); *tl_copy = NULL; mlog_errno(status); @@ -7108,15 +7111,23 @@ start: * to check it up here before changing the tree. */ if (root_el->l_tree_depth && rec->e_int_clusters == 0) { - ocfs2_error(inode->i_sb, "Inode %lu has an empty " + mlog(ML_ERROR, "Inode %lu has an empty " "extent record, depth %u\n", inode->i_ino, le16_to_cpu(root_el->l_tree_depth)); - status = -EROFS; - goto bail; + status = ocfs2_remove_rightmost_empty_extent(osb, + &et, path, &dealloc); + if (status) { + mlog_errno(status); + goto bail; + } + + ocfs2_reinit_path(path, 1); + goto start; + } else { + trunc_cpos = le32_to_cpu(rec->e_cpos); + trunc_len = 0; + blkno = 0; } - trunc_cpos = le32_to_cpu(rec->e_cpos); - trunc_len = 0; - blkno = 0; } else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) { /* * Truncate entire record. @@ -7204,8 +7215,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) || !ocfs2_supports_inline_data(osb)) { ocfs2_error(inode->i_sb, - "Inline data flags for inode %llu don't agree! " - "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n", + "Inline data flags for inode %llu don't agree! Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, le16_to_cpu(di->i_dyn_features), OCFS2_I(inode)->ip_dyn_features, diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 1a35c6139656..64b11d90eca6 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -227,7 +227,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page, struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) { - ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag", + ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); return -EROFS; } @@ -237,7 +237,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page, if (size > PAGE_CACHE_SIZE || size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) { ocfs2_error(inode->i_sb, - "Inode %llu has with inline data has bad size: %Lu", + "Inode %llu has with inline data has bad size: %Lu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)size); return -EROFS; @@ -533,10 +533,14 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); + down_read(&OCFS2_I(inode)->ip_alloc_sem); + /* This figures out the size of the next contiguous block, and * our logical offset */ ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &contig_blocks, &ext_flags); + up_read(&OCFS2_I(inode)->ip_alloc_sem); + if (ret) { mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", (unsigned long long)iblock); @@ -557,6 +561,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, alloc_locked = 1; + down_write(&OCFS2_I(inode)->ip_alloc_sem); + /* fill hole, allocate blocks can't be larger than the size * of the hole */ clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len); @@ -569,6 +575,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, ret = ocfs2_extend_allocation(inode, cpos, clusters_to_alloc, 0); if (ret < 0) { + up_write(&OCFS2_I(inode)->ip_alloc_sem); mlog_errno(ret); goto bail; } @@ -576,11 +583,13 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &contig_blocks, &ext_flags); if (ret < 0) { + up_write(&OCFS2_I(inode)->ip_alloc_sem); mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", (unsigned long long)iblock); ret = -EIO; goto bail; } + up_write(&OCFS2_I(inode)->ip_alloc_sem); } /* @@ -627,10 +636,13 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio); } - ocfs2_iocb_clear_rw_locked(iocb); + /* Let rw unlock to be done later to protect append direct io write */ + if (offset + bytes <= i_size_read(inode)) { + ocfs2_iocb_clear_rw_locked(iocb); - level = ocfs2_iocb_rw_locked_level(iocb); - ocfs2_rw_unlock(inode, level); + level = ocfs2_iocb_rw_locked_level(iocb); + ocfs2_rw_unlock(inode, level); + } } static int ocfs2_releasepage(struct page *page, gfp_t wait) @@ -685,7 +697,7 @@ static int ocfs2_direct_IO_zero_extend(struct ocfs2_super *osb, if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) { u64 s = i_size_read(inode); - sector_t sector = (p_cpos << (osb->s_clustersize_bits - 9)) + + sector_t sector = ((u64)p_cpos << (osb->s_clustersize_bits - 9)) + (do_div(s, osb->s_clustersize) >> 9); ret = blkdev_issue_zeroout(osb->sb->s_bdev, sector, @@ -832,12 +844,17 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, /* zeroing out the previously allocated cluster tail * that but not zeroed */ - if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) + if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { + down_read(&OCFS2_I(inode)->ip_alloc_sem); ret = ocfs2_direct_IO_zero_extend(osb, inode, offset, zero_len_tail, cluster_align_tail); - else + up_read(&OCFS2_I(inode)->ip_alloc_sem); + } else { + down_write(&OCFS2_I(inode)->ip_alloc_sem); ret = ocfs2_direct_IO_extend_no_holes(osb, inode, offset); + up_write(&OCFS2_I(inode)->ip_alloc_sem); + } if (ret < 0) { mlog_errno(ret); ocfs2_inode_unlock(inode, 1); @@ -857,7 +874,8 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, written = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, offset, ocfs2_direct_IO_get_blocks, ocfs2_dio_end_io, NULL, 0); - if (unlikely(written < 0)) { + /* overwrite aio may return -EIOCBQUEUED, and it is not an error */ + if ((written < 0) && (written != -EIOCBQUEUED)) { loff_t i_size = i_size_read(inode); if (offset + count > i_size) { @@ -876,12 +894,14 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, ocfs2_inode_unlock(inode, 1); brelse(di_bh); + di_bh = NULL; goto clean_orphan; } } ocfs2_inode_unlock(inode, 1); brelse(di_bh); + di_bh = NULL; ret = jbd2_journal_force_commit(journal); if (ret < 0) @@ -910,7 +930,7 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN)); ret = blkdev_issue_zeroout(osb->sb->s_bdev, - p_cpos << (osb->s_clustersize_bits - 9), + (u64)p_cpos << (osb->s_clustersize_bits - 9), zero_len_head >> 9, GFP_NOFS, false); if (ret < 0) mlog_errno(ret); @@ -936,10 +956,12 @@ clean_orphan: if (tmp_ret < 0) { ret = tmp_ret; mlog_errno(ret); + brelse(di_bh); goto out; } ocfs2_inode_unlock(inode, 1); + brelse(di_bh); tmp_ret = jbd2_journal_force_commit(journal); if (tmp_ret < 0) { @@ -2185,10 +2207,7 @@ try_again: if (ret) goto out_commit; } - /* - * We don't want this to fail in ocfs2_write_end(), so do it - * here. - */ + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { @@ -2345,7 +2364,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { - int i; + int i, ret; unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1); struct inode *inode = mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); @@ -2354,6 +2373,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping, handle_t *handle = wc->w_handle; struct page *tmppage; + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + copied = ret; + mlog_errno(ret); + goto out; + } + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { ocfs2_write_end_inline(inode, pos, len, &copied, di, wc); goto out_write_size; @@ -2409,6 +2436,7 @@ out_write_size: ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, wc->w_di_bh); +out: /* unlock pages before dealloc since it needs acquiring j_trans_barrier * lock, or it will cause a deadlock since journal commit threads holds * this lock and will ask for the page lock when flushing the data. diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index 1edcb141f639..fe50ded1b4ce 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -316,6 +316,12 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, bh = bhs[i]; if (!(flags & OCFS2_BH_READAHEAD)) { + if (status) { + /* Clear the rest of the buffers on error */ + put_bh(bh); + bhs[i] = NULL; + continue; + } /* We know this can't have changed as we hold the * owner sem. Avoid doing any work on the bh if the * journal has it. */ diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 16eff45727ee..fa15debcc02b 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -36,7 +36,7 @@ #include <linux/debugfs.h> #include <linux/slab.h> #include <linux/bitmap.h> - +#include <linux/ktime.h> #include "heartbeat.h" #include "tcp.h" #include "nodemanager.h" @@ -372,14 +372,13 @@ static void o2hb_wait_on_io(struct o2hb_region *reg, wait_for_completion(&wc->wc_io_complete); } -static void o2hb_bio_end_io(struct bio *bio, - int error) +static void o2hb_bio_end_io(struct bio *bio) { struct o2hb_bio_wait_ctxt *wc = bio->bi_private; - if (error) { - mlog(ML_ERROR, "IO Error %d\n", error); - wc->wc_error = error; + if (bio->bi_error) { + mlog(ML_ERROR, "IO Error %d\n", bio->bi_error); + wc->wc_error = bio->bi_error; } o2hb_bio_wait_dec(wc, 1); @@ -1061,37 +1060,6 @@ bail: return ret; } -/* Subtract b from a, storing the result in a. a *must* have a larger - * value than b. */ -static void o2hb_tv_subtract(struct timeval *a, - struct timeval *b) -{ - /* just return 0 when a is after b */ - if (a->tv_sec < b->tv_sec || - (a->tv_sec == b->tv_sec && a->tv_usec < b->tv_usec)) { - a->tv_sec = 0; - a->tv_usec = 0; - return; - } - - a->tv_sec -= b->tv_sec; - a->tv_usec -= b->tv_usec; - while ( a->tv_usec < 0 ) { - a->tv_sec--; - a->tv_usec += 1000000; - } -} - -static unsigned int o2hb_elapsed_msecs(struct timeval *start, - struct timeval *end) -{ - struct timeval res = *end; - - o2hb_tv_subtract(&res, start); - - return res.tv_sec * 1000 + res.tv_usec / 1000; -} - /* * we ride the region ref that the region dir holds. before the region * dir is removed and drops it ref it will wait to tear down this @@ -1102,7 +1070,7 @@ static int o2hb_thread(void *data) int i, ret; struct o2hb_region *reg = data; struct o2hb_bio_wait_ctxt write_wc; - struct timeval before_hb, after_hb; + ktime_t before_hb, after_hb; unsigned int elapsed_msec; mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n"); @@ -1119,18 +1087,18 @@ static int o2hb_thread(void *data) * hr_timeout_ms between disk writes. On busy systems * this should result in a heartbeat which is less * likely to time itself out. */ - do_gettimeofday(&before_hb); + before_hb = ktime_get_real(); ret = o2hb_do_disk_heartbeat(reg); - do_gettimeofday(&after_hb); - elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); + after_hb = ktime_get_real(); + + elapsed_msec = (unsigned int) + ktime_ms_delta(after_hb, before_hb); mlog(ML_HEARTBEAT, - "start = %lu.%lu, end = %lu.%lu, msec = %u, ret = %d\n", - before_hb.tv_sec, (unsigned long) before_hb.tv_usec, - after_hb.tv_sec, (unsigned long) after_hb.tv_usec, - elapsed_msec, ret); + "start = %lld, end = %lld, msec = %u, ret = %d\n", + before_hb.tv64, after_hb.tv64, elapsed_msec, ret); if (!kthread_should_stop() && elapsed_msec < reg->hr_timeout_ms) { @@ -1620,17 +1588,13 @@ static int o2hb_map_slot_data(struct o2hb_region *reg) struct o2hb_disk_slot *slot; reg->hr_tmp_block = kmalloc(reg->hr_block_bytes, GFP_KERNEL); - if (reg->hr_tmp_block == NULL) { - mlog_errno(-ENOMEM); + if (reg->hr_tmp_block == NULL) return -ENOMEM; - } reg->hr_slots = kcalloc(reg->hr_blocks, sizeof(struct o2hb_disk_slot), GFP_KERNEL); - if (reg->hr_slots == NULL) { - mlog_errno(-ENOMEM); + if (reg->hr_slots == NULL) return -ENOMEM; - } for(i = 0; i < reg->hr_blocks; i++) { slot = ®->hr_slots[i]; @@ -1646,17 +1610,13 @@ static int o2hb_map_slot_data(struct o2hb_region *reg) reg->hr_slot_data = kcalloc(reg->hr_num_pages, sizeof(struct page *), GFP_KERNEL); - if (!reg->hr_slot_data) { - mlog_errno(-ENOMEM); + if (!reg->hr_slot_data) return -ENOMEM; - } for(i = 0; i < reg->hr_num_pages; i++) { page = alloc_page(GFP_KERNEL); - if (!page) { - mlog_errno(-ENOMEM); + if (!page) return -ENOMEM; - } reg->hr_slot_data[i] = page; @@ -1688,10 +1648,8 @@ static int o2hb_populate_slot_data(struct o2hb_region *reg) struct o2hb_disk_heartbeat_block *hb_block; ret = o2hb_read_slots(reg, reg->hr_blocks); - if (ret) { - mlog_errno(ret); + if (ret) goto out; - } /* We only want to get an idea of the values initially in each * slot, so we do no verification - o2hb_check_slot will diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 02878a83f0b4..ffecf89c8c1c 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -480,33 +480,26 @@ static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh) trailer = ocfs2_trailer_from_bh(bh, dir->i_sb); if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) { - rc = -EINVAL; - ocfs2_error(dir->i_sb, - "Invalid dirblock #%llu: " - "signature = %.*s\n", - (unsigned long long)bh->b_blocknr, 7, - trailer->db_signature); + rc = ocfs2_error(dir->i_sb, + "Invalid dirblock #%llu: signature = %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + trailer->db_signature); goto out; } if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) { - rc = -EINVAL; - ocfs2_error(dir->i_sb, - "Directory block #%llu has an invalid " - "db_blkno of %llu", - (unsigned long long)bh->b_blocknr, - (unsigned long long)le64_to_cpu(trailer->db_blkno)); + rc = ocfs2_error(dir->i_sb, + "Directory block #%llu has an invalid db_blkno of %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(trailer->db_blkno)); goto out; } if (le64_to_cpu(trailer->db_parent_dinode) != OCFS2_I(dir)->ip_blkno) { - rc = -EINVAL; - ocfs2_error(dir->i_sb, - "Directory block #%llu on dinode " - "#%llu has an invalid parent_dinode " - "of %llu", - (unsigned long long)bh->b_blocknr, - (unsigned long long)OCFS2_I(dir)->ip_blkno, - (unsigned long long)le64_to_cpu(trailer->db_blkno)); + rc = ocfs2_error(dir->i_sb, + "Directory block #%llu on dinode #%llu has an invalid parent_dinode of %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long long)le64_to_cpu(trailer->db_blkno)); goto out; } out: @@ -604,14 +597,13 @@ static int ocfs2_validate_dx_root(struct super_block *sb, } if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) { - ocfs2_error(sb, - "Dir Index Root # %llu has bad signature %.*s", - (unsigned long long)le64_to_cpu(dx_root->dr_blkno), - 7, dx_root->dr_signature); - return -EINVAL; + ret = ocfs2_error(sb, + "Dir Index Root # %llu has bad signature %.*s\n", + (unsigned long long)le64_to_cpu(dx_root->dr_blkno), + 7, dx_root->dr_signature); } - return 0; + return ret; } static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di, @@ -648,12 +640,11 @@ static int ocfs2_validate_dx_leaf(struct super_block *sb, } if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) { - ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s", - 7, dx_leaf->dl_signature); - return -EROFS; + ret = ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s\n", + 7, dx_leaf->dl_signature); } - return 0; + return ret; } static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno, @@ -812,11 +803,10 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode, el = &eb->h_list; if (el->l_tree_depth) { - ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in " - "btree tree block %llu\n", inode->i_ino, - (unsigned long long)eb_bh->b_blocknr); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in btree tree block %llu\n", + inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); goto out; } } @@ -832,11 +822,11 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode, } if (!found) { - ocfs2_error(inode->i_sb, "Inode %lu has bad extent " - "record (%u, %u, 0) in btree", inode->i_ino, - le32_to_cpu(rec->e_cpos), - ocfs2_rec_clusters(el, rec)); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, + "Inode %lu has bad extent record (%u, %u, 0) in btree\n", + inode->i_ino, + le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); goto out; } diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 7df88a6dd626..6918f30d02cd 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1465,39 +1465,46 @@ static int dlm_request_join(struct dlm_ctxt *dlm, if (status == -ENOPROTOOPT) { status = 0; *response = JOIN_OK_NO_MAP; - } else if (packet.code == JOIN_DISALLOW || - packet.code == JOIN_OK_NO_MAP) { - *response = packet.code; - } else if (packet.code == JOIN_PROTOCOL_MISMATCH) { - mlog(ML_NOTICE, - "This node requested DLM locking protocol %u.%u and " - "filesystem locking protocol %u.%u. At least one of " - "the protocol versions on node %d is not compatible, " - "disconnecting\n", - dlm->dlm_locking_proto.pv_major, - dlm->dlm_locking_proto.pv_minor, - dlm->fs_locking_proto.pv_major, - dlm->fs_locking_proto.pv_minor, - node); - status = -EPROTO; - *response = packet.code; - } else if (packet.code == JOIN_OK) { - *response = packet.code; - /* Use the same locking protocol as the remote node */ - dlm->dlm_locking_proto.pv_minor = packet.dlm_minor; - dlm->fs_locking_proto.pv_minor = packet.fs_minor; - mlog(0, - "Node %d responds JOIN_OK with DLM locking protocol " - "%u.%u and fs locking protocol %u.%u\n", - node, - dlm->dlm_locking_proto.pv_major, - dlm->dlm_locking_proto.pv_minor, - dlm->fs_locking_proto.pv_major, - dlm->fs_locking_proto.pv_minor); } else { - status = -EINVAL; - mlog(ML_ERROR, "invalid response %d from node %u\n", - packet.code, node); + *response = packet.code; + switch (packet.code) { + case JOIN_DISALLOW: + case JOIN_OK_NO_MAP: + break; + case JOIN_PROTOCOL_MISMATCH: + mlog(ML_NOTICE, + "This node requested DLM locking protocol %u.%u and " + "filesystem locking protocol %u.%u. At least one of " + "the protocol versions on node %d is not compatible, " + "disconnecting\n", + dlm->dlm_locking_proto.pv_major, + dlm->dlm_locking_proto.pv_minor, + dlm->fs_locking_proto.pv_major, + dlm->fs_locking_proto.pv_minor, + node); + status = -EPROTO; + break; + case JOIN_OK: + /* Use the same locking protocol as the remote node */ + dlm->dlm_locking_proto.pv_minor = packet.dlm_minor; + dlm->fs_locking_proto.pv_minor = packet.fs_minor; + mlog(0, + "Node %d responds JOIN_OK with DLM locking protocol " + "%u.%u and fs locking protocol %u.%u\n", + node, + dlm->dlm_locking_proto.pv_major, + dlm->dlm_locking_proto.pv_minor, + dlm->fs_locking_proto.pv_major, + dlm->fs_locking_proto.pv_minor); + break; + default: + status = -EINVAL; + mlog(ML_ERROR, "invalid response %d from node %u\n", + packet.code, node); + /* Reset response to JOIN_DISALLOW */ + *response = JOIN_DISALLOW; + break; + } } mlog(0, "status %d, node %d response is %d\n", status, node, @@ -1725,12 +1732,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm) o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB, dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI); + o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB, + dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI); + status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_down); if (status) goto bail; - o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB, - dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI); status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_up); if (status) goto bail; @@ -1845,8 +1853,6 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm) sizeof(struct dlm_exit_domain), dlm_begin_exit_domain_handler, dlm, NULL, &dlm->dlm_domain_handlers); - if (status) - goto bail; bail: if (status) diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index fdf4b41d0609..46b8b2bbc95a 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -498,16 +498,6 @@ static void dlm_lockres_release(struct kref *kref) mlog(0, "destroying lockres %.*s\n", res->lockname.len, res->lockname.name); - spin_lock(&dlm->track_lock); - if (!list_empty(&res->tracking)) - list_del_init(&res->tracking); - else { - mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n", - res->lockname.len, res->lockname.name); - dlm_print_one_lock_resource(res); - } - spin_unlock(&dlm->track_lock); - atomic_dec(&dlm->res_cur_count); if (!hlist_unhashed(&res->hash_node) || @@ -795,8 +785,18 @@ lookup: dlm_lockres_grab_inflight_ref(dlm, tmpres); spin_unlock(&tmpres->spinlock); - if (res) + if (res) { + spin_lock(&dlm->track_lock); + if (!list_empty(&res->tracking)) + list_del_init(&res->tracking); + else + mlog(ML_ERROR, "Resource %.*s not " + "on the Tracking list\n", + res->lockname.len, + res->lockname.name); + spin_unlock(&dlm->track_lock); dlm_lockres_put(res); + } res = tmpres; goto leave; } diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index ce12e0b1a31f..d0e436dc6437 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1776,7 +1776,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, struct dlm_migratable_lockres *mres) { struct dlm_migratable_lock *ml; - struct list_head *queue, *iter; + struct list_head *queue; struct list_head *tmpq = NULL; struct dlm_lock *newlock = NULL; struct dlm_lockstatus *lksb = NULL; @@ -1821,9 +1821,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, spin_lock(&res->spinlock); for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) { tmpq = dlm_list_idx_to_ptr(res, j); - list_for_each(iter, tmpq) { - lock = list_entry(iter, - struct dlm_lock, list); + list_for_each_entry(lock, tmpq, list) { if (lock->ml.cookie == ml->cookie) break; lock = NULL; diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 69aac6f088ad..2e5e6d5fffe8 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -211,6 +211,16 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm, __dlm_unhash_lockres(dlm, res); + spin_lock(&dlm->track_lock); + if (!list_empty(&res->tracking)) + list_del_init(&res->tracking); + else { + mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n", + res->lockname.len, res->lockname.name); + __dlm_print_one_lock_resource(res); + } + spin_unlock(&dlm->track_lock); + /* lockres is not in the hash now. drop the flag and wake up * any processes waiting in dlm_get_lock_resource. */ if (!master) { diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 8b23aa2f52dd..1c91103c1333 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -3035,8 +3035,6 @@ local: ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb); osb->cconn = conn; - - status = 0; bail: if (status < 0) { ocfs2_dlm_shutdown_debug(osb); @@ -4025,9 +4023,13 @@ static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb) osb->dc_work_sequence = osb->dc_wake_sequence; processed = osb->blocked_lock_count; - while (processed) { - BUG_ON(list_empty(&osb->blocked_lock_list)); - + /* + * blocked lock processing in this loop might call iput which can + * remove items off osb->blocked_lock_list. Downconvert up to + * 'processed' number of locks, but stop short if we had some + * removed in ocfs2_mark_lockres_freeing when downconverting. + */ + while (processed && !list_empty(&osb->blocked_lock_list)) { lockres = list_entry(osb->blocked_lock_list.next, struct ocfs2_lock_res, l_blocked_list); list_del_init(&lockres->l_blocked_list); diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 767370b656ca..e4719e0a3f99 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -305,8 +305,8 @@ static int ocfs2_last_eb_is_empty(struct inode *inode, if (el->l_tree_depth) { ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in " - "leaf block %llu\n", inode->i_ino, + "Inode %lu has non zero tree depth in leaf block %llu\n", + inode->i_ino, (unsigned long long)eb_bh->b_blocknr); ret = -EROFS; goto out; @@ -441,8 +441,8 @@ static int ocfs2_get_clusters_nocache(struct inode *inode, if (el->l_tree_depth) { ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in " - "leaf block %llu\n", inode->i_ino, + "Inode %lu has non zero tree depth in leaf block %llu\n", + inode->i_ino, (unsigned long long)eb_bh->b_blocknr); ret = -EROFS; goto out; @@ -475,8 +475,9 @@ static int ocfs2_get_clusters_nocache(struct inode *inode, BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); if (!rec->e_blkno) { - ocfs2_error(inode->i_sb, "Inode %lu has bad extent " - "record (%u, %u, 0)", inode->i_ino, + ocfs2_error(inode->i_sb, + "Inode %lu has bad extent record (%u, %u, 0)\n", + inode->i_ino, le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); ret = -EROFS; @@ -564,8 +565,8 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, if (el->l_tree_depth) { ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in " - "xattr leaf block %llu\n", inode->i_ino, + "Inode %lu has non zero tree depth in xattr leaf block %llu\n", + inode->i_ino, (unsigned long long)eb_bh->b_blocknr); ret = -EROFS; goto out; @@ -582,8 +583,9 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); if (!rec->e_blkno) { - ocfs2_error(inode->i_sb, "Inode %lu has bad extent " - "record (%u, %u, 0) in xattr", inode->i_ino, + ocfs2_error(inode->i_sb, + "Inode %lu has bad extent record (%u, %u, 0) in xattr\n", + inode->i_ino, le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); ret = -EROFS; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 719f7f4c7a37..0e5b4515f92e 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -105,8 +105,11 @@ static int ocfs2_file_open(struct inode *inode, struct file *file) file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name, mode); - if (file->f_mode & FMODE_WRITE) - dquot_initialize(inode); + if (file->f_mode & FMODE_WRITE) { + status = dquot_initialize(inode); + if (status) + goto leave; + } spin_lock(&oi->ip_lock); @@ -1127,6 +1130,7 @@ out: int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) { int status = 0, size_change; + int inode_locked = 0; struct inode *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; struct ocfs2_super *osb = OCFS2_SB(sb); @@ -1155,8 +1159,11 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) if (status) return status; - if (is_quota_modification(inode, attr)) - dquot_initialize(inode); + if (is_quota_modification(inode, attr)) { + status = dquot_initialize(inode); + if (status) + return status; + } size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; if (size_change) { status = ocfs2_rw_lock(inode, 1); @@ -1172,6 +1179,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) mlog_errno(status); goto bail_unlock_rw; } + inode_locked = 1; if (size_change) { status = inode_newsize_ok(inode, attr->ia_size); @@ -1209,8 +1217,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) && OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid)); - if (!transfer_to[USRQUOTA]) { - status = -ESRCH; + if (IS_ERR(transfer_to[USRQUOTA])) { + status = PTR_ERR(transfer_to[USRQUOTA]); goto bail_unlock; } } @@ -1218,8 +1226,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) && OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid)); - if (!transfer_to[GRPQUOTA]) { - status = -ESRCH; + if (IS_ERR(transfer_to[GRPQUOTA])) { + status = PTR_ERR(transfer_to[GRPQUOTA]); goto bail_unlock; } } @@ -1252,7 +1260,10 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) bail_commit: ocfs2_commit_trans(osb, handle); bail_unlock: - ocfs2_inode_unlock(inode, 1); + if (status) { + ocfs2_inode_unlock(inode, 1); + inode_locked = 0; + } bail_unlock_rw: if (size_change) ocfs2_rw_unlock(inode, 1); @@ -1268,6 +1279,8 @@ bail: if (status < 0) mlog_errno(status); } + if (inode_locked) + ocfs2_inode_unlock(inode, 1); return status; } @@ -2256,8 +2269,6 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, ssize_t written = 0; ssize_t ret; size_t count = iov_iter_count(from), orig_count; - loff_t old_size; - u32 old_clusters; struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); @@ -2265,6 +2276,8 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, OCFS2_MOUNT_COHERENCY_BUFFERED); int unaligned_dio = 0; int dropped_dio = 0; + int append_write = ((iocb->ki_pos + count) >= + i_size_read(inode) ? 1 : 0); trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry, (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -2284,8 +2297,9 @@ relock: /* * Concurrent O_DIRECT writes are allowed with * mount_option "coherency=buffered". + * For append write, we must take rw EX. */ - rw_level = (!direct_io || full_coherency); + rw_level = (!direct_io || full_coherency || append_write); ret = ocfs2_rw_lock(inode, rw_level); if (ret < 0) { @@ -2358,13 +2372,6 @@ relock: ocfs2_iocb_set_unaligned_aio(iocb); } - /* - * To later detect whether a journal commit for sync writes is - * necessary, we sample i_size, and cluster count here. - */ - old_size = i_size_read(inode); - old_clusters = OCFS2_I(inode)->ip_clusters; - /* communicate with ocfs2_dio_end_io */ ocfs2_iocb_set_rw_locked(iocb, rw_level); @@ -2372,6 +2379,20 @@ relock: /* buffered aio wouldn't have proper lock coverage today */ BUG_ON(written == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT)); + /* + * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io + * function pointer which is called when o_direct io completes so that + * it can unlock our rw lock. + * Unfortunately there are error cases which call end_io and others + * that don't. so we don't have to unlock the rw_lock if either an + * async dio is going to do it in the future or an end_io after an + * error has already done it. + */ + if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { + rw_level = -1; + unaligned_dio = 0; + } + if (unlikely(written <= 0)) goto no_sync; @@ -2396,21 +2417,7 @@ relock: } no_sync: - /* - * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io - * function pointer which is called when o_direct io completes so that - * it can unlock our rw lock. - * Unfortunately there are error cases which call end_io and others - * that don't. so we don't have to unlock the rw_lock if either an - * async dio is going to do it in the future or an end_io after an - * error has already done it. - */ - if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { - rw_level = -1; - unaligned_dio = 0; - } - - if (unaligned_dio) { + if (unaligned_dio && ocfs2_iocb_is_unaligned_aio(iocb)) { ocfs2_iocb_clear_unaligned_aio(iocb); mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio); } diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index b254416dc8d9..8f87e05ee25d 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -971,6 +971,7 @@ static void ocfs2_delete_inode(struct inode *inode) int wipe, status; sigset_t oldset; struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di = NULL; trace_ocfs2_delete_inode(inode->i_ino, (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -1025,6 +1026,14 @@ static void ocfs2_delete_inode(struct inode *inode) goto bail_unlock_nfs_sync; } + di = (struct ocfs2_dinode *)di_bh->b_data; + /* Skip inode deletion and wait for dio orphan entry recovered + * first */ + if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { + ocfs2_cleanup_delete_inode(inode, 0); + goto bail_unlock_inode; + } + /* Query the cluster. This will be the final decision made * before we go ahead and wipe the inode. */ status = ocfs2_query_inode_wipe(inode, di_bh, &wipe); @@ -1191,17 +1200,19 @@ void ocfs2_evict_inode(struct inode *inode) int ocfs2_drop_inode(struct inode *inode) { struct ocfs2_inode_info *oi = OCFS2_I(inode); - int res; trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags); - if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) - res = 1; - else - res = generic_drop_inode(inode); + assert_spin_locked(&inode->i_lock); + inode->i_state |= I_WILL_FREE; + spin_unlock(&inode->i_lock); + write_inode_now(inode, 1); + spin_lock(&inode->i_lock); + WARN_ON(inode->i_state & I_NEW); + inode->i_state &= ~I_WILL_FREE; - return res; + return 1; } /* @@ -1350,32 +1361,32 @@ int ocfs2_validate_inode_block(struct super_block *sb, rc = -EINVAL; if (!OCFS2_IS_VALID_DINODE(di)) { - ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n", - (unsigned long long)bh->b_blocknr, 7, - di->i_signature); + rc = ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + di->i_signature); goto bail; } if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { - ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n", - (unsigned long long)bh->b_blocknr, - (unsigned long long)le64_to_cpu(di->i_blkno)); + rc = ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(di->i_blkno)); goto bail; } if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { - ocfs2_error(sb, - "Invalid dinode #%llu: OCFS2_VALID_FL not set\n", - (unsigned long long)bh->b_blocknr); + rc = ocfs2_error(sb, + "Invalid dinode #%llu: OCFS2_VALID_FL not set\n", + (unsigned long long)bh->b_blocknr); goto bail; } if (le32_to_cpu(di->i_fs_generation) != OCFS2_SB(sb)->fs_generation) { - ocfs2_error(sb, - "Invalid dinode #%llu: fs_generation is %u\n", - (unsigned long long)bh->b_blocknr, - le32_to_cpu(di->i_fs_generation)); + rc = ocfs2_error(sb, + "Invalid dinode #%llu: fs_generation is %u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(di->i_fs_generation)); goto bail; } diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 5e86b247c821..ca3431ee7f24 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -81,8 +81,6 @@ struct ocfs2_inode_info tid_t i_sync_tid; tid_t i_datasync_tid; - wait_queue_head_t append_dio_wq; - struct dquot *i_dquot[MAXQUOTAS]; }; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 7c099f7032fd..ff82b28462a6 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -374,7 +374,7 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs) mlog_errno(PTR_ERR(handle)); if (is_journal_aborted(journal)) { - ocfs2_abort(osb->sb, "Detected aborted journal"); + ocfs2_abort(osb->sb, "Detected aborted journal\n"); handle = ERR_PTR(-EROFS); } } else { @@ -668,7 +668,23 @@ static int __ocfs2_journal_access(handle_t *handle, mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n"); mlog(ML_ERROR, "b_blocknr=%llu\n", (unsigned long long)bh->b_blocknr); - BUG(); + + lock_buffer(bh); + /* + * A previous attempt to write this buffer head failed. + * Nothing we can do but to retry the write and hope for + * the best. + */ + if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) { + clear_buffer_write_io_error(bh); + set_buffer_uptodate(bh); + } + + if (!buffer_uptodate(bh)) { + unlock_buffer(bh); + return -EIO; + } + unlock_buffer(bh); } /* Set the current transaction information on the ci so @@ -2170,6 +2186,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, iter = oi->ip_next_orphan; oi->ip_next_orphan = NULL; + mutex_lock(&inode->i_mutex); ret = ocfs2_rw_lock(inode, 1); if (ret < 0) { mlog_errno(ret); @@ -2193,7 +2210,9 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, * ocfs2_delete_inode. */ oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; spin_unlock(&oi->ip_lock); - } else if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) && + } + + if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) && (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { ret = ocfs2_truncate_file(inode, di_bh, i_size_read(inode)); @@ -2206,17 +2225,16 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0); if (ret) mlog_errno(ret); - - wake_up(&OCFS2_I(inode)->append_dio_wq); } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */ unlock_inode: ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + di_bh = NULL; unlock_rw: ocfs2_rw_unlock(inode, 1); next: + mutex_unlock(&inode->i_mutex); iput(inode); - brelse(di_bh); - di_bh = NULL; inode = iter; } diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 857bbbcd39f3..0a4457fb0711 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -665,8 +665,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, #ifdef CONFIG_OCFS2_DEBUG_FS if (le32_to_cpu(alloc->id1.bitmap1.i_used) != ocfs2_local_alloc_count_bits(alloc)) { - ocfs2_error(osb->sb, "local alloc inode %llu says it has " - "%u used bits, but a count shows %u", + ocfs2_error(osb->sb, "local alloc inode %llu says it has %u used bits, but a count shows %u\n", (unsigned long long)le64_to_cpu(alloc->i_blkno), le32_to_cpu(alloc->id1.bitmap1.i_used), ocfs2_local_alloc_count_bits(alloc)); diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 56a768d06aa6..124471d26a73 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -99,11 +99,9 @@ static int __ocfs2_move_extent(handle_t *handle, index = ocfs2_search_extent_list(el, cpos); if (index == -1) { - ocfs2_error(inode->i_sb, - "Inode %llu has an extent at cpos %u which can no " - "longer be found.\n", - (unsigned long long)ino, cpos); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, + "Inode %llu has an extent at cpos %u which can no longer be found\n", + (unsigned long long)ino, cpos); goto out; } diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 6e6abb93fda5..b7dfac226b1e 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -200,11 +200,12 @@ bail: static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode) { struct inode *inode; + int status; inode = new_inode(dir->i_sb); if (!inode) { mlog(ML_ERROR, "new_inode failed!\n"); - return NULL; + return ERR_PTR(-ENOMEM); } /* populate as many fields early on as possible - many of @@ -213,7 +214,10 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode) if (S_ISDIR(mode)) set_nlink(inode, 2); inode_init_owner(inode, dir, mode); - dquot_initialize(inode); + status = dquot_initialize(inode); + if (status) + return ERR_PTR(status); + return inode; } @@ -264,7 +268,11 @@ static int ocfs2_mknod(struct inode *dir, (unsigned long long)OCFS2_I(dir)->ip_blkno, (unsigned long)dev, mode); - dquot_initialize(dir); + status = dquot_initialize(dir); + if (status) { + mlog_errno(status); + return status; + } /* get our super block */ osb = OCFS2_SB(dir->i_sb); @@ -311,8 +319,9 @@ static int ocfs2_mknod(struct inode *dir, } inode = ocfs2_get_init_inode(dir, mode); - if (!inode) { - status = -ENOMEM; + if (IS_ERR(inode)) { + status = PTR_ERR(inode); + inode = NULL; mlog_errno(status); goto leave; } @@ -708,7 +717,11 @@ static int ocfs2_link(struct dentry *old_dentry, if (S_ISDIR(inode->i_mode)) return -EPERM; - dquot_initialize(dir); + err = dquot_initialize(dir); + if (err) { + mlog_errno(err); + return err; + } err = ocfs2_double_lock(osb, &old_dir_bh, old_dir, &parent_fe_bh, dir, 0); @@ -896,7 +909,11 @@ static int ocfs2_unlink(struct inode *dir, (unsigned long long)OCFS2_I(dir)->ip_blkno, (unsigned long long)OCFS2_I(inode)->ip_blkno); - dquot_initialize(dir); + status = dquot_initialize(dir); + if (status) { + mlog_errno(status); + return status; + } BUG_ON(d_inode(dentry->d_parent) != dir); @@ -1018,11 +1035,6 @@ leave: if (handle) ocfs2_commit_trans(osb, handle); - if (child_locked) - ocfs2_inode_unlock(inode, 1); - - ocfs2_inode_unlock(dir, 1); - if (orphan_dir) { /* This was locked for us in ocfs2_prepare_orphan_dir() */ ocfs2_inode_unlock(orphan_dir, 1); @@ -1030,6 +1042,11 @@ leave: iput(orphan_dir); } + if (child_locked) + ocfs2_inode_unlock(inode, 1); + + ocfs2_inode_unlock(dir, 1); + brelse(fe_bh); brelse(parent_node_bh); @@ -1230,8 +1247,16 @@ static int ocfs2_rename(struct inode *old_dir, old_dentry->d_name.len, old_dentry->d_name.name, new_dentry->d_name.len, new_dentry->d_name.name); - dquot_initialize(old_dir); - dquot_initialize(new_dir); + status = dquot_initialize(old_dir); + if (status) { + mlog_errno(status); + goto bail; + } + status = dquot_initialize(new_dir); + if (status) { + mlog_errno(status); + goto bail; + } osb = OCFS2_SB(old_dir->i_sb); @@ -1284,6 +1309,11 @@ static int ocfs2_rename(struct inode *old_dir, } parents_locked = 1; + if (!new_dir->i_nlink) { + status = -EACCES; + goto bail; + } + /* make sure both dirs have bhs * get an extra ref on old_dir_bh if old==new */ if (!new_dir_bh) { @@ -1544,12 +1574,25 @@ static int ocfs2_rename(struct inode *old_dir, status = ocfs2_find_entry(old_dentry->d_name.name, old_dentry->d_name.len, old_dir, &old_entry_lookup); - if (status) + if (status) { + if (!is_journal_aborted(osb->journal->j_journal)) { + ocfs2_error(osb->sb, "new entry %.*s is added, but old entry %.*s " + "is not deleted.", + new_dentry->d_name.len, new_dentry->d_name.name, + old_dentry->d_name.len, old_dentry->d_name.name); + } goto bail; + } status = ocfs2_delete_entry(handle, old_dir, &old_entry_lookup); if (status < 0) { mlog_errno(status); + if (!is_journal_aborted(osb->journal->j_journal)) { + ocfs2_error(osb->sb, "new entry %.*s is added, but old entry %.*s " + "is not deleted.", + new_dentry->d_name.len, new_dentry->d_name.name, + old_dentry->d_name.len, old_dentry->d_name.name); + } goto bail; } @@ -1608,21 +1651,9 @@ static int ocfs2_rename(struct inode *old_dir, ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir); status = 0; bail: - if (rename_lock) - ocfs2_rename_unlock(osb); - if (handle) ocfs2_commit_trans(osb, handle); - if (parents_locked) - ocfs2_double_unlock(old_dir, new_dir); - - if (old_child_locked) - ocfs2_inode_unlock(old_inode, 1); - - if (new_child_locked) - ocfs2_inode_unlock(new_inode, 1); - if (orphan_dir) { /* This was locked for us in ocfs2_prepare_orphan_dir() */ ocfs2_inode_unlock(orphan_dir, 1); @@ -1630,6 +1661,18 @@ bail: iput(orphan_dir); } + if (new_child_locked) + ocfs2_inode_unlock(new_inode, 1); + + if (old_child_locked) + ocfs2_inode_unlock(old_inode, 1); + + if (parents_locked) + ocfs2_double_unlock(old_dir, new_dir); + + if (rename_lock) + ocfs2_rename_unlock(osb); + if (new_inode) sync_mapping_buffers(old_inode->i_mapping); @@ -1786,7 +1829,11 @@ static int ocfs2_symlink(struct inode *dir, trace_ocfs2_symlink_begin(dir, dentry, symname, dentry->d_name.len, dentry->d_name.name); - dquot_initialize(dir); + status = dquot_initialize(dir); + if (status) { + mlog_errno(status); + goto bail; + } sb = dir->i_sb; osb = OCFS2_SB(sb); @@ -1831,8 +1878,9 @@ static int ocfs2_symlink(struct inode *dir, } inode = ocfs2_get_init_inode(dir, S_IFLNK | S_IRWXUGO); - if (!inode) { - status = -ENOMEM; + if (IS_ERR(inode)) { + status = PTR_ERR(inode); + inode = NULL; mlog_errno(status); goto bail; } @@ -2485,8 +2533,9 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, } inode = ocfs2_get_init_inode(dir, mode); - if (!inode) { - status = -ENOMEM; + if (IS_ERR(inode)) { + status = PTR_ERR(inode); + inode = NULL; mlog_errno(status); goto leave; } @@ -2570,27 +2619,6 @@ leave: return status; } -static int ocfs2_dio_orphan_recovered(struct inode *inode) -{ - int ret; - struct buffer_head *di_bh = NULL; - struct ocfs2_dinode *di = NULL; - - ret = ocfs2_inode_lock(inode, &di_bh, 1); - if (ret < 0) { - mlog_errno(ret); - return 0; - } - - di = (struct ocfs2_dinode *) di_bh->b_data; - ret = !(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL)); - ocfs2_inode_unlock(inode, 1); - brelse(di_bh); - - return ret; -} - -#define OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL 10000 int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, struct inode *inode) { @@ -2602,7 +2630,6 @@ int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, handle_t *handle = NULL; struct ocfs2_dinode *di = NULL; -restart: status = ocfs2_inode_lock(inode, &di_bh, 1); if (status < 0) { mlog_errno(status); @@ -2612,15 +2639,21 @@ restart: di = (struct ocfs2_dinode *) di_bh->b_data; /* * Another append dio crashed? - * If so, wait for recovery first. + * If so, manually recover it first. */ if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { - ocfs2_inode_unlock(inode, 1); - brelse(di_bh); - wait_event_interruptible_timeout(OCFS2_I(inode)->append_dio_wq, - ocfs2_dio_orphan_recovered(inode), - msecs_to_jiffies(OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL)); - goto restart; + status = ocfs2_truncate_file(inode, di_bh, i_size_read(inode)); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail_unlock_inode; + } + + status = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0); + if (status < 0) { + mlog_errno(status); + goto bail_unlock_inode; + } } status = ocfs2_prepare_orphan_dir(osb, &orphan_dir_inode, diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 690ddc60189b..7a0126267847 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -286,6 +286,8 @@ enum ocfs2_mount_options OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */ OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */ + OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */ + OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */ }; #define OCFS2_OSB_SOFT_RO 0x0001 diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index db64ce2d4667..540ab5b75dbb 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -168,7 +168,7 @@ /* Refcount tree support */ #define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE 0x1000 -/* Discontigous block groups */ +/* Discontiguous block groups */ #define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000 /* @@ -939,7 +939,7 @@ struct ocfs2_group_desc /* * Block groups may be discontiguous when * OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG is set. - * The extents of a discontigous block group are + * The extents of a discontiguous block group are * stored in bg_list. It is a flat list. * l_tree_depth must always be zero. A * discontiguous group is signified by a non-zero diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 3d0b63d34225..8a54fd8a4fa5 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -138,8 +138,7 @@ static int ocfs2_read_quota_block(struct inode *inode, u64 v_block, if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) { ocfs2_error(inode->i_sb, - "Quota file %llu is probably corrupted! Requested " - "to read block %Lu but file has size only %Lu\n", + "Quota file %llu is probably corrupted! Requested to read block %Lu but file has size only %Lu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)v_block, (unsigned long long)i_size_read(inode)); @@ -499,8 +498,8 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode, dquot = dqget(sb, make_kqid(&init_user_ns, type, le64_to_cpu(dqblk->dqb_id))); - if (!dquot) { - status = -EIO; + if (IS_ERR(dquot)) { + status = PTR_ERR(dquot); mlog(ML_ERROR, "Failed to get quota structure " "for id %u, type %d. Cannot finish quota " "file recovery.\n", diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index b69dd14c0b9b..e5d57cd32505 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -102,32 +102,30 @@ static int ocfs2_validate_refcount_block(struct super_block *sb, if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) { - ocfs2_error(sb, - "Refcount block #%llu has bad signature %.*s", - (unsigned long long)bh->b_blocknr, 7, - rb->rf_signature); - return -EINVAL; + rc = ocfs2_error(sb, + "Refcount block #%llu has bad signature %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + rb->rf_signature); + goto out; } if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) { - ocfs2_error(sb, - "Refcount block #%llu has an invalid rf_blkno " - "of %llu", - (unsigned long long)bh->b_blocknr, - (unsigned long long)le64_to_cpu(rb->rf_blkno)); - return -EINVAL; + rc = ocfs2_error(sb, + "Refcount block #%llu has an invalid rf_blkno of %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(rb->rf_blkno)); + goto out; } if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) { - ocfs2_error(sb, - "Refcount block #%llu has an invalid " - "rf_fs_generation of #%u", - (unsigned long long)bh->b_blocknr, - le32_to_cpu(rb->rf_fs_generation)); - return -EINVAL; + rc = ocfs2_error(sb, + "Refcount block #%llu has an invalid rf_fs_generation of #%u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(rb->rf_fs_generation)); + goto out; } - - return 0; +out: + return rc; } static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci, @@ -1102,12 +1100,10 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci, el = &eb->h_list; if (el->l_tree_depth) { - ocfs2_error(sb, - "refcount tree %llu has non zero tree " - "depth in leaf btree tree block %llu\n", - (unsigned long long)ocfs2_metadata_cache_owner(ci), - (unsigned long long)eb_bh->b_blocknr); - ret = -EROFS; + ret = ocfs2_error(sb, + "refcount tree %llu has non zero tree depth in leaf btree tree block %llu\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)eb_bh->b_blocknr); goto out; } } @@ -2359,10 +2355,8 @@ static int ocfs2_mark_extent_refcounted(struct inode *inode, cpos, len, phys); if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { - ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " - "tree, but the feature bit is not set in the " - "super block.", inode->i_ino); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", + inode->i_ino); goto out; } @@ -2545,10 +2539,8 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno); if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { - ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " - "tree, but the feature bit is not set in the " - "super block.", inode->i_ino); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", + inode->i_ino); goto out; } @@ -2672,11 +2664,10 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode, el = &eb->h_list; if (el->l_tree_depth) { - ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in " - "leaf block %llu\n", inode->i_ino, - (unsigned long long)eb_bh->b_blocknr); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in leaf block %llu\n", + inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); goto out; } } @@ -3106,11 +3097,9 @@ static int ocfs2_clear_ext_refcount(handle_t *handle, index = ocfs2_search_extent_list(el, cpos); if (index == -1) { - ocfs2_error(sb, - "Inode %llu has an extent at cpos %u which can no " - "longer be found.\n", - (unsigned long long)ino, cpos); - ret = -EROFS; + ret = ocfs2_error(sb, + "Inode %llu has an extent at cpos %u which can no longer be found\n", + (unsigned long long)ino, cpos); goto out; } @@ -3376,10 +3365,8 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context) struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { - ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " - "tree, but the feature bit is not set in the " - "super block.", inode->i_ino); - return -EROFS; + return ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", + inode->i_ino); } ocfs2_init_dealloc_ctxt(&context->dealloc); @@ -4419,8 +4406,9 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, } mutex_lock(&inode->i_mutex); - dquot_initialize(dir); - error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve); + error = dquot_initialize(dir); + if (!error) + error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve); mutex_unlock(&inode->i_mutex); if (!error) fsnotify_create(dir, new_dentry); diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 2768eb1da2b8..ced70c8139f7 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -655,14 +655,7 @@ static int ocfs2_control_init(void) static void ocfs2_control_exit(void) { - int rc; - - rc = misc_deregister(&ocfs2_control_device); - if (rc) - printk(KERN_ERR - "ocfs2: Unable to deregister ocfs2_control device " - "(errno %d)\n", - -rc); + misc_deregister(&ocfs2_control_device); } static void fsdlm_lock_ast_wrapper(void *astarg) diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 4479029630bb..d83d2602cf2b 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -149,10 +149,8 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) brelse(ac->ac_bh); ac->ac_bh = NULL; ac->ac_resv = NULL; - if (ac->ac_find_loc_priv) { - kfree(ac->ac_find_loc_priv); - ac->ac_find_loc_priv = NULL; - } + kfree(ac->ac_find_loc_priv); + ac->ac_find_loc_priv = NULL; } void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) @@ -167,12 +165,12 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl) } #define do_error(fmt, ...) \ - do{ \ - if (resize) \ - mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \ - else \ - ocfs2_error(sb, fmt, ##__VA_ARGS__); \ - } while (0) +do { \ + if (resize) \ + mlog(ML_ERROR, fmt, ##__VA_ARGS__); \ + else \ + return ocfs2_error(sb, fmt, ##__VA_ARGS__); \ +} while (0) static int ocfs2_validate_gd_self(struct super_block *sb, struct buffer_head *bh, @@ -181,44 +179,35 @@ static int ocfs2_validate_gd_self(struct super_block *sb, struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { - do_error("Group descriptor #%llu has bad signature %.*s", + do_error("Group descriptor #%llu has bad signature %.*s\n", (unsigned long long)bh->b_blocknr, 7, gd->bg_signature); - return -EINVAL; } if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) { - do_error("Group descriptor #%llu has an invalid bg_blkno " - "of %llu", + do_error("Group descriptor #%llu has an invalid bg_blkno of %llu\n", (unsigned long long)bh->b_blocknr, (unsigned long long)le64_to_cpu(gd->bg_blkno)); - return -EINVAL; } if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) { - do_error("Group descriptor #%llu has an invalid " - "fs_generation of #%u", + do_error("Group descriptor #%llu has an invalid fs_generation of #%u\n", (unsigned long long)bh->b_blocknr, le32_to_cpu(gd->bg_generation)); - return -EINVAL; } if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) { - do_error("Group descriptor #%llu has bit count %u but " - "claims that %u are free", + do_error("Group descriptor #%llu has bit count %u but claims that %u are free\n", (unsigned long long)bh->b_blocknr, le16_to_cpu(gd->bg_bits), le16_to_cpu(gd->bg_free_bits_count)); - return -EINVAL; } if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) { - do_error("Group descriptor #%llu has bit count %u but " - "max bitmap bits of %u", + do_error("Group descriptor #%llu has bit count %u but max bitmap bits of %u\n", (unsigned long long)bh->b_blocknr, le16_to_cpu(gd->bg_bits), 8 * le16_to_cpu(gd->bg_size)); - return -EINVAL; } return 0; @@ -233,20 +222,17 @@ static int ocfs2_validate_gd_parent(struct super_block *sb, struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; if (di->i_blkno != gd->bg_parent_dinode) { - do_error("Group descriptor #%llu has bad parent " - "pointer (%llu, expected %llu)", + do_error("Group descriptor #%llu has bad parent pointer (%llu, expected %llu)\n", (unsigned long long)bh->b_blocknr, (unsigned long long)le64_to_cpu(gd->bg_parent_dinode), (unsigned long long)le64_to_cpu(di->i_blkno)); - return -EINVAL; } max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc); if (le16_to_cpu(gd->bg_bits) > max_bits) { - do_error("Group descriptor #%llu has bit count of %u", + do_error("Group descriptor #%llu has bit count of %u\n", (unsigned long long)bh->b_blocknr, le16_to_cpu(gd->bg_bits)); - return -EINVAL; } /* In resize, we may meet the case bg_chain == cl_next_free_rec. */ @@ -254,10 +240,9 @@ static int ocfs2_validate_gd_parent(struct super_block *sb, le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) || ((le16_to_cpu(gd->bg_chain) == le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) { - do_error("Group descriptor #%llu has bad chain %u", + do_error("Group descriptor #%llu has bad chain %u\n", (unsigned long long)bh->b_blocknr, le16_to_cpu(gd->bg_chain)); - return -EINVAL; } return 0; @@ -384,11 +369,10 @@ static int ocfs2_block_group_fill(handle_t *handle, struct super_block * sb = alloc_inode->i_sb; if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) { - ocfs2_error(alloc_inode->i_sb, "group block (%llu) != " - "b_blocknr (%llu)", - (unsigned long long)group_blkno, - (unsigned long long) bg_bh->b_blocknr); - status = -EIO; + status = ocfs2_error(alloc_inode->i_sb, + "group block (%llu) != b_blocknr (%llu)\n", + (unsigned long long)group_blkno, + (unsigned long long) bg_bh->b_blocknr); goto bail; } @@ -834,9 +818,9 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) { - ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu", - (unsigned long long)le64_to_cpu(fe->i_blkno)); - status = -EIO; + status = ocfs2_error(alloc_inode->i_sb, + "Invalid chain allocator %llu\n", + (unsigned long long)le64_to_cpu(fe->i_blkno)); goto bail; } @@ -1370,12 +1354,11 @@ int ocfs2_block_group_set_bits(handle_t *handle, le16_add_cpu(&bg->bg_free_bits_count, -num_bits); if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { - ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" - " count %u but claims %u are freed. num_bits %d", - (unsigned long long)le64_to_cpu(bg->bg_blkno), - le16_to_cpu(bg->bg_bits), - le16_to_cpu(bg->bg_free_bits_count), num_bits); - return -EROFS; + return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", + (unsigned long long)le64_to_cpu(bg->bg_blkno), + le16_to_cpu(bg->bg_bits), + le16_to_cpu(bg->bg_free_bits_count), + num_bits); } while(num_bits--) ocfs2_set_bit(bit_off++, bitmap); @@ -1905,13 +1888,11 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, if (le32_to_cpu(fe->id1.bitmap1.i_used) >= le32_to_cpu(fe->id1.bitmap1.i_total)) { - ocfs2_error(ac->ac_inode->i_sb, - "Chain allocator dinode %llu has %u used " - "bits but only %u total.", - (unsigned long long)le64_to_cpu(fe->i_blkno), - le32_to_cpu(fe->id1.bitmap1.i_used), - le32_to_cpu(fe->id1.bitmap1.i_total)); - status = -EIO; + status = ocfs2_error(ac->ac_inode->i_sb, + "Chain allocator dinode %llu has %u used bits but only %u total\n", + (unsigned long long)le64_to_cpu(fe->i_blkno), + le32_to_cpu(fe->id1.bitmap1.i_used), + le32_to_cpu(fe->id1.bitmap1.i_total)); goto bail; } @@ -2429,12 +2410,11 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, } le16_add_cpu(&bg->bg_free_bits_count, num_bits); if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { - ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" - " count %u but claims %u are freed. num_bits %d", - (unsigned long long)le64_to_cpu(bg->bg_blkno), - le16_to_cpu(bg->bg_bits), - le16_to_cpu(bg->bg_free_bits_count), num_bits); - return -EROFS; + return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", + (unsigned long long)le64_to_cpu(bg->bg_blkno), + le16_to_cpu(bg->bg_bits), + le16_to_cpu(bg->bg_free_bits_count), + num_bits); } if (undo_fn) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 403c5660b306..2de4c8a9340c 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -192,6 +192,7 @@ enum { Opt_resv_level, Opt_dir_resv_level, Opt_journal_async_commit, + Opt_err_cont, Opt_err, }; @@ -224,6 +225,7 @@ static const match_table_t tokens = { {Opt_resv_level, "resv_level=%u"}, {Opt_dir_resv_level, "dir_resv_level=%u"}, {Opt_journal_async_commit, "journal_async_commit"}, + {Opt_err_cont, "errors=continue"}, {Opt_err, NULL} }; @@ -1330,10 +1332,19 @@ static int ocfs2_parse_options(struct super_block *sb, mopt->mount_opt |= OCFS2_MOUNT_NOINTR; break; case Opt_err_panic: + mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT; + mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS; mopt->mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; break; case Opt_err_ro: + mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT; mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC; + mopt->mount_opt |= OCFS2_MOUNT_ERRORS_ROFS; + break; + case Opt_err_cont: + mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS; + mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC; + mopt->mount_opt |= OCFS2_MOUNT_ERRORS_CONT; break; case Opt_data_ordered: mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK; @@ -1530,6 +1541,8 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root) if (opts & OCFS2_MOUNT_ERRORS_PANIC) seq_printf(s, ",errors=panic"); + else if (opts & OCFS2_MOUNT_ERRORS_CONT) + seq_printf(s, ",errors=continue"); else seq_printf(s, ",errors=remount-ro"); @@ -1550,8 +1563,8 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",localflocks,"); if (osb->osb_cluster_stack[0]) - seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN, - osb->osb_cluster_stack); + seq_show_option_n(s, "cluster_stack", osb->osb_cluster_stack, + OCFS2_STACK_LABEL_LEN); if (opts & OCFS2_MOUNT_USRQUOTA) seq_printf(s, ",usrquota"); if (opts & OCFS2_MOUNT_GRPQUOTA) @@ -1746,8 +1759,6 @@ static void ocfs2_inode_init_once(void *data) ocfs2_lock_res_init_once(&oi->ip_inode_lockres); ocfs2_lock_res_init_once(&oi->ip_open_lockres); - init_waitqueue_head(&oi->append_dio_wq); - ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode), &ocfs2_inode_caching_ops); @@ -2541,31 +2552,43 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb) memset(osb, 0, sizeof(struct ocfs2_super)); } -/* Put OCFS2 into a readonly state, or (if the user specifies it), - * panic(). We do not support continue-on-error operation. */ -static void ocfs2_handle_error(struct super_block *sb) +/* Depending on the mount option passed, perform one of the following: + * Put OCFS2 into a readonly state (default) + * Return EIO so that only the process errs + * Fix the error as if fsck.ocfs2 -y + * panic + */ +static int ocfs2_handle_error(struct super_block *sb) { struct ocfs2_super *osb = OCFS2_SB(sb); - - if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC) - panic("OCFS2: (device %s): panic forced after error\n", - sb->s_id); + int rv = 0; ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS); + pr_crit("On-disk corruption discovered. " + "Please run fsck.ocfs2 once the filesystem is unmounted.\n"); - if (sb->s_flags & MS_RDONLY && - (ocfs2_is_soft_readonly(osb) || - ocfs2_is_hard_readonly(osb))) - return; - - printk(KERN_CRIT "File system is now read-only due to the potential " - "of on-disk corruption. Please run fsck.ocfs2 once the file " - "system is unmounted.\n"); - sb->s_flags |= MS_RDONLY; - ocfs2_set_ro_flag(osb, 0); + if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC) { + panic("OCFS2: (device %s): panic forced after error\n", + sb->s_id); + } else if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_CONT) { + pr_crit("OCFS2: Returning error to the calling process.\n"); + rv = -EIO; + } else { /* default option */ + rv = -EROFS; + if (sb->s_flags & MS_RDONLY && + (ocfs2_is_soft_readonly(osb) || + ocfs2_is_hard_readonly(osb))) + return rv; + + pr_crit("OCFS2: File system is now read-only.\n"); + sb->s_flags |= MS_RDONLY; + ocfs2_set_ro_flag(osb, 0); + } + + return rv; } -void __ocfs2_error(struct super_block *sb, const char *function, +int __ocfs2_error(struct super_block *sb, const char *function, const char *fmt, ...) { struct va_format vaf; @@ -2577,12 +2600,12 @@ void __ocfs2_error(struct super_block *sb, const char *function, /* Not using mlog here because we want to show the actual * function the error came from. */ - printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV\n", + printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV", sb->s_id, function, &vaf); va_end(args); - ocfs2_handle_error(sb); + return ocfs2_handle_error(sb); } /* Handle critical errors. This is intentionally more drastic than @@ -2599,7 +2622,7 @@ void __ocfs2_abort(struct super_block *sb, const char *function, vaf.fmt = fmt; vaf.va = &args; - printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV\n", + printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV", sb->s_id, function, &vaf); va_end(args); diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h index 74ff74cf78fe..b477d0b1c7b6 100644 --- a/fs/ocfs2/super.h +++ b/fs/ocfs2/super.h @@ -32,16 +32,18 @@ int ocfs2_publish_get_mount_state(struct ocfs2_super *osb, int node_num); __printf(3, 4) -void __ocfs2_error(struct super_block *sb, const char *function, +int __ocfs2_error(struct super_block *sb, const char *function, const char *fmt, ...); -#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args) +#define ocfs2_error(sb, fmt, ...) \ + __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##__VA_ARGS__) __printf(3, 4) void __ocfs2_abort(struct super_block *sb, const char *function, const char *fmt, ...); -#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args) +#define ocfs2_abort(sb, fmt, ...) \ + __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##__VA_ARGS__) /* * Void signal blockers, because in-kernel sigprocmask() only fails diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 889f3796a0d7..ebfdea78659b 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -499,30 +499,24 @@ static int ocfs2_validate_xattr_block(struct super_block *sb, */ if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) { - ocfs2_error(sb, - "Extended attribute block #%llu has bad " - "signature %.*s", - (unsigned long long)bh->b_blocknr, 7, - xb->xb_signature); - return -EINVAL; + return ocfs2_error(sb, + "Extended attribute block #%llu has bad signature %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + xb->xb_signature); } if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) { - ocfs2_error(sb, - "Extended attribute block #%llu has an " - "invalid xb_blkno of %llu", - (unsigned long long)bh->b_blocknr, - (unsigned long long)le64_to_cpu(xb->xb_blkno)); - return -EINVAL; + return ocfs2_error(sb, + "Extended attribute block #%llu has an invalid xb_blkno of %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(xb->xb_blkno)); } if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) { - ocfs2_error(sb, - "Extended attribute block #%llu has an invalid " - "xb_fs_generation of #%u", - (unsigned long long)bh->b_blocknr, - le32_to_cpu(xb->xb_fs_generation)); - return -EINVAL; + return ocfs2_error(sb, + "Extended attribute block #%llu has an invalid xb_fs_generation of #%u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(xb->xb_fs_generation)); } return 0; @@ -3694,11 +3688,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode, el = &eb->h_list; if (el->l_tree_depth) { - ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in " - "xattr tree block %llu\n", inode->i_ino, - (unsigned long long)eb_bh->b_blocknr); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in xattr tree block %llu\n", + inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); goto out; } } @@ -3713,11 +3706,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode, } if (!e_blkno) { - ocfs2_error(inode->i_sb, "Inode %lu has bad extent " - "record (%u, %u, 0) in xattr", inode->i_ino, - le32_to_cpu(rec->e_cpos), - ocfs2_rec_clusters(el, rec)); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, "Inode %lu has bad extent record (%u, %u, 0) in xattr\n", + inode->i_ino, + le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); goto out; } @@ -7334,6 +7326,9 @@ static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list, const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; + if (!capable(CAP_SYS_ADMIN)) + return 0; + if (list && total_len <= list_size) { memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); memcpy(list + prefix_len, name, name_len); diff --git a/fs/open.c b/fs/open.c index e33dab287fa0..b6f1e96a7c0b 100644 --- a/fs/open.c +++ b/fs/open.c @@ -377,7 +377,7 @@ retry: * with the "noexec" flag. */ res = -EACCES; - if (path.mnt->mnt_flags & MNT_NOEXEC) + if (path_noexec(&path)) goto out_path_release; } diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 7466ff339c66..79073d68b475 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -588,10 +588,10 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry) struct super_block *sb = dentry->d_sb; struct ovl_fs *ufs = sb->s_fs_info; - seq_printf(m, ",lowerdir=%s", ufs->config.lowerdir); + seq_show_option(m, "lowerdir", ufs->config.lowerdir); if (ufs->config.upperdir) { - seq_printf(m, ",upperdir=%s", ufs->config.upperdir); - seq_printf(m, ",workdir=%s", ufs->config.workdir); + seq_show_option(m, "upperdir", ufs->config.upperdir); + seq_show_option(m, "workdir", ufs->config.workdir); } return 0; } diff --git a/fs/proc/array.c b/fs/proc/array.c index ce065cf3104f..f60f0121e331 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -308,7 +308,8 @@ static void render_cap_t(struct seq_file *m, const char *header, static inline void task_cap(struct seq_file *m, struct task_struct *p) { const struct cred *cred; - kernel_cap_t cap_inheritable, cap_permitted, cap_effective, cap_bset; + kernel_cap_t cap_inheritable, cap_permitted, cap_effective, + cap_bset, cap_ambient; rcu_read_lock(); cred = __task_cred(p); @@ -316,12 +317,14 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p) cap_permitted = cred->cap_permitted; cap_effective = cred->cap_effective; cap_bset = cred->cap_bset; + cap_ambient = cred->cap_ambient; rcu_read_unlock(); render_cap_t(m, "CapInh:\t", &cap_inheritable); render_cap_t(m, "CapPrm:\t", &cap_permitted); render_cap_t(m, "CapEff:\t", &cap_effective); render_cap_t(m, "CapBnd:\t", &cap_bset); + render_cap_t(m, "CapAmb:\t", &cap_ambient); } static inline void task_seccomp(struct seq_file *m, struct task_struct *p) diff --git a/fs/proc/base.c b/fs/proc/base.c index aa50d1ac28fc..b25eee4cead5 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1230,10 +1230,9 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file_inode(file); - char *page, *tmp; - ssize_t length; uid_t loginuid; kuid_t kloginuid; + int rv; rcu_read_lock(); if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { @@ -1242,46 +1241,28 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, } rcu_read_unlock(); - if (count >= PAGE_SIZE) - count = PAGE_SIZE - 1; - if (*ppos != 0) { /* No partial writes. */ return -EINVAL; } - page = (char*)__get_free_page(GFP_TEMPORARY); - if (!page) - return -ENOMEM; - length = -EFAULT; - if (copy_from_user(page, buf, count)) - goto out_free_page; - - page[count] = '\0'; - loginuid = simple_strtoul(page, &tmp, 10); - if (tmp == page) { - length = -EINVAL; - goto out_free_page; - } + rv = kstrtou32_from_user(buf, count, 10, &loginuid); + if (rv < 0) + return rv; /* is userspace tring to explicitly UNSET the loginuid? */ if (loginuid == AUDIT_UID_UNSET) { kloginuid = INVALID_UID; } else { kloginuid = make_kuid(file->f_cred->user_ns, loginuid); - if (!uid_valid(kloginuid)) { - length = -EINVAL; - goto out_free_page; - } + if (!uid_valid(kloginuid)) + return -EINVAL; } - length = audit_set_loginuid(kloginuid); - if (likely(length == 0)) - length = count; - -out_free_page: - free_page((unsigned long) page); - return length; + rv = audit_set_loginuid(kloginuid); + if (rv < 0) + return rv; + return count; } static const struct file_operations proc_loginuid_operations = { @@ -1335,8 +1316,9 @@ static ssize_t proc_fault_inject_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { struct task_struct *task; - char buffer[PROC_NUMBUF], *end; + char buffer[PROC_NUMBUF]; int make_it_fail; + int rv; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; @@ -1345,9 +1327,9 @@ static ssize_t proc_fault_inject_write(struct file * file, count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; - make_it_fail = simple_strtol(strstrip(buffer), &end, 0); - if (*end) - return -EINVAL; + rv = kstrtoint(strstrip(buffer), 0, &make_it_fail); + if (rv < 0) + return rv; if (make_it_fail < 0 || make_it_fail > 1) return -EINVAL; @@ -1836,8 +1818,6 @@ end_instantiate: return dir_emit(ctx, name, len, 1, DT_UNKNOWN); } -#ifdef CONFIG_CHECKPOINT_RESTORE - /* * dname_to_vma_addr - maps a dentry name into two unsigned longs * which represent vma start and end addresses. @@ -1864,11 +1844,6 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; - if (!capable(CAP_SYS_ADMIN)) { - status = -EPERM; - goto out_notask; - } - inode = d_inode(dentry); task = get_proc_task(inode); if (!task) @@ -1957,6 +1932,29 @@ struct map_files_info { unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ }; +/* + * Only allow CAP_SYS_ADMIN to follow the links, due to concerns about how the + * symlinks may be used to bypass permissions on ancestor directories in the + * path to the file in question. + */ +static const char * +proc_map_files_follow_link(struct dentry *dentry, void **cookie) +{ + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + return proc_pid_follow_link(dentry, NULL); +} + +/* + * Identical to proc_pid_link_inode_operations except for follow_link() + */ +static const struct inode_operations proc_map_files_link_inode_operations = { + .readlink = proc_pid_readlink, + .follow_link = proc_map_files_follow_link, + .setattr = proc_setattr, +}; + static int proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) @@ -1972,7 +1970,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, ei = PROC_I(inode); ei->op.proc_get_link = proc_map_files_get_link; - inode->i_op = &proc_pid_link_inode_operations; + inode->i_op = &proc_map_files_link_inode_operations; inode->i_size = 64; inode->i_mode = S_IFLNK; @@ -1996,10 +1994,6 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, int result; struct mm_struct *mm; - result = -EPERM; - if (!capable(CAP_SYS_ADMIN)) - goto out; - result = -ENOENT; task = get_proc_task(dir); if (!task) @@ -2053,10 +2047,6 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) struct map_files_info *p; int ret; - ret = -EPERM; - if (!capable(CAP_SYS_ADMIN)) - goto out; - ret = -ENOENT; task = get_proc_task(file_inode(file)); if (!task) @@ -2245,7 +2235,6 @@ static const struct file_operations proc_timers_operations = { .llseek = seq_lseek, .release = seq_release_private, }; -#endif /* CONFIG_CHECKPOINT_RESTORE */ static int proc_pident_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) @@ -2481,32 +2470,20 @@ static ssize_t proc_coredump_filter_write(struct file *file, { struct task_struct *task; struct mm_struct *mm; - char buffer[PROC_NUMBUF], *end; unsigned int val; int ret; int i; unsigned long mask; - ret = -EFAULT; - memset(buffer, 0, sizeof(buffer)); - if (count > sizeof(buffer) - 1) - count = sizeof(buffer) - 1; - if (copy_from_user(buffer, buf, count)) - goto out_no_task; - - ret = -EINVAL; - val = (unsigned int)simple_strtoul(buffer, &end, 0); - if (*end == '\n') - end++; - if (end - buffer == 0) - goto out_no_task; + ret = kstrtouint_from_user(buf, count, 0, &val); + if (ret < 0) + return ret; ret = -ESRCH; task = get_proc_task(file_inode(file)); if (!task) goto out_no_task; - ret = end - buffer; mm = get_task_mm(task); if (!mm) goto out_no_mm; @@ -2522,7 +2499,9 @@ static ssize_t proc_coredump_filter_write(struct file *file, out_no_mm: put_task_struct(task); out_no_task: - return ret; + if (ret < 0) + return ret; + return count; } static const struct file_operations proc_coredump_filter_operations = { @@ -2744,9 +2723,7 @@ static const struct inode_operations proc_task_inode_operations; static const struct pid_entry tgid_base_stuff[] = { DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), -#ifdef CONFIG_CHECKPOINT_RESTORE DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations), -#endif DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), #ifdef CONFIG_NET diff --git a/fs/proc/generic.c b/fs/proc/generic.c index e5dee5c3188e..ff3ffc76a937 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -26,7 +26,7 @@ #include "internal.h" -static DEFINE_SPINLOCK(proc_subdir_lock); +static DEFINE_RWLOCK(proc_subdir_lock); static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de) { @@ -172,9 +172,9 @@ static int xlate_proc_name(const char *name, struct proc_dir_entry **ret, { int rv; - spin_lock(&proc_subdir_lock); + read_lock(&proc_subdir_lock); rv = __xlate_proc_name(name, ret, residual); - spin_unlock(&proc_subdir_lock); + read_unlock(&proc_subdir_lock); return rv; } @@ -231,11 +231,11 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, { struct inode *inode; - spin_lock(&proc_subdir_lock); + read_lock(&proc_subdir_lock); de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len); if (de) { pde_get(de); - spin_unlock(&proc_subdir_lock); + read_unlock(&proc_subdir_lock); inode = proc_get_inode(dir->i_sb, de); if (!inode) return ERR_PTR(-ENOMEM); @@ -243,7 +243,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, d_add(dentry, inode); return NULL; } - spin_unlock(&proc_subdir_lock); + read_unlock(&proc_subdir_lock); return ERR_PTR(-ENOENT); } @@ -270,12 +270,12 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file, if (!dir_emit_dots(file, ctx)) return 0; - spin_lock(&proc_subdir_lock); + read_lock(&proc_subdir_lock); de = pde_subdir_first(de); i = ctx->pos - 2; for (;;) { if (!de) { - spin_unlock(&proc_subdir_lock); + read_unlock(&proc_subdir_lock); return 0; } if (!i) @@ -287,19 +287,19 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file, do { struct proc_dir_entry *next; pde_get(de); - spin_unlock(&proc_subdir_lock); + read_unlock(&proc_subdir_lock); if (!dir_emit(ctx, de->name, de->namelen, de->low_ino, de->mode >> 12)) { pde_put(de); return 0; } - spin_lock(&proc_subdir_lock); + read_lock(&proc_subdir_lock); ctx->pos++; next = pde_subdir_next(de); pde_put(de); de = next; } while (de); - spin_unlock(&proc_subdir_lock); + read_unlock(&proc_subdir_lock); return 1; } @@ -338,16 +338,16 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp if (ret) return ret; - spin_lock(&proc_subdir_lock); + write_lock(&proc_subdir_lock); dp->parent = dir; if (pde_subdir_insert(dir, dp) == false) { WARN(1, "proc_dir_entry '%s/%s' already registered\n", dir->name, dp->name); - spin_unlock(&proc_subdir_lock); + write_unlock(&proc_subdir_lock); proc_free_inum(dp->low_ino); return -EEXIST; } - spin_unlock(&proc_subdir_lock); + write_unlock(&proc_subdir_lock); return 0; } @@ -549,9 +549,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) const char *fn = name; unsigned int len; - spin_lock(&proc_subdir_lock); + write_lock(&proc_subdir_lock); if (__xlate_proc_name(name, &parent, &fn) != 0) { - spin_unlock(&proc_subdir_lock); + write_unlock(&proc_subdir_lock); return; } len = strlen(fn); @@ -559,7 +559,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) de = pde_subdir_find(parent, fn, len); if (de) rb_erase(&de->subdir_node, &parent->subdir); - spin_unlock(&proc_subdir_lock); + write_unlock(&proc_subdir_lock); if (!de) { WARN(1, "name '%s'\n", name); return; @@ -583,16 +583,16 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) const char *fn = name; unsigned int len; - spin_lock(&proc_subdir_lock); + write_lock(&proc_subdir_lock); if (__xlate_proc_name(name, &parent, &fn) != 0) { - spin_unlock(&proc_subdir_lock); + write_unlock(&proc_subdir_lock); return -ENOENT; } len = strlen(fn); root = pde_subdir_find(parent, fn, len); if (!root) { - spin_unlock(&proc_subdir_lock); + write_unlock(&proc_subdir_lock); return -ENOENT; } rb_erase(&root->subdir_node, &parent->subdir); @@ -605,7 +605,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) de = next; continue; } - spin_unlock(&proc_subdir_lock); + write_unlock(&proc_subdir_lock); proc_entry_rundown(de); next = de->parent; @@ -616,7 +616,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) break; pde_put(de); - spin_lock(&proc_subdir_lock); + write_lock(&proc_subdir_lock); de = next; } pde_put(root); diff --git a/fs/proc/page.c b/fs/proc/page.c index 7eee2d8b97d9..93484034a03d 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -9,12 +9,16 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/hugetlb.h> +#include <linux/memcontrol.h> +#include <linux/mmu_notifier.h> +#include <linux/page_idle.h> #include <linux/kernel-page-flags.h> #include <asm/uaccess.h> #include "internal.h" #define KPMSIZE sizeof(u64) #define KPMMASK (KPMSIZE - 1) +#define KPMBITS (KPMSIZE * BITS_PER_BYTE) /* /proc/kpagecount - an array exposing page counts * @@ -54,6 +58,8 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, pfn++; out++; count -= KPMSIZE; + + cond_resched(); } *ppos += (char __user *)out - buf; @@ -146,6 +152,9 @@ u64 stable_page_flags(struct page *page) if (PageBalloon(page)) u |= 1 << KPF_BALLOON; + if (page_is_idle(page)) + u |= 1 << KPF_IDLE; + u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); @@ -212,6 +221,8 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, pfn++; out++; count -= KPMSIZE; + + cond_resched(); } *ppos += (char __user *)out - buf; @@ -225,10 +236,64 @@ static const struct file_operations proc_kpageflags_operations = { .read = kpageflags_read, }; +#ifdef CONFIG_MEMCG +static ssize_t kpagecgroup_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u64 __user *out = (u64 __user *)buf; + struct page *ppage; + unsigned long src = *ppos; + unsigned long pfn; + ssize_t ret = 0; + u64 ino; + + pfn = src / KPMSIZE; + count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src); + if (src & KPMMASK || count & KPMMASK) + return -EINVAL; + + while (count > 0) { + if (pfn_valid(pfn)) + ppage = pfn_to_page(pfn); + else + ppage = NULL; + + if (ppage) + ino = page_cgroup_ino(ppage); + else + ino = 0; + + if (put_user(ino, out)) { + ret = -EFAULT; + break; + } + + pfn++; + out++; + count -= KPMSIZE; + + cond_resched(); + } + + *ppos += (char __user *)out - buf; + if (!ret) + ret = (char __user *)out - buf; + return ret; +} + +static const struct file_operations proc_kpagecgroup_operations = { + .llseek = mem_lseek, + .read = kpagecgroup_read, +}; +#endif /* CONFIG_MEMCG */ + static int __init proc_page_init(void) { proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations); proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations); +#ifdef CONFIG_MEMCG + proc_create("kpagecgroup", S_IRUSR, NULL, &proc_kpagecgroup_operations); +#endif return 0; } fs_initcall(proc_page_init); diff --git a/fs/proc/root.c b/fs/proc/root.c index 68feb0f70e63..361ab4ee42fc 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -134,6 +134,8 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, } sb->s_flags |= MS_ACTIVE; + /* User space would break if executables appear on proc */ + sb->s_iflags |= SB_I_NOEXEC; } return dget(sb->s_root); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ca1e091881d4..e2d46adb54b4 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -13,6 +13,7 @@ #include <linux/swap.h> #include <linux/swapops.h> #include <linux/mmu_notifier.h> +#include <linux/page_idle.h> #include <asm/elf.h> #include <asm/uaccess.h> @@ -446,6 +447,7 @@ struct mem_size_stats { unsigned long anonymous_thp; unsigned long swap; u64 pss; + u64 swap_pss; }; static void smaps_account(struct mem_size_stats *mss, struct page *page, @@ -458,7 +460,7 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, mss->resident += size; /* Accumulate the size in pages that have been accessed. */ - if (young || PageReferenced(page)) + if (young || page_is_young(page) || PageReferenced(page)) mss->referenced += size; mapcount = page_mapcount(page); if (mapcount >= 2) { @@ -492,9 +494,20 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, } else if (is_swap_pte(*pte)) { swp_entry_t swpent = pte_to_swp_entry(*pte); - if (!non_swap_entry(swpent)) + if (!non_swap_entry(swpent)) { + int mapcount; + mss->swap += PAGE_SIZE; - else if (is_migration_entry(swpent)) + mapcount = swp_swapcount(swpent); + if (mapcount >= 2) { + u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT; + + do_div(pss_delta, mapcount); + mss->swap_pss += pss_delta; + } else { + mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT; + } + } else if (is_migration_entry(swpent)) page = migration_entry_to_page(swpent); } @@ -597,6 +610,8 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_HUGEPAGE)] = "hg", [ilog2(VM_NOHUGEPAGE)] = "nh", [ilog2(VM_MERGEABLE)] = "mg", + [ilog2(VM_UFFD_MISSING)]= "um", + [ilog2(VM_UFFD_WP)] = "uw", }; size_t i; @@ -638,6 +653,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) "Anonymous: %8lu kB\n" "AnonHugePages: %8lu kB\n" "Swap: %8lu kB\n" + "SwapPss: %8lu kB\n" "KernelPageSize: %8lu kB\n" "MMUPageSize: %8lu kB\n" "Locked: %8lu kB\n", @@ -652,6 +668,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) mss.anonymous >> 10, mss.anonymous_thp >> 10, mss.swap >> 10, + (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)), vma_kernel_pagesize(vma) >> 10, vma_mmu_pagesize(vma) >> 10, (vma->vm_flags & VM_LOCKED) ? @@ -710,23 +727,6 @@ const struct file_operations proc_tid_smaps_operations = { .release = proc_map_release, }; -/* - * We do not want to have constant page-shift bits sitting in - * pagemap entries and are about to reuse them some time soon. - * - * Here's the "migration strategy": - * 1. when the system boots these bits remain what they are, - * but a warning about future change is printed in log; - * 2. once anyone clears soft-dirty bits via clear_refs file, - * these flag is set to denote, that user is aware of the - * new API and those page-shift bits change their meaning. - * The respective warning is printed in dmesg; - * 3. In a couple of releases we will remove all the mentions - * of page-shift in pagemap entries. - */ - -static bool soft_dirty_cleared __read_mostly; - enum clear_refs_types { CLEAR_REFS_ALL = 1, CLEAR_REFS_ANON, @@ -808,6 +808,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, /* Clear accessed and referenced bits. */ pmdp_test_and_clear_young(vma, addr, pmd); + test_and_clear_page_young(page); ClearPageReferenced(page); out: spin_unlock(ptl); @@ -835,6 +836,7 @@ out: /* Clear accessed and referenced bits. */ ptep_test_and_clear_young(vma, addr, pte); + test_and_clear_page_young(page); ClearPageReferenced(page); } pte_unmap_unlock(pte - 1, ptl); @@ -887,13 +889,6 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) return -EINVAL; - if (type == CLEAR_REFS_SOFT_DIRTY) { - soft_dirty_cleared = true; - pr_warn_once("The pagemap bits 55-60 has changed their meaning!" - " See the linux/Documentation/vm/pagemap.txt for " - "details.\n"); - } - task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; @@ -961,36 +956,26 @@ typedef struct { struct pagemapread { int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ pagemap_entry_t *buffer; - bool v2; + bool show_pfn; }; #define PAGEMAP_WALK_SIZE (PMD_SIZE) #define PAGEMAP_WALK_MASK (PMD_MASK) -#define PM_ENTRY_BYTES sizeof(pagemap_entry_t) -#define PM_STATUS_BITS 3 -#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS) -#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET) -#define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK) -#define PM_PSHIFT_BITS 6 -#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS) -#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) -#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) -#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1) -#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) -/* in "new" pagemap pshift bits are occupied with more status bits */ -#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT)) - -#define __PM_SOFT_DIRTY (1LL) -#define PM_PRESENT PM_STATUS(4LL) -#define PM_SWAP PM_STATUS(2LL) -#define PM_FILE PM_STATUS(1LL) -#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0) +#define PM_ENTRY_BYTES sizeof(pagemap_entry_t) +#define PM_PFRAME_BITS 55 +#define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0) +#define PM_SOFT_DIRTY BIT_ULL(55) +#define PM_MMAP_EXCLUSIVE BIT_ULL(56) +#define PM_FILE BIT_ULL(61) +#define PM_SWAP BIT_ULL(62) +#define PM_PRESENT BIT_ULL(63) + #define PM_END_OF_BUFFER 1 -static inline pagemap_entry_t make_pme(u64 val) +static inline pagemap_entry_t make_pme(u64 frame, u64 flags) { - return (pagemap_entry_t) { .pme = val }; + return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags }; } static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme, @@ -1011,7 +996,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, while (addr < end) { struct vm_area_struct *vma = find_vma(walk->mm, addr); - pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); + pagemap_entry_t pme = make_pme(0, 0); /* End of address space hole, which we mark as non-present. */ unsigned long hole_end; @@ -1031,7 +1016,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, /* Addresses in the VMA. */ if (vma->vm_flags & VM_SOFTDIRTY) - pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY); + pme = make_pme(0, PM_SOFT_DIRTY); for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) { err = add_to_pagemap(addr, &pme, pm); if (err) @@ -1042,67 +1027,42 @@ out: return err; } -static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, +static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, struct vm_area_struct *vma, unsigned long addr, pte_t pte) { - u64 frame, flags; + u64 frame = 0, flags = 0; struct page *page = NULL; - int flags2 = 0; if (pte_present(pte)) { - frame = pte_pfn(pte); - flags = PM_PRESENT; + if (pm->show_pfn) + frame = pte_pfn(pte); + flags |= PM_PRESENT; page = vm_normal_page(vma, addr, pte); if (pte_soft_dirty(pte)) - flags2 |= __PM_SOFT_DIRTY; + flags |= PM_SOFT_DIRTY; } else if (is_swap_pte(pte)) { swp_entry_t entry; if (pte_swp_soft_dirty(pte)) - flags2 |= __PM_SOFT_DIRTY; + flags |= PM_SOFT_DIRTY; entry = pte_to_swp_entry(pte); frame = swp_type(entry) | (swp_offset(entry) << MAX_SWAPFILES_SHIFT); - flags = PM_SWAP; + flags |= PM_SWAP; if (is_migration_entry(entry)) page = migration_entry_to_page(entry); - } else { - if (vma->vm_flags & VM_SOFTDIRTY) - flags2 |= __PM_SOFT_DIRTY; - *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2)); - return; } if (page && !PageAnon(page)) flags |= PM_FILE; - if ((vma->vm_flags & VM_SOFTDIRTY)) - flags2 |= __PM_SOFT_DIRTY; - - *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); -} + if (page && page_mapcount(page) == 1) + flags |= PM_MMAP_EXCLUSIVE; + if (vma->vm_flags & VM_SOFTDIRTY) + flags |= PM_SOFT_DIRTY; -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, - pmd_t pmd, int offset, int pmd_flags2) -{ - /* - * Currently pmd for thp is always present because thp can not be - * swapped-out, migrated, or HWPOISONed (split in such cases instead.) - * This if-check is just to prepare for future implementation. - */ - if (pmd_present(pmd)) - *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) - | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT); - else - *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2)); + return make_pme(frame, flags); } -#else -static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, - pmd_t pmd, int offset, int pmd_flags2) -{ -} -#endif -static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, +static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; @@ -1111,41 +1071,58 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte_t *pte, *orig_pte; int err = 0; - if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { - int pmd_flags2; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (pmd_trans_huge_lock(pmdp, vma, &ptl) == 1) { + u64 flags = 0, frame = 0; + pmd_t pmd = *pmdp; - if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd)) - pmd_flags2 = __PM_SOFT_DIRTY; - else - pmd_flags2 = 0; + if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(pmd)) + flags |= PM_SOFT_DIRTY; + + /* + * Currently pmd for thp is always present because thp + * can not be swapped-out, migrated, or HWPOISONed + * (split in such cases instead.) + * This if-check is just to prepare for future implementation. + */ + if (pmd_present(pmd)) { + struct page *page = pmd_page(pmd); + + if (page_mapcount(page) == 1) + flags |= PM_MMAP_EXCLUSIVE; + + flags |= PM_PRESENT; + if (pm->show_pfn) + frame = pmd_pfn(pmd) + + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + } for (; addr != end; addr += PAGE_SIZE) { - unsigned long offset; - pagemap_entry_t pme; + pagemap_entry_t pme = make_pme(frame, flags); - offset = (addr & ~PAGEMAP_WALK_MASK) >> - PAGE_SHIFT; - thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2); err = add_to_pagemap(addr, &pme, pm); if (err) break; + if (pm->show_pfn && (flags & PM_PRESENT)) + frame++; } spin_unlock(ptl); return err; } - if (pmd_trans_unstable(pmd)) + if (pmd_trans_unstable(pmdp)) return 0; +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* * We can assume that @vma always points to a valid one and @end never * goes beyond vma->vm_end. */ - orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl); for (; addr < end; pte++, addr += PAGE_SIZE) { pagemap_entry_t pme; - pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); + pme = pte_to_pagemap_entry(pm, vma, addr, *pte); err = add_to_pagemap(addr, &pme, pm); if (err) break; @@ -1158,40 +1135,44 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, } #ifdef CONFIG_HUGETLB_PAGE -static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, - pte_t pte, int offset, int flags2) -{ - if (pte_present(pte)) - *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) | - PM_STATUS2(pm->v2, flags2) | - PM_PRESENT); - else - *pme = make_pme(PM_NOT_PRESENT(pm->v2) | - PM_STATUS2(pm->v2, flags2)); -} - /* This function walks within one hugetlb entry in the single call */ -static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, +static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct pagemapread *pm = walk->private; struct vm_area_struct *vma = walk->vma; + u64 flags = 0, frame = 0; int err = 0; - int flags2; - pagemap_entry_t pme; + pte_t pte; if (vma->vm_flags & VM_SOFTDIRTY) - flags2 = __PM_SOFT_DIRTY; - else - flags2 = 0; + flags |= PM_SOFT_DIRTY; + + pte = huge_ptep_get(ptep); + if (pte_present(pte)) { + struct page *page = pte_page(pte); + + if (!PageAnon(page)) + flags |= PM_FILE; + + if (page_mapcount(page) == 1) + flags |= PM_MMAP_EXCLUSIVE; + + flags |= PM_PRESENT; + if (pm->show_pfn) + frame = pte_pfn(pte) + + ((addr & ~hmask) >> PAGE_SHIFT); + } for (; addr != end; addr += PAGE_SIZE) { - int offset = (addr & ~hmask) >> PAGE_SHIFT; - huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2); + pagemap_entry_t pme = make_pme(frame, flags); + err = add_to_pagemap(addr, &pme, pm); if (err) return err; + if (pm->show_pfn && (flags & PM_PRESENT)) + frame++; } cond_resched(); @@ -1209,7 +1190,9 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, * Bits 0-54 page frame number (PFN) if present * Bits 0-4 swap type if swapped * Bits 5-54 swap offset if swapped - * Bits 55-60 page shift (page size = 1<<page shift) + * Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.txt) + * Bit 56 page exclusively mapped + * Bits 57-60 zero * Bit 61 page is file-page or shared-anon * Bit 62 page swapped * Bit 63 page present @@ -1227,42 +1210,37 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, static ssize_t pagemap_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *task = get_proc_task(file_inode(file)); - struct mm_struct *mm; + struct mm_struct *mm = file->private_data; struct pagemapread pm; - int ret = -ESRCH; struct mm_walk pagemap_walk = {}; unsigned long src; unsigned long svpfn; unsigned long start_vaddr; unsigned long end_vaddr; - int copied = 0; + int ret = 0, copied = 0; - if (!task) + if (!mm || !atomic_inc_not_zero(&mm->mm_users)) goto out; ret = -EINVAL; /* file position must be aligned */ if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES)) - goto out_task; + goto out_mm; ret = 0; if (!count) - goto out_task; + goto out_mm; + + /* do not disclose physical addresses: attack vector */ + pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN); - pm.v2 = soft_dirty_cleared; pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY); ret = -ENOMEM; if (!pm.buffer) - goto out_task; - - mm = mm_access(task, PTRACE_MODE_READ); - ret = PTR_ERR(mm); - if (!mm || IS_ERR(mm)) - goto out_free; + goto out_mm; - pagemap_walk.pmd_entry = pagemap_pte_range; + pagemap_walk.pmd_entry = pagemap_pmd_range; pagemap_walk.pte_hole = pagemap_pte_hole; #ifdef CONFIG_HUGETLB_PAGE pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; @@ -1273,10 +1251,10 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, src = *ppos; svpfn = src / PM_ENTRY_BYTES; start_vaddr = svpfn << PAGE_SHIFT; - end_vaddr = TASK_SIZE_OF(task); + end_vaddr = mm->task_size; /* watch out for wraparound */ - if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT) + if (svpfn > mm->task_size >> PAGE_SHIFT) start_vaddr = end_vaddr; /* @@ -1303,7 +1281,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, len = min(count, PM_ENTRY_BYTES * pm.pos); if (copy_to_user(buf, pm.buffer, len)) { ret = -EFAULT; - goto out_mm; + goto out_free; } copied += len; buf += len; @@ -1313,24 +1291,31 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, if (!ret || ret == PM_END_OF_BUFFER) ret = copied; -out_mm: - mmput(mm); out_free: kfree(pm.buffer); -out_task: - put_task_struct(task); +out_mm: + mmput(mm); out: return ret; } static int pagemap_open(struct inode *inode, struct file *file) { - /* do not disclose physical addresses: attack vector */ - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about " - "to stop being page-shift some time soon. See the " - "linux/Documentation/vm/pagemap.txt for details.\n"); + struct mm_struct *mm; + + mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR(mm)) + return PTR_ERR(mm); + file->private_data = mm; + return 0; +} + +static int pagemap_release(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = file->private_data; + + if (mm) + mmdrop(mm); return 0; } @@ -1338,6 +1323,7 @@ const struct file_operations proc_pagemap_operations = { .llseek = mem_lseek, /* borrow this */ .read = pagemap_read, .open = pagemap_open, + .release = pagemap_release, }; #endif /* CONFIG_PROC_PAGE_MONITOR */ diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 20d1f74561cf..ef0d64b2a6d9 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -247,7 +247,7 @@ struct dqstats dqstats; EXPORT_SYMBOL(dqstats); static qsize_t inode_get_rsv_space(struct inode *inode); -static void __dquot_initialize(struct inode *inode, int type); +static int __dquot_initialize(struct inode *inode, int type); static inline unsigned int hashfn(const struct super_block *sb, struct kqid qid) @@ -832,16 +832,17 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type) struct dquot *dqget(struct super_block *sb, struct kqid qid) { unsigned int hashent = hashfn(sb, qid); - struct dquot *dquot = NULL, *empty = NULL; + struct dquot *dquot, *empty = NULL; if (!sb_has_quota_active(sb, qid.type)) - return NULL; + return ERR_PTR(-ESRCH); we_slept: spin_lock(&dq_list_lock); spin_lock(&dq_state_lock); if (!sb_has_quota_active(sb, qid.type)) { spin_unlock(&dq_state_lock); spin_unlock(&dq_list_lock); + dquot = ERR_PTR(-ESRCH); goto out; } spin_unlock(&dq_state_lock); @@ -876,11 +877,15 @@ we_slept: * already finished or it will be canceled due to dq_count > 1 test */ wait_on_dquot(dquot); /* Read the dquot / allocate space in quota file */ - if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && - sb->dq_op->acquire_dquot(dquot) < 0) { - dqput(dquot); - dquot = NULL; - goto out; + if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) { + int err; + + err = sb->dq_op->acquire_dquot(dquot); + if (err < 0) { + dqput(dquot); + dquot = ERR_PTR(err); + goto out; + } } #ifdef CONFIG_QUOTA_DEBUG BUG_ON(!dquot->dq_sb); /* Has somebody invalidated entry under us? */ @@ -923,7 +928,7 @@ static void add_dquot_ref(struct super_block *sb, int type) int reserved = 0; #endif - spin_lock(&inode_sb_list_lock); + spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || @@ -934,7 +939,7 @@ static void add_dquot_ref(struct super_block *sb, int type) } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); #ifdef CONFIG_QUOTA_DEBUG if (unlikely(inode_get_rsv_space(inode) > 0)) @@ -946,15 +951,15 @@ static void add_dquot_ref(struct super_block *sb, int type) /* * We hold a reference to 'inode' so it couldn't have been * removed from s_inodes list while we dropped the - * inode_sb_list_lock We cannot iput the inode now as we can be + * s_inode_list_lock. We cannot iput the inode now as we can be * holding the last reference and we cannot iput it under - * inode_sb_list_lock. So we keep the reference and iput it + * s_inode_list_lock. So we keep the reference and iput it * later. */ old_inode = inode; - spin_lock(&inode_sb_list_lock); + spin_lock(&sb->s_inode_list_lock); } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); iput(old_inode); #ifdef CONFIG_QUOTA_DEBUG @@ -1023,7 +1028,7 @@ static void remove_dquot_ref(struct super_block *sb, int type, struct inode *inode; int reserved = 0; - spin_lock(&inode_sb_list_lock); + spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { /* * We have to scan also I_NEW inodes because they can already @@ -1039,7 +1044,7 @@ static void remove_dquot_ref(struct super_block *sb, int type, } spin_unlock(&dq_data_lock); } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); #ifdef CONFIG_QUOTA_DEBUG if (reserved) { printk(KERN_WARNING "VFS (%s): Writes happened after quota" @@ -1390,15 +1395,16 @@ static int dquot_active(const struct inode *inode) * It is better to call this function outside of any transaction as it * might need a lot of space in journal for dquot structure allocation. */ -static void __dquot_initialize(struct inode *inode, int type) +static int __dquot_initialize(struct inode *inode, int type) { int cnt, init_needed = 0; struct dquot **dquots, *got[MAXQUOTAS]; struct super_block *sb = inode->i_sb; qsize_t rsv; + int ret = 0; if (!dquot_active(inode)) - return; + return 0; dquots = i_dquot(inode); @@ -1407,6 +1413,7 @@ static void __dquot_initialize(struct inode *inode, int type) struct kqid qid; kprojid_t projid; int rc; + struct dquot *dquot; got[cnt] = NULL; if (type != -1 && cnt != type) @@ -1438,16 +1445,25 @@ static void __dquot_initialize(struct inode *inode, int type) qid = make_kqid_projid(projid); break; } - got[cnt] = dqget(sb, qid); + dquot = dqget(sb, qid); + if (IS_ERR(dquot)) { + /* We raced with somebody turning quotas off... */ + if (PTR_ERR(dquot) != -ESRCH) { + ret = PTR_ERR(dquot); + goto out_put; + } + dquot = NULL; + } + got[cnt] = dquot; } /* All required i_dquot has been initialized */ if (!init_needed) - return; + return 0; spin_lock(&dq_data_lock); if (IS_NOQUOTA(inode)) - goto out_err; + goto out_lock; for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && cnt != type) continue; @@ -1469,15 +1485,18 @@ static void __dquot_initialize(struct inode *inode, int type) dquot_resv_space(dquots[cnt], rsv); } } -out_err: +out_lock: spin_unlock(&dq_data_lock); +out_put: /* Drop unused references */ dqput_all(got); + + return ret; } -void dquot_initialize(struct inode *inode) +int dquot_initialize(struct inode *inode) { - __dquot_initialize(inode, -1); + return __dquot_initialize(inode, -1); } EXPORT_SYMBOL(dquot_initialize); @@ -1961,18 +1980,37 @@ EXPORT_SYMBOL(__dquot_transfer); int dquot_transfer(struct inode *inode, struct iattr *iattr) { struct dquot *transfer_to[MAXQUOTAS] = {}; + struct dquot *dquot; struct super_block *sb = inode->i_sb; int ret; if (!dquot_active(inode)) return 0; - if (iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) - transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(iattr->ia_uid)); - if (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid)) - transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(iattr->ia_gid)); - + if (iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)){ + dquot = dqget(sb, make_kqid_uid(iattr->ia_uid)); + if (IS_ERR(dquot)) { + if (PTR_ERR(dquot) != -ESRCH) { + ret = PTR_ERR(dquot); + goto out_put; + } + dquot = NULL; + } + transfer_to[USRQUOTA] = dquot; + } + if (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid)){ + dquot = dqget(sb, make_kqid_gid(iattr->ia_gid)); + if (IS_ERR(dquot)) { + if (PTR_ERR(dquot) != -ESRCH) { + ret = PTR_ERR(dquot); + goto out_put; + } + dquot = NULL; + } + transfer_to[GRPQUOTA] = dquot; + } ret = __dquot_transfer(inode, transfer_to); +out_put: dqput_all(transfer_to); return ret; } @@ -2518,8 +2556,8 @@ int dquot_get_dqblk(struct super_block *sb, struct kqid qid, struct dquot *dquot; dquot = dqget(sb, qid); - if (!dquot) - return -ESRCH; + if (IS_ERR(dquot)) + return PTR_ERR(dquot); do_get_dqblk(dquot, di); dqput(dquot); @@ -2631,8 +2669,8 @@ int dquot_set_dqblk(struct super_block *sb, struct kqid qid, int rc; dquot = dqget(sb, qid); - if (!dquot) { - rc = -ESRCH; + if (IS_ERR(dquot)) { + rc = PTR_ERR(dquot); goto out; } rc = do_set_dqblk(dquot, di); diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 86ded7375c21..3746367098fd 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -141,9 +141,9 @@ static int quota_getinfo(struct super_block *sb, int type, void __user *addr) if (tstate->flags & QCI_ROOT_SQUASH) uinfo.dqi_flags |= DQF_ROOT_SQUASH; uinfo.dqi_valid = IIF_ALL; - if (!ret && copy_to_user(addr, &uinfo, sizeof(uinfo))) + if (copy_to_user(addr, &uinfo, sizeof(uinfo))) return -EFAULT; - return ret; + return 0; } static int quota_setinfo(struct super_block *sb, int type, void __user *addr) diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index f6f2fbad9777..3d8e7e671d5b 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -3319,8 +3319,11 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) /* must be turned off for recursive notify_change calls */ ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID); - if (is_quota_modification(inode, attr)) - dquot_initialize(inode); + if (is_quota_modification(inode, attr)) { + error = dquot_initialize(inode); + if (error) + return error; + } reiserfs_write_lock(inode->i_sb); if (attr->ia_valid & ATTR_SIZE) { /* diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index b55a074653d7..5f1c9c29eb8c 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -613,8 +613,7 @@ static int new_inode_init(struct inode *inode, struct inode *dir, umode_t mode) * we have to set uid and gid here */ inode_init_owner(inode, dir, mode); - dquot_initialize(inode); - return 0; + return dquot_initialize(inode); } static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, @@ -633,12 +632,18 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mod struct reiserfs_transaction_handle th; struct reiserfs_security_handle security; - dquot_initialize(dir); + retval = dquot_initialize(dir); + if (retval) + return retval; if (!(inode = new_inode(dir->i_sb))) { return -ENOMEM; } - new_inode_init(inode, dir, mode); + retval = new_inode_init(inode, dir, mode); + if (retval) { + drop_new_inode(inode); + return retval; + } jbegin_count += reiserfs_cache_default_acl(dir); retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security); @@ -710,12 +715,18 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode if (!new_valid_dev(rdev)) return -EINVAL; - dquot_initialize(dir); + retval = dquot_initialize(dir); + if (retval) + return retval; if (!(inode = new_inode(dir->i_sb))) { return -ENOMEM; } - new_inode_init(inode, dir, mode); + retval = new_inode_init(inode, dir, mode); + if (retval) { + drop_new_inode(inode); + return retval; + } jbegin_count += reiserfs_cache_default_acl(dir); retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security); @@ -787,7 +798,9 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) + REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb)); - dquot_initialize(dir); + retval = dquot_initialize(dir); + if (retval) + return retval; #ifdef DISPLACE_NEW_PACKING_LOCALITIES /* @@ -800,7 +813,11 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode if (!(inode = new_inode(dir->i_sb))) { return -ENOMEM; } - new_inode_init(inode, dir, mode); + retval = new_inode_init(inode, dir, mode); + if (retval) { + drop_new_inode(inode); + return retval; + } jbegin_count += reiserfs_cache_default_acl(dir); retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security); @@ -899,7 +916,9 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) JOURNAL_PER_BALANCE_CNT * 2 + 2 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); - dquot_initialize(dir); + retval = dquot_initialize(dir); + if (retval) + return retval; reiserfs_write_lock(dir->i_sb); retval = journal_begin(&th, dir->i_sb, jbegin_count); @@ -985,7 +1004,9 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) int jbegin_count; unsigned long savelink; - dquot_initialize(dir); + retval = dquot_initialize(dir); + if (retval) + return retval; inode = d_inode(dentry); @@ -1095,12 +1116,18 @@ static int reiserfs_symlink(struct inode *parent_dir, 2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) + REISERFS_QUOTA_TRANS_BLOCKS(parent_dir->i_sb)); - dquot_initialize(parent_dir); + retval = dquot_initialize(parent_dir); + if (retval) + return retval; if (!(inode = new_inode(parent_dir->i_sb))) { return -ENOMEM; } - new_inode_init(inode, parent_dir, mode); + retval = new_inode_init(inode, parent_dir, mode); + if (retval) { + drop_new_inode(inode); + return retval; + } retval = reiserfs_security_init(parent_dir, inode, &dentry->d_name, &security); @@ -1184,7 +1211,9 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, JOURNAL_PER_BALANCE_CNT * 3 + 2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); - dquot_initialize(dir); + retval = dquot_initialize(dir); + if (retval) + return retval; reiserfs_write_lock(dir->i_sb); if (inode->i_nlink >= REISERFS_LINK_MAX) { @@ -1308,8 +1337,12 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, JOURNAL_PER_BALANCE_CNT * 3 + 5 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb); - dquot_initialize(old_dir); - dquot_initialize(new_dir); + retval = dquot_initialize(old_dir); + if (retval) + return retval; + retval = dquot_initialize(new_dir); + if (retval) + return retval; old_inode = d_inode(old_dentry); new_dentry_inode = d_inode(new_dentry); diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 0e4cf728126f..4a62fe8cc3bf 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -714,18 +714,20 @@ static int reiserfs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",acl"); if (REISERFS_SB(s)->s_jdev) - seq_printf(seq, ",jdev=%s", REISERFS_SB(s)->s_jdev); + seq_show_option(seq, "jdev", REISERFS_SB(s)->s_jdev); if (journal->j_max_commit_age != journal->j_default_max_commit_age) seq_printf(seq, ",commit=%d", journal->j_max_commit_age); #ifdef CONFIG_QUOTA if (REISERFS_SB(s)->s_qf_names[USRQUOTA]) - seq_printf(seq, ",usrjquota=%s", REISERFS_SB(s)->s_qf_names[USRQUOTA]); + seq_show_option(seq, "usrjquota", + REISERFS_SB(s)->s_qf_names[USRQUOTA]); else if (opts & (1 << REISERFS_USRQUOTA)) seq_puts(seq, ",usrquota"); if (REISERFS_SB(s)->s_qf_names[GRPQUOTA]) - seq_printf(seq, ",grpjquota=%s", REISERFS_SB(s)->s_qf_names[GRPQUOTA]); + seq_show_option(seq, "grpjquota", + REISERFS_SB(s)->s_qf_names[GRPQUOTA]); else if (opts & (1 << REISERFS_GRPQUOTA)) seq_puts(seq, ",grpquota"); if (REISERFS_SB(s)->s_jquota_fmt) { diff --git a/fs/seq_file.c b/fs/seq_file.c index ce9e39fd5daf..263b125dbcf4 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -12,6 +12,7 @@ #include <linux/slab.h> #include <linux/cred.h> #include <linux/mm.h> +#include <linux/printk.h> #include <asm/uaccess.h> #include <asm/page.h> @@ -773,6 +774,47 @@ void seq_pad(struct seq_file *m, char c) } EXPORT_SYMBOL(seq_pad); +/* A complete analogue of print_hex_dump() */ +void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type, + int rowsize, int groupsize, const void *buf, size_t len, + bool ascii) +{ + const u8 *ptr = buf; + int i, linelen, remaining = len; + int ret; + + if (rowsize != 16 && rowsize != 32) + rowsize = 16; + + for (i = 0; i < len && !seq_has_overflowed(m); i += rowsize) { + linelen = min(remaining, rowsize); + remaining -= rowsize; + + switch (prefix_type) { + case DUMP_PREFIX_ADDRESS: + seq_printf(m, "%s%p: ", prefix_str, ptr + i); + break; + case DUMP_PREFIX_OFFSET: + seq_printf(m, "%s%.8x: ", prefix_str, i); + break; + default: + seq_printf(m, "%s", prefix_str); + break; + } + + ret = hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize, + m->buf + m->count, m->size - m->count, + ascii); + if (ret >= m->size - m->count) { + seq_set_overflow(m); + } else { + m->count += ret; + seq_putc(m, '\n'); + } + } +} +EXPORT_SYMBOL(seq_hex_dump); + struct list_head *seq_list_start(struct list_head *head, loff_t pos) { struct list_head *lh; diff --git a/fs/signalfd.c b/fs/signalfd.c index 7e412ad74836..270221fcef42 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -121,8 +121,9 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, * Other callers might not initialize the si_lsb field, * so check explicitly for the right codes here. */ - if (kinfo->si_code == BUS_MCEERR_AR || - kinfo->si_code == BUS_MCEERR_AO) + if (kinfo->si_signo == SIGBUS && + (kinfo->si_code == BUS_MCEERR_AR || + kinfo->si_code == BUS_MCEERR_AO)) err |= __put_user((short) kinfo->si_addr_lsb, &uinfo->ssi_addr_lsb); #endif diff --git a/fs/super.c b/fs/super.c index b61372354f2b..954aeb80e202 100644 --- a/fs/super.c +++ b/fs/super.c @@ -135,6 +135,24 @@ static unsigned long super_cache_count(struct shrinker *shrink, return total_objects; } +static void destroy_super_work(struct work_struct *work) +{ + struct super_block *s = container_of(work, struct super_block, + destroy_work); + int i; + + for (i = 0; i < SB_FREEZE_LEVELS; i++) + percpu_free_rwsem(&s->s_writers.rw_sem[i]); + kfree(s); +} + +static void destroy_super_rcu(struct rcu_head *head) +{ + struct super_block *s = container_of(head, struct super_block, rcu); + INIT_WORK(&s->destroy_work, destroy_super_work); + schedule_work(&s->destroy_work); +} + /** * destroy_super - frees a superblock * @s: superblock to free @@ -143,16 +161,13 @@ static unsigned long super_cache_count(struct shrinker *shrink, */ static void destroy_super(struct super_block *s) { - int i; list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); - for (i = 0; i < SB_FREEZE_LEVELS; i++) - percpu_counter_destroy(&s->s_writers.counter[i]); security_sb_free(s); WARN_ON(!list_empty(&s->s_mounts)); kfree(s->s_subtype); kfree(s->s_options); - kfree_rcu(s, rcu); + call_rcu(&s->rcu, destroy_super_rcu); } /** @@ -178,19 +193,19 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) goto fail; for (i = 0; i < SB_FREEZE_LEVELS; i++) { - if (percpu_counter_init(&s->s_writers.counter[i], 0, - GFP_KERNEL) < 0) + if (__percpu_init_rwsem(&s->s_writers.rw_sem[i], + sb_writers_name[i], + &type->s_writers_key[i])) goto fail; - lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i], - &type->s_writers_key[i], 0); } - init_waitqueue_head(&s->s_writers.wait); init_waitqueue_head(&s->s_writers.wait_unfrozen); s->s_bdi = &noop_backing_dev_info; s->s_flags = flags; INIT_HLIST_NODE(&s->s_instances); INIT_HLIST_BL_HEAD(&s->s_anon); + mutex_init(&s->s_sync_lock); INIT_LIST_HEAD(&s->s_inodes); + spin_lock_init(&s->s_inode_list_lock); if (list_lru_init_memcg(&s->s_dentry_lru)) goto fail; @@ -399,7 +414,7 @@ void generic_shutdown_super(struct super_block *sb) sync_filesystem(sb); sb->s_flags &= ~MS_ACTIVE; - fsnotify_unmount_inodes(&sb->s_inodes); + fsnotify_unmount_inodes(sb); evict_inodes(sb); @@ -1146,72 +1161,46 @@ out: */ void __sb_end_write(struct super_block *sb, int level) { - percpu_counter_dec(&sb->s_writers.counter[level-1]); - /* - * Make sure s_writers are updated before we wake up waiters in - * freeze_super(). - */ - smp_mb(); - if (waitqueue_active(&sb->s_writers.wait)) - wake_up(&sb->s_writers.wait); - rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_); + percpu_up_read(sb->s_writers.rw_sem + level-1); } EXPORT_SYMBOL(__sb_end_write); -#ifdef CONFIG_LOCKDEP -/* - * We want lockdep to tell us about possible deadlocks with freezing but - * it's it bit tricky to properly instrument it. Getting a freeze protection - * works as getting a read lock but there are subtle problems. XFS for example - * gets freeze protection on internal level twice in some cases, which is OK - * only because we already hold a freeze protection also on higher level. Due - * to these cases we have to tell lockdep we are doing trylock when we - * already hold a freeze protection for a higher freeze level. - */ -static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock, - unsigned long ip) -{ - int i; - - if (!trylock) { - for (i = 0; i < level - 1; i++) - if (lock_is_held(&sb->s_writers.lock_map[i])) { - trylock = true; - break; - } - } - rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip); -} -#endif - /* * This is an internal function, please use sb_start_{write,pagefault,intwrite} * instead. */ int __sb_start_write(struct super_block *sb, int level, bool wait) { -retry: - if (unlikely(sb->s_writers.frozen >= level)) { - if (!wait) - return 0; - wait_event(sb->s_writers.wait_unfrozen, - sb->s_writers.frozen < level); - } + bool force_trylock = false; + int ret = 1; #ifdef CONFIG_LOCKDEP - acquire_freeze_lock(sb, level, !wait, _RET_IP_); -#endif - percpu_counter_inc(&sb->s_writers.counter[level-1]); /* - * Make sure counter is updated before we check for frozen. - * freeze_super() first sets frozen and then checks the counter. + * We want lockdep to tell us about possible deadlocks with freezing + * but it's it bit tricky to properly instrument it. Getting a freeze + * protection works as getting a read lock but there are subtle + * problems. XFS for example gets freeze protection on internal level + * twice in some cases, which is OK only because we already hold a + * freeze protection also on higher level. Due to these cases we have + * to use wait == F (trylock mode) which must not fail. */ - smp_mb(); - if (unlikely(sb->s_writers.frozen >= level)) { - __sb_end_write(sb, level); - goto retry; + if (wait) { + int i; + + for (i = 0; i < level - 1; i++) + if (percpu_rwsem_is_held(sb->s_writers.rw_sem + i)) { + force_trylock = true; + break; + } } - return 1; +#endif + if (wait && !force_trylock) + percpu_down_read(sb->s_writers.rw_sem + level-1); + else + ret = percpu_down_read_trylock(sb->s_writers.rw_sem + level-1); + + WARN_ON(force_trylock & !ret); + return ret; } EXPORT_SYMBOL(__sb_start_write); @@ -1221,37 +1210,33 @@ EXPORT_SYMBOL(__sb_start_write); * @level: type of writers we wait for (normal vs page fault) * * This function waits until there are no writers of given type to given file - * system. Caller of this function should make sure there can be no new writers - * of type @level before calling this function. Otherwise this function can - * livelock. + * system. */ static void sb_wait_write(struct super_block *sb, int level) { - s64 writers; - + percpu_down_write(sb->s_writers.rw_sem + level-1); /* - * We just cycle-through lockdep here so that it does not complain - * about returning with lock to userspace + * We are going to return to userspace and forget about this lock, the + * ownership goes to the caller of thaw_super() which does unlock. + * + * FIXME: we should do this before return from freeze_super() after we + * called sync_filesystem(sb) and s_op->freeze_fs(sb), and thaw_super() + * should re-acquire these locks before s_op->unfreeze_fs(sb). However + * this leads to lockdep false-positives, so currently we do the early + * release right after acquire. */ - rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_); - rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_); - - do { - DEFINE_WAIT(wait); + percpu_rwsem_release(sb->s_writers.rw_sem + level-1, 0, _THIS_IP_); +} - /* - * We use a barrier in prepare_to_wait() to separate setting - * of frozen and checking of the counter - */ - prepare_to_wait(&sb->s_writers.wait, &wait, - TASK_UNINTERRUPTIBLE); +static void sb_freeze_unlock(struct super_block *sb) +{ + int level; - writers = percpu_counter_sum(&sb->s_writers.counter[level-1]); - if (writers) - schedule(); + for (level = 0; level < SB_FREEZE_LEVELS; ++level) + percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_); - finish_wait(&sb->s_writers.wait, &wait); - } while (writers); + for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--) + percpu_up_write(sb->s_writers.rw_sem + level); } /** @@ -1310,20 +1295,14 @@ int freeze_super(struct super_block *sb) return 0; } - /* From now on, no new normal writers can start */ sb->s_writers.frozen = SB_FREEZE_WRITE; - smp_wmb(); - /* Release s_umount to preserve sb_start_write -> s_umount ordering */ up_write(&sb->s_umount); - sb_wait_write(sb, SB_FREEZE_WRITE); + down_write(&sb->s_umount); /* Now we go and block page faults... */ - down_write(&sb->s_umount); sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; - smp_wmb(); - sb_wait_write(sb, SB_FREEZE_PAGEFAULT); /* All writers are done so after syncing there won't be dirty data */ @@ -1331,7 +1310,6 @@ int freeze_super(struct super_block *sb) /* Now wait for internal filesystem counter */ sb->s_writers.frozen = SB_FREEZE_FS; - smp_wmb(); sb_wait_write(sb, SB_FREEZE_FS); if (sb->s_op->freeze_fs) { @@ -1340,7 +1318,7 @@ int freeze_super(struct super_block *sb) printk(KERN_ERR "VFS:Filesystem freeze failed\n"); sb->s_writers.frozen = SB_UNFROZEN; - smp_wmb(); + sb_freeze_unlock(sb); wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); return ret; @@ -1372,8 +1350,10 @@ int thaw_super(struct super_block *sb) return -EINVAL; } - if (sb->s_flags & MS_RDONLY) + if (sb->s_flags & MS_RDONLY) { + sb->s_writers.frozen = SB_UNFROZEN; goto out; + } if (sb->s_op->unfreeze_fs) { error = sb->s_op->unfreeze_fs(sb); @@ -1385,12 +1365,11 @@ int thaw_super(struct super_block *sb) } } -out: sb->s_writers.frozen = SB_UNFROZEN; - smp_wmb(); + sb_freeze_unlock(sb); +out: wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); - return 0; } EXPORT_SYMBOL(thaw_super); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 1c6ac6fcee9f..f3db82071cfb 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -40,6 +40,10 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, SYSFS_MAGIC, &new_sb, ns); if (IS_ERR(root) || !new_sb) kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); + else if (new_sb) + /* Userspace would break if executables appear on sysfs */ + root->d_sb->s_iflags |= SB_I_NOEXEC; + return root; } diff --git a/fs/udf/super.c b/fs/udf/super.c index b96f190bc567..81155b9b445b 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -2070,6 +2070,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) struct udf_options uopt; struct kernel_lb_addr rootdir, fileset; struct udf_sb_info *sbi; + bool lvid_open = false; uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT); uopt.uid = INVALID_UID; @@ -2216,8 +2217,10 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) le16_to_cpu(ts.year), ts.month, ts.day, ts.hour, ts.minute, le16_to_cpu(ts.typeAndTimezone)); } - if (!(sb->s_flags & MS_RDONLY)) + if (!(sb->s_flags & MS_RDONLY)) { udf_open_lvid(sb); + lvid_open = true; + } /* Assign the root inode */ /* assign inodes by physical block number */ @@ -2248,7 +2251,7 @@ parse_options_failure: if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) unload_nls(sbi->s_nls_map); #endif - if (!(sb->s_flags & MS_RDONLY)) + if (lvid_open) udf_close_lvid(sb); brelse(sbi->s_lvid_bh); udf_sb_free_partitions(sb); diff --git a/fs/ufs/Makefile b/fs/ufs/Makefile index 4d0e02b022b3..392db25c0b56 100644 --- a/fs/ufs/Makefile +++ b/fs/ufs/Makefile @@ -5,5 +5,5 @@ obj-$(CONFIG_UFS_FS) += ufs.o ufs-objs := balloc.o cylinder.o dir.o file.o ialloc.o inode.o \ - namei.o super.o symlink.o truncate.o util.o + namei.o super.o symlink.o util.o ccflags-$(CONFIG_UFS_DEBUG) += -DDEBUG diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index a7106eda5024..dc5fae601c24 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -417,12 +417,14 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, if (oldcount == 0) { result = ufs_alloc_fragments (inode, cgno, goal, count, err); if (result) { + ufs_clear_frags(inode, result + oldcount, + newcount - oldcount, locked_page != NULL); + write_seqlock(&UFS_I(inode)->meta_lock); ufs_cpu_to_data_ptr(sb, p, result); + write_sequnlock(&UFS_I(inode)->meta_lock); *err = 0; UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag, fragment + count); - ufs_clear_frags(inode, result + oldcount, - newcount - oldcount, locked_page != NULL); } mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT, result %llu\n", (unsigned long long)result); @@ -473,7 +475,9 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, ufs_change_blocknr(inode, fragment - oldcount, oldcount, uspi->s_sbbase + tmp, uspi->s_sbbase + result, locked_page); + write_seqlock(&UFS_I(inode)->meta_lock); ufs_cpu_to_data_ptr(sb, p, result); + write_sequnlock(&UFS_I(inode)->meta_lock); *err = 0; UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag, fragment + count); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index f913a6924b23..a064cf44b143 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -41,9 +41,7 @@ #include "swab.h" #include "util.h" -static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock); - -static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t offsets[4]) +static int ufs_block_to_path(struct inode *inode, sector_t i_block, unsigned offsets[4]) { struct ufs_sb_private_info *uspi = UFS_SB(inode->i_sb)->s_uspi; int ptrs = uspi->s_apb; @@ -75,227 +73,232 @@ static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t off return n; } +typedef struct { + void *p; + union { + __fs32 key32; + __fs64 key64; + }; + struct buffer_head *bh; +} Indirect; + +static inline int grow_chain32(struct ufs_inode_info *ufsi, + struct buffer_head *bh, __fs32 *v, + Indirect *from, Indirect *to) +{ + Indirect *p; + unsigned seq; + to->bh = bh; + do { + seq = read_seqbegin(&ufsi->meta_lock); + to->key32 = *(__fs32 *)(to->p = v); + for (p = from; p <= to && p->key32 == *(__fs32 *)p->p; p++) + ; + } while (read_seqretry(&ufsi->meta_lock, seq)); + return (p > to); +} + +static inline int grow_chain64(struct ufs_inode_info *ufsi, + struct buffer_head *bh, __fs64 *v, + Indirect *from, Indirect *to) +{ + Indirect *p; + unsigned seq; + to->bh = bh; + do { + seq = read_seqbegin(&ufsi->meta_lock); + to->key64 = *(__fs64 *)(to->p = v); + for (p = from; p <= to && p->key64 == *(__fs64 *)p->p; p++) + ; + } while (read_seqretry(&ufsi->meta_lock, seq)); + return (p > to); +} + /* * Returns the location of the fragment from * the beginning of the filesystem. */ -static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock) +static u64 ufs_frag_map(struct inode *inode, unsigned offsets[4], int depth) { struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; u64 mask = (u64) uspi->s_apbmask>>uspi->s_fpbshift; int shift = uspi->s_apbshift-uspi->s_fpbshift; - sector_t offsets[4], *p; - int depth = ufs_block_to_path(inode, frag >> uspi->s_fpbshift, offsets); - u64 ret = 0L; - __fs32 block; - __fs64 u2_block = 0L; + Indirect chain[4], *q = chain; + unsigned *p; unsigned flags = UFS_SB(sb)->s_flags; - u64 temp = 0L; + u64 res = 0; - UFSD(": frag = %llu depth = %d\n", (unsigned long long)frag, depth); UFSD(": uspi->s_fpbshift = %d ,uspi->s_apbmask = %x, mask=%llx\n", uspi->s_fpbshift, uspi->s_apbmask, (unsigned long long)mask); if (depth == 0) - return 0; + goto no_block; +again: p = offsets; - if (needs_lock) - lock_ufs(sb); if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) goto ufs2; - block = ufsi->i_u1.i_data[*p++]; - if (!block) - goto out; + if (!grow_chain32(ufsi, NULL, &ufsi->i_u1.i_data[*p++], chain, q)) + goto changed; + if (!q->key32) + goto no_block; while (--depth) { + __fs32 *ptr; struct buffer_head *bh; - sector_t n = *p++; + unsigned n = *p++; - bh = sb_bread(sb, uspi->s_sbbase + fs32_to_cpu(sb, block)+(n>>shift)); + bh = sb_bread(sb, uspi->s_sbbase + + fs32_to_cpu(sb, q->key32) + (n>>shift)); if (!bh) - goto out; - block = ((__fs32 *) bh->b_data)[n & mask]; - brelse (bh); - if (!block) - goto out; - } - ret = (u64) (uspi->s_sbbase + fs32_to_cpu(sb, block) + (frag & uspi->s_fpbmask)); - goto out; -ufs2: - u2_block = ufsi->i_u1.u2_i_data[*p++]; - if (!u2_block) - goto out; + goto no_block; + ptr = (__fs32 *)bh->b_data + (n & mask); + if (!grow_chain32(ufsi, bh, ptr, chain, ++q)) + goto changed; + if (!q->key32) + goto no_block; + } + res = fs32_to_cpu(sb, q->key32); + goto found; +ufs2: + if (!grow_chain64(ufsi, NULL, &ufsi->i_u1.u2_i_data[*p++], chain, q)) + goto changed; + if (!q->key64) + goto no_block; while (--depth) { + __fs64 *ptr; struct buffer_head *bh; - sector_t n = *p++; - + unsigned n = *p++; - temp = (u64)(uspi->s_sbbase) + fs64_to_cpu(sb, u2_block); - bh = sb_bread(sb, temp +(u64) (n>>shift)); + bh = sb_bread(sb, uspi->s_sbbase + + fs64_to_cpu(sb, q->key64) + (n>>shift)); if (!bh) - goto out; - u2_block = ((__fs64 *)bh->b_data)[n & mask]; - brelse(bh); - if (!u2_block) - goto out; + goto no_block; + ptr = (__fs64 *)bh->b_data + (n & mask); + if (!grow_chain64(ufsi, bh, ptr, chain, ++q)) + goto changed; + if (!q->key64) + goto no_block; } - temp = (u64)uspi->s_sbbase + fs64_to_cpu(sb, u2_block); - ret = temp + (u64) (frag & uspi->s_fpbmask); + res = fs64_to_cpu(sb, q->key64); +found: + res += uspi->s_sbbase; +no_block: + while (q > chain) { + brelse(q->bh); + q--; + } + return res; -out: - if (needs_lock) - unlock_ufs(sb); - return ret; +changed: + while (q > chain) { + brelse(q->bh); + q--; + } + goto again; +} + +/* + * Unpacking tails: we have a file with partial final block and + * we had been asked to extend it. If the fragment being written + * is within the same block, we need to extend the tail just to cover + * that fragment. Otherwise the tail is extended to full block. + * + * Note that we might need to create a _new_ tail, but that will + * be handled elsewhere; this is strictly for resizing old + * ones. + */ +static bool +ufs_extend_tail(struct inode *inode, u64 writes_to, + int *err, struct page *locked_page) +{ + struct ufs_inode_info *ufsi = UFS_I(inode); + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + unsigned lastfrag = ufsi->i_lastfrag; /* it's a short file, so unsigned is enough */ + unsigned block = ufs_fragstoblks(lastfrag); + unsigned new_size; + void *p; + u64 tmp; + + if (writes_to < (lastfrag | uspi->s_fpbmask)) + new_size = (writes_to & uspi->s_fpbmask) + 1; + else + new_size = uspi->s_fpb; + + p = ufs_get_direct_data_ptr(uspi, ufsi, block); + tmp = ufs_new_fragments(inode, p, lastfrag, ufs_data_ptr_to_cpu(sb, p), + new_size, err, locked_page); + return tmp != 0; } /** * ufs_inode_getfrag() - allocate new fragment(s) * @inode: pointer to inode - * @fragment: number of `fragment' which hold pointer - * to new allocated fragment(s) + * @index: number of block pointer within the inode's array. * @new_fragment: number of new allocated fragment(s) - * @required: how many fragment(s) we require * @err: we set it if something wrong - * @phys: pointer to where we save physical number of new allocated fragments, - * NULL if we allocate not data(indirect blocks for example). * @new: we set it if we allocate new block * @locked_page: for ufs_new_fragments() */ -static struct buffer_head * -ufs_inode_getfrag(struct inode *inode, u64 fragment, - sector_t new_fragment, unsigned int required, int *err, - long *phys, int *new, struct page *locked_page) +static u64 +ufs_inode_getfrag(struct inode *inode, unsigned index, + sector_t new_fragment, int *err, + int *new, struct page *locked_page) { struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; - struct buffer_head * result; - unsigned blockoff, lastblockoff; - u64 tmp, goal, lastfrag, block, lastblock; - void *p, *p2; - - UFSD("ENTER, ino %lu, fragment %llu, new_fragment %llu, required %u, " - "metadata %d\n", inode->i_ino, (unsigned long long)fragment, - (unsigned long long)new_fragment, required, !phys); + u64 tmp, goal, lastfrag; + unsigned nfrags = uspi->s_fpb; + void *p; /* TODO : to be done for write support if ( (flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) goto ufs2; */ - block = ufs_fragstoblks (fragment); - blockoff = ufs_fragnum (fragment); - p = ufs_get_direct_data_ptr(uspi, ufsi, block); - - goal = 0; - -repeat: + p = ufs_get_direct_data_ptr(uspi, ufsi, index); tmp = ufs_data_ptr_to_cpu(sb, p); + if (tmp) + goto out; lastfrag = ufsi->i_lastfrag; - if (tmp && fragment < lastfrag) { - if (!phys) { - result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff); - if (tmp == ufs_data_ptr_to_cpu(sb, p)) { - UFSD("EXIT, result %llu\n", - (unsigned long long)tmp + blockoff); - return result; - } - brelse (result); - goto repeat; - } else { - *phys = uspi->s_sbbase + tmp + blockoff; - return NULL; - } - } - lastblock = ufs_fragstoblks (lastfrag); - lastblockoff = ufs_fragnum (lastfrag); - /* - * We will extend file into new block beyond last allocated block - */ - if (lastblock < block) { - /* - * We must reallocate last allocated block - */ - if (lastblockoff) { - p2 = ufs_get_direct_data_ptr(uspi, ufsi, lastblock); - tmp = ufs_new_fragments(inode, p2, lastfrag, - ufs_data_ptr_to_cpu(sb, p2), - uspi->s_fpb - lastblockoff, - err, locked_page); - if (!tmp) { - if (lastfrag != ufsi->i_lastfrag) - goto repeat; - else - return NULL; - } - lastfrag = ufsi->i_lastfrag; - - } - tmp = ufs_data_ptr_to_cpu(sb, - ufs_get_direct_data_ptr(uspi, ufsi, - lastblock)); - if (tmp) - goal = tmp + uspi->s_fpb; - tmp = ufs_new_fragments (inode, p, fragment - blockoff, - goal, required + blockoff, - err, - phys != NULL ? locked_page : NULL); - } else if (lastblock == block) { - /* - * We will extend last allocated block - */ - tmp = ufs_new_fragments(inode, p, fragment - - (blockoff - lastblockoff), - ufs_data_ptr_to_cpu(sb, p), - required + (blockoff - lastblockoff), - err, phys != NULL ? locked_page : NULL); - } else /* (lastblock > block) */ { - /* - * We will allocate new block before last allocated block - */ - if (block) { - tmp = ufs_data_ptr_to_cpu(sb, - ufs_get_direct_data_ptr(uspi, ufsi, block - 1)); - if (tmp) - goal = tmp + uspi->s_fpb; - } - tmp = ufs_new_fragments(inode, p, fragment - blockoff, - goal, uspi->s_fpb, err, - phys != NULL ? locked_page : NULL); + /* will that be a new tail? */ + if (new_fragment < UFS_NDIR_FRAGMENT && new_fragment >= lastfrag) + nfrags = (new_fragment & uspi->s_fpbmask) + 1; + + goal = 0; + if (index) { + goal = ufs_data_ptr_to_cpu(sb, + ufs_get_direct_data_ptr(uspi, ufsi, index - 1)); + if (goal) + goal += uspi->s_fpb; } + tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment), + goal, uspi->s_fpb, err, locked_page); + if (!tmp) { - if ((!blockoff && ufs_data_ptr_to_cpu(sb, p)) || - (blockoff && lastfrag != ufsi->i_lastfrag)) - goto repeat; *err = -ENOSPC; - return NULL; + return 0; } - if (!phys) { - result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff); - } else { - *phys = uspi->s_sbbase + tmp + blockoff; - result = NULL; - *err = 0; + if (new) *new = 1; - } - inode->i_ctime = CURRENT_TIME_SEC; if (IS_SYNC(inode)) ufs_sync_inode (inode); mark_inode_dirty(inode); - UFSD("EXIT, result %llu\n", (unsigned long long)tmp + blockoff); - return result; +out: + return tmp + uspi->s_sbbase; /* This part : To be implemented .... Required only for writing, not required for READ-ONLY. @@ -316,95 +319,70 @@ repeat2: /** * ufs_inode_getblock() - allocate new block * @inode: pointer to inode - * @bh: pointer to block which hold "pointer" to new allocated block - * @fragment: number of `fragment' which hold pointer - * to new allocated block + * @ind_block: block number of the indirect block + * @index: number of pointer within the indirect block * @new_fragment: number of new allocated fragment * (block will hold this fragment and also uspi->s_fpb-1) * @err: see ufs_inode_getfrag() - * @phys: see ufs_inode_getfrag() * @new: see ufs_inode_getfrag() * @locked_page: see ufs_inode_getfrag() */ -static struct buffer_head * -ufs_inode_getblock(struct inode *inode, struct buffer_head *bh, - u64 fragment, sector_t new_fragment, int *err, - long *phys, int *new, struct page *locked_page) +static u64 +ufs_inode_getblock(struct inode *inode, u64 ind_block, + unsigned index, sector_t new_fragment, int *err, + int *new, struct page *locked_page) { struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; - struct buffer_head * result; - unsigned blockoff; - u64 tmp, goal, block; + int shift = uspi->s_apbshift - uspi->s_fpbshift; + u64 tmp = 0, goal; + struct buffer_head *bh; void *p; - block = ufs_fragstoblks (fragment); - blockoff = ufs_fragnum (fragment); - - UFSD("ENTER, ino %lu, fragment %llu, new_fragment %llu, metadata %d\n", - inode->i_ino, (unsigned long long)fragment, - (unsigned long long)new_fragment, !phys); + if (!ind_block) + return 0; - result = NULL; - if (!bh) - goto out; - if (!buffer_uptodate(bh)) { - ll_rw_block (READ, 1, &bh); - wait_on_buffer (bh); - if (!buffer_uptodate(bh)) - goto out; + bh = sb_bread(sb, ind_block + (index >> shift)); + if (unlikely(!bh)) { + *err = -EIO; + return 0; } + + index &= uspi->s_apbmask >> uspi->s_fpbshift; if (uspi->fs_magic == UFS2_MAGIC) - p = (__fs64 *)bh->b_data + block; + p = (__fs64 *)bh->b_data + index; else - p = (__fs32 *)bh->b_data + block; -repeat: + p = (__fs32 *)bh->b_data + index; + tmp = ufs_data_ptr_to_cpu(sb, p); - if (tmp) { - if (!phys) { - result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff); - if (tmp == ufs_data_ptr_to_cpu(sb, p)) - goto out; - brelse (result); - goto repeat; - } else { - *phys = uspi->s_sbbase + tmp + blockoff; - goto out; - } - } + if (tmp) + goto out; - if (block && (uspi->fs_magic == UFS2_MAGIC ? - (tmp = fs64_to_cpu(sb, ((__fs64 *)bh->b_data)[block-1])) : - (tmp = fs32_to_cpu(sb, ((__fs32 *)bh->b_data)[block-1])))) + if (index && (uspi->fs_magic == UFS2_MAGIC ? + (tmp = fs64_to_cpu(sb, ((__fs64 *)bh->b_data)[index-1])) : + (tmp = fs32_to_cpu(sb, ((__fs32 *)bh->b_data)[index-1])))) goal = tmp + uspi->s_fpb; else goal = bh->b_blocknr + uspi->s_fpb; tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment), goal, uspi->s_fpb, err, locked_page); - if (!tmp) { - if (ufs_data_ptr_to_cpu(sb, p)) - goto repeat; + if (!tmp) goto out; - } - - if (!phys) { - result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff); - } else { - *phys = uspi->s_sbbase + tmp + blockoff; + if (new) *new = 1; - } mark_buffer_dirty(bh); if (IS_SYNC(inode)) sync_dirty_buffer(bh); inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); - UFSD("result %llu\n", (unsigned long long)tmp + blockoff); out: brelse (bh); UFSD("EXIT\n"); - return result; + if (tmp) + tmp += uspi->s_sbbase; + return tmp; } /** @@ -412,103 +390,64 @@ out: * readpage, writepage and so on */ -int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create) +static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create) { - struct super_block * sb = inode->i_sb; - struct ufs_sb_info * sbi = UFS_SB(sb); - struct ufs_sb_private_info * uspi = sbi->s_uspi; - struct buffer_head * bh; - int ret, err, new; - unsigned long ptr,phys; + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + int err = 0, new = 0; + unsigned offsets[4]; + int depth = ufs_block_to_path(inode, fragment >> uspi->s_fpbshift, offsets); u64 phys64 = 0; - bool needs_lock = (sbi->mutex_owner != current); - + unsigned frag = fragment & uspi->s_fpbmask; + if (!create) { - phys64 = ufs_frag_map(inode, fragment, needs_lock); - UFSD("phys64 = %llu\n", (unsigned long long)phys64); - if (phys64) - map_bh(bh_result, sb, phys64); - return 0; + phys64 = ufs_frag_map(inode, offsets, depth); + goto out; } /* This code entered only while writing ....? */ - err = -EIO; - new = 0; - ret = 0; - bh = NULL; - - if (needs_lock) - lock_ufs(sb); + mutex_lock(&UFS_I(inode)->truncate_mutex); UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment); - if (fragment > - ((UFS_NDADDR + uspi->s_apb + uspi->s_2apb + uspi->s_3apb) - << uspi->s_fpbshift)) - goto abort_too_big; - - err = 0; - ptr = fragment; - - /* - * ok, these macros clean the logic up a bit and make - * it much more readable: - */ -#define GET_INODE_DATABLOCK(x) \ - ufs_inode_getfrag(inode, x, fragment, 1, &err, &phys, &new,\ - bh_result->b_page) -#define GET_INODE_PTR(x) \ - ufs_inode_getfrag(inode, x, fragment, uspi->s_fpb, &err, NULL, NULL,\ - bh_result->b_page) -#define GET_INDIRECT_DATABLOCK(x) \ - ufs_inode_getblock(inode, bh, x, fragment, \ - &err, &phys, &new, bh_result->b_page) -#define GET_INDIRECT_PTR(x) \ - ufs_inode_getblock(inode, bh, x, fragment, \ - &err, NULL, NULL, NULL) - - if (ptr < UFS_NDIR_FRAGMENT) { - bh = GET_INODE_DATABLOCK(ptr); + if (unlikely(!depth)) { + ufs_warning(sb, "ufs_get_block", "block > big"); + err = -EIO; goto out; } - ptr -= UFS_NDIR_FRAGMENT; - if (ptr < (1 << (uspi->s_apbshift + uspi->s_fpbshift))) { - bh = GET_INODE_PTR(UFS_IND_FRAGMENT + (ptr >> uspi->s_apbshift)); - goto get_indirect; - } - ptr -= 1 << (uspi->s_apbshift + uspi->s_fpbshift); - if (ptr < (1 << (uspi->s_2apbshift + uspi->s_fpbshift))) { - bh = GET_INODE_PTR(UFS_DIND_FRAGMENT + (ptr >> uspi->s_2apbshift)); - goto get_double; - } - ptr -= 1 << (uspi->s_2apbshift + uspi->s_fpbshift); - bh = GET_INODE_PTR(UFS_TIND_FRAGMENT + (ptr >> uspi->s_3apbshift)); - bh = GET_INDIRECT_PTR((ptr >> uspi->s_2apbshift) & uspi->s_apbmask); -get_double: - bh = GET_INDIRECT_PTR((ptr >> uspi->s_apbshift) & uspi->s_apbmask); -get_indirect: - bh = GET_INDIRECT_DATABLOCK(ptr & uspi->s_apbmask); - -#undef GET_INODE_DATABLOCK -#undef GET_INODE_PTR -#undef GET_INDIRECT_DATABLOCK -#undef GET_INDIRECT_PTR -out: - if (err) - goto abort; - if (new) - set_buffer_new(bh_result); - map_bh(bh_result, sb, phys); -abort: - if (needs_lock) - unlock_ufs(sb); + if (UFS_I(inode)->i_lastfrag < UFS_NDIR_FRAGMENT) { + unsigned lastfrag = UFS_I(inode)->i_lastfrag; + unsigned tailfrags = lastfrag & uspi->s_fpbmask; + if (tailfrags && fragment >= lastfrag) { + if (!ufs_extend_tail(inode, fragment, + &err, bh_result->b_page)) + goto out; + } + } + if (depth == 1) { + phys64 = ufs_inode_getfrag(inode, offsets[0], fragment, + &err, &new, bh_result->b_page); + } else { + int i; + phys64 = ufs_inode_getfrag(inode, offsets[0], fragment, + &err, NULL, NULL); + for (i = 1; i < depth - 1; i++) + phys64 = ufs_inode_getblock(inode, phys64, offsets[i], + fragment, &err, NULL, NULL); + phys64 = ufs_inode_getblock(inode, phys64, offsets[depth - 1], + fragment, &err, &new, bh_result->b_page); + } +out: + if (phys64) { + phys64 += frag; + map_bh(bh_result, sb, phys64); + if (new) + set_buffer_new(bh_result); + } + mutex_unlock(&UFS_I(inode)->truncate_mutex); return err; - -abort_too_big: - ufs_warning(sb, "ufs_get_block", "block > big"); - goto abort; } static int ufs_writepage(struct page *page, struct writeback_control *wbc) @@ -526,12 +465,16 @@ int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len) return __block_write_begin(page, pos, len, ufs_getfrag_block); } +static void ufs_truncate_blocks(struct inode *); + static void ufs_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; - if (to > inode->i_size) + if (to > inode->i_size) { truncate_pagecache(inode, inode->i_size); + ufs_truncate_blocks(inode); + } } static int ufs_write_begin(struct file *file, struct address_space *mapping, @@ -548,6 +491,18 @@ static int ufs_write_begin(struct file *file, struct address_space *mapping, return ret; } +static int ufs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + int ret; + + ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + if (ret < len) + ufs_write_failed(mapping, pos + len); + return ret; +} + static sector_t ufs_bmap(struct address_space *mapping, sector_t block) { return generic_block_bmap(mapping,block,ufs_getfrag_block); @@ -557,7 +512,7 @@ const struct address_space_operations ufs_aops = { .readpage = ufs_readpage, .writepage = ufs_writepage, .write_begin = ufs_write_begin, - .write_end = generic_write_end, + .write_end = ufs_write_end, .bmap = ufs_bmap }; @@ -599,7 +554,7 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode) ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino); return -1; } - + /* * Linux now has 32-bit uid and gid, so we can support EFT. */ @@ -619,7 +574,7 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode) ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow); ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag); - + if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) { memcpy(ufsi->i_u1.i_data, &ufs_inode->ui_u2.ui_addr, sizeof(ufs_inode->ui_u2.ui_addr)); @@ -753,7 +708,7 @@ static void ufs1_update_inode(struct inode *inode, struct ufs_inode *ufs_inode) ufs_set_inode_uid(sb, ufs_inode, i_uid_read(inode)); ufs_set_inode_gid(sb, ufs_inode, i_gid_read(inode)); - + ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size); ufs_inode->ui_atime.tv_sec = cpu_to_fs32(sb, inode->i_atime.tv_sec); ufs_inode->ui_atime.tv_usec = 0; @@ -855,23 +810,19 @@ static int ufs_update_inode(struct inode * inode, int do_sync) ufs1_update_inode(inode, ufs_inode + ufs_inotofsbo(inode->i_ino)); } - + mark_buffer_dirty(bh); if (do_sync) sync_dirty_buffer(bh); brelse (bh); - + UFSD("EXIT\n"); return 0; } int ufs_write_inode(struct inode *inode, struct writeback_control *wbc) { - int ret; - lock_ufs(inode->i_sb); - ret = ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); - unlock_ufs(inode->i_sb); - return ret; + return ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); } int ufs_sync_inode (struct inode *inode) @@ -888,24 +839,389 @@ void ufs_evict_inode(struct inode * inode) truncate_inode_pages_final(&inode->i_data); if (want_delete) { - loff_t old_i_size; - /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/ - lock_ufs(inode->i_sb); - mark_inode_dirty(inode); - ufs_update_inode(inode, IS_SYNC(inode)); - old_i_size = inode->i_size; inode->i_size = 0; - if (inode->i_blocks && ufs_truncate(inode, old_i_size)) - ufs_warning(inode->i_sb, __func__, "ufs_truncate failed\n"); - unlock_ufs(inode->i_sb); + if (inode->i_blocks) + ufs_truncate_blocks(inode); } invalidate_inode_buffers(inode); clear_inode(inode); - if (want_delete) { - lock_ufs(inode->i_sb); + if (want_delete) ufs_free_inode(inode); - unlock_ufs(inode->i_sb); +} + +struct to_free { + struct inode *inode; + u64 to; + unsigned count; +}; + +static inline void free_data(struct to_free *ctx, u64 from, unsigned count) +{ + if (ctx->count && ctx->to != from) { + ufs_free_blocks(ctx->inode, ctx->to - ctx->count, ctx->count); + ctx->count = 0; + } + ctx->count += count; + ctx->to = from + count; +} + +#define DIRECT_BLOCK ((inode->i_size + uspi->s_bsize - 1) >> uspi->s_bshift) +#define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift) + +static void ufs_trunc_direct(struct inode *inode) +{ + struct ufs_inode_info *ufsi = UFS_I(inode); + struct super_block * sb; + struct ufs_sb_private_info * uspi; + void *p; + u64 frag1, frag2, frag3, frag4, block1, block2; + struct to_free ctx = {.inode = inode}; + unsigned i, tmp; + + UFSD("ENTER: ino %lu\n", inode->i_ino); + + sb = inode->i_sb; + uspi = UFS_SB(sb)->s_uspi; + + frag1 = DIRECT_FRAGMENT; + frag4 = min_t(u64, UFS_NDIR_FRAGMENT, ufsi->i_lastfrag); + frag2 = ((frag1 & uspi->s_fpbmask) ? ((frag1 | uspi->s_fpbmask) + 1) : frag1); + frag3 = frag4 & ~uspi->s_fpbmask; + block1 = block2 = 0; + if (frag2 > frag3) { + frag2 = frag4; + frag3 = frag4 = 0; + } else if (frag2 < frag3) { + block1 = ufs_fragstoblks (frag2); + block2 = ufs_fragstoblks (frag3); + } + + UFSD("ino %lu, frag1 %llu, frag2 %llu, block1 %llu, block2 %llu," + " frag3 %llu, frag4 %llu\n", inode->i_ino, + (unsigned long long)frag1, (unsigned long long)frag2, + (unsigned long long)block1, (unsigned long long)block2, + (unsigned long long)frag3, (unsigned long long)frag4); + + if (frag1 >= frag2) + goto next1; + + /* + * Free first free fragments + */ + p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag1)); + tmp = ufs_data_ptr_to_cpu(sb, p); + if (!tmp ) + ufs_panic (sb, "ufs_trunc_direct", "internal error"); + frag2 -= frag1; + frag1 = ufs_fragnum (frag1); + + ufs_free_fragments(inode, tmp + frag1, frag2); + +next1: + /* + * Free whole blocks + */ + for (i = block1 ; i < block2; i++) { + p = ufs_get_direct_data_ptr(uspi, ufsi, i); + tmp = ufs_data_ptr_to_cpu(sb, p); + if (!tmp) + continue; + write_seqlock(&ufsi->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&ufsi->meta_lock); + + free_data(&ctx, tmp, uspi->s_fpb); + } + + free_data(&ctx, 0, 0); + + if (frag3 >= frag4) + goto next3; + + /* + * Free last free fragments + */ + p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag3)); + tmp = ufs_data_ptr_to_cpu(sb, p); + if (!tmp ) + ufs_panic(sb, "ufs_truncate_direct", "internal error"); + frag4 = ufs_fragnum (frag4); + write_seqlock(&ufsi->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&ufsi->meta_lock); + + ufs_free_fragments (inode, tmp, frag4); + next3: + + UFSD("EXIT: ino %lu\n", inode->i_ino); +} + +static void free_full_branch(struct inode *inode, u64 ind_block, int depth) +{ + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + struct ufs_buffer_head *ubh = ubh_bread(sb, ind_block, uspi->s_bsize); + unsigned i; + + if (!ubh) + return; + + if (--depth) { + for (i = 0; i < uspi->s_apb; i++) { + void *p = ubh_get_data_ptr(uspi, ubh, i); + u64 block = ufs_data_ptr_to_cpu(sb, p); + if (block) + free_full_branch(inode, block, depth); + } + } else { + struct to_free ctx = {.inode = inode}; + + for (i = 0; i < uspi->s_apb; i++) { + void *p = ubh_get_data_ptr(uspi, ubh, i); + u64 block = ufs_data_ptr_to_cpu(sb, p); + if (block) + free_data(&ctx, block, uspi->s_fpb); + } + free_data(&ctx, 0, 0); + } + + ubh_bforget(ubh); + ufs_free_blocks(inode, ind_block, uspi->s_fpb); +} + +static void free_branch_tail(struct inode *inode, unsigned from, struct ufs_buffer_head *ubh, int depth) +{ + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + unsigned i; + + if (--depth) { + for (i = from; i < uspi->s_apb ; i++) { + void *p = ubh_get_data_ptr(uspi, ubh, i); + u64 block = ufs_data_ptr_to_cpu(sb, p); + if (block) { + write_seqlock(&UFS_I(inode)->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&UFS_I(inode)->meta_lock); + ubh_mark_buffer_dirty(ubh); + free_full_branch(inode, block, depth); + } + } + } else { + struct to_free ctx = {.inode = inode}; + + for (i = from; i < uspi->s_apb; i++) { + void *p = ubh_get_data_ptr(uspi, ubh, i); + u64 block = ufs_data_ptr_to_cpu(sb, p); + if (block) { + write_seqlock(&UFS_I(inode)->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&UFS_I(inode)->meta_lock); + ubh_mark_buffer_dirty(ubh); + free_data(&ctx, block, uspi->s_fpb); + } + } + free_data(&ctx, 0, 0); + } + if (IS_SYNC(inode) && ubh_buffer_dirty(ubh)) + ubh_sync_block(ubh); + ubh_brelse(ubh); +} + +static int ufs_alloc_lastblock(struct inode *inode, loff_t size) +{ + int err = 0; + struct super_block *sb = inode->i_sb; + struct address_space *mapping = inode->i_mapping; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + unsigned i, end; + sector_t lastfrag; + struct page *lastpage; + struct buffer_head *bh; + u64 phys64; + + lastfrag = (size + uspi->s_fsize - 1) >> uspi->s_fshift; + + if (!lastfrag) + goto out; + + lastfrag--; + + lastpage = ufs_get_locked_page(mapping, lastfrag >> + (PAGE_CACHE_SHIFT - inode->i_blkbits)); + if (IS_ERR(lastpage)) { + err = -EIO; + goto out; + } + + end = lastfrag & ((1 << (PAGE_CACHE_SHIFT - inode->i_blkbits)) - 1); + bh = page_buffers(lastpage); + for (i = 0; i < end; ++i) + bh = bh->b_this_page; + + + err = ufs_getfrag_block(inode, lastfrag, bh, 1); + + if (unlikely(err)) + goto out_unlock; + + if (buffer_new(bh)) { + clear_buffer_new(bh); + unmap_underlying_metadata(bh->b_bdev, + bh->b_blocknr); + /* + * we do not zeroize fragment, because of + * if it maped to hole, it already contains zeroes + */ + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + set_page_dirty(lastpage); + } + + if (lastfrag >= UFS_IND_FRAGMENT) { + end = uspi->s_fpb - ufs_fragnum(lastfrag) - 1; + phys64 = bh->b_blocknr + 1; + for (i = 0; i < end; ++i) { + bh = sb_getblk(sb, i + phys64); + lock_buffer(bh); + memset(bh->b_data, 0, sb->s_blocksize); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + unlock_buffer(bh); + sync_dirty_buffer(bh); + brelse(bh); + } + } +out_unlock: + ufs_put_locked_page(lastpage); +out: + return err; +} + +static void __ufs_truncate_blocks(struct inode *inode) +{ + struct ufs_inode_info *ufsi = UFS_I(inode); + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + unsigned offsets[4]; + int depth = ufs_block_to_path(inode, DIRECT_BLOCK, offsets); + int depth2; + unsigned i; + struct ufs_buffer_head *ubh[3]; + void *p; + u64 block; + + if (!depth) + return; + + /* find the last non-zero in offsets[] */ + for (depth2 = depth - 1; depth2; depth2--) + if (offsets[depth2]) + break; + + mutex_lock(&ufsi->truncate_mutex); + if (depth == 1) { + ufs_trunc_direct(inode); + offsets[0] = UFS_IND_BLOCK; + } else { + /* get the blocks that should be partially emptied */ + p = ufs_get_direct_data_ptr(uspi, ufsi, offsets[0]); + for (i = 0; i < depth2; i++) { + offsets[i]++; /* next branch is fully freed */ + block = ufs_data_ptr_to_cpu(sb, p); + if (!block) + break; + ubh[i] = ubh_bread(sb, block, uspi->s_bsize); + if (!ubh[i]) { + write_seqlock(&ufsi->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&ufsi->meta_lock); + break; + } + p = ubh_get_data_ptr(uspi, ubh[i], offsets[i + 1]); + } + while (i--) + free_branch_tail(inode, offsets[i + 1], ubh[i], depth - i - 1); + } + for (i = offsets[0]; i <= UFS_TIND_BLOCK; i++) { + p = ufs_get_direct_data_ptr(uspi, ufsi, i); + block = ufs_data_ptr_to_cpu(sb, p); + if (block) { + write_seqlock(&ufsi->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&ufsi->meta_lock); + free_full_branch(inode, block, i - UFS_IND_BLOCK + 1); + } } + ufsi->i_lastfrag = DIRECT_FRAGMENT; + mark_inode_dirty(inode); + mutex_unlock(&ufsi->truncate_mutex); +} + +static int ufs_truncate(struct inode *inode, loff_t size) +{ + int err = 0; + + UFSD("ENTER: ino %lu, i_size: %llu, old_i_size: %llu\n", + inode->i_ino, (unsigned long long)size, + (unsigned long long)i_size_read(inode)); + + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return -EINVAL; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return -EPERM; + + err = ufs_alloc_lastblock(inode, size); + + if (err) + goto out; + + block_truncate_page(inode->i_mapping, size, ufs_getfrag_block); + + truncate_setsize(inode, size); + + __ufs_truncate_blocks(inode); + inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); +out: + UFSD("EXIT: err %d\n", err); + return err; +} + +void ufs_truncate_blocks(struct inode *inode) +{ + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return; + __ufs_truncate_blocks(inode); +} + +int ufs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + unsigned int ia_valid = attr->ia_valid; + int error; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) { + error = ufs_truncate(inode, attr->ia_size); + if (error) + return error; + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + return 0; } + +const struct inode_operations ufs_file_inode_operations = { + .setattr = ufs_setattr, +}; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 250579a80d90..f6390eec02ca 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -94,22 +94,6 @@ #include "swab.h" #include "util.h" -void lock_ufs(struct super_block *sb) -{ - struct ufs_sb_info *sbi = UFS_SB(sb); - - mutex_lock(&sbi->mutex); - sbi->mutex_owner = current; -} - -void unlock_ufs(struct super_block *sb) -{ - struct ufs_sb_info *sbi = UFS_SB(sb); - - sbi->mutex_owner = NULL; - mutex_unlock(&sbi->mutex); -} - static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; @@ -694,7 +678,6 @@ static int ufs_sync_fs(struct super_block *sb, int wait) struct ufs_super_block_third * usb3; unsigned flags; - lock_ufs(sb); mutex_lock(&UFS_SB(sb)->s_lock); UFSD("ENTER\n"); @@ -714,7 +697,6 @@ static int ufs_sync_fs(struct super_block *sb, int wait) UFSD("EXIT\n"); mutex_unlock(&UFS_SB(sb)->s_lock); - unlock_ufs(sb); return 0; } @@ -758,7 +740,6 @@ static void ufs_put_super(struct super_block *sb) ubh_brelse_uspi (sbi->s_uspi); kfree (sbi->s_uspi); - mutex_destroy(&sbi->mutex); kfree (sbi); sb->s_fs_info = NULL; UFSD("EXIT\n"); @@ -801,7 +782,6 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) UFSD("flag %u\n", (int)(sb->s_flags & MS_RDONLY)); - mutex_init(&sbi->mutex); mutex_init(&sbi->s_lock); spin_lock_init(&sbi->work_lock); INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs); @@ -1257,7 +1237,6 @@ magic_found: return 0; failed: - mutex_destroy(&sbi->mutex); if (ubh) ubh_brelse_uspi (uspi); kfree (uspi); @@ -1280,7 +1259,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) unsigned flags; sync_filesystem(sb); - lock_ufs(sb); mutex_lock(&UFS_SB(sb)->s_lock); uspi = UFS_SB(sb)->s_uspi; flags = UFS_SB(sb)->s_flags; @@ -1296,7 +1274,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) ufs_set_opt (new_mount_opt, ONERROR_LOCK); if (!ufs_parse_options (data, &new_mount_opt)) { mutex_unlock(&UFS_SB(sb)->s_lock); - unlock_ufs(sb); return -EINVAL; } if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) { @@ -1304,14 +1281,12 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) { pr_err("ufstype can't be changed during remount\n"); mutex_unlock(&UFS_SB(sb)->s_lock); - unlock_ufs(sb); return -EINVAL; } if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { UFS_SB(sb)->s_mount_opt = new_mount_opt; mutex_unlock(&UFS_SB(sb)->s_lock); - unlock_ufs(sb); return 0; } @@ -1335,7 +1310,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) #ifndef CONFIG_UFS_FS_WRITE pr_err("ufs was compiled with read-only support, can't be mounted as read-write\n"); mutex_unlock(&UFS_SB(sb)->s_lock); - unlock_ufs(sb); return -EINVAL; #else if (ufstype != UFS_MOUNT_UFSTYPE_SUN && @@ -1345,13 +1319,11 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) ufstype != UFS_MOUNT_UFSTYPE_UFS2) { pr_err("this ufstype is read-only supported\n"); mutex_unlock(&UFS_SB(sb)->s_lock); - unlock_ufs(sb); return -EINVAL; } if (!ufs_read_cylinder_structures(sb)) { pr_err("failed during remounting\n"); mutex_unlock(&UFS_SB(sb)->s_lock); - unlock_ufs(sb); return -EPERM; } sb->s_flags &= ~MS_RDONLY; @@ -1359,7 +1331,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) } UFS_SB(sb)->s_mount_opt = new_mount_opt; mutex_unlock(&UFS_SB(sb)->s_lock); - unlock_ufs(sb); return 0; } @@ -1391,8 +1362,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf) struct ufs_super_block_third *usb3; u64 id = huge_encode_dev(sb->s_bdev->bd_dev); - lock_ufs(sb); - + mutex_lock(&UFS_SB(sb)->s_lock); usb3 = ubh_get_usb_third(uspi); if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { @@ -1413,7 +1383,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); return 0; } @@ -1429,6 +1399,8 @@ static struct inode *ufs_alloc_inode(struct super_block *sb) return NULL; ei->vfs_inode.i_version = 1; + seqlock_init(&ei->meta_lock); + mutex_init(&ei->truncate_mutex); return &ei->vfs_inode; } diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c deleted file mode 100644 index 21154704c168..000000000000 --- a/fs/ufs/truncate.c +++ /dev/null @@ -1,523 +0,0 @@ -/* - * linux/fs/ufs/truncate.c - * - * Copyright (C) 1998 - * Daniel Pirkl <daniel.pirkl@email.cz> - * Charles University, Faculty of Mathematics and Physics - * - * from - * - * linux/fs/ext2/truncate.c - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * from - * - * linux/fs/minix/truncate.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * Big-endian to little-endian byte-swapping/bitmaps by - * David S. Miller (davem@caip.rutgers.edu), 1995 - */ - -/* - * Real random numbers for secure rm added 94/02/18 - * Idea from Pierre del Perugia <delperug@gla.ecoledoc.ibp.fr> - */ - -/* - * Adoptation to use page cache and UFS2 write support by - * Evgeniy Dushistov <dushistov@mail.ru>, 2006-2007 - */ - -#include <linux/errno.h> -#include <linux/fs.h> -#include <linux/fcntl.h> -#include <linux/time.h> -#include <linux/stat.h> -#include <linux/string.h> -#include <linux/buffer_head.h> -#include <linux/blkdev.h> -#include <linux/sched.h> - -#include "ufs_fs.h" -#include "ufs.h" -#include "swab.h" -#include "util.h" - -/* - * Secure deletion currently doesn't work. It interacts very badly - * with buffers shared with memory mappings, and for that reason - * can't be done in the truncate() routines. It should instead be - * done separately in "release()" before calling the truncate routines - * that will release the actual file blocks. - * - * Linus - */ - -#define DIRECT_BLOCK ((inode->i_size + uspi->s_bsize - 1) >> uspi->s_bshift) -#define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift) - - -static int ufs_trunc_direct(struct inode *inode) -{ - struct ufs_inode_info *ufsi = UFS_I(inode); - struct super_block * sb; - struct ufs_sb_private_info * uspi; - void *p; - u64 frag1, frag2, frag3, frag4, block1, block2; - unsigned frag_to_free, free_count; - unsigned i, tmp; - int retry; - - UFSD("ENTER: ino %lu\n", inode->i_ino); - - sb = inode->i_sb; - uspi = UFS_SB(sb)->s_uspi; - - frag_to_free = 0; - free_count = 0; - retry = 0; - - frag1 = DIRECT_FRAGMENT; - frag4 = min_t(u64, UFS_NDIR_FRAGMENT, ufsi->i_lastfrag); - frag2 = ((frag1 & uspi->s_fpbmask) ? ((frag1 | uspi->s_fpbmask) + 1) : frag1); - frag3 = frag4 & ~uspi->s_fpbmask; - block1 = block2 = 0; - if (frag2 > frag3) { - frag2 = frag4; - frag3 = frag4 = 0; - } else if (frag2 < frag3) { - block1 = ufs_fragstoblks (frag2); - block2 = ufs_fragstoblks (frag3); - } - - UFSD("ino %lu, frag1 %llu, frag2 %llu, block1 %llu, block2 %llu," - " frag3 %llu, frag4 %llu\n", inode->i_ino, - (unsigned long long)frag1, (unsigned long long)frag2, - (unsigned long long)block1, (unsigned long long)block2, - (unsigned long long)frag3, (unsigned long long)frag4); - - if (frag1 >= frag2) - goto next1; - - /* - * Free first free fragments - */ - p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag1)); - tmp = ufs_data_ptr_to_cpu(sb, p); - if (!tmp ) - ufs_panic (sb, "ufs_trunc_direct", "internal error"); - frag2 -= frag1; - frag1 = ufs_fragnum (frag1); - - ufs_free_fragments(inode, tmp + frag1, frag2); - mark_inode_dirty(inode); - frag_to_free = tmp + frag1; - -next1: - /* - * Free whole blocks - */ - for (i = block1 ; i < block2; i++) { - p = ufs_get_direct_data_ptr(uspi, ufsi, i); - tmp = ufs_data_ptr_to_cpu(sb, p); - if (!tmp) - continue; - ufs_data_ptr_clear(uspi, p); - - if (free_count == 0) { - frag_to_free = tmp; - free_count = uspi->s_fpb; - } else if (free_count > 0 && frag_to_free == tmp - free_count) - free_count += uspi->s_fpb; - else { - ufs_free_blocks (inode, frag_to_free, free_count); - frag_to_free = tmp; - free_count = uspi->s_fpb; - } - mark_inode_dirty(inode); - } - - if (free_count > 0) - ufs_free_blocks (inode, frag_to_free, free_count); - - if (frag3 >= frag4) - goto next3; - - /* - * Free last free fragments - */ - p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag3)); - tmp = ufs_data_ptr_to_cpu(sb, p); - if (!tmp ) - ufs_panic(sb, "ufs_truncate_direct", "internal error"); - frag4 = ufs_fragnum (frag4); - ufs_data_ptr_clear(uspi, p); - - ufs_free_fragments (inode, tmp, frag4); - mark_inode_dirty(inode); - next3: - - UFSD("EXIT: ino %lu\n", inode->i_ino); - return retry; -} - - -static int ufs_trunc_indirect(struct inode *inode, u64 offset, void *p) -{ - struct super_block * sb; - struct ufs_sb_private_info * uspi; - struct ufs_buffer_head * ind_ubh; - void *ind; - u64 tmp, indirect_block, i, frag_to_free; - unsigned free_count; - int retry; - - UFSD("ENTER: ino %lu, offset %llu, p: %p\n", - inode->i_ino, (unsigned long long)offset, p); - - BUG_ON(!p); - - sb = inode->i_sb; - uspi = UFS_SB(sb)->s_uspi; - - frag_to_free = 0; - free_count = 0; - retry = 0; - - tmp = ufs_data_ptr_to_cpu(sb, p); - if (!tmp) - return 0; - ind_ubh = ubh_bread(sb, tmp, uspi->s_bsize); - if (tmp != ufs_data_ptr_to_cpu(sb, p)) { - ubh_brelse (ind_ubh); - return 1; - } - if (!ind_ubh) { - ufs_data_ptr_clear(uspi, p); - return 0; - } - - indirect_block = (DIRECT_BLOCK > offset) ? (DIRECT_BLOCK - offset) : 0; - for (i = indirect_block; i < uspi->s_apb; i++) { - ind = ubh_get_data_ptr(uspi, ind_ubh, i); - tmp = ufs_data_ptr_to_cpu(sb, ind); - if (!tmp) - continue; - - ufs_data_ptr_clear(uspi, ind); - ubh_mark_buffer_dirty(ind_ubh); - if (free_count == 0) { - frag_to_free = tmp; - free_count = uspi->s_fpb; - } else if (free_count > 0 && frag_to_free == tmp - free_count) - free_count += uspi->s_fpb; - else { - ufs_free_blocks (inode, frag_to_free, free_count); - frag_to_free = tmp; - free_count = uspi->s_fpb; - } - - mark_inode_dirty(inode); - } - - if (free_count > 0) { - ufs_free_blocks (inode, frag_to_free, free_count); - } - for (i = 0; i < uspi->s_apb; i++) - if (!ufs_is_data_ptr_zero(uspi, - ubh_get_data_ptr(uspi, ind_ubh, i))) - break; - if (i >= uspi->s_apb) { - tmp = ufs_data_ptr_to_cpu(sb, p); - ufs_data_ptr_clear(uspi, p); - - ufs_free_blocks (inode, tmp, uspi->s_fpb); - mark_inode_dirty(inode); - ubh_bforget(ind_ubh); - ind_ubh = NULL; - } - if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh)) - ubh_sync_block(ind_ubh); - ubh_brelse (ind_ubh); - - UFSD("EXIT: ino %lu\n", inode->i_ino); - - return retry; -} - -static int ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p) -{ - struct super_block * sb; - struct ufs_sb_private_info * uspi; - struct ufs_buffer_head *dind_bh; - u64 i, tmp, dindirect_block; - void *dind; - int retry = 0; - - UFSD("ENTER: ino %lu\n", inode->i_ino); - - sb = inode->i_sb; - uspi = UFS_SB(sb)->s_uspi; - - dindirect_block = (DIRECT_BLOCK > offset) - ? ((DIRECT_BLOCK - offset) >> uspi->s_apbshift) : 0; - retry = 0; - - tmp = ufs_data_ptr_to_cpu(sb, p); - if (!tmp) - return 0; - dind_bh = ubh_bread(sb, tmp, uspi->s_bsize); - if (tmp != ufs_data_ptr_to_cpu(sb, p)) { - ubh_brelse (dind_bh); - return 1; - } - if (!dind_bh) { - ufs_data_ptr_clear(uspi, p); - return 0; - } - - for (i = dindirect_block ; i < uspi->s_apb ; i++) { - dind = ubh_get_data_ptr(uspi, dind_bh, i); - tmp = ufs_data_ptr_to_cpu(sb, dind); - if (!tmp) - continue; - retry |= ufs_trunc_indirect (inode, offset + (i << uspi->s_apbshift), dind); - ubh_mark_buffer_dirty(dind_bh); - } - - for (i = 0; i < uspi->s_apb; i++) - if (!ufs_is_data_ptr_zero(uspi, - ubh_get_data_ptr(uspi, dind_bh, i))) - break; - if (i >= uspi->s_apb) { - tmp = ufs_data_ptr_to_cpu(sb, p); - ufs_data_ptr_clear(uspi, p); - - ufs_free_blocks(inode, tmp, uspi->s_fpb); - mark_inode_dirty(inode); - ubh_bforget(dind_bh); - dind_bh = NULL; - } - if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh)) - ubh_sync_block(dind_bh); - ubh_brelse (dind_bh); - - UFSD("EXIT: ino %lu\n", inode->i_ino); - - return retry; -} - -static int ufs_trunc_tindirect(struct inode *inode) -{ - struct super_block *sb = inode->i_sb; - struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; - struct ufs_inode_info *ufsi = UFS_I(inode); - struct ufs_buffer_head * tind_bh; - u64 tindirect_block, tmp, i; - void *tind, *p; - int retry; - - UFSD("ENTER: ino %lu\n", inode->i_ino); - - retry = 0; - - tindirect_block = (DIRECT_BLOCK > (UFS_NDADDR + uspi->s_apb + uspi->s_2apb)) - ? ((DIRECT_BLOCK - UFS_NDADDR - uspi->s_apb - uspi->s_2apb) >> uspi->s_2apbshift) : 0; - - p = ufs_get_direct_data_ptr(uspi, ufsi, UFS_TIND_BLOCK); - if (!(tmp = ufs_data_ptr_to_cpu(sb, p))) - return 0; - tind_bh = ubh_bread (sb, tmp, uspi->s_bsize); - if (tmp != ufs_data_ptr_to_cpu(sb, p)) { - ubh_brelse (tind_bh); - return 1; - } - if (!tind_bh) { - ufs_data_ptr_clear(uspi, p); - return 0; - } - - for (i = tindirect_block ; i < uspi->s_apb ; i++) { - tind = ubh_get_data_ptr(uspi, tind_bh, i); - retry |= ufs_trunc_dindirect(inode, UFS_NDADDR + - uspi->s_apb + ((i + 1) << uspi->s_2apbshift), tind); - ubh_mark_buffer_dirty(tind_bh); - } - for (i = 0; i < uspi->s_apb; i++) - if (!ufs_is_data_ptr_zero(uspi, - ubh_get_data_ptr(uspi, tind_bh, i))) - break; - if (i >= uspi->s_apb) { - tmp = ufs_data_ptr_to_cpu(sb, p); - ufs_data_ptr_clear(uspi, p); - - ufs_free_blocks(inode, tmp, uspi->s_fpb); - mark_inode_dirty(inode); - ubh_bforget(tind_bh); - tind_bh = NULL; - } - if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh)) - ubh_sync_block(tind_bh); - ubh_brelse (tind_bh); - - UFSD("EXIT: ino %lu\n", inode->i_ino); - return retry; -} - -static int ufs_alloc_lastblock(struct inode *inode) -{ - int err = 0; - struct super_block *sb = inode->i_sb; - struct address_space *mapping = inode->i_mapping; - struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; - unsigned i, end; - sector_t lastfrag; - struct page *lastpage; - struct buffer_head *bh; - u64 phys64; - - lastfrag = (i_size_read(inode) + uspi->s_fsize - 1) >> uspi->s_fshift; - - if (!lastfrag) - goto out; - - lastfrag--; - - lastpage = ufs_get_locked_page(mapping, lastfrag >> - (PAGE_CACHE_SHIFT - inode->i_blkbits)); - if (IS_ERR(lastpage)) { - err = -EIO; - goto out; - } - - end = lastfrag & ((1 << (PAGE_CACHE_SHIFT - inode->i_blkbits)) - 1); - bh = page_buffers(lastpage); - for (i = 0; i < end; ++i) - bh = bh->b_this_page; - - - err = ufs_getfrag_block(inode, lastfrag, bh, 1); - - if (unlikely(err)) - goto out_unlock; - - if (buffer_new(bh)) { - clear_buffer_new(bh); - unmap_underlying_metadata(bh->b_bdev, - bh->b_blocknr); - /* - * we do not zeroize fragment, because of - * if it maped to hole, it already contains zeroes - */ - set_buffer_uptodate(bh); - mark_buffer_dirty(bh); - set_page_dirty(lastpage); - } - - if (lastfrag >= UFS_IND_FRAGMENT) { - end = uspi->s_fpb - ufs_fragnum(lastfrag) - 1; - phys64 = bh->b_blocknr + 1; - for (i = 0; i < end; ++i) { - bh = sb_getblk(sb, i + phys64); - lock_buffer(bh); - memset(bh->b_data, 0, sb->s_blocksize); - set_buffer_uptodate(bh); - mark_buffer_dirty(bh); - unlock_buffer(bh); - sync_dirty_buffer(bh); - brelse(bh); - } - } -out_unlock: - ufs_put_locked_page(lastpage); -out: - return err; -} - -int ufs_truncate(struct inode *inode, loff_t old_i_size) -{ - struct ufs_inode_info *ufsi = UFS_I(inode); - struct super_block *sb = inode->i_sb; - struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; - int retry, err = 0; - - UFSD("ENTER: ino %lu, i_size: %llu, old_i_size: %llu\n", - inode->i_ino, (unsigned long long)i_size_read(inode), - (unsigned long long)old_i_size); - - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode))) - return -EINVAL; - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return -EPERM; - - err = ufs_alloc_lastblock(inode); - - if (err) { - i_size_write(inode, old_i_size); - goto out; - } - - block_truncate_page(inode->i_mapping, inode->i_size, ufs_getfrag_block); - - while (1) { - retry = ufs_trunc_direct(inode); - retry |= ufs_trunc_indirect(inode, UFS_IND_BLOCK, - ufs_get_direct_data_ptr(uspi, ufsi, - UFS_IND_BLOCK)); - retry |= ufs_trunc_dindirect(inode, UFS_IND_BLOCK + uspi->s_apb, - ufs_get_direct_data_ptr(uspi, ufsi, - UFS_DIND_BLOCK)); - retry |= ufs_trunc_tindirect (inode); - if (!retry) - break; - if (IS_SYNC(inode) && (inode->i_state & I_DIRTY)) - ufs_sync_inode (inode); - yield(); - } - - inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; - ufsi->i_lastfrag = DIRECT_FRAGMENT; - mark_inode_dirty(inode); -out: - UFSD("EXIT: err %d\n", err); - return err; -} - -int ufs_setattr(struct dentry *dentry, struct iattr *attr) -{ - struct inode *inode = d_inode(dentry); - unsigned int ia_valid = attr->ia_valid; - int error; - - error = inode_change_ok(inode, attr); - if (error) - return error; - - if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) { - loff_t old_i_size = inode->i_size; - - /* XXX(truncate): truncate_setsize should be called last */ - truncate_setsize(inode, attr->ia_size); - - lock_ufs(inode->i_sb); - error = ufs_truncate(inode, old_i_size); - unlock_ufs(inode->i_sb); - if (error) - return error; - } - - setattr_copy(inode, attr); - mark_inode_dirty(inode); - return 0; -} - -const struct inode_operations ufs_file_inode_operations = { - .setattr = ufs_setattr, -}; diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index 2e31ea2e35a3..7da4aca868c0 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -24,8 +24,6 @@ struct ufs_sb_info { unsigned s_cgno[UFS_MAX_GROUP_LOADED]; unsigned short s_cg_loaded; unsigned s_mount_opt; - struct mutex mutex; - struct task_struct *mutex_owner; struct super_block *sb; int work_queued; /* non-zero if the delayed work is queued */ struct delayed_work sync_work; /* FS sync delayed work */ @@ -46,6 +44,8 @@ struct ufs_inode_info { __u32 i_oeftflag; __u16 i_osync; __u64 i_lastfrag; + seqlock_t meta_lock; + struct mutex truncate_mutex; __u32 i_dir_start_lookup; struct inode vfs_inode; }; @@ -122,7 +122,7 @@ extern struct inode *ufs_iget(struct super_block *, unsigned long); extern int ufs_write_inode (struct inode *, struct writeback_control *); extern int ufs_sync_inode (struct inode *); extern void ufs_evict_inode (struct inode *); -extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create); +extern int ufs_setattr(struct dentry *dentry, struct iattr *attr); /* namei.c */ extern const struct file_operations ufs_dir_operations; @@ -140,10 +140,6 @@ void ufs_mark_sb_dirty(struct super_block *sb); extern const struct inode_operations ufs_fast_symlink_inode_operations; extern const struct inode_operations ufs_symlink_inode_operations; -/* truncate.c */ -extern int ufs_truncate (struct inode *, loff_t); -extern int ufs_setattr(struct dentry *dentry, struct iattr *attr); - static inline struct ufs_sb_info *UFS_SB(struct super_block *sb) { return sb->s_fs_info; @@ -170,7 +166,4 @@ static inline u32 ufs_dtogd(struct ufs_sb_private_info * uspi, u64 b) return do_div(b, uspi->s_fpg); } -extern void lock_ufs(struct super_block *sb); -extern void unlock_ufs(struct super_block *sb); - #endif /* _UFS_UFS_H */ diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c new file mode 100644 index 000000000000..634e676072cb --- /dev/null +++ b/fs/userfaultfd.c @@ -0,0 +1,1330 @@ +/* + * fs/userfaultfd.c + * + * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> + * Copyright (C) 2008-2009 Red Hat, Inc. + * Copyright (C) 2015 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Some part derived from fs/eventfd.c (anon inode setup) and + * mm/ksm.c (mm hashing). + */ + +#include <linux/hashtable.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/poll.h> +#include <linux/slab.h> +#include <linux/seq_file.h> +#include <linux/file.h> +#include <linux/bug.h> +#include <linux/anon_inodes.h> +#include <linux/syscalls.h> +#include <linux/userfaultfd_k.h> +#include <linux/mempolicy.h> +#include <linux/ioctl.h> +#include <linux/security.h> + +static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; + +enum userfaultfd_state { + UFFD_STATE_WAIT_API, + UFFD_STATE_RUNNING, +}; + +/* + * Start with fault_pending_wqh and fault_wqh so they're more likely + * to be in the same cacheline. + */ +struct userfaultfd_ctx { + /* waitqueue head for the pending (i.e. not read) userfaults */ + wait_queue_head_t fault_pending_wqh; + /* waitqueue head for the userfaults */ + wait_queue_head_t fault_wqh; + /* waitqueue head for the pseudo fd to wakeup poll/read */ + wait_queue_head_t fd_wqh; + /* a refile sequence protected by fault_pending_wqh lock */ + struct seqcount refile_seq; + /* pseudo fd refcounting */ + atomic_t refcount; + /* userfaultfd syscall flags */ + unsigned int flags; + /* state machine */ + enum userfaultfd_state state; + /* released */ + bool released; + /* mm with one ore more vmas attached to this userfaultfd_ctx */ + struct mm_struct *mm; +}; + +struct userfaultfd_wait_queue { + struct uffd_msg msg; + wait_queue_t wq; + struct userfaultfd_ctx *ctx; +}; + +struct userfaultfd_wake_range { + unsigned long start; + unsigned long len; +}; + +static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode, + int wake_flags, void *key) +{ + struct userfaultfd_wake_range *range = key; + int ret; + struct userfaultfd_wait_queue *uwq; + unsigned long start, len; + + uwq = container_of(wq, struct userfaultfd_wait_queue, wq); + ret = 0; + /* len == 0 means wake all */ + start = range->start; + len = range->len; + if (len && (start > uwq->msg.arg.pagefault.address || + start + len <= uwq->msg.arg.pagefault.address)) + goto out; + ret = wake_up_state(wq->private, mode); + if (ret) + /* + * Wake only once, autoremove behavior. + * + * After the effect of list_del_init is visible to the + * other CPUs, the waitqueue may disappear from under + * us, see the !list_empty_careful() in + * handle_userfault(). try_to_wake_up() has an + * implicit smp_mb__before_spinlock, and the + * wq->private is read before calling the extern + * function "wake_up_state" (which in turns calls + * try_to_wake_up). While the spin_lock;spin_unlock; + * wouldn't be enough, the smp_mb__before_spinlock is + * enough to avoid an explicit smp_mb() here. + */ + list_del_init(&wq->task_list); +out: + return ret; +} + +/** + * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd + * context. + * @ctx: [in] Pointer to the userfaultfd context. + * + * Returns: In case of success, returns not zero. + */ +static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx) +{ + if (!atomic_inc_not_zero(&ctx->refcount)) + BUG(); +} + +/** + * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd + * context. + * @ctx: [in] Pointer to userfaultfd context. + * + * The userfaultfd context reference must have been previously acquired either + * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget(). + */ +static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx) +{ + if (atomic_dec_and_test(&ctx->refcount)) { + VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock)); + VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh)); + VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock)); + VM_BUG_ON(waitqueue_active(&ctx->fault_wqh)); + VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock)); + VM_BUG_ON(waitqueue_active(&ctx->fd_wqh)); + mmput(ctx->mm); + kmem_cache_free(userfaultfd_ctx_cachep, ctx); + } +} + +static inline void msg_init(struct uffd_msg *msg) +{ + BUILD_BUG_ON(sizeof(struct uffd_msg) != 32); + /* + * Must use memset to zero out the paddings or kernel data is + * leaked to userland. + */ + memset(msg, 0, sizeof(struct uffd_msg)); +} + +static inline struct uffd_msg userfault_msg(unsigned long address, + unsigned int flags, + unsigned long reason) +{ + struct uffd_msg msg; + msg_init(&msg); + msg.event = UFFD_EVENT_PAGEFAULT; + msg.arg.pagefault.address = address; + if (flags & FAULT_FLAG_WRITE) + /* + * If UFFD_FEATURE_PAGEFAULT_FLAG_WRITE was set in the + * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE + * was not set in a UFFD_EVENT_PAGEFAULT, it means it + * was a read fault, otherwise if set it means it's + * a write fault. + */ + msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE; + if (reason & VM_UFFD_WP) + /* + * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the + * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WP was + * not set in a UFFD_EVENT_PAGEFAULT, it means it was + * a missing fault, otherwise if set it means it's a + * write protect fault. + */ + msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP; + return msg; +} + +/* + * Verify the pagetables are still not ok after having reigstered into + * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any + * userfault that has already been resolved, if userfaultfd_read and + * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different + * threads. + */ +static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, + unsigned long address, + unsigned long flags, + unsigned long reason) +{ + struct mm_struct *mm = ctx->mm; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd, _pmd; + pte_t *pte; + bool ret = true; + + VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out; + pmd = pmd_offset(pud, address); + /* + * READ_ONCE must function as a barrier with narrower scope + * and it must be equivalent to: + * _pmd = *pmd; barrier(); + * + * This is to deal with the instability (as in + * pmd_trans_unstable) of the pmd. + */ + _pmd = READ_ONCE(*pmd); + if (!pmd_present(_pmd)) + goto out; + + ret = false; + if (pmd_trans_huge(_pmd)) + goto out; + + /* + * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it + * and use the standard pte_offset_map() instead of parsing _pmd. + */ + pte = pte_offset_map(pmd, address); + /* + * Lockless access: we're in a wait_event so it's ok if it + * changes under us. + */ + if (pte_none(*pte)) + ret = true; + pte_unmap(pte); + +out: + return ret; +} + +/* + * The locking rules involved in returning VM_FAULT_RETRY depending on + * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and + * FAULT_FLAG_KILLABLE are not straightforward. The "Caution" + * recommendation in __lock_page_or_retry is not an understatement. + * + * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_sem must be released + * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is + * not set. + * + * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not + * set, VM_FAULT_RETRY can still be returned if and only if there are + * fatal_signal_pending()s, and the mmap_sem must be released before + * returning it. + */ +int handle_userfault(struct vm_area_struct *vma, unsigned long address, + unsigned int flags, unsigned long reason) +{ + struct mm_struct *mm = vma->vm_mm; + struct userfaultfd_ctx *ctx; + struct userfaultfd_wait_queue uwq; + int ret; + bool must_wait, return_to_userland; + + BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); + + ret = VM_FAULT_SIGBUS; + ctx = vma->vm_userfaultfd_ctx.ctx; + if (!ctx) + goto out; + + BUG_ON(ctx->mm != mm); + + VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP)); + VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP)); + + /* + * If it's already released don't get it. This avoids to loop + * in __get_user_pages if userfaultfd_release waits on the + * caller of handle_userfault to release the mmap_sem. + */ + if (unlikely(ACCESS_ONCE(ctx->released))) + goto out; + + /* + * Check that we can return VM_FAULT_RETRY. + * + * NOTE: it should become possible to return VM_FAULT_RETRY + * even if FAULT_FLAG_TRIED is set without leading to gup() + * -EBUSY failures, if the userfaultfd is to be extended for + * VM_UFFD_WP tracking and we intend to arm the userfault + * without first stopping userland access to the memory. For + * VM_UFFD_MISSING userfaults this is enough for now. + */ + if (unlikely(!(flags & FAULT_FLAG_ALLOW_RETRY))) { + /* + * Validate the invariant that nowait must allow retry + * to be sure not to return SIGBUS erroneously on + * nowait invocations. + */ + BUG_ON(flags & FAULT_FLAG_RETRY_NOWAIT); +#ifdef CONFIG_DEBUG_VM + if (printk_ratelimit()) { + printk(KERN_WARNING + "FAULT_FLAG_ALLOW_RETRY missing %x\n", flags); + dump_stack(); + } +#endif + goto out; + } + + /* + * Handle nowait, not much to do other than tell it to retry + * and wait. + */ + ret = VM_FAULT_RETRY; + if (flags & FAULT_FLAG_RETRY_NOWAIT) + goto out; + + /* take the reference before dropping the mmap_sem */ + userfaultfd_ctx_get(ctx); + + init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); + uwq.wq.private = current; + uwq.msg = userfault_msg(address, flags, reason); + uwq.ctx = ctx; + + return_to_userland = (flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == + (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE); + + spin_lock(&ctx->fault_pending_wqh.lock); + /* + * After the __add_wait_queue the uwq is visible to userland + * through poll/read(). + */ + __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq); + /* + * The smp_mb() after __set_current_state prevents the reads + * following the spin_unlock to happen before the list_add in + * __add_wait_queue. + */ + set_current_state(return_to_userland ? TASK_INTERRUPTIBLE : + TASK_KILLABLE); + spin_unlock(&ctx->fault_pending_wqh.lock); + + must_wait = userfaultfd_must_wait(ctx, address, flags, reason); + up_read(&mm->mmap_sem); + + if (likely(must_wait && !ACCESS_ONCE(ctx->released) && + (return_to_userland ? !signal_pending(current) : + !fatal_signal_pending(current)))) { + wake_up_poll(&ctx->fd_wqh, POLLIN); + schedule(); + ret |= VM_FAULT_MAJOR; + } + + __set_current_state(TASK_RUNNING); + + if (return_to_userland) { + if (signal_pending(current) && + !fatal_signal_pending(current)) { + /* + * If we got a SIGSTOP or SIGCONT and this is + * a normal userland page fault, just let + * userland return so the signal will be + * handled and gdb debugging works. The page + * fault code immediately after we return from + * this function is going to release the + * mmap_sem and it's not depending on it + * (unlike gup would if we were not to return + * VM_FAULT_RETRY). + * + * If a fatal signal is pending we still take + * the streamlined VM_FAULT_RETRY failure path + * and there's no need to retake the mmap_sem + * in such case. + */ + down_read(&mm->mmap_sem); + ret = 0; + } + } + + /* + * Here we race with the list_del; list_add in + * userfaultfd_ctx_read(), however because we don't ever run + * list_del_init() to refile across the two lists, the prev + * and next pointers will never point to self. list_add also + * would never let any of the two pointers to point to + * self. So list_empty_careful won't risk to see both pointers + * pointing to self at any time during the list refile. The + * only case where list_del_init() is called is the full + * removal in the wake function and there we don't re-list_add + * and it's fine not to block on the spinlock. The uwq on this + * kernel stack can be released after the list_del_init. + */ + if (!list_empty_careful(&uwq.wq.task_list)) { + spin_lock(&ctx->fault_pending_wqh.lock); + /* + * No need of list_del_init(), the uwq on the stack + * will be freed shortly anyway. + */ + list_del(&uwq.wq.task_list); + spin_unlock(&ctx->fault_pending_wqh.lock); + } + + /* + * ctx may go away after this if the userfault pseudo fd is + * already released. + */ + userfaultfd_ctx_put(ctx); + +out: + return ret; +} + +static int userfaultfd_release(struct inode *inode, struct file *file) +{ + struct userfaultfd_ctx *ctx = file->private_data; + struct mm_struct *mm = ctx->mm; + struct vm_area_struct *vma, *prev; + /* len == 0 means wake all */ + struct userfaultfd_wake_range range = { .len = 0, }; + unsigned long new_flags; + + ACCESS_ONCE(ctx->released) = true; + + /* + * Flush page faults out of all CPUs. NOTE: all page faults + * must be retried without returning VM_FAULT_SIGBUS if + * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx + * changes while handle_userfault released the mmap_sem. So + * it's critical that released is set to true (above), before + * taking the mmap_sem for writing. + */ + down_write(&mm->mmap_sem); + prev = NULL; + for (vma = mm->mmap; vma; vma = vma->vm_next) { + cond_resched(); + BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ + !!(vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP))); + if (vma->vm_userfaultfd_ctx.ctx != ctx) { + prev = vma; + continue; + } + new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP); + prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, + new_flags, vma->anon_vma, + vma->vm_file, vma->vm_pgoff, + vma_policy(vma), + NULL_VM_UFFD_CTX); + if (prev) + vma = prev; + else + prev = vma; + vma->vm_flags = new_flags; + vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + } + up_write(&mm->mmap_sem); + + /* + * After no new page faults can wait on this fault_*wqh, flush + * the last page faults that may have been already waiting on + * the fault_*wqh. + */ + spin_lock(&ctx->fault_pending_wqh.lock); + __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0, &range); + __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, &range); + spin_unlock(&ctx->fault_pending_wqh.lock); + + wake_up_poll(&ctx->fd_wqh, POLLHUP); + userfaultfd_ctx_put(ctx); + return 0; +} + +/* fault_pending_wqh.lock must be hold by the caller */ +static inline struct userfaultfd_wait_queue *find_userfault( + struct userfaultfd_ctx *ctx) +{ + wait_queue_t *wq; + struct userfaultfd_wait_queue *uwq; + + VM_BUG_ON(!spin_is_locked(&ctx->fault_pending_wqh.lock)); + + uwq = NULL; + if (!waitqueue_active(&ctx->fault_pending_wqh)) + goto out; + /* walk in reverse to provide FIFO behavior to read userfaults */ + wq = list_last_entry(&ctx->fault_pending_wqh.task_list, + typeof(*wq), task_list); + uwq = container_of(wq, struct userfaultfd_wait_queue, wq); +out: + return uwq; +} + +static unsigned int userfaultfd_poll(struct file *file, poll_table *wait) +{ + struct userfaultfd_ctx *ctx = file->private_data; + unsigned int ret; + + poll_wait(file, &ctx->fd_wqh, wait); + + switch (ctx->state) { + case UFFD_STATE_WAIT_API: + return POLLERR; + case UFFD_STATE_RUNNING: + /* + * poll() never guarantees that read won't block. + * userfaults can be waken before they're read(). + */ + if (unlikely(!(file->f_flags & O_NONBLOCK))) + return POLLERR; + /* + * lockless access to see if there are pending faults + * __pollwait last action is the add_wait_queue but + * the spin_unlock would allow the waitqueue_active to + * pass above the actual list_add inside + * add_wait_queue critical section. So use a full + * memory barrier to serialize the list_add write of + * add_wait_queue() with the waitqueue_active read + * below. + */ + ret = 0; + smp_mb(); + if (waitqueue_active(&ctx->fault_pending_wqh)) + ret = POLLIN; + return ret; + default: + BUG(); + } +} + +static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, + struct uffd_msg *msg) +{ + ssize_t ret; + DECLARE_WAITQUEUE(wait, current); + struct userfaultfd_wait_queue *uwq; + + /* always take the fd_wqh lock before the fault_pending_wqh lock */ + spin_lock(&ctx->fd_wqh.lock); + __add_wait_queue(&ctx->fd_wqh, &wait); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + spin_lock(&ctx->fault_pending_wqh.lock); + uwq = find_userfault(ctx); + if (uwq) { + /* + * Use a seqcount to repeat the lockless check + * in wake_userfault() to avoid missing + * wakeups because during the refile both + * waitqueue could become empty if this is the + * only userfault. + */ + write_seqcount_begin(&ctx->refile_seq); + + /* + * The fault_pending_wqh.lock prevents the uwq + * to disappear from under us. + * + * Refile this userfault from + * fault_pending_wqh to fault_wqh, it's not + * pending anymore after we read it. + * + * Use list_del() by hand (as + * userfaultfd_wake_function also uses + * list_del_init() by hand) to be sure nobody + * changes __remove_wait_queue() to use + * list_del_init() in turn breaking the + * !list_empty_careful() check in + * handle_userfault(). The uwq->wq.task_list + * must never be empty at any time during the + * refile, or the waitqueue could disappear + * from under us. The "wait_queue_head_t" + * parameter of __remove_wait_queue() is unused + * anyway. + */ + list_del(&uwq->wq.task_list); + __add_wait_queue(&ctx->fault_wqh, &uwq->wq); + + write_seqcount_end(&ctx->refile_seq); + + /* careful to always initialize msg if ret == 0 */ + *msg = uwq->msg; + spin_unlock(&ctx->fault_pending_wqh.lock); + ret = 0; + break; + } + spin_unlock(&ctx->fault_pending_wqh.lock); + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + if (no_wait) { + ret = -EAGAIN; + break; + } + spin_unlock(&ctx->fd_wqh.lock); + schedule(); + spin_lock(&ctx->fd_wqh.lock); + } + __remove_wait_queue(&ctx->fd_wqh, &wait); + __set_current_state(TASK_RUNNING); + spin_unlock(&ctx->fd_wqh.lock); + + return ret; +} + +static ssize_t userfaultfd_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct userfaultfd_ctx *ctx = file->private_data; + ssize_t _ret, ret = 0; + struct uffd_msg msg; + int no_wait = file->f_flags & O_NONBLOCK; + + if (ctx->state == UFFD_STATE_WAIT_API) + return -EINVAL; + + for (;;) { + if (count < sizeof(msg)) + return ret ? ret : -EINVAL; + _ret = userfaultfd_ctx_read(ctx, no_wait, &msg); + if (_ret < 0) + return ret ? ret : _ret; + if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg))) + return ret ? ret : -EFAULT; + ret += sizeof(msg); + buf += sizeof(msg); + count -= sizeof(msg); + /* + * Allow to read more than one fault at time but only + * block if waiting for the very first one. + */ + no_wait = O_NONBLOCK; + } +} + +static void __wake_userfault(struct userfaultfd_ctx *ctx, + struct userfaultfd_wake_range *range) +{ + unsigned long start, end; + + start = range->start; + end = range->start + range->len; + + spin_lock(&ctx->fault_pending_wqh.lock); + /* wake all in the range and autoremove */ + if (waitqueue_active(&ctx->fault_pending_wqh)) + __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0, + range); + if (waitqueue_active(&ctx->fault_wqh)) + __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, range); + spin_unlock(&ctx->fault_pending_wqh.lock); +} + +static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx, + struct userfaultfd_wake_range *range) +{ + unsigned seq; + bool need_wakeup; + + /* + * To be sure waitqueue_active() is not reordered by the CPU + * before the pagetable update, use an explicit SMP memory + * barrier here. PT lock release or up_read(mmap_sem) still + * have release semantics that can allow the + * waitqueue_active() to be reordered before the pte update. + */ + smp_mb(); + + /* + * Use waitqueue_active because it's very frequent to + * change the address space atomically even if there are no + * userfaults yet. So we take the spinlock only when we're + * sure we've userfaults to wake. + */ + do { + seq = read_seqcount_begin(&ctx->refile_seq); + need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) || + waitqueue_active(&ctx->fault_wqh); + cond_resched(); + } while (read_seqcount_retry(&ctx->refile_seq, seq)); + if (need_wakeup) + __wake_userfault(ctx, range); +} + +static __always_inline int validate_range(struct mm_struct *mm, + __u64 start, __u64 len) +{ + __u64 task_size = mm->task_size; + + if (start & ~PAGE_MASK) + return -EINVAL; + if (len & ~PAGE_MASK) + return -EINVAL; + if (!len) + return -EINVAL; + if (start < mmap_min_addr) + return -EINVAL; + if (start >= task_size) + return -EINVAL; + if (len > task_size - start) + return -EINVAL; + return 0; +} + +static int userfaultfd_register(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + struct mm_struct *mm = ctx->mm; + struct vm_area_struct *vma, *prev, *cur; + int ret; + struct uffdio_register uffdio_register; + struct uffdio_register __user *user_uffdio_register; + unsigned long vm_flags, new_flags; + bool found; + unsigned long start, end, vma_end; + + user_uffdio_register = (struct uffdio_register __user *) arg; + + ret = -EFAULT; + if (copy_from_user(&uffdio_register, user_uffdio_register, + sizeof(uffdio_register)-sizeof(__u64))) + goto out; + + ret = -EINVAL; + if (!uffdio_register.mode) + goto out; + if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING| + UFFDIO_REGISTER_MODE_WP)) + goto out; + vm_flags = 0; + if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING) + vm_flags |= VM_UFFD_MISSING; + if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) { + vm_flags |= VM_UFFD_WP; + /* + * FIXME: remove the below error constraint by + * implementing the wprotect tracking mode. + */ + ret = -EINVAL; + goto out; + } + + ret = validate_range(mm, uffdio_register.range.start, + uffdio_register.range.len); + if (ret) + goto out; + + start = uffdio_register.range.start; + end = start + uffdio_register.range.len; + + down_write(&mm->mmap_sem); + vma = find_vma_prev(mm, start, &prev); + + ret = -ENOMEM; + if (!vma) + goto out_unlock; + + /* check that there's at least one vma in the range */ + ret = -EINVAL; + if (vma->vm_start >= end) + goto out_unlock; + + /* + * Search for not compatible vmas. + * + * FIXME: this shall be relaxed later so that it doesn't fail + * on tmpfs backed vmas (in addition to the current allowance + * on anonymous vmas). + */ + found = false; + for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { + cond_resched(); + + BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ + !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP))); + + /* check not compatible vmas */ + ret = -EINVAL; + if (cur->vm_ops) + goto out_unlock; + + /* + * Check that this vma isn't already owned by a + * different userfaultfd. We can't allow more than one + * userfaultfd to own a single vma simultaneously or we + * wouldn't know which one to deliver the userfaults to. + */ + ret = -EBUSY; + if (cur->vm_userfaultfd_ctx.ctx && + cur->vm_userfaultfd_ctx.ctx != ctx) + goto out_unlock; + + found = true; + } + BUG_ON(!found); + + if (vma->vm_start < start) + prev = vma; + + ret = 0; + do { + cond_resched(); + + BUG_ON(vma->vm_ops); + BUG_ON(vma->vm_userfaultfd_ctx.ctx && + vma->vm_userfaultfd_ctx.ctx != ctx); + + /* + * Nothing to do: this vma is already registered into this + * userfaultfd and with the right tracking mode too. + */ + if (vma->vm_userfaultfd_ctx.ctx == ctx && + (vma->vm_flags & vm_flags) == vm_flags) + goto skip; + + if (vma->vm_start > start) + start = vma->vm_start; + vma_end = min(end, vma->vm_end); + + new_flags = (vma->vm_flags & ~vm_flags) | vm_flags; + prev = vma_merge(mm, prev, start, vma_end, new_flags, + vma->anon_vma, vma->vm_file, vma->vm_pgoff, + vma_policy(vma), + ((struct vm_userfaultfd_ctx){ ctx })); + if (prev) { + vma = prev; + goto next; + } + if (vma->vm_start < start) { + ret = split_vma(mm, vma, start, 1); + if (ret) + break; + } + if (vma->vm_end > end) { + ret = split_vma(mm, vma, end, 0); + if (ret) + break; + } + next: + /* + * In the vma_merge() successful mprotect-like case 8: + * the next vma was merged into the current one and + * the current one has not been updated yet. + */ + vma->vm_flags = new_flags; + vma->vm_userfaultfd_ctx.ctx = ctx; + + skip: + prev = vma; + start = vma->vm_end; + vma = vma->vm_next; + } while (vma && vma->vm_start < end); +out_unlock: + up_write(&mm->mmap_sem); + if (!ret) { + /* + * Now that we scanned all vmas we can already tell + * userland which ioctls methods are guaranteed to + * succeed on this range. + */ + if (put_user(UFFD_API_RANGE_IOCTLS, + &user_uffdio_register->ioctls)) + ret = -EFAULT; + } +out: + return ret; +} + +static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + struct mm_struct *mm = ctx->mm; + struct vm_area_struct *vma, *prev, *cur; + int ret; + struct uffdio_range uffdio_unregister; + unsigned long new_flags; + bool found; + unsigned long start, end, vma_end; + const void __user *buf = (void __user *)arg; + + ret = -EFAULT; + if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) + goto out; + + ret = validate_range(mm, uffdio_unregister.start, + uffdio_unregister.len); + if (ret) + goto out; + + start = uffdio_unregister.start; + end = start + uffdio_unregister.len; + + down_write(&mm->mmap_sem); + vma = find_vma_prev(mm, start, &prev); + + ret = -ENOMEM; + if (!vma) + goto out_unlock; + + /* check that there's at least one vma in the range */ + ret = -EINVAL; + if (vma->vm_start >= end) + goto out_unlock; + + /* + * Search for not compatible vmas. + * + * FIXME: this shall be relaxed later so that it doesn't fail + * on tmpfs backed vmas (in addition to the current allowance + * on anonymous vmas). + */ + found = false; + ret = -EINVAL; + for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { + cond_resched(); + + BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ + !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP))); + + /* + * Check not compatible vmas, not strictly required + * here as not compatible vmas cannot have an + * userfaultfd_ctx registered on them, but this + * provides for more strict behavior to notice + * unregistration errors. + */ + if (cur->vm_ops) + goto out_unlock; + + found = true; + } + BUG_ON(!found); + + if (vma->vm_start < start) + prev = vma; + + ret = 0; + do { + cond_resched(); + + BUG_ON(vma->vm_ops); + + /* + * Nothing to do: this vma is already registered into this + * userfaultfd and with the right tracking mode too. + */ + if (!vma->vm_userfaultfd_ctx.ctx) + goto skip; + + if (vma->vm_start > start) + start = vma->vm_start; + vma_end = min(end, vma->vm_end); + + new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP); + prev = vma_merge(mm, prev, start, vma_end, new_flags, + vma->anon_vma, vma->vm_file, vma->vm_pgoff, + vma_policy(vma), + NULL_VM_UFFD_CTX); + if (prev) { + vma = prev; + goto next; + } + if (vma->vm_start < start) { + ret = split_vma(mm, vma, start, 1); + if (ret) + break; + } + if (vma->vm_end > end) { + ret = split_vma(mm, vma, end, 0); + if (ret) + break; + } + next: + /* + * In the vma_merge() successful mprotect-like case 8: + * the next vma was merged into the current one and + * the current one has not been updated yet. + */ + vma->vm_flags = new_flags; + vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + + skip: + prev = vma; + start = vma->vm_end; + vma = vma->vm_next; + } while (vma && vma->vm_start < end); +out_unlock: + up_write(&mm->mmap_sem); +out: + return ret; +} + +/* + * userfaultfd_wake may be used in combination with the + * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches. + */ +static int userfaultfd_wake(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + int ret; + struct uffdio_range uffdio_wake; + struct userfaultfd_wake_range range; + const void __user *buf = (void __user *)arg; + + ret = -EFAULT; + if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake))) + goto out; + + ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len); + if (ret) + goto out; + + range.start = uffdio_wake.start; + range.len = uffdio_wake.len; + + /* + * len == 0 means wake all and we don't want to wake all here, + * so check it again to be sure. + */ + VM_BUG_ON(!range.len); + + wake_userfault(ctx, &range); + ret = 0; + +out: + return ret; +} + +static int userfaultfd_copy(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + __s64 ret; + struct uffdio_copy uffdio_copy; + struct uffdio_copy __user *user_uffdio_copy; + struct userfaultfd_wake_range range; + + user_uffdio_copy = (struct uffdio_copy __user *) arg; + + ret = -EFAULT; + if (copy_from_user(&uffdio_copy, user_uffdio_copy, + /* don't copy "copy" last field */ + sizeof(uffdio_copy)-sizeof(__s64))) + goto out; + + ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len); + if (ret) + goto out; + /* + * double check for wraparound just in case. copy_from_user() + * will later check uffdio_copy.src + uffdio_copy.len to fit + * in the userland range. + */ + ret = -EINVAL; + if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src) + goto out; + if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE) + goto out; + + ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src, + uffdio_copy.len); + if (unlikely(put_user(ret, &user_uffdio_copy->copy))) + return -EFAULT; + if (ret < 0) + goto out; + BUG_ON(!ret); + /* len == 0 would wake all */ + range.len = ret; + if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) { + range.start = uffdio_copy.dst; + wake_userfault(ctx, &range); + } + ret = range.len == uffdio_copy.len ? 0 : -EAGAIN; +out: + return ret; +} + +static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + __s64 ret; + struct uffdio_zeropage uffdio_zeropage; + struct uffdio_zeropage __user *user_uffdio_zeropage; + struct userfaultfd_wake_range range; + + user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg; + + ret = -EFAULT; + if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage, + /* don't copy "zeropage" last field */ + sizeof(uffdio_zeropage)-sizeof(__s64))) + goto out; + + ret = validate_range(ctx->mm, uffdio_zeropage.range.start, + uffdio_zeropage.range.len); + if (ret) + goto out; + ret = -EINVAL; + if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE) + goto out; + + ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start, + uffdio_zeropage.range.len); + if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage))) + return -EFAULT; + if (ret < 0) + goto out; + /* len == 0 would wake all */ + BUG_ON(!ret); + range.len = ret; + if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) { + range.start = uffdio_zeropage.range.start; + wake_userfault(ctx, &range); + } + ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN; +out: + return ret; +} + +/* + * userland asks for a certain API version and we return which bits + * and ioctl commands are implemented in this kernel for such API + * version or -EINVAL if unknown. + */ +static int userfaultfd_api(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + struct uffdio_api uffdio_api; + void __user *buf = (void __user *)arg; + int ret; + + ret = -EINVAL; + if (ctx->state != UFFD_STATE_WAIT_API) + goto out; + ret = -EFAULT; + if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api))) + goto out; + if (uffdio_api.api != UFFD_API || uffdio_api.features) { + memset(&uffdio_api, 0, sizeof(uffdio_api)); + if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) + goto out; + ret = -EINVAL; + goto out; + } + uffdio_api.features = UFFD_API_FEATURES; + uffdio_api.ioctls = UFFD_API_IOCTLS; + ret = -EFAULT; + if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) + goto out; + ctx->state = UFFD_STATE_RUNNING; + ret = 0; +out: + return ret; +} + +static long userfaultfd_ioctl(struct file *file, unsigned cmd, + unsigned long arg) +{ + int ret = -EINVAL; + struct userfaultfd_ctx *ctx = file->private_data; + + if (cmd != UFFDIO_API && ctx->state == UFFD_STATE_WAIT_API) + return -EINVAL; + + switch(cmd) { + case UFFDIO_API: + ret = userfaultfd_api(ctx, arg); + break; + case UFFDIO_REGISTER: + ret = userfaultfd_register(ctx, arg); + break; + case UFFDIO_UNREGISTER: + ret = userfaultfd_unregister(ctx, arg); + break; + case UFFDIO_WAKE: + ret = userfaultfd_wake(ctx, arg); + break; + case UFFDIO_COPY: + ret = userfaultfd_copy(ctx, arg); + break; + case UFFDIO_ZEROPAGE: + ret = userfaultfd_zeropage(ctx, arg); + break; + } + return ret; +} + +#ifdef CONFIG_PROC_FS +static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct userfaultfd_ctx *ctx = f->private_data; + wait_queue_t *wq; + struct userfaultfd_wait_queue *uwq; + unsigned long pending = 0, total = 0; + + spin_lock(&ctx->fault_pending_wqh.lock); + list_for_each_entry(wq, &ctx->fault_pending_wqh.task_list, task_list) { + uwq = container_of(wq, struct userfaultfd_wait_queue, wq); + pending++; + total++; + } + list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) { + uwq = container_of(wq, struct userfaultfd_wait_queue, wq); + total++; + } + spin_unlock(&ctx->fault_pending_wqh.lock); + + /* + * If more protocols will be added, there will be all shown + * separated by a space. Like this: + * protocols: aa:... bb:... + */ + seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n", + pending, total, UFFD_API, UFFD_API_FEATURES, + UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS); +} +#endif + +static const struct file_operations userfaultfd_fops = { +#ifdef CONFIG_PROC_FS + .show_fdinfo = userfaultfd_show_fdinfo, +#endif + .release = userfaultfd_release, + .poll = userfaultfd_poll, + .read = userfaultfd_read, + .unlocked_ioctl = userfaultfd_ioctl, + .compat_ioctl = userfaultfd_ioctl, + .llseek = noop_llseek, +}; + +static void init_once_userfaultfd_ctx(void *mem) +{ + struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem; + + init_waitqueue_head(&ctx->fault_pending_wqh); + init_waitqueue_head(&ctx->fault_wqh); + init_waitqueue_head(&ctx->fd_wqh); + seqcount_init(&ctx->refile_seq); +} + +/** + * userfaultfd_file_create - Creates an userfaultfd file pointer. + * @flags: Flags for the userfaultfd file. + * + * This function creates an userfaultfd file pointer, w/out installing + * it into the fd table. This is useful when the userfaultfd file is + * used during the initialization of data structures that require + * extra setup after the userfaultfd creation. So the userfaultfd + * creation is split into the file pointer creation phase, and the + * file descriptor installation phase. In this way races with + * userspace closing the newly installed file descriptor can be + * avoided. Returns an userfaultfd file pointer, or a proper error + * pointer. + */ +static struct file *userfaultfd_file_create(int flags) +{ + struct file *file; + struct userfaultfd_ctx *ctx; + + BUG_ON(!current->mm); + + /* Check the UFFD_* constants for consistency. */ + BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC); + BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK); + + file = ERR_PTR(-EINVAL); + if (flags & ~UFFD_SHARED_FCNTL_FLAGS) + goto out; + + file = ERR_PTR(-ENOMEM); + ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL); + if (!ctx) + goto out; + + atomic_set(&ctx->refcount, 1); + ctx->flags = flags; + ctx->state = UFFD_STATE_WAIT_API; + ctx->released = false; + ctx->mm = current->mm; + /* prevent the mm struct to be freed */ + atomic_inc(&ctx->mm->mm_users); + + file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx, + O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS)); + if (IS_ERR(file)) + kmem_cache_free(userfaultfd_ctx_cachep, ctx); +out: + return file; +} + +SYSCALL_DEFINE1(userfaultfd, int, flags) +{ + int fd, error; + struct file *file; + + error = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS); + if (error < 0) + return error; + fd = error; + + file = userfaultfd_file_create(flags); + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto err_put_unused_fd; + } + fd_install(fd, file); + + return fd; + +err_put_unused_fd: + put_unused_fd(fd); + + return error; +} + +static int __init userfaultfd_init(void) +{ + userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache", + sizeof(struct userfaultfd_ctx), + 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, + init_once_userfaultfd_ctx); + return 0; +} +__initcall(userfaultfd_init); diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index df6828570e87..a096841bd06c 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -33,6 +33,7 @@ xfs-y += $(addprefix libxfs/, \ xfs_attr.o \ xfs_attr_leaf.o \ xfs_attr_remote.o \ + xfs_bit.o \ xfs_bmap.o \ xfs_bmap_btree.o \ xfs_btree.o \ @@ -63,7 +64,6 @@ xfs-$(CONFIG_XFS_RT) += $(addprefix libxfs/, \ xfs-y += xfs_aops.o \ xfs_attr_inactive.o \ xfs_attr_list.o \ - xfs_bit.o \ xfs_bmap_util.o \ xfs_buf.o \ xfs_dir2_readdir.o \ diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index f9e9ffe6fb46..ffad7f20342f 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -464,7 +464,7 @@ xfs_agfl_verify( struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); int i; - if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid)) return false; if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC) return false; @@ -1937,7 +1937,7 @@ xfs_alloc_fix_freelist( struct xfs_alloc_arg targs; /* local allocation arguments */ xfs_agblock_t bno; /* freelist block */ xfs_extlen_t need; /* total blocks needed in freelist */ - int error; + int error = 0; if (!pag->pagf_init) { error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp); @@ -2260,7 +2260,7 @@ xfs_agf_verify( struct xfs_agf *agf = XFS_BUF_TO_AGF(bp); if (xfs_sb_version_hascrc(&mp->m_sb) && - !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_uuid)) + !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid)) return false; if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 59d521c09a17..90de071dd4c2 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -295,7 +295,7 @@ xfs_allocbt_verify( case cpu_to_be32(XFS_ABTB_CRC_MAGIC): if (!xfs_sb_version_hascrc(&mp->m_sb)) return false; - if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) return false; if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) return false; @@ -313,7 +313,7 @@ xfs_allocbt_verify( case cpu_to_be32(XFS_ABTC_CRC_MAGIC): if (!xfs_sb_version_hascrc(&mp->m_sb)) return false; - if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) return false; if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) return false; diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 3349c9a1e845..ff065578969f 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -139,6 +139,8 @@ xfs_attr_get( args.value = value; args.valuelen = *valuelenp; + /* Entirely possible to look up a name which doesn't exist */ + args.op_flags = XFS_DA_OP_OKNOENT; lock_mode = xfs_ilock_attr_map_shared(ip); if (!xfs_inode_hasattr(ip)) diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index e9d401ce93bb..33df52d97ec7 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -262,7 +262,7 @@ xfs_attr3_leaf_verify( if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC) return false; - if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid)) return false; if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) return false; @@ -1056,7 +1056,7 @@ xfs_attr3_leaf_create( hdr3->blkno = cpu_to_be64(bp->b_bn); hdr3->owner = cpu_to_be64(dp->i_ino); - uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid); + uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid); ichdr.freemap[0].base = sizeof(struct xfs_attr3_leaf_hdr); } else { diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 20de88d1bf86..f38f9bd81557 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -100,7 +100,7 @@ xfs_attr3_rmt_verify( return false; if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC)) return false; - if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid)) return false; if (be64_to_cpu(rmt->rm_blkno) != bno) return false; @@ -159,11 +159,10 @@ xfs_attr3_rmt_write_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_buf_log_item *bip = bp->b_fspriv; + int blksize = mp->m_attr_geo->blksize; char *ptr; int len; xfs_daddr_t bno; - int blksize = mp->m_attr_geo->blksize; /* no verification of non-crc buffers */ if (!xfs_sb_version_hascrc(&mp->m_sb)) @@ -175,16 +174,22 @@ xfs_attr3_rmt_write_verify( ASSERT(len >= blksize); while (len > 0) { + struct xfs_attr3_rmt_hdr *rmt = (struct xfs_attr3_rmt_hdr *)ptr; + if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) { xfs_buf_ioerror(bp, -EFSCORRUPTED); xfs_verifier_error(bp); return; } - if (bip) { - struct xfs_attr3_rmt_hdr *rmt; - rmt = (struct xfs_attr3_rmt_hdr *)ptr; - rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn); + /* + * Ensure we aren't writing bogus LSNs to disk. See + * xfs_attr3_rmt_hdr_set() for the explanation. + */ + if (rmt->rm_lsn != cpu_to_be64(NULLCOMMITLSN)) { + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; } xfs_update_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF); @@ -217,10 +222,22 @@ xfs_attr3_rmt_hdr_set( rmt->rm_magic = cpu_to_be32(XFS_ATTR3_RMT_MAGIC); rmt->rm_offset = cpu_to_be32(offset); rmt->rm_bytes = cpu_to_be32(size); - uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_uuid); + uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid); rmt->rm_owner = cpu_to_be64(ino); rmt->rm_blkno = cpu_to_be64(bno); + /* + * Remote attribute blocks are written synchronously, so we don't + * have an LSN that we can stamp in them that makes any sense to log + * recovery. To ensure that log recovery handles overwrites of these + * blocks sanely (i.e. once they've been freed and reallocated as some + * other type of metadata) we need to ensure that the LSN has a value + * that tells log recovery to ignore the LSN and overwrite the buffer + * with whatever is in it's log. To do this, we use the magic + * NULLCOMMITLSN to indicate that the LSN is invalid. + */ + rmt->rm_lsn = cpu_to_be64(NULLCOMMITLSN); + return sizeof(struct xfs_attr3_rmt_hdr); } @@ -434,14 +451,21 @@ xfs_attr_rmtval_set( /* * Allocate a single extent, up to the size of the value. + * + * Note that we have to consider this a data allocation as we + * write the remote attribute without logging the contents. + * Hence we must ensure that we aren't using blocks that are on + * the busy list so that we don't overwrite blocks which have + * recently been freed but their transactions are not yet + * committed to disk. If we overwrite the contents of a busy + * extent and then crash then the block may not contain the + * correct metadata after log recovery occurs. */ xfs_bmap_init(args->flist, args->firstblock); nmap = 1; error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, - blkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - args->firstblock, args->total, &map, &nmap, - args->flist); + blkcnt, XFS_BMAPI_ATTRFORK, args->firstblock, + args->total, &map, &nmap, args->flist); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, &committed); @@ -594,9 +618,8 @@ xfs_attr_rmtval_remove( xfs_bmap_init(args->flist, args->firstblock); error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - 1, args->firstblock, args->flist, - &done); + XFS_BMAPI_ATTRFORK, 1, args->firstblock, + args->flist, &done); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, &committed); diff --git a/fs/xfs/xfs_bit.c b/fs/xfs/libxfs/xfs_bit.c index 0e8885a59646..0e8885a59646 100644 --- a/fs/xfs/xfs_bit.c +++ b/fs/xfs/libxfs/xfs_bit.c diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 63e05b663380..8e2010d53b07 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5945,6 +5945,7 @@ xfs_bmap_split_extent( return xfs_trans_commit(tp); out: + xfs_bmap_cancel(&free_list); xfs_trans_cancel(tp); return error; } diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 2c44c8e50782..6b0cf6546a82 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -349,7 +349,8 @@ xfs_bmbt_to_bmdr( if (xfs_sb_version_hascrc(&mp->m_sb)) { ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_CRC_MAGIC)); - ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid)); + ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, + &mp->m_sb.sb_meta_uuid)); ASSERT(rblock->bb_u.l.bb_blkno == cpu_to_be64(XFS_BUF_DADDR_NULL)); } else @@ -647,7 +648,7 @@ xfs_bmbt_verify( case cpu_to_be32(XFS_BMAP_CRC_MAGIC): if (!xfs_sb_version_hascrc(&mp->m_sb)) return false; - if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) return false; if (be64_to_cpu(block->bb_u.l.bb_blkno) != bp->b_bn) return false; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index c72283dd8d44..f7d7ee7a2607 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -65,7 +65,8 @@ xfs_btree_check_lblock( if (xfs_sb_version_hascrc(&mp->m_sb)) { lblock_ok = lblock_ok && - uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid) && + uuid_equal(&block->bb_u.l.bb_uuid, + &mp->m_sb.sb_meta_uuid) && block->bb_u.l.bb_blkno == cpu_to_be64( bp ? bp->b_bn : XFS_BUF_DADDR_NULL); } @@ -115,7 +116,8 @@ xfs_btree_check_sblock( if (xfs_sb_version_hascrc(&mp->m_sb)) { sblock_ok = sblock_ok && - uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid) && + uuid_equal(&block->bb_u.s.bb_uuid, + &mp->m_sb.sb_meta_uuid) && block->bb_u.s.bb_blkno == cpu_to_be64( bp ? bp->b_bn : XFS_BUF_DADDR_NULL); } @@ -1000,7 +1002,7 @@ xfs_btree_init_block_int( if (flags & XFS_BTREE_CRC_BLOCKS) { buf->bb_u.l.bb_blkno = cpu_to_be64(blkno); buf->bb_u.l.bb_owner = cpu_to_be64(owner); - uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid); + uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid); buf->bb_u.l.bb_pad = 0; buf->bb_u.l.bb_lsn = 0; } @@ -1013,7 +1015,7 @@ xfs_btree_init_block_int( if (flags & XFS_BTREE_CRC_BLOCKS) { buf->bb_u.s.bb_blkno = cpu_to_be64(blkno); buf->bb_u.s.bb_owner = cpu_to_be32(__owner); - uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid); + uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid); buf->bb_u.s.bb_lsn = 0; } } diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 2385f8cd08ab..be43248a5822 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -146,7 +146,7 @@ xfs_da3_node_verify( if (ichdr.magic != XFS_DA3_NODE_MAGIC) return false; - if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid)) return false; if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) return false; @@ -233,6 +233,7 @@ xfs_da3_node_read_verify( bp->b_ops->verify_read(bp); return; default: + xfs_buf_ioerror(bp, -EFSCORRUPTED); break; } @@ -324,7 +325,7 @@ xfs_da3_node_create( ichdr.magic = XFS_DA3_NODE_MAGIC; hdr3->info.blkno = cpu_to_be64(bp->b_bn); hdr3->info.owner = cpu_to_be64(args->dp->i_ino); - uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_uuid); + uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid); } else { ichdr.magic = XFS_DA_NODE_MAGIC; } @@ -1822,6 +1823,7 @@ xfs_da3_path_shift( struct xfs_da_args *args; struct xfs_da_node_entry *btree; struct xfs_da3_icnode_hdr nodehdr; + struct xfs_buf *bp; xfs_dablk_t blkno = 0; int level; int error; @@ -1866,20 +1868,24 @@ xfs_da3_path_shift( */ for (blk++, level++; level < path->active; blk++, level++) { /* - * Release the old block. - * (if it's dirty, trans won't actually let go) + * Read the next child block into a local buffer. */ - if (release) - xfs_trans_brelse(args->trans, blk->bp); + error = xfs_da3_node_read(args->trans, dp, blkno, -1, &bp, + args->whichfork); + if (error) + return error; /* - * Read the next child block. + * Release the old block (if it's dirty, the trans doesn't + * actually let go) and swap the local buffer into the path + * structure. This ensures failure of the above read doesn't set + * a NULL buffer in an active slot in the path. */ + if (release) + xfs_trans_brelse(args->trans, blk->bp); blk->blkno = blkno; - error = xfs_da3_node_read(args->trans, dp, blkno, -1, - &blk->bp, args->whichfork); - if (error) - return error; + blk->bp = bp; + info = blk->bp->b_addr; ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || info->magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) || @@ -2351,8 +2357,8 @@ xfs_da_shrink_inode( * the last block to the place we want to kill. */ error = xfs_bunmapi(tp, dp, dead_blkno, count, - xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, - 0, args->firstblock, args->flist, &done); + xfs_bmapi_aflag(w), 0, args->firstblock, + args->flist, &done); if (error == -ENOSPC) { if (w != XFS_DATA_FORK) break; diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 74bcbabfa523..b14bbd6bb05f 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -680,8 +680,15 @@ typedef struct xfs_attr_leaf_name_remote { typedef struct xfs_attr_leafblock { xfs_attr_leaf_hdr_t hdr; /* constant-structure header block */ xfs_attr_leaf_entry_t entries[1]; /* sorted on key, not name */ - xfs_attr_leaf_name_local_t namelist; /* grows from bottom of buf */ - xfs_attr_leaf_name_remote_t valuelist; /* grows from bottom of buf */ + /* + * The rest of the block contains the following structures after the + * leaf entries, growing from the bottom up. The variables are never + * referenced and definining them can actually make gcc optimize away + * accesses to the 'entries' array above index 0 so don't do that. + * + * xfs_attr_leaf_name_local_t namelist; + * xfs_attr_leaf_name_remote_t valuelist; + */ } xfs_attr_leafblock_t; /* diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index a69fb3a1e161..9de401d297e5 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -362,6 +362,7 @@ xfs_dir_lookup( struct xfs_da_args *args; int rval; int v; /* type-checking value */ + int lock_mode; ASSERT(S_ISDIR(dp->i_d.di_mode)); XFS_STATS_INC(xs_dir_lookup); @@ -387,6 +388,7 @@ xfs_dir_lookup( if (ci_name) args->op_flags |= XFS_DA_OP_CILOOKUP; + lock_mode = xfs_ilock_data_map_shared(dp); if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { rval = xfs_dir2_sf_lookup(args); goto out_check_rval; @@ -419,6 +421,7 @@ out_check_rval: } } out_free: + xfs_iunlock(dp, lock_mode); kmem_free(args); return rval; } @@ -674,25 +677,22 @@ xfs_dir2_shrink_inode( mp = dp->i_mount; tp = args->trans; da = xfs_dir2_db_to_da(args->geo, db); - /* - * Unmap the fsblock(s). - */ - if ((error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount, - XFS_BMAPI_METADATA, 0, args->firstblock, args->flist, - &done))) { + + /* Unmap the fsblock(s). */ + error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount, 0, 0, + args->firstblock, args->flist, &done); + if (error) { /* - * ENOSPC actually can happen if we're in a removename with - * no space reservation, and the resulting block removal - * would cause a bmap btree split or conversion from extents - * to btree. This can only happen for un-fragmented - * directory blocks, since you need to be punching out - * the middle of an extent. - * In this case we need to leave the block in the file, - * and not binval it. - * So the block has to be in a consistent empty state - * and appropriately logged. - * We don't free up the buffer, the caller can tell it - * hasn't happened since it got an error back. + * ENOSPC actually can happen if we're in a removename with no + * space reservation, and the resulting block removal would + * cause a bmap btree split or conversion from extents to btree. + * This can only happen for un-fragmented directory blocks, + * since you need to be punching out the middle of an extent. + * In this case we need to leave the block in the file, and not + * binval it. So the block has to be in a consistent empty + * state and appropriately logged. We don't free up the buffer, + * the caller can tell it hasn't happened since it got an error + * back. */ return error; } diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 9354e190b82e..4778d1dd511a 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -67,7 +67,7 @@ xfs_dir3_block_verify( if (xfs_sb_version_hascrc(&mp->m_sb)) { if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) return false; - if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return false; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) return false; @@ -157,7 +157,7 @@ xfs_dir3_block_init( hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); hdr3->blkno = cpu_to_be64(bp->b_bn); hdr3->owner = cpu_to_be64(dp->i_ino); - uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid); + uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid); return; } diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index de1ea16f5748..824131e71bc5 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -220,7 +220,7 @@ xfs_dir3_data_verify( if (xfs_sb_version_hascrc(&mp->m_sb)) { if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC)) return false; - if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return false; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) return false; @@ -252,7 +252,8 @@ xfs_dir3_data_reada_verify( return; case cpu_to_be32(XFS_DIR2_DATA_MAGIC): case cpu_to_be32(XFS_DIR3_DATA_MAGIC): - xfs_dir3_data_verify(bp); + bp->b_ops = &xfs_dir3_data_buf_ops; + bp->b_ops->verify_read(bp); return; default: xfs_buf_ioerror(bp, -EFSCORRUPTED); @@ -604,7 +605,7 @@ xfs_dir3_data_init( hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC); hdr3->blkno = cpu_to_be64(bp->b_bn); hdr3->owner = cpu_to_be64(dp->i_ino); - uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid); + uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid); } else hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index 106119955400..f300240ebb8d 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -160,7 +160,7 @@ xfs_dir3_leaf_verify( if (leaf3->info.hdr.magic != cpu_to_be16(magic3)) return false; - if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid)) return false; if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) return false; @@ -310,7 +310,7 @@ xfs_dir3_leaf_init( : cpu_to_be16(XFS_DIR3_LEAFN_MAGIC); leaf3->info.blkno = cpu_to_be64(bp->b_bn); leaf3->info.owner = cpu_to_be64(owner); - uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_uuid); + uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid); } else { memset(leaf, 0, sizeof(*leaf)); leaf->hdr.info.magic = cpu_to_be16(type); diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 41b80d3d3877..cc28e924545b 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -93,7 +93,7 @@ xfs_dir3_free_verify( if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC)) return false; - if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return false; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) return false; @@ -226,7 +226,7 @@ xfs_dir3_free_get_buf( hdr3->hdr.blkno = cpu_to_be64(bp->b_bn); hdr3->hdr.owner = cpu_to_be64(dp->i_ino); - uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid); + uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_meta_uuid); } else hdr.magic = XFS_DIR2_FREE_MAGIC; dp->d_ops->free_hdr_to_disk(bp->b_addr, &hdr); @@ -1845,8 +1845,7 @@ xfs_dir2_node_addname_int( if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) { xfs_alert(mp, - "%s: dir ino %llu needed freesp block %lld for\n" - " data block %lld, got %lld ifbno %llu lastfbno %d", +"%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld ifbno %llu lastfbno %d", __func__, (unsigned long long)dp->i_ino, (long long)dp->d_ops->db_to_fdb( args->geo, dbno), @@ -2132,6 +2131,7 @@ xfs_dir2_node_replace( int error; /* error return value */ int i; /* btree level */ xfs_ino_t inum; /* new inode number */ + int ftype; /* new file type */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry being changed */ int rval; /* internal return value */ @@ -2145,7 +2145,14 @@ xfs_dir2_node_replace( state = xfs_da_state_alloc(); state->args = args; state->mp = args->dp->i_mount; + + /* + * We have to save new inode number and ftype since + * xfs_da3_node_lookup_int() is going to overwrite them + */ inum = args->inumber; + ftype = args->filetype; + /* * Lookup the entry to change in the btree. */ @@ -2183,7 +2190,7 @@ xfs_dir2_node_replace( * Fill in the new inode number and log the entry. */ dep->inumber = cpu_to_be64(inum); - args->dp->d_ops->data_put_ftype(dep, args->filetype); + args->dp->d_ops->data_put_ftype(dep, ftype); xfs_dir2_data_log_entry(args, state->extrablk.bp, dep); rval = 0; } diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index 6fbf2d853a54..5331b7f0460c 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -163,7 +163,7 @@ xfs_dqcheck( d->dd_diskdq.d_id = cpu_to_be32(id); if (xfs_sb_version_hascrc(&mp->m_sb)) { - uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid); + uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid); xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); } @@ -198,7 +198,7 @@ xfs_dquot_buf_verify_crc( if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF)) return false; - if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_meta_uuid)) return false; } return true; diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index a0ae572051de..9590a069e556 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -100,7 +100,7 @@ typedef struct xfs_sb { xfs_rfsblock_t sb_dblocks; /* number of data blocks */ xfs_rfsblock_t sb_rblocks; /* number of realtime blocks */ xfs_rtblock_t sb_rextents; /* number of realtime extents */ - uuid_t sb_uuid; /* file system unique id */ + uuid_t sb_uuid; /* user-visible file system unique id */ xfs_fsblock_t sb_logstart; /* starting block of log if internal */ xfs_ino_t sb_rootino; /* root inode number */ xfs_ino_t sb_rbmino; /* bitmap inode for realtime extents */ @@ -174,6 +174,7 @@ typedef struct xfs_sb { xfs_ino_t sb_pquotino; /* project quota inode */ xfs_lsn_t sb_lsn; /* last write sequence */ + uuid_t sb_meta_uuid; /* metadata file system unique id */ /* must be padded to 64 bit alignment */ } xfs_sb_t; @@ -190,7 +191,7 @@ typedef struct xfs_dsb { __be64 sb_dblocks; /* number of data blocks */ __be64 sb_rblocks; /* number of realtime blocks */ __be64 sb_rextents; /* number of realtime extents */ - uuid_t sb_uuid; /* file system unique id */ + uuid_t sb_uuid; /* user-visible file system unique id */ __be64 sb_logstart; /* starting block of log if internal */ __be64 sb_rootino; /* root inode number */ __be64 sb_rbmino; /* bitmap inode for realtime extents */ @@ -260,6 +261,7 @@ typedef struct xfs_dsb { __be64 sb_pquotino; /* project quota inode */ __be64 sb_lsn; /* last write sequence */ + uuid_t sb_meta_uuid; /* metadata file system unique id */ /* must be padded to 64 bit alignment */ } xfs_dsb_t; @@ -458,9 +460,11 @@ xfs_sb_has_ro_compat_feature( #define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */ #define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */ +#define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */ #define XFS_SB_FEAT_INCOMPAT_ALL \ (XFS_SB_FEAT_INCOMPAT_FTYPE| \ - XFS_SB_FEAT_INCOMPAT_SPINODES) + XFS_SB_FEAT_INCOMPAT_SPINODES| \ + XFS_SB_FEAT_INCOMPAT_META_UUID) #define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL static inline bool @@ -515,6 +519,18 @@ static inline bool xfs_sb_version_hassparseinodes(struct xfs_sb *sbp) } /* + * XFS_SB_FEAT_INCOMPAT_META_UUID indicates that the metadata UUID + * is stored separately from the user-visible UUID; this allows the + * user-visible UUID to be changed on V5 filesystems which have a + * filesystem UUID stamped into every piece of metadata. + */ +static inline bool xfs_sb_version_hasmetauuid(struct xfs_sb *sbp) +{ + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID); +} + +/* * end of superblock version macros */ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 66efc702452a..54deb2d12ac6 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -338,7 +338,8 @@ xfs_ialloc_inode_init( if (version == 3) { free->di_ino = cpu_to_be64(ino); ino++; - uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid); + uuid_copy(&free->di_uuid, + &mp->m_sb.sb_meta_uuid); xfs_dinode_calc_crc(mp, free); } else if (tp) { /* just log the inode core */ @@ -2232,7 +2233,7 @@ xfs_imap_lookup( } xfs_trans_brelse(tp, agbp); - xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); if (error) return error; @@ -2500,7 +2501,7 @@ xfs_agi_verify( struct xfs_agi *agi = XFS_BUF_TO_AGI(bp); if (xfs_sb_version_hascrc(&mp->m_sb) && - !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_uuid)) + !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid)) return false; /* * Validate the magic number of the agi block. diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 674ad8f760be..f39b285beb19 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -239,7 +239,7 @@ xfs_inobt_verify( case cpu_to_be32(XFS_FIBT_CRC_MAGIC): if (!xfs_sb_version_hascrc(&mp->m_sb)) return false; - if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) return false; if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) return false; diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 6526e7696184..268c00f4f83a 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -304,7 +304,7 @@ xfs_dinode_verify( return false; if (be64_to_cpu(dip->di_ino) != ip->i_ino) return false; - if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid)) return false; return true; } @@ -366,7 +366,7 @@ xfs_iread( if (xfs_sb_version_hascrc(&mp->m_sb)) { ip->i_d.di_version = 3; ip->i_d.di_ino = ip->i_ino; - uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid); + uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_meta_uuid); } else ip->i_d.di_version = 2; return 0; diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index df9851c46b5c..47425140f343 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -131,10 +131,11 @@ xfs_mount_validate_sb( if (xfs_sb_has_compat_feature(sbp, XFS_SB_FEAT_COMPAT_UNKNOWN)) { xfs_warn(mp, -"Superblock has unknown compatible features (0x%x) enabled.\n" -"Using a more recent kernel is recommended.", +"Superblock has unknown compatible features (0x%x) enabled.", (sbp->sb_features_compat & XFS_SB_FEAT_COMPAT_UNKNOWN)); + xfs_warn(mp, +"Using a more recent kernel is recommended."); } if (xfs_sb_has_ro_compat_feature(sbp, @@ -145,18 +146,21 @@ xfs_mount_validate_sb( XFS_SB_FEAT_RO_COMPAT_UNKNOWN)); if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { xfs_warn(mp, -"Attempted to mount read-only compatible filesystem read-write.\n" +"Attempted to mount read-only compatible filesystem read-write."); + xfs_warn(mp, "Filesystem can only be safely mounted read only."); + return -EINVAL; } } if (xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_UNKNOWN)) { xfs_warn(mp, -"Superblock has unknown incompatible features (0x%x) enabled.\n" -"Filesystem can not be safely mounted by this kernel.", +"Superblock has unknown incompatible features (0x%x) enabled.", (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_UNKNOWN)); + xfs_warn(mp, +"Filesystem can not be safely mounted by this kernel."); return -EINVAL; } } @@ -182,9 +186,6 @@ xfs_mount_validate_sb( if (xfs_sb_version_hassparseinodes(sbp)) { uint32_t align; - xfs_alert(mp, - "EXPERIMENTAL sparse inode feature enabled. Use at your own risk!"); - align = XFS_INODES_PER_CHUNK * sbp->sb_inodesize >> sbp->sb_blocklog; if (sbp->sb_inoalignmt != align) { @@ -398,6 +399,14 @@ __xfs_sb_from_disk( to->sb_spino_align = be32_to_cpu(from->sb_spino_align); to->sb_pquotino = be64_to_cpu(from->sb_pquotino); to->sb_lsn = be64_to_cpu(from->sb_lsn); + /* + * sb_meta_uuid is only on disk if it differs from sb_uuid and the + * feature flag is set; if not set we keep it only in memory. + */ + if (xfs_sb_version_hasmetauuid(to)) + uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid); + else + uuid_copy(&to->sb_meta_uuid, &from->sb_uuid); /* Convert on-disk flags to in-memory flags? */ if (convert_xquota) xfs_sb_quota_from_disk(to); @@ -539,6 +548,8 @@ xfs_sb_to_disk( cpu_to_be32(from->sb_features_log_incompat); to->sb_spino_align = cpu_to_be32(from->sb_spino_align); to->sb_lsn = cpu_to_be64(from->sb_lsn); + if (xfs_sb_version_hasmetauuid(from)) + uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid); } } diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index e7e26bd6468f..8f8af05b3f13 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -63,7 +63,7 @@ xfs_symlink_hdr_set( dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC); dsl->sl_offset = cpu_to_be32(offset); dsl->sl_bytes = cpu_to_be32(size); - uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_uuid); + uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid); dsl->sl_owner = cpu_to_be64(ino); dsl->sl_blkno = cpu_to_be64(bp->b_bn); bp->b_ops = &xfs_symlink_buf_ops; @@ -107,7 +107,7 @@ xfs_symlink_verify( return false; if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC)) return false; - if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid)) return false; if (bp->b_bn != be64_to_cpu(dsl->sl_blkno)) return false; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 3859f5e27a4d..50ab2879b9da 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -119,8 +119,7 @@ xfs_setfilesize_trans_alloc( * We may pass freeze protection with a transaction. So tell lockdep * we released it. */ - rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], - 1, _THIS_IP_); + __sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS); /* * We hand off the transaction to the completion thread now, so * clear the flag here. @@ -171,8 +170,7 @@ xfs_setfilesize_ioend( * Similarly for freeze protection. */ current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); - rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], - 0, 1, _THIS_IP_); + __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS); return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size); } @@ -351,12 +349,12 @@ xfs_imap_valid( */ STATIC void xfs_end_bio( - struct bio *bio, - int error) + struct bio *bio) { xfs_ioend_t *ioend = bio->bi_private; - ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error; + if (!ioend->io_error) + ioend->io_error = bio->bi_error; /* Toss bio and pass work off to an xfsdatad thread */ bio->bi_private = NULL; @@ -382,8 +380,7 @@ STATIC struct bio * xfs_alloc_ioend_bio( struct buffer_head *bh) { - int nvecs = bio_get_nr_vecs(bh->b_bdev); - struct bio *bio = bio_alloc(GFP_NOIO, nvecs); + struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); ASSERT(bio->bi_private == NULL); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 0f34886cf726..3bf4ad0d19e4 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -67,16 +67,15 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) */ int /* error */ xfs_bmap_finish( - xfs_trans_t **tp, /* transaction pointer addr */ - xfs_bmap_free_t *flist, /* i/o: list extents to free */ - int *committed) /* xact committed or not */ + struct xfs_trans **tp, /* transaction pointer addr */ + struct xfs_bmap_free *flist, /* i/o: list extents to free */ + int *committed)/* xact committed or not */ { - xfs_efd_log_item_t *efd; /* extent free data */ - xfs_efi_log_item_t *efi; /* extent free intention */ - int error; /* error return value */ - xfs_bmap_free_item_t *free; /* free extent item */ - xfs_mount_t *mp; /* filesystem mount structure */ - xfs_bmap_free_item_t *next; /* next item on free list */ + struct xfs_efd_log_item *efd; /* extent free data */ + struct xfs_efi_log_item *efi; /* extent free intention */ + int error; /* error return value */ + struct xfs_bmap_free_item *free; /* free extent item */ + struct xfs_bmap_free_item *next; /* next item on free list */ ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); if (flist->xbf_count == 0) { @@ -88,40 +87,48 @@ xfs_bmap_finish( xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock, free->xbfi_blockcount); - error = xfs_trans_roll(tp, NULL); - *committed = 1; - /* - * We have a new transaction, so we should return committed=1, - * even though we're returning an error. - */ - if (error) + error = __xfs_trans_roll(tp, NULL, committed); + if (error) { + /* + * If the transaction was committed, drop the EFD reference + * since we're bailing out of here. The other reference is + * dropped when the EFI hits the AIL. + * + * If the transaction was not committed, the EFI is freed by the + * EFI item unlock handler on abort. Also, we have a new + * transaction so we should return committed=1 even though we're + * returning an error. + */ + if (*committed) { + xfs_efi_release(efi); + xfs_force_shutdown((*tp)->t_mountp, + (error == -EFSCORRUPTED) ? + SHUTDOWN_CORRUPT_INCORE : + SHUTDOWN_META_IO_ERROR); + } else { + *committed = 1; + } + return error; + } + /* + * Get an EFD and free each extent in the list, logging to the EFD in + * the process. The remaining bmap free list is cleaned up by the caller + * on error. + */ efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count); for (free = flist->xbf_first; free != NULL; free = next) { next = free->xbfi_next; - if ((error = xfs_free_extent(*tp, free->xbfi_startblock, - free->xbfi_blockcount))) { - /* - * The bmap free list will be cleaned up at a - * higher level. The EFI will be canceled when - * this transaction is aborted. - * Need to force shutdown here to make sure it - * happens, since this transaction may not be - * dirty yet. - */ - mp = (*tp)->t_mountp; - if (!XFS_FORCED_SHUTDOWN(mp)) - xfs_force_shutdown(mp, - (error == -EFSCORRUPTED) ? - SHUTDOWN_CORRUPT_INCORE : - SHUTDOWN_META_IO_ERROR); + + error = xfs_trans_free_extent(*tp, efd, free->xbfi_startblock, + free->xbfi_blockcount); + if (error) return error; - } - xfs_trans_log_efd_extent(*tp, efd, free->xbfi_startblock, - free->xbfi_blockcount); + xfs_bmap_del_free(flist, NULL, free); } + return 0; } @@ -1467,7 +1474,7 @@ xfs_shift_file_space( XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, XFS_QMOPT_RES_REGBLKS); if (error) - goto out; + goto out_trans_cancel; xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); @@ -1481,18 +1488,20 @@ xfs_shift_file_space( &done, stop_fsb, &first_block, &free_list, direction, XFS_BMAP_MAX_SHIFT_EXTENTS); if (error) - goto out; + goto out_bmap_cancel; error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) - goto out; + goto out_bmap_cancel; error = xfs_trans_commit(tp); } return error; -out: +out_bmap_cancel: + xfs_bmap_cancel(&free_list); +out_trans_cancel: xfs_trans_cancel(tp); return error; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index a4b7d92e946c..8ecffb35935b 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -438,7 +438,6 @@ _xfs_buf_find( xfs_buf_flags_t flags, xfs_buf_t *new_bp) { - size_t numbytes; struct xfs_perag *pag; struct rb_node **rbp; struct rb_node *parent; @@ -450,10 +449,9 @@ _xfs_buf_find( for (i = 0; i < nmaps; i++) numblks += map[i].bm_len; - numbytes = BBTOB(numblks); /* Check for IOs smaller than the sector size / not sector aligned */ - ASSERT(!(numbytes < btp->bt_meta_sectorsize)); + ASSERT(!(BBTOB(numblks) < btp->bt_meta_sectorsize)); ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask)); /* @@ -1096,8 +1094,7 @@ xfs_bwrite( STATIC void xfs_buf_bio_end_io( - struct bio *bio, - int error) + struct bio *bio) { xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; @@ -1105,10 +1102,10 @@ xfs_buf_bio_end_io( * don't overwrite existing errors - otherwise we can lose errors on * buffers that require multiple bios to complete. */ - if (error) { + if (bio->bi_error) { spin_lock(&bp->b_lock); if (!bp->b_io_error) - bp->b_io_error = error; + bp->b_io_error = bio->bi_error; spin_unlock(&bp->b_lock); } @@ -1533,9 +1530,10 @@ xfs_wait_buftarg( list_del_init(&bp->b_lru); if (bp->b_flags & XBF_WRITE_FAIL) { xfs_alert(btp->bt_mount, -"Corruption Alert: Buffer at block 0x%llx had permanent write failures!\n" -"Please run xfs_repair to determine the extent of the problem.", +"Corruption Alert: Buffer at block 0x%llx had permanent write failures!", (long long)bp->b_bn); + xfs_alert(btp->bt_mount, +"Please run xfs_repair to determine the extent of the problem."); } xfs_buf_rele(bp); } diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 331c1ccf8264..c79b717d9b88 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -23,6 +23,7 @@ #include <linux/spinlock.h> #include <linux/mm.h> #include <linux/fs.h> +#include <linux/dax.h> #include <linux/buffer_head.h> #include <linux/uio.h> #include <linux/list_lru.h> diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 092d652bc03d..7e986da34f6c 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -647,11 +647,7 @@ xfs_buf_item_unlock( xfs_buf_item_relse(bp); else if (aborted) { ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); - if (lip->li_flags & XFS_LI_IN_AIL) { - spin_lock(&lip->li_ailp->xa_lock); - xfs_trans_ail_delete(lip->li_ailp, lip, - SHUTDOWN_LOG_IO_ERROR); - } + xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR); xfs_buf_item_relse(bp); } } @@ -750,13 +746,13 @@ xfs_buf_item_free_format( * buffer (see xfs_buf_attach_iodone() below), then put the * buf log item at the front. */ -void +int xfs_buf_item_init( - xfs_buf_t *bp, - xfs_mount_t *mp) + struct xfs_buf *bp, + struct xfs_mount *mp) { - xfs_log_item_t *lip = bp->b_fspriv; - xfs_buf_log_item_t *bip; + struct xfs_log_item *lip = bp->b_fspriv; + struct xfs_buf_log_item *bip; int chunks; int map_size; int error; @@ -770,12 +766,11 @@ xfs_buf_item_init( */ ASSERT(bp->b_target->bt_mount == mp); if (lip != NULL && lip->li_type == XFS_LI_BUF) - return; + return 0; bip = kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP); xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); bip->bli_buf = bp; - xfs_buf_hold(bp); /* * chunks is the number of XFS_BLF_CHUNK size pieces the buffer @@ -788,6 +783,11 @@ xfs_buf_item_init( */ error = xfs_buf_item_get_format(bip, bp->b_map_count); ASSERT(error == 0); + if (error) { /* to stop gcc throwing set-but-unused warnings */ + kmem_zone_free(xfs_buf_item_zone, bip); + return error; + } + for (i = 0; i < bip->bli_format_count; i++) { chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len), @@ -807,6 +807,8 @@ xfs_buf_item_init( if (bp->b_fspriv) bip->bli_item.li_bio_list = bp->b_fspriv; bp->b_fspriv = bip; + xfs_buf_hold(bp); + return 0; } diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 3f3455a41510..f7eba99d19dd 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -61,7 +61,7 @@ typedef struct xfs_buf_log_item { struct xfs_buf_log_format __bli_format; /* embedded in-log header */ } xfs_buf_log_item_t; -void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); +int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); void xfs_buf_item_relse(struct xfs_buf *); void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint); uint xfs_buf_item_dirty(xfs_buf_log_item_t *); diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 098cd78fe708..a989a9c7edb7 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -171,6 +171,7 @@ xfs_dir2_block_getdents( int wantoff; /* starting block offset */ xfs_off_t cook; struct xfs_da_geometry *geo = args->geo; + int lock_mode; /* * If the block number in the offset is out of range, we're done. @@ -178,7 +179,9 @@ xfs_dir2_block_getdents( if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk) return 0; + lock_mode = xfs_ilock_data_map_shared(dp); error = xfs_dir3_block_read(NULL, dp, &bp); + xfs_iunlock(dp, lock_mode); if (error) return error; @@ -529,9 +532,12 @@ xfs_dir2_leaf_getdents( * current buffer, need to get another one. */ if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) { + int lock_mode; + lock_mode = xfs_ilock_data_map_shared(dp); error = xfs_dir2_leaf_readbuf(args, bufsize, map_info, &curoff, &bp); + xfs_iunlock(dp, lock_mode); if (error || !map_info->map_valid) break; @@ -653,7 +659,6 @@ xfs_readdir( struct xfs_da_args args = { NULL }; int rval; int v; - uint lock_mode; trace_xfs_readdir(dp); @@ -666,7 +671,7 @@ xfs_readdir( args.dp = dp; args.geo = dp->i_mount->m_dir_geo; - lock_mode = xfs_ilock_data_map_shared(dp); + xfs_ilock(dp, XFS_IOLOCK_SHARED); if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) rval = xfs_dir2_sf_getdents(&args, ctx); else if ((rval = xfs_dir2_isblock(&args, &v))) @@ -675,7 +680,7 @@ xfs_readdir( rval = xfs_dir2_block_getdents(&args, ctx); else rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize); - xfs_iunlock(dp, lock_mode); + xfs_iunlock(dp, XFS_IOLOCK_SHARED); return rval; } diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 4143dc75dca4..30cb3afb67f0 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -251,7 +251,7 @@ xfs_qm_init_dquot_blk( d->dd_diskdq.d_id = cpu_to_be32(curid); d->dd_diskdq.d_flags = type; if (xfs_sb_version_hascrc(&mp->m_sb)) { - uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid); + uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid); xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); } @@ -954,12 +954,8 @@ xfs_qm_dqflush( struct xfs_log_item *lip = &dqp->q_logitem.qli_item; dqp->dq_flags &= ~XFS_DQ_DIRTY; - spin_lock(&mp->m_ail->xa_lock); - if (lip->li_flags & XFS_LI_IN_AIL) - xfs_trans_ail_delete(mp->m_ail, lip, - SHUTDOWN_CORRUPT_INCORE); - else - spin_unlock(&mp->m_ail->xa_lock); + xfs_trans_ail_remove(lip, SHUTDOWN_CORRUPT_INCORE); + error = -EIO; goto out_unlock; } diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index adc8f8fdd145..4aa0153214f9 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -47,28 +47,6 @@ xfs_efi_item_free( } /* - * Freeing the efi requires that we remove it from the AIL if it has already - * been placed there. However, the EFI may not yet have been placed in the AIL - * when called by xfs_efi_release() from EFD processing due to the ordering of - * committed vs unpin operations in bulk insert operations. Hence the reference - * count to ensure only the last caller frees the EFI. - */ -STATIC void -__xfs_efi_release( - struct xfs_efi_log_item *efip) -{ - struct xfs_ail *ailp = efip->efi_item.li_ailp; - - if (atomic_dec_and_test(&efip->efi_refcount)) { - spin_lock(&ailp->xa_lock); - /* xfs_trans_ail_delete() drops the AIL lock. */ - xfs_trans_ail_delete(ailp, &efip->efi_item, - SHUTDOWN_LOG_IO_ERROR); - xfs_efi_item_free(efip); - } -} - -/* * This returns the number of iovecs needed to log the given efi item. * We only need 1 iovec for an efi item. It just logs the efi_log_format * structure. @@ -128,12 +106,12 @@ xfs_efi_item_pin( } /* - * While EFIs cannot really be pinned, the unpin operation is the last place at - * which the EFI is manipulated during a transaction. If we are being asked to - * remove the EFI it's because the transaction has been cancelled and by - * definition that means the EFI cannot be in the AIL so remove it from the - * transaction and free it. Otherwise coordinate with xfs_efi_release() - * to determine who gets to free the EFI. + * The unpin operation is the last place an EFI is manipulated in the log. It is + * either inserted in the AIL or aborted in the event of a log I/O error. In + * either case, the EFI transaction has been successfully committed to make it + * this far. Therefore, we expect whoever committed the EFI to either construct + * and commit the EFD or drop the EFD's reference in the event of error. Simply + * drop the log's EFI reference now that the log is done with it. */ STATIC void xfs_efi_item_unpin( @@ -141,15 +119,7 @@ xfs_efi_item_unpin( int remove) { struct xfs_efi_log_item *efip = EFI_ITEM(lip); - - if (remove) { - ASSERT(!(lip->li_flags & XFS_LI_IN_AIL)); - if (lip->li_desc) - xfs_trans_del_item(lip); - xfs_efi_item_free(efip); - return; - } - __xfs_efi_release(efip); + xfs_efi_release(efip); } /* @@ -167,6 +137,11 @@ xfs_efi_item_push( return XFS_ITEM_PINNED; } +/* + * The EFI has been either committed or aborted if the transaction has been + * cancelled. If the transaction was cancelled, an EFD isn't going to be + * constructed and thus we free the EFI here directly. + */ STATIC void xfs_efi_item_unlock( struct xfs_log_item *lip) @@ -301,23 +276,19 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) } /* - * This is called by the efd item code below to release references to the given - * efi item. Each efd calls this with the number of extents that it has - * logged, and when the sum of these reaches the total number of extents logged - * by this efi item we can free the efi item. + * Freeing the efi requires that we remove it from the AIL if it has already + * been placed there. However, the EFI may not yet have been placed in the AIL + * when called by xfs_efi_release() from EFD processing due to the ordering of + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the EFI. */ void -xfs_efi_release(xfs_efi_log_item_t *efip, - uint nextents) +xfs_efi_release( + struct xfs_efi_log_item *efip) { - ASSERT(atomic_read(&efip->efi_next_extent) >= nextents); - if (atomic_sub_and_test(nextents, &efip->efi_next_extent)) { - /* recovery needs us to drop the EFI reference, too */ - if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) - __xfs_efi_release(efip); - - __xfs_efi_release(efip); - /* efip may now have been freed, do not reference it again. */ + if (atomic_dec_and_test(&efip->efi_refcount)) { + xfs_trans_ail_remove(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR); + xfs_efi_item_free(efip); } } @@ -415,20 +386,27 @@ xfs_efd_item_push( return XFS_ITEM_PINNED; } +/* + * The EFD is either committed or aborted if the transaction is cancelled. If + * the transaction is cancelled, drop our reference to the EFI and free the EFD. + */ STATIC void xfs_efd_item_unlock( struct xfs_log_item *lip) { - if (lip->li_flags & XFS_LI_ABORTED) - xfs_efd_item_free(EFD_ITEM(lip)); + struct xfs_efd_log_item *efdp = EFD_ITEM(lip); + + if (lip->li_flags & XFS_LI_ABORTED) { + xfs_efi_release(efdp->efd_efip); + xfs_efd_item_free(efdp); + } } /* - * When the efd item is committed to disk, all we need to do - * is delete our reference to our partner efi item and then - * free ourselves. Since we're freeing ourselves we must - * return -1 to keep the transaction code from further referencing - * this item. + * When the efd item is committed to disk, all we need to do is delete our + * reference to our partner efi item and then free ourselves. Since we're + * freeing ourselves we must return -1 to keep the transaction code from further + * referencing this item. */ STATIC xfs_lsn_t xfs_efd_item_committed( @@ -438,13 +416,14 @@ xfs_efd_item_committed( struct xfs_efd_log_item *efdp = EFD_ITEM(lip); /* - * If we got a log I/O error, it's always the case that the LR with the - * EFI got unpinned and freed before the EFD got aborted. + * Drop the EFI reference regardless of whether the EFD has been + * aborted. Once the EFD transaction is constructed, it is the sole + * responsibility of the EFD to release the EFI (even if the EFI is + * aborted due to log I/O error). */ - if (!(lip->li_flags & XFS_LI_ABORTED)) - xfs_efi_release(efdp->efd_efip, efdp->efd_format.efd_nextents); - + xfs_efi_release(efdp->efd_efip); xfs_efd_item_free(efdp); + return (xfs_lsn_t)-1; } diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index 0ffbce32d569..8fa8651705e1 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -39,9 +39,28 @@ struct kmem_zone; * "extent free done" log item described below. * * The EFI is reference counted so that it is not freed prior to both the EFI - * and EFD being committed and unpinned. This ensures that when the last - * reference goes away the EFI will always be in the AIL as it has been - * unpinned, regardless of whether the EFD is processed before or after the EFI. + * and EFD being committed and unpinned. This ensures the EFI is inserted into + * the AIL even in the event of out of order EFI/EFD processing. In other words, + * an EFI is born with two references: + * + * 1.) an EFI held reference to track EFI AIL insertion + * 2.) an EFD held reference to track EFD commit + * + * On allocation, both references are the responsibility of the caller. Once the + * EFI is added to and dirtied in a transaction, ownership of reference one + * transfers to the transaction. The reference is dropped once the EFI is + * inserted to the AIL or in the event of failure along the way (e.g., commit + * failure, log I/O error, etc.). Note that the caller remains responsible for + * the EFD reference under all circumstances to this point. The caller has no + * means to detect failure once the transaction is committed, however. + * Therefore, an EFD is required after this point, even in the event of + * unrelated failure. + * + * Once an EFD is allocated and dirtied in a transaction, reference two + * transfers to the transaction. The EFD reference is dropped once it reaches + * the unpin handler. Similar to the EFI, the reference also drops in the event + * of commit failure or log I/O errors. Note that the EFD is not inserted in the + * AIL, so at this point both the EFI and EFD are freed. */ typedef struct xfs_efi_log_item { xfs_log_item_t efi_item; @@ -77,5 +96,6 @@ xfs_efd_log_item_t *xfs_efd_init(struct xfs_mount *, xfs_efi_log_item_t *, int xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt); void xfs_efi_item_free(xfs_efi_log_item_t *); +void xfs_efi_release(struct xfs_efi_log_item *); #endif /* __XFS_EXTFREE_ITEM_H__ */ diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f0e8249722d4..e78feb400e22 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -317,24 +317,33 @@ xfs_file_read_iter( return -EIO; /* - * Locking is a bit tricky here. If we take an exclusive lock - * for direct IO, we effectively serialise all new concurrent - * read IO to this file and block it behind IO that is currently in - * progress because IO in progress holds the IO lock shared. We only - * need to hold the lock exclusive to blow away the page cache, so - * only take lock exclusively if the page cache needs invalidation. - * This allows the normal direct IO case of no page cache pages to - * proceeed concurrently without serialisation. + * Locking is a bit tricky here. If we take an exclusive lock for direct + * IO, we effectively serialise all new concurrent read IO to this file + * and block it behind IO that is currently in progress because IO in + * progress holds the IO lock shared. We only need to hold the lock + * exclusive to blow away the page cache, so only take lock exclusively + * if the page cache needs invalidation. This allows the normal direct + * IO case of no page cache pages to proceeed concurrently without + * serialisation. */ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) { xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); + /* + * The generic dio code only flushes the range of the particular + * I/O. Because we take an exclusive lock here, this whole + * sequence is considerably more expensive for us. This has a + * noticeable performance impact for any file with cached pages, + * even when outside of the range of the particular I/O. + * + * Hence, amortize the cost of the lock against a full file + * flush and reduce the chances of repeated iolock cycles going + * forward. + */ if (inode->i_mapping->nrpages) { - ret = filemap_write_and_wait_range( - VFS_I(ip)->i_mapping, - pos, pos + size - 1); + ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); if (ret) { xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); return ret; @@ -345,9 +354,7 @@ xfs_file_read_iter( * we fail to invalidate a page, but this should never * happen on XFS. Warn if it does fail. */ - ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, - pos >> PAGE_CACHE_SHIFT, - (pos + size - 1) >> PAGE_CACHE_SHIFT); + ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping); WARN_ON_ONCE(ret); ret = 0; } @@ -733,19 +740,19 @@ xfs_file_dio_aio_write( pos = iocb->ki_pos; end = pos + count - 1; + /* + * See xfs_file_read_iter() for why we do a full-file flush here. + */ if (mapping->nrpages) { - ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, - pos, end); + ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); if (ret) goto out; /* - * Invalidate whole pages. This can return an error if - * we fail to invalidate a page, but this should never - * happen on XFS. Warn if it does fail. + * Invalidate whole pages. This can return an error if we fail + * to invalidate a page, but this should never happen on XFS. + * Warn if it does fail. */ - ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, - pos >> PAGE_CACHE_SHIFT, - end >> PAGE_CACHE_SHIFT); + ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping); WARN_ON_ONCE(ret); ret = 0; } @@ -1514,24 +1521,61 @@ xfs_filemap_fault( struct vm_area_struct *vma, struct vm_fault *vmf) { - struct xfs_inode *ip = XFS_I(file_inode(vma->vm_file)); + struct inode *inode = file_inode(vma->vm_file); int ret; - trace_xfs_filemap_fault(ip); + trace_xfs_filemap_fault(XFS_I(inode)); /* DAX can shortcut the normal fault path on write faults! */ - if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(VFS_I(ip))) + if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(inode)) return xfs_filemap_page_mkwrite(vma, vmf); - xfs_ilock(ip, XFS_MMAPLOCK_SHARED); - ret = filemap_fault(vma, vmf); - xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + if (IS_DAX(inode)) { + /* + * we do not want to trigger unwritten extent conversion on read + * faults - that is unnecessary overhead and would also require + * changes to xfs_get_blocks_direct() to map unwritten extent + * ioend for conversion on read-only mappings. + */ + ret = __dax_fault(vma, vmf, xfs_get_blocks_direct, NULL); + } else + ret = filemap_fault(vma, vmf); + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + + return ret; +} + +STATIC int +xfs_filemap_pmd_fault( + struct vm_area_struct *vma, + unsigned long addr, + pmd_t *pmd, + unsigned int flags) +{ + struct inode *inode = file_inode(vma->vm_file); + struct xfs_inode *ip = XFS_I(inode); + int ret; + + if (!IS_DAX(inode)) + return VM_FAULT_FALLBACK; + + trace_xfs_filemap_pmd_fault(ip); + + sb_start_pagefault(inode->i_sb); + file_update_time(vma->vm_file); + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_direct, + xfs_end_io_dax_write); + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + sb_end_pagefault(inode->i_sb); return ret; } static const struct vm_operations_struct xfs_file_vm_ops = { .fault = xfs_filemap_fault, + .pmd_fault = xfs_filemap_pmd_fault, .map_pages = filemap_map_pages, .page_mkwrite = xfs_filemap_page_mkwrite, }; @@ -1544,7 +1588,7 @@ xfs_file_mmap( file_accessed(filp); vma->vm_ops = &xfs_file_vm_ops; if (IS_DAX(file_inode(filp))) - vma->vm_flags |= VM_MIXEDMAP; + vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; return 0; } diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 9b3438a7680f..ee3aaa0a5317 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -250,7 +250,7 @@ xfs_growfs_data_private( agf->agf_freeblks = cpu_to_be32(tmpsize); agf->agf_longest = cpu_to_be32(tmpsize); if (xfs_sb_version_hascrc(&mp->m_sb)) - uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_uuid); + uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid); error = xfs_bwrite(bp); xfs_buf_relse(bp); @@ -273,7 +273,7 @@ xfs_growfs_data_private( if (xfs_sb_version_hascrc(&mp->m_sb)) { agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC); agfl->agfl_seqno = cpu_to_be32(agno); - uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_uuid); + uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid); } agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp); @@ -309,7 +309,7 @@ xfs_growfs_data_private( agi->agi_newino = cpu_to_be32(NULLAGINO); agi->agi_dirino = cpu_to_be32(NULLAGINO); if (xfs_sb_version_hascrc(&mp->m_sb)) - uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_uuid); + uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid); if (xfs_sb_version_hasfinobt(&mp->m_sb)) { agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp)); agi->agi_free_level = cpu_to_be32(1); diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 76a9f2783282..0a326bd64d4e 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -412,6 +412,8 @@ xfs_iget( if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) return -EINVAL; + XFS_STATS_INC(xs_ig_attempts); + /* get the perag structure and ensure that it's inode capable */ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); agino = XFS_INO_TO_AGINO(mp, ino); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 3da9f4da4f3d..dc40a6d5ae0d 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -164,7 +164,7 @@ xfs_ilock( (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); if (lock_flags & XFS_IOLOCK_EXCL) mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); @@ -212,7 +212,7 @@ xfs_ilock_nowait( (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); if (lock_flags & XFS_IOLOCK_EXCL) { if (!mrtryupdate(&ip->i_iolock)) @@ -281,7 +281,7 @@ xfs_iunlock( (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); ASSERT(lock_flags != 0); if (lock_flags & XFS_IOLOCK_EXCL) @@ -363,31 +363,57 @@ int xfs_lock_delays; #endif /* + * xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when + * DEBUG or XFS_WARN is set. And MAX_LOCKDEP_SUBCLASSES is then only defined + * when CONFIG_LOCKDEP is set. Hence the complex define below to avoid build + * errors and warnings. + */ +#if (defined(DEBUG) || defined(XFS_WARN)) && defined(CONFIG_LOCKDEP) +static bool +xfs_lockdep_subclass_ok( + int subclass) +{ + return subclass < MAX_LOCKDEP_SUBCLASSES; +} +#else +#define xfs_lockdep_subclass_ok(subclass) (true) +#endif + +/* * Bump the subclass so xfs_lock_inodes() acquires each lock with a different - * value. This shouldn't be called for page fault locking, but we also need to - * ensure we don't overrun the number of lockdep subclasses for the iolock or - * mmaplock as that is limited to 12 by the mmap lock lockdep annotations. + * value. This can be called for any type of inode lock combination, including + * parent locking. Care must be taken to ensure we don't overrun the subclass + * storage fields in the class mask we build. */ static inline int xfs_lock_inumorder(int lock_mode, int subclass) { + int class = 0; + + ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP | + XFS_ILOCK_RTSUM))); + ASSERT(xfs_lockdep_subclass_ok(subclass)); + if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { - ASSERT(subclass + XFS_LOCK_INUMORDER < - (1 << (XFS_MMAPLOCK_SHIFT - XFS_IOLOCK_SHIFT))); - lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT; + ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS); + ASSERT(xfs_lockdep_subclass_ok(subclass + + XFS_IOLOCK_PARENT_VAL)); + class += subclass << XFS_IOLOCK_SHIFT; + if (lock_mode & XFS_IOLOCK_PARENT) + class += XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT; } if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) { - ASSERT(subclass + XFS_LOCK_INUMORDER < - (1 << (XFS_ILOCK_SHIFT - XFS_MMAPLOCK_SHIFT))); - lock_mode |= (subclass + XFS_LOCK_INUMORDER) << - XFS_MMAPLOCK_SHIFT; + ASSERT(subclass <= XFS_MMAPLOCK_MAX_SUBCLASS); + class += subclass << XFS_MMAPLOCK_SHIFT; } - if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) - lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT; + if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) { + ASSERT(subclass <= XFS_ILOCK_MAX_SUBCLASS); + class += subclass << XFS_ILOCK_SHIFT; + } - return lock_mode; + return (lock_mode & ~XFS_LOCK_SUBCLASS_MASK) | class; } /* @@ -399,6 +425,11 @@ xfs_lock_inumorder(int lock_mode, int subclass) * transaction (such as truncate). This can result in deadlock since the long * running trans might need to wait for the inode we just locked in order to * push the tail and free space in the log. + * + * xfs_lock_inodes() can only be used to lock one type of lock at a time - + * the iolock, the mmaplock or the ilock, but not more than one at a time. If we + * lock more than one at a time, lockdep will report false positives saying we + * have violated locking orders. */ void xfs_lock_inodes( @@ -409,8 +440,29 @@ xfs_lock_inodes( int attempts = 0, i, j, try_lock; xfs_log_item_t *lp; - /* currently supports between 2 and 5 inodes */ + /* + * Currently supports between 2 and 5 inodes with exclusive locking. We + * support an arbitrary depth of locking here, but absolute limits on + * inodes depend on the the type of locking and the limits placed by + * lockdep annotations in xfs_lock_inumorder. These are all checked by + * the asserts. + */ ASSERT(ips && inodes >= 2 && inodes <= 5); + ASSERT(lock_mode & (XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL | + XFS_ILOCK_EXCL)); + ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED | + XFS_ILOCK_SHARED))); + ASSERT(!(lock_mode & XFS_IOLOCK_EXCL) || + inodes <= XFS_IOLOCK_MAX_SUBCLASS + 1); + ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) || + inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1); + ASSERT(!(lock_mode & XFS_ILOCK_EXCL) || + inodes <= XFS_ILOCK_MAX_SUBCLASS + 1); + + if (lock_mode & XFS_IOLOCK_EXCL) { + ASSERT(!(lock_mode & (XFS_MMAPLOCK_EXCL | XFS_ILOCK_EXCL))); + } else if (lock_mode & XFS_MMAPLOCK_EXCL) + ASSERT(!(lock_mode & XFS_ILOCK_EXCL)); try_lock = 0; i = 0; @@ -629,30 +681,29 @@ xfs_lookup( { xfs_ino_t inum; int error; - uint lock_mode; trace_xfs_lookup(dp, name); if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return -EIO; - lock_mode = xfs_ilock_data_map_shared(dp); + xfs_ilock(dp, XFS_IOLOCK_SHARED); error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); - xfs_iunlock(dp, lock_mode); - if (error) - goto out; + goto out_unlock; error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp); if (error) goto out_free_name; + xfs_iunlock(dp, XFS_IOLOCK_SHARED); return 0; out_free_name: if (ci_name) kmem_free(ci_name->name); -out: +out_unlock: + xfs_iunlock(dp, XFS_IOLOCK_SHARED); *ipp = NULL; return error; } @@ -787,7 +838,7 @@ xfs_ialloc( if (ip->i_d.di_version == 3) { ASSERT(ip->i_d.di_ino == ino); - ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid)); + ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_meta_uuid)); ip->i_d.di_crc = 0; ip->i_d.di_changecount = 1; ip->i_d.di_lsn = 0; @@ -1149,7 +1200,8 @@ xfs_create( goto out_trans_cancel; - xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); + xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL | + XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT); unlock_dp_on_error = true; xfs_bmap_init(&free_list, &first_block); @@ -1175,11 +1227,8 @@ xfs_create( */ error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid, resblks > 0, &ip, &committed); - if (error) { - if (error == -ENOSPC) - goto out_trans_cancel; + if (error) goto out_trans_cancel; - } /* * Now we join the directory inode to the transaction. We do not do it @@ -1188,7 +1237,7 @@ xfs_create( * the transaction cancel unlocking dp so don't do it explicitly in the * error path. */ - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); unlock_dp_on_error = false; error = xfs_dir_createname(tp, dp, name, ip->i_ino, @@ -1261,7 +1310,7 @@ xfs_create( xfs_qm_dqrele(pdqp); if (unlock_dp_on_error) - xfs_iunlock(dp, XFS_ILOCK_EXCL); + xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); return error; } @@ -1318,11 +1367,8 @@ xfs_create_tmpfile( error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, resblks > 0, &ip, NULL); - if (error) { - if (error == -ENOSPC) - goto out_trans_cancel; + if (error) goto out_trans_cancel; - } if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); @@ -1409,10 +1455,11 @@ xfs_link( if (error) goto error_return; + xfs_ilock(tdp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, tdp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); /* * If we are using project inheritance, we only allow hard link @@ -1791,14 +1838,15 @@ xfs_inactive_ifree( xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1); /* - * Just ignore errors at this point. There is nothing we can - * do except to try to keep going. Make sure it's not a silent - * error. + * Just ignore errors at this point. There is nothing we can do except + * to try to keep going. Make sure it's not a silent error. */ error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) + if (error) { xfs_notice(mp, "%s: xfs_bmap_finish returned error %d", __func__, error); + xfs_bmap_cancel(&free_list); + } error = xfs_trans_commit(tp); if (error) xfs_notice(mp, "%s: xfs_trans_commit returned error %d", @@ -2515,9 +2563,10 @@ xfs_remove( goto out_trans_cancel; } + xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); /* @@ -2898,6 +2947,12 @@ xfs_rename( * whether the target directory is the same as the source * directory, we can lock from 2 to 4 inodes. */ + if (!new_parent) + xfs_ilock(src_dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); + else + xfs_lock_two_inodes(src_dp, target_dp, + XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); + xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL); /* @@ -2905,9 +2960,9 @@ xfs_rename( * we can rely on either trans_commit or trans_cancel to unlock * them. */ - xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, src_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); if (new_parent) - xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, target_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); if (target_ip) xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 8f22d20368d8..ca9e11989cbd 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -284,9 +284,9 @@ static inline int xfs_isiflocked(struct xfs_inode *ip) * Flags for lockdep annotations. * * XFS_LOCK_PARENT - for directory operations that require locking a - * parent directory inode and a child entry inode. The parent gets locked - * with this flag so it gets a lockdep subclass of 1 and the child entry - * lock will have a lockdep subclass of 0. + * parent directory inode and a child entry inode. IOLOCK requires nesting, + * MMAPLOCK does not support this class, ILOCK requires a single subclass + * to differentiate parent from child. * * XFS_LOCK_RTBITMAP/XFS_LOCK_RTSUM - the realtime device bitmap and summary * inodes do not participate in the normal lock order, and thus have their @@ -295,30 +295,63 @@ static inline int xfs_isiflocked(struct xfs_inode *ip) * XFS_LOCK_INUMORDER - for locking several inodes at the some time * with xfs_lock_inodes(). This flag is used as the starting subclass * and each subsequent lock acquired will increment the subclass by one. - * So the first lock acquired will have a lockdep subclass of 4, the - * second lock will have a lockdep subclass of 5, and so on. It is - * the responsibility of the class builder to shift this to the correct - * portion of the lock_mode lockdep mask. + * However, MAX_LOCKDEP_SUBCLASSES == 8, which means we are greatly + * limited to the subclasses we can represent via nesting. We need at least + * 5 inodes nest depth for the ILOCK through rename, and we also have to support + * XFS_ILOCK_PARENT, which gives 6 subclasses. Then we have XFS_ILOCK_RTBITMAP + * and XFS_ILOCK_RTSUM, which are another 2 unique subclasses, so that's all + * 8 subclasses supported by lockdep. + * + * This also means we have to number the sub-classes in the lowest bits of + * the mask we keep, and we have to ensure we never exceed 3 bits of lockdep + * mask and we can't use bit-masking to build the subclasses. What a mess. + * + * Bit layout: + * + * Bit Lock Region + * 16-19 XFS_IOLOCK_SHIFT dependencies + * 20-23 XFS_MMAPLOCK_SHIFT dependencies + * 24-31 XFS_ILOCK_SHIFT dependencies + * + * IOLOCK values + * + * 0-3 subclass value + * 4-7 PARENT subclass values + * + * MMAPLOCK values + * + * 0-3 subclass value + * 4-7 unused + * + * ILOCK values + * 0-4 subclass values + * 5 PARENT subclass (not nestable) + * 6 RTBITMAP subclass (not nestable) + * 7 RTSUM subclass (not nestable) + * */ -#define XFS_LOCK_PARENT 1 -#define XFS_LOCK_RTBITMAP 2 -#define XFS_LOCK_RTSUM 3 -#define XFS_LOCK_INUMORDER 4 - -#define XFS_IOLOCK_SHIFT 16 -#define XFS_IOLOCK_PARENT (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT) - -#define XFS_MMAPLOCK_SHIFT 20 - -#define XFS_ILOCK_SHIFT 24 -#define XFS_ILOCK_PARENT (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT) -#define XFS_ILOCK_RTBITMAP (XFS_LOCK_RTBITMAP << XFS_ILOCK_SHIFT) -#define XFS_ILOCK_RTSUM (XFS_LOCK_RTSUM << XFS_ILOCK_SHIFT) - -#define XFS_IOLOCK_DEP_MASK 0x000f0000 -#define XFS_MMAPLOCK_DEP_MASK 0x00f00000 -#define XFS_ILOCK_DEP_MASK 0xff000000 -#define XFS_LOCK_DEP_MASK (XFS_IOLOCK_DEP_MASK | \ +#define XFS_IOLOCK_SHIFT 16 +#define XFS_IOLOCK_PARENT_VAL 4 +#define XFS_IOLOCK_MAX_SUBCLASS (XFS_IOLOCK_PARENT_VAL - 1) +#define XFS_IOLOCK_DEP_MASK 0x000f0000 +#define XFS_IOLOCK_PARENT (XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT) + +#define XFS_MMAPLOCK_SHIFT 20 +#define XFS_MMAPLOCK_NUMORDER 0 +#define XFS_MMAPLOCK_MAX_SUBCLASS 3 +#define XFS_MMAPLOCK_DEP_MASK 0x00f00000 + +#define XFS_ILOCK_SHIFT 24 +#define XFS_ILOCK_PARENT_VAL 5 +#define XFS_ILOCK_MAX_SUBCLASS (XFS_ILOCK_PARENT_VAL - 1) +#define XFS_ILOCK_RTBITMAP_VAL 6 +#define XFS_ILOCK_RTSUM_VAL 7 +#define XFS_ILOCK_DEP_MASK 0xff000000 +#define XFS_ILOCK_PARENT (XFS_ILOCK_PARENT_VAL << XFS_ILOCK_SHIFT) +#define XFS_ILOCK_RTBITMAP (XFS_ILOCK_RTBITMAP_VAL << XFS_ILOCK_SHIFT) +#define XFS_ILOCK_RTSUM (XFS_ILOCK_RTSUM_VAL << XFS_ILOCK_SHIFT) + +#define XFS_LOCK_SUBCLASS_MASK (XFS_IOLOCK_DEP_MASK | \ XFS_MMAPLOCK_DEP_MASK | \ XFS_ILOCK_DEP_MASK) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index bf13a5a7e2f4..62bd80f4edd9 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -703,17 +703,10 @@ xfs_iflush_abort( xfs_inode_log_item_t *iip = ip->i_itemp; if (iip) { - struct xfs_ail *ailp = iip->ili_item.li_ailp; if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { - spin_lock(&ailp->xa_lock); - if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { - /* xfs_trans_ail_delete() drops the AIL lock. */ - xfs_trans_ail_delete(ailp, &iip->ili_item, - stale ? - SHUTDOWN_LOG_IO_ERROR : + xfs_trans_ail_remove(&iip->ili_item, + stale ? SHUTDOWN_LOG_IO_ERROR : SHUTDOWN_CORRUPT_INCORE); - } else - spin_unlock(&ailp->xa_lock); } iip->ili_logged = 0; /* diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 766b23f86ce9..8294132e6a3c 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -609,7 +609,7 @@ xfs_setattr_nonsize( tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); if (error) - goto out_dqrele; + goto out_trans_cancel; xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -640,7 +640,7 @@ xfs_setattr_nonsize( NULL, capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0); if (error) /* out of quota */ - goto out_trans_cancel; + goto out_unlock; } } @@ -729,10 +729,10 @@ xfs_setattr_nonsize( return 0; +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); out_trans_cancel: xfs_trans_cancel(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); -out_dqrele: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); return error; diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index f41b0c3fddab..930ebd86beba 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -473,7 +473,8 @@ xfs_bulkstat( * pending error, then we are done. */ del_cursor: - xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(cur, error ? + XFS_BTREE_ERROR : XFS_BTREE_NOERROR); xfs_buf_relse(agbp); if (error) break; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 08d4fe46f0fa..aaadee0969c9 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -668,9 +668,9 @@ xfs_log_mount( ASSERT(0); goto out_free_log; } + xfs_crit(mp, "Log size out of supported range."); xfs_crit(mp, -"Log size out of supported range. Continuing onwards, but if log hangs are\n" -"experienced then please report this message in the bug report."); +"Continuing onwards, but if log hangs are experienced then please report this message in the bug report."); } /* @@ -700,6 +700,7 @@ xfs_log_mount( if (error) { xfs_warn(mp, "log mount/recovery failed: error %d", error); + xlog_recover_cancel(mp->m_log); goto out_destroy_ail; } } @@ -740,18 +741,35 @@ out: * it. */ int -xfs_log_mount_finish(xfs_mount_t *mp) +xfs_log_mount_finish( + struct xfs_mount *mp) { int error = 0; - if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { - error = xlog_recover_finish(mp->m_log); - if (!error) - xfs_log_work_queue(mp); - } else { + if (mp->m_flags & XFS_MOUNT_NORECOVERY) { ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); + return 0; } + error = xlog_recover_finish(mp->m_log); + if (!error) + xfs_log_work_queue(mp); + + return error; +} + +/* + * The mount has failed. Cancel the recovery if it hasn't completed and destroy + * the log. + */ +int +xfs_log_mount_cancel( + struct xfs_mount *mp) +{ + int error; + + error = xlog_recover_cancel(mp->m_log); + xfs_log_unmount(mp); return error; } @@ -1142,11 +1160,13 @@ xlog_space_left( * In this case we just want to return the size of the * log as the amount of space left. */ + xfs_alert(log->l_mp, "xlog_space_left: head behind tail"); xfs_alert(log->l_mp, - "xlog_space_left: head behind tail\n" - " tail_cycle = %d, tail_bytes = %d\n" - " GH cycle = %d, GH bytes = %d", - tail_cycle, tail_bytes, head_cycle, head_bytes); + " tail_cycle = %d, tail_bytes = %d", + tail_cycle, tail_bytes); + xfs_alert(log->l_mp, + " GH cycle = %d, GH bytes = %d", + head_cycle, head_bytes); ASSERT(0); free_bytes = log->l_logsize; } @@ -1652,8 +1672,13 @@ xlog_cksum( if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead; int i; + int xheads; + + xheads = size / XLOG_HEADER_CYCLE_SIZE; + if (size % XLOG_HEADER_CYCLE_SIZE) + xheads++; - for (i = 1; i < log->l_iclog_heads; i++) { + for (i = 1; i < xheads; i++) { crc = crc32c(crc, &xhdr[i].hic_xheader, sizeof(struct xlog_rec_ext_header)); } @@ -2028,26 +2053,24 @@ xlog_print_tic_res( "SWAPEXT" }; - xfs_warn(mp, - "xlog_write: reservation summary:\n" - " trans type = %s (%u)\n" - " unit res = %d bytes\n" - " current res = %d bytes\n" - " total reg = %u bytes (o/flow = %u bytes)\n" - " ophdrs = %u (ophdr space = %u bytes)\n" - " ophdr + reg = %u bytes\n" - " num regions = %u", - ((ticket->t_trans_type <= 0 || - ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ? + xfs_warn(mp, "xlog_write: reservation summary:"); + xfs_warn(mp, " trans type = %s (%u)", + ((ticket->t_trans_type <= 0 || + ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ? "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]), - ticket->t_trans_type, - ticket->t_unit_res, - ticket->t_curr_res, - ticket->t_res_arr_sum, ticket->t_res_o_flow, - ticket->t_res_num_ophdrs, ophdr_spc, - ticket->t_res_arr_sum + - ticket->t_res_o_flow + ophdr_spc, - ticket->t_res_num); + ticket->t_trans_type); + xfs_warn(mp, " unit res = %d bytes", + ticket->t_unit_res); + xfs_warn(mp, " current res = %d bytes", + ticket->t_curr_res); + xfs_warn(mp, " total reg = %u bytes (o/flow = %u bytes)", + ticket->t_res_arr_sum, ticket->t_res_o_flow); + xfs_warn(mp, " ophdrs = %u (ophdr space = %u bytes)", + ticket->t_res_num_ophdrs, ophdr_spc); + xfs_warn(mp, " ophdr + reg = %u bytes", + ticket->t_res_arr_sum + ticket->t_res_o_flow + ophdr_spc); + xfs_warn(mp, " num regions = %u", + ticket->t_res_num); for (i = 0; i < ticket->t_res_num; i++) { uint r_type = ticket->t_res_arr[i].r_type; diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index fa27aaec72cb..09d91d3166cd 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -147,6 +147,7 @@ int xfs_log_mount(struct xfs_mount *mp, xfs_daddr_t start_block, int num_bblocks); int xfs_log_mount_finish(struct xfs_mount *mp); +int xfs_log_mount_cancel(struct xfs_mount *); xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp); void xfs_log_space_wake(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index abc2ccbff739..4e7649351f5a 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -307,7 +307,13 @@ xlog_cil_insert_items( if (!(lidp->lid_flags & XFS_LID_DIRTY)) continue; - list_move_tail(&lip->li_cil, &cil->xc_cil); + /* + * Only move the item if it isn't already at the tail. This is + * to prevent a transient list_empty() state when reinserting + * an item that is already the only item in the CIL. + */ + if (!list_is_last(&lip->li_cil, &cil->xc_cil)) + list_move_tail(&lip->li_cil, &cil->xc_cil); } /* account for space used by new iovec headers */ diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 1c87c8abfbed..950f3f94720c 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -426,6 +426,8 @@ xlog_recover( extern int xlog_recover_finish( struct xlog *log); +extern int +xlog_recover_cancel(struct xlog *); extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, char *dp, int size); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 01dd228ca05e..512a0945d52a 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1886,19 +1886,34 @@ xlog_recover_get_buf_lsn( uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid; break; case XFS_ATTR3_RMT_MAGIC: - lsn = be64_to_cpu(((struct xfs_attr3_rmt_hdr *)blk)->rm_lsn); - uuid = &((struct xfs_attr3_rmt_hdr *)blk)->rm_uuid; - break; + /* + * Remote attr blocks are written synchronously, rather than + * being logged. That means they do not contain a valid LSN + * (i.e. transactionally ordered) in them, and hence any time we + * see a buffer to replay over the top of a remote attribute + * block we should simply do so. + */ + goto recover_immediately; case XFS_SB_MAGIC: + /* + * superblock uuids are magic. We may or may not have a + * sb_meta_uuid on disk, but it will be set in the in-core + * superblock. We set the uuid pointer for verification + * according to the superblock feature mask to ensure we check + * the relevant UUID in the superblock. + */ lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn); - uuid = &((struct xfs_dsb *)blk)->sb_uuid; + if (xfs_sb_version_hasmetauuid(&mp->m_sb)) + uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid; + else + uuid = &((struct xfs_dsb *)blk)->sb_uuid; break; default: break; } if (lsn != (xfs_lsn_t)-1) { - if (!uuid_equal(&mp->m_sb.sb_uuid, uuid)) + if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid)) goto recover_immediately; return lsn; } @@ -2928,16 +2943,16 @@ xlog_recover_efi_pass2( struct xlog_recover_item *item, xfs_lsn_t lsn) { - int error; - xfs_mount_t *mp = log->l_mp; - xfs_efi_log_item_t *efip; - xfs_efi_log_format_t *efi_formatp; + int error; + struct xfs_mount *mp = log->l_mp; + struct xfs_efi_log_item *efip; + struct xfs_efi_log_format *efi_formatp; efi_formatp = item->ri_buf[0].i_addr; efip = xfs_efi_init(mp, efi_formatp->efi_nextents); - if ((error = xfs_efi_copy_format(&(item->ri_buf[0]), - &(efip->efi_format)))) { + error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format); + if (error) { xfs_efi_item_free(efip); return error; } @@ -2945,20 +2960,23 @@ xlog_recover_efi_pass2( spin_lock(&log->l_ailp->xa_lock); /* - * xfs_trans_ail_update() drops the AIL lock. + * The EFI has two references. One for the EFD and one for EFI to ensure + * it makes it into the AIL. Insert the EFI into the AIL directly and + * drop the EFI reference. Note that xfs_trans_ail_update() drops the + * AIL lock. */ xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn); + xfs_efi_release(efip); return 0; } /* - * This routine is called when an efd format structure is found in - * a committed transaction in the log. It's purpose is to cancel - * the corresponding efi if it was still in the log. To do this - * it searches the AIL for the efi with an id equal to that in the - * efd format structure. If we find it, we remove the efi from the - * AIL and free it. + * This routine is called when an EFD format structure is found in a committed + * transaction in the log. Its purpose is to cancel the corresponding EFI if it + * was still in the log. To do this it searches the AIL for the EFI with an id + * equal to that in the EFD format structure. If we find it we drop the EFD + * reference, which removes the EFI from the AIL and frees it. */ STATIC int xlog_recover_efd_pass2( @@ -2980,8 +2998,8 @@ xlog_recover_efd_pass2( efi_id = efd_formatp->efd_efi_id; /* - * Search for the efi with the id in the efd format structure - * in the AIL. + * Search for the EFI with the id in the EFD format structure in the + * AIL. */ spin_lock(&ailp->xa_lock); lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); @@ -2990,18 +3008,18 @@ xlog_recover_efd_pass2( efip = (xfs_efi_log_item_t *)lip; if (efip->efi_format.efi_id == efi_id) { /* - * xfs_trans_ail_delete() drops the - * AIL lock. + * Drop the EFD reference to the EFI. This + * removes the EFI from the AIL and frees it. */ - xfs_trans_ail_delete(ailp, lip, - SHUTDOWN_CORRUPT_INCORE); - xfs_efi_item_free(efip); + spin_unlock(&ailp->xa_lock); + xfs_efi_release(efip); spin_lock(&ailp->xa_lock); break; } } lip = xfs_trans_ail_cursor_next(ailp, &cur); } + xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->xa_lock); @@ -3029,6 +3047,11 @@ xlog_recover_do_icreate_pass2( unsigned int count; unsigned int isize; xfs_agblock_t length; + int blks_per_cluster; + int bb_per_cluster; + int cancel_count; + int nbufs; + int i; icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr; if (icl->icl_type != XFS_LI_ICREATE) { @@ -3087,22 +3110,45 @@ xlog_recover_do_icreate_pass2( } /* - * Inode buffers can be freed. Do not replay the inode initialisation as - * we could be overwriting something written after this inode buffer was - * cancelled. + * The icreate transaction can cover multiple cluster buffers and these + * buffers could have been freed and reused. Check the individual + * buffers for cancellation so we don't overwrite anything written after + * a cancellation. + */ + blks_per_cluster = xfs_icluster_size_fsb(mp); + bb_per_cluster = XFS_FSB_TO_BB(mp, blks_per_cluster); + nbufs = length / blks_per_cluster; + for (i = 0, cancel_count = 0; i < nbufs; i++) { + xfs_daddr_t daddr; + + daddr = XFS_AGB_TO_DADDR(mp, agno, + agbno + i * blks_per_cluster); + if (xlog_check_buffer_cancelled(log, daddr, bb_per_cluster, 0)) + cancel_count++; + } + + /* + * We currently only use icreate for a single allocation at a time. This + * means we should expect either all or none of the buffers to be + * cancelled. Be conservative and skip replay if at least one buffer is + * cancelled, but warn the user that something is awry if the buffers + * are not consistent. * - * XXX: we need to iterate all buffers and only init those that are not - * cancelled. I think that a more fine grained factoring of - * xfs_ialloc_inode_init may be appropriate here to enable this to be - * done easily. + * XXX: This must be refined to only skip cancelled clusters once we use + * icreate for multiple chunk allocations. */ - if (xlog_check_buffer_cancelled(log, - XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0)) + ASSERT(!cancel_count || cancel_count == nbufs); + if (cancel_count) { + if (cancel_count != nbufs) + xfs_warn(mp, + "WARNING: partial inode chunk cancellation, skipped icreate."); + trace_xfs_log_recover_icreate_cancel(log, icl); return 0; + } - xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno, length, - be32_to_cpu(icl->icl_gen)); - return 0; + trace_xfs_log_recover_icreate_recover(log, icl); + return xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno, + length, be32_to_cpu(icl->icl_gen)); } STATIC void @@ -3380,14 +3426,24 @@ xlog_recover_add_to_cont_trans( char *ptr, *old_ptr; int old_len; + /* + * If the transaction is empty, the header was split across this and the + * previous record. Copy the rest of the header. + */ if (list_empty(&trans->r_itemq)) { - /* finish copying rest of trans header */ + ASSERT(len < sizeof(struct xfs_trans_header)); + if (len > sizeof(struct xfs_trans_header)) { + xfs_warn(log->l_mp, "%s: bad header length", __func__); + return -EIO; + } + xlog_recover_add_item(&trans->r_itemq); ptr = (char *)&trans->r_theader + - sizeof(xfs_trans_header_t) - len; + sizeof(struct xfs_trans_header) - len; memcpy(ptr, dp, len); return 0; } + /* take the tail entry */ item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); @@ -3436,7 +3492,19 @@ xlog_recover_add_to_trans( ASSERT(0); return -EIO; } - if (len == sizeof(xfs_trans_header_t)) + + if (len > sizeof(struct xfs_trans_header)) { + xfs_warn(log->l_mp, "%s: bad header length", __func__); + ASSERT(0); + return -EIO; + } + + /* + * The transaction header can be arbitrarily split across op + * records. If we don't have the whole thing here, copy what we + * do have and handle the rest in the next record. + */ + if (len == sizeof(struct xfs_trans_header)) xlog_recover_add_item(&trans->r_itemq); memcpy(&trans->r_theader, dp, len); return 0; @@ -3739,7 +3807,7 @@ xlog_recover_process_efi( * free the memory associated with it. */ set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); - xfs_efi_release(efip, efip->efi_format.efi_nextents); + xfs_efi_release(efip); return -EIO; } } @@ -3752,11 +3820,11 @@ xlog_recover_process_efi( for (i = 0; i < efip->efi_format.efi_nextents; i++) { extp = &(efip->efi_format.efi_extents[i]); - error = xfs_free_extent(tp, extp->ext_start, extp->ext_len); + error = xfs_trans_free_extent(tp, efdp, extp->ext_start, + extp->ext_len); if (error) goto abort_error; - xfs_trans_log_efd_extent(tp, efdp, extp->ext_start, - extp->ext_len); + } set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); @@ -3788,10 +3856,10 @@ abort_error: */ STATIC int xlog_recover_process_efis( - struct xlog *log) + struct xlog *log) { - xfs_log_item_t *lip; - xfs_efi_log_item_t *efip; + struct xfs_log_item *lip; + struct xfs_efi_log_item *efip; int error = 0; struct xfs_ail_cursor cur; struct xfs_ail *ailp; @@ -3815,7 +3883,7 @@ xlog_recover_process_efis( /* * Skip EFIs that we've already processed. */ - efip = (xfs_efi_log_item_t *)lip; + efip = container_of(lip, struct xfs_efi_log_item, efi_item); if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) { lip = xfs_trans_ail_cursor_next(ailp, &cur); continue; @@ -3835,6 +3903,50 @@ out: } /* + * A cancel occurs when the mount has failed and we're bailing out. Release all + * pending EFIs so they don't pin the AIL. + */ +STATIC int +xlog_recover_cancel_efis( + struct xlog *log) +{ + struct xfs_log_item *lip; + struct xfs_efi_log_item *efip; + int error = 0; + struct xfs_ail_cursor cur; + struct xfs_ail *ailp; + + ailp = log->l_ailp; + spin_lock(&ailp->xa_lock); + lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); + while (lip != NULL) { + /* + * We're done when we see something other than an EFI. + * There should be no EFIs left in the AIL now. + */ + if (lip->li_type != XFS_LI_EFI) { +#ifdef DEBUG + for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur)) + ASSERT(lip->li_type != XFS_LI_EFI); +#endif + break; + } + + efip = container_of(lip, struct xfs_efi_log_item, efi_item); + + spin_unlock(&ailp->xa_lock); + xfs_efi_release(efip); + spin_lock(&ailp->xa_lock); + + lip = xfs_trans_ail_cursor_next(ailp, &cur); + } + + xfs_trans_ail_cursor_done(&cur); + spin_unlock(&ailp->xa_lock); + return error; +} + +/* * This routine performs a transaction to null out a bad inode pointer * in an agi unlinked inode hash bucket. */ @@ -4527,11 +4639,13 @@ xlog_recover( xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb, XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) { xfs_warn(log->l_mp, -"Superblock has unknown incompatible log features (0x%x) enabled.\n" -"The log can not be fully and/or safely recovered by this kernel.\n" -"Please recover the log on a kernel that supports the unknown features.", +"Superblock has unknown incompatible log features (0x%x) enabled.", (log->l_mp->m_sb.sb_features_log_incompat & XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)); + xfs_warn(log->l_mp, +"The log can not be fully and/or safely recovered by this kernel."); + xfs_warn(log->l_mp, +"Please recover the log on a kernel that supports the unknown features."); return -EINVAL; } @@ -4607,6 +4721,17 @@ xlog_recover_finish( return 0; } +int +xlog_recover_cancel( + struct xlog *log) +{ + int error = 0; + + if (log->l_flags & XLOG_RECOVERY_NEEDED) + error = xlog_recover_cancel_efis(log); + + return error; +} #if defined(DEBUG) /* diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 461e791efad7..bf92e0c037c7 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -615,14 +615,14 @@ xfs_default_resblks(xfs_mount_t *mp) */ int xfs_mountfs( - xfs_mount_t *mp) + struct xfs_mount *mp) { - xfs_sb_t *sbp = &(mp->m_sb); - xfs_inode_t *rip; - __uint64_t resblks; - uint quotamount = 0; - uint quotaflags = 0; - int error = 0; + struct xfs_sb *sbp = &(mp->m_sb); + struct xfs_inode *rip; + __uint64_t resblks; + uint quotamount = 0; + uint quotaflags = 0; + int error = 0; xfs_sb_mount_common(mp, sbp); @@ -799,7 +799,9 @@ xfs_mountfs( } /* - * log's mount-time initialization. Perform 1st part recovery if needed + * Log's mount-time initialization. The first part of recovery can place + * some items on the AIL, to be handled when recovery is finished or + * cancelled. */ error = xfs_log_mount(mp, mp->m_logdev_targp, XFS_FSB_TO_DADDR(mp, sbp->sb_logstart), @@ -910,9 +912,9 @@ xfs_mountfs( } /* - * Finish recovering the file system. This part needed to be - * delayed until after the root and real-time bitmap inodes - * were consistently read in. + * Finish recovering the file system. This part needed to be delayed + * until after the root and real-time bitmap inodes were consistently + * read in. */ error = xfs_log_mount_finish(mp); if (error) { @@ -955,8 +957,10 @@ xfs_mountfs( xfs_rtunmount_inodes(mp); out_rele_rip: IRELE(rip); + cancel_delayed_work_sync(&mp->m_reclaim_work); + xfs_reclaim_inodes(mp, SYNC_WAIT); out_log_dealloc: - xfs_log_unmount(mp); + xfs_log_mount_cancel(mp); out_fail_wait: if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) xfs_wait_buftarg(mp->m_logdev_targp); diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index f4e8c06eee26..ab1bac6a3a1c 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -757,31 +757,30 @@ xfs_rtallocate_extent_size( /* * Allocate space to the bitmap or summary file, and zero it, for growfs. */ -STATIC int /* error */ +STATIC int xfs_growfs_rt_alloc( - xfs_mount_t *mp, /* file system mount point */ - xfs_extlen_t oblocks, /* old count of blocks */ - xfs_extlen_t nblocks, /* new count of blocks */ - xfs_inode_t *ip) /* inode (bitmap/summary) */ + struct xfs_mount *mp, /* file system mount point */ + xfs_extlen_t oblocks, /* old count of blocks */ + xfs_extlen_t nblocks, /* new count of blocks */ + struct xfs_inode *ip) /* inode (bitmap/summary) */ { - xfs_fileoff_t bno; /* block number in file */ - xfs_buf_t *bp; /* temporary buffer for zeroing */ - int committed; /* transaction committed flag */ - xfs_daddr_t d; /* disk block address */ - int error; /* error return value */ - xfs_fsblock_t firstblock; /* first block allocated in xaction */ - xfs_bmap_free_t flist; /* list of freed blocks */ - xfs_fsblock_t fsbno; /* filesystem block for bno */ - xfs_bmbt_irec_t map; /* block map output */ - int nmap; /* number of block maps */ - int resblks; /* space reservation */ + xfs_fileoff_t bno; /* block number in file */ + struct xfs_buf *bp; /* temporary buffer for zeroing */ + int committed; /* transaction committed flag */ + xfs_daddr_t d; /* disk block address */ + int error; /* error return value */ + xfs_fsblock_t firstblock;/* first block allocated in xaction */ + struct xfs_bmap_free flist; /* list of freed blocks */ + xfs_fsblock_t fsbno; /* filesystem block for bno */ + struct xfs_bmbt_irec map; /* block map output */ + int nmap; /* number of block maps */ + int resblks; /* space reservation */ + struct xfs_trans *tp; /* * Allocate space to the file, as necessary. */ while (oblocks < nblocks) { - xfs_trans_t *tp; - tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC); resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks); /* @@ -790,7 +789,7 @@ xfs_growfs_rt_alloc( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtalloc, resblks, 0); if (error) - goto error_cancel; + goto out_trans_cancel; /* * Lock the inode. */ @@ -808,16 +807,16 @@ xfs_growfs_rt_alloc( if (!error && nmap < 1) error = -ENOSPC; if (error) - goto error_cancel; + goto out_bmap_cancel; /* * Free any blocks freed up in the transaction, then commit. */ error = xfs_bmap_finish(&tp, &flist, &committed); if (error) - goto error_cancel; + goto out_bmap_cancel; error = xfs_trans_commit(tp); if (error) - goto error; + return error; /* * Now we need to clear the allocated blocks. * Do this one block per transaction, to keep it simple. @@ -832,7 +831,7 @@ xfs_growfs_rt_alloc( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtzero, 0, 0); if (error) - goto error_cancel; + goto out_trans_cancel; /* * Lock the bitmap inode. */ @@ -846,9 +845,7 @@ xfs_growfs_rt_alloc( mp->m_bsize, 0); if (bp == NULL) { error = -EIO; -error_cancel: - xfs_trans_cancel(tp); - goto error; + goto out_trans_cancel; } memset(bp->b_addr, 0, mp->m_sb.sb_blocksize); xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1); @@ -857,16 +854,20 @@ error_cancel: */ error = xfs_trans_commit(tp); if (error) - goto error; + return error; } /* * Go on to the next extent, if any. */ oblocks = map.br_startoff + map.br_blockcount; } + return 0; -error: +out_bmap_cancel: + xfs_bmap_cancel(&flist); +out_trans_cancel: + xfs_trans_cancel(tp); return error; } diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 1fb16562c159..904f637cfa5f 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -261,16 +261,8 @@ xfs_parseargs( mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL); if (!mp->m_rtname) return -ENOMEM; - } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) { - if (!value || !*value) { - xfs_warn(mp, "%s option requires an argument", - this_char); - return -EINVAL; - } - if (kstrtoint(value, 10, &iosize)) - return -EINVAL; - iosizelog = ffs(iosize) - 1; - } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) { + } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE) || + !strcmp(this_char, MNTOPT_BIOSIZE)) { if (!value || !*value) { xfs_warn(mp, "%s option requires an argument", this_char); @@ -511,9 +503,9 @@ xfs_showargs( seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10); if (mp->m_logname) - seq_printf(m, "," MNTOPT_LOGDEV "=%s", mp->m_logname); + seq_show_option(m, MNTOPT_LOGDEV, mp->m_logname); if (mp->m_rtname) - seq_printf(m, "," MNTOPT_RTDEV "=%s", mp->m_rtname); + seq_show_option(m, MNTOPT_RTDEV, mp->m_rtname); if (mp->m_dalign > 0) seq_printf(m, "," MNTOPT_SUNIT "=%d", @@ -1528,6 +1520,10 @@ xfs_fs_fill_super( } } + if (xfs_sb_version_hassparseinodes(&mp->m_sb)) + xfs_alert(mp, + "EXPERIMENTAL sparse inode feature enabled. Use at your own risk!"); + error = xfs_mountfs(mp); if (error) goto out_filestream_unmount; diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 4be27b0210af..996481eeb491 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -240,7 +240,8 @@ xfs_symlink( if (error) goto out_trans_cancel; - xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); + xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL | + XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT); unlock_dp_on_error = true; /* @@ -288,7 +289,7 @@ xfs_symlink( * the transaction cancel unlocking dp so don't do it explicitly in the * error path. */ - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); unlock_dp_on_error = false; /* @@ -421,7 +422,7 @@ out_release_inode: xfs_qm_dqrele(pdqp); if (unlock_dp_on_error) - xfs_iunlock(dp, XFS_ILOCK_EXCL); + xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); return error; } @@ -501,7 +502,7 @@ xfs_inactive_symlink_rmt( /* * Unmap the dead block(s) to the free_list. */ - error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps, + error = xfs_bunmapi(tp, ip, 0, size, 0, nmaps, &first_block, &free_list, &done); if (error) goto error_bmap_cancel; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 8d916d33d93d..5ed36b1e04c1 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -687,6 +687,7 @@ DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag); DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid); DEFINE_INODE_EVENT(xfs_filemap_fault); +DEFINE_INODE_EVENT(xfs_filemap_pmd_fault); DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite); DECLARE_EVENT_CLASS(xfs_iref_class, @@ -2089,6 +2090,40 @@ DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover); DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel); DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip); +DECLARE_EVENT_CLASS(xfs_log_recover_icreate_item_class, + TP_PROTO(struct xlog *log, struct xfs_icreate_log *in_f), + TP_ARGS(log, in_f), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(unsigned int, count) + __field(unsigned int, isize) + __field(xfs_agblock_t, length) + __field(unsigned int, gen) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->agno = be32_to_cpu(in_f->icl_ag); + __entry->agbno = be32_to_cpu(in_f->icl_agbno); + __entry->count = be32_to_cpu(in_f->icl_count); + __entry->isize = be32_to_cpu(in_f->icl_isize); + __entry->length = be32_to_cpu(in_f->icl_length); + __entry->gen = be32_to_cpu(in_f->icl_gen); + ), + TP_printk("dev %d:%d agno %u agbno %u count %u isize %u length %u " + "gen %u", MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, __entry->agbno, __entry->count, __entry->isize, + __entry->length, __entry->gen) +) +#define DEFINE_LOG_RECOVER_ICREATE_ITEM(name) \ +DEFINE_EVENT(xfs_log_recover_icreate_item_class, name, \ + TP_PROTO(struct xlog *log, struct xfs_icreate_log *in_f), \ + TP_ARGS(log, in_f)) + +DEFINE_LOG_RECOVER_ICREATE_ITEM(xfs_log_recover_icreate_cancel); +DEFINE_LOG_RECOVER_ICREATE_ITEM(xfs_log_recover_icreate_recover); + DECLARE_EVENT_CLASS(xfs_discard_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t len), diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 0582a27107d4..a0ab1dae9c31 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1019,9 +1019,10 @@ xfs_trans_cancel( * chunk we've been working on and get a new transaction to continue. */ int -xfs_trans_roll( +__xfs_trans_roll( struct xfs_trans **tpp, - struct xfs_inode *dp) + struct xfs_inode *dp, + int *committed) { struct xfs_trans *trans; struct xfs_trans_res tres; @@ -1052,6 +1053,7 @@ xfs_trans_roll( if (error) return error; + *committed = 1; trans = *tpp; /* @@ -1074,3 +1076,12 @@ xfs_trans_roll( xfs_trans_ijoin(trans, dp, 0); return 0; } + +int +xfs_trans_roll( + struct xfs_trans **tpp, + struct xfs_inode *dp) +{ + int committed = 0; + return __xfs_trans_roll(tpp, dp, &committed); +} diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 3b21b4e5e467..4643070d7cae 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -213,7 +213,6 @@ void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint); void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint); void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint); struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint); -void xfs_efi_release(struct xfs_efi_log_item *, uint); void xfs_trans_log_efi_extent(xfs_trans_t *, struct xfs_efi_log_item *, xfs_fsblock_t, @@ -221,11 +220,11 @@ void xfs_trans_log_efi_extent(xfs_trans_t *, struct xfs_efd_log_item *xfs_trans_get_efd(xfs_trans_t *, struct xfs_efi_log_item *, uint); -void xfs_trans_log_efd_extent(xfs_trans_t *, - struct xfs_efd_log_item *, - xfs_fsblock_t, - xfs_extlen_t); +int xfs_trans_free_extent(struct xfs_trans *, + struct xfs_efd_log_item *, xfs_fsblock_t, + xfs_extlen_t); int xfs_trans_commit(struct xfs_trans *); +int __xfs_trans_roll(struct xfs_trans **, struct xfs_inode *, int *); int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *); void xfs_trans_cancel(xfs_trans_t *); int xfs_trans_ail_init(struct xfs_mount *); diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c index 284397dd7990..a96ae540eb62 100644 --- a/fs/xfs/xfs_trans_extfree.c +++ b/fs/xfs/xfs_trans_extfree.c @@ -25,6 +25,7 @@ #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_extfree_item.h" +#include "xfs_alloc.h" /* * This routine is called to allocate an "extent free intention" @@ -108,19 +109,30 @@ xfs_trans_get_efd(xfs_trans_t *tp, } /* - * This routine is called to indicate that the described - * extent is to be logged as having been freed. It should - * be called once for each extent freed. + * Free an extent and log it to the EFD. Note that the transaction is marked + * dirty regardless of whether the extent free succeeds or fails to support the + * EFI/EFD lifecycle rules. */ -void -xfs_trans_log_efd_extent(xfs_trans_t *tp, - xfs_efd_log_item_t *efdp, - xfs_fsblock_t start_block, - xfs_extlen_t ext_len) +int +xfs_trans_free_extent( + struct xfs_trans *tp, + struct xfs_efd_log_item *efdp, + xfs_fsblock_t start_block, + xfs_extlen_t ext_len) { uint next_extent; - xfs_extent_t *extp; + struct xfs_extent *extp; + int error; + error = xfs_free_extent(tp, start_block, ext_len); + + /* + * Mark the transaction dirty, even on error. This ensures the + * transaction is aborted, which: + * + * 1.) releases the EFI and frees the EFD + * 2.) shuts down the filesystem + */ tp->t_flags |= XFS_TRANS_DIRTY; efdp->efd_item.li_desc->lid_flags |= XFS_LID_DIRTY; @@ -130,4 +142,6 @@ xfs_trans_log_efd_extent(xfs_trans_t *tp, extp->ext_start = start_block; extp->ext_len = ext_len; efdp->efd_next_extent++; + + return error; } diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 1b736294558a..49931b72da8a 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -119,6 +119,21 @@ xfs_trans_ail_delete( xfs_trans_ail_delete_bulk(ailp, &lip, 1, shutdown_type); } +static inline void +xfs_trans_ail_remove( + struct xfs_log_item *lip, + int shutdown_type) +{ + struct xfs_ail *ailp = lip->li_ailp; + + spin_lock(&ailp->xa_lock); + /* xfs_trans_ail_delete() drops the AIL lock */ + if (lip->li_flags & XFS_LI_IN_AIL) + xfs_trans_ail_delete(ailp, lip, shutdown_type); + else + spin_unlock(&ailp->xa_lock); +} + void xfs_ail_push(struct xfs_ail *, xfs_lsn_t); void xfs_ail_push_all(struct xfs_ail *); void xfs_ail_push_all_sync(struct xfs_ail *); |