diff options
Diffstat (limited to 'fs')
167 files changed, 2691 insertions, 1621 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 4802d75b3cf7..03c9e325bfbc 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -288,7 +288,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl) fl->fl_end = OFFSET_MAX; else fl->fl_end = glock.start + glock.length - 1; - fl->fl_pid = glock.proc_id; + fl->fl_pid = -glock.proc_id; } kfree(glock.client_id); return res; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 879ff9c7ffd0..6466153f2bf0 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -664,8 +664,7 @@ static unsigned long randomize_stack_top(unsigned long stack_top) { unsigned long random_variable = 0; - if ((current->flags & PF_RANDOMIZE) && - !(current->personality & ADDR_NO_RANDOMIZE)) { + if (current->flags & PF_RANDOMIZE) { random_variable = get_random_long(); random_variable &= STACK_RND_MASK; random_variable <<= PAGE_SHIFT; diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 69ec23daa25e..a1e6860b6f46 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -574,7 +574,7 @@ static int load_flat_file(struct linux_binprm *bprm, MAX_SHARED_LIBS * sizeof(unsigned long), FLAT_DATA_ALIGN); - pr_debug("Allocated data+bss+stack (%ld bytes): %lx\n", + pr_debug("Allocated data+bss+stack (%u bytes): %lx\n", data_len + bss_len + stack_len, datapos); fpos = ntohl(hdr->data_start); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 080e2ebb8aa0..f45b61fe9a9a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3516,7 +3516,7 @@ static blk_status_t wait_dev_flush(struct btrfs_device *device) struct bio *bio = device->flush_bio; if (!device->flush_bio_sent) - return 0; + return BLK_STS_OK; device->flush_bio_sent = 0; wait_for_completion_io(&device->flush_wait); @@ -3563,7 +3563,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info) continue; write_dev_flush(dev); - dev->last_flush_error = 0; + dev->last_flush_error = BLK_STS_OK; } /* wait for all the barriers */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 375f8c728d91..e3b0b4196d3d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4825,10 +4825,6 @@ skip_async: else flush = BTRFS_RESERVE_NO_FLUSH; spin_lock(&space_info->lock); - if (can_overcommit(fs_info, space_info, orig, flush, false)) { - spin_unlock(&space_info->lock); - break; - } if (list_empty(&space_info->tickets) && list_empty(&space_info->priority_tickets)) { spin_unlock(&space_info->lock); @@ -7589,6 +7585,10 @@ search: u64 offset; int cached; + /* If the block group is read-only, we can skip it entirely. */ + if (unlikely(block_group->ro)) + continue; + btrfs_grab_block_group(block_group, delalloc); search_start = block_group->key.objectid; @@ -7624,8 +7624,6 @@ have_block_group: if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) goto loop; - if (unlikely(block_group->ro)) - goto loop; /* * Ok we want to try and use the cluster allocator, so @@ -7839,6 +7837,7 @@ loop: failed_alloc = false; BUG_ON(index != get_block_group_index(block_group)); btrfs_release_block_group(block_group, delalloc); + cond_resched(); } up_read(&space_info->groups_sem); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 95c212037095..24bcd5cd9cf2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7924,11 +7924,12 @@ err: return ret; } -static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio, - int mirror_num) +static inline blk_status_t submit_dio_repair_bio(struct inode *inode, + struct bio *bio, + int mirror_num) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int ret; + blk_status_t ret; BUG_ON(bio_op(bio) == REQ_OP_WRITE); @@ -7980,10 +7981,10 @@ static int btrfs_check_dio_repairable(struct inode *inode, return 1; } -static int dio_read_error(struct inode *inode, struct bio *failed_bio, - struct page *page, unsigned int pgoff, - u64 start, u64 end, int failed_mirror, - bio_end_io_t *repair_endio, void *repair_arg) +static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio, + struct page *page, unsigned int pgoff, + u64 start, u64 end, int failed_mirror, + bio_end_io_t *repair_endio, void *repair_arg) { struct io_failure_record *failrec; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; @@ -7993,18 +7994,19 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, int read_mode = 0; int segs; int ret; + blk_status_t status; BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); ret = btrfs_get_io_failure_record(inode, start, end, &failrec); if (ret) - return ret; + return errno_to_blk_status(ret); ret = btrfs_check_dio_repairable(inode, failed_bio, failrec, failed_mirror); if (!ret) { free_io_failure(failure_tree, io_tree, failrec); - return -EIO; + return BLK_STS_IOERR; } segs = bio_segments(failed_bio); @@ -8022,13 +8024,13 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n", read_mode, failrec->this_mirror, failrec->in_validation); - ret = submit_dio_repair_bio(inode, bio, failrec->this_mirror); - if (ret) { + status = submit_dio_repair_bio(inode, bio, failrec->this_mirror); + if (status) { free_io_failure(failure_tree, io_tree, failrec); bio_put(bio); } - return ret; + return status; } struct btrfs_retry_complete { @@ -8065,8 +8067,8 @@ end: bio_put(bio); } -static int __btrfs_correct_data_nocsum(struct inode *inode, - struct btrfs_io_bio *io_bio) +static blk_status_t __btrfs_correct_data_nocsum(struct inode *inode, + struct btrfs_io_bio *io_bio) { struct btrfs_fs_info *fs_info; struct bio_vec bvec; @@ -8076,8 +8078,8 @@ static int __btrfs_correct_data_nocsum(struct inode *inode, unsigned int pgoff; u32 sectorsize; int nr_sectors; - int ret; - int err = 0; + blk_status_t ret; + blk_status_t err = BLK_STS_OK; fs_info = BTRFS_I(inode)->root->fs_info; sectorsize = fs_info->sectorsize; @@ -8183,11 +8185,12 @@ static blk_status_t __btrfs_subio_endio_read(struct inode *inode, int csum_pos; bool uptodate = (err == 0); int ret; + blk_status_t status; fs_info = BTRFS_I(inode)->root->fs_info; sectorsize = fs_info->sectorsize; - err = 0; + err = BLK_STS_OK; start = io_bio->logical; done.inode = inode; io_bio->bio.bi_iter = io_bio->iter; @@ -8209,12 +8212,12 @@ try_again: done.start = start; init_completion(&done.done); - ret = dio_read_error(inode, &io_bio->bio, bvec.bv_page, - pgoff, start, start + sectorsize - 1, - io_bio->mirror_num, - btrfs_retry_endio, &done); - if (ret) { - err = errno_to_blk_status(ret); + status = dio_read_error(inode, &io_bio->bio, bvec.bv_page, + pgoff, start, start + sectorsize - 1, + io_bio->mirror_num, btrfs_retry_endio, + &done); + if (status) { + err = status; goto next; } @@ -8250,7 +8253,7 @@ static blk_status_t btrfs_subio_endio_read(struct inode *inode, if (unlikely(err)) return __btrfs_correct_data_nocsum(inode, io_bio); else - return 0; + return BLK_STS_OK; } else { return __btrfs_subio_endio_read(inode, io_bio, err); } @@ -8423,9 +8426,9 @@ static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode, return 0; } -static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, - u64 file_offset, int skip_sum, - int async_submit) +static inline blk_status_t +__btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset, + int skip_sum, int async_submit) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_dio_private *dip = bio->bi_private; @@ -8488,6 +8491,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, int clone_offset = 0; int clone_len; int ret; + blk_status_t status; map_length = orig_bio->bi_iter.bi_size; submit_len = map_length; @@ -8537,9 +8541,9 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, */ atomic_inc(&dip->pending_bios); - ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum, - async_submit); - if (ret) { + status = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum, + async_submit); + if (status) { bio_put(bio); atomic_dec(&dip->pending_bios); goto out_err; @@ -8557,9 +8561,9 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, } while (submit_len > 0); submit: - ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum, - async_submit); - if (!ret) + status = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum, + async_submit); + if (!status) return 0; bio_put(bio); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 208638384cd2..2cf6ba40f7c4 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -905,7 +905,7 @@ static void raid_write_end_io(struct bio *bio) if (!atomic_dec_and_test(&rbio->stripes_pending)) return; - err = 0; + err = BLK_STS_OK; /* OK, we have read all the stripes we need to. */ max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? @@ -1324,7 +1324,7 @@ write_data: return; cleanup: - rbio_orig_end_io(rbio, -EIO); + rbio_orig_end_io(rbio, BLK_STS_IOERR); } /* @@ -1475,7 +1475,7 @@ static void raid_rmw_end_io(struct bio *bio) cleanup: - rbio_orig_end_io(rbio, -EIO); + rbio_orig_end_io(rbio, BLK_STS_IOERR); } static void async_rmw_stripe(struct btrfs_raid_bio *rbio) @@ -1579,7 +1579,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) return 0; cleanup: - rbio_orig_end_io(rbio, -EIO); + rbio_orig_end_io(rbio, BLK_STS_IOERR); return -EIO; finish: @@ -1795,12 +1795,12 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) void **pointers; int faila = -1, failb = -1; struct page *page; - int err; + blk_status_t err; int i; pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); if (!pointers) { - err = -ENOMEM; + err = BLK_STS_RESOURCE; goto cleanup_io; } @@ -1856,7 +1856,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) * a bad data or Q stripe. * TODO, we should redo the xor here. */ - err = -EIO; + err = BLK_STS_IOERR; goto cleanup; } /* @@ -1882,7 +1882,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) { if (rbio->bbio->raid_map[faila] == RAID5_P_STRIPE) { - err = -EIO; + err = BLK_STS_IOERR; goto cleanup; } /* @@ -1954,13 +1954,13 @@ pstripe: } } - err = 0; + err = BLK_STS_OK; cleanup: kfree(pointers); cleanup_io: if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { - if (err == 0) + if (err == BLK_STS_OK) cache_rbio_pages(rbio); else clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); @@ -1968,7 +1968,7 @@ cleanup_io: rbio_orig_end_io(rbio, err); } else if (rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { rbio_orig_end_io(rbio, err); - } else if (err == 0) { + } else if (err == BLK_STS_OK) { rbio->faila = -1; rbio->failb = -1; @@ -2005,7 +2005,7 @@ static void raid_recover_end_io(struct bio *bio) return; if (atomic_read(&rbio->error) > rbio->bbio->max_errors) - rbio_orig_end_io(rbio, -EIO); + rbio_orig_end_io(rbio, BLK_STS_IOERR); else __raid_recover_end_io(rbio); } @@ -2104,7 +2104,7 @@ out: cleanup: if (rbio->operation == BTRFS_RBIO_READ_REBUILD || rbio->operation == BTRFS_RBIO_REBUILD_MISSING) - rbio_orig_end_io(rbio, -EIO); + rbio_orig_end_io(rbio, BLK_STS_IOERR); return -EIO; } @@ -2431,7 +2431,7 @@ submit_write: nr_data = bio_list_size(&bio_list); if (!nr_data) { /* Every parity is right */ - rbio_orig_end_io(rbio, 0); + rbio_orig_end_io(rbio, BLK_STS_OK); return; } @@ -2451,7 +2451,7 @@ submit_write: return; cleanup: - rbio_orig_end_io(rbio, -EIO); + rbio_orig_end_io(rbio, BLK_STS_IOERR); } static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) @@ -2519,7 +2519,7 @@ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) return; cleanup: - rbio_orig_end_io(rbio, -EIO); + rbio_orig_end_io(rbio, BLK_STS_IOERR); } /* @@ -2633,7 +2633,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) return; cleanup: - rbio_orig_end_io(rbio, -EIO); + rbio_orig_end_io(rbio, BLK_STS_IOERR); return; finish: diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f20ef211a73d..3a11ae63676e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2153,8 +2153,7 @@ process_leaf: u32 this_len = sizeof(*di) + name_len + data_len; char *name; - ret = verify_dir_item(fs_info, path->nodes[0], - path->slots[0], di); + ret = verify_dir_item(fs_info, path->nodes[0], i, di); if (ret) { ret = -EIO; goto out; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5eb7217738ed..bd679bc7a1a9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2702,7 +2702,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, mutex_lock(&fs_info->chunk_mutex); old_total = btrfs_super_total_bytes(super_copy); - diff = new_size - device->total_bytes; + diff = round_down(new_size - device->total_bytes, fs_info->sectorsize); if (new_size <= device->total_bytes || device->is_tgtdev_for_dev_replace) { @@ -4406,7 +4406,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) u64 diff; new_size = round_down(new_size, fs_info->sectorsize); - diff = old_size - new_size; + diff = round_down(old_size - new_size, fs_info->sectorsize); if (device->is_tgtdev_for_dev_replace) return -EINVAL; @@ -6212,8 +6212,8 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) } } -int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, - int mirror_num, int async_submit) +blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, + int mirror_num, int async_submit) { struct btrfs_device *dev; struct bio *first_bio = bio; @@ -6233,7 +6233,7 @@ int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, &map_length, &bbio, mirror_num, 1); if (ret) { btrfs_bio_counter_dec(fs_info); - return ret; + return errno_to_blk_status(ret); } total_devs = bbio->num_stripes; @@ -6256,7 +6256,7 @@ int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, } btrfs_bio_counter_dec(fs_info); - return ret; + return errno_to_blk_status(ret); } if (map_length < length) { @@ -6283,7 +6283,7 @@ int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, dev_nr, async_submit); } btrfs_bio_counter_dec(fs_info); - return 0; + return BLK_STS_OK; } struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 6f45fd60d15a..93277fc60930 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -74,7 +74,7 @@ struct btrfs_device { int missing; int can_discard; int is_tgtdev_for_dev_replace; - int last_flush_error; + blk_status_t last_flush_error; int flush_bio_sent; #ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED @@ -416,8 +416,8 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 type); void btrfs_mapping_init(struct btrfs_mapping_tree *tree); void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree); -int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, - int mirror_num, int async_submit); +blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, + int mirror_num, int async_submit); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder); int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 50836280a6f8..1bc709fe330a 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -189,7 +189,7 @@ static int ceph_releasepage(struct page *page, gfp_t g) /* * read a single page, without unlocking it. */ -static int readpage_nounlock(struct file *filp, struct page *page) +static int ceph_do_readpage(struct file *filp, struct page *page) { struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); @@ -219,7 +219,7 @@ static int readpage_nounlock(struct file *filp, struct page *page) err = ceph_readpage_from_fscache(inode, page); if (err == 0) - goto out; + return -EINPROGRESS; dout("readpage inode %p file %p page %p index %lu\n", inode, filp, page, page->index); @@ -249,8 +249,11 @@ out: static int ceph_readpage(struct file *filp, struct page *page) { - int r = readpage_nounlock(filp, page); - unlock_page(page); + int r = ceph_do_readpage(filp, page); + if (r != -EINPROGRESS) + unlock_page(page); + else + r = 0; return r; } @@ -1237,7 +1240,7 @@ retry_locked: goto retry_locked; r = writepage_nounlock(page, NULL); if (r < 0) - goto fail_nosnap; + goto fail_unlock; goto retry_locked; } @@ -1265,11 +1268,14 @@ retry_locked: } /* we need to read it. */ - r = readpage_nounlock(file, page); - if (r < 0) - goto fail_nosnap; + r = ceph_do_readpage(file, page); + if (r < 0) { + if (r == -EINPROGRESS) + return -EAGAIN; + goto fail_unlock; + } goto retry_locked; -fail_nosnap: +fail_unlock: unlock_page(page); return r; } diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index fd1172823f86..337f88673ed9 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -297,13 +297,7 @@ void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp) } } -static void ceph_vfs_readpage_complete(struct page *page, void *data, int error) -{ - if (!error) - SetPageUptodate(page); -} - -static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int error) +static void ceph_readpage_from_fscache_complete(struct page *page, void *data, int error) { if (!error) SetPageUptodate(page); @@ -331,7 +325,7 @@ int ceph_readpage_from_fscache(struct inode *inode, struct page *page) return -ENOBUFS; ret = fscache_read_or_alloc_page(ci->fscache, page, - ceph_vfs_readpage_complete, NULL, + ceph_readpage_from_fscache_complete, NULL, GFP_KERNEL); switch (ret) { @@ -360,7 +354,7 @@ int ceph_readpages_from_fscache(struct inode *inode, return -ENOBUFS; ret = fscache_read_or_alloc_pages(ci->fscache, mapping, pages, nr_pages, - ceph_vfs_readpage_complete_unlock, + ceph_readpage_from_fscache_complete, NULL, mapping_gfp_mask(mapping)); switch (ret) { diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index e071d23f6148..ef7240ace576 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -271,6 +271,11 @@ out: if (ret < 0) err = ret; dput(last); + /* last_name no longer match cache index */ + if (fi->readdir_cache_idx >= 0) { + fi->readdir_cache_idx = -1; + fi->dir_release_count = 0; + } } return err; } diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 64ae74472046..8cd63e8123d8 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -79,7 +79,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, err = ceph_mdsc_do_request(mdsc, inode, req); if (operation == CEPH_MDS_OP_GETFILELOCK) { - fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid); + fl->fl_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid); if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type) fl->fl_type = F_RDLCK; else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type) diff --git a/fs/char_dev.c b/fs/char_dev.c index fb8507f521b2..ebcc8fb3fa66 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -28,6 +28,8 @@ static struct kobj_map *cdev_map; static DEFINE_MUTEX(chrdevs_lock); +#define CHRDEV_MAJOR_HASH_SIZE 255 + static struct char_device_struct { struct char_device_struct *next; unsigned int major; @@ -49,16 +51,39 @@ void chrdev_show(struct seq_file *f, off_t offset) { struct char_device_struct *cd; - if (offset < CHRDEV_MAJOR_HASH_SIZE) { - mutex_lock(&chrdevs_lock); - for (cd = chrdevs[offset]; cd; cd = cd->next) + mutex_lock(&chrdevs_lock); + for (cd = chrdevs[major_to_index(offset)]; cd; cd = cd->next) { + if (cd->major == offset) seq_printf(f, "%3d %s\n", cd->major, cd->name); - mutex_unlock(&chrdevs_lock); } + mutex_unlock(&chrdevs_lock); } #endif /* CONFIG_PROC_FS */ +static int find_dynamic_major(void) +{ + int i; + struct char_device_struct *cd; + + for (i = ARRAY_SIZE(chrdevs)-1; i > CHRDEV_MAJOR_DYN_END; i--) { + if (chrdevs[i] == NULL) + return i; + } + + for (i = CHRDEV_MAJOR_DYN_EXT_START; + i > CHRDEV_MAJOR_DYN_EXT_END; i--) { + for (cd = chrdevs[major_to_index(i)]; cd; cd = cd->next) + if (cd->major == i) + break; + + if (cd == NULL || cd->major != i) + return i; + } + + return -EBUSY; +} + /* * Register a single major with a specified minor range. * @@ -84,22 +109,21 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor, mutex_lock(&chrdevs_lock); - /* temporary */ if (major == 0) { - for (i = ARRAY_SIZE(chrdevs)-1; i > 0; i--) { - if (chrdevs[i] == NULL) - break; - } - - if (i < CHRDEV_MAJOR_DYN_END) - pr_warn("CHRDEV \"%s\" major number %d goes below the dynamic allocation range\n", - name, i); - - if (i == 0) { - ret = -EBUSY; + ret = find_dynamic_major(); + if (ret < 0) { + pr_err("CHRDEV \"%s\" dynamic allocation region is full\n", + name); goto out; } - major = i; + major = ret; + } + + if (major >= CHRDEV_MAJOR_MAX) { + pr_err("CHRDEV \"%s\" major requested (%d) is greater than the maximum (%d)\n", + name, major, CHRDEV_MAJOR_MAX); + ret = -EINVAL; + goto out; } cd->major = major; diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 72a53bd19865..118a63e7e221 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -2522,7 +2522,7 @@ CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon, pLockData->fl_start = le64_to_cpu(parm_data->start); pLockData->fl_end = pLockData->fl_start + le64_to_cpu(parm_data->length) - 1; - pLockData->fl_pid = le32_to_cpu(parm_data->pid); + pLockData->fl_pid = -le32_to_cpu(parm_data->pid); } } diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 59647eb72c5f..83a8f52cd879 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1223,6 +1223,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, char *tmp_end, *value; char delim; bool got_ip = false; + bool got_version = false; unsigned short port = 0; struct sockaddr *dstaddr = (struct sockaddr *)&vol->dstaddr; @@ -1874,24 +1875,35 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, pr_warn("CIFS: server netbiosname longer than 15 truncated.\n"); break; case Opt_ver: + /* version of mount userspace tools, not dialect */ string = match_strdup(args); if (string == NULL) goto out_nomem; + /* If interface changes in mount.cifs bump to new ver */ if (strncasecmp(string, "1", 1) == 0) { + if (strlen(string) > 1) { + pr_warn("Bad mount helper ver=%s. Did " + "you want SMB1 (CIFS) dialect " + "and mean to type vers=1.0 " + "instead?\n", string); + goto cifs_parse_mount_err; + } /* This is the default */ break; } /* For all other value, error */ - pr_warn("CIFS: Invalid version specified\n"); + pr_warn("CIFS: Invalid mount helper version specified\n"); goto cifs_parse_mount_err; case Opt_vers: + /* protocol version (dialect) */ string = match_strdup(args); if (string == NULL) goto out_nomem; if (cifs_parse_smb_version(string, vol) != 0) goto cifs_parse_mount_err; + got_version = true; break; case Opt_sec: string = match_strdup(args); @@ -1973,6 +1985,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, else if (override_gid == 1) pr_notice("CIFS: ignoring forcegid mount option specified with no gid= option.\n"); + if (got_version == false) + pr_warn("No dialect specified on mount. Default has changed to " + "a more secure dialect, SMB3 (vers=3.0), from CIFS " + "(SMB1). To use the less secure SMB1 dialect to access " + "old servers which do not support SMB3 specify vers=1.0" + " on mount. For somewhat newer servers such as Windows " + "7 try vers=2.1.\n"); + kfree(mountdata_copy); return 0; diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 56366e984076..e702d48bd023 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -194,15 +194,20 @@ cifs_bp_rename_retry: } /* + * Don't allow path components longer than the server max. * Don't allow the separator character in a path component. * The VFS will not allow "/", but "\" is allowed by posix. */ static int -check_name(struct dentry *direntry) +check_name(struct dentry *direntry, struct cifs_tcon *tcon) { struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); int i; + if (unlikely(direntry->d_name.len > + le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength))) + return -ENAMETOOLONG; + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)) { for (i = 0; i < direntry->d_name.len; i++) { if (direntry->d_name.name[i] == '\\') { @@ -500,10 +505,6 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, return finish_no_open(file, res); } - rc = check_name(direntry); - if (rc) - return rc; - xid = get_xid(); cifs_dbg(FYI, "parent inode = 0x%p name is: %pd and dentry = 0x%p\n", @@ -516,6 +517,11 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, } tcon = tlink_tcon(tlink); + + rc = check_name(direntry, tcon); + if (rc) + goto out_free_xid; + server = tcon->ses->server; if (server->ops->new_lease_key) @@ -776,7 +782,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, } pTcon = tlink_tcon(tlink); - rc = check_name(direntry); + rc = check_name(direntry, pTcon); if (rc) goto lookup_out; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 5fb2fc2d0080..7aa67206f6da 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -514,7 +514,12 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) * No tcon so can't do * cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_fail[SMB2...]); */ - if (rc != 0) + if (rc == -EOPNOTSUPP) { + cifs_dbg(VFS, "Dialect not supported by server. Consider " + "specifying vers=1.0 or vers=2.1 on mount for accessing" + " older servers\n"); + goto neg_exit; + } else if (rc != 0) goto neg_exit; cifs_dbg(FYI, "mode 0x%x\n", rsp->SecurityMode); @@ -3219,8 +3224,8 @@ copy_fs_info_to_kstatfs(struct smb2_fs_full_size_info *pfs_inf, kst->f_bsize = le32_to_cpu(pfs_inf->BytesPerSector) * le32_to_cpu(pfs_inf->SectorsPerAllocationUnit); kst->f_blocks = le64_to_cpu(pfs_inf->TotalAllocationUnits); - kst->f_bfree = le64_to_cpu(pfs_inf->ActualAvailableAllocationUnits); - kst->f_bavail = le64_to_cpu(pfs_inf->CallerAvailableAllocationUnits); + kst->f_bfree = kst->f_bavail = + le64_to_cpu(pfs_inf->CallerAvailableAllocationUnits); return; } diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 18700fd25a0b..2826882c81d1 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -84,8 +84,8 @@ #define NUMBER_OF_SMB2_COMMANDS 0x0013 -/* BB FIXME - analyze following length BB */ -#define MAX_SMB2_HDR_SIZE 0x78 /* 4 len + 64 hdr + (2*24 wct) + 2 bct + 2 pad */ +/* 4 len + 52 transform hdr + 64 hdr + 56 create rsp */ +#define MAX_SMB2_HDR_SIZE 0x00b0 #define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe) #define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd) @@ -646,11 +646,10 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, pte_t pte, *ptep = NULL; pmd_t *pmdp = NULL; spinlock_t *ptl; - bool changed; i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) { - unsigned long address; + unsigned long address, start, end; cond_resched(); @@ -658,8 +657,13 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, continue; address = pgoff_address(index, vma); - changed = false; - if (follow_pte_pmd(vma->vm_mm, address, &ptep, &pmdp, &ptl)) + + /* + * Note because we provide start/end to follow_pte_pmd it will + * call mmu_notifier_invalidate_range_start() on our behalf + * before taking any lock. + */ + if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl)) continue; if (pmdp) { @@ -676,7 +680,7 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, pmd = pmd_wrprotect(pmd); pmd = pmd_mkclean(pmd); set_pmd_at(vma->vm_mm, address, pmdp, pmd); - changed = true; + mmu_notifier_invalidate_range(vma->vm_mm, start, end); unlock_pmd: spin_unlock(ptl); #endif @@ -691,13 +695,12 @@ unlock_pmd: pte = pte_wrprotect(pte); pte = pte_mkclean(pte); set_pte_at(vma->vm_mm, address, ptep, pte); - changed = true; + mmu_notifier_invalidate_range(vma->vm_mm, start, end); unlock_pte: pte_unmap_unlock(ptep, ptl); } - if (changed) - mmu_notifier_invalidate_page(vma->vm_mm, address); + mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); } i_mmap_unlock_read(mapping); } @@ -1383,6 +1386,16 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); + /* + * Make sure that the faulting address's PMD offset (color) matches + * the PMD offset from the start of the file. This is necessary so + * that a PMD range in the page table overlaps exactly with a PMD + * range in the radix tree. + */ + if ((vmf->pgoff & PG_PMD_COLOUR) != + ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR)) + goto fallback; + /* Fall back to PTEs if we're going to COW */ if (write && !(vma->vm_flags & VM_SHARED)) goto fallback; diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 108df2e3602c..7eae33ffa3fc 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -133,6 +133,50 @@ static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb) return sb->s_fs_info; } +static int devpts_ptmx_path(struct path *path) +{ + struct super_block *sb; + int err; + + /* Has the devpts filesystem already been found? */ + if (path->mnt->mnt_sb->s_magic == DEVPTS_SUPER_MAGIC) + return 0; + + /* Is a devpts filesystem at "pts" in the same directory? */ + err = path_pts(path); + if (err) + return err; + + /* Is the path the root of a devpts filesystem? */ + sb = path->mnt->mnt_sb; + if ((sb->s_magic != DEVPTS_SUPER_MAGIC) || + (path->mnt->mnt_root != sb->s_root)) + return -ENODEV; + + return 0; +} + +struct vfsmount *devpts_mntget(struct file *filp, struct pts_fs_info *fsi) +{ + struct path path; + int err; + + path = filp->f_path; + path_get(&path); + + err = devpts_ptmx_path(&path); + dput(path.dentry); + if (err) { + mntput(path.mnt); + path.mnt = ERR_PTR(err); + } + if (DEVPTS_SB(path.mnt->mnt_sb) != fsi) { + mntput(path.mnt); + path.mnt = ERR_PTR(-ENODEV); + } + return path.mnt; +} + struct pts_fs_info *devpts_acquire(struct file *filp) { struct pts_fs_info *result; @@ -143,27 +187,16 @@ struct pts_fs_info *devpts_acquire(struct file *filp) path = filp->f_path; path_get(&path); - /* Has the devpts filesystem already been found? */ - sb = path.mnt->mnt_sb; - if (sb->s_magic != DEVPTS_SUPER_MAGIC) { - /* Is a devpts filesystem at "pts" in the same directory? */ - err = path_pts(&path); - if (err) { - result = ERR_PTR(err); - goto out; - } - - /* Is the path the root of a devpts filesystem? */ - result = ERR_PTR(-ENODEV); - sb = path.mnt->mnt_sb; - if ((sb->s_magic != DEVPTS_SUPER_MAGIC) || - (path.mnt->mnt_root != sb->s_root)) - goto out; + err = devpts_ptmx_path(&path); + if (err) { + result = ERR_PTR(err); + goto out; } /* * pty code needs to hold extra references in case of last /dev/tty close */ + sb = path.mnt->mnt_sb; atomic_inc(&sb->s_active); result = DEVPTS_SB(sb); diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index ca7089aeadab..fa08448e35dd 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -68,7 +68,7 @@ static void print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb, if (lkb->lkb_wait_type) seq_printf(s, " wait_type: %d", lkb->lkb_wait_type); - seq_puts(s, "\n"); + seq_putc(s, '\n'); } static void print_format1(struct dlm_rsb *res, struct seq_file *s) @@ -111,7 +111,7 @@ static void print_format1(struct dlm_rsb *res, struct seq_file *s) } if (rsb_flag(res, RSB_VALNOTVALID)) seq_puts(s, " (INVALID)"); - seq_puts(s, "\n"); + seq_putc(s, '\n'); if (seq_has_overflowed(s)) goto out; } @@ -156,7 +156,7 @@ static void print_format1(struct dlm_rsb *res, struct seq_file *s) lkb->lkb_id, print_lockmode(lkb->lkb_rqmode)); if (lkb->lkb_wait_type) seq_printf(s, " wait_type: %d", lkb->lkb_wait_type); - seq_puts(s, "\n"); + seq_putc(s, '\n'); if (seq_has_overflowed(s)) goto out; } @@ -287,7 +287,7 @@ static void print_format3(struct dlm_rsb *r, struct seq_file *s) else seq_printf(s, " %02x", (unsigned char)r->res_name[i]); } - seq_puts(s, "\n"); + seq_putc(s, '\n'); if (seq_has_overflowed(s)) goto out; @@ -298,7 +298,7 @@ static void print_format3(struct dlm_rsb *r, struct seq_file *s) for (i = 0; i < lvblen; i++) seq_printf(s, " %02x", (unsigned char)r->res_lvbptr[i]); - seq_puts(s, "\n"); + seq_putc(s, '\n'); if (seq_has_overflowed(s)) goto out; @@ -361,8 +361,7 @@ static void print_format4(struct dlm_rsb *r, struct seq_file *s) else seq_printf(s, " %02x", (unsigned char)r->res_name[i]); } - seq_puts(s, "\n"); - + seq_putc(s, '\n'); unlock_rsb(r); } @@ -436,7 +435,7 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos) if (bucket >= ls->ls_rsbtbl_size) return NULL; - ri = kzalloc(sizeof(struct rsbtbl_iter), GFP_NOFS); + ri = kzalloc(sizeof(*ri), GFP_NOFS); if (!ri) return NULL; if (n == 0) @@ -742,7 +741,7 @@ void dlm_delete_debug_file(struct dlm_ls *ls) int dlm_create_debug_file(struct dlm_ls *ls) { - char name[DLM_LOCKSPACE_LEN+8]; + char name[DLM_LOCKSPACE_LEN + 8]; /* format 1 */ @@ -757,7 +756,7 @@ int dlm_create_debug_file(struct dlm_ls *ls) /* format 2 */ memset(name, 0, sizeof(name)); - snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_locks", ls->ls_name); + snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_locks", ls->ls_name); ls->ls_debug_locks_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO, @@ -770,7 +769,7 @@ int dlm_create_debug_file(struct dlm_ls *ls) /* format 3 */ memset(name, 0, sizeof(name)); - snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_all", ls->ls_name); + snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_all", ls->ls_name); ls->ls_debug_all_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO, @@ -783,7 +782,7 @@ int dlm_create_debug_file(struct dlm_ls *ls) /* format 4 */ memset(name, 0, sizeof(name)); - snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_toss", ls->ls_name); + snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_toss", ls->ls_name); ls->ls_debug_toss_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO, @@ -794,7 +793,7 @@ int dlm_create_debug_file(struct dlm_ls *ls) goto fail; memset(name, 0, sizeof(name)); - snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name); + snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_waiters", ls->ls_name); ls->ls_debug_waiters_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO, diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 6df332296c66..d4aaddec1b16 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1426,7 +1426,7 @@ void dlm_scan_waiters(struct dlm_ls *ls) if (!num_nodes) { num_nodes = ls->ls_num_nodes; - warned = kzalloc(num_nodes * sizeof(int), GFP_KERNEL); + warned = kcalloc(num_nodes, sizeof(int), GFP_KERNEL); } if (!warned) continue; @@ -5119,11 +5119,9 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) int wait_type, stub_unlock_result, stub_cancel_result; int dir_nodeid; - ms_stub = kmalloc(sizeof(struct dlm_message), GFP_KERNEL); - if (!ms_stub) { - log_error(ls, "dlm_recover_waiters_pre no mem"); + ms_stub = kmalloc(sizeof(*ms_stub), GFP_KERNEL); + if (!ms_stub) return; - } mutex_lock(&ls->ls_waiters_mutex); diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 91592b75c309..78a7c855b06b 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -235,7 +235,7 @@ static int dlm_uevent(struct kset *kset, struct kobject *kobj, return 0; } -static struct kset_uevent_ops dlm_uevent_ops = { +static const struct kset_uevent_ops dlm_uevent_ops = { .uevent = dlm_uevent, }; @@ -453,9 +453,14 @@ static int new_lockspace(const char *name, const char *cluster, *ops_result = 0; } + if (!cluster) + log_print("dlm cluster name '%s' is being used without an application provided cluster name", + dlm_config.ci_cluster_name); + if (dlm_config.ci_recover_callbacks && cluster && strncmp(cluster, dlm_config.ci_cluster_name, DLM_LOCKSPACE_LEN)) { - log_print("dlm cluster name %s mismatch %s", + log_print("dlm cluster name '%s' does not match " + "the application cluster name '%s'", dlm_config.ci_cluster_name, cluster); error = -EBADR; goto out; diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 9382db998ec9..4813d0e0cd9b 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -729,7 +729,7 @@ static int tcp_accept_from_sock(struct connection *con) mutex_unlock(&connections_lock); memset(&peeraddr, 0, sizeof(peeraddr)); - result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, + result = sock_create_lite(dlm_local_addr[0]->ss_family, SOCK_STREAM, IPPROTO_TCP, &newsock); if (result < 0) return -ENOMEM; diff --git a/fs/dlm/member.c b/fs/dlm/member.c index 9c47f1c14a8b..3fda3832cf6a 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -217,8 +217,7 @@ int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size, } array_size = max + need; - - array = kzalloc(array_size * sizeof(struct dlm_slot), GFP_NOFS); + array = kcalloc(array_size, sizeof(*array), GFP_NOFS); if (!array) return -ENOMEM; @@ -319,7 +318,7 @@ static int dlm_add_member(struct dlm_ls *ls, struct dlm_config_node *node) struct dlm_member *memb; int error; - memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS); + memb = kzalloc(sizeof(*memb), GFP_NOFS); if (!memb) return -ENOMEM; @@ -405,8 +404,7 @@ static void make_member_array(struct dlm_ls *ls) } ls->ls_total_weight = total; - - array = kmalloc(sizeof(int) * total, GFP_NOFS); + array = kmalloc_array(total, sizeof(*array), GFP_NOFS); if (!array) return; @@ -492,8 +490,7 @@ void dlm_lsop_recover_done(struct dlm_ls *ls) return; num = ls->ls_num_nodes; - - slots = kzalloc(num * sizeof(struct dlm_slot), GFP_KERNEL); + slots = kcalloc(num, sizeof(*slots), GFP_KERNEL); if (!slots) return; @@ -673,11 +670,11 @@ int dlm_ls_stop(struct dlm_ls *ls) int dlm_ls_start(struct dlm_ls *ls) { - struct dlm_recover *rv = NULL, *rv_old; + struct dlm_recover *rv, *rv_old; struct dlm_config_node *nodes; int error, count; - rv = kzalloc(sizeof(struct dlm_recover), GFP_NOFS); + rv = kzalloc(sizeof(*rv), GFP_NOFS); if (!rv) return -ENOMEM; diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index d401425f602a..e631b1689228 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -367,7 +367,7 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file, locks_init_lock(fl); fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK; fl->fl_flags = FL_POSIX; - fl->fl_pid = op->info.pid; + fl->fl_pid = -op->info.pid; fl->fl_start = op->info.start; fl->fl_end = op->info.end; rv = 0; diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 23488f559cf9..d18e7a539f11 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -123,6 +123,8 @@ static void compat_input(struct dlm_write_request *kb, static void compat_output(struct dlm_lock_result *res, struct dlm_lock_result32 *res32) { + memset(res32, 0, sizeof(*res32)); + res32->version[0] = res->version[0]; res32->version[1] = res->version[1]; res32->version[2] = res->version[2]; @@ -355,6 +357,10 @@ static int dlm_device_register(struct dlm_ls *ls, char *name) error = misc_register(&ls->ls_device); if (error) { kfree(ls->ls_device.name); + /* this has to be set to NULL + * to avoid a double-free in dlm_device_deregister + */ + ls->ls_device.name = NULL; } fail: return error; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index e767e4389cb1..adbe328b957c 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -600,8 +600,13 @@ static void ep_remove_wait_queue(struct eppoll_entry *pwq) wait_queue_head_t *whead; rcu_read_lock(); - /* If it is cleared by POLLFREE, it should be rcu-safe */ - whead = rcu_dereference(pwq->whead); + /* + * If it is cleared by POLLFREE, it should be rcu-safe. + * If we read NULL we need a barrier paired with + * smp_store_release() in ep_poll_callback(), otherwise + * we rely on whead->lock. + */ + whead = smp_load_acquire(&pwq->whead); if (whead) remove_wait_queue(whead, &pwq->wait); rcu_read_unlock(); @@ -1134,17 +1139,6 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v struct eventpoll *ep = epi->ep; int ewake = 0; - if ((unsigned long)key & POLLFREE) { - ep_pwq_from_wait(wait)->whead = NULL; - /* - * whead = NULL above can race with ep_remove_wait_queue() - * which can do another remove_wait_queue() after us, so we - * can't use __remove_wait_queue(). whead->lock is held by - * the caller. - */ - list_del_init(&wait->entry); - } - spin_lock_irqsave(&ep->lock, flags); ep_set_busy_poll_napi_id(epi); @@ -1228,10 +1222,26 @@ out_unlock: if (pwake) ep_poll_safewake(&ep->poll_wait); - if (epi->event.events & EPOLLEXCLUSIVE) - return ewake; + if (!(epi->event.events & EPOLLEXCLUSIVE)) + ewake = 1; + + if ((unsigned long)key & POLLFREE) { + /* + * If we race with ep_remove_wait_queue() it can miss + * ->whead = NULL and do another remove_wait_queue() after + * us, so we can't use __remove_wait_queue(). + */ + list_del_init(&wait->entry); + /* + * ->whead != NULL protects us from the race with ep_free() + * or ep_remove(), ep_remove_wait_queue() takes whead->lock + * held by the caller. Once we nullify it, nothing protects + * ep/epi or even wait. + */ + smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL); + } - return 1; + return ewake; } /* diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 79dafa71effd..51f0aea70cb4 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -175,11 +175,8 @@ ext2_get_acl(struct inode *inode, int type) return acl; } -/* - * inode->i_mutex: down - */ -int -ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) +static int +__ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) { int name_index; void *value = NULL; @@ -189,13 +186,6 @@ ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) switch(type) { case ACL_TYPE_ACCESS: name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS; - if (acl) { - error = posix_acl_update_mode(inode, &inode->i_mode, &acl); - if (error) - return error; - inode->i_ctime = current_time(inode); - mark_inode_dirty(inode); - } break; case ACL_TYPE_DEFAULT: @@ -222,6 +212,31 @@ ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) } /* + * inode->i_mutex: down + */ +int +ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int error; + int update_mode = 0; + umode_t mode = inode->i_mode; + + if (type == ACL_TYPE_ACCESS && acl) { + error = posix_acl_update_mode(inode, &mode, &acl); + if (error) + return error; + update_mode = 1; + } + error = __ext2_set_acl(inode, acl, type); + if (!error && update_mode) { + inode->i_mode = mode; + inode->i_ctime = current_time(inode); + mark_inode_dirty(inode); + } + return error; +} + +/* * Initialize the ACLs of a new inode. Called from ext2_new_inode. * * dir->i_mutex: down @@ -238,12 +253,12 @@ ext2_init_acl(struct inode *inode, struct inode *dir) return error; if (default_acl) { - error = ext2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + error = __ext2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); posix_acl_release(default_acl); } if (acl) { if (!error) - error = ext2_set_acl(inode, acl, ACL_TYPE_ACCESS); + error = __ext2_set_acl(inode, acl, ACL_TYPE_ACCESS); posix_acl_release(acl); } return error; diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 09441ae07a5b..46ff2229ff5e 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -193,13 +193,6 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type, switch (type) { case ACL_TYPE_ACCESS: name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; - if (acl) { - error = posix_acl_update_mode(inode, &inode->i_mode, &acl); - if (error) - return error; - inode->i_ctime = current_time(inode); - ext4_mark_inode_dirty(handle, inode); - } break; case ACL_TYPE_DEFAULT: @@ -221,8 +214,9 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type, value, size, xattr_flags); kfree(value); - if (!error) + if (!error) { set_cached_acl(inode, type, acl); + } return error; } @@ -233,6 +227,8 @@ ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type) handle_t *handle; int error, credits, retries = 0; size_t acl_size = acl ? ext4_acl_size(acl->a_count) : 0; + umode_t mode = inode->i_mode; + int update_mode = 0; error = dquot_initialize(inode); if (error) @@ -247,7 +243,20 @@ retry: if (IS_ERR(handle)) return PTR_ERR(handle); + if ((type == ACL_TYPE_ACCESS) && acl) { + error = posix_acl_update_mode(inode, &mode, &acl); + if (error) + goto out_stop; + update_mode = 1; + } + error = __ext4_set_acl(handle, inode, type, acl, 0 /* xattr_flags */); + if (!error && update_mode) { + inode->i_mode = mode; + inode->i_ctime = current_time(inode); + ext4_mark_inode_dirty(handle, inode); + } +out_stop: ext4_journal_stop(handle); if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index e8b365000d73..b04e882179c6 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -411,7 +411,7 @@ static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp, { struct dir_private_info *p; - p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); + p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) return NULL; p->curr_hash = pos2maj_hash(filp, pos); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9ebde0cd632e..84b9da192238 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -838,13 +838,11 @@ static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) { if (unlikely(sizeof(time->tv_sec) > 4 && (extra & cpu_to_le32(EXT4_EPOCH_MASK)))) { -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,20,0) + +#if 1 /* Handle legacy encoding of pre-1970 dates with epoch - * bits 1,1. We assume that by kernel version 4.20, - * everyone will have run fsck over the affected - * filesystems to correct the problem. (This - * backwards compatibility may be removed before this - * time, at the discretion of the ext4 developers.) + * bits 1,1. (This backwards compatibility may be removed + * at the discretion of the ext4 developers.) */ u64 extra_bits = le32_to_cpu(extra) & EXT4_EPOCH_MASK; if (extra_bits == 3 && ((time->tv_sec) & 0x80000000) != 0) @@ -961,7 +959,7 @@ struct ext4_inode_info { /* * i_block_group is the number of the block group which contains * this file's inode. Constant across the lifetime of the inode, - * it is ued for making block allocation decisions - we try to + * it is used for making block allocation decisions - we try to * place a file's data blocks near its inode block, and new inodes * near to their parent directory's inode. */ @@ -1049,10 +1047,8 @@ struct ext4_inode_info { ext4_group_t i_last_alloc_group; /* allocation reservation info for delalloc */ - /* In case of bigalloc, these refer to clusters rather than blocks */ + /* In case of bigalloc, this refer to clusters rather than blocks */ unsigned int i_reserved_data_blocks; - unsigned int i_reserved_meta_blocks; - unsigned int i_allocated_meta_blocks; ext4_lblk_t i_da_metadata_calc_last_lblock; int i_da_metadata_calc_len; @@ -1569,6 +1565,7 @@ enum { nolocking */ EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ + EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */ }; #define EXT4_INODE_BIT_FNS(name, field, offset) \ @@ -2022,7 +2019,8 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) #define is_dx(dir) (ext4_has_feature_dir_index((dir)->i_sb) && \ ext4_test_inode_flag((dir), EXT4_INODE_INDEX)) -#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX) +#define EXT4_DIR_LINK_MAX(dir) unlikely((dir)->i_nlink >= EXT4_LINK_MAX && \ + !(ext4_has_feature_dir_nlink((dir)->i_sb) && is_dx(dir))) #define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) /* Legal values for the dx_root hash_version field: */ @@ -2462,6 +2460,8 @@ extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid); int ext4_inode_is_fast_symlink(struct inode *inode); struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); +int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, + bool wait, struct buffer_head **bhs); int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, @@ -3074,7 +3074,7 @@ extern int ext4_handle_dirty_dirent_node(handle_t *handle, struct inode *inode, struct buffer_head *bh); #define S_SHIFT 12 -static const unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = { +static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = { [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index dabad1bc8617..48143e32411c 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -227,6 +227,9 @@ int ext4_reserve_inode_write(handle_t *handle, struct inode *inode, int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode); +int ext4_expand_extra_isize(struct inode *inode, + unsigned int new_extra_isize, + struct ext4_iloc *iloc); /* * Wrapper functions with which ext4 calls into JBD. */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e0a8425ff74d..97f0fd06728d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4652,7 +4652,7 @@ retry: static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, ext4_lblk_t len, loff_t new_size, - int flags, int mode) + int flags) { struct inode *inode = file_inode(file); handle_t *handle; @@ -4815,7 +4815,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, round_down(offset, 1 << blkbits) >> blkbits, (round_up((offset + len), 1 << blkbits) - round_down(offset, 1 << blkbits)) >> blkbits, - new_size, flags, mode); + new_size, flags); if (ret) goto out_dio; @@ -4841,7 +4841,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, inode->i_mtime = inode->i_ctime = current_time(inode); ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, - flags, mode); + flags); up_write(&EXT4_I(inode)->i_mmap_sem); if (ret) goto out_dio; @@ -4976,8 +4976,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, - flags, mode); + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); ext4_inode_resume_unlocked_dio(inode); if (ret) goto out; @@ -5837,7 +5836,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, if (e1_blk > lblk1) next1 = e1_blk; if (e2_blk > lblk2) - next2 = e1_blk; + next2 = e2_blk; /* Do we have something to swap */ if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS) goto finish; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 58294c9a7e1d..197653ea6041 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -279,7 +279,20 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf, handle_t *handle = NULL; struct inode *inode = file_inode(vmf->vma->vm_file); struct super_block *sb = inode->i_sb; - bool write = vmf->flags & FAULT_FLAG_WRITE; + + /* + * We have to distinguish real writes from writes which will result in a + * COW page; COW writes should *not* poke the journal (the file will not + * be changed). Doing so would cause unintended failures when mounted + * read-only. + * + * We check for VM_SHARED rather than vmf->cow_page since the latter is + * unset for pe_size != PE_SIZE_PTE (i.e. only in do_cow_fault); for + * other sizes, dax_iomap_fault will handle splitting / fallback so that + * we eventually come back with a COW page. + */ + bool write = (vmf->flags & FAULT_FLAG_WRITE) && + (vmf->vma->vm_flags & VM_SHARED); if (write) { sb_start_pagefault(sb); @@ -537,6 +550,8 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, lastoff = page_offset(page); bh = head = page_buffers(page); do { + if (lastoff + bh->b_size <= startoff) + goto next; if (buffer_uptodate(bh) || buffer_unwritten(bh)) { if (whence == SEEK_DATA) @@ -551,6 +566,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, unlock_page(page); goto out; } +next: lastoff += bh->b_size; bh = bh->b_this_page; } while (bh != head); @@ -592,7 +608,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) inode_lock(inode); isize = i_size_read(inode); - if (offset >= isize) { + if (offset < 0 || offset >= isize) { inode_unlock(inode); return -ENXIO; } @@ -655,7 +671,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) inode_lock(inode); isize = i_size_read(inode); - if (offset >= isize) { + if (offset < 0 || offset >= isize) { inode_unlock(inode); return -ENXIO; } diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index 38b8a96eb97c..00c6dd29e621 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -148,8 +148,6 @@ static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num) if (len > num*4) len = num * 4; for (i = 0; i < len; i++) { - if ((i % 4) == 0) - val = pad; val = ((int) scp[i]) + (val << 8); if ((i % 4) == 3) { *buf++ = val; @@ -176,8 +174,6 @@ static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num) if (len > num*4) len = num * 4; for (i = 0; i < len; i++) { - if ((i % 4) == 0) - val = pad; val = ((int) ucp[i]) + (val << 8); if ((i % 4) == 3) { *buf++ = val; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 507bfb3344d4..71e93a23cec3 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -692,24 +692,25 @@ static int find_group_other(struct super_block *sb, struct inode *parent, * somewhat arbitrary...) */ #define RECENTCY_MIN 5 -#define RECENTCY_DIRTY 30 +#define RECENTCY_DIRTY 300 static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino) { struct ext4_group_desc *gdp; struct ext4_inode *raw_inode; struct buffer_head *bh; - unsigned long dtime, now; - int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; - int offset, ret = 0, recentcy = RECENTCY_MIN; + int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; + int offset, ret = 0; + int recentcy = RECENTCY_MIN; + u32 dtime, now; gdp = ext4_get_group_desc(sb, group, NULL); if (unlikely(!gdp)) return 0; - bh = sb_getblk(sb, ext4_inode_table(sb, gdp) + + bh = sb_find_get_block(sb, ext4_inode_table(sb, gdp) + (ino / inodes_per_block)); - if (unlikely(!bh) || !buffer_uptodate(bh)) + if (!bh || !buffer_uptodate(bh)) /* * If the block is not in the buffer cache, then it * must have been written out. @@ -718,18 +719,45 @@ static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino) offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb); raw_inode = (struct ext4_inode *) (bh->b_data + offset); + + /* i_dtime is only 32 bits on disk, but we only care about relative + * times in the range of a few minutes (i.e. long enough to sync a + * recently-deleted inode to disk), so using the low 32 bits of the + * clock (a 68 year range) is enough, see time_before32() */ dtime = le32_to_cpu(raw_inode->i_dtime); - now = get_seconds(); + now = ktime_get_real_seconds(); if (buffer_dirty(bh)) recentcy += RECENTCY_DIRTY; - if (dtime && (dtime < now) && (now < dtime + recentcy)) + if (dtime && time_before32(dtime, now) && + time_before32(now, dtime + recentcy)) ret = 1; out: brelse(bh); return ret; } +static int find_inode_bit(struct super_block *sb, ext4_group_t group, + struct buffer_head *bitmap, unsigned long *ino) +{ +next: + *ino = ext4_find_next_zero_bit((unsigned long *) + bitmap->b_data, + EXT4_INODES_PER_GROUP(sb), *ino); + if (*ino >= EXT4_INODES_PER_GROUP(sb)) + return 0; + + if ((EXT4_SB(sb)->s_journal == NULL) && + recently_deleted(sb, group, *ino)) { + *ino = *ino + 1; + if (*ino < EXT4_INODES_PER_GROUP(sb)) + goto next; + return 0; + } + + return 1; +} + /* * There are two policies for allocating an inode. If the new inode is * a directory, then a forward search is made for a block group with both @@ -892,19 +920,13 @@ got_group: /* * Check free inodes count before loading bitmap. */ - if (ext4_free_inodes_count(sb, gdp) == 0) { - if (++group == ngroups) - group = 0; - continue; - } + if (ext4_free_inodes_count(sb, gdp) == 0) + goto next_group; grp = ext4_get_group_info(sb, group); /* Skip groups with already-known suspicious inode tables */ - if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { - if (++group == ngroups) - group = 0; - continue; - } + if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) + goto next_group; brelse(inode_bitmap_bh); inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); @@ -912,27 +934,20 @@ got_group: if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) || IS_ERR(inode_bitmap_bh)) { inode_bitmap_bh = NULL; - if (++group == ngroups) - group = 0; - continue; + goto next_group; } repeat_in_this_group: - ino = ext4_find_next_zero_bit((unsigned long *) - inode_bitmap_bh->b_data, - EXT4_INODES_PER_GROUP(sb), ino); - if (ino >= EXT4_INODES_PER_GROUP(sb)) + ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino); + if (!ret2) goto next_group; - if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) { + + if (group == 0 && (ino + 1) < EXT4_FIRST_INO(sb)) { ext4_error(sb, "reserved inode found cleared - " "inode=%lu", ino + 1); - continue; - } - if ((EXT4_SB(sb)->s_journal == NULL) && - recently_deleted(sb, group, ino)) { - ino++; - goto next_inode; + goto next_group; } + if (!handle) { BUG_ON(nblocks <= 0); handle = __ext4_journal_start_sb(dir->i_sb, line_no, @@ -952,11 +967,23 @@ repeat_in_this_group: } ext4_lock_group(sb, group); ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data); + if (ret2) { + /* Someone already took the bit. Repeat the search + * with lock held. + */ + ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino); + if (ret2) { + ext4_set_bit(ino, inode_bitmap_bh->b_data); + ret2 = 0; + } else { + ret2 = 1; /* we didn't grab the inode */ + } + } ext4_unlock_group(sb, group); ino++; /* the inode bitmap is zero-based */ if (!ret2) goto got; /* we grabbed the inode! */ -next_inode: + if (ino < EXT4_INODES_PER_GROUP(sb)) goto repeat_in_this_group; next_group: diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3c600f02673f..714396760616 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -892,7 +892,7 @@ static int ext4_dio_get_block_unwritten_async(struct inode *inode, /* * Get block function for non-AIO DIO writes when we create unwritten extent if * blocks are not allocated yet. The extent will be converted to written - * after IO is complete from ext4_ext_direct_IO() function. + * after IO is complete by ext4_direct_IO_write(). */ static int ext4_dio_get_block_unwritten_sync(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) @@ -907,7 +907,7 @@ static int ext4_dio_get_block_unwritten_sync(struct inode *inode, /* * Mark inode as having pending DIO writes to unwritten extents. - * ext4_ext_direct_IO() checks this flag and converts extents to + * ext4_direct_IO_write() checks this flag and converts extents to * written. */ if (!ret && buffer_unwritten(bh_result)) @@ -1015,6 +1015,50 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, return ERR_PTR(-EIO); } +/* Read a contiguous batch of blocks. */ +int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, + bool wait, struct buffer_head **bhs) +{ + int i, err; + + for (i = 0; i < bh_count; i++) { + bhs[i] = ext4_getblk(NULL, inode, block + i, 0 /* map_flags */); + if (IS_ERR(bhs[i])) { + err = PTR_ERR(bhs[i]); + bh_count = i; + goto out_brelse; + } + } + + for (i = 0; i < bh_count; i++) + /* Note that NULL bhs[i] is valid because of holes. */ + if (bhs[i] && !buffer_uptodate(bhs[i])) + ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, + &bhs[i]); + + if (!wait) + return 0; + + for (i = 0; i < bh_count; i++) + if (bhs[i]) + wait_on_buffer(bhs[i]); + + for (i = 0; i < bh_count; i++) { + if (bhs[i] && !buffer_uptodate(bhs[i])) { + err = -EIO; + goto out_brelse; + } + } + return 0; + +out_brelse: + for (i = 0; i < bh_count; i++) { + brelse(bhs[i]); + bhs[i] = NULL; + } + return err; +} + int ext4_walk_page_buffers(handle_t *handle, struct buffer_head *head, unsigned from, @@ -4853,14 +4897,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) brelse(iloc.bh); ext4_set_inode_flags(inode); - if (ei->i_flags & EXT4_EA_INODE_FL) { - ext4_xattr_inode_set_class(inode); - - inode_lock(inode); - inode->i_flags |= S_NOQUOTA; - inode_unlock(inode); - } - unlock_new_inode(inode); return inode; @@ -5658,22 +5694,16 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode, return err; } -/* - * Expand an inode by new_extra_isize bytes. - * Returns 0 on success or negative error number on failure. - */ -static int ext4_expand_extra_isize(struct inode *inode, - unsigned int new_extra_isize, - struct ext4_iloc iloc, - handle_t *handle) +static int __ext4_expand_extra_isize(struct inode *inode, + unsigned int new_extra_isize, + struct ext4_iloc *iloc, + handle_t *handle, int *no_expand) { struct ext4_inode *raw_inode; struct ext4_xattr_ibody_header *header; + int error; - if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) - return 0; - - raw_inode = ext4_raw_inode(&iloc); + raw_inode = ext4_raw_inode(iloc); header = IHDR(inode, raw_inode); @@ -5688,8 +5718,98 @@ static int ext4_expand_extra_isize(struct inode *inode, } /* try to expand with EAs present */ - return ext4_expand_extra_isize_ea(inode, new_extra_isize, - raw_inode, handle); + error = ext4_expand_extra_isize_ea(inode, new_extra_isize, + raw_inode, handle); + if (error) { + /* + * Inode size expansion failed; don't try again + */ + *no_expand = 1; + } + + return error; +} + +/* + * Expand an inode by new_extra_isize bytes. + * Returns 0 on success or negative error number on failure. + */ +static int ext4_try_to_expand_extra_isize(struct inode *inode, + unsigned int new_extra_isize, + struct ext4_iloc iloc, + handle_t *handle) +{ + int no_expand; + int error; + + if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) + return -EOVERFLOW; + + /* + * In nojournal mode, we can immediately attempt to expand + * the inode. When journaled, we first need to obtain extra + * buffer credits since we may write into the EA block + * with this same handle. If journal_extend fails, then it will + * only result in a minor loss of functionality for that inode. + * If this is felt to be critical, then e2fsck should be run to + * force a large enough s_min_extra_isize. + */ + if (ext4_handle_valid(handle) && + jbd2_journal_extend(handle, + EXT4_DATA_TRANS_BLOCKS(inode->i_sb)) != 0) + return -ENOSPC; + + if (ext4_write_trylock_xattr(inode, &no_expand) == 0) + return -EBUSY; + + error = __ext4_expand_extra_isize(inode, new_extra_isize, &iloc, + handle, &no_expand); + ext4_write_unlock_xattr(inode, &no_expand); + + return error; +} + +int ext4_expand_extra_isize(struct inode *inode, + unsigned int new_extra_isize, + struct ext4_iloc *iloc) +{ + handle_t *handle; + int no_expand; + int error, rc; + + if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { + brelse(iloc->bh); + return -EOVERFLOW; + } + + handle = ext4_journal_start(inode, EXT4_HT_INODE, + EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + brelse(iloc->bh); + return error; + } + + ext4_write_lock_xattr(inode, &no_expand); + + BUFFER_TRACE(iloc.bh, "get_write_access"); + error = ext4_journal_get_write_access(handle, iloc->bh); + if (error) { + brelse(iloc->bh); + goto out_stop; + } + + error = __ext4_expand_extra_isize(inode, new_extra_isize, iloc, + handle, &no_expand); + + rc = ext4_mark_iloc_dirty(handle, inode, iloc); + if (!error) + error = rc; + + ext4_write_unlock_xattr(inode, &no_expand); +out_stop: + ext4_journal_stop(handle); + return error; } /* @@ -5709,44 +5829,18 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) { struct ext4_iloc iloc; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - static unsigned int mnt_count; - int err, ret; + int err; might_sleep(); trace_ext4_mark_inode_dirty(inode, _RET_IP_); err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) return err; - if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && - !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { - /* - * In nojournal mode, we can immediately attempt to expand - * the inode. When journaled, we first need to obtain extra - * buffer credits since we may write into the EA block - * with this same handle. If journal_extend fails, then it will - * only result in a minor loss of functionality for that inode. - * If this is felt to be critical, then e2fsck should be run to - * force a large enough s_min_extra_isize. - */ - if (!ext4_handle_valid(handle) || - jbd2_journal_extend(handle, - EXT4_DATA_TRANS_BLOCKS(inode->i_sb)) == 0) { - ret = ext4_expand_extra_isize(inode, - sbi->s_want_extra_isize, - iloc, handle); - if (ret) { - if (mnt_count != - le16_to_cpu(sbi->s_es->s_mnt_count)) { - ext4_warning(inode->i_sb, - "Unable to expand inode %lu. Delete" - " some EAs or run e2fsck.", - inode->i_ino); - mnt_count = - le16_to_cpu(sbi->s_es->s_mnt_count); - } - } - } - } + + if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize) + ext4_try_to_expand_extra_isize(inode, sbi->s_want_extra_isize, + iloc, handle); + return ext4_mark_iloc_dirty(handle, inode, &iloc); } diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 42b3a73143cf..afb66d4ab5cf 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -64,18 +64,16 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) ei1 = EXT4_I(inode1); ei2 = EXT4_I(inode2); - memswap(&inode1->i_flags, &inode2->i_flags, sizeof(inode1->i_flags)); - memswap(&inode1->i_version, &inode2->i_version, - sizeof(inode1->i_version)); - memswap(&inode1->i_blocks, &inode2->i_blocks, - sizeof(inode1->i_blocks)); - memswap(&inode1->i_bytes, &inode2->i_bytes, sizeof(inode1->i_bytes)); - memswap(&inode1->i_atime, &inode2->i_atime, sizeof(inode1->i_atime)); - memswap(&inode1->i_mtime, &inode2->i_mtime, sizeof(inode1->i_mtime)); + swap(inode1->i_flags, inode2->i_flags); + swap(inode1->i_version, inode2->i_version); + swap(inode1->i_blocks, inode2->i_blocks); + swap(inode1->i_bytes, inode2->i_bytes); + swap(inode1->i_atime, inode2->i_atime); + swap(inode1->i_mtime, inode2->i_mtime); memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data)); - memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags)); - memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize)); + swap(ei1->i_flags, ei2->i_flags); + swap(ei1->i_disksize, ei2->i_disksize); ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS); ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS); @@ -351,11 +349,14 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) raw_inode = ext4_raw_inode(&iloc); if (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) { - err = -EOVERFLOW; + err = ext4_expand_extra_isize(inode, + EXT4_SB(sb)->s_want_extra_isize, + &iloc); + if (err) + goto out_unlock; + } else { brelse(iloc.bh); - goto out_unlock; } - brelse(iloc.bh); dquot_initialize(inode); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 581e357e8406..701085620cd8 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2295,6 +2295,9 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) int err, buddy_loaded = 0; struct ext4_buddy e4b; struct ext4_group_info *grinfo; + unsigned char blocksize_bits = min_t(unsigned char, + sb->s_blocksize_bits, + EXT4_MAX_BLOCK_LOG_SIZE); struct sg { struct ext4_group_info info; ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2]; @@ -2306,8 +2309,9 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 " " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n"); - i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + + i = (blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + sizeof(struct ext4_group_info); + grinfo = ext4_get_group_info(sb, group); /* Load the group info in memory only if not already loaded. */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) { @@ -2327,7 +2331,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, sg.info.bb_fragments, sg.info.bb_first_free); for (i = 0; i <= 13; i++) - seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? + seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ? sg.info.bb_counters[i] : 0); seq_printf(seq, " ]\n"); @@ -2892,8 +2896,10 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) break; } - if (discard_bio) + if (discard_bio) { submit_bio_wait(discard_bio); + bio_put(discard_bio); + } } list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list) diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index eb9835638680..77cdce1f17ce 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -367,7 +367,7 @@ skip: goto failed; } - mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL); + mmpd_data = kmalloc(sizeof(*mmpd_data), GFP_KERNEL); if (!mmpd_data) { ext4_warning(sb, "not enough memory for mmpd_data"); goto failed; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 13f0cadb1238..c1cf020d1889 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1342,13 +1342,12 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, struct super_block *sb; struct buffer_head *bh_use[NAMEI_RA_SIZE]; struct buffer_head *bh, *ret = NULL; - ext4_lblk_t start, block, b; + ext4_lblk_t start, block; const u8 *name = d_name->name; - int ra_max = 0; /* Number of bh's in the readahead + size_t ra_max = 0; /* Number of bh's in the readahead buffer, bh_use[] */ - int ra_ptr = 0; /* Current index into readahead + size_t ra_ptr = 0; /* Current index into readahead buffer */ - int num = 0; ext4_lblk_t nblocks; int i, namelen, retval; struct ext4_filename fname; @@ -1411,31 +1410,17 @@ restart: if (ra_ptr >= ra_max) { /* Refill the readahead buffer */ ra_ptr = 0; - b = block; - for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) { - /* - * Terminate if we reach the end of the - * directory and must wrap, or if our - * search has finished at this block. - */ - if (b >= nblocks || (num && block == start)) { - bh_use[ra_max] = NULL; - break; - } - num++; - bh = ext4_getblk(NULL, dir, b++, 0); - if (IS_ERR(bh)) { - if (ra_max == 0) { - ret = bh; - goto cleanup_and_exit; - } - break; - } - bh_use[ra_max] = bh; - if (bh) - ll_rw_block(REQ_OP_READ, - REQ_META | REQ_PRIO, - 1, &bh); + if (block < start) + ra_max = start - block; + else + ra_max = nblocks - block; + ra_max = min(ra_max, ARRAY_SIZE(bh_use)); + retval = ext4_bread_batch(dir, block, ra_max, + false /* wait */, bh_use); + if (retval) { + ret = ERR_PTR(retval); + ra_max = 0; + goto cleanup_and_exit; } } if ((bh = bh_use[ra_ptr++]) == NULL) @@ -2395,19 +2380,22 @@ out: } /* - * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2, - * since this indicates that nlinks count was previously 1. + * Set directory link count to 1 if nlinks > EXT4_LINK_MAX, or if nlinks == 2 + * since this indicates that nlinks count was previously 1 to avoid overflowing + * the 16-bit i_links_count field on disk. Directories with i_nlink == 1 mean + * that subdirectory link counts are not being maintained accurately. + * + * The caller has already checked for i_nlink overflow in case the DIR_LINK + * feature is not enabled and returned -EMLINK. The is_dx() check is a proxy + * for checking S_ISDIR(inode) (since the INODE_INDEX feature will not be set + * on regular files) and to avoid creating huge/slow non-HTREE directories. */ static void ext4_inc_count(handle_t *handle, struct inode *inode) { inc_nlink(inode); - if (is_dx(inode) && inode->i_nlink > 1) { - /* limit is 16-bit i_links_count */ - if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) { - set_nlink(inode, 1); - ext4_set_feature_dir_nlink(inode->i_sb); - } - } + if (is_dx(inode) && + (inode->i_nlink > EXT4_LINK_MAX || inode->i_nlink == 2)) + set_nlink(inode, 1); } /* diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index c3ed9021b781..035cd3f4785e 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1927,7 +1927,8 @@ retry: n_desc_blocks = o_desc_blocks + le16_to_cpu(es->s_reserved_gdt_blocks); n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb); - n_blocks_count = n_group * EXT4_BLOCKS_PER_GROUP(sb); + n_blocks_count = (ext4_fsblk_t)n_group * + EXT4_BLOCKS_PER_GROUP(sb); n_group--; /* set to last group number */ } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0886fe82e9c4..c9e7be58756b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -978,8 +978,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_es_shk_nr = 0; ei->i_es_shrink_lblk = 0; ei->i_reserved_data_blocks = 0; - ei->i_reserved_meta_blocks = 0; - ei->i_allocated_meta_blocks = 0; ei->i_da_metadata_calc_len = 0; ei->i_da_metadata_calc_last_lblock = 0; spin_lock_init(&(ei->i_block_reservation_lock)); @@ -2406,6 +2404,7 @@ static void ext4_orphan_cleanup(struct super_block *sb, unsigned int s_flags = sb->s_flags; int ret, nr_orphans = 0, nr_truncates = 0; #ifdef CONFIG_QUOTA + int quota_update = 0; int i; #endif if (!es->s_last_orphan) { @@ -2444,14 +2443,32 @@ static void ext4_orphan_cleanup(struct super_block *sb, #ifdef CONFIG_QUOTA /* Needed for iput() to work correctly and not trash data */ sb->s_flags |= MS_ACTIVE; - /* Turn on quotas so that they are updated correctly */ + + /* + * Turn on quotas which were not enabled for read-only mounts if + * filesystem has quota feature, so that they are updated correctly. + */ + if (ext4_has_feature_quota(sb) && (s_flags & MS_RDONLY)) { + int ret = ext4_enable_quotas(sb); + + if (!ret) + quota_update = 1; + else + ext4_msg(sb, KERN_ERR, + "Cannot turn on quotas: error %d", ret); + } + + /* Turn on journaled quotas used for old sytle */ for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (EXT4_SB(sb)->s_qf_names[i]) { int ret = ext4_quota_on_mount(sb, i); - if (ret < 0) + + if (!ret) + quota_update = 1; + else ext4_msg(sb, KERN_ERR, "Cannot turn on journaled " - "quota: error %d", ret); + "quota: type %d: error %d", i, ret); } } #endif @@ -2512,10 +2529,12 @@ static void ext4_orphan_cleanup(struct super_block *sb, ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up", PLURAL(nr_truncates)); #ifdef CONFIG_QUOTA - /* Turn quotas off */ - for (i = 0; i < EXT4_MAXQUOTAS; i++) { - if (sb_dqopt(sb)->files[i]) - dquot_quota_off(sb, i); + /* Turn off quotas if they were enabled for orphan cleanup */ + if (quota_update) { + for (i = 0; i < EXT4_MAXQUOTAS; i++) { + if (sb_dqopt(sb)->files[i]) + dquot_quota_off(sb, i); + } } #endif sb->s_flags = s_flags; /* Restore MS_RDONLY status */ @@ -5514,6 +5533,9 @@ static int ext4_enable_quotas(struct super_block *sb) DQUOT_USAGE_ENABLED | (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0)); if (err) { + for (type--; type >= 0; type--) + dquot_quota_off(sb, type); + ext4_warning(sb, "Failed to enable quota tracking " "(type=%d, err=%d). Please run " diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index cff4f41ced61..3b69330a4250 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -317,32 +317,47 @@ static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash) */ static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size) { - unsigned long block = 0; - struct buffer_head *bh; - int blocksize = ea_inode->i_sb->s_blocksize; - size_t csize, copied = 0; - void *copy_pos = buf; - - while (copied < size) { - csize = (size - copied) > blocksize ? blocksize : size - copied; - bh = ext4_bread(NULL, ea_inode, block, 0); - if (IS_ERR(bh)) - return PTR_ERR(bh); - if (!bh) - return -EFSCORRUPTED; + int blocksize = 1 << ea_inode->i_blkbits; + int bh_count = (size + blocksize - 1) >> ea_inode->i_blkbits; + int tail_size = (size % blocksize) ?: blocksize; + struct buffer_head *bhs_inline[8]; + struct buffer_head **bhs = bhs_inline; + int i, ret; + + if (bh_count > ARRAY_SIZE(bhs_inline)) { + bhs = kmalloc_array(bh_count, sizeof(*bhs), GFP_NOFS); + if (!bhs) + return -ENOMEM; + } - memcpy(copy_pos, bh->b_data, csize); - brelse(bh); + ret = ext4_bread_batch(ea_inode, 0 /* block */, bh_count, + true /* wait */, bhs); + if (ret) + goto free_bhs; - copy_pos += csize; - block += 1; - copied += csize; + for (i = 0; i < bh_count; i++) { + /* There shouldn't be any holes in ea_inode. */ + if (!bhs[i]) { + ret = -EFSCORRUPTED; + goto put_bhs; + } + memcpy((char *)buf + blocksize * i, bhs[i]->b_data, + i < bh_count - 1 ? blocksize : tail_size); } - return 0; + ret = 0; +put_bhs: + for (i = 0; i < bh_count; i++) + brelse(bhs[i]); +free_bhs: + if (bhs != bhs_inline) + kfree(bhs); + return ret; } +#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode)->i_mtime.tv_sec) + static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, - struct inode **ea_inode) + u32 ea_inode_hash, struct inode **ea_inode) { struct inode *inode; int err; @@ -372,6 +387,24 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, goto error; } + ext4_xattr_inode_set_class(inode); + + /* + * Check whether this is an old Lustre-style xattr inode. Lustre + * implementation does not have hash validation, rather it has a + * backpointer from ea_inode to the parent inode. + */ + if (ea_inode_hash != ext4_xattr_inode_get_hash(inode) && + EXT4_XATTR_INODE_GET_PARENT(inode) == parent->i_ino && + inode->i_generation == parent->i_generation) { + ext4_set_inode_state(inode, EXT4_STATE_LUSTRE_EA_INODE); + ext4_xattr_inode_set_ref(inode, 1); + } else { + inode_lock(inode); + inode->i_flags |= S_NOQUOTA; + inode_unlock(inode); + } + *ea_inode = inode; return 0; error: @@ -404,8 +437,6 @@ ext4_xattr_inode_verify_hashes(struct inode *ea_inode, return 0; } -#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode)->i_mtime.tv_sec) - /* * Read xattr value from the EA inode. */ @@ -418,7 +449,7 @@ ext4_xattr_inode_get(struct inode *inode, struct ext4_xattr_entry *entry, int err; err = ext4_xattr_inode_iget(inode, le32_to_cpu(entry->e_value_inum), - &ea_inode); + le32_to_cpu(entry->e_hash), &ea_inode); if (err) { ea_inode = NULL; goto out; @@ -436,28 +467,20 @@ ext4_xattr_inode_get(struct inode *inode, struct ext4_xattr_entry *entry, if (err) goto out; - err = ext4_xattr_inode_verify_hashes(ea_inode, entry, buffer, size); - /* - * Compatibility check for old Lustre ea_inode implementation. Old - * version does not have hash validation, but it has a backpointer - * from ea_inode to the parent inode. - */ - if (err == -EFSCORRUPTED) { - if (EXT4_XATTR_INODE_GET_PARENT(ea_inode) != inode->i_ino || - ea_inode->i_generation != inode->i_generation) { + if (!ext4_test_inode_state(ea_inode, EXT4_STATE_LUSTRE_EA_INODE)) { + err = ext4_xattr_inode_verify_hashes(ea_inode, entry, buffer, + size); + if (err) { ext4_warning_inode(ea_inode, "EA inode hash validation failed"); goto out; } - /* Do not add ea_inode to the cache. */ - ea_inode_cache = NULL; - } else if (err) - goto out; - if (ea_inode_cache) - mb_cache_entry_create(ea_inode_cache, GFP_NOFS, - ext4_xattr_inode_get_hash(ea_inode), - ea_inode->i_ino, true /* reusable */); + if (ea_inode_cache) + mb_cache_entry_create(ea_inode_cache, GFP_NOFS, + ext4_xattr_inode_get_hash(ea_inode), + ea_inode->i_ino, true /* reusable */); + } out: iput(ea_inode); return err; @@ -824,10 +847,15 @@ static int ext4_xattr_inode_alloc_quota(struct inode *inode, size_t len) return err; } -static void ext4_xattr_inode_free_quota(struct inode *inode, size_t len) +static void ext4_xattr_inode_free_quota(struct inode *parent, + struct inode *ea_inode, + size_t len) { - dquot_free_space_nodirty(inode, round_up_cluster(inode, len)); - dquot_free_inode(inode); + if (ea_inode && + ext4_test_inode_state(ea_inode, EXT4_STATE_LUSTRE_EA_INODE)) + return; + dquot_free_space_nodirty(parent, round_up_cluster(parent, len)); + dquot_free_inode(parent); } int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode, @@ -1057,7 +1085,9 @@ static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent, if (!entry->e_value_inum) continue; ea_ino = le32_to_cpu(entry->e_value_inum); - err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode); + err = ext4_xattr_inode_iget(parent, ea_ino, + le32_to_cpu(entry->e_hash), + &ea_inode); if (err) goto cleanup; err = ext4_xattr_inode_inc_ref(handle, ea_inode); @@ -1079,7 +1109,9 @@ cleanup: if (!entry->e_value_inum) continue; ea_ino = le32_to_cpu(entry->e_value_inum); - err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode); + err = ext4_xattr_inode_iget(parent, ea_ino, + le32_to_cpu(entry->e_hash), + &ea_inode); if (err) { ext4_warning(parent->i_sb, "cleanup ea_ino %u iget error %d", ea_ino, @@ -1117,7 +1149,9 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, if (!entry->e_value_inum) continue; ea_ino = le32_to_cpu(entry->e_value_inum); - err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode); + err = ext4_xattr_inode_iget(parent, ea_ino, + le32_to_cpu(entry->e_hash), + &ea_inode); if (err) continue; @@ -1145,7 +1179,7 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, } if (!skip_quota) - ext4_xattr_inode_free_quota(parent, + ext4_xattr_inode_free_quota(parent, ea_inode, le32_to_cpu(entry->e_value_size)); /* @@ -1529,7 +1563,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, /* Clear padding bytes. */ memset(val + i->value_len, 0, new_size - i->value_len); } - return 0; + goto update_hash; } /* Compute min_offs and last. */ @@ -1577,6 +1611,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, if (!s->not_found && here->e_value_inum) { ret = ext4_xattr_inode_iget(inode, le32_to_cpu(here->e_value_inum), + le32_to_cpu(here->e_hash), &old_ea_inode); if (ret) { old_ea_inode = NULL; @@ -1595,7 +1630,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, &new_ea_inode); if (ret) { new_ea_inode = NULL; - ext4_xattr_inode_free_quota(inode, i->value_len); + ext4_xattr_inode_free_quota(inode, NULL, i->value_len); goto out; } } @@ -1614,13 +1649,13 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, ext4_warning_inode(new_ea_inode, "dec ref new_ea_inode err=%d", err); - ext4_xattr_inode_free_quota(inode, + ext4_xattr_inode_free_quota(inode, new_ea_inode, i->value_len); } goto out; } - ext4_xattr_inode_free_quota(inode, + ext4_xattr_inode_free_quota(inode, old_ea_inode, le32_to_cpu(here->e_value_size)); } @@ -1693,6 +1728,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, here->e_value_size = cpu_to_le32(i->value_len); } +update_hash: if (i->value) { __le32 hash = 0; @@ -1711,7 +1747,8 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, here->e_name_len, &crc32c_hash, 1); } else if (is_block) { - __le32 *value = s->base + min_offs - new_size; + __le32 *value = s->base + le16_to_cpu( + here->e_value_offs); hash = ext4_xattr_hash_entry(here->e_name, here->e_name_len, value, @@ -1789,8 +1826,10 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, struct mb_cache_entry *ce = NULL; int error = 0; struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); - struct inode *ea_inode = NULL; - size_t old_ea_inode_size = 0; + struct inode *ea_inode = NULL, *tmp_inode; + size_t old_ea_inode_quota = 0; + unsigned int ea_ino; + #define header(x) ((struct ext4_xattr_header *)(x)) @@ -1815,9 +1854,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, ea_bdebug(bs->bh, "modifying in-place"); error = ext4_xattr_set_entry(i, s, handle, inode, true /* is_block */); - if (!error) - ext4_xattr_block_cache_insert(ea_block_cache, - bs->bh); ext4_xattr_block_csum_set(inode, bs->bh); unlock_buffer(bs->bh); if (error == -EFSCORRUPTED) @@ -1852,12 +1888,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, * like it has an empty value. */ if (!s->not_found && s->here->e_value_inum) { - /* - * Defer quota free call for previous inode - * until success is guaranteed. - */ - old_ea_inode_size = le32_to_cpu( + ea_ino = le32_to_cpu(s->here->e_value_inum); + error = ext4_xattr_inode_iget(inode, ea_ino, + le32_to_cpu(s->here->e_hash), + &tmp_inode); + if (error) + goto cleanup; + + if (!ext4_test_inode_state(tmp_inode, + EXT4_STATE_LUSTRE_EA_INODE)) { + /* + * Defer quota free call for previous + * inode until success is guaranteed. + */ + old_ea_inode_quota = le32_to_cpu( s->here->e_value_size); + } + iput(tmp_inode); + s->here->e_value_inum = 0; s->here->e_value_size = 0; } @@ -1884,8 +1932,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, goto cleanup; if (i->value && s->here->e_value_inum) { - unsigned int ea_ino; - /* * A ref count on ea_inode has been taken as part of the call to * ext4_xattr_set_entry() above. We would like to drop this @@ -1893,7 +1939,9 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, * initialized and has its own ref count on the ea_inode. */ ea_ino = le32_to_cpu(s->here->e_value_inum); - error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode); + error = ext4_xattr_inode_iget(inode, ea_ino, + le32_to_cpu(s->here->e_hash), + &ea_inode); if (error) { ea_inode = NULL; goto cleanup; @@ -1973,6 +2021,7 @@ inserted: } else if (bs->bh && s->base == bs->bh->b_data) { /* We were modifying this block in-place. */ ea_bdebug(bs->bh, "keeping this block"); + ext4_xattr_block_cache_insert(ea_block_cache, bs->bh); new_bh = bs->bh; get_bh(new_bh); } else { @@ -2042,8 +2091,8 @@ getblk_failed: } } - if (old_ea_inode_size) - ext4_xattr_inode_free_quota(inode, old_ea_inode_size); + if (old_ea_inode_quota) + ext4_xattr_inode_free_quota(inode, NULL, old_ea_inode_quota); /* Update the inode. */ EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; @@ -2070,7 +2119,7 @@ cleanup: /* If there was an error, revert the quota charge. */ if (error) - ext4_xattr_inode_free_quota(inode, + ext4_xattr_inode_free_quota(inode, ea_inode, i_size_read(ea_inode)); iput(ea_inode); } @@ -2625,23 +2674,21 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle) { struct ext4_xattr_ibody_header *header; - struct buffer_head *bh = NULL; + struct buffer_head *bh; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + static unsigned int mnt_count; size_t min_offs; size_t ifree, bfree; int total_ino; void *base, *end; int error = 0, tried_min_extra_isize = 0; - int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize); + int s_min_extra_isize = le16_to_cpu(sbi->s_es->s_min_extra_isize); int isize_diff; /* How much do we need to grow i_extra_isize */ - int no_expand; - - if (ext4_write_trylock_xattr(inode, &no_expand) == 0) - return 0; retry: isize_diff = new_extra_isize - EXT4_I(inode)->i_extra_isize; if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) - goto out; + return 0; header = IHDR(inode, raw_inode); @@ -2676,6 +2723,7 @@ retry: EXT4_ERROR_INODE(inode, "bad block %llu", EXT4_I(inode)->i_file_acl); error = -EFSCORRUPTED; + brelse(bh); goto cleanup; } base = BHDR(bh); @@ -2683,11 +2731,11 @@ retry: min_offs = end - base; bfree = ext4_xattr_free_space(BFIRST(bh), &min_offs, base, NULL); + brelse(bh); if (bfree + ifree < isize_diff) { if (!tried_min_extra_isize && s_min_extra_isize) { tried_min_extra_isize++; new_extra_isize = s_min_extra_isize; - brelse(bh); goto retry; } error = -ENOSPC; @@ -2705,7 +2753,6 @@ retry: s_min_extra_isize) { tried_min_extra_isize++; new_extra_isize = s_min_extra_isize; - brelse(bh); goto retry; } goto cleanup; @@ -2717,18 +2764,13 @@ shift: EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize, (void *)header, total_ino); EXT4_I(inode)->i_extra_isize = new_extra_isize; - brelse(bh); -out: - ext4_write_unlock_xattr(inode, &no_expand); - return 0; cleanup: - brelse(bh); - /* - * Inode size expansion failed; don't try again - */ - no_expand = 1; - ext4_write_unlock_xattr(inode, &no_expand); + if (error && (mnt_count != le16_to_cpu(sbi->s_es->s_mnt_count))) { + ext4_warning(inode->i_sb, "Unable to expand inode %lu. Delete some EAs or run e2fsck.", + inode->i_ino); + mnt_count = le16_to_cpu(sbi->s_es->s_mnt_count); + } return error; } @@ -2793,6 +2835,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, struct ext4_xattr_ibody_header *header; struct ext4_iloc iloc = { .bh = NULL }; struct ext4_xattr_entry *entry; + struct inode *ea_inode; int error; error = ext4_xattr_ensure_credits(handle, inode, extra_credits, @@ -2847,10 +2890,19 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, if (ext4_has_feature_ea_inode(inode->i_sb)) { for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry); - entry = EXT4_XATTR_NEXT(entry)) - if (entry->e_value_inum) - ext4_xattr_inode_free_quota(inode, + entry = EXT4_XATTR_NEXT(entry)) { + if (!entry->e_value_inum) + continue; + error = ext4_xattr_inode_iget(inode, + le32_to_cpu(entry->e_value_inum), + le32_to_cpu(entry->e_hash), + &ea_inode); + if (error) + continue; + ext4_xattr_inode_free_quota(inode, ea_inode, le32_to_cpu(entry->e_value_size)); + iput(ea_inode); + } } diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index a140c5e3dc54..b4b8438c42ef 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -211,7 +211,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, switch (type) { case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; - if (acl) { + if (acl && !ipage) { error = posix_acl_update_mode(inode, &inode->i_mode, &acl); if (error) return error; diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 56bbf592e487..5b876f6d3f6b 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -879,6 +879,7 @@ int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) struct inode *inode; struct f2fs_inode_info *fi; bool is_dir = (type == DIR_INODE); + unsigned long ino = 0; trace_f2fs_sync_dirty_inodes_enter(sbi->sb, is_dir, get_pages(sbi, is_dir ? @@ -901,8 +902,17 @@ retry: inode = igrab(&fi->vfs_inode); spin_unlock(&sbi->inode_lock[type]); if (inode) { + unsigned long cur_ino = inode->i_ino; + filemap_fdatawrite(inode->i_mapping); iput(inode); + /* We need to give cpu to another writers. */ + if (ino == cur_ino) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + cond_resched(); + } else { + ino = cur_ino; + } } else { /* * We should submit bio, since it exists several diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 40fb3d4bb9c2..843a0d99f7ea 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1538,7 +1538,6 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) /* Is it quota file? Do not allow user to mess with it */ if (IS_NOQUOTA(inode)) { - inode_unlock(inode); ret = -EPERM; goto unlock_out; } @@ -1549,9 +1548,8 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { if (!capable(CAP_LINUX_IMMUTABLE)) { - inode_unlock(inode); ret = -EPERM; - goto out; + goto unlock_out; } } @@ -1564,7 +1562,6 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) f2fs_mark_inode_dirty_sync(inode, false); unlock_out: inode_unlock(inode); -out: mnt_drop_write_file(filp); return ret; } diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 9adc202fcd6f..71191d89917d 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -11,6 +11,7 @@ */ #include <linux/proc_fs.h> #include <linux/f2fs_fs.h> +#include <linux/seq_file.h> #include "f2fs.h" #include "segment.h" diff --git a/fs/fuse/file.c b/fs/fuse/file.c index e2ffc499d106..d66789804287 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -46,7 +46,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) { struct fuse_file *ff; - ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL); + ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL); if (unlikely(!ff)) return NULL; @@ -609,7 +609,7 @@ static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_req *req) struct fuse_io_priv *io = req->io; ssize_t pos = -1; - fuse_release_user_pages(req, !io->write); + fuse_release_user_pages(req, io->should_dirty); if (io->write) { if (req->misc.write.in.size != req->misc.write.out.size) @@ -1316,7 +1316,6 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, loff_t *ppos, int flags) { int write = flags & FUSE_DIO_WRITE; - bool should_dirty = !write && iter_is_iovec(iter); int cuse = flags & FUSE_DIO_CUSE; struct file *file = io->file; struct inode *inode = file->f_mapping->host; @@ -1346,6 +1345,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, inode_unlock(inode); } + io->should_dirty = !write && iter_is_iovec(iter); while (count) { size_t nres; fl_owner_t owner = current->files; @@ -1360,7 +1360,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, nres = fuse_send_read(req, io, pos, nbytes, owner); if (!io->async) - fuse_release_user_pages(req, should_dirty); + fuse_release_user_pages(req, io->should_dirty); if (req->out.h.error) { err = req->out.h.error; break; @@ -1669,6 +1669,7 @@ err_nofile: err_free: fuse_request_free(req); err: + mapping_set_error(page->mapping, error); end_page_writeback(page); return error; } @@ -2101,11 +2102,11 @@ static int convert_fuse_file_lock(struct fuse_conn *fc, fl->fl_end = ffl->end; /* - * Convert pid into the caller's pid namespace. If the pid - * does not map into the namespace fl_pid will get set to 0. + * Convert pid into init's pid namespace. The locks API will + * translate it into the caller's pid namespace. */ rcu_read_lock(); - fl->fl_pid = pid_vnr(find_pid_ns(ffl->pid, fc->pid_ns)); + fl->fl_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns); rcu_read_unlock(); break; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 1bd7ffdad593..bd4d2a3e1ec1 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -249,6 +249,7 @@ struct fuse_io_priv { size_t size; __u64 offset; bool write; + bool should_dirty; int err; struct kiocb *iocb; struct file *file; diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 2524807ee070..9d5eecb123de 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -86,19 +86,6 @@ int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) char *data; const char *name = gfs2_acl_name(type); - if (acl && acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode))) - return -E2BIG; - - if (type == ACL_TYPE_ACCESS) { - umode_t mode = inode->i_mode; - - error = posix_acl_update_mode(inode, &inode->i_mode, &acl); - if (error) - return error; - if (mode != inode->i_mode) - mark_inode_dirty(inode); - } - if (acl) { len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0); if (len == 0) @@ -129,6 +116,10 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) struct gfs2_holder gh; bool need_unlock = false; int ret; + umode_t mode; + + if (acl && acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode))) + return -E2BIG; ret = gfs2_rsqa_alloc(ip); if (ret) @@ -140,7 +131,20 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) return ret; need_unlock = true; } + + mode = inode->i_mode; + if (type == ACL_TYPE_ACCESS && acl) { + ret = posix_acl_update_mode(inode, &mode, &acl); + if (ret) + goto unlock; + } + ret = __gfs2_set_acl(inode, acl, type); + if (!ret && mode != inode->i_mode) { + inode->i_mode = mode; + mark_inode_dirty(inode); + } +unlock: if (need_unlock) gfs2_glock_dq_uninit(&gh); return ret; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index ed7a2e252ad8..68ed06962537 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -234,7 +234,19 @@ out: static int gfs2_writepages(struct address_space *mapping, struct writeback_control *wbc) { - return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc); + struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping); + int ret = mpage_writepages(mapping, wbc, gfs2_get_block_noalloc); + + /* + * Even if we didn't write any pages here, we might still be holding + * dirty pages in the ail. We forcibly flush the ail because we don't + * want balance_dirty_pages() to loop indefinitely trying to write out + * pages held in the ail that it can't find. + */ + if (ret == 0) + set_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags); + + return ret; } /** diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 9fa3aef9a5b3..3dd0cceefa43 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -291,8 +291,9 @@ static void gfs2_metapath_ra(struct gfs2_glock *gl, if (trylock_buffer(rabh)) { if (!buffer_uptodate(rabh)) { rabh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, REQ_RAHEAD | REQ_META, - rabh); + submit_bh(REQ_OP_READ, + REQ_RAHEAD | REQ_META | REQ_PRIO, + rabh); continue; } unlock_buffer(rabh); @@ -1103,8 +1104,15 @@ static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp, while (true) { ptr = metapointer(h, mp); - if (*ptr) /* if we have a non-null pointer */ + if (*ptr) { /* if we have a non-null pointer */ + /* Now zero the metapath after the current height. */ + h++; + if (h < GFS2_MAX_META_HEIGHT) + memset(&mp->mp_list[h], 0, + (GFS2_MAX_META_HEIGHT - h) * + sizeof(mp->mp_list[0])); return true; + } if (mp->mp_list[h] < ptrs) mp->mp_list[h]++; @@ -1120,6 +1128,13 @@ enum dealloc_states { DEALLOC_DONE = 3, /* process complete */ }; +static bool mp_eq_to_hgt(struct metapath *mp, __u16 *nbof, unsigned int h) +{ + if (memcmp(mp->mp_list, nbof, h * sizeof(mp->mp_list[0]))) + return false; + return true; +} + /** * trunc_dealloc - truncate a file down to a desired size * @ip: inode to truncate @@ -1197,8 +1212,7 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 newsize) /* If we're truncating to a non-zero size and the mp is at the beginning of file for the strip height, we need to preserve the first metadata pointer. */ - preserve1 = (newsize && - (mp.mp_list[mp_h] == nbof[mp_h])); + preserve1 = (newsize && mp_eq_to_hgt(&mp, nbof, mp_h)); bh = mp.mp_bh[mp_h]; gfs2_assert_withdraw(sdp, bh); if (gfs2_assert_withdraw(sdp, diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 5ee2e2f8576c..06a0d1947c77 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -1513,7 +1513,9 @@ static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index, continue; } bh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, REQ_RAHEAD | REQ_META, bh); + submit_bh(REQ_OP_READ, + REQ_RAHEAD | REQ_META | REQ_PRIO, + bh); continue; } brelse(bh); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index c53ac6efd04c..33a0cb5701a3 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1032,8 +1032,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) mutex_lock(&fp->f_fl_mutex); - gl = fl_gh->gh_gl; - if (gl) { + if (gfs2_holder_initialized(fl_gh)) { if (fl_gh->gh_state == state) goto out; locks_lock_file_wait(file, diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index c38ab6c81898..98e845b7841b 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -15,6 +15,7 @@ #include <linux/buffer_head.h> #include <linux/delay.h> #include <linux/sort.h> +#include <linux/hash.h> #include <linux/jhash.h> #include <linux/kallsyms.h> #include <linux/gfs2_ondisk.h> @@ -71,7 +72,7 @@ static DEFINE_SPINLOCK(lru_lock); #define GFS2_GL_HASH_SHIFT 15 #define GFS2_GL_HASH_SIZE BIT(GFS2_GL_HASH_SHIFT) -static struct rhashtable_params ht_parms = { +static const struct rhashtable_params ht_parms = { .nelem_hint = GFS2_GL_HASH_SIZE * 3 / 4, .key_len = offsetofend(struct lm_lockname, ln_type), .key_offset = offsetof(struct gfs2_glock, gl_name), @@ -80,6 +81,49 @@ static struct rhashtable_params ht_parms = { static struct rhashtable gl_hash_table; +#define GLOCK_WAIT_TABLE_BITS 12 +#define GLOCK_WAIT_TABLE_SIZE (1 << GLOCK_WAIT_TABLE_BITS) +static wait_queue_head_t glock_wait_table[GLOCK_WAIT_TABLE_SIZE] __cacheline_aligned; + +struct wait_glock_queue { + struct lm_lockname *name; + wait_queue_entry_t wait; +}; + +static int glock_wake_function(wait_queue_entry_t *wait, unsigned int mode, + int sync, void *key) +{ + struct wait_glock_queue *wait_glock = + container_of(wait, struct wait_glock_queue, wait); + struct lm_lockname *wait_name = wait_glock->name; + struct lm_lockname *wake_name = key; + + if (wake_name->ln_sbd != wait_name->ln_sbd || + wake_name->ln_number != wait_name->ln_number || + wake_name->ln_type != wait_name->ln_type) + return 0; + return autoremove_wake_function(wait, mode, sync, key); +} + +static wait_queue_head_t *glock_waitqueue(struct lm_lockname *name) +{ + u32 hash = jhash2((u32 *)name, sizeof(*name) / 4, 0); + + return glock_wait_table + hash_32(hash, GLOCK_WAIT_TABLE_BITS); +} + +/** + * wake_up_glock - Wake up waiters on a glock + * @gl: the glock + */ +static void wake_up_glock(struct gfs2_glock *gl) +{ + wait_queue_head_t *wq = glock_waitqueue(&gl->gl_name); + + if (waitqueue_active(wq)) + __wake_up(wq, TASK_NORMAL, 1, &gl->gl_name); +} + static void gfs2_glock_dealloc(struct rcu_head *rcu) { struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu); @@ -96,6 +140,9 @@ void gfs2_glock_free(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms); + smp_mb(); + wake_up_glock(gl); call_rcu(&gl->gl_rcu, gfs2_glock_dealloc); if (atomic_dec_and_test(&sdp->sd_glock_disposal)) wake_up(&sdp->sd_glock_wait); @@ -107,7 +154,7 @@ void gfs2_glock_free(struct gfs2_glock *gl) * */ -static void gfs2_glock_hold(struct gfs2_glock *gl) +void gfs2_glock_hold(struct gfs2_glock *gl) { GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref)); lockref_get(&gl->gl_lockref); @@ -150,6 +197,9 @@ void gfs2_glock_add_to_lru(struct gfs2_glock *gl) static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl) { + if (!(gl->gl_ops->go_flags & GLOF_LRU)) + return; + spin_lock(&lru_lock); if (!list_empty(&gl->gl_lru)) { list_del_init(&gl->gl_lru); @@ -191,13 +241,20 @@ static void __gfs2_glock_put(struct gfs2_glock *gl) gfs2_glock_remove_from_lru(gl); spin_unlock(&gl->gl_lockref.lock); - rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms); GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); GLOCK_BUG_ON(gl, mapping && mapping->nrpages); trace_gfs2_glock_put(gl); sdp->sd_lockstruct.ls_ops->lm_put_lock(gl); } +/* + * Cause the glock to be put in work queue context. + */ +void gfs2_glock_queue_put(struct gfs2_glock *gl) +{ + gfs2_glock_queue_work(gl, 0); +} + /** * gfs2_glock_put() - Decrement reference count on glock * @gl: The glock to put @@ -676,6 +733,40 @@ static void glock_work_func(struct work_struct *work) spin_unlock(&gl->gl_lockref.lock); } +static struct gfs2_glock *find_insert_glock(struct lm_lockname *name, + struct gfs2_glock *new) +{ + struct wait_glock_queue wait; + wait_queue_head_t *wq = glock_waitqueue(name); + struct gfs2_glock *gl; + + wait.name = name; + init_wait(&wait.wait); + wait.wait.func = glock_wake_function; + +again: + prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + rcu_read_lock(); + if (new) { + gl = rhashtable_lookup_get_insert_fast(&gl_hash_table, + &new->gl_node, ht_parms); + if (IS_ERR(gl)) + goto out; + } else { + gl = rhashtable_lookup_fast(&gl_hash_table, + name, ht_parms); + } + if (gl && !lockref_get_not_dead(&gl->gl_lockref)) { + rcu_read_unlock(); + schedule(); + goto again; + } +out: + rcu_read_unlock(); + finish_wait(wq, &wait.wait); + return gl; +} + /** * gfs2_glock_get() - Get a glock, or create one if one doesn't exist * @sdp: The GFS2 superblock @@ -702,15 +793,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, struct kmem_cache *cachep; int ret = 0; - rcu_read_lock(); - gl = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms); - if (gl && !lockref_get_not_dead(&gl->gl_lockref)) - gl = NULL; - rcu_read_unlock(); - - *glp = gl; - if (gl) + gl = find_insert_glock(&name, NULL); + if (gl) { + *glp = gl; return 0; + } if (!create) return -ENOENT; @@ -764,10 +851,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, mapping->writeback_index = 0; } -again: - rcu_read_lock(); - tmp = rhashtable_lookup_get_insert_fast(&gl_hash_table, &gl->gl_node, - ht_parms); + tmp = find_insert_glock(&name, gl); if (!tmp) { *glp = gl; goto out; @@ -776,13 +860,7 @@ again: ret = PTR_ERR(tmp); goto out_free; } - if (lockref_get_not_dead(&tmp->gl_lockref)) { - *glp = tmp; - goto out_free; - } - rcu_read_unlock(); - cond_resched(); - goto again; + *glp = tmp; out_free: kfree(gl->gl_lksb.sb_lvbptr); @@ -790,7 +868,6 @@ out_free: atomic_dec(&sdp->sd_glock_disposal); out: - rcu_read_unlock(); return ret; } @@ -1473,14 +1550,15 @@ static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp) do { gl = ERR_PTR(rhashtable_walk_start(&iter)); - if (gl) - continue; + if (IS_ERR(gl)) + goto walk_stop; while ((gl = rhashtable_walk_next(&iter)) && !IS_ERR(gl)) - if ((gl->gl_name.ln_sbd == sdp) && + if (gl->gl_name.ln_sbd == sdp && lockref_get_not_dead(&gl->gl_lockref)) examiner(gl); +walk_stop: rhashtable_walk_stop(&iter); } while (cond_resched(), gl == ERR_PTR(-EAGAIN)); @@ -1803,7 +1881,7 @@ static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr) int __init gfs2_glock_init(void) { - int ret; + int i, ret; ret = rhashtable_init(&gl_hash_table, &ht_parms); if (ret < 0) @@ -1832,6 +1910,9 @@ int __init gfs2_glock_init(void) return ret; } + for (i = 0; i < GLOCK_WAIT_TABLE_SIZE; i++) + init_waitqueue_head(glock_wait_table + i); + return 0; } @@ -1860,6 +1941,7 @@ static void gfs2_glock_iter_next(struct gfs2_glock_iter *gi) } static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) { struct gfs2_glock_iter *gi = seq->private; loff_t n = *pos; @@ -1892,6 +1974,7 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr, } static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr) + __releases(RCU) { struct gfs2_glock_iter *gi = seq->private; diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 9ad4a6ac6c84..5e12220cc0c2 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -13,6 +13,7 @@ #include <linux/sched.h> #include <linux/parser.h> #include "incore.h" +#include "util.h" /* Options for hostdata parser */ @@ -181,7 +182,9 @@ static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl) extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, const struct gfs2_glock_operations *glops, int create, struct gfs2_glock **glp); +extern void gfs2_glock_hold(struct gfs2_glock *gl); extern void gfs2_glock_put(struct gfs2_glock *gl); +extern void gfs2_glock_queue_put(struct gfs2_glock *gl); extern void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, u16 flags, struct gfs2_holder *gh); extern void gfs2_holder_reinit(unsigned int state, u16 flags, @@ -257,11 +260,44 @@ static inline bool gfs2_holder_initialized(struct gfs2_holder *gh) return gh->gh_gl; } +/** + * glock_set_object - set the gl_object field of a glock + * @gl: the glock + * @object: the object + */ static inline void glock_set_object(struct gfs2_glock *gl, void *object) { spin_lock(&gl->gl_lockref.lock); + if (gfs2_assert_warn(gl->gl_name.ln_sbd, gl->gl_object == NULL)) + gfs2_dump_glock(NULL, gl); gl->gl_object = object; spin_unlock(&gl->gl_lockref.lock); } +/** + * glock_clear_object - clear the gl_object field of a glock + * @gl: the glock + * @object: the object + * + * I'd love to similarly add this: + * else if (gfs2_assert_warn(gl->gl_sbd, gl->gl_object == object)) + * gfs2_dump_glock(NULL, gl); + * Unfortunately, that's not possible because as soon as gfs2_delete_inode + * frees the block in the rgrp, another process can reassign it for an I_NEW + * inode in gfs2_create_inode because that calls new_inode, not gfs2_iget. + * That means gfs2_delete_inode may subsequently try to call this function + * for a glock that's already pointing to a brand new inode. If we clear the + * new inode's gl_object, we'll introduce metadata corruption. Function + * gfs2_delete_inode calls clear_inode which calls gfs2_clear_inode which also + * tries to clear gl_object, so it's more than just gfs2_delete_inode. + * + */ +static inline void glock_clear_object(struct gfs2_glock *gl, void *object) +{ + spin_lock(&gl->gl_lockref.lock); + if (gl->gl_object == object) + gl->gl_object = NULL; + spin_unlock(&gl->gl_lockref.lock); +} + #endif /* __GLOCK_DOT_H__ */ diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 5e69636d4dd3..dac6559e2195 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -329,32 +329,6 @@ static int inode_go_demote_ok(const struct gfs2_glock *gl) return 1; } -/** - * gfs2_set_nlink - Set the inode's link count based on on-disk info - * @inode: The inode in question - * @nlink: The link count - * - * If the link count has hit zero, it must never be raised, whatever the - * on-disk inode might say. When new struct inodes are created the link - * count is set to 1, so that we can safely use this test even when reading - * in on disk information for the first time. - */ - -static void gfs2_set_nlink(struct inode *inode, u32 nlink) -{ - /* - * We will need to review setting the nlink count here in the - * light of the forthcoming ro bind mount work. This is a reminder - * to do that. - */ - if ((inode->i_nlink != nlink) && (inode->i_nlink != 0)) { - if (nlink == 0) - clear_nlink(inode); - else - set_nlink(inode, nlink); - } -} - static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) { const struct gfs2_dinode *str = buf; @@ -376,7 +350,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) i_uid_write(&ip->i_inode, be32_to_cpu(str->di_uid)); i_gid_write(&ip->i_inode, be32_to_cpu(str->di_gid)); - gfs2_set_nlink(&ip->i_inode, be32_to_cpu(str->di_nlink)); + set_nlink(&ip->i_inode, be32_to_cpu(str->di_nlink)); i_size_write(&ip->i_inode, be64_to_cpu(str->di_size)); gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); atime.tv_sec = be64_to_cpu(str->di_atime); @@ -470,7 +444,7 @@ static int inode_go_lock(struct gfs2_holder *gh) (gh->gh_state == LM_ST_EXCLUSIVE)) { spin_lock(&sdp->sd_trunc_lock); if (list_empty(&ip->i_trunc_list)) - list_add(&sdp->sd_trunc_list, &ip->i_trunc_list); + list_add(&ip->i_trunc_list, &sdp->sd_trunc_list); spin_unlock(&sdp->sd_trunc_lock); wake_up(&sdp->sd_quota_wait); return 1; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 73fce76e67ee..6e18e9793ec4 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -606,6 +606,7 @@ enum { SDF_NOJOURNALID = 6, SDF_RORECOVERY = 7, /* read only recovery */ SDF_SKIP_DLM_UNLOCK = 8, + SDF_FORCE_AIL_FLUSH = 9, }; enum gfs2_freeze_state { @@ -816,6 +817,7 @@ struct gfs2_sbd { atomic_t sd_log_in_flight; struct bio *sd_log_bio; wait_queue_head_t sd_log_flush_wait; + int sd_log_error; atomic_t sd_reserving_log; wait_queue_head_t sd_reserving_log_wait; @@ -831,7 +833,7 @@ struct gfs2_sbd { atomic_t sd_freeze_state; struct mutex sd_freeze_mutex; - char sd_fsname[GFS2_FSNAME_LEN]; + char sd_fsname[GFS2_FSNAME_LEN + 3 * sizeof(int) + 2]; char sd_table_name[GFS2_FSNAME_LEN]; char sd_proto_name[GFS2_FSNAME_LEN]; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index acca501f8110..863749e29bf9 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -109,7 +109,7 @@ static void gfs2_set_iop(struct inode *inode) * @no_addr: The inode number * @no_formal_ino: The inode generation number * @blktype: Requested block type (GFS2_BLKST_DINODE or GFS2_BLKST_UNLINKED; - * GFS2_BLKST_FREE do indicate not to verify) + * GFS2_BLKST_FREE to indicate not to verify) * * If @type is DT_UNKNOWN, the inode type is fetched from disk. * @@ -145,7 +145,6 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, if (unlikely(error)) goto fail; flush_delayed_work(&ip->i_gl->gl_work); - glock_set_object(ip->i_gl, ip); error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl); if (unlikely(error)) @@ -170,11 +169,11 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, } } + glock_set_object(ip->i_gl, ip); set_bit(GIF_INVALID, &ip->i_flags); error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); if (unlikely(error)) goto fail_put; - flush_delayed_work(&ip->i_iopen_gh.gh_gl->gl_work); glock_set_object(ip->i_iopen_gh.gh_gl, ip); gfs2_glock_put(io_gl); io_gl = NULL; @@ -202,14 +201,14 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, fail_refresh: ip->i_iopen_gh.gh_flags |= GL_NOCACHE; - glock_set_object(ip->i_iopen_gh.gh_gl, NULL); + glock_clear_object(ip->i_iopen_gh.gh_gl, ip); gfs2_glock_dq_uninit(&ip->i_iopen_gh); fail_put: if (io_gl) gfs2_glock_put(io_gl); + glock_clear_object(ip->i_gl, ip); if (gfs2_holder_initialized(&i_gh)) gfs2_glock_dq_uninit(&i_gh); - glock_set_object(ip->i_gl, NULL); fail: iget_failed(inode); return ERR_PTR(error); @@ -706,8 +705,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); if (error) goto fail_free_inode; - + flush_delayed_work(&ip->i_gl->gl_work); glock_set_object(ip->i_gl, ip); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); if (error) goto fail_free_inode; @@ -775,14 +775,17 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, return error; fail_gunlock3: + glock_clear_object(io_gl, ip); gfs2_glock_dq_uninit(&ip->i_iopen_gh); gfs2_glock_put(io_gl); fail_gunlock2: if (io_gl) clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags); fail_free_inode: - if (ip->i_gl) + if (ip->i_gl) { + glock_clear_object(ip->i_gl, ip); gfs2_glock_put(ip->i_gl); + } gfs2_rsqa_delete(ip, NULL); fail_free_acls: if (default_acl) diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 0515f0a68637..65f33a0ac190 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -23,8 +23,6 @@ #include "sys.h" #include "trace_gfs2.h" -extern struct workqueue_struct *gfs2_control_wq; - /** * gfs2_update_stats - Update time based stats * @mv: Pointer to mean/variance structure to update @@ -1059,6 +1057,7 @@ static void free_recover_size(struct lm_lockstruct *ls) ls->ls_recover_submit = NULL; ls->ls_recover_result = NULL; ls->ls_recover_size = 0; + ls->ls_lvb_bits = NULL; } /* dlm calls before it does lock recovery */ @@ -1175,7 +1174,7 @@ static void gdlm_recovery_result(struct gfs2_sbd *sdp, unsigned int jid, spin_unlock(&ls->ls_recover_spin); } -const struct dlm_lockspace_ops gdlm_lockspace_ops = { +static const struct dlm_lockspace_ops gdlm_lockspace_ops = { .recover_prep = gdlm_recover_prep, .recover_slot = gdlm_recover_slot, .recover_done = gdlm_recover_done, diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 9a624f694400..f72c44231406 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -898,6 +898,10 @@ static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp) static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp) { unsigned int used_blocks = sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free); + + if (test_and_clear_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags)) + return 1; + return used_blocks + atomic_read(&sdp->sd_log_blks_needed) >= atomic_read(&sdp->sd_log_thresh2); } @@ -919,6 +923,15 @@ int gfs2_logd(void *data) while (!kthread_should_stop()) { + /* Check for errors writing to the journal */ + if (sdp->sd_log_error) { + gfs2_lm_withdraw(sdp, + "GFS2: fsid=%s: error %d: " + "withdrawing the file system to " + "prevent further damage.\n", + sdp->sd_fsname, sdp->sd_log_error); + } + did_flush = false; if (gfs2_jrnl_flush_reqd(sdp) || t == 0) { gfs2_ail1_empty(sdp); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 3010f9edd177..7dabbe721dba 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -207,8 +207,11 @@ static void gfs2_end_log_write(struct bio *bio) struct page *page; int i; - if (bio->bi_status) - fs_err(sdp, "Error %d writing to log\n", bio->bi_status); + if (bio->bi_status) { + fs_err(sdp, "Error %d writing to journal, jid=%u\n", + bio->bi_status, sdp->sd_jdesc->jd_jid); + wake_up(&sdp->sd_logd_waitq); + } bio_for_each_segment_all(bvec, bio, i) { page = bvec->bv_page; diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index fabe1614f879..61ef6c9be816 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -419,8 +419,9 @@ int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num, if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) { brelse(bh); ret = -EIO; + } else { + *bhp = bh; } - *bhp = bh; return ret; } @@ -452,7 +453,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) if (buffer_uptodate(first_bh)) goto out; if (!buffer_locked(first_bh)) - ll_rw_block(REQ_OP_READ, REQ_META, 1, &first_bh); + ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &first_bh); dblock++; extlen--; @@ -461,7 +462,9 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) bh = gfs2_getbuf(gl, dblock, CREATE); if (!buffer_uptodate(bh) && !buffer_locked(bh)) - ll_rw_block(REQ_OP_READ, REQ_RAHEAD | REQ_META, 1, &bh); + ll_rw_block(REQ_OP_READ, + REQ_RAHEAD | REQ_META | REQ_PRIO, + 1, &bh); brelse(bh); dblock++; extlen--; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index e76058d34b74..c0a4b3778f3f 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1113,7 +1113,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent return error; } - snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s", sdp->sd_table_name); + snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s", sdp->sd_table_name); error = gfs2_sys_fs_add(sdp); /* @@ -1159,10 +1159,10 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent } if (sdp->sd_args.ar_spectator) - snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", + snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s.s", sdp->sd_table_name); else - snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", + snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s.%u", sdp->sd_table_name, sdp->sd_lockstruct.ls_jid); error = init_inodes(sdp, DO); @@ -1388,7 +1388,6 @@ static void gfs2_kill_sb(struct super_block *sb) sdp->sd_root_dir = NULL; sdp->sd_master_dir = NULL; shrink_dcache_sb(sb); - gfs2_delete_debugfs_file(sdp); free_percpu(sdp->sd_lkstats); kill_block_super(sb); } diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index c2ca9566b764..e647938432bd 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -730,7 +730,7 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index, if (PageUptodate(page)) set_buffer_uptodate(bh); if (!buffer_uptodate(bh)) { - ll_rw_block(REQ_OP_READ, REQ_META, 1, &bh); + ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) goto unlock_out; @@ -1474,8 +1474,11 @@ static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error) { if (error == 0 || error == -EROFS) return; - if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) { fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error); + sdp->sd_log_error = error; + wake_up(&sdp->sd_logd_waitq); + } } static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg, diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 836e38ba5d0a..95b2a57ded33 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -705,8 +705,7 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) rb_erase(n, &sdp->sd_rindex_tree); if (gl) { - glock_set_object(gl, NULL); - gfs2_glock_add_to_lru(gl); + glock_clear_object(gl, rgd); gfs2_glock_put(gl); } diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index fdedec379b78..769841185ce5 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -924,6 +924,7 @@ restart: gfs2_jindex_free(sdp); /* Take apart glock structures and buffer lists */ gfs2_gl_hash_clear(sdp); + gfs2_delete_debugfs_file(sdp); /* Unmount the locking protocol */ gfs2_lm_unmount(sdp); @@ -943,9 +944,9 @@ static int gfs2_sync_fs(struct super_block *sb, int wait) struct gfs2_sbd *sdp = sb->s_fs_info; gfs2_quota_sync(sb, -1); - if (wait && sdp) + if (wait) gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); - return 0; + return sdp->sd_log_error; } void gfs2_freeze_func(struct work_struct *work) @@ -1295,7 +1296,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) * gfs2_drop_inode - Drop an inode (test for remote unlink) * @inode: The inode to drop * - * If we've received a callback on an iopen lock then its because a + * If we've received a callback on an iopen lock then it's because a * remote node tried to deallocate the inode but failed due to this node * still having the inode open. Here we mark the link count zero * since we know that it must have reached zero if the GLF_DEMOTE flag @@ -1317,6 +1318,23 @@ static int gfs2_drop_inode(struct inode *inode) if (test_bit(GLF_DEMOTE, &gl->gl_flags)) clear_nlink(inode); } + + /* + * When under memory pressure when an inode's link count has dropped to + * zero, defer deleting the inode to the delete workqueue. This avoids + * calling into DLM under memory pressure, which can deadlock. + */ + if (!inode->i_nlink && + unlikely(current->flags & PF_MEMALLOC) && + gfs2_holder_initialized(&ip->i_iopen_gh)) { + struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; + + gfs2_glock_hold(gl); + if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0) + gfs2_glock_queue_put(gl); + return false; + } + return generic_drop_inode(inode); } @@ -1501,6 +1519,22 @@ out_qs: } /** + * gfs2_glock_put_eventually + * @gl: The glock to put + * + * When under memory pressure, trigger a deferred glock put to make sure we + * won't call into DLM and deadlock. Otherwise, put the glock directly. + */ + +static void gfs2_glock_put_eventually(struct gfs2_glock *gl) +{ + if (current->flags & PF_MEMALLOC) + gfs2_glock_queue_put(gl); + else + gfs2_glock_put(gl); +} + +/** * gfs2_evict_inode - Remove an inode from cache * @inode: The inode to evict * @@ -1544,9 +1578,14 @@ static void gfs2_evict_inode(struct inode *inode) goto alloc_failed; } + /* Deletes should never happen under memory pressure anymore. */ + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) + goto out; + /* Must not read inode block until block type has been verified */ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh); if (unlikely(error)) { + glock_clear_object(ip->i_iopen_gh.gh_gl, ip); ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_uninit(&ip->i_iopen_gh); goto out; @@ -1562,6 +1601,12 @@ static void gfs2_evict_inode(struct inode *inode) goto out_truncate; } + /* + * The inode may have been recreated in the meantime. + */ + if (inode->i_nlink) + goto out_truncate; + alloc_failed: if (gfs2_holder_initialized(&ip->i_iopen_gh) && test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { @@ -1595,6 +1640,11 @@ alloc_failed: goto out_unlock; } + /* We're about to clear the bitmap for the dinode, but as soon as we + do, gfs2_create_inode can create another inode at the same block + location and try to set gl_object again. We clear gl_object here so + that subsequent inode creates don't see an old gl_object. */ + glock_clear_object(ip->i_gl, ip); error = gfs2_dinode_dealloc(ip); goto out_unlock; @@ -1623,14 +1673,17 @@ out_unlock: gfs2_rs_deltree(&ip->i_res); if (gfs2_holder_initialized(&ip->i_iopen_gh)) { + glock_clear_object(ip->i_iopen_gh.gh_gl, ip); if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq(&ip->i_iopen_gh); } gfs2_holder_uninit(&ip->i_iopen_gh); } - if (gfs2_holder_initialized(&gh)) + if (gfs2_holder_initialized(&gh)) { + glock_clear_object(ip->i_gl, ip); gfs2_glock_dq_uninit(&gh); + } if (error && error != GLR_TRYFAILED && error != -EROFS) fs_warn(sdp, "gfs2_evict_inode: %d\n", error); out: @@ -1640,15 +1693,19 @@ out: gfs2_ordered_del_inode(ip); clear_inode(inode); gfs2_dir_hash_inval(ip); - glock_set_object(ip->i_gl, NULL); + glock_clear_object(ip->i_gl, ip); wait_on_bit_io(&ip->i_flags, GIF_GLOP_PENDING, TASK_UNINTERRUPTIBLE); gfs2_glock_add_to_lru(ip->i_gl); - gfs2_glock_put(ip->i_gl); + gfs2_glock_put_eventually(ip->i_gl); ip->i_gl = NULL; if (gfs2_holder_initialized(&ip->i_iopen_gh)) { - glock_set_object(ip->i_iopen_gh.gh_gl, NULL); + struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; + + glock_clear_object(gl, ip); ip->i_iopen_gh.gh_flags |= GL_NOCACHE; + gfs2_glock_hold(gl); gfs2_glock_dq_uninit(&ip->i_iopen_gh); + gfs2_glock_put_eventually(gl); } } diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index c81295f407f6..3926f95a6eb7 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -151,6 +151,7 @@ extern struct kmem_cache *gfs2_rgrpd_cachep; extern struct kmem_cache *gfs2_quotad_cachep; extern struct kmem_cache *gfs2_qadata_cachep; extern mempool_t *gfs2_page_pool; +extern struct workqueue_struct *gfs2_control_wq; static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt, unsigned int *p) diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 54179554c7d2..ea09e41dbb49 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -25,6 +25,7 @@ #include "meta_io.h" #include "quota.h" #include "rgrp.h" +#include "super.h" #include "trans.h" #include "util.h" @@ -1209,8 +1210,12 @@ int __gfs2_xattr_set(struct inode *inode, const char *name, if (namel > GFS2_EA_MAX_NAME_LEN) return -ERANGE; - if (value == NULL) - return gfs2_xattr_remove(ip, type, name); + if (value == NULL) { + error = gfs2_xattr_remove(ip, type, name); + if (error == -ENODATA && !(flags & XATTR_REPLACE)) + error = 0; + return error; + } if (ea_check_size(sdp, namel, size)) return -ERANGE; diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c index 9b92058a1240..6bb5d7c42888 100644 --- a/fs/hfsplus/posix_acl.c +++ b/fs/hfsplus/posix_acl.c @@ -51,8 +51,8 @@ struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type) return acl; } -int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl, - int type) +static int __hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl, + int type) { int err; char *xattr_name; @@ -64,12 +64,6 @@ int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl, switch (type) { case ACL_TYPE_ACCESS: xattr_name = XATTR_NAME_POSIX_ACL_ACCESS; - if (acl) { - err = posix_acl_update_mode(inode, &inode->i_mode, &acl); - if (err) - return err; - } - err = 0; break; case ACL_TYPE_DEFAULT: @@ -105,6 +99,18 @@ end_set_acl: return err; } +int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int err; + + if (type == ACL_TYPE_ACCESS && acl) { + err = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (err) + return err; + } + return __hfsplus_set_posix_acl(inode, acl, type); +} + int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir) { int err = 0; @@ -122,15 +128,15 @@ int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir) return err; if (default_acl) { - err = hfsplus_set_posix_acl(inode, default_acl, - ACL_TYPE_DEFAULT); + err = __hfsplus_set_posix_acl(inode, default_acl, + ACL_TYPE_DEFAULT); posix_acl_release(default_acl); } if (acl) { if (!err) - err = hfsplus_set_posix_acl(inode, acl, - ACL_TYPE_ACCESS); + err = __hfsplus_set_posix_acl(inode, acl, + ACL_TYPE_ACCESS); posix_acl_release(acl); } return err; diff --git a/fs/inode.c b/fs/inode.c index 50370599e371..6a1626e0edaf 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -637,6 +637,7 @@ again: dispose_list(&dispose); } +EXPORT_SYMBOL_GPL(evict_inodes); /** * invalidate_inodes - attempt to free all inodes on a superblock diff --git a/fs/internal.h b/fs/internal.h index 9676fe11c093..fedfe94d84ba 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -132,7 +132,6 @@ static inline bool atime_needs_update_rcu(const struct path *path, extern void inode_io_list_del(struct inode *inode); extern long get_nr_dirty_inodes(void); -extern void evict_inodes(struct super_block *); extern int invalidate_inodes(struct super_block *, bool); /* diff --git a/fs/iomap.c b/fs/iomap.c index 039266128b7f..8554a8d75281 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -278,7 +278,7 @@ iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data, unsigned long bytes; /* Bytes to write to page */ offset = (pos & (PAGE_SIZE - 1)); - bytes = min_t(unsigned long, PAGE_SIZE - offset, length); + bytes = min_t(loff_t, PAGE_SIZE - offset, length); rpage = __iomap_read_page(inode, pos); if (IS_ERR(rpage)) @@ -373,7 +373,7 @@ iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, unsigned offset, bytes; offset = pos & (PAGE_SIZE - 1); /* Within page */ - bytes = min_t(unsigned, PAGE_SIZE - offset, count); + bytes = min_t(loff_t, PAGE_SIZE - offset, count); if (IS_DAX(inode)) status = iomap_dax_zero(pos, offset, bytes, iomap); @@ -477,10 +477,10 @@ int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) set_page_dirty(page); wait_for_stable_page(page); - return 0; + return VM_FAULT_LOCKED; out_unlock: unlock_page(page); - return ret; + return block_page_mkwrite_return(ret); } EXPORT_SYMBOL_GPL(iomap_page_mkwrite); diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 8cf898a59730..217a5e7815da 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -410,7 +410,11 @@ static int parse_options(char *options, struct iso9660_options *popt) if (match_int(&args[0], &option)) return 0; n = option; - if (n > 99) + /* + * Track numbers are supposed to be in range 1-99, the + * mount option starts indexing at 0. + */ + if (n >= 99) return 0; popt->session = n + 1; break; @@ -543,7 +547,7 @@ static unsigned int isofs_get_last_session(struct super_block *sb, s32 session) vol_desc_start=0; ms_info.addr_format=CDROM_LBA; - if(session >= 0 && session <= 99) { + if (session > 0) { struct cdrom_tocentry Te; Te.cdte_track=session; Te.cdte_format=CDROM_LBA; diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 7bc186f4ed4d..2e71b6e7e646 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -77,13 +77,6 @@ static int __jfs_set_acl(tid_t tid, struct inode *inode, int type, switch (type) { case ACL_TYPE_ACCESS: ea_name = XATTR_NAME_POSIX_ACL_ACCESS; - if (acl) { - rc = posix_acl_update_mode(inode, &inode->i_mode, &acl); - if (rc) - return rc; - inode->i_ctime = current_time(inode); - mark_inode_dirty(inode); - } break; case ACL_TYPE_DEFAULT: ea_name = XATTR_NAME_POSIX_ACL_DEFAULT; @@ -115,12 +108,27 @@ int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { int rc; tid_t tid; + int update_mode = 0; + umode_t mode = inode->i_mode; tid = txBegin(inode->i_sb, 0); mutex_lock(&JFS_IP(inode)->commit_mutex); + if (type == ACL_TYPE_ACCESS && acl) { + rc = posix_acl_update_mode(inode, &mode, &acl); + if (rc) + goto end_tx; + update_mode = 1; + } rc = __jfs_set_acl(tid, inode, type, acl); - if (!rc) + if (!rc) { + if (update_mode) { + inode->i_mode = mode; + inode->i_ctime = current_time(inode); + mark_inode_dirty(inode); + } rc = txCommit(tid, 1, &inode, 0); + } +end_tx: txEnd(tid); mutex_unlock(&JFS_IP(inode)->commit_mutex); return rc; diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c index bd9b641ada2c..7ddcb445a3d9 100644 --- a/fs/jfs/resize.c +++ b/fs/jfs/resize.c @@ -98,7 +98,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) goto out; } - VolumeSize = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits; + VolumeSize = i_size_read(sb->s_bdev->bd_inode) >> sb->s_blocksize_bits; if (VolumeSize) { if (newLVSize > VolumeSize) { @@ -211,7 +211,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) txQuiesce(sb); /* Reset size of direct inode */ - sbi->direct_inode->i_size = sb->s_bdev->bd_inode->i_size; + sbi->direct_inode->i_size = i_size_read(sb->s_bdev->bd_inode); if (sbi->mntflag & JFS_INLINELOG) { /* diff --git a/fs/jfs/super.c b/fs/jfs/super.c index e8aad7d87b8c..60726ae7cf26 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -313,7 +313,7 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, } case Opt_resize_nosize: { - *newLVSize = sb->s_bdev->bd_inode->i_size >> + *newLVSize = i_size_read(sb->s_bdev->bd_inode) >> sb->s_blocksize_bits; if (*newLVSize == 0) pr_err("JFS: Cannot determine volume size\n"); @@ -579,7 +579,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) goto out_unload; } inode->i_ino = 0; - inode->i_size = sb->s_bdev->bd_inode->i_size; + inode->i_size = i_size_read(sb->s_bdev->bd_inode); inode->i_mapping->a_ops = &jfs_metapage_aops; hlist_add_fake(&inode->i_hash); mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); @@ -619,16 +619,10 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) if (!sb->s_root) goto out_no_root; - /* logical blocks are represented by 40 bits in pxd_t, etc. */ - sb->s_maxbytes = ((u64) sb->s_blocksize) << 40; -#if BITS_PER_LONG == 32 - /* - * Page cache is indexed by long. - * I would use MAX_LFS_FILESIZE, but it's only half as big + /* logical blocks are represented by 40 bits in pxd_t, etc. + * and page cache is indexed by long */ - sb->s_maxbytes = min(((u64) PAGE_SIZE << 32) - 1, - (u64)sb->s_maxbytes); -#endif + sb->s_maxbytes = min(((loff_t)sb->s_blocksize) << 40, MAX_LFS_FILESIZE); sb->s_time_gran = 1; return 0; diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index ac2dfe0c5a9c..e6c8954a4e89 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -997,7 +997,7 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, #ifdef CONFIG_DEBUG_LOCK_ALLOC if (key) { - lockdep_init_map(&kn->dep_map, "s_active", key, 0); + lockdep_init_map(&kn->dep_map, "kn->count", key, 0); kn->flags |= KERNFS_LOCKDEP; } #endif diff --git a/fs/locks.c b/fs/locks.c index afefeb4ad6de..1bd71c4d663a 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -137,6 +137,7 @@ #define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) #define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT)) #define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK) +#define IS_REMOTELCK(fl) (fl->fl_pid <= 0) static inline bool is_remote_lock(struct file *filp) { @@ -270,6 +271,22 @@ locks_check_ctx_lists(struct inode *inode) } } +static void +locks_check_ctx_file_list(struct file *filp, struct list_head *list, + char *list_type) +{ + struct file_lock *fl; + struct inode *inode = locks_inode(filp); + + list_for_each_entry(fl, list, fl_list) + if (fl->fl_file == filp) + pr_warn("Leaked %s lock on dev=0x%x:0x%x ino=0x%lx " + " fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n", + list_type, MAJOR(inode->i_sb->s_dev), + MINOR(inode->i_sb->s_dev), inode->i_ino, + fl->fl_owner, fl->fl_flags, fl->fl_type, fl->fl_pid); +} + void locks_free_lock_context(struct inode *inode) { @@ -733,7 +750,6 @@ static void locks_wake_up_blocks(struct file_lock *blocker) static void locks_insert_lock_ctx(struct file_lock *fl, struct list_head *before) { - fl->fl_nspid = get_pid(task_tgid(current)); list_add_tail(&fl->fl_list, before); locks_insert_global_locks(fl); } @@ -743,10 +759,6 @@ locks_unlink_lock_ctx(struct file_lock *fl) { locks_delete_global_locks(fl); list_del_init(&fl->fl_list); - if (fl->fl_nspid) { - put_pid(fl->fl_nspid); - fl->fl_nspid = NULL; - } locks_wake_up_blocks(fl); } @@ -823,8 +835,6 @@ posix_test_lock(struct file *filp, struct file_lock *fl) list_for_each_entry(cfl, &ctx->flc_posix, fl_list) { if (posix_locks_conflict(fl, cfl)) { locks_copy_conflock(fl, cfl); - if (cfl->fl_nspid) - fl->fl_pid = pid_vnr(cfl->fl_nspid); goto out; } } @@ -2048,9 +2058,33 @@ int vfs_test_lock(struct file *filp, struct file_lock *fl) } EXPORT_SYMBOL_GPL(vfs_test_lock); +/** + * locks_translate_pid - translate a file_lock's fl_pid number into a namespace + * @fl: The file_lock who's fl_pid should be translated + * @ns: The namespace into which the pid should be translated + * + * Used to tranlate a fl_pid into a namespace virtual pid number + */ +static pid_t locks_translate_pid(struct file_lock *fl, struct pid_namespace *ns) +{ + pid_t vnr; + struct pid *pid; + + if (IS_OFDLCK(fl)) + return -1; + if (IS_REMOTELCK(fl)) + return fl->fl_pid; + + rcu_read_lock(); + pid = find_pid_ns(fl->fl_pid, &init_pid_ns); + vnr = pid_nr_ns(pid, ns); + rcu_read_unlock(); + return vnr; +} + static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl) { - flock->l_pid = IS_OFDLCK(fl) ? -1 : fl->fl_pid; + flock->l_pid = locks_translate_pid(fl, task_active_pid_ns(current)); #if BITS_PER_LONG == 32 /* * Make sure we can represent the posix lock via @@ -2072,7 +2106,7 @@ static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl) #if BITS_PER_LONG == 32 static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl) { - flock->l_pid = IS_OFDLCK(fl) ? -1 : fl->fl_pid; + flock->l_pid = locks_translate_pid(fl, task_active_pid_ns(current)); flock->l_start = fl->fl_start; flock->l_len = fl->fl_end == OFFSET_MAX ? 0 : fl->fl_end - fl->fl_start + 1; @@ -2086,14 +2120,17 @@ static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl) */ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock) { - struct file_lock file_lock; + struct file_lock *fl; int error; + fl = locks_alloc_lock(); + if (fl == NULL) + return -ENOMEM; error = -EINVAL; if (flock->l_type != F_RDLCK && flock->l_type != F_WRLCK) goto out; - error = flock_to_posix_lock(filp, &file_lock, flock); + error = flock_to_posix_lock(filp, fl, flock); if (error) goto out; @@ -2103,23 +2140,22 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock) goto out; cmd = F_GETLK; - file_lock.fl_flags |= FL_OFDLCK; - file_lock.fl_owner = filp; + fl->fl_flags |= FL_OFDLCK; + fl->fl_owner = filp; } - error = vfs_test_lock(filp, &file_lock); + error = vfs_test_lock(filp, fl); if (error) goto out; - flock->l_type = file_lock.fl_type; - if (file_lock.fl_type != F_UNLCK) { - error = posix_lock_to_flock(flock, &file_lock); + flock->l_type = fl->fl_type; + if (fl->fl_type != F_UNLCK) { + error = posix_lock_to_flock(flock, fl); if (error) - goto rel_priv; + goto out; } -rel_priv: - locks_release_private(&file_lock); out: + locks_free_lock(fl); return error; } @@ -2298,14 +2334,18 @@ out: */ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 *flock) { - struct file_lock file_lock; + struct file_lock *fl; int error; + fl = locks_alloc_lock(); + if (fl == NULL) + return -ENOMEM; + error = -EINVAL; if (flock->l_type != F_RDLCK && flock->l_type != F_WRLCK) goto out; - error = flock64_to_posix_lock(filp, &file_lock, flock); + error = flock64_to_posix_lock(filp, fl, flock); if (error) goto out; @@ -2315,20 +2355,20 @@ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 *flock) goto out; cmd = F_GETLK64; - file_lock.fl_flags |= FL_OFDLCK; - file_lock.fl_owner = filp; + fl->fl_flags |= FL_OFDLCK; + fl->fl_owner = filp; } - error = vfs_test_lock(filp, &file_lock); + error = vfs_test_lock(filp, fl); if (error) goto out; - flock->l_type = file_lock.fl_type; - if (file_lock.fl_type != F_UNLCK) - posix_lock_to_flock64(flock, &file_lock); + flock->l_type = fl->fl_type; + if (fl->fl_type != F_UNLCK) + posix_lock_to_flock64(flock, fl); - locks_release_private(&file_lock); out: + locks_free_lock(fl); return error; } @@ -2525,6 +2565,12 @@ void locks_remove_file(struct file *filp) /* remove any leases */ locks_remove_lease(filp, ctx); + + spin_lock(&ctx->flc_lock); + locks_check_ctx_file_list(filp, &ctx->flc_posix, "POSIX"); + locks_check_ctx_file_list(filp, &ctx->flc_flock, "FLOCK"); + locks_check_ctx_file_list(filp, &ctx->flc_lease, "LEASE"); + spin_unlock(&ctx->flc_lock); } /** @@ -2578,22 +2624,16 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, { struct inode *inode = NULL; unsigned int fl_pid; + struct pid_namespace *proc_pidns = file_inode(f->file)->i_sb->s_fs_info; - if (fl->fl_nspid) { - struct pid_namespace *proc_pidns = file_inode(f->file)->i_sb->s_fs_info; - - /* Don't let fl_pid change based on who is reading the file */ - fl_pid = pid_nr_ns(fl->fl_nspid, proc_pidns); - - /* - * If there isn't a fl_pid don't display who is waiting on - * the lock if we are called from locks_show, or if we are - * called from __show_fd_info - skip lock entirely - */ - if (fl_pid == 0) - return; - } else - fl_pid = fl->fl_pid; + fl_pid = locks_translate_pid(fl, proc_pidns); + /* + * If there isn't a fl_pid don't display who is waiting on + * the lock if we are called from locks_show, or if we are + * called from __show_fd_info - skip lock entirely + */ + if (fl_pid == 0) + return; if (fl->fl_file != NULL) inode = locks_inode(fl->fl_file); @@ -2668,7 +2708,7 @@ static int locks_show(struct seq_file *f, void *v) fl = hlist_entry(v, struct file_lock, fl_link); - if (fl->fl_nspid && !pid_nr_ns(fl->fl_nspid, proc_pidns)) + if (locks_translate_pid(fl, proc_pidns) == 0) return 0; lock_get_status(f, fl, iter->li_pos, ""); diff --git a/fs/mount.h b/fs/mount.h index de45d9e76748..6790767d1883 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -16,7 +16,7 @@ struct mnt_namespace { u64 event; unsigned int mounts; /* # of mounts in the namespace */ unsigned int pending_mounts; -}; +} __randomize_layout; struct mnt_pcp { int mnt_count; @@ -69,7 +69,7 @@ struct mount { struct hlist_head mnt_pins; struct fs_pin mnt_umount; struct dentry *mnt_ex_mountpoint; -}; +} __randomize_layout; #define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */ diff --git a/fs/namei.c b/fs/namei.c index 88fd38d1e3e7..ddb6a7c2b3d4 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -524,7 +524,7 @@ struct nameidata { struct inode *link_inode; unsigned root_seq; int dfd; -}; +} __randomize_layout; static void set_nameidata(struct nameidata *p, int dfd, struct filename *name) { diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 69d02cf8cf37..5f93cfacb3d1 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -121,6 +121,7 @@ config PNFS_FILE_LAYOUT config PNFS_BLOCK tristate depends on NFS_V4_1 && BLK_DEV_DM + depends on 64BIT || LBDAF default NFS_V4 config PNFS_FLEXFILE_LAYOUT diff --git a/fs/nfs/client.c b/fs/nfs/client.c index ee5ddbd36088..efebe6cf4378 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -820,6 +820,7 @@ void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *sour target->caps = source->caps; target->options = source->options; target->auth_info = source->auth_info; + target->port = source->port; } EXPORT_SYMBOL_GPL(nfs_server_copy_userdata); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 5ac484fe0dee..3522b1249019 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2372,16 +2372,40 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) } EXPORT_SYMBOL_GPL(nfs_access_add_cache); +#define NFS_MAY_READ (NFS4_ACCESS_READ) +#define NFS_MAY_WRITE (NFS4_ACCESS_MODIFY | \ + NFS4_ACCESS_EXTEND | \ + NFS4_ACCESS_DELETE) +#define NFS_FILE_MAY_WRITE (NFS4_ACCESS_MODIFY | \ + NFS4_ACCESS_EXTEND) +#define NFS_DIR_MAY_WRITE NFS_MAY_WRITE +#define NFS_MAY_LOOKUP (NFS4_ACCESS_LOOKUP) +#define NFS_MAY_EXECUTE (NFS4_ACCESS_EXECUTE) +static int +nfs_access_calc_mask(u32 access_result, umode_t umode) +{ + int mask = 0; + + if (access_result & NFS_MAY_READ) + mask |= MAY_READ; + if (S_ISDIR(umode)) { + if ((access_result & NFS_DIR_MAY_WRITE) == NFS_DIR_MAY_WRITE) + mask |= MAY_WRITE; + if ((access_result & NFS_MAY_LOOKUP) == NFS_MAY_LOOKUP) + mask |= MAY_EXEC; + } else if (S_ISREG(umode)) { + if ((access_result & NFS_FILE_MAY_WRITE) == NFS_FILE_MAY_WRITE) + mask |= MAY_WRITE; + if ((access_result & NFS_MAY_EXECUTE) == NFS_MAY_EXECUTE) + mask |= MAY_EXEC; + } else if (access_result & NFS_MAY_WRITE) + mask |= MAY_WRITE; + return mask; +} + void nfs_access_set_mask(struct nfs_access_entry *entry, u32 access_result) { - entry->mask = 0; - if (access_result & NFS4_ACCESS_READ) - entry->mask |= MAY_READ; - if (access_result & - (NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE)) - entry->mask |= MAY_WRITE; - if (access_result & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE)) - entry->mask |= MAY_EXEC; + entry->mask = access_result; } EXPORT_SYMBOL_GPL(nfs_access_set_mask); @@ -2389,6 +2413,7 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) { struct nfs_access_entry cache; bool may_block = (mask & MAY_NOT_BLOCK) == 0; + int cache_mask; int status; trace_nfs_access_enter(inode); @@ -2404,7 +2429,8 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) goto out; /* Be clever: ask server to check for all possible rights */ - cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ; + cache.mask = NFS_MAY_LOOKUP | NFS_MAY_EXECUTE + | NFS_MAY_WRITE | NFS_MAY_READ; cache.cred = cred; cache.jiffies = jiffies; status = NFS_PROTO(inode)->access(inode, &cache); @@ -2418,7 +2444,8 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) } nfs_access_add_cache(inode, &cache); out_cached: - if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) != 0) + cache_mask = nfs_access_calc_mask(cache.mask, inode->i_mode); + if ((mask & ~cache_mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) != 0) status = -EACCES; out: trace_nfs_access_exit(inode, status); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 5713eb32a45e..af330c31f627 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -617,6 +617,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) if (result) goto out; } + if (iocb->ki_pos > i_size_read(inode)) + nfs_revalidate_mapping(inode, file->f_mapping); nfs_start_io_write(inode); result = generic_write_checks(iocb, from); @@ -750,7 +752,7 @@ do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) */ nfs_sync_mapping(filp->f_mapping); if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) - nfs_zap_mapping(inode, filp->f_mapping); + nfs_zap_caches(inode); out: return status; } diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 080fc6b278bd..44c638b7876c 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -542,6 +542,10 @@ filelayout_check_deviceid(struct pnfs_layout_hdr *lo, struct nfs4_file_layout_dsaddr *dsaddr; int status = -EINVAL; + /* Is the deviceid already set? If so, we're good. */ + if (fl->dsaddr != NULL) + return 0; + /* find and reference the deviceid */ d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &fl->deviceid, lo->plh_lc_cred, gfp_flags); @@ -553,8 +557,6 @@ filelayout_check_deviceid(struct pnfs_layout_hdr *lo, if (filelayout_test_devid_unavailable(&dsaddr->id_node)) goto out_put; - fl->dsaddr = dsaddr; - if (fl->first_stripe_index >= dsaddr->stripe_count) { dprintk("%s Bad first_stripe_index %u\n", __func__, fl->first_stripe_index); @@ -570,6 +572,13 @@ filelayout_check_deviceid(struct pnfs_layout_hdr *lo, goto out_put; } status = 0; + + /* + * Atomic compare and xchange to ensure we don't scribble + * over a non-NULL pointer. + */ + if (cmpxchg(&fl->dsaddr, NULL, dsaddr) != NULL) + goto out_put; out: return status; out_put: diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 1f2ac3dd0fe5..b0fa83a60754 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1842,6 +1842,10 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) int vers, ret; struct nfs_fh *fh; + if (!lseg || !(pnfs_is_valid_lseg(lseg) || + test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))) + goto out_err; + idx = calc_ds_index_from_commit(lseg, data->ds_commit_index); ds = nfs4_ff_layout_prepare_ds(lseg, idx, true); if (!ds) diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index 6df7a0cf5660..f32c58bbe556 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -32,6 +32,7 @@ void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds) { nfs4_print_deviceid(&mirror_ds->id_node.deviceid); nfs4_pnfs_ds_put(mirror_ds->ds); + kfree(mirror_ds->ds_versions); kfree_rcu(mirror_ds, id_node.rcu); } diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 3efe946672be..60bad882c123 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -512,7 +512,7 @@ static const struct rpc_version mnt_version1 = { .counts = mnt_counts, }; -static unsigned int mnt3_counts[ARRAY_SIZE(mnt_procedures)]; +static unsigned int mnt3_counts[ARRAY_SIZE(mnt3_procedures)]; static const struct rpc_version mnt_version3 = { .number = 3, .nrprocs = ARRAY_SIZE(mnt3_procedures), diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index df4a7d3ab915..d1e87ec0df84 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -220,15 +220,8 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); nfs_refresh_inode(inode, res.fattr); - if (status == 0) { - entry->mask = 0; - if (res.access & NFS3_ACCESS_READ) - entry->mask |= MAY_READ; - if (res.access & (NFS3_ACCESS_MODIFY | NFS3_ACCESS_EXTEND | NFS3_ACCESS_DELETE)) - entry->mask |= MAY_WRITE; - if (res.access & (NFS3_ACCESS_LOOKUP|NFS3_ACCESS_EXECUTE)) - entry->mask |= MAY_EXEC; - } + if (status == 0) + nfs_access_set_mask(entry, res.access); nfs_free_fattr(res.fattr); out: dprintk("NFS reply access: %d\n", status); diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 50566acb5469..e9bea90dc017 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -660,9 +660,6 @@ int nfs4_detect_session_trunking(struct nfs_client *clp, if (!nfs4_check_server_scope(clp->cl_serverscope, res->server_scope)) goto out_err; - /* Session trunking passed, add the xprt */ - rpc_clnt_xprt_switch_add_xprt(clp->cl_rpcclient, xprt); - pr_info("NFS: %s: Session trunking succeeded for %s\n", clp->cl_hostname, xprt->address_strings[RPC_DISPLAY_ADDR]); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index a0b4e1091340..d90132642340 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2236,7 +2236,7 @@ static int nfs4_opendata_access(struct rpc_cred *cred, int openflags) { struct nfs_access_entry cache; - u32 mask; + u32 mask, flags; /* access call failed or for some reason the server doesn't * support any access modes -- defer access call until later */ @@ -2250,16 +2250,20 @@ static int nfs4_opendata_access(struct rpc_cred *cred, */ if (openflags & __FMODE_EXEC) { /* ONLY check for exec rights */ - mask = MAY_EXEC; + if (S_ISDIR(state->inode->i_mode)) + mask = NFS4_ACCESS_LOOKUP; + else + mask = NFS4_ACCESS_EXECUTE; } else if ((fmode & FMODE_READ) && !opendata->file_created) - mask = MAY_READ; + mask = NFS4_ACCESS_READ; cache.cred = cred; cache.jiffies = jiffies; nfs_access_set_mask(&cache, opendata->o_res.access_result); nfs_access_add_cache(state->inode, &cache); - if ((mask & ~cache.mask & (MAY_READ | MAY_EXEC)) == 0) + flags = NFS4_ACCESS_READ | NFS4_ACCESS_EXECUTE | NFS4_ACCESS_LOOKUP; + if ((mask & ~cache.mask & flags) == 0) return 0; return -EACCES; @@ -2549,9 +2553,8 @@ static int nfs41_check_open_stateid(struct nfs4_state *state) clear_bit(NFS_O_RDWR_STATE, &state->flags); clear_bit(NFS_OPEN_STATE, &state->flags); stateid->type = NFS4_INVALID_STATEID_TYPE; - } - if (status != NFS_OK) return status; + } if (nfs_open_stateid_recover_openmode(state)) return -NFS4ERR_OPENMODE; return NFS_OK; @@ -6492,7 +6495,7 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) set_current_state(TASK_INTERRUPTIBLE); spin_unlock_irqrestore(&q->lock, flags); - freezable_schedule_timeout_interruptible(NFS4_LOCK_MAXTIMEOUT); + freezable_schedule_timeout(NFS4_LOCK_MAXTIMEOUT); } finish_wait(q, &wait); @@ -7457,7 +7460,7 @@ static void nfs4_exchange_id_done(struct rpc_task *task, void *data) cdata->res.server_scope = NULL; } /* Save the EXCHANGE_ID verifier session trunk tests */ - memcpy(clp->cl_confirm.data, cdata->args.verifier->data, + memcpy(clp->cl_confirm.data, cdata->args.verifier.data, sizeof(clp->cl_confirm.data)); } out: @@ -7470,10 +7473,6 @@ static void nfs4_exchange_id_release(void *data) struct nfs41_exchange_id_data *cdata = (struct nfs41_exchange_id_data *)data; - if (cdata->xprt) { - xprt_put(cdata->xprt); - rpc_clnt_xprt_switch_put(cdata->args.client->cl_rpcclient); - } nfs_put_client(cdata->args.client); kfree(cdata->res.impl_id); kfree(cdata->res.server_scope); @@ -7494,7 +7493,6 @@ static const struct rpc_call_ops nfs4_exchange_id_call_ops = { static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, u32 sp4_how, struct rpc_xprt *xprt) { - nfs4_verifier verifier; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_EXCHANGE_ID], .rpc_cred = cred, @@ -7503,7 +7501,7 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, .rpc_client = clp->cl_rpcclient, .callback_ops = &nfs4_exchange_id_call_ops, .rpc_message = &msg, - .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT, + .flags = RPC_TASK_TIMEOUT, }; struct nfs41_exchange_id_data *calldata; struct rpc_task *task; @@ -7518,8 +7516,7 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, return -ENOMEM; } - if (!xprt) - nfs4_init_boot_verifier(clp, &verifier); + nfs4_init_boot_verifier(clp, &calldata->args.verifier); status = nfs4_init_uniform_client_string(clp); if (status) @@ -7558,11 +7555,9 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, if (xprt) { calldata->xprt = xprt; task_setup_data.rpc_xprt = xprt; - task_setup_data.flags = - RPC_TASK_SOFT|RPC_TASK_SOFTCONN|RPC_TASK_ASYNC; - calldata->args.verifier = &clp->cl_confirm; - } else { - calldata->args.verifier = &verifier; + task_setup_data.flags |= RPC_TASK_SOFTCONN; + memcpy(calldata->args.verifier.data, clp->cl_confirm.data, + sizeof(calldata->args.verifier.data)); } calldata->args.client = clp; #ifdef CONFIG_NFS_V4_1_MIGRATION @@ -7581,12 +7576,7 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, if (IS_ERR(task)) return PTR_ERR(task); - if (!xprt) { - status = rpc_wait_for_completion_task(task); - if (!status) - status = calldata->rpc_status; - } else /* session trunking test */ - status = calldata->rpc_status; + status = calldata->rpc_status; rpc_put_task(task); out: diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index fa3eb361d4f8..37c8af003275 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1785,7 +1785,7 @@ static void encode_exchange_id(struct xdr_stream *xdr, int len = 0; encode_op_hdr(xdr, OP_EXCHANGE_ID, decode_exchange_id_maxsz, hdr); - encode_nfs4_verifier(xdr, args->verifier); + encode_nfs4_verifier(xdr, &args->verifier); encode_string(xdr, strlen(args->client->cl_owner_id), args->client->cl_owner_id); diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index d40755a0984b..25f28fa64c57 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -159,13 +159,18 @@ void pnfs_generic_recover_commit_reqs(struct list_head *dst, { struct pnfs_commit_bucket *b; struct pnfs_layout_segment *freeme; + int nwritten; int i; lockdep_assert_held(&cinfo->inode->i_lock); restart: for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { - if (pnfs_generic_transfer_commit_list(&b->written, dst, - cinfo, 0)) { + nwritten = pnfs_generic_transfer_commit_list(&b->written, + dst, cinfo, 0); + if (!nwritten) + continue; + cinfo->ds->nwritten -= nwritten; + if (list_empty(&b->written)) { freeme = b->wlseg; b->wlseg = NULL; spin_unlock(&cinfo->inode->i_lock); @@ -174,7 +179,6 @@ restart: goto restart; } } - cinfo->ds->nwritten = 0; } EXPORT_SYMBOL_GPL(pnfs_generic_recover_commit_reqs); @@ -183,6 +187,7 @@ static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx) struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; struct pnfs_commit_bucket *bucket; struct pnfs_layout_segment *freeme; + struct list_head *pos; LIST_HEAD(pages); int i; @@ -193,6 +198,8 @@ static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx) continue; freeme = bucket->clseg; bucket->clseg = NULL; + list_for_each(pos, &bucket->committing) + cinfo->ds->ncommitting--; list_splice_init(&bucket->committing, &pages); spin_unlock(&cinfo->inode->i_lock); nfs_retry_commit(&pages, freeme, cinfo, i); @@ -217,13 +224,6 @@ pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo, for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) { if (list_empty(&bucket->committing)) continue; - /* - * If the layout segment is invalid, then let - * pnfs_generic_retry_commit() clean up the bucket. - */ - if (bucket->clseg && !pnfs_is_valid_lseg(bucket->clseg) && - !test_bit(NFS_LSEG_LAYOUTRETURN, &bucket->clseg->pls_flags)) - break; data = nfs_commitdata_alloc(false); if (!data) break; @@ -243,9 +243,12 @@ void pnfs_fetch_commit_bucket_list(struct list_head *pages, struct nfs_commit_info *cinfo) { struct pnfs_commit_bucket *bucket; + struct list_head *pos; bucket = &cinfo->ds->buckets[data->ds_commit_index]; spin_lock(&cinfo->inode->i_lock); + list_for_each(pos, &bucket->committing) + cinfo->ds->ncommitting--; list_splice_init(&bucket->committing, pages); data->lseg = bucket->clseg; bucket->clseg = NULL; @@ -330,7 +333,6 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, } } out: - cinfo->ds->ncommitting = 0; return PNFS_ATTEMPTED; } EXPORT_SYMBOL_GPL(pnfs_generic_commit_pagelist); diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index b45083c0f9ae..49b0a9e7ff18 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -720,8 +720,8 @@ static const struct rpc_version nfs_cb_version4 = { .counts = nfs4_cb_counts, }; -static const struct rpc_version *nfs_cb_version[] = { - &nfs_cb_version4, +static const struct rpc_version *nfs_cb_version[2] = { + [1] = &nfs_cb_version4, }; static const struct rpc_program cb_program; @@ -795,7 +795,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c .saddress = (struct sockaddr *) &conn->cb_saddr, .timeout = &timeparms, .program = &cb_program, - .version = 0, + .version = 1, .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), }; struct rpc_clnt *client; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 20fbcab97753..5f940d2a136b 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -144,7 +144,7 @@ static void next_decode_page(struct nfsd4_compoundargs *argp) argp->p = page_address(argp->pagelist[0]); argp->pagelist++; if (argp->pagelen < PAGE_SIZE) { - argp->end = argp->p + (argp->pagelen>>2); + argp->end = argp->p + XDR_QUADLEN(argp->pagelen); argp->pagelen = 0; } else { argp->end = argp->p + (PAGE_SIZE>>2); @@ -1279,9 +1279,7 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) argp->pagelen -= pages * PAGE_SIZE; len -= pages * PAGE_SIZE; - argp->p = (__be32 *)page_address(argp->pagelist[0]); - argp->pagelist++; - argp->end = argp->p + XDR_QUADLEN(PAGE_SIZE); + next_decode_page(argp); } argp->p += XDR_QUADLEN(len); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 38d0383dc7f9..bc69d40c4e8b 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -969,7 +969,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, int use_wgather; loff_t pos = offset; unsigned int pflags = current->flags; - int flags = 0; + rwf_t flags = 0; if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) /* diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index dc22ba8c710f..e50a387959bf 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -240,18 +240,6 @@ int ocfs2_set_acl(handle_t *handle, switch (type) { case ACL_TYPE_ACCESS: name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS; - if (acl) { - umode_t mode; - - ret = posix_acl_update_mode(inode, &mode, &acl); - if (ret) - return ret; - - ret = ocfs2_acl_set_mode(inode, di_bh, - handle, mode); - if (ret) - return ret; - } break; case ACL_TYPE_DEFAULT: name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT; @@ -289,7 +277,19 @@ int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type) had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh); if (had_lock < 0) return had_lock; + if (type == ACL_TYPE_ACCESS && acl) { + umode_t mode; + + status = posix_acl_update_mode(inode, &mode, &acl); + if (status) + goto unlock; + + status = ocfs2_acl_set_mode(inode, bh, NULL, mode); + if (status) + goto unlock; + } status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL); +unlock: ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock); brelse(bh); return status; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 641d9ee97f91..48b70e6490f3 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -481,17 +481,30 @@ out_cleanup: } static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, - struct cattr *attr, struct dentry *hardlink) + struct cattr *attr, struct dentry *hardlink, + bool origin) { int err; const struct cred *old_cred; struct cred *override_cred; + struct dentry *parent = dentry->d_parent; - err = ovl_copy_up(dentry->d_parent); + err = ovl_copy_up(parent); if (err) return err; old_cred = ovl_override_creds(dentry->d_sb); + + /* + * When linking a file with copy up origin into a new parent, mark the + * new parent dir "impure". + */ + if (origin) { + err = ovl_set_impure(parent, ovl_dentry_upper(parent)); + if (err) + goto out_revert_creds; + } + err = -ENOMEM; override_cred = prepare_creds(); if (override_cred) { @@ -550,7 +563,7 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev, inode_init_owner(inode, dentry->d_parent->d_inode, mode); attr.mode = inode->i_mode; - err = ovl_create_or_link(dentry, inode, &attr, NULL); + err = ovl_create_or_link(dentry, inode, &attr, NULL, false); if (err) iput(inode); @@ -609,7 +622,8 @@ static int ovl_link(struct dentry *old, struct inode *newdir, inode = d_inode(old); ihold(inode); - err = ovl_create_or_link(new, inode, NULL, ovl_dentry_upper(old)); + err = ovl_create_or_link(new, inode, NULL, ovl_dentry_upper(old), + ovl_type_origin(old)); if (err) iput(inode); diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 69f4fc26ee39..5bc71642b226 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -202,37 +202,38 @@ bool ovl_is_private_xattr(const char *name) sizeof(OVL_XATTR_PREFIX) - 1) == 0; } -int ovl_xattr_set(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags) +int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, + const void *value, size_t size, int flags) { int err; - struct path realpath; - enum ovl_path_type type = ovl_path_real(dentry, &realpath); + struct dentry *upperdentry = ovl_i_dentry_upper(inode); + struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry); const struct cred *old_cred; err = ovl_want_write(dentry); if (err) goto out; - if (!value && !OVL_TYPE_UPPER(type)) { - err = vfs_getxattr(realpath.dentry, name, NULL, 0); + if (!value && !upperdentry) { + err = vfs_getxattr(realdentry, name, NULL, 0); if (err < 0) goto out_drop_write; } - err = ovl_copy_up(dentry); - if (err) - goto out_drop_write; + if (!upperdentry) { + err = ovl_copy_up(dentry); + if (err) + goto out_drop_write; - if (!OVL_TYPE_UPPER(type)) - ovl_path_upper(dentry, &realpath); + realdentry = ovl_dentry_upper(dentry); + } old_cred = ovl_override_creds(dentry->d_sb); if (value) - err = vfs_setxattr(realpath.dentry, name, value, size, flags); + err = vfs_setxattr(realdentry, name, value, size, flags); else { WARN_ON(flags != XATTR_REPLACE); - err = vfs_removexattr(realpath.dentry, name); + err = vfs_removexattr(realdentry, name); } revert_creds(old_cred); @@ -242,12 +243,13 @@ out: return err; } -int ovl_xattr_get(struct dentry *dentry, const char *name, +int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size) { - struct dentry *realdentry = ovl_dentry_real(dentry); ssize_t res; const struct cred *old_cred; + struct dentry *realdentry = + ovl_i_dentry_upper(inode) ?: ovl_dentry_lower(dentry); old_cred = ovl_override_creds(dentry->d_sb); res = vfs_getxattr(realdentry, name, value, size); diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 9bc0e580a5b3..8aef2b304b2d 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -397,8 +397,19 @@ int ovl_verify_index(struct dentry *index, struct path *lowerstack, if (!d_inode(index)) return 0; - err = -EISDIR; - if (d_is_dir(index)) + /* + * Directory index entries are going to be used for looking up + * redirected upper dirs by lower dir fh when decoding an overlay + * file handle of a merge dir. Whiteout index entries are going to be + * used as an indication that an exported overlay file handle should + * be treated as stale (i.e. after unlink of the overlay inode). + * We don't know the verification rules for directory and whiteout + * index entries, because they have not been implemented yet, so return + * EROFS if those entries are found to avoid corrupting an index that + * was created by a newer kernel. + */ + err = -EROFS; + if (d_is_dir(index) || ovl_is_whiteout(index)) goto fail; err = -EINVAL; @@ -436,8 +447,8 @@ out: return err; fail: - pr_warn_ratelimited("overlayfs: failed to verify index (%pd2, err=%i)\n", - index, err); + pr_warn_ratelimited("overlayfs: failed to verify index (%pd2, ftype=%x, err=%i)\n", + index, d_inode(index)->i_mode & S_IFMT, err); goto out; } @@ -502,6 +513,7 @@ static struct dentry *ovl_lookup_index(struct dentry *dentry, goto out; } + inode = d_inode(index); if (d_is_negative(index)) { if (upper && d_inode(origin)->i_nlink > 1) { pr_warn_ratelimited("overlayfs: hard link with origin but no index (ino=%lu).\n", @@ -511,11 +523,22 @@ static struct dentry *ovl_lookup_index(struct dentry *dentry, dput(index); index = NULL; - } else if (upper && d_inode(index) != d_inode(upper)) { - inode = d_inode(index); - pr_warn_ratelimited("overlayfs: wrong index found (index ino: %lu, upper ino: %lu).\n", - d_inode(index)->i_ino, - d_inode(upper)->i_ino); + } else if (upper && d_inode(upper) != inode) { + pr_warn_ratelimited("overlayfs: wrong index found (index=%pd2, ino=%lu, upper ino=%lu).\n", + index, inode->i_ino, d_inode(upper)->i_ino); + goto fail; + } else if (ovl_dentry_weird(index) || ovl_is_whiteout(index) || + ((inode->i_mode ^ d_inode(origin)->i_mode) & S_IFMT)) { + /* + * Index should always be of the same file type as origin + * except for the case of a whiteout index. A whiteout + * index should only exist if all lower aliases have been + * unlinked, which means that finding a lower origin on lookup + * whose index is a whiteout should be treated as an error. + */ + pr_warn_ratelimited("overlayfs: bad index found (index=%pd2, ftype=%x, origin ftype=%x).\n", + index, d_inode(index)->i_mode & S_IFMT, + d_inode(origin)->i_mode & S_IFMT); goto fail; } diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 60d26605e039..e927a62c97ae 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -47,7 +47,8 @@ enum ovl_flag { /* Is the real inode encoded in fid an upper inode? */ #define OVL_FH_FLAG_PATH_UPPER (1 << 2) -#define OVL_FH_FLAG_ALL (OVL_FH_FLAG_BIG_ENDIAN | OVL_FH_FLAG_ANY_ENDIAN) +#define OVL_FH_FLAG_ALL (OVL_FH_FLAG_BIG_ENDIAN | OVL_FH_FLAG_ANY_ENDIAN | \ + OVL_FH_FLAG_PATH_UPPER) #if defined(__LITTLE_ENDIAN) #define OVL_FH_FLAG_CPU_ENDIAN 0 @@ -199,6 +200,7 @@ enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path); struct dentry *ovl_dentry_upper(struct dentry *dentry); struct dentry *ovl_dentry_lower(struct dentry *dentry); struct dentry *ovl_dentry_real(struct dentry *dentry); +struct dentry *ovl_i_dentry_upper(struct inode *inode); struct inode *ovl_inode_upper(struct inode *inode); struct inode *ovl_inode_lower(struct inode *inode); struct inode *ovl_inode_real(struct inode *inode); @@ -270,9 +272,9 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr); int ovl_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); int ovl_permission(struct inode *inode, int mask); -int ovl_xattr_set(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags); -int ovl_xattr_get(struct dentry *dentry, const char *name, +int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, + const void *value, size_t size, int flags); +int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size); ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); struct posix_acl *ovl_get_acl(struct inode *inode, int type); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 0298463cf9c3..f0fd3adb1693 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -446,14 +446,14 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, ovl_path_upper(dentry, &upperpath); realfile = ovl_path_open(&upperpath, O_RDONLY); - smp_mb__before_spinlock(); + inode_lock(inode); if (!od->upperfile) { if (IS_ERR(realfile)) { inode_unlock(inode); return PTR_ERR(realfile); } - od->upperfile = realfile; + smp_store_release(&od->upperfile, realfile); } else { /* somebody has beaten us to it */ if (!IS_ERR(realfile)) @@ -703,7 +703,10 @@ int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt, err = PTR_ERR(index); break; } - if (ovl_verify_index(index, lowerstack, numlower)) { + err = ovl_verify_index(index, lowerstack, numlower); + if (err) { + if (err == -EROFS) + break; err = ovl_cleanup(dir, index); if (err) break; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 44dc2d6ffe0f..d86e89f97201 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -692,7 +692,7 @@ ovl_posix_acl_xattr_get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, void *buffer, size_t size) { - return ovl_xattr_get(dentry, handler->name, buffer, size); + return ovl_xattr_get(dentry, inode, handler->name, buffer, size); } static int __maybe_unused @@ -742,7 +742,7 @@ ovl_posix_acl_xattr_set(const struct xattr_handler *handler, return err; } - err = ovl_xattr_set(dentry, handler->name, value, size, flags); + err = ovl_xattr_set(dentry, inode, handler->name, value, size, flags); if (!err) ovl_copyattr(ovl_inode_real(inode), inode); @@ -772,7 +772,7 @@ static int ovl_other_xattr_get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, void *buffer, size_t size) { - return ovl_xattr_get(dentry, name, buffer, size); + return ovl_xattr_get(dentry, inode, name, buffer, size); } static int ovl_other_xattr_set(const struct xattr_handler *handler, @@ -780,7 +780,7 @@ static int ovl_other_xattr_set(const struct xattr_handler *handler, const char *name, const void *value, size_t size, int flags) { - return ovl_xattr_set(dentry, name, value, size, flags); + return ovl_xattr_set(dentry, inode, name, value, size, flags); } static const struct xattr_handler __maybe_unused @@ -1058,10 +1058,6 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) ufs->indexdir = ovl_workdir_create(sb, ufs, workpath.dentry, OVL_INDEXDIR_NAME, true); - err = PTR_ERR(ufs->indexdir); - if (IS_ERR(ufs->indexdir)) - goto out_put_lower_mnt; - if (ufs->indexdir) { /* Verify upper root is index dir origin */ err = ovl_verify_origin(ufs->indexdir, ufs->upper_mnt, @@ -1090,6 +1086,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) else sb->s_d_op = &ovl_dentry_operations; + err = -ENOMEM; ufs->creator_cred = cred = prepare_creds(); if (!cred) goto out_put_indexdir; diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index c492ba75c659..f46ad75dc96a 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -157,9 +157,14 @@ struct dentry *ovl_dentry_real(struct dentry *dentry) return ovl_dentry_upper(dentry) ?: ovl_dentry_lower(dentry); } +struct dentry *ovl_i_dentry_upper(struct inode *inode) +{ + return ovl_upperdentry_dereference(OVL_I(inode)); +} + struct inode *ovl_inode_upper(struct inode *inode) { - struct dentry *upperdentry = ovl_upperdentry_dereference(OVL_I(inode)); + struct dentry *upperdentry = ovl_i_dentry_upper(inode); return upperdentry ? d_inode(upperdentry) : NULL; } diff --git a/fs/proc/base.c b/fs/proc/base.c index 719c2e943ea1..98fd8f6df851 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1408,12 +1408,13 @@ static const struct file_operations proc_fail_nth_operations = { static int sched_show(struct seq_file *m, void *v) { struct inode *inode = m->private; + struct pid_namespace *ns = inode->i_sb->s_fs_info; struct task_struct *p; p = get_proc_task(inode); if (!p) return -ESRCH; - proc_sched_show_task(p, m); + proc_sched_show_task(p, ns, m); put_task_struct(p); diff --git a/fs/proc/devices.c b/fs/proc/devices.c index 50493edc30e5..e5709343feb7 100644 --- a/fs/proc/devices.c +++ b/fs/proc/devices.c @@ -7,14 +7,14 @@ static int devinfo_show(struct seq_file *f, void *v) { int i = *(loff_t *) v; - if (i < CHRDEV_MAJOR_HASH_SIZE) { + if (i < CHRDEV_MAJOR_MAX) { if (i == 0) seq_puts(f, "Character devices:\n"); chrdev_show(f, i); } #ifdef CONFIG_BLOCK else { - i -= CHRDEV_MAJOR_HASH_SIZE; + i -= CHRDEV_MAJOR_MAX; if (i == 0) seq_puts(f, "\nBlock devices:\n"); blkdev_show(f, i); @@ -25,7 +25,7 @@ static int devinfo_show(struct seq_file *f, void *v) static void *devinfo_start(struct seq_file *f, loff_t *pos) { - if (*pos < (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE)) + if (*pos < (BLKDEV_MAJOR_MAX + CHRDEV_MAJOR_MAX)) return pos; return NULL; } @@ -33,7 +33,7 @@ static void *devinfo_start(struct seq_file *f, loff_t *pos) static void *devinfo_next(struct seq_file *f, void *v, loff_t *pos) { (*pos)++; - if (*pos >= (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE)) + if (*pos >= (BLKDEV_MAJOR_MAX + CHRDEV_MAJOR_MAX)) return NULL; return pos; } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 18694598bebf..aa2b89071630 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -51,7 +51,7 @@ struct proc_dir_entry { spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */ u8 namelen; char name[]; -}; +} __randomize_layout; union proc_op { int (*proc_get_link)(struct dentry *, struct path *); @@ -70,7 +70,7 @@ struct proc_inode { struct hlist_node sysctl_inodes; const struct proc_ns_operations *ns_ops; struct inode vfs_inode; -}; +} __randomize_layout; /* * General functions @@ -279,7 +279,7 @@ struct proc_maps_private { #ifdef CONFIG_NUMA struct mempolicy *task_mempolicy; #endif -}; +} __randomize_layout; struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode); diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 8a428498d6b2..509a61668d90 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -106,13 +106,13 @@ static int meminfo_proc_show(struct seq_file *m, void *v) global_node_page_state(NR_FILE_MAPPED)); show_val_kb(m, "Shmem: ", i.sharedram); show_val_kb(m, "Slab: ", - global_page_state(NR_SLAB_RECLAIMABLE) + - global_page_state(NR_SLAB_UNRECLAIMABLE)); + global_node_page_state(NR_SLAB_RECLAIMABLE) + + global_node_page_state(NR_SLAB_UNRECLAIMABLE)); show_val_kb(m, "SReclaimable: ", - global_page_state(NR_SLAB_RECLAIMABLE)); + global_node_page_state(NR_SLAB_RECLAIMABLE)); show_val_kb(m, "SUnreclaim: ", - global_page_state(NR_SLAB_UNRECLAIMABLE)); + global_node_page_state(NR_SLAB_UNRECLAIMABLE)); seq_printf(m, "KernelStack: %8lu kB\n", global_page_state(NR_KERNEL_STACK_KB)); show_val_kb(m, "PageTables: ", diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index b836fd61ed87..fe8f3265e877 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -16,9 +16,10 @@ #include <linux/mmu_notifier.h> #include <linux/page_idle.h> #include <linux/shmem_fs.h> +#include <linux/uaccess.h> #include <asm/elf.h> -#include <linux/uaccess.h> +#include <asm/tlb.h> #include <asm/tlbflush.h> #include "internal.h" @@ -1008,6 +1009,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, struct mm_struct *mm; struct vm_area_struct *vma; enum clear_refs_types type; + struct mmu_gather tlb; int itype; int rv; @@ -1054,6 +1056,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, } down_read(&mm->mmap_sem); + tlb_gather_mmu(&tlb, mm, 0, -1); if (type == CLEAR_REFS_SOFT_DIRTY) { for (vma = mm->mmap; vma; vma = vma->vm_next) { if (!(vma->vm_flags & VM_SOFTDIRTY)) @@ -1075,7 +1078,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, walk_page_range(0, mm->highest_vm_end, &clear_refs_walk); if (type == CLEAR_REFS_SOFT_DIRTY) mmu_notifier_invalidate_range_end(mm, 0, -1); - flush_tlb_mm(mm); + tlb_finish_mmu(&tlb, 0, -1); up_read(&mm->mmap_sem); out_mm: mmput(mm); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 53a17496c5c5..566e6ef99f07 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1124,6 +1124,10 @@ void dquot_free_reserved_space(struct dquot *dquot, qsize_t number) WARN_ON_ONCE(1); dquot->dq_dqb.dqb_rsvspace = 0; } + if (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace <= + dquot->dq_dqb.dqb_bsoftlimit) + dquot->dq_dqb.dqb_btime = (time64_t) 0; + clear_bit(DQ_BLKS_B, &dquot->dq_flags); } static void dquot_decr_inodes(struct dquot *dquot, qsize_t number) @@ -1145,7 +1149,8 @@ static void dquot_decr_space(struct dquot *dquot, qsize_t number) dquot->dq_dqb.dqb_curspace -= number; else dquot->dq_dqb.dqb_curspace = 0; - if (dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit) + if (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace <= + dquot->dq_dqb.dqb_bsoftlimit) dquot->dq_dqb.dqb_btime = (time64_t) 0; clear_bit(DQ_BLKS_B, &dquot->dq_flags); } @@ -1381,14 +1386,18 @@ static int info_idq_free(struct dquot *dquot, qsize_t inodes) static int info_bdq_free(struct dquot *dquot, qsize_t space) { + qsize_t tspace; + + tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace; + if (test_bit(DQ_FAKE_B, &dquot->dq_flags) || - dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit) + tspace <= dquot->dq_dqb.dqb_bsoftlimit) return QUOTA_NL_NOWARN; - if (dquot->dq_dqb.dqb_curspace - space <= dquot->dq_dqb.dqb_bsoftlimit) + if (tspace - space <= dquot->dq_dqb.dqb_bsoftlimit) return QUOTA_NL_BSOFTBELOW; - if (dquot->dq_dqb.dqb_curspace >= dquot->dq_dqb.dqb_bhardlimit && - dquot->dq_dqb.dqb_curspace - space < dquot->dq_dqb.dqb_bhardlimit) + if (tspace >= dquot->dq_dqb.dqb_bhardlimit && + tspace - space < dquot->dq_dqb.dqb_bhardlimit) return QUOTA_NL_BHARDBELOW; return QUOTA_NL_NOWARN; } @@ -2681,7 +2690,7 @@ static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di) if (check_blim) { if (!dm->dqb_bsoftlimit || - dm->dqb_curspace < dm->dqb_bsoftlimit) { + dm->dqb_curspace + dm->dqb_rsvspace < dm->dqb_bsoftlimit) { dm->dqb_btime = 0; clear_bit(DQ_BLKS_B, &dquot->dq_flags); } else if (!(di->d_fieldmask & QC_SPC_TIMER)) diff --git a/fs/read_write.c b/fs/read_write.c index 0cc7033aa413..61b58c7b6531 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -33,7 +33,7 @@ const struct file_operations generic_ro_fops = { EXPORT_SYMBOL(generic_ro_fops); -static inline int unsigned_offsets(struct file *file) +static inline bool unsigned_offsets(struct file *file) { return file->f_mode & FMODE_UNSIGNED_OFFSET; } @@ -633,7 +633,7 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to) EXPORT_SYMBOL(iov_shorten); static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, - loff_t *ppos, int type, int flags) + loff_t *ppos, int type, rwf_t flags) { struct kiocb kiocb; ssize_t ret; @@ -655,7 +655,7 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, /* Do it by hand, with file-ops */ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, - loff_t *ppos, int type, int flags) + loff_t *ppos, int type, rwf_t flags) { ssize_t ret = 0; @@ -871,7 +871,7 @@ out: #endif static ssize_t do_iter_read(struct file *file, struct iov_iter *iter, - loff_t *pos, int flags) + loff_t *pos, rwf_t flags) { size_t tot_len; ssize_t ret = 0; @@ -899,7 +899,7 @@ out: } ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, - int flags) + rwf_t flags) { if (!file->f_op->read_iter) return -EINVAL; @@ -908,7 +908,7 @@ ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, EXPORT_SYMBOL(vfs_iter_read); static ssize_t do_iter_write(struct file *file, struct iov_iter *iter, - loff_t *pos, int flags) + loff_t *pos, rwf_t flags) { size_t tot_len; ssize_t ret = 0; @@ -935,7 +935,7 @@ static ssize_t do_iter_write(struct file *file, struct iov_iter *iter, } ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, - int flags) + rwf_t flags) { if (!file->f_op->write_iter) return -EINVAL; @@ -944,7 +944,7 @@ ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, EXPORT_SYMBOL(vfs_iter_write); ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, - unsigned long vlen, loff_t *pos, int flags) + unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; @@ -962,7 +962,7 @@ ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, EXPORT_SYMBOL(vfs_readv); ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, - unsigned long vlen, loff_t *pos, int flags) + unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; @@ -981,7 +981,7 @@ ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, EXPORT_SYMBOL(vfs_writev); static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec, - unsigned long vlen, int flags) + unsigned long vlen, rwf_t flags) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; @@ -1001,7 +1001,7 @@ static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec, } static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec, - unsigned long vlen, int flags) + unsigned long vlen, rwf_t flags) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; @@ -1027,7 +1027,7 @@ static inline loff_t pos_from_hilo(unsigned long high, unsigned long low) } static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec, - unsigned long vlen, loff_t pos, int flags) + unsigned long vlen, loff_t pos, rwf_t flags) { struct fd f; ssize_t ret = -EBADF; @@ -1050,7 +1050,7 @@ static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec, } static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec, - unsigned long vlen, loff_t pos, int flags) + unsigned long vlen, loff_t pos, rwf_t flags) { struct fd f; ssize_t ret = -EBADF; @@ -1094,7 +1094,7 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, - int, flags) + rwf_t, flags) { loff_t pos = pos_from_hilo(pos_h, pos_l); @@ -1114,7 +1114,7 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, - int, flags) + rwf_t, flags) { loff_t pos = pos_from_hilo(pos_h, pos_l); @@ -1127,7 +1127,7 @@ SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec, #ifdef CONFIG_COMPAT static size_t compat_readv(struct file *file, const struct compat_iovec __user *vec, - unsigned long vlen, loff_t *pos, int flags) + unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; @@ -1147,7 +1147,7 @@ static size_t compat_readv(struct file *file, static size_t do_compat_readv(compat_ulong_t fd, const struct compat_iovec __user *vec, - compat_ulong_t vlen, int flags) + compat_ulong_t vlen, rwf_t flags) { struct fd f = fdget_pos(fd); ssize_t ret; @@ -1173,7 +1173,7 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd, static long do_compat_preadv64(unsigned long fd, const struct compat_iovec __user *vec, - unsigned long vlen, loff_t pos, int flags) + unsigned long vlen, loff_t pos, rwf_t flags) { struct fd f; ssize_t ret; @@ -1211,7 +1211,7 @@ COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2 COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd, const struct compat_iovec __user *,vec, - unsigned long, vlen, loff_t, pos, int, flags) + unsigned long, vlen, loff_t, pos, rwf_t, flags) { return do_compat_preadv64(fd, vec, vlen, pos, flags); } @@ -1220,7 +1220,7 @@ COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd, COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd, const struct compat_iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, - int, flags) + rwf_t, flags) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; @@ -1232,7 +1232,7 @@ COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd, static size_t compat_writev(struct file *file, const struct compat_iovec __user *vec, - unsigned long vlen, loff_t *pos, int flags) + unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; @@ -1254,7 +1254,7 @@ static size_t compat_writev(struct file *file, static size_t do_compat_writev(compat_ulong_t fd, const struct compat_iovec __user* vec, - compat_ulong_t vlen, int flags) + compat_ulong_t vlen, rwf_t flags) { struct fd f = fdget_pos(fd); ssize_t ret; @@ -1279,7 +1279,7 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd, static long do_compat_pwritev64(unsigned long fd, const struct compat_iovec __user *vec, - unsigned long vlen, loff_t pos, int flags) + unsigned long vlen, loff_t pos, rwf_t flags) { struct fd f; ssize_t ret; @@ -1317,7 +1317,7 @@ COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2 COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd, const struct compat_iovec __user *,vec, - unsigned long, vlen, loff_t, pos, int, flags) + unsigned long, vlen, loff_t, pos, rwf_t, flags) { return do_compat_pwritev64(fd, vec, vlen, pos, flags); } @@ -1325,7 +1325,7 @@ COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd, COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd, const struct compat_iovec __user *,vec, - compat_ulong_t, vlen, u32, pos_low, u32, pos_high, int, flags) + compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 3d2256a425ee..54415f0e3d18 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -23,7 +23,8 @@ reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) struct reiserfs_transaction_handle th; size_t jcreate_blocks; int size = acl ? posix_acl_xattr_size(acl->a_count) : 0; - + int update_mode = 0; + umode_t mode = inode->i_mode; /* * Pessimism: We can't assume that anything from the xattr root up @@ -37,7 +38,16 @@ reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) error = journal_begin(&th, inode->i_sb, jcreate_blocks); reiserfs_write_unlock(inode->i_sb); if (error == 0) { + if (type == ACL_TYPE_ACCESS && acl) { + error = posix_acl_update_mode(inode, &mode, &acl); + if (error) + goto unlock; + update_mode = 1; + } error = __reiserfs_set_acl(&th, inode, type, acl); + if (!error && update_mode) + inode->i_mode = mode; +unlock: reiserfs_write_lock(inode->i_sb); error2 = journal_end(&th); reiserfs_write_unlock(inode->i_sb); @@ -241,11 +251,6 @@ __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, switch (type) { case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; - if (acl) { - error = posix_acl_update_mode(inode, &inode->i_mode, &acl); - if (error) - return error; - } break; case ACL_TYPE_DEFAULT: name = XATTR_NAME_POSIX_ACL_DEFAULT; diff --git a/fs/select.c b/fs/select.c index 9d5f15ed87fe..c6362e38ae92 100644 --- a/fs/select.c +++ b/fs/select.c @@ -1164,11 +1164,7 @@ int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, if (ufdset) { return compat_get_bitmap(fdset, ufdset, nr); } else { - /* Tricky, must clear full unsigned long in the - * kernel fdset at the end, ALIGN makes sure that - * actually happens. - */ - memset(fdset, 0, ALIGN(nr, BITS_PER_LONG)); + zero_fd_set(nr, fdset); return 0; } } diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index cadcd12a3d35..886085b47c75 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -109,27 +109,24 @@ static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, goto out; WRITE_ONCE(uwq->waken, true); /* - * The implicit smp_mb__before_spinlock in try_to_wake_up() - * renders uwq->waken visible to other CPUs before the task is - * waken. + * The Program-Order guarantees provided by the scheduler + * ensure uwq->waken is visible before the task is woken. */ ret = wake_up_state(wq->private, mode); - if (ret) + if (ret) { /* * Wake only once, autoremove behavior. * - * After the effect of list_del_init is visible to the - * other CPUs, the waitqueue may disappear from under - * us, see the !list_empty_careful() in - * handle_userfault(). try_to_wake_up() has an - * implicit smp_mb__before_spinlock, and the - * wq->private is read before calling the extern - * function "wake_up_state" (which in turns calls - * try_to_wake_up). While the spin_lock;spin_unlock; - * wouldn't be enough, the smp_mb__before_spinlock is - * enough to avoid an explicit smp_mb() here. + * After the effect of list_del_init is visible to the other + * CPUs, the waitqueue may disappear from under us, see the + * !list_empty_careful() in handle_userfault(). + * + * try_to_wake_up() has an implicit smp_mb(), and the + * wq->private is read before calling the extern function + * "wake_up_state" (which in turns calls try_to_wake_up). */ list_del_init(&wq->entry); + } out: return ret; } @@ -854,6 +851,9 @@ wakeup: __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, &range); spin_unlock(&ctx->fault_pending_wqh.lock); + /* Flush pending events that may still wait on event_wqh */ + wake_up_all(&ctx->event_wqh); + wake_up_poll(&ctx->fd_wqh, POLLHUP); userfaultfd_ctx_put(ctx); return 0; @@ -1597,7 +1597,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, uffdio_copy.len); mmput(ctx->mm); } else { - return -ENOSPC; + return -ESRCH; } if (unlikely(put_user(ret, &user_uffdio_copy->copy))) return -EFAULT; @@ -1643,6 +1643,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start, uffdio_zeropage.range.len); mmput(ctx->mm); + } else { + return -ESRCH; } if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage))) return -EFAULT; diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index de7b9bd30bec..6249c92671de 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -328,20 +328,19 @@ xfs_attr_set( */ xfs_defer_init(args.dfops, args.firstblock); error = xfs_attr_shortform_to_leaf(&args); - if (!error) - error = xfs_defer_finish(&args.trans, args.dfops, dp); - if (error) { - args.trans = NULL; - xfs_defer_cancel(&dfops); - goto out; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args.dfops, dp); + error = xfs_defer_finish(&args.trans, args.dfops); + if (error) + goto out_defer_cancel; /* * Commit the leaf transformation. We'll need another (linked) * transaction to add the new attribute to the leaf. */ - error = xfs_trans_roll(&args.trans, dp); + error = xfs_trans_roll_inode(&args.trans, dp); if (error) goto out; @@ -373,6 +372,9 @@ xfs_attr_set( return error; +out_defer_cancel: + xfs_defer_cancel(&dfops); + args.trans = NULL; out: if (args.trans) xfs_trans_cancel(args.trans); @@ -593,19 +595,18 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) */ xfs_defer_init(args->dfops, args->firstblock); error = xfs_attr3_leaf_to_node(args); - if (!error) - error = xfs_defer_finish(&args->trans, args->dfops, dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - return error; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; /* * Commit the current trans (including the inode) and start * a new one. */ - error = xfs_trans_roll(&args->trans, dp); + error = xfs_trans_roll_inode(&args->trans, dp); if (error) return error; @@ -620,7 +621,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * Commit the transaction that added the attr name so that * later routines can manage their own transactions. */ - error = xfs_trans_roll(&args->trans, dp); + error = xfs_trans_roll_inode(&args->trans, dp); if (error) return error; @@ -684,20 +685,18 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) xfs_defer_init(args->dfops, args->firstblock); error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ - if (!error) - error = xfs_defer_finish(&args->trans, - args->dfops, dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - return error; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; } /* * Commit the remove and start the next trans in series. */ - error = xfs_trans_roll(&args->trans, dp); + error = xfs_trans_roll_inode(&args->trans, dp); } else if (args->rmtblkno > 0) { /* @@ -706,6 +705,10 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) error = xfs_attr3_leaf_clearflag(args); } return error; +out_defer_cancel: + xfs_defer_cancel(args->dfops); + args->trans = NULL; + return error; } /* @@ -747,15 +750,18 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) xfs_defer_init(args->dfops, args->firstblock); error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ - if (!error) - error = xfs_defer_finish(&args->trans, args->dfops, dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - return error; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; } return 0; +out_defer_cancel: + xfs_defer_cancel(args->dfops); + args->trans = NULL; + return error; } /* @@ -872,20 +878,18 @@ restart: state = NULL; xfs_defer_init(args->dfops, args->firstblock); error = xfs_attr3_leaf_to_node(args); - if (!error) - error = xfs_defer_finish(&args->trans, - args->dfops, dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - goto out; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; /* * Commit the node conversion and start the next * trans in the chain. */ - error = xfs_trans_roll(&args->trans, dp); + error = xfs_trans_roll_inode(&args->trans, dp); if (error) goto out; @@ -900,13 +904,12 @@ restart: */ xfs_defer_init(args->dfops, args->firstblock); error = xfs_da3_split(state); - if (!error) - error = xfs_defer_finish(&args->trans, args->dfops, dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - goto out; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; } else { /* * Addition succeeded, update Btree hashvals. @@ -925,7 +928,7 @@ restart: * Commit the leaf addition or btree split and start the next * trans in the chain. */ - error = xfs_trans_roll(&args->trans, dp); + error = xfs_trans_roll_inode(&args->trans, dp); if (error) goto out; @@ -999,20 +1002,18 @@ restart: if (retval && (state->path.active > 1)) { xfs_defer_init(args->dfops, args->firstblock); error = xfs_da3_join(state); - if (!error) - error = xfs_defer_finish(&args->trans, - args->dfops, dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - goto out; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; } /* * Commit and start the next trans in the chain. */ - error = xfs_trans_roll(&args->trans, dp); + error = xfs_trans_roll_inode(&args->trans, dp); if (error) goto out; @@ -1032,6 +1033,10 @@ out: if (error) return error; return retval; +out_defer_cancel: + xfs_defer_cancel(args->dfops); + args->trans = NULL; + goto out; } /* @@ -1122,17 +1127,16 @@ xfs_attr_node_removename(xfs_da_args_t *args) if (retval && (state->path.active > 1)) { xfs_defer_init(args->dfops, args->firstblock); error = xfs_da3_join(state); - if (!error) - error = xfs_defer_finish(&args->trans, args->dfops, dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - goto out; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; /* * Commit the Btree join operation and start a new trans. */ - error = xfs_trans_roll(&args->trans, dp); + error = xfs_trans_roll_inode(&args->trans, dp); if (error) goto out; } @@ -1156,14 +1160,12 @@ xfs_attr_node_removename(xfs_da_args_t *args) xfs_defer_init(args->dfops, args->firstblock); error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ - if (!error) - error = xfs_defer_finish(&args->trans, - args->dfops, dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - goto out; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; } else xfs_trans_brelse(args->trans, bp); } @@ -1172,6 +1174,10 @@ xfs_attr_node_removename(xfs_da_args_t *args) out: xfs_da_state_free(state); return error; +out_defer_cancel: + xfs_defer_cancel(args->dfops); + args->trans = NULL; + goto out; } /* diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index c6c15e5717e4..5c16db86b38f 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -2608,7 +2608,7 @@ xfs_attr3_leaf_clearflag( /* * Commit the flag value change and start the next trans in series. */ - return xfs_trans_roll(&args->trans, args->dp); + return xfs_trans_roll_inode(&args->trans, args->dp); } /* @@ -2659,7 +2659,7 @@ xfs_attr3_leaf_setflag( /* * Commit the flag value change and start the next trans in series. */ - return xfs_trans_roll(&args->trans, args->dp); + return xfs_trans_roll_inode(&args->trans, args->dp); } /* @@ -2777,7 +2777,7 @@ xfs_attr3_leaf_flipflags( /* * Commit the flag value change and start the next trans in series. */ - error = xfs_trans_roll(&args->trans, args->dp); + error = xfs_trans_roll_inode(&args->trans, args->dp); return error; } diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 5236d8e45146..d56caf037ca0 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -467,13 +467,12 @@ xfs_attr_rmtval_set( error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, blkcnt, XFS_BMAPI_ATTRFORK, args->firstblock, args->total, &map, &nmap, args->dfops); - if (!error) - error = xfs_defer_finish(&args->trans, args->dfops, dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - return error; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; ASSERT(nmap == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && @@ -484,7 +483,7 @@ xfs_attr_rmtval_set( /* * Start the next trans in the chain. */ - error = xfs_trans_roll(&args->trans, dp); + error = xfs_trans_roll_inode(&args->trans, dp); if (error) return error; } @@ -539,6 +538,10 @@ xfs_attr_rmtval_set( } ASSERT(valuelen == 0); return 0; +out_defer_cancel: + xfs_defer_cancel(args->dfops); + args->trans = NULL; + return error; } /* @@ -609,21 +612,23 @@ xfs_attr_rmtval_remove( error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, XFS_BMAPI_ATTRFORK, 1, args->firstblock, args->dfops, &done); - if (!error) - error = xfs_defer_finish(&args->trans, args->dfops, - args->dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - return error; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, args->dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; /* * Close out trans and start the next one in the chain. */ - error = xfs_trans_roll(&args->trans, args->dp); + error = xfs_trans_roll_inode(&args->trans, args->dp); if (error) return error; } return 0; +out_defer_cancel: + xfs_defer_cancel(args->dfops); + args->trans = NULL; + return error; } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 0a9880777c9c..459f4b4f08fe 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -579,7 +579,7 @@ xfs_bmap_validate_ret( #else #define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0) -#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) +#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) do { } while (0) #endif /* DEBUG */ /* @@ -880,7 +880,7 @@ xfs_bmap_local_to_extents( xfs_ifork_t *ifp; /* inode fork pointer */ xfs_alloc_arg_t args; /* allocation arguments */ xfs_buf_t *bp; /* buffer for extent block */ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + struct xfs_bmbt_irec rec; /* * We don't want to deal with the case of keeping inode data inline yet. @@ -943,9 +943,12 @@ xfs_bmap_local_to_extents( xfs_bmap_local_to_extents_empty(ip, whichfork); flags |= XFS_ILOG_CORE; - xfs_iext_add(ifp, 0, 1); - ep = xfs_iext_get_ext(ifp, 0); - xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM); + rec.br_startoff = 0; + rec.br_startblock = args.fsbno; + rec.br_blockcount = 1; + rec.br_state = XFS_EXT_NORM; + xfs_iext_insert(ip, 0, 1, &rec, 0); + trace_xfs_bmap_post_update(ip, 0, whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0, _THIS_IP_); @@ -1196,7 +1199,7 @@ xfs_bmap_add_attrfork( xfs_log_sb(tp); } - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto bmap_cancel; error = xfs_trans_commit(tp); @@ -1356,7 +1359,6 @@ xfs_bmap_first_unused( xfs_fileoff_t lastaddr; /* last block number seen */ xfs_fileoff_t lowest; /* lowest useful block */ xfs_fileoff_t max; /* starting useful block */ - xfs_fileoff_t off; /* offset for this block */ xfs_extnum_t nextents; /* number of extent entries */ ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE || @@ -1373,16 +1375,19 @@ xfs_bmap_first_unused( lowest = *first_unused; nextents = xfs_iext_count(ifp); for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) { - xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx); - off = xfs_bmbt_get_startoff(ep); + struct xfs_bmbt_irec got; + + xfs_iext_get_extent(ifp, idx, &got); + /* * See if the hole before this extent will work. */ - if (off >= lowest + len && off - max >= len) { + if (got.br_startoff >= lowest + len && + got.br_startoff - max >= len) { *first_unused = max; return 0; } - lastaddr = off + xfs_bmbt_get_blockcount(ep); + lastaddr = got.br_startoff + got.br_blockcount; max = XFS_FILEOFF_MAX(lastaddr, lowest); } *first_unused = max; @@ -4918,7 +4923,7 @@ xfs_bmap_del_extent_delay( da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, got->br_blockcount), da_old); got->br_startblock = nullstartblock((int)da_new); - xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + xfs_iext_update_extent(ifp, *idx, got); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); break; case BMAP_RIGHT_CONTIG: @@ -4930,7 +4935,7 @@ xfs_bmap_del_extent_delay( da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, got->br_blockcount), da_old); got->br_startblock = nullstartblock((int)da_new); - xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + xfs_iext_update_extent(ifp, *idx, got); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); break; case 0: @@ -4956,7 +4961,7 @@ xfs_bmap_del_extent_delay( del->br_blockcount); got->br_startblock = nullstartblock((int)got_indlen); - xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + xfs_iext_update_extent(ifp, *idx, got); trace_xfs_bmap_post_update(ip, *idx, 0, _THIS_IP_); new.br_startoff = del_endoff; @@ -5026,7 +5031,7 @@ xfs_bmap_del_extent_cow( got->br_startoff = del_endoff; got->br_blockcount -= del->br_blockcount; got->br_startblock = del->br_startblock + del->br_blockcount; - xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + xfs_iext_update_extent(ifp, *idx, got); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); break; case BMAP_RIGHT_CONTIG: @@ -5035,7 +5040,7 @@ xfs_bmap_del_extent_cow( */ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); got->br_blockcount -= del->br_blockcount; - xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + xfs_iext_update_extent(ifp, *idx, got); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); break; case 0: @@ -5044,7 +5049,7 @@ xfs_bmap_del_extent_cow( */ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); got->br_blockcount = del->br_startoff - got->br_startoff; - xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + xfs_iext_update_extent(ifp, *idx, got); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); new.br_startoff = del_endoff; @@ -5435,6 +5440,7 @@ __xfs_bunmapi( xfs_fsblock_t sum; xfs_filblks_t len = *rlen; /* length to unmap in file */ xfs_fileoff_t max_len; + xfs_agnumber_t prev_agno = NULLAGNUMBER, agno; trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_); @@ -5534,6 +5540,17 @@ __xfs_bunmapi( */ del = got; wasdel = isnullstartblock(del.br_startblock); + + /* + * Make sure we don't touch multiple AGF headers out of order + * in a single transaction, as that could cause AB-BA deadlocks. + */ + if (!wasdel) { + agno = XFS_FSB_TO_AGNO(mp, del.br_startblock); + if (prev_agno != NULLAGNUMBER && prev_agno > agno) + break; + prev_agno = agno; + } if (got.br_startoff < start) { del.br_startoff = start; del.br_blockcount -= start - got.br_startoff; @@ -5864,32 +5881,26 @@ xfs_bmse_merge( int whichfork, xfs_fileoff_t shift, /* shift fsb */ int current_ext, /* idx of gotp */ - struct xfs_bmbt_rec_host *gotp, /* extent to shift */ - struct xfs_bmbt_rec_host *leftp, /* preceding extent */ + struct xfs_bmbt_irec *got, /* extent to shift */ + struct xfs_bmbt_irec *left, /* preceding extent */ struct xfs_btree_cur *cur, - int *logflags) /* output */ + int *logflags, /* output */ + struct xfs_defer_ops *dfops) { - struct xfs_bmbt_irec got; - struct xfs_bmbt_irec left; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_bmbt_irec new; xfs_filblks_t blockcount; int error, i; struct xfs_mount *mp = ip->i_mount; - xfs_bmbt_get_all(gotp, &got); - xfs_bmbt_get_all(leftp, &left); - blockcount = left.br_blockcount + got.br_blockcount; + blockcount = left->br_blockcount + got->br_blockcount; ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(xfs_bmse_can_merge(&left, &got, shift)); + ASSERT(xfs_bmse_can_merge(left, got, shift)); - /* - * Merge the in-core extents. Note that the host record pointers and - * current_ext index are invalid once the extent has been removed via - * xfs_iext_remove(). - */ - xfs_bmbt_set_blockcount(leftp, blockcount); - xfs_iext_remove(ip, current_ext, 1, 0); + new = *left; + new.br_blockcount = blockcount; /* * Update the on-disk extent count, the btree if necessary and log the @@ -5900,12 +5911,12 @@ xfs_bmse_merge( *logflags |= XFS_ILOG_CORE; if (!cur) { *logflags |= XFS_ILOG_DEXT; - return 0; + goto done; } /* lookup and remove the extent to merge */ - error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock, - got.br_blockcount, &i); + error = xfs_bmbt_lookup_eq(cur, got->br_startoff, got->br_startblock, + got->br_blockcount, &i); if (error) return error; XFS_WANT_CORRUPTED_RETURN(mp, i == 1); @@ -5916,16 +5927,28 @@ xfs_bmse_merge( XFS_WANT_CORRUPTED_RETURN(mp, i == 1); /* lookup and update size of the previous extent */ - error = xfs_bmbt_lookup_eq(cur, left.br_startoff, left.br_startblock, - left.br_blockcount, &i); + error = xfs_bmbt_lookup_eq(cur, left->br_startoff, left->br_startblock, + left->br_blockcount, &i); if (error) return error; XFS_WANT_CORRUPTED_RETURN(mp, i == 1); - left.br_blockcount = blockcount; + error = xfs_bmbt_update(cur, new.br_startoff, new.br_startblock, + new.br_blockcount, new.br_state); + if (error) + return error; - return xfs_bmbt_update(cur, left.br_startoff, left.br_startblock, - left.br_blockcount, left.br_state); +done: + xfs_iext_update_extent(ifp, current_ext - 1, &new); + xfs_iext_remove(ip, current_ext, 1, 0); + + /* update reverse mapping. rmap functions merge the rmaps for us */ + error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, got); + if (error) + return error; + memcpy(&new, got, sizeof(new)); + new.br_startoff = left->br_startoff + left->br_blockcount; + return xfs_rmap_map_extent(mp, dfops, ip, whichfork, &new); } /* @@ -5937,7 +5960,7 @@ xfs_bmse_shift_one( int whichfork, xfs_fileoff_t offset_shift_fsb, int *current_ext, - struct xfs_bmbt_rec_host *gotp, + struct xfs_bmbt_irec *got, struct xfs_btree_cur *cur, int *logflags, enum shift_direction direction, @@ -5946,9 +5969,7 @@ xfs_bmse_shift_one( struct xfs_ifork *ifp; struct xfs_mount *mp; xfs_fileoff_t startoff; - struct xfs_bmbt_rec_host *adj_irecp; - struct xfs_bmbt_irec got; - struct xfs_bmbt_irec adj_irec; + struct xfs_bmbt_irec adj_irec, new; int error; int i; int total_extents; @@ -5957,13 +5978,11 @@ xfs_bmse_shift_one( ifp = XFS_IFORK_PTR(ip, whichfork); total_extents = xfs_iext_count(ifp); - xfs_bmbt_get_all(gotp, &got); - /* delalloc extents should be prevented by caller */ - XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got.br_startblock)); + XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got->br_startblock)); if (direction == SHIFT_LEFT) { - startoff = got.br_startoff - offset_shift_fsb; + startoff = got->br_startoff - offset_shift_fsb; /* * Check for merge if we've got an extent to the left, @@ -5971,46 +5990,39 @@ xfs_bmse_shift_one( * of the file for the shift. */ if (!*current_ext) { - if (got.br_startoff < offset_shift_fsb) + if (got->br_startoff < offset_shift_fsb) return -EINVAL; goto update_current_ext; } + /* - * grab the left extent and check for a large - * enough hole. + * grab the left extent and check for a large enough hole. */ - adj_irecp = xfs_iext_get_ext(ifp, *current_ext - 1); - xfs_bmbt_get_all(adj_irecp, &adj_irec); - - if (startoff < - adj_irec.br_startoff + adj_irec.br_blockcount) + xfs_iext_get_extent(ifp, *current_ext - 1, &adj_irec); + if (startoff < adj_irec.br_startoff + adj_irec.br_blockcount) return -EINVAL; /* check whether to merge the extent or shift it down */ - if (xfs_bmse_can_merge(&adj_irec, &got, - offset_shift_fsb)) { - error = xfs_bmse_merge(ip, whichfork, offset_shift_fsb, - *current_ext, gotp, adj_irecp, - cur, logflags); - if (error) - return error; - adj_irec = got; - goto update_rmap; + if (xfs_bmse_can_merge(&adj_irec, got, offset_shift_fsb)) { + return xfs_bmse_merge(ip, whichfork, offset_shift_fsb, + *current_ext, got, &adj_irec, + cur, logflags, dfops); } } else { - startoff = got.br_startoff + offset_shift_fsb; + startoff = got->br_startoff + offset_shift_fsb; /* nothing to move if this is the last extent */ if (*current_ext >= (total_extents - 1)) goto update_current_ext; + /* * If this is not the last extent in the file, make sure there * is enough room between current extent and next extent for * accommodating the shift. */ - adj_irecp = xfs_iext_get_ext(ifp, *current_ext + 1); - xfs_bmbt_get_all(adj_irecp, &adj_irec); - if (startoff + got.br_blockcount > adj_irec.br_startoff) + xfs_iext_get_extent(ifp, *current_ext + 1, &adj_irec); + if (startoff + got->br_blockcount > adj_irec.br_startoff) return -EINVAL; + /* * Unlike a left shift (which involves a hole punch), * a right shift does not modify extent neighbors @@ -6018,45 +6030,48 @@ xfs_bmse_shift_one( * in this scenario. Check anyways and warn if we * encounter two extents that could be one. */ - if (xfs_bmse_can_merge(&got, &adj_irec, offset_shift_fsb)) + if (xfs_bmse_can_merge(got, &adj_irec, offset_shift_fsb)) WARN_ON_ONCE(1); } + /* * Increment the extent index for the next iteration, update the start * offset of the in-core extent and update the btree if applicable. */ update_current_ext: - if (direction == SHIFT_LEFT) - (*current_ext)++; - else - (*current_ext)--; - xfs_bmbt_set_startoff(gotp, startoff); *logflags |= XFS_ILOG_CORE; - adj_irec = got; - if (!cur) { + + new = *got; + new.br_startoff = startoff; + + if (cur) { + error = xfs_bmbt_lookup_eq(cur, got->br_startoff, + got->br_startblock, got->br_blockcount, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + + error = xfs_bmbt_update(cur, new.br_startoff, + new.br_startblock, new.br_blockcount, + new.br_state); + if (error) + return error; + } else { *logflags |= XFS_ILOG_DEXT; - goto update_rmap; } - error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock, - got.br_blockcount, &i); - if (error) - return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + xfs_iext_update_extent(ifp, *current_ext, &new); - got.br_startoff = startoff; - error = xfs_bmbt_update(cur, got.br_startoff, got.br_startblock, - got.br_blockcount, got.br_state); - if (error) - return error; + if (direction == SHIFT_LEFT) + (*current_ext)++; + else + (*current_ext)--; -update_rmap: /* update reverse mapping */ - error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, &adj_irec); + error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, got); if (error) return error; - adj_irec.br_startoff = startoff; - return xfs_rmap_map_extent(mp, dfops, ip, whichfork, &adj_irec); + return xfs_rmap_map_extent(mp, dfops, ip, whichfork, &new); } /* @@ -6083,7 +6098,6 @@ xfs_bmap_shift_extents( int num_exts) { struct xfs_btree_cur *cur = NULL; - struct xfs_bmbt_rec_host *gotp; struct xfs_bmbt_irec got; struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp; @@ -6110,7 +6124,6 @@ xfs_bmap_shift_extents( ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT); - ASSERT(*next_fsb != NULLFSBLOCK || direction == SHIFT_RIGHT); ifp = XFS_IFORK_PTR(ip, whichfork); if (!(ifp->if_flags & XFS_IFEXTENTS)) { @@ -6142,10 +6155,26 @@ xfs_bmap_shift_extents( * In case of first right shift, we need to initialize next_fsb */ if (*next_fsb == NULLFSBLOCK) { - gotp = xfs_iext_get_ext(ifp, total_extents - 1); - xfs_bmbt_get_all(gotp, &got); + ASSERT(direction == SHIFT_RIGHT); + + current_ext = total_extents - 1; + xfs_iext_get_extent(ifp, current_ext, &got); + if (stop_fsb > got.br_startoff) { + *done = 1; + goto del_cursor; + } *next_fsb = got.br_startoff; - if (stop_fsb > *next_fsb) { + } else { + /* + * Look up the extent index for the fsb where we start shifting. We can + * henceforth iterate with current_ext as extent list changes are locked + * out via ilock. + * + * If next_fsb lies in a hole beyond which there are no extents we are + * done. + */ + if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, ¤t_ext, + &got)) { *done = 1; goto del_cursor; } @@ -6153,37 +6182,26 @@ xfs_bmap_shift_extents( /* Lookup the extent index at which we have to stop */ if (direction == SHIFT_RIGHT) { - gotp = xfs_iext_bno_to_ext(ifp, stop_fsb, &stop_extent); + struct xfs_bmbt_irec s; + + xfs_iext_lookup_extent(ip, ifp, stop_fsb, &stop_extent, &s); /* Make stop_extent exclusive of shift range */ stop_extent--; - } else + if (current_ext <= stop_extent) { + error = -EIO; + goto del_cursor; + } + } else { stop_extent = total_extents; - - /* - * Look up the extent index for the fsb where we start shifting. We can - * henceforth iterate with current_ext as extent list changes are locked - * out via ilock. - * - * gotp can be null in 2 cases: 1) if there are no extents or 2) - * *next_fsb lies in a hole beyond which there are no extents. Either - * way, we are done. - */ - gotp = xfs_iext_bno_to_ext(ifp, *next_fsb, ¤t_ext); - if (!gotp) { - *done = 1; - goto del_cursor; - } - - /* some sanity checking before we finally start shifting extents */ - if ((direction == SHIFT_LEFT && current_ext >= stop_extent) || - (direction == SHIFT_RIGHT && current_ext <= stop_extent)) { - error = -EIO; - goto del_cursor; + if (current_ext >= stop_extent) { + error = -EIO; + goto del_cursor; + } } while (nexts++ < num_exts) { error = xfs_bmse_shift_one(ip, whichfork, offset_shift_fsb, - ¤t_ext, gotp, cur, &logflags, + ¤t_ext, &got, cur, &logflags, direction, dfops); if (error) goto del_cursor; @@ -6201,13 +6219,11 @@ xfs_bmap_shift_extents( *next_fsb = NULLFSBLOCK; break; } - gotp = xfs_iext_get_ext(ifp, current_ext); + xfs_iext_get_extent(ifp, current_ext, &got); } - if (!*done) { - xfs_bmbt_get_all(gotp, &got); + if (!*done) *next_fsb = got.br_startoff; - } del_cursor: if (cur) @@ -6236,7 +6252,6 @@ xfs_bmap_split_extent_at( { int whichfork = XFS_DATA_FORK; struct xfs_btree_cur *cur = NULL; - struct xfs_bmbt_rec_host *gotp; struct xfs_bmbt_irec got; struct xfs_bmbt_irec new; /* split extent */ struct xfs_mount *mp = ip->i_mount; @@ -6268,21 +6283,10 @@ xfs_bmap_split_extent_at( } /* - * gotp can be null in 2 cases: 1) if there are no extents - * or 2) split_fsb lies in a hole beyond which there are - * no extents. Either way, we are done. + * If there are not extents, or split_fsb lies in a hole we are done. */ - gotp = xfs_iext_bno_to_ext(ifp, split_fsb, ¤t_ext); - if (!gotp) - return 0; - - xfs_bmbt_get_all(gotp, &got); - - /* - * Check split_fsb lies in a hole or the start boundary offset - * of the extent. - */ - if (got.br_startoff >= split_fsb) + if (!xfs_iext_lookup_extent(ip, ifp, split_fsb, ¤t_ext, &got) || + got.br_startoff >= split_fsb) return 0; gotblkcnt = split_fsb - got.br_startoff; @@ -6305,8 +6309,8 @@ xfs_bmap_split_extent_at( XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor); } - xfs_bmbt_set_blockcount(gotp, gotblkcnt); got.br_blockcount = gotblkcnt; + xfs_iext_update_extent(ifp, current_ext, &got); logflags = XFS_ILOG_CORE; if (cur) { @@ -6390,7 +6394,7 @@ xfs_bmap_split_extent( if (error) goto out; - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out; @@ -6440,7 +6444,7 @@ __xfs_bmap_add( bi->bi_whichfork = whichfork; bi->bi_bmap = *bmap; - error = xfs_defer_join(dfops, bi->bi_owner); + error = xfs_defer_ijoin(dfops, bi->bi_owner); if (error) { kmem_free(bi); return error; @@ -6499,6 +6503,15 @@ xfs_bmap_finish_one( xfs_fsblock_t firstfsb; int error = 0; + /* + * firstfsb is tied to the transaction lifetime and is used to + * ensure correct AG locking order and schedule work item + * continuations. XFS_BUI_MAX_FAST_EXTENTS (== 1) restricts us + * to only making one bmap call per transaction, so it should + * be safe to have it as a local variable here. + */ + firstfsb = NULLFSBLOCK; + trace_xfs_bmap_deferred(tp->t_mountp, XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type, XFS_FSB_TO_AGBNO(tp->t_mountp, startblock), diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 85de22513014..a6331ffa51e3 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -858,6 +858,7 @@ xfs_bmbt_change_owner( cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork); if (!cur) return -ENOMEM; + cur->bc_private.b.flags |= XFS_BTCUR_BPRV_INVALID_OWNER; error = xfs_btree_change_owner(cur, new_owner, buffer_list); xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 4da85fff69ad..5bfb88261c7e 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -728,7 +728,8 @@ xfs_btree_firstrec( * Get the block pointer for this level. */ block = xfs_btree_get_block(cur, level, &bp); - xfs_btree_check_block(cur, block, level, bp); + if (xfs_btree_check_block(cur, block, level, bp)) + return 0; /* * It's empty, there is no such record. */ @@ -757,7 +758,8 @@ xfs_btree_lastrec( * Get the block pointer for this level. */ block = xfs_btree_get_block(cur, level, &bp); - xfs_btree_check_block(cur, block, level, bp); + if (xfs_btree_check_block(cur, block, level, bp)) + return 0; /* * It's empty, there is no such record. */ @@ -1789,6 +1791,7 @@ xfs_btree_lookup_get_block( /* Check the inode owner since the verifiers don't. */ if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) && + !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_INVALID_OWNER) && (cur->bc_flags & XFS_BTREE_LONG_PTRS) && be64_to_cpu((*blkp)->bb_u.l.bb_owner) != cur->bc_private.b.ip->i_ino) @@ -4449,10 +4452,15 @@ xfs_btree_block_change_owner( /* modify the owner */ block = xfs_btree_get_block(cur, level, &bp); - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner)) + return 0; block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner); - else + } else { + if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner)) + return 0; block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner); + } /* * If the block is a root block hosted in an inode, we might not have a @@ -4461,16 +4469,19 @@ xfs_btree_block_change_owner( * block is formatted into the on-disk inode fork. We still change it, * though, so everything is consistent in memory. */ - if (bp) { - if (cur->bc_tp) { - xfs_trans_ordered_buf(cur->bc_tp, bp); + if (!bp) { + ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + ASSERT(level == cur->bc_nlevels - 1); + return 0; + } + + if (cur->bc_tp) { + if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) { xfs_btree_log_block(cur, bp, XFS_BB_OWNER); - } else { - xfs_buf_delwri_queue(bp, bbcoi->buffer_list); + return -EAGAIN; } } else { - ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); - ASSERT(level == cur->bc_nlevels - 1); + xfs_buf_delwri_queue(bp, bbcoi->buffer_list); } return 0; diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 9c95e965cfe5..f2a88c3b1159 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -233,7 +233,8 @@ typedef struct xfs_btree_cur short forksize; /* fork's inode space */ char whichfork; /* data or attr fork */ char flags; /* flags */ -#define XFS_BTCUR_BPRV_WASDEL 1 /* was delayed */ +#define XFS_BTCUR_BPRV_WASDEL (1<<0) /* was delayed */ +#define XFS_BTCUR_BPRV_INVALID_OWNER (1<<1) /* for ext swap */ } b; } bc_private; /* per-btree type data */ } xfs_btree_cur_t; diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 5c2929f94bd3..072ebfe1d6ae 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -240,23 +240,19 @@ xfs_defer_trans_abort( STATIC int xfs_defer_trans_roll( struct xfs_trans **tp, - struct xfs_defer_ops *dop, - struct xfs_inode *ip) + struct xfs_defer_ops *dop) { int i; int error; - /* Log all the joined inodes except the one we passed in. */ - for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) { - if (dop->dop_inodes[i] == ip) - continue; + /* Log all the joined inodes. */ + for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) xfs_trans_log_inode(*tp, dop->dop_inodes[i], XFS_ILOG_CORE); - } trace_xfs_defer_trans_roll((*tp)->t_mountp, dop); /* Roll the transaction. */ - error = xfs_trans_roll(tp, ip); + error = xfs_trans_roll(tp); if (error) { trace_xfs_defer_trans_roll_error((*tp)->t_mountp, dop, error); xfs_defer_trans_abort(*tp, dop, error); @@ -264,12 +260,9 @@ xfs_defer_trans_roll( } dop->dop_committed = true; - /* Rejoin the joined inodes except the one we passed in. */ - for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) { - if (dop->dop_inodes[i] == ip) - continue; + /* Rejoin the joined inodes. */ + for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) xfs_trans_ijoin(*tp, dop->dop_inodes[i], 0); - } return error; } @@ -284,11 +277,10 @@ xfs_defer_has_unfinished_work( /* * Add this inode to the deferred op. Each joined inode is relogged - * each time we roll the transaction, in addition to any inode passed - * to xfs_defer_finish(). + * each time we roll the transaction. */ int -xfs_defer_join( +xfs_defer_ijoin( struct xfs_defer_ops *dop, struct xfs_inode *ip) { @@ -317,8 +309,7 @@ xfs_defer_join( int xfs_defer_finish( struct xfs_trans **tp, - struct xfs_defer_ops *dop, - struct xfs_inode *ip) + struct xfs_defer_ops *dop) { struct xfs_defer_pending *dfp; struct list_head *li; @@ -337,7 +328,7 @@ xfs_defer_finish( xfs_defer_intake_work(*tp, dop); /* Roll the transaction. */ - error = xfs_defer_trans_roll(tp, dop, ip); + error = xfs_defer_trans_roll(tp, dop); if (error) goto out; diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index f6e93ef0bffe..d4f046dd44bd 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -72,12 +72,11 @@ struct xfs_defer_ops { void xfs_defer_add(struct xfs_defer_ops *dop, enum xfs_defer_ops_type type, struct list_head *h); -int xfs_defer_finish(struct xfs_trans **tp, struct xfs_defer_ops *dop, - struct xfs_inode *ip); +int xfs_defer_finish(struct xfs_trans **tp, struct xfs_defer_ops *dop); void xfs_defer_cancel(struct xfs_defer_ops *dop); void xfs_defer_init(struct xfs_defer_ops *dop, xfs_fsblock_t *fbp); bool xfs_defer_has_unfinished_work(struct xfs_defer_ops *dop); -int xfs_defer_join(struct xfs_defer_ops *dop, struct xfs_inode *ip); +int xfs_defer_ijoin(struct xfs_defer_ops *dop, struct xfs_inode *ip); /* Description of a deferred type. */ struct xfs_defer_op_type { diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index d478065b9544..8727a43115ef 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -136,6 +136,8 @@ __xfs_dir3_data_check( */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { XFS_WANT_CORRUPTED_RETURN(mp, lastfree == 0); + XFS_WANT_CORRUPTED_RETURN(mp, endp >= + p + be16_to_cpu(dup->length)); XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) == (char *)dup - (char *)hdr); @@ -164,6 +166,8 @@ __xfs_dir3_data_check( XFS_WANT_CORRUPTED_RETURN(mp, dep->namelen != 0); XFS_WANT_CORRUPTED_RETURN(mp, !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber))); + XFS_WANT_CORRUPTED_RETURN(mp, endp >= + p + ops->data_entsize(dep->namelen)); XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(*ops->data_entry_tag_p(dep)) == (char *)dep - (char *)hdr); diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index ffd5a15d1bb6..988bb3f31446 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -378,8 +378,6 @@ xfs_ialloc_inode_init( * transaction and pin the log appropriately. */ xfs_trans_ordered_buf(tp, fbuf); - xfs_trans_log_buf(tp, fbuf, 0, - BBTOB(fbuf->b_length) - 1); } } else { fbuf->b_flags |= XBF_DONE; @@ -1133,6 +1131,7 @@ xfs_dialloc_ag_inobt( int error; int offset; int i, j; + int searchdistance = 10; pag = xfs_perag_get(mp, agno); @@ -1159,7 +1158,6 @@ xfs_dialloc_ag_inobt( if (pagno == agno) { int doneleft; /* done, to the left */ int doneright; /* done, to the right */ - int searchdistance = 10; error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i); if (error) @@ -1220,21 +1218,9 @@ xfs_dialloc_ag_inobt( /* * Loop until we find an inode chunk with a free inode. */ - while (!doneleft || !doneright) { + while (--searchdistance > 0 && (!doneleft || !doneright)) { int useleft; /* using left inode chunk this time */ - if (!--searchdistance) { - /* - * Not in range - save last search - * location and allocate a new inode - */ - xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); - pag->pagl_leftrec = trec.ir_startino; - pag->pagl_rightrec = rec.ir_startino; - pag->pagl_pagino = pagino; - goto newino; - } - /* figure out the closer block if both are valid. */ if (!doneleft && !doneright) { useleft = pagino - @@ -1246,13 +1232,13 @@ xfs_dialloc_ag_inobt( /* free inodes to the left? */ if (useleft && trec.ir_freecount) { - rec = trec; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); cur = tcur; pag->pagl_leftrec = trec.ir_startino; pag->pagl_rightrec = rec.ir_startino; pag->pagl_pagino = pagino; + rec = trec; goto alloc_inode; } @@ -1278,26 +1264,37 @@ xfs_dialloc_ag_inobt( goto error1; } - /* - * We've reached the end of the btree. because - * we are only searching a small chunk of the - * btree each search, there is obviously free - * inodes closer to the parent inode than we - * are now. restart the search again. - */ - pag->pagl_pagino = NULLAGINO; - pag->pagl_leftrec = NULLAGINO; - pag->pagl_rightrec = NULLAGINO; - xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); - xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); - goto restart_pagno; + if (searchdistance <= 0) { + /* + * Not in range - save last search + * location and allocate a new inode + */ + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + pag->pagl_leftrec = trec.ir_startino; + pag->pagl_rightrec = rec.ir_startino; + pag->pagl_pagino = pagino; + + } else { + /* + * We've reached the end of the btree. because + * we are only searching a small chunk of the + * btree each search, there is obviously free + * inodes closer to the parent inode than we + * are now. restart the search again. + */ + pag->pagl_pagino = NULLAGINO; + pag->pagl_leftrec = NULLAGINO; + pag->pagl_rightrec = NULLAGINO; + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + goto restart_pagno; + } } /* * In a different AG from the parent. * See if the most recently allocated block has any free. */ -newino: if (agi->agi_newino != cpu_to_be32(NULLAGINO)) { error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino), XFS_LOOKUP_EQ, &i); diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 0e80f34fe97c..31840ca24018 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -1499,14 +1499,11 @@ xfs_iext_realloc_indirect( xfs_ifork_t *ifp, /* inode fork pointer */ int new_size) /* new indirection array size */ { - int nlists; /* number of irec's (ex lists) */ - int size; /* current indirection array size */ - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - size = nlists * sizeof(xfs_ext_irec_t); ASSERT(ifp->if_real_bytes); - ASSERT((new_size >= 0) && (new_size != size)); + ASSERT((new_size >= 0) && + (new_size != ((ifp->if_real_bytes / XFS_IEXT_BUFSZ) * + sizeof(xfs_ext_irec_t)))); if (new_size == 0) { xfs_iext_destroy(ifp); } else { @@ -2023,3 +2020,15 @@ xfs_iext_get_extent( xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), gotp); return true; } + +void +xfs_iext_update_extent( + struct xfs_ifork *ifp, + xfs_extnum_t idx, + struct xfs_bmbt_irec *gotp) +{ + ASSERT(idx >= 0); + ASSERT(idx < xfs_iext_count(ifp)); + + xfs_bmbt_set_all(xfs_iext_get_ext(ifp, idx), gotp); +} diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 7fb8365326d1..11af705219f6 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -187,6 +187,8 @@ bool xfs_iext_lookup_extent(struct xfs_inode *ip, xfs_extnum_t *idxp, struct xfs_bmbt_irec *gotp); bool xfs_iext_get_extent(struct xfs_ifork *ifp, xfs_extnum_t idx, struct xfs_bmbt_irec *gotp); +void xfs_iext_update_extent(struct xfs_ifork *ifp, xfs_extnum_t idx, + struct xfs_bmbt_irec *gotp); extern struct kmem_zone *xfs_ifork_zone; diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 900ea231f9a3..9d5406b4f663 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1638,6 +1638,10 @@ xfs_refcount_recover_cow_leftovers( error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); if (error) goto out_trans; + if (!agbp) { + error = -ENOMEM; + goto out_trans; + } cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL); /* Find all the leftover CoW staging extents. */ @@ -1675,7 +1679,7 @@ xfs_refcount_recover_cow_leftovers( xfs_bmap_add_free(mp, &dfops, fsb, rr->rr_rrec.rc_blockcount, NULL); - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_defer; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 6bf120bb1a17..f9efd67f6fa1 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -85,11 +85,11 @@ xfs_find_bdev_for_inode( * associated buffer_heads, paying attention to the start and end offsets that * we need to process on the page. * - * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last - * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or - * the page at all, as we may be racing with memory reclaim and it can free both - * the bufferhead chain and the page as it will see the page as clean and - * unused. + * Note that we open code the action in end_buffer_async_write here so that we + * only have to iterate over the buffers attached to the page once. This is not + * only more efficient, but also ensures that we only calls end_page_writeback + * at the end of the iteration, and thus avoids the pitfall of having the page + * and buffers potentially freed after every call to end_buffer_async_write. */ static void xfs_finish_page_writeback( @@ -97,29 +97,44 @@ xfs_finish_page_writeback( struct bio_vec *bvec, int error) { - unsigned int end = bvec->bv_offset + bvec->bv_len - 1; - struct buffer_head *head, *bh, *next; + struct buffer_head *head = page_buffers(bvec->bv_page), *bh = head; + bool busy = false; unsigned int off = 0; - unsigned int bsize; + unsigned long flags; ASSERT(bvec->bv_offset < PAGE_SIZE); ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0); - ASSERT(end < PAGE_SIZE); + ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE); ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0); - bh = head = page_buffers(bvec->bv_page); - - bsize = bh->b_size; + local_irq_save(flags); + bit_spin_lock(BH_Uptodate_Lock, &head->b_state); do { - if (off > end) - break; - next = bh->b_this_page; - if (off < bvec->bv_offset) - goto next_bh; - bh->b_end_io(bh, !error); -next_bh: - off += bsize; - } while ((bh = next) != head); + if (off >= bvec->bv_offset && + off < bvec->bv_offset + bvec->bv_len) { + ASSERT(buffer_async_write(bh)); + ASSERT(bh->b_end_io == NULL); + + if (error) { + mark_buffer_write_io_error(bh); + clear_buffer_uptodate(bh); + SetPageError(bvec->bv_page); + } else { + set_buffer_uptodate(bh); + } + clear_buffer_async_write(bh); + unlock_buffer(bh); + } else if (buffer_async_write(bh)) { + ASSERT(buffer_locked(bh)); + busy = true; + } + off += bh->b_size; + } while ((bh = bh->b_this_page) != head); + bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); + local_irq_restore(flags); + + if (!busy) + end_page_writeback(bvec->bv_page); } /* @@ -133,8 +148,10 @@ xfs_destroy_ioend( int error) { struct inode *inode = ioend->io_inode; - struct bio *last = ioend->io_bio; - struct bio *bio, *next; + struct bio *bio = &ioend->io_inline_bio; + struct bio *last = ioend->io_bio, *next; + u64 start = bio->bi_iter.bi_sector; + bool quiet = bio_flagged(bio, BIO_QUIET); for (bio = &ioend->io_inline_bio; bio; bio = next) { struct bio_vec *bvec; @@ -155,6 +172,11 @@ xfs_destroy_ioend( bio_put(bio); } + + if (unlikely(error && !quiet)) { + xfs_err_ratelimited(XFS_I(inode)->i_mount, + "writeback error on sector %llu", start); + } } /* @@ -423,7 +445,8 @@ xfs_start_buffer_writeback( ASSERT(!buffer_delay(bh)); ASSERT(!buffer_unwritten(bh)); - mark_buffer_async_write(bh); + bh->b_end_io = NULL; + set_buffer_async_write(bh); set_buffer_uptodate(bh); clear_buffer_dirty(bh); } diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index be0b79d8900f..ebd66b19fbfc 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -97,7 +97,7 @@ xfs_attr3_leaf_freextent( /* * Roll to next transaction. */ - error = xfs_trans_roll(trans, dp); + error = xfs_trans_roll_inode(trans, dp); if (error) return error; } @@ -308,7 +308,7 @@ xfs_attr3_node_inactive( /* * Atomically commit the whole invalidate stuff. */ - error = xfs_trans_roll(trans, dp); + error = xfs_trans_roll_inode(trans, dp); if (error) return error; } @@ -375,7 +375,7 @@ xfs_attr3_root_inactive( /* * Commit the invalidate and start the next transaction. */ - error = xfs_trans_roll(trans, dp); + error = xfs_trans_roll_inode(trans, dp); return error; } diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 88073910fa5d..dd136f7275e4 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -502,7 +502,7 @@ xfs_bui_recover( } /* Finish transaction, free inodes. */ - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto err_dfops; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 93e955262d07..cd9a5400ba4f 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -222,22 +222,21 @@ xfs_bmap_eof( * Count leaf blocks given a range of extent records. Delayed allocation * extents are not counted towards the totals. */ -STATIC void +xfs_extnum_t xfs_bmap_count_leaves( struct xfs_ifork *ifp, - xfs_extnum_t *numrecs, xfs_filblks_t *count) { - xfs_extnum_t i; - xfs_extnum_t nr_exts = xfs_iext_count(ifp); - - for (i = 0; i < nr_exts; i++) { - xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, i); - if (!isnullstartblock(xfs_bmbt_get_startblock(frp))) { - (*numrecs)++; - *count += xfs_bmbt_get_blockcount(frp); + struct xfs_bmbt_irec got; + xfs_extnum_t numrecs = 0, i = 0; + + while (xfs_iext_get_extent(ifp, i++, &got)) { + if (!isnullstartblock(got.br_startblock)) { + *count += got.br_blockcount; + numrecs++; } } + return numrecs; } /* @@ -370,7 +369,7 @@ xfs_bmap_count_blocks( switch (XFS_IFORK_FORMAT(ip, whichfork)) { case XFS_DINODE_FMT_EXTENTS: - xfs_bmap_count_leaves(ifp, nextents, count); + *nextents = xfs_bmap_count_leaves(ifp, count); return 0; case XFS_DINODE_FMT_BTREE: if (!(ifp->if_flags & XFS_IFEXTENTS)) { @@ -1136,7 +1135,7 @@ xfs_alloc_file_space( /* * Complete the transaction */ - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto error0; @@ -1202,7 +1201,8 @@ xfs_unmap_extent( if (error) goto out_bmap_cancel; - error = xfs_defer_finish(&tp, &dfops, ip); + xfs_defer_ijoin(&dfops, ip); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; @@ -1496,7 +1496,7 @@ xfs_shift_file_space( if (error) goto out_bmap_cancel; - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; @@ -1777,7 +1777,8 @@ xfs_swap_extent_rmap( if (error) goto out_defer; - error = xfs_defer_finish(tpp, &dfops, ip); + xfs_defer_ijoin(&dfops, ip); + error = xfs_defer_finish(tpp, &dfops); if (error) goto out_defer; @@ -1840,29 +1841,18 @@ xfs_swap_extent_forks( } /* - * Before we've swapped the forks, lets set the owners of the forks - * appropriately. We have to do this as we are demand paging the btree - * buffers, and so the validation done on read will expect the owner - * field to be correctly set. Once we change the owners, we can swap the - * inode forks. + * Btree format (v3) inodes have the inode number stamped in the bmbt + * block headers. We can't start changing the bmbt blocks until the + * inode owner change is logged so recovery does the right thing in the + * event of a crash. Set the owner change log flags now and leave the + * bmbt scan as the last step. */ if (ip->i_d.di_version == 3 && - ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + ip->i_d.di_format == XFS_DINODE_FMT_BTREE) (*target_log_flags) |= XFS_ILOG_DOWNER; - error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, - tip->i_ino, NULL); - if (error) - return error; - } - if (tip->i_d.di_version == 3 && - tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + tip->i_d.di_format == XFS_DINODE_FMT_BTREE) (*src_log_flags) |= XFS_ILOG_DOWNER; - error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK, - ip->i_ino, NULL); - if (error) - return error; - } /* * Swap the data forks of the inodes @@ -1940,6 +1930,48 @@ xfs_swap_extent_forks( return 0; } +/* + * Fix up the owners of the bmbt blocks to refer to the current inode. The + * change owner scan attempts to order all modified buffers in the current + * transaction. In the event of ordered buffer failure, the offending buffer is + * physically logged as a fallback and the scan returns -EAGAIN. We must roll + * the transaction in this case to replenish the fallback log reservation and + * restart the scan. This process repeats until the scan completes. + */ +static int +xfs_swap_change_owner( + struct xfs_trans **tpp, + struct xfs_inode *ip, + struct xfs_inode *tmpip) +{ + int error; + struct xfs_trans *tp = *tpp; + + do { + error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, ip->i_ino, + NULL); + /* success or fatal error */ + if (error != -EAGAIN) + break; + + error = xfs_trans_roll(tpp); + if (error) + break; + tp = *tpp; + + /* + * Redirty both inodes so they can relog and keep the log tail + * moving forward. + */ + xfs_trans_ijoin(tp, ip, 0); + xfs_trans_ijoin(tp, tmpip, 0); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + xfs_trans_log_inode(tp, tmpip, XFS_ILOG_CORE); + } while (true); + + return error; +} + int xfs_swap_extents( struct xfs_inode *ip, /* target inode */ @@ -1954,7 +1986,7 @@ xfs_swap_extents( int lock_flags; struct xfs_ifork *cowfp; uint64_t f; - int resblks; + int resblks = 0; /* * Lock the inodes against other IO, page faults and truncate to @@ -2002,11 +2034,8 @@ xfs_swap_extents( XFS_SWAP_RMAP_SPACE_RES(mp, XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK), XFS_DATA_FORK); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, - 0, 0, &tp); - } else - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, - 0, 0, &tp); + } + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); if (error) goto out_unlock; @@ -2092,6 +2121,23 @@ xfs_swap_extents( xfs_trans_log_inode(tp, tip, target_log_flags); /* + * The extent forks have been swapped, but crc=1,rmapbt=0 filesystems + * have inode number owner values in the bmbt blocks that still refer to + * the old inode. Scan each bmbt to fix up the owner values with the + * inode number of the current inode. + */ + if (src_log_flags & XFS_ILOG_DOWNER) { + error = xfs_swap_change_owner(&tp, ip, tip); + if (error) + goto out_trans_cancel; + } + if (target_log_flags & XFS_ILOG_DOWNER) { + error = xfs_swap_change_owner(&tp, tip, ip); + if (error) + goto out_trans_cancel; + } + + /* * If this is a synchronous mount, make sure that the * transaction goes to disk before returning to the user. */ diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 0cede1043571..0eaa81dc49be 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -70,6 +70,7 @@ int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip, xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb); +xfs_extnum_t xfs_bmap_count_leaves(struct xfs_ifork *ifp, xfs_filblks_t *count); int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, xfs_extnum_t *nextents, xfs_filblks_t *count); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index f6a8422e9562..e0a0af0946f2 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -29,6 +29,7 @@ #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_log.h" +#include "xfs_inode.h" kmem_zone_t *xfs_buf_item_zone; @@ -322,6 +323,8 @@ xfs_buf_item_format( ASSERT((bip->bli_flags & XFS_BLI_STALE) || (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF)); + ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED) || + (bip->bli_flags & XFS_BLI_STALE)); /* @@ -346,16 +349,6 @@ xfs_buf_item_format( bip->bli_flags &= ~XFS_BLI_INODE_BUF; } - if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) == - XFS_BLI_ORDERED) { - /* - * The buffer has been logged just to order it. It is not being - * included in the transaction commit, so don't format it. - */ - trace_xfs_buf_item_format_ordered(bip); - return; - } - for (i = 0; i < bip->bli_format_count; i++) { xfs_buf_item_format_segment(bip, lv, &vecp, offset, &bip->bli_formats[i]); @@ -574,26 +567,20 @@ xfs_buf_item_unlock( { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; - bool clean; - bool aborted; - int flags; + bool aborted = !!(lip->li_flags & XFS_LI_ABORTED); + bool hold = !!(bip->bli_flags & XFS_BLI_HOLD); + bool dirty = !!(bip->bli_flags & XFS_BLI_DIRTY); +#if defined(DEBUG) || defined(XFS_WARN) + bool ordered = !!(bip->bli_flags & XFS_BLI_ORDERED); +#endif /* Clear the buffer's association with this transaction. */ bp->b_transp = NULL; /* - * If this is a transaction abort, don't return early. Instead, allow - * the brelse to happen. Normally it would be done for stale - * (cancelled) buffers at unpin time, but we'll never go through the - * pin/unpin cycle if we abort inside commit. + * The per-transaction state has been copied above so clear it from the + * bli. */ - aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false; - /* - * Before possibly freeing the buf item, copy the per-transaction state - * so we can reference it safely later after clearing it from the - * buffer log item. - */ - flags = bip->bli_flags; bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); /* @@ -601,7 +588,7 @@ xfs_buf_item_unlock( * unlock the buffer and free the buf item when the buffer is unpinned * for the last time. */ - if (flags & XFS_BLI_STALE) { + if (bip->bli_flags & XFS_BLI_STALE) { trace_xfs_buf_item_unlock_stale(bip); ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); if (!aborted) { @@ -619,20 +606,11 @@ xfs_buf_item_unlock( * regardless of whether it is dirty or not. A dirty abort implies a * shutdown, anyway. * - * Ordered buffers are dirty but may have no recorded changes, so ensure - * we only release clean items here. + * The bli dirty state should match whether the blf has logged segments + * except for ordered buffers, where only the bli should be dirty. */ - clean = (flags & XFS_BLI_DIRTY) ? false : true; - if (clean) { - int i; - for (i = 0; i < bip->bli_format_count; i++) { - if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, - bip->bli_formats[i].blf_map_size)) { - clean = false; - break; - } - } - } + ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) || + (ordered && dirty && !xfs_buf_item_dirty_format(bip))); /* * Clean buffers, by definition, cannot be in the AIL. However, aborted @@ -651,11 +629,11 @@ xfs_buf_item_unlock( ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR); xfs_buf_item_relse(bp); - } else if (clean) + } else if (!dirty) xfs_buf_item_relse(bp); } - if (!(flags & XFS_BLI_HOLD)) + if (!hold) xfs_buf_relse(bp); } @@ -945,14 +923,22 @@ xfs_buf_item_log( /* - * Return 1 if the buffer has been logged or ordered in a transaction (at any - * point, not just the current transaction) and 0 if not. + * Return true if the buffer has any ranges logged/dirtied by a transaction, + * false otherwise. */ -uint -xfs_buf_item_dirty( - xfs_buf_log_item_t *bip) +bool +xfs_buf_item_dirty_format( + struct xfs_buf_log_item *bip) { - return (bip->bli_flags & XFS_BLI_DIRTY); + int i; + + for (i = 0; i < bip->bli_format_count; i++) { + if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, + bip->bli_formats[i].blf_map_size)) + return true; + } + + return false; } STATIC void @@ -1054,6 +1040,31 @@ xfs_buf_do_callbacks( } } +/* + * Invoke the error state callback for each log item affected by the failed I/O. + * + * If a metadata buffer write fails with a non-permanent error, the buffer is + * eventually resubmitted and so the completion callbacks are not run. The error + * state may need to be propagated to the log items attached to the buffer, + * however, so the next AIL push of the item knows hot to handle it correctly. + */ +STATIC void +xfs_buf_do_callbacks_fail( + struct xfs_buf *bp) +{ + struct xfs_log_item *next; + struct xfs_log_item *lip = bp->b_fspriv; + struct xfs_ail *ailp = lip->li_ailp; + + spin_lock(&ailp->xa_lock); + for (; lip; lip = next) { + next = lip->li_bio_list; + if (lip->li_ops->iop_error) + lip->li_ops->iop_error(lip, bp); + } + spin_unlock(&ailp->xa_lock); +} + static bool xfs_buf_iodone_callback_error( struct xfs_buf *bp) @@ -1123,7 +1134,11 @@ xfs_buf_iodone_callback_error( if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount) goto permanent_error; - /* still a transient error, higher layers will retry */ + /* + * Still a transient error, run IO completion failure callbacks and let + * the higher layers retry the buffer. + */ + xfs_buf_do_callbacks_fail(bp); xfs_buf_ioerror(bp, 0); xfs_buf_relse(bp); return true; @@ -1204,3 +1219,31 @@ xfs_buf_iodone( xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); xfs_buf_item_free(BUF_ITEM(lip)); } + +/* + * Requeue a failed buffer for writeback + * + * Return true if the buffer has been re-queued properly, false otherwise + */ +bool +xfs_buf_resubmit_failed_buffers( + struct xfs_buf *bp, + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + struct xfs_log_item *next; + + /* + * Clear XFS_LI_FAILED flag from all items before resubmit + * + * XFS_LI_FAILED set/clear is protected by xa_lock, caller this + * function already have it acquired + */ + for (; lip; lip = next) { + next = lip->li_bio_list; + xfs_clear_li_failed(lip); + } + + /* Add this buffer back to the delayed write list */ + return xfs_buf_delwri_queue(bp, buffer_list); +} diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index f7eba99d19dd..9690ce62c9a7 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -64,12 +64,15 @@ typedef struct xfs_buf_log_item { int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); void xfs_buf_item_relse(struct xfs_buf *); void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint); -uint xfs_buf_item_dirty(xfs_buf_log_item_t *); +bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *); void xfs_buf_attach_iodone(struct xfs_buf *, void(*)(struct xfs_buf *, xfs_log_item_t *), xfs_log_item_t *); void xfs_buf_iodone_callbacks(struct xfs_buf *); void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); +bool xfs_buf_resubmit_failed_buffers(struct xfs_buf *, + struct xfs_log_item *, + struct list_head *); extern kmem_zone_t *xfs_buf_item_zone; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index fd2ef8c2c9a7..cd82429d8df7 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -383,7 +383,7 @@ xfs_qm_dqalloc( xfs_trans_bhold(tp, bp); - error = xfs_defer_finish(tpp, &dfops, NULL); + error = xfs_defer_finish(tpp, &dfops); if (error) goto error1; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 2f4feb959bfb..bd786a9ac2c3 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -57,6 +57,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_AG_RESV_CRITICAL, XFS_RANDOM_DROP_WRITES, XFS_RANDOM_LOG_BAD_CRC, + XFS_RANDOM_LOG_ITEM_PIN, }; struct xfs_errortag_attr { @@ -161,6 +162,7 @@ XFS_ERRORTAG_ATTR_RW(bmap_finish_one, XFS_ERRTAG_BMAP_FINISH_ONE); XFS_ERRORTAG_ATTR_RW(ag_resv_critical, XFS_ERRTAG_AG_RESV_CRITICAL); XFS_ERRORTAG_ATTR_RW(drop_writes, XFS_ERRTAG_DROP_WRITES); XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC); +XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -193,6 +195,7 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(ag_resv_critical), XFS_ERRORTAG_ATTR_LIST(drop_writes), XFS_ERRORTAG_ATTR_LIST(log_bad_crc), + XFS_ERRORTAG_ATTR_LIST(log_item_pin), NULL, }; diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 7577be5f09bc..7c4bef3bddb7 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -106,7 +106,8 @@ extern void xfs_verifier_error(struct xfs_buf *bp); */ #define XFS_ERRTAG_DROP_WRITES 28 #define XFS_ERRTAG_LOG_BAD_CRC 29 -#define XFS_ERRTAG_MAX 30 +#define XFS_ERRTAG_LOG_ITEM_PIN 30 +#define XFS_ERRTAG_MAX 31 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -141,6 +142,7 @@ extern void xfs_verifier_error(struct xfs_buf *bp); #define XFS_RANDOM_AG_RESV_CRITICAL 4 #define XFS_RANDOM_DROP_WRITES 1 #define XFS_RANDOM_LOG_BAD_CRC 1 +#define XFS_RANDOM_LOG_ITEM_PIN 1 #ifdef DEBUG extern int xfs_errortag_init(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index c4893e226fd8..0debbc7e3f03 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1011,96 +1011,67 @@ xfs_file_llseek( * page_lock (MM) * i_lock (XFS - extent map serialisation) */ - -/* - * mmap()d file has taken write protection fault and is being made writable. We - * can set the page state up correctly for a writable page, which means we can - * do correct delalloc accounting (ENOSPC checking!) and unwritten extent - * mapping. - */ -STATIC int -xfs_filemap_page_mkwrite( - struct vm_fault *vmf) +static int +__xfs_filemap_fault( + struct vm_fault *vmf, + enum page_entry_size pe_size, + bool write_fault) { struct inode *inode = file_inode(vmf->vma->vm_file); + struct xfs_inode *ip = XFS_I(inode); int ret; - trace_xfs_filemap_page_mkwrite(XFS_I(inode)); + trace_xfs_filemap_fault(ip, pe_size, write_fault); - sb_start_pagefault(inode->i_sb); - file_update_time(vmf->vma->vm_file); - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + if (write_fault) { + sb_start_pagefault(inode->i_sb); + file_update_time(vmf->vma->vm_file); + } + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { - ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops); + ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops); } else { - ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops); - ret = block_page_mkwrite_return(ret); + if (write_fault) + ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops); + else + ret = filemap_fault(vmf); } - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - sb_end_pagefault(inode->i_sb); + if (write_fault) + sb_end_pagefault(inode->i_sb); return ret; } -STATIC int +static int xfs_filemap_fault( struct vm_fault *vmf) { - struct inode *inode = file_inode(vmf->vma->vm_file); - int ret; - - trace_xfs_filemap_fault(XFS_I(inode)); - /* DAX can shortcut the normal fault path on write faults! */ - if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(inode)) - return xfs_filemap_page_mkwrite(vmf); - - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - if (IS_DAX(inode)) - ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops); - else - ret = filemap_fault(vmf); - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - - return ret; + return __xfs_filemap_fault(vmf, PE_SIZE_PTE, + IS_DAX(file_inode(vmf->vma->vm_file)) && + (vmf->flags & FAULT_FLAG_WRITE)); } -/* - * Similar to xfs_filemap_fault(), the DAX fault path can call into here on - * both read and write faults. Hence we need to handle both cases. There is no - * ->huge_mkwrite callout for huge pages, so we have a single function here to - * handle both cases here. @flags carries the information on the type of fault - * occuring. - */ -STATIC int +static int xfs_filemap_huge_fault( struct vm_fault *vmf, enum page_entry_size pe_size) { - struct inode *inode = file_inode(vmf->vma->vm_file); - struct xfs_inode *ip = XFS_I(inode); - int ret; - - if (!IS_DAX(inode)) + if (!IS_DAX(file_inode(vmf->vma->vm_file))) return VM_FAULT_FALLBACK; - trace_xfs_filemap_huge_fault(ip); - - if (vmf->flags & FAULT_FLAG_WRITE) { - sb_start_pagefault(inode->i_sb); - file_update_time(vmf->vma->vm_file); - } - - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops); - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - - if (vmf->flags & FAULT_FLAG_WRITE) - sb_end_pagefault(inode->i_sb); + /* DAX can shortcut the normal fault path on write faults! */ + return __xfs_filemap_fault(vmf, pe_size, + (vmf->flags & FAULT_FLAG_WRITE)); +} - return ret; +static int +xfs_filemap_page_mkwrite( + struct vm_fault *vmf) +{ + return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true); } /* diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 0a9e6985a0d0..34227115a5d6 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1124,11 +1124,11 @@ reclaim: * Because we use RCU freeing we need to ensure the inode always appears * to be reclaimed with an invalid inode number when in the free state. * We do this as early as possible under the ILOCK so that - * xfs_iflush_cluster() can be guaranteed to detect races with us here. - * By doing this, we guarantee that once xfs_iflush_cluster has locked - * XFS_ILOCK that it will see either a valid, flushable inode that will - * serialise correctly, or it will see a clean (and invalid) inode that - * it can skip. + * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to + * detect races with us here. By doing this, we guarantee that once + * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that + * it will see either a valid inode that will serialise correctly, or it + * will see an invalid inode that it can skip. */ spin_lock(&ip->i_flags_lock); ip->i_flags = XFS_IRECLAIM; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ceef77c0416a..5599dda4727a 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -874,7 +874,6 @@ xfs_ialloc( case S_IFREG: case S_IFDIR: if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { - uint64_t di_flags2 = 0; uint di_flags = 0; if (S_ISDIR(mode)) { @@ -911,20 +910,23 @@ xfs_ialloc( di_flags |= XFS_DIFLAG_NODEFRAG; if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM) di_flags |= XFS_DIFLAG_FILESTREAM; - if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) - di_flags2 |= XFS_DIFLAG2_DAX; ip->i_d.di_flags |= di_flags; - ip->i_d.di_flags2 |= di_flags2; } if (pip && (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) && pip->i_d.di_version == 3 && ip->i_d.di_version == 3) { + uint64_t di_flags2 = 0; + if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) { - ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; + di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; ip->i_d.di_cowextsize = pip->i_d.di_cowextsize; } + if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) + di_flags2 |= XFS_DIFLAG2_DAX; + + ip->i_d.di_flags2 |= di_flags2; } /* FALLTHROUGH */ case S_IFLNK: @@ -1053,7 +1055,7 @@ xfs_dir_ialloc( tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY); } - code = xfs_trans_roll(&tp, NULL); + code = xfs_trans_roll(&tp); if (committed != NULL) *committed = 1; @@ -1283,7 +1285,7 @@ xfs_create( */ xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; @@ -1511,7 +1513,7 @@ xfs_link( if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) xfs_trans_set_sync(tp); - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) { xfs_defer_cancel(&dfops); goto error_return; @@ -1605,11 +1607,12 @@ xfs_itruncate_extents( * Duplicate the transaction that has the permanent * reservation and commit the old transaction. */ - error = xfs_defer_finish(&tp, &dfops, ip); + xfs_defer_ijoin(&dfops, ip); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; - error = xfs_trans_roll(&tp, ip); + error = xfs_trans_roll_inode(&tp, ip); if (error) goto out; } @@ -1853,7 +1856,7 @@ xfs_inactive_ifree( * Just ignore errors at this point. There is nothing we can do except * to try to keep going. Make sure it's not a silent error. */ - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) { xfs_notice(mp, "%s: xfs_defer_finish returned error %d", __func__, error); @@ -2357,11 +2360,24 @@ retry: * already marked stale. If we can't lock it, back off * and retry. */ - if (ip != free_ip && - !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { - rcu_read_unlock(); - delay(1); - goto retry; + if (ip != free_ip) { + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { + rcu_read_unlock(); + delay(1); + goto retry; + } + + /* + * Check the inode number again in case we're + * racing with freeing in xfs_reclaim_inode(). + * See the comments in that function for more + * information as to why the initial check is + * not sufficient. + */ + if (ip->i_ino != inum + i) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + continue; + } } rcu_read_unlock(); @@ -2635,7 +2651,7 @@ xfs_remove( if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) xfs_trans_set_sync(tp); - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; @@ -2721,7 +2737,7 @@ xfs_finish_rename( if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) xfs_trans_set_sync(tp); - error = xfs_defer_finish(&tp, dfops, NULL); + error = xfs_defer_finish(&tp, dfops); if (error) { xfs_defer_cancel(dfops); xfs_trans_cancel(tp); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 013cc78d7daf..6d0f74ec31e8 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -27,6 +27,7 @@ #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_trans_priv.h" +#include "xfs_buf_item.h" #include "xfs_log.h" @@ -475,6 +476,23 @@ xfs_inode_item_unpin( wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT); } +/* + * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer + * have been failed during writeback + * + * This informs the AIL that the inode is already flush locked on the next push, + * and acquires a hold on the buffer to ensure that it isn't reclaimed before + * dirty data makes it to disk. + */ +STATIC void +xfs_inode_item_error( + struct xfs_log_item *lip, + struct xfs_buf *bp) +{ + ASSERT(xfs_isiflocked(INODE_ITEM(lip)->ili_inode)); + xfs_set_li_failed(lip, bp); +} + STATIC uint xfs_inode_item_push( struct xfs_log_item *lip, @@ -484,13 +502,28 @@ xfs_inode_item_push( { struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; - struct xfs_buf *bp = NULL; + struct xfs_buf *bp = lip->li_buf; uint rval = XFS_ITEM_SUCCESS; int error; if (xfs_ipincount(ip) > 0) return XFS_ITEM_PINNED; + /* + * The buffer containing this item failed to be written back + * previously. Resubmit the buffer for IO. + */ + if (lip->li_flags & XFS_LI_FAILED) { + if (!xfs_buf_trylock(bp)) + return XFS_ITEM_LOCKED; + + if (!xfs_buf_resubmit_failed_buffers(bp, lip, buffer_list)) + rval = XFS_ITEM_FLUSHING; + + xfs_buf_unlock(bp); + return rval; + } + if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) return XFS_ITEM_LOCKED; @@ -622,7 +655,8 @@ static const struct xfs_item_ops xfs_inode_item_ops = { .iop_unlock = xfs_inode_item_unlock, .iop_committed = xfs_inode_item_committed, .iop_push = xfs_inode_item_push, - .iop_committing = xfs_inode_item_committing + .iop_committing = xfs_inode_item_committing, + .iop_error = xfs_inode_item_error }; @@ -710,7 +744,8 @@ xfs_iflush_done( * the AIL lock. */ iip = INODE_ITEM(blip); - if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) + if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) || + lip->li_flags & XFS_LI_FAILED) need_ail++; blip = next; @@ -718,7 +753,8 @@ xfs_iflush_done( /* make sure we capture the state of the initial inode. */ iip = INODE_ITEM(lip); - if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) + if ((iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) || + lip->li_flags & XFS_LI_FAILED) need_ail++; /* @@ -739,6 +775,9 @@ xfs_iflush_done( if (INODE_ITEM(blip)->ili_logged && blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) mlip_changed |= xfs_ail_delete_one(ailp, blip); + else { + xfs_clear_li_failed(blip); + } } if (mlip_changed) { diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 9c0c7a920304..5049e8ab6e30 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -931,16 +931,15 @@ xfs_ioc_fsgetxattr( return 0; } -STATIC void -xfs_set_diflags( +STATIC uint16_t +xfs_flags2diflags( struct xfs_inode *ip, unsigned int xflags) { - unsigned int di_flags; - uint64_t di_flags2; - /* can't set PREALLOC this way, just preserve it */ - di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC); + uint16_t di_flags = + (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC); + if (xflags & FS_XFLAG_IMMUTABLE) di_flags |= XFS_DIFLAG_IMMUTABLE; if (xflags & FS_XFLAG_APPEND) @@ -970,19 +969,24 @@ xfs_set_diflags( if (xflags & FS_XFLAG_EXTSIZE) di_flags |= XFS_DIFLAG_EXTSIZE; } - ip->i_d.di_flags = di_flags; - /* diflags2 only valid for v3 inodes. */ - if (ip->i_d.di_version < 3) - return; + return di_flags; +} + +STATIC uint64_t +xfs_flags2diflags2( + struct xfs_inode *ip, + unsigned int xflags) +{ + uint64_t di_flags2 = + (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK); - di_flags2 = (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK); if (xflags & FS_XFLAG_DAX) di_flags2 |= XFS_DIFLAG2_DAX; if (xflags & FS_XFLAG_COWEXTSIZE) di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; - ip->i_d.di_flags2 = di_flags2; + return di_flags2; } STATIC void @@ -1008,11 +1012,12 @@ xfs_diflags_to_linux( inode->i_flags |= S_NOATIME; else inode->i_flags &= ~S_NOATIME; +#if 0 /* disabled until the flag switching races are sorted out */ if (xflags & FS_XFLAG_DAX) inode->i_flags |= S_DAX; else inode->i_flags &= ~S_DAX; - +#endif } static int @@ -1022,6 +1027,7 @@ xfs_ioctl_setattr_xflags( struct fsxattr *fa) { struct xfs_mount *mp = ip->i_mount; + uint64_t di_flags2; /* Can't change realtime flag if any extents are allocated. */ if ((ip->i_d.di_nextents || ip->i_delayed_blks) && @@ -1052,7 +1058,14 @@ xfs_ioctl_setattr_xflags( !capable(CAP_LINUX_IMMUTABLE)) return -EPERM; - xfs_set_diflags(ip, fa->fsx_xflags); + /* diflags2 only valid for v3 inodes. */ + di_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags); + if (di_flags2 && ip->i_d.di_version < 3) + return -EINVAL; + + ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags); + ip->i_d.di_flags2 = di_flags2; + xfs_diflags_to_linux(ip); xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 813394c62849..79cb5b3d140c 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -274,7 +274,7 @@ xfs_iomap_write_direct( /* * Complete the transaction */ - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; @@ -520,7 +520,6 @@ xfs_file_iomap_begin_delay( struct inode *inode, loff_t offset, loff_t count, - unsigned flags, struct iomap *iomap) { struct xfs_inode *ip = XFS_I(inode); @@ -784,7 +783,7 @@ xfs_iomap_write_allocate( if (error) goto trans_cancel; - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto trans_cancel; @@ -906,7 +905,7 @@ xfs_iomap_write_unwritten( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto error_on_bmapi_transaction; @@ -984,8 +983,7 @@ xfs_file_iomap_begin( if (((flags & (IOMAP_WRITE | IOMAP_DIRECT)) == IOMAP_WRITE) && !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) { /* Reserve delalloc blocks for regular writeback. */ - return xfs_file_iomap_begin_delay(inode, offset, length, flags, - iomap); + return xfs_file_iomap_begin_delay(inode, offset, length, iomap); } if (need_excl_ilock(ip, flags)) { diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 469c9fa4c178..17081c77ef86 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -817,7 +817,7 @@ xfs_vn_setattr_nonsize( * Caution: The caller of this function is responsible for calling * setattr_prepare() or otherwise verifying the change is fine. */ -int +STATIC int xfs_setattr_size( struct xfs_inode *ip, struct iattr *iattr) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 0053bcf2b10a..c5107c7bc4bf 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -743,15 +743,45 @@ xfs_log_mount_finish( struct xfs_mount *mp) { int error = 0; + bool readonly = (mp->m_flags & XFS_MOUNT_RDONLY); if (mp->m_flags & XFS_MOUNT_NORECOVERY) { ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); return 0; + } else if (readonly) { + /* Allow unlinked processing to proceed */ + mp->m_flags &= ~XFS_MOUNT_RDONLY; } + /* + * During the second phase of log recovery, we need iget and + * iput to behave like they do for an active filesystem. + * xfs_fs_drop_inode needs to be able to prevent the deletion + * of inodes before we're done replaying log items on those + * inodes. Turn it off immediately after recovery finishes + * so that we don't leak the quota inodes if subsequent mount + * activities fail. + * + * We let all inodes involved in redo item processing end up on + * the LRU instead of being evicted immediately so that if we do + * something to an unlinked inode, the irele won't cause + * premature truncation and freeing of the inode, which results + * in log recovery failure. We have to evict the unreferenced + * lru inodes after clearing MS_ACTIVE because we don't + * otherwise clean up the lru if there's a subsequent failure in + * xfs_mountfs, which leads to us leaking the inodes if nothing + * else (e.g. quotacheck) references the inodes before the + * mount failure occurs. + */ + mp->m_super->s_flags |= MS_ACTIVE; error = xlog_recover_finish(mp->m_log); if (!error) xfs_log_work_queue(mp); + mp->m_super->s_flags &= ~MS_ACTIVE; + evict_inodes(mp->m_super); + + if (readonly) + mp->m_flags |= XFS_MOUNT_RDONLY; return error; } @@ -801,11 +831,14 @@ xfs_log_unmount_write(xfs_mount_t *mp) int error; /* - * Don't write out unmount record on read-only mounts. + * Don't write out unmount record on norecovery mounts or ro devices. * Or, if we are doing a forced umount (typically because of IO errors). */ - if (mp->m_flags & XFS_MOUNT_RDONLY) + if (mp->m_flags & XFS_MOUNT_NORECOVERY || + xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) { + ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); return 0; + } error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL); ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log))); @@ -3342,8 +3375,6 @@ maybe_sleep: */ if (iclog->ic_state & XLOG_STATE_IOERROR) return -EIO; - if (log_flushed) - *log_flushed = 1; } else { no_sleep: @@ -3447,8 +3478,6 @@ try_again: xlog_wait(&iclog->ic_prev->ic_write_wait, &log->l_icloglock); - if (log_flushed) - *log_flushed = 1; already_slept = 1; goto try_again; } @@ -3482,9 +3511,6 @@ try_again: */ if (iclog->ic_state & XLOG_STATE_IOERROR) return -EIO; - - if (log_flushed) - *log_flushed = 1; } else { /* just return */ spin_unlock(&log->l_icloglock); } diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index fbe72b134bef..43aa42a3a5d3 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -539,6 +539,7 @@ xlog_discard_endio( INIT_WORK(&ctx->discard_endio_work, xlog_discard_endio_work); queue_work(xfs_discard_wq, &ctx->discard_endio_work); + bio_put(bio); } static void diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 9549188f5a36..ee34899396b2 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1029,61 +1029,106 @@ out_error: } /* - * Check the log tail for torn writes. This is required when torn writes are - * detected at the head and the head had to be walked back to a previous record. - * The tail of the previous record must now be verified to ensure the torn - * writes didn't corrupt the previous tail. + * Calculate distance from head to tail (i.e., unused space in the log). + */ +static inline int +xlog_tail_distance( + struct xlog *log, + xfs_daddr_t head_blk, + xfs_daddr_t tail_blk) +{ + if (head_blk < tail_blk) + return tail_blk - head_blk; + + return tail_blk + (log->l_logBBsize - head_blk); +} + +/* + * Verify the log tail. This is particularly important when torn or incomplete + * writes have been detected near the front of the log and the head has been + * walked back accordingly. + * + * We also have to handle the case where the tail was pinned and the head + * blocked behind the tail right before a crash. If the tail had been pushed + * immediately prior to the crash and the subsequent checkpoint was only + * partially written, it's possible it overwrote the last referenced tail in the + * log with garbage. This is not a coherency problem because the tail must have + * been pushed before it can be overwritten, but appears as log corruption to + * recovery because we have no way to know the tail was updated if the + * subsequent checkpoint didn't write successfully. * - * Return an error if CRC verification fails as recovery cannot proceed. + * Therefore, CRC check the log from tail to head. If a failure occurs and the + * offending record is within max iclog bufs from the head, walk the tail + * forward and retry until a valid tail is found or corruption is detected out + * of the range of a possible overwrite. */ STATIC int xlog_verify_tail( struct xlog *log, xfs_daddr_t head_blk, - xfs_daddr_t tail_blk) + xfs_daddr_t *tail_blk, + int hsize) { struct xlog_rec_header *thead; struct xfs_buf *bp; xfs_daddr_t first_bad; - int count; int error = 0; bool wrapped; - xfs_daddr_t tmp_head; + xfs_daddr_t tmp_tail; + xfs_daddr_t orig_tail = *tail_blk; bp = xlog_get_bp(log, 1); if (!bp) return -ENOMEM; /* - * Seek XLOG_MAX_ICLOGS + 1 records past the current tail record to get - * a temporary head block that points after the last possible - * concurrently written record of the tail. + * Make sure the tail points to a record (returns positive count on + * success). */ - count = xlog_seek_logrec_hdr(log, head_blk, tail_blk, - XLOG_MAX_ICLOGS + 1, bp, &tmp_head, &thead, - &wrapped); - if (count < 0) { - error = count; + error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, bp, + &tmp_tail, &thead, &wrapped); + if (error < 0) goto out; - } - - /* - * If the call above didn't find XLOG_MAX_ICLOGS + 1 records, we ran - * into the actual log head. tmp_head points to the start of the record - * so update it to the actual head block. - */ - if (count < XLOG_MAX_ICLOGS + 1) - tmp_head = head_blk; + if (*tail_blk != tmp_tail) + *tail_blk = tmp_tail; /* - * We now have a tail and temporary head block that covers at least - * XLOG_MAX_ICLOGS records from the tail. We need to verify that these - * records were completely written. Run a CRC verification pass from - * tail to head and return the result. + * Run a CRC check from the tail to the head. We can't just check + * MAX_ICLOGS records past the tail because the tail may point to stale + * blocks cleared during the search for the head/tail. These blocks are + * overwritten with zero-length records and thus record count is not a + * reliable indicator of the iclog state before a crash. */ - error = xlog_do_recovery_pass(log, tmp_head, tail_blk, + first_bad = 0; + error = xlog_do_recovery_pass(log, head_blk, *tail_blk, XLOG_RECOVER_CRCPASS, &first_bad); + while ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) { + int tail_distance; + + /* + * Is corruption within range of the head? If so, retry from + * the next record. Otherwise return an error. + */ + tail_distance = xlog_tail_distance(log, head_blk, first_bad); + if (tail_distance > BTOBB(XLOG_MAX_ICLOGS * hsize)) + break; + /* skip to the next record; returns positive count on success */ + error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2, bp, + &tmp_tail, &thead, &wrapped); + if (error < 0) + goto out; + + *tail_blk = tmp_tail; + first_bad = 0; + error = xlog_do_recovery_pass(log, head_blk, *tail_blk, + XLOG_RECOVER_CRCPASS, &first_bad); + } + + if (!error && *tail_blk != orig_tail) + xfs_warn(log->l_mp, + "Tail block (0x%llx) overwrite detected. Updated to 0x%llx", + orig_tail, *tail_blk); out: xlog_put_bp(bp); return error; @@ -1143,7 +1188,7 @@ xlog_verify_head( */ error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk, XLOG_RECOVER_CRCPASS, &first_bad); - if (error == -EFSBADCRC) { + if ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) { /* * We've hit a potential torn write. Reset the error and warn * about it. @@ -1183,31 +1228,12 @@ xlog_verify_head( ASSERT(0); return 0; } - - /* - * Now verify the tail based on the updated head. This is - * required because the torn writes trimmed from the head could - * have been written over the tail of a previous record. Return - * any errors since recovery cannot proceed if the tail is - * corrupt. - * - * XXX: This leaves a gap in truly robust protection from torn - * writes in the log. If the head is behind the tail, the tail - * pushes forward to create some space and then a crash occurs - * causing the writes into the previous record's tail region to - * tear, log recovery isn't able to recover. - * - * How likely is this to occur? If possible, can we do something - * more intelligent here? Is it safe to push the tail forward if - * we can determine that the tail is within the range of the - * torn write (e.g., the kernel can only overwrite the tail if - * it has actually been pushed forward)? Alternatively, could we - * somehow prevent this condition at runtime? - */ - error = xlog_verify_tail(log, *head_blk, *tail_blk); } + if (error) + return error; - return error; + return xlog_verify_tail(log, *head_blk, tail_blk, + be32_to_cpu((*rhead)->h_size)); } /* @@ -4801,12 +4827,16 @@ xlog_recover_process_intents( int error = 0; struct xfs_ail_cursor cur; struct xfs_ail *ailp; +#if defined(DEBUG) || defined(XFS_WARN) xfs_lsn_t last_lsn; +#endif ailp = log->l_ailp; spin_lock(&ailp->xa_lock); lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); +#if defined(DEBUG) || defined(XFS_WARN) last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block); +#endif while (lip != NULL) { /* * We're done when we see something other than an intent. @@ -5218,7 +5248,7 @@ xlog_do_recovery_pass( xfs_daddr_t *first_bad) /* out: first bad log rec */ { xlog_rec_header_t *rhead; - xfs_daddr_t blk_no; + xfs_daddr_t blk_no, rblk_no; xfs_daddr_t rhead_blk; char *offset; xfs_buf_t *hbp, *dbp; @@ -5231,7 +5261,7 @@ xlog_do_recovery_pass( LIST_HEAD (buffer_list); ASSERT(head_blk != tail_blk); - rhead_blk = 0; + blk_no = rhead_blk = tail_blk; for (i = 0; i < XLOG_RHASH_SIZE; i++) INIT_HLIST_HEAD(&rhash[i]); @@ -5309,7 +5339,6 @@ xlog_do_recovery_pass( } memset(rhash, 0, sizeof(rhash)); - blk_no = rhead_blk = tail_blk; if (tail_blk > head_blk) { /* * Perform recovery around the end of the physical log. @@ -5371,9 +5400,19 @@ xlog_do_recovery_pass( bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); blk_no += hblks; - /* Read in data for log record */ - if (blk_no + bblks <= log->l_logBBsize) { - error = xlog_bread(log, blk_no, bblks, dbp, + /* + * Read the log record data in multiple reads if it + * wraps around the end of the log. Note that if the + * header already wrapped, blk_no could point past the + * end of the log. The record data is contiguous in + * that case. + */ + if (blk_no + bblks <= log->l_logBBsize || + blk_no >= log->l_logBBsize) { + /* mod blk_no in case the header wrapped and + * pushed it beyond the end of the log */ + rblk_no = do_mod(blk_no, log->l_logBBsize); + error = xlog_bread(log, rblk_no, bblks, dbp, &offset); if (error) goto bread_err2; @@ -5563,6 +5602,8 @@ xlog_do_recover( xfs_buf_t *bp; xfs_sb_t *sbp; + trace_xfs_log_recover(log, head_blk, tail_blk); + /* * First replay the images in the log. */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 40d4e8b4e193..ea7d4b4e50d0 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -945,15 +945,6 @@ xfs_mountfs( } /* - * During the second phase of log recovery, we need iget and - * iput to behave like they do for an active filesystem. - * xfs_fs_drop_inode needs to be able to prevent the deletion - * of inodes before we're done replaying log items on those - * inodes. - */ - mp->m_super->s_flags |= MS_ACTIVE; - - /* * Finish recovering the file system. This part needed to be delayed * until after the root and real-time bitmap inodes were consistently * read in. @@ -1028,12 +1019,13 @@ xfs_mountfs( out_quota: xfs_qm_unmount_quotas(mp); out_rtunmount: - mp->m_super->s_flags &= ~MS_ACTIVE; xfs_rtunmount_inodes(mp); out_rele_rip: IRELE(rip); cancel_delayed_work_sync(&mp->m_reclaim_work); xfs_reclaim_inodes(mp, SYNC_WAIT); + /* Clean out dquots that might be in memory after quotacheck. */ + xfs_qm_unmount(mp); out_log_dealloc: mp->m_flags |= XFS_MOUNT_UNMOUNTING; xfs_log_mount_cancel(mp); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 6ce948c436d5..010a13a201aa 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -31,6 +31,7 @@ #include "xfs_error.h" #include "xfs_bmap.h" #include "xfs_bmap_btree.h" +#include "xfs_bmap_util.h" #include "xfs_trans.h" #include "xfs_trans_space.h" #include "xfs_qm.h" @@ -111,6 +112,9 @@ restart: skipped = 0; break; } + /* we're done if id overflows back to zero */ + if (!next_index) + break; } if (skipped) { @@ -1117,31 +1121,6 @@ xfs_qm_quotacheck_dqadjust( return 0; } -STATIC int -xfs_qm_get_rtblks( - xfs_inode_t *ip, - xfs_qcnt_t *O_rtblks) -{ - xfs_filblks_t rtblks; /* total rt blks */ - xfs_extnum_t idx; /* extent record index */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extnum_t nextents; /* number of extent entries */ - int error; - - ASSERT(XFS_IS_REALTIME_INODE(ip)); - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - if ((error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK))) - return error; - } - rtblks = 0; - nextents = xfs_iext_count(ifp); - for (idx = 0; idx < nextents; idx++) - rtblks += xfs_bmbt_get_blockcount(xfs_iext_get_ext(ifp, idx)); - *O_rtblks = (xfs_qcnt_t)rtblks; - return 0; -} - /* * callback routine supplied to bulkstat(). Given an inumber, find its * dquots and update them to account for resources taken by that inode. @@ -1157,7 +1136,8 @@ xfs_qm_dqusage_adjust( int *res) /* result code value */ { xfs_inode_t *ip; - xfs_qcnt_t nblks, rtblks = 0; + xfs_qcnt_t nblks; + xfs_filblks_t rtblks = 0; /* total rt blks */ int error; ASSERT(XFS_IS_QUOTA_RUNNING(mp)); @@ -1187,12 +1167,15 @@ xfs_qm_dqusage_adjust( ASSERT(ip->i_delayed_blks == 0); if (XFS_IS_REALTIME_INODE(ip)) { - /* - * Walk thru the extent list and count the realtime blocks. - */ - error = xfs_qm_get_rtblks(ip, &rtblks); - if (error) - goto error0; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + if (error) + goto error0; + } + + xfs_bmap_count_leaves(ifp, &rtblks); } nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks; diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 96fe209b5eb6..8f2e2fac4255 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -525,7 +525,7 @@ xfs_cui_recover( } xfs_refcount_finish_one_cleanup(tp, rcur, error); - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto abort_defer; set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags); diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index ab2270a87196..3246815c24d6 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -170,6 +170,8 @@ xfs_reflink_find_shared( error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); if (error) return error; + if (!agbp) + return -ENOMEM; cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL); @@ -329,7 +331,7 @@ xfs_reflink_convert_cow_extent( xfs_filblks_t count_fsb, struct xfs_defer_ops *dfops) { - xfs_fsblock_t first_block; + xfs_fsblock_t first_block = NULLFSBLOCK; int nimaps = 1; if (imap->br_state == XFS_EXT_NORM) @@ -462,7 +464,7 @@ retry: goto out_bmap_cancel; /* Finish up. */ - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; @@ -600,7 +602,8 @@ xfs_reflink_cancel_cow_blocks( -(long)del.br_blockcount); /* Roll the transaction */ - error = xfs_defer_finish(tpp, &dfops, ip); + xfs_defer_ijoin(&dfops, ip); + error = xfs_defer_finish(tpp, &dfops); if (error) { xfs_defer_cancel(&dfops); break; @@ -789,7 +792,8 @@ xfs_reflink_end_cow( /* Remove the mapping from the CoW fork. */ xfs_bmap_del_extent_cow(ip, &idx, &got, &del); - error = xfs_defer_finish(&tp, &dfops, ip); + xfs_defer_ijoin(&dfops, ip); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_defer; next_extent: @@ -1150,7 +1154,8 @@ xfs_reflink_remap_extent( next_extent: /* Process all the deferred stuff. */ - error = xfs_defer_finish(&tp, &dfops, ip); + xfs_defer_ijoin(&dfops, ip); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_defer; } diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 91472193643b..488719d43ca8 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -810,7 +810,7 @@ xfs_growfs_rt_alloc( /* * Free any blocks freed up in the transaction, then commit. */ - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; error = xfs_trans_commit(tp); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 38aaacdbb8b3..c1c4c2ea1014 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1220,7 +1220,7 @@ xfs_test_remount_options( tmp_mp->m_super = sb; error = xfs_parseargs(tmp_mp, options); xfs_free_fsname(tmp_mp); - kfree(tmp_mp); + kmem_free(tmp_mp); return error; } diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 23a50d7aa46a..68d3ca2c4968 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -378,7 +378,7 @@ xfs_symlink( xfs_trans_set_sync(tp); } - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; @@ -497,7 +497,8 @@ xfs_inactive_symlink_rmt( /* * Commit the first transaction. This logs the EFI and the inode. */ - error = xfs_defer_finish(&tp, &dfops, ip); + xfs_defer_ijoin(&dfops, ip); + error = xfs_defer_finish(&tp, &dfops); if (error) goto error_bmap_cancel; /* diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index bcc3cdf8e1c5..bb5514688d47 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -517,7 +517,6 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format); -DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_ordered); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin); @@ -689,11 +688,34 @@ DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag); DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag); DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid); -DEFINE_INODE_EVENT(xfs_filemap_fault); -DEFINE_INODE_EVENT(xfs_filemap_huge_fault); -DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite); DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite); +TRACE_EVENT(xfs_filemap_fault, + TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size, + bool write_fault), + TP_ARGS(ip, pe_size, write_fault), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(enum page_entry_size, pe_size) + __field(bool, write_fault) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->pe_size = pe_size; + __entry->write_fault = write_fault; + ), + TP_printk("dev %d:%d ino 0x%llx %s write_fault %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->pe_size, + { PE_SIZE_PTE, "PTE" }, + { PE_SIZE_PMD, "PMD" }, + { PE_SIZE_PUD, "PUD" }), + __entry->write_fault) +) + DECLARE_EVENT_CLASS(xfs_iref_class, TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), TP_ARGS(ip, caller_ip), @@ -1963,6 +1985,24 @@ DEFINE_EVENT(xfs_swap_extent_class, name, \ DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before); DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after); +TRACE_EVENT(xfs_log_recover, + TP_PROTO(struct xlog *log, xfs_daddr_t headblk, xfs_daddr_t tailblk), + TP_ARGS(log, headblk, tailblk), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_daddr_t, headblk) + __field(xfs_daddr_t, tailblk) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->headblk = headblk; + __entry->tailblk = tailblk; + ), + TP_printk("dev %d:%d headblk 0x%llx tailblk 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->headblk, + __entry->tailblk) +) + TRACE_EVENT(xfs_log_recover_record, TP_PROTO(struct xlog *log, struct xlog_rec_header *rhead, int pass), TP_ARGS(log, rhead, pass), diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 2011620008de..a87f657f59c9 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1035,25 +1035,18 @@ xfs_trans_cancel( */ int xfs_trans_roll( - struct xfs_trans **tpp, - struct xfs_inode *dp) + struct xfs_trans **tpp) { - struct xfs_trans *trans; + struct xfs_trans *trans = *tpp; struct xfs_trans_res tres; int error; /* - * Ensure that the inode is always logged. - */ - trans = *tpp; - if (dp) - xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); - - /* * Copy the critical parameters from one trans to the next. */ tres.tr_logres = trans->t_log_res; tres.tr_logcount = trans->t_log_count; + *tpp = xfs_trans_dup(trans); /* @@ -1067,10 +1060,8 @@ xfs_trans_roll( if (error) return error; - trans = *tpp; - /* - * Reserve space in the log for th next transaction. + * Reserve space in the log for the next transaction. * This also pushes items in the "AIL", the list of logged items, * out to disk if they are taking up space at the tail of the log * that we want to use. This requires that either nothing be locked @@ -1078,14 +1069,5 @@ xfs_trans_roll( * the prior and the next transactions. */ tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; - error = xfs_trans_reserve(trans, &tres, 0, 0); - /* - * Ensure that the inode is in the new transaction and locked. - */ - if (error) - return error; - - if (dp) - xfs_trans_ijoin(trans, dp, 0); - return 0; + return xfs_trans_reserve(*tpp, &tres, 0, 0); } diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 6bdad6f58934..815b53d20e26 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -49,6 +49,7 @@ typedef struct xfs_log_item { struct xfs_ail *li_ailp; /* ptr to AIL */ uint li_type; /* item type */ uint li_flags; /* misc flags */ + struct xfs_buf *li_buf; /* real buffer pointer */ struct xfs_log_item *li_bio_list; /* buffer item list */ void (*li_cb)(struct xfs_buf *, struct xfs_log_item *); @@ -64,11 +65,13 @@ typedef struct xfs_log_item { } xfs_log_item_t; #define XFS_LI_IN_AIL 0x1 -#define XFS_LI_ABORTED 0x2 +#define XFS_LI_ABORTED 0x2 +#define XFS_LI_FAILED 0x4 #define XFS_LI_FLAGS \ { XFS_LI_IN_AIL, "IN_AIL" }, \ - { XFS_LI_ABORTED, "ABORTED" } + { XFS_LI_ABORTED, "ABORTED" }, \ + { XFS_LI_FAILED, "FAILED" } struct xfs_item_ops { void (*iop_size)(xfs_log_item_t *, int *, int *); @@ -79,6 +82,7 @@ struct xfs_item_ops { void (*iop_unlock)(xfs_log_item_t *); xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t); void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); + void (*iop_error)(xfs_log_item_t *, xfs_buf_t *); }; void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item, @@ -208,12 +212,14 @@ void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *); void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *); void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *); -void xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *); +bool xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int); void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint); -void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint); +void xfs_trans_log_buf(struct xfs_trans *, struct xfs_buf *, uint, + uint); +void xfs_trans_dirty_buf(struct xfs_trans *, struct xfs_buf *); void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint); void xfs_extent_free_init_defer_op(void); @@ -224,7 +230,8 @@ int xfs_trans_free_extent(struct xfs_trans *, struct xfs_efd_log_item *, xfs_fsblock_t, xfs_extlen_t, struct xfs_owner_info *); int xfs_trans_commit(struct xfs_trans *); -int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *); +int xfs_trans_roll(struct xfs_trans **); +int xfs_trans_roll_inode(struct xfs_trans **, struct xfs_inode *); void xfs_trans_cancel(xfs_trans_t *); int xfs_trans_ail_init(struct xfs_mount *); void xfs_trans_ail_destroy(struct xfs_mount *); diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 9056c0f34a3c..354368a906e5 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -325,6 +325,21 @@ xfs_ail_delete( xfs_trans_ail_cursor_clear(ailp, lip); } +static inline uint +xfsaild_push_item( + struct xfs_ail *ailp, + struct xfs_log_item *lip) +{ + /* + * If log item pinning is enabled, skip the push and track the item as + * pinned. This can help induce head-behind-tail conditions. + */ + if (XFS_TEST_ERROR(false, ailp->xa_mount, XFS_ERRTAG_LOG_ITEM_PIN)) + return XFS_ITEM_PINNED; + + return lip->li_ops->iop_push(lip, &ailp->xa_buf_list); +} + static long xfsaild_push( struct xfs_ail *ailp) @@ -382,7 +397,7 @@ xfsaild_push( * rely on the AIL cursor implementation to be able to deal with * the dropped lock. */ - lock_result = lip->li_ops->iop_push(lip, &ailp->xa_buf_list); + lock_result = xfsaild_push_item(ailp, lip); switch (lock_result) { case XFS_ITEM_SUCCESS: XFS_STATS_INC(mp, xs_push_ail_success); @@ -687,12 +702,13 @@ xfs_trans_ail_update_bulk( bool xfs_ail_delete_one( struct xfs_ail *ailp, - struct xfs_log_item *lip) + struct xfs_log_item *lip) { struct xfs_log_item *mlip = xfs_ail_min(ailp); trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn); xfs_ail_delete(ailp, lip); + xfs_clear_li_failed(lip); lip->li_flags &= ~XFS_LI_IN_AIL; lip->li_lsn = 0; diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 86987d823d76..3ba7a96a8abd 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -435,7 +435,7 @@ xfs_trans_brelse(xfs_trans_t *tp, if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) { xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR); xfs_buf_item_relse(bp); - } else if (!xfs_buf_item_dirty(bip)) { + } else if (!(bip->bli_flags & XFS_BLI_DIRTY)) { /*** ASSERT(bp->b_pincount == 0); ***/ @@ -493,25 +493,17 @@ xfs_trans_bhold_release(xfs_trans_t *tp, } /* - * This is called to mark bytes first through last inclusive of the given - * buffer as needing to be logged when the transaction is committed. - * The buffer must already be associated with the given transaction. - * - * First and last are numbers relative to the beginning of this buffer, - * so the first byte in the buffer is numbered 0 regardless of the - * value of b_blkno. + * Mark a buffer dirty in the transaction. */ void -xfs_trans_log_buf(xfs_trans_t *tp, - xfs_buf_t *bp, - uint first, - uint last) +xfs_trans_dirty_buf( + struct xfs_trans *tp, + struct xfs_buf *bp) { - xfs_buf_log_item_t *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_fspriv; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); - ASSERT(first <= last && last < BBTOB(bp->b_length)); ASSERT(bp->b_iodone == NULL || bp->b_iodone == xfs_buf_iodone_callbacks); @@ -531,8 +523,6 @@ xfs_trans_log_buf(xfs_trans_t *tp, bp->b_iodone = xfs_buf_iodone_callbacks; bip->bli_item.li_cb = xfs_buf_iodone; - trace_xfs_trans_log_buf(bip); - /* * If we invalidated the buffer within this transaction, then * cancel the invalidation now that we're dirtying the buffer @@ -545,17 +535,37 @@ xfs_trans_log_buf(xfs_trans_t *tp, bp->b_flags &= ~XBF_STALE; bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL; } + bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED; tp->t_flags |= XFS_TRANS_DIRTY; bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; +} - /* - * If we have an ordered buffer we are not logging any dirty range but - * it still needs to be marked dirty and that it has been logged. - */ - bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED; - if (!(bip->bli_flags & XFS_BLI_ORDERED)) - xfs_buf_item_log(bip, first, last); +/* + * This is called to mark bytes first through last inclusive of the given + * buffer as needing to be logged when the transaction is committed. + * The buffer must already be associated with the given transaction. + * + * First and last are numbers relative to the beginning of this buffer, + * so the first byte in the buffer is numbered 0 regardless of the + * value of b_blkno. + */ +void +xfs_trans_log_buf( + struct xfs_trans *tp, + struct xfs_buf *bp, + uint first, + uint last) +{ + struct xfs_buf_log_item *bip = bp->b_fspriv; + + ASSERT(first <= last && last < BBTOB(bp->b_length)); + ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED)); + + xfs_trans_dirty_buf(tp, bp); + + trace_xfs_trans_log_buf(bip); + xfs_buf_item_log(bip, first, last); } @@ -708,14 +718,13 @@ xfs_trans_inode_alloc_buf( } /* - * Mark the buffer as ordered for this transaction. This means - * that the contents of the buffer are not recorded in the transaction - * but it is tracked in the AIL as though it was. This allows us - * to record logical changes in transactions rather than the physical - * changes we make to the buffer without changing writeback ordering - * constraints of metadata buffers. + * Mark the buffer as ordered for this transaction. This means that the contents + * of the buffer are not recorded in the transaction but it is tracked in the + * AIL as though it was. This allows us to record logical changes in + * transactions rather than the physical changes we make to the buffer without + * changing writeback ordering constraints of metadata buffers. */ -void +bool xfs_trans_ordered_buf( struct xfs_trans *tp, struct xfs_buf *bp) @@ -726,8 +735,18 @@ xfs_trans_ordered_buf( ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); + if (xfs_buf_item_dirty_format(bip)) + return false; + bip->bli_flags |= XFS_BLI_ORDERED; trace_xfs_buf_item_ordered(bip); + + /* + * We don't log a dirty range of an ordered buffer but it still needs + * to be marked dirty and that it has been logged. + */ + xfs_trans_dirty_buf(tp, bp); + return true; } /* diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index dab8daa676f9..daa7615497f9 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -134,3 +134,17 @@ xfs_trans_log_inode( flags |= ip->i_itemp->ili_last_fields; ip->i_itemp->ili_fields |= flags; } + +int +xfs_trans_roll_inode( + struct xfs_trans **tpp, + struct xfs_inode *ip) +{ + int error; + + xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); + error = xfs_trans_roll(tpp); + if (!error) + xfs_trans_ijoin(*tpp, ip, 0); + return error; +} diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index d91706c56c63..b317a3644c00 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -164,4 +164,35 @@ xfs_trans_ail_copy_lsn( *dst = *src; } #endif + +static inline void +xfs_clear_li_failed( + struct xfs_log_item *lip) +{ + struct xfs_buf *bp = lip->li_buf; + + ASSERT(lip->li_flags & XFS_LI_IN_AIL); + lockdep_assert_held(&lip->li_ailp->xa_lock); + + if (lip->li_flags & XFS_LI_FAILED) { + lip->li_flags &= ~XFS_LI_FAILED; + lip->li_buf = NULL; + xfs_buf_rele(bp); + } +} + +static inline void +xfs_set_li_failed( + struct xfs_log_item *lip, + struct xfs_buf *bp) +{ + lockdep_assert_held(&lip->li_ailp->xa_lock); + + if (!(lip->li_flags & XFS_LI_FAILED)) { + xfs_buf_hold(bp); + lip->li_flags |= XFS_LI_FAILED; + lip->li_buf = bp; + } +} + #endif /* __XFS_TRANS_PRIV_H__ */ |